1 /* 2 * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 #ifdef BUILTIN_SIM 50 #include "../../../../../../simulator/simulator.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #undef __ 58 #define __ _masm-> 59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Stub Code definitions 70 71 class StubGenerator: public StubCodeGenerator { 72 private: 73 74 #ifdef PRODUCT 75 #define inc_counter_np(counter) ((void)0) 76 #else 77 void inc_counter_np_(int& counter) { 78 __ lea(rscratch2, ExternalAddress((address)&counter)); 79 __ ldrw(rscratch1, Address(rscratch2)); 80 __ addw(rscratch1, rscratch1, 1); 81 __ strw(rscratch1, Address(rscratch2)); 82 } 83 #define inc_counter_np(counter) \ 84 BLOCK_COMMENT("inc_counter " #counter); \ 85 inc_counter_np_(counter); 86 #endif 87 88 // Call stubs are used to call Java from C 89 // 90 // Arguments: 91 // c_rarg0: call wrapper address address 92 // c_rarg1: result address 93 // c_rarg2: result type BasicType 94 // c_rarg3: method Method* 95 // c_rarg4: (interpreter) entry point address 96 // c_rarg5: parameters intptr_t* 97 // c_rarg6: parameter size (in words) int 98 // c_rarg7: thread Thread* 99 // 100 // There is no return from the stub itself as any Java result 101 // is written to result 102 // 103 // we save r30 (lr) as the return PC at the base of the frame and 104 // link r29 (fp) below it as the frame pointer installing sp (r31) 105 // into fp. 106 // 107 // we save r0-r7, which accounts for all the c arguments. 108 // 109 // TODO: strictly do we need to save them all? they are treated as 110 // volatile by C so could we omit saving the ones we are going to 111 // place in global registers (thread? method?) or those we only use 112 // during setup of the Java call? 113 // 114 // we don't need to save r8 which C uses as an indirect result location 115 // return register. 116 // 117 // we don't need to save r9-r15 which both C and Java treat as 118 // volatile 119 // 120 // we don't need to save r16-18 because Java does not use them 121 // 122 // we save r19-r28 which Java uses as scratch registers and C 123 // expects to be callee-save 124 // 125 // we save the bottom 64 bits of each value stored in v8-v15; it is 126 // the responsibility of the caller to preserve larger values. 127 // 128 // so the stub frame looks like this when we enter Java code 129 // 130 // [ return_from_Java ] <--- sp 131 // [ argument word n ] 132 // ... 133 // -27 [ argument word 1 ] 134 // -26 [ saved v15 ] <--- sp_after_call 135 // -25 [ saved v14 ] 136 // -24 [ saved v13 ] 137 // -23 [ saved v12 ] 138 // -22 [ saved v11 ] 139 // -21 [ saved v10 ] 140 // -20 [ saved v9 ] 141 // -19 [ saved v8 ] 142 // -18 [ saved r28 ] 143 // -17 [ saved r27 ] 144 // -16 [ saved r26 ] 145 // -15 [ saved r25 ] 146 // -14 [ saved r24 ] 147 // -13 [ saved r23 ] 148 // -12 [ saved r22 ] 149 // -11 [ saved r21 ] 150 // -10 [ saved r20 ] 151 // -9 [ saved r19 ] 152 // -8 [ call wrapper (r0) ] 153 // -7 [ result (r1) ] 154 // -6 [ result type (r2) ] 155 // -5 [ method (r3) ] 156 // -4 [ entry point (r4) ] 157 // -3 [ parameters (r5) ] 158 // -2 [ parameter size (r6) ] 159 // -1 [ thread (r7) ] 160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 161 // 1 [ saved lr (r30) ] 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -26, 166 167 d15_off = -26, 168 d13_off = -24, 169 d11_off = -22, 170 d9_off = -20, 171 172 r28_off = -18, 173 r26_off = -16, 174 r24_off = -14, 175 r22_off = -12, 176 r20_off = -10, 177 call_wrapper_off = -8, 178 result_off = -7, 179 result_type_off = -6, 180 method_off = -5, 181 entry_point_off = -4, 182 parameter_size_off = -2, 183 thread_off = -1, 184 fp_f = 0, 185 retaddr_off = 1, 186 }; 187 188 address generate_call_stub(address& return_address) { 189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 191 "adjust this code"); 192 193 StubCodeMark mark(this, "StubRoutines", "call_stub"); 194 address start = __ pc(); 195 196 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 197 198 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 199 const Address result (rfp, result_off * wordSize); 200 const Address result_type (rfp, result_type_off * wordSize); 201 const Address method (rfp, method_off * wordSize); 202 const Address entry_point (rfp, entry_point_off * wordSize); 203 const Address parameter_size(rfp, parameter_size_off * wordSize); 204 205 const Address thread (rfp, thread_off * wordSize); 206 207 const Address d15_save (rfp, d15_off * wordSize); 208 const Address d13_save (rfp, d13_off * wordSize); 209 const Address d11_save (rfp, d11_off * wordSize); 210 const Address d9_save (rfp, d9_off * wordSize); 211 212 const Address r28_save (rfp, r28_off * wordSize); 213 const Address r26_save (rfp, r26_off * wordSize); 214 const Address r24_save (rfp, r24_off * wordSize); 215 const Address r22_save (rfp, r22_off * wordSize); 216 const Address r20_save (rfp, r20_off * wordSize); 217 218 // stub code 219 220 // we need a C prolog to bootstrap the x86 caller into the sim 221 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 222 223 address aarch64_entry = __ pc(); 224 225 #ifdef BUILTIN_SIM 226 // Save sender's SP for stack traces. 227 __ mov(rscratch1, sp); 228 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 229 #endif 230 // set up frame and move sp to end of save area 231 __ enter(); 232 __ sub(sp, rfp, -sp_after_call_off * wordSize); 233 234 // save register parameters and Java scratch/global registers 235 // n.b. we save thread even though it gets installed in 236 // rthread because we want to sanity check rthread later 237 __ str(c_rarg7, thread); 238 __ strw(c_rarg6, parameter_size); 239 __ stp(c_rarg4, c_rarg5, entry_point); 240 __ stp(c_rarg2, c_rarg3, result_type); 241 __ stp(c_rarg0, c_rarg1, call_wrapper); 242 243 __ stp(r20, r19, r20_save); 244 __ stp(r22, r21, r22_save); 245 __ stp(r24, r23, r24_save); 246 __ stp(r26, r25, r26_save); 247 __ stp(r28, r27, r28_save); 248 249 __ stpd(v9, v8, d9_save); 250 __ stpd(v11, v10, d11_save); 251 __ stpd(v13, v12, d13_save); 252 __ stpd(v15, v14, d15_save); 253 254 // install Java thread in global register now we have saved 255 // whatever value it held 256 __ mov(rthread, c_rarg7); 257 // And method 258 __ mov(rmethod, c_rarg3); 259 260 // set up the heapbase register 261 __ reinit_heapbase(); 262 263 #ifdef ASSERT 264 // make sure we have no pending exceptions 265 { 266 Label L; 267 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 268 __ cmp(rscratch1, (unsigned)NULL_WORD); 269 __ br(Assembler::EQ, L); 270 __ stop("StubRoutines::call_stub: entered with pending exception"); 271 __ BIND(L); 272 } 273 #endif 274 // pass parameters if any 275 __ mov(esp, sp); 276 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 277 __ andr(sp, rscratch1, -2 * wordSize); 278 279 BLOCK_COMMENT("pass parameters if any"); 280 Label parameters_done; 281 // parameter count is still in c_rarg6 282 // and parameter pointer identifying param 1 is in c_rarg5 283 __ cbzw(c_rarg6, parameters_done); 284 285 address loop = __ pc(); 286 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 287 __ subsw(c_rarg6, c_rarg6, 1); 288 __ push(rscratch1); 289 __ br(Assembler::GT, loop); 290 291 __ BIND(parameters_done); 292 293 // call Java entry -- passing methdoOop, and current sp 294 // rmethod: Method* 295 // r13: sender sp 296 BLOCK_COMMENT("call Java function"); 297 __ mov(r13, sp); 298 __ blr(c_rarg4); 299 300 // tell the simulator we have returned to the stub 301 302 // we do this here because the notify will already have been done 303 // if we get to the next instruction via an exception 304 // 305 // n.b. adding this instruction here affects the calculation of 306 // whether or not a routine returns to the call stub (used when 307 // doing stack walks) since the normal test is to check the return 308 // pc against the address saved below. so we may need to allow for 309 // this extra instruction in the check. 310 311 if (NotifySimulator) { 312 __ notify(Assembler::method_reentry); 313 } 314 // save current address for use by exception handling code 315 316 return_address = __ pc(); 317 318 // store result depending on type (everything that is not 319 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 320 // n.b. this assumes Java returns an integral result in r0 321 // and a floating result in j_farg0 322 __ ldr(j_rarg2, result); 323 Label is_long, is_float, is_double, exit; 324 __ ldr(j_rarg1, result_type); 325 __ cmp(j_rarg1, T_OBJECT); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, T_LONG); 328 __ br(Assembler::EQ, is_long); 329 __ cmp(j_rarg1, T_FLOAT); 330 __ br(Assembler::EQ, is_float); 331 __ cmp(j_rarg1, T_DOUBLE); 332 __ br(Assembler::EQ, is_double); 333 334 // handle T_INT case 335 __ strw(r0, Address(j_rarg2)); 336 337 __ BIND(exit); 338 339 // pop parameters 340 __ sub(esp, rfp, -sp_after_call_off * wordSize); 341 342 #ifdef ASSERT 343 // verify that threads correspond 344 { 345 Label L, S; 346 __ ldr(rscratch1, thread); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::NE, S); 349 __ get_thread(rscratch1); 350 __ cmp(rthread, rscratch1); 351 __ br(Assembler::EQ, L); 352 __ BIND(S); 353 __ stop("StubRoutines::call_stub: threads must correspond"); 354 __ BIND(L); 355 } 356 #endif 357 358 // restore callee-save registers 359 __ ldpd(v15, v14, d15_save); 360 __ ldpd(v13, v12, d13_save); 361 __ ldpd(v11, v10, d11_save); 362 __ ldpd(v9, v8, d9_save); 363 364 __ ldp(r28, r27, r28_save); 365 __ ldp(r26, r25, r26_save); 366 __ ldp(r24, r23, r24_save); 367 __ ldp(r22, r21, r22_save); 368 __ ldp(r20, r19, r20_save); 369 370 __ ldp(c_rarg0, c_rarg1, call_wrapper); 371 __ ldrw(c_rarg2, result_type); 372 __ ldr(c_rarg3, method); 373 __ ldp(c_rarg4, c_rarg5, entry_point); 374 __ ldp(c_rarg6, c_rarg7, parameter_size); 375 376 #ifndef PRODUCT 377 // tell the simulator we are about to end Java execution 378 if (NotifySimulator) { 379 __ notify(Assembler::method_exit); 380 } 381 #endif 382 // leave frame and return to caller 383 __ leave(); 384 __ ret(lr); 385 386 // handle return types different from T_INT 387 388 __ BIND(is_long); 389 __ str(r0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 __ BIND(is_float); 393 __ strs(j_farg0, Address(j_rarg2, 0)); 394 __ br(Assembler::AL, exit); 395 396 __ BIND(is_double); 397 __ strd(j_farg0, Address(j_rarg2, 0)); 398 __ br(Assembler::AL, exit); 399 400 return start; 401 } 402 403 // Return point for a Java call if there's an exception thrown in 404 // Java code. The exception is caught and transformed into a 405 // pending exception stored in JavaThread that can be tested from 406 // within the VM. 407 // 408 // Note: Usually the parameters are removed by the callee. In case 409 // of an exception crossing an activation frame boundary, that is 410 // not the case if the callee is compiled code => need to setup the 411 // rsp. 412 // 413 // r0: exception oop 414 415 // NOTE: this is used as a target from the signal handler so it 416 // needs an x86 prolog which returns into the current simulator 417 // executing the generated catch_exception code. so the prolog 418 // needs to install rax in a sim register and adjust the sim's 419 // restart pc to enter the generated code at the start position 420 // then return from native to simulated execution. 421 422 address generate_catch_exception() { 423 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 424 address start = __ pc(); 425 426 // same as in generate_call_stub(): 427 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 428 const Address thread (rfp, thread_off * wordSize); 429 430 #ifdef ASSERT 431 // verify that threads correspond 432 { 433 Label L, S; 434 __ ldr(rscratch1, thread); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::NE, S); 437 __ get_thread(rscratch1); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::EQ, L); 440 __ bind(S); 441 __ stop("StubRoutines::catch_exception: threads must correspond"); 442 __ bind(L); 443 } 444 #endif 445 446 // set pending exception 447 __ verify_oop(r0); 448 449 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 450 __ mov(rscratch1, (address)__FILE__); 451 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 452 __ movw(rscratch1, (int)__LINE__); 453 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 454 455 // complete return to VM 456 assert(StubRoutines::_call_stub_return_address != NULL, 457 "_call_stub_return_address must have been generated before"); 458 __ b(StubRoutines::_call_stub_return_address); 459 460 return start; 461 } 462 463 // Continuation point for runtime calls returning with a pending 464 // exception. The pending exception check happened in the runtime 465 // or native call stub. The pending exception in Thread is 466 // converted into a Java-level exception. 467 // 468 // Contract with Java-level exception handlers: 469 // r0: exception 470 // r3: throwing pc 471 // 472 // NOTE: At entry of this stub, exception-pc must be in LR !! 473 474 // NOTE: this is always used as a jump target within generated code 475 // so it just needs to be generated code wiht no x86 prolog 476 477 address generate_forward_exception() { 478 StubCodeMark mark(this, "StubRoutines", "forward exception"); 479 address start = __ pc(); 480 481 // Upon entry, LR points to the return address returning into 482 // Java (interpreted or compiled) code; i.e., the return address 483 // becomes the throwing pc. 484 // 485 // Arguments pushed before the runtime call are still on the stack 486 // but the exception handler will reset the stack pointer -> 487 // ignore them. A potential result in registers can be ignored as 488 // well. 489 490 #ifdef ASSERT 491 // make sure this code is only executed if there is a pending exception 492 { 493 Label L; 494 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 495 __ cbnz(rscratch1, L); 496 __ stop("StubRoutines::forward exception: no pending exception (1)"); 497 __ bind(L); 498 } 499 #endif 500 501 // compute exception handler into r19 502 503 // call the VM to find the handler address associated with the 504 // caller address. pass thread in r0 and caller pc (ret address) 505 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 506 // the stack. 507 __ mov(c_rarg1, lr); 508 // lr will be trashed by the VM call so we move it to R19 509 // (callee-saved) because we also need to pass it to the handler 510 // returned by this call. 511 __ mov(r19, lr); 512 BLOCK_COMMENT("call exception_handler_for_return_address"); 513 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 514 SharedRuntime::exception_handler_for_return_address), 515 rthread, c_rarg1); 516 // we should not really care that lr is no longer the callee 517 // address. we saved the value the handler needs in r19 so we can 518 // just copy it to r3. however, the C2 handler will push its own 519 // frame and then calls into the VM and the VM code asserts that 520 // the PC for the frame above the handler belongs to a compiled 521 // Java method. So, we restore lr here to satisfy that assert. 522 __ mov(lr, r19); 523 // setup r0 & r3 & clear pending exception 524 __ mov(r3, r19); 525 __ mov(r19, r0); 526 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 527 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 528 529 #ifdef ASSERT 530 // make sure exception is set 531 { 532 Label L; 533 __ cbnz(r0, L); 534 __ stop("StubRoutines::forward exception: no pending exception (2)"); 535 __ bind(L); 536 } 537 #endif 538 539 // continue at exception handler 540 // r0: exception 541 // r3: throwing pc 542 // r19: exception handler 543 __ verify_oop(r0); 544 __ br(r19); 545 546 return start; 547 } 548 549 // Non-destructive plausibility checks for oops 550 // 551 // Arguments: 552 // r0: oop to verify 553 // rscratch1: error message 554 // 555 // Stack after saving c_rarg3: 556 // [tos + 0]: saved c_rarg3 557 // [tos + 1]: saved c_rarg2 558 // [tos + 2]: saved lr 559 // [tos + 3]: saved rscratch2 560 // [tos + 4]: saved r0 561 // [tos + 5]: saved rscratch1 562 address generate_verify_oop() { 563 564 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 565 address start = __ pc(); 566 567 Label exit, error; 568 569 // save c_rarg2 and c_rarg3 570 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 571 572 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 573 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 574 __ ldr(c_rarg3, Address(c_rarg2)); 575 __ add(c_rarg3, c_rarg3, 1); 576 __ str(c_rarg3, Address(c_rarg2)); 577 578 // object is in r0 579 // make sure object is 'reasonable' 580 __ cbz(r0, exit); // if obj is NULL it is OK 581 582 // Check if the oop is in the right area of memory 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 584 __ andr(c_rarg2, r0, c_rarg3); 585 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 586 587 // Compare c_rarg2 and c_rarg3. We don't use a compare 588 // instruction here because the flags register is live. 589 __ eor(c_rarg2, c_rarg2, c_rarg3); 590 __ cbnz(c_rarg2, error); 591 592 // make sure klass is 'reasonable', which is not zero. 593 __ load_klass(r0, r0); // get klass 594 __ cbz(r0, error); // if klass is NULL it is broken 595 596 // return if everything seems ok 597 __ bind(exit); 598 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 __ ret(lr); 601 602 // handle errors 603 __ bind(error); 604 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 605 606 __ push(RegSet::range(r0, r29), sp); 607 // debug(char* msg, int64_t pc, int64_t regs[]) 608 __ mov(c_rarg0, rscratch1); // pass address of error message 609 __ mov(c_rarg1, lr); // pass return address 610 __ mov(c_rarg2, sp); // pass address of regs on stack 611 #ifndef PRODUCT 612 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 613 #endif 614 BLOCK_COMMENT("call MacroAssembler::debug"); 615 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 616 __ blrt(rscratch1, 3, 0, 1); 617 618 return start; 619 } 620 621 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 622 623 // The inner part of zero_words(). This is the bulk operation, 624 // zeroing words in blocks, possibly using DC ZVA to do it. The 625 // caller is responsible for zeroing the last few words. 626 // 627 // Inputs: 628 // r10: the HeapWord-aligned base address of an array to zero. 629 // r11: the count in HeapWords, r11 > 0. 630 // 631 // Returns r10 and r11, adjusted for the caller to clear. 632 // r10: the base address of the tail of words left to clear. 633 // r11: the number of words in the tail. 634 // r11 < MacroAssembler::zero_words_block_size. 635 636 address generate_zero_blocks() { 637 Label store_pair, loop_store_pair, done; 638 Label base_aligned; 639 640 Register base = r10, cnt = r11; 641 642 __ align(CodeEntryAlignment); 643 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 644 address start = __ pc(); 645 646 if (UseBlockZeroing) { 647 int zva_length = VM_Version::zva_length(); 648 649 // Ensure ZVA length can be divided by 16. This is required by 650 // the subsequent operations. 651 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 652 653 __ tbz(base, 3, base_aligned); 654 __ str(zr, Address(__ post(base, 8))); 655 __ sub(cnt, cnt, 1); 656 __ bind(base_aligned); 657 658 // Ensure count >= zva_length * 2 so that it still deserves a zva after 659 // alignment. 660 Label small; 661 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 662 __ subs(rscratch1, cnt, low_limit >> 3); 663 __ br(Assembler::LT, small); 664 __ zero_dcache_blocks(base, cnt); 665 __ bind(small); 666 } 667 668 { 669 // Number of stp instructions we'll unroll 670 const int unroll = 671 MacroAssembler::zero_words_block_size / 2; 672 // Clear the remaining blocks. 673 Label loop; 674 __ subs(cnt, cnt, unroll * 2); 675 __ br(Assembler::LT, done); 676 __ bind(loop); 677 for (int i = 0; i < unroll; i++) 678 __ stp(zr, zr, __ post(base, 16)); 679 __ subs(cnt, cnt, unroll * 2); 680 __ br(Assembler::GE, loop); 681 __ bind(done); 682 __ add(cnt, cnt, unroll * 2); 683 } 684 685 __ ret(lr); 686 687 return start; 688 } 689 690 691 typedef enum { 692 copy_forwards = 1, 693 copy_backwards = -1 694 } copy_direction; 695 696 // Bulk copy of blocks of 8 words. 697 // 698 // count is a count of words. 699 // 700 // Precondition: count >= 8 701 // 702 // Postconditions: 703 // 704 // The least significant bit of count contains the remaining count 705 // of words to copy. The rest of count is trash. 706 // 707 // s and d are adjusted to point to the remaining words to copy 708 // 709 void generate_copy_longs(Label &start, Register s, Register d, Register count, 710 copy_direction direction) { 711 int unit = wordSize * direction; 712 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 713 714 int offset; 715 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 716 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 717 const Register stride = r13; 718 719 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 720 assert_different_registers(s, d, count, rscratch1); 721 722 Label again, drain; 723 const char *stub_name; 724 if (direction == copy_forwards) 725 stub_name = "forward_copy_longs"; 726 else 727 stub_name = "backward_copy_longs"; 728 StubCodeMark mark(this, "StubRoutines", stub_name); 729 __ align(CodeEntryAlignment); 730 __ bind(start); 731 732 Label unaligned_copy_long; 733 if (AvoidUnalignedAccesses) { 734 __ tbnz(d, 3, unaligned_copy_long); 735 } 736 737 if (direction == copy_forwards) { 738 __ sub(s, s, bias); 739 __ sub(d, d, bias); 740 } 741 742 #ifdef ASSERT 743 // Make sure we are never given < 8 words 744 { 745 Label L; 746 __ cmp(count, 8); 747 __ br(Assembler::GE, L); 748 __ stop("genrate_copy_longs called with < 8 words"); 749 __ bind(L); 750 } 751 #endif 752 753 // Fill 8 registers 754 if (UseSIMDForMemoryOps) { 755 __ ldpq(v0, v1, Address(s, 4 * unit)); 756 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 757 } else { 758 __ ldp(t0, t1, Address(s, 2 * unit)); 759 __ ldp(t2, t3, Address(s, 4 * unit)); 760 __ ldp(t4, t5, Address(s, 6 * unit)); 761 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 762 } 763 764 __ subs(count, count, 16); 765 __ br(Assembler::LO, drain); 766 767 int prefetch = PrefetchCopyIntervalInBytes; 768 bool use_stride = false; 769 if (direction == copy_backwards) { 770 use_stride = prefetch > 256; 771 prefetch = -prefetch; 772 if (use_stride) __ mov(stride, prefetch); 773 } 774 775 __ bind(again); 776 777 if (PrefetchCopyIntervalInBytes > 0) 778 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 779 780 if (UseSIMDForMemoryOps) { 781 __ stpq(v0, v1, Address(d, 4 * unit)); 782 __ ldpq(v0, v1, Address(s, 4 * unit)); 783 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 784 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 785 } else { 786 __ stp(t0, t1, Address(d, 2 * unit)); 787 __ ldp(t0, t1, Address(s, 2 * unit)); 788 __ stp(t2, t3, Address(d, 4 * unit)); 789 __ ldp(t2, t3, Address(s, 4 * unit)); 790 __ stp(t4, t5, Address(d, 6 * unit)); 791 __ ldp(t4, t5, Address(s, 6 * unit)); 792 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 793 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 794 } 795 796 __ subs(count, count, 8); 797 __ br(Assembler::HS, again); 798 799 // Drain 800 __ bind(drain); 801 if (UseSIMDForMemoryOps) { 802 __ stpq(v0, v1, Address(d, 4 * unit)); 803 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 804 } else { 805 __ stp(t0, t1, Address(d, 2 * unit)); 806 __ stp(t2, t3, Address(d, 4 * unit)); 807 __ stp(t4, t5, Address(d, 6 * unit)); 808 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 809 } 810 811 { 812 Label L1, L2; 813 __ tbz(count, exact_log2(4), L1); 814 if (UseSIMDForMemoryOps) { 815 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 816 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 817 } else { 818 __ ldp(t0, t1, Address(s, 2 * unit)); 819 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 820 __ stp(t0, t1, Address(d, 2 * unit)); 821 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 822 } 823 __ bind(L1); 824 825 if (direction == copy_forwards) { 826 __ add(s, s, bias); 827 __ add(d, d, bias); 828 } 829 830 __ tbz(count, 1, L2); 831 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 832 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 833 __ bind(L2); 834 } 835 836 __ ret(lr); 837 838 if (AvoidUnalignedAccesses) { 839 Label drain, again; 840 // Register order for storing. Order is different for backward copy. 841 842 __ bind(unaligned_copy_long); 843 844 // source address is even aligned, target odd aligned 845 // 846 // when forward copying word pairs we read long pairs at offsets 847 // {0, 2, 4, 6} (in long words). when backwards copying we read 848 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 849 // address by -2 in the forwards case so we can compute the 850 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 851 // or -1. 852 // 853 // when forward copying we need to store 1 word, 3 pairs and 854 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 855 // zero offset We adjust the destination by -1 which means we 856 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 857 // 858 // When backwards copyng we need to store 1 word, 3 pairs and 859 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 860 // offsets {1, 3, 5, 7, 8} * unit. 861 862 if (direction == copy_forwards) { 863 __ sub(s, s, 16); 864 __ sub(d, d, 8); 865 } 866 867 // Fill 8 registers 868 // 869 // for forwards copy s was offset by -16 from the original input 870 // value of s so the register contents are at these offsets 871 // relative to the 64 bit block addressed by that original input 872 // and so on for each successive 64 byte block when s is updated 873 // 874 // t0 at offset 0, t1 at offset 8 875 // t2 at offset 16, t3 at offset 24 876 // t4 at offset 32, t5 at offset 40 877 // t6 at offset 48, t7 at offset 56 878 879 // for backwards copy s was not offset so the register contents 880 // are at these offsets into the preceding 64 byte block 881 // relative to that original input and so on for each successive 882 // preceding 64 byte block when s is updated. this explains the 883 // slightly counter-intuitive looking pattern of register usage 884 // in the stp instructions for backwards copy. 885 // 886 // t0 at offset -16, t1 at offset -8 887 // t2 at offset -32, t3 at offset -24 888 // t4 at offset -48, t5 at offset -40 889 // t6 at offset -64, t7 at offset -56 890 891 __ ldp(t0, t1, Address(s, 2 * unit)); 892 __ ldp(t2, t3, Address(s, 4 * unit)); 893 __ ldp(t4, t5, Address(s, 6 * unit)); 894 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 895 896 __ subs(count, count, 16); 897 __ br(Assembler::LO, drain); 898 899 int prefetch = PrefetchCopyIntervalInBytes; 900 bool use_stride = false; 901 if (direction == copy_backwards) { 902 use_stride = prefetch > 256; 903 prefetch = -prefetch; 904 if (use_stride) __ mov(stride, prefetch); 905 } 906 907 __ bind(again); 908 909 if (PrefetchCopyIntervalInBytes > 0) 910 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 911 912 if (direction == copy_forwards) { 913 // allowing for the offset of -8 the store instructions place 914 // registers into the target 64 bit block at the following 915 // offsets 916 // 917 // t0 at offset 0 918 // t1 at offset 8, t2 at offset 16 919 // t3 at offset 24, t4 at offset 32 920 // t5 at offset 40, t6 at offset 48 921 // t7 at offset 56 922 923 __ str(t0, Address(d, 1 * unit)); 924 __ stp(t1, t2, Address(d, 2 * unit)); 925 __ ldp(t0, t1, Address(s, 2 * unit)); 926 __ stp(t3, t4, Address(d, 4 * unit)); 927 __ ldp(t2, t3, Address(s, 4 * unit)); 928 __ stp(t5, t6, Address(d, 6 * unit)); 929 __ ldp(t4, t5, Address(s, 6 * unit)); 930 __ str(t7, Address(__ pre(d, 8 * unit))); 931 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 932 } else { 933 // d was not offset when we started so the registers are 934 // written into the 64 bit block preceding d with the following 935 // offsets 936 // 937 // t1 at offset -8 938 // t3 at offset -24, t0 at offset -16 939 // t5 at offset -48, t2 at offset -32 940 // t7 at offset -56, t4 at offset -48 941 // t6 at offset -64 942 // 943 // note that this matches the offsets previously noted for the 944 // loads 945 946 __ str(t1, Address(d, 1 * unit)); 947 __ stp(t3, t0, Address(d, 3 * unit)); 948 __ ldp(t0, t1, Address(s, 2 * unit)); 949 __ stp(t5, t2, Address(d, 5 * unit)); 950 __ ldp(t2, t3, Address(s, 4 * unit)); 951 __ stp(t7, t4, Address(d, 7 * unit)); 952 __ ldp(t4, t5, Address(s, 6 * unit)); 953 __ str(t6, Address(__ pre(d, 8 * unit))); 954 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 955 } 956 957 __ subs(count, count, 8); 958 __ br(Assembler::HS, again); 959 960 // Drain 961 // 962 // this uses the same pattern of offsets and register arguments 963 // as above 964 __ bind(drain); 965 if (direction == copy_forwards) { 966 __ str(t0, Address(d, 1 * unit)); 967 __ stp(t1, t2, Address(d, 2 * unit)); 968 __ stp(t3, t4, Address(d, 4 * unit)); 969 __ stp(t5, t6, Address(d, 6 * unit)); 970 __ str(t7, Address(__ pre(d, 8 * unit))); 971 } else { 972 __ str(t1, Address(d, 1 * unit)); 973 __ stp(t3, t0, Address(d, 3 * unit)); 974 __ stp(t5, t2, Address(d, 5 * unit)); 975 __ stp(t7, t4, Address(d, 7 * unit)); 976 __ str(t6, Address(__ pre(d, 8 * unit))); 977 } 978 // now we need to copy any remaining part block which may 979 // include a 4 word block subblock and/or a 2 word subblock. 980 // bits 2 and 1 in the count are the tell-tale for whetehr we 981 // have each such subblock 982 { 983 Label L1, L2; 984 __ tbz(count, exact_log2(4), L1); 985 // this is the same as above but copying only 4 longs hence 986 // with ony one intervening stp between the str instructions 987 // but note that the offsets and registers still follow the 988 // same pattern 989 __ ldp(t0, t1, Address(s, 2 * unit)); 990 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 991 if (direction == copy_forwards) { 992 __ str(t0, Address(d, 1 * unit)); 993 __ stp(t1, t2, Address(d, 2 * unit)); 994 __ str(t3, Address(__ pre(d, 4 * unit))); 995 } else { 996 __ str(t1, Address(d, 1 * unit)); 997 __ stp(t3, t0, Address(d, 3 * unit)); 998 __ str(t2, Address(__ pre(d, 4 * unit))); 999 } 1000 __ bind(L1); 1001 1002 __ tbz(count, 1, L2); 1003 // this is the same as above but copying only 2 longs hence 1004 // there is no intervening stp between the str instructions 1005 // but note that the offset and register patterns are still 1006 // the same 1007 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1008 if (direction == copy_forwards) { 1009 __ str(t0, Address(d, 1 * unit)); 1010 __ str(t1, Address(__ pre(d, 2 * unit))); 1011 } else { 1012 __ str(t1, Address(d, 1 * unit)); 1013 __ str(t0, Address(__ pre(d, 2 * unit))); 1014 } 1015 __ bind(L2); 1016 1017 // for forwards copy we need to re-adjust the offsets we 1018 // applied so that s and d are follow the last words written 1019 1020 if (direction == copy_forwards) { 1021 __ add(s, s, 16); 1022 __ add(d, d, 8); 1023 } 1024 1025 } 1026 1027 __ ret(lr); 1028 } 1029 } 1030 1031 // Small copy: less than 16 bytes. 1032 // 1033 // NB: Ignores all of the bits of count which represent more than 15 1034 // bytes, so a caller doesn't have to mask them. 1035 1036 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1037 bool is_backwards = step < 0; 1038 size_t granularity = uabs(step); 1039 int direction = is_backwards ? -1 : 1; 1040 int unit = wordSize * direction; 1041 1042 Label Lpair, Lword, Lint, Lshort, Lbyte; 1043 1044 assert(granularity 1045 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1046 1047 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1048 1049 // ??? I don't know if this bit-test-and-branch is the right thing 1050 // to do. It does a lot of jumping, resulting in several 1051 // mispredicted branches. It might make more sense to do this 1052 // with something like Duff's device with a single computed branch. 1053 1054 __ tbz(count, 3 - exact_log2(granularity), Lword); 1055 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1056 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1057 __ bind(Lword); 1058 1059 if (granularity <= sizeof (jint)) { 1060 __ tbz(count, 2 - exact_log2(granularity), Lint); 1061 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1062 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1063 __ bind(Lint); 1064 } 1065 1066 if (granularity <= sizeof (jshort)) { 1067 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1068 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1069 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1070 __ bind(Lshort); 1071 } 1072 1073 if (granularity <= sizeof (jbyte)) { 1074 __ tbz(count, 0, Lbyte); 1075 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1076 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1077 __ bind(Lbyte); 1078 } 1079 } 1080 1081 Label copy_f, copy_b; 1082 1083 // All-singing all-dancing memory copy. 1084 // 1085 // Copy count units of memory from s to d. The size of a unit is 1086 // step, which can be positive or negative depending on the direction 1087 // of copy. If is_aligned is false, we align the source address. 1088 // 1089 1090 void copy_memory(bool is_aligned, Register s, Register d, 1091 Register count, Register tmp, int step) { 1092 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1093 bool is_backwards = step < 0; 1094 int granularity = uabs(step); 1095 const Register t0 = r3, t1 = r4; 1096 1097 // <= 96 bytes do inline. Direction doesn't matter because we always 1098 // load all the data before writing anything 1099 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1100 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1101 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1102 const Register send = r17, dend = r18; 1103 1104 if (PrefetchCopyIntervalInBytes > 0) 1105 __ prfm(Address(s, 0), PLDL1KEEP); 1106 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1107 __ br(Assembler::HI, copy_big); 1108 1109 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1110 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1111 1112 __ cmp(count, 16/granularity); 1113 __ br(Assembler::LS, copy16); 1114 1115 __ cmp(count, 64/granularity); 1116 __ br(Assembler::HI, copy80); 1117 1118 __ cmp(count, 32/granularity); 1119 __ br(Assembler::LS, copy32); 1120 1121 // 33..64 bytes 1122 if (UseSIMDForMemoryOps) { 1123 __ ldpq(v0, v1, Address(s, 0)); 1124 __ ldpq(v2, v3, Address(send, -32)); 1125 __ stpq(v0, v1, Address(d, 0)); 1126 __ stpq(v2, v3, Address(dend, -32)); 1127 } else { 1128 __ ldp(t0, t1, Address(s, 0)); 1129 __ ldp(t2, t3, Address(s, 16)); 1130 __ ldp(t4, t5, Address(send, -32)); 1131 __ ldp(t6, t7, Address(send, -16)); 1132 1133 __ stp(t0, t1, Address(d, 0)); 1134 __ stp(t2, t3, Address(d, 16)); 1135 __ stp(t4, t5, Address(dend, -32)); 1136 __ stp(t6, t7, Address(dend, -16)); 1137 } 1138 __ b(finish); 1139 1140 // 17..32 bytes 1141 __ bind(copy32); 1142 __ ldp(t0, t1, Address(s, 0)); 1143 __ ldp(t2, t3, Address(send, -16)); 1144 __ stp(t0, t1, Address(d, 0)); 1145 __ stp(t2, t3, Address(dend, -16)); 1146 __ b(finish); 1147 1148 // 65..80/96 bytes 1149 // (96 bytes if SIMD because we do 32 byes per instruction) 1150 __ bind(copy80); 1151 if (UseSIMDForMemoryOps) { 1152 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1153 __ ldpq(v4, v5, Address(send, -32)); 1154 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1155 __ stpq(v4, v5, Address(dend, -32)); 1156 } else { 1157 __ ldp(t0, t1, Address(s, 0)); 1158 __ ldp(t2, t3, Address(s, 16)); 1159 __ ldp(t4, t5, Address(s, 32)); 1160 __ ldp(t6, t7, Address(s, 48)); 1161 __ ldp(t8, t9, Address(send, -16)); 1162 1163 __ stp(t0, t1, Address(d, 0)); 1164 __ stp(t2, t3, Address(d, 16)); 1165 __ stp(t4, t5, Address(d, 32)); 1166 __ stp(t6, t7, Address(d, 48)); 1167 __ stp(t8, t9, Address(dend, -16)); 1168 } 1169 __ b(finish); 1170 1171 // 0..16 bytes 1172 __ bind(copy16); 1173 __ cmp(count, 8/granularity); 1174 __ br(Assembler::LO, copy8); 1175 1176 // 8..16 bytes 1177 __ ldr(t0, Address(s, 0)); 1178 __ ldr(t1, Address(send, -8)); 1179 __ str(t0, Address(d, 0)); 1180 __ str(t1, Address(dend, -8)); 1181 __ b(finish); 1182 1183 if (granularity < 8) { 1184 // 4..7 bytes 1185 __ bind(copy8); 1186 __ tbz(count, 2 - exact_log2(granularity), copy4); 1187 __ ldrw(t0, Address(s, 0)); 1188 __ ldrw(t1, Address(send, -4)); 1189 __ strw(t0, Address(d, 0)); 1190 __ strw(t1, Address(dend, -4)); 1191 __ b(finish); 1192 if (granularity < 4) { 1193 // 0..3 bytes 1194 __ bind(copy4); 1195 __ cbz(count, finish); // get rid of 0 case 1196 if (granularity == 2) { 1197 __ ldrh(t0, Address(s, 0)); 1198 __ strh(t0, Address(d, 0)); 1199 } else { // granularity == 1 1200 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1201 // the first and last byte. 1202 // Handle the 3 byte case by loading and storing base + count/2 1203 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1204 // This does means in the 1 byte case we load/store the same 1205 // byte 3 times. 1206 __ lsr(count, count, 1); 1207 __ ldrb(t0, Address(s, 0)); 1208 __ ldrb(t1, Address(send, -1)); 1209 __ ldrb(t2, Address(s, count)); 1210 __ strb(t0, Address(d, 0)); 1211 __ strb(t1, Address(dend, -1)); 1212 __ strb(t2, Address(d, count)); 1213 } 1214 __ b(finish); 1215 } 1216 } 1217 1218 __ bind(copy_big); 1219 if (is_backwards) { 1220 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1221 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1222 } 1223 1224 // Now we've got the small case out of the way we can align the 1225 // source address on a 2-word boundary. 1226 1227 Label aligned; 1228 1229 if (is_aligned) { 1230 // We may have to adjust by 1 word to get s 2-word-aligned. 1231 __ tbz(s, exact_log2(wordSize), aligned); 1232 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1233 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1234 __ sub(count, count, wordSize/granularity); 1235 } else { 1236 if (is_backwards) { 1237 __ andr(rscratch2, s, 2 * wordSize - 1); 1238 } else { 1239 __ neg(rscratch2, s); 1240 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1241 } 1242 // rscratch2 is the byte adjustment needed to align s. 1243 __ cbz(rscratch2, aligned); 1244 int shift = exact_log2(granularity); 1245 if (shift) __ lsr(rscratch2, rscratch2, shift); 1246 __ sub(count, count, rscratch2); 1247 1248 #if 0 1249 // ?? This code is only correct for a disjoint copy. It may or 1250 // may not make sense to use it in that case. 1251 1252 // Copy the first pair; s and d may not be aligned. 1253 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1254 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1255 1256 // Align s and d, adjust count 1257 if (is_backwards) { 1258 __ sub(s, s, rscratch2); 1259 __ sub(d, d, rscratch2); 1260 } else { 1261 __ add(s, s, rscratch2); 1262 __ add(d, d, rscratch2); 1263 } 1264 #else 1265 copy_memory_small(s, d, rscratch2, rscratch1, step); 1266 #endif 1267 } 1268 1269 __ bind(aligned); 1270 1271 // s is now 2-word-aligned. 1272 1273 // We have a count of units and some trailing bytes. Adjust the 1274 // count and do a bulk copy of words. 1275 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1276 if (direction == copy_forwards) 1277 __ bl(copy_f); 1278 else 1279 __ bl(copy_b); 1280 1281 // And the tail. 1282 copy_memory_small(s, d, count, tmp, step); 1283 1284 if (granularity >= 8) __ bind(copy8); 1285 if (granularity >= 4) __ bind(copy4); 1286 __ bind(finish); 1287 } 1288 1289 1290 void clobber_registers() { 1291 #ifdef ASSERT 1292 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1293 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1294 for (Register r = r3; r <= r18; r++) 1295 if (r != rscratch1) __ mov(r, rscratch1); 1296 #endif 1297 } 1298 1299 // Scan over array at a for count oops, verifying each one. 1300 // Preserves a and count, clobbers rscratch1 and rscratch2. 1301 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1302 Label loop, end; 1303 __ mov(rscratch1, a); 1304 __ mov(rscratch2, zr); 1305 __ bind(loop); 1306 __ cmp(rscratch2, count); 1307 __ br(Assembler::HS, end); 1308 if (size == (size_t)wordSize) { 1309 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1310 __ verify_oop(temp); 1311 } else { 1312 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1313 __ decode_heap_oop(temp); // calls verify_oop 1314 } 1315 __ add(rscratch2, rscratch2, size); 1316 __ b(loop); 1317 __ bind(end); 1318 } 1319 1320 // Arguments: 1321 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1322 // ignored 1323 // is_oop - true => oop array, so generate store check code 1324 // name - stub name string 1325 // 1326 // Inputs: 1327 // c_rarg0 - source array address 1328 // c_rarg1 - destination array address 1329 // c_rarg2 - element count, treated as ssize_t, can be zero 1330 // 1331 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1332 // the hardware handle it. The two dwords within qwords that span 1333 // cache line boundaries will still be loaded and stored atomicly. 1334 // 1335 // Side Effects: 1336 // disjoint_int_copy_entry is set to the no-overlap entry point 1337 // used by generate_conjoint_int_oop_copy(). 1338 // 1339 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1340 const char *name, bool dest_uninitialized = false) { 1341 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1342 RegSet saved_reg = RegSet::of(s, d, count); 1343 __ align(CodeEntryAlignment); 1344 StubCodeMark mark(this, "StubRoutines", name); 1345 address start = __ pc(); 1346 __ enter(); 1347 1348 if (entry != NULL) { 1349 *entry = __ pc(); 1350 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1351 BLOCK_COMMENT("Entry:"); 1352 } 1353 1354 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1355 if (dest_uninitialized) { 1356 decorators |= IS_DEST_UNINITIALIZED; 1357 } 1358 if (aligned) { 1359 decorators |= ARRAYCOPY_ALIGNED; 1360 } 1361 1362 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1363 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1364 1365 if (is_oop) { 1366 // save regs before copy_memory 1367 __ push(RegSet::of(d, count), sp); 1368 } 1369 copy_memory(aligned, s, d, count, rscratch1, size); 1370 1371 if (is_oop) { 1372 __ pop(RegSet::of(d, count), sp); 1373 if (VerifyOops) 1374 verify_oop_array(size, d, count, r16); 1375 __ sub(count, count, 1); // make an inclusive end pointer 1376 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1377 } 1378 1379 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1380 1381 __ leave(); 1382 __ mov(r0, zr); // return 0 1383 __ ret(lr); 1384 #ifdef BUILTIN_SIM 1385 { 1386 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1387 sim->notifyCompile(const_cast<char*>(name), start); 1388 } 1389 #endif 1390 return start; 1391 } 1392 1393 // Arguments: 1394 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1395 // ignored 1396 // is_oop - true => oop array, so generate store check code 1397 // name - stub name string 1398 // 1399 // Inputs: 1400 // c_rarg0 - source array address 1401 // c_rarg1 - destination array address 1402 // c_rarg2 - element count, treated as ssize_t, can be zero 1403 // 1404 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1405 // the hardware handle it. The two dwords within qwords that span 1406 // cache line boundaries will still be loaded and stored atomicly. 1407 // 1408 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1409 address *entry, const char *name, 1410 bool dest_uninitialized = false) { 1411 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1412 RegSet saved_regs = RegSet::of(s, d, count); 1413 StubCodeMark mark(this, "StubRoutines", name); 1414 address start = __ pc(); 1415 __ enter(); 1416 1417 if (entry != NULL) { 1418 *entry = __ pc(); 1419 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1420 BLOCK_COMMENT("Entry:"); 1421 } 1422 1423 // use fwd copy when (d-s) above_equal (count*size) 1424 __ sub(rscratch1, d, s); 1425 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1426 __ br(Assembler::HS, nooverlap_target); 1427 1428 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1429 if (dest_uninitialized) { 1430 decorators |= IS_DEST_UNINITIALIZED; 1431 } 1432 if (aligned) { 1433 decorators |= ARRAYCOPY_ALIGNED; 1434 } 1435 1436 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1437 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1438 1439 if (is_oop) { 1440 // save regs before copy_memory 1441 __ push(RegSet::of(d, count), sp); 1442 } 1443 copy_memory(aligned, s, d, count, rscratch1, -size); 1444 if (is_oop) { 1445 __ pop(RegSet::of(d, count), sp); 1446 if (VerifyOops) 1447 verify_oop_array(size, d, count, r16); 1448 __ sub(count, count, 1); // make an inclusive end pointer 1449 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1450 } 1451 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1452 __ leave(); 1453 __ mov(r0, zr); // return 0 1454 __ ret(lr); 1455 #ifdef BUILTIN_SIM 1456 { 1457 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1458 sim->notifyCompile(const_cast<char*>(name), start); 1459 } 1460 #endif 1461 return start; 1462 } 1463 1464 // Arguments: 1465 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1466 // ignored 1467 // name - stub name string 1468 // 1469 // Inputs: 1470 // c_rarg0 - source array address 1471 // c_rarg1 - destination array address 1472 // c_rarg2 - element count, treated as ssize_t, can be zero 1473 // 1474 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1475 // we let the hardware handle it. The one to eight bytes within words, 1476 // dwords or qwords that span cache line boundaries will still be loaded 1477 // and stored atomically. 1478 // 1479 // Side Effects: 1480 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1481 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1482 // we let the hardware handle it. The one to eight bytes within words, 1483 // dwords or qwords that span cache line boundaries will still be loaded 1484 // and stored atomically. 1485 // 1486 // Side Effects: 1487 // disjoint_byte_copy_entry is set to the no-overlap entry point 1488 // used by generate_conjoint_byte_copy(). 1489 // 1490 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1491 const bool not_oop = false; 1492 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1493 } 1494 1495 // Arguments: 1496 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1497 // ignored 1498 // name - stub name string 1499 // 1500 // Inputs: 1501 // c_rarg0 - source array address 1502 // c_rarg1 - destination array address 1503 // c_rarg2 - element count, treated as ssize_t, can be zero 1504 // 1505 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1506 // we let the hardware handle it. The one to eight bytes within words, 1507 // dwords or qwords that span cache line boundaries will still be loaded 1508 // and stored atomically. 1509 // 1510 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1511 address* entry, const char *name) { 1512 const bool not_oop = false; 1513 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1514 } 1515 1516 // Arguments: 1517 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1518 // ignored 1519 // name - stub name string 1520 // 1521 // Inputs: 1522 // c_rarg0 - source array address 1523 // c_rarg1 - destination array address 1524 // c_rarg2 - element count, treated as ssize_t, can be zero 1525 // 1526 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1527 // let the hardware handle it. The two or four words within dwords 1528 // or qwords that span cache line boundaries will still be loaded 1529 // and stored atomically. 1530 // 1531 // Side Effects: 1532 // disjoint_short_copy_entry is set to the no-overlap entry point 1533 // used by generate_conjoint_short_copy(). 1534 // 1535 address generate_disjoint_short_copy(bool aligned, 1536 address* entry, const char *name) { 1537 const bool not_oop = false; 1538 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1539 } 1540 1541 // Arguments: 1542 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1543 // ignored 1544 // name - stub name string 1545 // 1546 // Inputs: 1547 // c_rarg0 - source array address 1548 // c_rarg1 - destination array address 1549 // c_rarg2 - element count, treated as ssize_t, can be zero 1550 // 1551 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1552 // let the hardware handle it. The two or four words within dwords 1553 // or qwords that span cache line boundaries will still be loaded 1554 // and stored atomically. 1555 // 1556 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1557 address *entry, const char *name) { 1558 const bool not_oop = false; 1559 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1560 1561 } 1562 // Arguments: 1563 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1564 // ignored 1565 // name - stub name string 1566 // 1567 // Inputs: 1568 // c_rarg0 - source array address 1569 // c_rarg1 - destination array address 1570 // c_rarg2 - element count, treated as ssize_t, can be zero 1571 // 1572 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1573 // the hardware handle it. The two dwords within qwords that span 1574 // cache line boundaries will still be loaded and stored atomicly. 1575 // 1576 // Side Effects: 1577 // disjoint_int_copy_entry is set to the no-overlap entry point 1578 // used by generate_conjoint_int_oop_copy(). 1579 // 1580 address generate_disjoint_int_copy(bool aligned, address *entry, 1581 const char *name, bool dest_uninitialized = false) { 1582 const bool not_oop = false; 1583 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1584 } 1585 1586 // Arguments: 1587 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1588 // ignored 1589 // name - stub name string 1590 // 1591 // Inputs: 1592 // c_rarg0 - source array address 1593 // c_rarg1 - destination array address 1594 // c_rarg2 - element count, treated as ssize_t, can be zero 1595 // 1596 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1597 // the hardware handle it. The two dwords within qwords that span 1598 // cache line boundaries will still be loaded and stored atomicly. 1599 // 1600 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1601 address *entry, const char *name, 1602 bool dest_uninitialized = false) { 1603 const bool not_oop = false; 1604 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1605 } 1606 1607 1608 // Arguments: 1609 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1610 // ignored 1611 // name - stub name string 1612 // 1613 // Inputs: 1614 // c_rarg0 - source array address 1615 // c_rarg1 - destination array address 1616 // c_rarg2 - element count, treated as size_t, can be zero 1617 // 1618 // Side Effects: 1619 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1620 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1621 // 1622 address generate_disjoint_long_copy(bool aligned, address *entry, 1623 const char *name, bool dest_uninitialized = false) { 1624 const bool not_oop = false; 1625 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1626 } 1627 1628 // Arguments: 1629 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1630 // ignored 1631 // name - stub name string 1632 // 1633 // Inputs: 1634 // c_rarg0 - source array address 1635 // c_rarg1 - destination array address 1636 // c_rarg2 - element count, treated as size_t, can be zero 1637 // 1638 address generate_conjoint_long_copy(bool aligned, 1639 address nooverlap_target, address *entry, 1640 const char *name, bool dest_uninitialized = false) { 1641 const bool not_oop = false; 1642 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1643 } 1644 1645 // Arguments: 1646 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1647 // ignored 1648 // name - stub name string 1649 // 1650 // Inputs: 1651 // c_rarg0 - source array address 1652 // c_rarg1 - destination array address 1653 // c_rarg2 - element count, treated as size_t, can be zero 1654 // 1655 // Side Effects: 1656 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1657 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1658 // 1659 address generate_disjoint_oop_copy(bool aligned, address *entry, 1660 const char *name, bool dest_uninitialized) { 1661 const bool is_oop = true; 1662 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1663 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1664 } 1665 1666 // Arguments: 1667 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1668 // ignored 1669 // name - stub name string 1670 // 1671 // Inputs: 1672 // c_rarg0 - source array address 1673 // c_rarg1 - destination array address 1674 // c_rarg2 - element count, treated as size_t, can be zero 1675 // 1676 address generate_conjoint_oop_copy(bool aligned, 1677 address nooverlap_target, address *entry, 1678 const char *name, bool dest_uninitialized) { 1679 const bool is_oop = true; 1680 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1681 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1682 name, dest_uninitialized); 1683 } 1684 1685 1686 // Helper for generating a dynamic type check. 1687 // Smashes rscratch1. 1688 void generate_type_check(Register sub_klass, 1689 Register super_check_offset, 1690 Register super_klass, 1691 Label& L_success) { 1692 assert_different_registers(sub_klass, super_check_offset, super_klass); 1693 1694 BLOCK_COMMENT("type_check:"); 1695 1696 Label L_miss; 1697 1698 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1699 super_check_offset); 1700 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1701 1702 // Fall through on failure! 1703 __ BIND(L_miss); 1704 } 1705 1706 // 1707 // Generate checkcasting array copy stub 1708 // 1709 // Input: 1710 // c_rarg0 - source array address 1711 // c_rarg1 - destination array address 1712 // c_rarg2 - element count, treated as ssize_t, can be zero 1713 // c_rarg3 - size_t ckoff (super_check_offset) 1714 // c_rarg4 - oop ckval (super_klass) 1715 // 1716 // Output: 1717 // r0 == 0 - success 1718 // r0 == -1^K - failure, where K is partial transfer count 1719 // 1720 address generate_checkcast_copy(const char *name, address *entry, 1721 bool dest_uninitialized = false) { 1722 1723 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1724 1725 // Input registers (after setup_arg_regs) 1726 const Register from = c_rarg0; // source array address 1727 const Register to = c_rarg1; // destination array address 1728 const Register count = c_rarg2; // elementscount 1729 const Register ckoff = c_rarg3; // super_check_offset 1730 const Register ckval = c_rarg4; // super_klass 1731 1732 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1733 RegSet wb_post_saved_regs = RegSet::of(count); 1734 1735 // Registers used as temps (r18, r19, r20 are save-on-entry) 1736 const Register count_save = r21; // orig elementscount 1737 const Register start_to = r20; // destination array start address 1738 const Register copied_oop = r18; // actual oop copied 1739 const Register r19_klass = r19; // oop._klass 1740 1741 //--------------------------------------------------------------- 1742 // Assembler stub will be used for this call to arraycopy 1743 // if the two arrays are subtypes of Object[] but the 1744 // destination array type is not equal to or a supertype 1745 // of the source type. Each element must be separately 1746 // checked. 1747 1748 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1749 copied_oop, r19_klass, count_save); 1750 1751 __ align(CodeEntryAlignment); 1752 StubCodeMark mark(this, "StubRoutines", name); 1753 address start = __ pc(); 1754 1755 __ enter(); // required for proper stackwalking of RuntimeStub frame 1756 1757 #ifdef ASSERT 1758 // caller guarantees that the arrays really are different 1759 // otherwise, we would have to make conjoint checks 1760 { Label L; 1761 array_overlap_test(L, TIMES_OOP); 1762 __ stop("checkcast_copy within a single array"); 1763 __ bind(L); 1764 } 1765 #endif //ASSERT 1766 1767 // Caller of this entry point must set up the argument registers. 1768 if (entry != NULL) { 1769 *entry = __ pc(); 1770 BLOCK_COMMENT("Entry:"); 1771 } 1772 1773 // Empty array: Nothing to do. 1774 __ cbz(count, L_done); 1775 1776 __ push(RegSet::of(r18, r19, r20, r21), sp); 1777 1778 #ifdef ASSERT 1779 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1780 // The ckoff and ckval must be mutually consistent, 1781 // even though caller generates both. 1782 { Label L; 1783 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1784 __ ldrw(start_to, Address(ckval, sco_offset)); 1785 __ cmpw(ckoff, start_to); 1786 __ br(Assembler::EQ, L); 1787 __ stop("super_check_offset inconsistent"); 1788 __ bind(L); 1789 } 1790 #endif //ASSERT 1791 1792 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; 1793 bool is_oop = true; 1794 if (dest_uninitialized) { 1795 decorators |= IS_DEST_UNINITIALIZED; 1796 } 1797 1798 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1799 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1800 1801 // save the original count 1802 __ mov(count_save, count); 1803 1804 // Copy from low to high addresses 1805 __ mov(start_to, to); // Save destination array start address 1806 __ b(L_load_element); 1807 1808 // ======== begin loop ======== 1809 // (Loop is rotated; its entry is L_load_element.) 1810 // Loop control: 1811 // for (; count != 0; count--) { 1812 // copied_oop = load_heap_oop(from++); 1813 // ... generate_type_check ...; 1814 // store_heap_oop(to++, copied_oop); 1815 // } 1816 __ align(OptoLoopAlignment); 1817 1818 __ BIND(L_store_element); 1819 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1820 __ sub(count, count, 1); 1821 __ cbz(count, L_do_card_marks); 1822 1823 // ======== loop entry is here ======== 1824 __ BIND(L_load_element); 1825 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1826 __ cbz(copied_oop, L_store_element); 1827 1828 __ load_klass(r19_klass, copied_oop);// query the object klass 1829 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1830 // ======== end loop ======== 1831 1832 // It was a real error; we must depend on the caller to finish the job. 1833 // Register count = remaining oops, count_orig = total oops. 1834 // Emit GC store barriers for the oops we have copied and report 1835 // their number to the caller. 1836 1837 __ subs(count, count_save, count); // K = partially copied oop count 1838 __ eon(count, count, zr); // report (-1^K) to caller 1839 __ br(Assembler::EQ, L_done_pop); 1840 1841 __ BIND(L_do_card_marks); 1842 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1843 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs); 1844 1845 __ bind(L_done_pop); 1846 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1847 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1848 1849 __ bind(L_done); 1850 __ mov(r0, count); 1851 __ leave(); 1852 __ ret(lr); 1853 1854 return start; 1855 } 1856 1857 // Perform range checks on the proposed arraycopy. 1858 // Kills temp, but nothing else. 1859 // Also, clean the sign bits of src_pos and dst_pos. 1860 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1861 Register src_pos, // source position (c_rarg1) 1862 Register dst, // destination array oo (c_rarg2) 1863 Register dst_pos, // destination position (c_rarg3) 1864 Register length, 1865 Register temp, 1866 Label& L_failed) { 1867 BLOCK_COMMENT("arraycopy_range_checks:"); 1868 1869 assert_different_registers(rscratch1, temp); 1870 1871 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1872 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1873 __ addw(temp, length, src_pos); 1874 __ cmpw(temp, rscratch1); 1875 __ br(Assembler::HI, L_failed); 1876 1877 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1878 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1879 __ addw(temp, length, dst_pos); 1880 __ cmpw(temp, rscratch1); 1881 __ br(Assembler::HI, L_failed); 1882 1883 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1884 __ movw(src_pos, src_pos); 1885 __ movw(dst_pos, dst_pos); 1886 1887 BLOCK_COMMENT("arraycopy_range_checks done"); 1888 } 1889 1890 // These stubs get called from some dumb test routine. 1891 // I'll write them properly when they're called from 1892 // something that's actually doing something. 1893 static void fake_arraycopy_stub(address src, address dst, int count) { 1894 assert(count == 0, "huh?"); 1895 } 1896 1897 1898 // 1899 // Generate 'unsafe' array copy stub 1900 // Though just as safe as the other stubs, it takes an unscaled 1901 // size_t argument instead of an element count. 1902 // 1903 // Input: 1904 // c_rarg0 - source array address 1905 // c_rarg1 - destination array address 1906 // c_rarg2 - byte count, treated as ssize_t, can be zero 1907 // 1908 // Examines the alignment of the operands and dispatches 1909 // to a long, int, short, or byte copy loop. 1910 // 1911 address generate_unsafe_copy(const char *name, 1912 address byte_copy_entry, 1913 address short_copy_entry, 1914 address int_copy_entry, 1915 address long_copy_entry) { 1916 Label L_long_aligned, L_int_aligned, L_short_aligned; 1917 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1918 1919 __ align(CodeEntryAlignment); 1920 StubCodeMark mark(this, "StubRoutines", name); 1921 address start = __ pc(); 1922 __ enter(); // required for proper stackwalking of RuntimeStub frame 1923 1924 // bump this on entry, not on exit: 1925 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1926 1927 __ orr(rscratch1, s, d); 1928 __ orr(rscratch1, rscratch1, count); 1929 1930 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1931 __ cbz(rscratch1, L_long_aligned); 1932 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1933 __ cbz(rscratch1, L_int_aligned); 1934 __ tbz(rscratch1, 0, L_short_aligned); 1935 __ b(RuntimeAddress(byte_copy_entry)); 1936 1937 __ BIND(L_short_aligned); 1938 __ lsr(count, count, LogBytesPerShort); // size => short_count 1939 __ b(RuntimeAddress(short_copy_entry)); 1940 __ BIND(L_int_aligned); 1941 __ lsr(count, count, LogBytesPerInt); // size => int_count 1942 __ b(RuntimeAddress(int_copy_entry)); 1943 __ BIND(L_long_aligned); 1944 __ lsr(count, count, LogBytesPerLong); // size => long_count 1945 __ b(RuntimeAddress(long_copy_entry)); 1946 1947 return start; 1948 } 1949 1950 // 1951 // Generate generic array copy stubs 1952 // 1953 // Input: 1954 // c_rarg0 - src oop 1955 // c_rarg1 - src_pos (32-bits) 1956 // c_rarg2 - dst oop 1957 // c_rarg3 - dst_pos (32-bits) 1958 // c_rarg4 - element count (32-bits) 1959 // 1960 // Output: 1961 // r0 == 0 - success 1962 // r0 == -1^K - failure, where K is partial transfer count 1963 // 1964 address generate_generic_copy(const char *name, 1965 address byte_copy_entry, address short_copy_entry, 1966 address int_copy_entry, address oop_copy_entry, 1967 address long_copy_entry, address checkcast_copy_entry) { 1968 1969 Label L_failed, L_failed_0, L_objArray; 1970 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1971 1972 // Input registers 1973 const Register src = c_rarg0; // source array oop 1974 const Register src_pos = c_rarg1; // source position 1975 const Register dst = c_rarg2; // destination array oop 1976 const Register dst_pos = c_rarg3; // destination position 1977 const Register length = c_rarg4; 1978 1979 StubCodeMark mark(this, "StubRoutines", name); 1980 1981 __ align(CodeEntryAlignment); 1982 address start = __ pc(); 1983 1984 __ enter(); // required for proper stackwalking of RuntimeStub frame 1985 1986 // bump this on entry, not on exit: 1987 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1988 1989 //----------------------------------------------------------------------- 1990 // Assembler stub will be used for this call to arraycopy 1991 // if the following conditions are met: 1992 // 1993 // (1) src and dst must not be null. 1994 // (2) src_pos must not be negative. 1995 // (3) dst_pos must not be negative. 1996 // (4) length must not be negative. 1997 // (5) src klass and dst klass should be the same and not NULL. 1998 // (6) src and dst should be arrays. 1999 // (7) src_pos + length must not exceed length of src. 2000 // (8) dst_pos + length must not exceed length of dst. 2001 // 2002 2003 // if (src == NULL) return -1; 2004 __ cbz(src, L_failed); 2005 2006 // if (src_pos < 0) return -1; 2007 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2008 2009 // if (dst == NULL) return -1; 2010 __ cbz(dst, L_failed); 2011 2012 // if (dst_pos < 0) return -1; 2013 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2014 2015 // registers used as temp 2016 const Register scratch_length = r16; // elements count to copy 2017 const Register scratch_src_klass = r17; // array klass 2018 const Register lh = r18; // layout helper 2019 2020 // if (length < 0) return -1; 2021 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2022 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2023 2024 __ load_klass(scratch_src_klass, src); 2025 #ifdef ASSERT 2026 // assert(src->klass() != NULL); 2027 { 2028 BLOCK_COMMENT("assert klasses not null {"); 2029 Label L1, L2; 2030 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2031 __ bind(L1); 2032 __ stop("broken null klass"); 2033 __ bind(L2); 2034 __ load_klass(rscratch1, dst); 2035 __ cbz(rscratch1, L1); // this would be broken also 2036 BLOCK_COMMENT("} assert klasses not null done"); 2037 } 2038 #endif 2039 2040 // Load layout helper (32-bits) 2041 // 2042 // |array_tag| | header_size | element_type | |log2_element_size| 2043 // 32 30 24 16 8 2 0 2044 // 2045 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2046 // 2047 2048 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2049 2050 // Handle objArrays completely differently... 2051 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2052 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2053 __ movw(rscratch1, objArray_lh); 2054 __ eorw(rscratch2, lh, rscratch1); 2055 __ cbzw(rscratch2, L_objArray); 2056 2057 // if (src->klass() != dst->klass()) return -1; 2058 __ load_klass(rscratch2, dst); 2059 __ eor(rscratch2, rscratch2, scratch_src_klass); 2060 __ cbnz(rscratch2, L_failed); 2061 2062 // if (!src->is_Array()) return -1; 2063 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2064 2065 // At this point, it is known to be a typeArray (array_tag 0x3). 2066 #ifdef ASSERT 2067 { 2068 BLOCK_COMMENT("assert primitive array {"); 2069 Label L; 2070 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2071 __ cmpw(lh, rscratch2); 2072 __ br(Assembler::GE, L); 2073 __ stop("must be a primitive array"); 2074 __ bind(L); 2075 BLOCK_COMMENT("} assert primitive array done"); 2076 } 2077 #endif 2078 2079 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2080 rscratch2, L_failed); 2081 2082 // TypeArrayKlass 2083 // 2084 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2085 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2086 // 2087 2088 const Register rscratch1_offset = rscratch1; // array offset 2089 const Register r18_elsize = lh; // element size 2090 2091 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2092 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2093 __ add(src, src, rscratch1_offset); // src array offset 2094 __ add(dst, dst, rscratch1_offset); // dst array offset 2095 BLOCK_COMMENT("choose copy loop based on element size"); 2096 2097 // next registers should be set before the jump to corresponding stub 2098 const Register from = c_rarg0; // source array address 2099 const Register to = c_rarg1; // destination array address 2100 const Register count = c_rarg2; // elements count 2101 2102 // 'from', 'to', 'count' registers should be set in such order 2103 // since they are the same as 'src', 'src_pos', 'dst'. 2104 2105 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2106 2107 // The possible values of elsize are 0-3, i.e. exact_log2(element 2108 // size in bytes). We do a simple bitwise binary search. 2109 __ BIND(L_copy_bytes); 2110 __ tbnz(r18_elsize, 1, L_copy_ints); 2111 __ tbnz(r18_elsize, 0, L_copy_shorts); 2112 __ lea(from, Address(src, src_pos));// src_addr 2113 __ lea(to, Address(dst, dst_pos));// dst_addr 2114 __ movw(count, scratch_length); // length 2115 __ b(RuntimeAddress(byte_copy_entry)); 2116 2117 __ BIND(L_copy_shorts); 2118 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2119 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2120 __ movw(count, scratch_length); // length 2121 __ b(RuntimeAddress(short_copy_entry)); 2122 2123 __ BIND(L_copy_ints); 2124 __ tbnz(r18_elsize, 0, L_copy_longs); 2125 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2126 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2127 __ movw(count, scratch_length); // length 2128 __ b(RuntimeAddress(int_copy_entry)); 2129 2130 __ BIND(L_copy_longs); 2131 #ifdef ASSERT 2132 { 2133 BLOCK_COMMENT("assert long copy {"); 2134 Label L; 2135 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2136 __ cmpw(r18_elsize, LogBytesPerLong); 2137 __ br(Assembler::EQ, L); 2138 __ stop("must be long copy, but elsize is wrong"); 2139 __ bind(L); 2140 BLOCK_COMMENT("} assert long copy done"); 2141 } 2142 #endif 2143 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2144 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2145 __ movw(count, scratch_length); // length 2146 __ b(RuntimeAddress(long_copy_entry)); 2147 2148 // ObjArrayKlass 2149 __ BIND(L_objArray); 2150 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2151 2152 Label L_plain_copy, L_checkcast_copy; 2153 // test array classes for subtyping 2154 __ load_klass(r18, dst); 2155 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2156 __ br(Assembler::NE, L_checkcast_copy); 2157 2158 // Identically typed arrays can be copied without element-wise checks. 2159 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2160 rscratch2, L_failed); 2161 2162 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2163 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2164 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2165 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2166 __ movw(count, scratch_length); // length 2167 __ BIND(L_plain_copy); 2168 __ b(RuntimeAddress(oop_copy_entry)); 2169 2170 __ BIND(L_checkcast_copy); 2171 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2172 { 2173 // Before looking at dst.length, make sure dst is also an objArray. 2174 __ ldrw(rscratch1, Address(r18, lh_offset)); 2175 __ movw(rscratch2, objArray_lh); 2176 __ eorw(rscratch1, rscratch1, rscratch2); 2177 __ cbnzw(rscratch1, L_failed); 2178 2179 // It is safe to examine both src.length and dst.length. 2180 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2181 r18, L_failed); 2182 2183 const Register rscratch2_dst_klass = rscratch2; 2184 __ load_klass(rscratch2_dst_klass, dst); // reload 2185 2186 // Marshal the base address arguments now, freeing registers. 2187 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2188 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2189 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2190 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2191 __ movw(count, length); // length (reloaded) 2192 Register sco_temp = c_rarg3; // this register is free now 2193 assert_different_registers(from, to, count, sco_temp, 2194 rscratch2_dst_klass, scratch_src_klass); 2195 // assert_clean_int(count, sco_temp); 2196 2197 // Generate the type check. 2198 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2199 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2200 // assert_clean_int(sco_temp, r18); 2201 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2202 2203 // Fetch destination element klass from the ObjArrayKlass header. 2204 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2205 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2206 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2207 2208 // the checkcast_copy loop needs two extra arguments: 2209 assert(c_rarg3 == sco_temp, "#3 already in place"); 2210 // Set up arguments for checkcast_copy_entry. 2211 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2212 __ b(RuntimeAddress(checkcast_copy_entry)); 2213 } 2214 2215 __ BIND(L_failed); 2216 __ mov(r0, -1); 2217 __ leave(); // required for proper stackwalking of RuntimeStub frame 2218 __ ret(lr); 2219 2220 return start; 2221 } 2222 2223 // 2224 // Generate stub for array fill. If "aligned" is true, the 2225 // "to" address is assumed to be heapword aligned. 2226 // 2227 // Arguments for generated stub: 2228 // to: c_rarg0 2229 // value: c_rarg1 2230 // count: c_rarg2 treated as signed 2231 // 2232 address generate_fill(BasicType t, bool aligned, const char *name) { 2233 __ align(CodeEntryAlignment); 2234 StubCodeMark mark(this, "StubRoutines", name); 2235 address start = __ pc(); 2236 2237 BLOCK_COMMENT("Entry:"); 2238 2239 const Register to = c_rarg0; // source array address 2240 const Register value = c_rarg1; // value 2241 const Register count = c_rarg2; // elements count 2242 2243 const Register bz_base = r10; // base for block_zero routine 2244 const Register cnt_words = r11; // temp register 2245 2246 __ enter(); 2247 2248 Label L_fill_elements, L_exit1; 2249 2250 int shift = -1; 2251 switch (t) { 2252 case T_BYTE: 2253 shift = 0; 2254 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2255 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2256 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2257 __ br(Assembler::LO, L_fill_elements); 2258 break; 2259 case T_SHORT: 2260 shift = 1; 2261 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2262 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2263 __ br(Assembler::LO, L_fill_elements); 2264 break; 2265 case T_INT: 2266 shift = 2; 2267 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2268 __ br(Assembler::LO, L_fill_elements); 2269 break; 2270 default: ShouldNotReachHere(); 2271 } 2272 2273 // Align source address at 8 bytes address boundary. 2274 Label L_skip_align1, L_skip_align2, L_skip_align4; 2275 if (!aligned) { 2276 switch (t) { 2277 case T_BYTE: 2278 // One byte misalignment happens only for byte arrays. 2279 __ tbz(to, 0, L_skip_align1); 2280 __ strb(value, Address(__ post(to, 1))); 2281 __ subw(count, count, 1); 2282 __ bind(L_skip_align1); 2283 // Fallthrough 2284 case T_SHORT: 2285 // Two bytes misalignment happens only for byte and short (char) arrays. 2286 __ tbz(to, 1, L_skip_align2); 2287 __ strh(value, Address(__ post(to, 2))); 2288 __ subw(count, count, 2 >> shift); 2289 __ bind(L_skip_align2); 2290 // Fallthrough 2291 case T_INT: 2292 // Align to 8 bytes, we know we are 4 byte aligned to start. 2293 __ tbz(to, 2, L_skip_align4); 2294 __ strw(value, Address(__ post(to, 4))); 2295 __ subw(count, count, 4 >> shift); 2296 __ bind(L_skip_align4); 2297 break; 2298 default: ShouldNotReachHere(); 2299 } 2300 } 2301 2302 // 2303 // Fill large chunks 2304 // 2305 __ lsrw(cnt_words, count, 3 - shift); // number of words 2306 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2307 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2308 if (UseBlockZeroing) { 2309 Label non_block_zeroing, rest; 2310 // If the fill value is zero we can use the fast zero_words(). 2311 __ cbnz(value, non_block_zeroing); 2312 __ mov(bz_base, to); 2313 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2314 __ zero_words(bz_base, cnt_words); 2315 __ b(rest); 2316 __ bind(non_block_zeroing); 2317 __ fill_words(to, cnt_words, value); 2318 __ bind(rest); 2319 } else { 2320 __ fill_words(to, cnt_words, value); 2321 } 2322 2323 // Remaining count is less than 8 bytes. Fill it by a single store. 2324 // Note that the total length is no less than 8 bytes. 2325 if (t == T_BYTE || t == T_SHORT) { 2326 Label L_exit1; 2327 __ cbzw(count, L_exit1); 2328 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2329 __ str(value, Address(to, -8)); // overwrite some elements 2330 __ bind(L_exit1); 2331 __ leave(); 2332 __ ret(lr); 2333 } 2334 2335 // Handle copies less than 8 bytes. 2336 Label L_fill_2, L_fill_4, L_exit2; 2337 __ bind(L_fill_elements); 2338 switch (t) { 2339 case T_BYTE: 2340 __ tbz(count, 0, L_fill_2); 2341 __ strb(value, Address(__ post(to, 1))); 2342 __ bind(L_fill_2); 2343 __ tbz(count, 1, L_fill_4); 2344 __ strh(value, Address(__ post(to, 2))); 2345 __ bind(L_fill_4); 2346 __ tbz(count, 2, L_exit2); 2347 __ strw(value, Address(to)); 2348 break; 2349 case T_SHORT: 2350 __ tbz(count, 0, L_fill_4); 2351 __ strh(value, Address(__ post(to, 2))); 2352 __ bind(L_fill_4); 2353 __ tbz(count, 1, L_exit2); 2354 __ strw(value, Address(to)); 2355 break; 2356 case T_INT: 2357 __ cbzw(count, L_exit2); 2358 __ strw(value, Address(to)); 2359 break; 2360 default: ShouldNotReachHere(); 2361 } 2362 __ bind(L_exit2); 2363 __ leave(); 2364 __ ret(lr); 2365 return start; 2366 } 2367 2368 address generate_data_cache_writeback() { 2369 const Register line = c_rarg0; // address of line to write back 2370 2371 __ align(CodeEntryAlignment); 2372 2373 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2374 2375 address start = __ pc(); 2376 __ enter(); 2377 __ cache_wb(Address(line, 0)); 2378 __ leave(); 2379 __ ret(lr); 2380 2381 return start; 2382 } 2383 2384 address generate_data_cache_writeback_sync() { 2385 const Register is_pre = c_rarg0; // pre or post sync 2386 2387 __ align(CodeEntryAlignment); 2388 2389 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2390 2391 // pre wbsync is a no-op 2392 // post wbsync translates to an sfence 2393 2394 Label skip; 2395 address start = __ pc(); 2396 __ enter(); 2397 __ cbnz(is_pre, skip); 2398 __ cache_wbsync(false); 2399 __ bind(skip); 2400 __ leave(); 2401 __ ret(lr); 2402 2403 return start; 2404 } 2405 2406 void generate_arraycopy_stubs() { 2407 address entry; 2408 address entry_jbyte_arraycopy; 2409 address entry_jshort_arraycopy; 2410 address entry_jint_arraycopy; 2411 address entry_oop_arraycopy; 2412 address entry_jlong_arraycopy; 2413 address entry_checkcast_arraycopy; 2414 2415 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2416 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2417 2418 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2419 2420 //*** jbyte 2421 // Always need aligned and unaligned versions 2422 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2423 "jbyte_disjoint_arraycopy"); 2424 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2425 &entry_jbyte_arraycopy, 2426 "jbyte_arraycopy"); 2427 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2428 "arrayof_jbyte_disjoint_arraycopy"); 2429 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2430 "arrayof_jbyte_arraycopy"); 2431 2432 //*** jshort 2433 // Always need aligned and unaligned versions 2434 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2435 "jshort_disjoint_arraycopy"); 2436 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2437 &entry_jshort_arraycopy, 2438 "jshort_arraycopy"); 2439 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2440 "arrayof_jshort_disjoint_arraycopy"); 2441 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2442 "arrayof_jshort_arraycopy"); 2443 2444 //*** jint 2445 // Aligned versions 2446 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2447 "arrayof_jint_disjoint_arraycopy"); 2448 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2449 "arrayof_jint_arraycopy"); 2450 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2451 // entry_jint_arraycopy always points to the unaligned version 2452 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2453 "jint_disjoint_arraycopy"); 2454 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2455 &entry_jint_arraycopy, 2456 "jint_arraycopy"); 2457 2458 //*** jlong 2459 // It is always aligned 2460 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2461 "arrayof_jlong_disjoint_arraycopy"); 2462 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2463 "arrayof_jlong_arraycopy"); 2464 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2465 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2466 2467 //*** oops 2468 { 2469 // With compressed oops we need unaligned versions; notice that 2470 // we overwrite entry_oop_arraycopy. 2471 bool aligned = !UseCompressedOops; 2472 2473 StubRoutines::_arrayof_oop_disjoint_arraycopy 2474 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2475 /*dest_uninitialized*/false); 2476 StubRoutines::_arrayof_oop_arraycopy 2477 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2478 /*dest_uninitialized*/false); 2479 // Aligned versions without pre-barriers 2480 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2481 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2482 /*dest_uninitialized*/true); 2483 StubRoutines::_arrayof_oop_arraycopy_uninit 2484 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2485 /*dest_uninitialized*/true); 2486 } 2487 2488 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2489 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2490 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2491 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2492 2493 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2494 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2495 /*dest_uninitialized*/true); 2496 2497 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2498 entry_jbyte_arraycopy, 2499 entry_jshort_arraycopy, 2500 entry_jint_arraycopy, 2501 entry_jlong_arraycopy); 2502 2503 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2504 entry_jbyte_arraycopy, 2505 entry_jshort_arraycopy, 2506 entry_jint_arraycopy, 2507 entry_oop_arraycopy, 2508 entry_jlong_arraycopy, 2509 entry_checkcast_arraycopy); 2510 2511 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2512 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2513 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2514 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2515 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2516 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2517 } 2518 2519 void generate_math_stubs() { Unimplemented(); } 2520 2521 // Arguments: 2522 // 2523 // Inputs: 2524 // c_rarg0 - source byte array address 2525 // c_rarg1 - destination byte array address 2526 // c_rarg2 - K (key) in little endian int array 2527 // 2528 address generate_aescrypt_encryptBlock() { 2529 __ align(CodeEntryAlignment); 2530 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2531 2532 Label L_doLast; 2533 2534 const Register from = c_rarg0; // source array address 2535 const Register to = c_rarg1; // destination array address 2536 const Register key = c_rarg2; // key array address 2537 const Register keylen = rscratch1; 2538 2539 address start = __ pc(); 2540 __ enter(); 2541 2542 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2543 2544 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2545 2546 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2547 __ rev32(v1, __ T16B, v1); 2548 __ rev32(v2, __ T16B, v2); 2549 __ rev32(v3, __ T16B, v3); 2550 __ rev32(v4, __ T16B, v4); 2551 __ aese(v0, v1); 2552 __ aesmc(v0, v0); 2553 __ aese(v0, v2); 2554 __ aesmc(v0, v0); 2555 __ aese(v0, v3); 2556 __ aesmc(v0, v0); 2557 __ aese(v0, v4); 2558 __ aesmc(v0, v0); 2559 2560 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2561 __ rev32(v1, __ T16B, v1); 2562 __ rev32(v2, __ T16B, v2); 2563 __ rev32(v3, __ T16B, v3); 2564 __ rev32(v4, __ T16B, v4); 2565 __ aese(v0, v1); 2566 __ aesmc(v0, v0); 2567 __ aese(v0, v2); 2568 __ aesmc(v0, v0); 2569 __ aese(v0, v3); 2570 __ aesmc(v0, v0); 2571 __ aese(v0, v4); 2572 __ aesmc(v0, v0); 2573 2574 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2575 __ rev32(v1, __ T16B, v1); 2576 __ rev32(v2, __ T16B, v2); 2577 2578 __ cmpw(keylen, 44); 2579 __ br(Assembler::EQ, L_doLast); 2580 2581 __ aese(v0, v1); 2582 __ aesmc(v0, v0); 2583 __ aese(v0, v2); 2584 __ aesmc(v0, v0); 2585 2586 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2587 __ rev32(v1, __ T16B, v1); 2588 __ rev32(v2, __ T16B, v2); 2589 2590 __ cmpw(keylen, 52); 2591 __ br(Assembler::EQ, L_doLast); 2592 2593 __ aese(v0, v1); 2594 __ aesmc(v0, v0); 2595 __ aese(v0, v2); 2596 __ aesmc(v0, v0); 2597 2598 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2599 __ rev32(v1, __ T16B, v1); 2600 __ rev32(v2, __ T16B, v2); 2601 2602 __ BIND(L_doLast); 2603 2604 __ aese(v0, v1); 2605 __ aesmc(v0, v0); 2606 __ aese(v0, v2); 2607 2608 __ ld1(v1, __ T16B, key); 2609 __ rev32(v1, __ T16B, v1); 2610 __ eor(v0, __ T16B, v0, v1); 2611 2612 __ st1(v0, __ T16B, to); 2613 2614 __ mov(r0, 0); 2615 2616 __ leave(); 2617 __ ret(lr); 2618 2619 return start; 2620 } 2621 2622 // Arguments: 2623 // 2624 // Inputs: 2625 // c_rarg0 - source byte array address 2626 // c_rarg1 - destination byte array address 2627 // c_rarg2 - K (key) in little endian int array 2628 // 2629 address generate_aescrypt_decryptBlock() { 2630 assert(UseAES, "need AES instructions and misaligned SSE support"); 2631 __ align(CodeEntryAlignment); 2632 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2633 Label L_doLast; 2634 2635 const Register from = c_rarg0; // source array address 2636 const Register to = c_rarg1; // destination array address 2637 const Register key = c_rarg2; // key array address 2638 const Register keylen = rscratch1; 2639 2640 address start = __ pc(); 2641 __ enter(); // required for proper stackwalking of RuntimeStub frame 2642 2643 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2644 2645 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2646 2647 __ ld1(v5, __ T16B, __ post(key, 16)); 2648 __ rev32(v5, __ T16B, v5); 2649 2650 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2651 __ rev32(v1, __ T16B, v1); 2652 __ rev32(v2, __ T16B, v2); 2653 __ rev32(v3, __ T16B, v3); 2654 __ rev32(v4, __ T16B, v4); 2655 __ aesd(v0, v1); 2656 __ aesimc(v0, v0); 2657 __ aesd(v0, v2); 2658 __ aesimc(v0, v0); 2659 __ aesd(v0, v3); 2660 __ aesimc(v0, v0); 2661 __ aesd(v0, v4); 2662 __ aesimc(v0, v0); 2663 2664 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2665 __ rev32(v1, __ T16B, v1); 2666 __ rev32(v2, __ T16B, v2); 2667 __ rev32(v3, __ T16B, v3); 2668 __ rev32(v4, __ T16B, v4); 2669 __ aesd(v0, v1); 2670 __ aesimc(v0, v0); 2671 __ aesd(v0, v2); 2672 __ aesimc(v0, v0); 2673 __ aesd(v0, v3); 2674 __ aesimc(v0, v0); 2675 __ aesd(v0, v4); 2676 __ aesimc(v0, v0); 2677 2678 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2679 __ rev32(v1, __ T16B, v1); 2680 __ rev32(v2, __ T16B, v2); 2681 2682 __ cmpw(keylen, 44); 2683 __ br(Assembler::EQ, L_doLast); 2684 2685 __ aesd(v0, v1); 2686 __ aesimc(v0, v0); 2687 __ aesd(v0, v2); 2688 __ aesimc(v0, v0); 2689 2690 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2691 __ rev32(v1, __ T16B, v1); 2692 __ rev32(v2, __ T16B, v2); 2693 2694 __ cmpw(keylen, 52); 2695 __ br(Assembler::EQ, L_doLast); 2696 2697 __ aesd(v0, v1); 2698 __ aesimc(v0, v0); 2699 __ aesd(v0, v2); 2700 __ aesimc(v0, v0); 2701 2702 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2703 __ rev32(v1, __ T16B, v1); 2704 __ rev32(v2, __ T16B, v2); 2705 2706 __ BIND(L_doLast); 2707 2708 __ aesd(v0, v1); 2709 __ aesimc(v0, v0); 2710 __ aesd(v0, v2); 2711 2712 __ eor(v0, __ T16B, v0, v5); 2713 2714 __ st1(v0, __ T16B, to); 2715 2716 __ mov(r0, 0); 2717 2718 __ leave(); 2719 __ ret(lr); 2720 2721 return start; 2722 } 2723 2724 // Arguments: 2725 // 2726 // Inputs: 2727 // c_rarg0 - source byte array address 2728 // c_rarg1 - destination byte array address 2729 // c_rarg2 - K (key) in little endian int array 2730 // c_rarg3 - r vector byte array address 2731 // c_rarg4 - input length 2732 // 2733 // Output: 2734 // x0 - input length 2735 // 2736 address generate_cipherBlockChaining_encryptAESCrypt() { 2737 assert(UseAES, "need AES instructions and misaligned SSE support"); 2738 __ align(CodeEntryAlignment); 2739 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2740 2741 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2742 2743 const Register from = c_rarg0; // source array address 2744 const Register to = c_rarg1; // destination array address 2745 const Register key = c_rarg2; // key array address 2746 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2747 // and left with the results of the last encryption block 2748 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2749 const Register keylen = rscratch1; 2750 2751 address start = __ pc(); 2752 2753 __ enter(); 2754 2755 __ movw(rscratch2, len_reg); 2756 2757 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2758 2759 __ ld1(v0, __ T16B, rvec); 2760 2761 __ cmpw(keylen, 52); 2762 __ br(Assembler::CC, L_loadkeys_44); 2763 __ br(Assembler::EQ, L_loadkeys_52); 2764 2765 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2766 __ rev32(v17, __ T16B, v17); 2767 __ rev32(v18, __ T16B, v18); 2768 __ BIND(L_loadkeys_52); 2769 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2770 __ rev32(v19, __ T16B, v19); 2771 __ rev32(v20, __ T16B, v20); 2772 __ BIND(L_loadkeys_44); 2773 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2774 __ rev32(v21, __ T16B, v21); 2775 __ rev32(v22, __ T16B, v22); 2776 __ rev32(v23, __ T16B, v23); 2777 __ rev32(v24, __ T16B, v24); 2778 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2779 __ rev32(v25, __ T16B, v25); 2780 __ rev32(v26, __ T16B, v26); 2781 __ rev32(v27, __ T16B, v27); 2782 __ rev32(v28, __ T16B, v28); 2783 __ ld1(v29, v30, v31, __ T16B, key); 2784 __ rev32(v29, __ T16B, v29); 2785 __ rev32(v30, __ T16B, v30); 2786 __ rev32(v31, __ T16B, v31); 2787 2788 __ BIND(L_aes_loop); 2789 __ ld1(v1, __ T16B, __ post(from, 16)); 2790 __ eor(v0, __ T16B, v0, v1); 2791 2792 __ br(Assembler::CC, L_rounds_44); 2793 __ br(Assembler::EQ, L_rounds_52); 2794 2795 __ aese(v0, v17); __ aesmc(v0, v0); 2796 __ aese(v0, v18); __ aesmc(v0, v0); 2797 __ BIND(L_rounds_52); 2798 __ aese(v0, v19); __ aesmc(v0, v0); 2799 __ aese(v0, v20); __ aesmc(v0, v0); 2800 __ BIND(L_rounds_44); 2801 __ aese(v0, v21); __ aesmc(v0, v0); 2802 __ aese(v0, v22); __ aesmc(v0, v0); 2803 __ aese(v0, v23); __ aesmc(v0, v0); 2804 __ aese(v0, v24); __ aesmc(v0, v0); 2805 __ aese(v0, v25); __ aesmc(v0, v0); 2806 __ aese(v0, v26); __ aesmc(v0, v0); 2807 __ aese(v0, v27); __ aesmc(v0, v0); 2808 __ aese(v0, v28); __ aesmc(v0, v0); 2809 __ aese(v0, v29); __ aesmc(v0, v0); 2810 __ aese(v0, v30); 2811 __ eor(v0, __ T16B, v0, v31); 2812 2813 __ st1(v0, __ T16B, __ post(to, 16)); 2814 2815 __ subw(len_reg, len_reg, 16); 2816 __ cbnzw(len_reg, L_aes_loop); 2817 2818 __ st1(v0, __ T16B, rvec); 2819 2820 __ mov(r0, rscratch2); 2821 2822 __ leave(); 2823 __ ret(lr); 2824 2825 return start; 2826 } 2827 2828 // Arguments: 2829 // 2830 // Inputs: 2831 // c_rarg0 - source byte array address 2832 // c_rarg1 - destination byte array address 2833 // c_rarg2 - K (key) in little endian int array 2834 // c_rarg3 - r vector byte array address 2835 // c_rarg4 - input length 2836 // 2837 // Output: 2838 // r0 - input length 2839 // 2840 address generate_cipherBlockChaining_decryptAESCrypt() { 2841 assert(UseAES, "need AES instructions and misaligned SSE support"); 2842 __ align(CodeEntryAlignment); 2843 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2844 2845 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2846 2847 const Register from = c_rarg0; // source array address 2848 const Register to = c_rarg1; // destination array address 2849 const Register key = c_rarg2; // key array address 2850 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2851 // and left with the results of the last encryption block 2852 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2853 const Register keylen = rscratch1; 2854 2855 address start = __ pc(); 2856 2857 __ enter(); 2858 2859 __ movw(rscratch2, len_reg); 2860 2861 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2862 2863 __ ld1(v2, __ T16B, rvec); 2864 2865 __ ld1(v31, __ T16B, __ post(key, 16)); 2866 __ rev32(v31, __ T16B, v31); 2867 2868 __ cmpw(keylen, 52); 2869 __ br(Assembler::CC, L_loadkeys_44); 2870 __ br(Assembler::EQ, L_loadkeys_52); 2871 2872 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2873 __ rev32(v17, __ T16B, v17); 2874 __ rev32(v18, __ T16B, v18); 2875 __ BIND(L_loadkeys_52); 2876 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2877 __ rev32(v19, __ T16B, v19); 2878 __ rev32(v20, __ T16B, v20); 2879 __ BIND(L_loadkeys_44); 2880 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2881 __ rev32(v21, __ T16B, v21); 2882 __ rev32(v22, __ T16B, v22); 2883 __ rev32(v23, __ T16B, v23); 2884 __ rev32(v24, __ T16B, v24); 2885 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2886 __ rev32(v25, __ T16B, v25); 2887 __ rev32(v26, __ T16B, v26); 2888 __ rev32(v27, __ T16B, v27); 2889 __ rev32(v28, __ T16B, v28); 2890 __ ld1(v29, v30, __ T16B, key); 2891 __ rev32(v29, __ T16B, v29); 2892 __ rev32(v30, __ T16B, v30); 2893 2894 __ BIND(L_aes_loop); 2895 __ ld1(v0, __ T16B, __ post(from, 16)); 2896 __ orr(v1, __ T16B, v0, v0); 2897 2898 __ br(Assembler::CC, L_rounds_44); 2899 __ br(Assembler::EQ, L_rounds_52); 2900 2901 __ aesd(v0, v17); __ aesimc(v0, v0); 2902 __ aesd(v0, v18); __ aesimc(v0, v0); 2903 __ BIND(L_rounds_52); 2904 __ aesd(v0, v19); __ aesimc(v0, v0); 2905 __ aesd(v0, v20); __ aesimc(v0, v0); 2906 __ BIND(L_rounds_44); 2907 __ aesd(v0, v21); __ aesimc(v0, v0); 2908 __ aesd(v0, v22); __ aesimc(v0, v0); 2909 __ aesd(v0, v23); __ aesimc(v0, v0); 2910 __ aesd(v0, v24); __ aesimc(v0, v0); 2911 __ aesd(v0, v25); __ aesimc(v0, v0); 2912 __ aesd(v0, v26); __ aesimc(v0, v0); 2913 __ aesd(v0, v27); __ aesimc(v0, v0); 2914 __ aesd(v0, v28); __ aesimc(v0, v0); 2915 __ aesd(v0, v29); __ aesimc(v0, v0); 2916 __ aesd(v0, v30); 2917 __ eor(v0, __ T16B, v0, v31); 2918 __ eor(v0, __ T16B, v0, v2); 2919 2920 __ st1(v0, __ T16B, __ post(to, 16)); 2921 __ orr(v2, __ T16B, v1, v1); 2922 2923 __ subw(len_reg, len_reg, 16); 2924 __ cbnzw(len_reg, L_aes_loop); 2925 2926 __ st1(v2, __ T16B, rvec); 2927 2928 __ mov(r0, rscratch2); 2929 2930 __ leave(); 2931 __ ret(lr); 2932 2933 return start; 2934 } 2935 2936 // Arguments: 2937 // 2938 // Inputs: 2939 // c_rarg0 - byte[] source+offset 2940 // c_rarg1 - int[] SHA.state 2941 // c_rarg2 - int offset 2942 // c_rarg3 - int limit 2943 // 2944 address generate_sha1_implCompress(bool multi_block, const char *name) { 2945 __ align(CodeEntryAlignment); 2946 StubCodeMark mark(this, "StubRoutines", name); 2947 address start = __ pc(); 2948 2949 Register buf = c_rarg0; 2950 Register state = c_rarg1; 2951 Register ofs = c_rarg2; 2952 Register limit = c_rarg3; 2953 2954 Label keys; 2955 Label sha1_loop; 2956 2957 // load the keys into v0..v3 2958 __ adr(rscratch1, keys); 2959 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2960 // load 5 words state into v6, v7 2961 __ ldrq(v6, Address(state, 0)); 2962 __ ldrs(v7, Address(state, 16)); 2963 2964 2965 __ BIND(sha1_loop); 2966 // load 64 bytes of data into v16..v19 2967 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2968 __ rev32(v16, __ T16B, v16); 2969 __ rev32(v17, __ T16B, v17); 2970 __ rev32(v18, __ T16B, v18); 2971 __ rev32(v19, __ T16B, v19); 2972 2973 // do the sha1 2974 __ addv(v4, __ T4S, v16, v0); 2975 __ orr(v20, __ T16B, v6, v6); 2976 2977 FloatRegister d0 = v16; 2978 FloatRegister d1 = v17; 2979 FloatRegister d2 = v18; 2980 FloatRegister d3 = v19; 2981 2982 for (int round = 0; round < 20; round++) { 2983 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2984 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2985 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2986 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2987 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2988 2989 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2990 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2991 __ sha1h(tmp2, __ T4S, v20); 2992 if (round < 5) 2993 __ sha1c(v20, __ T4S, tmp3, tmp4); 2994 else if (round < 10 || round >= 15) 2995 __ sha1p(v20, __ T4S, tmp3, tmp4); 2996 else 2997 __ sha1m(v20, __ T4S, tmp3, tmp4); 2998 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2999 3000 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3001 } 3002 3003 __ addv(v7, __ T2S, v7, v21); 3004 __ addv(v6, __ T4S, v6, v20); 3005 3006 if (multi_block) { 3007 __ add(ofs, ofs, 64); 3008 __ cmp(ofs, limit); 3009 __ br(Assembler::LE, sha1_loop); 3010 __ mov(c_rarg0, ofs); // return ofs 3011 } 3012 3013 __ strq(v6, Address(state, 0)); 3014 __ strs(v7, Address(state, 16)); 3015 3016 __ ret(lr); 3017 3018 __ bind(keys); 3019 __ emit_int32(0x5a827999); 3020 __ emit_int32(0x6ed9eba1); 3021 __ emit_int32(0x8f1bbcdc); 3022 __ emit_int32(0xca62c1d6); 3023 3024 return start; 3025 } 3026 3027 3028 // Arguments: 3029 // 3030 // Inputs: 3031 // c_rarg0 - byte[] source+offset 3032 // c_rarg1 - int[] SHA.state 3033 // c_rarg2 - int offset 3034 // c_rarg3 - int limit 3035 // 3036 address generate_sha256_implCompress(bool multi_block, const char *name) { 3037 static const uint32_t round_consts[64] = { 3038 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3039 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3040 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3041 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3042 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3043 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3044 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3045 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3046 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3047 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3048 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3049 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3050 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3051 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3052 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3053 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3054 }; 3055 __ align(CodeEntryAlignment); 3056 StubCodeMark mark(this, "StubRoutines", name); 3057 address start = __ pc(); 3058 3059 Register buf = c_rarg0; 3060 Register state = c_rarg1; 3061 Register ofs = c_rarg2; 3062 Register limit = c_rarg3; 3063 3064 Label sha1_loop; 3065 3066 __ stpd(v8, v9, __ pre(sp, -32)); 3067 __ stpd(v10, v11, Address(sp, 16)); 3068 3069 // dga == v0 3070 // dgb == v1 3071 // dg0 == v2 3072 // dg1 == v3 3073 // dg2 == v4 3074 // t0 == v6 3075 // t1 == v7 3076 3077 // load 16 keys to v16..v31 3078 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3079 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3080 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3081 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3082 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3083 3084 // load 8 words (256 bits) state 3085 __ ldpq(v0, v1, state); 3086 3087 __ BIND(sha1_loop); 3088 // load 64 bytes of data into v8..v11 3089 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3090 __ rev32(v8, __ T16B, v8); 3091 __ rev32(v9, __ T16B, v9); 3092 __ rev32(v10, __ T16B, v10); 3093 __ rev32(v11, __ T16B, v11); 3094 3095 __ addv(v6, __ T4S, v8, v16); 3096 __ orr(v2, __ T16B, v0, v0); 3097 __ orr(v3, __ T16B, v1, v1); 3098 3099 FloatRegister d0 = v8; 3100 FloatRegister d1 = v9; 3101 FloatRegister d2 = v10; 3102 FloatRegister d3 = v11; 3103 3104 3105 for (int round = 0; round < 16; round++) { 3106 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3107 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3108 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3109 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3110 3111 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3112 __ orr(v4, __ T16B, v2, v2); 3113 if (round < 15) 3114 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3115 __ sha256h(v2, __ T4S, v3, tmp2); 3116 __ sha256h2(v3, __ T4S, v4, tmp2); 3117 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3118 3119 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3120 } 3121 3122 __ addv(v0, __ T4S, v0, v2); 3123 __ addv(v1, __ T4S, v1, v3); 3124 3125 if (multi_block) { 3126 __ add(ofs, ofs, 64); 3127 __ cmp(ofs, limit); 3128 __ br(Assembler::LE, sha1_loop); 3129 __ mov(c_rarg0, ofs); // return ofs 3130 } 3131 3132 __ ldpd(v10, v11, Address(sp, 16)); 3133 __ ldpd(v8, v9, __ post(sp, 32)); 3134 3135 __ stpq(v0, v1, state); 3136 3137 __ ret(lr); 3138 3139 return start; 3140 } 3141 3142 #ifndef BUILTIN_SIM 3143 // Safefetch stubs. 3144 void generate_safefetch(const char* name, int size, address* entry, 3145 address* fault_pc, address* continuation_pc) { 3146 // safefetch signatures: 3147 // int SafeFetch32(int* adr, int errValue); 3148 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3149 // 3150 // arguments: 3151 // c_rarg0 = adr 3152 // c_rarg1 = errValue 3153 // 3154 // result: 3155 // PPC_RET = *adr or errValue 3156 3157 StubCodeMark mark(this, "StubRoutines", name); 3158 3159 // Entry point, pc or function descriptor. 3160 *entry = __ pc(); 3161 3162 // Load *adr into c_rarg1, may fault. 3163 *fault_pc = __ pc(); 3164 switch (size) { 3165 case 4: 3166 // int32_t 3167 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3168 break; 3169 case 8: 3170 // int64_t 3171 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3172 break; 3173 default: 3174 ShouldNotReachHere(); 3175 } 3176 3177 // return errValue or *adr 3178 *continuation_pc = __ pc(); 3179 __ mov(r0, c_rarg1); 3180 __ ret(lr); 3181 } 3182 #endif 3183 3184 /** 3185 * Arguments: 3186 * 3187 * Inputs: 3188 * c_rarg0 - int crc 3189 * c_rarg1 - byte* buf 3190 * c_rarg2 - int length 3191 * 3192 * Ouput: 3193 * rax - int crc result 3194 */ 3195 address generate_updateBytesCRC32() { 3196 assert(UseCRC32Intrinsics, "what are we doing here?"); 3197 3198 __ align(CodeEntryAlignment); 3199 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3200 3201 address start = __ pc(); 3202 3203 const Register crc = c_rarg0; // crc 3204 const Register buf = c_rarg1; // source java byte array address 3205 const Register len = c_rarg2; // length 3206 const Register table0 = c_rarg3; // crc_table address 3207 const Register table1 = c_rarg4; 3208 const Register table2 = c_rarg5; 3209 const Register table3 = c_rarg6; 3210 const Register tmp3 = c_rarg7; 3211 3212 BLOCK_COMMENT("Entry:"); 3213 __ enter(); // required for proper stackwalking of RuntimeStub frame 3214 3215 __ kernel_crc32(crc, buf, len, 3216 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3217 3218 __ leave(); // required for proper stackwalking of RuntimeStub frame 3219 __ ret(lr); 3220 3221 return start; 3222 } 3223 3224 /** 3225 * Arguments: 3226 * 3227 * Inputs: 3228 * c_rarg0 - int crc 3229 * c_rarg1 - byte* buf 3230 * c_rarg2 - int length 3231 * c_rarg3 - int* table 3232 * 3233 * Ouput: 3234 * r0 - int crc result 3235 */ 3236 address generate_updateBytesCRC32C() { 3237 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3238 3239 __ align(CodeEntryAlignment); 3240 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3241 3242 address start = __ pc(); 3243 3244 const Register crc = c_rarg0; // crc 3245 const Register buf = c_rarg1; // source java byte array address 3246 const Register len = c_rarg2; // length 3247 const Register table0 = c_rarg3; // crc_table address 3248 const Register table1 = c_rarg4; 3249 const Register table2 = c_rarg5; 3250 const Register table3 = c_rarg6; 3251 const Register tmp3 = c_rarg7; 3252 3253 BLOCK_COMMENT("Entry:"); 3254 __ enter(); // required for proper stackwalking of RuntimeStub frame 3255 3256 __ kernel_crc32c(crc, buf, len, 3257 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3258 3259 __ leave(); // required for proper stackwalking of RuntimeStub frame 3260 __ ret(lr); 3261 3262 return start; 3263 } 3264 3265 /*** 3266 * Arguments: 3267 * 3268 * Inputs: 3269 * c_rarg0 - int adler 3270 * c_rarg1 - byte* buff 3271 * c_rarg2 - int len 3272 * 3273 * Output: 3274 * c_rarg0 - int adler result 3275 */ 3276 address generate_updateBytesAdler32() { 3277 __ align(CodeEntryAlignment); 3278 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3279 address start = __ pc(); 3280 3281 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3282 3283 // Aliases 3284 Register adler = c_rarg0; 3285 Register s1 = c_rarg0; 3286 Register s2 = c_rarg3; 3287 Register buff = c_rarg1; 3288 Register len = c_rarg2; 3289 Register nmax = r4; 3290 Register base = r5; 3291 Register count = r6; 3292 Register temp0 = rscratch1; 3293 Register temp1 = rscratch2; 3294 Register temp2 = r7; 3295 3296 // Max number of bytes we can process before having to take the mod 3297 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3298 unsigned long BASE = 0xfff1; 3299 unsigned long NMAX = 0x15B0; 3300 3301 __ mov(base, BASE); 3302 __ mov(nmax, NMAX); 3303 3304 // s1 is initialized to the lower 16 bits of adler 3305 // s2 is initialized to the upper 16 bits of adler 3306 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3307 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3308 3309 // The pipelined loop needs at least 16 elements for 1 iteration 3310 // It does check this, but it is more effective to skip to the cleanup loop 3311 __ cmp(len, 16); 3312 __ br(Assembler::HS, L_nmax); 3313 __ cbz(len, L_combine); 3314 3315 __ bind(L_simple_by1_loop); 3316 __ ldrb(temp0, Address(__ post(buff, 1))); 3317 __ add(s1, s1, temp0); 3318 __ add(s2, s2, s1); 3319 __ subs(len, len, 1); 3320 __ br(Assembler::HI, L_simple_by1_loop); 3321 3322 // s1 = s1 % BASE 3323 __ subs(temp0, s1, base); 3324 __ csel(s1, temp0, s1, Assembler::HS); 3325 3326 // s2 = s2 % BASE 3327 __ lsr(temp0, s2, 16); 3328 __ lsl(temp1, temp0, 4); 3329 __ sub(temp1, temp1, temp0); 3330 __ add(s2, temp1, s2, ext::uxth); 3331 3332 __ subs(temp0, s2, base); 3333 __ csel(s2, temp0, s2, Assembler::HS); 3334 3335 __ b(L_combine); 3336 3337 __ bind(L_nmax); 3338 __ subs(len, len, nmax); 3339 __ sub(count, nmax, 16); 3340 __ br(Assembler::LO, L_by16); 3341 3342 __ bind(L_nmax_loop); 3343 3344 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3345 3346 __ add(s1, s1, temp0, ext::uxtb); 3347 __ ubfx(temp2, temp0, 8, 8); 3348 __ add(s2, s2, s1); 3349 __ add(s1, s1, temp2); 3350 __ ubfx(temp2, temp0, 16, 8); 3351 __ add(s2, s2, s1); 3352 __ add(s1, s1, temp2); 3353 __ ubfx(temp2, temp0, 24, 8); 3354 __ add(s2, s2, s1); 3355 __ add(s1, s1, temp2); 3356 __ ubfx(temp2, temp0, 32, 8); 3357 __ add(s2, s2, s1); 3358 __ add(s1, s1, temp2); 3359 __ ubfx(temp2, temp0, 40, 8); 3360 __ add(s2, s2, s1); 3361 __ add(s1, s1, temp2); 3362 __ ubfx(temp2, temp0, 48, 8); 3363 __ add(s2, s2, s1); 3364 __ add(s1, s1, temp2); 3365 __ add(s2, s2, s1); 3366 __ add(s1, s1, temp0, Assembler::LSR, 56); 3367 __ add(s2, s2, s1); 3368 3369 __ add(s1, s1, temp1, ext::uxtb); 3370 __ ubfx(temp2, temp1, 8, 8); 3371 __ add(s2, s2, s1); 3372 __ add(s1, s1, temp2); 3373 __ ubfx(temp2, temp1, 16, 8); 3374 __ add(s2, s2, s1); 3375 __ add(s1, s1, temp2); 3376 __ ubfx(temp2, temp1, 24, 8); 3377 __ add(s2, s2, s1); 3378 __ add(s1, s1, temp2); 3379 __ ubfx(temp2, temp1, 32, 8); 3380 __ add(s2, s2, s1); 3381 __ add(s1, s1, temp2); 3382 __ ubfx(temp2, temp1, 40, 8); 3383 __ add(s2, s2, s1); 3384 __ add(s1, s1, temp2); 3385 __ ubfx(temp2, temp1, 48, 8); 3386 __ add(s2, s2, s1); 3387 __ add(s1, s1, temp2); 3388 __ add(s2, s2, s1); 3389 __ add(s1, s1, temp1, Assembler::LSR, 56); 3390 __ add(s2, s2, s1); 3391 3392 __ subs(count, count, 16); 3393 __ br(Assembler::HS, L_nmax_loop); 3394 3395 // s1 = s1 % BASE 3396 __ lsr(temp0, s1, 16); 3397 __ lsl(temp1, temp0, 4); 3398 __ sub(temp1, temp1, temp0); 3399 __ add(temp1, temp1, s1, ext::uxth); 3400 3401 __ lsr(temp0, temp1, 16); 3402 __ lsl(s1, temp0, 4); 3403 __ sub(s1, s1, temp0); 3404 __ add(s1, s1, temp1, ext:: uxth); 3405 3406 __ subs(temp0, s1, base); 3407 __ csel(s1, temp0, s1, Assembler::HS); 3408 3409 // s2 = s2 % BASE 3410 __ lsr(temp0, s2, 16); 3411 __ lsl(temp1, temp0, 4); 3412 __ sub(temp1, temp1, temp0); 3413 __ add(temp1, temp1, s2, ext::uxth); 3414 3415 __ lsr(temp0, temp1, 16); 3416 __ lsl(s2, temp0, 4); 3417 __ sub(s2, s2, temp0); 3418 __ add(s2, s2, temp1, ext:: uxth); 3419 3420 __ subs(temp0, s2, base); 3421 __ csel(s2, temp0, s2, Assembler::HS); 3422 3423 __ subs(len, len, nmax); 3424 __ sub(count, nmax, 16); 3425 __ br(Assembler::HS, L_nmax_loop); 3426 3427 __ bind(L_by16); 3428 __ adds(len, len, count); 3429 __ br(Assembler::LO, L_by1); 3430 3431 __ bind(L_by16_loop); 3432 3433 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3434 3435 __ add(s1, s1, temp0, ext::uxtb); 3436 __ ubfx(temp2, temp0, 8, 8); 3437 __ add(s2, s2, s1); 3438 __ add(s1, s1, temp2); 3439 __ ubfx(temp2, temp0, 16, 8); 3440 __ add(s2, s2, s1); 3441 __ add(s1, s1, temp2); 3442 __ ubfx(temp2, temp0, 24, 8); 3443 __ add(s2, s2, s1); 3444 __ add(s1, s1, temp2); 3445 __ ubfx(temp2, temp0, 32, 8); 3446 __ add(s2, s2, s1); 3447 __ add(s1, s1, temp2); 3448 __ ubfx(temp2, temp0, 40, 8); 3449 __ add(s2, s2, s1); 3450 __ add(s1, s1, temp2); 3451 __ ubfx(temp2, temp0, 48, 8); 3452 __ add(s2, s2, s1); 3453 __ add(s1, s1, temp2); 3454 __ add(s2, s2, s1); 3455 __ add(s1, s1, temp0, Assembler::LSR, 56); 3456 __ add(s2, s2, s1); 3457 3458 __ add(s1, s1, temp1, ext::uxtb); 3459 __ ubfx(temp2, temp1, 8, 8); 3460 __ add(s2, s2, s1); 3461 __ add(s1, s1, temp2); 3462 __ ubfx(temp2, temp1, 16, 8); 3463 __ add(s2, s2, s1); 3464 __ add(s1, s1, temp2); 3465 __ ubfx(temp2, temp1, 24, 8); 3466 __ add(s2, s2, s1); 3467 __ add(s1, s1, temp2); 3468 __ ubfx(temp2, temp1, 32, 8); 3469 __ add(s2, s2, s1); 3470 __ add(s1, s1, temp2); 3471 __ ubfx(temp2, temp1, 40, 8); 3472 __ add(s2, s2, s1); 3473 __ add(s1, s1, temp2); 3474 __ ubfx(temp2, temp1, 48, 8); 3475 __ add(s2, s2, s1); 3476 __ add(s1, s1, temp2); 3477 __ add(s2, s2, s1); 3478 __ add(s1, s1, temp1, Assembler::LSR, 56); 3479 __ add(s2, s2, s1); 3480 3481 __ subs(len, len, 16); 3482 __ br(Assembler::HS, L_by16_loop); 3483 3484 __ bind(L_by1); 3485 __ adds(len, len, 15); 3486 __ br(Assembler::LO, L_do_mod); 3487 3488 __ bind(L_by1_loop); 3489 __ ldrb(temp0, Address(__ post(buff, 1))); 3490 __ add(s1, temp0, s1); 3491 __ add(s2, s2, s1); 3492 __ subs(len, len, 1); 3493 __ br(Assembler::HS, L_by1_loop); 3494 3495 __ bind(L_do_mod); 3496 // s1 = s1 % BASE 3497 __ lsr(temp0, s1, 16); 3498 __ lsl(temp1, temp0, 4); 3499 __ sub(temp1, temp1, temp0); 3500 __ add(temp1, temp1, s1, ext::uxth); 3501 3502 __ lsr(temp0, temp1, 16); 3503 __ lsl(s1, temp0, 4); 3504 __ sub(s1, s1, temp0); 3505 __ add(s1, s1, temp1, ext:: uxth); 3506 3507 __ subs(temp0, s1, base); 3508 __ csel(s1, temp0, s1, Assembler::HS); 3509 3510 // s2 = s2 % BASE 3511 __ lsr(temp0, s2, 16); 3512 __ lsl(temp1, temp0, 4); 3513 __ sub(temp1, temp1, temp0); 3514 __ add(temp1, temp1, s2, ext::uxth); 3515 3516 __ lsr(temp0, temp1, 16); 3517 __ lsl(s2, temp0, 4); 3518 __ sub(s2, s2, temp0); 3519 __ add(s2, s2, temp1, ext:: uxth); 3520 3521 __ subs(temp0, s2, base); 3522 __ csel(s2, temp0, s2, Assembler::HS); 3523 3524 // Combine lower bits and higher bits 3525 __ bind(L_combine); 3526 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3527 3528 __ ret(lr); 3529 3530 return start; 3531 } 3532 3533 /** 3534 * Arguments: 3535 * 3536 * Input: 3537 * c_rarg0 - x address 3538 * c_rarg1 - x length 3539 * c_rarg2 - y address 3540 * c_rarg3 - y lenth 3541 * c_rarg4 - z address 3542 * c_rarg5 - z length 3543 */ 3544 address generate_multiplyToLen() { 3545 __ align(CodeEntryAlignment); 3546 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3547 3548 address start = __ pc(); 3549 const Register x = r0; 3550 const Register xlen = r1; 3551 const Register y = r2; 3552 const Register ylen = r3; 3553 const Register z = r4; 3554 const Register zlen = r5; 3555 3556 const Register tmp1 = r10; 3557 const Register tmp2 = r11; 3558 const Register tmp3 = r12; 3559 const Register tmp4 = r13; 3560 const Register tmp5 = r14; 3561 const Register tmp6 = r15; 3562 const Register tmp7 = r16; 3563 3564 BLOCK_COMMENT("Entry:"); 3565 __ enter(); // required for proper stackwalking of RuntimeStub frame 3566 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3567 __ leave(); // required for proper stackwalking of RuntimeStub frame 3568 __ ret(lr); 3569 3570 return start; 3571 } 3572 3573 address generate_squareToLen() { 3574 // squareToLen algorithm for sizes 1..127 described in java code works 3575 // faster than multiply_to_len on some CPUs and slower on others, but 3576 // multiply_to_len shows a bit better overall results 3577 __ align(CodeEntryAlignment); 3578 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3579 address start = __ pc(); 3580 3581 const Register x = r0; 3582 const Register xlen = r1; 3583 const Register z = r2; 3584 const Register zlen = r3; 3585 const Register y = r4; // == x 3586 const Register ylen = r5; // == xlen 3587 3588 const Register tmp1 = r10; 3589 const Register tmp2 = r11; 3590 const Register tmp3 = r12; 3591 const Register tmp4 = r13; 3592 const Register tmp5 = r14; 3593 const Register tmp6 = r15; 3594 const Register tmp7 = r16; 3595 3596 RegSet spilled_regs = RegSet::of(y, ylen); 3597 BLOCK_COMMENT("Entry:"); 3598 __ enter(); 3599 __ push(spilled_regs, sp); 3600 __ mov(y, x); 3601 __ mov(ylen, xlen); 3602 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3603 __ pop(spilled_regs, sp); 3604 __ leave(); 3605 __ ret(lr); 3606 return start; 3607 } 3608 3609 address generate_mulAdd() { 3610 __ align(CodeEntryAlignment); 3611 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3612 3613 address start = __ pc(); 3614 3615 const Register out = r0; 3616 const Register in = r1; 3617 const Register offset = r2; 3618 const Register len = r3; 3619 const Register k = r4; 3620 3621 BLOCK_COMMENT("Entry:"); 3622 __ enter(); 3623 __ mul_add(out, in, offset, len, k); 3624 __ leave(); 3625 __ ret(lr); 3626 3627 return start; 3628 } 3629 3630 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3631 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3632 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3633 // Karatsuba multiplication performs a 128*128 -> 256-bit 3634 // multiplication in three 128-bit multiplications and a few 3635 // additions. 3636 // 3637 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3638 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3639 // 3640 // Inputs: 3641 // 3642 // A0 in a.d[0] (subkey) 3643 // A1 in a.d[1] 3644 // (A1+A0) in a1_xor_a0.d[0] 3645 // 3646 // B0 in b.d[0] (state) 3647 // B1 in b.d[1] 3648 3649 __ ext(tmp1, __ T16B, b, b, 0x08); 3650 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3651 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3652 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3653 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3654 3655 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3656 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3657 __ eor(tmp2, __ T16B, tmp2, tmp4); 3658 __ eor(tmp2, __ T16B, tmp2, tmp3); 3659 3660 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3661 __ ins(result_hi, __ D, tmp2, 0, 1); 3662 __ ins(result_lo, __ D, tmp2, 1, 0); 3663 } 3664 3665 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3666 FloatRegister p, FloatRegister z, FloatRegister t1) { 3667 const FloatRegister t0 = result; 3668 3669 // The GCM field polynomial f is z^128 + p(z), where p = 3670 // z^7+z^2+z+1. 3671 // 3672 // z^128 === -p(z) (mod (z^128 + p(z))) 3673 // 3674 // so, given that the product we're reducing is 3675 // a == lo + hi * z^128 3676 // substituting, 3677 // === lo - hi * p(z) (mod (z^128 + p(z))) 3678 // 3679 // we reduce by multiplying hi by p(z) and subtracting the result 3680 // from (i.e. XORing it with) lo. Because p has no nonzero high 3681 // bits we can do this with two 64-bit multiplications, lo*p and 3682 // hi*p. 3683 3684 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3685 __ ext(t1, __ T16B, t0, z, 8); 3686 __ eor(hi, __ T16B, hi, t1); 3687 __ ext(t1, __ T16B, z, t0, 8); 3688 __ eor(lo, __ T16B, lo, t1); 3689 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3690 __ eor(result, __ T16B, lo, t0); 3691 } 3692 3693 address generate_has_negatives(address &has_negatives_long) { 3694 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3695 const int large_loop_size = 64; 3696 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3697 int dcache_line = VM_Version::dcache_line_size(); 3698 3699 Register ary1 = r1, len = r2, result = r0; 3700 3701 __ align(CodeEntryAlignment); 3702 address entry = __ pc(); 3703 3704 __ enter(); 3705 3706 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3707 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3708 3709 __ cmp(len, 15); 3710 __ br(Assembler::GT, LEN_OVER_15); 3711 // The only case when execution falls into this code is when pointer is near 3712 // the end of memory page and we have to avoid reading next page 3713 __ add(ary1, ary1, len); 3714 __ subs(len, len, 8); 3715 __ br(Assembler::GT, LEN_OVER_8); 3716 __ ldr(rscratch2, Address(ary1, -8)); 3717 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3718 __ lsrv(rscratch2, rscratch2, rscratch1); 3719 __ tst(rscratch2, UPPER_BIT_MASK); 3720 __ cset(result, Assembler::NE); 3721 __ leave(); 3722 __ ret(lr); 3723 __ bind(LEN_OVER_8); 3724 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3725 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3726 __ tst(rscratch2, UPPER_BIT_MASK); 3727 __ br(Assembler::NE, RET_TRUE_NO_POP); 3728 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3729 __ lsrv(rscratch1, rscratch1, rscratch2); 3730 __ tst(rscratch1, UPPER_BIT_MASK); 3731 __ cset(result, Assembler::NE); 3732 __ leave(); 3733 __ ret(lr); 3734 3735 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3736 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3737 3738 has_negatives_long = __ pc(); // 2nd entry point 3739 3740 __ enter(); 3741 3742 __ bind(LEN_OVER_15); 3743 __ push(spilled_regs, sp); 3744 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3745 __ cbz(rscratch2, ALIGNED); 3746 __ ldp(tmp6, tmp1, Address(ary1)); 3747 __ mov(tmp5, 16); 3748 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3749 __ add(ary1, ary1, rscratch1); 3750 __ sub(len, len, rscratch1); 3751 __ orr(tmp6, tmp6, tmp1); 3752 __ tst(tmp6, UPPER_BIT_MASK); 3753 __ br(Assembler::NE, RET_TRUE); 3754 3755 __ bind(ALIGNED); 3756 __ cmp(len, large_loop_size); 3757 __ br(Assembler::LT, CHECK_16); 3758 // Perform 16-byte load as early return in pre-loop to handle situation 3759 // when initially aligned large array has negative values at starting bytes, 3760 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3761 // slower. Cases with negative bytes further ahead won't be affected that 3762 // much. In fact, it'll be faster due to early loads, less instructions and 3763 // less branches in LARGE_LOOP. 3764 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3765 __ sub(len, len, 16); 3766 __ orr(tmp6, tmp6, tmp1); 3767 __ tst(tmp6, UPPER_BIT_MASK); 3768 __ br(Assembler::NE, RET_TRUE); 3769 __ cmp(len, large_loop_size); 3770 __ br(Assembler::LT, CHECK_16); 3771 3772 if (SoftwarePrefetchHintDistance >= 0 3773 && SoftwarePrefetchHintDistance >= dcache_line) { 3774 // initial prefetch 3775 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3776 } 3777 __ bind(LARGE_LOOP); 3778 if (SoftwarePrefetchHintDistance >= 0) { 3779 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3780 } 3781 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3782 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3783 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3784 // instructions per cycle and have less branches, but this approach disables 3785 // early return, thus, all 64 bytes are loaded and checked every time. 3786 __ ldp(tmp2, tmp3, Address(ary1)); 3787 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3788 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3789 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3790 __ add(ary1, ary1, large_loop_size); 3791 __ sub(len, len, large_loop_size); 3792 __ orr(tmp2, tmp2, tmp3); 3793 __ orr(tmp4, tmp4, tmp5); 3794 __ orr(rscratch1, rscratch1, rscratch2); 3795 __ orr(tmp6, tmp6, tmp1); 3796 __ orr(tmp2, tmp2, tmp4); 3797 __ orr(rscratch1, rscratch1, tmp6); 3798 __ orr(tmp2, tmp2, rscratch1); 3799 __ tst(tmp2, UPPER_BIT_MASK); 3800 __ br(Assembler::NE, RET_TRUE); 3801 __ cmp(len, large_loop_size); 3802 __ br(Assembler::GE, LARGE_LOOP); 3803 3804 __ bind(CHECK_16); // small 16-byte load pre-loop 3805 __ cmp(len, 16); 3806 __ br(Assembler::LT, POST_LOOP16); 3807 3808 __ bind(LOOP16); // small 16-byte load loop 3809 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3810 __ sub(len, len, 16); 3811 __ orr(tmp2, tmp2, tmp3); 3812 __ tst(tmp2, UPPER_BIT_MASK); 3813 __ br(Assembler::NE, RET_TRUE); 3814 __ cmp(len, 16); 3815 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3816 3817 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3818 __ cmp(len, 8); 3819 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3820 __ ldr(tmp3, Address(__ post(ary1, 8))); 3821 __ sub(len, len, 8); 3822 __ tst(tmp3, UPPER_BIT_MASK); 3823 __ br(Assembler::NE, RET_TRUE); 3824 3825 __ bind(POST_LOOP16_LOAD_TAIL); 3826 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3827 __ ldr(tmp1, Address(ary1)); 3828 __ mov(tmp2, 64); 3829 __ sub(tmp4, tmp2, len, __ LSL, 3); 3830 __ lslv(tmp1, tmp1, tmp4); 3831 __ tst(tmp1, UPPER_BIT_MASK); 3832 __ br(Assembler::NE, RET_TRUE); 3833 // Fallthrough 3834 3835 __ bind(RET_FALSE); 3836 __ pop(spilled_regs, sp); 3837 __ leave(); 3838 __ mov(result, zr); 3839 __ ret(lr); 3840 3841 __ bind(RET_TRUE); 3842 __ pop(spilled_regs, sp); 3843 __ bind(RET_TRUE_NO_POP); 3844 __ leave(); 3845 __ mov(result, 1); 3846 __ ret(lr); 3847 3848 __ bind(DONE); 3849 __ pop(spilled_regs, sp); 3850 __ leave(); 3851 __ ret(lr); 3852 return entry; 3853 } 3854 3855 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3856 bool usePrefetch, Label &NOT_EQUAL) { 3857 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3858 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3859 tmp7 = r12, tmp8 = r13; 3860 Label LOOP; 3861 3862 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3863 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3864 __ bind(LOOP); 3865 if (usePrefetch) { 3866 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3867 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3868 } 3869 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3870 __ eor(tmp1, tmp1, tmp2); 3871 __ eor(tmp3, tmp3, tmp4); 3872 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3873 __ orr(tmp1, tmp1, tmp3); 3874 __ cbnz(tmp1, NOT_EQUAL); 3875 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3876 __ eor(tmp5, tmp5, tmp6); 3877 __ eor(tmp7, tmp7, tmp8); 3878 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3879 __ orr(tmp5, tmp5, tmp7); 3880 __ cbnz(tmp5, NOT_EQUAL); 3881 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3882 __ eor(tmp1, tmp1, tmp2); 3883 __ eor(tmp3, tmp3, tmp4); 3884 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3885 __ orr(tmp1, tmp1, tmp3); 3886 __ cbnz(tmp1, NOT_EQUAL); 3887 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3888 __ eor(tmp5, tmp5, tmp6); 3889 __ sub(cnt1, cnt1, 8 * wordSize); 3890 __ eor(tmp7, tmp7, tmp8); 3891 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3892 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3893 // cmp) because subs allows an unlimited range of immediate operand. 3894 __ subs(tmp6, cnt1, loopThreshold); 3895 __ orr(tmp5, tmp5, tmp7); 3896 __ cbnz(tmp5, NOT_EQUAL); 3897 __ br(__ GE, LOOP); 3898 // post-loop 3899 __ eor(tmp1, tmp1, tmp2); 3900 __ eor(tmp3, tmp3, tmp4); 3901 __ orr(tmp1, tmp1, tmp3); 3902 __ sub(cnt1, cnt1, 2 * wordSize); 3903 __ cbnz(tmp1, NOT_EQUAL); 3904 } 3905 3906 void generate_large_array_equals_loop_simd(int loopThreshold, 3907 bool usePrefetch, Label &NOT_EQUAL) { 3908 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3909 tmp2 = rscratch2; 3910 Label LOOP; 3911 3912 __ bind(LOOP); 3913 if (usePrefetch) { 3914 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3915 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3916 } 3917 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3918 __ sub(cnt1, cnt1, 8 * wordSize); 3919 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3920 __ subs(tmp1, cnt1, loopThreshold); 3921 __ eor(v0, __ T16B, v0, v4); 3922 __ eor(v1, __ T16B, v1, v5); 3923 __ eor(v2, __ T16B, v2, v6); 3924 __ eor(v3, __ T16B, v3, v7); 3925 __ orr(v0, __ T16B, v0, v1); 3926 __ orr(v1, __ T16B, v2, v3); 3927 __ orr(v0, __ T16B, v0, v1); 3928 __ umov(tmp1, v0, __ D, 0); 3929 __ umov(tmp2, v0, __ D, 1); 3930 __ orr(tmp1, tmp1, tmp2); 3931 __ cbnz(tmp1, NOT_EQUAL); 3932 __ br(__ GE, LOOP); 3933 } 3934 3935 // a1 = r1 - array1 address 3936 // a2 = r2 - array2 address 3937 // result = r0 - return value. Already contains "false" 3938 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3939 // r3-r5 are reserved temporary registers 3940 address generate_large_array_equals() { 3941 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3942 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3943 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3944 tmp7 = r12, tmp8 = r13; 3945 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3946 SMALL_LOOP, POST_LOOP; 3947 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3948 // calculate if at least 32 prefetched bytes are used 3949 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3950 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3951 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3952 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3953 tmp5, tmp6, tmp7, tmp8); 3954 3955 __ align(CodeEntryAlignment); 3956 address entry = __ pc(); 3957 __ enter(); 3958 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3959 // also advance pointers to use post-increment instead of pre-increment 3960 __ add(a1, a1, wordSize); 3961 __ add(a2, a2, wordSize); 3962 if (AvoidUnalignedAccesses) { 3963 // both implementations (SIMD/nonSIMD) are using relatively large load 3964 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3965 // on some CPUs in case of address is not at least 16-byte aligned. 3966 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3967 // load if needed at least for 1st address and make if 16-byte aligned. 3968 Label ALIGNED16; 3969 __ tbz(a1, 3, ALIGNED16); 3970 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3971 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3972 __ sub(cnt1, cnt1, wordSize); 3973 __ eor(tmp1, tmp1, tmp2); 3974 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3975 __ bind(ALIGNED16); 3976 } 3977 if (UseSIMDForArrayEquals) { 3978 if (SoftwarePrefetchHintDistance >= 0) { 3979 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3980 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3981 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3982 /* prfm = */ true, NOT_EQUAL); 3983 __ cmp(cnt1, nonPrefetchLoopThreshold); 3984 __ br(__ LT, TAIL); 3985 } 3986 __ bind(NO_PREFETCH_LARGE_LOOP); 3987 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3988 /* prfm = */ false, NOT_EQUAL); 3989 } else { 3990 __ push(spilled_regs, sp); 3991 if (SoftwarePrefetchHintDistance >= 0) { 3992 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3993 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3994 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3995 /* prfm = */ true, NOT_EQUAL); 3996 __ cmp(cnt1, nonPrefetchLoopThreshold); 3997 __ br(__ LT, TAIL); 3998 } 3999 __ bind(NO_PREFETCH_LARGE_LOOP); 4000 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 4001 /* prfm = */ false, NOT_EQUAL); 4002 } 4003 __ bind(TAIL); 4004 __ cbz(cnt1, EQUAL); 4005 __ subs(cnt1, cnt1, wordSize); 4006 __ br(__ LE, POST_LOOP); 4007 __ bind(SMALL_LOOP); 4008 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4009 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4010 __ subs(cnt1, cnt1, wordSize); 4011 __ eor(tmp1, tmp1, tmp2); 4012 __ cbnz(tmp1, NOT_EQUAL); 4013 __ br(__ GT, SMALL_LOOP); 4014 __ bind(POST_LOOP); 4015 __ ldr(tmp1, Address(a1, cnt1)); 4016 __ ldr(tmp2, Address(a2, cnt1)); 4017 __ eor(tmp1, tmp1, tmp2); 4018 __ cbnz(tmp1, NOT_EQUAL); 4019 __ bind(EQUAL); 4020 __ mov(result, true); 4021 __ bind(NOT_EQUAL); 4022 if (!UseSIMDForArrayEquals) { 4023 __ pop(spilled_regs, sp); 4024 } 4025 __ bind(NOT_EQUAL_NO_POP); 4026 __ leave(); 4027 __ ret(lr); 4028 return entry; 4029 } 4030 4031 address generate_dsin_dcos(bool isCos) { 4032 __ align(CodeEntryAlignment); 4033 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 4034 address start = __ pc(); 4035 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 4036 (address)StubRoutines::aarch64::_two_over_pi, 4037 (address)StubRoutines::aarch64::_pio2, 4038 (address)StubRoutines::aarch64::_dsin_coef, 4039 (address)StubRoutines::aarch64::_dcos_coef); 4040 return start; 4041 } 4042 4043 address generate_dlog() { 4044 __ align(CodeEntryAlignment); 4045 StubCodeMark mark(this, "StubRoutines", "dlog"); 4046 address entry = __ pc(); 4047 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 4048 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 4049 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 4050 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 4051 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 4052 return entry; 4053 } 4054 4055 // code for comparing 16 bytes of strings with same encoding 4056 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 4057 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 4058 __ ldr(rscratch1, Address(__ post(str1, 8))); 4059 __ eor(rscratch2, tmp1, tmp2); 4060 __ ldr(cnt1, Address(__ post(str2, 8))); 4061 __ cbnz(rscratch2, DIFF1); 4062 __ ldr(tmp1, Address(__ post(str1, 8))); 4063 __ eor(rscratch2, rscratch1, cnt1); 4064 __ ldr(tmp2, Address(__ post(str2, 8))); 4065 __ cbnz(rscratch2, DIFF2); 4066 } 4067 4068 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4069 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4070 Label &DIFF2) { 4071 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 4072 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4073 4074 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4075 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4076 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4077 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4078 4079 __ fmovd(tmpL, vtmp3); 4080 __ eor(rscratch2, tmp3, tmpL); 4081 __ cbnz(rscratch2, DIFF2); 4082 4083 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4084 __ umov(tmpL, vtmp3, __ D, 1); 4085 __ eor(rscratch2, tmpU, tmpL); 4086 __ cbnz(rscratch2, DIFF1); 4087 4088 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4089 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4090 __ fmovd(tmpL, vtmp); 4091 __ eor(rscratch2, tmp3, tmpL); 4092 __ cbnz(rscratch2, DIFF2); 4093 4094 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4095 __ umov(tmpL, vtmp, __ D, 1); 4096 __ eor(rscratch2, tmpU, tmpL); 4097 __ cbnz(rscratch2, DIFF1); 4098 } 4099 4100 // r0 = result 4101 // r1 = str1 4102 // r2 = cnt1 4103 // r3 = str2 4104 // r4 = cnt2 4105 // r10 = tmp1 4106 // r11 = tmp2 4107 address generate_compare_long_string_different_encoding(bool isLU) { 4108 __ align(CodeEntryAlignment); 4109 StubCodeMark mark(this, "StubRoutines", isLU 4110 ? "compare_long_string_different_encoding LU" 4111 : "compare_long_string_different_encoding UL"); 4112 address entry = __ pc(); 4113 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4114 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER, 4115 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4116 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4117 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4118 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4119 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4120 4121 int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2); 4122 4123 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4124 // cnt2 == amount of characters left to compare 4125 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4126 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4127 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4128 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4129 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4130 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4131 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4132 __ eor(rscratch2, tmp1, tmp2); 4133 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4134 __ mov(rscratch1, tmp2); 4135 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4136 Register strU = isLU ? str2 : str1, 4137 strL = isLU ? str1 : str2, 4138 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4139 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4140 __ push(spilled_regs, sp); 4141 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4142 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4143 4144 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4145 4146 if (SoftwarePrefetchHintDistance >= 0) { 4147 __ cmp(cnt2, prefetchLoopExitCondition); 4148 __ br(__ LT, SMALL_LOOP); 4149 __ bind(LARGE_LOOP_PREFETCH); 4150 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4151 __ mov(tmp4, 2); 4152 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4153 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4154 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4155 __ subs(tmp4, tmp4, 1); 4156 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4157 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4158 __ mov(tmp4, 2); 4159 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4160 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4161 __ subs(tmp4, tmp4, 1); 4162 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4163 __ sub(cnt2, cnt2, 64); 4164 __ cmp(cnt2, prefetchLoopExitCondition); 4165 __ br(__ GE, LARGE_LOOP_PREFETCH); 4166 } 4167 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4168 __ subs(cnt2, cnt2, 16); 4169 __ br(__ LT, TAIL); 4170 __ b(SMALL_LOOP_ENTER); 4171 __ bind(SMALL_LOOP); // smaller loop 4172 __ subs(cnt2, cnt2, 16); 4173 __ bind(SMALL_LOOP_ENTER); 4174 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4175 __ br(__ GE, SMALL_LOOP); 4176 __ cbz(cnt2, LOAD_LAST); 4177 __ bind(TAIL); // 1..15 characters left 4178 __ cmp(cnt2, -8); 4179 __ br(__ GT, TAIL_LOAD_16); 4180 __ ldrd(vtmp, Address(tmp2)); 4181 __ zip1(vtmp3, __ T8B, vtmp, vtmpZ); 4182 4183 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4184 __ fmovd(tmpL, vtmp3); 4185 __ eor(rscratch2, tmp3, tmpL); 4186 __ cbnz(rscratch2, DIFF2); 4187 __ umov(tmpL, vtmp3, __ D, 1); 4188 __ eor(rscratch2, tmpU, tmpL); 4189 __ cbnz(rscratch2, DIFF1); 4190 __ b(LOAD_LAST); 4191 __ bind(TAIL_LOAD_16); 4192 __ ldrq(vtmp, Address(tmp2)); 4193 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4194 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4195 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4196 __ fmovd(tmpL, vtmp3); 4197 __ eor(rscratch2, tmp3, tmpL); 4198 __ cbnz(rscratch2, DIFF2); 4199 4200 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4201 __ umov(tmpL, vtmp3, __ D, 1); 4202 __ eor(rscratch2, tmpU, tmpL); 4203 __ cbnz(rscratch2, DIFF1); 4204 4205 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4206 __ fmovd(tmpL, vtmp); 4207 __ eor(rscratch2, tmp3, tmpL); 4208 __ cbnz(rscratch2, DIFF2); 4209 4210 __ umov(tmpL, vtmp, __ D, 1); 4211 __ eor(rscratch2, tmpU, tmpL); 4212 __ cbnz(rscratch2, DIFF1); 4213 __ b(LOAD_LAST); 4214 __ bind(DIFF2); 4215 __ mov(tmpU, tmp3); 4216 __ bind(DIFF1); 4217 __ pop(spilled_regs, sp); 4218 __ b(CALCULATE_DIFFERENCE); 4219 __ bind(LOAD_LAST); 4220 __ pop(spilled_regs, sp); 4221 4222 __ ldrs(vtmp, Address(strL)); 4223 __ ldr(tmpU, Address(strU)); 4224 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4225 __ fmovd(tmpL, vtmp); 4226 4227 __ eor(rscratch2, tmpU, tmpL); 4228 __ cbz(rscratch2, DONE); 4229 4230 // Find the first different characters in the longwords and 4231 // compute their difference. 4232 __ bind(CALCULATE_DIFFERENCE); 4233 __ rev(rscratch2, rscratch2); 4234 __ clz(rscratch2, rscratch2); 4235 __ andr(rscratch2, rscratch2, -16); 4236 __ lsrv(tmp1, tmp1, rscratch2); 4237 __ uxthw(tmp1, tmp1); 4238 __ lsrv(rscratch1, rscratch1, rscratch2); 4239 __ uxthw(rscratch1, rscratch1); 4240 __ subw(result, tmp1, rscratch1); 4241 __ bind(DONE); 4242 __ ret(lr); 4243 return entry; 4244 } 4245 4246 // r0 = result 4247 // r1 = str1 4248 // r2 = cnt1 4249 // r3 = str2 4250 // r4 = cnt2 4251 // r10 = tmp1 4252 // r11 = tmp2 4253 address generate_compare_long_string_same_encoding(bool isLL) { 4254 __ align(CodeEntryAlignment); 4255 StubCodeMark mark(this, "StubRoutines", isLL 4256 ? "compare_long_string_same_encoding LL" 4257 : "compare_long_string_same_encoding UU"); 4258 address entry = __ pc(); 4259 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4260 tmp1 = r10, tmp2 = r11; 4261 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4262 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4263 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4264 // exit from large loop when less than 64 bytes left to read or we're about 4265 // to prefetch memory behind array border 4266 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4267 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4268 // update cnt2 counter with already loaded 8 bytes 4269 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4270 // update pointers, because of previous read 4271 __ add(str1, str1, wordSize); 4272 __ add(str2, str2, wordSize); 4273 if (SoftwarePrefetchHintDistance >= 0) { 4274 __ bind(LARGE_LOOP_PREFETCH); 4275 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4276 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4277 compare_string_16_bytes_same(DIFF, DIFF2); 4278 compare_string_16_bytes_same(DIFF, DIFF2); 4279 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4280 compare_string_16_bytes_same(DIFF, DIFF2); 4281 __ cmp(cnt2, largeLoopExitCondition); 4282 compare_string_16_bytes_same(DIFF, DIFF2); 4283 __ br(__ GT, LARGE_LOOP_PREFETCH); 4284 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4285 // less than 16 bytes left? 4286 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4287 __ br(__ LT, TAIL); 4288 } 4289 __ bind(SMALL_LOOP); 4290 compare_string_16_bytes_same(DIFF, DIFF2); 4291 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4292 __ br(__ GE, SMALL_LOOP); 4293 __ bind(TAIL); 4294 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4295 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4296 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4297 __ br(__ LE, CHECK_LAST); 4298 __ eor(rscratch2, tmp1, tmp2); 4299 __ cbnz(rscratch2, DIFF); 4300 __ ldr(tmp1, Address(__ post(str1, 8))); 4301 __ ldr(tmp2, Address(__ post(str2, 8))); 4302 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4303 __ bind(CHECK_LAST); 4304 if (!isLL) { 4305 __ add(cnt2, cnt2, cnt2); // now in bytes 4306 } 4307 __ eor(rscratch2, tmp1, tmp2); 4308 __ cbnz(rscratch2, DIFF); 4309 __ ldr(rscratch1, Address(str1, cnt2)); 4310 __ ldr(cnt1, Address(str2, cnt2)); 4311 __ eor(rscratch2, rscratch1, cnt1); 4312 __ cbz(rscratch2, LENGTH_DIFF); 4313 // Find the first different characters in the longwords and 4314 // compute their difference. 4315 __ bind(DIFF2); 4316 __ rev(rscratch2, rscratch2); 4317 __ clz(rscratch2, rscratch2); 4318 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4319 __ lsrv(rscratch1, rscratch1, rscratch2); 4320 if (isLL) { 4321 __ lsrv(cnt1, cnt1, rscratch2); 4322 __ uxtbw(rscratch1, rscratch1); 4323 __ uxtbw(cnt1, cnt1); 4324 } else { 4325 __ lsrv(cnt1, cnt1, rscratch2); 4326 __ uxthw(rscratch1, rscratch1); 4327 __ uxthw(cnt1, cnt1); 4328 } 4329 __ subw(result, rscratch1, cnt1); 4330 __ b(LENGTH_DIFF); 4331 __ bind(DIFF); 4332 __ rev(rscratch2, rscratch2); 4333 __ clz(rscratch2, rscratch2); 4334 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4335 __ lsrv(tmp1, tmp1, rscratch2); 4336 if (isLL) { 4337 __ lsrv(tmp2, tmp2, rscratch2); 4338 __ uxtbw(tmp1, tmp1); 4339 __ uxtbw(tmp2, tmp2); 4340 } else { 4341 __ lsrv(tmp2, tmp2, rscratch2); 4342 __ uxthw(tmp1, tmp1); 4343 __ uxthw(tmp2, tmp2); 4344 } 4345 __ subw(result, tmp1, tmp2); 4346 __ b(LENGTH_DIFF); 4347 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4348 __ eor(rscratch2, tmp1, tmp2); 4349 __ cbnz(rscratch2, DIFF); 4350 __ bind(LENGTH_DIFF); 4351 __ ret(lr); 4352 return entry; 4353 } 4354 4355 void generate_compare_long_strings() { 4356 StubRoutines::aarch64::_compare_long_string_LL 4357 = generate_compare_long_string_same_encoding(true); 4358 StubRoutines::aarch64::_compare_long_string_UU 4359 = generate_compare_long_string_same_encoding(false); 4360 StubRoutines::aarch64::_compare_long_string_LU 4361 = generate_compare_long_string_different_encoding(true); 4362 StubRoutines::aarch64::_compare_long_string_UL 4363 = generate_compare_long_string_different_encoding(false); 4364 } 4365 4366 // R0 = result 4367 // R1 = str2 4368 // R2 = cnt1 4369 // R3 = str1 4370 // R4 = cnt2 4371 // This generic linear code use few additional ideas, which makes it faster: 4372 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4373 // in order to skip initial loading(help in systems with 1 ld pipeline) 4374 // 2) we can use "fast" algorithm of finding single character to search for 4375 // first symbol with less branches(1 branch per each loaded register instead 4376 // of branch for each symbol), so, this is where constants like 4377 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4378 // 3) after loading and analyzing 1st register of source string, it can be 4379 // used to search for every 1st character entry, saving few loads in 4380 // comparison with "simplier-but-slower" implementation 4381 // 4) in order to avoid lots of push/pop operations, code below is heavily 4382 // re-using/re-initializing/compressing register values, which makes code 4383 // larger and a bit less readable, however, most of extra operations are 4384 // issued during loads or branches, so, penalty is minimal 4385 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4386 const char* stubName = str1_isL 4387 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4388 : "indexof_linear_uu"; 4389 __ align(CodeEntryAlignment); 4390 StubCodeMark mark(this, "StubRoutines", stubName); 4391 address entry = __ pc(); 4392 4393 int str1_chr_size = str1_isL ? 1 : 2; 4394 int str2_chr_size = str2_isL ? 1 : 2; 4395 int str1_chr_shift = str1_isL ? 0 : 1; 4396 int str2_chr_shift = str2_isL ? 0 : 1; 4397 bool isL = str1_isL && str2_isL; 4398 // parameters 4399 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4400 // temporary registers 4401 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4402 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4403 // redefinitions 4404 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4405 4406 __ push(spilled_regs, sp); 4407 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, L_SMALL_MATCH_LOOP, 4408 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4409 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4410 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4411 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4412 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4413 // Read whole register from str1. It is safe, because length >=8 here 4414 __ ldr(ch1, Address(str1)); 4415 // Read whole register from str2. It is safe, because length >=8 here 4416 __ ldr(ch2, Address(str2)); 4417 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4418 if (str1_isL != str2_isL) { 4419 __ eor(v0, __ T16B, v0, v0); 4420 } 4421 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4422 __ mul(first, first, tmp1); 4423 // check if we have less than 1 register to check 4424 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4425 if (str1_isL != str2_isL) { 4426 __ fmovd(v1, ch1); 4427 } 4428 __ br(__ LE, L_SMALL); 4429 __ eor(ch2, first, ch2); 4430 if (str1_isL != str2_isL) { 4431 __ zip1(v1, __ T16B, v1, v0); 4432 } 4433 __ sub(tmp2, ch2, tmp1); 4434 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4435 __ bics(tmp2, tmp2, ch2); 4436 if (str1_isL != str2_isL) { 4437 __ fmovd(ch1, v1); 4438 } 4439 __ br(__ NE, L_HAS_ZERO); 4440 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4441 __ add(result, result, wordSize/str2_chr_size); 4442 __ add(str2, str2, wordSize); 4443 __ br(__ LT, L_POST_LOOP); 4444 __ BIND(L_LOOP); 4445 __ ldr(ch2, Address(str2)); 4446 __ eor(ch2, first, ch2); 4447 __ sub(tmp2, ch2, tmp1); 4448 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4449 __ bics(tmp2, tmp2, ch2); 4450 __ br(__ NE, L_HAS_ZERO); 4451 __ BIND(L_LOOP_PROCEED); 4452 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4453 __ add(str2, str2, wordSize); 4454 __ add(result, result, wordSize/str2_chr_size); 4455 __ br(__ GE, L_LOOP); 4456 __ BIND(L_POST_LOOP); 4457 __ cmp(cnt2, -wordSize/str2_chr_size); // no extra characters to check 4458 __ br(__ LE, NOMATCH); 4459 __ ldr(ch2, Address(str2)); 4460 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4461 __ eor(ch2, first, ch2); 4462 __ sub(tmp2, ch2, tmp1); 4463 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4464 __ mov(tmp4, -1); // all bits set 4465 __ b(L_SMALL_PROCEED); 4466 __ align(OptoLoopAlignment); 4467 __ BIND(L_SMALL); 4468 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4469 __ eor(ch2, first, ch2); 4470 if (str1_isL != str2_isL) { 4471 __ zip1(v1, __ T16B, v1, v0); 4472 } 4473 __ sub(tmp2, ch2, tmp1); 4474 __ mov(tmp4, -1); // all bits set 4475 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4476 if (str1_isL != str2_isL) { 4477 __ fmovd(ch1, v1); // move converted 4 symbols 4478 } 4479 __ BIND(L_SMALL_PROCEED); 4480 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4481 __ bic(tmp2, tmp2, ch2); 4482 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4483 __ rbit(tmp2, tmp2); 4484 __ br(__ EQ, NOMATCH); 4485 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4486 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4487 __ cmp(cnt1, wordSize/str2_chr_size); 4488 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4489 if (str2_isL) { // LL 4490 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4491 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4492 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4493 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4494 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4495 } else { 4496 __ mov(ch2, 0xE); // all bits in byte set except last one 4497 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4498 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4499 __ lslv(tmp2, tmp2, tmp4); 4500 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4501 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4502 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4503 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4504 } 4505 __ cmp(ch1, ch2); 4506 __ mov(tmp4, wordSize/str2_chr_size); 4507 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4508 __ BIND(L_SMALL_CMP_LOOP); 4509 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4510 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4511 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4512 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4513 __ add(tmp4, tmp4, 1); 4514 __ cmp(tmp4, cnt1); 4515 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4516 __ cmp(first, ch2); 4517 __ br(__ EQ, L_SMALL_CMP_LOOP); 4518 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4519 __ cbz(tmp2, NOMATCH); // no more matches. exit 4520 __ clz(tmp4, tmp2); 4521 __ add(result, result, 1); // advance index 4522 __ add(str2, str2, str2_chr_size); // advance pointer 4523 __ b(L_SMALL_HAS_ZERO_LOOP); 4524 __ align(OptoLoopAlignment); 4525 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4526 __ cmp(first, ch2); 4527 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4528 __ b(DONE); 4529 __ align(OptoLoopAlignment); 4530 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4531 if (str2_isL) { // LL 4532 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4533 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4534 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4535 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4536 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4537 } else { 4538 __ mov(ch2, 0xE); // all bits in byte set except last one 4539 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4540 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4541 __ lslv(tmp2, tmp2, tmp4); 4542 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4543 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4544 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4545 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4546 } 4547 __ cmp(ch1, ch2); 4548 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4549 __ b(DONE); 4550 __ align(OptoLoopAlignment); 4551 __ BIND(L_HAS_ZERO); 4552 __ rbit(tmp2, tmp2); 4553 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4554 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4555 // It's fine because both counters are 32bit and are not changed in this 4556 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4557 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4558 __ sub(result, result, 1); 4559 __ BIND(L_HAS_ZERO_LOOP); 4560 __ mov(cnt1, wordSize/str2_chr_size); 4561 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4562 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4563 if (str2_isL) { 4564 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4565 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4566 __ lslv(tmp2, tmp2, tmp4); 4567 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4568 __ add(tmp4, tmp4, 1); 4569 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4570 __ lsl(tmp2, tmp2, 1); 4571 __ mov(tmp4, wordSize/str2_chr_size); 4572 } else { 4573 __ mov(ch2, 0xE); 4574 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4575 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4576 __ lslv(tmp2, tmp2, tmp4); 4577 __ add(tmp4, tmp4, 1); 4578 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4579 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4580 __ lsl(tmp2, tmp2, 1); 4581 __ mov(tmp4, wordSize/str2_chr_size); 4582 __ sub(str2, str2, str2_chr_size); 4583 } 4584 __ cmp(ch1, ch2); 4585 __ mov(tmp4, wordSize/str2_chr_size); 4586 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4587 __ BIND(L_CMP_LOOP); 4588 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4589 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4590 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4591 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4592 __ add(tmp4, tmp4, 1); 4593 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4594 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4595 __ cmp(cnt1, ch2); 4596 __ br(__ EQ, L_CMP_LOOP); 4597 __ BIND(L_CMP_LOOP_NOMATCH); 4598 // here we're not matched 4599 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4600 __ clz(tmp4, tmp2); 4601 __ add(str2, str2, str2_chr_size); // advance pointer 4602 __ b(L_HAS_ZERO_LOOP); 4603 __ align(OptoLoopAlignment); 4604 __ BIND(L_CMP_LOOP_LAST_CMP); 4605 __ cmp(cnt1, ch2); 4606 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4607 __ b(DONE); 4608 __ align(OptoLoopAlignment); 4609 __ BIND(L_CMP_LOOP_LAST_CMP2); 4610 if (str2_isL) { 4611 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4612 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4613 __ lslv(tmp2, tmp2, tmp4); 4614 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4615 __ add(tmp4, tmp4, 1); 4616 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4617 __ lsl(tmp2, tmp2, 1); 4618 } else { 4619 __ mov(ch2, 0xE); 4620 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4621 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4622 __ lslv(tmp2, tmp2, tmp4); 4623 __ add(tmp4, tmp4, 1); 4624 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4625 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4626 __ lsl(tmp2, tmp2, 1); 4627 __ sub(str2, str2, str2_chr_size); 4628 } 4629 __ cmp(ch1, ch2); 4630 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4631 __ b(DONE); 4632 __ align(OptoLoopAlignment); 4633 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4634 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4635 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4636 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4637 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4638 // result by analyzed characters value, so, we can just reset lower bits 4639 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4640 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4641 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4642 // index of last analyzed substring inside current octet. So, str2 in at 4643 // respective start address. We need to advance it to next octet 4644 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4645 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4646 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4647 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4648 __ movw(cnt2, cnt2); 4649 __ b(L_LOOP_PROCEED); 4650 __ align(OptoLoopAlignment); 4651 __ BIND(NOMATCH); 4652 __ mov(result, -1); 4653 __ BIND(DONE); 4654 __ pop(spilled_regs, sp); 4655 __ ret(lr); 4656 return entry; 4657 } 4658 4659 void generate_string_indexof_stubs() { 4660 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4661 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4662 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4663 } 4664 4665 void inflate_and_store_2_fp_registers(bool generatePrfm, 4666 FloatRegister src1, FloatRegister src2) { 4667 Register dst = r1; 4668 __ zip1(v1, __ T16B, src1, v0); 4669 __ zip2(v2, __ T16B, src1, v0); 4670 if (generatePrfm) { 4671 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4672 } 4673 __ zip1(v3, __ T16B, src2, v0); 4674 __ zip2(v4, __ T16B, src2, v0); 4675 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4676 } 4677 4678 // R0 = src 4679 // R1 = dst 4680 // R2 = len 4681 // R3 = len >> 3 4682 // V0 = 0 4683 // v1 = loaded 8 bytes 4684 address generate_large_byte_array_inflate() { 4685 __ align(CodeEntryAlignment); 4686 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4687 address entry = __ pc(); 4688 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4689 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4690 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4691 4692 // do one more 8-byte read to have address 16-byte aligned in most cases 4693 // also use single store instruction 4694 __ ldrd(v2, __ post(src, 8)); 4695 __ sub(octetCounter, octetCounter, 2); 4696 __ zip1(v1, __ T16B, v1, v0); 4697 __ zip1(v2, __ T16B, v2, v0); 4698 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4699 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4700 __ cmp(octetCounter, large_loop_threshold); 4701 __ br(__ LE, LOOP_START); 4702 __ b(LOOP_PRFM_START); 4703 __ bind(LOOP_PRFM); 4704 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4705 __ bind(LOOP_PRFM_START); 4706 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4707 __ sub(octetCounter, octetCounter, 8); 4708 __ cmp(octetCounter, large_loop_threshold); 4709 inflate_and_store_2_fp_registers(true, v3, v4); 4710 inflate_and_store_2_fp_registers(true, v5, v6); 4711 __ br(__ GT, LOOP_PRFM); 4712 __ cmp(octetCounter, 8); 4713 __ br(__ LT, DONE); 4714 __ bind(LOOP); 4715 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4716 __ bind(LOOP_START); 4717 __ sub(octetCounter, octetCounter, 8); 4718 __ cmp(octetCounter, 8); 4719 inflate_and_store_2_fp_registers(false, v3, v4); 4720 inflate_and_store_2_fp_registers(false, v5, v6); 4721 __ br(__ GE, LOOP); 4722 __ bind(DONE); 4723 __ ret(lr); 4724 return entry; 4725 } 4726 4727 /** 4728 * Arguments: 4729 * 4730 * Input: 4731 * c_rarg0 - current state address 4732 * c_rarg1 - H key address 4733 * c_rarg2 - data address 4734 * c_rarg3 - number of blocks 4735 * 4736 * Output: 4737 * Updated state at c_rarg0 4738 */ 4739 address generate_ghash_processBlocks() { 4740 // Bafflingly, GCM uses little-endian for the byte order, but 4741 // big-endian for the bit order. For example, the polynomial 1 is 4742 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4743 // 4744 // So, we must either reverse the bytes in each word and do 4745 // everything big-endian or reverse the bits in each byte and do 4746 // it little-endian. On AArch64 it's more idiomatic to reverse 4747 // the bits in each byte (we have an instruction, RBIT, to do 4748 // that) and keep the data in little-endian bit order throught the 4749 // calculation, bit-reversing the inputs and outputs. 4750 4751 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4752 __ align(wordSize * 2); 4753 address p = __ pc(); 4754 __ emit_int64(0x87); // The low-order bits of the field 4755 // polynomial (i.e. p = z^7+z^2+z+1) 4756 // repeated in the low and high parts of a 4757 // 128-bit vector 4758 __ emit_int64(0x87); 4759 4760 __ align(CodeEntryAlignment); 4761 address start = __ pc(); 4762 4763 Register state = c_rarg0; 4764 Register subkeyH = c_rarg1; 4765 Register data = c_rarg2; 4766 Register blocks = c_rarg3; 4767 4768 FloatRegister vzr = v30; 4769 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4770 4771 __ ldrq(v0, Address(state)); 4772 __ ldrq(v1, Address(subkeyH)); 4773 4774 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4775 __ rbit(v0, __ T16B, v0); 4776 __ rev64(v1, __ T16B, v1); 4777 __ rbit(v1, __ T16B, v1); 4778 4779 __ ldrq(v26, p); 4780 4781 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4782 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4783 4784 { 4785 Label L_ghash_loop; 4786 __ bind(L_ghash_loop); 4787 4788 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4789 // reversing each byte 4790 __ rbit(v2, __ T16B, v2); 4791 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4792 4793 // Multiply state in v2 by subkey in v1 4794 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4795 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4796 /*temps*/v6, v20, v18, v21); 4797 // Reduce v7:v5 by the field polynomial 4798 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4799 4800 __ sub(blocks, blocks, 1); 4801 __ cbnz(blocks, L_ghash_loop); 4802 } 4803 4804 // The bit-reversed result is at this point in v0 4805 __ rev64(v1, __ T16B, v0); 4806 __ rbit(v1, __ T16B, v1); 4807 4808 __ st1(v1, __ T16B, state); 4809 __ ret(lr); 4810 4811 return start; 4812 } 4813 4814 // Continuation point for throwing of implicit exceptions that are 4815 // not handled in the current activation. Fabricates an exception 4816 // oop and initiates normal exception dispatching in this 4817 // frame. Since we need to preserve callee-saved values (currently 4818 // only for C2, but done for C1 as well) we need a callee-saved oop 4819 // map and therefore have to make these stubs into RuntimeStubs 4820 // rather than BufferBlobs. If the compiler needs all registers to 4821 // be preserved between the fault point and the exception handler 4822 // then it must assume responsibility for that in 4823 // AbstractCompiler::continuation_for_implicit_null_exception or 4824 // continuation_for_implicit_division_by_zero_exception. All other 4825 // implicit exceptions (e.g., NullPointerException or 4826 // AbstractMethodError on entry) are either at call sites or 4827 // otherwise assume that stack unwinding will be initiated, so 4828 // caller saved registers were assumed volatile in the compiler. 4829 4830 #undef __ 4831 #define __ masm-> 4832 4833 address generate_throw_exception(const char* name, 4834 address runtime_entry, 4835 Register arg1 = noreg, 4836 Register arg2 = noreg) { 4837 // Information about frame layout at time of blocking runtime call. 4838 // Note that we only have to preserve callee-saved registers since 4839 // the compilers are responsible for supplying a continuation point 4840 // if they expect all registers to be preserved. 4841 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4842 enum layout { 4843 rfp_off = 0, 4844 rfp_off2, 4845 return_off, 4846 return_off2, 4847 framesize // inclusive of return address 4848 }; 4849 4850 int insts_size = 512; 4851 int locs_size = 64; 4852 4853 CodeBuffer code(name, insts_size, locs_size); 4854 OopMapSet* oop_maps = new OopMapSet(); 4855 MacroAssembler* masm = new MacroAssembler(&code); 4856 4857 address start = __ pc(); 4858 4859 // This is an inlined and slightly modified version of call_VM 4860 // which has the ability to fetch the return PC out of 4861 // thread-local storage and also sets up last_Java_sp slightly 4862 // differently than the real call_VM 4863 4864 __ enter(); // Save FP and LR before call 4865 4866 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4867 4868 // lr and fp are already in place 4869 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4870 4871 int frame_complete = __ pc() - start; 4872 4873 // Set up last_Java_sp and last_Java_fp 4874 address the_pc = __ pc(); 4875 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 4876 4877 // Call runtime 4878 if (arg1 != noreg) { 4879 assert(arg2 != c_rarg1, "clobbered"); 4880 __ mov(c_rarg1, arg1); 4881 } 4882 if (arg2 != noreg) { 4883 __ mov(c_rarg2, arg2); 4884 } 4885 __ mov(c_rarg0, rthread); 4886 BLOCK_COMMENT("call runtime_entry"); 4887 __ mov(rscratch1, runtime_entry); 4888 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4889 4890 // Generate oop map 4891 OopMap* map = new OopMap(framesize, 0); 4892 4893 oop_maps->add_gc_map(the_pc - start, map); 4894 4895 __ reset_last_Java_frame(true); 4896 __ maybe_isb(); 4897 4898 __ leave(); 4899 4900 // check for pending exceptions 4901 #ifdef ASSERT 4902 Label L; 4903 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4904 __ cbnz(rscratch1, L); 4905 __ should_not_reach_here(); 4906 __ bind(L); 4907 #endif // ASSERT 4908 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4909 4910 4911 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4912 RuntimeStub* stub = 4913 RuntimeStub::new_runtime_stub(name, 4914 &code, 4915 frame_complete, 4916 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4917 oop_maps, false); 4918 return stub->entry_point(); 4919 } 4920 4921 class MontgomeryMultiplyGenerator : public MacroAssembler { 4922 4923 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4924 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4925 4926 RegSet _toSave; 4927 bool _squaring; 4928 4929 public: 4930 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4931 : MacroAssembler(as->code()), _squaring(squaring) { 4932 4933 // Register allocation 4934 4935 Register reg = c_rarg0; 4936 Pa_base = reg; // Argument registers 4937 if (squaring) 4938 Pb_base = Pa_base; 4939 else 4940 Pb_base = ++reg; 4941 Pn_base = ++reg; 4942 Rlen= ++reg; 4943 inv = ++reg; 4944 Pm_base = ++reg; 4945 4946 // Working registers: 4947 Ra = ++reg; // The current digit of a, b, n, and m. 4948 Rb = ++reg; 4949 Rm = ++reg; 4950 Rn = ++reg; 4951 4952 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4953 Pb = ++reg; 4954 Pm = ++reg; 4955 Pn = ++reg; 4956 4957 t0 = ++reg; // Three registers which form a 4958 t1 = ++reg; // triple-precision accumuator. 4959 t2 = ++reg; 4960 4961 Ri = ++reg; // Inner and outer loop indexes. 4962 Rj = ++reg; 4963 4964 Rhi_ab = ++reg; // Product registers: low and high parts 4965 Rlo_ab = ++reg; // of a*b and m*n. 4966 Rhi_mn = ++reg; 4967 Rlo_mn = ++reg; 4968 4969 // r19 and up are callee-saved. 4970 _toSave = RegSet::range(r19, reg) + Pm_base; 4971 } 4972 4973 private: 4974 void save_regs() { 4975 push(_toSave, sp); 4976 } 4977 4978 void restore_regs() { 4979 pop(_toSave, sp); 4980 } 4981 4982 template <typename T> 4983 void unroll_2(Register count, T block) { 4984 Label loop, end, odd; 4985 tbnz(count, 0, odd); 4986 cbz(count, end); 4987 align(16); 4988 bind(loop); 4989 (this->*block)(); 4990 bind(odd); 4991 (this->*block)(); 4992 subs(count, count, 2); 4993 br(Assembler::GT, loop); 4994 bind(end); 4995 } 4996 4997 template <typename T> 4998 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4999 Label loop, end, odd; 5000 tbnz(count, 0, odd); 5001 cbz(count, end); 5002 align(16); 5003 bind(loop); 5004 (this->*block)(d, s, tmp); 5005 bind(odd); 5006 (this->*block)(d, s, tmp); 5007 subs(count, count, 2); 5008 br(Assembler::GT, loop); 5009 bind(end); 5010 } 5011 5012 void pre1(RegisterOrConstant i) { 5013 block_comment("pre1"); 5014 // Pa = Pa_base; 5015 // Pb = Pb_base + i; 5016 // Pm = Pm_base; 5017 // Pn = Pn_base + i; 5018 // Ra = *Pa; 5019 // Rb = *Pb; 5020 // Rm = *Pm; 5021 // Rn = *Pn; 5022 ldr(Ra, Address(Pa_base)); 5023 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 5024 ldr(Rm, Address(Pm_base)); 5025 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5026 lea(Pa, Address(Pa_base)); 5027 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 5028 lea(Pm, Address(Pm_base)); 5029 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5030 5031 // Zero the m*n result. 5032 mov(Rhi_mn, zr); 5033 mov(Rlo_mn, zr); 5034 } 5035 5036 // The core multiply-accumulate step of a Montgomery 5037 // multiplication. The idea is to schedule operations as a 5038 // pipeline so that instructions with long latencies (loads and 5039 // multiplies) have time to complete before their results are 5040 // used. This most benefits in-order implementations of the 5041 // architecture but out-of-order ones also benefit. 5042 void step() { 5043 block_comment("step"); 5044 // MACC(Ra, Rb, t0, t1, t2); 5045 // Ra = *++Pa; 5046 // Rb = *--Pb; 5047 umulh(Rhi_ab, Ra, Rb); 5048 mul(Rlo_ab, Ra, Rb); 5049 ldr(Ra, pre(Pa, wordSize)); 5050 ldr(Rb, pre(Pb, -wordSize)); 5051 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 5052 // previous iteration. 5053 // MACC(Rm, Rn, t0, t1, t2); 5054 // Rm = *++Pm; 5055 // Rn = *--Pn; 5056 umulh(Rhi_mn, Rm, Rn); 5057 mul(Rlo_mn, Rm, Rn); 5058 ldr(Rm, pre(Pm, wordSize)); 5059 ldr(Rn, pre(Pn, -wordSize)); 5060 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5061 } 5062 5063 void post1() { 5064 block_comment("post1"); 5065 5066 // MACC(Ra, Rb, t0, t1, t2); 5067 // Ra = *++Pa; 5068 // Rb = *--Pb; 5069 umulh(Rhi_ab, Ra, Rb); 5070 mul(Rlo_ab, Ra, Rb); 5071 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5072 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5073 5074 // *Pm = Rm = t0 * inv; 5075 mul(Rm, t0, inv); 5076 str(Rm, Address(Pm)); 5077 5078 // MACC(Rm, Rn, t0, t1, t2); 5079 // t0 = t1; t1 = t2; t2 = 0; 5080 umulh(Rhi_mn, Rm, Rn); 5081 5082 #ifndef PRODUCT 5083 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5084 { 5085 mul(Rlo_mn, Rm, Rn); 5086 add(Rlo_mn, t0, Rlo_mn); 5087 Label ok; 5088 cbz(Rlo_mn, ok); { 5089 stop("broken Montgomery multiply"); 5090 } bind(ok); 5091 } 5092 #endif 5093 // We have very carefully set things up so that 5094 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5095 // the lower half of Rm * Rn because we know the result already: 5096 // it must be -t0. t0 + (-t0) must generate a carry iff 5097 // t0 != 0. So, rather than do a mul and an adds we just set 5098 // the carry flag iff t0 is nonzero. 5099 // 5100 // mul(Rlo_mn, Rm, Rn); 5101 // adds(zr, t0, Rlo_mn); 5102 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5103 adcs(t0, t1, Rhi_mn); 5104 adc(t1, t2, zr); 5105 mov(t2, zr); 5106 } 5107 5108 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5109 block_comment("pre2"); 5110 // Pa = Pa_base + i-len; 5111 // Pb = Pb_base + len; 5112 // Pm = Pm_base + i-len; 5113 // Pn = Pn_base + len; 5114 5115 if (i.is_register()) { 5116 sub(Rj, i.as_register(), len); 5117 } else { 5118 mov(Rj, i.as_constant()); 5119 sub(Rj, Rj, len); 5120 } 5121 // Rj == i-len 5122 5123 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5124 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5125 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5126 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5127 5128 // Ra = *++Pa; 5129 // Rb = *--Pb; 5130 // Rm = *++Pm; 5131 // Rn = *--Pn; 5132 ldr(Ra, pre(Pa, wordSize)); 5133 ldr(Rb, pre(Pb, -wordSize)); 5134 ldr(Rm, pre(Pm, wordSize)); 5135 ldr(Rn, pre(Pn, -wordSize)); 5136 5137 mov(Rhi_mn, zr); 5138 mov(Rlo_mn, zr); 5139 } 5140 5141 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5142 block_comment("post2"); 5143 if (i.is_constant()) { 5144 mov(Rj, i.as_constant()-len.as_constant()); 5145 } else { 5146 sub(Rj, i.as_register(), len); 5147 } 5148 5149 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5150 5151 // As soon as we know the least significant digit of our result, 5152 // store it. 5153 // Pm_base[i-len] = t0; 5154 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5155 5156 // t0 = t1; t1 = t2; t2 = 0; 5157 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5158 adc(t1, t2, zr); 5159 mov(t2, zr); 5160 } 5161 5162 // A carry in t0 after Montgomery multiplication means that we 5163 // should subtract multiples of n from our result in m. We'll 5164 // keep doing that until there is no carry. 5165 void normalize(RegisterOrConstant len) { 5166 block_comment("normalize"); 5167 // while (t0) 5168 // t0 = sub(Pm_base, Pn_base, t0, len); 5169 Label loop, post, again; 5170 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5171 cbz(t0, post); { 5172 bind(again); { 5173 mov(i, zr); 5174 mov(cnt, len); 5175 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5176 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5177 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5178 align(16); 5179 bind(loop); { 5180 sbcs(Rm, Rm, Rn); 5181 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5182 add(i, i, 1); 5183 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5184 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5185 sub(cnt, cnt, 1); 5186 } cbnz(cnt, loop); 5187 sbc(t0, t0, zr); 5188 } cbnz(t0, again); 5189 } bind(post); 5190 } 5191 5192 // Move memory at s to d, reversing words. 5193 // Increments d to end of copied memory 5194 // Destroys tmp1, tmp2 5195 // Preserves len 5196 // Leaves s pointing to the address which was in d at start 5197 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5198 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5199 5200 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5201 mov(tmp1, len); 5202 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5203 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5204 } 5205 // where 5206 void reverse1(Register d, Register s, Register tmp) { 5207 ldr(tmp, pre(s, -wordSize)); 5208 ror(tmp, tmp, 32); 5209 str(tmp, post(d, wordSize)); 5210 } 5211 5212 void step_squaring() { 5213 // An extra ACC 5214 step(); 5215 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5216 } 5217 5218 void last_squaring(RegisterOrConstant i) { 5219 Label dont; 5220 // if ((i & 1) == 0) { 5221 tbnz(i.as_register(), 0, dont); { 5222 // MACC(Ra, Rb, t0, t1, t2); 5223 // Ra = *++Pa; 5224 // Rb = *--Pb; 5225 umulh(Rhi_ab, Ra, Rb); 5226 mul(Rlo_ab, Ra, Rb); 5227 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5228 } bind(dont); 5229 } 5230 5231 void extra_step_squaring() { 5232 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5233 5234 // MACC(Rm, Rn, t0, t1, t2); 5235 // Rm = *++Pm; 5236 // Rn = *--Pn; 5237 umulh(Rhi_mn, Rm, Rn); 5238 mul(Rlo_mn, Rm, Rn); 5239 ldr(Rm, pre(Pm, wordSize)); 5240 ldr(Rn, pre(Pn, -wordSize)); 5241 } 5242 5243 void post1_squaring() { 5244 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5245 5246 // *Pm = Rm = t0 * inv; 5247 mul(Rm, t0, inv); 5248 str(Rm, Address(Pm)); 5249 5250 // MACC(Rm, Rn, t0, t1, t2); 5251 // t0 = t1; t1 = t2; t2 = 0; 5252 umulh(Rhi_mn, Rm, Rn); 5253 5254 #ifndef PRODUCT 5255 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5256 { 5257 mul(Rlo_mn, Rm, Rn); 5258 add(Rlo_mn, t0, Rlo_mn); 5259 Label ok; 5260 cbz(Rlo_mn, ok); { 5261 stop("broken Montgomery multiply"); 5262 } bind(ok); 5263 } 5264 #endif 5265 // We have very carefully set things up so that 5266 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5267 // the lower half of Rm * Rn because we know the result already: 5268 // it must be -t0. t0 + (-t0) must generate a carry iff 5269 // t0 != 0. So, rather than do a mul and an adds we just set 5270 // the carry flag iff t0 is nonzero. 5271 // 5272 // mul(Rlo_mn, Rm, Rn); 5273 // adds(zr, t0, Rlo_mn); 5274 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5275 adcs(t0, t1, Rhi_mn); 5276 adc(t1, t2, zr); 5277 mov(t2, zr); 5278 } 5279 5280 void acc(Register Rhi, Register Rlo, 5281 Register t0, Register t1, Register t2) { 5282 adds(t0, t0, Rlo); 5283 adcs(t1, t1, Rhi); 5284 adc(t2, t2, zr); 5285 } 5286 5287 public: 5288 /** 5289 * Fast Montgomery multiplication. The derivation of the 5290 * algorithm is in A Cryptographic Library for the Motorola 5291 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5292 * 5293 * Arguments: 5294 * 5295 * Inputs for multiplication: 5296 * c_rarg0 - int array elements a 5297 * c_rarg1 - int array elements b 5298 * c_rarg2 - int array elements n (the modulus) 5299 * c_rarg3 - int length 5300 * c_rarg4 - int inv 5301 * c_rarg5 - int array elements m (the result) 5302 * 5303 * Inputs for squaring: 5304 * c_rarg0 - int array elements a 5305 * c_rarg1 - int array elements n (the modulus) 5306 * c_rarg2 - int length 5307 * c_rarg3 - int inv 5308 * c_rarg4 - int array elements m (the result) 5309 * 5310 */ 5311 address generate_multiply() { 5312 Label argh, nothing; 5313 bind(argh); 5314 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5315 5316 align(CodeEntryAlignment); 5317 address entry = pc(); 5318 5319 cbzw(Rlen, nothing); 5320 5321 enter(); 5322 5323 // Make room. 5324 cmpw(Rlen, 512); 5325 br(Assembler::HI, argh); 5326 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5327 andr(sp, Ra, -2 * wordSize); 5328 5329 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5330 5331 { 5332 // Copy input args, reversing as we go. We use Ra as a 5333 // temporary variable. 5334 reverse(Ra, Pa_base, Rlen, t0, t1); 5335 if (!_squaring) 5336 reverse(Ra, Pb_base, Rlen, t0, t1); 5337 reverse(Ra, Pn_base, Rlen, t0, t1); 5338 } 5339 5340 // Push all call-saved registers and also Pm_base which we'll need 5341 // at the end. 5342 save_regs(); 5343 5344 #ifndef PRODUCT 5345 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5346 { 5347 ldr(Rn, Address(Pn_base, 0)); 5348 mul(Rlo_mn, Rn, inv); 5349 cmp(Rlo_mn, -1); 5350 Label ok; 5351 br(EQ, ok); { 5352 stop("broken inverse in Montgomery multiply"); 5353 } bind(ok); 5354 } 5355 #endif 5356 5357 mov(Pm_base, Ra); 5358 5359 mov(t0, zr); 5360 mov(t1, zr); 5361 mov(t2, zr); 5362 5363 block_comment("for (int i = 0; i < len; i++) {"); 5364 mov(Ri, zr); { 5365 Label loop, end; 5366 cmpw(Ri, Rlen); 5367 br(Assembler::GE, end); 5368 5369 bind(loop); 5370 pre1(Ri); 5371 5372 block_comment(" for (j = i; j; j--) {"); { 5373 movw(Rj, Ri); 5374 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5375 } block_comment(" } // j"); 5376 5377 post1(); 5378 addw(Ri, Ri, 1); 5379 cmpw(Ri, Rlen); 5380 br(Assembler::LT, loop); 5381 bind(end); 5382 block_comment("} // i"); 5383 } 5384 5385 block_comment("for (int i = len; i < 2*len; i++) {"); 5386 mov(Ri, Rlen); { 5387 Label loop, end; 5388 cmpw(Ri, Rlen, Assembler::LSL, 1); 5389 br(Assembler::GE, end); 5390 5391 bind(loop); 5392 pre2(Ri, Rlen); 5393 5394 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5395 lslw(Rj, Rlen, 1); 5396 subw(Rj, Rj, Ri); 5397 subw(Rj, Rj, 1); 5398 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5399 } block_comment(" } // j"); 5400 5401 post2(Ri, Rlen); 5402 addw(Ri, Ri, 1); 5403 cmpw(Ri, Rlen, Assembler::LSL, 1); 5404 br(Assembler::LT, loop); 5405 bind(end); 5406 } 5407 block_comment("} // i"); 5408 5409 normalize(Rlen); 5410 5411 mov(Ra, Pm_base); // Save Pm_base in Ra 5412 restore_regs(); // Restore caller's Pm_base 5413 5414 // Copy our result into caller's Pm_base 5415 reverse(Pm_base, Ra, Rlen, t0, t1); 5416 5417 leave(); 5418 bind(nothing); 5419 ret(lr); 5420 5421 return entry; 5422 } 5423 // In C, approximately: 5424 5425 // void 5426 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5427 // unsigned long Pn_base[], unsigned long Pm_base[], 5428 // unsigned long inv, int len) { 5429 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5430 // unsigned long *Pa, *Pb, *Pn, *Pm; 5431 // unsigned long Ra, Rb, Rn, Rm; 5432 5433 // int i; 5434 5435 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5436 5437 // for (i = 0; i < len; i++) { 5438 // int j; 5439 5440 // Pa = Pa_base; 5441 // Pb = Pb_base + i; 5442 // Pm = Pm_base; 5443 // Pn = Pn_base + i; 5444 5445 // Ra = *Pa; 5446 // Rb = *Pb; 5447 // Rm = *Pm; 5448 // Rn = *Pn; 5449 5450 // int iters = i; 5451 // for (j = 0; iters--; j++) { 5452 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5453 // MACC(Ra, Rb, t0, t1, t2); 5454 // Ra = *++Pa; 5455 // Rb = *--Pb; 5456 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5457 // MACC(Rm, Rn, t0, t1, t2); 5458 // Rm = *++Pm; 5459 // Rn = *--Pn; 5460 // } 5461 5462 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5463 // MACC(Ra, Rb, t0, t1, t2); 5464 // *Pm = Rm = t0 * inv; 5465 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5466 // MACC(Rm, Rn, t0, t1, t2); 5467 5468 // assert(t0 == 0, "broken Montgomery multiply"); 5469 5470 // t0 = t1; t1 = t2; t2 = 0; 5471 // } 5472 5473 // for (i = len; i < 2*len; i++) { 5474 // int j; 5475 5476 // Pa = Pa_base + i-len; 5477 // Pb = Pb_base + len; 5478 // Pm = Pm_base + i-len; 5479 // Pn = Pn_base + len; 5480 5481 // Ra = *++Pa; 5482 // Rb = *--Pb; 5483 // Rm = *++Pm; 5484 // Rn = *--Pn; 5485 5486 // int iters = len*2-i-1; 5487 // for (j = i-len+1; iters--; j++) { 5488 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5489 // MACC(Ra, Rb, t0, t1, t2); 5490 // Ra = *++Pa; 5491 // Rb = *--Pb; 5492 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5493 // MACC(Rm, Rn, t0, t1, t2); 5494 // Rm = *++Pm; 5495 // Rn = *--Pn; 5496 // } 5497 5498 // Pm_base[i-len] = t0; 5499 // t0 = t1; t1 = t2; t2 = 0; 5500 // } 5501 5502 // while (t0) 5503 // t0 = sub(Pm_base, Pn_base, t0, len); 5504 // } 5505 5506 /** 5507 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5508 * multiplies than Montgomery multiplication so it should be up to 5509 * 25% faster. However, its loop control is more complex and it 5510 * may actually run slower on some machines. 5511 * 5512 * Arguments: 5513 * 5514 * Inputs: 5515 * c_rarg0 - int array elements a 5516 * c_rarg1 - int array elements n (the modulus) 5517 * c_rarg2 - int length 5518 * c_rarg3 - int inv 5519 * c_rarg4 - int array elements m (the result) 5520 * 5521 */ 5522 address generate_square() { 5523 Label argh; 5524 bind(argh); 5525 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5526 5527 align(CodeEntryAlignment); 5528 address entry = pc(); 5529 5530 enter(); 5531 5532 // Make room. 5533 cmpw(Rlen, 512); 5534 br(Assembler::HI, argh); 5535 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5536 andr(sp, Ra, -2 * wordSize); 5537 5538 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5539 5540 { 5541 // Copy input args, reversing as we go. We use Ra as a 5542 // temporary variable. 5543 reverse(Ra, Pa_base, Rlen, t0, t1); 5544 reverse(Ra, Pn_base, Rlen, t0, t1); 5545 } 5546 5547 // Push all call-saved registers and also Pm_base which we'll need 5548 // at the end. 5549 save_regs(); 5550 5551 mov(Pm_base, Ra); 5552 5553 mov(t0, zr); 5554 mov(t1, zr); 5555 mov(t2, zr); 5556 5557 block_comment("for (int i = 0; i < len; i++) {"); 5558 mov(Ri, zr); { 5559 Label loop, end; 5560 bind(loop); 5561 cmp(Ri, Rlen); 5562 br(Assembler::GE, end); 5563 5564 pre1(Ri); 5565 5566 block_comment("for (j = (i+1)/2; j; j--) {"); { 5567 add(Rj, Ri, 1); 5568 lsr(Rj, Rj, 1); 5569 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5570 } block_comment(" } // j"); 5571 5572 last_squaring(Ri); 5573 5574 block_comment(" for (j = i/2; j; j--) {"); { 5575 lsr(Rj, Ri, 1); 5576 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5577 } block_comment(" } // j"); 5578 5579 post1_squaring(); 5580 add(Ri, Ri, 1); 5581 cmp(Ri, Rlen); 5582 br(Assembler::LT, loop); 5583 5584 bind(end); 5585 block_comment("} // i"); 5586 } 5587 5588 block_comment("for (int i = len; i < 2*len; i++) {"); 5589 mov(Ri, Rlen); { 5590 Label loop, end; 5591 bind(loop); 5592 cmp(Ri, Rlen, Assembler::LSL, 1); 5593 br(Assembler::GE, end); 5594 5595 pre2(Ri, Rlen); 5596 5597 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5598 lsl(Rj, Rlen, 1); 5599 sub(Rj, Rj, Ri); 5600 sub(Rj, Rj, 1); 5601 lsr(Rj, Rj, 1); 5602 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5603 } block_comment(" } // j"); 5604 5605 last_squaring(Ri); 5606 5607 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5608 lsl(Rj, Rlen, 1); 5609 sub(Rj, Rj, Ri); 5610 lsr(Rj, Rj, 1); 5611 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5612 } block_comment(" } // j"); 5613 5614 post2(Ri, Rlen); 5615 add(Ri, Ri, 1); 5616 cmp(Ri, Rlen, Assembler::LSL, 1); 5617 5618 br(Assembler::LT, loop); 5619 bind(end); 5620 block_comment("} // i"); 5621 } 5622 5623 normalize(Rlen); 5624 5625 mov(Ra, Pm_base); // Save Pm_base in Ra 5626 restore_regs(); // Restore caller's Pm_base 5627 5628 // Copy our result into caller's Pm_base 5629 reverse(Pm_base, Ra, Rlen, t0, t1); 5630 5631 leave(); 5632 ret(lr); 5633 5634 return entry; 5635 } 5636 // In C, approximately: 5637 5638 // void 5639 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5640 // unsigned long Pm_base[], unsigned long inv, int len) { 5641 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5642 // unsigned long *Pa, *Pb, *Pn, *Pm; 5643 // unsigned long Ra, Rb, Rn, Rm; 5644 5645 // int i; 5646 5647 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5648 5649 // for (i = 0; i < len; i++) { 5650 // int j; 5651 5652 // Pa = Pa_base; 5653 // Pb = Pa_base + i; 5654 // Pm = Pm_base; 5655 // Pn = Pn_base + i; 5656 5657 // Ra = *Pa; 5658 // Rb = *Pb; 5659 // Rm = *Pm; 5660 // Rn = *Pn; 5661 5662 // int iters = (i+1)/2; 5663 // for (j = 0; iters--; j++) { 5664 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5665 // MACC2(Ra, Rb, t0, t1, t2); 5666 // Ra = *++Pa; 5667 // Rb = *--Pb; 5668 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5669 // MACC(Rm, Rn, t0, t1, t2); 5670 // Rm = *++Pm; 5671 // Rn = *--Pn; 5672 // } 5673 // if ((i & 1) == 0) { 5674 // assert(Ra == Pa_base[j], "must be"); 5675 // MACC(Ra, Ra, t0, t1, t2); 5676 // } 5677 // iters = i/2; 5678 // assert(iters == i-j, "must be"); 5679 // for (; iters--; j++) { 5680 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5681 // MACC(Rm, Rn, t0, t1, t2); 5682 // Rm = *++Pm; 5683 // Rn = *--Pn; 5684 // } 5685 5686 // *Pm = Rm = t0 * inv; 5687 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5688 // MACC(Rm, Rn, t0, t1, t2); 5689 5690 // assert(t0 == 0, "broken Montgomery multiply"); 5691 5692 // t0 = t1; t1 = t2; t2 = 0; 5693 // } 5694 5695 // for (i = len; i < 2*len; i++) { 5696 // int start = i-len+1; 5697 // int end = start + (len - start)/2; 5698 // int j; 5699 5700 // Pa = Pa_base + i-len; 5701 // Pb = Pa_base + len; 5702 // Pm = Pm_base + i-len; 5703 // Pn = Pn_base + len; 5704 5705 // Ra = *++Pa; 5706 // Rb = *--Pb; 5707 // Rm = *++Pm; 5708 // Rn = *--Pn; 5709 5710 // int iters = (2*len-i-1)/2; 5711 // assert(iters == end-start, "must be"); 5712 // for (j = start; iters--; j++) { 5713 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5714 // MACC2(Ra, Rb, t0, t1, t2); 5715 // Ra = *++Pa; 5716 // Rb = *--Pb; 5717 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5718 // MACC(Rm, Rn, t0, t1, t2); 5719 // Rm = *++Pm; 5720 // Rn = *--Pn; 5721 // } 5722 // if ((i & 1) == 0) { 5723 // assert(Ra == Pa_base[j], "must be"); 5724 // MACC(Ra, Ra, t0, t1, t2); 5725 // } 5726 // iters = (2*len-i)/2; 5727 // assert(iters == len-j, "must be"); 5728 // for (; iters--; j++) { 5729 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5730 // MACC(Rm, Rn, t0, t1, t2); 5731 // Rm = *++Pm; 5732 // Rn = *--Pn; 5733 // } 5734 // Pm_base[i-len] = t0; 5735 // t0 = t1; t1 = t2; t2 = 0; 5736 // } 5737 5738 // while (t0) 5739 // t0 = sub(Pm_base, Pn_base, t0, len); 5740 // } 5741 }; 5742 5743 5744 // Initialization 5745 void generate_initial() { 5746 // Generate initial stubs and initializes the entry points 5747 5748 // entry points that exist in all platforms Note: This is code 5749 // that could be shared among different platforms - however the 5750 // benefit seems to be smaller than the disadvantage of having a 5751 // much more complicated generator structure. See also comment in 5752 // stubRoutines.hpp. 5753 5754 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5755 5756 StubRoutines::_call_stub_entry = 5757 generate_call_stub(StubRoutines::_call_stub_return_address); 5758 5759 // is referenced by megamorphic call 5760 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5761 5762 // Build this early so it's available for the interpreter. 5763 StubRoutines::_throw_StackOverflowError_entry = 5764 generate_throw_exception("StackOverflowError throw_exception", 5765 CAST_FROM_FN_PTR(address, 5766 SharedRuntime::throw_StackOverflowError)); 5767 StubRoutines::_throw_delayed_StackOverflowError_entry = 5768 generate_throw_exception("delayed StackOverflowError throw_exception", 5769 CAST_FROM_FN_PTR(address, 5770 SharedRuntime::throw_delayed_StackOverflowError)); 5771 if (UseCRC32Intrinsics) { 5772 // set table address before stub generation which use it 5773 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5774 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5775 } 5776 5777 if (UseCRC32CIntrinsics) { 5778 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5779 } 5780 5781 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5782 StubRoutines::_dlog = generate_dlog(); 5783 } 5784 5785 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5786 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5787 } 5788 5789 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5790 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5791 } 5792 } 5793 5794 void generate_all() { 5795 // support for verify_oop (must happen after universe_init) 5796 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5797 StubRoutines::_throw_AbstractMethodError_entry = 5798 generate_throw_exception("AbstractMethodError throw_exception", 5799 CAST_FROM_FN_PTR(address, 5800 SharedRuntime:: 5801 throw_AbstractMethodError)); 5802 5803 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5804 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5805 CAST_FROM_FN_PTR(address, 5806 SharedRuntime:: 5807 throw_IncompatibleClassChangeError)); 5808 5809 StubRoutines::_throw_NullPointerException_at_call_entry = 5810 generate_throw_exception("NullPointerException at call throw_exception", 5811 CAST_FROM_FN_PTR(address, 5812 SharedRuntime:: 5813 throw_NullPointerException_at_call)); 5814 5815 // arraycopy stubs used by compilers 5816 generate_arraycopy_stubs(); 5817 5818 // has negatives stub for large arrays. 5819 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5820 5821 // array equals stub for large arrays. 5822 if (!UseSimpleArrayEquals) { 5823 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5824 } 5825 5826 generate_compare_long_strings(); 5827 5828 generate_string_indexof_stubs(); 5829 5830 // byte_array_inflate stub for large arrays. 5831 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5832 5833 if (UseMultiplyToLenIntrinsic) { 5834 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5835 } 5836 5837 if (UseSquareToLenIntrinsic) { 5838 StubRoutines::_squareToLen = generate_squareToLen(); 5839 } 5840 5841 if (UseMulAddIntrinsic) { 5842 StubRoutines::_mulAdd = generate_mulAdd(); 5843 } 5844 5845 if (UseMontgomeryMultiplyIntrinsic) { 5846 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5847 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5848 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5849 } 5850 5851 if (UseMontgomerySquareIntrinsic) { 5852 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5853 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5854 // We use generate_multiply() rather than generate_square() 5855 // because it's faster for the sizes of modulus we care about. 5856 StubRoutines::_montgomerySquare = g.generate_multiply(); 5857 } 5858 5859 #ifndef BUILTIN_SIM 5860 // generate GHASH intrinsics code 5861 if (UseGHASHIntrinsics) { 5862 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5863 } 5864 5865 // data cache line writeback 5866 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 5867 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 5868 5869 if (UseAESIntrinsics) { 5870 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5871 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5872 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5873 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5874 } 5875 5876 if (UseSHA1Intrinsics) { 5877 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5878 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5879 } 5880 if (UseSHA256Intrinsics) { 5881 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5882 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5883 } 5884 5885 // generate Adler32 intrinsics code 5886 if (UseAdler32Intrinsics) { 5887 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5888 } 5889 5890 // Safefetch stubs. 5891 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5892 &StubRoutines::_safefetch32_fault_pc, 5893 &StubRoutines::_safefetch32_continuation_pc); 5894 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5895 &StubRoutines::_safefetchN_fault_pc, 5896 &StubRoutines::_safefetchN_continuation_pc); 5897 #endif 5898 StubRoutines::aarch64::set_completed(); 5899 } 5900 5901 public: 5902 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5903 if (all) { 5904 generate_all(); 5905 } else { 5906 generate_initial(); 5907 } 5908 } 5909 }; // end class declaration 5910 5911 void StubGenerator_generate(CodeBuffer* code, bool all) { 5912 StubGenerator g(code, all); 5913 }