1 /* 2 * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 #ifdef BUILTIN_SIM 50 #include "../../../../../../simulator/simulator.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #undef __ 58 #define __ _masm-> 59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Stub Code definitions 70 71 class StubGenerator: public StubCodeGenerator { 72 private: 73 74 #ifdef PRODUCT 75 #define inc_counter_np(counter) ((void)0) 76 #else 77 void inc_counter_np_(int& counter) { 78 __ lea(rscratch2, ExternalAddress((address)&counter)); 79 __ ldrw(rscratch1, Address(rscratch2)); 80 __ addw(rscratch1, rscratch1, 1); 81 __ strw(rscratch1, Address(rscratch2)); 82 } 83 #define inc_counter_np(counter) \ 84 BLOCK_COMMENT("inc_counter " #counter); \ 85 inc_counter_np_(counter); 86 #endif 87 88 // Call stubs are used to call Java from C 89 // 90 // Arguments: 91 // c_rarg0: call wrapper address address 92 // c_rarg1: result address 93 // c_rarg2: result type BasicType 94 // c_rarg3: method Method* 95 // c_rarg4: (interpreter) entry point address 96 // c_rarg5: parameters intptr_t* 97 // c_rarg6: parameter size (in words) int 98 // c_rarg7: thread Thread* 99 // 100 // There is no return from the stub itself as any Java result 101 // is written to result 102 // 103 // we save r30 (lr) as the return PC at the base of the frame and 104 // link r29 (fp) below it as the frame pointer installing sp (r31) 105 // into fp. 106 // 107 // we save r0-r7, which accounts for all the c arguments. 108 // 109 // TODO: strictly do we need to save them all? they are treated as 110 // volatile by C so could we omit saving the ones we are going to 111 // place in global registers (thread? method?) or those we only use 112 // during setup of the Java call? 113 // 114 // we don't need to save r8 which C uses as an indirect result location 115 // return register. 116 // 117 // we don't need to save r9-r15 which both C and Java treat as 118 // volatile 119 // 120 // we don't need to save r16-18 because Java does not use them 121 // 122 // we save r19-r28 which Java uses as scratch registers and C 123 // expects to be callee-save 124 // 125 // we save the bottom 64 bits of each value stored in v8-v15; it is 126 // the responsibility of the caller to preserve larger values. 127 // 128 // so the stub frame looks like this when we enter Java code 129 // 130 // [ return_from_Java ] <--- sp 131 // [ argument word n ] 132 // ... 133 // -27 [ argument word 1 ] 134 // -26 [ saved v15 ] <--- sp_after_call 135 // -25 [ saved v14 ] 136 // -24 [ saved v13 ] 137 // -23 [ saved v12 ] 138 // -22 [ saved v11 ] 139 // -21 [ saved v10 ] 140 // -20 [ saved v9 ] 141 // -19 [ saved v8 ] 142 // -18 [ saved r28 ] 143 // -17 [ saved r27 ] 144 // -16 [ saved r26 ] 145 // -15 [ saved r25 ] 146 // -14 [ saved r24 ] 147 // -13 [ saved r23 ] 148 // -12 [ saved r22 ] 149 // -11 [ saved r21 ] 150 // -10 [ saved r20 ] 151 // -9 [ saved r19 ] 152 // -8 [ call wrapper (r0) ] 153 // -7 [ result (r1) ] 154 // -6 [ result type (r2) ] 155 // -5 [ method (r3) ] 156 // -4 [ entry point (r4) ] 157 // -3 [ parameters (r5) ] 158 // -2 [ parameter size (r6) ] 159 // -1 [ thread (r7) ] 160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 161 // 1 [ saved lr (r30) ] 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -26, 166 167 d15_off = -26, 168 d13_off = -24, 169 d11_off = -22, 170 d9_off = -20, 171 172 r28_off = -18, 173 r26_off = -16, 174 r24_off = -14, 175 r22_off = -12, 176 r20_off = -10, 177 call_wrapper_off = -8, 178 result_off = -7, 179 result_type_off = -6, 180 method_off = -5, 181 entry_point_off = -4, 182 parameter_size_off = -2, 183 thread_off = -1, 184 fp_f = 0, 185 retaddr_off = 1, 186 }; 187 188 address generate_call_stub(address& return_address) { 189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 191 "adjust this code"); 192 193 StubCodeMark mark(this, "StubRoutines", "call_stub"); 194 address start = __ pc(); 195 196 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 197 198 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 199 const Address result (rfp, result_off * wordSize); 200 const Address result_type (rfp, result_type_off * wordSize); 201 const Address method (rfp, method_off * wordSize); 202 const Address entry_point (rfp, entry_point_off * wordSize); 203 const Address parameter_size(rfp, parameter_size_off * wordSize); 204 205 const Address thread (rfp, thread_off * wordSize); 206 207 const Address d15_save (rfp, d15_off * wordSize); 208 const Address d13_save (rfp, d13_off * wordSize); 209 const Address d11_save (rfp, d11_off * wordSize); 210 const Address d9_save (rfp, d9_off * wordSize); 211 212 const Address r28_save (rfp, r28_off * wordSize); 213 const Address r26_save (rfp, r26_off * wordSize); 214 const Address r24_save (rfp, r24_off * wordSize); 215 const Address r22_save (rfp, r22_off * wordSize); 216 const Address r20_save (rfp, r20_off * wordSize); 217 218 // stub code 219 220 // we need a C prolog to bootstrap the x86 caller into the sim 221 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 222 223 address aarch64_entry = __ pc(); 224 225 #ifdef BUILTIN_SIM 226 // Save sender's SP for stack traces. 227 __ mov(rscratch1, sp); 228 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 229 #endif 230 // set up frame and move sp to end of save area 231 __ enter(); 232 __ sub(sp, rfp, -sp_after_call_off * wordSize); 233 234 // save register parameters and Java scratch/global registers 235 // n.b. we save thread even though it gets installed in 236 // rthread because we want to sanity check rthread later 237 __ str(c_rarg7, thread); 238 __ strw(c_rarg6, parameter_size); 239 __ stp(c_rarg4, c_rarg5, entry_point); 240 __ stp(c_rarg2, c_rarg3, result_type); 241 __ stp(c_rarg0, c_rarg1, call_wrapper); 242 243 __ stp(r20, r19, r20_save); 244 __ stp(r22, r21, r22_save); 245 __ stp(r24, r23, r24_save); 246 __ stp(r26, r25, r26_save); 247 __ stp(r28, r27, r28_save); 248 249 __ stpd(v9, v8, d9_save); 250 __ stpd(v11, v10, d11_save); 251 __ stpd(v13, v12, d13_save); 252 __ stpd(v15, v14, d15_save); 253 254 // install Java thread in global register now we have saved 255 // whatever value it held 256 __ mov(rthread, c_rarg7); 257 // And method 258 __ mov(rmethod, c_rarg3); 259 260 // set up the heapbase register 261 __ reinit_heapbase(); 262 263 #ifdef ASSERT 264 // make sure we have no pending exceptions 265 { 266 Label L; 267 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 268 __ cmp(rscratch1, (unsigned)NULL_WORD); 269 __ br(Assembler::EQ, L); 270 __ stop("StubRoutines::call_stub: entered with pending exception"); 271 __ BIND(L); 272 } 273 #endif 274 // pass parameters if any 275 __ mov(esp, sp); 276 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 277 __ andr(sp, rscratch1, -2 * wordSize); 278 279 BLOCK_COMMENT("pass parameters if any"); 280 Label parameters_done; 281 // parameter count is still in c_rarg6 282 // and parameter pointer identifying param 1 is in c_rarg5 283 __ cbzw(c_rarg6, parameters_done); 284 285 address loop = __ pc(); 286 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 287 __ subsw(c_rarg6, c_rarg6, 1); 288 __ push(rscratch1); 289 __ br(Assembler::GT, loop); 290 291 __ BIND(parameters_done); 292 293 // call Java entry -- passing methdoOop, and current sp 294 // rmethod: Method* 295 // r13: sender sp 296 BLOCK_COMMENT("call Java function"); 297 __ mov(r13, sp); 298 __ blr(c_rarg4); 299 300 // tell the simulator we have returned to the stub 301 302 // we do this here because the notify will already have been done 303 // if we get to the next instruction via an exception 304 // 305 // n.b. adding this instruction here affects the calculation of 306 // whether or not a routine returns to the call stub (used when 307 // doing stack walks) since the normal test is to check the return 308 // pc against the address saved below. so we may need to allow for 309 // this extra instruction in the check. 310 311 if (NotifySimulator) { 312 __ notify(Assembler::method_reentry); 313 } 314 // save current address for use by exception handling code 315 316 return_address = __ pc(); 317 318 // store result depending on type (everything that is not 319 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 320 // n.b. this assumes Java returns an integral result in r0 321 // and a floating result in j_farg0 322 __ ldr(j_rarg2, result); 323 Label is_long, is_float, is_double, exit; 324 __ ldr(j_rarg1, result_type); 325 __ cmp(j_rarg1, T_OBJECT); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, T_LONG); 328 __ br(Assembler::EQ, is_long); 329 __ cmp(j_rarg1, T_FLOAT); 330 __ br(Assembler::EQ, is_float); 331 __ cmp(j_rarg1, T_DOUBLE); 332 __ br(Assembler::EQ, is_double); 333 334 // handle T_INT case 335 __ strw(r0, Address(j_rarg2)); 336 337 __ BIND(exit); 338 339 // pop parameters 340 __ sub(esp, rfp, -sp_after_call_off * wordSize); 341 342 #ifdef ASSERT 343 // verify that threads correspond 344 { 345 Label L, S; 346 __ ldr(rscratch1, thread); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::NE, S); 349 __ get_thread(rscratch1); 350 __ cmp(rthread, rscratch1); 351 __ br(Assembler::EQ, L); 352 __ BIND(S); 353 __ stop("StubRoutines::call_stub: threads must correspond"); 354 __ BIND(L); 355 } 356 #endif 357 358 // restore callee-save registers 359 __ ldpd(v15, v14, d15_save); 360 __ ldpd(v13, v12, d13_save); 361 __ ldpd(v11, v10, d11_save); 362 __ ldpd(v9, v8, d9_save); 363 364 __ ldp(r28, r27, r28_save); 365 __ ldp(r26, r25, r26_save); 366 __ ldp(r24, r23, r24_save); 367 __ ldp(r22, r21, r22_save); 368 __ ldp(r20, r19, r20_save); 369 370 __ ldp(c_rarg0, c_rarg1, call_wrapper); 371 __ ldrw(c_rarg2, result_type); 372 __ ldr(c_rarg3, method); 373 __ ldp(c_rarg4, c_rarg5, entry_point); 374 __ ldp(c_rarg6, c_rarg7, parameter_size); 375 376 #ifndef PRODUCT 377 // tell the simulator we are about to end Java execution 378 if (NotifySimulator) { 379 __ notify(Assembler::method_exit); 380 } 381 #endif 382 // leave frame and return to caller 383 __ leave(); 384 __ ret(lr); 385 386 // handle return types different from T_INT 387 388 __ BIND(is_long); 389 __ str(r0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 __ BIND(is_float); 393 __ strs(j_farg0, Address(j_rarg2, 0)); 394 __ br(Assembler::AL, exit); 395 396 __ BIND(is_double); 397 __ strd(j_farg0, Address(j_rarg2, 0)); 398 __ br(Assembler::AL, exit); 399 400 return start; 401 } 402 403 // Return point for a Java call if there's an exception thrown in 404 // Java code. The exception is caught and transformed into a 405 // pending exception stored in JavaThread that can be tested from 406 // within the VM. 407 // 408 // Note: Usually the parameters are removed by the callee. In case 409 // of an exception crossing an activation frame boundary, that is 410 // not the case if the callee is compiled code => need to setup the 411 // rsp. 412 // 413 // r0: exception oop 414 415 // NOTE: this is used as a target from the signal handler so it 416 // needs an x86 prolog which returns into the current simulator 417 // executing the generated catch_exception code. so the prolog 418 // needs to install rax in a sim register and adjust the sim's 419 // restart pc to enter the generated code at the start position 420 // then return from native to simulated execution. 421 422 address generate_catch_exception() { 423 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 424 address start = __ pc(); 425 426 // same as in generate_call_stub(): 427 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 428 const Address thread (rfp, thread_off * wordSize); 429 430 #ifdef ASSERT 431 // verify that threads correspond 432 { 433 Label L, S; 434 __ ldr(rscratch1, thread); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::NE, S); 437 __ get_thread(rscratch1); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::EQ, L); 440 __ bind(S); 441 __ stop("StubRoutines::catch_exception: threads must correspond"); 442 __ bind(L); 443 } 444 #endif 445 446 // set pending exception 447 __ verify_oop(r0); 448 449 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 450 __ mov(rscratch1, (address)__FILE__); 451 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 452 __ movw(rscratch1, (int)__LINE__); 453 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 454 455 // complete return to VM 456 assert(StubRoutines::_call_stub_return_address != NULL, 457 "_call_stub_return_address must have been generated before"); 458 __ b(StubRoutines::_call_stub_return_address); 459 460 return start; 461 } 462 463 // Continuation point for runtime calls returning with a pending 464 // exception. The pending exception check happened in the runtime 465 // or native call stub. The pending exception in Thread is 466 // converted into a Java-level exception. 467 // 468 // Contract with Java-level exception handlers: 469 // r0: exception 470 // r3: throwing pc 471 // 472 // NOTE: At entry of this stub, exception-pc must be in LR !! 473 474 // NOTE: this is always used as a jump target within generated code 475 // so it just needs to be generated code wiht no x86 prolog 476 477 address generate_forward_exception() { 478 StubCodeMark mark(this, "StubRoutines", "forward exception"); 479 address start = __ pc(); 480 481 // Upon entry, LR points to the return address returning into 482 // Java (interpreted or compiled) code; i.e., the return address 483 // becomes the throwing pc. 484 // 485 // Arguments pushed before the runtime call are still on the stack 486 // but the exception handler will reset the stack pointer -> 487 // ignore them. A potential result in registers can be ignored as 488 // well. 489 490 #ifdef ASSERT 491 // make sure this code is only executed if there is a pending exception 492 { 493 Label L; 494 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 495 __ cbnz(rscratch1, L); 496 __ stop("StubRoutines::forward exception: no pending exception (1)"); 497 __ bind(L); 498 } 499 #endif 500 501 // compute exception handler into r19 502 503 // call the VM to find the handler address associated with the 504 // caller address. pass thread in r0 and caller pc (ret address) 505 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 506 // the stack. 507 __ mov(c_rarg1, lr); 508 // lr will be trashed by the VM call so we move it to R19 509 // (callee-saved) because we also need to pass it to the handler 510 // returned by this call. 511 __ mov(r19, lr); 512 BLOCK_COMMENT("call exception_handler_for_return_address"); 513 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 514 SharedRuntime::exception_handler_for_return_address), 515 rthread, c_rarg1); 516 // we should not really care that lr is no longer the callee 517 // address. we saved the value the handler needs in r19 so we can 518 // just copy it to r3. however, the C2 handler will push its own 519 // frame and then calls into the VM and the VM code asserts that 520 // the PC for the frame above the handler belongs to a compiled 521 // Java method. So, we restore lr here to satisfy that assert. 522 __ mov(lr, r19); 523 // setup r0 & r3 & clear pending exception 524 __ mov(r3, r19); 525 __ mov(r19, r0); 526 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 527 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 528 529 #ifdef ASSERT 530 // make sure exception is set 531 { 532 Label L; 533 __ cbnz(r0, L); 534 __ stop("StubRoutines::forward exception: no pending exception (2)"); 535 __ bind(L); 536 } 537 #endif 538 539 // continue at exception handler 540 // r0: exception 541 // r3: throwing pc 542 // r19: exception handler 543 __ verify_oop(r0); 544 __ br(r19); 545 546 return start; 547 } 548 549 // Non-destructive plausibility checks for oops 550 // 551 // Arguments: 552 // r0: oop to verify 553 // rscratch1: error message 554 // 555 // Stack after saving c_rarg3: 556 // [tos + 0]: saved c_rarg3 557 // [tos + 1]: saved c_rarg2 558 // [tos + 2]: saved lr 559 // [tos + 3]: saved rscratch2 560 // [tos + 4]: saved r0 561 // [tos + 5]: saved rscratch1 562 address generate_verify_oop() { 563 564 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 565 address start = __ pc(); 566 567 Label exit, error; 568 569 // save c_rarg2 and c_rarg3 570 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 571 572 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 573 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 574 __ ldr(c_rarg3, Address(c_rarg2)); 575 __ add(c_rarg3, c_rarg3, 1); 576 __ str(c_rarg3, Address(c_rarg2)); 577 578 // object is in r0 579 // make sure object is 'reasonable' 580 __ cbz(r0, exit); // if obj is NULL it is OK 581 582 // Check if the oop is in the right area of memory 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 584 __ andr(c_rarg2, r0, c_rarg3); 585 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 586 587 // Compare c_rarg2 and c_rarg3. We don't use a compare 588 // instruction here because the flags register is live. 589 __ eor(c_rarg2, c_rarg2, c_rarg3); 590 __ cbnz(c_rarg2, error); 591 592 // make sure klass is 'reasonable', which is not zero. 593 __ load_klass(r0, r0); // get klass 594 __ cbz(r0, error); // if klass is NULL it is broken 595 596 // return if everything seems ok 597 __ bind(exit); 598 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 __ ret(lr); 601 602 // handle errors 603 __ bind(error); 604 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 605 606 __ push(RegSet::range(r0, r29), sp); 607 // debug(char* msg, int64_t pc, int64_t regs[]) 608 __ mov(c_rarg0, rscratch1); // pass address of error message 609 __ mov(c_rarg1, lr); // pass return address 610 __ mov(c_rarg2, sp); // pass address of regs on stack 611 #ifndef PRODUCT 612 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 613 #endif 614 BLOCK_COMMENT("call MacroAssembler::debug"); 615 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 616 __ blrt(rscratch1, 3, 0, 1); 617 618 return start; 619 } 620 621 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 622 623 // The inner part of zero_words(). This is the bulk operation, 624 // zeroing words in blocks, possibly using DC ZVA to do it. The 625 // caller is responsible for zeroing the last few words. 626 // 627 // Inputs: 628 // r10: the HeapWord-aligned base address of an array to zero. 629 // r11: the count in HeapWords, r11 > 0. 630 // 631 // Returns r10 and r11, adjusted for the caller to clear. 632 // r10: the base address of the tail of words left to clear. 633 // r11: the number of words in the tail. 634 // r11 < MacroAssembler::zero_words_block_size. 635 636 address generate_zero_blocks() { 637 Label store_pair, loop_store_pair, done; 638 Label base_aligned; 639 640 Register base = r10, cnt = r11; 641 642 __ align(CodeEntryAlignment); 643 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 644 address start = __ pc(); 645 646 if (UseBlockZeroing) { 647 int zva_length = VM_Version::zva_length(); 648 649 // Ensure ZVA length can be divided by 16. This is required by 650 // the subsequent operations. 651 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 652 653 __ tbz(base, 3, base_aligned); 654 __ str(zr, Address(__ post(base, 8))); 655 __ sub(cnt, cnt, 1); 656 __ bind(base_aligned); 657 658 // Ensure count >= zva_length * 2 so that it still deserves a zva after 659 // alignment. 660 Label small; 661 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 662 __ subs(rscratch1, cnt, low_limit >> 3); 663 __ br(Assembler::LT, small); 664 __ zero_dcache_blocks(base, cnt); 665 __ bind(small); 666 } 667 668 { 669 // Number of stp instructions we'll unroll 670 const int unroll = 671 MacroAssembler::zero_words_block_size / 2; 672 // Clear the remaining blocks. 673 Label loop; 674 __ subs(cnt, cnt, unroll * 2); 675 __ br(Assembler::LT, done); 676 __ bind(loop); 677 for (int i = 0; i < unroll; i++) 678 __ stp(zr, zr, __ post(base, 16)); 679 __ subs(cnt, cnt, unroll * 2); 680 __ br(Assembler::GE, loop); 681 __ bind(done); 682 __ add(cnt, cnt, unroll * 2); 683 } 684 685 __ ret(lr); 686 687 return start; 688 } 689 690 691 typedef enum { 692 copy_forwards = 1, 693 copy_backwards = -1 694 } copy_direction; 695 696 // Bulk copy of blocks of 8 words. 697 // 698 // count is a count of words. 699 // 700 // Precondition: count >= 8 701 // 702 // Postconditions: 703 // 704 // The least significant bit of count contains the remaining count 705 // of words to copy. The rest of count is trash. 706 // 707 // s and d are adjusted to point to the remaining words to copy 708 // 709 void generate_copy_longs(Label &start, Register s, Register d, Register count, 710 copy_direction direction) { 711 int unit = wordSize * direction; 712 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 713 714 int offset; 715 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 716 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 717 const Register stride = r13; 718 719 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 720 assert_different_registers(s, d, count, rscratch1); 721 722 Label again, drain; 723 const char *stub_name; 724 if (direction == copy_forwards) 725 stub_name = "forward_copy_longs"; 726 else 727 stub_name = "backward_copy_longs"; 728 StubCodeMark mark(this, "StubRoutines", stub_name); 729 __ align(CodeEntryAlignment); 730 __ bind(start); 731 732 Label unaligned_copy_long; 733 if (AvoidUnalignedAccesses) { 734 __ tbnz(d, 3, unaligned_copy_long); 735 } 736 737 if (direction == copy_forwards) { 738 __ sub(s, s, bias); 739 __ sub(d, d, bias); 740 } 741 742 #ifdef ASSERT 743 // Make sure we are never given < 8 words 744 { 745 Label L; 746 __ cmp(count, 8); 747 __ br(Assembler::GE, L); 748 __ stop("genrate_copy_longs called with < 8 words"); 749 __ bind(L); 750 } 751 #endif 752 753 // Fill 8 registers 754 if (UseSIMDForMemoryOps) { 755 __ ldpq(v0, v1, Address(s, 4 * unit)); 756 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 757 } else { 758 __ ldp(t0, t1, Address(s, 2 * unit)); 759 __ ldp(t2, t3, Address(s, 4 * unit)); 760 __ ldp(t4, t5, Address(s, 6 * unit)); 761 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 762 } 763 764 __ subs(count, count, 16); 765 __ br(Assembler::LO, drain); 766 767 int prefetch = PrefetchCopyIntervalInBytes; 768 bool use_stride = false; 769 if (direction == copy_backwards) { 770 use_stride = prefetch > 256; 771 prefetch = -prefetch; 772 if (use_stride) __ mov(stride, prefetch); 773 } 774 775 __ bind(again); 776 777 if (PrefetchCopyIntervalInBytes > 0) 778 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 779 780 if (UseSIMDForMemoryOps) { 781 __ stpq(v0, v1, Address(d, 4 * unit)); 782 __ ldpq(v0, v1, Address(s, 4 * unit)); 783 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 784 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 785 } else { 786 __ stp(t0, t1, Address(d, 2 * unit)); 787 __ ldp(t0, t1, Address(s, 2 * unit)); 788 __ stp(t2, t3, Address(d, 4 * unit)); 789 __ ldp(t2, t3, Address(s, 4 * unit)); 790 __ stp(t4, t5, Address(d, 6 * unit)); 791 __ ldp(t4, t5, Address(s, 6 * unit)); 792 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 793 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 794 } 795 796 __ subs(count, count, 8); 797 __ br(Assembler::HS, again); 798 799 // Drain 800 __ bind(drain); 801 if (UseSIMDForMemoryOps) { 802 __ stpq(v0, v1, Address(d, 4 * unit)); 803 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 804 } else { 805 __ stp(t0, t1, Address(d, 2 * unit)); 806 __ stp(t2, t3, Address(d, 4 * unit)); 807 __ stp(t4, t5, Address(d, 6 * unit)); 808 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 809 } 810 811 { 812 Label L1, L2; 813 __ tbz(count, exact_log2(4), L1); 814 if (UseSIMDForMemoryOps) { 815 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 816 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 817 } else { 818 __ ldp(t0, t1, Address(s, 2 * unit)); 819 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 820 __ stp(t0, t1, Address(d, 2 * unit)); 821 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 822 } 823 __ bind(L1); 824 825 if (direction == copy_forwards) { 826 __ add(s, s, bias); 827 __ add(d, d, bias); 828 } 829 830 __ tbz(count, 1, L2); 831 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 832 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 833 __ bind(L2); 834 } 835 836 __ ret(lr); 837 838 if (AvoidUnalignedAccesses) { 839 Label drain, again; 840 // Register order for storing. Order is different for backward copy. 841 842 __ bind(unaligned_copy_long); 843 844 // source address is even aligned, target odd aligned 845 // 846 // when forward copying word pairs we read long pairs at offsets 847 // {0, 2, 4, 6} (in long words). when backwards copying we read 848 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 849 // address by -2 in the forwards case so we can compute the 850 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 851 // or -1. 852 // 853 // when forward copying we need to store 1 word, 3 pairs and 854 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 855 // zero offset We adjust the destination by -1 which means we 856 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 857 // 858 // When backwards copyng we need to store 1 word, 3 pairs and 859 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 860 // offsets {1, 3, 5, 7, 8} * unit. 861 862 if (direction == copy_forwards) { 863 __ sub(s, s, 16); 864 __ sub(d, d, 8); 865 } 866 867 // Fill 8 registers 868 // 869 // for forwards copy s was offset by -16 from the original input 870 // value of s so the register contents are at these offsets 871 // relative to the 64 bit block addressed by that original input 872 // and so on for each successive 64 byte block when s is updated 873 // 874 // t0 at offset 0, t1 at offset 8 875 // t2 at offset 16, t3 at offset 24 876 // t4 at offset 32, t5 at offset 40 877 // t6 at offset 48, t7 at offset 56 878 879 // for backwards copy s was not offset so the register contents 880 // are at these offsets into the preceding 64 byte block 881 // relative to that original input and so on for each successive 882 // preceding 64 byte block when s is updated. this explains the 883 // slightly counter-intuitive looking pattern of register usage 884 // in the stp instructions for backwards copy. 885 // 886 // t0 at offset -16, t1 at offset -8 887 // t2 at offset -32, t3 at offset -24 888 // t4 at offset -48, t5 at offset -40 889 // t6 at offset -64, t7 at offset -56 890 891 __ ldp(t0, t1, Address(s, 2 * unit)); 892 __ ldp(t2, t3, Address(s, 4 * unit)); 893 __ ldp(t4, t5, Address(s, 6 * unit)); 894 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 895 896 __ subs(count, count, 16); 897 __ br(Assembler::LO, drain); 898 899 int prefetch = PrefetchCopyIntervalInBytes; 900 bool use_stride = false; 901 if (direction == copy_backwards) { 902 use_stride = prefetch > 256; 903 prefetch = -prefetch; 904 if (use_stride) __ mov(stride, prefetch); 905 } 906 907 __ bind(again); 908 909 if (PrefetchCopyIntervalInBytes > 0) 910 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 911 912 if (direction == copy_forwards) { 913 // allowing for the offset of -8 the store instructions place 914 // registers into the target 64 bit block at the following 915 // offsets 916 // 917 // t0 at offset 0 918 // t1 at offset 8, t2 at offset 16 919 // t3 at offset 24, t4 at offset 32 920 // t5 at offset 40, t6 at offset 48 921 // t7 at offset 56 922 923 __ str(t0, Address(d, 1 * unit)); 924 __ stp(t1, t2, Address(d, 2 * unit)); 925 __ ldp(t0, t1, Address(s, 2 * unit)); 926 __ stp(t3, t4, Address(d, 4 * unit)); 927 __ ldp(t2, t3, Address(s, 4 * unit)); 928 __ stp(t5, t6, Address(d, 6 * unit)); 929 __ ldp(t4, t5, Address(s, 6 * unit)); 930 __ str(t7, Address(__ pre(d, 8 * unit))); 931 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 932 } else { 933 // d was not offset when we started so the registers are 934 // written into the 64 bit block preceding d with the following 935 // offsets 936 // 937 // t1 at offset -8 938 // t3 at offset -24, t0 at offset -16 939 // t5 at offset -48, t2 at offset -32 940 // t7 at offset -56, t4 at offset -48 941 // t6 at offset -64 942 // 943 // note that this matches the offsets previously noted for the 944 // loads 945 946 __ str(t1, Address(d, 1 * unit)); 947 __ stp(t3, t0, Address(d, 3 * unit)); 948 __ ldp(t0, t1, Address(s, 2 * unit)); 949 __ stp(t5, t2, Address(d, 5 * unit)); 950 __ ldp(t2, t3, Address(s, 4 * unit)); 951 __ stp(t7, t4, Address(d, 7 * unit)); 952 __ ldp(t4, t5, Address(s, 6 * unit)); 953 __ str(t6, Address(__ pre(d, 8 * unit))); 954 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 955 } 956 957 __ subs(count, count, 8); 958 __ br(Assembler::HS, again); 959 960 // Drain 961 // 962 // this uses the same pattern of offsets and register arguments 963 // as above 964 __ bind(drain); 965 if (direction == copy_forwards) { 966 __ str(t0, Address(d, 1 * unit)); 967 __ stp(t1, t2, Address(d, 2 * unit)); 968 __ stp(t3, t4, Address(d, 4 * unit)); 969 __ stp(t5, t6, Address(d, 6 * unit)); 970 __ str(t7, Address(__ pre(d, 8 * unit))); 971 } else { 972 __ str(t1, Address(d, 1 * unit)); 973 __ stp(t3, t0, Address(d, 3 * unit)); 974 __ stp(t5, t2, Address(d, 5 * unit)); 975 __ stp(t7, t4, Address(d, 7 * unit)); 976 __ str(t6, Address(__ pre(d, 8 * unit))); 977 } 978 // now we need to copy any remaining part block which may 979 // include a 4 word block subblock and/or a 2 word subblock. 980 // bits 2 and 1 in the count are the tell-tale for whetehr we 981 // have each such subblock 982 { 983 Label L1, L2; 984 __ tbz(count, exact_log2(4), L1); 985 // this is the same as above but copying only 4 longs hence 986 // with ony one intervening stp between the str instructions 987 // but note that the offsets and registers still follow the 988 // same pattern 989 __ ldp(t0, t1, Address(s, 2 * unit)); 990 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 991 if (direction == copy_forwards) { 992 __ str(t0, Address(d, 1 * unit)); 993 __ stp(t1, t2, Address(d, 2 * unit)); 994 __ str(t3, Address(__ pre(d, 4 * unit))); 995 } else { 996 __ str(t1, Address(d, 1 * unit)); 997 __ stp(t3, t0, Address(d, 3 * unit)); 998 __ str(t2, Address(__ pre(d, 4 * unit))); 999 } 1000 __ bind(L1); 1001 1002 __ tbz(count, 1, L2); 1003 // this is the same as above but copying only 2 longs hence 1004 // there is no intervening stp between the str instructions 1005 // but note that the offset and register patterns are still 1006 // the same 1007 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1008 if (direction == copy_forwards) { 1009 __ str(t0, Address(d, 1 * unit)); 1010 __ str(t1, Address(__ pre(d, 2 * unit))); 1011 } else { 1012 __ str(t1, Address(d, 1 * unit)); 1013 __ str(t0, Address(__ pre(d, 2 * unit))); 1014 } 1015 __ bind(L2); 1016 1017 // for forwards copy we need to re-adjust the offsets we 1018 // applied so that s and d are follow the last words written 1019 1020 if (direction == copy_forwards) { 1021 __ add(s, s, 16); 1022 __ add(d, d, 8); 1023 } 1024 1025 } 1026 1027 __ ret(lr); 1028 } 1029 } 1030 1031 // Small copy: less than 16 bytes. 1032 // 1033 // NB: Ignores all of the bits of count which represent more than 15 1034 // bytes, so a caller doesn't have to mask them. 1035 1036 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1037 bool is_backwards = step < 0; 1038 size_t granularity = uabs(step); 1039 int direction = is_backwards ? -1 : 1; 1040 int unit = wordSize * direction; 1041 1042 Label Lpair, Lword, Lint, Lshort, Lbyte; 1043 1044 assert(granularity 1045 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1046 1047 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1048 1049 // ??? I don't know if this bit-test-and-branch is the right thing 1050 // to do. It does a lot of jumping, resulting in several 1051 // mispredicted branches. It might make more sense to do this 1052 // with something like Duff's device with a single computed branch. 1053 1054 __ tbz(count, 3 - exact_log2(granularity), Lword); 1055 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1056 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1057 __ bind(Lword); 1058 1059 if (granularity <= sizeof (jint)) { 1060 __ tbz(count, 2 - exact_log2(granularity), Lint); 1061 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1062 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1063 __ bind(Lint); 1064 } 1065 1066 if (granularity <= sizeof (jshort)) { 1067 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1068 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1069 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1070 __ bind(Lshort); 1071 } 1072 1073 if (granularity <= sizeof (jbyte)) { 1074 __ tbz(count, 0, Lbyte); 1075 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1076 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1077 __ bind(Lbyte); 1078 } 1079 } 1080 1081 Label copy_f, copy_b; 1082 1083 // All-singing all-dancing memory copy. 1084 // 1085 // Copy count units of memory from s to d. The size of a unit is 1086 // step, which can be positive or negative depending on the direction 1087 // of copy. If is_aligned is false, we align the source address. 1088 // 1089 1090 void copy_memory(bool is_aligned, Register s, Register d, 1091 Register count, Register tmp, int step) { 1092 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1093 bool is_backwards = step < 0; 1094 int granularity = uabs(step); 1095 const Register t0 = r3, t1 = r4; 1096 1097 // <= 96 bytes do inline. Direction doesn't matter because we always 1098 // load all the data before writing anything 1099 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1100 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1101 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1102 const Register send = r17, dend = r18; 1103 1104 if (PrefetchCopyIntervalInBytes > 0) 1105 __ prfm(Address(s, 0), PLDL1KEEP); 1106 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1107 __ br(Assembler::HI, copy_big); 1108 1109 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1110 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1111 1112 __ cmp(count, 16/granularity); 1113 __ br(Assembler::LS, copy16); 1114 1115 __ cmp(count, 64/granularity); 1116 __ br(Assembler::HI, copy80); 1117 1118 __ cmp(count, 32/granularity); 1119 __ br(Assembler::LS, copy32); 1120 1121 // 33..64 bytes 1122 if (UseSIMDForMemoryOps) { 1123 __ ldpq(v0, v1, Address(s, 0)); 1124 __ ldpq(v2, v3, Address(send, -32)); 1125 __ stpq(v0, v1, Address(d, 0)); 1126 __ stpq(v2, v3, Address(dend, -32)); 1127 } else { 1128 __ ldp(t0, t1, Address(s, 0)); 1129 __ ldp(t2, t3, Address(s, 16)); 1130 __ ldp(t4, t5, Address(send, -32)); 1131 __ ldp(t6, t7, Address(send, -16)); 1132 1133 __ stp(t0, t1, Address(d, 0)); 1134 __ stp(t2, t3, Address(d, 16)); 1135 __ stp(t4, t5, Address(dend, -32)); 1136 __ stp(t6, t7, Address(dend, -16)); 1137 } 1138 __ b(finish); 1139 1140 // 17..32 bytes 1141 __ bind(copy32); 1142 __ ldp(t0, t1, Address(s, 0)); 1143 __ ldp(t2, t3, Address(send, -16)); 1144 __ stp(t0, t1, Address(d, 0)); 1145 __ stp(t2, t3, Address(dend, -16)); 1146 __ b(finish); 1147 1148 // 65..80/96 bytes 1149 // (96 bytes if SIMD because we do 32 byes per instruction) 1150 __ bind(copy80); 1151 if (UseSIMDForMemoryOps) { 1152 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1153 __ ldpq(v4, v5, Address(send, -32)); 1154 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1155 __ stpq(v4, v5, Address(dend, -32)); 1156 } else { 1157 __ ldp(t0, t1, Address(s, 0)); 1158 __ ldp(t2, t3, Address(s, 16)); 1159 __ ldp(t4, t5, Address(s, 32)); 1160 __ ldp(t6, t7, Address(s, 48)); 1161 __ ldp(t8, t9, Address(send, -16)); 1162 1163 __ stp(t0, t1, Address(d, 0)); 1164 __ stp(t2, t3, Address(d, 16)); 1165 __ stp(t4, t5, Address(d, 32)); 1166 __ stp(t6, t7, Address(d, 48)); 1167 __ stp(t8, t9, Address(dend, -16)); 1168 } 1169 __ b(finish); 1170 1171 // 0..16 bytes 1172 __ bind(copy16); 1173 __ cmp(count, 8/granularity); 1174 __ br(Assembler::LO, copy8); 1175 1176 // 8..16 bytes 1177 __ ldr(t0, Address(s, 0)); 1178 __ ldr(t1, Address(send, -8)); 1179 __ str(t0, Address(d, 0)); 1180 __ str(t1, Address(dend, -8)); 1181 __ b(finish); 1182 1183 if (granularity < 8) { 1184 // 4..7 bytes 1185 __ bind(copy8); 1186 __ tbz(count, 2 - exact_log2(granularity), copy4); 1187 __ ldrw(t0, Address(s, 0)); 1188 __ ldrw(t1, Address(send, -4)); 1189 __ strw(t0, Address(d, 0)); 1190 __ strw(t1, Address(dend, -4)); 1191 __ b(finish); 1192 if (granularity < 4) { 1193 // 0..3 bytes 1194 __ bind(copy4); 1195 __ cbz(count, finish); // get rid of 0 case 1196 if (granularity == 2) { 1197 __ ldrh(t0, Address(s, 0)); 1198 __ strh(t0, Address(d, 0)); 1199 } else { // granularity == 1 1200 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1201 // the first and last byte. 1202 // Handle the 3 byte case by loading and storing base + count/2 1203 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1204 // This does means in the 1 byte case we load/store the same 1205 // byte 3 times. 1206 __ lsr(count, count, 1); 1207 __ ldrb(t0, Address(s, 0)); 1208 __ ldrb(t1, Address(send, -1)); 1209 __ ldrb(t2, Address(s, count)); 1210 __ strb(t0, Address(d, 0)); 1211 __ strb(t1, Address(dend, -1)); 1212 __ strb(t2, Address(d, count)); 1213 } 1214 __ b(finish); 1215 } 1216 } 1217 1218 __ bind(copy_big); 1219 if (is_backwards) { 1220 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1221 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1222 } 1223 1224 // Now we've got the small case out of the way we can align the 1225 // source address on a 2-word boundary. 1226 1227 Label aligned; 1228 1229 if (is_aligned) { 1230 // We may have to adjust by 1 word to get s 2-word-aligned. 1231 __ tbz(s, exact_log2(wordSize), aligned); 1232 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1233 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1234 __ sub(count, count, wordSize/granularity); 1235 } else { 1236 if (is_backwards) { 1237 __ andr(rscratch2, s, 2 * wordSize - 1); 1238 } else { 1239 __ neg(rscratch2, s); 1240 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1241 } 1242 // rscratch2 is the byte adjustment needed to align s. 1243 __ cbz(rscratch2, aligned); 1244 int shift = exact_log2(granularity); 1245 if (shift) __ lsr(rscratch2, rscratch2, shift); 1246 __ sub(count, count, rscratch2); 1247 1248 #if 0 1249 // ?? This code is only correct for a disjoint copy. It may or 1250 // may not make sense to use it in that case. 1251 1252 // Copy the first pair; s and d may not be aligned. 1253 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1254 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1255 1256 // Align s and d, adjust count 1257 if (is_backwards) { 1258 __ sub(s, s, rscratch2); 1259 __ sub(d, d, rscratch2); 1260 } else { 1261 __ add(s, s, rscratch2); 1262 __ add(d, d, rscratch2); 1263 } 1264 #else 1265 copy_memory_small(s, d, rscratch2, rscratch1, step); 1266 #endif 1267 } 1268 1269 __ bind(aligned); 1270 1271 // s is now 2-word-aligned. 1272 1273 // We have a count of units and some trailing bytes. Adjust the 1274 // count and do a bulk copy of words. 1275 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1276 if (direction == copy_forwards) 1277 __ bl(copy_f); 1278 else 1279 __ bl(copy_b); 1280 1281 // And the tail. 1282 copy_memory_small(s, d, count, tmp, step); 1283 1284 if (granularity >= 8) __ bind(copy8); 1285 if (granularity >= 4) __ bind(copy4); 1286 __ bind(finish); 1287 } 1288 1289 1290 void clobber_registers() { 1291 #ifdef ASSERT 1292 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1293 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1294 for (Register r = r3; r <= r18; r++) 1295 if (r != rscratch1) __ mov(r, rscratch1); 1296 #endif 1297 } 1298 1299 // Scan over array at a for count oops, verifying each one. 1300 // Preserves a and count, clobbers rscratch1 and rscratch2. 1301 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1302 Label loop, end; 1303 __ mov(rscratch1, a); 1304 __ mov(rscratch2, zr); 1305 __ bind(loop); 1306 __ cmp(rscratch2, count); 1307 __ br(Assembler::HS, end); 1308 if (size == (size_t)wordSize) { 1309 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1310 __ verify_oop(temp); 1311 } else { 1312 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1313 __ decode_heap_oop(temp); // calls verify_oop 1314 } 1315 __ add(rscratch2, rscratch2, size); 1316 __ b(loop); 1317 __ bind(end); 1318 } 1319 1320 // Arguments: 1321 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1322 // ignored 1323 // is_oop - true => oop array, so generate store check code 1324 // name - stub name string 1325 // 1326 // Inputs: 1327 // c_rarg0 - source array address 1328 // c_rarg1 - destination array address 1329 // c_rarg2 - element count, treated as ssize_t, can be zero 1330 // 1331 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1332 // the hardware handle it. The two dwords within qwords that span 1333 // cache line boundaries will still be loaded and stored atomicly. 1334 // 1335 // Side Effects: 1336 // disjoint_int_copy_entry is set to the no-overlap entry point 1337 // used by generate_conjoint_int_oop_copy(). 1338 // 1339 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1340 const char *name, bool dest_uninitialized = false) { 1341 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1342 RegSet saved_reg = RegSet::of(s, d, count); 1343 __ align(CodeEntryAlignment); 1344 StubCodeMark mark(this, "StubRoutines", name); 1345 address start = __ pc(); 1346 __ enter(); 1347 1348 if (entry != NULL) { 1349 *entry = __ pc(); 1350 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1351 BLOCK_COMMENT("Entry:"); 1352 } 1353 1354 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1355 if (dest_uninitialized) { 1356 decorators |= IS_DEST_UNINITIALIZED; 1357 } 1358 if (aligned) { 1359 decorators |= ARRAYCOPY_ALIGNED; 1360 } 1361 1362 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1363 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1364 1365 if (is_oop) { 1366 // save regs before copy_memory 1367 __ push(RegSet::of(d, count), sp); 1368 } 1369 copy_memory(aligned, s, d, count, rscratch1, size); 1370 1371 if (is_oop) { 1372 __ pop(RegSet::of(d, count), sp); 1373 if (VerifyOops) 1374 verify_oop_array(size, d, count, r16); 1375 __ sub(count, count, 1); // make an inclusive end pointer 1376 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1377 } 1378 1379 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1380 1381 __ leave(); 1382 __ mov(r0, zr); // return 0 1383 __ ret(lr); 1384 #ifdef BUILTIN_SIM 1385 { 1386 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1387 sim->notifyCompile(const_cast<char*>(name), start); 1388 } 1389 #endif 1390 return start; 1391 } 1392 1393 // Arguments: 1394 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1395 // ignored 1396 // is_oop - true => oop array, so generate store check code 1397 // name - stub name string 1398 // 1399 // Inputs: 1400 // c_rarg0 - source array address 1401 // c_rarg1 - destination array address 1402 // c_rarg2 - element count, treated as ssize_t, can be zero 1403 // 1404 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1405 // the hardware handle it. The two dwords within qwords that span 1406 // cache line boundaries will still be loaded and stored atomicly. 1407 // 1408 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1409 address *entry, const char *name, 1410 bool dest_uninitialized = false) { 1411 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1412 RegSet saved_regs = RegSet::of(s, d, count); 1413 StubCodeMark mark(this, "StubRoutines", name); 1414 address start = __ pc(); 1415 __ enter(); 1416 1417 if (entry != NULL) { 1418 *entry = __ pc(); 1419 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1420 BLOCK_COMMENT("Entry:"); 1421 } 1422 1423 // use fwd copy when (d-s) above_equal (count*size) 1424 __ sub(rscratch1, d, s); 1425 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1426 __ br(Assembler::HS, nooverlap_target); 1427 1428 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1429 if (dest_uninitialized) { 1430 decorators |= IS_DEST_UNINITIALIZED; 1431 } 1432 if (aligned) { 1433 decorators |= ARRAYCOPY_ALIGNED; 1434 } 1435 1436 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1437 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1438 1439 if (is_oop) { 1440 // save regs before copy_memory 1441 __ push(RegSet::of(d, count), sp); 1442 } 1443 copy_memory(aligned, s, d, count, rscratch1, -size); 1444 if (is_oop) { 1445 __ pop(RegSet::of(d, count), sp); 1446 if (VerifyOops) 1447 verify_oop_array(size, d, count, r16); 1448 __ sub(count, count, 1); // make an inclusive end pointer 1449 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1450 } 1451 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1452 __ leave(); 1453 __ mov(r0, zr); // return 0 1454 __ ret(lr); 1455 #ifdef BUILTIN_SIM 1456 { 1457 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1458 sim->notifyCompile(const_cast<char*>(name), start); 1459 } 1460 #endif 1461 return start; 1462 } 1463 1464 // Arguments: 1465 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1466 // ignored 1467 // name - stub name string 1468 // 1469 // Inputs: 1470 // c_rarg0 - source array address 1471 // c_rarg1 - destination array address 1472 // c_rarg2 - element count, treated as ssize_t, can be zero 1473 // 1474 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1475 // we let the hardware handle it. The one to eight bytes within words, 1476 // dwords or qwords that span cache line boundaries will still be loaded 1477 // and stored atomically. 1478 // 1479 // Side Effects: 1480 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1481 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1482 // we let the hardware handle it. The one to eight bytes within words, 1483 // dwords or qwords that span cache line boundaries will still be loaded 1484 // and stored atomically. 1485 // 1486 // Side Effects: 1487 // disjoint_byte_copy_entry is set to the no-overlap entry point 1488 // used by generate_conjoint_byte_copy(). 1489 // 1490 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1491 const bool not_oop = false; 1492 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1493 } 1494 1495 // Arguments: 1496 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1497 // ignored 1498 // name - stub name string 1499 // 1500 // Inputs: 1501 // c_rarg0 - source array address 1502 // c_rarg1 - destination array address 1503 // c_rarg2 - element count, treated as ssize_t, can be zero 1504 // 1505 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1506 // we let the hardware handle it. The one to eight bytes within words, 1507 // dwords or qwords that span cache line boundaries will still be loaded 1508 // and stored atomically. 1509 // 1510 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1511 address* entry, const char *name) { 1512 const bool not_oop = false; 1513 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1514 } 1515 1516 // Arguments: 1517 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1518 // ignored 1519 // name - stub name string 1520 // 1521 // Inputs: 1522 // c_rarg0 - source array address 1523 // c_rarg1 - destination array address 1524 // c_rarg2 - element count, treated as ssize_t, can be zero 1525 // 1526 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1527 // let the hardware handle it. The two or four words within dwords 1528 // or qwords that span cache line boundaries will still be loaded 1529 // and stored atomically. 1530 // 1531 // Side Effects: 1532 // disjoint_short_copy_entry is set to the no-overlap entry point 1533 // used by generate_conjoint_short_copy(). 1534 // 1535 address generate_disjoint_short_copy(bool aligned, 1536 address* entry, const char *name) { 1537 const bool not_oop = false; 1538 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1539 } 1540 1541 // Arguments: 1542 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1543 // ignored 1544 // name - stub name string 1545 // 1546 // Inputs: 1547 // c_rarg0 - source array address 1548 // c_rarg1 - destination array address 1549 // c_rarg2 - element count, treated as ssize_t, can be zero 1550 // 1551 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1552 // let the hardware handle it. The two or four words within dwords 1553 // or qwords that span cache line boundaries will still be loaded 1554 // and stored atomically. 1555 // 1556 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1557 address *entry, const char *name) { 1558 const bool not_oop = false; 1559 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1560 1561 } 1562 // Arguments: 1563 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1564 // ignored 1565 // name - stub name string 1566 // 1567 // Inputs: 1568 // c_rarg0 - source array address 1569 // c_rarg1 - destination array address 1570 // c_rarg2 - element count, treated as ssize_t, can be zero 1571 // 1572 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1573 // the hardware handle it. The two dwords within qwords that span 1574 // cache line boundaries will still be loaded and stored atomicly. 1575 // 1576 // Side Effects: 1577 // disjoint_int_copy_entry is set to the no-overlap entry point 1578 // used by generate_conjoint_int_oop_copy(). 1579 // 1580 address generate_disjoint_int_copy(bool aligned, address *entry, 1581 const char *name, bool dest_uninitialized = false) { 1582 const bool not_oop = false; 1583 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1584 } 1585 1586 // Arguments: 1587 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1588 // ignored 1589 // name - stub name string 1590 // 1591 // Inputs: 1592 // c_rarg0 - source array address 1593 // c_rarg1 - destination array address 1594 // c_rarg2 - element count, treated as ssize_t, can be zero 1595 // 1596 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1597 // the hardware handle it. The two dwords within qwords that span 1598 // cache line boundaries will still be loaded and stored atomicly. 1599 // 1600 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1601 address *entry, const char *name, 1602 bool dest_uninitialized = false) { 1603 const bool not_oop = false; 1604 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1605 } 1606 1607 1608 // Arguments: 1609 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1610 // ignored 1611 // name - stub name string 1612 // 1613 // Inputs: 1614 // c_rarg0 - source array address 1615 // c_rarg1 - destination array address 1616 // c_rarg2 - element count, treated as size_t, can be zero 1617 // 1618 // Side Effects: 1619 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1620 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1621 // 1622 address generate_disjoint_long_copy(bool aligned, address *entry, 1623 const char *name, bool dest_uninitialized = false) { 1624 const bool not_oop = false; 1625 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1626 } 1627 1628 // Arguments: 1629 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1630 // ignored 1631 // name - stub name string 1632 // 1633 // Inputs: 1634 // c_rarg0 - source array address 1635 // c_rarg1 - destination array address 1636 // c_rarg2 - element count, treated as size_t, can be zero 1637 // 1638 address generate_conjoint_long_copy(bool aligned, 1639 address nooverlap_target, address *entry, 1640 const char *name, bool dest_uninitialized = false) { 1641 const bool not_oop = false; 1642 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1643 } 1644 1645 // Arguments: 1646 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1647 // ignored 1648 // name - stub name string 1649 // 1650 // Inputs: 1651 // c_rarg0 - source array address 1652 // c_rarg1 - destination array address 1653 // c_rarg2 - element count, treated as size_t, can be zero 1654 // 1655 // Side Effects: 1656 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1657 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1658 // 1659 address generate_disjoint_oop_copy(bool aligned, address *entry, 1660 const char *name, bool dest_uninitialized) { 1661 const bool is_oop = true; 1662 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1663 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1664 } 1665 1666 // Arguments: 1667 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1668 // ignored 1669 // name - stub name string 1670 // 1671 // Inputs: 1672 // c_rarg0 - source array address 1673 // c_rarg1 - destination array address 1674 // c_rarg2 - element count, treated as size_t, can be zero 1675 // 1676 address generate_conjoint_oop_copy(bool aligned, 1677 address nooverlap_target, address *entry, 1678 const char *name, bool dest_uninitialized) { 1679 const bool is_oop = true; 1680 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1681 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1682 name, dest_uninitialized); 1683 } 1684 1685 1686 // Helper for generating a dynamic type check. 1687 // Smashes rscratch1. 1688 void generate_type_check(Register sub_klass, 1689 Register super_check_offset, 1690 Register super_klass, 1691 Label& L_success) { 1692 assert_different_registers(sub_klass, super_check_offset, super_klass); 1693 1694 BLOCK_COMMENT("type_check:"); 1695 1696 Label L_miss; 1697 1698 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1699 super_check_offset); 1700 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1701 1702 // Fall through on failure! 1703 __ BIND(L_miss); 1704 } 1705 1706 // 1707 // Generate checkcasting array copy stub 1708 // 1709 // Input: 1710 // c_rarg0 - source array address 1711 // c_rarg1 - destination array address 1712 // c_rarg2 - element count, treated as ssize_t, can be zero 1713 // c_rarg3 - size_t ckoff (super_check_offset) 1714 // c_rarg4 - oop ckval (super_klass) 1715 // 1716 // Output: 1717 // r0 == 0 - success 1718 // r0 == -1^K - failure, where K is partial transfer count 1719 // 1720 address generate_checkcast_copy(const char *name, address *entry, 1721 bool dest_uninitialized = false) { 1722 1723 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1724 1725 // Input registers (after setup_arg_regs) 1726 const Register from = c_rarg0; // source array address 1727 const Register to = c_rarg1; // destination array address 1728 const Register count = c_rarg2; // elementscount 1729 const Register ckoff = c_rarg3; // super_check_offset 1730 const Register ckval = c_rarg4; // super_klass 1731 1732 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1733 RegSet wb_post_saved_regs = RegSet::of(count); 1734 1735 // Registers used as temps (r18, r19, r20 are save-on-entry) 1736 const Register count_save = r21; // orig elementscount 1737 const Register start_to = r20; // destination array start address 1738 const Register copied_oop = r18; // actual oop copied 1739 const Register r19_klass = r19; // oop._klass 1740 1741 //--------------------------------------------------------------- 1742 // Assembler stub will be used for this call to arraycopy 1743 // if the two arrays are subtypes of Object[] but the 1744 // destination array type is not equal to or a supertype 1745 // of the source type. Each element must be separately 1746 // checked. 1747 1748 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1749 copied_oop, r19_klass, count_save); 1750 1751 __ align(CodeEntryAlignment); 1752 StubCodeMark mark(this, "StubRoutines", name); 1753 address start = __ pc(); 1754 1755 __ enter(); // required for proper stackwalking of RuntimeStub frame 1756 1757 #ifdef ASSERT 1758 // caller guarantees that the arrays really are different 1759 // otherwise, we would have to make conjoint checks 1760 { Label L; 1761 array_overlap_test(L, TIMES_OOP); 1762 __ stop("checkcast_copy within a single array"); 1763 __ bind(L); 1764 } 1765 #endif //ASSERT 1766 1767 // Caller of this entry point must set up the argument registers. 1768 if (entry != NULL) { 1769 *entry = __ pc(); 1770 BLOCK_COMMENT("Entry:"); 1771 } 1772 1773 // Empty array: Nothing to do. 1774 __ cbz(count, L_done); 1775 1776 __ push(RegSet::of(r18, r19, r20, r21), sp); 1777 1778 #ifdef ASSERT 1779 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1780 // The ckoff and ckval must be mutually consistent, 1781 // even though caller generates both. 1782 { Label L; 1783 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1784 __ ldrw(start_to, Address(ckval, sco_offset)); 1785 __ cmpw(ckoff, start_to); 1786 __ br(Assembler::EQ, L); 1787 __ stop("super_check_offset inconsistent"); 1788 __ bind(L); 1789 } 1790 #endif //ASSERT 1791 1792 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; 1793 bool is_oop = true; 1794 if (dest_uninitialized) { 1795 decorators |= IS_DEST_UNINITIALIZED; 1796 } 1797 1798 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1799 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1800 1801 // save the original count 1802 __ mov(count_save, count); 1803 1804 // Copy from low to high addresses 1805 __ mov(start_to, to); // Save destination array start address 1806 __ b(L_load_element); 1807 1808 // ======== begin loop ======== 1809 // (Loop is rotated; its entry is L_load_element.) 1810 // Loop control: 1811 // for (; count != 0; count--) { 1812 // copied_oop = load_heap_oop(from++); 1813 // ... generate_type_check ...; 1814 // store_heap_oop(to++, copied_oop); 1815 // } 1816 __ align(OptoLoopAlignment); 1817 1818 __ BIND(L_store_element); 1819 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1820 __ sub(count, count, 1); 1821 __ cbz(count, L_do_card_marks); 1822 1823 // ======== loop entry is here ======== 1824 __ BIND(L_load_element); 1825 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1826 __ cbz(copied_oop, L_store_element); 1827 1828 __ load_klass(r19_klass, copied_oop);// query the object klass 1829 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1830 // ======== end loop ======== 1831 1832 // It was a real error; we must depend on the caller to finish the job. 1833 // Register count = remaining oops, count_orig = total oops. 1834 // Emit GC store barriers for the oops we have copied and report 1835 // their number to the caller. 1836 1837 __ subs(count, count_save, count); // K = partially copied oop count 1838 __ eon(count, count, zr); // report (-1^K) to caller 1839 __ br(Assembler::EQ, L_done_pop); 1840 1841 __ BIND(L_do_card_marks); 1842 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1843 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs); 1844 1845 __ bind(L_done_pop); 1846 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1847 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1848 1849 __ bind(L_done); 1850 __ mov(r0, count); 1851 __ leave(); 1852 __ ret(lr); 1853 1854 return start; 1855 } 1856 1857 // Perform range checks on the proposed arraycopy. 1858 // Kills temp, but nothing else. 1859 // Also, clean the sign bits of src_pos and dst_pos. 1860 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1861 Register src_pos, // source position (c_rarg1) 1862 Register dst, // destination array oo (c_rarg2) 1863 Register dst_pos, // destination position (c_rarg3) 1864 Register length, 1865 Register temp, 1866 Label& L_failed) { 1867 BLOCK_COMMENT("arraycopy_range_checks:"); 1868 1869 assert_different_registers(rscratch1, temp); 1870 1871 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1872 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1873 __ addw(temp, length, src_pos); 1874 __ cmpw(temp, rscratch1); 1875 __ br(Assembler::HI, L_failed); 1876 1877 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1878 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1879 __ addw(temp, length, dst_pos); 1880 __ cmpw(temp, rscratch1); 1881 __ br(Assembler::HI, L_failed); 1882 1883 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1884 __ movw(src_pos, src_pos); 1885 __ movw(dst_pos, dst_pos); 1886 1887 BLOCK_COMMENT("arraycopy_range_checks done"); 1888 } 1889 1890 // These stubs get called from some dumb test routine. 1891 // I'll write them properly when they're called from 1892 // something that's actually doing something. 1893 static void fake_arraycopy_stub(address src, address dst, int count) { 1894 assert(count == 0, "huh?"); 1895 } 1896 1897 1898 // 1899 // Generate 'unsafe' array copy stub 1900 // Though just as safe as the other stubs, it takes an unscaled 1901 // size_t argument instead of an element count. 1902 // 1903 // Input: 1904 // c_rarg0 - source array address 1905 // c_rarg1 - destination array address 1906 // c_rarg2 - byte count, treated as ssize_t, can be zero 1907 // 1908 // Examines the alignment of the operands and dispatches 1909 // to a long, int, short, or byte copy loop. 1910 // 1911 address generate_unsafe_copy(const char *name, 1912 address byte_copy_entry, 1913 address short_copy_entry, 1914 address int_copy_entry, 1915 address long_copy_entry) { 1916 Label L_long_aligned, L_int_aligned, L_short_aligned; 1917 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1918 1919 __ align(CodeEntryAlignment); 1920 StubCodeMark mark(this, "StubRoutines", name); 1921 address start = __ pc(); 1922 __ enter(); // required for proper stackwalking of RuntimeStub frame 1923 1924 // bump this on entry, not on exit: 1925 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1926 1927 __ orr(rscratch1, s, d); 1928 __ orr(rscratch1, rscratch1, count); 1929 1930 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1931 __ cbz(rscratch1, L_long_aligned); 1932 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1933 __ cbz(rscratch1, L_int_aligned); 1934 __ tbz(rscratch1, 0, L_short_aligned); 1935 __ b(RuntimeAddress(byte_copy_entry)); 1936 1937 __ BIND(L_short_aligned); 1938 __ lsr(count, count, LogBytesPerShort); // size => short_count 1939 __ b(RuntimeAddress(short_copy_entry)); 1940 __ BIND(L_int_aligned); 1941 __ lsr(count, count, LogBytesPerInt); // size => int_count 1942 __ b(RuntimeAddress(int_copy_entry)); 1943 __ BIND(L_long_aligned); 1944 __ lsr(count, count, LogBytesPerLong); // size => long_count 1945 __ b(RuntimeAddress(long_copy_entry)); 1946 1947 return start; 1948 } 1949 1950 // 1951 // Generate generic array copy stubs 1952 // 1953 // Input: 1954 // c_rarg0 - src oop 1955 // c_rarg1 - src_pos (32-bits) 1956 // c_rarg2 - dst oop 1957 // c_rarg3 - dst_pos (32-bits) 1958 // c_rarg4 - element count (32-bits) 1959 // 1960 // Output: 1961 // r0 == 0 - success 1962 // r0 == -1^K - failure, where K is partial transfer count 1963 // 1964 address generate_generic_copy(const char *name, 1965 address byte_copy_entry, address short_copy_entry, 1966 address int_copy_entry, address oop_copy_entry, 1967 address long_copy_entry, address checkcast_copy_entry) { 1968 1969 Label L_failed, L_failed_0, L_objArray; 1970 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1971 1972 // Input registers 1973 const Register src = c_rarg0; // source array oop 1974 const Register src_pos = c_rarg1; // source position 1975 const Register dst = c_rarg2; // destination array oop 1976 const Register dst_pos = c_rarg3; // destination position 1977 const Register length = c_rarg4; 1978 1979 StubCodeMark mark(this, "StubRoutines", name); 1980 1981 __ align(CodeEntryAlignment); 1982 address start = __ pc(); 1983 1984 __ enter(); // required for proper stackwalking of RuntimeStub frame 1985 1986 // bump this on entry, not on exit: 1987 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1988 1989 //----------------------------------------------------------------------- 1990 // Assembler stub will be used for this call to arraycopy 1991 // if the following conditions are met: 1992 // 1993 // (1) src and dst must not be null. 1994 // (2) src_pos must not be negative. 1995 // (3) dst_pos must not be negative. 1996 // (4) length must not be negative. 1997 // (5) src klass and dst klass should be the same and not NULL. 1998 // (6) src and dst should be arrays. 1999 // (7) src_pos + length must not exceed length of src. 2000 // (8) dst_pos + length must not exceed length of dst. 2001 // 2002 2003 // if (src == NULL) return -1; 2004 __ cbz(src, L_failed); 2005 2006 // if (src_pos < 0) return -1; 2007 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2008 2009 // if (dst == NULL) return -1; 2010 __ cbz(dst, L_failed); 2011 2012 // if (dst_pos < 0) return -1; 2013 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2014 2015 // registers used as temp 2016 const Register scratch_length = r16; // elements count to copy 2017 const Register scratch_src_klass = r17; // array klass 2018 const Register lh = r18; // layout helper 2019 2020 // if (length < 0) return -1; 2021 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2022 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2023 2024 __ load_klass(scratch_src_klass, src); 2025 #ifdef ASSERT 2026 // assert(src->klass() != NULL); 2027 { 2028 BLOCK_COMMENT("assert klasses not null {"); 2029 Label L1, L2; 2030 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2031 __ bind(L1); 2032 __ stop("broken null klass"); 2033 __ bind(L2); 2034 __ load_klass(rscratch1, dst); 2035 __ cbz(rscratch1, L1); // this would be broken also 2036 BLOCK_COMMENT("} assert klasses not null done"); 2037 } 2038 #endif 2039 2040 // Load layout helper (32-bits) 2041 // 2042 // |array_tag| | header_size | element_type | |log2_element_size| 2043 // 32 30 24 16 8 2 0 2044 // 2045 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2046 // 2047 2048 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2049 2050 // Handle objArrays completely differently... 2051 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2052 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2053 __ movw(rscratch1, objArray_lh); 2054 __ eorw(rscratch2, lh, rscratch1); 2055 __ cbzw(rscratch2, L_objArray); 2056 2057 // if (src->klass() != dst->klass()) return -1; 2058 __ load_klass(rscratch2, dst); 2059 __ eor(rscratch2, rscratch2, scratch_src_klass); 2060 __ cbnz(rscratch2, L_failed); 2061 2062 // if (!src->is_Array()) return -1; 2063 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2064 2065 // At this point, it is known to be a typeArray (array_tag 0x3). 2066 #ifdef ASSERT 2067 { 2068 BLOCK_COMMENT("assert primitive array {"); 2069 Label L; 2070 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2071 __ cmpw(lh, rscratch2); 2072 __ br(Assembler::GE, L); 2073 __ stop("must be a primitive array"); 2074 __ bind(L); 2075 BLOCK_COMMENT("} assert primitive array done"); 2076 } 2077 #endif 2078 2079 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2080 rscratch2, L_failed); 2081 2082 // TypeArrayKlass 2083 // 2084 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2085 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2086 // 2087 2088 const Register rscratch1_offset = rscratch1; // array offset 2089 const Register r18_elsize = lh; // element size 2090 2091 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2092 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2093 __ add(src, src, rscratch1_offset); // src array offset 2094 __ add(dst, dst, rscratch1_offset); // dst array offset 2095 BLOCK_COMMENT("choose copy loop based on element size"); 2096 2097 // next registers should be set before the jump to corresponding stub 2098 const Register from = c_rarg0; // source array address 2099 const Register to = c_rarg1; // destination array address 2100 const Register count = c_rarg2; // elements count 2101 2102 // 'from', 'to', 'count' registers should be set in such order 2103 // since they are the same as 'src', 'src_pos', 'dst'. 2104 2105 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2106 2107 // The possible values of elsize are 0-3, i.e. exact_log2(element 2108 // size in bytes). We do a simple bitwise binary search. 2109 __ BIND(L_copy_bytes); 2110 __ tbnz(r18_elsize, 1, L_copy_ints); 2111 __ tbnz(r18_elsize, 0, L_copy_shorts); 2112 __ lea(from, Address(src, src_pos));// src_addr 2113 __ lea(to, Address(dst, dst_pos));// dst_addr 2114 __ movw(count, scratch_length); // length 2115 __ b(RuntimeAddress(byte_copy_entry)); 2116 2117 __ BIND(L_copy_shorts); 2118 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2119 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2120 __ movw(count, scratch_length); // length 2121 __ b(RuntimeAddress(short_copy_entry)); 2122 2123 __ BIND(L_copy_ints); 2124 __ tbnz(r18_elsize, 0, L_copy_longs); 2125 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2126 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2127 __ movw(count, scratch_length); // length 2128 __ b(RuntimeAddress(int_copy_entry)); 2129 2130 __ BIND(L_copy_longs); 2131 #ifdef ASSERT 2132 { 2133 BLOCK_COMMENT("assert long copy {"); 2134 Label L; 2135 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2136 __ cmpw(r18_elsize, LogBytesPerLong); 2137 __ br(Assembler::EQ, L); 2138 __ stop("must be long copy, but elsize is wrong"); 2139 __ bind(L); 2140 BLOCK_COMMENT("} assert long copy done"); 2141 } 2142 #endif 2143 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2144 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2145 __ movw(count, scratch_length); // length 2146 __ b(RuntimeAddress(long_copy_entry)); 2147 2148 // ObjArrayKlass 2149 __ BIND(L_objArray); 2150 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2151 2152 Label L_plain_copy, L_checkcast_copy; 2153 // test array classes for subtyping 2154 __ load_klass(r18, dst); 2155 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2156 __ br(Assembler::NE, L_checkcast_copy); 2157 2158 // Identically typed arrays can be copied without element-wise checks. 2159 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2160 rscratch2, L_failed); 2161 2162 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2163 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2164 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2165 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2166 __ movw(count, scratch_length); // length 2167 __ BIND(L_plain_copy); 2168 __ b(RuntimeAddress(oop_copy_entry)); 2169 2170 __ BIND(L_checkcast_copy); 2171 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2172 { 2173 // Before looking at dst.length, make sure dst is also an objArray. 2174 __ ldrw(rscratch1, Address(r18, lh_offset)); 2175 __ movw(rscratch2, objArray_lh); 2176 __ eorw(rscratch1, rscratch1, rscratch2); 2177 __ cbnzw(rscratch1, L_failed); 2178 2179 // It is safe to examine both src.length and dst.length. 2180 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2181 r18, L_failed); 2182 2183 const Register rscratch2_dst_klass = rscratch2; 2184 __ load_klass(rscratch2_dst_klass, dst); // reload 2185 2186 // Marshal the base address arguments now, freeing registers. 2187 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2188 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2189 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2190 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2191 __ movw(count, length); // length (reloaded) 2192 Register sco_temp = c_rarg3; // this register is free now 2193 assert_different_registers(from, to, count, sco_temp, 2194 rscratch2_dst_klass, scratch_src_klass); 2195 // assert_clean_int(count, sco_temp); 2196 2197 // Generate the type check. 2198 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2199 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2200 // assert_clean_int(sco_temp, r18); 2201 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2202 2203 // Fetch destination element klass from the ObjArrayKlass header. 2204 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2205 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2206 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2207 2208 // the checkcast_copy loop needs two extra arguments: 2209 assert(c_rarg3 == sco_temp, "#3 already in place"); 2210 // Set up arguments for checkcast_copy_entry. 2211 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2212 __ b(RuntimeAddress(checkcast_copy_entry)); 2213 } 2214 2215 __ BIND(L_failed); 2216 __ mov(r0, -1); 2217 __ leave(); // required for proper stackwalking of RuntimeStub frame 2218 __ ret(lr); 2219 2220 return start; 2221 } 2222 2223 // 2224 // Generate stub for array fill. If "aligned" is true, the 2225 // "to" address is assumed to be heapword aligned. 2226 // 2227 // Arguments for generated stub: 2228 // to: c_rarg0 2229 // value: c_rarg1 2230 // count: c_rarg2 treated as signed 2231 // 2232 address generate_fill(BasicType t, bool aligned, const char *name) { 2233 __ align(CodeEntryAlignment); 2234 StubCodeMark mark(this, "StubRoutines", name); 2235 address start = __ pc(); 2236 2237 BLOCK_COMMENT("Entry:"); 2238 2239 const Register to = c_rarg0; // source array address 2240 const Register value = c_rarg1; // value 2241 const Register count = c_rarg2; // elements count 2242 2243 const Register bz_base = r10; // base for block_zero routine 2244 const Register cnt_words = r11; // temp register 2245 2246 __ enter(); 2247 2248 Label L_fill_elements, L_exit1; 2249 2250 int shift = -1; 2251 switch (t) { 2252 case T_BYTE: 2253 shift = 0; 2254 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2255 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2256 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2257 __ br(Assembler::LO, L_fill_elements); 2258 break; 2259 case T_SHORT: 2260 shift = 1; 2261 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2262 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2263 __ br(Assembler::LO, L_fill_elements); 2264 break; 2265 case T_INT: 2266 shift = 2; 2267 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2268 __ br(Assembler::LO, L_fill_elements); 2269 break; 2270 default: ShouldNotReachHere(); 2271 } 2272 2273 // Align source address at 8 bytes address boundary. 2274 Label L_skip_align1, L_skip_align2, L_skip_align4; 2275 if (!aligned) { 2276 switch (t) { 2277 case T_BYTE: 2278 // One byte misalignment happens only for byte arrays. 2279 __ tbz(to, 0, L_skip_align1); 2280 __ strb(value, Address(__ post(to, 1))); 2281 __ subw(count, count, 1); 2282 __ bind(L_skip_align1); 2283 // Fallthrough 2284 case T_SHORT: 2285 // Two bytes misalignment happens only for byte and short (char) arrays. 2286 __ tbz(to, 1, L_skip_align2); 2287 __ strh(value, Address(__ post(to, 2))); 2288 __ subw(count, count, 2 >> shift); 2289 __ bind(L_skip_align2); 2290 // Fallthrough 2291 case T_INT: 2292 // Align to 8 bytes, we know we are 4 byte aligned to start. 2293 __ tbz(to, 2, L_skip_align4); 2294 __ strw(value, Address(__ post(to, 4))); 2295 __ subw(count, count, 4 >> shift); 2296 __ bind(L_skip_align4); 2297 break; 2298 default: ShouldNotReachHere(); 2299 } 2300 } 2301 2302 // 2303 // Fill large chunks 2304 // 2305 __ lsrw(cnt_words, count, 3 - shift); // number of words 2306 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2307 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2308 if (UseBlockZeroing) { 2309 Label non_block_zeroing, rest; 2310 // If the fill value is zero we can use the fast zero_words(). 2311 __ cbnz(value, non_block_zeroing); 2312 __ mov(bz_base, to); 2313 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2314 __ zero_words(bz_base, cnt_words); 2315 __ b(rest); 2316 __ bind(non_block_zeroing); 2317 __ fill_words(to, cnt_words, value); 2318 __ bind(rest); 2319 } else { 2320 __ fill_words(to, cnt_words, value); 2321 } 2322 2323 // Remaining count is less than 8 bytes. Fill it by a single store. 2324 // Note that the total length is no less than 8 bytes. 2325 if (t == T_BYTE || t == T_SHORT) { 2326 Label L_exit1; 2327 __ cbzw(count, L_exit1); 2328 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2329 __ str(value, Address(to, -8)); // overwrite some elements 2330 __ bind(L_exit1); 2331 __ leave(); 2332 __ ret(lr); 2333 } 2334 2335 // Handle copies less than 8 bytes. 2336 Label L_fill_2, L_fill_4, L_exit2; 2337 __ bind(L_fill_elements); 2338 switch (t) { 2339 case T_BYTE: 2340 __ tbz(count, 0, L_fill_2); 2341 __ strb(value, Address(__ post(to, 1))); 2342 __ bind(L_fill_2); 2343 __ tbz(count, 1, L_fill_4); 2344 __ strh(value, Address(__ post(to, 2))); 2345 __ bind(L_fill_4); 2346 __ tbz(count, 2, L_exit2); 2347 __ strw(value, Address(to)); 2348 break; 2349 case T_SHORT: 2350 __ tbz(count, 0, L_fill_4); 2351 __ strh(value, Address(__ post(to, 2))); 2352 __ bind(L_fill_4); 2353 __ tbz(count, 1, L_exit2); 2354 __ strw(value, Address(to)); 2355 break; 2356 case T_INT: 2357 __ cbzw(count, L_exit2); 2358 __ strw(value, Address(to)); 2359 break; 2360 default: ShouldNotReachHere(); 2361 } 2362 __ bind(L_exit2); 2363 __ leave(); 2364 __ ret(lr); 2365 return start; 2366 } 2367 2368 address generate_data_cache_writeback() { 2369 const Register line = c_rarg0; // address of line to write back 2370 2371 __ align(CodeEntryAlignment); 2372 2373 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2374 2375 address start = __ pc(); 2376 __ enter(); 2377 __ cache_wb(Address(line, 0)); 2378 __ leave(); 2379 __ ret(lr); 2380 2381 return start; 2382 } 2383 2384 address generate_data_cache_writeback_sync() { 2385 const Register kind = c_rarg0; // pre or post sync (unused) 2386 2387 __ align(CodeEntryAlignment); 2388 2389 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2390 2391 address start = __ pc(); 2392 __ enter(); 2393 __ cache_wbsync(); 2394 __ leave(); 2395 __ ret(lr); 2396 2397 return start; 2398 } 2399 2400 void generate_arraycopy_stubs() { 2401 address entry; 2402 address entry_jbyte_arraycopy; 2403 address entry_jshort_arraycopy; 2404 address entry_jint_arraycopy; 2405 address entry_oop_arraycopy; 2406 address entry_jlong_arraycopy; 2407 address entry_checkcast_arraycopy; 2408 2409 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2410 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2411 2412 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2413 2414 //*** jbyte 2415 // Always need aligned and unaligned versions 2416 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2417 "jbyte_disjoint_arraycopy"); 2418 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2419 &entry_jbyte_arraycopy, 2420 "jbyte_arraycopy"); 2421 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2422 "arrayof_jbyte_disjoint_arraycopy"); 2423 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2424 "arrayof_jbyte_arraycopy"); 2425 2426 //*** jshort 2427 // Always need aligned and unaligned versions 2428 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2429 "jshort_disjoint_arraycopy"); 2430 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2431 &entry_jshort_arraycopy, 2432 "jshort_arraycopy"); 2433 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2434 "arrayof_jshort_disjoint_arraycopy"); 2435 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2436 "arrayof_jshort_arraycopy"); 2437 2438 //*** jint 2439 // Aligned versions 2440 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2441 "arrayof_jint_disjoint_arraycopy"); 2442 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2443 "arrayof_jint_arraycopy"); 2444 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2445 // entry_jint_arraycopy always points to the unaligned version 2446 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2447 "jint_disjoint_arraycopy"); 2448 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2449 &entry_jint_arraycopy, 2450 "jint_arraycopy"); 2451 2452 //*** jlong 2453 // It is always aligned 2454 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2455 "arrayof_jlong_disjoint_arraycopy"); 2456 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2457 "arrayof_jlong_arraycopy"); 2458 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2459 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2460 2461 //*** oops 2462 { 2463 // With compressed oops we need unaligned versions; notice that 2464 // we overwrite entry_oop_arraycopy. 2465 bool aligned = !UseCompressedOops; 2466 2467 StubRoutines::_arrayof_oop_disjoint_arraycopy 2468 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2469 /*dest_uninitialized*/false); 2470 StubRoutines::_arrayof_oop_arraycopy 2471 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2472 /*dest_uninitialized*/false); 2473 // Aligned versions without pre-barriers 2474 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2475 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2476 /*dest_uninitialized*/true); 2477 StubRoutines::_arrayof_oop_arraycopy_uninit 2478 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2479 /*dest_uninitialized*/true); 2480 } 2481 2482 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2483 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2484 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2485 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2486 2487 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2488 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2489 /*dest_uninitialized*/true); 2490 2491 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2492 entry_jbyte_arraycopy, 2493 entry_jshort_arraycopy, 2494 entry_jint_arraycopy, 2495 entry_jlong_arraycopy); 2496 2497 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2498 entry_jbyte_arraycopy, 2499 entry_jshort_arraycopy, 2500 entry_jint_arraycopy, 2501 entry_oop_arraycopy, 2502 entry_jlong_arraycopy, 2503 entry_checkcast_arraycopy); 2504 2505 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2506 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2507 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2508 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2509 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2510 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2511 } 2512 2513 void generate_math_stubs() { Unimplemented(); } 2514 2515 // Arguments: 2516 // 2517 // Inputs: 2518 // c_rarg0 - source byte array address 2519 // c_rarg1 - destination byte array address 2520 // c_rarg2 - K (key) in little endian int array 2521 // 2522 address generate_aescrypt_encryptBlock() { 2523 __ align(CodeEntryAlignment); 2524 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2525 2526 Label L_doLast; 2527 2528 const Register from = c_rarg0; // source array address 2529 const Register to = c_rarg1; // destination array address 2530 const Register key = c_rarg2; // key array address 2531 const Register keylen = rscratch1; 2532 2533 address start = __ pc(); 2534 __ enter(); 2535 2536 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2537 2538 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2539 2540 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2541 __ rev32(v1, __ T16B, v1); 2542 __ rev32(v2, __ T16B, v2); 2543 __ rev32(v3, __ T16B, v3); 2544 __ rev32(v4, __ T16B, v4); 2545 __ aese(v0, v1); 2546 __ aesmc(v0, v0); 2547 __ aese(v0, v2); 2548 __ aesmc(v0, v0); 2549 __ aese(v0, v3); 2550 __ aesmc(v0, v0); 2551 __ aese(v0, v4); 2552 __ aesmc(v0, v0); 2553 2554 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2555 __ rev32(v1, __ T16B, v1); 2556 __ rev32(v2, __ T16B, v2); 2557 __ rev32(v3, __ T16B, v3); 2558 __ rev32(v4, __ T16B, v4); 2559 __ aese(v0, v1); 2560 __ aesmc(v0, v0); 2561 __ aese(v0, v2); 2562 __ aesmc(v0, v0); 2563 __ aese(v0, v3); 2564 __ aesmc(v0, v0); 2565 __ aese(v0, v4); 2566 __ aesmc(v0, v0); 2567 2568 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2569 __ rev32(v1, __ T16B, v1); 2570 __ rev32(v2, __ T16B, v2); 2571 2572 __ cmpw(keylen, 44); 2573 __ br(Assembler::EQ, L_doLast); 2574 2575 __ aese(v0, v1); 2576 __ aesmc(v0, v0); 2577 __ aese(v0, v2); 2578 __ aesmc(v0, v0); 2579 2580 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2581 __ rev32(v1, __ T16B, v1); 2582 __ rev32(v2, __ T16B, v2); 2583 2584 __ cmpw(keylen, 52); 2585 __ br(Assembler::EQ, L_doLast); 2586 2587 __ aese(v0, v1); 2588 __ aesmc(v0, v0); 2589 __ aese(v0, v2); 2590 __ aesmc(v0, v0); 2591 2592 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2593 __ rev32(v1, __ T16B, v1); 2594 __ rev32(v2, __ T16B, v2); 2595 2596 __ BIND(L_doLast); 2597 2598 __ aese(v0, v1); 2599 __ aesmc(v0, v0); 2600 __ aese(v0, v2); 2601 2602 __ ld1(v1, __ T16B, key); 2603 __ rev32(v1, __ T16B, v1); 2604 __ eor(v0, __ T16B, v0, v1); 2605 2606 __ st1(v0, __ T16B, to); 2607 2608 __ mov(r0, 0); 2609 2610 __ leave(); 2611 __ ret(lr); 2612 2613 return start; 2614 } 2615 2616 // Arguments: 2617 // 2618 // Inputs: 2619 // c_rarg0 - source byte array address 2620 // c_rarg1 - destination byte array address 2621 // c_rarg2 - K (key) in little endian int array 2622 // 2623 address generate_aescrypt_decryptBlock() { 2624 assert(UseAES, "need AES instructions and misaligned SSE support"); 2625 __ align(CodeEntryAlignment); 2626 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2627 Label L_doLast; 2628 2629 const Register from = c_rarg0; // source array address 2630 const Register to = c_rarg1; // destination array address 2631 const Register key = c_rarg2; // key array address 2632 const Register keylen = rscratch1; 2633 2634 address start = __ pc(); 2635 __ enter(); // required for proper stackwalking of RuntimeStub frame 2636 2637 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2638 2639 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2640 2641 __ ld1(v5, __ T16B, __ post(key, 16)); 2642 __ rev32(v5, __ T16B, v5); 2643 2644 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2645 __ rev32(v1, __ T16B, v1); 2646 __ rev32(v2, __ T16B, v2); 2647 __ rev32(v3, __ T16B, v3); 2648 __ rev32(v4, __ T16B, v4); 2649 __ aesd(v0, v1); 2650 __ aesimc(v0, v0); 2651 __ aesd(v0, v2); 2652 __ aesimc(v0, v0); 2653 __ aesd(v0, v3); 2654 __ aesimc(v0, v0); 2655 __ aesd(v0, v4); 2656 __ aesimc(v0, v0); 2657 2658 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2659 __ rev32(v1, __ T16B, v1); 2660 __ rev32(v2, __ T16B, v2); 2661 __ rev32(v3, __ T16B, v3); 2662 __ rev32(v4, __ T16B, v4); 2663 __ aesd(v0, v1); 2664 __ aesimc(v0, v0); 2665 __ aesd(v0, v2); 2666 __ aesimc(v0, v0); 2667 __ aesd(v0, v3); 2668 __ aesimc(v0, v0); 2669 __ aesd(v0, v4); 2670 __ aesimc(v0, v0); 2671 2672 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2673 __ rev32(v1, __ T16B, v1); 2674 __ rev32(v2, __ T16B, v2); 2675 2676 __ cmpw(keylen, 44); 2677 __ br(Assembler::EQ, L_doLast); 2678 2679 __ aesd(v0, v1); 2680 __ aesimc(v0, v0); 2681 __ aesd(v0, v2); 2682 __ aesimc(v0, v0); 2683 2684 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2685 __ rev32(v1, __ T16B, v1); 2686 __ rev32(v2, __ T16B, v2); 2687 2688 __ cmpw(keylen, 52); 2689 __ br(Assembler::EQ, L_doLast); 2690 2691 __ aesd(v0, v1); 2692 __ aesimc(v0, v0); 2693 __ aesd(v0, v2); 2694 __ aesimc(v0, v0); 2695 2696 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2697 __ rev32(v1, __ T16B, v1); 2698 __ rev32(v2, __ T16B, v2); 2699 2700 __ BIND(L_doLast); 2701 2702 __ aesd(v0, v1); 2703 __ aesimc(v0, v0); 2704 __ aesd(v0, v2); 2705 2706 __ eor(v0, __ T16B, v0, v5); 2707 2708 __ st1(v0, __ T16B, to); 2709 2710 __ mov(r0, 0); 2711 2712 __ leave(); 2713 __ ret(lr); 2714 2715 return start; 2716 } 2717 2718 // Arguments: 2719 // 2720 // Inputs: 2721 // c_rarg0 - source byte array address 2722 // c_rarg1 - destination byte array address 2723 // c_rarg2 - K (key) in little endian int array 2724 // c_rarg3 - r vector byte array address 2725 // c_rarg4 - input length 2726 // 2727 // Output: 2728 // x0 - input length 2729 // 2730 address generate_cipherBlockChaining_encryptAESCrypt() { 2731 assert(UseAES, "need AES instructions and misaligned SSE support"); 2732 __ align(CodeEntryAlignment); 2733 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2734 2735 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2736 2737 const Register from = c_rarg0; // source array address 2738 const Register to = c_rarg1; // destination array address 2739 const Register key = c_rarg2; // key array address 2740 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2741 // and left with the results of the last encryption block 2742 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2743 const Register keylen = rscratch1; 2744 2745 address start = __ pc(); 2746 2747 __ enter(); 2748 2749 __ movw(rscratch2, len_reg); 2750 2751 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2752 2753 __ ld1(v0, __ T16B, rvec); 2754 2755 __ cmpw(keylen, 52); 2756 __ br(Assembler::CC, L_loadkeys_44); 2757 __ br(Assembler::EQ, L_loadkeys_52); 2758 2759 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2760 __ rev32(v17, __ T16B, v17); 2761 __ rev32(v18, __ T16B, v18); 2762 __ BIND(L_loadkeys_52); 2763 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2764 __ rev32(v19, __ T16B, v19); 2765 __ rev32(v20, __ T16B, v20); 2766 __ BIND(L_loadkeys_44); 2767 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2768 __ rev32(v21, __ T16B, v21); 2769 __ rev32(v22, __ T16B, v22); 2770 __ rev32(v23, __ T16B, v23); 2771 __ rev32(v24, __ T16B, v24); 2772 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2773 __ rev32(v25, __ T16B, v25); 2774 __ rev32(v26, __ T16B, v26); 2775 __ rev32(v27, __ T16B, v27); 2776 __ rev32(v28, __ T16B, v28); 2777 __ ld1(v29, v30, v31, __ T16B, key); 2778 __ rev32(v29, __ T16B, v29); 2779 __ rev32(v30, __ T16B, v30); 2780 __ rev32(v31, __ T16B, v31); 2781 2782 __ BIND(L_aes_loop); 2783 __ ld1(v1, __ T16B, __ post(from, 16)); 2784 __ eor(v0, __ T16B, v0, v1); 2785 2786 __ br(Assembler::CC, L_rounds_44); 2787 __ br(Assembler::EQ, L_rounds_52); 2788 2789 __ aese(v0, v17); __ aesmc(v0, v0); 2790 __ aese(v0, v18); __ aesmc(v0, v0); 2791 __ BIND(L_rounds_52); 2792 __ aese(v0, v19); __ aesmc(v0, v0); 2793 __ aese(v0, v20); __ aesmc(v0, v0); 2794 __ BIND(L_rounds_44); 2795 __ aese(v0, v21); __ aesmc(v0, v0); 2796 __ aese(v0, v22); __ aesmc(v0, v0); 2797 __ aese(v0, v23); __ aesmc(v0, v0); 2798 __ aese(v0, v24); __ aesmc(v0, v0); 2799 __ aese(v0, v25); __ aesmc(v0, v0); 2800 __ aese(v0, v26); __ aesmc(v0, v0); 2801 __ aese(v0, v27); __ aesmc(v0, v0); 2802 __ aese(v0, v28); __ aesmc(v0, v0); 2803 __ aese(v0, v29); __ aesmc(v0, v0); 2804 __ aese(v0, v30); 2805 __ eor(v0, __ T16B, v0, v31); 2806 2807 __ st1(v0, __ T16B, __ post(to, 16)); 2808 2809 __ subw(len_reg, len_reg, 16); 2810 __ cbnzw(len_reg, L_aes_loop); 2811 2812 __ st1(v0, __ T16B, rvec); 2813 2814 __ mov(r0, rscratch2); 2815 2816 __ leave(); 2817 __ ret(lr); 2818 2819 return start; 2820 } 2821 2822 // Arguments: 2823 // 2824 // Inputs: 2825 // c_rarg0 - source byte array address 2826 // c_rarg1 - destination byte array address 2827 // c_rarg2 - K (key) in little endian int array 2828 // c_rarg3 - r vector byte array address 2829 // c_rarg4 - input length 2830 // 2831 // Output: 2832 // r0 - input length 2833 // 2834 address generate_cipherBlockChaining_decryptAESCrypt() { 2835 assert(UseAES, "need AES instructions and misaligned SSE support"); 2836 __ align(CodeEntryAlignment); 2837 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2838 2839 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2840 2841 const Register from = c_rarg0; // source array address 2842 const Register to = c_rarg1; // destination array address 2843 const Register key = c_rarg2; // key array address 2844 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2845 // and left with the results of the last encryption block 2846 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2847 const Register keylen = rscratch1; 2848 2849 address start = __ pc(); 2850 2851 __ enter(); 2852 2853 __ movw(rscratch2, len_reg); 2854 2855 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2856 2857 __ ld1(v2, __ T16B, rvec); 2858 2859 __ ld1(v31, __ T16B, __ post(key, 16)); 2860 __ rev32(v31, __ T16B, v31); 2861 2862 __ cmpw(keylen, 52); 2863 __ br(Assembler::CC, L_loadkeys_44); 2864 __ br(Assembler::EQ, L_loadkeys_52); 2865 2866 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2867 __ rev32(v17, __ T16B, v17); 2868 __ rev32(v18, __ T16B, v18); 2869 __ BIND(L_loadkeys_52); 2870 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2871 __ rev32(v19, __ T16B, v19); 2872 __ rev32(v20, __ T16B, v20); 2873 __ BIND(L_loadkeys_44); 2874 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2875 __ rev32(v21, __ T16B, v21); 2876 __ rev32(v22, __ T16B, v22); 2877 __ rev32(v23, __ T16B, v23); 2878 __ rev32(v24, __ T16B, v24); 2879 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2880 __ rev32(v25, __ T16B, v25); 2881 __ rev32(v26, __ T16B, v26); 2882 __ rev32(v27, __ T16B, v27); 2883 __ rev32(v28, __ T16B, v28); 2884 __ ld1(v29, v30, __ T16B, key); 2885 __ rev32(v29, __ T16B, v29); 2886 __ rev32(v30, __ T16B, v30); 2887 2888 __ BIND(L_aes_loop); 2889 __ ld1(v0, __ T16B, __ post(from, 16)); 2890 __ orr(v1, __ T16B, v0, v0); 2891 2892 __ br(Assembler::CC, L_rounds_44); 2893 __ br(Assembler::EQ, L_rounds_52); 2894 2895 __ aesd(v0, v17); __ aesimc(v0, v0); 2896 __ aesd(v0, v18); __ aesimc(v0, v0); 2897 __ BIND(L_rounds_52); 2898 __ aesd(v0, v19); __ aesimc(v0, v0); 2899 __ aesd(v0, v20); __ aesimc(v0, v0); 2900 __ BIND(L_rounds_44); 2901 __ aesd(v0, v21); __ aesimc(v0, v0); 2902 __ aesd(v0, v22); __ aesimc(v0, v0); 2903 __ aesd(v0, v23); __ aesimc(v0, v0); 2904 __ aesd(v0, v24); __ aesimc(v0, v0); 2905 __ aesd(v0, v25); __ aesimc(v0, v0); 2906 __ aesd(v0, v26); __ aesimc(v0, v0); 2907 __ aesd(v0, v27); __ aesimc(v0, v0); 2908 __ aesd(v0, v28); __ aesimc(v0, v0); 2909 __ aesd(v0, v29); __ aesimc(v0, v0); 2910 __ aesd(v0, v30); 2911 __ eor(v0, __ T16B, v0, v31); 2912 __ eor(v0, __ T16B, v0, v2); 2913 2914 __ st1(v0, __ T16B, __ post(to, 16)); 2915 __ orr(v2, __ T16B, v1, v1); 2916 2917 __ subw(len_reg, len_reg, 16); 2918 __ cbnzw(len_reg, L_aes_loop); 2919 2920 __ st1(v2, __ T16B, rvec); 2921 2922 __ mov(r0, rscratch2); 2923 2924 __ leave(); 2925 __ ret(lr); 2926 2927 return start; 2928 } 2929 2930 // Arguments: 2931 // 2932 // Inputs: 2933 // c_rarg0 - byte[] source+offset 2934 // c_rarg1 - int[] SHA.state 2935 // c_rarg2 - int offset 2936 // c_rarg3 - int limit 2937 // 2938 address generate_sha1_implCompress(bool multi_block, const char *name) { 2939 __ align(CodeEntryAlignment); 2940 StubCodeMark mark(this, "StubRoutines", name); 2941 address start = __ pc(); 2942 2943 Register buf = c_rarg0; 2944 Register state = c_rarg1; 2945 Register ofs = c_rarg2; 2946 Register limit = c_rarg3; 2947 2948 Label keys; 2949 Label sha1_loop; 2950 2951 // load the keys into v0..v3 2952 __ adr(rscratch1, keys); 2953 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2954 // load 5 words state into v6, v7 2955 __ ldrq(v6, Address(state, 0)); 2956 __ ldrs(v7, Address(state, 16)); 2957 2958 2959 __ BIND(sha1_loop); 2960 // load 64 bytes of data into v16..v19 2961 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2962 __ rev32(v16, __ T16B, v16); 2963 __ rev32(v17, __ T16B, v17); 2964 __ rev32(v18, __ T16B, v18); 2965 __ rev32(v19, __ T16B, v19); 2966 2967 // do the sha1 2968 __ addv(v4, __ T4S, v16, v0); 2969 __ orr(v20, __ T16B, v6, v6); 2970 2971 FloatRegister d0 = v16; 2972 FloatRegister d1 = v17; 2973 FloatRegister d2 = v18; 2974 FloatRegister d3 = v19; 2975 2976 for (int round = 0; round < 20; round++) { 2977 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2978 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2979 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2980 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2981 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2982 2983 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2984 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2985 __ sha1h(tmp2, __ T4S, v20); 2986 if (round < 5) 2987 __ sha1c(v20, __ T4S, tmp3, tmp4); 2988 else if (round < 10 || round >= 15) 2989 __ sha1p(v20, __ T4S, tmp3, tmp4); 2990 else 2991 __ sha1m(v20, __ T4S, tmp3, tmp4); 2992 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2993 2994 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2995 } 2996 2997 __ addv(v7, __ T2S, v7, v21); 2998 __ addv(v6, __ T4S, v6, v20); 2999 3000 if (multi_block) { 3001 __ add(ofs, ofs, 64); 3002 __ cmp(ofs, limit); 3003 __ br(Assembler::LE, sha1_loop); 3004 __ mov(c_rarg0, ofs); // return ofs 3005 } 3006 3007 __ strq(v6, Address(state, 0)); 3008 __ strs(v7, Address(state, 16)); 3009 3010 __ ret(lr); 3011 3012 __ bind(keys); 3013 __ emit_int32(0x5a827999); 3014 __ emit_int32(0x6ed9eba1); 3015 __ emit_int32(0x8f1bbcdc); 3016 __ emit_int32(0xca62c1d6); 3017 3018 return start; 3019 } 3020 3021 3022 // Arguments: 3023 // 3024 // Inputs: 3025 // c_rarg0 - byte[] source+offset 3026 // c_rarg1 - int[] SHA.state 3027 // c_rarg2 - int offset 3028 // c_rarg3 - int limit 3029 // 3030 address generate_sha256_implCompress(bool multi_block, const char *name) { 3031 static const uint32_t round_consts[64] = { 3032 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3033 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3034 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3035 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3036 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3037 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3038 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3039 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3040 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3041 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3042 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3043 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3044 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3045 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3046 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3047 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3048 }; 3049 __ align(CodeEntryAlignment); 3050 StubCodeMark mark(this, "StubRoutines", name); 3051 address start = __ pc(); 3052 3053 Register buf = c_rarg0; 3054 Register state = c_rarg1; 3055 Register ofs = c_rarg2; 3056 Register limit = c_rarg3; 3057 3058 Label sha1_loop; 3059 3060 __ stpd(v8, v9, __ pre(sp, -32)); 3061 __ stpd(v10, v11, Address(sp, 16)); 3062 3063 // dga == v0 3064 // dgb == v1 3065 // dg0 == v2 3066 // dg1 == v3 3067 // dg2 == v4 3068 // t0 == v6 3069 // t1 == v7 3070 3071 // load 16 keys to v16..v31 3072 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3073 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3074 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3075 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3076 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3077 3078 // load 8 words (256 bits) state 3079 __ ldpq(v0, v1, state); 3080 3081 __ BIND(sha1_loop); 3082 // load 64 bytes of data into v8..v11 3083 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3084 __ rev32(v8, __ T16B, v8); 3085 __ rev32(v9, __ T16B, v9); 3086 __ rev32(v10, __ T16B, v10); 3087 __ rev32(v11, __ T16B, v11); 3088 3089 __ addv(v6, __ T4S, v8, v16); 3090 __ orr(v2, __ T16B, v0, v0); 3091 __ orr(v3, __ T16B, v1, v1); 3092 3093 FloatRegister d0 = v8; 3094 FloatRegister d1 = v9; 3095 FloatRegister d2 = v10; 3096 FloatRegister d3 = v11; 3097 3098 3099 for (int round = 0; round < 16; round++) { 3100 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3101 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3102 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3103 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3104 3105 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3106 __ orr(v4, __ T16B, v2, v2); 3107 if (round < 15) 3108 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3109 __ sha256h(v2, __ T4S, v3, tmp2); 3110 __ sha256h2(v3, __ T4S, v4, tmp2); 3111 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3112 3113 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3114 } 3115 3116 __ addv(v0, __ T4S, v0, v2); 3117 __ addv(v1, __ T4S, v1, v3); 3118 3119 if (multi_block) { 3120 __ add(ofs, ofs, 64); 3121 __ cmp(ofs, limit); 3122 __ br(Assembler::LE, sha1_loop); 3123 __ mov(c_rarg0, ofs); // return ofs 3124 } 3125 3126 __ ldpd(v10, v11, Address(sp, 16)); 3127 __ ldpd(v8, v9, __ post(sp, 32)); 3128 3129 __ stpq(v0, v1, state); 3130 3131 __ ret(lr); 3132 3133 return start; 3134 } 3135 3136 #ifndef BUILTIN_SIM 3137 // Safefetch stubs. 3138 void generate_safefetch(const char* name, int size, address* entry, 3139 address* fault_pc, address* continuation_pc) { 3140 // safefetch signatures: 3141 // int SafeFetch32(int* adr, int errValue); 3142 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3143 // 3144 // arguments: 3145 // c_rarg0 = adr 3146 // c_rarg1 = errValue 3147 // 3148 // result: 3149 // PPC_RET = *adr or errValue 3150 3151 StubCodeMark mark(this, "StubRoutines", name); 3152 3153 // Entry point, pc or function descriptor. 3154 *entry = __ pc(); 3155 3156 // Load *adr into c_rarg1, may fault. 3157 *fault_pc = __ pc(); 3158 switch (size) { 3159 case 4: 3160 // int32_t 3161 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3162 break; 3163 case 8: 3164 // int64_t 3165 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3166 break; 3167 default: 3168 ShouldNotReachHere(); 3169 } 3170 3171 // return errValue or *adr 3172 *continuation_pc = __ pc(); 3173 __ mov(r0, c_rarg1); 3174 __ ret(lr); 3175 } 3176 #endif 3177 3178 /** 3179 * Arguments: 3180 * 3181 * Inputs: 3182 * c_rarg0 - int crc 3183 * c_rarg1 - byte* buf 3184 * c_rarg2 - int length 3185 * 3186 * Ouput: 3187 * rax - int crc result 3188 */ 3189 address generate_updateBytesCRC32() { 3190 assert(UseCRC32Intrinsics, "what are we doing here?"); 3191 3192 __ align(CodeEntryAlignment); 3193 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3194 3195 address start = __ pc(); 3196 3197 const Register crc = c_rarg0; // crc 3198 const Register buf = c_rarg1; // source java byte array address 3199 const Register len = c_rarg2; // length 3200 const Register table0 = c_rarg3; // crc_table address 3201 const Register table1 = c_rarg4; 3202 const Register table2 = c_rarg5; 3203 const Register table3 = c_rarg6; 3204 const Register tmp3 = c_rarg7; 3205 3206 BLOCK_COMMENT("Entry:"); 3207 __ enter(); // required for proper stackwalking of RuntimeStub frame 3208 3209 __ kernel_crc32(crc, buf, len, 3210 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3211 3212 __ leave(); // required for proper stackwalking of RuntimeStub frame 3213 __ ret(lr); 3214 3215 return start; 3216 } 3217 3218 /** 3219 * Arguments: 3220 * 3221 * Inputs: 3222 * c_rarg0 - int crc 3223 * c_rarg1 - byte* buf 3224 * c_rarg2 - int length 3225 * c_rarg3 - int* table 3226 * 3227 * Ouput: 3228 * r0 - int crc result 3229 */ 3230 address generate_updateBytesCRC32C() { 3231 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3232 3233 __ align(CodeEntryAlignment); 3234 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3235 3236 address start = __ pc(); 3237 3238 const Register crc = c_rarg0; // crc 3239 const Register buf = c_rarg1; // source java byte array address 3240 const Register len = c_rarg2; // length 3241 const Register table0 = c_rarg3; // crc_table address 3242 const Register table1 = c_rarg4; 3243 const Register table2 = c_rarg5; 3244 const Register table3 = c_rarg6; 3245 const Register tmp3 = c_rarg7; 3246 3247 BLOCK_COMMENT("Entry:"); 3248 __ enter(); // required for proper stackwalking of RuntimeStub frame 3249 3250 __ kernel_crc32c(crc, buf, len, 3251 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3252 3253 __ leave(); // required for proper stackwalking of RuntimeStub frame 3254 __ ret(lr); 3255 3256 return start; 3257 } 3258 3259 /*** 3260 * Arguments: 3261 * 3262 * Inputs: 3263 * c_rarg0 - int adler 3264 * c_rarg1 - byte* buff 3265 * c_rarg2 - int len 3266 * 3267 * Output: 3268 * c_rarg0 - int adler result 3269 */ 3270 address generate_updateBytesAdler32() { 3271 __ align(CodeEntryAlignment); 3272 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3273 address start = __ pc(); 3274 3275 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3276 3277 // Aliases 3278 Register adler = c_rarg0; 3279 Register s1 = c_rarg0; 3280 Register s2 = c_rarg3; 3281 Register buff = c_rarg1; 3282 Register len = c_rarg2; 3283 Register nmax = r4; 3284 Register base = r5; 3285 Register count = r6; 3286 Register temp0 = rscratch1; 3287 Register temp1 = rscratch2; 3288 Register temp2 = r7; 3289 3290 // Max number of bytes we can process before having to take the mod 3291 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3292 unsigned long BASE = 0xfff1; 3293 unsigned long NMAX = 0x15B0; 3294 3295 __ mov(base, BASE); 3296 __ mov(nmax, NMAX); 3297 3298 // s1 is initialized to the lower 16 bits of adler 3299 // s2 is initialized to the upper 16 bits of adler 3300 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3301 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3302 3303 // The pipelined loop needs at least 16 elements for 1 iteration 3304 // It does check this, but it is more effective to skip to the cleanup loop 3305 __ cmp(len, 16); 3306 __ br(Assembler::HS, L_nmax); 3307 __ cbz(len, L_combine); 3308 3309 __ bind(L_simple_by1_loop); 3310 __ ldrb(temp0, Address(__ post(buff, 1))); 3311 __ add(s1, s1, temp0); 3312 __ add(s2, s2, s1); 3313 __ subs(len, len, 1); 3314 __ br(Assembler::HI, L_simple_by1_loop); 3315 3316 // s1 = s1 % BASE 3317 __ subs(temp0, s1, base); 3318 __ csel(s1, temp0, s1, Assembler::HS); 3319 3320 // s2 = s2 % BASE 3321 __ lsr(temp0, s2, 16); 3322 __ lsl(temp1, temp0, 4); 3323 __ sub(temp1, temp1, temp0); 3324 __ add(s2, temp1, s2, ext::uxth); 3325 3326 __ subs(temp0, s2, base); 3327 __ csel(s2, temp0, s2, Assembler::HS); 3328 3329 __ b(L_combine); 3330 3331 __ bind(L_nmax); 3332 __ subs(len, len, nmax); 3333 __ sub(count, nmax, 16); 3334 __ br(Assembler::LO, L_by16); 3335 3336 __ bind(L_nmax_loop); 3337 3338 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3339 3340 __ add(s1, s1, temp0, ext::uxtb); 3341 __ ubfx(temp2, temp0, 8, 8); 3342 __ add(s2, s2, s1); 3343 __ add(s1, s1, temp2); 3344 __ ubfx(temp2, temp0, 16, 8); 3345 __ add(s2, s2, s1); 3346 __ add(s1, s1, temp2); 3347 __ ubfx(temp2, temp0, 24, 8); 3348 __ add(s2, s2, s1); 3349 __ add(s1, s1, temp2); 3350 __ ubfx(temp2, temp0, 32, 8); 3351 __ add(s2, s2, s1); 3352 __ add(s1, s1, temp2); 3353 __ ubfx(temp2, temp0, 40, 8); 3354 __ add(s2, s2, s1); 3355 __ add(s1, s1, temp2); 3356 __ ubfx(temp2, temp0, 48, 8); 3357 __ add(s2, s2, s1); 3358 __ add(s1, s1, temp2); 3359 __ add(s2, s2, s1); 3360 __ add(s1, s1, temp0, Assembler::LSR, 56); 3361 __ add(s2, s2, s1); 3362 3363 __ add(s1, s1, temp1, ext::uxtb); 3364 __ ubfx(temp2, temp1, 8, 8); 3365 __ add(s2, s2, s1); 3366 __ add(s1, s1, temp2); 3367 __ ubfx(temp2, temp1, 16, 8); 3368 __ add(s2, s2, s1); 3369 __ add(s1, s1, temp2); 3370 __ ubfx(temp2, temp1, 24, 8); 3371 __ add(s2, s2, s1); 3372 __ add(s1, s1, temp2); 3373 __ ubfx(temp2, temp1, 32, 8); 3374 __ add(s2, s2, s1); 3375 __ add(s1, s1, temp2); 3376 __ ubfx(temp2, temp1, 40, 8); 3377 __ add(s2, s2, s1); 3378 __ add(s1, s1, temp2); 3379 __ ubfx(temp2, temp1, 48, 8); 3380 __ add(s2, s2, s1); 3381 __ add(s1, s1, temp2); 3382 __ add(s2, s2, s1); 3383 __ add(s1, s1, temp1, Assembler::LSR, 56); 3384 __ add(s2, s2, s1); 3385 3386 __ subs(count, count, 16); 3387 __ br(Assembler::HS, L_nmax_loop); 3388 3389 // s1 = s1 % BASE 3390 __ lsr(temp0, s1, 16); 3391 __ lsl(temp1, temp0, 4); 3392 __ sub(temp1, temp1, temp0); 3393 __ add(temp1, temp1, s1, ext::uxth); 3394 3395 __ lsr(temp0, temp1, 16); 3396 __ lsl(s1, temp0, 4); 3397 __ sub(s1, s1, temp0); 3398 __ add(s1, s1, temp1, ext:: uxth); 3399 3400 __ subs(temp0, s1, base); 3401 __ csel(s1, temp0, s1, Assembler::HS); 3402 3403 // s2 = s2 % BASE 3404 __ lsr(temp0, s2, 16); 3405 __ lsl(temp1, temp0, 4); 3406 __ sub(temp1, temp1, temp0); 3407 __ add(temp1, temp1, s2, ext::uxth); 3408 3409 __ lsr(temp0, temp1, 16); 3410 __ lsl(s2, temp0, 4); 3411 __ sub(s2, s2, temp0); 3412 __ add(s2, s2, temp1, ext:: uxth); 3413 3414 __ subs(temp0, s2, base); 3415 __ csel(s2, temp0, s2, Assembler::HS); 3416 3417 __ subs(len, len, nmax); 3418 __ sub(count, nmax, 16); 3419 __ br(Assembler::HS, L_nmax_loop); 3420 3421 __ bind(L_by16); 3422 __ adds(len, len, count); 3423 __ br(Assembler::LO, L_by1); 3424 3425 __ bind(L_by16_loop); 3426 3427 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3428 3429 __ add(s1, s1, temp0, ext::uxtb); 3430 __ ubfx(temp2, temp0, 8, 8); 3431 __ add(s2, s2, s1); 3432 __ add(s1, s1, temp2); 3433 __ ubfx(temp2, temp0, 16, 8); 3434 __ add(s2, s2, s1); 3435 __ add(s1, s1, temp2); 3436 __ ubfx(temp2, temp0, 24, 8); 3437 __ add(s2, s2, s1); 3438 __ add(s1, s1, temp2); 3439 __ ubfx(temp2, temp0, 32, 8); 3440 __ add(s2, s2, s1); 3441 __ add(s1, s1, temp2); 3442 __ ubfx(temp2, temp0, 40, 8); 3443 __ add(s2, s2, s1); 3444 __ add(s1, s1, temp2); 3445 __ ubfx(temp2, temp0, 48, 8); 3446 __ add(s2, s2, s1); 3447 __ add(s1, s1, temp2); 3448 __ add(s2, s2, s1); 3449 __ add(s1, s1, temp0, Assembler::LSR, 56); 3450 __ add(s2, s2, s1); 3451 3452 __ add(s1, s1, temp1, ext::uxtb); 3453 __ ubfx(temp2, temp1, 8, 8); 3454 __ add(s2, s2, s1); 3455 __ add(s1, s1, temp2); 3456 __ ubfx(temp2, temp1, 16, 8); 3457 __ add(s2, s2, s1); 3458 __ add(s1, s1, temp2); 3459 __ ubfx(temp2, temp1, 24, 8); 3460 __ add(s2, s2, s1); 3461 __ add(s1, s1, temp2); 3462 __ ubfx(temp2, temp1, 32, 8); 3463 __ add(s2, s2, s1); 3464 __ add(s1, s1, temp2); 3465 __ ubfx(temp2, temp1, 40, 8); 3466 __ add(s2, s2, s1); 3467 __ add(s1, s1, temp2); 3468 __ ubfx(temp2, temp1, 48, 8); 3469 __ add(s2, s2, s1); 3470 __ add(s1, s1, temp2); 3471 __ add(s2, s2, s1); 3472 __ add(s1, s1, temp1, Assembler::LSR, 56); 3473 __ add(s2, s2, s1); 3474 3475 __ subs(len, len, 16); 3476 __ br(Assembler::HS, L_by16_loop); 3477 3478 __ bind(L_by1); 3479 __ adds(len, len, 15); 3480 __ br(Assembler::LO, L_do_mod); 3481 3482 __ bind(L_by1_loop); 3483 __ ldrb(temp0, Address(__ post(buff, 1))); 3484 __ add(s1, temp0, s1); 3485 __ add(s2, s2, s1); 3486 __ subs(len, len, 1); 3487 __ br(Assembler::HS, L_by1_loop); 3488 3489 __ bind(L_do_mod); 3490 // s1 = s1 % BASE 3491 __ lsr(temp0, s1, 16); 3492 __ lsl(temp1, temp0, 4); 3493 __ sub(temp1, temp1, temp0); 3494 __ add(temp1, temp1, s1, ext::uxth); 3495 3496 __ lsr(temp0, temp1, 16); 3497 __ lsl(s1, temp0, 4); 3498 __ sub(s1, s1, temp0); 3499 __ add(s1, s1, temp1, ext:: uxth); 3500 3501 __ subs(temp0, s1, base); 3502 __ csel(s1, temp0, s1, Assembler::HS); 3503 3504 // s2 = s2 % BASE 3505 __ lsr(temp0, s2, 16); 3506 __ lsl(temp1, temp0, 4); 3507 __ sub(temp1, temp1, temp0); 3508 __ add(temp1, temp1, s2, ext::uxth); 3509 3510 __ lsr(temp0, temp1, 16); 3511 __ lsl(s2, temp0, 4); 3512 __ sub(s2, s2, temp0); 3513 __ add(s2, s2, temp1, ext:: uxth); 3514 3515 __ subs(temp0, s2, base); 3516 __ csel(s2, temp0, s2, Assembler::HS); 3517 3518 // Combine lower bits and higher bits 3519 __ bind(L_combine); 3520 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3521 3522 __ ret(lr); 3523 3524 return start; 3525 } 3526 3527 /** 3528 * Arguments: 3529 * 3530 * Input: 3531 * c_rarg0 - x address 3532 * c_rarg1 - x length 3533 * c_rarg2 - y address 3534 * c_rarg3 - y lenth 3535 * c_rarg4 - z address 3536 * c_rarg5 - z length 3537 */ 3538 address generate_multiplyToLen() { 3539 __ align(CodeEntryAlignment); 3540 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3541 3542 address start = __ pc(); 3543 const Register x = r0; 3544 const Register xlen = r1; 3545 const Register y = r2; 3546 const Register ylen = r3; 3547 const Register z = r4; 3548 const Register zlen = r5; 3549 3550 const Register tmp1 = r10; 3551 const Register tmp2 = r11; 3552 const Register tmp3 = r12; 3553 const Register tmp4 = r13; 3554 const Register tmp5 = r14; 3555 const Register tmp6 = r15; 3556 const Register tmp7 = r16; 3557 3558 BLOCK_COMMENT("Entry:"); 3559 __ enter(); // required for proper stackwalking of RuntimeStub frame 3560 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3561 __ leave(); // required for proper stackwalking of RuntimeStub frame 3562 __ ret(lr); 3563 3564 return start; 3565 } 3566 3567 address generate_squareToLen() { 3568 // squareToLen algorithm for sizes 1..127 described in java code works 3569 // faster than multiply_to_len on some CPUs and slower on others, but 3570 // multiply_to_len shows a bit better overall results 3571 __ align(CodeEntryAlignment); 3572 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3573 address start = __ pc(); 3574 3575 const Register x = r0; 3576 const Register xlen = r1; 3577 const Register z = r2; 3578 const Register zlen = r3; 3579 const Register y = r4; // == x 3580 const Register ylen = r5; // == xlen 3581 3582 const Register tmp1 = r10; 3583 const Register tmp2 = r11; 3584 const Register tmp3 = r12; 3585 const Register tmp4 = r13; 3586 const Register tmp5 = r14; 3587 const Register tmp6 = r15; 3588 const Register tmp7 = r16; 3589 3590 RegSet spilled_regs = RegSet::of(y, ylen); 3591 BLOCK_COMMENT("Entry:"); 3592 __ enter(); 3593 __ push(spilled_regs, sp); 3594 __ mov(y, x); 3595 __ mov(ylen, xlen); 3596 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3597 __ pop(spilled_regs, sp); 3598 __ leave(); 3599 __ ret(lr); 3600 return start; 3601 } 3602 3603 address generate_mulAdd() { 3604 __ align(CodeEntryAlignment); 3605 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3606 3607 address start = __ pc(); 3608 3609 const Register out = r0; 3610 const Register in = r1; 3611 const Register offset = r2; 3612 const Register len = r3; 3613 const Register k = r4; 3614 3615 BLOCK_COMMENT("Entry:"); 3616 __ enter(); 3617 __ mul_add(out, in, offset, len, k); 3618 __ leave(); 3619 __ ret(lr); 3620 3621 return start; 3622 } 3623 3624 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3625 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3626 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3627 // Karatsuba multiplication performs a 128*128 -> 256-bit 3628 // multiplication in three 128-bit multiplications and a few 3629 // additions. 3630 // 3631 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3632 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3633 // 3634 // Inputs: 3635 // 3636 // A0 in a.d[0] (subkey) 3637 // A1 in a.d[1] 3638 // (A1+A0) in a1_xor_a0.d[0] 3639 // 3640 // B0 in b.d[0] (state) 3641 // B1 in b.d[1] 3642 3643 __ ext(tmp1, __ T16B, b, b, 0x08); 3644 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3645 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3646 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3647 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3648 3649 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3650 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3651 __ eor(tmp2, __ T16B, tmp2, tmp4); 3652 __ eor(tmp2, __ T16B, tmp2, tmp3); 3653 3654 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3655 __ ins(result_hi, __ D, tmp2, 0, 1); 3656 __ ins(result_lo, __ D, tmp2, 1, 0); 3657 } 3658 3659 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3660 FloatRegister p, FloatRegister z, FloatRegister t1) { 3661 const FloatRegister t0 = result; 3662 3663 // The GCM field polynomial f is z^128 + p(z), where p = 3664 // z^7+z^2+z+1. 3665 // 3666 // z^128 === -p(z) (mod (z^128 + p(z))) 3667 // 3668 // so, given that the product we're reducing is 3669 // a == lo + hi * z^128 3670 // substituting, 3671 // === lo - hi * p(z) (mod (z^128 + p(z))) 3672 // 3673 // we reduce by multiplying hi by p(z) and subtracting the result 3674 // from (i.e. XORing it with) lo. Because p has no nonzero high 3675 // bits we can do this with two 64-bit multiplications, lo*p and 3676 // hi*p. 3677 3678 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3679 __ ext(t1, __ T16B, t0, z, 8); 3680 __ eor(hi, __ T16B, hi, t1); 3681 __ ext(t1, __ T16B, z, t0, 8); 3682 __ eor(lo, __ T16B, lo, t1); 3683 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3684 __ eor(result, __ T16B, lo, t0); 3685 } 3686 3687 address generate_has_negatives(address &has_negatives_long) { 3688 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3689 const int large_loop_size = 64; 3690 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3691 int dcache_line = VM_Version::dcache_line_size(); 3692 3693 Register ary1 = r1, len = r2, result = r0; 3694 3695 __ align(CodeEntryAlignment); 3696 address entry = __ pc(); 3697 3698 __ enter(); 3699 3700 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3701 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3702 3703 __ cmp(len, 15); 3704 __ br(Assembler::GT, LEN_OVER_15); 3705 // The only case when execution falls into this code is when pointer is near 3706 // the end of memory page and we have to avoid reading next page 3707 __ add(ary1, ary1, len); 3708 __ subs(len, len, 8); 3709 __ br(Assembler::GT, LEN_OVER_8); 3710 __ ldr(rscratch2, Address(ary1, -8)); 3711 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3712 __ lsrv(rscratch2, rscratch2, rscratch1); 3713 __ tst(rscratch2, UPPER_BIT_MASK); 3714 __ cset(result, Assembler::NE); 3715 __ leave(); 3716 __ ret(lr); 3717 __ bind(LEN_OVER_8); 3718 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3719 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3720 __ tst(rscratch2, UPPER_BIT_MASK); 3721 __ br(Assembler::NE, RET_TRUE_NO_POP); 3722 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3723 __ lsrv(rscratch1, rscratch1, rscratch2); 3724 __ tst(rscratch1, UPPER_BIT_MASK); 3725 __ cset(result, Assembler::NE); 3726 __ leave(); 3727 __ ret(lr); 3728 3729 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3730 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3731 3732 has_negatives_long = __ pc(); // 2nd entry point 3733 3734 __ enter(); 3735 3736 __ bind(LEN_OVER_15); 3737 __ push(spilled_regs, sp); 3738 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3739 __ cbz(rscratch2, ALIGNED); 3740 __ ldp(tmp6, tmp1, Address(ary1)); 3741 __ mov(tmp5, 16); 3742 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3743 __ add(ary1, ary1, rscratch1); 3744 __ sub(len, len, rscratch1); 3745 __ orr(tmp6, tmp6, tmp1); 3746 __ tst(tmp6, UPPER_BIT_MASK); 3747 __ br(Assembler::NE, RET_TRUE); 3748 3749 __ bind(ALIGNED); 3750 __ cmp(len, large_loop_size); 3751 __ br(Assembler::LT, CHECK_16); 3752 // Perform 16-byte load as early return in pre-loop to handle situation 3753 // when initially aligned large array has negative values at starting bytes, 3754 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3755 // slower. Cases with negative bytes further ahead won't be affected that 3756 // much. In fact, it'll be faster due to early loads, less instructions and 3757 // less branches in LARGE_LOOP. 3758 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3759 __ sub(len, len, 16); 3760 __ orr(tmp6, tmp6, tmp1); 3761 __ tst(tmp6, UPPER_BIT_MASK); 3762 __ br(Assembler::NE, RET_TRUE); 3763 __ cmp(len, large_loop_size); 3764 __ br(Assembler::LT, CHECK_16); 3765 3766 if (SoftwarePrefetchHintDistance >= 0 3767 && SoftwarePrefetchHintDistance >= dcache_line) { 3768 // initial prefetch 3769 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3770 } 3771 __ bind(LARGE_LOOP); 3772 if (SoftwarePrefetchHintDistance >= 0) { 3773 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3774 } 3775 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3776 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3777 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3778 // instructions per cycle and have less branches, but this approach disables 3779 // early return, thus, all 64 bytes are loaded and checked every time. 3780 __ ldp(tmp2, tmp3, Address(ary1)); 3781 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3782 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3783 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3784 __ add(ary1, ary1, large_loop_size); 3785 __ sub(len, len, large_loop_size); 3786 __ orr(tmp2, tmp2, tmp3); 3787 __ orr(tmp4, tmp4, tmp5); 3788 __ orr(rscratch1, rscratch1, rscratch2); 3789 __ orr(tmp6, tmp6, tmp1); 3790 __ orr(tmp2, tmp2, tmp4); 3791 __ orr(rscratch1, rscratch1, tmp6); 3792 __ orr(tmp2, tmp2, rscratch1); 3793 __ tst(tmp2, UPPER_BIT_MASK); 3794 __ br(Assembler::NE, RET_TRUE); 3795 __ cmp(len, large_loop_size); 3796 __ br(Assembler::GE, LARGE_LOOP); 3797 3798 __ bind(CHECK_16); // small 16-byte load pre-loop 3799 __ cmp(len, 16); 3800 __ br(Assembler::LT, POST_LOOP16); 3801 3802 __ bind(LOOP16); // small 16-byte load loop 3803 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3804 __ sub(len, len, 16); 3805 __ orr(tmp2, tmp2, tmp3); 3806 __ tst(tmp2, UPPER_BIT_MASK); 3807 __ br(Assembler::NE, RET_TRUE); 3808 __ cmp(len, 16); 3809 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3810 3811 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3812 __ cmp(len, 8); 3813 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3814 __ ldr(tmp3, Address(__ post(ary1, 8))); 3815 __ sub(len, len, 8); 3816 __ tst(tmp3, UPPER_BIT_MASK); 3817 __ br(Assembler::NE, RET_TRUE); 3818 3819 __ bind(POST_LOOP16_LOAD_TAIL); 3820 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3821 __ ldr(tmp1, Address(ary1)); 3822 __ mov(tmp2, 64); 3823 __ sub(tmp4, tmp2, len, __ LSL, 3); 3824 __ lslv(tmp1, tmp1, tmp4); 3825 __ tst(tmp1, UPPER_BIT_MASK); 3826 __ br(Assembler::NE, RET_TRUE); 3827 // Fallthrough 3828 3829 __ bind(RET_FALSE); 3830 __ pop(spilled_regs, sp); 3831 __ leave(); 3832 __ mov(result, zr); 3833 __ ret(lr); 3834 3835 __ bind(RET_TRUE); 3836 __ pop(spilled_regs, sp); 3837 __ bind(RET_TRUE_NO_POP); 3838 __ leave(); 3839 __ mov(result, 1); 3840 __ ret(lr); 3841 3842 __ bind(DONE); 3843 __ pop(spilled_regs, sp); 3844 __ leave(); 3845 __ ret(lr); 3846 return entry; 3847 } 3848 3849 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3850 bool usePrefetch, Label &NOT_EQUAL) { 3851 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3852 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3853 tmp7 = r12, tmp8 = r13; 3854 Label LOOP; 3855 3856 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3857 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3858 __ bind(LOOP); 3859 if (usePrefetch) { 3860 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3861 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3862 } 3863 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3864 __ eor(tmp1, tmp1, tmp2); 3865 __ eor(tmp3, tmp3, tmp4); 3866 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3867 __ orr(tmp1, tmp1, tmp3); 3868 __ cbnz(tmp1, NOT_EQUAL); 3869 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3870 __ eor(tmp5, tmp5, tmp6); 3871 __ eor(tmp7, tmp7, tmp8); 3872 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3873 __ orr(tmp5, tmp5, tmp7); 3874 __ cbnz(tmp5, NOT_EQUAL); 3875 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3876 __ eor(tmp1, tmp1, tmp2); 3877 __ eor(tmp3, tmp3, tmp4); 3878 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3879 __ orr(tmp1, tmp1, tmp3); 3880 __ cbnz(tmp1, NOT_EQUAL); 3881 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3882 __ eor(tmp5, tmp5, tmp6); 3883 __ sub(cnt1, cnt1, 8 * wordSize); 3884 __ eor(tmp7, tmp7, tmp8); 3885 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3886 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3887 // cmp) because subs allows an unlimited range of immediate operand. 3888 __ subs(tmp6, cnt1, loopThreshold); 3889 __ orr(tmp5, tmp5, tmp7); 3890 __ cbnz(tmp5, NOT_EQUAL); 3891 __ br(__ GE, LOOP); 3892 // post-loop 3893 __ eor(tmp1, tmp1, tmp2); 3894 __ eor(tmp3, tmp3, tmp4); 3895 __ orr(tmp1, tmp1, tmp3); 3896 __ sub(cnt1, cnt1, 2 * wordSize); 3897 __ cbnz(tmp1, NOT_EQUAL); 3898 } 3899 3900 void generate_large_array_equals_loop_simd(int loopThreshold, 3901 bool usePrefetch, Label &NOT_EQUAL) { 3902 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3903 tmp2 = rscratch2; 3904 Label LOOP; 3905 3906 __ bind(LOOP); 3907 if (usePrefetch) { 3908 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3909 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3910 } 3911 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3912 __ sub(cnt1, cnt1, 8 * wordSize); 3913 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3914 __ subs(tmp1, cnt1, loopThreshold); 3915 __ eor(v0, __ T16B, v0, v4); 3916 __ eor(v1, __ T16B, v1, v5); 3917 __ eor(v2, __ T16B, v2, v6); 3918 __ eor(v3, __ T16B, v3, v7); 3919 __ orr(v0, __ T16B, v0, v1); 3920 __ orr(v1, __ T16B, v2, v3); 3921 __ orr(v0, __ T16B, v0, v1); 3922 __ umov(tmp1, v0, __ D, 0); 3923 __ umov(tmp2, v0, __ D, 1); 3924 __ orr(tmp1, tmp1, tmp2); 3925 __ cbnz(tmp1, NOT_EQUAL); 3926 __ br(__ GE, LOOP); 3927 } 3928 3929 // a1 = r1 - array1 address 3930 // a2 = r2 - array2 address 3931 // result = r0 - return value. Already contains "false" 3932 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3933 // r3-r5 are reserved temporary registers 3934 address generate_large_array_equals() { 3935 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3936 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3937 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3938 tmp7 = r12, tmp8 = r13; 3939 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3940 SMALL_LOOP, POST_LOOP; 3941 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3942 // calculate if at least 32 prefetched bytes are used 3943 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3944 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3945 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3946 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3947 tmp5, tmp6, tmp7, tmp8); 3948 3949 __ align(CodeEntryAlignment); 3950 address entry = __ pc(); 3951 __ enter(); 3952 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3953 // also advance pointers to use post-increment instead of pre-increment 3954 __ add(a1, a1, wordSize); 3955 __ add(a2, a2, wordSize); 3956 if (AvoidUnalignedAccesses) { 3957 // both implementations (SIMD/nonSIMD) are using relatively large load 3958 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3959 // on some CPUs in case of address is not at least 16-byte aligned. 3960 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3961 // load if needed at least for 1st address and make if 16-byte aligned. 3962 Label ALIGNED16; 3963 __ tbz(a1, 3, ALIGNED16); 3964 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3965 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3966 __ sub(cnt1, cnt1, wordSize); 3967 __ eor(tmp1, tmp1, tmp2); 3968 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3969 __ bind(ALIGNED16); 3970 } 3971 if (UseSIMDForArrayEquals) { 3972 if (SoftwarePrefetchHintDistance >= 0) { 3973 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3974 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3975 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3976 /* prfm = */ true, NOT_EQUAL); 3977 __ cmp(cnt1, nonPrefetchLoopThreshold); 3978 __ br(__ LT, TAIL); 3979 } 3980 __ bind(NO_PREFETCH_LARGE_LOOP); 3981 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3982 /* prfm = */ false, NOT_EQUAL); 3983 } else { 3984 __ push(spilled_regs, sp); 3985 if (SoftwarePrefetchHintDistance >= 0) { 3986 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3987 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3988 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3989 /* prfm = */ true, NOT_EQUAL); 3990 __ cmp(cnt1, nonPrefetchLoopThreshold); 3991 __ br(__ LT, TAIL); 3992 } 3993 __ bind(NO_PREFETCH_LARGE_LOOP); 3994 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3995 /* prfm = */ false, NOT_EQUAL); 3996 } 3997 __ bind(TAIL); 3998 __ cbz(cnt1, EQUAL); 3999 __ subs(cnt1, cnt1, wordSize); 4000 __ br(__ LE, POST_LOOP); 4001 __ bind(SMALL_LOOP); 4002 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4003 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4004 __ subs(cnt1, cnt1, wordSize); 4005 __ eor(tmp1, tmp1, tmp2); 4006 __ cbnz(tmp1, NOT_EQUAL); 4007 __ br(__ GT, SMALL_LOOP); 4008 __ bind(POST_LOOP); 4009 __ ldr(tmp1, Address(a1, cnt1)); 4010 __ ldr(tmp2, Address(a2, cnt1)); 4011 __ eor(tmp1, tmp1, tmp2); 4012 __ cbnz(tmp1, NOT_EQUAL); 4013 __ bind(EQUAL); 4014 __ mov(result, true); 4015 __ bind(NOT_EQUAL); 4016 if (!UseSIMDForArrayEquals) { 4017 __ pop(spilled_regs, sp); 4018 } 4019 __ bind(NOT_EQUAL_NO_POP); 4020 __ leave(); 4021 __ ret(lr); 4022 return entry; 4023 } 4024 4025 address generate_dsin_dcos(bool isCos) { 4026 __ align(CodeEntryAlignment); 4027 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 4028 address start = __ pc(); 4029 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 4030 (address)StubRoutines::aarch64::_two_over_pi, 4031 (address)StubRoutines::aarch64::_pio2, 4032 (address)StubRoutines::aarch64::_dsin_coef, 4033 (address)StubRoutines::aarch64::_dcos_coef); 4034 return start; 4035 } 4036 4037 address generate_dlog() { 4038 __ align(CodeEntryAlignment); 4039 StubCodeMark mark(this, "StubRoutines", "dlog"); 4040 address entry = __ pc(); 4041 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 4042 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 4043 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 4044 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 4045 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 4046 return entry; 4047 } 4048 4049 // code for comparing 16 bytes of strings with same encoding 4050 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 4051 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 4052 __ ldr(rscratch1, Address(__ post(str1, 8))); 4053 __ eor(rscratch2, tmp1, tmp2); 4054 __ ldr(cnt1, Address(__ post(str2, 8))); 4055 __ cbnz(rscratch2, DIFF1); 4056 __ ldr(tmp1, Address(__ post(str1, 8))); 4057 __ eor(rscratch2, rscratch1, cnt1); 4058 __ ldr(tmp2, Address(__ post(str2, 8))); 4059 __ cbnz(rscratch2, DIFF2); 4060 } 4061 4062 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4063 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4064 Label &DIFF2) { 4065 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 4066 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4067 4068 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4069 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4070 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4071 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4072 4073 __ fmovd(tmpL, vtmp3); 4074 __ eor(rscratch2, tmp3, tmpL); 4075 __ cbnz(rscratch2, DIFF2); 4076 4077 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4078 __ umov(tmpL, vtmp3, __ D, 1); 4079 __ eor(rscratch2, tmpU, tmpL); 4080 __ cbnz(rscratch2, DIFF1); 4081 4082 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4083 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4084 __ fmovd(tmpL, vtmp); 4085 __ eor(rscratch2, tmp3, tmpL); 4086 __ cbnz(rscratch2, DIFF2); 4087 4088 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4089 __ umov(tmpL, vtmp, __ D, 1); 4090 __ eor(rscratch2, tmpU, tmpL); 4091 __ cbnz(rscratch2, DIFF1); 4092 } 4093 4094 // r0 = result 4095 // r1 = str1 4096 // r2 = cnt1 4097 // r3 = str2 4098 // r4 = cnt2 4099 // r10 = tmp1 4100 // r11 = tmp2 4101 address generate_compare_long_string_different_encoding(bool isLU) { 4102 __ align(CodeEntryAlignment); 4103 StubCodeMark mark(this, "StubRoutines", isLU 4104 ? "compare_long_string_different_encoding LU" 4105 : "compare_long_string_different_encoding UL"); 4106 address entry = __ pc(); 4107 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4108 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER, 4109 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4110 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4111 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4112 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4113 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4114 4115 int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2); 4116 4117 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4118 // cnt2 == amount of characters left to compare 4119 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4120 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4121 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4122 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4123 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4124 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4125 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4126 __ eor(rscratch2, tmp1, tmp2); 4127 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4128 __ mov(rscratch1, tmp2); 4129 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4130 Register strU = isLU ? str2 : str1, 4131 strL = isLU ? str1 : str2, 4132 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4133 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4134 __ push(spilled_regs, sp); 4135 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4136 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4137 4138 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4139 4140 if (SoftwarePrefetchHintDistance >= 0) { 4141 __ cmp(cnt2, prefetchLoopExitCondition); 4142 __ br(__ LT, SMALL_LOOP); 4143 __ bind(LARGE_LOOP_PREFETCH); 4144 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4145 __ mov(tmp4, 2); 4146 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4147 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4148 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4149 __ subs(tmp4, tmp4, 1); 4150 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4151 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4152 __ mov(tmp4, 2); 4153 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4154 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4155 __ subs(tmp4, tmp4, 1); 4156 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4157 __ sub(cnt2, cnt2, 64); 4158 __ cmp(cnt2, prefetchLoopExitCondition); 4159 __ br(__ GE, LARGE_LOOP_PREFETCH); 4160 } 4161 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4162 __ subs(cnt2, cnt2, 16); 4163 __ br(__ LT, TAIL); 4164 __ b(SMALL_LOOP_ENTER); 4165 __ bind(SMALL_LOOP); // smaller loop 4166 __ subs(cnt2, cnt2, 16); 4167 __ bind(SMALL_LOOP_ENTER); 4168 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4169 __ br(__ GE, SMALL_LOOP); 4170 __ cbz(cnt2, LOAD_LAST); 4171 __ bind(TAIL); // 1..15 characters left 4172 __ cmp(cnt2, -8); 4173 __ br(__ GT, TAIL_LOAD_16); 4174 __ ldrd(vtmp, Address(tmp2)); 4175 __ zip1(vtmp3, __ T8B, vtmp, vtmpZ); 4176 4177 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4178 __ fmovd(tmpL, vtmp3); 4179 __ eor(rscratch2, tmp3, tmpL); 4180 __ cbnz(rscratch2, DIFF2); 4181 __ umov(tmpL, vtmp3, __ D, 1); 4182 __ eor(rscratch2, tmpU, tmpL); 4183 __ cbnz(rscratch2, DIFF1); 4184 __ b(LOAD_LAST); 4185 __ bind(TAIL_LOAD_16); 4186 __ ldrq(vtmp, Address(tmp2)); 4187 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4188 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4189 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4190 __ fmovd(tmpL, vtmp3); 4191 __ eor(rscratch2, tmp3, tmpL); 4192 __ cbnz(rscratch2, DIFF2); 4193 4194 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4195 __ umov(tmpL, vtmp3, __ D, 1); 4196 __ eor(rscratch2, tmpU, tmpL); 4197 __ cbnz(rscratch2, DIFF1); 4198 4199 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4200 __ fmovd(tmpL, vtmp); 4201 __ eor(rscratch2, tmp3, tmpL); 4202 __ cbnz(rscratch2, DIFF2); 4203 4204 __ umov(tmpL, vtmp, __ D, 1); 4205 __ eor(rscratch2, tmpU, tmpL); 4206 __ cbnz(rscratch2, DIFF1); 4207 __ b(LOAD_LAST); 4208 __ bind(DIFF2); 4209 __ mov(tmpU, tmp3); 4210 __ bind(DIFF1); 4211 __ pop(spilled_regs, sp); 4212 __ b(CALCULATE_DIFFERENCE); 4213 __ bind(LOAD_LAST); 4214 __ pop(spilled_regs, sp); 4215 4216 __ ldrs(vtmp, Address(strL)); 4217 __ ldr(tmpU, Address(strU)); 4218 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4219 __ fmovd(tmpL, vtmp); 4220 4221 __ eor(rscratch2, tmpU, tmpL); 4222 __ cbz(rscratch2, DONE); 4223 4224 // Find the first different characters in the longwords and 4225 // compute their difference. 4226 __ bind(CALCULATE_DIFFERENCE); 4227 __ rev(rscratch2, rscratch2); 4228 __ clz(rscratch2, rscratch2); 4229 __ andr(rscratch2, rscratch2, -16); 4230 __ lsrv(tmp1, tmp1, rscratch2); 4231 __ uxthw(tmp1, tmp1); 4232 __ lsrv(rscratch1, rscratch1, rscratch2); 4233 __ uxthw(rscratch1, rscratch1); 4234 __ subw(result, tmp1, rscratch1); 4235 __ bind(DONE); 4236 __ ret(lr); 4237 return entry; 4238 } 4239 4240 // r0 = result 4241 // r1 = str1 4242 // r2 = cnt1 4243 // r3 = str2 4244 // r4 = cnt2 4245 // r10 = tmp1 4246 // r11 = tmp2 4247 address generate_compare_long_string_same_encoding(bool isLL) { 4248 __ align(CodeEntryAlignment); 4249 StubCodeMark mark(this, "StubRoutines", isLL 4250 ? "compare_long_string_same_encoding LL" 4251 : "compare_long_string_same_encoding UU"); 4252 address entry = __ pc(); 4253 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4254 tmp1 = r10, tmp2 = r11; 4255 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4256 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4257 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4258 // exit from large loop when less than 64 bytes left to read or we're about 4259 // to prefetch memory behind array border 4260 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4261 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4262 // update cnt2 counter with already loaded 8 bytes 4263 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4264 // update pointers, because of previous read 4265 __ add(str1, str1, wordSize); 4266 __ add(str2, str2, wordSize); 4267 if (SoftwarePrefetchHintDistance >= 0) { 4268 __ bind(LARGE_LOOP_PREFETCH); 4269 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4270 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4271 compare_string_16_bytes_same(DIFF, DIFF2); 4272 compare_string_16_bytes_same(DIFF, DIFF2); 4273 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4274 compare_string_16_bytes_same(DIFF, DIFF2); 4275 __ cmp(cnt2, largeLoopExitCondition); 4276 compare_string_16_bytes_same(DIFF, DIFF2); 4277 __ br(__ GT, LARGE_LOOP_PREFETCH); 4278 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4279 // less than 16 bytes left? 4280 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4281 __ br(__ LT, TAIL); 4282 } 4283 __ bind(SMALL_LOOP); 4284 compare_string_16_bytes_same(DIFF, DIFF2); 4285 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4286 __ br(__ GE, SMALL_LOOP); 4287 __ bind(TAIL); 4288 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4289 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4290 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4291 __ br(__ LE, CHECK_LAST); 4292 __ eor(rscratch2, tmp1, tmp2); 4293 __ cbnz(rscratch2, DIFF); 4294 __ ldr(tmp1, Address(__ post(str1, 8))); 4295 __ ldr(tmp2, Address(__ post(str2, 8))); 4296 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4297 __ bind(CHECK_LAST); 4298 if (!isLL) { 4299 __ add(cnt2, cnt2, cnt2); // now in bytes 4300 } 4301 __ eor(rscratch2, tmp1, tmp2); 4302 __ cbnz(rscratch2, DIFF); 4303 __ ldr(rscratch1, Address(str1, cnt2)); 4304 __ ldr(cnt1, Address(str2, cnt2)); 4305 __ eor(rscratch2, rscratch1, cnt1); 4306 __ cbz(rscratch2, LENGTH_DIFF); 4307 // Find the first different characters in the longwords and 4308 // compute their difference. 4309 __ bind(DIFF2); 4310 __ rev(rscratch2, rscratch2); 4311 __ clz(rscratch2, rscratch2); 4312 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4313 __ lsrv(rscratch1, rscratch1, rscratch2); 4314 if (isLL) { 4315 __ lsrv(cnt1, cnt1, rscratch2); 4316 __ uxtbw(rscratch1, rscratch1); 4317 __ uxtbw(cnt1, cnt1); 4318 } else { 4319 __ lsrv(cnt1, cnt1, rscratch2); 4320 __ uxthw(rscratch1, rscratch1); 4321 __ uxthw(cnt1, cnt1); 4322 } 4323 __ subw(result, rscratch1, cnt1); 4324 __ b(LENGTH_DIFF); 4325 __ bind(DIFF); 4326 __ rev(rscratch2, rscratch2); 4327 __ clz(rscratch2, rscratch2); 4328 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4329 __ lsrv(tmp1, tmp1, rscratch2); 4330 if (isLL) { 4331 __ lsrv(tmp2, tmp2, rscratch2); 4332 __ uxtbw(tmp1, tmp1); 4333 __ uxtbw(tmp2, tmp2); 4334 } else { 4335 __ lsrv(tmp2, tmp2, rscratch2); 4336 __ uxthw(tmp1, tmp1); 4337 __ uxthw(tmp2, tmp2); 4338 } 4339 __ subw(result, tmp1, tmp2); 4340 __ b(LENGTH_DIFF); 4341 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4342 __ eor(rscratch2, tmp1, tmp2); 4343 __ cbnz(rscratch2, DIFF); 4344 __ bind(LENGTH_DIFF); 4345 __ ret(lr); 4346 return entry; 4347 } 4348 4349 void generate_compare_long_strings() { 4350 StubRoutines::aarch64::_compare_long_string_LL 4351 = generate_compare_long_string_same_encoding(true); 4352 StubRoutines::aarch64::_compare_long_string_UU 4353 = generate_compare_long_string_same_encoding(false); 4354 StubRoutines::aarch64::_compare_long_string_LU 4355 = generate_compare_long_string_different_encoding(true); 4356 StubRoutines::aarch64::_compare_long_string_UL 4357 = generate_compare_long_string_different_encoding(false); 4358 } 4359 4360 // R0 = result 4361 // R1 = str2 4362 // R2 = cnt1 4363 // R3 = str1 4364 // R4 = cnt2 4365 // This generic linear code use few additional ideas, which makes it faster: 4366 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4367 // in order to skip initial loading(help in systems with 1 ld pipeline) 4368 // 2) we can use "fast" algorithm of finding single character to search for 4369 // first symbol with less branches(1 branch per each loaded register instead 4370 // of branch for each symbol), so, this is where constants like 4371 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4372 // 3) after loading and analyzing 1st register of source string, it can be 4373 // used to search for every 1st character entry, saving few loads in 4374 // comparison with "simplier-but-slower" implementation 4375 // 4) in order to avoid lots of push/pop operations, code below is heavily 4376 // re-using/re-initializing/compressing register values, which makes code 4377 // larger and a bit less readable, however, most of extra operations are 4378 // issued during loads or branches, so, penalty is minimal 4379 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4380 const char* stubName = str1_isL 4381 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4382 : "indexof_linear_uu"; 4383 __ align(CodeEntryAlignment); 4384 StubCodeMark mark(this, "StubRoutines", stubName); 4385 address entry = __ pc(); 4386 4387 int str1_chr_size = str1_isL ? 1 : 2; 4388 int str2_chr_size = str2_isL ? 1 : 2; 4389 int str1_chr_shift = str1_isL ? 0 : 1; 4390 int str2_chr_shift = str2_isL ? 0 : 1; 4391 bool isL = str1_isL && str2_isL; 4392 // parameters 4393 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4394 // temporary registers 4395 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4396 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4397 // redefinitions 4398 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4399 4400 __ push(spilled_regs, sp); 4401 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, L_SMALL_MATCH_LOOP, 4402 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4403 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4404 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4405 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4406 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4407 // Read whole register from str1. It is safe, because length >=8 here 4408 __ ldr(ch1, Address(str1)); 4409 // Read whole register from str2. It is safe, because length >=8 here 4410 __ ldr(ch2, Address(str2)); 4411 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4412 if (str1_isL != str2_isL) { 4413 __ eor(v0, __ T16B, v0, v0); 4414 } 4415 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4416 __ mul(first, first, tmp1); 4417 // check if we have less than 1 register to check 4418 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4419 if (str1_isL != str2_isL) { 4420 __ fmovd(v1, ch1); 4421 } 4422 __ br(__ LE, L_SMALL); 4423 __ eor(ch2, first, ch2); 4424 if (str1_isL != str2_isL) { 4425 __ zip1(v1, __ T16B, v1, v0); 4426 } 4427 __ sub(tmp2, ch2, tmp1); 4428 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4429 __ bics(tmp2, tmp2, ch2); 4430 if (str1_isL != str2_isL) { 4431 __ fmovd(ch1, v1); 4432 } 4433 __ br(__ NE, L_HAS_ZERO); 4434 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4435 __ add(result, result, wordSize/str2_chr_size); 4436 __ add(str2, str2, wordSize); 4437 __ br(__ LT, L_POST_LOOP); 4438 __ BIND(L_LOOP); 4439 __ ldr(ch2, Address(str2)); 4440 __ eor(ch2, first, ch2); 4441 __ sub(tmp2, ch2, tmp1); 4442 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4443 __ bics(tmp2, tmp2, ch2); 4444 __ br(__ NE, L_HAS_ZERO); 4445 __ BIND(L_LOOP_PROCEED); 4446 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4447 __ add(str2, str2, wordSize); 4448 __ add(result, result, wordSize/str2_chr_size); 4449 __ br(__ GE, L_LOOP); 4450 __ BIND(L_POST_LOOP); 4451 __ cmp(cnt2, -wordSize/str2_chr_size); // no extra characters to check 4452 __ br(__ LE, NOMATCH); 4453 __ ldr(ch2, Address(str2)); 4454 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4455 __ eor(ch2, first, ch2); 4456 __ sub(tmp2, ch2, tmp1); 4457 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4458 __ mov(tmp4, -1); // all bits set 4459 __ b(L_SMALL_PROCEED); 4460 __ align(OptoLoopAlignment); 4461 __ BIND(L_SMALL); 4462 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4463 __ eor(ch2, first, ch2); 4464 if (str1_isL != str2_isL) { 4465 __ zip1(v1, __ T16B, v1, v0); 4466 } 4467 __ sub(tmp2, ch2, tmp1); 4468 __ mov(tmp4, -1); // all bits set 4469 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4470 if (str1_isL != str2_isL) { 4471 __ fmovd(ch1, v1); // move converted 4 symbols 4472 } 4473 __ BIND(L_SMALL_PROCEED); 4474 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4475 __ bic(tmp2, tmp2, ch2); 4476 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4477 __ rbit(tmp2, tmp2); 4478 __ br(__ EQ, NOMATCH); 4479 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4480 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4481 __ cmp(cnt1, wordSize/str2_chr_size); 4482 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4483 if (str2_isL) { // LL 4484 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4485 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4486 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4487 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4488 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4489 } else { 4490 __ mov(ch2, 0xE); // all bits in byte set except last one 4491 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4492 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4493 __ lslv(tmp2, tmp2, tmp4); 4494 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4495 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4496 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4497 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4498 } 4499 __ cmp(ch1, ch2); 4500 __ mov(tmp4, wordSize/str2_chr_size); 4501 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4502 __ BIND(L_SMALL_CMP_LOOP); 4503 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4504 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4505 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4506 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4507 __ add(tmp4, tmp4, 1); 4508 __ cmp(tmp4, cnt1); 4509 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4510 __ cmp(first, ch2); 4511 __ br(__ EQ, L_SMALL_CMP_LOOP); 4512 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4513 __ cbz(tmp2, NOMATCH); // no more matches. exit 4514 __ clz(tmp4, tmp2); 4515 __ add(result, result, 1); // advance index 4516 __ add(str2, str2, str2_chr_size); // advance pointer 4517 __ b(L_SMALL_HAS_ZERO_LOOP); 4518 __ align(OptoLoopAlignment); 4519 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4520 __ cmp(first, ch2); 4521 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4522 __ b(DONE); 4523 __ align(OptoLoopAlignment); 4524 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4525 if (str2_isL) { // LL 4526 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4527 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4528 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4529 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4530 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4531 } else { 4532 __ mov(ch2, 0xE); // all bits in byte set except last one 4533 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4534 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4535 __ lslv(tmp2, tmp2, tmp4); 4536 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4537 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4538 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4539 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4540 } 4541 __ cmp(ch1, ch2); 4542 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4543 __ b(DONE); 4544 __ align(OptoLoopAlignment); 4545 __ BIND(L_HAS_ZERO); 4546 __ rbit(tmp2, tmp2); 4547 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4548 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4549 // It's fine because both counters are 32bit and are not changed in this 4550 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4551 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4552 __ sub(result, result, 1); 4553 __ BIND(L_HAS_ZERO_LOOP); 4554 __ mov(cnt1, wordSize/str2_chr_size); 4555 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4556 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4557 if (str2_isL) { 4558 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4559 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4560 __ lslv(tmp2, tmp2, tmp4); 4561 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4562 __ add(tmp4, tmp4, 1); 4563 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4564 __ lsl(tmp2, tmp2, 1); 4565 __ mov(tmp4, wordSize/str2_chr_size); 4566 } else { 4567 __ mov(ch2, 0xE); 4568 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4569 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4570 __ lslv(tmp2, tmp2, tmp4); 4571 __ add(tmp4, tmp4, 1); 4572 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4573 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4574 __ lsl(tmp2, tmp2, 1); 4575 __ mov(tmp4, wordSize/str2_chr_size); 4576 __ sub(str2, str2, str2_chr_size); 4577 } 4578 __ cmp(ch1, ch2); 4579 __ mov(tmp4, wordSize/str2_chr_size); 4580 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4581 __ BIND(L_CMP_LOOP); 4582 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4583 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4584 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4585 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4586 __ add(tmp4, tmp4, 1); 4587 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4588 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4589 __ cmp(cnt1, ch2); 4590 __ br(__ EQ, L_CMP_LOOP); 4591 __ BIND(L_CMP_LOOP_NOMATCH); 4592 // here we're not matched 4593 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4594 __ clz(tmp4, tmp2); 4595 __ add(str2, str2, str2_chr_size); // advance pointer 4596 __ b(L_HAS_ZERO_LOOP); 4597 __ align(OptoLoopAlignment); 4598 __ BIND(L_CMP_LOOP_LAST_CMP); 4599 __ cmp(cnt1, ch2); 4600 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4601 __ b(DONE); 4602 __ align(OptoLoopAlignment); 4603 __ BIND(L_CMP_LOOP_LAST_CMP2); 4604 if (str2_isL) { 4605 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4606 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4607 __ lslv(tmp2, tmp2, tmp4); 4608 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4609 __ add(tmp4, tmp4, 1); 4610 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4611 __ lsl(tmp2, tmp2, 1); 4612 } else { 4613 __ mov(ch2, 0xE); 4614 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4615 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4616 __ lslv(tmp2, tmp2, tmp4); 4617 __ add(tmp4, tmp4, 1); 4618 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4619 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4620 __ lsl(tmp2, tmp2, 1); 4621 __ sub(str2, str2, str2_chr_size); 4622 } 4623 __ cmp(ch1, ch2); 4624 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4625 __ b(DONE); 4626 __ align(OptoLoopAlignment); 4627 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4628 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4629 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4630 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4631 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4632 // result by analyzed characters value, so, we can just reset lower bits 4633 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4634 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4635 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4636 // index of last analyzed substring inside current octet. So, str2 in at 4637 // respective start address. We need to advance it to next octet 4638 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4639 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4640 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4641 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4642 __ movw(cnt2, cnt2); 4643 __ b(L_LOOP_PROCEED); 4644 __ align(OptoLoopAlignment); 4645 __ BIND(NOMATCH); 4646 __ mov(result, -1); 4647 __ BIND(DONE); 4648 __ pop(spilled_regs, sp); 4649 __ ret(lr); 4650 return entry; 4651 } 4652 4653 void generate_string_indexof_stubs() { 4654 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4655 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4656 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4657 } 4658 4659 void inflate_and_store_2_fp_registers(bool generatePrfm, 4660 FloatRegister src1, FloatRegister src2) { 4661 Register dst = r1; 4662 __ zip1(v1, __ T16B, src1, v0); 4663 __ zip2(v2, __ T16B, src1, v0); 4664 if (generatePrfm) { 4665 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4666 } 4667 __ zip1(v3, __ T16B, src2, v0); 4668 __ zip2(v4, __ T16B, src2, v0); 4669 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4670 } 4671 4672 // R0 = src 4673 // R1 = dst 4674 // R2 = len 4675 // R3 = len >> 3 4676 // V0 = 0 4677 // v1 = loaded 8 bytes 4678 address generate_large_byte_array_inflate() { 4679 __ align(CodeEntryAlignment); 4680 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4681 address entry = __ pc(); 4682 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4683 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4684 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4685 4686 // do one more 8-byte read to have address 16-byte aligned in most cases 4687 // also use single store instruction 4688 __ ldrd(v2, __ post(src, 8)); 4689 __ sub(octetCounter, octetCounter, 2); 4690 __ zip1(v1, __ T16B, v1, v0); 4691 __ zip1(v2, __ T16B, v2, v0); 4692 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4693 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4694 __ cmp(octetCounter, large_loop_threshold); 4695 __ br(__ LE, LOOP_START); 4696 __ b(LOOP_PRFM_START); 4697 __ bind(LOOP_PRFM); 4698 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4699 __ bind(LOOP_PRFM_START); 4700 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4701 __ sub(octetCounter, octetCounter, 8); 4702 __ cmp(octetCounter, large_loop_threshold); 4703 inflate_and_store_2_fp_registers(true, v3, v4); 4704 inflate_and_store_2_fp_registers(true, v5, v6); 4705 __ br(__ GT, LOOP_PRFM); 4706 __ cmp(octetCounter, 8); 4707 __ br(__ LT, DONE); 4708 __ bind(LOOP); 4709 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4710 __ bind(LOOP_START); 4711 __ sub(octetCounter, octetCounter, 8); 4712 __ cmp(octetCounter, 8); 4713 inflate_and_store_2_fp_registers(false, v3, v4); 4714 inflate_and_store_2_fp_registers(false, v5, v6); 4715 __ br(__ GE, LOOP); 4716 __ bind(DONE); 4717 __ ret(lr); 4718 return entry; 4719 } 4720 4721 /** 4722 * Arguments: 4723 * 4724 * Input: 4725 * c_rarg0 - current state address 4726 * c_rarg1 - H key address 4727 * c_rarg2 - data address 4728 * c_rarg3 - number of blocks 4729 * 4730 * Output: 4731 * Updated state at c_rarg0 4732 */ 4733 address generate_ghash_processBlocks() { 4734 // Bafflingly, GCM uses little-endian for the byte order, but 4735 // big-endian for the bit order. For example, the polynomial 1 is 4736 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4737 // 4738 // So, we must either reverse the bytes in each word and do 4739 // everything big-endian or reverse the bits in each byte and do 4740 // it little-endian. On AArch64 it's more idiomatic to reverse 4741 // the bits in each byte (we have an instruction, RBIT, to do 4742 // that) and keep the data in little-endian bit order throught the 4743 // calculation, bit-reversing the inputs and outputs. 4744 4745 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4746 __ align(wordSize * 2); 4747 address p = __ pc(); 4748 __ emit_int64(0x87); // The low-order bits of the field 4749 // polynomial (i.e. p = z^7+z^2+z+1) 4750 // repeated in the low and high parts of a 4751 // 128-bit vector 4752 __ emit_int64(0x87); 4753 4754 __ align(CodeEntryAlignment); 4755 address start = __ pc(); 4756 4757 Register state = c_rarg0; 4758 Register subkeyH = c_rarg1; 4759 Register data = c_rarg2; 4760 Register blocks = c_rarg3; 4761 4762 FloatRegister vzr = v30; 4763 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4764 4765 __ ldrq(v0, Address(state)); 4766 __ ldrq(v1, Address(subkeyH)); 4767 4768 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4769 __ rbit(v0, __ T16B, v0); 4770 __ rev64(v1, __ T16B, v1); 4771 __ rbit(v1, __ T16B, v1); 4772 4773 __ ldrq(v26, p); 4774 4775 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4776 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4777 4778 { 4779 Label L_ghash_loop; 4780 __ bind(L_ghash_loop); 4781 4782 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4783 // reversing each byte 4784 __ rbit(v2, __ T16B, v2); 4785 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4786 4787 // Multiply state in v2 by subkey in v1 4788 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4789 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4790 /*temps*/v6, v20, v18, v21); 4791 // Reduce v7:v5 by the field polynomial 4792 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4793 4794 __ sub(blocks, blocks, 1); 4795 __ cbnz(blocks, L_ghash_loop); 4796 } 4797 4798 // The bit-reversed result is at this point in v0 4799 __ rev64(v1, __ T16B, v0); 4800 __ rbit(v1, __ T16B, v1); 4801 4802 __ st1(v1, __ T16B, state); 4803 __ ret(lr); 4804 4805 return start; 4806 } 4807 4808 // Continuation point for throwing of implicit exceptions that are 4809 // not handled in the current activation. Fabricates an exception 4810 // oop and initiates normal exception dispatching in this 4811 // frame. Since we need to preserve callee-saved values (currently 4812 // only for C2, but done for C1 as well) we need a callee-saved oop 4813 // map and therefore have to make these stubs into RuntimeStubs 4814 // rather than BufferBlobs. If the compiler needs all registers to 4815 // be preserved between the fault point and the exception handler 4816 // then it must assume responsibility for that in 4817 // AbstractCompiler::continuation_for_implicit_null_exception or 4818 // continuation_for_implicit_division_by_zero_exception. All other 4819 // implicit exceptions (e.g., NullPointerException or 4820 // AbstractMethodError on entry) are either at call sites or 4821 // otherwise assume that stack unwinding will be initiated, so 4822 // caller saved registers were assumed volatile in the compiler. 4823 4824 #undef __ 4825 #define __ masm-> 4826 4827 address generate_throw_exception(const char* name, 4828 address runtime_entry, 4829 Register arg1 = noreg, 4830 Register arg2 = noreg) { 4831 // Information about frame layout at time of blocking runtime call. 4832 // Note that we only have to preserve callee-saved registers since 4833 // the compilers are responsible for supplying a continuation point 4834 // if they expect all registers to be preserved. 4835 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4836 enum layout { 4837 rfp_off = 0, 4838 rfp_off2, 4839 return_off, 4840 return_off2, 4841 framesize // inclusive of return address 4842 }; 4843 4844 int insts_size = 512; 4845 int locs_size = 64; 4846 4847 CodeBuffer code(name, insts_size, locs_size); 4848 OopMapSet* oop_maps = new OopMapSet(); 4849 MacroAssembler* masm = new MacroAssembler(&code); 4850 4851 address start = __ pc(); 4852 4853 // This is an inlined and slightly modified version of call_VM 4854 // which has the ability to fetch the return PC out of 4855 // thread-local storage and also sets up last_Java_sp slightly 4856 // differently than the real call_VM 4857 4858 __ enter(); // Save FP and LR before call 4859 4860 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4861 4862 // lr and fp are already in place 4863 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4864 4865 int frame_complete = __ pc() - start; 4866 4867 // Set up last_Java_sp and last_Java_fp 4868 address the_pc = __ pc(); 4869 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 4870 4871 // Call runtime 4872 if (arg1 != noreg) { 4873 assert(arg2 != c_rarg1, "clobbered"); 4874 __ mov(c_rarg1, arg1); 4875 } 4876 if (arg2 != noreg) { 4877 __ mov(c_rarg2, arg2); 4878 } 4879 __ mov(c_rarg0, rthread); 4880 BLOCK_COMMENT("call runtime_entry"); 4881 __ mov(rscratch1, runtime_entry); 4882 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4883 4884 // Generate oop map 4885 OopMap* map = new OopMap(framesize, 0); 4886 4887 oop_maps->add_gc_map(the_pc - start, map); 4888 4889 __ reset_last_Java_frame(true); 4890 __ maybe_isb(); 4891 4892 __ leave(); 4893 4894 // check for pending exceptions 4895 #ifdef ASSERT 4896 Label L; 4897 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4898 __ cbnz(rscratch1, L); 4899 __ should_not_reach_here(); 4900 __ bind(L); 4901 #endif // ASSERT 4902 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4903 4904 4905 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4906 RuntimeStub* stub = 4907 RuntimeStub::new_runtime_stub(name, 4908 &code, 4909 frame_complete, 4910 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4911 oop_maps, false); 4912 return stub->entry_point(); 4913 } 4914 4915 class MontgomeryMultiplyGenerator : public MacroAssembler { 4916 4917 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4918 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4919 4920 RegSet _toSave; 4921 bool _squaring; 4922 4923 public: 4924 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4925 : MacroAssembler(as->code()), _squaring(squaring) { 4926 4927 // Register allocation 4928 4929 Register reg = c_rarg0; 4930 Pa_base = reg; // Argument registers 4931 if (squaring) 4932 Pb_base = Pa_base; 4933 else 4934 Pb_base = ++reg; 4935 Pn_base = ++reg; 4936 Rlen= ++reg; 4937 inv = ++reg; 4938 Pm_base = ++reg; 4939 4940 // Working registers: 4941 Ra = ++reg; // The current digit of a, b, n, and m. 4942 Rb = ++reg; 4943 Rm = ++reg; 4944 Rn = ++reg; 4945 4946 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4947 Pb = ++reg; 4948 Pm = ++reg; 4949 Pn = ++reg; 4950 4951 t0 = ++reg; // Three registers which form a 4952 t1 = ++reg; // triple-precision accumuator. 4953 t2 = ++reg; 4954 4955 Ri = ++reg; // Inner and outer loop indexes. 4956 Rj = ++reg; 4957 4958 Rhi_ab = ++reg; // Product registers: low and high parts 4959 Rlo_ab = ++reg; // of a*b and m*n. 4960 Rhi_mn = ++reg; 4961 Rlo_mn = ++reg; 4962 4963 // r19 and up are callee-saved. 4964 _toSave = RegSet::range(r19, reg) + Pm_base; 4965 } 4966 4967 private: 4968 void save_regs() { 4969 push(_toSave, sp); 4970 } 4971 4972 void restore_regs() { 4973 pop(_toSave, sp); 4974 } 4975 4976 template <typename T> 4977 void unroll_2(Register count, T block) { 4978 Label loop, end, odd; 4979 tbnz(count, 0, odd); 4980 cbz(count, end); 4981 align(16); 4982 bind(loop); 4983 (this->*block)(); 4984 bind(odd); 4985 (this->*block)(); 4986 subs(count, count, 2); 4987 br(Assembler::GT, loop); 4988 bind(end); 4989 } 4990 4991 template <typename T> 4992 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4993 Label loop, end, odd; 4994 tbnz(count, 0, odd); 4995 cbz(count, end); 4996 align(16); 4997 bind(loop); 4998 (this->*block)(d, s, tmp); 4999 bind(odd); 5000 (this->*block)(d, s, tmp); 5001 subs(count, count, 2); 5002 br(Assembler::GT, loop); 5003 bind(end); 5004 } 5005 5006 void pre1(RegisterOrConstant i) { 5007 block_comment("pre1"); 5008 // Pa = Pa_base; 5009 // Pb = Pb_base + i; 5010 // Pm = Pm_base; 5011 // Pn = Pn_base + i; 5012 // Ra = *Pa; 5013 // Rb = *Pb; 5014 // Rm = *Pm; 5015 // Rn = *Pn; 5016 ldr(Ra, Address(Pa_base)); 5017 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 5018 ldr(Rm, Address(Pm_base)); 5019 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5020 lea(Pa, Address(Pa_base)); 5021 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 5022 lea(Pm, Address(Pm_base)); 5023 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5024 5025 // Zero the m*n result. 5026 mov(Rhi_mn, zr); 5027 mov(Rlo_mn, zr); 5028 } 5029 5030 // The core multiply-accumulate step of a Montgomery 5031 // multiplication. The idea is to schedule operations as a 5032 // pipeline so that instructions with long latencies (loads and 5033 // multiplies) have time to complete before their results are 5034 // used. This most benefits in-order implementations of the 5035 // architecture but out-of-order ones also benefit. 5036 void step() { 5037 block_comment("step"); 5038 // MACC(Ra, Rb, t0, t1, t2); 5039 // Ra = *++Pa; 5040 // Rb = *--Pb; 5041 umulh(Rhi_ab, Ra, Rb); 5042 mul(Rlo_ab, Ra, Rb); 5043 ldr(Ra, pre(Pa, wordSize)); 5044 ldr(Rb, pre(Pb, -wordSize)); 5045 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 5046 // previous iteration. 5047 // MACC(Rm, Rn, t0, t1, t2); 5048 // Rm = *++Pm; 5049 // Rn = *--Pn; 5050 umulh(Rhi_mn, Rm, Rn); 5051 mul(Rlo_mn, Rm, Rn); 5052 ldr(Rm, pre(Pm, wordSize)); 5053 ldr(Rn, pre(Pn, -wordSize)); 5054 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5055 } 5056 5057 void post1() { 5058 block_comment("post1"); 5059 5060 // MACC(Ra, Rb, t0, t1, t2); 5061 // Ra = *++Pa; 5062 // Rb = *--Pb; 5063 umulh(Rhi_ab, Ra, Rb); 5064 mul(Rlo_ab, Ra, Rb); 5065 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5066 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5067 5068 // *Pm = Rm = t0 * inv; 5069 mul(Rm, t0, inv); 5070 str(Rm, Address(Pm)); 5071 5072 // MACC(Rm, Rn, t0, t1, t2); 5073 // t0 = t1; t1 = t2; t2 = 0; 5074 umulh(Rhi_mn, Rm, Rn); 5075 5076 #ifndef PRODUCT 5077 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5078 { 5079 mul(Rlo_mn, Rm, Rn); 5080 add(Rlo_mn, t0, Rlo_mn); 5081 Label ok; 5082 cbz(Rlo_mn, ok); { 5083 stop("broken Montgomery multiply"); 5084 } bind(ok); 5085 } 5086 #endif 5087 // We have very carefully set things up so that 5088 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5089 // the lower half of Rm * Rn because we know the result already: 5090 // it must be -t0. t0 + (-t0) must generate a carry iff 5091 // t0 != 0. So, rather than do a mul and an adds we just set 5092 // the carry flag iff t0 is nonzero. 5093 // 5094 // mul(Rlo_mn, Rm, Rn); 5095 // adds(zr, t0, Rlo_mn); 5096 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5097 adcs(t0, t1, Rhi_mn); 5098 adc(t1, t2, zr); 5099 mov(t2, zr); 5100 } 5101 5102 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5103 block_comment("pre2"); 5104 // Pa = Pa_base + i-len; 5105 // Pb = Pb_base + len; 5106 // Pm = Pm_base + i-len; 5107 // Pn = Pn_base + len; 5108 5109 if (i.is_register()) { 5110 sub(Rj, i.as_register(), len); 5111 } else { 5112 mov(Rj, i.as_constant()); 5113 sub(Rj, Rj, len); 5114 } 5115 // Rj == i-len 5116 5117 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5118 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5119 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5120 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5121 5122 // Ra = *++Pa; 5123 // Rb = *--Pb; 5124 // Rm = *++Pm; 5125 // Rn = *--Pn; 5126 ldr(Ra, pre(Pa, wordSize)); 5127 ldr(Rb, pre(Pb, -wordSize)); 5128 ldr(Rm, pre(Pm, wordSize)); 5129 ldr(Rn, pre(Pn, -wordSize)); 5130 5131 mov(Rhi_mn, zr); 5132 mov(Rlo_mn, zr); 5133 } 5134 5135 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5136 block_comment("post2"); 5137 if (i.is_constant()) { 5138 mov(Rj, i.as_constant()-len.as_constant()); 5139 } else { 5140 sub(Rj, i.as_register(), len); 5141 } 5142 5143 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5144 5145 // As soon as we know the least significant digit of our result, 5146 // store it. 5147 // Pm_base[i-len] = t0; 5148 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5149 5150 // t0 = t1; t1 = t2; t2 = 0; 5151 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5152 adc(t1, t2, zr); 5153 mov(t2, zr); 5154 } 5155 5156 // A carry in t0 after Montgomery multiplication means that we 5157 // should subtract multiples of n from our result in m. We'll 5158 // keep doing that until there is no carry. 5159 void normalize(RegisterOrConstant len) { 5160 block_comment("normalize"); 5161 // while (t0) 5162 // t0 = sub(Pm_base, Pn_base, t0, len); 5163 Label loop, post, again; 5164 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5165 cbz(t0, post); { 5166 bind(again); { 5167 mov(i, zr); 5168 mov(cnt, len); 5169 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5170 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5171 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5172 align(16); 5173 bind(loop); { 5174 sbcs(Rm, Rm, Rn); 5175 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5176 add(i, i, 1); 5177 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5178 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5179 sub(cnt, cnt, 1); 5180 } cbnz(cnt, loop); 5181 sbc(t0, t0, zr); 5182 } cbnz(t0, again); 5183 } bind(post); 5184 } 5185 5186 // Move memory at s to d, reversing words. 5187 // Increments d to end of copied memory 5188 // Destroys tmp1, tmp2 5189 // Preserves len 5190 // Leaves s pointing to the address which was in d at start 5191 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5192 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5193 5194 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5195 mov(tmp1, len); 5196 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5197 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5198 } 5199 // where 5200 void reverse1(Register d, Register s, Register tmp) { 5201 ldr(tmp, pre(s, -wordSize)); 5202 ror(tmp, tmp, 32); 5203 str(tmp, post(d, wordSize)); 5204 } 5205 5206 void step_squaring() { 5207 // An extra ACC 5208 step(); 5209 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5210 } 5211 5212 void last_squaring(RegisterOrConstant i) { 5213 Label dont; 5214 // if ((i & 1) == 0) { 5215 tbnz(i.as_register(), 0, dont); { 5216 // MACC(Ra, Rb, t0, t1, t2); 5217 // Ra = *++Pa; 5218 // Rb = *--Pb; 5219 umulh(Rhi_ab, Ra, Rb); 5220 mul(Rlo_ab, Ra, Rb); 5221 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5222 } bind(dont); 5223 } 5224 5225 void extra_step_squaring() { 5226 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5227 5228 // MACC(Rm, Rn, t0, t1, t2); 5229 // Rm = *++Pm; 5230 // Rn = *--Pn; 5231 umulh(Rhi_mn, Rm, Rn); 5232 mul(Rlo_mn, Rm, Rn); 5233 ldr(Rm, pre(Pm, wordSize)); 5234 ldr(Rn, pre(Pn, -wordSize)); 5235 } 5236 5237 void post1_squaring() { 5238 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5239 5240 // *Pm = Rm = t0 * inv; 5241 mul(Rm, t0, inv); 5242 str(Rm, Address(Pm)); 5243 5244 // MACC(Rm, Rn, t0, t1, t2); 5245 // t0 = t1; t1 = t2; t2 = 0; 5246 umulh(Rhi_mn, Rm, Rn); 5247 5248 #ifndef PRODUCT 5249 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5250 { 5251 mul(Rlo_mn, Rm, Rn); 5252 add(Rlo_mn, t0, Rlo_mn); 5253 Label ok; 5254 cbz(Rlo_mn, ok); { 5255 stop("broken Montgomery multiply"); 5256 } bind(ok); 5257 } 5258 #endif 5259 // We have very carefully set things up so that 5260 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5261 // the lower half of Rm * Rn because we know the result already: 5262 // it must be -t0. t0 + (-t0) must generate a carry iff 5263 // t0 != 0. So, rather than do a mul and an adds we just set 5264 // the carry flag iff t0 is nonzero. 5265 // 5266 // mul(Rlo_mn, Rm, Rn); 5267 // adds(zr, t0, Rlo_mn); 5268 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5269 adcs(t0, t1, Rhi_mn); 5270 adc(t1, t2, zr); 5271 mov(t2, zr); 5272 } 5273 5274 void acc(Register Rhi, Register Rlo, 5275 Register t0, Register t1, Register t2) { 5276 adds(t0, t0, Rlo); 5277 adcs(t1, t1, Rhi); 5278 adc(t2, t2, zr); 5279 } 5280 5281 public: 5282 /** 5283 * Fast Montgomery multiplication. The derivation of the 5284 * algorithm is in A Cryptographic Library for the Motorola 5285 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5286 * 5287 * Arguments: 5288 * 5289 * Inputs for multiplication: 5290 * c_rarg0 - int array elements a 5291 * c_rarg1 - int array elements b 5292 * c_rarg2 - int array elements n (the modulus) 5293 * c_rarg3 - int length 5294 * c_rarg4 - int inv 5295 * c_rarg5 - int array elements m (the result) 5296 * 5297 * Inputs for squaring: 5298 * c_rarg0 - int array elements a 5299 * c_rarg1 - int array elements n (the modulus) 5300 * c_rarg2 - int length 5301 * c_rarg3 - int inv 5302 * c_rarg4 - int array elements m (the result) 5303 * 5304 */ 5305 address generate_multiply() { 5306 Label argh, nothing; 5307 bind(argh); 5308 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5309 5310 align(CodeEntryAlignment); 5311 address entry = pc(); 5312 5313 cbzw(Rlen, nothing); 5314 5315 enter(); 5316 5317 // Make room. 5318 cmpw(Rlen, 512); 5319 br(Assembler::HI, argh); 5320 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5321 andr(sp, Ra, -2 * wordSize); 5322 5323 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5324 5325 { 5326 // Copy input args, reversing as we go. We use Ra as a 5327 // temporary variable. 5328 reverse(Ra, Pa_base, Rlen, t0, t1); 5329 if (!_squaring) 5330 reverse(Ra, Pb_base, Rlen, t0, t1); 5331 reverse(Ra, Pn_base, Rlen, t0, t1); 5332 } 5333 5334 // Push all call-saved registers and also Pm_base which we'll need 5335 // at the end. 5336 save_regs(); 5337 5338 #ifndef PRODUCT 5339 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5340 { 5341 ldr(Rn, Address(Pn_base, 0)); 5342 mul(Rlo_mn, Rn, inv); 5343 cmp(Rlo_mn, -1); 5344 Label ok; 5345 br(EQ, ok); { 5346 stop("broken inverse in Montgomery multiply"); 5347 } bind(ok); 5348 } 5349 #endif 5350 5351 mov(Pm_base, Ra); 5352 5353 mov(t0, zr); 5354 mov(t1, zr); 5355 mov(t2, zr); 5356 5357 block_comment("for (int i = 0; i < len; i++) {"); 5358 mov(Ri, zr); { 5359 Label loop, end; 5360 cmpw(Ri, Rlen); 5361 br(Assembler::GE, end); 5362 5363 bind(loop); 5364 pre1(Ri); 5365 5366 block_comment(" for (j = i; j; j--) {"); { 5367 movw(Rj, Ri); 5368 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5369 } block_comment(" } // j"); 5370 5371 post1(); 5372 addw(Ri, Ri, 1); 5373 cmpw(Ri, Rlen); 5374 br(Assembler::LT, loop); 5375 bind(end); 5376 block_comment("} // i"); 5377 } 5378 5379 block_comment("for (int i = len; i < 2*len; i++) {"); 5380 mov(Ri, Rlen); { 5381 Label loop, end; 5382 cmpw(Ri, Rlen, Assembler::LSL, 1); 5383 br(Assembler::GE, end); 5384 5385 bind(loop); 5386 pre2(Ri, Rlen); 5387 5388 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5389 lslw(Rj, Rlen, 1); 5390 subw(Rj, Rj, Ri); 5391 subw(Rj, Rj, 1); 5392 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5393 } block_comment(" } // j"); 5394 5395 post2(Ri, Rlen); 5396 addw(Ri, Ri, 1); 5397 cmpw(Ri, Rlen, Assembler::LSL, 1); 5398 br(Assembler::LT, loop); 5399 bind(end); 5400 } 5401 block_comment("} // i"); 5402 5403 normalize(Rlen); 5404 5405 mov(Ra, Pm_base); // Save Pm_base in Ra 5406 restore_regs(); // Restore caller's Pm_base 5407 5408 // Copy our result into caller's Pm_base 5409 reverse(Pm_base, Ra, Rlen, t0, t1); 5410 5411 leave(); 5412 bind(nothing); 5413 ret(lr); 5414 5415 return entry; 5416 } 5417 // In C, approximately: 5418 5419 // void 5420 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5421 // unsigned long Pn_base[], unsigned long Pm_base[], 5422 // unsigned long inv, int len) { 5423 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5424 // unsigned long *Pa, *Pb, *Pn, *Pm; 5425 // unsigned long Ra, Rb, Rn, Rm; 5426 5427 // int i; 5428 5429 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5430 5431 // for (i = 0; i < len; i++) { 5432 // int j; 5433 5434 // Pa = Pa_base; 5435 // Pb = Pb_base + i; 5436 // Pm = Pm_base; 5437 // Pn = Pn_base + i; 5438 5439 // Ra = *Pa; 5440 // Rb = *Pb; 5441 // Rm = *Pm; 5442 // Rn = *Pn; 5443 5444 // int iters = i; 5445 // for (j = 0; iters--; j++) { 5446 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5447 // MACC(Ra, Rb, t0, t1, t2); 5448 // Ra = *++Pa; 5449 // Rb = *--Pb; 5450 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5451 // MACC(Rm, Rn, t0, t1, t2); 5452 // Rm = *++Pm; 5453 // Rn = *--Pn; 5454 // } 5455 5456 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5457 // MACC(Ra, Rb, t0, t1, t2); 5458 // *Pm = Rm = t0 * inv; 5459 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5460 // MACC(Rm, Rn, t0, t1, t2); 5461 5462 // assert(t0 == 0, "broken Montgomery multiply"); 5463 5464 // t0 = t1; t1 = t2; t2 = 0; 5465 // } 5466 5467 // for (i = len; i < 2*len; i++) { 5468 // int j; 5469 5470 // Pa = Pa_base + i-len; 5471 // Pb = Pb_base + len; 5472 // Pm = Pm_base + i-len; 5473 // Pn = Pn_base + len; 5474 5475 // Ra = *++Pa; 5476 // Rb = *--Pb; 5477 // Rm = *++Pm; 5478 // Rn = *--Pn; 5479 5480 // int iters = len*2-i-1; 5481 // for (j = i-len+1; iters--; j++) { 5482 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5483 // MACC(Ra, Rb, t0, t1, t2); 5484 // Ra = *++Pa; 5485 // Rb = *--Pb; 5486 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5487 // MACC(Rm, Rn, t0, t1, t2); 5488 // Rm = *++Pm; 5489 // Rn = *--Pn; 5490 // } 5491 5492 // Pm_base[i-len] = t0; 5493 // t0 = t1; t1 = t2; t2 = 0; 5494 // } 5495 5496 // while (t0) 5497 // t0 = sub(Pm_base, Pn_base, t0, len); 5498 // } 5499 5500 /** 5501 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5502 * multiplies than Montgomery multiplication so it should be up to 5503 * 25% faster. However, its loop control is more complex and it 5504 * may actually run slower on some machines. 5505 * 5506 * Arguments: 5507 * 5508 * Inputs: 5509 * c_rarg0 - int array elements a 5510 * c_rarg1 - int array elements n (the modulus) 5511 * c_rarg2 - int length 5512 * c_rarg3 - int inv 5513 * c_rarg4 - int array elements m (the result) 5514 * 5515 */ 5516 address generate_square() { 5517 Label argh; 5518 bind(argh); 5519 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5520 5521 align(CodeEntryAlignment); 5522 address entry = pc(); 5523 5524 enter(); 5525 5526 // Make room. 5527 cmpw(Rlen, 512); 5528 br(Assembler::HI, argh); 5529 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5530 andr(sp, Ra, -2 * wordSize); 5531 5532 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5533 5534 { 5535 // Copy input args, reversing as we go. We use Ra as a 5536 // temporary variable. 5537 reverse(Ra, Pa_base, Rlen, t0, t1); 5538 reverse(Ra, Pn_base, Rlen, t0, t1); 5539 } 5540 5541 // Push all call-saved registers and also Pm_base which we'll need 5542 // at the end. 5543 save_regs(); 5544 5545 mov(Pm_base, Ra); 5546 5547 mov(t0, zr); 5548 mov(t1, zr); 5549 mov(t2, zr); 5550 5551 block_comment("for (int i = 0; i < len; i++) {"); 5552 mov(Ri, zr); { 5553 Label loop, end; 5554 bind(loop); 5555 cmp(Ri, Rlen); 5556 br(Assembler::GE, end); 5557 5558 pre1(Ri); 5559 5560 block_comment("for (j = (i+1)/2; j; j--) {"); { 5561 add(Rj, Ri, 1); 5562 lsr(Rj, Rj, 1); 5563 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5564 } block_comment(" } // j"); 5565 5566 last_squaring(Ri); 5567 5568 block_comment(" for (j = i/2; j; j--) {"); { 5569 lsr(Rj, Ri, 1); 5570 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5571 } block_comment(" } // j"); 5572 5573 post1_squaring(); 5574 add(Ri, Ri, 1); 5575 cmp(Ri, Rlen); 5576 br(Assembler::LT, loop); 5577 5578 bind(end); 5579 block_comment("} // i"); 5580 } 5581 5582 block_comment("for (int i = len; i < 2*len; i++) {"); 5583 mov(Ri, Rlen); { 5584 Label loop, end; 5585 bind(loop); 5586 cmp(Ri, Rlen, Assembler::LSL, 1); 5587 br(Assembler::GE, end); 5588 5589 pre2(Ri, Rlen); 5590 5591 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5592 lsl(Rj, Rlen, 1); 5593 sub(Rj, Rj, Ri); 5594 sub(Rj, Rj, 1); 5595 lsr(Rj, Rj, 1); 5596 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5597 } block_comment(" } // j"); 5598 5599 last_squaring(Ri); 5600 5601 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5602 lsl(Rj, Rlen, 1); 5603 sub(Rj, Rj, Ri); 5604 lsr(Rj, Rj, 1); 5605 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5606 } block_comment(" } // j"); 5607 5608 post2(Ri, Rlen); 5609 add(Ri, Ri, 1); 5610 cmp(Ri, Rlen, Assembler::LSL, 1); 5611 5612 br(Assembler::LT, loop); 5613 bind(end); 5614 block_comment("} // i"); 5615 } 5616 5617 normalize(Rlen); 5618 5619 mov(Ra, Pm_base); // Save Pm_base in Ra 5620 restore_regs(); // Restore caller's Pm_base 5621 5622 // Copy our result into caller's Pm_base 5623 reverse(Pm_base, Ra, Rlen, t0, t1); 5624 5625 leave(); 5626 ret(lr); 5627 5628 return entry; 5629 } 5630 // In C, approximately: 5631 5632 // void 5633 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5634 // unsigned long Pm_base[], unsigned long inv, int len) { 5635 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5636 // unsigned long *Pa, *Pb, *Pn, *Pm; 5637 // unsigned long Ra, Rb, Rn, Rm; 5638 5639 // int i; 5640 5641 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5642 5643 // for (i = 0; i < len; i++) { 5644 // int j; 5645 5646 // Pa = Pa_base; 5647 // Pb = Pa_base + i; 5648 // Pm = Pm_base; 5649 // Pn = Pn_base + i; 5650 5651 // Ra = *Pa; 5652 // Rb = *Pb; 5653 // Rm = *Pm; 5654 // Rn = *Pn; 5655 5656 // int iters = (i+1)/2; 5657 // for (j = 0; iters--; j++) { 5658 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5659 // MACC2(Ra, Rb, t0, t1, t2); 5660 // Ra = *++Pa; 5661 // Rb = *--Pb; 5662 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5663 // MACC(Rm, Rn, t0, t1, t2); 5664 // Rm = *++Pm; 5665 // Rn = *--Pn; 5666 // } 5667 // if ((i & 1) == 0) { 5668 // assert(Ra == Pa_base[j], "must be"); 5669 // MACC(Ra, Ra, t0, t1, t2); 5670 // } 5671 // iters = i/2; 5672 // assert(iters == i-j, "must be"); 5673 // for (; iters--; j++) { 5674 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5675 // MACC(Rm, Rn, t0, t1, t2); 5676 // Rm = *++Pm; 5677 // Rn = *--Pn; 5678 // } 5679 5680 // *Pm = Rm = t0 * inv; 5681 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5682 // MACC(Rm, Rn, t0, t1, t2); 5683 5684 // assert(t0 == 0, "broken Montgomery multiply"); 5685 5686 // t0 = t1; t1 = t2; t2 = 0; 5687 // } 5688 5689 // for (i = len; i < 2*len; i++) { 5690 // int start = i-len+1; 5691 // int end = start + (len - start)/2; 5692 // int j; 5693 5694 // Pa = Pa_base + i-len; 5695 // Pb = Pa_base + len; 5696 // Pm = Pm_base + i-len; 5697 // Pn = Pn_base + len; 5698 5699 // Ra = *++Pa; 5700 // Rb = *--Pb; 5701 // Rm = *++Pm; 5702 // Rn = *--Pn; 5703 5704 // int iters = (2*len-i-1)/2; 5705 // assert(iters == end-start, "must be"); 5706 // for (j = start; iters--; j++) { 5707 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5708 // MACC2(Ra, Rb, t0, t1, t2); 5709 // Ra = *++Pa; 5710 // Rb = *--Pb; 5711 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5712 // MACC(Rm, Rn, t0, t1, t2); 5713 // Rm = *++Pm; 5714 // Rn = *--Pn; 5715 // } 5716 // if ((i & 1) == 0) { 5717 // assert(Ra == Pa_base[j], "must be"); 5718 // MACC(Ra, Ra, t0, t1, t2); 5719 // } 5720 // iters = (2*len-i)/2; 5721 // assert(iters == len-j, "must be"); 5722 // for (; iters--; j++) { 5723 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5724 // MACC(Rm, Rn, t0, t1, t2); 5725 // Rm = *++Pm; 5726 // Rn = *--Pn; 5727 // } 5728 // Pm_base[i-len] = t0; 5729 // t0 = t1; t1 = t2; t2 = 0; 5730 // } 5731 5732 // while (t0) 5733 // t0 = sub(Pm_base, Pn_base, t0, len); 5734 // } 5735 }; 5736 5737 5738 // Initialization 5739 void generate_initial() { 5740 // Generate initial stubs and initializes the entry points 5741 5742 // entry points that exist in all platforms Note: This is code 5743 // that could be shared among different platforms - however the 5744 // benefit seems to be smaller than the disadvantage of having a 5745 // much more complicated generator structure. See also comment in 5746 // stubRoutines.hpp. 5747 5748 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5749 5750 StubRoutines::_call_stub_entry = 5751 generate_call_stub(StubRoutines::_call_stub_return_address); 5752 5753 // is referenced by megamorphic call 5754 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5755 5756 // Build this early so it's available for the interpreter. 5757 StubRoutines::_throw_StackOverflowError_entry = 5758 generate_throw_exception("StackOverflowError throw_exception", 5759 CAST_FROM_FN_PTR(address, 5760 SharedRuntime::throw_StackOverflowError)); 5761 StubRoutines::_throw_delayed_StackOverflowError_entry = 5762 generate_throw_exception("delayed StackOverflowError throw_exception", 5763 CAST_FROM_FN_PTR(address, 5764 SharedRuntime::throw_delayed_StackOverflowError)); 5765 if (UseCRC32Intrinsics) { 5766 // set table address before stub generation which use it 5767 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5768 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5769 } 5770 5771 if (UseCRC32CIntrinsics) { 5772 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5773 } 5774 5775 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5776 StubRoutines::_dlog = generate_dlog(); 5777 } 5778 5779 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5780 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5781 } 5782 5783 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5784 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5785 } 5786 } 5787 5788 void generate_all() { 5789 // support for verify_oop (must happen after universe_init) 5790 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5791 StubRoutines::_throw_AbstractMethodError_entry = 5792 generate_throw_exception("AbstractMethodError throw_exception", 5793 CAST_FROM_FN_PTR(address, 5794 SharedRuntime:: 5795 throw_AbstractMethodError)); 5796 5797 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5798 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5799 CAST_FROM_FN_PTR(address, 5800 SharedRuntime:: 5801 throw_IncompatibleClassChangeError)); 5802 5803 StubRoutines::_throw_NullPointerException_at_call_entry = 5804 generate_throw_exception("NullPointerException at call throw_exception", 5805 CAST_FROM_FN_PTR(address, 5806 SharedRuntime:: 5807 throw_NullPointerException_at_call)); 5808 5809 // arraycopy stubs used by compilers 5810 generate_arraycopy_stubs(); 5811 5812 // has negatives stub for large arrays. 5813 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5814 5815 // array equals stub for large arrays. 5816 if (!UseSimpleArrayEquals) { 5817 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5818 } 5819 5820 generate_compare_long_strings(); 5821 5822 generate_string_indexof_stubs(); 5823 5824 // byte_array_inflate stub for large arrays. 5825 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5826 5827 if (UseMultiplyToLenIntrinsic) { 5828 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5829 } 5830 5831 if (UseSquareToLenIntrinsic) { 5832 StubRoutines::_squareToLen = generate_squareToLen(); 5833 } 5834 5835 if (UseMulAddIntrinsic) { 5836 StubRoutines::_mulAdd = generate_mulAdd(); 5837 } 5838 5839 if (UseMontgomeryMultiplyIntrinsic) { 5840 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5841 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5842 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5843 } 5844 5845 if (UseMontgomerySquareIntrinsic) { 5846 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5847 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5848 // We use generate_multiply() rather than generate_square() 5849 // because it's faster for the sizes of modulus we care about. 5850 StubRoutines::_montgomerySquare = g.generate_multiply(); 5851 } 5852 5853 #ifndef BUILTIN_SIM 5854 // generate GHASH intrinsics code 5855 if (UseGHASHIntrinsics) { 5856 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5857 } 5858 5859 // data cache line writeback 5860 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 5861 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 5862 5863 if (UseAESIntrinsics) { 5864 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5865 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5866 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5867 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5868 } 5869 5870 if (UseSHA1Intrinsics) { 5871 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5872 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5873 } 5874 if (UseSHA256Intrinsics) { 5875 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5876 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5877 } 5878 5879 // generate Adler32 intrinsics code 5880 if (UseAdler32Intrinsics) { 5881 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5882 } 5883 5884 // Safefetch stubs. 5885 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5886 &StubRoutines::_safefetch32_fault_pc, 5887 &StubRoutines::_safefetch32_continuation_pc); 5888 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5889 &StubRoutines::_safefetchN_fault_pc, 5890 &StubRoutines::_safefetchN_continuation_pc); 5891 #endif 5892 StubRoutines::aarch64::set_completed(); 5893 } 5894 5895 public: 5896 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5897 if (all) { 5898 generate_all(); 5899 } else { 5900 generate_initial(); 5901 } 5902 } 5903 }; // end class declaration 5904 5905 void StubGenerator_generate(CodeBuffer* code, bool all) { 5906 StubGenerator g(code, all); 5907 }