1 /* 2 * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 #ifdef BUILTIN_SIM 50 #include "../../../../../../simulator/simulator.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #undef __ 58 #define __ _masm-> 59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Stub Code definitions 70 71 class StubGenerator: public StubCodeGenerator { 72 private: 73 74 #ifdef PRODUCT 75 #define inc_counter_np(counter) ((void)0) 76 #else 77 void inc_counter_np_(int& counter) { 78 __ lea(rscratch2, ExternalAddress((address)&counter)); 79 __ ldrw(rscratch1, Address(rscratch2)); 80 __ addw(rscratch1, rscratch1, 1); 81 __ strw(rscratch1, Address(rscratch2)); 82 } 83 #define inc_counter_np(counter) \ 84 BLOCK_COMMENT("inc_counter " #counter); \ 85 inc_counter_np_(counter); 86 #endif 87 88 // Call stubs are used to call Java from C 89 // 90 // Arguments: 91 // c_rarg0: call wrapper address address 92 // c_rarg1: result address 93 // c_rarg2: result type BasicType 94 // c_rarg3: method Method* 95 // c_rarg4: (interpreter) entry point address 96 // c_rarg5: parameters intptr_t* 97 // c_rarg6: parameter size (in words) int 98 // c_rarg7: thread Thread* 99 // 100 // There is no return from the stub itself as any Java result 101 // is written to result 102 // 103 // we save r30 (lr) as the return PC at the base of the frame and 104 // link r29 (fp) below it as the frame pointer installing sp (r31) 105 // into fp. 106 // 107 // we save r0-r7, which accounts for all the c arguments. 108 // 109 // TODO: strictly do we need to save them all? they are treated as 110 // volatile by C so could we omit saving the ones we are going to 111 // place in global registers (thread? method?) or those we only use 112 // during setup of the Java call? 113 // 114 // we don't need to save r8 which C uses as an indirect result location 115 // return register. 116 // 117 // we don't need to save r9-r15 which both C and Java treat as 118 // volatile 119 // 120 // we don't need to save r16-18 because Java does not use them 121 // 122 // we save r19-r28 which Java uses as scratch registers and C 123 // expects to be callee-save 124 // 125 // we save the bottom 64 bits of each value stored in v8-v15; it is 126 // the responsibility of the caller to preserve larger values. 127 // 128 // so the stub frame looks like this when we enter Java code 129 // 130 // [ return_from_Java ] <--- sp 131 // [ argument word n ] 132 // ... 133 // -27 [ argument word 1 ] 134 // -26 [ saved v15 ] <--- sp_after_call 135 // -25 [ saved v14 ] 136 // -24 [ saved v13 ] 137 // -23 [ saved v12 ] 138 // -22 [ saved v11 ] 139 // -21 [ saved v10 ] 140 // -20 [ saved v9 ] 141 // -19 [ saved v8 ] 142 // -18 [ saved r28 ] 143 // -17 [ saved r27 ] 144 // -16 [ saved r26 ] 145 // -15 [ saved r25 ] 146 // -14 [ saved r24 ] 147 // -13 [ saved r23 ] 148 // -12 [ saved r22 ] 149 // -11 [ saved r21 ] 150 // -10 [ saved r20 ] 151 // -9 [ saved r19 ] 152 // -8 [ call wrapper (r0) ] 153 // -7 [ result (r1) ] 154 // -6 [ result type (r2) ] 155 // -5 [ method (r3) ] 156 // -4 [ entry point (r4) ] 157 // -3 [ parameters (r5) ] 158 // -2 [ parameter size (r6) ] 159 // -1 [ thread (r7) ] 160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 161 // 1 [ saved lr (r30) ] 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -26, 166 167 d15_off = -26, 168 d13_off = -24, 169 d11_off = -22, 170 d9_off = -20, 171 172 r28_off = -18, 173 r26_off = -16, 174 r24_off = -14, 175 r22_off = -12, 176 r20_off = -10, 177 call_wrapper_off = -8, 178 result_off = -7, 179 result_type_off = -6, 180 method_off = -5, 181 entry_point_off = -4, 182 parameter_size_off = -2, 183 thread_off = -1, 184 fp_f = 0, 185 retaddr_off = 1, 186 }; 187 188 address generate_call_stub(address& return_address) { 189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 191 "adjust this code"); 192 193 StubCodeMark mark(this, "StubRoutines", "call_stub"); 194 address start = __ pc(); 195 196 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 197 198 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 199 const Address result (rfp, result_off * wordSize); 200 const Address result_type (rfp, result_type_off * wordSize); 201 const Address method (rfp, method_off * wordSize); 202 const Address entry_point (rfp, entry_point_off * wordSize); 203 const Address parameter_size(rfp, parameter_size_off * wordSize); 204 205 const Address thread (rfp, thread_off * wordSize); 206 207 const Address d15_save (rfp, d15_off * wordSize); 208 const Address d13_save (rfp, d13_off * wordSize); 209 const Address d11_save (rfp, d11_off * wordSize); 210 const Address d9_save (rfp, d9_off * wordSize); 211 212 const Address r28_save (rfp, r28_off * wordSize); 213 const Address r26_save (rfp, r26_off * wordSize); 214 const Address r24_save (rfp, r24_off * wordSize); 215 const Address r22_save (rfp, r22_off * wordSize); 216 const Address r20_save (rfp, r20_off * wordSize); 217 218 // stub code 219 220 // we need a C prolog to bootstrap the x86 caller into the sim 221 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 222 223 address aarch64_entry = __ pc(); 224 225 #ifdef BUILTIN_SIM 226 // Save sender's SP for stack traces. 227 __ mov(rscratch1, sp); 228 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 229 #endif 230 // set up frame and move sp to end of save area 231 __ enter(); 232 __ sub(sp, rfp, -sp_after_call_off * wordSize); 233 234 // save register parameters and Java scratch/global registers 235 // n.b. we save thread even though it gets installed in 236 // rthread because we want to sanity check rthread later 237 __ str(c_rarg7, thread); 238 __ strw(c_rarg6, parameter_size); 239 __ stp(c_rarg4, c_rarg5, entry_point); 240 __ stp(c_rarg2, c_rarg3, result_type); 241 __ stp(c_rarg0, c_rarg1, call_wrapper); 242 243 __ stp(r20, r19, r20_save); 244 __ stp(r22, r21, r22_save); 245 __ stp(r24, r23, r24_save); 246 __ stp(r26, r25, r26_save); 247 __ stp(r28, r27, r28_save); 248 249 __ stpd(v9, v8, d9_save); 250 __ stpd(v11, v10, d11_save); 251 __ stpd(v13, v12, d13_save); 252 __ stpd(v15, v14, d15_save); 253 254 // install Java thread in global register now we have saved 255 // whatever value it held 256 __ mov(rthread, c_rarg7); 257 // And method 258 __ mov(rmethod, c_rarg3); 259 260 // set up the heapbase register 261 __ reinit_heapbase(); 262 263 #ifdef ASSERT 264 // make sure we have no pending exceptions 265 { 266 Label L; 267 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 268 __ cmp(rscratch1, (u1)NULL_WORD); 269 __ br(Assembler::EQ, L); 270 __ stop("StubRoutines::call_stub: entered with pending exception"); 271 __ BIND(L); 272 } 273 #endif 274 // pass parameters if any 275 __ mov(esp, sp); 276 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 277 __ andr(sp, rscratch1, -2 * wordSize); 278 279 BLOCK_COMMENT("pass parameters if any"); 280 Label parameters_done; 281 // parameter count is still in c_rarg6 282 // and parameter pointer identifying param 1 is in c_rarg5 283 __ cbzw(c_rarg6, parameters_done); 284 285 address loop = __ pc(); 286 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 287 __ subsw(c_rarg6, c_rarg6, 1); 288 __ push(rscratch1); 289 __ br(Assembler::GT, loop); 290 291 __ BIND(parameters_done); 292 293 // call Java entry -- passing methdoOop, and current sp 294 // rmethod: Method* 295 // r13: sender sp 296 BLOCK_COMMENT("call Java function"); 297 __ mov(r13, sp); 298 __ blr(c_rarg4); 299 300 // tell the simulator we have returned to the stub 301 302 // we do this here because the notify will already have been done 303 // if we get to the next instruction via an exception 304 // 305 // n.b. adding this instruction here affects the calculation of 306 // whether or not a routine returns to the call stub (used when 307 // doing stack walks) since the normal test is to check the return 308 // pc against the address saved below. so we may need to allow for 309 // this extra instruction in the check. 310 311 if (NotifySimulator) { 312 __ notify(Assembler::method_reentry); 313 } 314 // save current address for use by exception handling code 315 316 return_address = __ pc(); 317 318 // store result depending on type (everything that is not 319 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 320 // n.b. this assumes Java returns an integral result in r0 321 // and a floating result in j_farg0 322 __ ldr(j_rarg2, result); 323 Label is_long, is_float, is_double, exit; 324 __ ldr(j_rarg1, result_type); 325 __ cmp(j_rarg1, (u1)T_OBJECT); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, (u1)T_LONG); 328 __ br(Assembler::EQ, is_long); 329 __ cmp(j_rarg1, (u1)T_FLOAT); 330 __ br(Assembler::EQ, is_float); 331 __ cmp(j_rarg1, (u1)T_DOUBLE); 332 __ br(Assembler::EQ, is_double); 333 334 // handle T_INT case 335 __ strw(r0, Address(j_rarg2)); 336 337 __ BIND(exit); 338 339 // pop parameters 340 __ sub(esp, rfp, -sp_after_call_off * wordSize); 341 342 #ifdef ASSERT 343 // verify that threads correspond 344 { 345 Label L, S; 346 __ ldr(rscratch1, thread); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::NE, S); 349 __ get_thread(rscratch1); 350 __ cmp(rthread, rscratch1); 351 __ br(Assembler::EQ, L); 352 __ BIND(S); 353 __ stop("StubRoutines::call_stub: threads must correspond"); 354 __ BIND(L); 355 } 356 #endif 357 358 // restore callee-save registers 359 __ ldpd(v15, v14, d15_save); 360 __ ldpd(v13, v12, d13_save); 361 __ ldpd(v11, v10, d11_save); 362 __ ldpd(v9, v8, d9_save); 363 364 __ ldp(r28, r27, r28_save); 365 __ ldp(r26, r25, r26_save); 366 __ ldp(r24, r23, r24_save); 367 __ ldp(r22, r21, r22_save); 368 __ ldp(r20, r19, r20_save); 369 370 __ ldp(c_rarg0, c_rarg1, call_wrapper); 371 __ ldrw(c_rarg2, result_type); 372 __ ldr(c_rarg3, method); 373 __ ldp(c_rarg4, c_rarg5, entry_point); 374 __ ldp(c_rarg6, c_rarg7, parameter_size); 375 376 #ifndef PRODUCT 377 // tell the simulator we are about to end Java execution 378 if (NotifySimulator) { 379 __ notify(Assembler::method_exit); 380 } 381 #endif 382 // leave frame and return to caller 383 __ leave(); 384 __ ret(lr); 385 386 // handle return types different from T_INT 387 388 __ BIND(is_long); 389 __ str(r0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 __ BIND(is_float); 393 __ strs(j_farg0, Address(j_rarg2, 0)); 394 __ br(Assembler::AL, exit); 395 396 __ BIND(is_double); 397 __ strd(j_farg0, Address(j_rarg2, 0)); 398 __ br(Assembler::AL, exit); 399 400 return start; 401 } 402 403 // Return point for a Java call if there's an exception thrown in 404 // Java code. The exception is caught and transformed into a 405 // pending exception stored in JavaThread that can be tested from 406 // within the VM. 407 // 408 // Note: Usually the parameters are removed by the callee. In case 409 // of an exception crossing an activation frame boundary, that is 410 // not the case if the callee is compiled code => need to setup the 411 // rsp. 412 // 413 // r0: exception oop 414 415 // NOTE: this is used as a target from the signal handler so it 416 // needs an x86 prolog which returns into the current simulator 417 // executing the generated catch_exception code. so the prolog 418 // needs to install rax in a sim register and adjust the sim's 419 // restart pc to enter the generated code at the start position 420 // then return from native to simulated execution. 421 422 address generate_catch_exception() { 423 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 424 address start = __ pc(); 425 426 // same as in generate_call_stub(): 427 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 428 const Address thread (rfp, thread_off * wordSize); 429 430 #ifdef ASSERT 431 // verify that threads correspond 432 { 433 Label L, S; 434 __ ldr(rscratch1, thread); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::NE, S); 437 __ get_thread(rscratch1); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::EQ, L); 440 __ bind(S); 441 __ stop("StubRoutines::catch_exception: threads must correspond"); 442 __ bind(L); 443 } 444 #endif 445 446 // set pending exception 447 __ verify_oop(r0); 448 449 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 450 __ mov(rscratch1, (address)__FILE__); 451 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 452 __ movw(rscratch1, (int)__LINE__); 453 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 454 455 // complete return to VM 456 assert(StubRoutines::_call_stub_return_address != NULL, 457 "_call_stub_return_address must have been generated before"); 458 __ b(StubRoutines::_call_stub_return_address); 459 460 return start; 461 } 462 463 // Continuation point for runtime calls returning with a pending 464 // exception. The pending exception check happened in the runtime 465 // or native call stub. The pending exception in Thread is 466 // converted into a Java-level exception. 467 // 468 // Contract with Java-level exception handlers: 469 // r0: exception 470 // r3: throwing pc 471 // 472 // NOTE: At entry of this stub, exception-pc must be in LR !! 473 474 // NOTE: this is always used as a jump target within generated code 475 // so it just needs to be generated code wiht no x86 prolog 476 477 address generate_forward_exception() { 478 StubCodeMark mark(this, "StubRoutines", "forward exception"); 479 address start = __ pc(); 480 481 // Upon entry, LR points to the return address returning into 482 // Java (interpreted or compiled) code; i.e., the return address 483 // becomes the throwing pc. 484 // 485 // Arguments pushed before the runtime call are still on the stack 486 // but the exception handler will reset the stack pointer -> 487 // ignore them. A potential result in registers can be ignored as 488 // well. 489 490 #ifdef ASSERT 491 // make sure this code is only executed if there is a pending exception 492 { 493 Label L; 494 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 495 __ cbnz(rscratch1, L); 496 __ stop("StubRoutines::forward exception: no pending exception (1)"); 497 __ bind(L); 498 } 499 #endif 500 501 // compute exception handler into r19 502 503 // call the VM to find the handler address associated with the 504 // caller address. pass thread in r0 and caller pc (ret address) 505 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 506 // the stack. 507 __ mov(c_rarg1, lr); 508 // lr will be trashed by the VM call so we move it to R19 509 // (callee-saved) because we also need to pass it to the handler 510 // returned by this call. 511 __ mov(r19, lr); 512 BLOCK_COMMENT("call exception_handler_for_return_address"); 513 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 514 SharedRuntime::exception_handler_for_return_address), 515 rthread, c_rarg1); 516 // we should not really care that lr is no longer the callee 517 // address. we saved the value the handler needs in r19 so we can 518 // just copy it to r3. however, the C2 handler will push its own 519 // frame and then calls into the VM and the VM code asserts that 520 // the PC for the frame above the handler belongs to a compiled 521 // Java method. So, we restore lr here to satisfy that assert. 522 __ mov(lr, r19); 523 // setup r0 & r3 & clear pending exception 524 __ mov(r3, r19); 525 __ mov(r19, r0); 526 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 527 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 528 529 #ifdef ASSERT 530 // make sure exception is set 531 { 532 Label L; 533 __ cbnz(r0, L); 534 __ stop("StubRoutines::forward exception: no pending exception (2)"); 535 __ bind(L); 536 } 537 #endif 538 539 // continue at exception handler 540 // r0: exception 541 // r3: throwing pc 542 // r19: exception handler 543 __ verify_oop(r0); 544 __ br(r19); 545 546 return start; 547 } 548 549 // Non-destructive plausibility checks for oops 550 // 551 // Arguments: 552 // r0: oop to verify 553 // rscratch1: error message 554 // 555 // Stack after saving c_rarg3: 556 // [tos + 0]: saved c_rarg3 557 // [tos + 1]: saved c_rarg2 558 // [tos + 2]: saved lr 559 // [tos + 3]: saved rscratch2 560 // [tos + 4]: saved r0 561 // [tos + 5]: saved rscratch1 562 address generate_verify_oop() { 563 564 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 565 address start = __ pc(); 566 567 Label exit, error; 568 569 // save c_rarg2 and c_rarg3 570 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 571 572 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 573 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 574 __ ldr(c_rarg3, Address(c_rarg2)); 575 __ add(c_rarg3, c_rarg3, 1); 576 __ str(c_rarg3, Address(c_rarg2)); 577 578 // object is in r0 579 // make sure object is 'reasonable' 580 __ cbz(r0, exit); // if obj is NULL it is OK 581 582 // Check if the oop is in the right area of memory 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 584 __ andr(c_rarg2, r0, c_rarg3); 585 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 586 587 // Compare c_rarg2 and c_rarg3. We don't use a compare 588 // instruction here because the flags register is live. 589 __ eor(c_rarg2, c_rarg2, c_rarg3); 590 __ cbnz(c_rarg2, error); 591 592 // make sure klass is 'reasonable', which is not zero. 593 __ load_klass(r0, r0); // get klass 594 __ cbz(r0, error); // if klass is NULL it is broken 595 596 // return if everything seems ok 597 __ bind(exit); 598 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 __ ret(lr); 601 602 // handle errors 603 __ bind(error); 604 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 605 606 __ push(RegSet::range(r0, r29), sp); 607 // debug(char* msg, int64_t pc, int64_t regs[]) 608 __ mov(c_rarg0, rscratch1); // pass address of error message 609 __ mov(c_rarg1, lr); // pass return address 610 __ mov(c_rarg2, sp); // pass address of regs on stack 611 #ifndef PRODUCT 612 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 613 #endif 614 BLOCK_COMMENT("call MacroAssembler::debug"); 615 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 616 __ blrt(rscratch1, 3, 0, 1); 617 618 return start; 619 } 620 621 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 622 623 // The inner part of zero_words(). This is the bulk operation, 624 // zeroing words in blocks, possibly using DC ZVA to do it. The 625 // caller is responsible for zeroing the last few words. 626 // 627 // Inputs: 628 // r10: the HeapWord-aligned base address of an array to zero. 629 // r11: the count in HeapWords, r11 > 0. 630 // 631 // Returns r10 and r11, adjusted for the caller to clear. 632 // r10: the base address of the tail of words left to clear. 633 // r11: the number of words in the tail. 634 // r11 < MacroAssembler::zero_words_block_size. 635 636 address generate_zero_blocks() { 637 Label done; 638 Label base_aligned; 639 640 Register base = r10, cnt = r11; 641 642 __ align(CodeEntryAlignment); 643 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 644 address start = __ pc(); 645 646 if (UseBlockZeroing) { 647 int zva_length = VM_Version::zva_length(); 648 649 // Ensure ZVA length can be divided by 16. This is required by 650 // the subsequent operations. 651 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 652 653 __ tbz(base, 3, base_aligned); 654 __ str(zr, Address(__ post(base, 8))); 655 __ sub(cnt, cnt, 1); 656 __ bind(base_aligned); 657 658 // Ensure count >= zva_length * 2 so that it still deserves a zva after 659 // alignment. 660 Label small; 661 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 662 __ subs(rscratch1, cnt, low_limit >> 3); 663 __ br(Assembler::LT, small); 664 __ zero_dcache_blocks(base, cnt); 665 __ bind(small); 666 } 667 668 { 669 // Number of stp instructions we'll unroll 670 const int unroll = 671 MacroAssembler::zero_words_block_size / 2; 672 // Clear the remaining blocks. 673 Label loop; 674 __ subs(cnt, cnt, unroll * 2); 675 __ br(Assembler::LT, done); 676 __ bind(loop); 677 for (int i = 0; i < unroll; i++) 678 __ stp(zr, zr, __ post(base, 16)); 679 __ subs(cnt, cnt, unroll * 2); 680 __ br(Assembler::GE, loop); 681 __ bind(done); 682 __ add(cnt, cnt, unroll * 2); 683 } 684 685 __ ret(lr); 686 687 return start; 688 } 689 690 691 typedef enum { 692 copy_forwards = 1, 693 copy_backwards = -1 694 } copy_direction; 695 696 // Bulk copy of blocks of 8 words. 697 // 698 // count is a count of words. 699 // 700 // Precondition: count >= 8 701 // 702 // Postconditions: 703 // 704 // The least significant bit of count contains the remaining count 705 // of words to copy. The rest of count is trash. 706 // 707 // s and d are adjusted to point to the remaining words to copy 708 // 709 void generate_copy_longs(Label &start, Register s, Register d, Register count, 710 copy_direction direction) { 711 int unit = wordSize * direction; 712 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 713 714 int offset; 715 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 716 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 717 const Register stride = r13; 718 719 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 720 assert_different_registers(s, d, count, rscratch1); 721 722 Label again, drain; 723 const char *stub_name; 724 if (direction == copy_forwards) 725 stub_name = "forward_copy_longs"; 726 else 727 stub_name = "backward_copy_longs"; 728 729 __ align(CodeEntryAlignment); 730 731 StubCodeMark mark(this, "StubRoutines", stub_name); 732 733 __ bind(start); 734 735 Label unaligned_copy_long; 736 if (AvoidUnalignedAccesses) { 737 __ tbnz(d, 3, unaligned_copy_long); 738 } 739 740 if (direction == copy_forwards) { 741 __ sub(s, s, bias); 742 __ sub(d, d, bias); 743 } 744 745 #ifdef ASSERT 746 // Make sure we are never given < 8 words 747 { 748 Label L; 749 __ cmp(count, (u1)8); 750 __ br(Assembler::GE, L); 751 __ stop("genrate_copy_longs called with < 8 words"); 752 __ bind(L); 753 } 754 #endif 755 756 // Fill 8 registers 757 if (UseSIMDForMemoryOps) { 758 __ ldpq(v0, v1, Address(s, 4 * unit)); 759 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 760 } else { 761 __ ldp(t0, t1, Address(s, 2 * unit)); 762 __ ldp(t2, t3, Address(s, 4 * unit)); 763 __ ldp(t4, t5, Address(s, 6 * unit)); 764 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 765 } 766 767 __ subs(count, count, 16); 768 __ br(Assembler::LO, drain); 769 770 int prefetch = PrefetchCopyIntervalInBytes; 771 bool use_stride = false; 772 if (direction == copy_backwards) { 773 use_stride = prefetch > 256; 774 prefetch = -prefetch; 775 if (use_stride) __ mov(stride, prefetch); 776 } 777 778 __ bind(again); 779 780 if (PrefetchCopyIntervalInBytes > 0) 781 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 782 783 if (UseSIMDForMemoryOps) { 784 __ stpq(v0, v1, Address(d, 4 * unit)); 785 __ ldpq(v0, v1, Address(s, 4 * unit)); 786 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 787 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 788 } else { 789 __ stp(t0, t1, Address(d, 2 * unit)); 790 __ ldp(t0, t1, Address(s, 2 * unit)); 791 __ stp(t2, t3, Address(d, 4 * unit)); 792 __ ldp(t2, t3, Address(s, 4 * unit)); 793 __ stp(t4, t5, Address(d, 6 * unit)); 794 __ ldp(t4, t5, Address(s, 6 * unit)); 795 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 796 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 797 } 798 799 __ subs(count, count, 8); 800 __ br(Assembler::HS, again); 801 802 // Drain 803 __ bind(drain); 804 if (UseSIMDForMemoryOps) { 805 __ stpq(v0, v1, Address(d, 4 * unit)); 806 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 807 } else { 808 __ stp(t0, t1, Address(d, 2 * unit)); 809 __ stp(t2, t3, Address(d, 4 * unit)); 810 __ stp(t4, t5, Address(d, 6 * unit)); 811 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 812 } 813 814 { 815 Label L1, L2; 816 __ tbz(count, exact_log2(4), L1); 817 if (UseSIMDForMemoryOps) { 818 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 819 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 820 } else { 821 __ ldp(t0, t1, Address(s, 2 * unit)); 822 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 823 __ stp(t0, t1, Address(d, 2 * unit)); 824 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 825 } 826 __ bind(L1); 827 828 if (direction == copy_forwards) { 829 __ add(s, s, bias); 830 __ add(d, d, bias); 831 } 832 833 __ tbz(count, 1, L2); 834 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 835 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 836 __ bind(L2); 837 } 838 839 __ ret(lr); 840 841 if (AvoidUnalignedAccesses) { 842 Label drain, again; 843 // Register order for storing. Order is different for backward copy. 844 845 __ bind(unaligned_copy_long); 846 847 // source address is even aligned, target odd aligned 848 // 849 // when forward copying word pairs we read long pairs at offsets 850 // {0, 2, 4, 6} (in long words). when backwards copying we read 851 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 852 // address by -2 in the forwards case so we can compute the 853 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 854 // or -1. 855 // 856 // when forward copying we need to store 1 word, 3 pairs and 857 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 858 // zero offset We adjust the destination by -1 which means we 859 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 860 // 861 // When backwards copyng we need to store 1 word, 3 pairs and 862 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 863 // offsets {1, 3, 5, 7, 8} * unit. 864 865 if (direction == copy_forwards) { 866 __ sub(s, s, 16); 867 __ sub(d, d, 8); 868 } 869 870 // Fill 8 registers 871 // 872 // for forwards copy s was offset by -16 from the original input 873 // value of s so the register contents are at these offsets 874 // relative to the 64 bit block addressed by that original input 875 // and so on for each successive 64 byte block when s is updated 876 // 877 // t0 at offset 0, t1 at offset 8 878 // t2 at offset 16, t3 at offset 24 879 // t4 at offset 32, t5 at offset 40 880 // t6 at offset 48, t7 at offset 56 881 882 // for backwards copy s was not offset so the register contents 883 // are at these offsets into the preceding 64 byte block 884 // relative to that original input and so on for each successive 885 // preceding 64 byte block when s is updated. this explains the 886 // slightly counter-intuitive looking pattern of register usage 887 // in the stp instructions for backwards copy. 888 // 889 // t0 at offset -16, t1 at offset -8 890 // t2 at offset -32, t3 at offset -24 891 // t4 at offset -48, t5 at offset -40 892 // t6 at offset -64, t7 at offset -56 893 894 __ ldp(t0, t1, Address(s, 2 * unit)); 895 __ ldp(t2, t3, Address(s, 4 * unit)); 896 __ ldp(t4, t5, Address(s, 6 * unit)); 897 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 898 899 __ subs(count, count, 16); 900 __ br(Assembler::LO, drain); 901 902 int prefetch = PrefetchCopyIntervalInBytes; 903 bool use_stride = false; 904 if (direction == copy_backwards) { 905 use_stride = prefetch > 256; 906 prefetch = -prefetch; 907 if (use_stride) __ mov(stride, prefetch); 908 } 909 910 __ bind(again); 911 912 if (PrefetchCopyIntervalInBytes > 0) 913 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 914 915 if (direction == copy_forwards) { 916 // allowing for the offset of -8 the store instructions place 917 // registers into the target 64 bit block at the following 918 // offsets 919 // 920 // t0 at offset 0 921 // t1 at offset 8, t2 at offset 16 922 // t3 at offset 24, t4 at offset 32 923 // t5 at offset 40, t6 at offset 48 924 // t7 at offset 56 925 926 __ str(t0, Address(d, 1 * unit)); 927 __ stp(t1, t2, Address(d, 2 * unit)); 928 __ ldp(t0, t1, Address(s, 2 * unit)); 929 __ stp(t3, t4, Address(d, 4 * unit)); 930 __ ldp(t2, t3, Address(s, 4 * unit)); 931 __ stp(t5, t6, Address(d, 6 * unit)); 932 __ ldp(t4, t5, Address(s, 6 * unit)); 933 __ str(t7, Address(__ pre(d, 8 * unit))); 934 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 935 } else { 936 // d was not offset when we started so the registers are 937 // written into the 64 bit block preceding d with the following 938 // offsets 939 // 940 // t1 at offset -8 941 // t3 at offset -24, t0 at offset -16 942 // t5 at offset -48, t2 at offset -32 943 // t7 at offset -56, t4 at offset -48 944 // t6 at offset -64 945 // 946 // note that this matches the offsets previously noted for the 947 // loads 948 949 __ str(t1, Address(d, 1 * unit)); 950 __ stp(t3, t0, Address(d, 3 * unit)); 951 __ ldp(t0, t1, Address(s, 2 * unit)); 952 __ stp(t5, t2, Address(d, 5 * unit)); 953 __ ldp(t2, t3, Address(s, 4 * unit)); 954 __ stp(t7, t4, Address(d, 7 * unit)); 955 __ ldp(t4, t5, Address(s, 6 * unit)); 956 __ str(t6, Address(__ pre(d, 8 * unit))); 957 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 958 } 959 960 __ subs(count, count, 8); 961 __ br(Assembler::HS, again); 962 963 // Drain 964 // 965 // this uses the same pattern of offsets and register arguments 966 // as above 967 __ bind(drain); 968 if (direction == copy_forwards) { 969 __ str(t0, Address(d, 1 * unit)); 970 __ stp(t1, t2, Address(d, 2 * unit)); 971 __ stp(t3, t4, Address(d, 4 * unit)); 972 __ stp(t5, t6, Address(d, 6 * unit)); 973 __ str(t7, Address(__ pre(d, 8 * unit))); 974 } else { 975 __ str(t1, Address(d, 1 * unit)); 976 __ stp(t3, t0, Address(d, 3 * unit)); 977 __ stp(t5, t2, Address(d, 5 * unit)); 978 __ stp(t7, t4, Address(d, 7 * unit)); 979 __ str(t6, Address(__ pre(d, 8 * unit))); 980 } 981 // now we need to copy any remaining part block which may 982 // include a 4 word block subblock and/or a 2 word subblock. 983 // bits 2 and 1 in the count are the tell-tale for whetehr we 984 // have each such subblock 985 { 986 Label L1, L2; 987 __ tbz(count, exact_log2(4), L1); 988 // this is the same as above but copying only 4 longs hence 989 // with ony one intervening stp between the str instructions 990 // but note that the offsets and registers still follow the 991 // same pattern 992 __ ldp(t0, t1, Address(s, 2 * unit)); 993 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 994 if (direction == copy_forwards) { 995 __ str(t0, Address(d, 1 * unit)); 996 __ stp(t1, t2, Address(d, 2 * unit)); 997 __ str(t3, Address(__ pre(d, 4 * unit))); 998 } else { 999 __ str(t1, Address(d, 1 * unit)); 1000 __ stp(t3, t0, Address(d, 3 * unit)); 1001 __ str(t2, Address(__ pre(d, 4 * unit))); 1002 } 1003 __ bind(L1); 1004 1005 __ tbz(count, 1, L2); 1006 // this is the same as above but copying only 2 longs hence 1007 // there is no intervening stp between the str instructions 1008 // but note that the offset and register patterns are still 1009 // the same 1010 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1011 if (direction == copy_forwards) { 1012 __ str(t0, Address(d, 1 * unit)); 1013 __ str(t1, Address(__ pre(d, 2 * unit))); 1014 } else { 1015 __ str(t1, Address(d, 1 * unit)); 1016 __ str(t0, Address(__ pre(d, 2 * unit))); 1017 } 1018 __ bind(L2); 1019 1020 // for forwards copy we need to re-adjust the offsets we 1021 // applied so that s and d are follow the last words written 1022 1023 if (direction == copy_forwards) { 1024 __ add(s, s, 16); 1025 __ add(d, d, 8); 1026 } 1027 1028 } 1029 1030 __ ret(lr); 1031 } 1032 } 1033 1034 // Small copy: less than 16 bytes. 1035 // 1036 // NB: Ignores all of the bits of count which represent more than 15 1037 // bytes, so a caller doesn't have to mask them. 1038 1039 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1040 bool is_backwards = step < 0; 1041 size_t granularity = uabs(step); 1042 int direction = is_backwards ? -1 : 1; 1043 int unit = wordSize * direction; 1044 1045 Label Lword, Lint, Lshort, Lbyte; 1046 1047 assert(granularity 1048 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1049 1050 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1051 1052 // ??? I don't know if this bit-test-and-branch is the right thing 1053 // to do. It does a lot of jumping, resulting in several 1054 // mispredicted branches. It might make more sense to do this 1055 // with something like Duff's device with a single computed branch. 1056 1057 __ tbz(count, 3 - exact_log2(granularity), Lword); 1058 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1059 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1060 __ bind(Lword); 1061 1062 if (granularity <= sizeof (jint)) { 1063 __ tbz(count, 2 - exact_log2(granularity), Lint); 1064 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1065 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1066 __ bind(Lint); 1067 } 1068 1069 if (granularity <= sizeof (jshort)) { 1070 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1071 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1072 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1073 __ bind(Lshort); 1074 } 1075 1076 if (granularity <= sizeof (jbyte)) { 1077 __ tbz(count, 0, Lbyte); 1078 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1079 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1080 __ bind(Lbyte); 1081 } 1082 } 1083 1084 Label copy_f, copy_b; 1085 1086 // All-singing all-dancing memory copy. 1087 // 1088 // Copy count units of memory from s to d. The size of a unit is 1089 // step, which can be positive or negative depending on the direction 1090 // of copy. If is_aligned is false, we align the source address. 1091 // 1092 1093 void copy_memory(bool is_aligned, Register s, Register d, 1094 Register count, Register tmp, int step) { 1095 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1096 bool is_backwards = step < 0; 1097 int granularity = uabs(step); 1098 const Register t0 = r3, t1 = r4; 1099 1100 // <= 96 bytes do inline. Direction doesn't matter because we always 1101 // load all the data before writing anything 1102 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1103 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1104 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1105 const Register send = r17, dend = r18; 1106 1107 if (PrefetchCopyIntervalInBytes > 0) 1108 __ prfm(Address(s, 0), PLDL1KEEP); 1109 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1110 __ br(Assembler::HI, copy_big); 1111 1112 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1113 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1114 1115 __ cmp(count, u1(16/granularity)); 1116 __ br(Assembler::LS, copy16); 1117 1118 __ cmp(count, u1(64/granularity)); 1119 __ br(Assembler::HI, copy80); 1120 1121 __ cmp(count, u1(32/granularity)); 1122 __ br(Assembler::LS, copy32); 1123 1124 // 33..64 bytes 1125 if (UseSIMDForMemoryOps) { 1126 __ ldpq(v0, v1, Address(s, 0)); 1127 __ ldpq(v2, v3, Address(send, -32)); 1128 __ stpq(v0, v1, Address(d, 0)); 1129 __ stpq(v2, v3, Address(dend, -32)); 1130 } else { 1131 __ ldp(t0, t1, Address(s, 0)); 1132 __ ldp(t2, t3, Address(s, 16)); 1133 __ ldp(t4, t5, Address(send, -32)); 1134 __ ldp(t6, t7, Address(send, -16)); 1135 1136 __ stp(t0, t1, Address(d, 0)); 1137 __ stp(t2, t3, Address(d, 16)); 1138 __ stp(t4, t5, Address(dend, -32)); 1139 __ stp(t6, t7, Address(dend, -16)); 1140 } 1141 __ b(finish); 1142 1143 // 17..32 bytes 1144 __ bind(copy32); 1145 __ ldp(t0, t1, Address(s, 0)); 1146 __ ldp(t2, t3, Address(send, -16)); 1147 __ stp(t0, t1, Address(d, 0)); 1148 __ stp(t2, t3, Address(dend, -16)); 1149 __ b(finish); 1150 1151 // 65..80/96 bytes 1152 // (96 bytes if SIMD because we do 32 byes per instruction) 1153 __ bind(copy80); 1154 if (UseSIMDForMemoryOps) { 1155 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1156 __ ldpq(v4, v5, Address(send, -32)); 1157 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1158 __ stpq(v4, v5, Address(dend, -32)); 1159 } else { 1160 __ ldp(t0, t1, Address(s, 0)); 1161 __ ldp(t2, t3, Address(s, 16)); 1162 __ ldp(t4, t5, Address(s, 32)); 1163 __ ldp(t6, t7, Address(s, 48)); 1164 __ ldp(t8, t9, Address(send, -16)); 1165 1166 __ stp(t0, t1, Address(d, 0)); 1167 __ stp(t2, t3, Address(d, 16)); 1168 __ stp(t4, t5, Address(d, 32)); 1169 __ stp(t6, t7, Address(d, 48)); 1170 __ stp(t8, t9, Address(dend, -16)); 1171 } 1172 __ b(finish); 1173 1174 // 0..16 bytes 1175 __ bind(copy16); 1176 __ cmp(count, u1(8/granularity)); 1177 __ br(Assembler::LO, copy8); 1178 1179 // 8..16 bytes 1180 __ ldr(t0, Address(s, 0)); 1181 __ ldr(t1, Address(send, -8)); 1182 __ str(t0, Address(d, 0)); 1183 __ str(t1, Address(dend, -8)); 1184 __ b(finish); 1185 1186 if (granularity < 8) { 1187 // 4..7 bytes 1188 __ bind(copy8); 1189 __ tbz(count, 2 - exact_log2(granularity), copy4); 1190 __ ldrw(t0, Address(s, 0)); 1191 __ ldrw(t1, Address(send, -4)); 1192 __ strw(t0, Address(d, 0)); 1193 __ strw(t1, Address(dend, -4)); 1194 __ b(finish); 1195 if (granularity < 4) { 1196 // 0..3 bytes 1197 __ bind(copy4); 1198 __ cbz(count, finish); // get rid of 0 case 1199 if (granularity == 2) { 1200 __ ldrh(t0, Address(s, 0)); 1201 __ strh(t0, Address(d, 0)); 1202 } else { // granularity == 1 1203 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1204 // the first and last byte. 1205 // Handle the 3 byte case by loading and storing base + count/2 1206 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1207 // This does means in the 1 byte case we load/store the same 1208 // byte 3 times. 1209 __ lsr(count, count, 1); 1210 __ ldrb(t0, Address(s, 0)); 1211 __ ldrb(t1, Address(send, -1)); 1212 __ ldrb(t2, Address(s, count)); 1213 __ strb(t0, Address(d, 0)); 1214 __ strb(t1, Address(dend, -1)); 1215 __ strb(t2, Address(d, count)); 1216 } 1217 __ b(finish); 1218 } 1219 } 1220 1221 __ bind(copy_big); 1222 if (is_backwards) { 1223 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1224 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1225 } 1226 1227 // Now we've got the small case out of the way we can align the 1228 // source address on a 2-word boundary. 1229 1230 Label aligned; 1231 1232 if (is_aligned) { 1233 // We may have to adjust by 1 word to get s 2-word-aligned. 1234 __ tbz(s, exact_log2(wordSize), aligned); 1235 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1236 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1237 __ sub(count, count, wordSize/granularity); 1238 } else { 1239 if (is_backwards) { 1240 __ andr(rscratch2, s, 2 * wordSize - 1); 1241 } else { 1242 __ neg(rscratch2, s); 1243 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1244 } 1245 // rscratch2 is the byte adjustment needed to align s. 1246 __ cbz(rscratch2, aligned); 1247 int shift = exact_log2(granularity); 1248 if (shift) __ lsr(rscratch2, rscratch2, shift); 1249 __ sub(count, count, rscratch2); 1250 1251 #if 0 1252 // ?? This code is only correct for a disjoint copy. It may or 1253 // may not make sense to use it in that case. 1254 1255 // Copy the first pair; s and d may not be aligned. 1256 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1257 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1258 1259 // Align s and d, adjust count 1260 if (is_backwards) { 1261 __ sub(s, s, rscratch2); 1262 __ sub(d, d, rscratch2); 1263 } else { 1264 __ add(s, s, rscratch2); 1265 __ add(d, d, rscratch2); 1266 } 1267 #else 1268 copy_memory_small(s, d, rscratch2, rscratch1, step); 1269 #endif 1270 } 1271 1272 __ bind(aligned); 1273 1274 // s is now 2-word-aligned. 1275 1276 // We have a count of units and some trailing bytes. Adjust the 1277 // count and do a bulk copy of words. 1278 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1279 if (direction == copy_forwards) 1280 __ bl(copy_f); 1281 else 1282 __ bl(copy_b); 1283 1284 // And the tail. 1285 copy_memory_small(s, d, count, tmp, step); 1286 1287 if (granularity >= 8) __ bind(copy8); 1288 if (granularity >= 4) __ bind(copy4); 1289 __ bind(finish); 1290 } 1291 1292 1293 void clobber_registers() { 1294 #ifdef ASSERT 1295 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1296 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1297 for (Register r = r3; r <= r18; r++) 1298 if (r != rscratch1) __ mov(r, rscratch1); 1299 #endif 1300 } 1301 1302 // Scan over array at a for count oops, verifying each one. 1303 // Preserves a and count, clobbers rscratch1 and rscratch2. 1304 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1305 Label loop, end; 1306 __ mov(rscratch1, a); 1307 __ mov(rscratch2, zr); 1308 __ bind(loop); 1309 __ cmp(rscratch2, count); 1310 __ br(Assembler::HS, end); 1311 if (size == (size_t)wordSize) { 1312 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1313 __ verify_oop(temp); 1314 } else { 1315 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1316 __ decode_heap_oop(temp); // calls verify_oop 1317 } 1318 __ add(rscratch2, rscratch2, size); 1319 __ b(loop); 1320 __ bind(end); 1321 } 1322 1323 // Arguments: 1324 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1325 // ignored 1326 // is_oop - true => oop array, so generate store check code 1327 // name - stub name string 1328 // 1329 // Inputs: 1330 // c_rarg0 - source array address 1331 // c_rarg1 - destination array address 1332 // c_rarg2 - element count, treated as ssize_t, can be zero 1333 // 1334 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1335 // the hardware handle it. The two dwords within qwords that span 1336 // cache line boundaries will still be loaded and stored atomicly. 1337 // 1338 // Side Effects: 1339 // disjoint_int_copy_entry is set to the no-overlap entry point 1340 // used by generate_conjoint_int_oop_copy(). 1341 // 1342 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1343 const char *name, bool dest_uninitialized = false) { 1344 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1345 RegSet saved_reg = RegSet::of(s, d, count); 1346 __ align(CodeEntryAlignment); 1347 StubCodeMark mark(this, "StubRoutines", name); 1348 address start = __ pc(); 1349 __ enter(); 1350 1351 if (entry != NULL) { 1352 *entry = __ pc(); 1353 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1354 BLOCK_COMMENT("Entry:"); 1355 } 1356 1357 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1358 if (dest_uninitialized) { 1359 decorators |= IS_DEST_UNINITIALIZED; 1360 } 1361 if (aligned) { 1362 decorators |= ARRAYCOPY_ALIGNED; 1363 } 1364 1365 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1366 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1367 1368 if (is_oop) { 1369 // save regs before copy_memory 1370 __ push(RegSet::of(d, count), sp); 1371 } 1372 copy_memory(aligned, s, d, count, rscratch1, size); 1373 1374 if (is_oop) { 1375 __ pop(RegSet::of(d, count), sp); 1376 if (VerifyOops) 1377 verify_oop_array(size, d, count, r16); 1378 __ sub(count, count, 1); // make an inclusive end pointer 1379 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1380 } 1381 1382 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1383 1384 __ leave(); 1385 __ mov(r0, zr); // return 0 1386 __ ret(lr); 1387 #ifdef BUILTIN_SIM 1388 { 1389 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1390 sim->notifyCompile(const_cast<char*>(name), start); 1391 } 1392 #endif 1393 return start; 1394 } 1395 1396 // Arguments: 1397 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1398 // ignored 1399 // is_oop - true => oop array, so generate store check code 1400 // name - stub name string 1401 // 1402 // Inputs: 1403 // c_rarg0 - source array address 1404 // c_rarg1 - destination array address 1405 // c_rarg2 - element count, treated as ssize_t, can be zero 1406 // 1407 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1408 // the hardware handle it. The two dwords within qwords that span 1409 // cache line boundaries will still be loaded and stored atomicly. 1410 // 1411 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1412 address *entry, const char *name, 1413 bool dest_uninitialized = false) { 1414 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1415 RegSet saved_regs = RegSet::of(s, d, count); 1416 StubCodeMark mark(this, "StubRoutines", name); 1417 address start = __ pc(); 1418 __ enter(); 1419 1420 if (entry != NULL) { 1421 *entry = __ pc(); 1422 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1423 BLOCK_COMMENT("Entry:"); 1424 } 1425 1426 // use fwd copy when (d-s) above_equal (count*size) 1427 __ sub(rscratch1, d, s); 1428 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1429 __ br(Assembler::HS, nooverlap_target); 1430 1431 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1432 if (dest_uninitialized) { 1433 decorators |= IS_DEST_UNINITIALIZED; 1434 } 1435 if (aligned) { 1436 decorators |= ARRAYCOPY_ALIGNED; 1437 } 1438 1439 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1440 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1441 1442 if (is_oop) { 1443 // save regs before copy_memory 1444 __ push(RegSet::of(d, count), sp); 1445 } 1446 copy_memory(aligned, s, d, count, rscratch1, -size); 1447 if (is_oop) { 1448 __ pop(RegSet::of(d, count), sp); 1449 if (VerifyOops) 1450 verify_oop_array(size, d, count, r16); 1451 __ sub(count, count, 1); // make an inclusive end pointer 1452 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1453 } 1454 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1455 __ leave(); 1456 __ mov(r0, zr); // return 0 1457 __ ret(lr); 1458 #ifdef BUILTIN_SIM 1459 { 1460 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1461 sim->notifyCompile(const_cast<char*>(name), start); 1462 } 1463 #endif 1464 return start; 1465 } 1466 1467 // Arguments: 1468 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1469 // ignored 1470 // name - stub name string 1471 // 1472 // Inputs: 1473 // c_rarg0 - source array address 1474 // c_rarg1 - destination array address 1475 // c_rarg2 - element count, treated as ssize_t, can be zero 1476 // 1477 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1478 // we let the hardware handle it. The one to eight bytes within words, 1479 // dwords or qwords that span cache line boundaries will still be loaded 1480 // and stored atomically. 1481 // 1482 // Side Effects: 1483 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1484 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1485 // we let the hardware handle it. The one to eight bytes within words, 1486 // dwords or qwords that span cache line boundaries will still be loaded 1487 // and stored atomically. 1488 // 1489 // Side Effects: 1490 // disjoint_byte_copy_entry is set to the no-overlap entry point 1491 // used by generate_conjoint_byte_copy(). 1492 // 1493 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1494 const bool not_oop = false; 1495 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1496 } 1497 1498 // Arguments: 1499 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1500 // ignored 1501 // name - stub name string 1502 // 1503 // Inputs: 1504 // c_rarg0 - source array address 1505 // c_rarg1 - destination array address 1506 // c_rarg2 - element count, treated as ssize_t, can be zero 1507 // 1508 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1509 // we let the hardware handle it. The one to eight bytes within words, 1510 // dwords or qwords that span cache line boundaries will still be loaded 1511 // and stored atomically. 1512 // 1513 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1514 address* entry, const char *name) { 1515 const bool not_oop = false; 1516 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1517 } 1518 1519 // Arguments: 1520 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1521 // ignored 1522 // name - stub name string 1523 // 1524 // Inputs: 1525 // c_rarg0 - source array address 1526 // c_rarg1 - destination array address 1527 // c_rarg2 - element count, treated as ssize_t, can be zero 1528 // 1529 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1530 // let the hardware handle it. The two or four words within dwords 1531 // or qwords that span cache line boundaries will still be loaded 1532 // and stored atomically. 1533 // 1534 // Side Effects: 1535 // disjoint_short_copy_entry is set to the no-overlap entry point 1536 // used by generate_conjoint_short_copy(). 1537 // 1538 address generate_disjoint_short_copy(bool aligned, 1539 address* entry, const char *name) { 1540 const bool not_oop = false; 1541 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1542 } 1543 1544 // Arguments: 1545 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1546 // ignored 1547 // name - stub name string 1548 // 1549 // Inputs: 1550 // c_rarg0 - source array address 1551 // c_rarg1 - destination array address 1552 // c_rarg2 - element count, treated as ssize_t, can be zero 1553 // 1554 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1555 // let the hardware handle it. The two or four words within dwords 1556 // or qwords that span cache line boundaries will still be loaded 1557 // and stored atomically. 1558 // 1559 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1560 address *entry, const char *name) { 1561 const bool not_oop = false; 1562 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1563 1564 } 1565 // Arguments: 1566 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1567 // ignored 1568 // name - stub name string 1569 // 1570 // Inputs: 1571 // c_rarg0 - source array address 1572 // c_rarg1 - destination array address 1573 // c_rarg2 - element count, treated as ssize_t, can be zero 1574 // 1575 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1576 // the hardware handle it. The two dwords within qwords that span 1577 // cache line boundaries will still be loaded and stored atomicly. 1578 // 1579 // Side Effects: 1580 // disjoint_int_copy_entry is set to the no-overlap entry point 1581 // used by generate_conjoint_int_oop_copy(). 1582 // 1583 address generate_disjoint_int_copy(bool aligned, address *entry, 1584 const char *name, bool dest_uninitialized = false) { 1585 const bool not_oop = false; 1586 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1587 } 1588 1589 // Arguments: 1590 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1591 // ignored 1592 // name - stub name string 1593 // 1594 // Inputs: 1595 // c_rarg0 - source array address 1596 // c_rarg1 - destination array address 1597 // c_rarg2 - element count, treated as ssize_t, can be zero 1598 // 1599 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1600 // the hardware handle it. The two dwords within qwords that span 1601 // cache line boundaries will still be loaded and stored atomicly. 1602 // 1603 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1604 address *entry, const char *name, 1605 bool dest_uninitialized = false) { 1606 const bool not_oop = false; 1607 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1608 } 1609 1610 1611 // Arguments: 1612 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1613 // ignored 1614 // name - stub name string 1615 // 1616 // Inputs: 1617 // c_rarg0 - source array address 1618 // c_rarg1 - destination array address 1619 // c_rarg2 - element count, treated as size_t, can be zero 1620 // 1621 // Side Effects: 1622 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1623 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1624 // 1625 address generate_disjoint_long_copy(bool aligned, address *entry, 1626 const char *name, bool dest_uninitialized = false) { 1627 const bool not_oop = false; 1628 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1629 } 1630 1631 // Arguments: 1632 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1633 // ignored 1634 // name - stub name string 1635 // 1636 // Inputs: 1637 // c_rarg0 - source array address 1638 // c_rarg1 - destination array address 1639 // c_rarg2 - element count, treated as size_t, can be zero 1640 // 1641 address generate_conjoint_long_copy(bool aligned, 1642 address nooverlap_target, address *entry, 1643 const char *name, bool dest_uninitialized = false) { 1644 const bool not_oop = false; 1645 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1646 } 1647 1648 // Arguments: 1649 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1650 // ignored 1651 // name - stub name string 1652 // 1653 // Inputs: 1654 // c_rarg0 - source array address 1655 // c_rarg1 - destination array address 1656 // c_rarg2 - element count, treated as size_t, can be zero 1657 // 1658 // Side Effects: 1659 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1660 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1661 // 1662 address generate_disjoint_oop_copy(bool aligned, address *entry, 1663 const char *name, bool dest_uninitialized) { 1664 const bool is_oop = true; 1665 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1666 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1667 } 1668 1669 // Arguments: 1670 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1671 // ignored 1672 // name - stub name string 1673 // 1674 // Inputs: 1675 // c_rarg0 - source array address 1676 // c_rarg1 - destination array address 1677 // c_rarg2 - element count, treated as size_t, can be zero 1678 // 1679 address generate_conjoint_oop_copy(bool aligned, 1680 address nooverlap_target, address *entry, 1681 const char *name, bool dest_uninitialized) { 1682 const bool is_oop = true; 1683 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1684 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1685 name, dest_uninitialized); 1686 } 1687 1688 1689 // Helper for generating a dynamic type check. 1690 // Smashes rscratch1. 1691 void generate_type_check(Register sub_klass, 1692 Register super_check_offset, 1693 Register super_klass, 1694 Label& L_success) { 1695 assert_different_registers(sub_klass, super_check_offset, super_klass); 1696 1697 BLOCK_COMMENT("type_check:"); 1698 1699 Label L_miss; 1700 1701 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1702 super_check_offset); 1703 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1704 1705 // Fall through on failure! 1706 __ BIND(L_miss); 1707 } 1708 1709 // 1710 // Generate checkcasting array copy stub 1711 // 1712 // Input: 1713 // c_rarg0 - source array address 1714 // c_rarg1 - destination array address 1715 // c_rarg2 - element count, treated as ssize_t, can be zero 1716 // c_rarg3 - size_t ckoff (super_check_offset) 1717 // c_rarg4 - oop ckval (super_klass) 1718 // 1719 // Output: 1720 // r0 == 0 - success 1721 // r0 == -1^K - failure, where K is partial transfer count 1722 // 1723 address generate_checkcast_copy(const char *name, address *entry, 1724 bool dest_uninitialized = false) { 1725 1726 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1727 1728 // Input registers (after setup_arg_regs) 1729 const Register from = c_rarg0; // source array address 1730 const Register to = c_rarg1; // destination array address 1731 const Register count = c_rarg2; // elementscount 1732 const Register ckoff = c_rarg3; // super_check_offset 1733 const Register ckval = c_rarg4; // super_klass 1734 1735 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1736 RegSet wb_post_saved_regs = RegSet::of(count); 1737 1738 // Registers used as temps (r18, r19, r20 are save-on-entry) 1739 const Register count_save = r21; // orig elementscount 1740 const Register start_to = r20; // destination array start address 1741 const Register copied_oop = r18; // actual oop copied 1742 const Register r19_klass = r19; // oop._klass 1743 1744 //--------------------------------------------------------------- 1745 // Assembler stub will be used for this call to arraycopy 1746 // if the two arrays are subtypes of Object[] but the 1747 // destination array type is not equal to or a supertype 1748 // of the source type. Each element must be separately 1749 // checked. 1750 1751 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1752 copied_oop, r19_klass, count_save); 1753 1754 __ align(CodeEntryAlignment); 1755 StubCodeMark mark(this, "StubRoutines", name); 1756 address start = __ pc(); 1757 1758 __ enter(); // required for proper stackwalking of RuntimeStub frame 1759 1760 #ifdef ASSERT 1761 // caller guarantees that the arrays really are different 1762 // otherwise, we would have to make conjoint checks 1763 { Label L; 1764 array_overlap_test(L, TIMES_OOP); 1765 __ stop("checkcast_copy within a single array"); 1766 __ bind(L); 1767 } 1768 #endif //ASSERT 1769 1770 // Caller of this entry point must set up the argument registers. 1771 if (entry != NULL) { 1772 *entry = __ pc(); 1773 BLOCK_COMMENT("Entry:"); 1774 } 1775 1776 // Empty array: Nothing to do. 1777 __ cbz(count, L_done); 1778 1779 __ push(RegSet::of(r18, r19, r20, r21), sp); 1780 1781 #ifdef ASSERT 1782 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1783 // The ckoff and ckval must be mutually consistent, 1784 // even though caller generates both. 1785 { Label L; 1786 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1787 __ ldrw(start_to, Address(ckval, sco_offset)); 1788 __ cmpw(ckoff, start_to); 1789 __ br(Assembler::EQ, L); 1790 __ stop("super_check_offset inconsistent"); 1791 __ bind(L); 1792 } 1793 #endif //ASSERT 1794 1795 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; 1796 bool is_oop = true; 1797 if (dest_uninitialized) { 1798 decorators |= IS_DEST_UNINITIALIZED; 1799 } 1800 1801 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1802 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1803 1804 // save the original count 1805 __ mov(count_save, count); 1806 1807 // Copy from low to high addresses 1808 __ mov(start_to, to); // Save destination array start address 1809 __ b(L_load_element); 1810 1811 // ======== begin loop ======== 1812 // (Loop is rotated; its entry is L_load_element.) 1813 // Loop control: 1814 // for (; count != 0; count--) { 1815 // copied_oop = load_heap_oop(from++); 1816 // ... generate_type_check ...; 1817 // store_heap_oop(to++, copied_oop); 1818 // } 1819 __ align(OptoLoopAlignment); 1820 1821 __ BIND(L_store_element); 1822 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1823 __ sub(count, count, 1); 1824 __ cbz(count, L_do_card_marks); 1825 1826 // ======== loop entry is here ======== 1827 __ BIND(L_load_element); 1828 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1829 __ cbz(copied_oop, L_store_element); 1830 1831 __ load_klass(r19_klass, copied_oop);// query the object klass 1832 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1833 // ======== end loop ======== 1834 1835 // It was a real error; we must depend on the caller to finish the job. 1836 // Register count = remaining oops, count_orig = total oops. 1837 // Emit GC store barriers for the oops we have copied and report 1838 // their number to the caller. 1839 1840 __ subs(count, count_save, count); // K = partially copied oop count 1841 __ eon(count, count, zr); // report (-1^K) to caller 1842 __ br(Assembler::EQ, L_done_pop); 1843 1844 __ BIND(L_do_card_marks); 1845 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1846 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs); 1847 1848 __ bind(L_done_pop); 1849 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1850 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1851 1852 __ bind(L_done); 1853 __ mov(r0, count); 1854 __ leave(); 1855 __ ret(lr); 1856 1857 return start; 1858 } 1859 1860 // Perform range checks on the proposed arraycopy. 1861 // Kills temp, but nothing else. 1862 // Also, clean the sign bits of src_pos and dst_pos. 1863 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1864 Register src_pos, // source position (c_rarg1) 1865 Register dst, // destination array oo (c_rarg2) 1866 Register dst_pos, // destination position (c_rarg3) 1867 Register length, 1868 Register temp, 1869 Label& L_failed) { 1870 BLOCK_COMMENT("arraycopy_range_checks:"); 1871 1872 assert_different_registers(rscratch1, temp); 1873 1874 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1875 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1876 __ addw(temp, length, src_pos); 1877 __ cmpw(temp, rscratch1); 1878 __ br(Assembler::HI, L_failed); 1879 1880 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1881 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1882 __ addw(temp, length, dst_pos); 1883 __ cmpw(temp, rscratch1); 1884 __ br(Assembler::HI, L_failed); 1885 1886 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1887 __ movw(src_pos, src_pos); 1888 __ movw(dst_pos, dst_pos); 1889 1890 BLOCK_COMMENT("arraycopy_range_checks done"); 1891 } 1892 1893 // These stubs get called from some dumb test routine. 1894 // I'll write them properly when they're called from 1895 // something that's actually doing something. 1896 static void fake_arraycopy_stub(address src, address dst, int count) { 1897 assert(count == 0, "huh?"); 1898 } 1899 1900 1901 // 1902 // Generate 'unsafe' array copy stub 1903 // Though just as safe as the other stubs, it takes an unscaled 1904 // size_t argument instead of an element count. 1905 // 1906 // Input: 1907 // c_rarg0 - source array address 1908 // c_rarg1 - destination array address 1909 // c_rarg2 - byte count, treated as ssize_t, can be zero 1910 // 1911 // Examines the alignment of the operands and dispatches 1912 // to a long, int, short, or byte copy loop. 1913 // 1914 address generate_unsafe_copy(const char *name, 1915 address byte_copy_entry, 1916 address short_copy_entry, 1917 address int_copy_entry, 1918 address long_copy_entry) { 1919 Label L_long_aligned, L_int_aligned, L_short_aligned; 1920 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1921 1922 __ align(CodeEntryAlignment); 1923 StubCodeMark mark(this, "StubRoutines", name); 1924 address start = __ pc(); 1925 __ enter(); // required for proper stackwalking of RuntimeStub frame 1926 1927 // bump this on entry, not on exit: 1928 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1929 1930 __ orr(rscratch1, s, d); 1931 __ orr(rscratch1, rscratch1, count); 1932 1933 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1934 __ cbz(rscratch1, L_long_aligned); 1935 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1936 __ cbz(rscratch1, L_int_aligned); 1937 __ tbz(rscratch1, 0, L_short_aligned); 1938 __ b(RuntimeAddress(byte_copy_entry)); 1939 1940 __ BIND(L_short_aligned); 1941 __ lsr(count, count, LogBytesPerShort); // size => short_count 1942 __ b(RuntimeAddress(short_copy_entry)); 1943 __ BIND(L_int_aligned); 1944 __ lsr(count, count, LogBytesPerInt); // size => int_count 1945 __ b(RuntimeAddress(int_copy_entry)); 1946 __ BIND(L_long_aligned); 1947 __ lsr(count, count, LogBytesPerLong); // size => long_count 1948 __ b(RuntimeAddress(long_copy_entry)); 1949 1950 return start; 1951 } 1952 1953 // 1954 // Generate generic array copy stubs 1955 // 1956 // Input: 1957 // c_rarg0 - src oop 1958 // c_rarg1 - src_pos (32-bits) 1959 // c_rarg2 - dst oop 1960 // c_rarg3 - dst_pos (32-bits) 1961 // c_rarg4 - element count (32-bits) 1962 // 1963 // Output: 1964 // r0 == 0 - success 1965 // r0 == -1^K - failure, where K is partial transfer count 1966 // 1967 address generate_generic_copy(const char *name, 1968 address byte_copy_entry, address short_copy_entry, 1969 address int_copy_entry, address oop_copy_entry, 1970 address long_copy_entry, address checkcast_copy_entry) { 1971 1972 Label L_failed, L_objArray; 1973 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1974 1975 // Input registers 1976 const Register src = c_rarg0; // source array oop 1977 const Register src_pos = c_rarg1; // source position 1978 const Register dst = c_rarg2; // destination array oop 1979 const Register dst_pos = c_rarg3; // destination position 1980 const Register length = c_rarg4; 1981 1982 __ align(CodeEntryAlignment); 1983 1984 StubCodeMark mark(this, "StubRoutines", name); 1985 1986 address start = __ pc(); 1987 1988 __ enter(); // required for proper stackwalking of RuntimeStub frame 1989 1990 // bump this on entry, not on exit: 1991 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1992 1993 //----------------------------------------------------------------------- 1994 // Assembler stub will be used for this call to arraycopy 1995 // if the following conditions are met: 1996 // 1997 // (1) src and dst must not be null. 1998 // (2) src_pos must not be negative. 1999 // (3) dst_pos must not be negative. 2000 // (4) length must not be negative. 2001 // (5) src klass and dst klass should be the same and not NULL. 2002 // (6) src and dst should be arrays. 2003 // (7) src_pos + length must not exceed length of src. 2004 // (8) dst_pos + length must not exceed length of dst. 2005 // 2006 2007 // if (src == NULL) return -1; 2008 __ cbz(src, L_failed); 2009 2010 // if (src_pos < 0) return -1; 2011 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2012 2013 // if (dst == NULL) return -1; 2014 __ cbz(dst, L_failed); 2015 2016 // if (dst_pos < 0) return -1; 2017 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2018 2019 // registers used as temp 2020 const Register scratch_length = r16; // elements count to copy 2021 const Register scratch_src_klass = r17; // array klass 2022 const Register lh = r18; // layout helper 2023 2024 // if (length < 0) return -1; 2025 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2026 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2027 2028 __ load_klass(scratch_src_klass, src); 2029 #ifdef ASSERT 2030 // assert(src->klass() != NULL); 2031 { 2032 BLOCK_COMMENT("assert klasses not null {"); 2033 Label L1, L2; 2034 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2035 __ bind(L1); 2036 __ stop("broken null klass"); 2037 __ bind(L2); 2038 __ load_klass(rscratch1, dst); 2039 __ cbz(rscratch1, L1); // this would be broken also 2040 BLOCK_COMMENT("} assert klasses not null done"); 2041 } 2042 #endif 2043 2044 // Load layout helper (32-bits) 2045 // 2046 // |array_tag| | header_size | element_type | |log2_element_size| 2047 // 32 30 24 16 8 2 0 2048 // 2049 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2050 // 2051 2052 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2053 2054 // Handle objArrays completely differently... 2055 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2056 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2057 __ movw(rscratch1, objArray_lh); 2058 __ eorw(rscratch2, lh, rscratch1); 2059 __ cbzw(rscratch2, L_objArray); 2060 2061 // if (src->klass() != dst->klass()) return -1; 2062 __ load_klass(rscratch2, dst); 2063 __ eor(rscratch2, rscratch2, scratch_src_klass); 2064 __ cbnz(rscratch2, L_failed); 2065 2066 // if (!src->is_Array()) return -1; 2067 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2068 2069 // At this point, it is known to be a typeArray (array_tag 0x3). 2070 #ifdef ASSERT 2071 { 2072 BLOCK_COMMENT("assert primitive array {"); 2073 Label L; 2074 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2075 __ cmpw(lh, rscratch2); 2076 __ br(Assembler::GE, L); 2077 __ stop("must be a primitive array"); 2078 __ bind(L); 2079 BLOCK_COMMENT("} assert primitive array done"); 2080 } 2081 #endif 2082 2083 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2084 rscratch2, L_failed); 2085 2086 // TypeArrayKlass 2087 // 2088 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2089 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2090 // 2091 2092 const Register rscratch1_offset = rscratch1; // array offset 2093 const Register r18_elsize = lh; // element size 2094 2095 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2096 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2097 __ add(src, src, rscratch1_offset); // src array offset 2098 __ add(dst, dst, rscratch1_offset); // dst array offset 2099 BLOCK_COMMENT("choose copy loop based on element size"); 2100 2101 // next registers should be set before the jump to corresponding stub 2102 const Register from = c_rarg0; // source array address 2103 const Register to = c_rarg1; // destination array address 2104 const Register count = c_rarg2; // elements count 2105 2106 // 'from', 'to', 'count' registers should be set in such order 2107 // since they are the same as 'src', 'src_pos', 'dst'. 2108 2109 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2110 2111 // The possible values of elsize are 0-3, i.e. exact_log2(element 2112 // size in bytes). We do a simple bitwise binary search. 2113 __ BIND(L_copy_bytes); 2114 __ tbnz(r18_elsize, 1, L_copy_ints); 2115 __ tbnz(r18_elsize, 0, L_copy_shorts); 2116 __ lea(from, Address(src, src_pos));// src_addr 2117 __ lea(to, Address(dst, dst_pos));// dst_addr 2118 __ movw(count, scratch_length); // length 2119 __ b(RuntimeAddress(byte_copy_entry)); 2120 2121 __ BIND(L_copy_shorts); 2122 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2123 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2124 __ movw(count, scratch_length); // length 2125 __ b(RuntimeAddress(short_copy_entry)); 2126 2127 __ BIND(L_copy_ints); 2128 __ tbnz(r18_elsize, 0, L_copy_longs); 2129 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2130 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2131 __ movw(count, scratch_length); // length 2132 __ b(RuntimeAddress(int_copy_entry)); 2133 2134 __ BIND(L_copy_longs); 2135 #ifdef ASSERT 2136 { 2137 BLOCK_COMMENT("assert long copy {"); 2138 Label L; 2139 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2140 __ cmpw(r18_elsize, LogBytesPerLong); 2141 __ br(Assembler::EQ, L); 2142 __ stop("must be long copy, but elsize is wrong"); 2143 __ bind(L); 2144 BLOCK_COMMENT("} assert long copy done"); 2145 } 2146 #endif 2147 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2148 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2149 __ movw(count, scratch_length); // length 2150 __ b(RuntimeAddress(long_copy_entry)); 2151 2152 // ObjArrayKlass 2153 __ BIND(L_objArray); 2154 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2155 2156 Label L_plain_copy, L_checkcast_copy; 2157 // test array classes for subtyping 2158 __ load_klass(r18, dst); 2159 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2160 __ br(Assembler::NE, L_checkcast_copy); 2161 2162 // Identically typed arrays can be copied without element-wise checks. 2163 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2164 rscratch2, L_failed); 2165 2166 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2167 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2168 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2169 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2170 __ movw(count, scratch_length); // length 2171 __ BIND(L_plain_copy); 2172 __ b(RuntimeAddress(oop_copy_entry)); 2173 2174 __ BIND(L_checkcast_copy); 2175 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2176 { 2177 // Before looking at dst.length, make sure dst is also an objArray. 2178 __ ldrw(rscratch1, Address(r18, lh_offset)); 2179 __ movw(rscratch2, objArray_lh); 2180 __ eorw(rscratch1, rscratch1, rscratch2); 2181 __ cbnzw(rscratch1, L_failed); 2182 2183 // It is safe to examine both src.length and dst.length. 2184 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2185 r18, L_failed); 2186 2187 const Register rscratch2_dst_klass = rscratch2; 2188 __ load_klass(rscratch2_dst_klass, dst); // reload 2189 2190 // Marshal the base address arguments now, freeing registers. 2191 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2192 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2193 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2194 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2195 __ movw(count, length); // length (reloaded) 2196 Register sco_temp = c_rarg3; // this register is free now 2197 assert_different_registers(from, to, count, sco_temp, 2198 rscratch2_dst_klass, scratch_src_klass); 2199 // assert_clean_int(count, sco_temp); 2200 2201 // Generate the type check. 2202 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2203 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2204 // assert_clean_int(sco_temp, r18); 2205 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2206 2207 // Fetch destination element klass from the ObjArrayKlass header. 2208 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2209 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2210 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2211 2212 // the checkcast_copy loop needs two extra arguments: 2213 assert(c_rarg3 == sco_temp, "#3 already in place"); 2214 // Set up arguments for checkcast_copy_entry. 2215 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2216 __ b(RuntimeAddress(checkcast_copy_entry)); 2217 } 2218 2219 __ BIND(L_failed); 2220 __ mov(r0, -1); 2221 __ leave(); // required for proper stackwalking of RuntimeStub frame 2222 __ ret(lr); 2223 2224 return start; 2225 } 2226 2227 // 2228 // Generate stub for array fill. If "aligned" is true, the 2229 // "to" address is assumed to be heapword aligned. 2230 // 2231 // Arguments for generated stub: 2232 // to: c_rarg0 2233 // value: c_rarg1 2234 // count: c_rarg2 treated as signed 2235 // 2236 address generate_fill(BasicType t, bool aligned, const char *name) { 2237 __ align(CodeEntryAlignment); 2238 StubCodeMark mark(this, "StubRoutines", name); 2239 address start = __ pc(); 2240 2241 BLOCK_COMMENT("Entry:"); 2242 2243 const Register to = c_rarg0; // source array address 2244 const Register value = c_rarg1; // value 2245 const Register count = c_rarg2; // elements count 2246 2247 const Register bz_base = r10; // base for block_zero routine 2248 const Register cnt_words = r11; // temp register 2249 2250 __ enter(); 2251 2252 Label L_fill_elements, L_exit1; 2253 2254 int shift = -1; 2255 switch (t) { 2256 case T_BYTE: 2257 shift = 0; 2258 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2259 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2260 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2261 __ br(Assembler::LO, L_fill_elements); 2262 break; 2263 case T_SHORT: 2264 shift = 1; 2265 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2266 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2267 __ br(Assembler::LO, L_fill_elements); 2268 break; 2269 case T_INT: 2270 shift = 2; 2271 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2272 __ br(Assembler::LO, L_fill_elements); 2273 break; 2274 default: ShouldNotReachHere(); 2275 } 2276 2277 // Align source address at 8 bytes address boundary. 2278 Label L_skip_align1, L_skip_align2, L_skip_align4; 2279 if (!aligned) { 2280 switch (t) { 2281 case T_BYTE: 2282 // One byte misalignment happens only for byte arrays. 2283 __ tbz(to, 0, L_skip_align1); 2284 __ strb(value, Address(__ post(to, 1))); 2285 __ subw(count, count, 1); 2286 __ bind(L_skip_align1); 2287 // Fallthrough 2288 case T_SHORT: 2289 // Two bytes misalignment happens only for byte and short (char) arrays. 2290 __ tbz(to, 1, L_skip_align2); 2291 __ strh(value, Address(__ post(to, 2))); 2292 __ subw(count, count, 2 >> shift); 2293 __ bind(L_skip_align2); 2294 // Fallthrough 2295 case T_INT: 2296 // Align to 8 bytes, we know we are 4 byte aligned to start. 2297 __ tbz(to, 2, L_skip_align4); 2298 __ strw(value, Address(__ post(to, 4))); 2299 __ subw(count, count, 4 >> shift); 2300 __ bind(L_skip_align4); 2301 break; 2302 default: ShouldNotReachHere(); 2303 } 2304 } 2305 2306 // 2307 // Fill large chunks 2308 // 2309 __ lsrw(cnt_words, count, 3 - shift); // number of words 2310 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2311 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2312 if (UseBlockZeroing) { 2313 Label non_block_zeroing, rest; 2314 // If the fill value is zero we can use the fast zero_words(). 2315 __ cbnz(value, non_block_zeroing); 2316 __ mov(bz_base, to); 2317 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2318 __ zero_words(bz_base, cnt_words); 2319 __ b(rest); 2320 __ bind(non_block_zeroing); 2321 __ fill_words(to, cnt_words, value); 2322 __ bind(rest); 2323 } else { 2324 __ fill_words(to, cnt_words, value); 2325 } 2326 2327 // Remaining count is less than 8 bytes. Fill it by a single store. 2328 // Note that the total length is no less than 8 bytes. 2329 if (t == T_BYTE || t == T_SHORT) { 2330 Label L_exit1; 2331 __ cbzw(count, L_exit1); 2332 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2333 __ str(value, Address(to, -8)); // overwrite some elements 2334 __ bind(L_exit1); 2335 __ leave(); 2336 __ ret(lr); 2337 } 2338 2339 // Handle copies less than 8 bytes. 2340 Label L_fill_2, L_fill_4, L_exit2; 2341 __ bind(L_fill_elements); 2342 switch (t) { 2343 case T_BYTE: 2344 __ tbz(count, 0, L_fill_2); 2345 __ strb(value, Address(__ post(to, 1))); 2346 __ bind(L_fill_2); 2347 __ tbz(count, 1, L_fill_4); 2348 __ strh(value, Address(__ post(to, 2))); 2349 __ bind(L_fill_4); 2350 __ tbz(count, 2, L_exit2); 2351 __ strw(value, Address(to)); 2352 break; 2353 case T_SHORT: 2354 __ tbz(count, 0, L_fill_4); 2355 __ strh(value, Address(__ post(to, 2))); 2356 __ bind(L_fill_4); 2357 __ tbz(count, 1, L_exit2); 2358 __ strw(value, Address(to)); 2359 break; 2360 case T_INT: 2361 __ cbzw(count, L_exit2); 2362 __ strw(value, Address(to)); 2363 break; 2364 default: ShouldNotReachHere(); 2365 } 2366 __ bind(L_exit2); 2367 __ leave(); 2368 __ ret(lr); 2369 return start; 2370 } 2371 2372 void generate_arraycopy_stubs() { 2373 address entry; 2374 address entry_jbyte_arraycopy; 2375 address entry_jshort_arraycopy; 2376 address entry_jint_arraycopy; 2377 address entry_oop_arraycopy; 2378 address entry_jlong_arraycopy; 2379 address entry_checkcast_arraycopy; 2380 2381 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2382 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2383 2384 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2385 2386 //*** jbyte 2387 // Always need aligned and unaligned versions 2388 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2389 "jbyte_disjoint_arraycopy"); 2390 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2391 &entry_jbyte_arraycopy, 2392 "jbyte_arraycopy"); 2393 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2394 "arrayof_jbyte_disjoint_arraycopy"); 2395 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2396 "arrayof_jbyte_arraycopy"); 2397 2398 //*** jshort 2399 // Always need aligned and unaligned versions 2400 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2401 "jshort_disjoint_arraycopy"); 2402 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2403 &entry_jshort_arraycopy, 2404 "jshort_arraycopy"); 2405 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2406 "arrayof_jshort_disjoint_arraycopy"); 2407 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2408 "arrayof_jshort_arraycopy"); 2409 2410 //*** jint 2411 // Aligned versions 2412 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2413 "arrayof_jint_disjoint_arraycopy"); 2414 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2415 "arrayof_jint_arraycopy"); 2416 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2417 // entry_jint_arraycopy always points to the unaligned version 2418 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2419 "jint_disjoint_arraycopy"); 2420 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2421 &entry_jint_arraycopy, 2422 "jint_arraycopy"); 2423 2424 //*** jlong 2425 // It is always aligned 2426 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2427 "arrayof_jlong_disjoint_arraycopy"); 2428 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2429 "arrayof_jlong_arraycopy"); 2430 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2431 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2432 2433 //*** oops 2434 { 2435 // With compressed oops we need unaligned versions; notice that 2436 // we overwrite entry_oop_arraycopy. 2437 bool aligned = !UseCompressedOops; 2438 2439 StubRoutines::_arrayof_oop_disjoint_arraycopy 2440 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2441 /*dest_uninitialized*/false); 2442 StubRoutines::_arrayof_oop_arraycopy 2443 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2444 /*dest_uninitialized*/false); 2445 // Aligned versions without pre-barriers 2446 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2447 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2448 /*dest_uninitialized*/true); 2449 StubRoutines::_arrayof_oop_arraycopy_uninit 2450 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2451 /*dest_uninitialized*/true); 2452 } 2453 2454 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2455 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2456 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2457 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2458 2459 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2460 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2461 /*dest_uninitialized*/true); 2462 2463 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2464 entry_jbyte_arraycopy, 2465 entry_jshort_arraycopy, 2466 entry_jint_arraycopy, 2467 entry_jlong_arraycopy); 2468 2469 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2470 entry_jbyte_arraycopy, 2471 entry_jshort_arraycopy, 2472 entry_jint_arraycopy, 2473 entry_oop_arraycopy, 2474 entry_jlong_arraycopy, 2475 entry_checkcast_arraycopy); 2476 2477 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2478 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2479 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2480 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2481 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2482 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2483 } 2484 2485 void generate_math_stubs() { Unimplemented(); } 2486 2487 // Arguments: 2488 // 2489 // Inputs: 2490 // c_rarg0 - source byte array address 2491 // c_rarg1 - destination byte array address 2492 // c_rarg2 - K (key) in little endian int array 2493 // 2494 address generate_aescrypt_encryptBlock() { 2495 __ align(CodeEntryAlignment); 2496 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2497 2498 Label L_doLast; 2499 2500 const Register from = c_rarg0; // source array address 2501 const Register to = c_rarg1; // destination array address 2502 const Register key = c_rarg2; // key array address 2503 const Register keylen = rscratch1; 2504 2505 address start = __ pc(); 2506 __ enter(); 2507 2508 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2509 2510 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2511 2512 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2513 __ rev32(v1, __ T16B, v1); 2514 __ rev32(v2, __ T16B, v2); 2515 __ rev32(v3, __ T16B, v3); 2516 __ rev32(v4, __ T16B, v4); 2517 __ aese(v0, v1); 2518 __ aesmc(v0, v0); 2519 __ aese(v0, v2); 2520 __ aesmc(v0, v0); 2521 __ aese(v0, v3); 2522 __ aesmc(v0, v0); 2523 __ aese(v0, v4); 2524 __ aesmc(v0, v0); 2525 2526 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2527 __ rev32(v1, __ T16B, v1); 2528 __ rev32(v2, __ T16B, v2); 2529 __ rev32(v3, __ T16B, v3); 2530 __ rev32(v4, __ T16B, v4); 2531 __ aese(v0, v1); 2532 __ aesmc(v0, v0); 2533 __ aese(v0, v2); 2534 __ aesmc(v0, v0); 2535 __ aese(v0, v3); 2536 __ aesmc(v0, v0); 2537 __ aese(v0, v4); 2538 __ aesmc(v0, v0); 2539 2540 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2541 __ rev32(v1, __ T16B, v1); 2542 __ rev32(v2, __ T16B, v2); 2543 2544 __ cmpw(keylen, 44); 2545 __ br(Assembler::EQ, L_doLast); 2546 2547 __ aese(v0, v1); 2548 __ aesmc(v0, v0); 2549 __ aese(v0, v2); 2550 __ aesmc(v0, v0); 2551 2552 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2553 __ rev32(v1, __ T16B, v1); 2554 __ rev32(v2, __ T16B, v2); 2555 2556 __ cmpw(keylen, 52); 2557 __ br(Assembler::EQ, L_doLast); 2558 2559 __ aese(v0, v1); 2560 __ aesmc(v0, v0); 2561 __ aese(v0, v2); 2562 __ aesmc(v0, v0); 2563 2564 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2565 __ rev32(v1, __ T16B, v1); 2566 __ rev32(v2, __ T16B, v2); 2567 2568 __ BIND(L_doLast); 2569 2570 __ aese(v0, v1); 2571 __ aesmc(v0, v0); 2572 __ aese(v0, v2); 2573 2574 __ ld1(v1, __ T16B, key); 2575 __ rev32(v1, __ T16B, v1); 2576 __ eor(v0, __ T16B, v0, v1); 2577 2578 __ st1(v0, __ T16B, to); 2579 2580 __ mov(r0, 0); 2581 2582 __ leave(); 2583 __ ret(lr); 2584 2585 return start; 2586 } 2587 2588 // Arguments: 2589 // 2590 // Inputs: 2591 // c_rarg0 - source byte array address 2592 // c_rarg1 - destination byte array address 2593 // c_rarg2 - K (key) in little endian int array 2594 // 2595 address generate_aescrypt_decryptBlock() { 2596 assert(UseAES, "need AES instructions and misaligned SSE support"); 2597 __ align(CodeEntryAlignment); 2598 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2599 Label L_doLast; 2600 2601 const Register from = c_rarg0; // source array address 2602 const Register to = c_rarg1; // destination array address 2603 const Register key = c_rarg2; // key array address 2604 const Register keylen = rscratch1; 2605 2606 address start = __ pc(); 2607 __ enter(); // required for proper stackwalking of RuntimeStub frame 2608 2609 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2610 2611 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2612 2613 __ ld1(v5, __ T16B, __ post(key, 16)); 2614 __ rev32(v5, __ T16B, v5); 2615 2616 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2617 __ rev32(v1, __ T16B, v1); 2618 __ rev32(v2, __ T16B, v2); 2619 __ rev32(v3, __ T16B, v3); 2620 __ rev32(v4, __ T16B, v4); 2621 __ aesd(v0, v1); 2622 __ aesimc(v0, v0); 2623 __ aesd(v0, v2); 2624 __ aesimc(v0, v0); 2625 __ aesd(v0, v3); 2626 __ aesimc(v0, v0); 2627 __ aesd(v0, v4); 2628 __ aesimc(v0, v0); 2629 2630 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2631 __ rev32(v1, __ T16B, v1); 2632 __ rev32(v2, __ T16B, v2); 2633 __ rev32(v3, __ T16B, v3); 2634 __ rev32(v4, __ T16B, v4); 2635 __ aesd(v0, v1); 2636 __ aesimc(v0, v0); 2637 __ aesd(v0, v2); 2638 __ aesimc(v0, v0); 2639 __ aesd(v0, v3); 2640 __ aesimc(v0, v0); 2641 __ aesd(v0, v4); 2642 __ aesimc(v0, v0); 2643 2644 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2645 __ rev32(v1, __ T16B, v1); 2646 __ rev32(v2, __ T16B, v2); 2647 2648 __ cmpw(keylen, 44); 2649 __ br(Assembler::EQ, L_doLast); 2650 2651 __ aesd(v0, v1); 2652 __ aesimc(v0, v0); 2653 __ aesd(v0, v2); 2654 __ aesimc(v0, v0); 2655 2656 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2657 __ rev32(v1, __ T16B, v1); 2658 __ rev32(v2, __ T16B, v2); 2659 2660 __ cmpw(keylen, 52); 2661 __ br(Assembler::EQ, L_doLast); 2662 2663 __ aesd(v0, v1); 2664 __ aesimc(v0, v0); 2665 __ aesd(v0, v2); 2666 __ aesimc(v0, v0); 2667 2668 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2669 __ rev32(v1, __ T16B, v1); 2670 __ rev32(v2, __ T16B, v2); 2671 2672 __ BIND(L_doLast); 2673 2674 __ aesd(v0, v1); 2675 __ aesimc(v0, v0); 2676 __ aesd(v0, v2); 2677 2678 __ eor(v0, __ T16B, v0, v5); 2679 2680 __ st1(v0, __ T16B, to); 2681 2682 __ mov(r0, 0); 2683 2684 __ leave(); 2685 __ ret(lr); 2686 2687 return start; 2688 } 2689 2690 // Arguments: 2691 // 2692 // Inputs: 2693 // c_rarg0 - source byte array address 2694 // c_rarg1 - destination byte array address 2695 // c_rarg2 - K (key) in little endian int array 2696 // c_rarg3 - r vector byte array address 2697 // c_rarg4 - input length 2698 // 2699 // Output: 2700 // x0 - input length 2701 // 2702 address generate_cipherBlockChaining_encryptAESCrypt() { 2703 assert(UseAES, "need AES instructions and misaligned SSE support"); 2704 __ align(CodeEntryAlignment); 2705 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2706 2707 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2708 2709 const Register from = c_rarg0; // source array address 2710 const Register to = c_rarg1; // destination array address 2711 const Register key = c_rarg2; // key array address 2712 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2713 // and left with the results of the last encryption block 2714 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2715 const Register keylen = rscratch1; 2716 2717 address start = __ pc(); 2718 2719 __ enter(); 2720 2721 __ movw(rscratch2, len_reg); 2722 2723 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2724 2725 __ ld1(v0, __ T16B, rvec); 2726 2727 __ cmpw(keylen, 52); 2728 __ br(Assembler::CC, L_loadkeys_44); 2729 __ br(Assembler::EQ, L_loadkeys_52); 2730 2731 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2732 __ rev32(v17, __ T16B, v17); 2733 __ rev32(v18, __ T16B, v18); 2734 __ BIND(L_loadkeys_52); 2735 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2736 __ rev32(v19, __ T16B, v19); 2737 __ rev32(v20, __ T16B, v20); 2738 __ BIND(L_loadkeys_44); 2739 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2740 __ rev32(v21, __ T16B, v21); 2741 __ rev32(v22, __ T16B, v22); 2742 __ rev32(v23, __ T16B, v23); 2743 __ rev32(v24, __ T16B, v24); 2744 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2745 __ rev32(v25, __ T16B, v25); 2746 __ rev32(v26, __ T16B, v26); 2747 __ rev32(v27, __ T16B, v27); 2748 __ rev32(v28, __ T16B, v28); 2749 __ ld1(v29, v30, v31, __ T16B, key); 2750 __ rev32(v29, __ T16B, v29); 2751 __ rev32(v30, __ T16B, v30); 2752 __ rev32(v31, __ T16B, v31); 2753 2754 __ BIND(L_aes_loop); 2755 __ ld1(v1, __ T16B, __ post(from, 16)); 2756 __ eor(v0, __ T16B, v0, v1); 2757 2758 __ br(Assembler::CC, L_rounds_44); 2759 __ br(Assembler::EQ, L_rounds_52); 2760 2761 __ aese(v0, v17); __ aesmc(v0, v0); 2762 __ aese(v0, v18); __ aesmc(v0, v0); 2763 __ BIND(L_rounds_52); 2764 __ aese(v0, v19); __ aesmc(v0, v0); 2765 __ aese(v0, v20); __ aesmc(v0, v0); 2766 __ BIND(L_rounds_44); 2767 __ aese(v0, v21); __ aesmc(v0, v0); 2768 __ aese(v0, v22); __ aesmc(v0, v0); 2769 __ aese(v0, v23); __ aesmc(v0, v0); 2770 __ aese(v0, v24); __ aesmc(v0, v0); 2771 __ aese(v0, v25); __ aesmc(v0, v0); 2772 __ aese(v0, v26); __ aesmc(v0, v0); 2773 __ aese(v0, v27); __ aesmc(v0, v0); 2774 __ aese(v0, v28); __ aesmc(v0, v0); 2775 __ aese(v0, v29); __ aesmc(v0, v0); 2776 __ aese(v0, v30); 2777 __ eor(v0, __ T16B, v0, v31); 2778 2779 __ st1(v0, __ T16B, __ post(to, 16)); 2780 2781 __ subw(len_reg, len_reg, 16); 2782 __ cbnzw(len_reg, L_aes_loop); 2783 2784 __ st1(v0, __ T16B, rvec); 2785 2786 __ mov(r0, rscratch2); 2787 2788 __ leave(); 2789 __ ret(lr); 2790 2791 return start; 2792 } 2793 2794 // Arguments: 2795 // 2796 // Inputs: 2797 // c_rarg0 - source byte array address 2798 // c_rarg1 - destination byte array address 2799 // c_rarg2 - K (key) in little endian int array 2800 // c_rarg3 - r vector byte array address 2801 // c_rarg4 - input length 2802 // 2803 // Output: 2804 // r0 - input length 2805 // 2806 address generate_cipherBlockChaining_decryptAESCrypt() { 2807 assert(UseAES, "need AES instructions and misaligned SSE support"); 2808 __ align(CodeEntryAlignment); 2809 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2810 2811 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2812 2813 const Register from = c_rarg0; // source array address 2814 const Register to = c_rarg1; // destination array address 2815 const Register key = c_rarg2; // key array address 2816 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2817 // and left with the results of the last encryption block 2818 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2819 const Register keylen = rscratch1; 2820 2821 address start = __ pc(); 2822 2823 __ enter(); 2824 2825 __ movw(rscratch2, len_reg); 2826 2827 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2828 2829 __ ld1(v2, __ T16B, rvec); 2830 2831 __ ld1(v31, __ T16B, __ post(key, 16)); 2832 __ rev32(v31, __ T16B, v31); 2833 2834 __ cmpw(keylen, 52); 2835 __ br(Assembler::CC, L_loadkeys_44); 2836 __ br(Assembler::EQ, L_loadkeys_52); 2837 2838 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2839 __ rev32(v17, __ T16B, v17); 2840 __ rev32(v18, __ T16B, v18); 2841 __ BIND(L_loadkeys_52); 2842 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2843 __ rev32(v19, __ T16B, v19); 2844 __ rev32(v20, __ T16B, v20); 2845 __ BIND(L_loadkeys_44); 2846 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2847 __ rev32(v21, __ T16B, v21); 2848 __ rev32(v22, __ T16B, v22); 2849 __ rev32(v23, __ T16B, v23); 2850 __ rev32(v24, __ T16B, v24); 2851 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2852 __ rev32(v25, __ T16B, v25); 2853 __ rev32(v26, __ T16B, v26); 2854 __ rev32(v27, __ T16B, v27); 2855 __ rev32(v28, __ T16B, v28); 2856 __ ld1(v29, v30, __ T16B, key); 2857 __ rev32(v29, __ T16B, v29); 2858 __ rev32(v30, __ T16B, v30); 2859 2860 __ BIND(L_aes_loop); 2861 __ ld1(v0, __ T16B, __ post(from, 16)); 2862 __ orr(v1, __ T16B, v0, v0); 2863 2864 __ br(Assembler::CC, L_rounds_44); 2865 __ br(Assembler::EQ, L_rounds_52); 2866 2867 __ aesd(v0, v17); __ aesimc(v0, v0); 2868 __ aesd(v0, v18); __ aesimc(v0, v0); 2869 __ BIND(L_rounds_52); 2870 __ aesd(v0, v19); __ aesimc(v0, v0); 2871 __ aesd(v0, v20); __ aesimc(v0, v0); 2872 __ BIND(L_rounds_44); 2873 __ aesd(v0, v21); __ aesimc(v0, v0); 2874 __ aesd(v0, v22); __ aesimc(v0, v0); 2875 __ aesd(v0, v23); __ aesimc(v0, v0); 2876 __ aesd(v0, v24); __ aesimc(v0, v0); 2877 __ aesd(v0, v25); __ aesimc(v0, v0); 2878 __ aesd(v0, v26); __ aesimc(v0, v0); 2879 __ aesd(v0, v27); __ aesimc(v0, v0); 2880 __ aesd(v0, v28); __ aesimc(v0, v0); 2881 __ aesd(v0, v29); __ aesimc(v0, v0); 2882 __ aesd(v0, v30); 2883 __ eor(v0, __ T16B, v0, v31); 2884 __ eor(v0, __ T16B, v0, v2); 2885 2886 __ st1(v0, __ T16B, __ post(to, 16)); 2887 __ orr(v2, __ T16B, v1, v1); 2888 2889 __ subw(len_reg, len_reg, 16); 2890 __ cbnzw(len_reg, L_aes_loop); 2891 2892 __ st1(v2, __ T16B, rvec); 2893 2894 __ mov(r0, rscratch2); 2895 2896 __ leave(); 2897 __ ret(lr); 2898 2899 return start; 2900 } 2901 2902 // Arguments: 2903 // 2904 // Inputs: 2905 // c_rarg0 - byte[] source+offset 2906 // c_rarg1 - int[] SHA.state 2907 // c_rarg2 - int offset 2908 // c_rarg3 - int limit 2909 // 2910 address generate_sha1_implCompress(bool multi_block, const char *name) { 2911 __ align(CodeEntryAlignment); 2912 StubCodeMark mark(this, "StubRoutines", name); 2913 address start = __ pc(); 2914 2915 Register buf = c_rarg0; 2916 Register state = c_rarg1; 2917 Register ofs = c_rarg2; 2918 Register limit = c_rarg3; 2919 2920 Label keys; 2921 Label sha1_loop; 2922 2923 // load the keys into v0..v3 2924 __ adr(rscratch1, keys); 2925 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2926 // load 5 words state into v6, v7 2927 __ ldrq(v6, Address(state, 0)); 2928 __ ldrs(v7, Address(state, 16)); 2929 2930 2931 __ BIND(sha1_loop); 2932 // load 64 bytes of data into v16..v19 2933 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2934 __ rev32(v16, __ T16B, v16); 2935 __ rev32(v17, __ T16B, v17); 2936 __ rev32(v18, __ T16B, v18); 2937 __ rev32(v19, __ T16B, v19); 2938 2939 // do the sha1 2940 __ addv(v4, __ T4S, v16, v0); 2941 __ orr(v20, __ T16B, v6, v6); 2942 2943 FloatRegister d0 = v16; 2944 FloatRegister d1 = v17; 2945 FloatRegister d2 = v18; 2946 FloatRegister d3 = v19; 2947 2948 for (int round = 0; round < 20; round++) { 2949 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2950 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2951 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2952 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2953 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2954 2955 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2956 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2957 __ sha1h(tmp2, __ T4S, v20); 2958 if (round < 5) 2959 __ sha1c(v20, __ T4S, tmp3, tmp4); 2960 else if (round < 10 || round >= 15) 2961 __ sha1p(v20, __ T4S, tmp3, tmp4); 2962 else 2963 __ sha1m(v20, __ T4S, tmp3, tmp4); 2964 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2965 2966 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2967 } 2968 2969 __ addv(v7, __ T2S, v7, v21); 2970 __ addv(v6, __ T4S, v6, v20); 2971 2972 if (multi_block) { 2973 __ add(ofs, ofs, 64); 2974 __ cmp(ofs, limit); 2975 __ br(Assembler::LE, sha1_loop); 2976 __ mov(c_rarg0, ofs); // return ofs 2977 } 2978 2979 __ strq(v6, Address(state, 0)); 2980 __ strs(v7, Address(state, 16)); 2981 2982 __ ret(lr); 2983 2984 __ bind(keys); 2985 __ emit_int32(0x5a827999); 2986 __ emit_int32(0x6ed9eba1); 2987 __ emit_int32(0x8f1bbcdc); 2988 __ emit_int32(0xca62c1d6); 2989 2990 return start; 2991 } 2992 2993 2994 // Arguments: 2995 // 2996 // Inputs: 2997 // c_rarg0 - byte[] source+offset 2998 // c_rarg1 - int[] SHA.state 2999 // c_rarg2 - int offset 3000 // c_rarg3 - int limit 3001 // 3002 address generate_sha256_implCompress(bool multi_block, const char *name) { 3003 static const uint32_t round_consts[64] = { 3004 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3005 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3006 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3007 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3008 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3009 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3010 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3011 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3012 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3013 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3014 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3015 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3016 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3017 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3018 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3019 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3020 }; 3021 __ align(CodeEntryAlignment); 3022 StubCodeMark mark(this, "StubRoutines", name); 3023 address start = __ pc(); 3024 3025 Register buf = c_rarg0; 3026 Register state = c_rarg1; 3027 Register ofs = c_rarg2; 3028 Register limit = c_rarg3; 3029 3030 Label sha1_loop; 3031 3032 __ stpd(v8, v9, __ pre(sp, -32)); 3033 __ stpd(v10, v11, Address(sp, 16)); 3034 3035 // dga == v0 3036 // dgb == v1 3037 // dg0 == v2 3038 // dg1 == v3 3039 // dg2 == v4 3040 // t0 == v6 3041 // t1 == v7 3042 3043 // load 16 keys to v16..v31 3044 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3045 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3046 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3047 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3048 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3049 3050 // load 8 words (256 bits) state 3051 __ ldpq(v0, v1, state); 3052 3053 __ BIND(sha1_loop); 3054 // load 64 bytes of data into v8..v11 3055 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3056 __ rev32(v8, __ T16B, v8); 3057 __ rev32(v9, __ T16B, v9); 3058 __ rev32(v10, __ T16B, v10); 3059 __ rev32(v11, __ T16B, v11); 3060 3061 __ addv(v6, __ T4S, v8, v16); 3062 __ orr(v2, __ T16B, v0, v0); 3063 __ orr(v3, __ T16B, v1, v1); 3064 3065 FloatRegister d0 = v8; 3066 FloatRegister d1 = v9; 3067 FloatRegister d2 = v10; 3068 FloatRegister d3 = v11; 3069 3070 3071 for (int round = 0; round < 16; round++) { 3072 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3073 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3074 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3075 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3076 3077 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3078 __ orr(v4, __ T16B, v2, v2); 3079 if (round < 15) 3080 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3081 __ sha256h(v2, __ T4S, v3, tmp2); 3082 __ sha256h2(v3, __ T4S, v4, tmp2); 3083 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3084 3085 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3086 } 3087 3088 __ addv(v0, __ T4S, v0, v2); 3089 __ addv(v1, __ T4S, v1, v3); 3090 3091 if (multi_block) { 3092 __ add(ofs, ofs, 64); 3093 __ cmp(ofs, limit); 3094 __ br(Assembler::LE, sha1_loop); 3095 __ mov(c_rarg0, ofs); // return ofs 3096 } 3097 3098 __ ldpd(v10, v11, Address(sp, 16)); 3099 __ ldpd(v8, v9, __ post(sp, 32)); 3100 3101 __ stpq(v0, v1, state); 3102 3103 __ ret(lr); 3104 3105 return start; 3106 } 3107 3108 #ifndef BUILTIN_SIM 3109 // Safefetch stubs. 3110 void generate_safefetch(const char* name, int size, address* entry, 3111 address* fault_pc, address* continuation_pc) { 3112 // safefetch signatures: 3113 // int SafeFetch32(int* adr, int errValue); 3114 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3115 // 3116 // arguments: 3117 // c_rarg0 = adr 3118 // c_rarg1 = errValue 3119 // 3120 // result: 3121 // PPC_RET = *adr or errValue 3122 3123 StubCodeMark mark(this, "StubRoutines", name); 3124 3125 // Entry point, pc or function descriptor. 3126 *entry = __ pc(); 3127 3128 // Load *adr into c_rarg1, may fault. 3129 *fault_pc = __ pc(); 3130 switch (size) { 3131 case 4: 3132 // int32_t 3133 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3134 break; 3135 case 8: 3136 // int64_t 3137 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3138 break; 3139 default: 3140 ShouldNotReachHere(); 3141 } 3142 3143 // return errValue or *adr 3144 *continuation_pc = __ pc(); 3145 __ mov(r0, c_rarg1); 3146 __ ret(lr); 3147 } 3148 #endif 3149 3150 /** 3151 * Arguments: 3152 * 3153 * Inputs: 3154 * c_rarg0 - int crc 3155 * c_rarg1 - byte* buf 3156 * c_rarg2 - int length 3157 * 3158 * Ouput: 3159 * rax - int crc result 3160 */ 3161 address generate_updateBytesCRC32() { 3162 assert(UseCRC32Intrinsics, "what are we doing here?"); 3163 3164 __ align(CodeEntryAlignment); 3165 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3166 3167 address start = __ pc(); 3168 3169 const Register crc = c_rarg0; // crc 3170 const Register buf = c_rarg1; // source java byte array address 3171 const Register len = c_rarg2; // length 3172 const Register table0 = c_rarg3; // crc_table address 3173 const Register table1 = c_rarg4; 3174 const Register table2 = c_rarg5; 3175 const Register table3 = c_rarg6; 3176 const Register tmp3 = c_rarg7; 3177 3178 BLOCK_COMMENT("Entry:"); 3179 __ enter(); // required for proper stackwalking of RuntimeStub frame 3180 3181 __ kernel_crc32(crc, buf, len, 3182 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3183 3184 __ leave(); // required for proper stackwalking of RuntimeStub frame 3185 __ ret(lr); 3186 3187 return start; 3188 } 3189 3190 /** 3191 * Arguments: 3192 * 3193 * Inputs: 3194 * c_rarg0 - int crc 3195 * c_rarg1 - byte* buf 3196 * c_rarg2 - int length 3197 * c_rarg3 - int* table 3198 * 3199 * Ouput: 3200 * r0 - int crc result 3201 */ 3202 address generate_updateBytesCRC32C() { 3203 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3204 3205 __ align(CodeEntryAlignment); 3206 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3207 3208 address start = __ pc(); 3209 3210 const Register crc = c_rarg0; // crc 3211 const Register buf = c_rarg1; // source java byte array address 3212 const Register len = c_rarg2; // length 3213 const Register table0 = c_rarg3; // crc_table address 3214 const Register table1 = c_rarg4; 3215 const Register table2 = c_rarg5; 3216 const Register table3 = c_rarg6; 3217 const Register tmp3 = c_rarg7; 3218 3219 BLOCK_COMMENT("Entry:"); 3220 __ enter(); // required for proper stackwalking of RuntimeStub frame 3221 3222 __ kernel_crc32c(crc, buf, len, 3223 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3224 3225 __ leave(); // required for proper stackwalking of RuntimeStub frame 3226 __ ret(lr); 3227 3228 return start; 3229 } 3230 3231 /*** 3232 * Arguments: 3233 * 3234 * Inputs: 3235 * c_rarg0 - int adler 3236 * c_rarg1 - byte* buff 3237 * c_rarg2 - int len 3238 * 3239 * Output: 3240 * c_rarg0 - int adler result 3241 */ 3242 address generate_updateBytesAdler32() { 3243 __ align(CodeEntryAlignment); 3244 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3245 address start = __ pc(); 3246 3247 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3248 3249 // Aliases 3250 Register adler = c_rarg0; 3251 Register s1 = c_rarg0; 3252 Register s2 = c_rarg3; 3253 Register buff = c_rarg1; 3254 Register len = c_rarg2; 3255 Register nmax = r4; 3256 Register base = r5; 3257 Register count = r6; 3258 Register temp0 = rscratch1; 3259 Register temp1 = rscratch2; 3260 Register temp2 = r7; 3261 3262 // Max number of bytes we can process before having to take the mod 3263 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3264 unsigned long BASE = 0xfff1; 3265 unsigned long NMAX = 0x15B0; 3266 3267 __ mov(base, BASE); 3268 __ mov(nmax, NMAX); 3269 3270 // s1 is initialized to the lower 16 bits of adler 3271 // s2 is initialized to the upper 16 bits of adler 3272 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3273 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3274 3275 // The pipelined loop needs at least 16 elements for 1 iteration 3276 // It does check this, but it is more effective to skip to the cleanup loop 3277 __ cmp(len, (u1)16); 3278 __ br(Assembler::HS, L_nmax); 3279 __ cbz(len, L_combine); 3280 3281 __ bind(L_simple_by1_loop); 3282 __ ldrb(temp0, Address(__ post(buff, 1))); 3283 __ add(s1, s1, temp0); 3284 __ add(s2, s2, s1); 3285 __ subs(len, len, 1); 3286 __ br(Assembler::HI, L_simple_by1_loop); 3287 3288 // s1 = s1 % BASE 3289 __ subs(temp0, s1, base); 3290 __ csel(s1, temp0, s1, Assembler::HS); 3291 3292 // s2 = s2 % BASE 3293 __ lsr(temp0, s2, 16); 3294 __ lsl(temp1, temp0, 4); 3295 __ sub(temp1, temp1, temp0); 3296 __ add(s2, temp1, s2, ext::uxth); 3297 3298 __ subs(temp0, s2, base); 3299 __ csel(s2, temp0, s2, Assembler::HS); 3300 3301 __ b(L_combine); 3302 3303 __ bind(L_nmax); 3304 __ subs(len, len, nmax); 3305 __ sub(count, nmax, 16); 3306 __ br(Assembler::LO, L_by16); 3307 3308 __ bind(L_nmax_loop); 3309 3310 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3311 3312 __ add(s1, s1, temp0, ext::uxtb); 3313 __ ubfx(temp2, temp0, 8, 8); 3314 __ add(s2, s2, s1); 3315 __ add(s1, s1, temp2); 3316 __ ubfx(temp2, temp0, 16, 8); 3317 __ add(s2, s2, s1); 3318 __ add(s1, s1, temp2); 3319 __ ubfx(temp2, temp0, 24, 8); 3320 __ add(s2, s2, s1); 3321 __ add(s1, s1, temp2); 3322 __ ubfx(temp2, temp0, 32, 8); 3323 __ add(s2, s2, s1); 3324 __ add(s1, s1, temp2); 3325 __ ubfx(temp2, temp0, 40, 8); 3326 __ add(s2, s2, s1); 3327 __ add(s1, s1, temp2); 3328 __ ubfx(temp2, temp0, 48, 8); 3329 __ add(s2, s2, s1); 3330 __ add(s1, s1, temp2); 3331 __ add(s2, s2, s1); 3332 __ add(s1, s1, temp0, Assembler::LSR, 56); 3333 __ add(s2, s2, s1); 3334 3335 __ add(s1, s1, temp1, ext::uxtb); 3336 __ ubfx(temp2, temp1, 8, 8); 3337 __ add(s2, s2, s1); 3338 __ add(s1, s1, temp2); 3339 __ ubfx(temp2, temp1, 16, 8); 3340 __ add(s2, s2, s1); 3341 __ add(s1, s1, temp2); 3342 __ ubfx(temp2, temp1, 24, 8); 3343 __ add(s2, s2, s1); 3344 __ add(s1, s1, temp2); 3345 __ ubfx(temp2, temp1, 32, 8); 3346 __ add(s2, s2, s1); 3347 __ add(s1, s1, temp2); 3348 __ ubfx(temp2, temp1, 40, 8); 3349 __ add(s2, s2, s1); 3350 __ add(s1, s1, temp2); 3351 __ ubfx(temp2, temp1, 48, 8); 3352 __ add(s2, s2, s1); 3353 __ add(s1, s1, temp2); 3354 __ add(s2, s2, s1); 3355 __ add(s1, s1, temp1, Assembler::LSR, 56); 3356 __ add(s2, s2, s1); 3357 3358 __ subs(count, count, 16); 3359 __ br(Assembler::HS, L_nmax_loop); 3360 3361 // s1 = s1 % BASE 3362 __ lsr(temp0, s1, 16); 3363 __ lsl(temp1, temp0, 4); 3364 __ sub(temp1, temp1, temp0); 3365 __ add(temp1, temp1, s1, ext::uxth); 3366 3367 __ lsr(temp0, temp1, 16); 3368 __ lsl(s1, temp0, 4); 3369 __ sub(s1, s1, temp0); 3370 __ add(s1, s1, temp1, ext:: uxth); 3371 3372 __ subs(temp0, s1, base); 3373 __ csel(s1, temp0, s1, Assembler::HS); 3374 3375 // s2 = s2 % BASE 3376 __ lsr(temp0, s2, 16); 3377 __ lsl(temp1, temp0, 4); 3378 __ sub(temp1, temp1, temp0); 3379 __ add(temp1, temp1, s2, ext::uxth); 3380 3381 __ lsr(temp0, temp1, 16); 3382 __ lsl(s2, temp0, 4); 3383 __ sub(s2, s2, temp0); 3384 __ add(s2, s2, temp1, ext:: uxth); 3385 3386 __ subs(temp0, s2, base); 3387 __ csel(s2, temp0, s2, Assembler::HS); 3388 3389 __ subs(len, len, nmax); 3390 __ sub(count, nmax, 16); 3391 __ br(Assembler::HS, L_nmax_loop); 3392 3393 __ bind(L_by16); 3394 __ adds(len, len, count); 3395 __ br(Assembler::LO, L_by1); 3396 3397 __ bind(L_by16_loop); 3398 3399 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3400 3401 __ add(s1, s1, temp0, ext::uxtb); 3402 __ ubfx(temp2, temp0, 8, 8); 3403 __ add(s2, s2, s1); 3404 __ add(s1, s1, temp2); 3405 __ ubfx(temp2, temp0, 16, 8); 3406 __ add(s2, s2, s1); 3407 __ add(s1, s1, temp2); 3408 __ ubfx(temp2, temp0, 24, 8); 3409 __ add(s2, s2, s1); 3410 __ add(s1, s1, temp2); 3411 __ ubfx(temp2, temp0, 32, 8); 3412 __ add(s2, s2, s1); 3413 __ add(s1, s1, temp2); 3414 __ ubfx(temp2, temp0, 40, 8); 3415 __ add(s2, s2, s1); 3416 __ add(s1, s1, temp2); 3417 __ ubfx(temp2, temp0, 48, 8); 3418 __ add(s2, s2, s1); 3419 __ add(s1, s1, temp2); 3420 __ add(s2, s2, s1); 3421 __ add(s1, s1, temp0, Assembler::LSR, 56); 3422 __ add(s2, s2, s1); 3423 3424 __ add(s1, s1, temp1, ext::uxtb); 3425 __ ubfx(temp2, temp1, 8, 8); 3426 __ add(s2, s2, s1); 3427 __ add(s1, s1, temp2); 3428 __ ubfx(temp2, temp1, 16, 8); 3429 __ add(s2, s2, s1); 3430 __ add(s1, s1, temp2); 3431 __ ubfx(temp2, temp1, 24, 8); 3432 __ add(s2, s2, s1); 3433 __ add(s1, s1, temp2); 3434 __ ubfx(temp2, temp1, 32, 8); 3435 __ add(s2, s2, s1); 3436 __ add(s1, s1, temp2); 3437 __ ubfx(temp2, temp1, 40, 8); 3438 __ add(s2, s2, s1); 3439 __ add(s1, s1, temp2); 3440 __ ubfx(temp2, temp1, 48, 8); 3441 __ add(s2, s2, s1); 3442 __ add(s1, s1, temp2); 3443 __ add(s2, s2, s1); 3444 __ add(s1, s1, temp1, Assembler::LSR, 56); 3445 __ add(s2, s2, s1); 3446 3447 __ subs(len, len, 16); 3448 __ br(Assembler::HS, L_by16_loop); 3449 3450 __ bind(L_by1); 3451 __ adds(len, len, 15); 3452 __ br(Assembler::LO, L_do_mod); 3453 3454 __ bind(L_by1_loop); 3455 __ ldrb(temp0, Address(__ post(buff, 1))); 3456 __ add(s1, temp0, s1); 3457 __ add(s2, s2, s1); 3458 __ subs(len, len, 1); 3459 __ br(Assembler::HS, L_by1_loop); 3460 3461 __ bind(L_do_mod); 3462 // s1 = s1 % BASE 3463 __ lsr(temp0, s1, 16); 3464 __ lsl(temp1, temp0, 4); 3465 __ sub(temp1, temp1, temp0); 3466 __ add(temp1, temp1, s1, ext::uxth); 3467 3468 __ lsr(temp0, temp1, 16); 3469 __ lsl(s1, temp0, 4); 3470 __ sub(s1, s1, temp0); 3471 __ add(s1, s1, temp1, ext:: uxth); 3472 3473 __ subs(temp0, s1, base); 3474 __ csel(s1, temp0, s1, Assembler::HS); 3475 3476 // s2 = s2 % BASE 3477 __ lsr(temp0, s2, 16); 3478 __ lsl(temp1, temp0, 4); 3479 __ sub(temp1, temp1, temp0); 3480 __ add(temp1, temp1, s2, ext::uxth); 3481 3482 __ lsr(temp0, temp1, 16); 3483 __ lsl(s2, temp0, 4); 3484 __ sub(s2, s2, temp0); 3485 __ add(s2, s2, temp1, ext:: uxth); 3486 3487 __ subs(temp0, s2, base); 3488 __ csel(s2, temp0, s2, Assembler::HS); 3489 3490 // Combine lower bits and higher bits 3491 __ bind(L_combine); 3492 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3493 3494 __ ret(lr); 3495 3496 return start; 3497 } 3498 3499 /** 3500 * Arguments: 3501 * 3502 * Input: 3503 * c_rarg0 - x address 3504 * c_rarg1 - x length 3505 * c_rarg2 - y address 3506 * c_rarg3 - y lenth 3507 * c_rarg4 - z address 3508 * c_rarg5 - z length 3509 */ 3510 address generate_multiplyToLen() { 3511 __ align(CodeEntryAlignment); 3512 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3513 3514 address start = __ pc(); 3515 const Register x = r0; 3516 const Register xlen = r1; 3517 const Register y = r2; 3518 const Register ylen = r3; 3519 const Register z = r4; 3520 const Register zlen = r5; 3521 3522 const Register tmp1 = r10; 3523 const Register tmp2 = r11; 3524 const Register tmp3 = r12; 3525 const Register tmp4 = r13; 3526 const Register tmp5 = r14; 3527 const Register tmp6 = r15; 3528 const Register tmp7 = r16; 3529 3530 BLOCK_COMMENT("Entry:"); 3531 __ enter(); // required for proper stackwalking of RuntimeStub frame 3532 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3533 __ leave(); // required for proper stackwalking of RuntimeStub frame 3534 __ ret(lr); 3535 3536 return start; 3537 } 3538 3539 address generate_squareToLen() { 3540 // squareToLen algorithm for sizes 1..127 described in java code works 3541 // faster than multiply_to_len on some CPUs and slower on others, but 3542 // multiply_to_len shows a bit better overall results 3543 __ align(CodeEntryAlignment); 3544 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3545 address start = __ pc(); 3546 3547 const Register x = r0; 3548 const Register xlen = r1; 3549 const Register z = r2; 3550 const Register zlen = r3; 3551 const Register y = r4; // == x 3552 const Register ylen = r5; // == xlen 3553 3554 const Register tmp1 = r10; 3555 const Register tmp2 = r11; 3556 const Register tmp3 = r12; 3557 const Register tmp4 = r13; 3558 const Register tmp5 = r14; 3559 const Register tmp6 = r15; 3560 const Register tmp7 = r16; 3561 3562 RegSet spilled_regs = RegSet::of(y, ylen); 3563 BLOCK_COMMENT("Entry:"); 3564 __ enter(); 3565 __ push(spilled_regs, sp); 3566 __ mov(y, x); 3567 __ mov(ylen, xlen); 3568 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3569 __ pop(spilled_regs, sp); 3570 __ leave(); 3571 __ ret(lr); 3572 return start; 3573 } 3574 3575 address generate_mulAdd() { 3576 __ align(CodeEntryAlignment); 3577 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3578 3579 address start = __ pc(); 3580 3581 const Register out = r0; 3582 const Register in = r1; 3583 const Register offset = r2; 3584 const Register len = r3; 3585 const Register k = r4; 3586 3587 BLOCK_COMMENT("Entry:"); 3588 __ enter(); 3589 __ mul_add(out, in, offset, len, k); 3590 __ leave(); 3591 __ ret(lr); 3592 3593 return start; 3594 } 3595 3596 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3597 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3598 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3599 // Karatsuba multiplication performs a 128*128 -> 256-bit 3600 // multiplication in three 128-bit multiplications and a few 3601 // additions. 3602 // 3603 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3604 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3605 // 3606 // Inputs: 3607 // 3608 // A0 in a.d[0] (subkey) 3609 // A1 in a.d[1] 3610 // (A1+A0) in a1_xor_a0.d[0] 3611 // 3612 // B0 in b.d[0] (state) 3613 // B1 in b.d[1] 3614 3615 __ ext(tmp1, __ T16B, b, b, 0x08); 3616 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3617 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3618 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3619 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3620 3621 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3622 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3623 __ eor(tmp2, __ T16B, tmp2, tmp4); 3624 __ eor(tmp2, __ T16B, tmp2, tmp3); 3625 3626 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3627 __ ins(result_hi, __ D, tmp2, 0, 1); 3628 __ ins(result_lo, __ D, tmp2, 1, 0); 3629 } 3630 3631 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3632 FloatRegister p, FloatRegister z, FloatRegister t1) { 3633 const FloatRegister t0 = result; 3634 3635 // The GCM field polynomial f is z^128 + p(z), where p = 3636 // z^7+z^2+z+1. 3637 // 3638 // z^128 === -p(z) (mod (z^128 + p(z))) 3639 // 3640 // so, given that the product we're reducing is 3641 // a == lo + hi * z^128 3642 // substituting, 3643 // === lo - hi * p(z) (mod (z^128 + p(z))) 3644 // 3645 // we reduce by multiplying hi by p(z) and subtracting the result 3646 // from (i.e. XORing it with) lo. Because p has no nonzero high 3647 // bits we can do this with two 64-bit multiplications, lo*p and 3648 // hi*p. 3649 3650 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3651 __ ext(t1, __ T16B, t0, z, 8); 3652 __ eor(hi, __ T16B, hi, t1); 3653 __ ext(t1, __ T16B, z, t0, 8); 3654 __ eor(lo, __ T16B, lo, t1); 3655 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3656 __ eor(result, __ T16B, lo, t0); 3657 } 3658 3659 address generate_has_negatives(address &has_negatives_long) { 3660 const u1 large_loop_size = 64; 3661 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3662 int dcache_line = VM_Version::dcache_line_size(); 3663 3664 Register ary1 = r1, len = r2, result = r0; 3665 3666 __ align(CodeEntryAlignment); 3667 3668 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3669 3670 address entry = __ pc(); 3671 3672 __ enter(); 3673 3674 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3675 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3676 3677 __ cmp(len, (u1)15); 3678 __ br(Assembler::GT, LEN_OVER_15); 3679 // The only case when execution falls into this code is when pointer is near 3680 // the end of memory page and we have to avoid reading next page 3681 __ add(ary1, ary1, len); 3682 __ subs(len, len, 8); 3683 __ br(Assembler::GT, LEN_OVER_8); 3684 __ ldr(rscratch2, Address(ary1, -8)); 3685 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3686 __ lsrv(rscratch2, rscratch2, rscratch1); 3687 __ tst(rscratch2, UPPER_BIT_MASK); 3688 __ cset(result, Assembler::NE); 3689 __ leave(); 3690 __ ret(lr); 3691 __ bind(LEN_OVER_8); 3692 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3693 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3694 __ tst(rscratch2, UPPER_BIT_MASK); 3695 __ br(Assembler::NE, RET_TRUE_NO_POP); 3696 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3697 __ lsrv(rscratch1, rscratch1, rscratch2); 3698 __ tst(rscratch1, UPPER_BIT_MASK); 3699 __ cset(result, Assembler::NE); 3700 __ leave(); 3701 __ ret(lr); 3702 3703 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3704 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3705 3706 has_negatives_long = __ pc(); // 2nd entry point 3707 3708 __ enter(); 3709 3710 __ bind(LEN_OVER_15); 3711 __ push(spilled_regs, sp); 3712 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3713 __ cbz(rscratch2, ALIGNED); 3714 __ ldp(tmp6, tmp1, Address(ary1)); 3715 __ mov(tmp5, 16); 3716 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3717 __ add(ary1, ary1, rscratch1); 3718 __ sub(len, len, rscratch1); 3719 __ orr(tmp6, tmp6, tmp1); 3720 __ tst(tmp6, UPPER_BIT_MASK); 3721 __ br(Assembler::NE, RET_TRUE); 3722 3723 __ bind(ALIGNED); 3724 __ cmp(len, large_loop_size); 3725 __ br(Assembler::LT, CHECK_16); 3726 // Perform 16-byte load as early return in pre-loop to handle situation 3727 // when initially aligned large array has negative values at starting bytes, 3728 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3729 // slower. Cases with negative bytes further ahead won't be affected that 3730 // much. In fact, it'll be faster due to early loads, less instructions and 3731 // less branches in LARGE_LOOP. 3732 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3733 __ sub(len, len, 16); 3734 __ orr(tmp6, tmp6, tmp1); 3735 __ tst(tmp6, UPPER_BIT_MASK); 3736 __ br(Assembler::NE, RET_TRUE); 3737 __ cmp(len, large_loop_size); 3738 __ br(Assembler::LT, CHECK_16); 3739 3740 if (SoftwarePrefetchHintDistance >= 0 3741 && SoftwarePrefetchHintDistance >= dcache_line) { 3742 // initial prefetch 3743 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3744 } 3745 __ bind(LARGE_LOOP); 3746 if (SoftwarePrefetchHintDistance >= 0) { 3747 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3748 } 3749 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3750 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3751 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3752 // instructions per cycle and have less branches, but this approach disables 3753 // early return, thus, all 64 bytes are loaded and checked every time. 3754 __ ldp(tmp2, tmp3, Address(ary1)); 3755 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3756 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3757 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3758 __ add(ary1, ary1, large_loop_size); 3759 __ sub(len, len, large_loop_size); 3760 __ orr(tmp2, tmp2, tmp3); 3761 __ orr(tmp4, tmp4, tmp5); 3762 __ orr(rscratch1, rscratch1, rscratch2); 3763 __ orr(tmp6, tmp6, tmp1); 3764 __ orr(tmp2, tmp2, tmp4); 3765 __ orr(rscratch1, rscratch1, tmp6); 3766 __ orr(tmp2, tmp2, rscratch1); 3767 __ tst(tmp2, UPPER_BIT_MASK); 3768 __ br(Assembler::NE, RET_TRUE); 3769 __ cmp(len, large_loop_size); 3770 __ br(Assembler::GE, LARGE_LOOP); 3771 3772 __ bind(CHECK_16); // small 16-byte load pre-loop 3773 __ cmp(len, (u1)16); 3774 __ br(Assembler::LT, POST_LOOP16); 3775 3776 __ bind(LOOP16); // small 16-byte load loop 3777 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3778 __ sub(len, len, 16); 3779 __ orr(tmp2, tmp2, tmp3); 3780 __ tst(tmp2, UPPER_BIT_MASK); 3781 __ br(Assembler::NE, RET_TRUE); 3782 __ cmp(len, (u1)16); 3783 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3784 3785 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3786 __ cmp(len, (u1)8); 3787 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3788 __ ldr(tmp3, Address(__ post(ary1, 8))); 3789 __ sub(len, len, 8); 3790 __ tst(tmp3, UPPER_BIT_MASK); 3791 __ br(Assembler::NE, RET_TRUE); 3792 3793 __ bind(POST_LOOP16_LOAD_TAIL); 3794 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3795 __ ldr(tmp1, Address(ary1)); 3796 __ mov(tmp2, 64); 3797 __ sub(tmp4, tmp2, len, __ LSL, 3); 3798 __ lslv(tmp1, tmp1, tmp4); 3799 __ tst(tmp1, UPPER_BIT_MASK); 3800 __ br(Assembler::NE, RET_TRUE); 3801 // Fallthrough 3802 3803 __ bind(RET_FALSE); 3804 __ pop(spilled_regs, sp); 3805 __ leave(); 3806 __ mov(result, zr); 3807 __ ret(lr); 3808 3809 __ bind(RET_TRUE); 3810 __ pop(spilled_regs, sp); 3811 __ bind(RET_TRUE_NO_POP); 3812 __ leave(); 3813 __ mov(result, 1); 3814 __ ret(lr); 3815 3816 __ bind(DONE); 3817 __ pop(spilled_regs, sp); 3818 __ leave(); 3819 __ ret(lr); 3820 return entry; 3821 } 3822 3823 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3824 bool usePrefetch, Label &NOT_EQUAL) { 3825 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3826 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3827 tmp7 = r12, tmp8 = r13; 3828 Label LOOP; 3829 3830 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3831 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3832 __ bind(LOOP); 3833 if (usePrefetch) { 3834 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3835 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3836 } 3837 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3838 __ eor(tmp1, tmp1, tmp2); 3839 __ eor(tmp3, tmp3, tmp4); 3840 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3841 __ orr(tmp1, tmp1, tmp3); 3842 __ cbnz(tmp1, NOT_EQUAL); 3843 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3844 __ eor(tmp5, tmp5, tmp6); 3845 __ eor(tmp7, tmp7, tmp8); 3846 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3847 __ orr(tmp5, tmp5, tmp7); 3848 __ cbnz(tmp5, NOT_EQUAL); 3849 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3850 __ eor(tmp1, tmp1, tmp2); 3851 __ eor(tmp3, tmp3, tmp4); 3852 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3853 __ orr(tmp1, tmp1, tmp3); 3854 __ cbnz(tmp1, NOT_EQUAL); 3855 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3856 __ eor(tmp5, tmp5, tmp6); 3857 __ sub(cnt1, cnt1, 8 * wordSize); 3858 __ eor(tmp7, tmp7, tmp8); 3859 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3860 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3861 // cmp) because subs allows an unlimited range of immediate operand. 3862 __ subs(tmp6, cnt1, loopThreshold); 3863 __ orr(tmp5, tmp5, tmp7); 3864 __ cbnz(tmp5, NOT_EQUAL); 3865 __ br(__ GE, LOOP); 3866 // post-loop 3867 __ eor(tmp1, tmp1, tmp2); 3868 __ eor(tmp3, tmp3, tmp4); 3869 __ orr(tmp1, tmp1, tmp3); 3870 __ sub(cnt1, cnt1, 2 * wordSize); 3871 __ cbnz(tmp1, NOT_EQUAL); 3872 } 3873 3874 void generate_large_array_equals_loop_simd(int loopThreshold, 3875 bool usePrefetch, Label &NOT_EQUAL) { 3876 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3877 tmp2 = rscratch2; 3878 Label LOOP; 3879 3880 __ bind(LOOP); 3881 if (usePrefetch) { 3882 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3883 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3884 } 3885 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3886 __ sub(cnt1, cnt1, 8 * wordSize); 3887 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3888 __ subs(tmp1, cnt1, loopThreshold); 3889 __ eor(v0, __ T16B, v0, v4); 3890 __ eor(v1, __ T16B, v1, v5); 3891 __ eor(v2, __ T16B, v2, v6); 3892 __ eor(v3, __ T16B, v3, v7); 3893 __ orr(v0, __ T16B, v0, v1); 3894 __ orr(v1, __ T16B, v2, v3); 3895 __ orr(v0, __ T16B, v0, v1); 3896 __ umov(tmp1, v0, __ D, 0); 3897 __ umov(tmp2, v0, __ D, 1); 3898 __ orr(tmp1, tmp1, tmp2); 3899 __ cbnz(tmp1, NOT_EQUAL); 3900 __ br(__ GE, LOOP); 3901 } 3902 3903 // a1 = r1 - array1 address 3904 // a2 = r2 - array2 address 3905 // result = r0 - return value. Already contains "false" 3906 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3907 // r3-r5 are reserved temporary registers 3908 address generate_large_array_equals() { 3909 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3910 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3911 tmp7 = r12, tmp8 = r13; 3912 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3913 SMALL_LOOP, POST_LOOP; 3914 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3915 // calculate if at least 32 prefetched bytes are used 3916 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3917 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3918 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3919 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3920 tmp5, tmp6, tmp7, tmp8); 3921 3922 __ align(CodeEntryAlignment); 3923 3924 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3925 3926 address entry = __ pc(); 3927 __ enter(); 3928 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3929 // also advance pointers to use post-increment instead of pre-increment 3930 __ add(a1, a1, wordSize); 3931 __ add(a2, a2, wordSize); 3932 if (AvoidUnalignedAccesses) { 3933 // both implementations (SIMD/nonSIMD) are using relatively large load 3934 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3935 // on some CPUs in case of address is not at least 16-byte aligned. 3936 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3937 // load if needed at least for 1st address and make if 16-byte aligned. 3938 Label ALIGNED16; 3939 __ tbz(a1, 3, ALIGNED16); 3940 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3941 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3942 __ sub(cnt1, cnt1, wordSize); 3943 __ eor(tmp1, tmp1, tmp2); 3944 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3945 __ bind(ALIGNED16); 3946 } 3947 if (UseSIMDForArrayEquals) { 3948 if (SoftwarePrefetchHintDistance >= 0) { 3949 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3950 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3951 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3952 /* prfm = */ true, NOT_EQUAL); 3953 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3954 __ br(__ LT, TAIL); 3955 } 3956 __ bind(NO_PREFETCH_LARGE_LOOP); 3957 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3958 /* prfm = */ false, NOT_EQUAL); 3959 } else { 3960 __ push(spilled_regs, sp); 3961 if (SoftwarePrefetchHintDistance >= 0) { 3962 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3963 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3964 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3965 /* prfm = */ true, NOT_EQUAL); 3966 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3967 __ br(__ LT, TAIL); 3968 } 3969 __ bind(NO_PREFETCH_LARGE_LOOP); 3970 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3971 /* prfm = */ false, NOT_EQUAL); 3972 } 3973 __ bind(TAIL); 3974 __ cbz(cnt1, EQUAL); 3975 __ subs(cnt1, cnt1, wordSize); 3976 __ br(__ LE, POST_LOOP); 3977 __ bind(SMALL_LOOP); 3978 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3979 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3980 __ subs(cnt1, cnt1, wordSize); 3981 __ eor(tmp1, tmp1, tmp2); 3982 __ cbnz(tmp1, NOT_EQUAL); 3983 __ br(__ GT, SMALL_LOOP); 3984 __ bind(POST_LOOP); 3985 __ ldr(tmp1, Address(a1, cnt1)); 3986 __ ldr(tmp2, Address(a2, cnt1)); 3987 __ eor(tmp1, tmp1, tmp2); 3988 __ cbnz(tmp1, NOT_EQUAL); 3989 __ bind(EQUAL); 3990 __ mov(result, true); 3991 __ bind(NOT_EQUAL); 3992 if (!UseSIMDForArrayEquals) { 3993 __ pop(spilled_regs, sp); 3994 } 3995 __ bind(NOT_EQUAL_NO_POP); 3996 __ leave(); 3997 __ ret(lr); 3998 return entry; 3999 } 4000 4001 address generate_dsin_dcos(bool isCos) { 4002 __ align(CodeEntryAlignment); 4003 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 4004 address start = __ pc(); 4005 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 4006 (address)StubRoutines::aarch64::_two_over_pi, 4007 (address)StubRoutines::aarch64::_pio2, 4008 (address)StubRoutines::aarch64::_dsin_coef, 4009 (address)StubRoutines::aarch64::_dcos_coef); 4010 return start; 4011 } 4012 4013 address generate_dlog() { 4014 __ align(CodeEntryAlignment); 4015 StubCodeMark mark(this, "StubRoutines", "dlog"); 4016 address entry = __ pc(); 4017 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 4018 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 4019 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 4020 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 4021 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 4022 return entry; 4023 } 4024 4025 // code for comparing 16 bytes of strings with same encoding 4026 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 4027 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 4028 __ ldr(rscratch1, Address(__ post(str1, 8))); 4029 __ eor(rscratch2, tmp1, tmp2); 4030 __ ldr(cnt1, Address(__ post(str2, 8))); 4031 __ cbnz(rscratch2, DIFF1); 4032 __ ldr(tmp1, Address(__ post(str1, 8))); 4033 __ eor(rscratch2, rscratch1, cnt1); 4034 __ ldr(tmp2, Address(__ post(str2, 8))); 4035 __ cbnz(rscratch2, DIFF2); 4036 } 4037 4038 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4039 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4040 Label &DIFF2) { 4041 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 4042 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4043 4044 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4045 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4046 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4047 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4048 4049 __ fmovd(tmpL, vtmp3); 4050 __ eor(rscratch2, tmp3, tmpL); 4051 __ cbnz(rscratch2, DIFF2); 4052 4053 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4054 __ umov(tmpL, vtmp3, __ D, 1); 4055 __ eor(rscratch2, tmpU, tmpL); 4056 __ cbnz(rscratch2, DIFF1); 4057 4058 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4059 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4060 __ fmovd(tmpL, vtmp); 4061 __ eor(rscratch2, tmp3, tmpL); 4062 __ cbnz(rscratch2, DIFF2); 4063 4064 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4065 __ umov(tmpL, vtmp, __ D, 1); 4066 __ eor(rscratch2, tmpU, tmpL); 4067 __ cbnz(rscratch2, DIFF1); 4068 } 4069 4070 // r0 = result 4071 // r1 = str1 4072 // r2 = cnt1 4073 // r3 = str2 4074 // r4 = cnt2 4075 // r10 = tmp1 4076 // r11 = tmp2 4077 address generate_compare_long_string_different_encoding(bool isLU) { 4078 __ align(CodeEntryAlignment); 4079 StubCodeMark mark(this, "StubRoutines", isLU 4080 ? "compare_long_string_different_encoding LU" 4081 : "compare_long_string_different_encoding UL"); 4082 address entry = __ pc(); 4083 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4084 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER, 4085 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4086 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4087 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4088 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4089 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4090 4091 int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2); 4092 4093 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4094 // cnt2 == amount of characters left to compare 4095 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4096 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4097 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4098 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4099 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4100 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4101 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4102 __ eor(rscratch2, tmp1, tmp2); 4103 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4104 __ mov(rscratch1, tmp2); 4105 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4106 Register strU = isLU ? str2 : str1, 4107 strL = isLU ? str1 : str2, 4108 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4109 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4110 __ push(spilled_regs, sp); 4111 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4112 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4113 4114 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4115 4116 if (SoftwarePrefetchHintDistance >= 0) { 4117 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4118 __ br(__ LT, SMALL_LOOP); 4119 __ bind(LARGE_LOOP_PREFETCH); 4120 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4121 __ mov(tmp4, 2); 4122 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4123 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4124 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4125 __ subs(tmp4, tmp4, 1); 4126 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4127 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4128 __ mov(tmp4, 2); 4129 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4130 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4131 __ subs(tmp4, tmp4, 1); 4132 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4133 __ sub(cnt2, cnt2, 64); 4134 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4135 __ br(__ GE, LARGE_LOOP_PREFETCH); 4136 } 4137 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4138 __ subs(cnt2, cnt2, 16); 4139 __ br(__ LT, TAIL); 4140 __ b(SMALL_LOOP_ENTER); 4141 __ bind(SMALL_LOOP); // smaller loop 4142 __ subs(cnt2, cnt2, 16); 4143 __ bind(SMALL_LOOP_ENTER); 4144 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4145 __ br(__ GE, SMALL_LOOP); 4146 __ cbz(cnt2, LOAD_LAST); 4147 __ bind(TAIL); // 1..15 characters left 4148 __ subs(zr, cnt2, -8); 4149 __ br(__ GT, TAIL_LOAD_16); 4150 __ ldrd(vtmp, Address(tmp2)); 4151 __ zip1(vtmp3, __ T8B, vtmp, vtmpZ); 4152 4153 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4154 __ fmovd(tmpL, vtmp3); 4155 __ eor(rscratch2, tmp3, tmpL); 4156 __ cbnz(rscratch2, DIFF2); 4157 __ umov(tmpL, vtmp3, __ D, 1); 4158 __ eor(rscratch2, tmpU, tmpL); 4159 __ cbnz(rscratch2, DIFF1); 4160 __ b(LOAD_LAST); 4161 __ bind(TAIL_LOAD_16); 4162 __ ldrq(vtmp, Address(tmp2)); 4163 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4164 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4165 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4166 __ fmovd(tmpL, vtmp3); 4167 __ eor(rscratch2, tmp3, tmpL); 4168 __ cbnz(rscratch2, DIFF2); 4169 4170 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4171 __ umov(tmpL, vtmp3, __ D, 1); 4172 __ eor(rscratch2, tmpU, tmpL); 4173 __ cbnz(rscratch2, DIFF1); 4174 4175 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4176 __ fmovd(tmpL, vtmp); 4177 __ eor(rscratch2, tmp3, tmpL); 4178 __ cbnz(rscratch2, DIFF2); 4179 4180 __ umov(tmpL, vtmp, __ D, 1); 4181 __ eor(rscratch2, tmpU, tmpL); 4182 __ cbnz(rscratch2, DIFF1); 4183 __ b(LOAD_LAST); 4184 __ bind(DIFF2); 4185 __ mov(tmpU, tmp3); 4186 __ bind(DIFF1); 4187 __ pop(spilled_regs, sp); 4188 __ b(CALCULATE_DIFFERENCE); 4189 __ bind(LOAD_LAST); 4190 __ pop(spilled_regs, sp); 4191 4192 __ ldrs(vtmp, Address(strL)); 4193 __ ldr(tmpU, Address(strU)); 4194 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4195 __ fmovd(tmpL, vtmp); 4196 4197 __ eor(rscratch2, tmpU, tmpL); 4198 __ cbz(rscratch2, DONE); 4199 4200 // Find the first different characters in the longwords and 4201 // compute their difference. 4202 __ bind(CALCULATE_DIFFERENCE); 4203 __ rev(rscratch2, rscratch2); 4204 __ clz(rscratch2, rscratch2); 4205 __ andr(rscratch2, rscratch2, -16); 4206 __ lsrv(tmp1, tmp1, rscratch2); 4207 __ uxthw(tmp1, tmp1); 4208 __ lsrv(rscratch1, rscratch1, rscratch2); 4209 __ uxthw(rscratch1, rscratch1); 4210 __ subw(result, tmp1, rscratch1); 4211 __ bind(DONE); 4212 __ ret(lr); 4213 return entry; 4214 } 4215 4216 // r0 = result 4217 // r1 = str1 4218 // r2 = cnt1 4219 // r3 = str2 4220 // r4 = cnt2 4221 // r10 = tmp1 4222 // r11 = tmp2 4223 address generate_compare_long_string_same_encoding(bool isLL) { 4224 __ align(CodeEntryAlignment); 4225 StubCodeMark mark(this, "StubRoutines", isLL 4226 ? "compare_long_string_same_encoding LL" 4227 : "compare_long_string_same_encoding UU"); 4228 address entry = __ pc(); 4229 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4230 tmp1 = r10, tmp2 = r11; 4231 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4232 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4233 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4234 // exit from large loop when less than 64 bytes left to read or we're about 4235 // to prefetch memory behind array border 4236 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4237 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4238 // update cnt2 counter with already loaded 8 bytes 4239 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4240 // update pointers, because of previous read 4241 __ add(str1, str1, wordSize); 4242 __ add(str2, str2, wordSize); 4243 if (SoftwarePrefetchHintDistance >= 0) { 4244 __ bind(LARGE_LOOP_PREFETCH); 4245 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4246 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4247 compare_string_16_bytes_same(DIFF, DIFF2); 4248 compare_string_16_bytes_same(DIFF, DIFF2); 4249 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4250 compare_string_16_bytes_same(DIFF, DIFF2); 4251 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4252 compare_string_16_bytes_same(DIFF, DIFF2); 4253 __ br(__ GT, LARGE_LOOP_PREFETCH); 4254 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4255 // less than 16 bytes left? 4256 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4257 __ br(__ LT, TAIL); 4258 } 4259 __ bind(SMALL_LOOP); 4260 compare_string_16_bytes_same(DIFF, DIFF2); 4261 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4262 __ br(__ GE, SMALL_LOOP); 4263 __ bind(TAIL); 4264 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4265 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4266 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4267 __ br(__ LE, CHECK_LAST); 4268 __ eor(rscratch2, tmp1, tmp2); 4269 __ cbnz(rscratch2, DIFF); 4270 __ ldr(tmp1, Address(__ post(str1, 8))); 4271 __ ldr(tmp2, Address(__ post(str2, 8))); 4272 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4273 __ bind(CHECK_LAST); 4274 if (!isLL) { 4275 __ add(cnt2, cnt2, cnt2); // now in bytes 4276 } 4277 __ eor(rscratch2, tmp1, tmp2); 4278 __ cbnz(rscratch2, DIFF); 4279 __ ldr(rscratch1, Address(str1, cnt2)); 4280 __ ldr(cnt1, Address(str2, cnt2)); 4281 __ eor(rscratch2, rscratch1, cnt1); 4282 __ cbz(rscratch2, LENGTH_DIFF); 4283 // Find the first different characters in the longwords and 4284 // compute their difference. 4285 __ bind(DIFF2); 4286 __ rev(rscratch2, rscratch2); 4287 __ clz(rscratch2, rscratch2); 4288 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4289 __ lsrv(rscratch1, rscratch1, rscratch2); 4290 if (isLL) { 4291 __ lsrv(cnt1, cnt1, rscratch2); 4292 __ uxtbw(rscratch1, rscratch1); 4293 __ uxtbw(cnt1, cnt1); 4294 } else { 4295 __ lsrv(cnt1, cnt1, rscratch2); 4296 __ uxthw(rscratch1, rscratch1); 4297 __ uxthw(cnt1, cnt1); 4298 } 4299 __ subw(result, rscratch1, cnt1); 4300 __ b(LENGTH_DIFF); 4301 __ bind(DIFF); 4302 __ rev(rscratch2, rscratch2); 4303 __ clz(rscratch2, rscratch2); 4304 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4305 __ lsrv(tmp1, tmp1, rscratch2); 4306 if (isLL) { 4307 __ lsrv(tmp2, tmp2, rscratch2); 4308 __ uxtbw(tmp1, tmp1); 4309 __ uxtbw(tmp2, tmp2); 4310 } else { 4311 __ lsrv(tmp2, tmp2, rscratch2); 4312 __ uxthw(tmp1, tmp1); 4313 __ uxthw(tmp2, tmp2); 4314 } 4315 __ subw(result, tmp1, tmp2); 4316 __ b(LENGTH_DIFF); 4317 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4318 __ eor(rscratch2, tmp1, tmp2); 4319 __ cbnz(rscratch2, DIFF); 4320 __ bind(LENGTH_DIFF); 4321 __ ret(lr); 4322 return entry; 4323 } 4324 4325 void generate_compare_long_strings() { 4326 StubRoutines::aarch64::_compare_long_string_LL 4327 = generate_compare_long_string_same_encoding(true); 4328 StubRoutines::aarch64::_compare_long_string_UU 4329 = generate_compare_long_string_same_encoding(false); 4330 StubRoutines::aarch64::_compare_long_string_LU 4331 = generate_compare_long_string_different_encoding(true); 4332 StubRoutines::aarch64::_compare_long_string_UL 4333 = generate_compare_long_string_different_encoding(false); 4334 } 4335 4336 // R0 = result 4337 // R1 = str2 4338 // R2 = cnt1 4339 // R3 = str1 4340 // R4 = cnt2 4341 // This generic linear code use few additional ideas, which makes it faster: 4342 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4343 // in order to skip initial loading(help in systems with 1 ld pipeline) 4344 // 2) we can use "fast" algorithm of finding single character to search for 4345 // first symbol with less branches(1 branch per each loaded register instead 4346 // of branch for each symbol), so, this is where constants like 4347 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4348 // 3) after loading and analyzing 1st register of source string, it can be 4349 // used to search for every 1st character entry, saving few loads in 4350 // comparison with "simplier-but-slower" implementation 4351 // 4) in order to avoid lots of push/pop operations, code below is heavily 4352 // re-using/re-initializing/compressing register values, which makes code 4353 // larger and a bit less readable, however, most of extra operations are 4354 // issued during loads or branches, so, penalty is minimal 4355 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4356 const char* stubName = str1_isL 4357 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4358 : "indexof_linear_uu"; 4359 __ align(CodeEntryAlignment); 4360 StubCodeMark mark(this, "StubRoutines", stubName); 4361 address entry = __ pc(); 4362 4363 int str1_chr_size = str1_isL ? 1 : 2; 4364 int str2_chr_size = str2_isL ? 1 : 2; 4365 int str1_chr_shift = str1_isL ? 0 : 1; 4366 int str2_chr_shift = str2_isL ? 0 : 1; 4367 bool isL = str1_isL && str2_isL; 4368 // parameters 4369 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4370 // temporary registers 4371 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4372 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4373 // redefinitions 4374 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4375 4376 __ push(spilled_regs, sp); 4377 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4378 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4379 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4380 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4381 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4382 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4383 // Read whole register from str1. It is safe, because length >=8 here 4384 __ ldr(ch1, Address(str1)); 4385 // Read whole register from str2. It is safe, because length >=8 here 4386 __ ldr(ch2, Address(str2)); 4387 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4388 if (str1_isL != str2_isL) { 4389 __ eor(v0, __ T16B, v0, v0); 4390 } 4391 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4392 __ mul(first, first, tmp1); 4393 // check if we have less than 1 register to check 4394 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4395 if (str1_isL != str2_isL) { 4396 __ fmovd(v1, ch1); 4397 } 4398 __ br(__ LE, L_SMALL); 4399 __ eor(ch2, first, ch2); 4400 if (str1_isL != str2_isL) { 4401 __ zip1(v1, __ T16B, v1, v0); 4402 } 4403 __ sub(tmp2, ch2, tmp1); 4404 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4405 __ bics(tmp2, tmp2, ch2); 4406 if (str1_isL != str2_isL) { 4407 __ fmovd(ch1, v1); 4408 } 4409 __ br(__ NE, L_HAS_ZERO); 4410 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4411 __ add(result, result, wordSize/str2_chr_size); 4412 __ add(str2, str2, wordSize); 4413 __ br(__ LT, L_POST_LOOP); 4414 __ BIND(L_LOOP); 4415 __ ldr(ch2, Address(str2)); 4416 __ eor(ch2, first, ch2); 4417 __ sub(tmp2, ch2, tmp1); 4418 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4419 __ bics(tmp2, tmp2, ch2); 4420 __ br(__ NE, L_HAS_ZERO); 4421 __ BIND(L_LOOP_PROCEED); 4422 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4423 __ add(str2, str2, wordSize); 4424 __ add(result, result, wordSize/str2_chr_size); 4425 __ br(__ GE, L_LOOP); 4426 __ BIND(L_POST_LOOP); 4427 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4428 __ br(__ LE, NOMATCH); 4429 __ ldr(ch2, Address(str2)); 4430 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4431 __ eor(ch2, first, ch2); 4432 __ sub(tmp2, ch2, tmp1); 4433 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4434 __ mov(tmp4, -1); // all bits set 4435 __ b(L_SMALL_PROCEED); 4436 __ align(OptoLoopAlignment); 4437 __ BIND(L_SMALL); 4438 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4439 __ eor(ch2, first, ch2); 4440 if (str1_isL != str2_isL) { 4441 __ zip1(v1, __ T16B, v1, v0); 4442 } 4443 __ sub(tmp2, ch2, tmp1); 4444 __ mov(tmp4, -1); // all bits set 4445 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4446 if (str1_isL != str2_isL) { 4447 __ fmovd(ch1, v1); // move converted 4 symbols 4448 } 4449 __ BIND(L_SMALL_PROCEED); 4450 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4451 __ bic(tmp2, tmp2, ch2); 4452 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4453 __ rbit(tmp2, tmp2); 4454 __ br(__ EQ, NOMATCH); 4455 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4456 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4457 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4458 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4459 if (str2_isL) { // LL 4460 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4461 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4462 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4463 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4464 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4465 } else { 4466 __ mov(ch2, 0xE); // all bits in byte set except last one 4467 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4468 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4469 __ lslv(tmp2, tmp2, tmp4); 4470 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4471 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4472 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4473 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4474 } 4475 __ cmp(ch1, ch2); 4476 __ mov(tmp4, wordSize/str2_chr_size); 4477 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4478 __ BIND(L_SMALL_CMP_LOOP); 4479 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4480 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4481 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4482 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4483 __ add(tmp4, tmp4, 1); 4484 __ cmp(tmp4, cnt1); 4485 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4486 __ cmp(first, ch2); 4487 __ br(__ EQ, L_SMALL_CMP_LOOP); 4488 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4489 __ cbz(tmp2, NOMATCH); // no more matches. exit 4490 __ clz(tmp4, tmp2); 4491 __ add(result, result, 1); // advance index 4492 __ add(str2, str2, str2_chr_size); // advance pointer 4493 __ b(L_SMALL_HAS_ZERO_LOOP); 4494 __ align(OptoLoopAlignment); 4495 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4496 __ cmp(first, ch2); 4497 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4498 __ b(DONE); 4499 __ align(OptoLoopAlignment); 4500 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4501 if (str2_isL) { // LL 4502 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4503 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4504 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4505 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4506 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4507 } else { 4508 __ mov(ch2, 0xE); // all bits in byte set except last one 4509 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4510 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4511 __ lslv(tmp2, tmp2, tmp4); 4512 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4513 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4514 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4515 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4516 } 4517 __ cmp(ch1, ch2); 4518 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4519 __ b(DONE); 4520 __ align(OptoLoopAlignment); 4521 __ BIND(L_HAS_ZERO); 4522 __ rbit(tmp2, tmp2); 4523 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4524 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4525 // It's fine because both counters are 32bit and are not changed in this 4526 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4527 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4528 __ sub(result, result, 1); 4529 __ BIND(L_HAS_ZERO_LOOP); 4530 __ mov(cnt1, wordSize/str2_chr_size); 4531 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4532 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4533 if (str2_isL) { 4534 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4535 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4536 __ lslv(tmp2, tmp2, tmp4); 4537 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4538 __ add(tmp4, tmp4, 1); 4539 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4540 __ lsl(tmp2, tmp2, 1); 4541 __ mov(tmp4, wordSize/str2_chr_size); 4542 } else { 4543 __ mov(ch2, 0xE); 4544 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4545 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4546 __ lslv(tmp2, tmp2, tmp4); 4547 __ add(tmp4, tmp4, 1); 4548 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4549 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4550 __ lsl(tmp2, tmp2, 1); 4551 __ mov(tmp4, wordSize/str2_chr_size); 4552 __ sub(str2, str2, str2_chr_size); 4553 } 4554 __ cmp(ch1, ch2); 4555 __ mov(tmp4, wordSize/str2_chr_size); 4556 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4557 __ BIND(L_CMP_LOOP); 4558 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4559 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4560 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4561 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4562 __ add(tmp4, tmp4, 1); 4563 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4564 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4565 __ cmp(cnt1, ch2); 4566 __ br(__ EQ, L_CMP_LOOP); 4567 __ BIND(L_CMP_LOOP_NOMATCH); 4568 // here we're not matched 4569 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4570 __ clz(tmp4, tmp2); 4571 __ add(str2, str2, str2_chr_size); // advance pointer 4572 __ b(L_HAS_ZERO_LOOP); 4573 __ align(OptoLoopAlignment); 4574 __ BIND(L_CMP_LOOP_LAST_CMP); 4575 __ cmp(cnt1, ch2); 4576 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4577 __ b(DONE); 4578 __ align(OptoLoopAlignment); 4579 __ BIND(L_CMP_LOOP_LAST_CMP2); 4580 if (str2_isL) { 4581 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4582 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4583 __ lslv(tmp2, tmp2, tmp4); 4584 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4585 __ add(tmp4, tmp4, 1); 4586 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4587 __ lsl(tmp2, tmp2, 1); 4588 } else { 4589 __ mov(ch2, 0xE); 4590 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4591 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4592 __ lslv(tmp2, tmp2, tmp4); 4593 __ add(tmp4, tmp4, 1); 4594 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4595 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4596 __ lsl(tmp2, tmp2, 1); 4597 __ sub(str2, str2, str2_chr_size); 4598 } 4599 __ cmp(ch1, ch2); 4600 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4601 __ b(DONE); 4602 __ align(OptoLoopAlignment); 4603 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4604 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4605 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4606 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4607 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4608 // result by analyzed characters value, so, we can just reset lower bits 4609 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4610 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4611 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4612 // index of last analyzed substring inside current octet. So, str2 in at 4613 // respective start address. We need to advance it to next octet 4614 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4615 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4616 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4617 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4618 __ movw(cnt2, cnt2); 4619 __ b(L_LOOP_PROCEED); 4620 __ align(OptoLoopAlignment); 4621 __ BIND(NOMATCH); 4622 __ mov(result, -1); 4623 __ BIND(DONE); 4624 __ pop(spilled_regs, sp); 4625 __ ret(lr); 4626 return entry; 4627 } 4628 4629 void generate_string_indexof_stubs() { 4630 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4631 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4632 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4633 } 4634 4635 void inflate_and_store_2_fp_registers(bool generatePrfm, 4636 FloatRegister src1, FloatRegister src2) { 4637 Register dst = r1; 4638 __ zip1(v1, __ T16B, src1, v0); 4639 __ zip2(v2, __ T16B, src1, v0); 4640 if (generatePrfm) { 4641 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4642 } 4643 __ zip1(v3, __ T16B, src2, v0); 4644 __ zip2(v4, __ T16B, src2, v0); 4645 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4646 } 4647 4648 // R0 = src 4649 // R1 = dst 4650 // R2 = len 4651 // R3 = len >> 3 4652 // V0 = 0 4653 // v1 = loaded 8 bytes 4654 address generate_large_byte_array_inflate() { 4655 __ align(CodeEntryAlignment); 4656 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4657 address entry = __ pc(); 4658 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4659 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4660 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4661 4662 // do one more 8-byte read to have address 16-byte aligned in most cases 4663 // also use single store instruction 4664 __ ldrd(v2, __ post(src, 8)); 4665 __ sub(octetCounter, octetCounter, 2); 4666 __ zip1(v1, __ T16B, v1, v0); 4667 __ zip1(v2, __ T16B, v2, v0); 4668 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4669 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4670 __ subs(rscratch1, octetCounter, large_loop_threshold); 4671 __ br(__ LE, LOOP_START); 4672 __ b(LOOP_PRFM_START); 4673 __ bind(LOOP_PRFM); 4674 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4675 __ bind(LOOP_PRFM_START); 4676 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4677 __ sub(octetCounter, octetCounter, 8); 4678 __ subs(rscratch1, octetCounter, large_loop_threshold); 4679 inflate_and_store_2_fp_registers(true, v3, v4); 4680 inflate_and_store_2_fp_registers(true, v5, v6); 4681 __ br(__ GT, LOOP_PRFM); 4682 __ cmp(octetCounter, (u1)8); 4683 __ br(__ LT, DONE); 4684 __ bind(LOOP); 4685 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4686 __ bind(LOOP_START); 4687 __ sub(octetCounter, octetCounter, 8); 4688 __ cmp(octetCounter, (u1)8); 4689 inflate_and_store_2_fp_registers(false, v3, v4); 4690 inflate_and_store_2_fp_registers(false, v5, v6); 4691 __ br(__ GE, LOOP); 4692 __ bind(DONE); 4693 __ ret(lr); 4694 return entry; 4695 } 4696 4697 /** 4698 * Arguments: 4699 * 4700 * Input: 4701 * c_rarg0 - current state address 4702 * c_rarg1 - H key address 4703 * c_rarg2 - data address 4704 * c_rarg3 - number of blocks 4705 * 4706 * Output: 4707 * Updated state at c_rarg0 4708 */ 4709 address generate_ghash_processBlocks() { 4710 // Bafflingly, GCM uses little-endian for the byte order, but 4711 // big-endian for the bit order. For example, the polynomial 1 is 4712 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4713 // 4714 // So, we must either reverse the bytes in each word and do 4715 // everything big-endian or reverse the bits in each byte and do 4716 // it little-endian. On AArch64 it's more idiomatic to reverse 4717 // the bits in each byte (we have an instruction, RBIT, to do 4718 // that) and keep the data in little-endian bit order throught the 4719 // calculation, bit-reversing the inputs and outputs. 4720 4721 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4722 __ align(wordSize * 2); 4723 address p = __ pc(); 4724 __ emit_int64(0x87); // The low-order bits of the field 4725 // polynomial (i.e. p = z^7+z^2+z+1) 4726 // repeated in the low and high parts of a 4727 // 128-bit vector 4728 __ emit_int64(0x87); 4729 4730 __ align(CodeEntryAlignment); 4731 address start = __ pc(); 4732 4733 Register state = c_rarg0; 4734 Register subkeyH = c_rarg1; 4735 Register data = c_rarg2; 4736 Register blocks = c_rarg3; 4737 4738 FloatRegister vzr = v30; 4739 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4740 4741 __ ldrq(v0, Address(state)); 4742 __ ldrq(v1, Address(subkeyH)); 4743 4744 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4745 __ rbit(v0, __ T16B, v0); 4746 __ rev64(v1, __ T16B, v1); 4747 __ rbit(v1, __ T16B, v1); 4748 4749 __ ldrq(v26, p); 4750 4751 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4752 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4753 4754 { 4755 Label L_ghash_loop; 4756 __ bind(L_ghash_loop); 4757 4758 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4759 // reversing each byte 4760 __ rbit(v2, __ T16B, v2); 4761 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4762 4763 // Multiply state in v2 by subkey in v1 4764 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4765 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4766 /*temps*/v6, v20, v18, v21); 4767 // Reduce v7:v5 by the field polynomial 4768 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4769 4770 __ sub(blocks, blocks, 1); 4771 __ cbnz(blocks, L_ghash_loop); 4772 } 4773 4774 // The bit-reversed result is at this point in v0 4775 __ rev64(v1, __ T16B, v0); 4776 __ rbit(v1, __ T16B, v1); 4777 4778 __ st1(v1, __ T16B, state); 4779 __ ret(lr); 4780 4781 return start; 4782 } 4783 4784 // Continuation point for throwing of implicit exceptions that are 4785 // not handled in the current activation. Fabricates an exception 4786 // oop and initiates normal exception dispatching in this 4787 // frame. Since we need to preserve callee-saved values (currently 4788 // only for C2, but done for C1 as well) we need a callee-saved oop 4789 // map and therefore have to make these stubs into RuntimeStubs 4790 // rather than BufferBlobs. If the compiler needs all registers to 4791 // be preserved between the fault point and the exception handler 4792 // then it must assume responsibility for that in 4793 // AbstractCompiler::continuation_for_implicit_null_exception or 4794 // continuation_for_implicit_division_by_zero_exception. All other 4795 // implicit exceptions (e.g., NullPointerException or 4796 // AbstractMethodError on entry) are either at call sites or 4797 // otherwise assume that stack unwinding will be initiated, so 4798 // caller saved registers were assumed volatile in the compiler. 4799 4800 #undef __ 4801 #define __ masm-> 4802 4803 address generate_throw_exception(const char* name, 4804 address runtime_entry, 4805 Register arg1 = noreg, 4806 Register arg2 = noreg) { 4807 // Information about frame layout at time of blocking runtime call. 4808 // Note that we only have to preserve callee-saved registers since 4809 // the compilers are responsible for supplying a continuation point 4810 // if they expect all registers to be preserved. 4811 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4812 enum layout { 4813 rfp_off = 0, 4814 rfp_off2, 4815 return_off, 4816 return_off2, 4817 framesize // inclusive of return address 4818 }; 4819 4820 int insts_size = 512; 4821 int locs_size = 64; 4822 4823 CodeBuffer code(name, insts_size, locs_size); 4824 OopMapSet* oop_maps = new OopMapSet(); 4825 MacroAssembler* masm = new MacroAssembler(&code); 4826 4827 address start = __ pc(); 4828 4829 // This is an inlined and slightly modified version of call_VM 4830 // which has the ability to fetch the return PC out of 4831 // thread-local storage and also sets up last_Java_sp slightly 4832 // differently than the real call_VM 4833 4834 __ enter(); // Save FP and LR before call 4835 4836 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4837 4838 // lr and fp are already in place 4839 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4840 4841 int frame_complete = __ pc() - start; 4842 4843 // Set up last_Java_sp and last_Java_fp 4844 address the_pc = __ pc(); 4845 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 4846 4847 // Call runtime 4848 if (arg1 != noreg) { 4849 assert(arg2 != c_rarg1, "clobbered"); 4850 __ mov(c_rarg1, arg1); 4851 } 4852 if (arg2 != noreg) { 4853 __ mov(c_rarg2, arg2); 4854 } 4855 __ mov(c_rarg0, rthread); 4856 BLOCK_COMMENT("call runtime_entry"); 4857 __ mov(rscratch1, runtime_entry); 4858 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4859 4860 // Generate oop map 4861 OopMap* map = new OopMap(framesize, 0); 4862 4863 oop_maps->add_gc_map(the_pc - start, map); 4864 4865 __ reset_last_Java_frame(true); 4866 __ maybe_isb(); 4867 4868 __ leave(); 4869 4870 // check for pending exceptions 4871 #ifdef ASSERT 4872 Label L; 4873 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4874 __ cbnz(rscratch1, L); 4875 __ should_not_reach_here(); 4876 __ bind(L); 4877 #endif // ASSERT 4878 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4879 4880 4881 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4882 RuntimeStub* stub = 4883 RuntimeStub::new_runtime_stub(name, 4884 &code, 4885 frame_complete, 4886 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4887 oop_maps, false); 4888 return stub->entry_point(); 4889 } 4890 4891 class MontgomeryMultiplyGenerator : public MacroAssembler { 4892 4893 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4894 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4895 4896 RegSet _toSave; 4897 bool _squaring; 4898 4899 public: 4900 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4901 : MacroAssembler(as->code()), _squaring(squaring) { 4902 4903 // Register allocation 4904 4905 Register reg = c_rarg0; 4906 Pa_base = reg; // Argument registers 4907 if (squaring) 4908 Pb_base = Pa_base; 4909 else 4910 Pb_base = ++reg; 4911 Pn_base = ++reg; 4912 Rlen= ++reg; 4913 inv = ++reg; 4914 Pm_base = ++reg; 4915 4916 // Working registers: 4917 Ra = ++reg; // The current digit of a, b, n, and m. 4918 Rb = ++reg; 4919 Rm = ++reg; 4920 Rn = ++reg; 4921 4922 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4923 Pb = ++reg; 4924 Pm = ++reg; 4925 Pn = ++reg; 4926 4927 t0 = ++reg; // Three registers which form a 4928 t1 = ++reg; // triple-precision accumuator. 4929 t2 = ++reg; 4930 4931 Ri = ++reg; // Inner and outer loop indexes. 4932 Rj = ++reg; 4933 4934 Rhi_ab = ++reg; // Product registers: low and high parts 4935 Rlo_ab = ++reg; // of a*b and m*n. 4936 Rhi_mn = ++reg; 4937 Rlo_mn = ++reg; 4938 4939 // r19 and up are callee-saved. 4940 _toSave = RegSet::range(r19, reg) + Pm_base; 4941 } 4942 4943 private: 4944 void save_regs() { 4945 push(_toSave, sp); 4946 } 4947 4948 void restore_regs() { 4949 pop(_toSave, sp); 4950 } 4951 4952 template <typename T> 4953 void unroll_2(Register count, T block) { 4954 Label loop, end, odd; 4955 tbnz(count, 0, odd); 4956 cbz(count, end); 4957 align(16); 4958 bind(loop); 4959 (this->*block)(); 4960 bind(odd); 4961 (this->*block)(); 4962 subs(count, count, 2); 4963 br(Assembler::GT, loop); 4964 bind(end); 4965 } 4966 4967 template <typename T> 4968 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4969 Label loop, end, odd; 4970 tbnz(count, 0, odd); 4971 cbz(count, end); 4972 align(16); 4973 bind(loop); 4974 (this->*block)(d, s, tmp); 4975 bind(odd); 4976 (this->*block)(d, s, tmp); 4977 subs(count, count, 2); 4978 br(Assembler::GT, loop); 4979 bind(end); 4980 } 4981 4982 void pre1(RegisterOrConstant i) { 4983 block_comment("pre1"); 4984 // Pa = Pa_base; 4985 // Pb = Pb_base + i; 4986 // Pm = Pm_base; 4987 // Pn = Pn_base + i; 4988 // Ra = *Pa; 4989 // Rb = *Pb; 4990 // Rm = *Pm; 4991 // Rn = *Pn; 4992 ldr(Ra, Address(Pa_base)); 4993 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4994 ldr(Rm, Address(Pm_base)); 4995 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4996 lea(Pa, Address(Pa_base)); 4997 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4998 lea(Pm, Address(Pm_base)); 4999 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5000 5001 // Zero the m*n result. 5002 mov(Rhi_mn, zr); 5003 mov(Rlo_mn, zr); 5004 } 5005 5006 // The core multiply-accumulate step of a Montgomery 5007 // multiplication. The idea is to schedule operations as a 5008 // pipeline so that instructions with long latencies (loads and 5009 // multiplies) have time to complete before their results are 5010 // used. This most benefits in-order implementations of the 5011 // architecture but out-of-order ones also benefit. 5012 void step() { 5013 block_comment("step"); 5014 // MACC(Ra, Rb, t0, t1, t2); 5015 // Ra = *++Pa; 5016 // Rb = *--Pb; 5017 umulh(Rhi_ab, Ra, Rb); 5018 mul(Rlo_ab, Ra, Rb); 5019 ldr(Ra, pre(Pa, wordSize)); 5020 ldr(Rb, pre(Pb, -wordSize)); 5021 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 5022 // previous iteration. 5023 // MACC(Rm, Rn, t0, t1, t2); 5024 // Rm = *++Pm; 5025 // Rn = *--Pn; 5026 umulh(Rhi_mn, Rm, Rn); 5027 mul(Rlo_mn, Rm, Rn); 5028 ldr(Rm, pre(Pm, wordSize)); 5029 ldr(Rn, pre(Pn, -wordSize)); 5030 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5031 } 5032 5033 void post1() { 5034 block_comment("post1"); 5035 5036 // MACC(Ra, Rb, t0, t1, t2); 5037 // Ra = *++Pa; 5038 // Rb = *--Pb; 5039 umulh(Rhi_ab, Ra, Rb); 5040 mul(Rlo_ab, Ra, Rb); 5041 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5042 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5043 5044 // *Pm = Rm = t0 * inv; 5045 mul(Rm, t0, inv); 5046 str(Rm, Address(Pm)); 5047 5048 // MACC(Rm, Rn, t0, t1, t2); 5049 // t0 = t1; t1 = t2; t2 = 0; 5050 umulh(Rhi_mn, Rm, Rn); 5051 5052 #ifndef PRODUCT 5053 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5054 { 5055 mul(Rlo_mn, Rm, Rn); 5056 add(Rlo_mn, t0, Rlo_mn); 5057 Label ok; 5058 cbz(Rlo_mn, ok); { 5059 stop("broken Montgomery multiply"); 5060 } bind(ok); 5061 } 5062 #endif 5063 // We have very carefully set things up so that 5064 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5065 // the lower half of Rm * Rn because we know the result already: 5066 // it must be -t0. t0 + (-t0) must generate a carry iff 5067 // t0 != 0. So, rather than do a mul and an adds we just set 5068 // the carry flag iff t0 is nonzero. 5069 // 5070 // mul(Rlo_mn, Rm, Rn); 5071 // adds(zr, t0, Rlo_mn); 5072 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5073 adcs(t0, t1, Rhi_mn); 5074 adc(t1, t2, zr); 5075 mov(t2, zr); 5076 } 5077 5078 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5079 block_comment("pre2"); 5080 // Pa = Pa_base + i-len; 5081 // Pb = Pb_base + len; 5082 // Pm = Pm_base + i-len; 5083 // Pn = Pn_base + len; 5084 5085 if (i.is_register()) { 5086 sub(Rj, i.as_register(), len); 5087 } else { 5088 mov(Rj, i.as_constant()); 5089 sub(Rj, Rj, len); 5090 } 5091 // Rj == i-len 5092 5093 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5094 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5095 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5096 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5097 5098 // Ra = *++Pa; 5099 // Rb = *--Pb; 5100 // Rm = *++Pm; 5101 // Rn = *--Pn; 5102 ldr(Ra, pre(Pa, wordSize)); 5103 ldr(Rb, pre(Pb, -wordSize)); 5104 ldr(Rm, pre(Pm, wordSize)); 5105 ldr(Rn, pre(Pn, -wordSize)); 5106 5107 mov(Rhi_mn, zr); 5108 mov(Rlo_mn, zr); 5109 } 5110 5111 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5112 block_comment("post2"); 5113 if (i.is_constant()) { 5114 mov(Rj, i.as_constant()-len.as_constant()); 5115 } else { 5116 sub(Rj, i.as_register(), len); 5117 } 5118 5119 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5120 5121 // As soon as we know the least significant digit of our result, 5122 // store it. 5123 // Pm_base[i-len] = t0; 5124 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5125 5126 // t0 = t1; t1 = t2; t2 = 0; 5127 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5128 adc(t1, t2, zr); 5129 mov(t2, zr); 5130 } 5131 5132 // A carry in t0 after Montgomery multiplication means that we 5133 // should subtract multiples of n from our result in m. We'll 5134 // keep doing that until there is no carry. 5135 void normalize(RegisterOrConstant len) { 5136 block_comment("normalize"); 5137 // while (t0) 5138 // t0 = sub(Pm_base, Pn_base, t0, len); 5139 Label loop, post, again; 5140 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5141 cbz(t0, post); { 5142 bind(again); { 5143 mov(i, zr); 5144 mov(cnt, len); 5145 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5146 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5147 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5148 align(16); 5149 bind(loop); { 5150 sbcs(Rm, Rm, Rn); 5151 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5152 add(i, i, 1); 5153 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5154 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5155 sub(cnt, cnt, 1); 5156 } cbnz(cnt, loop); 5157 sbc(t0, t0, zr); 5158 } cbnz(t0, again); 5159 } bind(post); 5160 } 5161 5162 // Move memory at s to d, reversing words. 5163 // Increments d to end of copied memory 5164 // Destroys tmp1, tmp2 5165 // Preserves len 5166 // Leaves s pointing to the address which was in d at start 5167 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5168 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5169 5170 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5171 mov(tmp1, len); 5172 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5173 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5174 } 5175 // where 5176 void reverse1(Register d, Register s, Register tmp) { 5177 ldr(tmp, pre(s, -wordSize)); 5178 ror(tmp, tmp, 32); 5179 str(tmp, post(d, wordSize)); 5180 } 5181 5182 void step_squaring() { 5183 // An extra ACC 5184 step(); 5185 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5186 } 5187 5188 void last_squaring(RegisterOrConstant i) { 5189 Label dont; 5190 // if ((i & 1) == 0) { 5191 tbnz(i.as_register(), 0, dont); { 5192 // MACC(Ra, Rb, t0, t1, t2); 5193 // Ra = *++Pa; 5194 // Rb = *--Pb; 5195 umulh(Rhi_ab, Ra, Rb); 5196 mul(Rlo_ab, Ra, Rb); 5197 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5198 } bind(dont); 5199 } 5200 5201 void extra_step_squaring() { 5202 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5203 5204 // MACC(Rm, Rn, t0, t1, t2); 5205 // Rm = *++Pm; 5206 // Rn = *--Pn; 5207 umulh(Rhi_mn, Rm, Rn); 5208 mul(Rlo_mn, Rm, Rn); 5209 ldr(Rm, pre(Pm, wordSize)); 5210 ldr(Rn, pre(Pn, -wordSize)); 5211 } 5212 5213 void post1_squaring() { 5214 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5215 5216 // *Pm = Rm = t0 * inv; 5217 mul(Rm, t0, inv); 5218 str(Rm, Address(Pm)); 5219 5220 // MACC(Rm, Rn, t0, t1, t2); 5221 // t0 = t1; t1 = t2; t2 = 0; 5222 umulh(Rhi_mn, Rm, Rn); 5223 5224 #ifndef PRODUCT 5225 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5226 { 5227 mul(Rlo_mn, Rm, Rn); 5228 add(Rlo_mn, t0, Rlo_mn); 5229 Label ok; 5230 cbz(Rlo_mn, ok); { 5231 stop("broken Montgomery multiply"); 5232 } bind(ok); 5233 } 5234 #endif 5235 // We have very carefully set things up so that 5236 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5237 // the lower half of Rm * Rn because we know the result already: 5238 // it must be -t0. t0 + (-t0) must generate a carry iff 5239 // t0 != 0. So, rather than do a mul and an adds we just set 5240 // the carry flag iff t0 is nonzero. 5241 // 5242 // mul(Rlo_mn, Rm, Rn); 5243 // adds(zr, t0, Rlo_mn); 5244 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5245 adcs(t0, t1, Rhi_mn); 5246 adc(t1, t2, zr); 5247 mov(t2, zr); 5248 } 5249 5250 void acc(Register Rhi, Register Rlo, 5251 Register t0, Register t1, Register t2) { 5252 adds(t0, t0, Rlo); 5253 adcs(t1, t1, Rhi); 5254 adc(t2, t2, zr); 5255 } 5256 5257 public: 5258 /** 5259 * Fast Montgomery multiplication. The derivation of the 5260 * algorithm is in A Cryptographic Library for the Motorola 5261 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5262 * 5263 * Arguments: 5264 * 5265 * Inputs for multiplication: 5266 * c_rarg0 - int array elements a 5267 * c_rarg1 - int array elements b 5268 * c_rarg2 - int array elements n (the modulus) 5269 * c_rarg3 - int length 5270 * c_rarg4 - int inv 5271 * c_rarg5 - int array elements m (the result) 5272 * 5273 * Inputs for squaring: 5274 * c_rarg0 - int array elements a 5275 * c_rarg1 - int array elements n (the modulus) 5276 * c_rarg2 - int length 5277 * c_rarg3 - int inv 5278 * c_rarg4 - int array elements m (the result) 5279 * 5280 */ 5281 address generate_multiply() { 5282 Label argh, nothing; 5283 bind(argh); 5284 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5285 5286 align(CodeEntryAlignment); 5287 address entry = pc(); 5288 5289 cbzw(Rlen, nothing); 5290 5291 enter(); 5292 5293 // Make room. 5294 cmpw(Rlen, 512); 5295 br(Assembler::HI, argh); 5296 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5297 andr(sp, Ra, -2 * wordSize); 5298 5299 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5300 5301 { 5302 // Copy input args, reversing as we go. We use Ra as a 5303 // temporary variable. 5304 reverse(Ra, Pa_base, Rlen, t0, t1); 5305 if (!_squaring) 5306 reverse(Ra, Pb_base, Rlen, t0, t1); 5307 reverse(Ra, Pn_base, Rlen, t0, t1); 5308 } 5309 5310 // Push all call-saved registers and also Pm_base which we'll need 5311 // at the end. 5312 save_regs(); 5313 5314 #ifndef PRODUCT 5315 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5316 { 5317 ldr(Rn, Address(Pn_base, 0)); 5318 mul(Rlo_mn, Rn, inv); 5319 subs(zr, Rlo_mn, -1); 5320 Label ok; 5321 br(EQ, ok); { 5322 stop("broken inverse in Montgomery multiply"); 5323 } bind(ok); 5324 } 5325 #endif 5326 5327 mov(Pm_base, Ra); 5328 5329 mov(t0, zr); 5330 mov(t1, zr); 5331 mov(t2, zr); 5332 5333 block_comment("for (int i = 0; i < len; i++) {"); 5334 mov(Ri, zr); { 5335 Label loop, end; 5336 cmpw(Ri, Rlen); 5337 br(Assembler::GE, end); 5338 5339 bind(loop); 5340 pre1(Ri); 5341 5342 block_comment(" for (j = i; j; j--) {"); { 5343 movw(Rj, Ri); 5344 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5345 } block_comment(" } // j"); 5346 5347 post1(); 5348 addw(Ri, Ri, 1); 5349 cmpw(Ri, Rlen); 5350 br(Assembler::LT, loop); 5351 bind(end); 5352 block_comment("} // i"); 5353 } 5354 5355 block_comment("for (int i = len; i < 2*len; i++) {"); 5356 mov(Ri, Rlen); { 5357 Label loop, end; 5358 cmpw(Ri, Rlen, Assembler::LSL, 1); 5359 br(Assembler::GE, end); 5360 5361 bind(loop); 5362 pre2(Ri, Rlen); 5363 5364 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5365 lslw(Rj, Rlen, 1); 5366 subw(Rj, Rj, Ri); 5367 subw(Rj, Rj, 1); 5368 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5369 } block_comment(" } // j"); 5370 5371 post2(Ri, Rlen); 5372 addw(Ri, Ri, 1); 5373 cmpw(Ri, Rlen, Assembler::LSL, 1); 5374 br(Assembler::LT, loop); 5375 bind(end); 5376 } 5377 block_comment("} // i"); 5378 5379 normalize(Rlen); 5380 5381 mov(Ra, Pm_base); // Save Pm_base in Ra 5382 restore_regs(); // Restore caller's Pm_base 5383 5384 // Copy our result into caller's Pm_base 5385 reverse(Pm_base, Ra, Rlen, t0, t1); 5386 5387 leave(); 5388 bind(nothing); 5389 ret(lr); 5390 5391 return entry; 5392 } 5393 // In C, approximately: 5394 5395 // void 5396 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5397 // unsigned long Pn_base[], unsigned long Pm_base[], 5398 // unsigned long inv, int len) { 5399 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5400 // unsigned long *Pa, *Pb, *Pn, *Pm; 5401 // unsigned long Ra, Rb, Rn, Rm; 5402 5403 // int i; 5404 5405 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5406 5407 // for (i = 0; i < len; i++) { 5408 // int j; 5409 5410 // Pa = Pa_base; 5411 // Pb = Pb_base + i; 5412 // Pm = Pm_base; 5413 // Pn = Pn_base + i; 5414 5415 // Ra = *Pa; 5416 // Rb = *Pb; 5417 // Rm = *Pm; 5418 // Rn = *Pn; 5419 5420 // int iters = i; 5421 // for (j = 0; iters--; j++) { 5422 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5423 // MACC(Ra, Rb, t0, t1, t2); 5424 // Ra = *++Pa; 5425 // Rb = *--Pb; 5426 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5427 // MACC(Rm, Rn, t0, t1, t2); 5428 // Rm = *++Pm; 5429 // Rn = *--Pn; 5430 // } 5431 5432 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5433 // MACC(Ra, Rb, t0, t1, t2); 5434 // *Pm = Rm = t0 * inv; 5435 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5436 // MACC(Rm, Rn, t0, t1, t2); 5437 5438 // assert(t0 == 0, "broken Montgomery multiply"); 5439 5440 // t0 = t1; t1 = t2; t2 = 0; 5441 // } 5442 5443 // for (i = len; i < 2*len; i++) { 5444 // int j; 5445 5446 // Pa = Pa_base + i-len; 5447 // Pb = Pb_base + len; 5448 // Pm = Pm_base + i-len; 5449 // Pn = Pn_base + len; 5450 5451 // Ra = *++Pa; 5452 // Rb = *--Pb; 5453 // Rm = *++Pm; 5454 // Rn = *--Pn; 5455 5456 // int iters = len*2-i-1; 5457 // for (j = i-len+1; iters--; j++) { 5458 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5459 // MACC(Ra, Rb, t0, t1, t2); 5460 // Ra = *++Pa; 5461 // Rb = *--Pb; 5462 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5463 // MACC(Rm, Rn, t0, t1, t2); 5464 // Rm = *++Pm; 5465 // Rn = *--Pn; 5466 // } 5467 5468 // Pm_base[i-len] = t0; 5469 // t0 = t1; t1 = t2; t2 = 0; 5470 // } 5471 5472 // while (t0) 5473 // t0 = sub(Pm_base, Pn_base, t0, len); 5474 // } 5475 5476 /** 5477 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5478 * multiplies than Montgomery multiplication so it should be up to 5479 * 25% faster. However, its loop control is more complex and it 5480 * may actually run slower on some machines. 5481 * 5482 * Arguments: 5483 * 5484 * Inputs: 5485 * c_rarg0 - int array elements a 5486 * c_rarg1 - int array elements n (the modulus) 5487 * c_rarg2 - int length 5488 * c_rarg3 - int inv 5489 * c_rarg4 - int array elements m (the result) 5490 * 5491 */ 5492 address generate_square() { 5493 Label argh; 5494 bind(argh); 5495 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5496 5497 align(CodeEntryAlignment); 5498 address entry = pc(); 5499 5500 enter(); 5501 5502 // Make room. 5503 cmpw(Rlen, 512); 5504 br(Assembler::HI, argh); 5505 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5506 andr(sp, Ra, -2 * wordSize); 5507 5508 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5509 5510 { 5511 // Copy input args, reversing as we go. We use Ra as a 5512 // temporary variable. 5513 reverse(Ra, Pa_base, Rlen, t0, t1); 5514 reverse(Ra, Pn_base, Rlen, t0, t1); 5515 } 5516 5517 // Push all call-saved registers and also Pm_base which we'll need 5518 // at the end. 5519 save_regs(); 5520 5521 mov(Pm_base, Ra); 5522 5523 mov(t0, zr); 5524 mov(t1, zr); 5525 mov(t2, zr); 5526 5527 block_comment("for (int i = 0; i < len; i++) {"); 5528 mov(Ri, zr); { 5529 Label loop, end; 5530 bind(loop); 5531 cmp(Ri, Rlen); 5532 br(Assembler::GE, end); 5533 5534 pre1(Ri); 5535 5536 block_comment("for (j = (i+1)/2; j; j--) {"); { 5537 add(Rj, Ri, 1); 5538 lsr(Rj, Rj, 1); 5539 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5540 } block_comment(" } // j"); 5541 5542 last_squaring(Ri); 5543 5544 block_comment(" for (j = i/2; j; j--) {"); { 5545 lsr(Rj, Ri, 1); 5546 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5547 } block_comment(" } // j"); 5548 5549 post1_squaring(); 5550 add(Ri, Ri, 1); 5551 cmp(Ri, Rlen); 5552 br(Assembler::LT, loop); 5553 5554 bind(end); 5555 block_comment("} // i"); 5556 } 5557 5558 block_comment("for (int i = len; i < 2*len; i++) {"); 5559 mov(Ri, Rlen); { 5560 Label loop, end; 5561 bind(loop); 5562 cmp(Ri, Rlen, Assembler::LSL, 1); 5563 br(Assembler::GE, end); 5564 5565 pre2(Ri, Rlen); 5566 5567 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5568 lsl(Rj, Rlen, 1); 5569 sub(Rj, Rj, Ri); 5570 sub(Rj, Rj, 1); 5571 lsr(Rj, Rj, 1); 5572 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5573 } block_comment(" } // j"); 5574 5575 last_squaring(Ri); 5576 5577 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5578 lsl(Rj, Rlen, 1); 5579 sub(Rj, Rj, Ri); 5580 lsr(Rj, Rj, 1); 5581 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5582 } block_comment(" } // j"); 5583 5584 post2(Ri, Rlen); 5585 add(Ri, Ri, 1); 5586 cmp(Ri, Rlen, Assembler::LSL, 1); 5587 5588 br(Assembler::LT, loop); 5589 bind(end); 5590 block_comment("} // i"); 5591 } 5592 5593 normalize(Rlen); 5594 5595 mov(Ra, Pm_base); // Save Pm_base in Ra 5596 restore_regs(); // Restore caller's Pm_base 5597 5598 // Copy our result into caller's Pm_base 5599 reverse(Pm_base, Ra, Rlen, t0, t1); 5600 5601 leave(); 5602 ret(lr); 5603 5604 return entry; 5605 } 5606 // In C, approximately: 5607 5608 // void 5609 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5610 // unsigned long Pm_base[], unsigned long inv, int len) { 5611 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5612 // unsigned long *Pa, *Pb, *Pn, *Pm; 5613 // unsigned long Ra, Rb, Rn, Rm; 5614 5615 // int i; 5616 5617 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5618 5619 // for (i = 0; i < len; i++) { 5620 // int j; 5621 5622 // Pa = Pa_base; 5623 // Pb = Pa_base + i; 5624 // Pm = Pm_base; 5625 // Pn = Pn_base + i; 5626 5627 // Ra = *Pa; 5628 // Rb = *Pb; 5629 // Rm = *Pm; 5630 // Rn = *Pn; 5631 5632 // int iters = (i+1)/2; 5633 // for (j = 0; iters--; j++) { 5634 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5635 // MACC2(Ra, Rb, t0, t1, t2); 5636 // Ra = *++Pa; 5637 // Rb = *--Pb; 5638 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5639 // MACC(Rm, Rn, t0, t1, t2); 5640 // Rm = *++Pm; 5641 // Rn = *--Pn; 5642 // } 5643 // if ((i & 1) == 0) { 5644 // assert(Ra == Pa_base[j], "must be"); 5645 // MACC(Ra, Ra, t0, t1, t2); 5646 // } 5647 // iters = i/2; 5648 // assert(iters == i-j, "must be"); 5649 // for (; iters--; j++) { 5650 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5651 // MACC(Rm, Rn, t0, t1, t2); 5652 // Rm = *++Pm; 5653 // Rn = *--Pn; 5654 // } 5655 5656 // *Pm = Rm = t0 * inv; 5657 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5658 // MACC(Rm, Rn, t0, t1, t2); 5659 5660 // assert(t0 == 0, "broken Montgomery multiply"); 5661 5662 // t0 = t1; t1 = t2; t2 = 0; 5663 // } 5664 5665 // for (i = len; i < 2*len; i++) { 5666 // int start = i-len+1; 5667 // int end = start + (len - start)/2; 5668 // int j; 5669 5670 // Pa = Pa_base + i-len; 5671 // Pb = Pa_base + len; 5672 // Pm = Pm_base + i-len; 5673 // Pn = Pn_base + len; 5674 5675 // Ra = *++Pa; 5676 // Rb = *--Pb; 5677 // Rm = *++Pm; 5678 // Rn = *--Pn; 5679 5680 // int iters = (2*len-i-1)/2; 5681 // assert(iters == end-start, "must be"); 5682 // for (j = start; iters--; j++) { 5683 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5684 // MACC2(Ra, Rb, t0, t1, t2); 5685 // Ra = *++Pa; 5686 // Rb = *--Pb; 5687 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5688 // MACC(Rm, Rn, t0, t1, t2); 5689 // Rm = *++Pm; 5690 // Rn = *--Pn; 5691 // } 5692 // if ((i & 1) == 0) { 5693 // assert(Ra == Pa_base[j], "must be"); 5694 // MACC(Ra, Ra, t0, t1, t2); 5695 // } 5696 // iters = (2*len-i)/2; 5697 // assert(iters == len-j, "must be"); 5698 // for (; iters--; j++) { 5699 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5700 // MACC(Rm, Rn, t0, t1, t2); 5701 // Rm = *++Pm; 5702 // Rn = *--Pn; 5703 // } 5704 // Pm_base[i-len] = t0; 5705 // t0 = t1; t1 = t2; t2 = 0; 5706 // } 5707 5708 // while (t0) 5709 // t0 = sub(Pm_base, Pn_base, t0, len); 5710 // } 5711 }; 5712 5713 5714 // Initialization 5715 void generate_initial() { 5716 // Generate initial stubs and initializes the entry points 5717 5718 // entry points that exist in all platforms Note: This is code 5719 // that could be shared among different platforms - however the 5720 // benefit seems to be smaller than the disadvantage of having a 5721 // much more complicated generator structure. See also comment in 5722 // stubRoutines.hpp. 5723 5724 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5725 5726 StubRoutines::_call_stub_entry = 5727 generate_call_stub(StubRoutines::_call_stub_return_address); 5728 5729 // is referenced by megamorphic call 5730 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5731 5732 // Build this early so it's available for the interpreter. 5733 StubRoutines::_throw_StackOverflowError_entry = 5734 generate_throw_exception("StackOverflowError throw_exception", 5735 CAST_FROM_FN_PTR(address, 5736 SharedRuntime::throw_StackOverflowError)); 5737 StubRoutines::_throw_delayed_StackOverflowError_entry = 5738 generate_throw_exception("delayed StackOverflowError throw_exception", 5739 CAST_FROM_FN_PTR(address, 5740 SharedRuntime::throw_delayed_StackOverflowError)); 5741 if (UseCRC32Intrinsics) { 5742 // set table address before stub generation which use it 5743 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5744 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5745 } 5746 5747 if (UseCRC32CIntrinsics) { 5748 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5749 } 5750 5751 // Disabled until JDK-8210858 is fixed 5752 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5753 // StubRoutines::_dlog = generate_dlog(); 5754 // } 5755 5756 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5757 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5758 } 5759 5760 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5761 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5762 } 5763 } 5764 5765 void generate_all() { 5766 // support for verify_oop (must happen after universe_init) 5767 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5768 StubRoutines::_throw_AbstractMethodError_entry = 5769 generate_throw_exception("AbstractMethodError throw_exception", 5770 CAST_FROM_FN_PTR(address, 5771 SharedRuntime:: 5772 throw_AbstractMethodError)); 5773 5774 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5775 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5776 CAST_FROM_FN_PTR(address, 5777 SharedRuntime:: 5778 throw_IncompatibleClassChangeError)); 5779 5780 StubRoutines::_throw_NullPointerException_at_call_entry = 5781 generate_throw_exception("NullPointerException at call throw_exception", 5782 CAST_FROM_FN_PTR(address, 5783 SharedRuntime:: 5784 throw_NullPointerException_at_call)); 5785 5786 // arraycopy stubs used by compilers 5787 generate_arraycopy_stubs(); 5788 5789 // has negatives stub for large arrays. 5790 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5791 5792 // array equals stub for large arrays. 5793 if (!UseSimpleArrayEquals) { 5794 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5795 } 5796 5797 generate_compare_long_strings(); 5798 5799 generate_string_indexof_stubs(); 5800 5801 // byte_array_inflate stub for large arrays. 5802 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5803 5804 #ifdef COMPILER2 5805 if (UseMultiplyToLenIntrinsic) { 5806 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5807 } 5808 5809 if (UseSquareToLenIntrinsic) { 5810 StubRoutines::_squareToLen = generate_squareToLen(); 5811 } 5812 5813 if (UseMulAddIntrinsic) { 5814 StubRoutines::_mulAdd = generate_mulAdd(); 5815 } 5816 5817 if (UseMontgomeryMultiplyIntrinsic) { 5818 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5819 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5820 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5821 } 5822 5823 if (UseMontgomerySquareIntrinsic) { 5824 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5825 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5826 // We use generate_multiply() rather than generate_square() 5827 // because it's faster for the sizes of modulus we care about. 5828 StubRoutines::_montgomerySquare = g.generate_multiply(); 5829 } 5830 #endif // COMPILER2 5831 5832 #ifndef BUILTIN_SIM 5833 // generate GHASH intrinsics code 5834 if (UseGHASHIntrinsics) { 5835 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5836 } 5837 5838 if (UseAESIntrinsics) { 5839 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5840 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5841 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5842 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5843 } 5844 5845 if (UseSHA1Intrinsics) { 5846 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5847 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5848 } 5849 if (UseSHA256Intrinsics) { 5850 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5851 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5852 } 5853 5854 // generate Adler32 intrinsics code 5855 if (UseAdler32Intrinsics) { 5856 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5857 } 5858 5859 // Safefetch stubs. 5860 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5861 &StubRoutines::_safefetch32_fault_pc, 5862 &StubRoutines::_safefetch32_continuation_pc); 5863 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5864 &StubRoutines::_safefetchN_fault_pc, 5865 &StubRoutines::_safefetchN_continuation_pc); 5866 #endif 5867 StubRoutines::aarch64::set_completed(); 5868 } 5869 5870 public: 5871 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5872 if (all) { 5873 generate_all(); 5874 } else { 5875 generate_initial(); 5876 } 5877 } 5878 }; // end class declaration 5879 5880 void StubGenerator_generate(CodeBuffer* code, bool all) { 5881 StubGenerator g(code, all); 5882 }