1 /* 2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/universe.hpp" 33 #include "nativeInst_aarch64.hpp" 34 #include "oops/instanceOop.hpp" 35 #include "oops/method.hpp" 36 #include "oops/objArrayKlass.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/frame.inline.hpp" 40 #include "runtime/handles.inline.hpp" 41 #include "runtime/sharedRuntime.hpp" 42 #include "runtime/stubCodeGenerator.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "runtime/thread.inline.hpp" 45 #include "utilities/align.hpp" 46 #ifdef COMPILER2 47 #include "opto/runtime.hpp" 48 #endif 49 #if INCLUDE_ZGC 50 #include "gc/z/zThreadLocalData.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #undef __ 58 #define __ _masm-> 59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Stub Code definitions 70 71 class StubGenerator: public StubCodeGenerator { 72 private: 73 74 #ifdef PRODUCT 75 #define inc_counter_np(counter) ((void)0) 76 #else 77 void inc_counter_np_(int& counter) { 78 __ lea(rscratch2, ExternalAddress((address)&counter)); 79 __ ldrw(rscratch1, Address(rscratch2)); 80 __ addw(rscratch1, rscratch1, 1); 81 __ strw(rscratch1, Address(rscratch2)); 82 } 83 #define inc_counter_np(counter) \ 84 BLOCK_COMMENT("inc_counter " #counter); \ 85 inc_counter_np_(counter); 86 #endif 87 88 // Call stubs are used to call Java from C 89 // 90 // Arguments: 91 // c_rarg0: call wrapper address address 92 // c_rarg1: result address 93 // c_rarg2: result type BasicType 94 // c_rarg3: method Method* 95 // c_rarg4: (interpreter) entry point address 96 // c_rarg5: parameters intptr_t* 97 // c_rarg6: parameter size (in words) int 98 // c_rarg7: thread Thread* 99 // 100 // There is no return from the stub itself as any Java result 101 // is written to result 102 // 103 // we save r30 (lr) as the return PC at the base of the frame and 104 // link r29 (fp) below it as the frame pointer installing sp (r31) 105 // into fp. 106 // 107 // we save r0-r7, which accounts for all the c arguments. 108 // 109 // TODO: strictly do we need to save them all? they are treated as 110 // volatile by C so could we omit saving the ones we are going to 111 // place in global registers (thread? method?) or those we only use 112 // during setup of the Java call? 113 // 114 // we don't need to save r8 which C uses as an indirect result location 115 // return register. 116 // 117 // we don't need to save r9-r15 which both C and Java treat as 118 // volatile 119 // 120 // we don't need to save r16-18 because Java does not use them 121 // 122 // we save r19-r28 which Java uses as scratch registers and C 123 // expects to be callee-save 124 // 125 // we save the bottom 64 bits of each value stored in v8-v15; it is 126 // the responsibility of the caller to preserve larger values. 127 // 128 // so the stub frame looks like this when we enter Java code 129 // 130 // [ return_from_Java ] <--- sp 131 // [ argument word n ] 132 // ... 133 // -27 [ argument word 1 ] 134 // -26 [ saved v15 ] <--- sp_after_call 135 // -25 [ saved v14 ] 136 // -24 [ saved v13 ] 137 // -23 [ saved v12 ] 138 // -22 [ saved v11 ] 139 // -21 [ saved v10 ] 140 // -20 [ saved v9 ] 141 // -19 [ saved v8 ] 142 // -18 [ saved r28 ] 143 // -17 [ saved r27 ] 144 // -16 [ saved r26 ] 145 // -15 [ saved r25 ] 146 // -14 [ saved r24 ] 147 // -13 [ saved r23 ] 148 // -12 [ saved r22 ] 149 // -11 [ saved r21 ] 150 // -10 [ saved r20 ] 151 // -9 [ saved r19 ] 152 // -8 [ call wrapper (r0) ] 153 // -7 [ result (r1) ] 154 // -6 [ result type (r2) ] 155 // -5 [ method (r3) ] 156 // -4 [ entry point (r4) ] 157 // -3 [ parameters (r5) ] 158 // -2 [ parameter size (r6) ] 159 // -1 [ thread (r7) ] 160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 161 // 1 [ saved lr (r30) ] 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -26, 166 167 d15_off = -26, 168 d13_off = -24, 169 d11_off = -22, 170 d9_off = -20, 171 172 r28_off = -18, 173 r26_off = -16, 174 r24_off = -14, 175 r22_off = -12, 176 r20_off = -10, 177 call_wrapper_off = -8, 178 result_off = -7, 179 result_type_off = -6, 180 method_off = -5, 181 entry_point_off = -4, 182 parameter_size_off = -2, 183 thread_off = -1, 184 fp_f = 0, 185 retaddr_off = 1, 186 }; 187 188 address generate_call_stub(address& return_address) { 189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 191 "adjust this code"); 192 193 StubCodeMark mark(this, "StubRoutines", "call_stub"); 194 address start = __ pc(); 195 196 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 197 198 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 199 const Address result (rfp, result_off * wordSize); 200 const Address result_type (rfp, result_type_off * wordSize); 201 const Address method (rfp, method_off * wordSize); 202 const Address entry_point (rfp, entry_point_off * wordSize); 203 const Address parameter_size(rfp, parameter_size_off * wordSize); 204 205 const Address thread (rfp, thread_off * wordSize); 206 207 const Address d15_save (rfp, d15_off * wordSize); 208 const Address d13_save (rfp, d13_off * wordSize); 209 const Address d11_save (rfp, d11_off * wordSize); 210 const Address d9_save (rfp, d9_off * wordSize); 211 212 const Address r28_save (rfp, r28_off * wordSize); 213 const Address r26_save (rfp, r26_off * wordSize); 214 const Address r24_save (rfp, r24_off * wordSize); 215 const Address r22_save (rfp, r22_off * wordSize); 216 const Address r20_save (rfp, r20_off * wordSize); 217 218 // stub code 219 220 address aarch64_entry = __ pc(); 221 222 // set up frame and move sp to end of save area 223 __ enter(); 224 __ sub(sp, rfp, -sp_after_call_off * wordSize); 225 226 // save register parameters and Java scratch/global registers 227 // n.b. we save thread even though it gets installed in 228 // rthread because we want to sanity check rthread later 229 __ str(c_rarg7, thread); 230 __ strw(c_rarg6, parameter_size); 231 __ stp(c_rarg4, c_rarg5, entry_point); 232 __ stp(c_rarg2, c_rarg3, result_type); 233 __ stp(c_rarg0, c_rarg1, call_wrapper); 234 235 __ stp(r20, r19, r20_save); 236 __ stp(r22, r21, r22_save); 237 __ stp(r24, r23, r24_save); 238 __ stp(r26, r25, r26_save); 239 __ stp(r28, r27, r28_save); 240 241 __ stpd(v9, v8, d9_save); 242 __ stpd(v11, v10, d11_save); 243 __ stpd(v13, v12, d13_save); 244 __ stpd(v15, v14, d15_save); 245 246 // install Java thread in global register now we have saved 247 // whatever value it held 248 __ mov(rthread, c_rarg7); 249 // And method 250 __ mov(rmethod, c_rarg3); 251 252 // set up the heapbase register 253 __ reinit_heapbase(); 254 255 #ifdef ASSERT 256 // make sure we have no pending exceptions 257 { 258 Label L; 259 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 260 __ cmp(rscratch1, (u1)NULL_WORD); 261 __ br(Assembler::EQ, L); 262 __ stop("StubRoutines::call_stub: entered with pending exception"); 263 __ BIND(L); 264 } 265 #endif 266 // pass parameters if any 267 __ mov(esp, sp); 268 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 269 __ andr(sp, rscratch1, -2 * wordSize); 270 271 BLOCK_COMMENT("pass parameters if any"); 272 Label parameters_done; 273 // parameter count is still in c_rarg6 274 // and parameter pointer identifying param 1 is in c_rarg5 275 __ cbzw(c_rarg6, parameters_done); 276 277 address loop = __ pc(); 278 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 279 __ subsw(c_rarg6, c_rarg6, 1); 280 __ push(rscratch1); 281 __ br(Assembler::GT, loop); 282 283 __ BIND(parameters_done); 284 285 // call Java entry -- passing methdoOop, and current sp 286 // rmethod: Method* 287 // r13: sender sp 288 BLOCK_COMMENT("call Java function"); 289 __ mov(r13, sp); 290 __ blr(c_rarg4); 291 292 // we do this here because the notify will already have been done 293 // if we get to the next instruction via an exception 294 // 295 // n.b. adding this instruction here affects the calculation of 296 // whether or not a routine returns to the call stub (used when 297 // doing stack walks) since the normal test is to check the return 298 // pc against the address saved below. so we may need to allow for 299 // this extra instruction in the check. 300 301 // save current address for use by exception handling code 302 303 return_address = __ pc(); 304 305 // store result depending on type (everything that is not 306 // T_OBJECT, T_VALUETYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 307 // n.b. this assumes Java returns an integral result in r0 308 // and a floating result in j_farg0 309 __ ldr(j_rarg2, result); 310 Label is_long, is_float, is_double, is_value, exit; 311 __ ldr(j_rarg1, result_type); 312 __ cmp(j_rarg1, (u1)T_OBJECT); 313 __ br(Assembler::EQ, is_long); 314 __ cmp(j_rarg1, (u1)T_VALUETYPE); 315 __ br(Assembler::EQ, is_value); 316 __ cmp(j_rarg1, (u1)T_LONG); 317 __ br(Assembler::EQ, is_long); 318 __ cmp(j_rarg1, (u1)T_FLOAT); 319 __ br(Assembler::EQ, is_float); 320 __ cmp(j_rarg1, (u1)T_DOUBLE); 321 __ br(Assembler::EQ, is_double); 322 323 // handle T_INT case 324 __ strw(r0, Address(j_rarg2)); 325 326 __ BIND(exit); 327 328 // pop parameters 329 __ sub(esp, rfp, -sp_after_call_off * wordSize); 330 331 #ifdef ASSERT 332 // verify that threads correspond 333 { 334 Label L, S; 335 __ ldr(rscratch1, thread); 336 __ cmp(rthread, rscratch1); 337 __ br(Assembler::NE, S); 338 __ get_thread(rscratch1); 339 __ cmp(rthread, rscratch1); 340 __ br(Assembler::EQ, L); 341 __ BIND(S); 342 __ stop("StubRoutines::call_stub: threads must correspond"); 343 __ BIND(L); 344 } 345 #endif 346 347 // restore callee-save registers 348 __ ldpd(v15, v14, d15_save); 349 __ ldpd(v13, v12, d13_save); 350 __ ldpd(v11, v10, d11_save); 351 __ ldpd(v9, v8, d9_save); 352 353 __ ldp(r28, r27, r28_save); 354 __ ldp(r26, r25, r26_save); 355 __ ldp(r24, r23, r24_save); 356 __ ldp(r22, r21, r22_save); 357 __ ldp(r20, r19, r20_save); 358 359 __ ldp(c_rarg0, c_rarg1, call_wrapper); 360 __ ldrw(c_rarg2, result_type); 361 __ ldr(c_rarg3, method); 362 __ ldp(c_rarg4, c_rarg5, entry_point); 363 __ ldp(c_rarg6, c_rarg7, parameter_size); 364 365 // leave frame and return to caller 366 __ leave(); 367 __ ret(lr); 368 369 // handle return types different from T_INT 370 __ BIND(is_value); 371 if (ValueTypeReturnedAsFields) { 372 // Check for flattened return value 373 __ cbz(r0, is_long); 374 // Initialize pre-allocated buffer 375 __ mov(r1, r0); 376 __ andr(r1, r1, -2); 377 __ ldr(r1, Address(r1, InstanceKlass::adr_valueklass_fixed_block_offset())); 378 __ ldr(r1, Address(r1, ValueKlass::pack_handler_offset())); 379 __ ldr(r0, Address(j_rarg2, 0)); 380 __ blr(r1); 381 __ b(exit); 382 } 383 384 __ BIND(is_long); 385 __ str(r0, Address(j_rarg2, 0)); 386 __ br(Assembler::AL, exit); 387 388 __ BIND(is_float); 389 __ strs(j_farg0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 __ BIND(is_double); 393 __ strd(j_farg0, Address(j_rarg2, 0)); 394 __ br(Assembler::AL, exit); 395 396 return start; 397 } 398 399 // Return point for a Java call if there's an exception thrown in 400 // Java code. The exception is caught and transformed into a 401 // pending exception stored in JavaThread that can be tested from 402 // within the VM. 403 // 404 // Note: Usually the parameters are removed by the callee. In case 405 // of an exception crossing an activation frame boundary, that is 406 // not the case if the callee is compiled code => need to setup the 407 // rsp. 408 // 409 // r0: exception oop 410 411 address generate_catch_exception() { 412 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 413 address start = __ pc(); 414 415 // same as in generate_call_stub(): 416 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 417 const Address thread (rfp, thread_off * wordSize); 418 419 #ifdef ASSERT 420 // verify that threads correspond 421 { 422 Label L, S; 423 __ ldr(rscratch1, thread); 424 __ cmp(rthread, rscratch1); 425 __ br(Assembler::NE, S); 426 __ get_thread(rscratch1); 427 __ cmp(rthread, rscratch1); 428 __ br(Assembler::EQ, L); 429 __ bind(S); 430 __ stop("StubRoutines::catch_exception: threads must correspond"); 431 __ bind(L); 432 } 433 #endif 434 435 // set pending exception 436 __ verify_oop(r0); 437 438 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 439 __ mov(rscratch1, (address)__FILE__); 440 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 441 __ movw(rscratch1, (int)__LINE__); 442 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 443 444 // complete return to VM 445 assert(StubRoutines::_call_stub_return_address != NULL, 446 "_call_stub_return_address must have been generated before"); 447 __ b(StubRoutines::_call_stub_return_address); 448 449 return start; 450 } 451 452 // Continuation point for runtime calls returning with a pending 453 // exception. The pending exception check happened in the runtime 454 // or native call stub. The pending exception in Thread is 455 // converted into a Java-level exception. 456 // 457 // Contract with Java-level exception handlers: 458 // r0: exception 459 // r3: throwing pc 460 // 461 // NOTE: At entry of this stub, exception-pc must be in LR !! 462 463 // NOTE: this is always used as a jump target within generated code 464 // so it just needs to be generated code wiht no x86 prolog 465 466 address generate_forward_exception() { 467 StubCodeMark mark(this, "StubRoutines", "forward exception"); 468 address start = __ pc(); 469 470 // Upon entry, LR points to the return address returning into 471 // Java (interpreted or compiled) code; i.e., the return address 472 // becomes the throwing pc. 473 // 474 // Arguments pushed before the runtime call are still on the stack 475 // but the exception handler will reset the stack pointer -> 476 // ignore them. A potential result in registers can be ignored as 477 // well. 478 479 #ifdef ASSERT 480 // make sure this code is only executed if there is a pending exception 481 { 482 Label L; 483 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 484 __ cbnz(rscratch1, L); 485 __ stop("StubRoutines::forward exception: no pending exception (1)"); 486 __ bind(L); 487 } 488 #endif 489 490 // compute exception handler into r19 491 492 // call the VM to find the handler address associated with the 493 // caller address. pass thread in r0 and caller pc (ret address) 494 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 495 // the stack. 496 __ mov(c_rarg1, lr); 497 // lr will be trashed by the VM call so we move it to R19 498 // (callee-saved) because we also need to pass it to the handler 499 // returned by this call. 500 __ mov(r19, lr); 501 BLOCK_COMMENT("call exception_handler_for_return_address"); 502 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 503 SharedRuntime::exception_handler_for_return_address), 504 rthread, c_rarg1); 505 // we should not really care that lr is no longer the callee 506 // address. we saved the value the handler needs in r19 so we can 507 // just copy it to r3. however, the C2 handler will push its own 508 // frame and then calls into the VM and the VM code asserts that 509 // the PC for the frame above the handler belongs to a compiled 510 // Java method. So, we restore lr here to satisfy that assert. 511 __ mov(lr, r19); 512 // setup r0 & r3 & clear pending exception 513 __ mov(r3, r19); 514 __ mov(r19, r0); 515 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 516 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 517 518 #ifdef ASSERT 519 // make sure exception is set 520 { 521 Label L; 522 __ cbnz(r0, L); 523 __ stop("StubRoutines::forward exception: no pending exception (2)"); 524 __ bind(L); 525 } 526 #endif 527 528 // continue at exception handler 529 // r0: exception 530 // r3: throwing pc 531 // r19: exception handler 532 __ verify_oop(r0); 533 __ br(r19); 534 535 return start; 536 } 537 538 // Non-destructive plausibility checks for oops 539 // 540 // Arguments: 541 // r0: oop to verify 542 // rscratch1: error message 543 // 544 // Stack after saving c_rarg3: 545 // [tos + 0]: saved c_rarg3 546 // [tos + 1]: saved c_rarg2 547 // [tos + 2]: saved lr 548 // [tos + 3]: saved rscratch2 549 // [tos + 4]: saved r0 550 // [tos + 5]: saved rscratch1 551 address generate_verify_oop() { 552 553 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 554 address start = __ pc(); 555 556 Label exit, error; 557 558 // save c_rarg2 and c_rarg3 559 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 560 561 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 562 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 563 __ ldr(c_rarg3, Address(c_rarg2)); 564 __ add(c_rarg3, c_rarg3, 1); 565 __ str(c_rarg3, Address(c_rarg2)); 566 567 // object is in r0 568 // make sure object is 'reasonable' 569 __ cbz(r0, exit); // if obj is NULL it is OK 570 571 #if INCLUDE_ZGC 572 if (UseZGC) { 573 // Check if mask is good. 574 // verifies that ZAddressBadMask & r0 == 0 575 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 576 __ andr(c_rarg2, r0, c_rarg3); 577 __ cbnz(c_rarg2, error); 578 } 579 #endif 580 581 // Check if the oop is in the right area of memory 582 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 583 __ andr(c_rarg2, r0, c_rarg3); 584 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 585 586 // Compare c_rarg2 and c_rarg3. We don't use a compare 587 // instruction here because the flags register is live. 588 __ eor(c_rarg2, c_rarg2, c_rarg3); 589 __ cbnz(c_rarg2, error); 590 591 // make sure klass is 'reasonable', which is not zero. 592 __ load_klass(r0, r0); // get klass 593 __ cbz(r0, error); // if klass is NULL it is broken 594 595 // return if everything seems ok 596 __ bind(exit); 597 598 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 599 __ ret(lr); 600 601 // handle errors 602 __ bind(error); 603 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 604 605 __ push(RegSet::range(r0, r29), sp); 606 // debug(char* msg, int64_t pc, int64_t regs[]) 607 __ mov(c_rarg0, rscratch1); // pass address of error message 608 __ mov(c_rarg1, lr); // pass return address 609 __ mov(c_rarg2, sp); // pass address of regs on stack 610 #ifndef PRODUCT 611 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 612 #endif 613 BLOCK_COMMENT("call MacroAssembler::debug"); 614 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 615 __ blr(rscratch1); 616 __ hlt(0); 617 618 return start; 619 } 620 621 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 622 623 // The inner part of zero_words(). This is the bulk operation, 624 // zeroing words in blocks, possibly using DC ZVA to do it. The 625 // caller is responsible for zeroing the last few words. 626 // 627 // Inputs: 628 // r10: the HeapWord-aligned base address of an array to zero. 629 // r11: the count in HeapWords, r11 > 0. 630 // 631 // Returns r10 and r11, adjusted for the caller to clear. 632 // r10: the base address of the tail of words left to clear. 633 // r11: the number of words in the tail. 634 // r11 < MacroAssembler::zero_words_block_size. 635 636 address generate_zero_blocks() { 637 Label done; 638 Label base_aligned; 639 640 Register base = r10, cnt = r11; 641 642 __ align(CodeEntryAlignment); 643 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 644 address start = __ pc(); 645 646 if (UseBlockZeroing) { 647 int zva_length = VM_Version::zva_length(); 648 649 // Ensure ZVA length can be divided by 16. This is required by 650 // the subsequent operations. 651 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 652 653 __ tbz(base, 3, base_aligned); 654 __ str(zr, Address(__ post(base, 8))); 655 __ sub(cnt, cnt, 1); 656 __ bind(base_aligned); 657 658 // Ensure count >= zva_length * 2 so that it still deserves a zva after 659 // alignment. 660 Label small; 661 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 662 __ subs(rscratch1, cnt, low_limit >> 3); 663 __ br(Assembler::LT, small); 664 __ zero_dcache_blocks(base, cnt); 665 __ bind(small); 666 } 667 668 { 669 // Number of stp instructions we'll unroll 670 const int unroll = 671 MacroAssembler::zero_words_block_size / 2; 672 // Clear the remaining blocks. 673 Label loop; 674 __ subs(cnt, cnt, unroll * 2); 675 __ br(Assembler::LT, done); 676 __ bind(loop); 677 for (int i = 0; i < unroll; i++) 678 __ stp(zr, zr, __ post(base, 16)); 679 __ subs(cnt, cnt, unroll * 2); 680 __ br(Assembler::GE, loop); 681 __ bind(done); 682 __ add(cnt, cnt, unroll * 2); 683 } 684 685 __ ret(lr); 686 687 return start; 688 } 689 690 691 typedef enum { 692 copy_forwards = 1, 693 copy_backwards = -1 694 } copy_direction; 695 696 // Bulk copy of blocks of 8 words. 697 // 698 // count is a count of words. 699 // 700 // Precondition: count >= 8 701 // 702 // Postconditions: 703 // 704 // The least significant bit of count contains the remaining count 705 // of words to copy. The rest of count is trash. 706 // 707 // s and d are adjusted to point to the remaining words to copy 708 // 709 void generate_copy_longs(Label &start, Register s, Register d, Register count, 710 copy_direction direction) { 711 int unit = wordSize * direction; 712 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 713 714 int offset; 715 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 716 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 717 const Register stride = r13; 718 719 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 720 assert_different_registers(s, d, count, rscratch1); 721 722 Label again, drain; 723 const char *stub_name; 724 if (direction == copy_forwards) 725 stub_name = "forward_copy_longs"; 726 else 727 stub_name = "backward_copy_longs"; 728 729 __ align(CodeEntryAlignment); 730 731 StubCodeMark mark(this, "StubRoutines", stub_name); 732 733 __ bind(start); 734 735 Label unaligned_copy_long; 736 if (AvoidUnalignedAccesses) { 737 __ tbnz(d, 3, unaligned_copy_long); 738 } 739 740 if (direction == copy_forwards) { 741 __ sub(s, s, bias); 742 __ sub(d, d, bias); 743 } 744 745 #ifdef ASSERT 746 // Make sure we are never given < 8 words 747 { 748 Label L; 749 __ cmp(count, (u1)8); 750 __ br(Assembler::GE, L); 751 __ stop("genrate_copy_longs called with < 8 words"); 752 __ bind(L); 753 } 754 #endif 755 756 // Fill 8 registers 757 if (UseSIMDForMemoryOps) { 758 __ ldpq(v0, v1, Address(s, 4 * unit)); 759 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 760 } else { 761 __ ldp(t0, t1, Address(s, 2 * unit)); 762 __ ldp(t2, t3, Address(s, 4 * unit)); 763 __ ldp(t4, t5, Address(s, 6 * unit)); 764 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 765 } 766 767 __ subs(count, count, 16); 768 __ br(Assembler::LO, drain); 769 770 int prefetch = PrefetchCopyIntervalInBytes; 771 bool use_stride = false; 772 if (direction == copy_backwards) { 773 use_stride = prefetch > 256; 774 prefetch = -prefetch; 775 if (use_stride) __ mov(stride, prefetch); 776 } 777 778 __ bind(again); 779 780 if (PrefetchCopyIntervalInBytes > 0) 781 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 782 783 if (UseSIMDForMemoryOps) { 784 __ stpq(v0, v1, Address(d, 4 * unit)); 785 __ ldpq(v0, v1, Address(s, 4 * unit)); 786 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 787 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 788 } else { 789 __ stp(t0, t1, Address(d, 2 * unit)); 790 __ ldp(t0, t1, Address(s, 2 * unit)); 791 __ stp(t2, t3, Address(d, 4 * unit)); 792 __ ldp(t2, t3, Address(s, 4 * unit)); 793 __ stp(t4, t5, Address(d, 6 * unit)); 794 __ ldp(t4, t5, Address(s, 6 * unit)); 795 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 796 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 797 } 798 799 __ subs(count, count, 8); 800 __ br(Assembler::HS, again); 801 802 // Drain 803 __ bind(drain); 804 if (UseSIMDForMemoryOps) { 805 __ stpq(v0, v1, Address(d, 4 * unit)); 806 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 807 } else { 808 __ stp(t0, t1, Address(d, 2 * unit)); 809 __ stp(t2, t3, Address(d, 4 * unit)); 810 __ stp(t4, t5, Address(d, 6 * unit)); 811 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 812 } 813 814 { 815 Label L1, L2; 816 __ tbz(count, exact_log2(4), L1); 817 if (UseSIMDForMemoryOps) { 818 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 819 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 820 } else { 821 __ ldp(t0, t1, Address(s, 2 * unit)); 822 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 823 __ stp(t0, t1, Address(d, 2 * unit)); 824 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 825 } 826 __ bind(L1); 827 828 if (direction == copy_forwards) { 829 __ add(s, s, bias); 830 __ add(d, d, bias); 831 } 832 833 __ tbz(count, 1, L2); 834 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 835 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 836 __ bind(L2); 837 } 838 839 __ ret(lr); 840 841 if (AvoidUnalignedAccesses) { 842 Label drain, again; 843 // Register order for storing. Order is different for backward copy. 844 845 __ bind(unaligned_copy_long); 846 847 // source address is even aligned, target odd aligned 848 // 849 // when forward copying word pairs we read long pairs at offsets 850 // {0, 2, 4, 6} (in long words). when backwards copying we read 851 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 852 // address by -2 in the forwards case so we can compute the 853 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 854 // or -1. 855 // 856 // when forward copying we need to store 1 word, 3 pairs and 857 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 858 // zero offset We adjust the destination by -1 which means we 859 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 860 // 861 // When backwards copyng we need to store 1 word, 3 pairs and 862 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 863 // offsets {1, 3, 5, 7, 8} * unit. 864 865 if (direction == copy_forwards) { 866 __ sub(s, s, 16); 867 __ sub(d, d, 8); 868 } 869 870 // Fill 8 registers 871 // 872 // for forwards copy s was offset by -16 from the original input 873 // value of s so the register contents are at these offsets 874 // relative to the 64 bit block addressed by that original input 875 // and so on for each successive 64 byte block when s is updated 876 // 877 // t0 at offset 0, t1 at offset 8 878 // t2 at offset 16, t3 at offset 24 879 // t4 at offset 32, t5 at offset 40 880 // t6 at offset 48, t7 at offset 56 881 882 // for backwards copy s was not offset so the register contents 883 // are at these offsets into the preceding 64 byte block 884 // relative to that original input and so on for each successive 885 // preceding 64 byte block when s is updated. this explains the 886 // slightly counter-intuitive looking pattern of register usage 887 // in the stp instructions for backwards copy. 888 // 889 // t0 at offset -16, t1 at offset -8 890 // t2 at offset -32, t3 at offset -24 891 // t4 at offset -48, t5 at offset -40 892 // t6 at offset -64, t7 at offset -56 893 894 __ ldp(t0, t1, Address(s, 2 * unit)); 895 __ ldp(t2, t3, Address(s, 4 * unit)); 896 __ ldp(t4, t5, Address(s, 6 * unit)); 897 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 898 899 __ subs(count, count, 16); 900 __ br(Assembler::LO, drain); 901 902 int prefetch = PrefetchCopyIntervalInBytes; 903 bool use_stride = false; 904 if (direction == copy_backwards) { 905 use_stride = prefetch > 256; 906 prefetch = -prefetch; 907 if (use_stride) __ mov(stride, prefetch); 908 } 909 910 __ bind(again); 911 912 if (PrefetchCopyIntervalInBytes > 0) 913 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 914 915 if (direction == copy_forwards) { 916 // allowing for the offset of -8 the store instructions place 917 // registers into the target 64 bit block at the following 918 // offsets 919 // 920 // t0 at offset 0 921 // t1 at offset 8, t2 at offset 16 922 // t3 at offset 24, t4 at offset 32 923 // t5 at offset 40, t6 at offset 48 924 // t7 at offset 56 925 926 __ str(t0, Address(d, 1 * unit)); 927 __ stp(t1, t2, Address(d, 2 * unit)); 928 __ ldp(t0, t1, Address(s, 2 * unit)); 929 __ stp(t3, t4, Address(d, 4 * unit)); 930 __ ldp(t2, t3, Address(s, 4 * unit)); 931 __ stp(t5, t6, Address(d, 6 * unit)); 932 __ ldp(t4, t5, Address(s, 6 * unit)); 933 __ str(t7, Address(__ pre(d, 8 * unit))); 934 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 935 } else { 936 // d was not offset when we started so the registers are 937 // written into the 64 bit block preceding d with the following 938 // offsets 939 // 940 // t1 at offset -8 941 // t3 at offset -24, t0 at offset -16 942 // t5 at offset -48, t2 at offset -32 943 // t7 at offset -56, t4 at offset -48 944 // t6 at offset -64 945 // 946 // note that this matches the offsets previously noted for the 947 // loads 948 949 __ str(t1, Address(d, 1 * unit)); 950 __ stp(t3, t0, Address(d, 3 * unit)); 951 __ ldp(t0, t1, Address(s, 2 * unit)); 952 __ stp(t5, t2, Address(d, 5 * unit)); 953 __ ldp(t2, t3, Address(s, 4 * unit)); 954 __ stp(t7, t4, Address(d, 7 * unit)); 955 __ ldp(t4, t5, Address(s, 6 * unit)); 956 __ str(t6, Address(__ pre(d, 8 * unit))); 957 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 958 } 959 960 __ subs(count, count, 8); 961 __ br(Assembler::HS, again); 962 963 // Drain 964 // 965 // this uses the same pattern of offsets and register arguments 966 // as above 967 __ bind(drain); 968 if (direction == copy_forwards) { 969 __ str(t0, Address(d, 1 * unit)); 970 __ stp(t1, t2, Address(d, 2 * unit)); 971 __ stp(t3, t4, Address(d, 4 * unit)); 972 __ stp(t5, t6, Address(d, 6 * unit)); 973 __ str(t7, Address(__ pre(d, 8 * unit))); 974 } else { 975 __ str(t1, Address(d, 1 * unit)); 976 __ stp(t3, t0, Address(d, 3 * unit)); 977 __ stp(t5, t2, Address(d, 5 * unit)); 978 __ stp(t7, t4, Address(d, 7 * unit)); 979 __ str(t6, Address(__ pre(d, 8 * unit))); 980 } 981 // now we need to copy any remaining part block which may 982 // include a 4 word block subblock and/or a 2 word subblock. 983 // bits 2 and 1 in the count are the tell-tale for whetehr we 984 // have each such subblock 985 { 986 Label L1, L2; 987 __ tbz(count, exact_log2(4), L1); 988 // this is the same as above but copying only 4 longs hence 989 // with ony one intervening stp between the str instructions 990 // but note that the offsets and registers still follow the 991 // same pattern 992 __ ldp(t0, t1, Address(s, 2 * unit)); 993 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 994 if (direction == copy_forwards) { 995 __ str(t0, Address(d, 1 * unit)); 996 __ stp(t1, t2, Address(d, 2 * unit)); 997 __ str(t3, Address(__ pre(d, 4 * unit))); 998 } else { 999 __ str(t1, Address(d, 1 * unit)); 1000 __ stp(t3, t0, Address(d, 3 * unit)); 1001 __ str(t2, Address(__ pre(d, 4 * unit))); 1002 } 1003 __ bind(L1); 1004 1005 __ tbz(count, 1, L2); 1006 // this is the same as above but copying only 2 longs hence 1007 // there is no intervening stp between the str instructions 1008 // but note that the offset and register patterns are still 1009 // the same 1010 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1011 if (direction == copy_forwards) { 1012 __ str(t0, Address(d, 1 * unit)); 1013 __ str(t1, Address(__ pre(d, 2 * unit))); 1014 } else { 1015 __ str(t1, Address(d, 1 * unit)); 1016 __ str(t0, Address(__ pre(d, 2 * unit))); 1017 } 1018 __ bind(L2); 1019 1020 // for forwards copy we need to re-adjust the offsets we 1021 // applied so that s and d are follow the last words written 1022 1023 if (direction == copy_forwards) { 1024 __ add(s, s, 16); 1025 __ add(d, d, 8); 1026 } 1027 1028 } 1029 1030 __ ret(lr); 1031 } 1032 } 1033 1034 // Small copy: less than 16 bytes. 1035 // 1036 // NB: Ignores all of the bits of count which represent more than 15 1037 // bytes, so a caller doesn't have to mask them. 1038 1039 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1040 bool is_backwards = step < 0; 1041 size_t granularity = uabs(step); 1042 int direction = is_backwards ? -1 : 1; 1043 int unit = wordSize * direction; 1044 1045 Label Lword, Lint, Lshort, Lbyte; 1046 1047 assert(granularity 1048 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1049 1050 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1051 1052 // ??? I don't know if this bit-test-and-branch is the right thing 1053 // to do. It does a lot of jumping, resulting in several 1054 // mispredicted branches. It might make more sense to do this 1055 // with something like Duff's device with a single computed branch. 1056 1057 __ tbz(count, 3 - exact_log2(granularity), Lword); 1058 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1059 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1060 __ bind(Lword); 1061 1062 if (granularity <= sizeof (jint)) { 1063 __ tbz(count, 2 - exact_log2(granularity), Lint); 1064 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1065 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1066 __ bind(Lint); 1067 } 1068 1069 if (granularity <= sizeof (jshort)) { 1070 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1071 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1072 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1073 __ bind(Lshort); 1074 } 1075 1076 if (granularity <= sizeof (jbyte)) { 1077 __ tbz(count, 0, Lbyte); 1078 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1079 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1080 __ bind(Lbyte); 1081 } 1082 } 1083 1084 Label copy_f, copy_b; 1085 1086 // All-singing all-dancing memory copy. 1087 // 1088 // Copy count units of memory from s to d. The size of a unit is 1089 // step, which can be positive or negative depending on the direction 1090 // of copy. If is_aligned is false, we align the source address. 1091 // 1092 1093 void copy_memory(bool is_aligned, Register s, Register d, 1094 Register count, Register tmp, int step) { 1095 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1096 bool is_backwards = step < 0; 1097 int granularity = uabs(step); 1098 const Register t0 = r3, t1 = r4; 1099 1100 // <= 96 bytes do inline. Direction doesn't matter because we always 1101 // load all the data before writing anything 1102 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1103 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1104 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1105 const Register send = r17, dend = r18; 1106 1107 if (PrefetchCopyIntervalInBytes > 0) 1108 __ prfm(Address(s, 0), PLDL1KEEP); 1109 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1110 __ br(Assembler::HI, copy_big); 1111 1112 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1113 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1114 1115 __ cmp(count, u1(16/granularity)); 1116 __ br(Assembler::LS, copy16); 1117 1118 __ cmp(count, u1(64/granularity)); 1119 __ br(Assembler::HI, copy80); 1120 1121 __ cmp(count, u1(32/granularity)); 1122 __ br(Assembler::LS, copy32); 1123 1124 // 33..64 bytes 1125 if (UseSIMDForMemoryOps) { 1126 __ ldpq(v0, v1, Address(s, 0)); 1127 __ ldpq(v2, v3, Address(send, -32)); 1128 __ stpq(v0, v1, Address(d, 0)); 1129 __ stpq(v2, v3, Address(dend, -32)); 1130 } else { 1131 __ ldp(t0, t1, Address(s, 0)); 1132 __ ldp(t2, t3, Address(s, 16)); 1133 __ ldp(t4, t5, Address(send, -32)); 1134 __ ldp(t6, t7, Address(send, -16)); 1135 1136 __ stp(t0, t1, Address(d, 0)); 1137 __ stp(t2, t3, Address(d, 16)); 1138 __ stp(t4, t5, Address(dend, -32)); 1139 __ stp(t6, t7, Address(dend, -16)); 1140 } 1141 __ b(finish); 1142 1143 // 17..32 bytes 1144 __ bind(copy32); 1145 __ ldp(t0, t1, Address(s, 0)); 1146 __ ldp(t2, t3, Address(send, -16)); 1147 __ stp(t0, t1, Address(d, 0)); 1148 __ stp(t2, t3, Address(dend, -16)); 1149 __ b(finish); 1150 1151 // 65..80/96 bytes 1152 // (96 bytes if SIMD because we do 32 byes per instruction) 1153 __ bind(copy80); 1154 if (UseSIMDForMemoryOps) { 1155 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1156 __ ldpq(v4, v5, Address(send, -32)); 1157 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1158 __ stpq(v4, v5, Address(dend, -32)); 1159 } else { 1160 __ ldp(t0, t1, Address(s, 0)); 1161 __ ldp(t2, t3, Address(s, 16)); 1162 __ ldp(t4, t5, Address(s, 32)); 1163 __ ldp(t6, t7, Address(s, 48)); 1164 __ ldp(t8, t9, Address(send, -16)); 1165 1166 __ stp(t0, t1, Address(d, 0)); 1167 __ stp(t2, t3, Address(d, 16)); 1168 __ stp(t4, t5, Address(d, 32)); 1169 __ stp(t6, t7, Address(d, 48)); 1170 __ stp(t8, t9, Address(dend, -16)); 1171 } 1172 __ b(finish); 1173 1174 // 0..16 bytes 1175 __ bind(copy16); 1176 __ cmp(count, u1(8/granularity)); 1177 __ br(Assembler::LO, copy8); 1178 1179 // 8..16 bytes 1180 __ ldr(t0, Address(s, 0)); 1181 __ ldr(t1, Address(send, -8)); 1182 __ str(t0, Address(d, 0)); 1183 __ str(t1, Address(dend, -8)); 1184 __ b(finish); 1185 1186 if (granularity < 8) { 1187 // 4..7 bytes 1188 __ bind(copy8); 1189 __ tbz(count, 2 - exact_log2(granularity), copy4); 1190 __ ldrw(t0, Address(s, 0)); 1191 __ ldrw(t1, Address(send, -4)); 1192 __ strw(t0, Address(d, 0)); 1193 __ strw(t1, Address(dend, -4)); 1194 __ b(finish); 1195 if (granularity < 4) { 1196 // 0..3 bytes 1197 __ bind(copy4); 1198 __ cbz(count, finish); // get rid of 0 case 1199 if (granularity == 2) { 1200 __ ldrh(t0, Address(s, 0)); 1201 __ strh(t0, Address(d, 0)); 1202 } else { // granularity == 1 1203 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1204 // the first and last byte. 1205 // Handle the 3 byte case by loading and storing base + count/2 1206 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1207 // This does means in the 1 byte case we load/store the same 1208 // byte 3 times. 1209 __ lsr(count, count, 1); 1210 __ ldrb(t0, Address(s, 0)); 1211 __ ldrb(t1, Address(send, -1)); 1212 __ ldrb(t2, Address(s, count)); 1213 __ strb(t0, Address(d, 0)); 1214 __ strb(t1, Address(dend, -1)); 1215 __ strb(t2, Address(d, count)); 1216 } 1217 __ b(finish); 1218 } 1219 } 1220 1221 __ bind(copy_big); 1222 if (is_backwards) { 1223 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1224 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1225 } 1226 1227 // Now we've got the small case out of the way we can align the 1228 // source address on a 2-word boundary. 1229 1230 Label aligned; 1231 1232 if (is_aligned) { 1233 // We may have to adjust by 1 word to get s 2-word-aligned. 1234 __ tbz(s, exact_log2(wordSize), aligned); 1235 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1236 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1237 __ sub(count, count, wordSize/granularity); 1238 } else { 1239 if (is_backwards) { 1240 __ andr(rscratch2, s, 2 * wordSize - 1); 1241 } else { 1242 __ neg(rscratch2, s); 1243 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1244 } 1245 // rscratch2 is the byte adjustment needed to align s. 1246 __ cbz(rscratch2, aligned); 1247 int shift = exact_log2(granularity); 1248 if (shift) __ lsr(rscratch2, rscratch2, shift); 1249 __ sub(count, count, rscratch2); 1250 1251 #if 0 1252 // ?? This code is only correct for a disjoint copy. It may or 1253 // may not make sense to use it in that case. 1254 1255 // Copy the first pair; s and d may not be aligned. 1256 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1257 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1258 1259 // Align s and d, adjust count 1260 if (is_backwards) { 1261 __ sub(s, s, rscratch2); 1262 __ sub(d, d, rscratch2); 1263 } else { 1264 __ add(s, s, rscratch2); 1265 __ add(d, d, rscratch2); 1266 } 1267 #else 1268 copy_memory_small(s, d, rscratch2, rscratch1, step); 1269 #endif 1270 } 1271 1272 __ bind(aligned); 1273 1274 // s is now 2-word-aligned. 1275 1276 // We have a count of units and some trailing bytes. Adjust the 1277 // count and do a bulk copy of words. 1278 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1279 if (direction == copy_forwards) 1280 __ bl(copy_f); 1281 else 1282 __ bl(copy_b); 1283 1284 // And the tail. 1285 copy_memory_small(s, d, count, tmp, step); 1286 1287 if (granularity >= 8) __ bind(copy8); 1288 if (granularity >= 4) __ bind(copy4); 1289 __ bind(finish); 1290 } 1291 1292 1293 void clobber_registers() { 1294 #ifdef ASSERT 1295 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1296 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1297 for (Register r = r3; r <= r18; r++) 1298 if (r != rscratch1) __ mov(r, rscratch1); 1299 #endif 1300 } 1301 1302 // Scan over array at a for count oops, verifying each one. 1303 // Preserves a and count, clobbers rscratch1 and rscratch2. 1304 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1305 Label loop, end; 1306 __ mov(rscratch1, a); 1307 __ mov(rscratch2, zr); 1308 __ bind(loop); 1309 __ cmp(rscratch2, count); 1310 __ br(Assembler::HS, end); 1311 if (size == (size_t)wordSize) { 1312 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1313 __ verify_oop(temp); 1314 } else { 1315 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1316 __ decode_heap_oop(temp); // calls verify_oop 1317 } 1318 __ add(rscratch2, rscratch2, size); 1319 __ b(loop); 1320 __ bind(end); 1321 } 1322 1323 // Arguments: 1324 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1325 // ignored 1326 // is_oop - true => oop array, so generate store check code 1327 // name - stub name string 1328 // 1329 // Inputs: 1330 // c_rarg0 - source array address 1331 // c_rarg1 - destination array address 1332 // c_rarg2 - element count, treated as ssize_t, can be zero 1333 // 1334 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1335 // the hardware handle it. The two dwords within qwords that span 1336 // cache line boundaries will still be loaded and stored atomicly. 1337 // 1338 // Side Effects: 1339 // disjoint_int_copy_entry is set to the no-overlap entry point 1340 // used by generate_conjoint_int_oop_copy(). 1341 // 1342 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1343 const char *name, bool dest_uninitialized = false) { 1344 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1345 RegSet saved_reg = RegSet::of(s, d, count); 1346 __ align(CodeEntryAlignment); 1347 StubCodeMark mark(this, "StubRoutines", name); 1348 address start = __ pc(); 1349 __ enter(); 1350 1351 if (entry != NULL) { 1352 *entry = __ pc(); 1353 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1354 BLOCK_COMMENT("Entry:"); 1355 } 1356 1357 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1358 if (dest_uninitialized) { 1359 decorators |= IS_DEST_UNINITIALIZED; 1360 } 1361 if (aligned) { 1362 decorators |= ARRAYCOPY_ALIGNED; 1363 } 1364 1365 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1366 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1367 1368 if (is_oop) { 1369 // save regs before copy_memory 1370 __ push(RegSet::of(d, count), sp); 1371 } 1372 { 1373 // UnsafeCopyMemory page error: continue after ucm 1374 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1375 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1376 copy_memory(aligned, s, d, count, rscratch1, size); 1377 } 1378 1379 if (is_oop) { 1380 __ pop(RegSet::of(d, count), sp); 1381 if (VerifyOops) 1382 verify_oop_array(size, d, count, r16); 1383 } 1384 1385 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1386 1387 __ leave(); 1388 __ mov(r0, zr); // return 0 1389 __ ret(lr); 1390 return start; 1391 } 1392 1393 // Arguments: 1394 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1395 // ignored 1396 // is_oop - true => oop array, so generate store check code 1397 // name - stub name string 1398 // 1399 // Inputs: 1400 // c_rarg0 - source array address 1401 // c_rarg1 - destination array address 1402 // c_rarg2 - element count, treated as ssize_t, can be zero 1403 // 1404 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1405 // the hardware handle it. The two dwords within qwords that span 1406 // cache line boundaries will still be loaded and stored atomicly. 1407 // 1408 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1409 address *entry, const char *name, 1410 bool dest_uninitialized = false) { 1411 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1412 RegSet saved_regs = RegSet::of(s, d, count); 1413 StubCodeMark mark(this, "StubRoutines", name); 1414 address start = __ pc(); 1415 __ enter(); 1416 1417 if (entry != NULL) { 1418 *entry = __ pc(); 1419 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1420 BLOCK_COMMENT("Entry:"); 1421 } 1422 1423 // use fwd copy when (d-s) above_equal (count*size) 1424 __ sub(rscratch1, d, s); 1425 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1426 __ br(Assembler::HS, nooverlap_target); 1427 1428 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1429 if (dest_uninitialized) { 1430 decorators |= IS_DEST_UNINITIALIZED; 1431 } 1432 if (aligned) { 1433 decorators |= ARRAYCOPY_ALIGNED; 1434 } 1435 1436 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1437 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1438 1439 if (is_oop) { 1440 // save regs before copy_memory 1441 __ push(RegSet::of(d, count), sp); 1442 } 1443 { 1444 // UnsafeCopyMemory page error: continue after ucm 1445 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1446 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1447 copy_memory(aligned, s, d, count, rscratch1, -size); 1448 } 1449 if (is_oop) { 1450 __ pop(RegSet::of(d, count), sp); 1451 if (VerifyOops) 1452 verify_oop_array(size, d, count, r16); 1453 } 1454 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1455 __ leave(); 1456 __ mov(r0, zr); // return 0 1457 __ ret(lr); 1458 return start; 1459 } 1460 1461 // Arguments: 1462 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1463 // ignored 1464 // name - stub name string 1465 // 1466 // Inputs: 1467 // c_rarg0 - source array address 1468 // c_rarg1 - destination array address 1469 // c_rarg2 - element count, treated as ssize_t, can be zero 1470 // 1471 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1472 // we let the hardware handle it. The one to eight bytes within words, 1473 // dwords or qwords that span cache line boundaries will still be loaded 1474 // and stored atomically. 1475 // 1476 // Side Effects: 1477 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1478 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1479 // we let the hardware handle it. The one to eight bytes within words, 1480 // dwords or qwords that span cache line boundaries will still be loaded 1481 // and stored atomically. 1482 // 1483 // Side Effects: 1484 // disjoint_byte_copy_entry is set to the no-overlap entry point 1485 // used by generate_conjoint_byte_copy(). 1486 // 1487 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1488 const bool not_oop = false; 1489 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1490 } 1491 1492 // Arguments: 1493 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1494 // ignored 1495 // name - stub name string 1496 // 1497 // Inputs: 1498 // c_rarg0 - source array address 1499 // c_rarg1 - destination array address 1500 // c_rarg2 - element count, treated as ssize_t, can be zero 1501 // 1502 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1503 // we let the hardware handle it. The one to eight bytes within words, 1504 // dwords or qwords that span cache line boundaries will still be loaded 1505 // and stored atomically. 1506 // 1507 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1508 address* entry, const char *name) { 1509 const bool not_oop = false; 1510 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1511 } 1512 1513 // Arguments: 1514 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1515 // ignored 1516 // name - stub name string 1517 // 1518 // Inputs: 1519 // c_rarg0 - source array address 1520 // c_rarg1 - destination array address 1521 // c_rarg2 - element count, treated as ssize_t, can be zero 1522 // 1523 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1524 // let the hardware handle it. The two or four words within dwords 1525 // or qwords that span cache line boundaries will still be loaded 1526 // and stored atomically. 1527 // 1528 // Side Effects: 1529 // disjoint_short_copy_entry is set to the no-overlap entry point 1530 // used by generate_conjoint_short_copy(). 1531 // 1532 address generate_disjoint_short_copy(bool aligned, 1533 address* entry, const char *name) { 1534 const bool not_oop = false; 1535 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1536 } 1537 1538 // Arguments: 1539 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1540 // ignored 1541 // name - stub name string 1542 // 1543 // Inputs: 1544 // c_rarg0 - source array address 1545 // c_rarg1 - destination array address 1546 // c_rarg2 - element count, treated as ssize_t, can be zero 1547 // 1548 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1549 // let the hardware handle it. The two or four words within dwords 1550 // or qwords that span cache line boundaries will still be loaded 1551 // and stored atomically. 1552 // 1553 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1554 address *entry, const char *name) { 1555 const bool not_oop = false; 1556 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1557 1558 } 1559 // Arguments: 1560 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1561 // ignored 1562 // name - stub name string 1563 // 1564 // Inputs: 1565 // c_rarg0 - source array address 1566 // c_rarg1 - destination array address 1567 // c_rarg2 - element count, treated as ssize_t, can be zero 1568 // 1569 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1570 // the hardware handle it. The two dwords within qwords that span 1571 // cache line boundaries will still be loaded and stored atomicly. 1572 // 1573 // Side Effects: 1574 // disjoint_int_copy_entry is set to the no-overlap entry point 1575 // used by generate_conjoint_int_oop_copy(). 1576 // 1577 address generate_disjoint_int_copy(bool aligned, address *entry, 1578 const char *name, bool dest_uninitialized = false) { 1579 const bool not_oop = false; 1580 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1581 } 1582 1583 // Arguments: 1584 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1585 // ignored 1586 // name - stub name string 1587 // 1588 // Inputs: 1589 // c_rarg0 - source array address 1590 // c_rarg1 - destination array address 1591 // c_rarg2 - element count, treated as ssize_t, can be zero 1592 // 1593 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1594 // the hardware handle it. The two dwords within qwords that span 1595 // cache line boundaries will still be loaded and stored atomicly. 1596 // 1597 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1598 address *entry, const char *name, 1599 bool dest_uninitialized = false) { 1600 const bool not_oop = false; 1601 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1602 } 1603 1604 1605 // Arguments: 1606 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1607 // ignored 1608 // name - stub name string 1609 // 1610 // Inputs: 1611 // c_rarg0 - source array address 1612 // c_rarg1 - destination array address 1613 // c_rarg2 - element count, treated as size_t, can be zero 1614 // 1615 // Side Effects: 1616 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1617 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1618 // 1619 address generate_disjoint_long_copy(bool aligned, address *entry, 1620 const char *name, bool dest_uninitialized = false) { 1621 const bool not_oop = false; 1622 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1623 } 1624 1625 // Arguments: 1626 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1627 // ignored 1628 // name - stub name string 1629 // 1630 // Inputs: 1631 // c_rarg0 - source array address 1632 // c_rarg1 - destination array address 1633 // c_rarg2 - element count, treated as size_t, can be zero 1634 // 1635 address generate_conjoint_long_copy(bool aligned, 1636 address nooverlap_target, address *entry, 1637 const char *name, bool dest_uninitialized = false) { 1638 const bool not_oop = false; 1639 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1640 } 1641 1642 // Arguments: 1643 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1644 // ignored 1645 // name - stub name string 1646 // 1647 // Inputs: 1648 // c_rarg0 - source array address 1649 // c_rarg1 - destination array address 1650 // c_rarg2 - element count, treated as size_t, can be zero 1651 // 1652 // Side Effects: 1653 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1654 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1655 // 1656 address generate_disjoint_oop_copy(bool aligned, address *entry, 1657 const char *name, bool dest_uninitialized) { 1658 const bool is_oop = true; 1659 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1660 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1661 } 1662 1663 // Arguments: 1664 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1665 // ignored 1666 // name - stub name string 1667 // 1668 // Inputs: 1669 // c_rarg0 - source array address 1670 // c_rarg1 - destination array address 1671 // c_rarg2 - element count, treated as size_t, can be zero 1672 // 1673 address generate_conjoint_oop_copy(bool aligned, 1674 address nooverlap_target, address *entry, 1675 const char *name, bool dest_uninitialized) { 1676 const bool is_oop = true; 1677 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1678 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1679 name, dest_uninitialized); 1680 } 1681 1682 1683 // Helper for generating a dynamic type check. 1684 // Smashes rscratch1, rscratch2. 1685 void generate_type_check(Register sub_klass, 1686 Register super_check_offset, 1687 Register super_klass, 1688 Label& L_success) { 1689 assert_different_registers(sub_klass, super_check_offset, super_klass); 1690 1691 BLOCK_COMMENT("type_check:"); 1692 1693 Label L_miss; 1694 1695 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1696 super_check_offset); 1697 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1698 1699 // Fall through on failure! 1700 __ BIND(L_miss); 1701 } 1702 1703 // 1704 // Generate checkcasting array copy stub 1705 // 1706 // Input: 1707 // c_rarg0 - source array address 1708 // c_rarg1 - destination array address 1709 // c_rarg2 - element count, treated as ssize_t, can be zero 1710 // c_rarg3 - size_t ckoff (super_check_offset) 1711 // c_rarg4 - oop ckval (super_klass) 1712 // 1713 // Output: 1714 // r0 == 0 - success 1715 // r0 == -1^K - failure, where K is partial transfer count 1716 // 1717 address generate_checkcast_copy(const char *name, address *entry, 1718 bool dest_uninitialized = false) { 1719 1720 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1721 1722 // Input registers (after setup_arg_regs) 1723 const Register from = c_rarg0; // source array address 1724 const Register to = c_rarg1; // destination array address 1725 const Register count = c_rarg2; // elementscount 1726 const Register ckoff = c_rarg3; // super_check_offset 1727 const Register ckval = c_rarg4; // super_klass 1728 1729 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1730 RegSet wb_post_saved_regs = RegSet::of(count); 1731 1732 // Registers used as temps (r18, r19, r20 are save-on-entry) 1733 const Register count_save = r21; // orig elementscount 1734 const Register start_to = r20; // destination array start address 1735 const Register copied_oop = r18; // actual oop copied 1736 const Register r19_klass = r19; // oop._klass 1737 1738 //--------------------------------------------------------------- 1739 // Assembler stub will be used for this call to arraycopy 1740 // if the two arrays are subtypes of Object[] but the 1741 // destination array type is not equal to or a supertype 1742 // of the source type. Each element must be separately 1743 // checked. 1744 1745 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1746 copied_oop, r19_klass, count_save); 1747 1748 __ align(CodeEntryAlignment); 1749 StubCodeMark mark(this, "StubRoutines", name); 1750 address start = __ pc(); 1751 1752 __ enter(); // required for proper stackwalking of RuntimeStub frame 1753 1754 #ifdef ASSERT 1755 // caller guarantees that the arrays really are different 1756 // otherwise, we would have to make conjoint checks 1757 { Label L; 1758 array_overlap_test(L, TIMES_OOP); 1759 __ stop("checkcast_copy within a single array"); 1760 __ bind(L); 1761 } 1762 #endif //ASSERT 1763 1764 // Caller of this entry point must set up the argument registers. 1765 if (entry != NULL) { 1766 *entry = __ pc(); 1767 BLOCK_COMMENT("Entry:"); 1768 } 1769 1770 // Empty array: Nothing to do. 1771 __ cbz(count, L_done); 1772 1773 __ push(RegSet::of(r18, r19, r20, r21), sp); 1774 1775 #ifdef ASSERT 1776 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1777 // The ckoff and ckval must be mutually consistent, 1778 // even though caller generates both. 1779 { Label L; 1780 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1781 __ ldrw(start_to, Address(ckval, sco_offset)); 1782 __ cmpw(ckoff, start_to); 1783 __ br(Assembler::EQ, L); 1784 __ stop("super_check_offset inconsistent"); 1785 __ bind(L); 1786 } 1787 #endif //ASSERT 1788 1789 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1790 bool is_oop = true; 1791 if (dest_uninitialized) { 1792 decorators |= IS_DEST_UNINITIALIZED; 1793 } 1794 1795 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1796 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1797 1798 // save the original count 1799 __ mov(count_save, count); 1800 1801 // Copy from low to high addresses 1802 __ mov(start_to, to); // Save destination array start address 1803 __ b(L_load_element); 1804 1805 // ======== begin loop ======== 1806 // (Loop is rotated; its entry is L_load_element.) 1807 // Loop control: 1808 // for (; count != 0; count--) { 1809 // copied_oop = load_heap_oop(from++); 1810 // ... generate_type_check ...; 1811 // store_heap_oop(to++, copied_oop); 1812 // } 1813 __ align(OptoLoopAlignment); 1814 1815 __ BIND(L_store_element); 1816 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW); // store the oop 1817 __ sub(count, count, 1); 1818 __ cbz(count, L_do_card_marks); 1819 1820 // ======== loop entry is here ======== 1821 __ BIND(L_load_element); 1822 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1823 __ cbz(copied_oop, L_store_element); 1824 1825 __ load_klass(r19_klass, copied_oop);// query the object klass 1826 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1827 // ======== end loop ======== 1828 1829 // It was a real error; we must depend on the caller to finish the job. 1830 // Register count = remaining oops, count_orig = total oops. 1831 // Emit GC store barriers for the oops we have copied and report 1832 // their number to the caller. 1833 1834 __ subs(count, count_save, count); // K = partially copied oop count 1835 __ eon(count, count, zr); // report (-1^K) to caller 1836 __ br(Assembler::EQ, L_done_pop); 1837 1838 __ BIND(L_do_card_marks); 1839 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1840 1841 __ bind(L_done_pop); 1842 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1843 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1844 1845 __ bind(L_done); 1846 __ mov(r0, count); 1847 __ leave(); 1848 __ ret(lr); 1849 1850 return start; 1851 } 1852 1853 // Perform range checks on the proposed arraycopy. 1854 // Kills temp, but nothing else. 1855 // Also, clean the sign bits of src_pos and dst_pos. 1856 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1857 Register src_pos, // source position (c_rarg1) 1858 Register dst, // destination array oo (c_rarg2) 1859 Register dst_pos, // destination position (c_rarg3) 1860 Register length, 1861 Register temp, 1862 Label& L_failed) { 1863 BLOCK_COMMENT("arraycopy_range_checks:"); 1864 1865 assert_different_registers(rscratch1, temp); 1866 1867 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1868 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1869 __ addw(temp, length, src_pos); 1870 __ cmpw(temp, rscratch1); 1871 __ br(Assembler::HI, L_failed); 1872 1873 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1874 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1875 __ addw(temp, length, dst_pos); 1876 __ cmpw(temp, rscratch1); 1877 __ br(Assembler::HI, L_failed); 1878 1879 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1880 __ movw(src_pos, src_pos); 1881 __ movw(dst_pos, dst_pos); 1882 1883 BLOCK_COMMENT("arraycopy_range_checks done"); 1884 } 1885 1886 // These stubs get called from some dumb test routine. 1887 // I'll write them properly when they're called from 1888 // something that's actually doing something. 1889 static void fake_arraycopy_stub(address src, address dst, int count) { 1890 assert(count == 0, "huh?"); 1891 } 1892 1893 1894 // 1895 // Generate 'unsafe' array copy stub 1896 // Though just as safe as the other stubs, it takes an unscaled 1897 // size_t argument instead of an element count. 1898 // 1899 // Input: 1900 // c_rarg0 - source array address 1901 // c_rarg1 - destination array address 1902 // c_rarg2 - byte count, treated as ssize_t, can be zero 1903 // 1904 // Examines the alignment of the operands and dispatches 1905 // to a long, int, short, or byte copy loop. 1906 // 1907 address generate_unsafe_copy(const char *name, 1908 address byte_copy_entry, 1909 address short_copy_entry, 1910 address int_copy_entry, 1911 address long_copy_entry) { 1912 Label L_long_aligned, L_int_aligned, L_short_aligned; 1913 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1914 1915 __ align(CodeEntryAlignment); 1916 StubCodeMark mark(this, "StubRoutines", name); 1917 address start = __ pc(); 1918 __ enter(); // required for proper stackwalking of RuntimeStub frame 1919 1920 // bump this on entry, not on exit: 1921 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1922 1923 __ orr(rscratch1, s, d); 1924 __ orr(rscratch1, rscratch1, count); 1925 1926 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1927 __ cbz(rscratch1, L_long_aligned); 1928 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1929 __ cbz(rscratch1, L_int_aligned); 1930 __ tbz(rscratch1, 0, L_short_aligned); 1931 __ b(RuntimeAddress(byte_copy_entry)); 1932 1933 __ BIND(L_short_aligned); 1934 __ lsr(count, count, LogBytesPerShort); // size => short_count 1935 __ b(RuntimeAddress(short_copy_entry)); 1936 __ BIND(L_int_aligned); 1937 __ lsr(count, count, LogBytesPerInt); // size => int_count 1938 __ b(RuntimeAddress(int_copy_entry)); 1939 __ BIND(L_long_aligned); 1940 __ lsr(count, count, LogBytesPerLong); // size => long_count 1941 __ b(RuntimeAddress(long_copy_entry)); 1942 1943 return start; 1944 } 1945 1946 // 1947 // Generate generic array copy stubs 1948 // 1949 // Input: 1950 // c_rarg0 - src oop 1951 // c_rarg1 - src_pos (32-bits) 1952 // c_rarg2 - dst oop 1953 // c_rarg3 - dst_pos (32-bits) 1954 // c_rarg4 - element count (32-bits) 1955 // 1956 // Output: 1957 // r0 == 0 - success 1958 // r0 == -1^K - failure, where K is partial transfer count 1959 // 1960 address generate_generic_copy(const char *name, 1961 address byte_copy_entry, address short_copy_entry, 1962 address int_copy_entry, address oop_copy_entry, 1963 address long_copy_entry, address checkcast_copy_entry) { 1964 1965 Label L_failed, L_objArray; 1966 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1967 1968 // Input registers 1969 const Register src = c_rarg0; // source array oop 1970 const Register src_pos = c_rarg1; // source position 1971 const Register dst = c_rarg2; // destination array oop 1972 const Register dst_pos = c_rarg3; // destination position 1973 const Register length = c_rarg4; 1974 1975 1976 // Registers used as temps 1977 const Register dst_klass = c_rarg5; 1978 1979 __ align(CodeEntryAlignment); 1980 1981 StubCodeMark mark(this, "StubRoutines", name); 1982 1983 address start = __ pc(); 1984 1985 __ enter(); // required for proper stackwalking of RuntimeStub frame 1986 1987 // bump this on entry, not on exit: 1988 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1989 1990 //----------------------------------------------------------------------- 1991 // Assembler stub will be used for this call to arraycopy 1992 // if the following conditions are met: 1993 // 1994 // (1) src and dst must not be null. 1995 // (2) src_pos must not be negative. 1996 // (3) dst_pos must not be negative. 1997 // (4) length must not be negative. 1998 // (5) src klass and dst klass should be the same and not NULL. 1999 // (6) src and dst should be arrays. 2000 // (7) src_pos + length must not exceed length of src. 2001 // (8) dst_pos + length must not exceed length of dst. 2002 // 2003 2004 // if (src == NULL) return -1; 2005 __ cbz(src, L_failed); 2006 2007 // if (src_pos < 0) return -1; 2008 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2009 2010 // if (dst == NULL) return -1; 2011 __ cbz(dst, L_failed); 2012 2013 // if (dst_pos < 0) return -1; 2014 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2015 2016 // registers used as temp 2017 const Register scratch_length = r16; // elements count to copy 2018 const Register scratch_src_klass = r17; // array klass 2019 const Register lh = r18; // layout helper 2020 2021 // if (length < 0) return -1; 2022 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2023 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2024 2025 __ load_klass(scratch_src_klass, src); 2026 #ifdef ASSERT 2027 // assert(src->klass() != NULL); 2028 { 2029 BLOCK_COMMENT("assert klasses not null {"); 2030 Label L1, L2; 2031 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2032 __ bind(L1); 2033 __ stop("broken null klass"); 2034 __ bind(L2); 2035 __ load_klass(rscratch1, dst); 2036 __ cbz(rscratch1, L1); // this would be broken also 2037 BLOCK_COMMENT("} assert klasses not null done"); 2038 } 2039 #endif 2040 2041 // Load layout helper (32-bits) 2042 // 2043 // |array_tag| | header_size | element_type | |log2_element_size| 2044 // 32 30 24 16 8 2 0 2045 // 2046 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2047 // 2048 2049 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2050 2051 // Handle objArrays completely differently... 2052 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2053 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2054 __ movw(rscratch1, objArray_lh); 2055 __ eorw(rscratch2, lh, rscratch1); 2056 __ cbzw(rscratch2, L_objArray); 2057 2058 // if (src->klass() != dst->klass()) return -1; 2059 __ load_klass(rscratch2, dst); 2060 __ eor(rscratch2, rscratch2, scratch_src_klass); 2061 __ cbnz(rscratch2, L_failed); 2062 2063 // if (!src->is_Array()) return -1; 2064 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2065 2066 // At this point, it is known to be a typeArray (array_tag 0x3). 2067 #ifdef ASSERT 2068 { 2069 BLOCK_COMMENT("assert primitive array {"); 2070 Label L; 2071 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2072 __ cmpw(lh, rscratch2); 2073 __ br(Assembler::GE, L); 2074 __ stop("must be a primitive array"); 2075 __ bind(L); 2076 BLOCK_COMMENT("} assert primitive array done"); 2077 } 2078 #endif 2079 2080 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2081 rscratch2, L_failed); 2082 2083 // TypeArrayKlass 2084 // 2085 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2086 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2087 // 2088 2089 const Register rscratch1_offset = rscratch1; // array offset 2090 const Register r18_elsize = lh; // element size 2091 2092 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2093 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2094 __ add(src, src, rscratch1_offset); // src array offset 2095 __ add(dst, dst, rscratch1_offset); // dst array offset 2096 BLOCK_COMMENT("choose copy loop based on element size"); 2097 2098 // next registers should be set before the jump to corresponding stub 2099 const Register from = c_rarg0; // source array address 2100 const Register to = c_rarg1; // destination array address 2101 const Register count = c_rarg2; // elements count 2102 2103 // 'from', 'to', 'count' registers should be set in such order 2104 // since they are the same as 'src', 'src_pos', 'dst'. 2105 2106 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2107 2108 // The possible values of elsize are 0-3, i.e. exact_log2(element 2109 // size in bytes). We do a simple bitwise binary search. 2110 __ BIND(L_copy_bytes); 2111 __ tbnz(r18_elsize, 1, L_copy_ints); 2112 __ tbnz(r18_elsize, 0, L_copy_shorts); 2113 __ lea(from, Address(src, src_pos));// src_addr 2114 __ lea(to, Address(dst, dst_pos));// dst_addr 2115 __ movw(count, scratch_length); // length 2116 __ b(RuntimeAddress(byte_copy_entry)); 2117 2118 __ BIND(L_copy_shorts); 2119 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2120 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2121 __ movw(count, scratch_length); // length 2122 __ b(RuntimeAddress(short_copy_entry)); 2123 2124 __ BIND(L_copy_ints); 2125 __ tbnz(r18_elsize, 0, L_copy_longs); 2126 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2127 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2128 __ movw(count, scratch_length); // length 2129 __ b(RuntimeAddress(int_copy_entry)); 2130 2131 __ BIND(L_copy_longs); 2132 #ifdef ASSERT 2133 { 2134 BLOCK_COMMENT("assert long copy {"); 2135 Label L; 2136 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2137 __ cmpw(r18_elsize, LogBytesPerLong); 2138 __ br(Assembler::EQ, L); 2139 __ stop("must be long copy, but elsize is wrong"); 2140 __ bind(L); 2141 BLOCK_COMMENT("} assert long copy done"); 2142 } 2143 #endif 2144 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2145 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2146 __ movw(count, scratch_length); // length 2147 __ b(RuntimeAddress(long_copy_entry)); 2148 2149 // ObjArrayKlass 2150 __ BIND(L_objArray); 2151 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2152 2153 Label L_plain_copy, L_checkcast_copy; 2154 // test array classes for subtyping 2155 __ load_klass(r18, dst); 2156 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2157 __ br(Assembler::NE, L_checkcast_copy); 2158 2159 // Identically typed arrays can be copied without element-wise checks. 2160 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2161 rscratch2, L_failed); 2162 2163 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2164 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2165 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2166 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2167 __ movw(count, scratch_length); // length 2168 __ BIND(L_plain_copy); 2169 __ b(RuntimeAddress(oop_copy_entry)); 2170 2171 __ BIND(L_checkcast_copy); 2172 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2173 { 2174 // Before looking at dst.length, make sure dst is also an objArray. 2175 __ ldrw(rscratch1, Address(r18, lh_offset)); 2176 __ movw(rscratch2, objArray_lh); 2177 __ eorw(rscratch1, rscratch1, rscratch2); 2178 __ cbnzw(rscratch1, L_failed); 2179 2180 // It is safe to examine both src.length and dst.length. 2181 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2182 r18, L_failed); 2183 2184 __ load_klass(dst_klass, dst); // reload 2185 2186 // Marshal the base address arguments now, freeing registers. 2187 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2188 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2189 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2190 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2191 __ movw(count, length); // length (reloaded) 2192 Register sco_temp = c_rarg3; // this register is free now 2193 assert_different_registers(from, to, count, sco_temp, 2194 dst_klass, scratch_src_klass); 2195 // assert_clean_int(count, sco_temp); 2196 2197 // Generate the type check. 2198 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2199 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2200 2201 // Smashes rscratch1, rscratch2 2202 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2203 2204 // Fetch destination element klass from the ObjArrayKlass header. 2205 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2206 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2207 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2208 2209 // the checkcast_copy loop needs two extra arguments: 2210 assert(c_rarg3 == sco_temp, "#3 already in place"); 2211 // Set up arguments for checkcast_copy_entry. 2212 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2213 __ b(RuntimeAddress(checkcast_copy_entry)); 2214 } 2215 2216 __ BIND(L_failed); 2217 __ mov(r0, -1); 2218 __ leave(); // required for proper stackwalking of RuntimeStub frame 2219 __ ret(lr); 2220 2221 return start; 2222 } 2223 2224 // 2225 // Generate stub for array fill. If "aligned" is true, the 2226 // "to" address is assumed to be heapword aligned. 2227 // 2228 // Arguments for generated stub: 2229 // to: c_rarg0 2230 // value: c_rarg1 2231 // count: c_rarg2 treated as signed 2232 // 2233 address generate_fill(BasicType t, bool aligned, const char *name) { 2234 __ align(CodeEntryAlignment); 2235 StubCodeMark mark(this, "StubRoutines", name); 2236 address start = __ pc(); 2237 2238 BLOCK_COMMENT("Entry:"); 2239 2240 const Register to = c_rarg0; // source array address 2241 const Register value = c_rarg1; // value 2242 const Register count = c_rarg2; // elements count 2243 2244 const Register bz_base = r10; // base for block_zero routine 2245 const Register cnt_words = r11; // temp register 2246 2247 __ enter(); 2248 2249 Label L_fill_elements, L_exit1; 2250 2251 int shift = -1; 2252 switch (t) { 2253 case T_BYTE: 2254 shift = 0; 2255 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2256 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2257 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2258 __ br(Assembler::LO, L_fill_elements); 2259 break; 2260 case T_SHORT: 2261 shift = 1; 2262 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2263 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2264 __ br(Assembler::LO, L_fill_elements); 2265 break; 2266 case T_INT: 2267 shift = 2; 2268 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2269 __ br(Assembler::LO, L_fill_elements); 2270 break; 2271 default: ShouldNotReachHere(); 2272 } 2273 2274 // Align source address at 8 bytes address boundary. 2275 Label L_skip_align1, L_skip_align2, L_skip_align4; 2276 if (!aligned) { 2277 switch (t) { 2278 case T_BYTE: 2279 // One byte misalignment happens only for byte arrays. 2280 __ tbz(to, 0, L_skip_align1); 2281 __ strb(value, Address(__ post(to, 1))); 2282 __ subw(count, count, 1); 2283 __ bind(L_skip_align1); 2284 // Fallthrough 2285 case T_SHORT: 2286 // Two bytes misalignment happens only for byte and short (char) arrays. 2287 __ tbz(to, 1, L_skip_align2); 2288 __ strh(value, Address(__ post(to, 2))); 2289 __ subw(count, count, 2 >> shift); 2290 __ bind(L_skip_align2); 2291 // Fallthrough 2292 case T_INT: 2293 // Align to 8 bytes, we know we are 4 byte aligned to start. 2294 __ tbz(to, 2, L_skip_align4); 2295 __ strw(value, Address(__ post(to, 4))); 2296 __ subw(count, count, 4 >> shift); 2297 __ bind(L_skip_align4); 2298 break; 2299 default: ShouldNotReachHere(); 2300 } 2301 } 2302 2303 // 2304 // Fill large chunks 2305 // 2306 __ lsrw(cnt_words, count, 3 - shift); // number of words 2307 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2308 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2309 if (UseBlockZeroing) { 2310 Label non_block_zeroing, rest; 2311 // If the fill value is zero we can use the fast zero_words(). 2312 __ cbnz(value, non_block_zeroing); 2313 __ mov(bz_base, to); 2314 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2315 __ zero_words(bz_base, cnt_words); 2316 __ b(rest); 2317 __ bind(non_block_zeroing); 2318 __ fill_words(to, cnt_words, value); 2319 __ bind(rest); 2320 } else { 2321 __ fill_words(to, cnt_words, value); 2322 } 2323 2324 // Remaining count is less than 8 bytes. Fill it by a single store. 2325 // Note that the total length is no less than 8 bytes. 2326 if (t == T_BYTE || t == T_SHORT) { 2327 Label L_exit1; 2328 __ cbzw(count, L_exit1); 2329 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2330 __ str(value, Address(to, -8)); // overwrite some elements 2331 __ bind(L_exit1); 2332 __ leave(); 2333 __ ret(lr); 2334 } 2335 2336 // Handle copies less than 8 bytes. 2337 Label L_fill_2, L_fill_4, L_exit2; 2338 __ bind(L_fill_elements); 2339 switch (t) { 2340 case T_BYTE: 2341 __ tbz(count, 0, L_fill_2); 2342 __ strb(value, Address(__ post(to, 1))); 2343 __ bind(L_fill_2); 2344 __ tbz(count, 1, L_fill_4); 2345 __ strh(value, Address(__ post(to, 2))); 2346 __ bind(L_fill_4); 2347 __ tbz(count, 2, L_exit2); 2348 __ strw(value, Address(to)); 2349 break; 2350 case T_SHORT: 2351 __ tbz(count, 0, L_fill_4); 2352 __ strh(value, Address(__ post(to, 2))); 2353 __ bind(L_fill_4); 2354 __ tbz(count, 1, L_exit2); 2355 __ strw(value, Address(to)); 2356 break; 2357 case T_INT: 2358 __ cbzw(count, L_exit2); 2359 __ strw(value, Address(to)); 2360 break; 2361 default: ShouldNotReachHere(); 2362 } 2363 __ bind(L_exit2); 2364 __ leave(); 2365 __ ret(lr); 2366 return start; 2367 } 2368 2369 address generate_data_cache_writeback() { 2370 const Register line = c_rarg0; // address of line to write back 2371 2372 __ align(CodeEntryAlignment); 2373 2374 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2375 2376 address start = __ pc(); 2377 __ enter(); 2378 __ cache_wb(Address(line, 0)); 2379 __ leave(); 2380 __ ret(lr); 2381 2382 return start; 2383 } 2384 2385 address generate_data_cache_writeback_sync() { 2386 const Register is_pre = c_rarg0; // pre or post sync 2387 2388 __ align(CodeEntryAlignment); 2389 2390 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2391 2392 // pre wbsync is a no-op 2393 // post wbsync translates to an sfence 2394 2395 Label skip; 2396 address start = __ pc(); 2397 __ enter(); 2398 __ cbnz(is_pre, skip); 2399 __ cache_wbsync(false); 2400 __ bind(skip); 2401 __ leave(); 2402 __ ret(lr); 2403 2404 return start; 2405 } 2406 2407 void generate_arraycopy_stubs() { 2408 address entry; 2409 address entry_jbyte_arraycopy; 2410 address entry_jshort_arraycopy; 2411 address entry_jint_arraycopy; 2412 address entry_oop_arraycopy; 2413 address entry_jlong_arraycopy; 2414 address entry_checkcast_arraycopy; 2415 2416 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2417 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2418 2419 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2420 2421 //*** jbyte 2422 // Always need aligned and unaligned versions 2423 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2424 "jbyte_disjoint_arraycopy"); 2425 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2426 &entry_jbyte_arraycopy, 2427 "jbyte_arraycopy"); 2428 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2429 "arrayof_jbyte_disjoint_arraycopy"); 2430 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2431 "arrayof_jbyte_arraycopy"); 2432 2433 //*** jshort 2434 // Always need aligned and unaligned versions 2435 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2436 "jshort_disjoint_arraycopy"); 2437 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2438 &entry_jshort_arraycopy, 2439 "jshort_arraycopy"); 2440 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2441 "arrayof_jshort_disjoint_arraycopy"); 2442 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2443 "arrayof_jshort_arraycopy"); 2444 2445 //*** jint 2446 // Aligned versions 2447 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2448 "arrayof_jint_disjoint_arraycopy"); 2449 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2450 "arrayof_jint_arraycopy"); 2451 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2452 // entry_jint_arraycopy always points to the unaligned version 2453 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2454 "jint_disjoint_arraycopy"); 2455 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2456 &entry_jint_arraycopy, 2457 "jint_arraycopy"); 2458 2459 //*** jlong 2460 // It is always aligned 2461 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2462 "arrayof_jlong_disjoint_arraycopy"); 2463 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2464 "arrayof_jlong_arraycopy"); 2465 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2466 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2467 2468 //*** oops 2469 { 2470 // With compressed oops we need unaligned versions; notice that 2471 // we overwrite entry_oop_arraycopy. 2472 bool aligned = !UseCompressedOops; 2473 2474 StubRoutines::_arrayof_oop_disjoint_arraycopy 2475 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2476 /*dest_uninitialized*/false); 2477 StubRoutines::_arrayof_oop_arraycopy 2478 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2479 /*dest_uninitialized*/false); 2480 // Aligned versions without pre-barriers 2481 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2482 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2483 /*dest_uninitialized*/true); 2484 StubRoutines::_arrayof_oop_arraycopy_uninit 2485 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2486 /*dest_uninitialized*/true); 2487 } 2488 2489 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2490 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2491 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2492 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2493 2494 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2495 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2496 /*dest_uninitialized*/true); 2497 2498 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2499 entry_jbyte_arraycopy, 2500 entry_jshort_arraycopy, 2501 entry_jint_arraycopy, 2502 entry_jlong_arraycopy); 2503 2504 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2505 entry_jbyte_arraycopy, 2506 entry_jshort_arraycopy, 2507 entry_jint_arraycopy, 2508 entry_oop_arraycopy, 2509 entry_jlong_arraycopy, 2510 entry_checkcast_arraycopy); 2511 2512 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2513 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2514 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2515 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2516 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2517 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2518 } 2519 2520 void generate_math_stubs() { Unimplemented(); } 2521 2522 // Arguments: 2523 // 2524 // Inputs: 2525 // c_rarg0 - source byte array address 2526 // c_rarg1 - destination byte array address 2527 // c_rarg2 - K (key) in little endian int array 2528 // 2529 address generate_aescrypt_encryptBlock() { 2530 __ align(CodeEntryAlignment); 2531 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2532 2533 Label L_doLast; 2534 2535 const Register from = c_rarg0; // source array address 2536 const Register to = c_rarg1; // destination array address 2537 const Register key = c_rarg2; // key array address 2538 const Register keylen = rscratch1; 2539 2540 address start = __ pc(); 2541 __ enter(); 2542 2543 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2544 2545 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2546 2547 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2548 __ rev32(v1, __ T16B, v1); 2549 __ rev32(v2, __ T16B, v2); 2550 __ rev32(v3, __ T16B, v3); 2551 __ rev32(v4, __ T16B, v4); 2552 __ aese(v0, v1); 2553 __ aesmc(v0, v0); 2554 __ aese(v0, v2); 2555 __ aesmc(v0, v0); 2556 __ aese(v0, v3); 2557 __ aesmc(v0, v0); 2558 __ aese(v0, v4); 2559 __ aesmc(v0, v0); 2560 2561 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2562 __ rev32(v1, __ T16B, v1); 2563 __ rev32(v2, __ T16B, v2); 2564 __ rev32(v3, __ T16B, v3); 2565 __ rev32(v4, __ T16B, v4); 2566 __ aese(v0, v1); 2567 __ aesmc(v0, v0); 2568 __ aese(v0, v2); 2569 __ aesmc(v0, v0); 2570 __ aese(v0, v3); 2571 __ aesmc(v0, v0); 2572 __ aese(v0, v4); 2573 __ aesmc(v0, v0); 2574 2575 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2576 __ rev32(v1, __ T16B, v1); 2577 __ rev32(v2, __ T16B, v2); 2578 2579 __ cmpw(keylen, 44); 2580 __ br(Assembler::EQ, L_doLast); 2581 2582 __ aese(v0, v1); 2583 __ aesmc(v0, v0); 2584 __ aese(v0, v2); 2585 __ aesmc(v0, v0); 2586 2587 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2588 __ rev32(v1, __ T16B, v1); 2589 __ rev32(v2, __ T16B, v2); 2590 2591 __ cmpw(keylen, 52); 2592 __ br(Assembler::EQ, L_doLast); 2593 2594 __ aese(v0, v1); 2595 __ aesmc(v0, v0); 2596 __ aese(v0, v2); 2597 __ aesmc(v0, v0); 2598 2599 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2600 __ rev32(v1, __ T16B, v1); 2601 __ rev32(v2, __ T16B, v2); 2602 2603 __ BIND(L_doLast); 2604 2605 __ aese(v0, v1); 2606 __ aesmc(v0, v0); 2607 __ aese(v0, v2); 2608 2609 __ ld1(v1, __ T16B, key); 2610 __ rev32(v1, __ T16B, v1); 2611 __ eor(v0, __ T16B, v0, v1); 2612 2613 __ st1(v0, __ T16B, to); 2614 2615 __ mov(r0, 0); 2616 2617 __ leave(); 2618 __ ret(lr); 2619 2620 return start; 2621 } 2622 2623 // Arguments: 2624 // 2625 // Inputs: 2626 // c_rarg0 - source byte array address 2627 // c_rarg1 - destination byte array address 2628 // c_rarg2 - K (key) in little endian int array 2629 // 2630 address generate_aescrypt_decryptBlock() { 2631 assert(UseAES, "need AES instructions and misaligned SSE support"); 2632 __ align(CodeEntryAlignment); 2633 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2634 Label L_doLast; 2635 2636 const Register from = c_rarg0; // source array address 2637 const Register to = c_rarg1; // destination array address 2638 const Register key = c_rarg2; // key array address 2639 const Register keylen = rscratch1; 2640 2641 address start = __ pc(); 2642 __ enter(); // required for proper stackwalking of RuntimeStub frame 2643 2644 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2645 2646 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2647 2648 __ ld1(v5, __ T16B, __ post(key, 16)); 2649 __ rev32(v5, __ T16B, v5); 2650 2651 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2652 __ rev32(v1, __ T16B, v1); 2653 __ rev32(v2, __ T16B, v2); 2654 __ rev32(v3, __ T16B, v3); 2655 __ rev32(v4, __ T16B, v4); 2656 __ aesd(v0, v1); 2657 __ aesimc(v0, v0); 2658 __ aesd(v0, v2); 2659 __ aesimc(v0, v0); 2660 __ aesd(v0, v3); 2661 __ aesimc(v0, v0); 2662 __ aesd(v0, v4); 2663 __ aesimc(v0, v0); 2664 2665 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2666 __ rev32(v1, __ T16B, v1); 2667 __ rev32(v2, __ T16B, v2); 2668 __ rev32(v3, __ T16B, v3); 2669 __ rev32(v4, __ T16B, v4); 2670 __ aesd(v0, v1); 2671 __ aesimc(v0, v0); 2672 __ aesd(v0, v2); 2673 __ aesimc(v0, v0); 2674 __ aesd(v0, v3); 2675 __ aesimc(v0, v0); 2676 __ aesd(v0, v4); 2677 __ aesimc(v0, v0); 2678 2679 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2680 __ rev32(v1, __ T16B, v1); 2681 __ rev32(v2, __ T16B, v2); 2682 2683 __ cmpw(keylen, 44); 2684 __ br(Assembler::EQ, L_doLast); 2685 2686 __ aesd(v0, v1); 2687 __ aesimc(v0, v0); 2688 __ aesd(v0, v2); 2689 __ aesimc(v0, v0); 2690 2691 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2692 __ rev32(v1, __ T16B, v1); 2693 __ rev32(v2, __ T16B, v2); 2694 2695 __ cmpw(keylen, 52); 2696 __ br(Assembler::EQ, L_doLast); 2697 2698 __ aesd(v0, v1); 2699 __ aesimc(v0, v0); 2700 __ aesd(v0, v2); 2701 __ aesimc(v0, v0); 2702 2703 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2704 __ rev32(v1, __ T16B, v1); 2705 __ rev32(v2, __ T16B, v2); 2706 2707 __ BIND(L_doLast); 2708 2709 __ aesd(v0, v1); 2710 __ aesimc(v0, v0); 2711 __ aesd(v0, v2); 2712 2713 __ eor(v0, __ T16B, v0, v5); 2714 2715 __ st1(v0, __ T16B, to); 2716 2717 __ mov(r0, 0); 2718 2719 __ leave(); 2720 __ ret(lr); 2721 2722 return start; 2723 } 2724 2725 // Arguments: 2726 // 2727 // Inputs: 2728 // c_rarg0 - source byte array address 2729 // c_rarg1 - destination byte array address 2730 // c_rarg2 - K (key) in little endian int array 2731 // c_rarg3 - r vector byte array address 2732 // c_rarg4 - input length 2733 // 2734 // Output: 2735 // x0 - input length 2736 // 2737 address generate_cipherBlockChaining_encryptAESCrypt() { 2738 assert(UseAES, "need AES instructions and misaligned SSE support"); 2739 __ align(CodeEntryAlignment); 2740 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2741 2742 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2743 2744 const Register from = c_rarg0; // source array address 2745 const Register to = c_rarg1; // destination array address 2746 const Register key = c_rarg2; // key array address 2747 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2748 // and left with the results of the last encryption block 2749 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2750 const Register keylen = rscratch1; 2751 2752 address start = __ pc(); 2753 2754 __ enter(); 2755 2756 __ movw(rscratch2, len_reg); 2757 2758 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2759 2760 __ ld1(v0, __ T16B, rvec); 2761 2762 __ cmpw(keylen, 52); 2763 __ br(Assembler::CC, L_loadkeys_44); 2764 __ br(Assembler::EQ, L_loadkeys_52); 2765 2766 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2767 __ rev32(v17, __ T16B, v17); 2768 __ rev32(v18, __ T16B, v18); 2769 __ BIND(L_loadkeys_52); 2770 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2771 __ rev32(v19, __ T16B, v19); 2772 __ rev32(v20, __ T16B, v20); 2773 __ BIND(L_loadkeys_44); 2774 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2775 __ rev32(v21, __ T16B, v21); 2776 __ rev32(v22, __ T16B, v22); 2777 __ rev32(v23, __ T16B, v23); 2778 __ rev32(v24, __ T16B, v24); 2779 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2780 __ rev32(v25, __ T16B, v25); 2781 __ rev32(v26, __ T16B, v26); 2782 __ rev32(v27, __ T16B, v27); 2783 __ rev32(v28, __ T16B, v28); 2784 __ ld1(v29, v30, v31, __ T16B, key); 2785 __ rev32(v29, __ T16B, v29); 2786 __ rev32(v30, __ T16B, v30); 2787 __ rev32(v31, __ T16B, v31); 2788 2789 __ BIND(L_aes_loop); 2790 __ ld1(v1, __ T16B, __ post(from, 16)); 2791 __ eor(v0, __ T16B, v0, v1); 2792 2793 __ br(Assembler::CC, L_rounds_44); 2794 __ br(Assembler::EQ, L_rounds_52); 2795 2796 __ aese(v0, v17); __ aesmc(v0, v0); 2797 __ aese(v0, v18); __ aesmc(v0, v0); 2798 __ BIND(L_rounds_52); 2799 __ aese(v0, v19); __ aesmc(v0, v0); 2800 __ aese(v0, v20); __ aesmc(v0, v0); 2801 __ BIND(L_rounds_44); 2802 __ aese(v0, v21); __ aesmc(v0, v0); 2803 __ aese(v0, v22); __ aesmc(v0, v0); 2804 __ aese(v0, v23); __ aesmc(v0, v0); 2805 __ aese(v0, v24); __ aesmc(v0, v0); 2806 __ aese(v0, v25); __ aesmc(v0, v0); 2807 __ aese(v0, v26); __ aesmc(v0, v0); 2808 __ aese(v0, v27); __ aesmc(v0, v0); 2809 __ aese(v0, v28); __ aesmc(v0, v0); 2810 __ aese(v0, v29); __ aesmc(v0, v0); 2811 __ aese(v0, v30); 2812 __ eor(v0, __ T16B, v0, v31); 2813 2814 __ st1(v0, __ T16B, __ post(to, 16)); 2815 2816 __ subw(len_reg, len_reg, 16); 2817 __ cbnzw(len_reg, L_aes_loop); 2818 2819 __ st1(v0, __ T16B, rvec); 2820 2821 __ mov(r0, rscratch2); 2822 2823 __ leave(); 2824 __ ret(lr); 2825 2826 return start; 2827 } 2828 2829 // Arguments: 2830 // 2831 // Inputs: 2832 // c_rarg0 - source byte array address 2833 // c_rarg1 - destination byte array address 2834 // c_rarg2 - K (key) in little endian int array 2835 // c_rarg3 - r vector byte array address 2836 // c_rarg4 - input length 2837 // 2838 // Output: 2839 // r0 - input length 2840 // 2841 address generate_cipherBlockChaining_decryptAESCrypt() { 2842 assert(UseAES, "need AES instructions and misaligned SSE support"); 2843 __ align(CodeEntryAlignment); 2844 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2845 2846 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2847 2848 const Register from = c_rarg0; // source array address 2849 const Register to = c_rarg1; // destination array address 2850 const Register key = c_rarg2; // key array address 2851 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2852 // and left with the results of the last encryption block 2853 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2854 const Register keylen = rscratch1; 2855 2856 address start = __ pc(); 2857 2858 __ enter(); 2859 2860 __ movw(rscratch2, len_reg); 2861 2862 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2863 2864 __ ld1(v2, __ T16B, rvec); 2865 2866 __ ld1(v31, __ T16B, __ post(key, 16)); 2867 __ rev32(v31, __ T16B, v31); 2868 2869 __ cmpw(keylen, 52); 2870 __ br(Assembler::CC, L_loadkeys_44); 2871 __ br(Assembler::EQ, L_loadkeys_52); 2872 2873 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2874 __ rev32(v17, __ T16B, v17); 2875 __ rev32(v18, __ T16B, v18); 2876 __ BIND(L_loadkeys_52); 2877 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2878 __ rev32(v19, __ T16B, v19); 2879 __ rev32(v20, __ T16B, v20); 2880 __ BIND(L_loadkeys_44); 2881 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2882 __ rev32(v21, __ T16B, v21); 2883 __ rev32(v22, __ T16B, v22); 2884 __ rev32(v23, __ T16B, v23); 2885 __ rev32(v24, __ T16B, v24); 2886 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2887 __ rev32(v25, __ T16B, v25); 2888 __ rev32(v26, __ T16B, v26); 2889 __ rev32(v27, __ T16B, v27); 2890 __ rev32(v28, __ T16B, v28); 2891 __ ld1(v29, v30, __ T16B, key); 2892 __ rev32(v29, __ T16B, v29); 2893 __ rev32(v30, __ T16B, v30); 2894 2895 __ BIND(L_aes_loop); 2896 __ ld1(v0, __ T16B, __ post(from, 16)); 2897 __ orr(v1, __ T16B, v0, v0); 2898 2899 __ br(Assembler::CC, L_rounds_44); 2900 __ br(Assembler::EQ, L_rounds_52); 2901 2902 __ aesd(v0, v17); __ aesimc(v0, v0); 2903 __ aesd(v0, v18); __ aesimc(v0, v0); 2904 __ BIND(L_rounds_52); 2905 __ aesd(v0, v19); __ aesimc(v0, v0); 2906 __ aesd(v0, v20); __ aesimc(v0, v0); 2907 __ BIND(L_rounds_44); 2908 __ aesd(v0, v21); __ aesimc(v0, v0); 2909 __ aesd(v0, v22); __ aesimc(v0, v0); 2910 __ aesd(v0, v23); __ aesimc(v0, v0); 2911 __ aesd(v0, v24); __ aesimc(v0, v0); 2912 __ aesd(v0, v25); __ aesimc(v0, v0); 2913 __ aesd(v0, v26); __ aesimc(v0, v0); 2914 __ aesd(v0, v27); __ aesimc(v0, v0); 2915 __ aesd(v0, v28); __ aesimc(v0, v0); 2916 __ aesd(v0, v29); __ aesimc(v0, v0); 2917 __ aesd(v0, v30); 2918 __ eor(v0, __ T16B, v0, v31); 2919 __ eor(v0, __ T16B, v0, v2); 2920 2921 __ st1(v0, __ T16B, __ post(to, 16)); 2922 __ orr(v2, __ T16B, v1, v1); 2923 2924 __ subw(len_reg, len_reg, 16); 2925 __ cbnzw(len_reg, L_aes_loop); 2926 2927 __ st1(v2, __ T16B, rvec); 2928 2929 __ mov(r0, rscratch2); 2930 2931 __ leave(); 2932 __ ret(lr); 2933 2934 return start; 2935 } 2936 2937 // Arguments: 2938 // 2939 // Inputs: 2940 // c_rarg0 - byte[] source+offset 2941 // c_rarg1 - int[] SHA.state 2942 // c_rarg2 - int offset 2943 // c_rarg3 - int limit 2944 // 2945 address generate_sha1_implCompress(bool multi_block, const char *name) { 2946 __ align(CodeEntryAlignment); 2947 StubCodeMark mark(this, "StubRoutines", name); 2948 address start = __ pc(); 2949 2950 Register buf = c_rarg0; 2951 Register state = c_rarg1; 2952 Register ofs = c_rarg2; 2953 Register limit = c_rarg3; 2954 2955 Label keys; 2956 Label sha1_loop; 2957 2958 // load the keys into v0..v3 2959 __ adr(rscratch1, keys); 2960 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2961 // load 5 words state into v6, v7 2962 __ ldrq(v6, Address(state, 0)); 2963 __ ldrs(v7, Address(state, 16)); 2964 2965 2966 __ BIND(sha1_loop); 2967 // load 64 bytes of data into v16..v19 2968 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2969 __ rev32(v16, __ T16B, v16); 2970 __ rev32(v17, __ T16B, v17); 2971 __ rev32(v18, __ T16B, v18); 2972 __ rev32(v19, __ T16B, v19); 2973 2974 // do the sha1 2975 __ addv(v4, __ T4S, v16, v0); 2976 __ orr(v20, __ T16B, v6, v6); 2977 2978 FloatRegister d0 = v16; 2979 FloatRegister d1 = v17; 2980 FloatRegister d2 = v18; 2981 FloatRegister d3 = v19; 2982 2983 for (int round = 0; round < 20; round++) { 2984 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2985 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2986 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2987 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2988 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2989 2990 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2991 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2992 __ sha1h(tmp2, __ T4S, v20); 2993 if (round < 5) 2994 __ sha1c(v20, __ T4S, tmp3, tmp4); 2995 else if (round < 10 || round >= 15) 2996 __ sha1p(v20, __ T4S, tmp3, tmp4); 2997 else 2998 __ sha1m(v20, __ T4S, tmp3, tmp4); 2999 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3000 3001 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3002 } 3003 3004 __ addv(v7, __ T2S, v7, v21); 3005 __ addv(v6, __ T4S, v6, v20); 3006 3007 if (multi_block) { 3008 __ add(ofs, ofs, 64); 3009 __ cmp(ofs, limit); 3010 __ br(Assembler::LE, sha1_loop); 3011 __ mov(c_rarg0, ofs); // return ofs 3012 } 3013 3014 __ strq(v6, Address(state, 0)); 3015 __ strs(v7, Address(state, 16)); 3016 3017 __ ret(lr); 3018 3019 __ bind(keys); 3020 __ emit_int32(0x5a827999); 3021 __ emit_int32(0x6ed9eba1); 3022 __ emit_int32(0x8f1bbcdc); 3023 __ emit_int32(0xca62c1d6); 3024 3025 return start; 3026 } 3027 3028 3029 // Arguments: 3030 // 3031 // Inputs: 3032 // c_rarg0 - byte[] source+offset 3033 // c_rarg1 - int[] SHA.state 3034 // c_rarg2 - int offset 3035 // c_rarg3 - int limit 3036 // 3037 address generate_sha256_implCompress(bool multi_block, const char *name) { 3038 static const uint32_t round_consts[64] = { 3039 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3040 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3041 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3042 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3043 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3044 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3045 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3046 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3047 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3048 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3049 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3050 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3051 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3052 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3053 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3054 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3055 }; 3056 __ align(CodeEntryAlignment); 3057 StubCodeMark mark(this, "StubRoutines", name); 3058 address start = __ pc(); 3059 3060 Register buf = c_rarg0; 3061 Register state = c_rarg1; 3062 Register ofs = c_rarg2; 3063 Register limit = c_rarg3; 3064 3065 Label sha1_loop; 3066 3067 __ stpd(v8, v9, __ pre(sp, -32)); 3068 __ stpd(v10, v11, Address(sp, 16)); 3069 3070 // dga == v0 3071 // dgb == v1 3072 // dg0 == v2 3073 // dg1 == v3 3074 // dg2 == v4 3075 // t0 == v6 3076 // t1 == v7 3077 3078 // load 16 keys to v16..v31 3079 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3080 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3081 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3082 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3083 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3084 3085 // load 8 words (256 bits) state 3086 __ ldpq(v0, v1, state); 3087 3088 __ BIND(sha1_loop); 3089 // load 64 bytes of data into v8..v11 3090 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3091 __ rev32(v8, __ T16B, v8); 3092 __ rev32(v9, __ T16B, v9); 3093 __ rev32(v10, __ T16B, v10); 3094 __ rev32(v11, __ T16B, v11); 3095 3096 __ addv(v6, __ T4S, v8, v16); 3097 __ orr(v2, __ T16B, v0, v0); 3098 __ orr(v3, __ T16B, v1, v1); 3099 3100 FloatRegister d0 = v8; 3101 FloatRegister d1 = v9; 3102 FloatRegister d2 = v10; 3103 FloatRegister d3 = v11; 3104 3105 3106 for (int round = 0; round < 16; round++) { 3107 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3108 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3109 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3110 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3111 3112 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3113 __ orr(v4, __ T16B, v2, v2); 3114 if (round < 15) 3115 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3116 __ sha256h(v2, __ T4S, v3, tmp2); 3117 __ sha256h2(v3, __ T4S, v4, tmp2); 3118 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3119 3120 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3121 } 3122 3123 __ addv(v0, __ T4S, v0, v2); 3124 __ addv(v1, __ T4S, v1, v3); 3125 3126 if (multi_block) { 3127 __ add(ofs, ofs, 64); 3128 __ cmp(ofs, limit); 3129 __ br(Assembler::LE, sha1_loop); 3130 __ mov(c_rarg0, ofs); // return ofs 3131 } 3132 3133 __ ldpd(v10, v11, Address(sp, 16)); 3134 __ ldpd(v8, v9, __ post(sp, 32)); 3135 3136 __ stpq(v0, v1, state); 3137 3138 __ ret(lr); 3139 3140 return start; 3141 } 3142 3143 // Safefetch stubs. 3144 void generate_safefetch(const char* name, int size, address* entry, 3145 address* fault_pc, address* continuation_pc) { 3146 // safefetch signatures: 3147 // int SafeFetch32(int* adr, int errValue); 3148 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3149 // 3150 // arguments: 3151 // c_rarg0 = adr 3152 // c_rarg1 = errValue 3153 // 3154 // result: 3155 // PPC_RET = *adr or errValue 3156 3157 StubCodeMark mark(this, "StubRoutines", name); 3158 3159 // Entry point, pc or function descriptor. 3160 *entry = __ pc(); 3161 3162 // Load *adr into c_rarg1, may fault. 3163 *fault_pc = __ pc(); 3164 switch (size) { 3165 case 4: 3166 // int32_t 3167 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3168 break; 3169 case 8: 3170 // int64_t 3171 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3172 break; 3173 default: 3174 ShouldNotReachHere(); 3175 } 3176 3177 // return errValue or *adr 3178 *continuation_pc = __ pc(); 3179 __ mov(r0, c_rarg1); 3180 __ ret(lr); 3181 } 3182 3183 /** 3184 * Arguments: 3185 * 3186 * Inputs: 3187 * c_rarg0 - int crc 3188 * c_rarg1 - byte* buf 3189 * c_rarg2 - int length 3190 * 3191 * Ouput: 3192 * rax - int crc result 3193 */ 3194 address generate_updateBytesCRC32() { 3195 assert(UseCRC32Intrinsics, "what are we doing here?"); 3196 3197 __ align(CodeEntryAlignment); 3198 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3199 3200 address start = __ pc(); 3201 3202 const Register crc = c_rarg0; // crc 3203 const Register buf = c_rarg1; // source java byte array address 3204 const Register len = c_rarg2; // length 3205 const Register table0 = c_rarg3; // crc_table address 3206 const Register table1 = c_rarg4; 3207 const Register table2 = c_rarg5; 3208 const Register table3 = c_rarg6; 3209 const Register tmp3 = c_rarg7; 3210 3211 BLOCK_COMMENT("Entry:"); 3212 __ enter(); // required for proper stackwalking of RuntimeStub frame 3213 3214 __ kernel_crc32(crc, buf, len, 3215 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3216 3217 __ leave(); // required for proper stackwalking of RuntimeStub frame 3218 __ ret(lr); 3219 3220 return start; 3221 } 3222 3223 /** 3224 * Arguments: 3225 * 3226 * Inputs: 3227 * c_rarg0 - int crc 3228 * c_rarg1 - byte* buf 3229 * c_rarg2 - int length 3230 * c_rarg3 - int* table 3231 * 3232 * Ouput: 3233 * r0 - int crc result 3234 */ 3235 address generate_updateBytesCRC32C() { 3236 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3237 3238 __ align(CodeEntryAlignment); 3239 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3240 3241 address start = __ pc(); 3242 3243 const Register crc = c_rarg0; // crc 3244 const Register buf = c_rarg1; // source java byte array address 3245 const Register len = c_rarg2; // length 3246 const Register table0 = c_rarg3; // crc_table address 3247 const Register table1 = c_rarg4; 3248 const Register table2 = c_rarg5; 3249 const Register table3 = c_rarg6; 3250 const Register tmp3 = c_rarg7; 3251 3252 BLOCK_COMMENT("Entry:"); 3253 __ enter(); // required for proper stackwalking of RuntimeStub frame 3254 3255 __ kernel_crc32c(crc, buf, len, 3256 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3257 3258 __ leave(); // required for proper stackwalking of RuntimeStub frame 3259 __ ret(lr); 3260 3261 return start; 3262 } 3263 3264 /*** 3265 * Arguments: 3266 * 3267 * Inputs: 3268 * c_rarg0 - int adler 3269 * c_rarg1 - byte* buff 3270 * c_rarg2 - int len 3271 * 3272 * Output: 3273 * c_rarg0 - int adler result 3274 */ 3275 address generate_updateBytesAdler32() { 3276 __ align(CodeEntryAlignment); 3277 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3278 address start = __ pc(); 3279 3280 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3281 3282 // Aliases 3283 Register adler = c_rarg0; 3284 Register s1 = c_rarg0; 3285 Register s2 = c_rarg3; 3286 Register buff = c_rarg1; 3287 Register len = c_rarg2; 3288 Register nmax = r4; 3289 Register base = r5; 3290 Register count = r6; 3291 Register temp0 = rscratch1; 3292 Register temp1 = rscratch2; 3293 FloatRegister vbytes = v0; 3294 FloatRegister vs1acc = v1; 3295 FloatRegister vs2acc = v2; 3296 FloatRegister vtable = v3; 3297 3298 // Max number of bytes we can process before having to take the mod 3299 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3300 unsigned long BASE = 0xfff1; 3301 unsigned long NMAX = 0x15B0; 3302 3303 __ mov(base, BASE); 3304 __ mov(nmax, NMAX); 3305 3306 // Load accumulation coefficients for the upper 16 bits 3307 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3308 __ ld1(vtable, __ T16B, Address(temp0)); 3309 3310 // s1 is initialized to the lower 16 bits of adler 3311 // s2 is initialized to the upper 16 bits of adler 3312 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3313 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3314 3315 // The pipelined loop needs at least 16 elements for 1 iteration 3316 // It does check this, but it is more effective to skip to the cleanup loop 3317 __ cmp(len, (u1)16); 3318 __ br(Assembler::HS, L_nmax); 3319 __ cbz(len, L_combine); 3320 3321 __ bind(L_simple_by1_loop); 3322 __ ldrb(temp0, Address(__ post(buff, 1))); 3323 __ add(s1, s1, temp0); 3324 __ add(s2, s2, s1); 3325 __ subs(len, len, 1); 3326 __ br(Assembler::HI, L_simple_by1_loop); 3327 3328 // s1 = s1 % BASE 3329 __ subs(temp0, s1, base); 3330 __ csel(s1, temp0, s1, Assembler::HS); 3331 3332 // s2 = s2 % BASE 3333 __ lsr(temp0, s2, 16); 3334 __ lsl(temp1, temp0, 4); 3335 __ sub(temp1, temp1, temp0); 3336 __ add(s2, temp1, s2, ext::uxth); 3337 3338 __ subs(temp0, s2, base); 3339 __ csel(s2, temp0, s2, Assembler::HS); 3340 3341 __ b(L_combine); 3342 3343 __ bind(L_nmax); 3344 __ subs(len, len, nmax); 3345 __ sub(count, nmax, 16); 3346 __ br(Assembler::LO, L_by16); 3347 3348 __ bind(L_nmax_loop); 3349 3350 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3351 vbytes, vs1acc, vs2acc, vtable); 3352 3353 __ subs(count, count, 16); 3354 __ br(Assembler::HS, L_nmax_loop); 3355 3356 // s1 = s1 % BASE 3357 __ lsr(temp0, s1, 16); 3358 __ lsl(temp1, temp0, 4); 3359 __ sub(temp1, temp1, temp0); 3360 __ add(temp1, temp1, s1, ext::uxth); 3361 3362 __ lsr(temp0, temp1, 16); 3363 __ lsl(s1, temp0, 4); 3364 __ sub(s1, s1, temp0); 3365 __ add(s1, s1, temp1, ext:: uxth); 3366 3367 __ subs(temp0, s1, base); 3368 __ csel(s1, temp0, s1, Assembler::HS); 3369 3370 // s2 = s2 % BASE 3371 __ lsr(temp0, s2, 16); 3372 __ lsl(temp1, temp0, 4); 3373 __ sub(temp1, temp1, temp0); 3374 __ add(temp1, temp1, s2, ext::uxth); 3375 3376 __ lsr(temp0, temp1, 16); 3377 __ lsl(s2, temp0, 4); 3378 __ sub(s2, s2, temp0); 3379 __ add(s2, s2, temp1, ext:: uxth); 3380 3381 __ subs(temp0, s2, base); 3382 __ csel(s2, temp0, s2, Assembler::HS); 3383 3384 __ subs(len, len, nmax); 3385 __ sub(count, nmax, 16); 3386 __ br(Assembler::HS, L_nmax_loop); 3387 3388 __ bind(L_by16); 3389 __ adds(len, len, count); 3390 __ br(Assembler::LO, L_by1); 3391 3392 __ bind(L_by16_loop); 3393 3394 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3395 vbytes, vs1acc, vs2acc, vtable); 3396 3397 __ subs(len, len, 16); 3398 __ br(Assembler::HS, L_by16_loop); 3399 3400 __ bind(L_by1); 3401 __ adds(len, len, 15); 3402 __ br(Assembler::LO, L_do_mod); 3403 3404 __ bind(L_by1_loop); 3405 __ ldrb(temp0, Address(__ post(buff, 1))); 3406 __ add(s1, temp0, s1); 3407 __ add(s2, s2, s1); 3408 __ subs(len, len, 1); 3409 __ br(Assembler::HS, L_by1_loop); 3410 3411 __ bind(L_do_mod); 3412 // s1 = s1 % BASE 3413 __ lsr(temp0, s1, 16); 3414 __ lsl(temp1, temp0, 4); 3415 __ sub(temp1, temp1, temp0); 3416 __ add(temp1, temp1, s1, ext::uxth); 3417 3418 __ lsr(temp0, temp1, 16); 3419 __ lsl(s1, temp0, 4); 3420 __ sub(s1, s1, temp0); 3421 __ add(s1, s1, temp1, ext:: uxth); 3422 3423 __ subs(temp0, s1, base); 3424 __ csel(s1, temp0, s1, Assembler::HS); 3425 3426 // s2 = s2 % BASE 3427 __ lsr(temp0, s2, 16); 3428 __ lsl(temp1, temp0, 4); 3429 __ sub(temp1, temp1, temp0); 3430 __ add(temp1, temp1, s2, ext::uxth); 3431 3432 __ lsr(temp0, temp1, 16); 3433 __ lsl(s2, temp0, 4); 3434 __ sub(s2, s2, temp0); 3435 __ add(s2, s2, temp1, ext:: uxth); 3436 3437 __ subs(temp0, s2, base); 3438 __ csel(s2, temp0, s2, Assembler::HS); 3439 3440 // Combine lower bits and higher bits 3441 __ bind(L_combine); 3442 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3443 3444 __ ret(lr); 3445 3446 return start; 3447 } 3448 3449 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3450 Register temp0, Register temp1, FloatRegister vbytes, 3451 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3452 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3453 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3454 // In non-vectorized code, we update s1 and s2 as: 3455 // s1 <- s1 + b1 3456 // s2 <- s2 + s1 3457 // s1 <- s1 + b2 3458 // s2 <- s2 + b1 3459 // ... 3460 // s1 <- s1 + b16 3461 // s2 <- s2 + s1 3462 // Putting above assignments together, we have: 3463 // s1_new = s1 + b1 + b2 + ... + b16 3464 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3465 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3466 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3467 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3468 3469 // s2 = s2 + s1 * 16 3470 __ add(s2, s2, s1, Assembler::LSL, 4); 3471 3472 // vs1acc = b1 + b2 + b3 + ... + b16 3473 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3474 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3475 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3476 __ uaddlv(vs1acc, __ T16B, vbytes); 3477 __ uaddlv(vs2acc, __ T8H, vs2acc); 3478 3479 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3480 __ fmovd(temp0, vs1acc); 3481 __ fmovd(temp1, vs2acc); 3482 __ add(s1, s1, temp0); 3483 __ add(s2, s2, temp1); 3484 } 3485 3486 /** 3487 * Arguments: 3488 * 3489 * Input: 3490 * c_rarg0 - x address 3491 * c_rarg1 - x length 3492 * c_rarg2 - y address 3493 * c_rarg3 - y lenth 3494 * c_rarg4 - z address 3495 * c_rarg5 - z length 3496 */ 3497 address generate_multiplyToLen() { 3498 __ align(CodeEntryAlignment); 3499 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3500 3501 address start = __ pc(); 3502 const Register x = r0; 3503 const Register xlen = r1; 3504 const Register y = r2; 3505 const Register ylen = r3; 3506 const Register z = r4; 3507 const Register zlen = r5; 3508 3509 const Register tmp1 = r10; 3510 const Register tmp2 = r11; 3511 const Register tmp3 = r12; 3512 const Register tmp4 = r13; 3513 const Register tmp5 = r14; 3514 const Register tmp6 = r15; 3515 const Register tmp7 = r16; 3516 3517 BLOCK_COMMENT("Entry:"); 3518 __ enter(); // required for proper stackwalking of RuntimeStub frame 3519 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3520 __ leave(); // required for proper stackwalking of RuntimeStub frame 3521 __ ret(lr); 3522 3523 return start; 3524 } 3525 3526 address generate_squareToLen() { 3527 // squareToLen algorithm for sizes 1..127 described in java code works 3528 // faster than multiply_to_len on some CPUs and slower on others, but 3529 // multiply_to_len shows a bit better overall results 3530 __ align(CodeEntryAlignment); 3531 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3532 address start = __ pc(); 3533 3534 const Register x = r0; 3535 const Register xlen = r1; 3536 const Register z = r2; 3537 const Register zlen = r3; 3538 const Register y = r4; // == x 3539 const Register ylen = r5; // == xlen 3540 3541 const Register tmp1 = r10; 3542 const Register tmp2 = r11; 3543 const Register tmp3 = r12; 3544 const Register tmp4 = r13; 3545 const Register tmp5 = r14; 3546 const Register tmp6 = r15; 3547 const Register tmp7 = r16; 3548 3549 RegSet spilled_regs = RegSet::of(y, ylen); 3550 BLOCK_COMMENT("Entry:"); 3551 __ enter(); 3552 __ push(spilled_regs, sp); 3553 __ mov(y, x); 3554 __ mov(ylen, xlen); 3555 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3556 __ pop(spilled_regs, sp); 3557 __ leave(); 3558 __ ret(lr); 3559 return start; 3560 } 3561 3562 address generate_mulAdd() { 3563 __ align(CodeEntryAlignment); 3564 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3565 3566 address start = __ pc(); 3567 3568 const Register out = r0; 3569 const Register in = r1; 3570 const Register offset = r2; 3571 const Register len = r3; 3572 const Register k = r4; 3573 3574 BLOCK_COMMENT("Entry:"); 3575 __ enter(); 3576 __ mul_add(out, in, offset, len, k); 3577 __ leave(); 3578 __ ret(lr); 3579 3580 return start; 3581 } 3582 3583 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3584 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3585 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3586 // Karatsuba multiplication performs a 128*128 -> 256-bit 3587 // multiplication in three 128-bit multiplications and a few 3588 // additions. 3589 // 3590 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3591 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3592 // 3593 // Inputs: 3594 // 3595 // A0 in a.d[0] (subkey) 3596 // A1 in a.d[1] 3597 // (A1+A0) in a1_xor_a0.d[0] 3598 // 3599 // B0 in b.d[0] (state) 3600 // B1 in b.d[1] 3601 3602 __ ext(tmp1, __ T16B, b, b, 0x08); 3603 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3604 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3605 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3606 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3607 3608 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3609 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3610 __ eor(tmp2, __ T16B, tmp2, tmp4); 3611 __ eor(tmp2, __ T16B, tmp2, tmp3); 3612 3613 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3614 __ ins(result_hi, __ D, tmp2, 0, 1); 3615 __ ins(result_lo, __ D, tmp2, 1, 0); 3616 } 3617 3618 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3619 FloatRegister p, FloatRegister z, FloatRegister t1) { 3620 const FloatRegister t0 = result; 3621 3622 // The GCM field polynomial f is z^128 + p(z), where p = 3623 // z^7+z^2+z+1. 3624 // 3625 // z^128 === -p(z) (mod (z^128 + p(z))) 3626 // 3627 // so, given that the product we're reducing is 3628 // a == lo + hi * z^128 3629 // substituting, 3630 // === lo - hi * p(z) (mod (z^128 + p(z))) 3631 // 3632 // we reduce by multiplying hi by p(z) and subtracting the result 3633 // from (i.e. XORing it with) lo. Because p has no nonzero high 3634 // bits we can do this with two 64-bit multiplications, lo*p and 3635 // hi*p. 3636 3637 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3638 __ ext(t1, __ T16B, t0, z, 8); 3639 __ eor(hi, __ T16B, hi, t1); 3640 __ ext(t1, __ T16B, z, t0, 8); 3641 __ eor(lo, __ T16B, lo, t1); 3642 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3643 __ eor(result, __ T16B, lo, t0); 3644 } 3645 3646 address generate_has_negatives(address &has_negatives_long) { 3647 const u1 large_loop_size = 64; 3648 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3649 int dcache_line = VM_Version::dcache_line_size(); 3650 3651 Register ary1 = r1, len = r2, result = r0; 3652 3653 __ align(CodeEntryAlignment); 3654 3655 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3656 3657 address entry = __ pc(); 3658 3659 __ enter(); 3660 3661 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3662 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3663 3664 __ cmp(len, (u1)15); 3665 __ br(Assembler::GT, LEN_OVER_15); 3666 // The only case when execution falls into this code is when pointer is near 3667 // the end of memory page and we have to avoid reading next page 3668 __ add(ary1, ary1, len); 3669 __ subs(len, len, 8); 3670 __ br(Assembler::GT, LEN_OVER_8); 3671 __ ldr(rscratch2, Address(ary1, -8)); 3672 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3673 __ lsrv(rscratch2, rscratch2, rscratch1); 3674 __ tst(rscratch2, UPPER_BIT_MASK); 3675 __ cset(result, Assembler::NE); 3676 __ leave(); 3677 __ ret(lr); 3678 __ bind(LEN_OVER_8); 3679 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3680 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3681 __ tst(rscratch2, UPPER_BIT_MASK); 3682 __ br(Assembler::NE, RET_TRUE_NO_POP); 3683 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3684 __ lsrv(rscratch1, rscratch1, rscratch2); 3685 __ tst(rscratch1, UPPER_BIT_MASK); 3686 __ cset(result, Assembler::NE); 3687 __ leave(); 3688 __ ret(lr); 3689 3690 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3691 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3692 3693 has_negatives_long = __ pc(); // 2nd entry point 3694 3695 __ enter(); 3696 3697 __ bind(LEN_OVER_15); 3698 __ push(spilled_regs, sp); 3699 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3700 __ cbz(rscratch2, ALIGNED); 3701 __ ldp(tmp6, tmp1, Address(ary1)); 3702 __ mov(tmp5, 16); 3703 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3704 __ add(ary1, ary1, rscratch1); 3705 __ sub(len, len, rscratch1); 3706 __ orr(tmp6, tmp6, tmp1); 3707 __ tst(tmp6, UPPER_BIT_MASK); 3708 __ br(Assembler::NE, RET_TRUE); 3709 3710 __ bind(ALIGNED); 3711 __ cmp(len, large_loop_size); 3712 __ br(Assembler::LT, CHECK_16); 3713 // Perform 16-byte load as early return in pre-loop to handle situation 3714 // when initially aligned large array has negative values at starting bytes, 3715 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3716 // slower. Cases with negative bytes further ahead won't be affected that 3717 // much. In fact, it'll be faster due to early loads, less instructions and 3718 // less branches in LARGE_LOOP. 3719 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3720 __ sub(len, len, 16); 3721 __ orr(tmp6, tmp6, tmp1); 3722 __ tst(tmp6, UPPER_BIT_MASK); 3723 __ br(Assembler::NE, RET_TRUE); 3724 __ cmp(len, large_loop_size); 3725 __ br(Assembler::LT, CHECK_16); 3726 3727 if (SoftwarePrefetchHintDistance >= 0 3728 && SoftwarePrefetchHintDistance >= dcache_line) { 3729 // initial prefetch 3730 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3731 } 3732 __ bind(LARGE_LOOP); 3733 if (SoftwarePrefetchHintDistance >= 0) { 3734 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3735 } 3736 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3737 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3738 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3739 // instructions per cycle and have less branches, but this approach disables 3740 // early return, thus, all 64 bytes are loaded and checked every time. 3741 __ ldp(tmp2, tmp3, Address(ary1)); 3742 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3743 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3744 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3745 __ add(ary1, ary1, large_loop_size); 3746 __ sub(len, len, large_loop_size); 3747 __ orr(tmp2, tmp2, tmp3); 3748 __ orr(tmp4, tmp4, tmp5); 3749 __ orr(rscratch1, rscratch1, rscratch2); 3750 __ orr(tmp6, tmp6, tmp1); 3751 __ orr(tmp2, tmp2, tmp4); 3752 __ orr(rscratch1, rscratch1, tmp6); 3753 __ orr(tmp2, tmp2, rscratch1); 3754 __ tst(tmp2, UPPER_BIT_MASK); 3755 __ br(Assembler::NE, RET_TRUE); 3756 __ cmp(len, large_loop_size); 3757 __ br(Assembler::GE, LARGE_LOOP); 3758 3759 __ bind(CHECK_16); // small 16-byte load pre-loop 3760 __ cmp(len, (u1)16); 3761 __ br(Assembler::LT, POST_LOOP16); 3762 3763 __ bind(LOOP16); // small 16-byte load loop 3764 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3765 __ sub(len, len, 16); 3766 __ orr(tmp2, tmp2, tmp3); 3767 __ tst(tmp2, UPPER_BIT_MASK); 3768 __ br(Assembler::NE, RET_TRUE); 3769 __ cmp(len, (u1)16); 3770 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3771 3772 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3773 __ cmp(len, (u1)8); 3774 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3775 __ ldr(tmp3, Address(__ post(ary1, 8))); 3776 __ sub(len, len, 8); 3777 __ tst(tmp3, UPPER_BIT_MASK); 3778 __ br(Assembler::NE, RET_TRUE); 3779 3780 __ bind(POST_LOOP16_LOAD_TAIL); 3781 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3782 __ ldr(tmp1, Address(ary1)); 3783 __ mov(tmp2, 64); 3784 __ sub(tmp4, tmp2, len, __ LSL, 3); 3785 __ lslv(tmp1, tmp1, tmp4); 3786 __ tst(tmp1, UPPER_BIT_MASK); 3787 __ br(Assembler::NE, RET_TRUE); 3788 // Fallthrough 3789 3790 __ bind(RET_FALSE); 3791 __ pop(spilled_regs, sp); 3792 __ leave(); 3793 __ mov(result, zr); 3794 __ ret(lr); 3795 3796 __ bind(RET_TRUE); 3797 __ pop(spilled_regs, sp); 3798 __ bind(RET_TRUE_NO_POP); 3799 __ leave(); 3800 __ mov(result, 1); 3801 __ ret(lr); 3802 3803 __ bind(DONE); 3804 __ pop(spilled_regs, sp); 3805 __ leave(); 3806 __ ret(lr); 3807 return entry; 3808 } 3809 3810 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3811 bool usePrefetch, Label &NOT_EQUAL) { 3812 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3813 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3814 tmp7 = r12, tmp8 = r13; 3815 Label LOOP; 3816 3817 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3818 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3819 __ bind(LOOP); 3820 if (usePrefetch) { 3821 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3822 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3823 } 3824 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3825 __ eor(tmp1, tmp1, tmp2); 3826 __ eor(tmp3, tmp3, tmp4); 3827 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3828 __ orr(tmp1, tmp1, tmp3); 3829 __ cbnz(tmp1, NOT_EQUAL); 3830 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3831 __ eor(tmp5, tmp5, tmp6); 3832 __ eor(tmp7, tmp7, tmp8); 3833 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3834 __ orr(tmp5, tmp5, tmp7); 3835 __ cbnz(tmp5, NOT_EQUAL); 3836 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3837 __ eor(tmp1, tmp1, tmp2); 3838 __ eor(tmp3, tmp3, tmp4); 3839 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3840 __ orr(tmp1, tmp1, tmp3); 3841 __ cbnz(tmp1, NOT_EQUAL); 3842 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3843 __ eor(tmp5, tmp5, tmp6); 3844 __ sub(cnt1, cnt1, 8 * wordSize); 3845 __ eor(tmp7, tmp7, tmp8); 3846 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3847 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3848 // cmp) because subs allows an unlimited range of immediate operand. 3849 __ subs(tmp6, cnt1, loopThreshold); 3850 __ orr(tmp5, tmp5, tmp7); 3851 __ cbnz(tmp5, NOT_EQUAL); 3852 __ br(__ GE, LOOP); 3853 // post-loop 3854 __ eor(tmp1, tmp1, tmp2); 3855 __ eor(tmp3, tmp3, tmp4); 3856 __ orr(tmp1, tmp1, tmp3); 3857 __ sub(cnt1, cnt1, 2 * wordSize); 3858 __ cbnz(tmp1, NOT_EQUAL); 3859 } 3860 3861 void generate_large_array_equals_loop_simd(int loopThreshold, 3862 bool usePrefetch, Label &NOT_EQUAL) { 3863 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3864 tmp2 = rscratch2; 3865 Label LOOP; 3866 3867 __ bind(LOOP); 3868 if (usePrefetch) { 3869 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3870 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3871 } 3872 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3873 __ sub(cnt1, cnt1, 8 * wordSize); 3874 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3875 __ subs(tmp1, cnt1, loopThreshold); 3876 __ eor(v0, __ T16B, v0, v4); 3877 __ eor(v1, __ T16B, v1, v5); 3878 __ eor(v2, __ T16B, v2, v6); 3879 __ eor(v3, __ T16B, v3, v7); 3880 __ orr(v0, __ T16B, v0, v1); 3881 __ orr(v1, __ T16B, v2, v3); 3882 __ orr(v0, __ T16B, v0, v1); 3883 __ umov(tmp1, v0, __ D, 0); 3884 __ umov(tmp2, v0, __ D, 1); 3885 __ orr(tmp1, tmp1, tmp2); 3886 __ cbnz(tmp1, NOT_EQUAL); 3887 __ br(__ GE, LOOP); 3888 } 3889 3890 // a1 = r1 - array1 address 3891 // a2 = r2 - array2 address 3892 // result = r0 - return value. Already contains "false" 3893 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3894 // r3-r5 are reserved temporary registers 3895 address generate_large_array_equals() { 3896 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3897 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3898 tmp7 = r12, tmp8 = r13; 3899 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3900 SMALL_LOOP, POST_LOOP; 3901 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3902 // calculate if at least 32 prefetched bytes are used 3903 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3904 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3905 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3906 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3907 tmp5, tmp6, tmp7, tmp8); 3908 3909 __ align(CodeEntryAlignment); 3910 3911 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3912 3913 address entry = __ pc(); 3914 __ enter(); 3915 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3916 // also advance pointers to use post-increment instead of pre-increment 3917 __ add(a1, a1, wordSize); 3918 __ add(a2, a2, wordSize); 3919 if (AvoidUnalignedAccesses) { 3920 // both implementations (SIMD/nonSIMD) are using relatively large load 3921 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3922 // on some CPUs in case of address is not at least 16-byte aligned. 3923 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3924 // load if needed at least for 1st address and make if 16-byte aligned. 3925 Label ALIGNED16; 3926 __ tbz(a1, 3, ALIGNED16); 3927 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3928 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3929 __ sub(cnt1, cnt1, wordSize); 3930 __ eor(tmp1, tmp1, tmp2); 3931 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3932 __ bind(ALIGNED16); 3933 } 3934 if (UseSIMDForArrayEquals) { 3935 if (SoftwarePrefetchHintDistance >= 0) { 3936 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3937 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3938 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3939 /* prfm = */ true, NOT_EQUAL); 3940 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3941 __ br(__ LT, TAIL); 3942 } 3943 __ bind(NO_PREFETCH_LARGE_LOOP); 3944 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3945 /* prfm = */ false, NOT_EQUAL); 3946 } else { 3947 __ push(spilled_regs, sp); 3948 if (SoftwarePrefetchHintDistance >= 0) { 3949 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3950 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3951 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3952 /* prfm = */ true, NOT_EQUAL); 3953 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3954 __ br(__ LT, TAIL); 3955 } 3956 __ bind(NO_PREFETCH_LARGE_LOOP); 3957 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3958 /* prfm = */ false, NOT_EQUAL); 3959 } 3960 __ bind(TAIL); 3961 __ cbz(cnt1, EQUAL); 3962 __ subs(cnt1, cnt1, wordSize); 3963 __ br(__ LE, POST_LOOP); 3964 __ bind(SMALL_LOOP); 3965 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3966 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3967 __ subs(cnt1, cnt1, wordSize); 3968 __ eor(tmp1, tmp1, tmp2); 3969 __ cbnz(tmp1, NOT_EQUAL); 3970 __ br(__ GT, SMALL_LOOP); 3971 __ bind(POST_LOOP); 3972 __ ldr(tmp1, Address(a1, cnt1)); 3973 __ ldr(tmp2, Address(a2, cnt1)); 3974 __ eor(tmp1, tmp1, tmp2); 3975 __ cbnz(tmp1, NOT_EQUAL); 3976 __ bind(EQUAL); 3977 __ mov(result, true); 3978 __ bind(NOT_EQUAL); 3979 if (!UseSIMDForArrayEquals) { 3980 __ pop(spilled_regs, sp); 3981 } 3982 __ bind(NOT_EQUAL_NO_POP); 3983 __ leave(); 3984 __ ret(lr); 3985 return entry; 3986 } 3987 3988 address generate_dsin_dcos(bool isCos) { 3989 __ align(CodeEntryAlignment); 3990 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 3991 address start = __ pc(); 3992 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 3993 (address)StubRoutines::aarch64::_two_over_pi, 3994 (address)StubRoutines::aarch64::_pio2, 3995 (address)StubRoutines::aarch64::_dsin_coef, 3996 (address)StubRoutines::aarch64::_dcos_coef); 3997 return start; 3998 } 3999 4000 address generate_dlog() { 4001 __ align(CodeEntryAlignment); 4002 StubCodeMark mark(this, "StubRoutines", "dlog"); 4003 address entry = __ pc(); 4004 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 4005 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 4006 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 4007 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 4008 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 4009 return entry; 4010 } 4011 4012 // code for comparing 16 bytes of strings with same encoding 4013 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 4014 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 4015 __ ldr(rscratch1, Address(__ post(str1, 8))); 4016 __ eor(rscratch2, tmp1, tmp2); 4017 __ ldr(cnt1, Address(__ post(str2, 8))); 4018 __ cbnz(rscratch2, DIFF1); 4019 __ ldr(tmp1, Address(__ post(str1, 8))); 4020 __ eor(rscratch2, rscratch1, cnt1); 4021 __ ldr(tmp2, Address(__ post(str2, 8))); 4022 __ cbnz(rscratch2, DIFF2); 4023 } 4024 4025 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4026 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4027 Label &DIFF2) { 4028 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 4029 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4030 4031 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4032 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4033 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4034 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4035 4036 __ fmovd(tmpL, vtmp3); 4037 __ eor(rscratch2, tmp3, tmpL); 4038 __ cbnz(rscratch2, DIFF2); 4039 4040 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4041 __ umov(tmpL, vtmp3, __ D, 1); 4042 __ eor(rscratch2, tmpU, tmpL); 4043 __ cbnz(rscratch2, DIFF1); 4044 4045 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4046 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4047 __ fmovd(tmpL, vtmp); 4048 __ eor(rscratch2, tmp3, tmpL); 4049 __ cbnz(rscratch2, DIFF2); 4050 4051 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4052 __ umov(tmpL, vtmp, __ D, 1); 4053 __ eor(rscratch2, tmpU, tmpL); 4054 __ cbnz(rscratch2, DIFF1); 4055 } 4056 4057 // r0 = result 4058 // r1 = str1 4059 // r2 = cnt1 4060 // r3 = str2 4061 // r4 = cnt2 4062 // r10 = tmp1 4063 // r11 = tmp2 4064 address generate_compare_long_string_different_encoding(bool isLU) { 4065 __ align(CodeEntryAlignment); 4066 StubCodeMark mark(this, "StubRoutines", isLU 4067 ? "compare_long_string_different_encoding LU" 4068 : "compare_long_string_different_encoding UL"); 4069 address entry = __ pc(); 4070 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4071 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 4072 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4073 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4074 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4075 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4076 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4077 4078 int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2); 4079 4080 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4081 // cnt2 == amount of characters left to compare 4082 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4083 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4084 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4085 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4086 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4087 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4088 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4089 __ eor(rscratch2, tmp1, tmp2); 4090 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4091 __ mov(rscratch1, tmp2); 4092 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4093 Register strU = isLU ? str2 : str1, 4094 strL = isLU ? str1 : str2, 4095 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4096 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4097 __ push(spilled_regs, sp); 4098 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4099 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4100 4101 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4102 4103 if (SoftwarePrefetchHintDistance >= 0) { 4104 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4105 __ br(__ LT, NO_PREFETCH); 4106 __ bind(LARGE_LOOP_PREFETCH); 4107 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4108 __ mov(tmp4, 2); 4109 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4110 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4111 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4112 __ subs(tmp4, tmp4, 1); 4113 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4114 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4115 __ mov(tmp4, 2); 4116 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4117 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4118 __ subs(tmp4, tmp4, 1); 4119 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4120 __ sub(cnt2, cnt2, 64); 4121 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4122 __ br(__ GE, LARGE_LOOP_PREFETCH); 4123 } 4124 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4125 __ bind(NO_PREFETCH); 4126 __ subs(cnt2, cnt2, 16); 4127 __ br(__ LT, TAIL); 4128 __ bind(SMALL_LOOP); // smaller loop 4129 __ subs(cnt2, cnt2, 16); 4130 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4131 __ br(__ GE, SMALL_LOOP); 4132 __ cmn(cnt2, (u1)16); 4133 __ br(__ EQ, LOAD_LAST); 4134 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 4135 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string 4136 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 4137 __ ldr(tmp3, Address(cnt1, -8)); 4138 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 4139 __ b(LOAD_LAST); 4140 __ bind(DIFF2); 4141 __ mov(tmpU, tmp3); 4142 __ bind(DIFF1); 4143 __ pop(spilled_regs, sp); 4144 __ b(CALCULATE_DIFFERENCE); 4145 __ bind(LOAD_LAST); 4146 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 4147 // No need to load it again 4148 __ mov(tmpU, tmp3); 4149 __ pop(spilled_regs, sp); 4150 4151 __ ldrs(vtmp, Address(strL)); 4152 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4153 __ fmovd(tmpL, vtmp); 4154 4155 __ eor(rscratch2, tmpU, tmpL); 4156 __ cbz(rscratch2, DONE); 4157 4158 // Find the first different characters in the longwords and 4159 // compute their difference. 4160 __ bind(CALCULATE_DIFFERENCE); 4161 __ rev(rscratch2, rscratch2); 4162 __ clz(rscratch2, rscratch2); 4163 __ andr(rscratch2, rscratch2, -16); 4164 __ lsrv(tmp1, tmp1, rscratch2); 4165 __ uxthw(tmp1, tmp1); 4166 __ lsrv(rscratch1, rscratch1, rscratch2); 4167 __ uxthw(rscratch1, rscratch1); 4168 __ subw(result, tmp1, rscratch1); 4169 __ bind(DONE); 4170 __ ret(lr); 4171 return entry; 4172 } 4173 4174 // r0 = result 4175 // r1 = str1 4176 // r2 = cnt1 4177 // r3 = str2 4178 // r4 = cnt2 4179 // r10 = tmp1 4180 // r11 = tmp2 4181 address generate_compare_long_string_same_encoding(bool isLL) { 4182 __ align(CodeEntryAlignment); 4183 StubCodeMark mark(this, "StubRoutines", isLL 4184 ? "compare_long_string_same_encoding LL" 4185 : "compare_long_string_same_encoding UU"); 4186 address entry = __ pc(); 4187 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4188 tmp1 = r10, tmp2 = r11; 4189 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4190 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4191 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4192 // exit from large loop when less than 64 bytes left to read or we're about 4193 // to prefetch memory behind array border 4194 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4195 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4196 // update cnt2 counter with already loaded 8 bytes 4197 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4198 // update pointers, because of previous read 4199 __ add(str1, str1, wordSize); 4200 __ add(str2, str2, wordSize); 4201 if (SoftwarePrefetchHintDistance >= 0) { 4202 __ bind(LARGE_LOOP_PREFETCH); 4203 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4204 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4205 compare_string_16_bytes_same(DIFF, DIFF2); 4206 compare_string_16_bytes_same(DIFF, DIFF2); 4207 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4208 compare_string_16_bytes_same(DIFF, DIFF2); 4209 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4210 compare_string_16_bytes_same(DIFF, DIFF2); 4211 __ br(__ GT, LARGE_LOOP_PREFETCH); 4212 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4213 } 4214 // less than 16 bytes left? 4215 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4216 __ br(__ LT, TAIL); 4217 __ bind(SMALL_LOOP); 4218 compare_string_16_bytes_same(DIFF, DIFF2); 4219 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4220 __ br(__ GE, SMALL_LOOP); 4221 __ bind(TAIL); 4222 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4223 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4224 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4225 __ br(__ LE, CHECK_LAST); 4226 __ eor(rscratch2, tmp1, tmp2); 4227 __ cbnz(rscratch2, DIFF); 4228 __ ldr(tmp1, Address(__ post(str1, 8))); 4229 __ ldr(tmp2, Address(__ post(str2, 8))); 4230 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4231 __ bind(CHECK_LAST); 4232 if (!isLL) { 4233 __ add(cnt2, cnt2, cnt2); // now in bytes 4234 } 4235 __ eor(rscratch2, tmp1, tmp2); 4236 __ cbnz(rscratch2, DIFF); 4237 __ ldr(rscratch1, Address(str1, cnt2)); 4238 __ ldr(cnt1, Address(str2, cnt2)); 4239 __ eor(rscratch2, rscratch1, cnt1); 4240 __ cbz(rscratch2, LENGTH_DIFF); 4241 // Find the first different characters in the longwords and 4242 // compute their difference. 4243 __ bind(DIFF2); 4244 __ rev(rscratch2, rscratch2); 4245 __ clz(rscratch2, rscratch2); 4246 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4247 __ lsrv(rscratch1, rscratch1, rscratch2); 4248 if (isLL) { 4249 __ lsrv(cnt1, cnt1, rscratch2); 4250 __ uxtbw(rscratch1, rscratch1); 4251 __ uxtbw(cnt1, cnt1); 4252 } else { 4253 __ lsrv(cnt1, cnt1, rscratch2); 4254 __ uxthw(rscratch1, rscratch1); 4255 __ uxthw(cnt1, cnt1); 4256 } 4257 __ subw(result, rscratch1, cnt1); 4258 __ b(LENGTH_DIFF); 4259 __ bind(DIFF); 4260 __ rev(rscratch2, rscratch2); 4261 __ clz(rscratch2, rscratch2); 4262 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4263 __ lsrv(tmp1, tmp1, rscratch2); 4264 if (isLL) { 4265 __ lsrv(tmp2, tmp2, rscratch2); 4266 __ uxtbw(tmp1, tmp1); 4267 __ uxtbw(tmp2, tmp2); 4268 } else { 4269 __ lsrv(tmp2, tmp2, rscratch2); 4270 __ uxthw(tmp1, tmp1); 4271 __ uxthw(tmp2, tmp2); 4272 } 4273 __ subw(result, tmp1, tmp2); 4274 __ b(LENGTH_DIFF); 4275 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4276 __ eor(rscratch2, tmp1, tmp2); 4277 __ cbnz(rscratch2, DIFF); 4278 __ bind(LENGTH_DIFF); 4279 __ ret(lr); 4280 return entry; 4281 } 4282 4283 void generate_compare_long_strings() { 4284 StubRoutines::aarch64::_compare_long_string_LL 4285 = generate_compare_long_string_same_encoding(true); 4286 StubRoutines::aarch64::_compare_long_string_UU 4287 = generate_compare_long_string_same_encoding(false); 4288 StubRoutines::aarch64::_compare_long_string_LU 4289 = generate_compare_long_string_different_encoding(true); 4290 StubRoutines::aarch64::_compare_long_string_UL 4291 = generate_compare_long_string_different_encoding(false); 4292 } 4293 4294 // R0 = result 4295 // R1 = str2 4296 // R2 = cnt1 4297 // R3 = str1 4298 // R4 = cnt2 4299 // This generic linear code use few additional ideas, which makes it faster: 4300 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4301 // in order to skip initial loading(help in systems with 1 ld pipeline) 4302 // 2) we can use "fast" algorithm of finding single character to search for 4303 // first symbol with less branches(1 branch per each loaded register instead 4304 // of branch for each symbol), so, this is where constants like 4305 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4306 // 3) after loading and analyzing 1st register of source string, it can be 4307 // used to search for every 1st character entry, saving few loads in 4308 // comparison with "simplier-but-slower" implementation 4309 // 4) in order to avoid lots of push/pop operations, code below is heavily 4310 // re-using/re-initializing/compressing register values, which makes code 4311 // larger and a bit less readable, however, most of extra operations are 4312 // issued during loads or branches, so, penalty is minimal 4313 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4314 const char* stubName = str1_isL 4315 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4316 : "indexof_linear_uu"; 4317 __ align(CodeEntryAlignment); 4318 StubCodeMark mark(this, "StubRoutines", stubName); 4319 address entry = __ pc(); 4320 4321 int str1_chr_size = str1_isL ? 1 : 2; 4322 int str2_chr_size = str2_isL ? 1 : 2; 4323 int str1_chr_shift = str1_isL ? 0 : 1; 4324 int str2_chr_shift = str2_isL ? 0 : 1; 4325 bool isL = str1_isL && str2_isL; 4326 // parameters 4327 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4328 // temporary registers 4329 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4330 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4331 // redefinitions 4332 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4333 4334 __ push(spilled_regs, sp); 4335 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4336 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4337 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4338 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4339 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4340 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4341 // Read whole register from str1. It is safe, because length >=8 here 4342 __ ldr(ch1, Address(str1)); 4343 // Read whole register from str2. It is safe, because length >=8 here 4344 __ ldr(ch2, Address(str2)); 4345 __ sub(cnt2, cnt2, cnt1); 4346 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4347 if (str1_isL != str2_isL) { 4348 __ eor(v0, __ T16B, v0, v0); 4349 } 4350 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4351 __ mul(first, first, tmp1); 4352 // check if we have less than 1 register to check 4353 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4354 if (str1_isL != str2_isL) { 4355 __ fmovd(v1, ch1); 4356 } 4357 __ br(__ LE, L_SMALL); 4358 __ eor(ch2, first, ch2); 4359 if (str1_isL != str2_isL) { 4360 __ zip1(v1, __ T16B, v1, v0); 4361 } 4362 __ sub(tmp2, ch2, tmp1); 4363 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4364 __ bics(tmp2, tmp2, ch2); 4365 if (str1_isL != str2_isL) { 4366 __ fmovd(ch1, v1); 4367 } 4368 __ br(__ NE, L_HAS_ZERO); 4369 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4370 __ add(result, result, wordSize/str2_chr_size); 4371 __ add(str2, str2, wordSize); 4372 __ br(__ LT, L_POST_LOOP); 4373 __ BIND(L_LOOP); 4374 __ ldr(ch2, Address(str2)); 4375 __ eor(ch2, first, ch2); 4376 __ sub(tmp2, ch2, tmp1); 4377 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4378 __ bics(tmp2, tmp2, ch2); 4379 __ br(__ NE, L_HAS_ZERO); 4380 __ BIND(L_LOOP_PROCEED); 4381 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4382 __ add(str2, str2, wordSize); 4383 __ add(result, result, wordSize/str2_chr_size); 4384 __ br(__ GE, L_LOOP); 4385 __ BIND(L_POST_LOOP); 4386 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4387 __ br(__ LE, NOMATCH); 4388 __ ldr(ch2, Address(str2)); 4389 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4390 __ eor(ch2, first, ch2); 4391 __ sub(tmp2, ch2, tmp1); 4392 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4393 __ mov(tmp4, -1); // all bits set 4394 __ b(L_SMALL_PROCEED); 4395 __ align(OptoLoopAlignment); 4396 __ BIND(L_SMALL); 4397 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4398 __ eor(ch2, first, ch2); 4399 if (str1_isL != str2_isL) { 4400 __ zip1(v1, __ T16B, v1, v0); 4401 } 4402 __ sub(tmp2, ch2, tmp1); 4403 __ mov(tmp4, -1); // all bits set 4404 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4405 if (str1_isL != str2_isL) { 4406 __ fmovd(ch1, v1); // move converted 4 symbols 4407 } 4408 __ BIND(L_SMALL_PROCEED); 4409 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4410 __ bic(tmp2, tmp2, ch2); 4411 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4412 __ rbit(tmp2, tmp2); 4413 __ br(__ EQ, NOMATCH); 4414 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4415 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4416 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4417 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4418 if (str2_isL) { // LL 4419 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4420 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4421 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4422 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4423 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4424 } else { 4425 __ mov(ch2, 0xE); // all bits in byte set except last one 4426 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4427 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4428 __ lslv(tmp2, tmp2, tmp4); 4429 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4430 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4431 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4432 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4433 } 4434 __ cmp(ch1, ch2); 4435 __ mov(tmp4, wordSize/str2_chr_size); 4436 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4437 __ BIND(L_SMALL_CMP_LOOP); 4438 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4439 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4440 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4441 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4442 __ add(tmp4, tmp4, 1); 4443 __ cmp(tmp4, cnt1); 4444 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4445 __ cmp(first, ch2); 4446 __ br(__ EQ, L_SMALL_CMP_LOOP); 4447 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4448 __ cbz(tmp2, NOMATCH); // no more matches. exit 4449 __ clz(tmp4, tmp2); 4450 __ add(result, result, 1); // advance index 4451 __ add(str2, str2, str2_chr_size); // advance pointer 4452 __ b(L_SMALL_HAS_ZERO_LOOP); 4453 __ align(OptoLoopAlignment); 4454 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4455 __ cmp(first, ch2); 4456 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4457 __ b(DONE); 4458 __ align(OptoLoopAlignment); 4459 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4460 if (str2_isL) { // LL 4461 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4462 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4463 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4464 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4465 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4466 } else { 4467 __ mov(ch2, 0xE); // all bits in byte set except last one 4468 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4469 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4470 __ lslv(tmp2, tmp2, tmp4); 4471 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4472 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4473 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4474 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4475 } 4476 __ cmp(ch1, ch2); 4477 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4478 __ b(DONE); 4479 __ align(OptoLoopAlignment); 4480 __ BIND(L_HAS_ZERO); 4481 __ rbit(tmp2, tmp2); 4482 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4483 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4484 // It's fine because both counters are 32bit and are not changed in this 4485 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4486 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4487 __ sub(result, result, 1); 4488 __ BIND(L_HAS_ZERO_LOOP); 4489 __ mov(cnt1, wordSize/str2_chr_size); 4490 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4491 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4492 if (str2_isL) { 4493 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4494 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4495 __ lslv(tmp2, tmp2, tmp4); 4496 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4497 __ add(tmp4, tmp4, 1); 4498 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4499 __ lsl(tmp2, tmp2, 1); 4500 __ mov(tmp4, wordSize/str2_chr_size); 4501 } else { 4502 __ mov(ch2, 0xE); 4503 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4504 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4505 __ lslv(tmp2, tmp2, tmp4); 4506 __ add(tmp4, tmp4, 1); 4507 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4508 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4509 __ lsl(tmp2, tmp2, 1); 4510 __ mov(tmp4, wordSize/str2_chr_size); 4511 __ sub(str2, str2, str2_chr_size); 4512 } 4513 __ cmp(ch1, ch2); 4514 __ mov(tmp4, wordSize/str2_chr_size); 4515 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4516 __ BIND(L_CMP_LOOP); 4517 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4518 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4519 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4520 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4521 __ add(tmp4, tmp4, 1); 4522 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4523 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4524 __ cmp(cnt1, ch2); 4525 __ br(__ EQ, L_CMP_LOOP); 4526 __ BIND(L_CMP_LOOP_NOMATCH); 4527 // here we're not matched 4528 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4529 __ clz(tmp4, tmp2); 4530 __ add(str2, str2, str2_chr_size); // advance pointer 4531 __ b(L_HAS_ZERO_LOOP); 4532 __ align(OptoLoopAlignment); 4533 __ BIND(L_CMP_LOOP_LAST_CMP); 4534 __ cmp(cnt1, ch2); 4535 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4536 __ b(DONE); 4537 __ align(OptoLoopAlignment); 4538 __ BIND(L_CMP_LOOP_LAST_CMP2); 4539 if (str2_isL) { 4540 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4541 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4542 __ lslv(tmp2, tmp2, tmp4); 4543 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4544 __ add(tmp4, tmp4, 1); 4545 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4546 __ lsl(tmp2, tmp2, 1); 4547 } else { 4548 __ mov(ch2, 0xE); 4549 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4550 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4551 __ lslv(tmp2, tmp2, tmp4); 4552 __ add(tmp4, tmp4, 1); 4553 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4554 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4555 __ lsl(tmp2, tmp2, 1); 4556 __ sub(str2, str2, str2_chr_size); 4557 } 4558 __ cmp(ch1, ch2); 4559 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4560 __ b(DONE); 4561 __ align(OptoLoopAlignment); 4562 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4563 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4564 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4565 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4566 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4567 // result by analyzed characters value, so, we can just reset lower bits 4568 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4569 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4570 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4571 // index of last analyzed substring inside current octet. So, str2 in at 4572 // respective start address. We need to advance it to next octet 4573 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4574 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4575 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4576 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4577 __ movw(cnt2, cnt2); 4578 __ b(L_LOOP_PROCEED); 4579 __ align(OptoLoopAlignment); 4580 __ BIND(NOMATCH); 4581 __ mov(result, -1); 4582 __ BIND(DONE); 4583 __ pop(spilled_regs, sp); 4584 __ ret(lr); 4585 return entry; 4586 } 4587 4588 void generate_string_indexof_stubs() { 4589 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4590 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4591 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4592 } 4593 4594 void inflate_and_store_2_fp_registers(bool generatePrfm, 4595 FloatRegister src1, FloatRegister src2) { 4596 Register dst = r1; 4597 __ zip1(v1, __ T16B, src1, v0); 4598 __ zip2(v2, __ T16B, src1, v0); 4599 if (generatePrfm) { 4600 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4601 } 4602 __ zip1(v3, __ T16B, src2, v0); 4603 __ zip2(v4, __ T16B, src2, v0); 4604 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4605 } 4606 4607 // R0 = src 4608 // R1 = dst 4609 // R2 = len 4610 // R3 = len >> 3 4611 // V0 = 0 4612 // v1 = loaded 8 bytes 4613 address generate_large_byte_array_inflate() { 4614 __ align(CodeEntryAlignment); 4615 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4616 address entry = __ pc(); 4617 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4618 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4619 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4620 4621 // do one more 8-byte read to have address 16-byte aligned in most cases 4622 // also use single store instruction 4623 __ ldrd(v2, __ post(src, 8)); 4624 __ sub(octetCounter, octetCounter, 2); 4625 __ zip1(v1, __ T16B, v1, v0); 4626 __ zip1(v2, __ T16B, v2, v0); 4627 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4628 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4629 __ subs(rscratch1, octetCounter, large_loop_threshold); 4630 __ br(__ LE, LOOP_START); 4631 __ b(LOOP_PRFM_START); 4632 __ bind(LOOP_PRFM); 4633 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4634 __ bind(LOOP_PRFM_START); 4635 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4636 __ sub(octetCounter, octetCounter, 8); 4637 __ subs(rscratch1, octetCounter, large_loop_threshold); 4638 inflate_and_store_2_fp_registers(true, v3, v4); 4639 inflate_and_store_2_fp_registers(true, v5, v6); 4640 __ br(__ GT, LOOP_PRFM); 4641 __ cmp(octetCounter, (u1)8); 4642 __ br(__ LT, DONE); 4643 __ bind(LOOP); 4644 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4645 __ bind(LOOP_START); 4646 __ sub(octetCounter, octetCounter, 8); 4647 __ cmp(octetCounter, (u1)8); 4648 inflate_and_store_2_fp_registers(false, v3, v4); 4649 inflate_and_store_2_fp_registers(false, v5, v6); 4650 __ br(__ GE, LOOP); 4651 __ bind(DONE); 4652 __ ret(lr); 4653 return entry; 4654 } 4655 4656 /** 4657 * Arguments: 4658 * 4659 * Input: 4660 * c_rarg0 - current state address 4661 * c_rarg1 - H key address 4662 * c_rarg2 - data address 4663 * c_rarg3 - number of blocks 4664 * 4665 * Output: 4666 * Updated state at c_rarg0 4667 */ 4668 address generate_ghash_processBlocks() { 4669 // Bafflingly, GCM uses little-endian for the byte order, but 4670 // big-endian for the bit order. For example, the polynomial 1 is 4671 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4672 // 4673 // So, we must either reverse the bytes in each word and do 4674 // everything big-endian or reverse the bits in each byte and do 4675 // it little-endian. On AArch64 it's more idiomatic to reverse 4676 // the bits in each byte (we have an instruction, RBIT, to do 4677 // that) and keep the data in little-endian bit order throught the 4678 // calculation, bit-reversing the inputs and outputs. 4679 4680 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4681 __ align(wordSize * 2); 4682 address p = __ pc(); 4683 __ emit_int64(0x87); // The low-order bits of the field 4684 // polynomial (i.e. p = z^7+z^2+z+1) 4685 // repeated in the low and high parts of a 4686 // 128-bit vector 4687 __ emit_int64(0x87); 4688 4689 __ align(CodeEntryAlignment); 4690 address start = __ pc(); 4691 4692 Register state = c_rarg0; 4693 Register subkeyH = c_rarg1; 4694 Register data = c_rarg2; 4695 Register blocks = c_rarg3; 4696 4697 FloatRegister vzr = v30; 4698 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4699 4700 __ ldrq(v0, Address(state)); 4701 __ ldrq(v1, Address(subkeyH)); 4702 4703 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4704 __ rbit(v0, __ T16B, v0); 4705 __ rev64(v1, __ T16B, v1); 4706 __ rbit(v1, __ T16B, v1); 4707 4708 __ ldrq(v26, p); 4709 4710 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4711 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4712 4713 { 4714 Label L_ghash_loop; 4715 __ bind(L_ghash_loop); 4716 4717 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4718 // reversing each byte 4719 __ rbit(v2, __ T16B, v2); 4720 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4721 4722 // Multiply state in v2 by subkey in v1 4723 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4724 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4725 /*temps*/v6, v20, v18, v21); 4726 // Reduce v7:v5 by the field polynomial 4727 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4728 4729 __ sub(blocks, blocks, 1); 4730 __ cbnz(blocks, L_ghash_loop); 4731 } 4732 4733 // The bit-reversed result is at this point in v0 4734 __ rev64(v1, __ T16B, v0); 4735 __ rbit(v1, __ T16B, v1); 4736 4737 __ st1(v1, __ T16B, state); 4738 __ ret(lr); 4739 4740 return start; 4741 } 4742 4743 // Continuation point for throwing of implicit exceptions that are 4744 // not handled in the current activation. Fabricates an exception 4745 // oop and initiates normal exception dispatching in this 4746 // frame. Since we need to preserve callee-saved values (currently 4747 // only for C2, but done for C1 as well) we need a callee-saved oop 4748 // map and therefore have to make these stubs into RuntimeStubs 4749 // rather than BufferBlobs. If the compiler needs all registers to 4750 // be preserved between the fault point and the exception handler 4751 // then it must assume responsibility for that in 4752 // AbstractCompiler::continuation_for_implicit_null_exception or 4753 // continuation_for_implicit_division_by_zero_exception. All other 4754 // implicit exceptions (e.g., NullPointerException or 4755 // AbstractMethodError on entry) are either at call sites or 4756 // otherwise assume that stack unwinding will be initiated, so 4757 // caller saved registers were assumed volatile in the compiler. 4758 4759 #undef __ 4760 #define __ masm-> 4761 4762 address generate_throw_exception(const char* name, 4763 address runtime_entry, 4764 Register arg1 = noreg, 4765 Register arg2 = noreg) { 4766 // Information about frame layout at time of blocking runtime call. 4767 // Note that we only have to preserve callee-saved registers since 4768 // the compilers are responsible for supplying a continuation point 4769 // if they expect all registers to be preserved. 4770 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4771 enum layout { 4772 rfp_off = 0, 4773 rfp_off2, 4774 return_off, 4775 return_off2, 4776 framesize // inclusive of return address 4777 }; 4778 4779 int insts_size = 512; 4780 int locs_size = 64; 4781 4782 CodeBuffer code(name, insts_size, locs_size); 4783 OopMapSet* oop_maps = new OopMapSet(); 4784 MacroAssembler* masm = new MacroAssembler(&code); 4785 4786 address start = __ pc(); 4787 4788 // This is an inlined and slightly modified version of call_VM 4789 // which has the ability to fetch the return PC out of 4790 // thread-local storage and also sets up last_Java_sp slightly 4791 // differently than the real call_VM 4792 4793 __ enter(); // Save FP and LR before call 4794 4795 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4796 4797 // lr and fp are already in place 4798 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4799 4800 int frame_complete = __ pc() - start; 4801 4802 // Set up last_Java_sp and last_Java_fp 4803 address the_pc = __ pc(); 4804 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 4805 4806 // Call runtime 4807 if (arg1 != noreg) { 4808 assert(arg2 != c_rarg1, "clobbered"); 4809 __ mov(c_rarg1, arg1); 4810 } 4811 if (arg2 != noreg) { 4812 __ mov(c_rarg2, arg2); 4813 } 4814 __ mov(c_rarg0, rthread); 4815 BLOCK_COMMENT("call runtime_entry"); 4816 __ mov(rscratch1, runtime_entry); 4817 __ blr(rscratch1); 4818 4819 // Generate oop map 4820 OopMap* map = new OopMap(framesize, 0); 4821 4822 oop_maps->add_gc_map(the_pc - start, map); 4823 4824 __ reset_last_Java_frame(true); 4825 __ maybe_isb(); 4826 4827 __ leave(); 4828 4829 // check for pending exceptions 4830 #ifdef ASSERT 4831 Label L; 4832 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4833 __ cbnz(rscratch1, L); 4834 __ should_not_reach_here(); 4835 __ bind(L); 4836 #endif // ASSERT 4837 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4838 4839 4840 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4841 RuntimeStub* stub = 4842 RuntimeStub::new_runtime_stub(name, 4843 &code, 4844 frame_complete, 4845 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4846 oop_maps, false); 4847 return stub->entry_point(); 4848 } 4849 4850 class MontgomeryMultiplyGenerator : public MacroAssembler { 4851 4852 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4853 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4854 4855 RegSet _toSave; 4856 bool _squaring; 4857 4858 public: 4859 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4860 : MacroAssembler(as->code()), _squaring(squaring) { 4861 4862 // Register allocation 4863 4864 Register reg = c_rarg0; 4865 Pa_base = reg; // Argument registers 4866 if (squaring) 4867 Pb_base = Pa_base; 4868 else 4869 Pb_base = ++reg; 4870 Pn_base = ++reg; 4871 Rlen= ++reg; 4872 inv = ++reg; 4873 Pm_base = ++reg; 4874 4875 // Working registers: 4876 Ra = ++reg; // The current digit of a, b, n, and m. 4877 Rb = ++reg; 4878 Rm = ++reg; 4879 Rn = ++reg; 4880 4881 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4882 Pb = ++reg; 4883 Pm = ++reg; 4884 Pn = ++reg; 4885 4886 t0 = ++reg; // Three registers which form a 4887 t1 = ++reg; // triple-precision accumuator. 4888 t2 = ++reg; 4889 4890 Ri = ++reg; // Inner and outer loop indexes. 4891 Rj = ++reg; 4892 4893 Rhi_ab = ++reg; // Product registers: low and high parts 4894 Rlo_ab = ++reg; // of a*b and m*n. 4895 Rhi_mn = ++reg; 4896 Rlo_mn = ++reg; 4897 4898 // r19 and up are callee-saved. 4899 _toSave = RegSet::range(r19, reg) + Pm_base; 4900 } 4901 4902 private: 4903 void save_regs() { 4904 push(_toSave, sp); 4905 } 4906 4907 void restore_regs() { 4908 pop(_toSave, sp); 4909 } 4910 4911 template <typename T> 4912 void unroll_2(Register count, T block) { 4913 Label loop, end, odd; 4914 tbnz(count, 0, odd); 4915 cbz(count, end); 4916 align(16); 4917 bind(loop); 4918 (this->*block)(); 4919 bind(odd); 4920 (this->*block)(); 4921 subs(count, count, 2); 4922 br(Assembler::GT, loop); 4923 bind(end); 4924 } 4925 4926 template <typename T> 4927 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4928 Label loop, end, odd; 4929 tbnz(count, 0, odd); 4930 cbz(count, end); 4931 align(16); 4932 bind(loop); 4933 (this->*block)(d, s, tmp); 4934 bind(odd); 4935 (this->*block)(d, s, tmp); 4936 subs(count, count, 2); 4937 br(Assembler::GT, loop); 4938 bind(end); 4939 } 4940 4941 void pre1(RegisterOrConstant i) { 4942 block_comment("pre1"); 4943 // Pa = Pa_base; 4944 // Pb = Pb_base + i; 4945 // Pm = Pm_base; 4946 // Pn = Pn_base + i; 4947 // Ra = *Pa; 4948 // Rb = *Pb; 4949 // Rm = *Pm; 4950 // Rn = *Pn; 4951 ldr(Ra, Address(Pa_base)); 4952 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4953 ldr(Rm, Address(Pm_base)); 4954 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4955 lea(Pa, Address(Pa_base)); 4956 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4957 lea(Pm, Address(Pm_base)); 4958 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4959 4960 // Zero the m*n result. 4961 mov(Rhi_mn, zr); 4962 mov(Rlo_mn, zr); 4963 } 4964 4965 // The core multiply-accumulate step of a Montgomery 4966 // multiplication. The idea is to schedule operations as a 4967 // pipeline so that instructions with long latencies (loads and 4968 // multiplies) have time to complete before their results are 4969 // used. This most benefits in-order implementations of the 4970 // architecture but out-of-order ones also benefit. 4971 void step() { 4972 block_comment("step"); 4973 // MACC(Ra, Rb, t0, t1, t2); 4974 // Ra = *++Pa; 4975 // Rb = *--Pb; 4976 umulh(Rhi_ab, Ra, Rb); 4977 mul(Rlo_ab, Ra, Rb); 4978 ldr(Ra, pre(Pa, wordSize)); 4979 ldr(Rb, pre(Pb, -wordSize)); 4980 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4981 // previous iteration. 4982 // MACC(Rm, Rn, t0, t1, t2); 4983 // Rm = *++Pm; 4984 // Rn = *--Pn; 4985 umulh(Rhi_mn, Rm, Rn); 4986 mul(Rlo_mn, Rm, Rn); 4987 ldr(Rm, pre(Pm, wordSize)); 4988 ldr(Rn, pre(Pn, -wordSize)); 4989 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4990 } 4991 4992 void post1() { 4993 block_comment("post1"); 4994 4995 // MACC(Ra, Rb, t0, t1, t2); 4996 // Ra = *++Pa; 4997 // Rb = *--Pb; 4998 umulh(Rhi_ab, Ra, Rb); 4999 mul(Rlo_ab, Ra, Rb); 5000 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5001 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5002 5003 // *Pm = Rm = t0 * inv; 5004 mul(Rm, t0, inv); 5005 str(Rm, Address(Pm)); 5006 5007 // MACC(Rm, Rn, t0, t1, t2); 5008 // t0 = t1; t1 = t2; t2 = 0; 5009 umulh(Rhi_mn, Rm, Rn); 5010 5011 #ifndef PRODUCT 5012 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5013 { 5014 mul(Rlo_mn, Rm, Rn); 5015 add(Rlo_mn, t0, Rlo_mn); 5016 Label ok; 5017 cbz(Rlo_mn, ok); { 5018 stop("broken Montgomery multiply"); 5019 } bind(ok); 5020 } 5021 #endif 5022 // We have very carefully set things up so that 5023 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5024 // the lower half of Rm * Rn because we know the result already: 5025 // it must be -t0. t0 + (-t0) must generate a carry iff 5026 // t0 != 0. So, rather than do a mul and an adds we just set 5027 // the carry flag iff t0 is nonzero. 5028 // 5029 // mul(Rlo_mn, Rm, Rn); 5030 // adds(zr, t0, Rlo_mn); 5031 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5032 adcs(t0, t1, Rhi_mn); 5033 adc(t1, t2, zr); 5034 mov(t2, zr); 5035 } 5036 5037 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5038 block_comment("pre2"); 5039 // Pa = Pa_base + i-len; 5040 // Pb = Pb_base + len; 5041 // Pm = Pm_base + i-len; 5042 // Pn = Pn_base + len; 5043 5044 if (i.is_register()) { 5045 sub(Rj, i.as_register(), len); 5046 } else { 5047 mov(Rj, i.as_constant()); 5048 sub(Rj, Rj, len); 5049 } 5050 // Rj == i-len 5051 5052 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5053 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5054 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5055 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5056 5057 // Ra = *++Pa; 5058 // Rb = *--Pb; 5059 // Rm = *++Pm; 5060 // Rn = *--Pn; 5061 ldr(Ra, pre(Pa, wordSize)); 5062 ldr(Rb, pre(Pb, -wordSize)); 5063 ldr(Rm, pre(Pm, wordSize)); 5064 ldr(Rn, pre(Pn, -wordSize)); 5065 5066 mov(Rhi_mn, zr); 5067 mov(Rlo_mn, zr); 5068 } 5069 5070 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5071 block_comment("post2"); 5072 if (i.is_constant()) { 5073 mov(Rj, i.as_constant()-len.as_constant()); 5074 } else { 5075 sub(Rj, i.as_register(), len); 5076 } 5077 5078 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5079 5080 // As soon as we know the least significant digit of our result, 5081 // store it. 5082 // Pm_base[i-len] = t0; 5083 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5084 5085 // t0 = t1; t1 = t2; t2 = 0; 5086 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5087 adc(t1, t2, zr); 5088 mov(t2, zr); 5089 } 5090 5091 // A carry in t0 after Montgomery multiplication means that we 5092 // should subtract multiples of n from our result in m. We'll 5093 // keep doing that until there is no carry. 5094 void normalize(RegisterOrConstant len) { 5095 block_comment("normalize"); 5096 // while (t0) 5097 // t0 = sub(Pm_base, Pn_base, t0, len); 5098 Label loop, post, again; 5099 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5100 cbz(t0, post); { 5101 bind(again); { 5102 mov(i, zr); 5103 mov(cnt, len); 5104 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5105 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5106 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5107 align(16); 5108 bind(loop); { 5109 sbcs(Rm, Rm, Rn); 5110 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5111 add(i, i, 1); 5112 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5113 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5114 sub(cnt, cnt, 1); 5115 } cbnz(cnt, loop); 5116 sbc(t0, t0, zr); 5117 } cbnz(t0, again); 5118 } bind(post); 5119 } 5120 5121 // Move memory at s to d, reversing words. 5122 // Increments d to end of copied memory 5123 // Destroys tmp1, tmp2 5124 // Preserves len 5125 // Leaves s pointing to the address which was in d at start 5126 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5127 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5128 5129 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5130 mov(tmp1, len); 5131 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5132 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5133 } 5134 // where 5135 void reverse1(Register d, Register s, Register tmp) { 5136 ldr(tmp, pre(s, -wordSize)); 5137 ror(tmp, tmp, 32); 5138 str(tmp, post(d, wordSize)); 5139 } 5140 5141 void step_squaring() { 5142 // An extra ACC 5143 step(); 5144 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5145 } 5146 5147 void last_squaring(RegisterOrConstant i) { 5148 Label dont; 5149 // if ((i & 1) == 0) { 5150 tbnz(i.as_register(), 0, dont); { 5151 // MACC(Ra, Rb, t0, t1, t2); 5152 // Ra = *++Pa; 5153 // Rb = *--Pb; 5154 umulh(Rhi_ab, Ra, Rb); 5155 mul(Rlo_ab, Ra, Rb); 5156 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5157 } bind(dont); 5158 } 5159 5160 void extra_step_squaring() { 5161 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5162 5163 // MACC(Rm, Rn, t0, t1, t2); 5164 // Rm = *++Pm; 5165 // Rn = *--Pn; 5166 umulh(Rhi_mn, Rm, Rn); 5167 mul(Rlo_mn, Rm, Rn); 5168 ldr(Rm, pre(Pm, wordSize)); 5169 ldr(Rn, pre(Pn, -wordSize)); 5170 } 5171 5172 void post1_squaring() { 5173 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5174 5175 // *Pm = Rm = t0 * inv; 5176 mul(Rm, t0, inv); 5177 str(Rm, Address(Pm)); 5178 5179 // MACC(Rm, Rn, t0, t1, t2); 5180 // t0 = t1; t1 = t2; t2 = 0; 5181 umulh(Rhi_mn, Rm, Rn); 5182 5183 #ifndef PRODUCT 5184 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5185 { 5186 mul(Rlo_mn, Rm, Rn); 5187 add(Rlo_mn, t0, Rlo_mn); 5188 Label ok; 5189 cbz(Rlo_mn, ok); { 5190 stop("broken Montgomery multiply"); 5191 } bind(ok); 5192 } 5193 #endif 5194 // We have very carefully set things up so that 5195 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5196 // the lower half of Rm * Rn because we know the result already: 5197 // it must be -t0. t0 + (-t0) must generate a carry iff 5198 // t0 != 0. So, rather than do a mul and an adds we just set 5199 // the carry flag iff t0 is nonzero. 5200 // 5201 // mul(Rlo_mn, Rm, Rn); 5202 // adds(zr, t0, Rlo_mn); 5203 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5204 adcs(t0, t1, Rhi_mn); 5205 adc(t1, t2, zr); 5206 mov(t2, zr); 5207 } 5208 5209 void acc(Register Rhi, Register Rlo, 5210 Register t0, Register t1, Register t2) { 5211 adds(t0, t0, Rlo); 5212 adcs(t1, t1, Rhi); 5213 adc(t2, t2, zr); 5214 } 5215 5216 public: 5217 /** 5218 * Fast Montgomery multiplication. The derivation of the 5219 * algorithm is in A Cryptographic Library for the Motorola 5220 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5221 * 5222 * Arguments: 5223 * 5224 * Inputs for multiplication: 5225 * c_rarg0 - int array elements a 5226 * c_rarg1 - int array elements b 5227 * c_rarg2 - int array elements n (the modulus) 5228 * c_rarg3 - int length 5229 * c_rarg4 - int inv 5230 * c_rarg5 - int array elements m (the result) 5231 * 5232 * Inputs for squaring: 5233 * c_rarg0 - int array elements a 5234 * c_rarg1 - int array elements n (the modulus) 5235 * c_rarg2 - int length 5236 * c_rarg3 - int inv 5237 * c_rarg4 - int array elements m (the result) 5238 * 5239 */ 5240 address generate_multiply() { 5241 Label argh, nothing; 5242 bind(argh); 5243 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5244 5245 align(CodeEntryAlignment); 5246 address entry = pc(); 5247 5248 cbzw(Rlen, nothing); 5249 5250 enter(); 5251 5252 // Make room. 5253 cmpw(Rlen, 512); 5254 br(Assembler::HI, argh); 5255 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5256 andr(sp, Ra, -2 * wordSize); 5257 5258 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5259 5260 { 5261 // Copy input args, reversing as we go. We use Ra as a 5262 // temporary variable. 5263 reverse(Ra, Pa_base, Rlen, t0, t1); 5264 if (!_squaring) 5265 reverse(Ra, Pb_base, Rlen, t0, t1); 5266 reverse(Ra, Pn_base, Rlen, t0, t1); 5267 } 5268 5269 // Push all call-saved registers and also Pm_base which we'll need 5270 // at the end. 5271 save_regs(); 5272 5273 #ifndef PRODUCT 5274 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5275 { 5276 ldr(Rn, Address(Pn_base, 0)); 5277 mul(Rlo_mn, Rn, inv); 5278 subs(zr, Rlo_mn, -1); 5279 Label ok; 5280 br(EQ, ok); { 5281 stop("broken inverse in Montgomery multiply"); 5282 } bind(ok); 5283 } 5284 #endif 5285 5286 mov(Pm_base, Ra); 5287 5288 mov(t0, zr); 5289 mov(t1, zr); 5290 mov(t2, zr); 5291 5292 block_comment("for (int i = 0; i < len; i++) {"); 5293 mov(Ri, zr); { 5294 Label loop, end; 5295 cmpw(Ri, Rlen); 5296 br(Assembler::GE, end); 5297 5298 bind(loop); 5299 pre1(Ri); 5300 5301 block_comment(" for (j = i; j; j--) {"); { 5302 movw(Rj, Ri); 5303 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5304 } block_comment(" } // j"); 5305 5306 post1(); 5307 addw(Ri, Ri, 1); 5308 cmpw(Ri, Rlen); 5309 br(Assembler::LT, loop); 5310 bind(end); 5311 block_comment("} // i"); 5312 } 5313 5314 block_comment("for (int i = len; i < 2*len; i++) {"); 5315 mov(Ri, Rlen); { 5316 Label loop, end; 5317 cmpw(Ri, Rlen, Assembler::LSL, 1); 5318 br(Assembler::GE, end); 5319 5320 bind(loop); 5321 pre2(Ri, Rlen); 5322 5323 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5324 lslw(Rj, Rlen, 1); 5325 subw(Rj, Rj, Ri); 5326 subw(Rj, Rj, 1); 5327 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5328 } block_comment(" } // j"); 5329 5330 post2(Ri, Rlen); 5331 addw(Ri, Ri, 1); 5332 cmpw(Ri, Rlen, Assembler::LSL, 1); 5333 br(Assembler::LT, loop); 5334 bind(end); 5335 } 5336 block_comment("} // i"); 5337 5338 normalize(Rlen); 5339 5340 mov(Ra, Pm_base); // Save Pm_base in Ra 5341 restore_regs(); // Restore caller's Pm_base 5342 5343 // Copy our result into caller's Pm_base 5344 reverse(Pm_base, Ra, Rlen, t0, t1); 5345 5346 leave(); 5347 bind(nothing); 5348 ret(lr); 5349 5350 return entry; 5351 } 5352 // In C, approximately: 5353 5354 // void 5355 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5356 // unsigned long Pn_base[], unsigned long Pm_base[], 5357 // unsigned long inv, int len) { 5358 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5359 // unsigned long *Pa, *Pb, *Pn, *Pm; 5360 // unsigned long Ra, Rb, Rn, Rm; 5361 5362 // int i; 5363 5364 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5365 5366 // for (i = 0; i < len; i++) { 5367 // int j; 5368 5369 // Pa = Pa_base; 5370 // Pb = Pb_base + i; 5371 // Pm = Pm_base; 5372 // Pn = Pn_base + i; 5373 5374 // Ra = *Pa; 5375 // Rb = *Pb; 5376 // Rm = *Pm; 5377 // Rn = *Pn; 5378 5379 // int iters = i; 5380 // for (j = 0; iters--; j++) { 5381 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5382 // MACC(Ra, Rb, t0, t1, t2); 5383 // Ra = *++Pa; 5384 // Rb = *--Pb; 5385 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5386 // MACC(Rm, Rn, t0, t1, t2); 5387 // Rm = *++Pm; 5388 // Rn = *--Pn; 5389 // } 5390 5391 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5392 // MACC(Ra, Rb, t0, t1, t2); 5393 // *Pm = Rm = t0 * inv; 5394 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5395 // MACC(Rm, Rn, t0, t1, t2); 5396 5397 // assert(t0 == 0, "broken Montgomery multiply"); 5398 5399 // t0 = t1; t1 = t2; t2 = 0; 5400 // } 5401 5402 // for (i = len; i < 2*len; i++) { 5403 // int j; 5404 5405 // Pa = Pa_base + i-len; 5406 // Pb = Pb_base + len; 5407 // Pm = Pm_base + i-len; 5408 // Pn = Pn_base + len; 5409 5410 // Ra = *++Pa; 5411 // Rb = *--Pb; 5412 // Rm = *++Pm; 5413 // Rn = *--Pn; 5414 5415 // int iters = len*2-i-1; 5416 // for (j = i-len+1; iters--; j++) { 5417 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5418 // MACC(Ra, Rb, t0, t1, t2); 5419 // Ra = *++Pa; 5420 // Rb = *--Pb; 5421 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5422 // MACC(Rm, Rn, t0, t1, t2); 5423 // Rm = *++Pm; 5424 // Rn = *--Pn; 5425 // } 5426 5427 // Pm_base[i-len] = t0; 5428 // t0 = t1; t1 = t2; t2 = 0; 5429 // } 5430 5431 // while (t0) 5432 // t0 = sub(Pm_base, Pn_base, t0, len); 5433 // } 5434 5435 /** 5436 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5437 * multiplies than Montgomery multiplication so it should be up to 5438 * 25% faster. However, its loop control is more complex and it 5439 * may actually run slower on some machines. 5440 * 5441 * Arguments: 5442 * 5443 * Inputs: 5444 * c_rarg0 - int array elements a 5445 * c_rarg1 - int array elements n (the modulus) 5446 * c_rarg2 - int length 5447 * c_rarg3 - int inv 5448 * c_rarg4 - int array elements m (the result) 5449 * 5450 */ 5451 address generate_square() { 5452 Label argh; 5453 bind(argh); 5454 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5455 5456 align(CodeEntryAlignment); 5457 address entry = pc(); 5458 5459 enter(); 5460 5461 // Make room. 5462 cmpw(Rlen, 512); 5463 br(Assembler::HI, argh); 5464 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5465 andr(sp, Ra, -2 * wordSize); 5466 5467 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5468 5469 { 5470 // Copy input args, reversing as we go. We use Ra as a 5471 // temporary variable. 5472 reverse(Ra, Pa_base, Rlen, t0, t1); 5473 reverse(Ra, Pn_base, Rlen, t0, t1); 5474 } 5475 5476 // Push all call-saved registers and also Pm_base which we'll need 5477 // at the end. 5478 save_regs(); 5479 5480 mov(Pm_base, Ra); 5481 5482 mov(t0, zr); 5483 mov(t1, zr); 5484 mov(t2, zr); 5485 5486 block_comment("for (int i = 0; i < len; i++) {"); 5487 mov(Ri, zr); { 5488 Label loop, end; 5489 bind(loop); 5490 cmp(Ri, Rlen); 5491 br(Assembler::GE, end); 5492 5493 pre1(Ri); 5494 5495 block_comment("for (j = (i+1)/2; j; j--) {"); { 5496 add(Rj, Ri, 1); 5497 lsr(Rj, Rj, 1); 5498 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5499 } block_comment(" } // j"); 5500 5501 last_squaring(Ri); 5502 5503 block_comment(" for (j = i/2; j; j--) {"); { 5504 lsr(Rj, Ri, 1); 5505 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5506 } block_comment(" } // j"); 5507 5508 post1_squaring(); 5509 add(Ri, Ri, 1); 5510 cmp(Ri, Rlen); 5511 br(Assembler::LT, loop); 5512 5513 bind(end); 5514 block_comment("} // i"); 5515 } 5516 5517 block_comment("for (int i = len; i < 2*len; i++) {"); 5518 mov(Ri, Rlen); { 5519 Label loop, end; 5520 bind(loop); 5521 cmp(Ri, Rlen, Assembler::LSL, 1); 5522 br(Assembler::GE, end); 5523 5524 pre2(Ri, Rlen); 5525 5526 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5527 lsl(Rj, Rlen, 1); 5528 sub(Rj, Rj, Ri); 5529 sub(Rj, Rj, 1); 5530 lsr(Rj, Rj, 1); 5531 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5532 } block_comment(" } // j"); 5533 5534 last_squaring(Ri); 5535 5536 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5537 lsl(Rj, Rlen, 1); 5538 sub(Rj, Rj, Ri); 5539 lsr(Rj, Rj, 1); 5540 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5541 } block_comment(" } // j"); 5542 5543 post2(Ri, Rlen); 5544 add(Ri, Ri, 1); 5545 cmp(Ri, Rlen, Assembler::LSL, 1); 5546 5547 br(Assembler::LT, loop); 5548 bind(end); 5549 block_comment("} // i"); 5550 } 5551 5552 normalize(Rlen); 5553 5554 mov(Ra, Pm_base); // Save Pm_base in Ra 5555 restore_regs(); // Restore caller's Pm_base 5556 5557 // Copy our result into caller's Pm_base 5558 reverse(Pm_base, Ra, Rlen, t0, t1); 5559 5560 leave(); 5561 ret(lr); 5562 5563 return entry; 5564 } 5565 // In C, approximately: 5566 5567 // void 5568 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5569 // unsigned long Pm_base[], unsigned long inv, int len) { 5570 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5571 // unsigned long *Pa, *Pb, *Pn, *Pm; 5572 // unsigned long Ra, Rb, Rn, Rm; 5573 5574 // int i; 5575 5576 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5577 5578 // for (i = 0; i < len; i++) { 5579 // int j; 5580 5581 // Pa = Pa_base; 5582 // Pb = Pa_base + i; 5583 // Pm = Pm_base; 5584 // Pn = Pn_base + i; 5585 5586 // Ra = *Pa; 5587 // Rb = *Pb; 5588 // Rm = *Pm; 5589 // Rn = *Pn; 5590 5591 // int iters = (i+1)/2; 5592 // for (j = 0; iters--; j++) { 5593 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5594 // MACC2(Ra, Rb, t0, t1, t2); 5595 // Ra = *++Pa; 5596 // Rb = *--Pb; 5597 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5598 // MACC(Rm, Rn, t0, t1, t2); 5599 // Rm = *++Pm; 5600 // Rn = *--Pn; 5601 // } 5602 // if ((i & 1) == 0) { 5603 // assert(Ra == Pa_base[j], "must be"); 5604 // MACC(Ra, Ra, t0, t1, t2); 5605 // } 5606 // iters = i/2; 5607 // assert(iters == i-j, "must be"); 5608 // for (; iters--; j++) { 5609 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5610 // MACC(Rm, Rn, t0, t1, t2); 5611 // Rm = *++Pm; 5612 // Rn = *--Pn; 5613 // } 5614 5615 // *Pm = Rm = t0 * inv; 5616 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5617 // MACC(Rm, Rn, t0, t1, t2); 5618 5619 // assert(t0 == 0, "broken Montgomery multiply"); 5620 5621 // t0 = t1; t1 = t2; t2 = 0; 5622 // } 5623 5624 // for (i = len; i < 2*len; i++) { 5625 // int start = i-len+1; 5626 // int end = start + (len - start)/2; 5627 // int j; 5628 5629 // Pa = Pa_base + i-len; 5630 // Pb = Pa_base + len; 5631 // Pm = Pm_base + i-len; 5632 // Pn = Pn_base + len; 5633 5634 // Ra = *++Pa; 5635 // Rb = *--Pb; 5636 // Rm = *++Pm; 5637 // Rn = *--Pn; 5638 5639 // int iters = (2*len-i-1)/2; 5640 // assert(iters == end-start, "must be"); 5641 // for (j = start; iters--; j++) { 5642 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5643 // MACC2(Ra, Rb, t0, t1, t2); 5644 // Ra = *++Pa; 5645 // Rb = *--Pb; 5646 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5647 // MACC(Rm, Rn, t0, t1, t2); 5648 // Rm = *++Pm; 5649 // Rn = *--Pn; 5650 // } 5651 // if ((i & 1) == 0) { 5652 // assert(Ra == Pa_base[j], "must be"); 5653 // MACC(Ra, Ra, t0, t1, t2); 5654 // } 5655 // iters = (2*len-i)/2; 5656 // assert(iters == len-j, "must be"); 5657 // for (; iters--; j++) { 5658 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5659 // MACC(Rm, Rn, t0, t1, t2); 5660 // Rm = *++Pm; 5661 // Rn = *--Pn; 5662 // } 5663 // Pm_base[i-len] = t0; 5664 // t0 = t1; t1 = t2; t2 = 0; 5665 // } 5666 5667 // while (t0) 5668 // t0 = sub(Pm_base, Pn_base, t0, len); 5669 // } 5670 }; 5671 5672 5673 // Call here from the interpreter or compiled code to either load 5674 // multiple returned values from the value type instance being 5675 // returned to registers or to store returned values to a newly 5676 // allocated value type instance. 5677 address generate_return_value_stub(address destination, const char* name, bool has_res) { 5678 5679 // Information about frame layout at time of blocking runtime call. 5680 // Note that we only have to preserve callee-saved registers since 5681 // the compilers are responsible for supplying a continuation point 5682 // if they expect all registers to be preserved. 5683 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 5684 enum layout { 5685 rfp_off = 0, rfp_off2, 5686 5687 j_rarg7_off, j_rarg7_2, 5688 j_rarg6_off, j_rarg6_2, 5689 j_rarg5_off, j_rarg5_2, 5690 j_rarg4_off, j_rarg4_2, 5691 j_rarg3_off, j_rarg3_2, 5692 j_rarg2_off, j_rarg2_2, 5693 j_rarg1_off, j_rarg1_2, 5694 j_rarg0_off, j_rarg0_2, 5695 5696 j_farg0_off, j_farg0_2, 5697 j_farg1_off, j_farg1_2, 5698 j_farg2_off, j_farg2_2, 5699 j_farg3_off, j_farg3_2, 5700 j_farg4_off, j_farg4_2, 5701 j_farg5_off, j_farg5_2, 5702 j_farg6_off, j_farg6_2, 5703 j_farg7_off, j_farg7_2, 5704 5705 return_off, return_off2, 5706 framesize // inclusive of return address 5707 }; 5708 5709 int insts_size = 512; 5710 int locs_size = 64; 5711 5712 CodeBuffer code(name, insts_size, locs_size); 5713 OopMapSet* oop_maps = new OopMapSet(); 5714 MacroAssembler* masm = new MacroAssembler(&code); 5715 5716 address start = __ pc(); 5717 5718 const Address f7_save (rfp, j_farg7_off * wordSize); 5719 const Address f6_save (rfp, j_farg6_off * wordSize); 5720 const Address f5_save (rfp, j_farg5_off * wordSize); 5721 const Address f4_save (rfp, j_farg4_off * wordSize); 5722 const Address f3_save (rfp, j_farg3_off * wordSize); 5723 const Address f2_save (rfp, j_farg2_off * wordSize); 5724 const Address f1_save (rfp, j_farg1_off * wordSize); 5725 const Address f0_save (rfp, j_farg0_off * wordSize); 5726 5727 const Address r0_save (rfp, j_rarg0_off * wordSize); 5728 const Address r1_save (rfp, j_rarg1_off * wordSize); 5729 const Address r2_save (rfp, j_rarg2_off * wordSize); 5730 const Address r3_save (rfp, j_rarg3_off * wordSize); 5731 const Address r4_save (rfp, j_rarg4_off * wordSize); 5732 const Address r5_save (rfp, j_rarg5_off * wordSize); 5733 const Address r6_save (rfp, j_rarg6_off * wordSize); 5734 const Address r7_save (rfp, j_rarg7_off * wordSize); 5735 5736 // Generate oop map 5737 OopMap* map = new OopMap(framesize, 0); 5738 5739 map->set_callee_saved(VMRegImpl::stack2reg(rfp_off), rfp->as_VMReg()); 5740 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 5741 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 5742 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 5743 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 5744 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 5745 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 5746 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 5747 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 5748 5749 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 5750 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 5751 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 5752 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 5753 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 5754 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 5755 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 5756 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 5757 5758 // This is an inlined and slightly modified version of call_VM 5759 // which has the ability to fetch the return PC out of 5760 // thread-local storage and also sets up last_Java_sp slightly 5761 // differently than the real call_VM 5762 5763 __ enter(); // Save FP and LR before call 5764 5765 assert(is_even(framesize/2), "sp not 16-byte aligned"); 5766 5767 // lr and fp are already in place 5768 __ sub(sp, rfp, ((unsigned)framesize - 4) << LogBytesPerInt); // prolog 5769 5770 __ strd(j_farg7, f7_save); 5771 __ strd(j_farg6, f6_save); 5772 __ strd(j_farg5, f5_save); 5773 __ strd(j_farg4, f4_save); 5774 __ strd(j_farg3, f3_save); 5775 __ strd(j_farg2, f2_save); 5776 __ strd(j_farg1, f1_save); 5777 __ strd(j_farg0, f0_save); 5778 5779 __ str(j_rarg0, r0_save); 5780 __ str(j_rarg1, r1_save); 5781 __ str(j_rarg2, r2_save); 5782 __ str(j_rarg3, r3_save); 5783 __ str(j_rarg4, r4_save); 5784 __ str(j_rarg5, r5_save); 5785 __ str(j_rarg6, r6_save); 5786 __ str(j_rarg7, r7_save); 5787 5788 int frame_complete = __ pc() - start; 5789 5790 // Set up last_Java_sp and last_Java_fp 5791 address the_pc = __ pc(); 5792 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 5793 5794 // Call runtime 5795 __ mov(c_rarg0, rthread); 5796 __ mov(c_rarg1, r0); 5797 5798 BLOCK_COMMENT("call runtime_entry"); 5799 __ mov(rscratch1, destination); 5800 __ blrt(rscratch1, 2 /* number_of_arguments */, 0, 1); 5801 5802 oop_maps->add_gc_map(the_pc - start, map); 5803 5804 __ reset_last_Java_frame(false); 5805 __ maybe_isb(); 5806 5807 __ ldrd(j_farg7, f7_save); 5808 __ ldrd(j_farg6, f6_save); 5809 __ ldrd(j_farg5, f5_save); 5810 __ ldrd(j_farg4, f4_save); 5811 __ ldrd(j_farg3, f3_save); 5812 __ ldrd(j_farg3, f2_save); 5813 __ ldrd(j_farg1, f1_save); 5814 __ ldrd(j_farg0, f0_save); 5815 5816 __ ldr(j_rarg0, r0_save); 5817 __ ldr(j_rarg1, r1_save); 5818 __ ldr(j_rarg2, r2_save); 5819 __ ldr(j_rarg3, r3_save); 5820 __ ldr(j_rarg4, r4_save); 5821 __ ldr(j_rarg5, r5_save); 5822 __ ldr(j_rarg6, r6_save); 5823 __ ldr(j_rarg7, r7_save); 5824 5825 __ leave(); 5826 5827 // check for pending exceptions 5828 Label pending; 5829 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 5830 __ cmp(rscratch1, (u1)NULL_WORD); 5831 __ br(Assembler::NE, pending); 5832 5833 if (has_res) { 5834 __ get_vm_result(r0, rthread); 5835 } 5836 __ ret(lr); 5837 5838 __ bind(pending); 5839 __ ldr(r0, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 5840 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 5841 5842 5843 // codeBlob framesize is in words (not VMRegImpl::slot_size) 5844 int frame_size_in_words = (framesize >> (LogBytesPerWord - LogBytesPerInt)); 5845 RuntimeStub* stub = 5846 RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 5847 5848 return stub->entry_point(); 5849 } 5850 5851 // Initialization 5852 void generate_initial() { 5853 // Generate initial stubs and initializes the entry points 5854 5855 // entry points that exist in all platforms Note: This is code 5856 // that could be shared among different platforms - however the 5857 // benefit seems to be smaller than the disadvantage of having a 5858 // much more complicated generator structure. See also comment in 5859 // stubRoutines.hpp. 5860 5861 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5862 5863 StubRoutines::_call_stub_entry = 5864 generate_call_stub(StubRoutines::_call_stub_return_address); 5865 5866 // is referenced by megamorphic call 5867 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5868 5869 // Build this early so it's available for the interpreter. 5870 StubRoutines::_throw_StackOverflowError_entry = 5871 generate_throw_exception("StackOverflowError throw_exception", 5872 CAST_FROM_FN_PTR(address, 5873 SharedRuntime::throw_StackOverflowError)); 5874 StubRoutines::_throw_delayed_StackOverflowError_entry = 5875 generate_throw_exception("delayed StackOverflowError throw_exception", 5876 CAST_FROM_FN_PTR(address, 5877 SharedRuntime::throw_delayed_StackOverflowError)); 5878 if (UseCRC32Intrinsics) { 5879 // set table address before stub generation which use it 5880 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5881 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5882 } 5883 5884 if (UseCRC32CIntrinsics) { 5885 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5886 } 5887 5888 // Disabled until JDK-8210858 is fixed 5889 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5890 // StubRoutines::_dlog = generate_dlog(); 5891 // } 5892 5893 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5894 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5895 } 5896 5897 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5898 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5899 } 5900 5901 5902 StubRoutines::_load_value_type_fields_in_regs = 5903 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_value_type_fields_in_regs), "load_value_type_fields_in_regs", false); 5904 StubRoutines::_store_value_type_fields_to_buf = 5905 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_value_type_fields_to_buf), "store_value_type_fields_to_buf", true); 5906 } 5907 5908 void generate_all() { 5909 // support for verify_oop (must happen after universe_init) 5910 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5911 StubRoutines::_throw_AbstractMethodError_entry = 5912 generate_throw_exception("AbstractMethodError throw_exception", 5913 CAST_FROM_FN_PTR(address, 5914 SharedRuntime:: 5915 throw_AbstractMethodError)); 5916 5917 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5918 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5919 CAST_FROM_FN_PTR(address, 5920 SharedRuntime:: 5921 throw_IncompatibleClassChangeError)); 5922 5923 StubRoutines::_throw_NullPointerException_at_call_entry = 5924 generate_throw_exception("NullPointerException at call throw_exception", 5925 CAST_FROM_FN_PTR(address, 5926 SharedRuntime:: 5927 throw_NullPointerException_at_call)); 5928 5929 // arraycopy stubs used by compilers 5930 generate_arraycopy_stubs(); 5931 5932 // has negatives stub for large arrays. 5933 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5934 5935 // array equals stub for large arrays. 5936 if (!UseSimpleArrayEquals) { 5937 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5938 } 5939 5940 generate_compare_long_strings(); 5941 5942 generate_string_indexof_stubs(); 5943 5944 // byte_array_inflate stub for large arrays. 5945 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5946 5947 #ifdef COMPILER2 5948 if (UseMultiplyToLenIntrinsic) { 5949 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5950 } 5951 5952 if (UseSquareToLenIntrinsic) { 5953 StubRoutines::_squareToLen = generate_squareToLen(); 5954 } 5955 5956 if (UseMulAddIntrinsic) { 5957 StubRoutines::_mulAdd = generate_mulAdd(); 5958 } 5959 5960 if (UseMontgomeryMultiplyIntrinsic) { 5961 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5962 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5963 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5964 } 5965 5966 if (UseMontgomerySquareIntrinsic) { 5967 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5968 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5969 // We use generate_multiply() rather than generate_square() 5970 // because it's faster for the sizes of modulus we care about. 5971 StubRoutines::_montgomerySquare = g.generate_multiply(); 5972 } 5973 #endif // COMPILER2 5974 5975 // generate GHASH intrinsics code 5976 if (UseGHASHIntrinsics) { 5977 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5978 } 5979 5980 // data cache line writeback 5981 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 5982 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 5983 5984 if (UseAESIntrinsics) { 5985 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5986 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5987 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5988 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5989 } 5990 5991 if (UseSHA1Intrinsics) { 5992 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5993 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5994 } 5995 if (UseSHA256Intrinsics) { 5996 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5997 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5998 } 5999 6000 // generate Adler32 intrinsics code 6001 if (UseAdler32Intrinsics) { 6002 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 6003 } 6004 6005 // Safefetch stubs. 6006 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 6007 &StubRoutines::_safefetch32_fault_pc, 6008 &StubRoutines::_safefetch32_continuation_pc); 6009 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 6010 &StubRoutines::_safefetchN_fault_pc, 6011 &StubRoutines::_safefetchN_continuation_pc); 6012 StubRoutines::aarch64::set_completed(); 6013 } 6014 6015 public: 6016 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 6017 if (all) { 6018 generate_all(); 6019 } else { 6020 generate_initial(); 6021 } 6022 } 6023 }; // end class declaration 6024 6025 #define UCM_TABLE_MAX_ENTRIES 8 6026 void StubGenerator_generate(CodeBuffer* code, bool all) { 6027 if (UnsafeCopyMemory::_table == NULL) { 6028 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 6029 } 6030 StubGenerator g(code, all); 6031 }