1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSetCodeGen.hpp" 30 #include "gc/shared/cardTable.hpp" 31 #include "gc/shared/cardTableModRefBS.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "nativeInst_aarch64.hpp" 34 #include "oops/instanceOop.hpp" 35 #include "oops/method.hpp" 36 #include "oops/objArrayKlass.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/frame.inline.hpp" 40 #include "runtime/handles.inline.hpp" 41 #include "runtime/sharedRuntime.hpp" 42 #include "runtime/stubCodeGenerator.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "runtime/thread.inline.hpp" 45 #include "utilities/align.hpp" 46 #ifdef COMPILER2 47 #include "opto/runtime.hpp" 48 #endif 49 50 #ifdef BUILTIN_SIM 51 #include "../../../../../../simulator/simulator.hpp" 52 #endif 53 54 // Declaration and definition of StubGenerator (no .hpp file). 55 // For a more detailed description of the stub routine structure 56 // see the comment in stubRoutines.hpp 57 58 #undef __ 59 #define __ _masm-> 60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 61 62 #ifdef PRODUCT 63 #define BLOCK_COMMENT(str) /* nothing */ 64 #else 65 #define BLOCK_COMMENT(str) __ block_comment(str) 66 #endif 67 68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 69 70 // Stub Code definitions 71 72 class StubGenerator: public StubCodeGenerator { 73 private: 74 75 #ifdef PRODUCT 76 #define inc_counter_np(counter) ((void)0) 77 #else 78 void inc_counter_np_(int& counter) { 79 __ lea(rscratch2, ExternalAddress((address)&counter)); 80 __ ldrw(rscratch1, Address(rscratch2)); 81 __ addw(rscratch1, rscratch1, 1); 82 __ strw(rscratch1, Address(rscratch2)); 83 } 84 #define inc_counter_np(counter) \ 85 BLOCK_COMMENT("inc_counter " #counter); \ 86 inc_counter_np_(counter); 87 #endif 88 89 // Call stubs are used to call Java from C 90 // 91 // Arguments: 92 // c_rarg0: call wrapper address address 93 // c_rarg1: result address 94 // c_rarg2: result type BasicType 95 // c_rarg3: method Method* 96 // c_rarg4: (interpreter) entry point address 97 // c_rarg5: parameters intptr_t* 98 // c_rarg6: parameter size (in words) int 99 // c_rarg7: thread Thread* 100 // 101 // There is no return from the stub itself as any Java result 102 // is written to result 103 // 104 // we save r30 (lr) as the return PC at the base of the frame and 105 // link r29 (fp) below it as the frame pointer installing sp (r31) 106 // into fp. 107 // 108 // we save r0-r7, which accounts for all the c arguments. 109 // 110 // TODO: strictly do we need to save them all? they are treated as 111 // volatile by C so could we omit saving the ones we are going to 112 // place in global registers (thread? method?) or those we only use 113 // during setup of the Java call? 114 // 115 // we don't need to save r8 which C uses as an indirect result location 116 // return register. 117 // 118 // we don't need to save r9-r15 which both C and Java treat as 119 // volatile 120 // 121 // we don't need to save r16-18 because Java does not use them 122 // 123 // we save r19-r28 which Java uses as scratch registers and C 124 // expects to be callee-save 125 // 126 // we save the bottom 64 bits of each value stored in v8-v15; it is 127 // the responsibility of the caller to preserve larger values. 128 // 129 // so the stub frame looks like this when we enter Java code 130 // 131 // [ return_from_Java ] <--- sp 132 // [ argument word n ] 133 // ... 134 // -27 [ argument word 1 ] 135 // -26 [ saved v15 ] <--- sp_after_call 136 // -25 [ saved v14 ] 137 // -24 [ saved v13 ] 138 // -23 [ saved v12 ] 139 // -22 [ saved v11 ] 140 // -21 [ saved v10 ] 141 // -20 [ saved v9 ] 142 // -19 [ saved v8 ] 143 // -18 [ saved r28 ] 144 // -17 [ saved r27 ] 145 // -16 [ saved r26 ] 146 // -15 [ saved r25 ] 147 // -14 [ saved r24 ] 148 // -13 [ saved r23 ] 149 // -12 [ saved r22 ] 150 // -11 [ saved r21 ] 151 // -10 [ saved r20 ] 152 // -9 [ saved r19 ] 153 // -8 [ call wrapper (r0) ] 154 // -7 [ result (r1) ] 155 // -6 [ result type (r2) ] 156 // -5 [ method (r3) ] 157 // -4 [ entry point (r4) ] 158 // -3 [ parameters (r5) ] 159 // -2 [ parameter size (r6) ] 160 // -1 [ thread (r7) ] 161 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 162 // 1 [ saved lr (r30) ] 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -26, 167 168 d15_off = -26, 169 d13_off = -24, 170 d11_off = -22, 171 d9_off = -20, 172 173 r28_off = -18, 174 r26_off = -16, 175 r24_off = -14, 176 r22_off = -12, 177 r20_off = -10, 178 call_wrapper_off = -8, 179 result_off = -7, 180 result_type_off = -6, 181 method_off = -5, 182 entry_point_off = -4, 183 parameter_size_off = -2, 184 thread_off = -1, 185 fp_f = 0, 186 retaddr_off = 1, 187 }; 188 189 address generate_call_stub(address& return_address) { 190 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 191 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 192 "adjust this code"); 193 194 StubCodeMark mark(this, "StubRoutines", "call_stub"); 195 address start = __ pc(); 196 197 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 198 199 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 200 const Address result (rfp, result_off * wordSize); 201 const Address result_type (rfp, result_type_off * wordSize); 202 const Address method (rfp, method_off * wordSize); 203 const Address entry_point (rfp, entry_point_off * wordSize); 204 const Address parameter_size(rfp, parameter_size_off * wordSize); 205 206 const Address thread (rfp, thread_off * wordSize); 207 208 const Address d15_save (rfp, d15_off * wordSize); 209 const Address d13_save (rfp, d13_off * wordSize); 210 const Address d11_save (rfp, d11_off * wordSize); 211 const Address d9_save (rfp, d9_off * wordSize); 212 213 const Address r28_save (rfp, r28_off * wordSize); 214 const Address r26_save (rfp, r26_off * wordSize); 215 const Address r24_save (rfp, r24_off * wordSize); 216 const Address r22_save (rfp, r22_off * wordSize); 217 const Address r20_save (rfp, r20_off * wordSize); 218 219 // stub code 220 221 // we need a C prolog to bootstrap the x86 caller into the sim 222 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 223 224 address aarch64_entry = __ pc(); 225 226 #ifdef BUILTIN_SIM 227 // Save sender's SP for stack traces. 228 __ mov(rscratch1, sp); 229 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 230 #endif 231 // set up frame and move sp to end of save area 232 __ enter(); 233 __ sub(sp, rfp, -sp_after_call_off * wordSize); 234 235 // save register parameters and Java scratch/global registers 236 // n.b. we save thread even though it gets installed in 237 // rthread because we want to sanity check rthread later 238 __ str(c_rarg7, thread); 239 __ strw(c_rarg6, parameter_size); 240 __ stp(c_rarg4, c_rarg5, entry_point); 241 __ stp(c_rarg2, c_rarg3, result_type); 242 __ stp(c_rarg0, c_rarg1, call_wrapper); 243 244 __ stp(r20, r19, r20_save); 245 __ stp(r22, r21, r22_save); 246 __ stp(r24, r23, r24_save); 247 __ stp(r26, r25, r26_save); 248 __ stp(r28, r27, r28_save); 249 250 __ stpd(v9, v8, d9_save); 251 __ stpd(v11, v10, d11_save); 252 __ stpd(v13, v12, d13_save); 253 __ stpd(v15, v14, d15_save); 254 255 // install Java thread in global register now we have saved 256 // whatever value it held 257 __ mov(rthread, c_rarg7); 258 // And method 259 __ mov(rmethod, c_rarg3); 260 261 // set up the heapbase register 262 __ reinit_heapbase(); 263 264 #ifdef ASSERT 265 // make sure we have no pending exceptions 266 { 267 Label L; 268 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 269 __ cmp(rscratch1, (unsigned)NULL_WORD); 270 __ br(Assembler::EQ, L); 271 __ stop("StubRoutines::call_stub: entered with pending exception"); 272 __ BIND(L); 273 } 274 #endif 275 // pass parameters if any 276 __ mov(esp, sp); 277 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 278 __ andr(sp, rscratch1, -2 * wordSize); 279 280 BLOCK_COMMENT("pass parameters if any"); 281 Label parameters_done; 282 // parameter count is still in c_rarg6 283 // and parameter pointer identifying param 1 is in c_rarg5 284 __ cbzw(c_rarg6, parameters_done); 285 286 address loop = __ pc(); 287 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 288 __ subsw(c_rarg6, c_rarg6, 1); 289 __ push(rscratch1); 290 __ br(Assembler::GT, loop); 291 292 __ BIND(parameters_done); 293 294 // call Java entry -- passing methdoOop, and current sp 295 // rmethod: Method* 296 // r13: sender sp 297 BLOCK_COMMENT("call Java function"); 298 __ mov(r13, sp); 299 __ blr(c_rarg4); 300 301 // tell the simulator we have returned to the stub 302 303 // we do this here because the notify will already have been done 304 // if we get to the next instruction via an exception 305 // 306 // n.b. adding this instruction here affects the calculation of 307 // whether or not a routine returns to the call stub (used when 308 // doing stack walks) since the normal test is to check the return 309 // pc against the address saved below. so we may need to allow for 310 // this extra instruction in the check. 311 312 if (NotifySimulator) { 313 __ notify(Assembler::method_reentry); 314 } 315 // save current address for use by exception handling code 316 317 return_address = __ pc(); 318 319 // store result depending on type (everything that is not 320 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 321 // n.b. this assumes Java returns an integral result in r0 322 // and a floating result in j_farg0 323 __ ldr(j_rarg2, result); 324 Label is_long, is_float, is_double, exit; 325 __ ldr(j_rarg1, result_type); 326 __ cmp(j_rarg1, T_OBJECT); 327 __ br(Assembler::EQ, is_long); 328 __ cmp(j_rarg1, T_LONG); 329 __ br(Assembler::EQ, is_long); 330 __ cmp(j_rarg1, T_FLOAT); 331 __ br(Assembler::EQ, is_float); 332 __ cmp(j_rarg1, T_DOUBLE); 333 __ br(Assembler::EQ, is_double); 334 335 // handle T_INT case 336 __ strw(r0, Address(j_rarg2)); 337 338 __ BIND(exit); 339 340 // pop parameters 341 __ sub(esp, rfp, -sp_after_call_off * wordSize); 342 343 #ifdef ASSERT 344 // verify that threads correspond 345 { 346 Label L, S; 347 __ ldr(rscratch1, thread); 348 __ cmp(rthread, rscratch1); 349 __ br(Assembler::NE, S); 350 __ get_thread(rscratch1); 351 __ cmp(rthread, rscratch1); 352 __ br(Assembler::EQ, L); 353 __ BIND(S); 354 __ stop("StubRoutines::call_stub: threads must correspond"); 355 __ BIND(L); 356 } 357 #endif 358 359 // restore callee-save registers 360 __ ldpd(v15, v14, d15_save); 361 __ ldpd(v13, v12, d13_save); 362 __ ldpd(v11, v10, d11_save); 363 __ ldpd(v9, v8, d9_save); 364 365 __ ldp(r28, r27, r28_save); 366 __ ldp(r26, r25, r26_save); 367 __ ldp(r24, r23, r24_save); 368 __ ldp(r22, r21, r22_save); 369 __ ldp(r20, r19, r20_save); 370 371 __ ldp(c_rarg0, c_rarg1, call_wrapper); 372 __ ldrw(c_rarg2, result_type); 373 __ ldr(c_rarg3, method); 374 __ ldp(c_rarg4, c_rarg5, entry_point); 375 __ ldp(c_rarg6, c_rarg7, parameter_size); 376 377 #ifndef PRODUCT 378 // tell the simulator we are about to end Java execution 379 if (NotifySimulator) { 380 __ notify(Assembler::method_exit); 381 } 382 #endif 383 // leave frame and return to caller 384 __ leave(); 385 __ ret(lr); 386 387 // handle return types different from T_INT 388 389 __ BIND(is_long); 390 __ str(r0, Address(j_rarg2, 0)); 391 __ br(Assembler::AL, exit); 392 393 __ BIND(is_float); 394 __ strs(j_farg0, Address(j_rarg2, 0)); 395 __ br(Assembler::AL, exit); 396 397 __ BIND(is_double); 398 __ strd(j_farg0, Address(j_rarg2, 0)); 399 __ br(Assembler::AL, exit); 400 401 return start; 402 } 403 404 // Return point for a Java call if there's an exception thrown in 405 // Java code. The exception is caught and transformed into a 406 // pending exception stored in JavaThread that can be tested from 407 // within the VM. 408 // 409 // Note: Usually the parameters are removed by the callee. In case 410 // of an exception crossing an activation frame boundary, that is 411 // not the case if the callee is compiled code => need to setup the 412 // rsp. 413 // 414 // r0: exception oop 415 416 // NOTE: this is used as a target from the signal handler so it 417 // needs an x86 prolog which returns into the current simulator 418 // executing the generated catch_exception code. so the prolog 419 // needs to install rax in a sim register and adjust the sim's 420 // restart pc to enter the generated code at the start position 421 // then return from native to simulated execution. 422 423 address generate_catch_exception() { 424 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 425 address start = __ pc(); 426 427 // same as in generate_call_stub(): 428 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 429 const Address thread (rfp, thread_off * wordSize); 430 431 #ifdef ASSERT 432 // verify that threads correspond 433 { 434 Label L, S; 435 __ ldr(rscratch1, thread); 436 __ cmp(rthread, rscratch1); 437 __ br(Assembler::NE, S); 438 __ get_thread(rscratch1); 439 __ cmp(rthread, rscratch1); 440 __ br(Assembler::EQ, L); 441 __ bind(S); 442 __ stop("StubRoutines::catch_exception: threads must correspond"); 443 __ bind(L); 444 } 445 #endif 446 447 // set pending exception 448 __ verify_oop(r0); 449 450 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 451 __ mov(rscratch1, (address)__FILE__); 452 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 453 __ movw(rscratch1, (int)__LINE__); 454 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 455 456 // complete return to VM 457 assert(StubRoutines::_call_stub_return_address != NULL, 458 "_call_stub_return_address must have been generated before"); 459 __ b(StubRoutines::_call_stub_return_address); 460 461 return start; 462 } 463 464 // Continuation point for runtime calls returning with a pending 465 // exception. The pending exception check happened in the runtime 466 // or native call stub. The pending exception in Thread is 467 // converted into a Java-level exception. 468 // 469 // Contract with Java-level exception handlers: 470 // r0: exception 471 // r3: throwing pc 472 // 473 // NOTE: At entry of this stub, exception-pc must be in LR !! 474 475 // NOTE: this is always used as a jump target within generated code 476 // so it just needs to be generated code wiht no x86 prolog 477 478 address generate_forward_exception() { 479 StubCodeMark mark(this, "StubRoutines", "forward exception"); 480 address start = __ pc(); 481 482 // Upon entry, LR points to the return address returning into 483 // Java (interpreted or compiled) code; i.e., the return address 484 // becomes the throwing pc. 485 // 486 // Arguments pushed before the runtime call are still on the stack 487 // but the exception handler will reset the stack pointer -> 488 // ignore them. A potential result in registers can be ignored as 489 // well. 490 491 #ifdef ASSERT 492 // make sure this code is only executed if there is a pending exception 493 { 494 Label L; 495 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 496 __ cbnz(rscratch1, L); 497 __ stop("StubRoutines::forward exception: no pending exception (1)"); 498 __ bind(L); 499 } 500 #endif 501 502 // compute exception handler into r19 503 504 // call the VM to find the handler address associated with the 505 // caller address. pass thread in r0 and caller pc (ret address) 506 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 507 // the stack. 508 __ mov(c_rarg1, lr); 509 // lr will be trashed by the VM call so we move it to R19 510 // (callee-saved) because we also need to pass it to the handler 511 // returned by this call. 512 __ mov(r19, lr); 513 BLOCK_COMMENT("call exception_handler_for_return_address"); 514 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 515 SharedRuntime::exception_handler_for_return_address), 516 rthread, c_rarg1); 517 // we should not really care that lr is no longer the callee 518 // address. we saved the value the handler needs in r19 so we can 519 // just copy it to r3. however, the C2 handler will push its own 520 // frame and then calls into the VM and the VM code asserts that 521 // the PC for the frame above the handler belongs to a compiled 522 // Java method. So, we restore lr here to satisfy that assert. 523 __ mov(lr, r19); 524 // setup r0 & r3 & clear pending exception 525 __ mov(r3, r19); 526 __ mov(r19, r0); 527 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 528 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 529 530 #ifdef ASSERT 531 // make sure exception is set 532 { 533 Label L; 534 __ cbnz(r0, L); 535 __ stop("StubRoutines::forward exception: no pending exception (2)"); 536 __ bind(L); 537 } 538 #endif 539 540 // continue at exception handler 541 // r0: exception 542 // r3: throwing pc 543 // r19: exception handler 544 __ verify_oop(r0); 545 __ br(r19); 546 547 return start; 548 } 549 550 // Non-destructive plausibility checks for oops 551 // 552 // Arguments: 553 // r0: oop to verify 554 // rscratch1: error message 555 // 556 // Stack after saving c_rarg3: 557 // [tos + 0]: saved c_rarg3 558 // [tos + 1]: saved c_rarg2 559 // [tos + 2]: saved lr 560 // [tos + 3]: saved rscratch2 561 // [tos + 4]: saved r0 562 // [tos + 5]: saved rscratch1 563 address generate_verify_oop() { 564 565 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 566 address start = __ pc(); 567 568 Label exit, error; 569 570 // save c_rarg2 and c_rarg3 571 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 572 573 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 574 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 575 __ ldr(c_rarg3, Address(c_rarg2)); 576 __ add(c_rarg3, c_rarg3, 1); 577 __ str(c_rarg3, Address(c_rarg2)); 578 579 // object is in r0 580 // make sure object is 'reasonable' 581 __ cbz(r0, exit); // if obj is NULL it is OK 582 583 // Check if the oop is in the right area of memory 584 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 585 __ andr(c_rarg2, r0, c_rarg3); 586 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 587 588 // Compare c_rarg2 and c_rarg3. We don't use a compare 589 // instruction here because the flags register is live. 590 __ eor(c_rarg2, c_rarg2, c_rarg3); 591 __ cbnz(c_rarg2, error); 592 593 // make sure klass is 'reasonable', which is not zero. 594 __ load_klass(r0, r0); // get klass 595 __ cbz(r0, error); // if klass is NULL it is broken 596 597 // return if everything seems ok 598 __ bind(exit); 599 600 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 601 __ ret(lr); 602 603 // handle errors 604 __ bind(error); 605 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 606 607 __ push(RegSet::range(r0, r29), sp); 608 // debug(char* msg, int64_t pc, int64_t regs[]) 609 __ mov(c_rarg0, rscratch1); // pass address of error message 610 __ mov(c_rarg1, lr); // pass return address 611 __ mov(c_rarg2, sp); // pass address of regs on stack 612 #ifndef PRODUCT 613 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 614 #endif 615 BLOCK_COMMENT("call MacroAssembler::debug"); 616 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 617 __ blrt(rscratch1, 3, 0, 1); 618 619 return start; 620 } 621 622 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 623 624 // The inner part of zero_words(). This is the bulk operation, 625 // zeroing words in blocks, possibly using DC ZVA to do it. The 626 // caller is responsible for zeroing the last few words. 627 // 628 // Inputs: 629 // r10: the HeapWord-aligned base address of an array to zero. 630 // r11: the count in HeapWords, r11 > 0. 631 // 632 // Returns r10 and r11, adjusted for the caller to clear. 633 // r10: the base address of the tail of words left to clear. 634 // r11: the number of words in the tail. 635 // r11 < MacroAssembler::zero_words_block_size. 636 637 address generate_zero_blocks() { 638 Label store_pair, loop_store_pair, done; 639 Label base_aligned; 640 641 Register base = r10, cnt = r11; 642 643 __ align(CodeEntryAlignment); 644 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 645 address start = __ pc(); 646 647 if (UseBlockZeroing) { 648 int zva_length = VM_Version::zva_length(); 649 650 // Ensure ZVA length can be divided by 16. This is required by 651 // the subsequent operations. 652 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 653 654 __ tbz(base, 3, base_aligned); 655 __ str(zr, Address(__ post(base, 8))); 656 __ sub(cnt, cnt, 1); 657 __ bind(base_aligned); 658 659 // Ensure count >= zva_length * 2 so that it still deserves a zva after 660 // alignment. 661 Label small; 662 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 663 __ subs(rscratch1, cnt, low_limit >> 3); 664 __ br(Assembler::LT, small); 665 __ zero_dcache_blocks(base, cnt); 666 __ bind(small); 667 } 668 669 { 670 // Number of stp instructions we'll unroll 671 const int unroll = 672 MacroAssembler::zero_words_block_size / 2; 673 // Clear the remaining blocks. 674 Label loop; 675 __ subs(cnt, cnt, unroll * 2); 676 __ br(Assembler::LT, done); 677 __ bind(loop); 678 for (int i = 0; i < unroll; i++) 679 __ stp(zr, zr, __ post(base, 16)); 680 __ subs(cnt, cnt, unroll * 2); 681 __ br(Assembler::GE, loop); 682 __ bind(done); 683 __ add(cnt, cnt, unroll * 2); 684 } 685 686 __ ret(lr); 687 688 return start; 689 } 690 691 692 typedef enum { 693 copy_forwards = 1, 694 copy_backwards = -1 695 } copy_direction; 696 697 // Bulk copy of blocks of 8 words. 698 // 699 // count is a count of words. 700 // 701 // Precondition: count >= 8 702 // 703 // Postconditions: 704 // 705 // The least significant bit of count contains the remaining count 706 // of words to copy. The rest of count is trash. 707 // 708 // s and d are adjusted to point to the remaining words to copy 709 // 710 void generate_copy_longs(Label &start, Register s, Register d, Register count, 711 copy_direction direction) { 712 int unit = wordSize * direction; 713 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 714 715 int offset; 716 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 717 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 718 const Register stride = r13; 719 720 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 721 assert_different_registers(s, d, count, rscratch1); 722 723 Label again, drain; 724 const char *stub_name; 725 if (direction == copy_forwards) 726 stub_name = "forward_copy_longs"; 727 else 728 stub_name = "backward_copy_longs"; 729 StubCodeMark mark(this, "StubRoutines", stub_name); 730 __ align(CodeEntryAlignment); 731 __ bind(start); 732 733 Label unaligned_copy_long; 734 if (AvoidUnalignedAccesses) { 735 __ tbnz(d, 3, unaligned_copy_long); 736 } 737 738 if (direction == copy_forwards) { 739 __ sub(s, s, bias); 740 __ sub(d, d, bias); 741 } 742 743 #ifdef ASSERT 744 // Make sure we are never given < 8 words 745 { 746 Label L; 747 __ cmp(count, 8); 748 __ br(Assembler::GE, L); 749 __ stop("genrate_copy_longs called with < 8 words"); 750 __ bind(L); 751 } 752 #endif 753 754 // Fill 8 registers 755 if (UseSIMDForMemoryOps) { 756 __ ldpq(v0, v1, Address(s, 4 * unit)); 757 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 758 } else { 759 __ ldp(t0, t1, Address(s, 2 * unit)); 760 __ ldp(t2, t3, Address(s, 4 * unit)); 761 __ ldp(t4, t5, Address(s, 6 * unit)); 762 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 763 } 764 765 __ subs(count, count, 16); 766 __ br(Assembler::LO, drain); 767 768 int prefetch = PrefetchCopyIntervalInBytes; 769 bool use_stride = false; 770 if (direction == copy_backwards) { 771 use_stride = prefetch > 256; 772 prefetch = -prefetch; 773 if (use_stride) __ mov(stride, prefetch); 774 } 775 776 __ bind(again); 777 778 if (PrefetchCopyIntervalInBytes > 0) 779 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 780 781 if (UseSIMDForMemoryOps) { 782 __ stpq(v0, v1, Address(d, 4 * unit)); 783 __ ldpq(v0, v1, Address(s, 4 * unit)); 784 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 785 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 786 } else { 787 __ stp(t0, t1, Address(d, 2 * unit)); 788 __ ldp(t0, t1, Address(s, 2 * unit)); 789 __ stp(t2, t3, Address(d, 4 * unit)); 790 __ ldp(t2, t3, Address(s, 4 * unit)); 791 __ stp(t4, t5, Address(d, 6 * unit)); 792 __ ldp(t4, t5, Address(s, 6 * unit)); 793 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 794 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 795 } 796 797 __ subs(count, count, 8); 798 __ br(Assembler::HS, again); 799 800 // Drain 801 __ bind(drain); 802 if (UseSIMDForMemoryOps) { 803 __ stpq(v0, v1, Address(d, 4 * unit)); 804 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 805 } else { 806 __ stp(t0, t1, Address(d, 2 * unit)); 807 __ stp(t2, t3, Address(d, 4 * unit)); 808 __ stp(t4, t5, Address(d, 6 * unit)); 809 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 810 } 811 812 { 813 Label L1, L2; 814 __ tbz(count, exact_log2(4), L1); 815 if (UseSIMDForMemoryOps) { 816 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 817 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 818 } else { 819 __ ldp(t0, t1, Address(s, 2 * unit)); 820 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 821 __ stp(t0, t1, Address(d, 2 * unit)); 822 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 823 } 824 __ bind(L1); 825 826 if (direction == copy_forwards) { 827 __ add(s, s, bias); 828 __ add(d, d, bias); 829 } 830 831 __ tbz(count, 1, L2); 832 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 833 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 834 __ bind(L2); 835 } 836 837 __ ret(lr); 838 839 if (AvoidUnalignedAccesses) { 840 Label drain, again; 841 // Register order for storing. Order is different for backward copy. 842 843 __ bind(unaligned_copy_long); 844 845 // source address is even aligned, target odd aligned 846 // 847 // when forward copying word pairs we read long pairs at offsets 848 // {0, 2, 4, 6} (in long words). when backwards copying we read 849 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 850 // address by -2 in the forwards case so we can compute the 851 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 852 // or -1. 853 // 854 // when forward copying we need to store 1 word, 3 pairs and 855 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 856 // zero offset We adjust the destination by -1 which means we 857 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 858 // 859 // When backwards copyng we need to store 1 word, 3 pairs and 860 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 861 // offsets {1, 3, 5, 7, 8} * unit. 862 863 if (direction == copy_forwards) { 864 __ sub(s, s, 16); 865 __ sub(d, d, 8); 866 } 867 868 // Fill 8 registers 869 // 870 // for forwards copy s was offset by -16 from the original input 871 // value of s so the register contents are at these offsets 872 // relative to the 64 bit block addressed by that original input 873 // and so on for each successive 64 byte block when s is updated 874 // 875 // t0 at offset 0, t1 at offset 8 876 // t2 at offset 16, t3 at offset 24 877 // t4 at offset 32, t5 at offset 40 878 // t6 at offset 48, t7 at offset 56 879 880 // for backwards copy s was not offset so the register contents 881 // are at these offsets into the preceding 64 byte block 882 // relative to that original input and so on for each successive 883 // preceding 64 byte block when s is updated. this explains the 884 // slightly counter-intuitive looking pattern of register usage 885 // in the stp instructions for backwards copy. 886 // 887 // t0 at offset -16, t1 at offset -8 888 // t2 at offset -32, t3 at offset -24 889 // t4 at offset -48, t5 at offset -40 890 // t6 at offset -64, t7 at offset -56 891 892 __ ldp(t0, t1, Address(s, 2 * unit)); 893 __ ldp(t2, t3, Address(s, 4 * unit)); 894 __ ldp(t4, t5, Address(s, 6 * unit)); 895 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 896 897 __ subs(count, count, 16); 898 __ br(Assembler::LO, drain); 899 900 int prefetch = PrefetchCopyIntervalInBytes; 901 bool use_stride = false; 902 if (direction == copy_backwards) { 903 use_stride = prefetch > 256; 904 prefetch = -prefetch; 905 if (use_stride) __ mov(stride, prefetch); 906 } 907 908 __ bind(again); 909 910 if (PrefetchCopyIntervalInBytes > 0) 911 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 912 913 if (direction == copy_forwards) { 914 // allowing for the offset of -8 the store instructions place 915 // registers into the target 64 bit block at the following 916 // offsets 917 // 918 // t0 at offset 0 919 // t1 at offset 8, t2 at offset 16 920 // t3 at offset 24, t4 at offset 32 921 // t5 at offset 40, t6 at offset 48 922 // t7 at offset 56 923 924 __ str(t0, Address(d, 1 * unit)); 925 __ stp(t1, t2, Address(d, 2 * unit)); 926 __ ldp(t0, t1, Address(s, 2 * unit)); 927 __ stp(t3, t4, Address(d, 4 * unit)); 928 __ ldp(t2, t3, Address(s, 4 * unit)); 929 __ stp(t5, t6, Address(d, 6 * unit)); 930 __ ldp(t4, t5, Address(s, 6 * unit)); 931 __ str(t7, Address(__ pre(d, 8 * unit))); 932 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 933 } else { 934 // d was not offset when we started so the registers are 935 // written into the 64 bit block preceding d with the following 936 // offsets 937 // 938 // t1 at offset -8 939 // t3 at offset -24, t0 at offset -16 940 // t5 at offset -48, t2 at offset -32 941 // t7 at offset -56, t4 at offset -48 942 // t6 at offset -64 943 // 944 // note that this matches the offsets previously noted for the 945 // loads 946 947 __ str(t1, Address(d, 1 * unit)); 948 __ stp(t3, t0, Address(d, 3 * unit)); 949 __ ldp(t0, t1, Address(s, 2 * unit)); 950 __ stp(t5, t2, Address(d, 5 * unit)); 951 __ ldp(t2, t3, Address(s, 4 * unit)); 952 __ stp(t7, t4, Address(d, 7 * unit)); 953 __ ldp(t4, t5, Address(s, 6 * unit)); 954 __ str(t6, Address(__ pre(d, 8 * unit))); 955 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 956 } 957 958 __ subs(count, count, 8); 959 __ br(Assembler::HS, again); 960 961 // Drain 962 // 963 // this uses the same pattern of offsets and register arguments 964 // as above 965 __ bind(drain); 966 if (direction == copy_forwards) { 967 __ str(t0, Address(d, 1 * unit)); 968 __ stp(t1, t2, Address(d, 2 * unit)); 969 __ stp(t3, t4, Address(d, 4 * unit)); 970 __ stp(t5, t6, Address(d, 6 * unit)); 971 __ str(t7, Address(__ pre(d, 8 * unit))); 972 } else { 973 __ str(t1, Address(d, 1 * unit)); 974 __ stp(t3, t0, Address(d, 3 * unit)); 975 __ stp(t5, t2, Address(d, 5 * unit)); 976 __ stp(t7, t4, Address(d, 7 * unit)); 977 __ str(t6, Address(__ pre(d, 8 * unit))); 978 } 979 // now we need to copy any remaining part block which may 980 // include a 4 word block subblock and/or a 2 word subblock. 981 // bits 2 and 1 in the count are the tell-tale for whetehr we 982 // have each such subblock 983 { 984 Label L1, L2; 985 __ tbz(count, exact_log2(4), L1); 986 // this is the same as above but copying only 4 longs hence 987 // with ony one intervening stp between the str instructions 988 // but note that the offsets and registers still follow the 989 // same pattern 990 __ ldp(t0, t1, Address(s, 2 * unit)); 991 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 992 if (direction == copy_forwards) { 993 __ str(t0, Address(d, 1 * unit)); 994 __ stp(t1, t2, Address(d, 2 * unit)); 995 __ str(t3, Address(__ pre(d, 4 * unit))); 996 } else { 997 __ str(t1, Address(d, 1 * unit)); 998 __ stp(t3, t0, Address(d, 3 * unit)); 999 __ str(t2, Address(__ pre(d, 4 * unit))); 1000 } 1001 __ bind(L1); 1002 1003 __ tbz(count, 1, L2); 1004 // this is the same as above but copying only 2 longs hence 1005 // there is no intervening stp between the str instructions 1006 // but note that the offset and register patterns are still 1007 // the same 1008 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1009 if (direction == copy_forwards) { 1010 __ str(t0, Address(d, 1 * unit)); 1011 __ str(t1, Address(__ pre(d, 2 * unit))); 1012 } else { 1013 __ str(t1, Address(d, 1 * unit)); 1014 __ str(t0, Address(__ pre(d, 2 * unit))); 1015 } 1016 __ bind(L2); 1017 1018 // for forwards copy we need to re-adjust the offsets we 1019 // applied so that s and d are follow the last words written 1020 1021 if (direction == copy_forwards) { 1022 __ add(s, s, 16); 1023 __ add(d, d, 8); 1024 } 1025 1026 } 1027 1028 __ ret(lr); 1029 } 1030 } 1031 1032 // Small copy: less than 16 bytes. 1033 // 1034 // NB: Ignores all of the bits of count which represent more than 15 1035 // bytes, so a caller doesn't have to mask them. 1036 1037 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1038 bool is_backwards = step < 0; 1039 size_t granularity = uabs(step); 1040 int direction = is_backwards ? -1 : 1; 1041 int unit = wordSize * direction; 1042 1043 Label Lpair, Lword, Lint, Lshort, Lbyte; 1044 1045 assert(granularity 1046 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1047 1048 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1049 1050 // ??? I don't know if this bit-test-and-branch is the right thing 1051 // to do. It does a lot of jumping, resulting in several 1052 // mispredicted branches. It might make more sense to do this 1053 // with something like Duff's device with a single computed branch. 1054 1055 __ tbz(count, 3 - exact_log2(granularity), Lword); 1056 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1057 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1058 __ bind(Lword); 1059 1060 if (granularity <= sizeof (jint)) { 1061 __ tbz(count, 2 - exact_log2(granularity), Lint); 1062 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1063 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1064 __ bind(Lint); 1065 } 1066 1067 if (granularity <= sizeof (jshort)) { 1068 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1069 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1070 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1071 __ bind(Lshort); 1072 } 1073 1074 if (granularity <= sizeof (jbyte)) { 1075 __ tbz(count, 0, Lbyte); 1076 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1077 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1078 __ bind(Lbyte); 1079 } 1080 } 1081 1082 Label copy_f, copy_b; 1083 1084 // All-singing all-dancing memory copy. 1085 // 1086 // Copy count units of memory from s to d. The size of a unit is 1087 // step, which can be positive or negative depending on the direction 1088 // of copy. If is_aligned is false, we align the source address. 1089 // 1090 1091 void copy_memory(bool is_aligned, Register s, Register d, 1092 Register count, Register tmp, int step) { 1093 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1094 bool is_backwards = step < 0; 1095 int granularity = uabs(step); 1096 const Register t0 = r3, t1 = r4; 1097 1098 // <= 96 bytes do inline. Direction doesn't matter because we always 1099 // load all the data before writing anything 1100 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1101 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1102 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1103 const Register send = r17, dend = r18; 1104 1105 if (PrefetchCopyIntervalInBytes > 0) 1106 __ prfm(Address(s, 0), PLDL1KEEP); 1107 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1108 __ br(Assembler::HI, copy_big); 1109 1110 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1111 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1112 1113 __ cmp(count, 16/granularity); 1114 __ br(Assembler::LS, copy16); 1115 1116 __ cmp(count, 64/granularity); 1117 __ br(Assembler::HI, copy80); 1118 1119 __ cmp(count, 32/granularity); 1120 __ br(Assembler::LS, copy32); 1121 1122 // 33..64 bytes 1123 if (UseSIMDForMemoryOps) { 1124 __ ldpq(v0, v1, Address(s, 0)); 1125 __ ldpq(v2, v3, Address(send, -32)); 1126 __ stpq(v0, v1, Address(d, 0)); 1127 __ stpq(v2, v3, Address(dend, -32)); 1128 } else { 1129 __ ldp(t0, t1, Address(s, 0)); 1130 __ ldp(t2, t3, Address(s, 16)); 1131 __ ldp(t4, t5, Address(send, -32)); 1132 __ ldp(t6, t7, Address(send, -16)); 1133 1134 __ stp(t0, t1, Address(d, 0)); 1135 __ stp(t2, t3, Address(d, 16)); 1136 __ stp(t4, t5, Address(dend, -32)); 1137 __ stp(t6, t7, Address(dend, -16)); 1138 } 1139 __ b(finish); 1140 1141 // 17..32 bytes 1142 __ bind(copy32); 1143 __ ldp(t0, t1, Address(s, 0)); 1144 __ ldp(t2, t3, Address(send, -16)); 1145 __ stp(t0, t1, Address(d, 0)); 1146 __ stp(t2, t3, Address(dend, -16)); 1147 __ b(finish); 1148 1149 // 65..80/96 bytes 1150 // (96 bytes if SIMD because we do 32 byes per instruction) 1151 __ bind(copy80); 1152 if (UseSIMDForMemoryOps) { 1153 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1154 __ ldpq(v4, v5, Address(send, -32)); 1155 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1156 __ stpq(v4, v5, Address(dend, -32)); 1157 } else { 1158 __ ldp(t0, t1, Address(s, 0)); 1159 __ ldp(t2, t3, Address(s, 16)); 1160 __ ldp(t4, t5, Address(s, 32)); 1161 __ ldp(t6, t7, Address(s, 48)); 1162 __ ldp(t8, t9, Address(send, -16)); 1163 1164 __ stp(t0, t1, Address(d, 0)); 1165 __ stp(t2, t3, Address(d, 16)); 1166 __ stp(t4, t5, Address(d, 32)); 1167 __ stp(t6, t7, Address(d, 48)); 1168 __ stp(t8, t9, Address(dend, -16)); 1169 } 1170 __ b(finish); 1171 1172 // 0..16 bytes 1173 __ bind(copy16); 1174 __ cmp(count, 8/granularity); 1175 __ br(Assembler::LO, copy8); 1176 1177 // 8..16 bytes 1178 __ ldr(t0, Address(s, 0)); 1179 __ ldr(t1, Address(send, -8)); 1180 __ str(t0, Address(d, 0)); 1181 __ str(t1, Address(dend, -8)); 1182 __ b(finish); 1183 1184 if (granularity < 8) { 1185 // 4..7 bytes 1186 __ bind(copy8); 1187 __ tbz(count, 2 - exact_log2(granularity), copy4); 1188 __ ldrw(t0, Address(s, 0)); 1189 __ ldrw(t1, Address(send, -4)); 1190 __ strw(t0, Address(d, 0)); 1191 __ strw(t1, Address(dend, -4)); 1192 __ b(finish); 1193 if (granularity < 4) { 1194 // 0..3 bytes 1195 __ bind(copy4); 1196 __ cbz(count, finish); // get rid of 0 case 1197 if (granularity == 2) { 1198 __ ldrh(t0, Address(s, 0)); 1199 __ strh(t0, Address(d, 0)); 1200 } else { // granularity == 1 1201 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1202 // the first and last byte. 1203 // Handle the 3 byte case by loading and storing base + count/2 1204 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1205 // This does means in the 1 byte case we load/store the same 1206 // byte 3 times. 1207 __ lsr(count, count, 1); 1208 __ ldrb(t0, Address(s, 0)); 1209 __ ldrb(t1, Address(send, -1)); 1210 __ ldrb(t2, Address(s, count)); 1211 __ strb(t0, Address(d, 0)); 1212 __ strb(t1, Address(dend, -1)); 1213 __ strb(t2, Address(d, count)); 1214 } 1215 __ b(finish); 1216 } 1217 } 1218 1219 __ bind(copy_big); 1220 if (is_backwards) { 1221 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1222 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1223 } 1224 1225 // Now we've got the small case out of the way we can align the 1226 // source address on a 2-word boundary. 1227 1228 Label aligned; 1229 1230 if (is_aligned) { 1231 // We may have to adjust by 1 word to get s 2-word-aligned. 1232 __ tbz(s, exact_log2(wordSize), aligned); 1233 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1234 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1235 __ sub(count, count, wordSize/granularity); 1236 } else { 1237 if (is_backwards) { 1238 __ andr(rscratch2, s, 2 * wordSize - 1); 1239 } else { 1240 __ neg(rscratch2, s); 1241 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1242 } 1243 // rscratch2 is the byte adjustment needed to align s. 1244 __ cbz(rscratch2, aligned); 1245 int shift = exact_log2(granularity); 1246 if (shift) __ lsr(rscratch2, rscratch2, shift); 1247 __ sub(count, count, rscratch2); 1248 1249 #if 0 1250 // ?? This code is only correct for a disjoint copy. It may or 1251 // may not make sense to use it in that case. 1252 1253 // Copy the first pair; s and d may not be aligned. 1254 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1255 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1256 1257 // Align s and d, adjust count 1258 if (is_backwards) { 1259 __ sub(s, s, rscratch2); 1260 __ sub(d, d, rscratch2); 1261 } else { 1262 __ add(s, s, rscratch2); 1263 __ add(d, d, rscratch2); 1264 } 1265 #else 1266 copy_memory_small(s, d, rscratch2, rscratch1, step); 1267 #endif 1268 } 1269 1270 __ bind(aligned); 1271 1272 // s is now 2-word-aligned. 1273 1274 // We have a count of units and some trailing bytes. Adjust the 1275 // count and do a bulk copy of words. 1276 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1277 if (direction == copy_forwards) 1278 __ bl(copy_f); 1279 else 1280 __ bl(copy_b); 1281 1282 // And the tail. 1283 copy_memory_small(s, d, count, tmp, step); 1284 1285 if (granularity >= 8) __ bind(copy8); 1286 if (granularity >= 4) __ bind(copy4); 1287 __ bind(finish); 1288 } 1289 1290 1291 void clobber_registers() { 1292 #ifdef ASSERT 1293 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1294 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1295 for (Register r = r3; r <= r18; r++) 1296 if (r != rscratch1) __ mov(r, rscratch1); 1297 #endif 1298 } 1299 1300 // Scan over array at a for count oops, verifying each one. 1301 // Preserves a and count, clobbers rscratch1 and rscratch2. 1302 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1303 Label loop, end; 1304 __ mov(rscratch1, a); 1305 __ mov(rscratch2, zr); 1306 __ bind(loop); 1307 __ cmp(rscratch2, count); 1308 __ br(Assembler::HS, end); 1309 if (size == (size_t)wordSize) { 1310 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1311 __ verify_oop(temp); 1312 } else { 1313 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1314 __ decode_heap_oop(temp); // calls verify_oop 1315 } 1316 __ add(rscratch2, rscratch2, size); 1317 __ b(loop); 1318 __ bind(end); 1319 } 1320 1321 // Arguments: 1322 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1323 // ignored 1324 // is_oop - true => oop array, so generate store check code 1325 // name - stub name string 1326 // 1327 // Inputs: 1328 // c_rarg0 - source array address 1329 // c_rarg1 - destination array address 1330 // c_rarg2 - element count, treated as ssize_t, can be zero 1331 // 1332 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1333 // the hardware handle it. The two dwords within qwords that span 1334 // cache line boundaries will still be loaded and stored atomicly. 1335 // 1336 // Side Effects: 1337 // disjoint_int_copy_entry is set to the no-overlap entry point 1338 // used by generate_conjoint_int_oop_copy(). 1339 // 1340 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1341 const char *name, bool dest_uninitialized = false) { 1342 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1343 RegSet saved_reg = RegSet::of(s, d, count); 1344 __ align(CodeEntryAlignment); 1345 StubCodeMark mark(this, "StubRoutines", name); 1346 address start = __ pc(); 1347 __ enter(); 1348 1349 if (entry != NULL) { 1350 *entry = __ pc(); 1351 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1352 BLOCK_COMMENT("Entry:"); 1353 } 1354 1355 BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen(); 1356 DecoratorSet decorators = ARRAYCOPY_DISJOINT; 1357 if (dest_uninitialized) { 1358 decorators |= DEST_NOT_INITIALIZED; 1359 } 1360 if (aligned) { 1361 decorators |= ARRAYCOPY_ALIGNED; 1362 } 1363 1364 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1365 1366 if (is_oop) { 1367 // save regs before copy_memory 1368 __ push(RegSet::of(d, count), sp); 1369 } 1370 copy_memory(aligned, s, d, count, rscratch1, size); 1371 1372 if (is_oop) { 1373 __ pop(RegSet::of(d, count), sp); 1374 if (VerifyOops) 1375 verify_oop_array(size, d, count, r16); 1376 __ sub(count, count, 1); // make an inclusive end pointer 1377 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1378 } 1379 1380 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1381 1382 __ leave(); 1383 __ mov(r0, zr); // return 0 1384 __ ret(lr); 1385 #ifdef BUILTIN_SIM 1386 { 1387 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1388 sim->notifyCompile(const_cast<char*>(name), start); 1389 } 1390 #endif 1391 return start; 1392 } 1393 1394 // Arguments: 1395 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1396 // ignored 1397 // is_oop - true => oop array, so generate store check code 1398 // name - stub name string 1399 // 1400 // Inputs: 1401 // c_rarg0 - source array address 1402 // c_rarg1 - destination array address 1403 // c_rarg2 - element count, treated as ssize_t, can be zero 1404 // 1405 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1406 // the hardware handle it. The two dwords within qwords that span 1407 // cache line boundaries will still be loaded and stored atomicly. 1408 // 1409 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1410 address *entry, const char *name, 1411 bool dest_uninitialized = false) { 1412 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1413 RegSet saved_regs = RegSet::of(s, d, count); 1414 StubCodeMark mark(this, "StubRoutines", name); 1415 address start = __ pc(); 1416 __ enter(); 1417 1418 if (entry != NULL) { 1419 *entry = __ pc(); 1420 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1421 BLOCK_COMMENT("Entry:"); 1422 } 1423 1424 // use fwd copy when (d-s) above_equal (count*size) 1425 __ sub(rscratch1, d, s); 1426 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1427 __ br(Assembler::HS, nooverlap_target); 1428 1429 BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen(); 1430 DecoratorSet decorators = DECORATOR_DEFAULT; 1431 if (dest_uninitialized) { 1432 decorators |= DEST_NOT_INITIALIZED; 1433 } 1434 if (aligned) { 1435 decorators |= ARRAYCOPY_ALIGNED; 1436 } 1437 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1438 1439 if (is_oop) { 1440 // save regs before copy_memory 1441 __ push(RegSet::of(d, count), sp); 1442 } 1443 copy_memory(aligned, s, d, count, rscratch1, -size); 1444 if (is_oop) { 1445 __ pop(RegSet::of(d, count), sp); 1446 if (VerifyOops) 1447 verify_oop_array(size, d, count, r16); 1448 __ sub(count, count, 1); // make an inclusive end pointer 1449 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1450 } 1451 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1452 __ leave(); 1453 __ mov(r0, zr); // return 0 1454 __ ret(lr); 1455 #ifdef BUILTIN_SIM 1456 { 1457 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1458 sim->notifyCompile(const_cast<char*>(name), start); 1459 } 1460 #endif 1461 return start; 1462 } 1463 1464 // Arguments: 1465 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1466 // ignored 1467 // name - stub name string 1468 // 1469 // Inputs: 1470 // c_rarg0 - source array address 1471 // c_rarg1 - destination array address 1472 // c_rarg2 - element count, treated as ssize_t, can be zero 1473 // 1474 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1475 // we let the hardware handle it. The one to eight bytes within words, 1476 // dwords or qwords that span cache line boundaries will still be loaded 1477 // and stored atomically. 1478 // 1479 // Side Effects: 1480 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1481 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1482 // we let the hardware handle it. The one to eight bytes within words, 1483 // dwords or qwords that span cache line boundaries will still be loaded 1484 // and stored atomically. 1485 // 1486 // Side Effects: 1487 // disjoint_byte_copy_entry is set to the no-overlap entry point 1488 // used by generate_conjoint_byte_copy(). 1489 // 1490 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1491 const bool not_oop = false; 1492 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1493 } 1494 1495 // Arguments: 1496 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1497 // ignored 1498 // name - stub name string 1499 // 1500 // Inputs: 1501 // c_rarg0 - source array address 1502 // c_rarg1 - destination array address 1503 // c_rarg2 - element count, treated as ssize_t, can be zero 1504 // 1505 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1506 // we let the hardware handle it. The one to eight bytes within words, 1507 // dwords or qwords that span cache line boundaries will still be loaded 1508 // and stored atomically. 1509 // 1510 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1511 address* entry, const char *name) { 1512 const bool not_oop = false; 1513 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1514 } 1515 1516 // Arguments: 1517 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1518 // ignored 1519 // name - stub name string 1520 // 1521 // Inputs: 1522 // c_rarg0 - source array address 1523 // c_rarg1 - destination array address 1524 // c_rarg2 - element count, treated as ssize_t, can be zero 1525 // 1526 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1527 // let the hardware handle it. The two or four words within dwords 1528 // or qwords that span cache line boundaries will still be loaded 1529 // and stored atomically. 1530 // 1531 // Side Effects: 1532 // disjoint_short_copy_entry is set to the no-overlap entry point 1533 // used by generate_conjoint_short_copy(). 1534 // 1535 address generate_disjoint_short_copy(bool aligned, 1536 address* entry, const char *name) { 1537 const bool not_oop = false; 1538 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1539 } 1540 1541 // Arguments: 1542 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1543 // ignored 1544 // name - stub name string 1545 // 1546 // Inputs: 1547 // c_rarg0 - source array address 1548 // c_rarg1 - destination array address 1549 // c_rarg2 - element count, treated as ssize_t, can be zero 1550 // 1551 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1552 // let the hardware handle it. The two or four words within dwords 1553 // or qwords that span cache line boundaries will still be loaded 1554 // and stored atomically. 1555 // 1556 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1557 address *entry, const char *name) { 1558 const bool not_oop = false; 1559 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1560 1561 } 1562 // Arguments: 1563 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1564 // ignored 1565 // name - stub name string 1566 // 1567 // Inputs: 1568 // c_rarg0 - source array address 1569 // c_rarg1 - destination array address 1570 // c_rarg2 - element count, treated as ssize_t, can be zero 1571 // 1572 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1573 // the hardware handle it. The two dwords within qwords that span 1574 // cache line boundaries will still be loaded and stored atomicly. 1575 // 1576 // Side Effects: 1577 // disjoint_int_copy_entry is set to the no-overlap entry point 1578 // used by generate_conjoint_int_oop_copy(). 1579 // 1580 address generate_disjoint_int_copy(bool aligned, address *entry, 1581 const char *name, bool dest_uninitialized = false) { 1582 const bool not_oop = false; 1583 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1584 } 1585 1586 // Arguments: 1587 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1588 // ignored 1589 // name - stub name string 1590 // 1591 // Inputs: 1592 // c_rarg0 - source array address 1593 // c_rarg1 - destination array address 1594 // c_rarg2 - element count, treated as ssize_t, can be zero 1595 // 1596 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1597 // the hardware handle it. The two dwords within qwords that span 1598 // cache line boundaries will still be loaded and stored atomicly. 1599 // 1600 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1601 address *entry, const char *name, 1602 bool dest_uninitialized = false) { 1603 const bool not_oop = false; 1604 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1605 } 1606 1607 1608 // Arguments: 1609 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1610 // ignored 1611 // name - stub name string 1612 // 1613 // Inputs: 1614 // c_rarg0 - source array address 1615 // c_rarg1 - destination array address 1616 // c_rarg2 - element count, treated as size_t, can be zero 1617 // 1618 // Side Effects: 1619 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1620 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1621 // 1622 address generate_disjoint_long_copy(bool aligned, address *entry, 1623 const char *name, bool dest_uninitialized = false) { 1624 const bool not_oop = false; 1625 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1626 } 1627 1628 // Arguments: 1629 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1630 // ignored 1631 // name - stub name string 1632 // 1633 // Inputs: 1634 // c_rarg0 - source array address 1635 // c_rarg1 - destination array address 1636 // c_rarg2 - element count, treated as size_t, can be zero 1637 // 1638 address generate_conjoint_long_copy(bool aligned, 1639 address nooverlap_target, address *entry, 1640 const char *name, bool dest_uninitialized = false) { 1641 const bool not_oop = false; 1642 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1643 } 1644 1645 // Arguments: 1646 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1647 // ignored 1648 // name - stub name string 1649 // 1650 // Inputs: 1651 // c_rarg0 - source array address 1652 // c_rarg1 - destination array address 1653 // c_rarg2 - element count, treated as size_t, can be zero 1654 // 1655 // Side Effects: 1656 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1657 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1658 // 1659 address generate_disjoint_oop_copy(bool aligned, address *entry, 1660 const char *name, bool dest_uninitialized) { 1661 const bool is_oop = true; 1662 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1663 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1664 } 1665 1666 // Arguments: 1667 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1668 // ignored 1669 // name - stub name string 1670 // 1671 // Inputs: 1672 // c_rarg0 - source array address 1673 // c_rarg1 - destination array address 1674 // c_rarg2 - element count, treated as size_t, can be zero 1675 // 1676 address generate_conjoint_oop_copy(bool aligned, 1677 address nooverlap_target, address *entry, 1678 const char *name, bool dest_uninitialized) { 1679 const bool is_oop = true; 1680 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1681 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1682 name, dest_uninitialized); 1683 } 1684 1685 1686 // Helper for generating a dynamic type check. 1687 // Smashes rscratch1. 1688 void generate_type_check(Register sub_klass, 1689 Register super_check_offset, 1690 Register super_klass, 1691 Label& L_success) { 1692 assert_different_registers(sub_klass, super_check_offset, super_klass); 1693 1694 BLOCK_COMMENT("type_check:"); 1695 1696 Label L_miss; 1697 1698 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1699 super_check_offset); 1700 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1701 1702 // Fall through on failure! 1703 __ BIND(L_miss); 1704 } 1705 1706 // 1707 // Generate checkcasting array copy stub 1708 // 1709 // Input: 1710 // c_rarg0 - source array address 1711 // c_rarg1 - destination array address 1712 // c_rarg2 - element count, treated as ssize_t, can be zero 1713 // c_rarg3 - size_t ckoff (super_check_offset) 1714 // c_rarg4 - oop ckval (super_klass) 1715 // 1716 // Output: 1717 // r0 == 0 - success 1718 // r0 == -1^K - failure, where K is partial transfer count 1719 // 1720 address generate_checkcast_copy(const char *name, address *entry, 1721 bool dest_uninitialized = false) { 1722 1723 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1724 1725 // Input registers (after setup_arg_regs) 1726 const Register from = c_rarg0; // source array address 1727 const Register to = c_rarg1; // destination array address 1728 const Register count = c_rarg2; // elementscount 1729 const Register ckoff = c_rarg3; // super_check_offset 1730 const Register ckval = c_rarg4; // super_klass 1731 1732 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1733 RegSet wb_post_saved_regs = RegSet::of(count); 1734 1735 // Registers used as temps (r18, r19, r20 are save-on-entry) 1736 const Register count_save = r21; // orig elementscount 1737 const Register start_to = r20; // destination array start address 1738 const Register copied_oop = r18; // actual oop copied 1739 const Register r19_klass = r19; // oop._klass 1740 1741 //--------------------------------------------------------------- 1742 // Assembler stub will be used for this call to arraycopy 1743 // if the two arrays are subtypes of Object[] but the 1744 // destination array type is not equal to or a supertype 1745 // of the source type. Each element must be separately 1746 // checked. 1747 1748 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1749 copied_oop, r19_klass, count_save); 1750 1751 __ align(CodeEntryAlignment); 1752 StubCodeMark mark(this, "StubRoutines", name); 1753 address start = __ pc(); 1754 1755 __ enter(); // required for proper stackwalking of RuntimeStub frame 1756 1757 #ifdef ASSERT 1758 // caller guarantees that the arrays really are different 1759 // otherwise, we would have to make conjoint checks 1760 { Label L; 1761 array_overlap_test(L, TIMES_OOP); 1762 __ stop("checkcast_copy within a single array"); 1763 __ bind(L); 1764 } 1765 #endif //ASSERT 1766 1767 // Caller of this entry point must set up the argument registers. 1768 if (entry != NULL) { 1769 *entry = __ pc(); 1770 BLOCK_COMMENT("Entry:"); 1771 } 1772 1773 // Empty array: Nothing to do. 1774 __ cbz(count, L_done); 1775 1776 __ push(RegSet::of(r18, r19, r20, r21), sp); 1777 1778 #ifdef ASSERT 1779 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1780 // The ckoff and ckval must be mutually consistent, 1781 // even though caller generates both. 1782 { Label L; 1783 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1784 __ ldrw(start_to, Address(ckval, sco_offset)); 1785 __ cmpw(ckoff, start_to); 1786 __ br(Assembler::EQ, L); 1787 __ stop("super_check_offset inconsistent"); 1788 __ bind(L); 1789 } 1790 #endif //ASSERT 1791 1792 BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen(); 1793 DecoratorSet decorators = ARRAYCOPY_CONTRAVARIANT | ARRAYCOPY_DISJOINT; 1794 bool is_oop = true; 1795 if (dest_uninitialized) { 1796 decorators |= DEST_NOT_INITIALIZED; 1797 } 1798 1799 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1800 1801 // save the original count 1802 __ mov(count_save, count); 1803 1804 // Copy from low to high addresses 1805 __ mov(start_to, to); // Save destination array start address 1806 __ b(L_load_element); 1807 1808 // ======== begin loop ======== 1809 // (Loop is rotated; its entry is L_load_element.) 1810 // Loop control: 1811 // for (; count != 0; count--) { 1812 // copied_oop = load_heap_oop(from++); 1813 // ... generate_type_check ...; 1814 // store_heap_oop(to++, copied_oop); 1815 // } 1816 __ align(OptoLoopAlignment); 1817 1818 __ BIND(L_store_element); 1819 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1820 __ sub(count, count, 1); 1821 __ cbz(count, L_do_card_marks); 1822 1823 // ======== loop entry is here ======== 1824 __ BIND(L_load_element); 1825 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1826 __ cbz(copied_oop, L_store_element); 1827 1828 __ load_klass(r19_klass, copied_oop);// query the object klass 1829 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1830 // ======== end loop ======== 1831 1832 // It was a real error; we must depend on the caller to finish the job. 1833 // Register count = remaining oops, count_orig = total oops. 1834 // Emit GC store barriers for the oops we have copied and report 1835 // their number to the caller. 1836 1837 __ subs(count, count_save, count); // K = partially copied oop count 1838 __ eon(count, count, zr); // report (-1^K) to caller 1839 __ br(Assembler::EQ, L_done_pop); 1840 1841 __ BIND(L_do_card_marks); 1842 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1843 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs); 1844 1845 __ bind(L_done_pop); 1846 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1847 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1848 1849 __ bind(L_done); 1850 __ mov(r0, count); 1851 __ leave(); 1852 __ ret(lr); 1853 1854 return start; 1855 } 1856 1857 // Perform range checks on the proposed arraycopy. 1858 // Kills temp, but nothing else. 1859 // Also, clean the sign bits of src_pos and dst_pos. 1860 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1861 Register src_pos, // source position (c_rarg1) 1862 Register dst, // destination array oo (c_rarg2) 1863 Register dst_pos, // destination position (c_rarg3) 1864 Register length, 1865 Register temp, 1866 Label& L_failed) { 1867 BLOCK_COMMENT("arraycopy_range_checks:"); 1868 1869 assert_different_registers(rscratch1, temp); 1870 1871 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1872 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1873 __ addw(temp, length, src_pos); 1874 __ cmpw(temp, rscratch1); 1875 __ br(Assembler::HI, L_failed); 1876 1877 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1878 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1879 __ addw(temp, length, dst_pos); 1880 __ cmpw(temp, rscratch1); 1881 __ br(Assembler::HI, L_failed); 1882 1883 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1884 __ movw(src_pos, src_pos); 1885 __ movw(dst_pos, dst_pos); 1886 1887 BLOCK_COMMENT("arraycopy_range_checks done"); 1888 } 1889 1890 // These stubs get called from some dumb test routine. 1891 // I'll write them properly when they're called from 1892 // something that's actually doing something. 1893 static void fake_arraycopy_stub(address src, address dst, int count) { 1894 assert(count == 0, "huh?"); 1895 } 1896 1897 1898 // 1899 // Generate 'unsafe' array copy stub 1900 // Though just as safe as the other stubs, it takes an unscaled 1901 // size_t argument instead of an element count. 1902 // 1903 // Input: 1904 // c_rarg0 - source array address 1905 // c_rarg1 - destination array address 1906 // c_rarg2 - byte count, treated as ssize_t, can be zero 1907 // 1908 // Examines the alignment of the operands and dispatches 1909 // to a long, int, short, or byte copy loop. 1910 // 1911 address generate_unsafe_copy(const char *name, 1912 address byte_copy_entry, 1913 address short_copy_entry, 1914 address int_copy_entry, 1915 address long_copy_entry) { 1916 Label L_long_aligned, L_int_aligned, L_short_aligned; 1917 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1918 1919 __ align(CodeEntryAlignment); 1920 StubCodeMark mark(this, "StubRoutines", name); 1921 address start = __ pc(); 1922 __ enter(); // required for proper stackwalking of RuntimeStub frame 1923 1924 // bump this on entry, not on exit: 1925 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1926 1927 __ orr(rscratch1, s, d); 1928 __ orr(rscratch1, rscratch1, count); 1929 1930 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1931 __ cbz(rscratch1, L_long_aligned); 1932 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1933 __ cbz(rscratch1, L_int_aligned); 1934 __ tbz(rscratch1, 0, L_short_aligned); 1935 __ b(RuntimeAddress(byte_copy_entry)); 1936 1937 __ BIND(L_short_aligned); 1938 __ lsr(count, count, LogBytesPerShort); // size => short_count 1939 __ b(RuntimeAddress(short_copy_entry)); 1940 __ BIND(L_int_aligned); 1941 __ lsr(count, count, LogBytesPerInt); // size => int_count 1942 __ b(RuntimeAddress(int_copy_entry)); 1943 __ BIND(L_long_aligned); 1944 __ lsr(count, count, LogBytesPerLong); // size => long_count 1945 __ b(RuntimeAddress(long_copy_entry)); 1946 1947 return start; 1948 } 1949 1950 // 1951 // Generate generic array copy stubs 1952 // 1953 // Input: 1954 // c_rarg0 - src oop 1955 // c_rarg1 - src_pos (32-bits) 1956 // c_rarg2 - dst oop 1957 // c_rarg3 - dst_pos (32-bits) 1958 // c_rarg4 - element count (32-bits) 1959 // 1960 // Output: 1961 // r0 == 0 - success 1962 // r0 == -1^K - failure, where K is partial transfer count 1963 // 1964 address generate_generic_copy(const char *name, 1965 address byte_copy_entry, address short_copy_entry, 1966 address int_copy_entry, address oop_copy_entry, 1967 address long_copy_entry, address checkcast_copy_entry) { 1968 1969 Label L_failed, L_failed_0, L_objArray; 1970 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1971 1972 // Input registers 1973 const Register src = c_rarg0; // source array oop 1974 const Register src_pos = c_rarg1; // source position 1975 const Register dst = c_rarg2; // destination array oop 1976 const Register dst_pos = c_rarg3; // destination position 1977 const Register length = c_rarg4; 1978 1979 StubCodeMark mark(this, "StubRoutines", name); 1980 1981 __ align(CodeEntryAlignment); 1982 address start = __ pc(); 1983 1984 __ enter(); // required for proper stackwalking of RuntimeStub frame 1985 1986 // bump this on entry, not on exit: 1987 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1988 1989 //----------------------------------------------------------------------- 1990 // Assembler stub will be used for this call to arraycopy 1991 // if the following conditions are met: 1992 // 1993 // (1) src and dst must not be null. 1994 // (2) src_pos must not be negative. 1995 // (3) dst_pos must not be negative. 1996 // (4) length must not be negative. 1997 // (5) src klass and dst klass should be the same and not NULL. 1998 // (6) src and dst should be arrays. 1999 // (7) src_pos + length must not exceed length of src. 2000 // (8) dst_pos + length must not exceed length of dst. 2001 // 2002 2003 // if (src == NULL) return -1; 2004 __ cbz(src, L_failed); 2005 2006 // if (src_pos < 0) return -1; 2007 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2008 2009 // if (dst == NULL) return -1; 2010 __ cbz(dst, L_failed); 2011 2012 // if (dst_pos < 0) return -1; 2013 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2014 2015 // registers used as temp 2016 const Register scratch_length = r16; // elements count to copy 2017 const Register scratch_src_klass = r17; // array klass 2018 const Register lh = r18; // layout helper 2019 2020 // if (length < 0) return -1; 2021 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2022 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2023 2024 __ load_klass(scratch_src_klass, src); 2025 #ifdef ASSERT 2026 // assert(src->klass() != NULL); 2027 { 2028 BLOCK_COMMENT("assert klasses not null {"); 2029 Label L1, L2; 2030 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2031 __ bind(L1); 2032 __ stop("broken null klass"); 2033 __ bind(L2); 2034 __ load_klass(rscratch1, dst); 2035 __ cbz(rscratch1, L1); // this would be broken also 2036 BLOCK_COMMENT("} assert klasses not null done"); 2037 } 2038 #endif 2039 2040 // Load layout helper (32-bits) 2041 // 2042 // |array_tag| | header_size | element_type | |log2_element_size| 2043 // 32 30 24 16 8 2 0 2044 // 2045 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2046 // 2047 2048 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2049 2050 // Handle objArrays completely differently... 2051 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2052 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2053 __ movw(rscratch1, objArray_lh); 2054 __ eorw(rscratch2, lh, rscratch1); 2055 __ cbzw(rscratch2, L_objArray); 2056 2057 // if (src->klass() != dst->klass()) return -1; 2058 __ load_klass(rscratch2, dst); 2059 __ eor(rscratch2, rscratch2, scratch_src_klass); 2060 __ cbnz(rscratch2, L_failed); 2061 2062 // if (!src->is_Array()) return -1; 2063 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2064 2065 // At this point, it is known to be a typeArray (array_tag 0x3). 2066 #ifdef ASSERT 2067 { 2068 BLOCK_COMMENT("assert primitive array {"); 2069 Label L; 2070 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2071 __ cmpw(lh, rscratch2); 2072 __ br(Assembler::GE, L); 2073 __ stop("must be a primitive array"); 2074 __ bind(L); 2075 BLOCK_COMMENT("} assert primitive array done"); 2076 } 2077 #endif 2078 2079 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2080 rscratch2, L_failed); 2081 2082 // TypeArrayKlass 2083 // 2084 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2085 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2086 // 2087 2088 const Register rscratch1_offset = rscratch1; // array offset 2089 const Register r18_elsize = lh; // element size 2090 2091 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2092 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2093 __ add(src, src, rscratch1_offset); // src array offset 2094 __ add(dst, dst, rscratch1_offset); // dst array offset 2095 BLOCK_COMMENT("choose copy loop based on element size"); 2096 2097 // next registers should be set before the jump to corresponding stub 2098 const Register from = c_rarg0; // source array address 2099 const Register to = c_rarg1; // destination array address 2100 const Register count = c_rarg2; // elements count 2101 2102 // 'from', 'to', 'count' registers should be set in such order 2103 // since they are the same as 'src', 'src_pos', 'dst'. 2104 2105 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2106 2107 // The possible values of elsize are 0-3, i.e. exact_log2(element 2108 // size in bytes). We do a simple bitwise binary search. 2109 __ BIND(L_copy_bytes); 2110 __ tbnz(r18_elsize, 1, L_copy_ints); 2111 __ tbnz(r18_elsize, 0, L_copy_shorts); 2112 __ lea(from, Address(src, src_pos));// src_addr 2113 __ lea(to, Address(dst, dst_pos));// dst_addr 2114 __ movw(count, scratch_length); // length 2115 __ b(RuntimeAddress(byte_copy_entry)); 2116 2117 __ BIND(L_copy_shorts); 2118 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2119 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2120 __ movw(count, scratch_length); // length 2121 __ b(RuntimeAddress(short_copy_entry)); 2122 2123 __ BIND(L_copy_ints); 2124 __ tbnz(r18_elsize, 0, L_copy_longs); 2125 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2126 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2127 __ movw(count, scratch_length); // length 2128 __ b(RuntimeAddress(int_copy_entry)); 2129 2130 __ BIND(L_copy_longs); 2131 #ifdef ASSERT 2132 { 2133 BLOCK_COMMENT("assert long copy {"); 2134 Label L; 2135 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2136 __ cmpw(r18_elsize, LogBytesPerLong); 2137 __ br(Assembler::EQ, L); 2138 __ stop("must be long copy, but elsize is wrong"); 2139 __ bind(L); 2140 BLOCK_COMMENT("} assert long copy done"); 2141 } 2142 #endif 2143 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2144 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2145 __ movw(count, scratch_length); // length 2146 __ b(RuntimeAddress(long_copy_entry)); 2147 2148 // ObjArrayKlass 2149 __ BIND(L_objArray); 2150 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2151 2152 Label L_plain_copy, L_checkcast_copy; 2153 // test array classes for subtyping 2154 __ load_klass(r18, dst); 2155 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2156 __ br(Assembler::NE, L_checkcast_copy); 2157 2158 // Identically typed arrays can be copied without element-wise checks. 2159 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2160 rscratch2, L_failed); 2161 2162 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2163 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2164 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2165 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2166 __ movw(count, scratch_length); // length 2167 __ BIND(L_plain_copy); 2168 __ b(RuntimeAddress(oop_copy_entry)); 2169 2170 __ BIND(L_checkcast_copy); 2171 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2172 { 2173 // Before looking at dst.length, make sure dst is also an objArray. 2174 __ ldrw(rscratch1, Address(r18, lh_offset)); 2175 __ movw(rscratch2, objArray_lh); 2176 __ eorw(rscratch1, rscratch1, rscratch2); 2177 __ cbnzw(rscratch1, L_failed); 2178 2179 // It is safe to examine both src.length and dst.length. 2180 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2181 r18, L_failed); 2182 2183 const Register rscratch2_dst_klass = rscratch2; 2184 __ load_klass(rscratch2_dst_klass, dst); // reload 2185 2186 // Marshal the base address arguments now, freeing registers. 2187 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2188 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2189 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2190 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2191 __ movw(count, length); // length (reloaded) 2192 Register sco_temp = c_rarg3; // this register is free now 2193 assert_different_registers(from, to, count, sco_temp, 2194 rscratch2_dst_klass, scratch_src_klass); 2195 // assert_clean_int(count, sco_temp); 2196 2197 // Generate the type check. 2198 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2199 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2200 // assert_clean_int(sco_temp, r18); 2201 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2202 2203 // Fetch destination element klass from the ObjArrayKlass header. 2204 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2205 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2206 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2207 2208 // the checkcast_copy loop needs two extra arguments: 2209 assert(c_rarg3 == sco_temp, "#3 already in place"); 2210 // Set up arguments for checkcast_copy_entry. 2211 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2212 __ b(RuntimeAddress(checkcast_copy_entry)); 2213 } 2214 2215 __ BIND(L_failed); 2216 __ mov(r0, -1); 2217 __ leave(); // required for proper stackwalking of RuntimeStub frame 2218 __ ret(lr); 2219 2220 return start; 2221 } 2222 2223 // 2224 // Generate stub for array fill. If "aligned" is true, the 2225 // "to" address is assumed to be heapword aligned. 2226 // 2227 // Arguments for generated stub: 2228 // to: c_rarg0 2229 // value: c_rarg1 2230 // count: c_rarg2 treated as signed 2231 // 2232 address generate_fill(BasicType t, bool aligned, const char *name) { 2233 __ align(CodeEntryAlignment); 2234 StubCodeMark mark(this, "StubRoutines", name); 2235 address start = __ pc(); 2236 2237 BLOCK_COMMENT("Entry:"); 2238 2239 const Register to = c_rarg0; // source array address 2240 const Register value = c_rarg1; // value 2241 const Register count = c_rarg2; // elements count 2242 2243 const Register bz_base = r10; // base for block_zero routine 2244 const Register cnt_words = r11; // temp register 2245 2246 __ enter(); 2247 2248 Label L_fill_elements, L_exit1; 2249 2250 int shift = -1; 2251 switch (t) { 2252 case T_BYTE: 2253 shift = 0; 2254 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2255 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2256 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2257 __ br(Assembler::LO, L_fill_elements); 2258 break; 2259 case T_SHORT: 2260 shift = 1; 2261 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2262 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2263 __ br(Assembler::LO, L_fill_elements); 2264 break; 2265 case T_INT: 2266 shift = 2; 2267 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2268 __ br(Assembler::LO, L_fill_elements); 2269 break; 2270 default: ShouldNotReachHere(); 2271 } 2272 2273 // Align source address at 8 bytes address boundary. 2274 Label L_skip_align1, L_skip_align2, L_skip_align4; 2275 if (!aligned) { 2276 switch (t) { 2277 case T_BYTE: 2278 // One byte misalignment happens only for byte arrays. 2279 __ tbz(to, 0, L_skip_align1); 2280 __ strb(value, Address(__ post(to, 1))); 2281 __ subw(count, count, 1); 2282 __ bind(L_skip_align1); 2283 // Fallthrough 2284 case T_SHORT: 2285 // Two bytes misalignment happens only for byte and short (char) arrays. 2286 __ tbz(to, 1, L_skip_align2); 2287 __ strh(value, Address(__ post(to, 2))); 2288 __ subw(count, count, 2 >> shift); 2289 __ bind(L_skip_align2); 2290 // Fallthrough 2291 case T_INT: 2292 // Align to 8 bytes, we know we are 4 byte aligned to start. 2293 __ tbz(to, 2, L_skip_align4); 2294 __ strw(value, Address(__ post(to, 4))); 2295 __ subw(count, count, 4 >> shift); 2296 __ bind(L_skip_align4); 2297 break; 2298 default: ShouldNotReachHere(); 2299 } 2300 } 2301 2302 // 2303 // Fill large chunks 2304 // 2305 __ lsrw(cnt_words, count, 3 - shift); // number of words 2306 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2307 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2308 if (UseBlockZeroing) { 2309 Label non_block_zeroing, rest; 2310 // If the fill value is zero we can use the fast zero_words(). 2311 __ cbnz(value, non_block_zeroing); 2312 __ mov(bz_base, to); 2313 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2314 __ zero_words(bz_base, cnt_words); 2315 __ b(rest); 2316 __ bind(non_block_zeroing); 2317 __ fill_words(to, cnt_words, value); 2318 __ bind(rest); 2319 } else { 2320 __ fill_words(to, cnt_words, value); 2321 } 2322 2323 // Remaining count is less than 8 bytes. Fill it by a single store. 2324 // Note that the total length is no less than 8 bytes. 2325 if (t == T_BYTE || t == T_SHORT) { 2326 Label L_exit1; 2327 __ cbzw(count, L_exit1); 2328 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2329 __ str(value, Address(to, -8)); // overwrite some elements 2330 __ bind(L_exit1); 2331 __ leave(); 2332 __ ret(lr); 2333 } 2334 2335 // Handle copies less than 8 bytes. 2336 Label L_fill_2, L_fill_4, L_exit2; 2337 __ bind(L_fill_elements); 2338 switch (t) { 2339 case T_BYTE: 2340 __ tbz(count, 0, L_fill_2); 2341 __ strb(value, Address(__ post(to, 1))); 2342 __ bind(L_fill_2); 2343 __ tbz(count, 1, L_fill_4); 2344 __ strh(value, Address(__ post(to, 2))); 2345 __ bind(L_fill_4); 2346 __ tbz(count, 2, L_exit2); 2347 __ strw(value, Address(to)); 2348 break; 2349 case T_SHORT: 2350 __ tbz(count, 0, L_fill_4); 2351 __ strh(value, Address(__ post(to, 2))); 2352 __ bind(L_fill_4); 2353 __ tbz(count, 1, L_exit2); 2354 __ strw(value, Address(to)); 2355 break; 2356 case T_INT: 2357 __ cbzw(count, L_exit2); 2358 __ strw(value, Address(to)); 2359 break; 2360 default: ShouldNotReachHere(); 2361 } 2362 __ bind(L_exit2); 2363 __ leave(); 2364 __ ret(lr); 2365 return start; 2366 } 2367 2368 void generate_arraycopy_stubs() { 2369 address entry; 2370 address entry_jbyte_arraycopy; 2371 address entry_jshort_arraycopy; 2372 address entry_jint_arraycopy; 2373 address entry_oop_arraycopy; 2374 address entry_jlong_arraycopy; 2375 address entry_checkcast_arraycopy; 2376 2377 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2378 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2379 2380 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2381 2382 //*** jbyte 2383 // Always need aligned and unaligned versions 2384 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2385 "jbyte_disjoint_arraycopy"); 2386 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2387 &entry_jbyte_arraycopy, 2388 "jbyte_arraycopy"); 2389 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2390 "arrayof_jbyte_disjoint_arraycopy"); 2391 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2392 "arrayof_jbyte_arraycopy"); 2393 2394 //*** jshort 2395 // Always need aligned and unaligned versions 2396 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2397 "jshort_disjoint_arraycopy"); 2398 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2399 &entry_jshort_arraycopy, 2400 "jshort_arraycopy"); 2401 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2402 "arrayof_jshort_disjoint_arraycopy"); 2403 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2404 "arrayof_jshort_arraycopy"); 2405 2406 //*** jint 2407 // Aligned versions 2408 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2409 "arrayof_jint_disjoint_arraycopy"); 2410 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2411 "arrayof_jint_arraycopy"); 2412 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2413 // entry_jint_arraycopy always points to the unaligned version 2414 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2415 "jint_disjoint_arraycopy"); 2416 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2417 &entry_jint_arraycopy, 2418 "jint_arraycopy"); 2419 2420 //*** jlong 2421 // It is always aligned 2422 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2423 "arrayof_jlong_disjoint_arraycopy"); 2424 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2425 "arrayof_jlong_arraycopy"); 2426 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2427 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2428 2429 //*** oops 2430 { 2431 // With compressed oops we need unaligned versions; notice that 2432 // we overwrite entry_oop_arraycopy. 2433 bool aligned = !UseCompressedOops; 2434 2435 StubRoutines::_arrayof_oop_disjoint_arraycopy 2436 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2437 /*dest_uninitialized*/false); 2438 StubRoutines::_arrayof_oop_arraycopy 2439 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2440 /*dest_uninitialized*/false); 2441 // Aligned versions without pre-barriers 2442 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2443 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2444 /*dest_uninitialized*/true); 2445 StubRoutines::_arrayof_oop_arraycopy_uninit 2446 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2447 /*dest_uninitialized*/true); 2448 } 2449 2450 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2451 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2452 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2453 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2454 2455 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2456 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2457 /*dest_uninitialized*/true); 2458 2459 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2460 entry_jbyte_arraycopy, 2461 entry_jshort_arraycopy, 2462 entry_jint_arraycopy, 2463 entry_jlong_arraycopy); 2464 2465 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2466 entry_jbyte_arraycopy, 2467 entry_jshort_arraycopy, 2468 entry_jint_arraycopy, 2469 entry_oop_arraycopy, 2470 entry_jlong_arraycopy, 2471 entry_checkcast_arraycopy); 2472 2473 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2474 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2475 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2476 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2477 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2478 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2479 } 2480 2481 void generate_math_stubs() { Unimplemented(); } 2482 2483 // Arguments: 2484 // 2485 // Inputs: 2486 // c_rarg0 - source byte array address 2487 // c_rarg1 - destination byte array address 2488 // c_rarg2 - K (key) in little endian int array 2489 // 2490 address generate_aescrypt_encryptBlock() { 2491 __ align(CodeEntryAlignment); 2492 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2493 2494 Label L_doLast; 2495 2496 const Register from = c_rarg0; // source array address 2497 const Register to = c_rarg1; // destination array address 2498 const Register key = c_rarg2; // key array address 2499 const Register keylen = rscratch1; 2500 2501 address start = __ pc(); 2502 __ enter(); 2503 2504 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2505 2506 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2507 2508 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2509 __ rev32(v1, __ T16B, v1); 2510 __ rev32(v2, __ T16B, v2); 2511 __ rev32(v3, __ T16B, v3); 2512 __ rev32(v4, __ T16B, v4); 2513 __ aese(v0, v1); 2514 __ aesmc(v0, v0); 2515 __ aese(v0, v2); 2516 __ aesmc(v0, v0); 2517 __ aese(v0, v3); 2518 __ aesmc(v0, v0); 2519 __ aese(v0, v4); 2520 __ aesmc(v0, v0); 2521 2522 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2523 __ rev32(v1, __ T16B, v1); 2524 __ rev32(v2, __ T16B, v2); 2525 __ rev32(v3, __ T16B, v3); 2526 __ rev32(v4, __ T16B, v4); 2527 __ aese(v0, v1); 2528 __ aesmc(v0, v0); 2529 __ aese(v0, v2); 2530 __ aesmc(v0, v0); 2531 __ aese(v0, v3); 2532 __ aesmc(v0, v0); 2533 __ aese(v0, v4); 2534 __ aesmc(v0, v0); 2535 2536 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2537 __ rev32(v1, __ T16B, v1); 2538 __ rev32(v2, __ T16B, v2); 2539 2540 __ cmpw(keylen, 44); 2541 __ br(Assembler::EQ, L_doLast); 2542 2543 __ aese(v0, v1); 2544 __ aesmc(v0, v0); 2545 __ aese(v0, v2); 2546 __ aesmc(v0, v0); 2547 2548 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2549 __ rev32(v1, __ T16B, v1); 2550 __ rev32(v2, __ T16B, v2); 2551 2552 __ cmpw(keylen, 52); 2553 __ br(Assembler::EQ, L_doLast); 2554 2555 __ aese(v0, v1); 2556 __ aesmc(v0, v0); 2557 __ aese(v0, v2); 2558 __ aesmc(v0, v0); 2559 2560 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2561 __ rev32(v1, __ T16B, v1); 2562 __ rev32(v2, __ T16B, v2); 2563 2564 __ BIND(L_doLast); 2565 2566 __ aese(v0, v1); 2567 __ aesmc(v0, v0); 2568 __ aese(v0, v2); 2569 2570 __ ld1(v1, __ T16B, key); 2571 __ rev32(v1, __ T16B, v1); 2572 __ eor(v0, __ T16B, v0, v1); 2573 2574 __ st1(v0, __ T16B, to); 2575 2576 __ mov(r0, 0); 2577 2578 __ leave(); 2579 __ ret(lr); 2580 2581 return start; 2582 } 2583 2584 // Arguments: 2585 // 2586 // Inputs: 2587 // c_rarg0 - source byte array address 2588 // c_rarg1 - destination byte array address 2589 // c_rarg2 - K (key) in little endian int array 2590 // 2591 address generate_aescrypt_decryptBlock() { 2592 assert(UseAES, "need AES instructions and misaligned SSE support"); 2593 __ align(CodeEntryAlignment); 2594 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2595 Label L_doLast; 2596 2597 const Register from = c_rarg0; // source array address 2598 const Register to = c_rarg1; // destination array address 2599 const Register key = c_rarg2; // key array address 2600 const Register keylen = rscratch1; 2601 2602 address start = __ pc(); 2603 __ enter(); // required for proper stackwalking of RuntimeStub frame 2604 2605 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2606 2607 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2608 2609 __ ld1(v5, __ T16B, __ post(key, 16)); 2610 __ rev32(v5, __ T16B, v5); 2611 2612 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2613 __ rev32(v1, __ T16B, v1); 2614 __ rev32(v2, __ T16B, v2); 2615 __ rev32(v3, __ T16B, v3); 2616 __ rev32(v4, __ T16B, v4); 2617 __ aesd(v0, v1); 2618 __ aesimc(v0, v0); 2619 __ aesd(v0, v2); 2620 __ aesimc(v0, v0); 2621 __ aesd(v0, v3); 2622 __ aesimc(v0, v0); 2623 __ aesd(v0, v4); 2624 __ aesimc(v0, v0); 2625 2626 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2627 __ rev32(v1, __ T16B, v1); 2628 __ rev32(v2, __ T16B, v2); 2629 __ rev32(v3, __ T16B, v3); 2630 __ rev32(v4, __ T16B, v4); 2631 __ aesd(v0, v1); 2632 __ aesimc(v0, v0); 2633 __ aesd(v0, v2); 2634 __ aesimc(v0, v0); 2635 __ aesd(v0, v3); 2636 __ aesimc(v0, v0); 2637 __ aesd(v0, v4); 2638 __ aesimc(v0, v0); 2639 2640 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2641 __ rev32(v1, __ T16B, v1); 2642 __ rev32(v2, __ T16B, v2); 2643 2644 __ cmpw(keylen, 44); 2645 __ br(Assembler::EQ, L_doLast); 2646 2647 __ aesd(v0, v1); 2648 __ aesimc(v0, v0); 2649 __ aesd(v0, v2); 2650 __ aesimc(v0, v0); 2651 2652 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2653 __ rev32(v1, __ T16B, v1); 2654 __ rev32(v2, __ T16B, v2); 2655 2656 __ cmpw(keylen, 52); 2657 __ br(Assembler::EQ, L_doLast); 2658 2659 __ aesd(v0, v1); 2660 __ aesimc(v0, v0); 2661 __ aesd(v0, v2); 2662 __ aesimc(v0, v0); 2663 2664 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2665 __ rev32(v1, __ T16B, v1); 2666 __ rev32(v2, __ T16B, v2); 2667 2668 __ BIND(L_doLast); 2669 2670 __ aesd(v0, v1); 2671 __ aesimc(v0, v0); 2672 __ aesd(v0, v2); 2673 2674 __ eor(v0, __ T16B, v0, v5); 2675 2676 __ st1(v0, __ T16B, to); 2677 2678 __ mov(r0, 0); 2679 2680 __ leave(); 2681 __ ret(lr); 2682 2683 return start; 2684 } 2685 2686 // Arguments: 2687 // 2688 // Inputs: 2689 // c_rarg0 - source byte array address 2690 // c_rarg1 - destination byte array address 2691 // c_rarg2 - K (key) in little endian int array 2692 // c_rarg3 - r vector byte array address 2693 // c_rarg4 - input length 2694 // 2695 // Output: 2696 // x0 - input length 2697 // 2698 address generate_cipherBlockChaining_encryptAESCrypt() { 2699 assert(UseAES, "need AES instructions and misaligned SSE support"); 2700 __ align(CodeEntryAlignment); 2701 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2702 2703 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2704 2705 const Register from = c_rarg0; // source array address 2706 const Register to = c_rarg1; // destination array address 2707 const Register key = c_rarg2; // key array address 2708 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2709 // and left with the results of the last encryption block 2710 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2711 const Register keylen = rscratch1; 2712 2713 address start = __ pc(); 2714 2715 __ enter(); 2716 2717 __ movw(rscratch2, len_reg); 2718 2719 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2720 2721 __ ld1(v0, __ T16B, rvec); 2722 2723 __ cmpw(keylen, 52); 2724 __ br(Assembler::CC, L_loadkeys_44); 2725 __ br(Assembler::EQ, L_loadkeys_52); 2726 2727 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2728 __ rev32(v17, __ T16B, v17); 2729 __ rev32(v18, __ T16B, v18); 2730 __ BIND(L_loadkeys_52); 2731 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2732 __ rev32(v19, __ T16B, v19); 2733 __ rev32(v20, __ T16B, v20); 2734 __ BIND(L_loadkeys_44); 2735 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2736 __ rev32(v21, __ T16B, v21); 2737 __ rev32(v22, __ T16B, v22); 2738 __ rev32(v23, __ T16B, v23); 2739 __ rev32(v24, __ T16B, v24); 2740 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2741 __ rev32(v25, __ T16B, v25); 2742 __ rev32(v26, __ T16B, v26); 2743 __ rev32(v27, __ T16B, v27); 2744 __ rev32(v28, __ T16B, v28); 2745 __ ld1(v29, v30, v31, __ T16B, key); 2746 __ rev32(v29, __ T16B, v29); 2747 __ rev32(v30, __ T16B, v30); 2748 __ rev32(v31, __ T16B, v31); 2749 2750 __ BIND(L_aes_loop); 2751 __ ld1(v1, __ T16B, __ post(from, 16)); 2752 __ eor(v0, __ T16B, v0, v1); 2753 2754 __ br(Assembler::CC, L_rounds_44); 2755 __ br(Assembler::EQ, L_rounds_52); 2756 2757 __ aese(v0, v17); __ aesmc(v0, v0); 2758 __ aese(v0, v18); __ aesmc(v0, v0); 2759 __ BIND(L_rounds_52); 2760 __ aese(v0, v19); __ aesmc(v0, v0); 2761 __ aese(v0, v20); __ aesmc(v0, v0); 2762 __ BIND(L_rounds_44); 2763 __ aese(v0, v21); __ aesmc(v0, v0); 2764 __ aese(v0, v22); __ aesmc(v0, v0); 2765 __ aese(v0, v23); __ aesmc(v0, v0); 2766 __ aese(v0, v24); __ aesmc(v0, v0); 2767 __ aese(v0, v25); __ aesmc(v0, v0); 2768 __ aese(v0, v26); __ aesmc(v0, v0); 2769 __ aese(v0, v27); __ aesmc(v0, v0); 2770 __ aese(v0, v28); __ aesmc(v0, v0); 2771 __ aese(v0, v29); __ aesmc(v0, v0); 2772 __ aese(v0, v30); 2773 __ eor(v0, __ T16B, v0, v31); 2774 2775 __ st1(v0, __ T16B, __ post(to, 16)); 2776 2777 __ subw(len_reg, len_reg, 16); 2778 __ cbnzw(len_reg, L_aes_loop); 2779 2780 __ st1(v0, __ T16B, rvec); 2781 2782 __ mov(r0, rscratch2); 2783 2784 __ leave(); 2785 __ ret(lr); 2786 2787 return start; 2788 } 2789 2790 // Arguments: 2791 // 2792 // Inputs: 2793 // c_rarg0 - source byte array address 2794 // c_rarg1 - destination byte array address 2795 // c_rarg2 - K (key) in little endian int array 2796 // c_rarg3 - r vector byte array address 2797 // c_rarg4 - input length 2798 // 2799 // Output: 2800 // r0 - input length 2801 // 2802 address generate_cipherBlockChaining_decryptAESCrypt() { 2803 assert(UseAES, "need AES instructions and misaligned SSE support"); 2804 __ align(CodeEntryAlignment); 2805 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2806 2807 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2808 2809 const Register from = c_rarg0; // source array address 2810 const Register to = c_rarg1; // destination array address 2811 const Register key = c_rarg2; // key array address 2812 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2813 // and left with the results of the last encryption block 2814 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2815 const Register keylen = rscratch1; 2816 2817 address start = __ pc(); 2818 2819 __ enter(); 2820 2821 __ movw(rscratch2, len_reg); 2822 2823 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2824 2825 __ ld1(v2, __ T16B, rvec); 2826 2827 __ ld1(v31, __ T16B, __ post(key, 16)); 2828 __ rev32(v31, __ T16B, v31); 2829 2830 __ cmpw(keylen, 52); 2831 __ br(Assembler::CC, L_loadkeys_44); 2832 __ br(Assembler::EQ, L_loadkeys_52); 2833 2834 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2835 __ rev32(v17, __ T16B, v17); 2836 __ rev32(v18, __ T16B, v18); 2837 __ BIND(L_loadkeys_52); 2838 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2839 __ rev32(v19, __ T16B, v19); 2840 __ rev32(v20, __ T16B, v20); 2841 __ BIND(L_loadkeys_44); 2842 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2843 __ rev32(v21, __ T16B, v21); 2844 __ rev32(v22, __ T16B, v22); 2845 __ rev32(v23, __ T16B, v23); 2846 __ rev32(v24, __ T16B, v24); 2847 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2848 __ rev32(v25, __ T16B, v25); 2849 __ rev32(v26, __ T16B, v26); 2850 __ rev32(v27, __ T16B, v27); 2851 __ rev32(v28, __ T16B, v28); 2852 __ ld1(v29, v30, __ T16B, key); 2853 __ rev32(v29, __ T16B, v29); 2854 __ rev32(v30, __ T16B, v30); 2855 2856 __ BIND(L_aes_loop); 2857 __ ld1(v0, __ T16B, __ post(from, 16)); 2858 __ orr(v1, __ T16B, v0, v0); 2859 2860 __ br(Assembler::CC, L_rounds_44); 2861 __ br(Assembler::EQ, L_rounds_52); 2862 2863 __ aesd(v0, v17); __ aesimc(v0, v0); 2864 __ aesd(v0, v18); __ aesimc(v0, v0); 2865 __ BIND(L_rounds_52); 2866 __ aesd(v0, v19); __ aesimc(v0, v0); 2867 __ aesd(v0, v20); __ aesimc(v0, v0); 2868 __ BIND(L_rounds_44); 2869 __ aesd(v0, v21); __ aesimc(v0, v0); 2870 __ aesd(v0, v22); __ aesimc(v0, v0); 2871 __ aesd(v0, v23); __ aesimc(v0, v0); 2872 __ aesd(v0, v24); __ aesimc(v0, v0); 2873 __ aesd(v0, v25); __ aesimc(v0, v0); 2874 __ aesd(v0, v26); __ aesimc(v0, v0); 2875 __ aesd(v0, v27); __ aesimc(v0, v0); 2876 __ aesd(v0, v28); __ aesimc(v0, v0); 2877 __ aesd(v0, v29); __ aesimc(v0, v0); 2878 __ aesd(v0, v30); 2879 __ eor(v0, __ T16B, v0, v31); 2880 __ eor(v0, __ T16B, v0, v2); 2881 2882 __ st1(v0, __ T16B, __ post(to, 16)); 2883 __ orr(v2, __ T16B, v1, v1); 2884 2885 __ subw(len_reg, len_reg, 16); 2886 __ cbnzw(len_reg, L_aes_loop); 2887 2888 __ st1(v2, __ T16B, rvec); 2889 2890 __ mov(r0, rscratch2); 2891 2892 __ leave(); 2893 __ ret(lr); 2894 2895 return start; 2896 } 2897 2898 // Arguments: 2899 // 2900 // Inputs: 2901 // c_rarg0 - byte[] source+offset 2902 // c_rarg1 - int[] SHA.state 2903 // c_rarg2 - int offset 2904 // c_rarg3 - int limit 2905 // 2906 address generate_sha1_implCompress(bool multi_block, const char *name) { 2907 __ align(CodeEntryAlignment); 2908 StubCodeMark mark(this, "StubRoutines", name); 2909 address start = __ pc(); 2910 2911 Register buf = c_rarg0; 2912 Register state = c_rarg1; 2913 Register ofs = c_rarg2; 2914 Register limit = c_rarg3; 2915 2916 Label keys; 2917 Label sha1_loop; 2918 2919 // load the keys into v0..v3 2920 __ adr(rscratch1, keys); 2921 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2922 // load 5 words state into v6, v7 2923 __ ldrq(v6, Address(state, 0)); 2924 __ ldrs(v7, Address(state, 16)); 2925 2926 2927 __ BIND(sha1_loop); 2928 // load 64 bytes of data into v16..v19 2929 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2930 __ rev32(v16, __ T16B, v16); 2931 __ rev32(v17, __ T16B, v17); 2932 __ rev32(v18, __ T16B, v18); 2933 __ rev32(v19, __ T16B, v19); 2934 2935 // do the sha1 2936 __ addv(v4, __ T4S, v16, v0); 2937 __ orr(v20, __ T16B, v6, v6); 2938 2939 FloatRegister d0 = v16; 2940 FloatRegister d1 = v17; 2941 FloatRegister d2 = v18; 2942 FloatRegister d3 = v19; 2943 2944 for (int round = 0; round < 20; round++) { 2945 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2946 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2947 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2948 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2949 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2950 2951 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2952 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2953 __ sha1h(tmp2, __ T4S, v20); 2954 if (round < 5) 2955 __ sha1c(v20, __ T4S, tmp3, tmp4); 2956 else if (round < 10 || round >= 15) 2957 __ sha1p(v20, __ T4S, tmp3, tmp4); 2958 else 2959 __ sha1m(v20, __ T4S, tmp3, tmp4); 2960 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2961 2962 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2963 } 2964 2965 __ addv(v7, __ T2S, v7, v21); 2966 __ addv(v6, __ T4S, v6, v20); 2967 2968 if (multi_block) { 2969 __ add(ofs, ofs, 64); 2970 __ cmp(ofs, limit); 2971 __ br(Assembler::LE, sha1_loop); 2972 __ mov(c_rarg0, ofs); // return ofs 2973 } 2974 2975 __ strq(v6, Address(state, 0)); 2976 __ strs(v7, Address(state, 16)); 2977 2978 __ ret(lr); 2979 2980 __ bind(keys); 2981 __ emit_int32(0x5a827999); 2982 __ emit_int32(0x6ed9eba1); 2983 __ emit_int32(0x8f1bbcdc); 2984 __ emit_int32(0xca62c1d6); 2985 2986 return start; 2987 } 2988 2989 2990 // Arguments: 2991 // 2992 // Inputs: 2993 // c_rarg0 - byte[] source+offset 2994 // c_rarg1 - int[] SHA.state 2995 // c_rarg2 - int offset 2996 // c_rarg3 - int limit 2997 // 2998 address generate_sha256_implCompress(bool multi_block, const char *name) { 2999 static const uint32_t round_consts[64] = { 3000 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3001 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3002 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3003 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3004 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3005 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3006 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3007 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3008 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3009 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3010 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3011 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3012 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3013 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3014 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3015 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3016 }; 3017 __ align(CodeEntryAlignment); 3018 StubCodeMark mark(this, "StubRoutines", name); 3019 address start = __ pc(); 3020 3021 Register buf = c_rarg0; 3022 Register state = c_rarg1; 3023 Register ofs = c_rarg2; 3024 Register limit = c_rarg3; 3025 3026 Label sha1_loop; 3027 3028 __ stpd(v8, v9, __ pre(sp, -32)); 3029 __ stpd(v10, v11, Address(sp, 16)); 3030 3031 // dga == v0 3032 // dgb == v1 3033 // dg0 == v2 3034 // dg1 == v3 3035 // dg2 == v4 3036 // t0 == v6 3037 // t1 == v7 3038 3039 // load 16 keys to v16..v31 3040 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3041 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3042 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3043 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3044 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3045 3046 // load 8 words (256 bits) state 3047 __ ldpq(v0, v1, state); 3048 3049 __ BIND(sha1_loop); 3050 // load 64 bytes of data into v8..v11 3051 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3052 __ rev32(v8, __ T16B, v8); 3053 __ rev32(v9, __ T16B, v9); 3054 __ rev32(v10, __ T16B, v10); 3055 __ rev32(v11, __ T16B, v11); 3056 3057 __ addv(v6, __ T4S, v8, v16); 3058 __ orr(v2, __ T16B, v0, v0); 3059 __ orr(v3, __ T16B, v1, v1); 3060 3061 FloatRegister d0 = v8; 3062 FloatRegister d1 = v9; 3063 FloatRegister d2 = v10; 3064 FloatRegister d3 = v11; 3065 3066 3067 for (int round = 0; round < 16; round++) { 3068 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3069 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3070 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3071 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3072 3073 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3074 __ orr(v4, __ T16B, v2, v2); 3075 if (round < 15) 3076 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3077 __ sha256h(v2, __ T4S, v3, tmp2); 3078 __ sha256h2(v3, __ T4S, v4, tmp2); 3079 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3080 3081 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3082 } 3083 3084 __ addv(v0, __ T4S, v0, v2); 3085 __ addv(v1, __ T4S, v1, v3); 3086 3087 if (multi_block) { 3088 __ add(ofs, ofs, 64); 3089 __ cmp(ofs, limit); 3090 __ br(Assembler::LE, sha1_loop); 3091 __ mov(c_rarg0, ofs); // return ofs 3092 } 3093 3094 __ ldpd(v10, v11, Address(sp, 16)); 3095 __ ldpd(v8, v9, __ post(sp, 32)); 3096 3097 __ stpq(v0, v1, state); 3098 3099 __ ret(lr); 3100 3101 return start; 3102 } 3103 3104 #ifndef BUILTIN_SIM 3105 // Safefetch stubs. 3106 void generate_safefetch(const char* name, int size, address* entry, 3107 address* fault_pc, address* continuation_pc) { 3108 // safefetch signatures: 3109 // int SafeFetch32(int* adr, int errValue); 3110 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3111 // 3112 // arguments: 3113 // c_rarg0 = adr 3114 // c_rarg1 = errValue 3115 // 3116 // result: 3117 // PPC_RET = *adr or errValue 3118 3119 StubCodeMark mark(this, "StubRoutines", name); 3120 3121 // Entry point, pc or function descriptor. 3122 *entry = __ pc(); 3123 3124 // Load *adr into c_rarg1, may fault. 3125 *fault_pc = __ pc(); 3126 switch (size) { 3127 case 4: 3128 // int32_t 3129 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3130 break; 3131 case 8: 3132 // int64_t 3133 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3134 break; 3135 default: 3136 ShouldNotReachHere(); 3137 } 3138 3139 // return errValue or *adr 3140 *continuation_pc = __ pc(); 3141 __ mov(r0, c_rarg1); 3142 __ ret(lr); 3143 } 3144 #endif 3145 3146 /** 3147 * Arguments: 3148 * 3149 * Inputs: 3150 * c_rarg0 - int crc 3151 * c_rarg1 - byte* buf 3152 * c_rarg2 - int length 3153 * 3154 * Ouput: 3155 * rax - int crc result 3156 */ 3157 address generate_updateBytesCRC32() { 3158 assert(UseCRC32Intrinsics, "what are we doing here?"); 3159 3160 __ align(CodeEntryAlignment); 3161 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3162 3163 address start = __ pc(); 3164 3165 const Register crc = c_rarg0; // crc 3166 const Register buf = c_rarg1; // source java byte array address 3167 const Register len = c_rarg2; // length 3168 const Register table0 = c_rarg3; // crc_table address 3169 const Register table1 = c_rarg4; 3170 const Register table2 = c_rarg5; 3171 const Register table3 = c_rarg6; 3172 const Register tmp3 = c_rarg7; 3173 3174 BLOCK_COMMENT("Entry:"); 3175 __ enter(); // required for proper stackwalking of RuntimeStub frame 3176 3177 __ kernel_crc32(crc, buf, len, 3178 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3179 3180 __ leave(); // required for proper stackwalking of RuntimeStub frame 3181 __ ret(lr); 3182 3183 return start; 3184 } 3185 3186 /** 3187 * Arguments: 3188 * 3189 * Inputs: 3190 * c_rarg0 - int crc 3191 * c_rarg1 - byte* buf 3192 * c_rarg2 - int length 3193 * c_rarg3 - int* table 3194 * 3195 * Ouput: 3196 * r0 - int crc result 3197 */ 3198 address generate_updateBytesCRC32C() { 3199 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3200 3201 __ align(CodeEntryAlignment); 3202 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3203 3204 address start = __ pc(); 3205 3206 const Register crc = c_rarg0; // crc 3207 const Register buf = c_rarg1; // source java byte array address 3208 const Register len = c_rarg2; // length 3209 const Register table0 = c_rarg3; // crc_table address 3210 const Register table1 = c_rarg4; 3211 const Register table2 = c_rarg5; 3212 const Register table3 = c_rarg6; 3213 const Register tmp3 = c_rarg7; 3214 3215 BLOCK_COMMENT("Entry:"); 3216 __ enter(); // required for proper stackwalking of RuntimeStub frame 3217 3218 __ kernel_crc32c(crc, buf, len, 3219 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3220 3221 __ leave(); // required for proper stackwalking of RuntimeStub frame 3222 __ ret(lr); 3223 3224 return start; 3225 } 3226 3227 /*** 3228 * Arguments: 3229 * 3230 * Inputs: 3231 * c_rarg0 - int adler 3232 * c_rarg1 - byte* buff 3233 * c_rarg2 - int len 3234 * 3235 * Output: 3236 * c_rarg0 - int adler result 3237 */ 3238 address generate_updateBytesAdler32() { 3239 __ align(CodeEntryAlignment); 3240 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3241 address start = __ pc(); 3242 3243 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3244 3245 // Aliases 3246 Register adler = c_rarg0; 3247 Register s1 = c_rarg0; 3248 Register s2 = c_rarg3; 3249 Register buff = c_rarg1; 3250 Register len = c_rarg2; 3251 Register nmax = r4; 3252 Register base = r5; 3253 Register count = r6; 3254 Register temp0 = rscratch1; 3255 Register temp1 = rscratch2; 3256 Register temp2 = r7; 3257 3258 // Max number of bytes we can process before having to take the mod 3259 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3260 unsigned long BASE = 0xfff1; 3261 unsigned long NMAX = 0x15B0; 3262 3263 __ mov(base, BASE); 3264 __ mov(nmax, NMAX); 3265 3266 // s1 is initialized to the lower 16 bits of adler 3267 // s2 is initialized to the upper 16 bits of adler 3268 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3269 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3270 3271 // The pipelined loop needs at least 16 elements for 1 iteration 3272 // It does check this, but it is more effective to skip to the cleanup loop 3273 __ cmp(len, 16); 3274 __ br(Assembler::HS, L_nmax); 3275 __ cbz(len, L_combine); 3276 3277 __ bind(L_simple_by1_loop); 3278 __ ldrb(temp0, Address(__ post(buff, 1))); 3279 __ add(s1, s1, temp0); 3280 __ add(s2, s2, s1); 3281 __ subs(len, len, 1); 3282 __ br(Assembler::HI, L_simple_by1_loop); 3283 3284 // s1 = s1 % BASE 3285 __ subs(temp0, s1, base); 3286 __ csel(s1, temp0, s1, Assembler::HS); 3287 3288 // s2 = s2 % BASE 3289 __ lsr(temp0, s2, 16); 3290 __ lsl(temp1, temp0, 4); 3291 __ sub(temp1, temp1, temp0); 3292 __ add(s2, temp1, s2, ext::uxth); 3293 3294 __ subs(temp0, s2, base); 3295 __ csel(s2, temp0, s2, Assembler::HS); 3296 3297 __ b(L_combine); 3298 3299 __ bind(L_nmax); 3300 __ subs(len, len, nmax); 3301 __ sub(count, nmax, 16); 3302 __ br(Assembler::LO, L_by16); 3303 3304 __ bind(L_nmax_loop); 3305 3306 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3307 3308 __ add(s1, s1, temp0, ext::uxtb); 3309 __ ubfx(temp2, temp0, 8, 8); 3310 __ add(s2, s2, s1); 3311 __ add(s1, s1, temp2); 3312 __ ubfx(temp2, temp0, 16, 8); 3313 __ add(s2, s2, s1); 3314 __ add(s1, s1, temp2); 3315 __ ubfx(temp2, temp0, 24, 8); 3316 __ add(s2, s2, s1); 3317 __ add(s1, s1, temp2); 3318 __ ubfx(temp2, temp0, 32, 8); 3319 __ add(s2, s2, s1); 3320 __ add(s1, s1, temp2); 3321 __ ubfx(temp2, temp0, 40, 8); 3322 __ add(s2, s2, s1); 3323 __ add(s1, s1, temp2); 3324 __ ubfx(temp2, temp0, 48, 8); 3325 __ add(s2, s2, s1); 3326 __ add(s1, s1, temp2); 3327 __ add(s2, s2, s1); 3328 __ add(s1, s1, temp0, Assembler::LSR, 56); 3329 __ add(s2, s2, s1); 3330 3331 __ add(s1, s1, temp1, ext::uxtb); 3332 __ ubfx(temp2, temp1, 8, 8); 3333 __ add(s2, s2, s1); 3334 __ add(s1, s1, temp2); 3335 __ ubfx(temp2, temp1, 16, 8); 3336 __ add(s2, s2, s1); 3337 __ add(s1, s1, temp2); 3338 __ ubfx(temp2, temp1, 24, 8); 3339 __ add(s2, s2, s1); 3340 __ add(s1, s1, temp2); 3341 __ ubfx(temp2, temp1, 32, 8); 3342 __ add(s2, s2, s1); 3343 __ add(s1, s1, temp2); 3344 __ ubfx(temp2, temp1, 40, 8); 3345 __ add(s2, s2, s1); 3346 __ add(s1, s1, temp2); 3347 __ ubfx(temp2, temp1, 48, 8); 3348 __ add(s2, s2, s1); 3349 __ add(s1, s1, temp2); 3350 __ add(s2, s2, s1); 3351 __ add(s1, s1, temp1, Assembler::LSR, 56); 3352 __ add(s2, s2, s1); 3353 3354 __ subs(count, count, 16); 3355 __ br(Assembler::HS, L_nmax_loop); 3356 3357 // s1 = s1 % BASE 3358 __ lsr(temp0, s1, 16); 3359 __ lsl(temp1, temp0, 4); 3360 __ sub(temp1, temp1, temp0); 3361 __ add(temp1, temp1, s1, ext::uxth); 3362 3363 __ lsr(temp0, temp1, 16); 3364 __ lsl(s1, temp0, 4); 3365 __ sub(s1, s1, temp0); 3366 __ add(s1, s1, temp1, ext:: uxth); 3367 3368 __ subs(temp0, s1, base); 3369 __ csel(s1, temp0, s1, Assembler::HS); 3370 3371 // s2 = s2 % BASE 3372 __ lsr(temp0, s2, 16); 3373 __ lsl(temp1, temp0, 4); 3374 __ sub(temp1, temp1, temp0); 3375 __ add(temp1, temp1, s2, ext::uxth); 3376 3377 __ lsr(temp0, temp1, 16); 3378 __ lsl(s2, temp0, 4); 3379 __ sub(s2, s2, temp0); 3380 __ add(s2, s2, temp1, ext:: uxth); 3381 3382 __ subs(temp0, s2, base); 3383 __ csel(s2, temp0, s2, Assembler::HS); 3384 3385 __ subs(len, len, nmax); 3386 __ sub(count, nmax, 16); 3387 __ br(Assembler::HS, L_nmax_loop); 3388 3389 __ bind(L_by16); 3390 __ adds(len, len, count); 3391 __ br(Assembler::LO, L_by1); 3392 3393 __ bind(L_by16_loop); 3394 3395 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3396 3397 __ add(s1, s1, temp0, ext::uxtb); 3398 __ ubfx(temp2, temp0, 8, 8); 3399 __ add(s2, s2, s1); 3400 __ add(s1, s1, temp2); 3401 __ ubfx(temp2, temp0, 16, 8); 3402 __ add(s2, s2, s1); 3403 __ add(s1, s1, temp2); 3404 __ ubfx(temp2, temp0, 24, 8); 3405 __ add(s2, s2, s1); 3406 __ add(s1, s1, temp2); 3407 __ ubfx(temp2, temp0, 32, 8); 3408 __ add(s2, s2, s1); 3409 __ add(s1, s1, temp2); 3410 __ ubfx(temp2, temp0, 40, 8); 3411 __ add(s2, s2, s1); 3412 __ add(s1, s1, temp2); 3413 __ ubfx(temp2, temp0, 48, 8); 3414 __ add(s2, s2, s1); 3415 __ add(s1, s1, temp2); 3416 __ add(s2, s2, s1); 3417 __ add(s1, s1, temp0, Assembler::LSR, 56); 3418 __ add(s2, s2, s1); 3419 3420 __ add(s1, s1, temp1, ext::uxtb); 3421 __ ubfx(temp2, temp1, 8, 8); 3422 __ add(s2, s2, s1); 3423 __ add(s1, s1, temp2); 3424 __ ubfx(temp2, temp1, 16, 8); 3425 __ add(s2, s2, s1); 3426 __ add(s1, s1, temp2); 3427 __ ubfx(temp2, temp1, 24, 8); 3428 __ add(s2, s2, s1); 3429 __ add(s1, s1, temp2); 3430 __ ubfx(temp2, temp1, 32, 8); 3431 __ add(s2, s2, s1); 3432 __ add(s1, s1, temp2); 3433 __ ubfx(temp2, temp1, 40, 8); 3434 __ add(s2, s2, s1); 3435 __ add(s1, s1, temp2); 3436 __ ubfx(temp2, temp1, 48, 8); 3437 __ add(s2, s2, s1); 3438 __ add(s1, s1, temp2); 3439 __ add(s2, s2, s1); 3440 __ add(s1, s1, temp1, Assembler::LSR, 56); 3441 __ add(s2, s2, s1); 3442 3443 __ subs(len, len, 16); 3444 __ br(Assembler::HS, L_by16_loop); 3445 3446 __ bind(L_by1); 3447 __ adds(len, len, 15); 3448 __ br(Assembler::LO, L_do_mod); 3449 3450 __ bind(L_by1_loop); 3451 __ ldrb(temp0, Address(__ post(buff, 1))); 3452 __ add(s1, temp0, s1); 3453 __ add(s2, s2, s1); 3454 __ subs(len, len, 1); 3455 __ br(Assembler::HS, L_by1_loop); 3456 3457 __ bind(L_do_mod); 3458 // s1 = s1 % BASE 3459 __ lsr(temp0, s1, 16); 3460 __ lsl(temp1, temp0, 4); 3461 __ sub(temp1, temp1, temp0); 3462 __ add(temp1, temp1, s1, ext::uxth); 3463 3464 __ lsr(temp0, temp1, 16); 3465 __ lsl(s1, temp0, 4); 3466 __ sub(s1, s1, temp0); 3467 __ add(s1, s1, temp1, ext:: uxth); 3468 3469 __ subs(temp0, s1, base); 3470 __ csel(s1, temp0, s1, Assembler::HS); 3471 3472 // s2 = s2 % BASE 3473 __ lsr(temp0, s2, 16); 3474 __ lsl(temp1, temp0, 4); 3475 __ sub(temp1, temp1, temp0); 3476 __ add(temp1, temp1, s2, ext::uxth); 3477 3478 __ lsr(temp0, temp1, 16); 3479 __ lsl(s2, temp0, 4); 3480 __ sub(s2, s2, temp0); 3481 __ add(s2, s2, temp1, ext:: uxth); 3482 3483 __ subs(temp0, s2, base); 3484 __ csel(s2, temp0, s2, Assembler::HS); 3485 3486 // Combine lower bits and higher bits 3487 __ bind(L_combine); 3488 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3489 3490 __ ret(lr); 3491 3492 return start; 3493 } 3494 3495 /** 3496 * Arguments: 3497 * 3498 * Input: 3499 * c_rarg0 - x address 3500 * c_rarg1 - x length 3501 * c_rarg2 - y address 3502 * c_rarg3 - y lenth 3503 * c_rarg4 - z address 3504 * c_rarg5 - z length 3505 */ 3506 address generate_multiplyToLen() { 3507 __ align(CodeEntryAlignment); 3508 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3509 3510 address start = __ pc(); 3511 const Register x = r0; 3512 const Register xlen = r1; 3513 const Register y = r2; 3514 const Register ylen = r3; 3515 const Register z = r4; 3516 const Register zlen = r5; 3517 3518 const Register tmp1 = r10; 3519 const Register tmp2 = r11; 3520 const Register tmp3 = r12; 3521 const Register tmp4 = r13; 3522 const Register tmp5 = r14; 3523 const Register tmp6 = r15; 3524 const Register tmp7 = r16; 3525 3526 BLOCK_COMMENT("Entry:"); 3527 __ enter(); // required for proper stackwalking of RuntimeStub frame 3528 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3529 __ leave(); // required for proper stackwalking of RuntimeStub frame 3530 __ ret(lr); 3531 3532 return start; 3533 } 3534 3535 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3536 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3537 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3538 // Karatsuba multiplication performs a 128*128 -> 256-bit 3539 // multiplication in three 128-bit multiplications and a few 3540 // additions. 3541 // 3542 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3543 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3544 // 3545 // Inputs: 3546 // 3547 // A0 in a.d[0] (subkey) 3548 // A1 in a.d[1] 3549 // (A1+A0) in a1_xor_a0.d[0] 3550 // 3551 // B0 in b.d[0] (state) 3552 // B1 in b.d[1] 3553 3554 __ ext(tmp1, __ T16B, b, b, 0x08); 3555 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3556 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3557 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3558 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3559 3560 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3561 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3562 __ eor(tmp2, __ T16B, tmp2, tmp4); 3563 __ eor(tmp2, __ T16B, tmp2, tmp3); 3564 3565 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3566 __ ins(result_hi, __ D, tmp2, 0, 1); 3567 __ ins(result_lo, __ D, tmp2, 1, 0); 3568 } 3569 3570 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3571 FloatRegister p, FloatRegister z, FloatRegister t1) { 3572 const FloatRegister t0 = result; 3573 3574 // The GCM field polynomial f is z^128 + p(z), where p = 3575 // z^7+z^2+z+1. 3576 // 3577 // z^128 === -p(z) (mod (z^128 + p(z))) 3578 // 3579 // so, given that the product we're reducing is 3580 // a == lo + hi * z^128 3581 // substituting, 3582 // === lo - hi * p(z) (mod (z^128 + p(z))) 3583 // 3584 // we reduce by multiplying hi by p(z) and subtracting the result 3585 // from (i.e. XORing it with) lo. Because p has no nonzero high 3586 // bits we can do this with two 64-bit multiplications, lo*p and 3587 // hi*p. 3588 3589 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3590 __ ext(t1, __ T16B, t0, z, 8); 3591 __ eor(hi, __ T16B, hi, t1); 3592 __ ext(t1, __ T16B, z, t0, 8); 3593 __ eor(lo, __ T16B, lo, t1); 3594 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3595 __ eor(result, __ T16B, lo, t0); 3596 } 3597 3598 address generate_has_negatives(address &has_negatives_long) { 3599 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3600 const int large_loop_size = 64; 3601 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3602 int dcache_line = VM_Version::dcache_line_size(); 3603 3604 Register ary1 = r1, len = r2, result = r0; 3605 3606 __ align(CodeEntryAlignment); 3607 address entry = __ pc(); 3608 3609 __ enter(); 3610 3611 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3612 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3613 3614 __ cmp(len, 15); 3615 __ br(Assembler::GT, LEN_OVER_15); 3616 // The only case when execution falls into this code is when pointer is near 3617 // the end of memory page and we have to avoid reading next page 3618 __ add(ary1, ary1, len); 3619 __ subs(len, len, 8); 3620 __ br(Assembler::GT, LEN_OVER_8); 3621 __ ldr(rscratch2, Address(ary1, -8)); 3622 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3623 __ lsrv(rscratch2, rscratch2, rscratch1); 3624 __ tst(rscratch2, UPPER_BIT_MASK); 3625 __ cset(result, Assembler::NE); 3626 __ leave(); 3627 __ ret(lr); 3628 __ bind(LEN_OVER_8); 3629 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3630 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3631 __ tst(rscratch2, UPPER_BIT_MASK); 3632 __ br(Assembler::NE, RET_TRUE_NO_POP); 3633 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3634 __ lsrv(rscratch1, rscratch1, rscratch2); 3635 __ tst(rscratch1, UPPER_BIT_MASK); 3636 __ cset(result, Assembler::NE); 3637 __ leave(); 3638 __ ret(lr); 3639 3640 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3641 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3642 3643 has_negatives_long = __ pc(); // 2nd entry point 3644 3645 __ enter(); 3646 3647 __ bind(LEN_OVER_15); 3648 __ push(spilled_regs, sp); 3649 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3650 __ cbz(rscratch2, ALIGNED); 3651 __ ldp(tmp6, tmp1, Address(ary1)); 3652 __ mov(tmp5, 16); 3653 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3654 __ add(ary1, ary1, rscratch1); 3655 __ sub(len, len, rscratch1); 3656 __ orr(tmp6, tmp6, tmp1); 3657 __ tst(tmp6, UPPER_BIT_MASK); 3658 __ br(Assembler::NE, RET_TRUE); 3659 3660 __ bind(ALIGNED); 3661 __ cmp(len, large_loop_size); 3662 __ br(Assembler::LT, CHECK_16); 3663 // Perform 16-byte load as early return in pre-loop to handle situation 3664 // when initially aligned large array has negative values at starting bytes, 3665 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3666 // slower. Cases with negative bytes further ahead won't be affected that 3667 // much. In fact, it'll be faster due to early loads, less instructions and 3668 // less branches in LARGE_LOOP. 3669 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3670 __ sub(len, len, 16); 3671 __ orr(tmp6, tmp6, tmp1); 3672 __ tst(tmp6, UPPER_BIT_MASK); 3673 __ br(Assembler::NE, RET_TRUE); 3674 __ cmp(len, large_loop_size); 3675 __ br(Assembler::LT, CHECK_16); 3676 3677 if (SoftwarePrefetchHintDistance >= 0 3678 && SoftwarePrefetchHintDistance >= dcache_line) { 3679 // initial prefetch 3680 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3681 } 3682 __ bind(LARGE_LOOP); 3683 if (SoftwarePrefetchHintDistance >= 0) { 3684 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3685 } 3686 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3687 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3688 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3689 // instructions per cycle and have less branches, but this approach disables 3690 // early return, thus, all 64 bytes are loaded and checked every time. 3691 __ ldp(tmp2, tmp3, Address(ary1)); 3692 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3693 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3694 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3695 __ add(ary1, ary1, large_loop_size); 3696 __ sub(len, len, large_loop_size); 3697 __ orr(tmp2, tmp2, tmp3); 3698 __ orr(tmp4, tmp4, tmp5); 3699 __ orr(rscratch1, rscratch1, rscratch2); 3700 __ orr(tmp6, tmp6, tmp1); 3701 __ orr(tmp2, tmp2, tmp4); 3702 __ orr(rscratch1, rscratch1, tmp6); 3703 __ orr(tmp2, tmp2, rscratch1); 3704 __ tst(tmp2, UPPER_BIT_MASK); 3705 __ br(Assembler::NE, RET_TRUE); 3706 __ cmp(len, large_loop_size); 3707 __ br(Assembler::GE, LARGE_LOOP); 3708 3709 __ bind(CHECK_16); // small 16-byte load pre-loop 3710 __ cmp(len, 16); 3711 __ br(Assembler::LT, POST_LOOP16); 3712 3713 __ bind(LOOP16); // small 16-byte load loop 3714 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3715 __ sub(len, len, 16); 3716 __ orr(tmp2, tmp2, tmp3); 3717 __ tst(tmp2, UPPER_BIT_MASK); 3718 __ br(Assembler::NE, RET_TRUE); 3719 __ cmp(len, 16); 3720 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3721 3722 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3723 __ cmp(len, 8); 3724 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3725 __ ldr(tmp3, Address(__ post(ary1, 8))); 3726 __ sub(len, len, 8); 3727 __ tst(tmp3, UPPER_BIT_MASK); 3728 __ br(Assembler::NE, RET_TRUE); 3729 3730 __ bind(POST_LOOP16_LOAD_TAIL); 3731 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3732 __ ldr(tmp1, Address(ary1)); 3733 __ mov(tmp2, 64); 3734 __ sub(tmp4, tmp2, len, __ LSL, 3); 3735 __ lslv(tmp1, tmp1, tmp4); 3736 __ tst(tmp1, UPPER_BIT_MASK); 3737 __ br(Assembler::NE, RET_TRUE); 3738 // Fallthrough 3739 3740 __ bind(RET_FALSE); 3741 __ pop(spilled_regs, sp); 3742 __ leave(); 3743 __ mov(result, zr); 3744 __ ret(lr); 3745 3746 __ bind(RET_TRUE); 3747 __ pop(spilled_regs, sp); 3748 __ bind(RET_TRUE_NO_POP); 3749 __ leave(); 3750 __ mov(result, 1); 3751 __ ret(lr); 3752 3753 __ bind(DONE); 3754 __ pop(spilled_regs, sp); 3755 __ leave(); 3756 __ ret(lr); 3757 return entry; 3758 } 3759 /** 3760 * Arguments: 3761 * 3762 * Input: 3763 * c_rarg0 - current state address 3764 * c_rarg1 - H key address 3765 * c_rarg2 - data address 3766 * c_rarg3 - number of blocks 3767 * 3768 * Output: 3769 * Updated state at c_rarg0 3770 */ 3771 address generate_ghash_processBlocks() { 3772 // Bafflingly, GCM uses little-endian for the byte order, but 3773 // big-endian for the bit order. For example, the polynomial 1 is 3774 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3775 // 3776 // So, we must either reverse the bytes in each word and do 3777 // everything big-endian or reverse the bits in each byte and do 3778 // it little-endian. On AArch64 it's more idiomatic to reverse 3779 // the bits in each byte (we have an instruction, RBIT, to do 3780 // that) and keep the data in little-endian bit order throught the 3781 // calculation, bit-reversing the inputs and outputs. 3782 3783 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3784 __ align(wordSize * 2); 3785 address p = __ pc(); 3786 __ emit_int64(0x87); // The low-order bits of the field 3787 // polynomial (i.e. p = z^7+z^2+z+1) 3788 // repeated in the low and high parts of a 3789 // 128-bit vector 3790 __ emit_int64(0x87); 3791 3792 __ align(CodeEntryAlignment); 3793 address start = __ pc(); 3794 3795 Register state = c_rarg0; 3796 Register subkeyH = c_rarg1; 3797 Register data = c_rarg2; 3798 Register blocks = c_rarg3; 3799 3800 FloatRegister vzr = v30; 3801 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3802 3803 __ ldrq(v0, Address(state)); 3804 __ ldrq(v1, Address(subkeyH)); 3805 3806 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3807 __ rbit(v0, __ T16B, v0); 3808 __ rev64(v1, __ T16B, v1); 3809 __ rbit(v1, __ T16B, v1); 3810 3811 __ ldrq(v26, p); 3812 3813 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3814 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3815 3816 { 3817 Label L_ghash_loop; 3818 __ bind(L_ghash_loop); 3819 3820 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3821 // reversing each byte 3822 __ rbit(v2, __ T16B, v2); 3823 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3824 3825 // Multiply state in v2 by subkey in v1 3826 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3827 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3828 /*temps*/v6, v20, v18, v21); 3829 // Reduce v7:v5 by the field polynomial 3830 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3831 3832 __ sub(blocks, blocks, 1); 3833 __ cbnz(blocks, L_ghash_loop); 3834 } 3835 3836 // The bit-reversed result is at this point in v0 3837 __ rev64(v1, __ T16B, v0); 3838 __ rbit(v1, __ T16B, v1); 3839 3840 __ st1(v1, __ T16B, state); 3841 __ ret(lr); 3842 3843 return start; 3844 } 3845 3846 // Continuation point for throwing of implicit exceptions that are 3847 // not handled in the current activation. Fabricates an exception 3848 // oop and initiates normal exception dispatching in this 3849 // frame. Since we need to preserve callee-saved values (currently 3850 // only for C2, but done for C1 as well) we need a callee-saved oop 3851 // map and therefore have to make these stubs into RuntimeStubs 3852 // rather than BufferBlobs. If the compiler needs all registers to 3853 // be preserved between the fault point and the exception handler 3854 // then it must assume responsibility for that in 3855 // AbstractCompiler::continuation_for_implicit_null_exception or 3856 // continuation_for_implicit_division_by_zero_exception. All other 3857 // implicit exceptions (e.g., NullPointerException or 3858 // AbstractMethodError on entry) are either at call sites or 3859 // otherwise assume that stack unwinding will be initiated, so 3860 // caller saved registers were assumed volatile in the compiler. 3861 3862 #undef __ 3863 #define __ masm-> 3864 3865 address generate_throw_exception(const char* name, 3866 address runtime_entry, 3867 Register arg1 = noreg, 3868 Register arg2 = noreg) { 3869 // Information about frame layout at time of blocking runtime call. 3870 // Note that we only have to preserve callee-saved registers since 3871 // the compilers are responsible for supplying a continuation point 3872 // if they expect all registers to be preserved. 3873 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3874 enum layout { 3875 rfp_off = 0, 3876 rfp_off2, 3877 return_off, 3878 return_off2, 3879 framesize // inclusive of return address 3880 }; 3881 3882 int insts_size = 512; 3883 int locs_size = 64; 3884 3885 CodeBuffer code(name, insts_size, locs_size); 3886 OopMapSet* oop_maps = new OopMapSet(); 3887 MacroAssembler* masm = new MacroAssembler(&code); 3888 3889 address start = __ pc(); 3890 3891 // This is an inlined and slightly modified version of call_VM 3892 // which has the ability to fetch the return PC out of 3893 // thread-local storage and also sets up last_Java_sp slightly 3894 // differently than the real call_VM 3895 3896 __ enter(); // Save FP and LR before call 3897 3898 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3899 3900 // lr and fp are already in place 3901 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3902 3903 int frame_complete = __ pc() - start; 3904 3905 // Set up last_Java_sp and last_Java_fp 3906 address the_pc = __ pc(); 3907 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3908 3909 // Call runtime 3910 if (arg1 != noreg) { 3911 assert(arg2 != c_rarg1, "clobbered"); 3912 __ mov(c_rarg1, arg1); 3913 } 3914 if (arg2 != noreg) { 3915 __ mov(c_rarg2, arg2); 3916 } 3917 __ mov(c_rarg0, rthread); 3918 BLOCK_COMMENT("call runtime_entry"); 3919 __ mov(rscratch1, runtime_entry); 3920 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3921 3922 // Generate oop map 3923 OopMap* map = new OopMap(framesize, 0); 3924 3925 oop_maps->add_gc_map(the_pc - start, map); 3926 3927 __ reset_last_Java_frame(true); 3928 __ maybe_isb(); 3929 3930 __ leave(); 3931 3932 // check for pending exceptions 3933 #ifdef ASSERT 3934 Label L; 3935 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3936 __ cbnz(rscratch1, L); 3937 __ should_not_reach_here(); 3938 __ bind(L); 3939 #endif // ASSERT 3940 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3941 3942 3943 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3944 RuntimeStub* stub = 3945 RuntimeStub::new_runtime_stub(name, 3946 &code, 3947 frame_complete, 3948 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3949 oop_maps, false); 3950 return stub->entry_point(); 3951 } 3952 3953 class MontgomeryMultiplyGenerator : public MacroAssembler { 3954 3955 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3956 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3957 3958 RegSet _toSave; 3959 bool _squaring; 3960 3961 public: 3962 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3963 : MacroAssembler(as->code()), _squaring(squaring) { 3964 3965 // Register allocation 3966 3967 Register reg = c_rarg0; 3968 Pa_base = reg; // Argument registers 3969 if (squaring) 3970 Pb_base = Pa_base; 3971 else 3972 Pb_base = ++reg; 3973 Pn_base = ++reg; 3974 Rlen= ++reg; 3975 inv = ++reg; 3976 Pm_base = ++reg; 3977 3978 // Working registers: 3979 Ra = ++reg; // The current digit of a, b, n, and m. 3980 Rb = ++reg; 3981 Rm = ++reg; 3982 Rn = ++reg; 3983 3984 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3985 Pb = ++reg; 3986 Pm = ++reg; 3987 Pn = ++reg; 3988 3989 t0 = ++reg; // Three registers which form a 3990 t1 = ++reg; // triple-precision accumuator. 3991 t2 = ++reg; 3992 3993 Ri = ++reg; // Inner and outer loop indexes. 3994 Rj = ++reg; 3995 3996 Rhi_ab = ++reg; // Product registers: low and high parts 3997 Rlo_ab = ++reg; // of a*b and m*n. 3998 Rhi_mn = ++reg; 3999 Rlo_mn = ++reg; 4000 4001 // r19 and up are callee-saved. 4002 _toSave = RegSet::range(r19, reg) + Pm_base; 4003 } 4004 4005 private: 4006 void save_regs() { 4007 push(_toSave, sp); 4008 } 4009 4010 void restore_regs() { 4011 pop(_toSave, sp); 4012 } 4013 4014 template <typename T> 4015 void unroll_2(Register count, T block) { 4016 Label loop, end, odd; 4017 tbnz(count, 0, odd); 4018 cbz(count, end); 4019 align(16); 4020 bind(loop); 4021 (this->*block)(); 4022 bind(odd); 4023 (this->*block)(); 4024 subs(count, count, 2); 4025 br(Assembler::GT, loop); 4026 bind(end); 4027 } 4028 4029 template <typename T> 4030 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4031 Label loop, end, odd; 4032 tbnz(count, 0, odd); 4033 cbz(count, end); 4034 align(16); 4035 bind(loop); 4036 (this->*block)(d, s, tmp); 4037 bind(odd); 4038 (this->*block)(d, s, tmp); 4039 subs(count, count, 2); 4040 br(Assembler::GT, loop); 4041 bind(end); 4042 } 4043 4044 void pre1(RegisterOrConstant i) { 4045 block_comment("pre1"); 4046 // Pa = Pa_base; 4047 // Pb = Pb_base + i; 4048 // Pm = Pm_base; 4049 // Pn = Pn_base + i; 4050 // Ra = *Pa; 4051 // Rb = *Pb; 4052 // Rm = *Pm; 4053 // Rn = *Pn; 4054 ldr(Ra, Address(Pa_base)); 4055 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4056 ldr(Rm, Address(Pm_base)); 4057 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4058 lea(Pa, Address(Pa_base)); 4059 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4060 lea(Pm, Address(Pm_base)); 4061 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4062 4063 // Zero the m*n result. 4064 mov(Rhi_mn, zr); 4065 mov(Rlo_mn, zr); 4066 } 4067 4068 // The core multiply-accumulate step of a Montgomery 4069 // multiplication. The idea is to schedule operations as a 4070 // pipeline so that instructions with long latencies (loads and 4071 // multiplies) have time to complete before their results are 4072 // used. This most benefits in-order implementations of the 4073 // architecture but out-of-order ones also benefit. 4074 void step() { 4075 block_comment("step"); 4076 // MACC(Ra, Rb, t0, t1, t2); 4077 // Ra = *++Pa; 4078 // Rb = *--Pb; 4079 umulh(Rhi_ab, Ra, Rb); 4080 mul(Rlo_ab, Ra, Rb); 4081 ldr(Ra, pre(Pa, wordSize)); 4082 ldr(Rb, pre(Pb, -wordSize)); 4083 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4084 // previous iteration. 4085 // MACC(Rm, Rn, t0, t1, t2); 4086 // Rm = *++Pm; 4087 // Rn = *--Pn; 4088 umulh(Rhi_mn, Rm, Rn); 4089 mul(Rlo_mn, Rm, Rn); 4090 ldr(Rm, pre(Pm, wordSize)); 4091 ldr(Rn, pre(Pn, -wordSize)); 4092 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4093 } 4094 4095 void post1() { 4096 block_comment("post1"); 4097 4098 // MACC(Ra, Rb, t0, t1, t2); 4099 // Ra = *++Pa; 4100 // Rb = *--Pb; 4101 umulh(Rhi_ab, Ra, Rb); 4102 mul(Rlo_ab, Ra, Rb); 4103 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4104 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4105 4106 // *Pm = Rm = t0 * inv; 4107 mul(Rm, t0, inv); 4108 str(Rm, Address(Pm)); 4109 4110 // MACC(Rm, Rn, t0, t1, t2); 4111 // t0 = t1; t1 = t2; t2 = 0; 4112 umulh(Rhi_mn, Rm, Rn); 4113 4114 #ifndef PRODUCT 4115 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4116 { 4117 mul(Rlo_mn, Rm, Rn); 4118 add(Rlo_mn, t0, Rlo_mn); 4119 Label ok; 4120 cbz(Rlo_mn, ok); { 4121 stop("broken Montgomery multiply"); 4122 } bind(ok); 4123 } 4124 #endif 4125 // We have very carefully set things up so that 4126 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4127 // the lower half of Rm * Rn because we know the result already: 4128 // it must be -t0. t0 + (-t0) must generate a carry iff 4129 // t0 != 0. So, rather than do a mul and an adds we just set 4130 // the carry flag iff t0 is nonzero. 4131 // 4132 // mul(Rlo_mn, Rm, Rn); 4133 // adds(zr, t0, Rlo_mn); 4134 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4135 adcs(t0, t1, Rhi_mn); 4136 adc(t1, t2, zr); 4137 mov(t2, zr); 4138 } 4139 4140 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4141 block_comment("pre2"); 4142 // Pa = Pa_base + i-len; 4143 // Pb = Pb_base + len; 4144 // Pm = Pm_base + i-len; 4145 // Pn = Pn_base + len; 4146 4147 if (i.is_register()) { 4148 sub(Rj, i.as_register(), len); 4149 } else { 4150 mov(Rj, i.as_constant()); 4151 sub(Rj, Rj, len); 4152 } 4153 // Rj == i-len 4154 4155 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4156 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4157 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4158 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4159 4160 // Ra = *++Pa; 4161 // Rb = *--Pb; 4162 // Rm = *++Pm; 4163 // Rn = *--Pn; 4164 ldr(Ra, pre(Pa, wordSize)); 4165 ldr(Rb, pre(Pb, -wordSize)); 4166 ldr(Rm, pre(Pm, wordSize)); 4167 ldr(Rn, pre(Pn, -wordSize)); 4168 4169 mov(Rhi_mn, zr); 4170 mov(Rlo_mn, zr); 4171 } 4172 4173 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4174 block_comment("post2"); 4175 if (i.is_constant()) { 4176 mov(Rj, i.as_constant()-len.as_constant()); 4177 } else { 4178 sub(Rj, i.as_register(), len); 4179 } 4180 4181 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4182 4183 // As soon as we know the least significant digit of our result, 4184 // store it. 4185 // Pm_base[i-len] = t0; 4186 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4187 4188 // t0 = t1; t1 = t2; t2 = 0; 4189 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4190 adc(t1, t2, zr); 4191 mov(t2, zr); 4192 } 4193 4194 // A carry in t0 after Montgomery multiplication means that we 4195 // should subtract multiples of n from our result in m. We'll 4196 // keep doing that until there is no carry. 4197 void normalize(RegisterOrConstant len) { 4198 block_comment("normalize"); 4199 // while (t0) 4200 // t0 = sub(Pm_base, Pn_base, t0, len); 4201 Label loop, post, again; 4202 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4203 cbz(t0, post); { 4204 bind(again); { 4205 mov(i, zr); 4206 mov(cnt, len); 4207 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4208 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4209 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4210 align(16); 4211 bind(loop); { 4212 sbcs(Rm, Rm, Rn); 4213 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4214 add(i, i, 1); 4215 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4216 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4217 sub(cnt, cnt, 1); 4218 } cbnz(cnt, loop); 4219 sbc(t0, t0, zr); 4220 } cbnz(t0, again); 4221 } bind(post); 4222 } 4223 4224 // Move memory at s to d, reversing words. 4225 // Increments d to end of copied memory 4226 // Destroys tmp1, tmp2 4227 // Preserves len 4228 // Leaves s pointing to the address which was in d at start 4229 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4230 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4231 4232 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4233 mov(tmp1, len); 4234 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4235 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4236 } 4237 // where 4238 void reverse1(Register d, Register s, Register tmp) { 4239 ldr(tmp, pre(s, -wordSize)); 4240 ror(tmp, tmp, 32); 4241 str(tmp, post(d, wordSize)); 4242 } 4243 4244 void step_squaring() { 4245 // An extra ACC 4246 step(); 4247 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4248 } 4249 4250 void last_squaring(RegisterOrConstant i) { 4251 Label dont; 4252 // if ((i & 1) == 0) { 4253 tbnz(i.as_register(), 0, dont); { 4254 // MACC(Ra, Rb, t0, t1, t2); 4255 // Ra = *++Pa; 4256 // Rb = *--Pb; 4257 umulh(Rhi_ab, Ra, Rb); 4258 mul(Rlo_ab, Ra, Rb); 4259 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4260 } bind(dont); 4261 } 4262 4263 void extra_step_squaring() { 4264 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4265 4266 // MACC(Rm, Rn, t0, t1, t2); 4267 // Rm = *++Pm; 4268 // Rn = *--Pn; 4269 umulh(Rhi_mn, Rm, Rn); 4270 mul(Rlo_mn, Rm, Rn); 4271 ldr(Rm, pre(Pm, wordSize)); 4272 ldr(Rn, pre(Pn, -wordSize)); 4273 } 4274 4275 void post1_squaring() { 4276 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4277 4278 // *Pm = Rm = t0 * inv; 4279 mul(Rm, t0, inv); 4280 str(Rm, Address(Pm)); 4281 4282 // MACC(Rm, Rn, t0, t1, t2); 4283 // t0 = t1; t1 = t2; t2 = 0; 4284 umulh(Rhi_mn, Rm, Rn); 4285 4286 #ifndef PRODUCT 4287 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4288 { 4289 mul(Rlo_mn, Rm, Rn); 4290 add(Rlo_mn, t0, Rlo_mn); 4291 Label ok; 4292 cbz(Rlo_mn, ok); { 4293 stop("broken Montgomery multiply"); 4294 } bind(ok); 4295 } 4296 #endif 4297 // We have very carefully set things up so that 4298 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4299 // the lower half of Rm * Rn because we know the result already: 4300 // it must be -t0. t0 + (-t0) must generate a carry iff 4301 // t0 != 0. So, rather than do a mul and an adds we just set 4302 // the carry flag iff t0 is nonzero. 4303 // 4304 // mul(Rlo_mn, Rm, Rn); 4305 // adds(zr, t0, Rlo_mn); 4306 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4307 adcs(t0, t1, Rhi_mn); 4308 adc(t1, t2, zr); 4309 mov(t2, zr); 4310 } 4311 4312 void acc(Register Rhi, Register Rlo, 4313 Register t0, Register t1, Register t2) { 4314 adds(t0, t0, Rlo); 4315 adcs(t1, t1, Rhi); 4316 adc(t2, t2, zr); 4317 } 4318 4319 public: 4320 /** 4321 * Fast Montgomery multiplication. The derivation of the 4322 * algorithm is in A Cryptographic Library for the Motorola 4323 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4324 * 4325 * Arguments: 4326 * 4327 * Inputs for multiplication: 4328 * c_rarg0 - int array elements a 4329 * c_rarg1 - int array elements b 4330 * c_rarg2 - int array elements n (the modulus) 4331 * c_rarg3 - int length 4332 * c_rarg4 - int inv 4333 * c_rarg5 - int array elements m (the result) 4334 * 4335 * Inputs for squaring: 4336 * c_rarg0 - int array elements a 4337 * c_rarg1 - int array elements n (the modulus) 4338 * c_rarg2 - int length 4339 * c_rarg3 - int inv 4340 * c_rarg4 - int array elements m (the result) 4341 * 4342 */ 4343 address generate_multiply() { 4344 Label argh, nothing; 4345 bind(argh); 4346 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4347 4348 align(CodeEntryAlignment); 4349 address entry = pc(); 4350 4351 cbzw(Rlen, nothing); 4352 4353 enter(); 4354 4355 // Make room. 4356 cmpw(Rlen, 512); 4357 br(Assembler::HI, argh); 4358 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4359 andr(sp, Ra, -2 * wordSize); 4360 4361 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4362 4363 { 4364 // Copy input args, reversing as we go. We use Ra as a 4365 // temporary variable. 4366 reverse(Ra, Pa_base, Rlen, t0, t1); 4367 if (!_squaring) 4368 reverse(Ra, Pb_base, Rlen, t0, t1); 4369 reverse(Ra, Pn_base, Rlen, t0, t1); 4370 } 4371 4372 // Push all call-saved registers and also Pm_base which we'll need 4373 // at the end. 4374 save_regs(); 4375 4376 #ifndef PRODUCT 4377 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4378 { 4379 ldr(Rn, Address(Pn_base, 0)); 4380 mul(Rlo_mn, Rn, inv); 4381 cmp(Rlo_mn, -1); 4382 Label ok; 4383 br(EQ, ok); { 4384 stop("broken inverse in Montgomery multiply"); 4385 } bind(ok); 4386 } 4387 #endif 4388 4389 mov(Pm_base, Ra); 4390 4391 mov(t0, zr); 4392 mov(t1, zr); 4393 mov(t2, zr); 4394 4395 block_comment("for (int i = 0; i < len; i++) {"); 4396 mov(Ri, zr); { 4397 Label loop, end; 4398 cmpw(Ri, Rlen); 4399 br(Assembler::GE, end); 4400 4401 bind(loop); 4402 pre1(Ri); 4403 4404 block_comment(" for (j = i; j; j--) {"); { 4405 movw(Rj, Ri); 4406 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4407 } block_comment(" } // j"); 4408 4409 post1(); 4410 addw(Ri, Ri, 1); 4411 cmpw(Ri, Rlen); 4412 br(Assembler::LT, loop); 4413 bind(end); 4414 block_comment("} // i"); 4415 } 4416 4417 block_comment("for (int i = len; i < 2*len; i++) {"); 4418 mov(Ri, Rlen); { 4419 Label loop, end; 4420 cmpw(Ri, Rlen, Assembler::LSL, 1); 4421 br(Assembler::GE, end); 4422 4423 bind(loop); 4424 pre2(Ri, Rlen); 4425 4426 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4427 lslw(Rj, Rlen, 1); 4428 subw(Rj, Rj, Ri); 4429 subw(Rj, Rj, 1); 4430 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4431 } block_comment(" } // j"); 4432 4433 post2(Ri, Rlen); 4434 addw(Ri, Ri, 1); 4435 cmpw(Ri, Rlen, Assembler::LSL, 1); 4436 br(Assembler::LT, loop); 4437 bind(end); 4438 } 4439 block_comment("} // i"); 4440 4441 normalize(Rlen); 4442 4443 mov(Ra, Pm_base); // Save Pm_base in Ra 4444 restore_regs(); // Restore caller's Pm_base 4445 4446 // Copy our result into caller's Pm_base 4447 reverse(Pm_base, Ra, Rlen, t0, t1); 4448 4449 leave(); 4450 bind(nothing); 4451 ret(lr); 4452 4453 return entry; 4454 } 4455 // In C, approximately: 4456 4457 // void 4458 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4459 // unsigned long Pn_base[], unsigned long Pm_base[], 4460 // unsigned long inv, int len) { 4461 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4462 // unsigned long *Pa, *Pb, *Pn, *Pm; 4463 // unsigned long Ra, Rb, Rn, Rm; 4464 4465 // int i; 4466 4467 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4468 4469 // for (i = 0; i < len; i++) { 4470 // int j; 4471 4472 // Pa = Pa_base; 4473 // Pb = Pb_base + i; 4474 // Pm = Pm_base; 4475 // Pn = Pn_base + i; 4476 4477 // Ra = *Pa; 4478 // Rb = *Pb; 4479 // Rm = *Pm; 4480 // Rn = *Pn; 4481 4482 // int iters = i; 4483 // for (j = 0; iters--; j++) { 4484 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4485 // MACC(Ra, Rb, t0, t1, t2); 4486 // Ra = *++Pa; 4487 // Rb = *--Pb; 4488 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4489 // MACC(Rm, Rn, t0, t1, t2); 4490 // Rm = *++Pm; 4491 // Rn = *--Pn; 4492 // } 4493 4494 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4495 // MACC(Ra, Rb, t0, t1, t2); 4496 // *Pm = Rm = t0 * inv; 4497 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4498 // MACC(Rm, Rn, t0, t1, t2); 4499 4500 // assert(t0 == 0, "broken Montgomery multiply"); 4501 4502 // t0 = t1; t1 = t2; t2 = 0; 4503 // } 4504 4505 // for (i = len; i < 2*len; i++) { 4506 // int j; 4507 4508 // Pa = Pa_base + i-len; 4509 // Pb = Pb_base + len; 4510 // Pm = Pm_base + i-len; 4511 // Pn = Pn_base + len; 4512 4513 // Ra = *++Pa; 4514 // Rb = *--Pb; 4515 // Rm = *++Pm; 4516 // Rn = *--Pn; 4517 4518 // int iters = len*2-i-1; 4519 // for (j = i-len+1; iters--; j++) { 4520 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4521 // MACC(Ra, Rb, t0, t1, t2); 4522 // Ra = *++Pa; 4523 // Rb = *--Pb; 4524 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4525 // MACC(Rm, Rn, t0, t1, t2); 4526 // Rm = *++Pm; 4527 // Rn = *--Pn; 4528 // } 4529 4530 // Pm_base[i-len] = t0; 4531 // t0 = t1; t1 = t2; t2 = 0; 4532 // } 4533 4534 // while (t0) 4535 // t0 = sub(Pm_base, Pn_base, t0, len); 4536 // } 4537 4538 /** 4539 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4540 * multiplies than Montgomery multiplication so it should be up to 4541 * 25% faster. However, its loop control is more complex and it 4542 * may actually run slower on some machines. 4543 * 4544 * Arguments: 4545 * 4546 * Inputs: 4547 * c_rarg0 - int array elements a 4548 * c_rarg1 - int array elements n (the modulus) 4549 * c_rarg2 - int length 4550 * c_rarg3 - int inv 4551 * c_rarg4 - int array elements m (the result) 4552 * 4553 */ 4554 address generate_square() { 4555 Label argh; 4556 bind(argh); 4557 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4558 4559 align(CodeEntryAlignment); 4560 address entry = pc(); 4561 4562 enter(); 4563 4564 // Make room. 4565 cmpw(Rlen, 512); 4566 br(Assembler::HI, argh); 4567 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4568 andr(sp, Ra, -2 * wordSize); 4569 4570 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4571 4572 { 4573 // Copy input args, reversing as we go. We use Ra as a 4574 // temporary variable. 4575 reverse(Ra, Pa_base, Rlen, t0, t1); 4576 reverse(Ra, Pn_base, Rlen, t0, t1); 4577 } 4578 4579 // Push all call-saved registers and also Pm_base which we'll need 4580 // at the end. 4581 save_regs(); 4582 4583 mov(Pm_base, Ra); 4584 4585 mov(t0, zr); 4586 mov(t1, zr); 4587 mov(t2, zr); 4588 4589 block_comment("for (int i = 0; i < len; i++) {"); 4590 mov(Ri, zr); { 4591 Label loop, end; 4592 bind(loop); 4593 cmp(Ri, Rlen); 4594 br(Assembler::GE, end); 4595 4596 pre1(Ri); 4597 4598 block_comment("for (j = (i+1)/2; j; j--) {"); { 4599 add(Rj, Ri, 1); 4600 lsr(Rj, Rj, 1); 4601 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4602 } block_comment(" } // j"); 4603 4604 last_squaring(Ri); 4605 4606 block_comment(" for (j = i/2; j; j--) {"); { 4607 lsr(Rj, Ri, 1); 4608 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4609 } block_comment(" } // j"); 4610 4611 post1_squaring(); 4612 add(Ri, Ri, 1); 4613 cmp(Ri, Rlen); 4614 br(Assembler::LT, loop); 4615 4616 bind(end); 4617 block_comment("} // i"); 4618 } 4619 4620 block_comment("for (int i = len; i < 2*len; i++) {"); 4621 mov(Ri, Rlen); { 4622 Label loop, end; 4623 bind(loop); 4624 cmp(Ri, Rlen, Assembler::LSL, 1); 4625 br(Assembler::GE, end); 4626 4627 pre2(Ri, Rlen); 4628 4629 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4630 lsl(Rj, Rlen, 1); 4631 sub(Rj, Rj, Ri); 4632 sub(Rj, Rj, 1); 4633 lsr(Rj, Rj, 1); 4634 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4635 } block_comment(" } // j"); 4636 4637 last_squaring(Ri); 4638 4639 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4640 lsl(Rj, Rlen, 1); 4641 sub(Rj, Rj, Ri); 4642 lsr(Rj, Rj, 1); 4643 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4644 } block_comment(" } // j"); 4645 4646 post2(Ri, Rlen); 4647 add(Ri, Ri, 1); 4648 cmp(Ri, Rlen, Assembler::LSL, 1); 4649 4650 br(Assembler::LT, loop); 4651 bind(end); 4652 block_comment("} // i"); 4653 } 4654 4655 normalize(Rlen); 4656 4657 mov(Ra, Pm_base); // Save Pm_base in Ra 4658 restore_regs(); // Restore caller's Pm_base 4659 4660 // Copy our result into caller's Pm_base 4661 reverse(Pm_base, Ra, Rlen, t0, t1); 4662 4663 leave(); 4664 ret(lr); 4665 4666 return entry; 4667 } 4668 // In C, approximately: 4669 4670 // void 4671 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4672 // unsigned long Pm_base[], unsigned long inv, int len) { 4673 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4674 // unsigned long *Pa, *Pb, *Pn, *Pm; 4675 // unsigned long Ra, Rb, Rn, Rm; 4676 4677 // int i; 4678 4679 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4680 4681 // for (i = 0; i < len; i++) { 4682 // int j; 4683 4684 // Pa = Pa_base; 4685 // Pb = Pa_base + i; 4686 // Pm = Pm_base; 4687 // Pn = Pn_base + i; 4688 4689 // Ra = *Pa; 4690 // Rb = *Pb; 4691 // Rm = *Pm; 4692 // Rn = *Pn; 4693 4694 // int iters = (i+1)/2; 4695 // for (j = 0; iters--; j++) { 4696 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4697 // MACC2(Ra, Rb, t0, t1, t2); 4698 // Ra = *++Pa; 4699 // Rb = *--Pb; 4700 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4701 // MACC(Rm, Rn, t0, t1, t2); 4702 // Rm = *++Pm; 4703 // Rn = *--Pn; 4704 // } 4705 // if ((i & 1) == 0) { 4706 // assert(Ra == Pa_base[j], "must be"); 4707 // MACC(Ra, Ra, t0, t1, t2); 4708 // } 4709 // iters = i/2; 4710 // assert(iters == i-j, "must be"); 4711 // for (; iters--; j++) { 4712 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4713 // MACC(Rm, Rn, t0, t1, t2); 4714 // Rm = *++Pm; 4715 // Rn = *--Pn; 4716 // } 4717 4718 // *Pm = Rm = t0 * inv; 4719 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4720 // MACC(Rm, Rn, t0, t1, t2); 4721 4722 // assert(t0 == 0, "broken Montgomery multiply"); 4723 4724 // t0 = t1; t1 = t2; t2 = 0; 4725 // } 4726 4727 // for (i = len; i < 2*len; i++) { 4728 // int start = i-len+1; 4729 // int end = start + (len - start)/2; 4730 // int j; 4731 4732 // Pa = Pa_base + i-len; 4733 // Pb = Pa_base + len; 4734 // Pm = Pm_base + i-len; 4735 // Pn = Pn_base + len; 4736 4737 // Ra = *++Pa; 4738 // Rb = *--Pb; 4739 // Rm = *++Pm; 4740 // Rn = *--Pn; 4741 4742 // int iters = (2*len-i-1)/2; 4743 // assert(iters == end-start, "must be"); 4744 // for (j = start; iters--; j++) { 4745 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4746 // MACC2(Ra, Rb, t0, t1, t2); 4747 // Ra = *++Pa; 4748 // Rb = *--Pb; 4749 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4750 // MACC(Rm, Rn, t0, t1, t2); 4751 // Rm = *++Pm; 4752 // Rn = *--Pn; 4753 // } 4754 // if ((i & 1) == 0) { 4755 // assert(Ra == Pa_base[j], "must be"); 4756 // MACC(Ra, Ra, t0, t1, t2); 4757 // } 4758 // iters = (2*len-i)/2; 4759 // assert(iters == len-j, "must be"); 4760 // for (; iters--; j++) { 4761 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4762 // MACC(Rm, Rn, t0, t1, t2); 4763 // Rm = *++Pm; 4764 // Rn = *--Pn; 4765 // } 4766 // Pm_base[i-len] = t0; 4767 // t0 = t1; t1 = t2; t2 = 0; 4768 // } 4769 4770 // while (t0) 4771 // t0 = sub(Pm_base, Pn_base, t0, len); 4772 // } 4773 }; 4774 4775 4776 // Initialization 4777 void generate_initial() { 4778 // Generate initial stubs and initializes the entry points 4779 4780 // entry points that exist in all platforms Note: This is code 4781 // that could be shared among different platforms - however the 4782 // benefit seems to be smaller than the disadvantage of having a 4783 // much more complicated generator structure. See also comment in 4784 // stubRoutines.hpp. 4785 4786 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4787 4788 StubRoutines::_call_stub_entry = 4789 generate_call_stub(StubRoutines::_call_stub_return_address); 4790 4791 // is referenced by megamorphic call 4792 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4793 4794 // Build this early so it's available for the interpreter. 4795 StubRoutines::_throw_StackOverflowError_entry = 4796 generate_throw_exception("StackOverflowError throw_exception", 4797 CAST_FROM_FN_PTR(address, 4798 SharedRuntime::throw_StackOverflowError)); 4799 StubRoutines::_throw_delayed_StackOverflowError_entry = 4800 generate_throw_exception("delayed StackOverflowError throw_exception", 4801 CAST_FROM_FN_PTR(address, 4802 SharedRuntime::throw_delayed_StackOverflowError)); 4803 if (UseCRC32Intrinsics) { 4804 // set table address before stub generation which use it 4805 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4806 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4807 } 4808 } 4809 4810 void generate_all() { 4811 // support for verify_oop (must happen after universe_init) 4812 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4813 StubRoutines::_throw_AbstractMethodError_entry = 4814 generate_throw_exception("AbstractMethodError throw_exception", 4815 CAST_FROM_FN_PTR(address, 4816 SharedRuntime:: 4817 throw_AbstractMethodError)); 4818 4819 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4820 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4821 CAST_FROM_FN_PTR(address, 4822 SharedRuntime:: 4823 throw_IncompatibleClassChangeError)); 4824 4825 StubRoutines::_throw_NullPointerException_at_call_entry = 4826 generate_throw_exception("NullPointerException at call throw_exception", 4827 CAST_FROM_FN_PTR(address, 4828 SharedRuntime:: 4829 throw_NullPointerException_at_call)); 4830 4831 // arraycopy stubs used by compilers 4832 generate_arraycopy_stubs(); 4833 4834 // has negatives stub for large arrays. 4835 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 4836 4837 if (UseMultiplyToLenIntrinsic) { 4838 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4839 } 4840 4841 if (UseMontgomeryMultiplyIntrinsic) { 4842 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4843 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4844 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4845 } 4846 4847 if (UseMontgomerySquareIntrinsic) { 4848 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4849 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4850 // We use generate_multiply() rather than generate_square() 4851 // because it's faster for the sizes of modulus we care about. 4852 StubRoutines::_montgomerySquare = g.generate_multiply(); 4853 } 4854 4855 #ifndef BUILTIN_SIM 4856 // generate GHASH intrinsics code 4857 if (UseGHASHIntrinsics) { 4858 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4859 } 4860 4861 if (UseAESIntrinsics) { 4862 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4863 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4864 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4865 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4866 } 4867 4868 if (UseSHA1Intrinsics) { 4869 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4870 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4871 } 4872 if (UseSHA256Intrinsics) { 4873 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4874 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4875 } 4876 4877 if (UseCRC32CIntrinsics) { 4878 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4879 } 4880 4881 // generate Adler32 intrinsics code 4882 if (UseAdler32Intrinsics) { 4883 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4884 } 4885 4886 // Safefetch stubs. 4887 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4888 &StubRoutines::_safefetch32_fault_pc, 4889 &StubRoutines::_safefetch32_continuation_pc); 4890 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4891 &StubRoutines::_safefetchN_fault_pc, 4892 &StubRoutines::_safefetchN_continuation_pc); 4893 #endif 4894 StubRoutines::aarch64::set_completed(); 4895 } 4896 4897 public: 4898 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4899 if (all) { 4900 generate_all(); 4901 } else { 4902 generate_initial(); 4903 } 4904 } 4905 }; // end class declaration 4906 4907 void StubGenerator_generate(CodeBuffer* code, bool all) { 4908 StubGenerator g(code, all); 4909 }