1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shenandoah/brooksPointer.hpp" 30 #include "gc/shenandoah/shenandoahBarrierSet.hpp" 31 #include "gc/shenandoah/shenandoahHeap.hpp" 32 #include "gc/shenandoah/shenandoahHeapRegion.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "nativeInst_aarch64.hpp" 35 #include "oops/instanceOop.hpp" 36 #include "oops/method.hpp" 37 #include "oops/objArrayKlass.hpp" 38 #include "oops/oop.inline.hpp" 39 #include "prims/methodHandles.hpp" 40 #include "runtime/frame.inline.hpp" 41 #include "runtime/handles.inline.hpp" 42 #include "runtime/sharedRuntime.hpp" 43 #include "runtime/stubCodeGenerator.hpp" 44 #include "runtime/stubRoutines.hpp" 45 #include "runtime/thread.inline.hpp" 46 #ifdef COMPILER2 47 #include "opto/runtime.hpp" 48 #endif 49 50 #ifdef BUILTIN_SIM 51 #include "../../../../../../simulator/simulator.hpp" 52 #endif 53 54 // Declaration and definition of StubGenerator (no .hpp file). 55 // For a more detailed description of the stub routine structure 56 // see the comment in stubRoutines.hpp 57 58 #undef __ 59 #define __ _masm-> 60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 61 62 #ifdef PRODUCT 63 #define BLOCK_COMMENT(str) /* nothing */ 64 #else 65 #define BLOCK_COMMENT(str) __ block_comment(str) 66 #endif 67 68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 69 70 // Stub Code definitions 71 72 class StubGenerator: public StubCodeGenerator { 73 private: 74 75 #ifdef PRODUCT 76 #define inc_counter_np(counter) ((void)0) 77 #else 78 void inc_counter_np_(int& counter) { 79 __ lea(rscratch2, ExternalAddress((address)&counter)); 80 __ ldrw(rscratch1, Address(rscratch2)); 81 __ addw(rscratch1, rscratch1, 1); 82 __ strw(rscratch1, Address(rscratch2)); 83 } 84 #define inc_counter_np(counter) \ 85 BLOCK_COMMENT("inc_counter " #counter); \ 86 inc_counter_np_(counter); 87 #endif 88 89 // Call stubs are used to call Java from C 90 // 91 // Arguments: 92 // c_rarg0: call wrapper address address 93 // c_rarg1: result address 94 // c_rarg2: result type BasicType 95 // c_rarg3: method Method* 96 // c_rarg4: (interpreter) entry point address 97 // c_rarg5: parameters intptr_t* 98 // c_rarg6: parameter size (in words) int 99 // c_rarg7: thread Thread* 100 // 101 // There is no return from the stub itself as any Java result 102 // is written to result 103 // 104 // we save r30 (lr) as the return PC at the base of the frame and 105 // link r29 (fp) below it as the frame pointer installing sp (r31) 106 // into fp. 107 // 108 // we save r0-r7, which accounts for all the c arguments. 109 // 110 // TODO: strictly do we need to save them all? they are treated as 111 // volatile by C so could we omit saving the ones we are going to 112 // place in global registers (thread? method?) or those we only use 113 // during setup of the Java call? 114 // 115 // we don't need to save r8 which C uses as an indirect result location 116 // return register. 117 // 118 // we don't need to save r9-r15 which both C and Java treat as 119 // volatile 120 // 121 // we don't need to save r16-18 because Java does not use them 122 // 123 // we save r19-r28 which Java uses as scratch registers and C 124 // expects to be callee-save 125 // 126 // we save the bottom 64 bits of each value stored in v8-v15; it is 127 // the responsibility of the caller to preserve larger values. 128 // 129 // so the stub frame looks like this when we enter Java code 130 // 131 // [ return_from_Java ] <--- sp 132 // [ argument word n ] 133 // ... 134 // -27 [ argument word 1 ] 135 // -26 [ saved v15 ] <--- sp_after_call 136 // -25 [ saved v14 ] 137 // -24 [ saved v13 ] 138 // -23 [ saved v12 ] 139 // -22 [ saved v11 ] 140 // -21 [ saved v10 ] 141 // -20 [ saved v9 ] 142 // -19 [ saved v8 ] 143 // -18 [ saved r28 ] 144 // -17 [ saved r27 ] 145 // -16 [ saved r26 ] 146 // -15 [ saved r25 ] 147 // -14 [ saved r24 ] 148 // -13 [ saved r23 ] 149 // -12 [ saved r22 ] 150 // -11 [ saved r21 ] 151 // -10 [ saved r20 ] 152 // -9 [ saved r19 ] 153 // -8 [ call wrapper (r0) ] 154 // -7 [ result (r1) ] 155 // -6 [ result type (r2) ] 156 // -5 [ method (r3) ] 157 // -4 [ entry point (r4) ] 158 // -3 [ parameters (r5) ] 159 // -2 [ parameter size (r6) ] 160 // -1 [ thread (r7) ] 161 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 162 // 1 [ saved lr (r30) ] 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -26, 167 168 d15_off = -26, 169 d13_off = -24, 170 d11_off = -22, 171 d9_off = -20, 172 173 r28_off = -18, 174 r26_off = -16, 175 r24_off = -14, 176 r22_off = -12, 177 r20_off = -10, 178 call_wrapper_off = -8, 179 result_off = -7, 180 result_type_off = -6, 181 method_off = -5, 182 entry_point_off = -4, 183 parameter_size_off = -2, 184 thread_off = -1, 185 fp_f = 0, 186 retaddr_off = 1, 187 }; 188 189 address generate_call_stub(address& return_address) { 190 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 191 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 192 "adjust this code"); 193 194 StubCodeMark mark(this, "StubRoutines", "call_stub"); 195 address start = __ pc(); 196 197 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 198 199 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 200 const Address result (rfp, result_off * wordSize); 201 const Address result_type (rfp, result_type_off * wordSize); 202 const Address method (rfp, method_off * wordSize); 203 const Address entry_point (rfp, entry_point_off * wordSize); 204 const Address parameter_size(rfp, parameter_size_off * wordSize); 205 206 const Address thread (rfp, thread_off * wordSize); 207 208 const Address d15_save (rfp, d15_off * wordSize); 209 const Address d13_save (rfp, d13_off * wordSize); 210 const Address d11_save (rfp, d11_off * wordSize); 211 const Address d9_save (rfp, d9_off * wordSize); 212 213 const Address r28_save (rfp, r28_off * wordSize); 214 const Address r26_save (rfp, r26_off * wordSize); 215 const Address r24_save (rfp, r24_off * wordSize); 216 const Address r22_save (rfp, r22_off * wordSize); 217 const Address r20_save (rfp, r20_off * wordSize); 218 219 // stub code 220 221 // we need a C prolog to bootstrap the x86 caller into the sim 222 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 223 224 address aarch64_entry = __ pc(); 225 226 #ifdef BUILTIN_SIM 227 // Save sender's SP for stack traces. 228 __ mov(rscratch1, sp); 229 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 230 #endif 231 // set up frame and move sp to end of save area 232 __ enter(); 233 __ sub(sp, rfp, -sp_after_call_off * wordSize); 234 235 // save register parameters and Java scratch/global registers 236 // n.b. we save thread even though it gets installed in 237 // rthread because we want to sanity check rthread later 238 __ str(c_rarg7, thread); 239 __ strw(c_rarg6, parameter_size); 240 __ stp(c_rarg4, c_rarg5, entry_point); 241 __ stp(c_rarg2, c_rarg3, result_type); 242 __ stp(c_rarg0, c_rarg1, call_wrapper); 243 244 __ stp(r20, r19, r20_save); 245 __ stp(r22, r21, r22_save); 246 __ stp(r24, r23, r24_save); 247 __ stp(r26, r25, r26_save); 248 __ stp(r28, r27, r28_save); 249 250 __ stpd(v9, v8, d9_save); 251 __ stpd(v11, v10, d11_save); 252 __ stpd(v13, v12, d13_save); 253 __ stpd(v15, v14, d15_save); 254 255 // install Java thread in global register now we have saved 256 // whatever value it held 257 __ mov(rthread, c_rarg7); 258 // And method 259 __ mov(rmethod, c_rarg3); 260 261 // set up the heapbase register 262 __ reinit_heapbase(); 263 264 #ifdef ASSERT 265 // make sure we have no pending exceptions 266 { 267 Label L; 268 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 269 __ cmp(rscratch1, (unsigned)NULL_WORD); 270 __ br(Assembler::EQ, L); 271 __ stop("StubRoutines::call_stub: entered with pending exception"); 272 __ BIND(L); 273 } 274 #endif 275 // pass parameters if any 276 __ mov(esp, sp); 277 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 278 __ andr(sp, rscratch1, -2 * wordSize); 279 280 BLOCK_COMMENT("pass parameters if any"); 281 Label parameters_done; 282 // parameter count is still in c_rarg6 283 // and parameter pointer identifying param 1 is in c_rarg5 284 __ cbzw(c_rarg6, parameters_done); 285 286 address loop = __ pc(); 287 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 288 __ subsw(c_rarg6, c_rarg6, 1); 289 __ push(rscratch1); 290 __ br(Assembler::GT, loop); 291 292 __ BIND(parameters_done); 293 294 // call Java entry -- passing methdoOop, and current sp 295 // rmethod: Method* 296 // r13: sender sp 297 BLOCK_COMMENT("call Java function"); 298 __ mov(r13, sp); 299 __ blr(c_rarg4); 300 301 // tell the simulator we have returned to the stub 302 303 // we do this here because the notify will already have been done 304 // if we get to the next instruction via an exception 305 // 306 // n.b. adding this instruction here affects the calculation of 307 // whether or not a routine returns to the call stub (used when 308 // doing stack walks) since the normal test is to check the return 309 // pc against the address saved below. so we may need to allow for 310 // this extra instruction in the check. 311 312 if (NotifySimulator) { 313 __ notify(Assembler::method_reentry); 314 } 315 // save current address for use by exception handling code 316 317 return_address = __ pc(); 318 319 // store result depending on type (everything that is not 320 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 321 // n.b. this assumes Java returns an integral result in r0 322 // and a floating result in j_farg0 323 __ ldr(j_rarg2, result); 324 Label is_long, is_float, is_double, exit; 325 __ ldr(j_rarg1, result_type); 326 __ cmp(j_rarg1, T_OBJECT); 327 __ br(Assembler::EQ, is_long); 328 __ cmp(j_rarg1, T_LONG); 329 __ br(Assembler::EQ, is_long); 330 __ cmp(j_rarg1, T_FLOAT); 331 __ br(Assembler::EQ, is_float); 332 __ cmp(j_rarg1, T_DOUBLE); 333 __ br(Assembler::EQ, is_double); 334 335 // handle T_INT case 336 __ strw(r0, Address(j_rarg2)); 337 338 __ BIND(exit); 339 340 // pop parameters 341 __ sub(esp, rfp, -sp_after_call_off * wordSize); 342 343 #ifdef ASSERT 344 // verify that threads correspond 345 { 346 Label L, S; 347 __ ldr(rscratch1, thread); 348 __ cmp(rthread, rscratch1); 349 __ br(Assembler::NE, S); 350 __ get_thread(rscratch1); 351 __ cmp(rthread, rscratch1); 352 __ br(Assembler::EQ, L); 353 __ BIND(S); 354 __ stop("StubRoutines::call_stub: threads must correspond"); 355 __ BIND(L); 356 } 357 #endif 358 359 // restore callee-save registers 360 __ ldpd(v15, v14, d15_save); 361 __ ldpd(v13, v12, d13_save); 362 __ ldpd(v11, v10, d11_save); 363 __ ldpd(v9, v8, d9_save); 364 365 __ ldp(r28, r27, r28_save); 366 __ ldp(r26, r25, r26_save); 367 __ ldp(r24, r23, r24_save); 368 __ ldp(r22, r21, r22_save); 369 __ ldp(r20, r19, r20_save); 370 371 __ ldp(c_rarg0, c_rarg1, call_wrapper); 372 __ ldrw(c_rarg2, result_type); 373 __ ldr(c_rarg3, method); 374 __ ldp(c_rarg4, c_rarg5, entry_point); 375 __ ldp(c_rarg6, c_rarg7, parameter_size); 376 377 #ifndef PRODUCT 378 // tell the simulator we are about to end Java execution 379 if (NotifySimulator) { 380 __ notify(Assembler::method_exit); 381 } 382 #endif 383 // leave frame and return to caller 384 __ leave(); 385 __ ret(lr); 386 387 // handle return types different from T_INT 388 389 __ BIND(is_long); 390 __ str(r0, Address(j_rarg2, 0)); 391 __ br(Assembler::AL, exit); 392 393 __ BIND(is_float); 394 __ strs(j_farg0, Address(j_rarg2, 0)); 395 __ br(Assembler::AL, exit); 396 397 __ BIND(is_double); 398 __ strd(j_farg0, Address(j_rarg2, 0)); 399 __ br(Assembler::AL, exit); 400 401 return start; 402 } 403 404 // Return point for a Java call if there's an exception thrown in 405 // Java code. The exception is caught and transformed into a 406 // pending exception stored in JavaThread that can be tested from 407 // within the VM. 408 // 409 // Note: Usually the parameters are removed by the callee. In case 410 // of an exception crossing an activation frame boundary, that is 411 // not the case if the callee is compiled code => need to setup the 412 // rsp. 413 // 414 // r0: exception oop 415 416 // NOTE: this is used as a target from the signal handler so it 417 // needs an x86 prolog which returns into the current simulator 418 // executing the generated catch_exception code. so the prolog 419 // needs to install rax in a sim register and adjust the sim's 420 // restart pc to enter the generated code at the start position 421 // then return from native to simulated execution. 422 423 address generate_catch_exception() { 424 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 425 address start = __ pc(); 426 427 // same as in generate_call_stub(): 428 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 429 const Address thread (rfp, thread_off * wordSize); 430 431 #ifdef ASSERT 432 // verify that threads correspond 433 { 434 Label L, S; 435 __ ldr(rscratch1, thread); 436 __ cmp(rthread, rscratch1); 437 __ br(Assembler::NE, S); 438 __ get_thread(rscratch1); 439 __ cmp(rthread, rscratch1); 440 __ br(Assembler::EQ, L); 441 __ bind(S); 442 __ stop("StubRoutines::catch_exception: threads must correspond"); 443 __ bind(L); 444 } 445 #endif 446 447 // set pending exception 448 __ verify_oop(r0); 449 450 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 451 __ mov(rscratch1, (address)__FILE__); 452 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 453 __ movw(rscratch1, (int)__LINE__); 454 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 455 456 // complete return to VM 457 assert(StubRoutines::_call_stub_return_address != NULL, 458 "_call_stub_return_address must have been generated before"); 459 __ b(StubRoutines::_call_stub_return_address); 460 461 return start; 462 } 463 464 // Continuation point for runtime calls returning with a pending 465 // exception. The pending exception check happened in the runtime 466 // or native call stub. The pending exception in Thread is 467 // converted into a Java-level exception. 468 // 469 // Contract with Java-level exception handlers: 470 // r0: exception 471 // r3: throwing pc 472 // 473 // NOTE: At entry of this stub, exception-pc must be in LR !! 474 475 // NOTE: this is always used as a jump target within generated code 476 // so it just needs to be generated code wiht no x86 prolog 477 478 address generate_forward_exception() { 479 StubCodeMark mark(this, "StubRoutines", "forward exception"); 480 address start = __ pc(); 481 482 // Upon entry, LR points to the return address returning into 483 // Java (interpreted or compiled) code; i.e., the return address 484 // becomes the throwing pc. 485 // 486 // Arguments pushed before the runtime call are still on the stack 487 // but the exception handler will reset the stack pointer -> 488 // ignore them. A potential result in registers can be ignored as 489 // well. 490 491 #ifdef ASSERT 492 // make sure this code is only executed if there is a pending exception 493 { 494 Label L; 495 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 496 __ cbnz(rscratch1, L); 497 __ stop("StubRoutines::forward exception: no pending exception (1)"); 498 __ bind(L); 499 } 500 #endif 501 502 // compute exception handler into r19 503 504 // call the VM to find the handler address associated with the 505 // caller address. pass thread in r0 and caller pc (ret address) 506 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 507 // the stack. 508 __ mov(c_rarg1, lr); 509 // lr will be trashed by the VM call so we move it to R19 510 // (callee-saved) because we also need to pass it to the handler 511 // returned by this call. 512 __ mov(r19, lr); 513 BLOCK_COMMENT("call exception_handler_for_return_address"); 514 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 515 SharedRuntime::exception_handler_for_return_address), 516 rthread, c_rarg1); 517 // we should not really care that lr is no longer the callee 518 // address. we saved the value the handler needs in r19 so we can 519 // just copy it to r3. however, the C2 handler will push its own 520 // frame and then calls into the VM and the VM code asserts that 521 // the PC for the frame above the handler belongs to a compiled 522 // Java method. So, we restore lr here to satisfy that assert. 523 __ mov(lr, r19); 524 // setup r0 & r3 & clear pending exception 525 __ mov(r3, r19); 526 __ mov(r19, r0); 527 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 528 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 529 530 #ifdef ASSERT 531 // make sure exception is set 532 { 533 Label L; 534 __ cbnz(r0, L); 535 __ stop("StubRoutines::forward exception: no pending exception (2)"); 536 __ bind(L); 537 } 538 #endif 539 540 // continue at exception handler 541 // r0: exception 542 // r3: throwing pc 543 // r19: exception handler 544 __ verify_oop(r0); 545 __ br(r19); 546 547 return start; 548 } 549 550 // Shenandoah write barrier. 551 // 552 // Input: 553 // r0: OOP to evacuate. Not null. 554 // 555 // Output: 556 // r0: Pointer to evacuated OOP. 557 // 558 // Trash rscratch1, rscratch2. Preserve everything else. 559 560 address generate_shenandoah_wb() { 561 StubCodeMark mark(this, "StubRoutines", "shenandoah_wb"); 562 563 __ align(6); 564 address start = __ pc(); 565 566 Label work, slow_case, lose, not_an_instance, is_array; 567 Address evacuation_in_progress 568 = Address(rthread, in_bytes(JavaThread::evacuation_in_progress_offset())); 569 570 __ mov(rscratch2, ShenandoahHeap::in_cset_fast_test_addr()); 571 __ lsr(rscratch1, r0, ShenandoahHeapRegion::RegionSizeShift); 572 __ ldrb(rscratch2, Address(rscratch2, rscratch1)); 573 __ tbnz(rscratch2, 0, work); 574 __ ret(lr); 575 576 __ bind(work); 577 578 RegSet saved = RegSet::range(r1, r4); 579 __ push(saved, sp); 580 581 Register obj = r0, size = r2, newobj = r3, newobj_end = rscratch2; 582 583 __ ldr(newobj, Address(rthread, JavaThread::gclab_top_offset())); 584 __ cbz(newobj, slow_case); // No GCLAB 585 586 __ load_klass(r1, obj); 587 __ ldrw(size, Address(r1, Klass::layout_helper_offset())); 588 __ tbnz(size, BitsPerInt - 1, not_an_instance); // make sure it's an instance (LH > 0) 589 assert(Klass::_lh_neutral_value == 0, "must be"); 590 __ cbzw(size, slow_case); 591 __ tbnz(size, exact_log2(Klass::_lh_instance_slow_path_bit), slow_case); 592 __ bind(is_array); 593 594 // size contains the size (in bytes) of the object. 595 596 // Make sure it's not a really big object. 597 // ??? Maybe this test is not necessary. 598 __ cmp(size, 128 * HeapWordSize); 599 __ br(Assembler::GE, slow_case); 600 601 int oop_extra_words = Universe::heap()->oop_extra_words(); 602 __ add(newobj_end, newobj, oop_extra_words * HeapWordSize); 603 __ add(newobj_end, newobj_end, size, ext::uxtw); 604 605 // Pointer to end of new object is in newobj_end. 606 607 __ ldr(rscratch1, Address(rthread, JavaThread::gclab_end_offset())); 608 __ cmp(newobj_end, rscratch1); 609 __ br(Assembler::HS, slow_case); // No room in GCLAB 610 611 // Store Brooks pointer and adjust start of newobj. 612 Universe::heap()->compile_prepare_oop(_masm, newobj); 613 614 // We can reuse newobj_end (rscratch2) to hold dst. 615 Register src = r1, dst = newobj_end; 616 617 // Copy the object from obj to newobj. This loop is short and 618 // sweet: the typical size of an object is about eight HeapWords 619 // so it makes no sense to optimize for a large memory copy. 620 // There might be some sense to calling generate_copy_longs from 621 // here if the object to be copied is very large. 622 Label loop, odd_count; 623 { 624 __ mov(src, obj); 625 __ mov(dst, newobj); 626 __ tbnz(size, exact_log2(HeapWordSize), odd_count); 627 628 // Live registers: obj, newobj, size, src, dst. 629 630 __ bind(loop); 631 // Count is even. Copy pairs of HeapWords. 632 __ ldp(rscratch1, r4, __ post(src, 2 * HeapWordSize)); 633 __ stp(rscratch1, r4, __ post(dst, 2 * HeapWordSize)); 634 __ subs(size, size, 2 * HeapWordSize); 635 __ br(Assembler::GT, loop); 636 } 637 638 // All copied. Now try to CAS the Brooks pointer. 639 Label succeed; 640 __ lea(r2, Address(obj, BrooksPointer::byte_offset())); 641 __ cmpxchgptr(obj, newobj, r2, rscratch1, succeed, NULL); 642 // If we lose the CAS we are racing with someone who just beat 643 // us evacuating the object. This leaves the address of the 644 // evacuated object in r0. 645 646 // We lost. 647 __ pop(saved, sp); 648 __ ret(lr); 649 650 // We won. 651 __ bind(succeed); 652 __ mov(obj, newobj); 653 // dst points to end of newobj. 654 __ str(dst, Address(rthread, JavaThread::gclab_top_offset())); 655 __ pop(saved, sp); 656 __ ret(lr); 657 658 // Come here if the count of HeapWords is odd. 659 { 660 __ bind(odd_count); 661 __ ldr(rscratch1, __ post(src, HeapWordSize)); 662 __ str(rscratch1, __ post(dst, HeapWordSize)); 663 __ subs(size, size, HeapWordSize); 664 __ b(loop); 665 } 666 667 // Come here if obj is an array of some kind. 668 { 669 __ bind(not_an_instance); 670 671 // It's an array. Calculate the size in r4. 672 __ ubfx(r4, size, Klass::_lh_header_size_shift, 673 exact_log2(Klass::_lh_header_size_mask+1)); 674 __ ldrw(rscratch1, Address(obj, arrayOopDesc::length_offset_in_bytes())); 675 __ ubfx(rscratch2, size, Klass::_lh_log2_element_size_shift, 676 exact_log2(Klass::_lh_log2_element_size_mask+1)); 677 __ lslv(rscratch1, rscratch1, rscratch2); 678 __ add(size, rscratch1, r4); 679 680 // Round up the size. 681 __ add(size, size, MinObjAlignmentInBytes-1); 682 __ andr(size, size, -MinObjAlignmentInBytes); 683 684 __ b(is_array); 685 } 686 687 { 688 // Make a runtime call to evacuate the object. 689 __ bind(slow_case); 690 __ pop(saved, sp); 691 692 __ enter(); // required for proper stackwalking of RuntimeStub frame 693 694 __ push_call_clobbered_registers(); 695 696 __ mov(lr, CAST_FROM_FN_PTR(address, ShenandoahBarrierSet::write_barrier_c2)); 697 __ blrt(lr, 1, 0, MacroAssembler::ret_type_integral); 698 __ mov(rscratch1, obj); 699 700 __ pop_call_clobbered_registers(); 701 __ mov(obj, rscratch1); 702 703 __ leave(); // required for proper stackwalking of RuntimeStub frame 704 __ ret(lr); 705 } 706 707 return start; 708 } 709 710 // Non-destructive plausibility checks for oops 711 // 712 // Arguments: 713 // r0: oop to verify 714 // rscratch1: error message 715 // 716 // Stack after saving c_rarg3: 717 // [tos + 0]: saved c_rarg3 718 // [tos + 1]: saved c_rarg2 719 // [tos + 2]: saved lr 720 // [tos + 3]: saved rscratch2 721 // [tos + 4]: saved r0 722 // [tos + 5]: saved rscratch1 723 address generate_verify_oop() { 724 725 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 726 address start = __ pc(); 727 728 Label exit, error; 729 730 // save c_rarg2 and c_rarg3 731 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 732 733 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 734 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 735 __ ldr(c_rarg3, Address(c_rarg2)); 736 __ add(c_rarg3, c_rarg3, 1); 737 __ str(c_rarg3, Address(c_rarg2)); 738 739 // object is in r0 740 // make sure object is 'reasonable' 741 __ cbz(r0, exit); // if obj is NULL it is OK 742 743 // Check if the oop is in the right area of memory 744 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 745 __ andr(c_rarg2, r0, c_rarg3); 746 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 747 748 // Compare c_rarg2 and c_rarg3. We don't use a compare 749 // instruction here because the flags register is live. 750 __ eor(c_rarg2, c_rarg2, c_rarg3); 751 __ cbnz(c_rarg2, error); 752 753 // make sure klass is 'reasonable', which is not zero. 754 __ load_klass(r0, r0); // get klass 755 __ cbz(r0, error); // if klass is NULL it is broken 756 757 // return if everything seems ok 758 __ bind(exit); 759 760 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 761 __ ret(lr); 762 763 // handle errors 764 __ bind(error); 765 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 766 767 __ push(RegSet::range(r0, r29), sp); 768 // debug(char* msg, int64_t pc, int64_t regs[]) 769 __ mov(c_rarg0, rscratch1); // pass address of error message 770 __ mov(c_rarg1, lr); // pass return address 771 __ mov(c_rarg2, sp); // pass address of regs on stack 772 #ifndef PRODUCT 773 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 774 #endif 775 BLOCK_COMMENT("call MacroAssembler::debug"); 776 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 777 __ blrt(rscratch1, 3, 0, 1); 778 779 return start; 780 } 781 782 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 783 784 // Generate code for an array write pre barrier 785 // 786 // addr - starting address 787 // count - element count 788 // tmp - scratch register 789 // 790 // Destroy no registers except rscratch1 and rscratch2 791 // 792 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 793 BarrierSet* bs = Universe::heap()->barrier_set(); 794 switch (bs->kind()) { 795 case BarrierSet::G1SATBCTLogging: 796 case BarrierSet::ShenandoahBarrierSet: 797 // Don't generate the call if we statically know that the target is uninitialized 798 if (!dest_uninitialized) { 799 __ push_call_clobbered_registers(); 800 if (count == c_rarg0) { 801 if (addr == c_rarg1) { 802 // exactly backwards!! 803 __ mov(rscratch1, c_rarg0); 804 __ mov(c_rarg0, c_rarg1); 805 __ mov(c_rarg1, rscratch1); 806 } else { 807 __ mov(c_rarg1, count); 808 __ mov(c_rarg0, addr); 809 } 810 } else { 811 __ mov(c_rarg0, addr); 812 __ mov(c_rarg1, count); 813 } 814 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 815 __ pop_call_clobbered_registers(); 816 break; 817 case BarrierSet::CardTableForRS: 818 case BarrierSet::CardTableExtension: 819 case BarrierSet::ModRef: 820 break; 821 default: 822 ShouldNotReachHere(); 823 } 824 } 825 } 826 827 828 // 829 // Generate code for an array write post barrier 830 // 831 // Input: 832 // start - register containing starting address of destination array 833 // end - register containing ending address of destination array 834 // scratch - scratch register 835 // 836 // The input registers are overwritten. 837 // The ending address is inclusive. 838 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 839 assert_different_registers(start, end, scratch); 840 BarrierSet* bs = Universe::heap()->barrier_set(); 841 switch (bs->kind()) { 842 case BarrierSet::G1SATBCTLogging: 843 case BarrierSet::ShenandoahBarrierSet: 844 { 845 __ push_call_clobbered_registers(); 846 // must compute element count unless barrier set interface is changed (other platforms supply count) 847 assert_different_registers(start, end, scratch); 848 __ lea(scratch, Address(end, BytesPerHeapOop)); 849 __ sub(scratch, scratch, start); // subtract start to get #bytes 850 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 851 __ mov(c_rarg0, start); 852 __ mov(c_rarg1, scratch); 853 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 854 __ pop_call_clobbered_registers(); 855 } 856 break; 857 case BarrierSet::CardTableForRS: 858 case BarrierSet::CardTableExtension: 859 { 860 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 861 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 862 863 Label L_loop; 864 865 __ lsr(start, start, CardTableModRefBS::card_shift); 866 __ lsr(end, end, CardTableModRefBS::card_shift); 867 __ sub(end, end, start); // number of bytes to copy 868 869 const Register count = end; // 'end' register contains bytes count now 870 __ load_byte_map_base(scratch); 871 __ add(start, start, scratch); 872 if (UseConcMarkSweepGC) { 873 __ membar(__ StoreStore); 874 } 875 __ BIND(L_loop); 876 __ strb(zr, Address(start, count)); 877 __ subs(count, count, 1); 878 __ br(Assembler::GE, L_loop); 879 } 880 break; 881 default: 882 ShouldNotReachHere(); 883 884 } 885 } 886 887 address generate_zero_longs(Register base, Register cnt) { 888 Register tmp = rscratch1; 889 Register tmp2 = rscratch2; 890 int zva_length = VM_Version::zva_length(); 891 Label initial_table_end, loop_zva; 892 Label fini; 893 894 __ align(CodeEntryAlignment); 895 StubCodeMark mark(this, "StubRoutines", "zero_longs"); 896 address start = __ pc(); 897 898 // Base must be 16 byte aligned. If not just return and let caller handle it 899 __ tst(base, 0x0f); 900 __ br(Assembler::NE, fini); 901 // Align base with ZVA length. 902 __ neg(tmp, base); 903 __ andr(tmp, tmp, zva_length - 1); 904 905 // tmp: the number of bytes to be filled to align the base with ZVA length. 906 __ add(base, base, tmp); 907 __ sub(cnt, cnt, tmp, Assembler::ASR, 3); 908 __ adr(tmp2, initial_table_end); 909 __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 910 __ br(tmp2); 911 912 for (int i = -zva_length + 16; i < 0; i += 16) 913 __ stp(zr, zr, Address(base, i)); 914 __ bind(initial_table_end); 915 916 __ sub(cnt, cnt, zva_length >> 3); 917 __ bind(loop_zva); 918 __ dc(Assembler::ZVA, base); 919 __ subs(cnt, cnt, zva_length >> 3); 920 __ add(base, base, zva_length); 921 __ br(Assembler::GE, loop_zva); 922 __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 923 __ bind(fini); 924 __ ret(lr); 925 926 return start; 927 } 928 929 typedef enum { 930 copy_forwards = 1, 931 copy_backwards = -1 932 } copy_direction; 933 934 // Bulk copy of blocks of 8 words. 935 // 936 // count is a count of words. 937 // 938 // Precondition: count >= 8 939 // 940 // Postconditions: 941 // 942 // The least significant bit of count contains the remaining count 943 // of words to copy. The rest of count is trash. 944 // 945 // s and d are adjusted to point to the remaining words to copy 946 // 947 void generate_copy_longs(Label &start, Register s, Register d, Register count, 948 copy_direction direction) { 949 int unit = wordSize * direction; 950 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 951 952 int offset; 953 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 954 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 955 const Register stride = r13; 956 957 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 958 assert_different_registers(s, d, count, rscratch1); 959 960 Label again, drain; 961 const char *stub_name; 962 if (direction == copy_forwards) 963 stub_name = "foward_copy_longs"; 964 else 965 stub_name = "backward_copy_longs"; 966 StubCodeMark mark(this, "StubRoutines", stub_name); 967 __ align(CodeEntryAlignment); 968 __ bind(start); 969 970 Label unaligned_copy_long; 971 if (AvoidUnalignedAccesses) { 972 __ tbnz(d, 3, unaligned_copy_long); 973 } 974 975 if (direction == copy_forwards) { 976 __ sub(s, s, bias); 977 __ sub(d, d, bias); 978 } 979 980 #ifdef ASSERT 981 // Make sure we are never given < 8 words 982 { 983 Label L; 984 __ cmp(count, 8); 985 __ br(Assembler::GE, L); 986 __ stop("genrate_copy_longs called with < 8 words"); 987 __ bind(L); 988 } 989 #endif 990 991 // Fill 8 registers 992 if (UseSIMDForMemoryOps) { 993 __ ldpq(v0, v1, Address(s, 4 * unit)); 994 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 995 } else { 996 __ ldp(t0, t1, Address(s, 2 * unit)); 997 __ ldp(t2, t3, Address(s, 4 * unit)); 998 __ ldp(t4, t5, Address(s, 6 * unit)); 999 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1000 } 1001 1002 __ subs(count, count, 16); 1003 __ br(Assembler::LO, drain); 1004 1005 int prefetch = PrefetchCopyIntervalInBytes; 1006 bool use_stride = false; 1007 if (direction == copy_backwards) { 1008 use_stride = prefetch > 256; 1009 prefetch = -prefetch; 1010 if (use_stride) __ mov(stride, prefetch); 1011 } 1012 1013 __ bind(again); 1014 1015 if (PrefetchCopyIntervalInBytes > 0) 1016 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1017 1018 if (UseSIMDForMemoryOps) { 1019 __ stpq(v0, v1, Address(d, 4 * unit)); 1020 __ ldpq(v0, v1, Address(s, 4 * unit)); 1021 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 1022 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 1023 } else { 1024 __ stp(t0, t1, Address(d, 2 * unit)); 1025 __ ldp(t0, t1, Address(s, 2 * unit)); 1026 __ stp(t2, t3, Address(d, 4 * unit)); 1027 __ ldp(t2, t3, Address(s, 4 * unit)); 1028 __ stp(t4, t5, Address(d, 6 * unit)); 1029 __ ldp(t4, t5, Address(s, 6 * unit)); 1030 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 1031 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1032 } 1033 1034 __ subs(count, count, 8); 1035 __ br(Assembler::HS, again); 1036 1037 // Drain 1038 __ bind(drain); 1039 if (UseSIMDForMemoryOps) { 1040 __ stpq(v0, v1, Address(d, 4 * unit)); 1041 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 1042 } else { 1043 __ stp(t0, t1, Address(d, 2 * unit)); 1044 __ stp(t2, t3, Address(d, 4 * unit)); 1045 __ stp(t4, t5, Address(d, 6 * unit)); 1046 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 1047 } 1048 1049 { 1050 Label L1, L2; 1051 __ tbz(count, exact_log2(4), L1); 1052 if (UseSIMDForMemoryOps) { 1053 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 1054 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 1055 } else { 1056 __ ldp(t0, t1, Address(s, 2 * unit)); 1057 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1058 __ stp(t0, t1, Address(d, 2 * unit)); 1059 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 1060 } 1061 __ bind(L1); 1062 1063 if (direction == copy_forwards) { 1064 __ add(s, s, bias); 1065 __ add(d, d, bias); 1066 } 1067 1068 __ tbz(count, 1, L2); 1069 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 1070 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 1071 __ bind(L2); 1072 } 1073 1074 __ ret(lr); 1075 1076 if (AvoidUnalignedAccesses) { 1077 Label drain, again; 1078 // Register order for storing. Order is different for backward copy. 1079 1080 __ bind(unaligned_copy_long); 1081 1082 // source address is even aligned, target odd aligned 1083 // 1084 // when forward copying word pairs we read long pairs at offsets 1085 // {0, 2, 4, 6} (in long words). when backwards copying we read 1086 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 1087 // address by -2 in the forwards case so we can compute the 1088 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 1089 // or -1. 1090 // 1091 // when forward copying we need to store 1 word, 3 pairs and 1092 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 1093 // zero offset We adjust the destination by -1 which means we 1094 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 1095 // 1096 // When backwards copyng we need to store 1 word, 3 pairs and 1097 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 1098 // offsets {1, 3, 5, 7, 8} * unit. 1099 1100 if (direction == copy_forwards) { 1101 __ sub(s, s, 16); 1102 __ sub(d, d, 8); 1103 } 1104 1105 // Fill 8 registers 1106 // 1107 // for forwards copy s was offset by -16 from the original input 1108 // value of s so the register contents are at these offsets 1109 // relative to the 64 bit block addressed by that original input 1110 // and so on for each successive 64 byte block when s is updated 1111 // 1112 // t0 at offset 0, t1 at offset 8 1113 // t2 at offset 16, t3 at offset 24 1114 // t4 at offset 32, t5 at offset 40 1115 // t6 at offset 48, t7 at offset 56 1116 1117 // for backwards copy s was not offset so the register contents 1118 // are at these offsets into the preceding 64 byte block 1119 // relative to that original input and so on for each successive 1120 // preceding 64 byte block when s is updated. this explains the 1121 // slightly counter-intuitive looking pattern of register usage 1122 // in the stp instructions for backwards copy. 1123 // 1124 // t0 at offset -16, t1 at offset -8 1125 // t2 at offset -32, t3 at offset -24 1126 // t4 at offset -48, t5 at offset -40 1127 // t6 at offset -64, t7 at offset -56 1128 1129 __ ldp(t0, t1, Address(s, 2 * unit)); 1130 __ ldp(t2, t3, Address(s, 4 * unit)); 1131 __ ldp(t4, t5, Address(s, 6 * unit)); 1132 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1133 1134 __ subs(count, count, 16); 1135 __ br(Assembler::LO, drain); 1136 1137 int prefetch = PrefetchCopyIntervalInBytes; 1138 bool use_stride = false; 1139 if (direction == copy_backwards) { 1140 use_stride = prefetch > 256; 1141 prefetch = -prefetch; 1142 if (use_stride) __ mov(stride, prefetch); 1143 } 1144 1145 __ bind(again); 1146 1147 if (PrefetchCopyIntervalInBytes > 0) 1148 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1149 1150 if (direction == copy_forwards) { 1151 // allowing for the offset of -8 the store instructions place 1152 // registers into the target 64 bit block at the following 1153 // offsets 1154 // 1155 // t0 at offset 0 1156 // t1 at offset 8, t2 at offset 16 1157 // t3 at offset 24, t4 at offset 32 1158 // t5 at offset 40, t6 at offset 48 1159 // t7 at offset 56 1160 1161 __ str(t0, Address(d, 1 * unit)); 1162 __ stp(t1, t2, Address(d, 2 * unit)); 1163 __ ldp(t0, t1, Address(s, 2 * unit)); 1164 __ stp(t3, t4, Address(d, 4 * unit)); 1165 __ ldp(t2, t3, Address(s, 4 * unit)); 1166 __ stp(t5, t6, Address(d, 6 * unit)); 1167 __ ldp(t4, t5, Address(s, 6 * unit)); 1168 __ str(t7, Address(__ pre(d, 8 * unit))); 1169 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1170 } else { 1171 // d was not offset when we started so the registers are 1172 // written into the 64 bit block preceding d with the following 1173 // offsets 1174 // 1175 // t1 at offset -8 1176 // t3 at offset -24, t0 at offset -16 1177 // t5 at offset -48, t2 at offset -32 1178 // t7 at offset -56, t4 at offset -48 1179 // t6 at offset -64 1180 // 1181 // note that this matches the offsets previously noted for the 1182 // loads 1183 1184 __ str(t1, Address(d, 1 * unit)); 1185 __ stp(t3, t0, Address(d, 3 * unit)); 1186 __ ldp(t0, t1, Address(s, 2 * unit)); 1187 __ stp(t5, t2, Address(d, 5 * unit)); 1188 __ ldp(t2, t3, Address(s, 4 * unit)); 1189 __ stp(t7, t4, Address(d, 7 * unit)); 1190 __ ldp(t4, t5, Address(s, 6 * unit)); 1191 __ str(t6, Address(__ pre(d, 8 * unit))); 1192 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1193 } 1194 1195 __ subs(count, count, 8); 1196 __ br(Assembler::HS, again); 1197 1198 // Drain 1199 // 1200 // this uses the same pattern of offsets and register arguments 1201 // as above 1202 __ bind(drain); 1203 if (direction == copy_forwards) { 1204 __ str(t0, Address(d, 1 * unit)); 1205 __ stp(t1, t2, Address(d, 2 * unit)); 1206 __ stp(t3, t4, Address(d, 4 * unit)); 1207 __ stp(t5, t6, Address(d, 6 * unit)); 1208 __ str(t7, Address(__ pre(d, 8 * unit))); 1209 } else { 1210 __ str(t1, Address(d, 1 * unit)); 1211 __ stp(t3, t0, Address(d, 3 * unit)); 1212 __ stp(t5, t2, Address(d, 5 * unit)); 1213 __ stp(t7, t4, Address(d, 7 * unit)); 1214 __ str(t6, Address(__ pre(d, 8 * unit))); 1215 } 1216 // now we need to copy any remaining part block which may 1217 // include a 4 word block subblock and/or a 2 word subblock. 1218 // bits 2 and 1 in the count are the tell-tale for whetehr we 1219 // have each such subblock 1220 { 1221 Label L1, L2; 1222 __ tbz(count, exact_log2(4), L1); 1223 // this is the same as above but copying only 4 longs hence 1224 // with ony one intervening stp between the str instructions 1225 // but note that the offsets and registers still follow the 1226 // same pattern 1227 __ ldp(t0, t1, Address(s, 2 * unit)); 1228 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1229 if (direction == copy_forwards) { 1230 __ str(t0, Address(d, 1 * unit)); 1231 __ stp(t1, t2, Address(d, 2 * unit)); 1232 __ str(t3, Address(__ pre(d, 4 * unit))); 1233 } else { 1234 __ str(t1, Address(d, 1 * unit)); 1235 __ stp(t3, t0, Address(d, 3 * unit)); 1236 __ str(t2, Address(__ pre(d, 4 * unit))); 1237 } 1238 __ bind(L1); 1239 1240 __ tbz(count, 1, L2); 1241 // this is the same as above but copying only 2 longs hence 1242 // there is no intervening stp between the str instructions 1243 // but note that the offset and register patterns are still 1244 // the same 1245 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1246 if (direction == copy_forwards) { 1247 __ str(t0, Address(d, 1 * unit)); 1248 __ str(t1, Address(__ pre(d, 2 * unit))); 1249 } else { 1250 __ str(t1, Address(d, 1 * unit)); 1251 __ str(t0, Address(__ pre(d, 2 * unit))); 1252 } 1253 __ bind(L2); 1254 1255 // for forwards copy we need to re-adjust the offsets we 1256 // applied so that s and d are follow the last words written 1257 1258 if (direction == copy_forwards) { 1259 __ add(s, s, 16); 1260 __ add(d, d, 8); 1261 } 1262 1263 } 1264 1265 __ ret(lr); 1266 } 1267 } 1268 1269 // Small copy: less than 16 bytes. 1270 // 1271 // NB: Ignores all of the bits of count which represent more than 15 1272 // bytes, so a caller doesn't have to mask them. 1273 1274 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1275 bool is_backwards = step < 0; 1276 size_t granularity = uabs(step); 1277 int direction = is_backwards ? -1 : 1; 1278 int unit = wordSize * direction; 1279 1280 Label Lpair, Lword, Lint, Lshort, Lbyte; 1281 1282 assert(granularity 1283 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1284 1285 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1286 1287 // ??? I don't know if this bit-test-and-branch is the right thing 1288 // to do. It does a lot of jumping, resulting in several 1289 // mispredicted branches. It might make more sense to do this 1290 // with something like Duff's device with a single computed branch. 1291 1292 __ tbz(count, 3 - exact_log2(granularity), Lword); 1293 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1294 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1295 __ bind(Lword); 1296 1297 if (granularity <= sizeof (jint)) { 1298 __ tbz(count, 2 - exact_log2(granularity), Lint); 1299 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1300 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1301 __ bind(Lint); 1302 } 1303 1304 if (granularity <= sizeof (jshort)) { 1305 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1306 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1307 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1308 __ bind(Lshort); 1309 } 1310 1311 if (granularity <= sizeof (jbyte)) { 1312 __ tbz(count, 0, Lbyte); 1313 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1314 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1315 __ bind(Lbyte); 1316 } 1317 } 1318 1319 Label copy_f, copy_b; 1320 1321 // All-singing all-dancing memory copy. 1322 // 1323 // Copy count units of memory from s to d. The size of a unit is 1324 // step, which can be positive or negative depending on the direction 1325 // of copy. If is_aligned is false, we align the source address. 1326 // 1327 1328 void copy_memory(bool is_aligned, Register s, Register d, 1329 Register count, Register tmp, int step) { 1330 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1331 bool is_backwards = step < 0; 1332 int granularity = uabs(step); 1333 const Register t0 = r3, t1 = r4; 1334 1335 // <= 96 bytes do inline. Direction doesn't matter because we always 1336 // load all the data before writing anything 1337 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1338 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1339 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1340 const Register send = r17, dend = r18; 1341 1342 if (PrefetchCopyIntervalInBytes > 0) 1343 __ prfm(Address(s, 0), PLDL1KEEP); 1344 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1345 __ br(Assembler::HI, copy_big); 1346 1347 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1348 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1349 1350 __ cmp(count, 16/granularity); 1351 __ br(Assembler::LS, copy16); 1352 1353 __ cmp(count, 64/granularity); 1354 __ br(Assembler::HI, copy80); 1355 1356 __ cmp(count, 32/granularity); 1357 __ br(Assembler::LS, copy32); 1358 1359 // 33..64 bytes 1360 if (UseSIMDForMemoryOps) { 1361 __ ldpq(v0, v1, Address(s, 0)); 1362 __ ldpq(v2, v3, Address(send, -32)); 1363 __ stpq(v0, v1, Address(d, 0)); 1364 __ stpq(v2, v3, Address(dend, -32)); 1365 } else { 1366 __ ldp(t0, t1, Address(s, 0)); 1367 __ ldp(t2, t3, Address(s, 16)); 1368 __ ldp(t4, t5, Address(send, -32)); 1369 __ ldp(t6, t7, Address(send, -16)); 1370 1371 __ stp(t0, t1, Address(d, 0)); 1372 __ stp(t2, t3, Address(d, 16)); 1373 __ stp(t4, t5, Address(dend, -32)); 1374 __ stp(t6, t7, Address(dend, -16)); 1375 } 1376 __ b(finish); 1377 1378 // 17..32 bytes 1379 __ bind(copy32); 1380 __ ldp(t0, t1, Address(s, 0)); 1381 __ ldp(t2, t3, Address(send, -16)); 1382 __ stp(t0, t1, Address(d, 0)); 1383 __ stp(t2, t3, Address(dend, -16)); 1384 __ b(finish); 1385 1386 // 65..80/96 bytes 1387 // (96 bytes if SIMD because we do 32 byes per instruction) 1388 __ bind(copy80); 1389 if (UseSIMDForMemoryOps) { 1390 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1391 __ ldpq(v4, v5, Address(send, -32)); 1392 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1393 __ stpq(v4, v5, Address(dend, -32)); 1394 } else { 1395 __ ldp(t0, t1, Address(s, 0)); 1396 __ ldp(t2, t3, Address(s, 16)); 1397 __ ldp(t4, t5, Address(s, 32)); 1398 __ ldp(t6, t7, Address(s, 48)); 1399 __ ldp(t8, t9, Address(send, -16)); 1400 1401 __ stp(t0, t1, Address(d, 0)); 1402 __ stp(t2, t3, Address(d, 16)); 1403 __ stp(t4, t5, Address(d, 32)); 1404 __ stp(t6, t7, Address(d, 48)); 1405 __ stp(t8, t9, Address(dend, -16)); 1406 } 1407 __ b(finish); 1408 1409 // 0..16 bytes 1410 __ bind(copy16); 1411 __ cmp(count, 8/granularity); 1412 __ br(Assembler::LO, copy8); 1413 1414 // 8..16 bytes 1415 __ ldr(t0, Address(s, 0)); 1416 __ ldr(t1, Address(send, -8)); 1417 __ str(t0, Address(d, 0)); 1418 __ str(t1, Address(dend, -8)); 1419 __ b(finish); 1420 1421 if (granularity < 8) { 1422 // 4..7 bytes 1423 __ bind(copy8); 1424 __ tbz(count, 2 - exact_log2(granularity), copy4); 1425 __ ldrw(t0, Address(s, 0)); 1426 __ ldrw(t1, Address(send, -4)); 1427 __ strw(t0, Address(d, 0)); 1428 __ strw(t1, Address(dend, -4)); 1429 __ b(finish); 1430 if (granularity < 4) { 1431 // 0..3 bytes 1432 __ bind(copy4); 1433 __ cbz(count, finish); // get rid of 0 case 1434 if (granularity == 2) { 1435 __ ldrh(t0, Address(s, 0)); 1436 __ strh(t0, Address(d, 0)); 1437 } else { // granularity == 1 1438 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1439 // the first and last byte. 1440 // Handle the 3 byte case by loading and storing base + count/2 1441 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1442 // This does means in the 1 byte case we load/store the same 1443 // byte 3 times. 1444 __ lsr(count, count, 1); 1445 __ ldrb(t0, Address(s, 0)); 1446 __ ldrb(t1, Address(send, -1)); 1447 __ ldrb(t2, Address(s, count)); 1448 __ strb(t0, Address(d, 0)); 1449 __ strb(t1, Address(dend, -1)); 1450 __ strb(t2, Address(d, count)); 1451 } 1452 __ b(finish); 1453 } 1454 } 1455 1456 __ bind(copy_big); 1457 if (is_backwards) { 1458 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1459 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1460 } 1461 1462 // Now we've got the small case out of the way we can align the 1463 // source address on a 2-word boundary. 1464 1465 Label aligned; 1466 1467 if (is_aligned) { 1468 // We may have to adjust by 1 word to get s 2-word-aligned. 1469 __ tbz(s, exact_log2(wordSize), aligned); 1470 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1471 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1472 __ sub(count, count, wordSize/granularity); 1473 } else { 1474 if (is_backwards) { 1475 __ andr(rscratch2, s, 2 * wordSize - 1); 1476 } else { 1477 __ neg(rscratch2, s); 1478 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1479 } 1480 // rscratch2 is the byte adjustment needed to align s. 1481 __ cbz(rscratch2, aligned); 1482 int shift = exact_log2(granularity); 1483 if (shift) __ lsr(rscratch2, rscratch2, shift); 1484 __ sub(count, count, rscratch2); 1485 1486 #if 0 1487 // ?? This code is only correct for a disjoint copy. It may or 1488 // may not make sense to use it in that case. 1489 1490 // Copy the first pair; s and d may not be aligned. 1491 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1492 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1493 1494 // Align s and d, adjust count 1495 if (is_backwards) { 1496 __ sub(s, s, rscratch2); 1497 __ sub(d, d, rscratch2); 1498 } else { 1499 __ add(s, s, rscratch2); 1500 __ add(d, d, rscratch2); 1501 } 1502 #else 1503 copy_memory_small(s, d, rscratch2, rscratch1, step); 1504 #endif 1505 } 1506 1507 __ bind(aligned); 1508 1509 // s is now 2-word-aligned. 1510 1511 // We have a count of units and some trailing bytes. Adjust the 1512 // count and do a bulk copy of words. 1513 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1514 if (direction == copy_forwards) 1515 __ bl(copy_f); 1516 else 1517 __ bl(copy_b); 1518 1519 // And the tail. 1520 copy_memory_small(s, d, count, tmp, step); 1521 1522 if (granularity >= 8) __ bind(copy8); 1523 if (granularity >= 4) __ bind(copy4); 1524 __ bind(finish); 1525 } 1526 1527 1528 void clobber_registers() { 1529 #ifdef ASSERT 1530 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1531 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1532 for (Register r = r3; r <= r18; r++) 1533 if (r != rscratch1) __ mov(r, rscratch1); 1534 #endif 1535 } 1536 1537 // Scan over array at a for count oops, verifying each one. 1538 // Preserves a and count, clobbers rscratch1 and rscratch2. 1539 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1540 Label loop, end; 1541 __ mov(rscratch1, a); 1542 __ mov(rscratch2, zr); 1543 __ bind(loop); 1544 __ cmp(rscratch2, count); 1545 __ br(Assembler::HS, end); 1546 if (size == (size_t)wordSize) { 1547 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1548 __ verify_oop(temp); 1549 } else { 1550 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1551 __ decode_heap_oop(temp); // calls verify_oop 1552 } 1553 __ add(rscratch2, rscratch2, size); 1554 __ b(loop); 1555 __ bind(end); 1556 } 1557 1558 // Arguments: 1559 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1560 // ignored 1561 // is_oop - true => oop array, so generate store check code 1562 // name - stub name string 1563 // 1564 // Inputs: 1565 // c_rarg0 - source array address 1566 // c_rarg1 - destination array address 1567 // c_rarg2 - element count, treated as ssize_t, can be zero 1568 // 1569 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1570 // the hardware handle it. The two dwords within qwords that span 1571 // cache line boundaries will still be loaded and stored atomicly. 1572 // 1573 // Side Effects: 1574 // disjoint_int_copy_entry is set to the no-overlap entry point 1575 // used by generate_conjoint_int_oop_copy(). 1576 // 1577 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1578 const char *name, bool dest_uninitialized = false) { 1579 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1580 __ align(CodeEntryAlignment); 1581 StubCodeMark mark(this, "StubRoutines", name); 1582 address start = __ pc(); 1583 __ enter(); 1584 1585 if (entry != NULL) { 1586 *entry = __ pc(); 1587 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1588 BLOCK_COMMENT("Entry:"); 1589 } 1590 1591 if (is_oop) { 1592 __ push(RegSet::of(d, count), sp); 1593 // no registers are destroyed by this call 1594 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1595 } 1596 copy_memory(aligned, s, d, count, rscratch1, size); 1597 if (is_oop) { 1598 __ pop(RegSet::of(d, count), sp); 1599 if (VerifyOops) 1600 verify_oop_array(size, d, count, r16); 1601 __ sub(count, count, 1); // make an inclusive end pointer 1602 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1603 gen_write_ref_array_post_barrier(d, count, rscratch1); 1604 } 1605 __ leave(); 1606 __ mov(r0, zr); // return 0 1607 __ ret(lr); 1608 #ifdef BUILTIN_SIM 1609 { 1610 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1611 sim->notifyCompile(const_cast<char*>(name), start); 1612 } 1613 #endif 1614 return start; 1615 } 1616 1617 // Arguments: 1618 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1619 // ignored 1620 // is_oop - true => oop array, so generate store check code 1621 // name - stub name string 1622 // 1623 // Inputs: 1624 // c_rarg0 - source array address 1625 // c_rarg1 - destination array address 1626 // c_rarg2 - element count, treated as ssize_t, can be zero 1627 // 1628 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1629 // the hardware handle it. The two dwords within qwords that span 1630 // cache line boundaries will still be loaded and stored atomicly. 1631 // 1632 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1633 address *entry, const char *name, 1634 bool dest_uninitialized = false) { 1635 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1636 1637 StubCodeMark mark(this, "StubRoutines", name); 1638 address start = __ pc(); 1639 __ enter(); 1640 1641 if (entry != NULL) { 1642 *entry = __ pc(); 1643 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1644 BLOCK_COMMENT("Entry:"); 1645 } 1646 1647 // use fwd copy when (d-s) above_equal (count*size) 1648 __ sub(rscratch1, d, s); 1649 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1650 __ br(Assembler::HS, nooverlap_target); 1651 1652 if (is_oop) { 1653 __ push(RegSet::of(d, count), sp); 1654 // no registers are destroyed by this call 1655 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1656 } 1657 copy_memory(aligned, s, d, count, rscratch1, -size); 1658 if (is_oop) { 1659 __ pop(RegSet::of(d, count), sp); 1660 if (VerifyOops) 1661 verify_oop_array(size, d, count, r16); 1662 __ sub(count, count, 1); // make an inclusive end pointer 1663 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1664 gen_write_ref_array_post_barrier(d, count, rscratch1); 1665 } 1666 __ leave(); 1667 __ mov(r0, zr); // return 0 1668 __ ret(lr); 1669 #ifdef BUILTIN_SIM 1670 { 1671 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1672 sim->notifyCompile(const_cast<char*>(name), start); 1673 } 1674 #endif 1675 return start; 1676 } 1677 1678 // Arguments: 1679 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1680 // ignored 1681 // name - stub name string 1682 // 1683 // Inputs: 1684 // c_rarg0 - source array address 1685 // c_rarg1 - destination array address 1686 // c_rarg2 - element count, treated as ssize_t, can be zero 1687 // 1688 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1689 // we let the hardware handle it. The one to eight bytes within words, 1690 // dwords or qwords that span cache line boundaries will still be loaded 1691 // and stored atomically. 1692 // 1693 // Side Effects: 1694 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1695 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1696 // we let the hardware handle it. The one to eight bytes within words, 1697 // dwords or qwords that span cache line boundaries will still be loaded 1698 // and stored atomically. 1699 // 1700 // Side Effects: 1701 // disjoint_byte_copy_entry is set to the no-overlap entry point 1702 // used by generate_conjoint_byte_copy(). 1703 // 1704 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1705 const bool not_oop = false; 1706 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1707 } 1708 1709 // Arguments: 1710 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1711 // ignored 1712 // name - stub name string 1713 // 1714 // Inputs: 1715 // c_rarg0 - source array address 1716 // c_rarg1 - destination array address 1717 // c_rarg2 - element count, treated as ssize_t, can be zero 1718 // 1719 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1720 // we let the hardware handle it. The one to eight bytes within words, 1721 // dwords or qwords that span cache line boundaries will still be loaded 1722 // and stored atomically. 1723 // 1724 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1725 address* entry, const char *name) { 1726 const bool not_oop = false; 1727 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1728 } 1729 1730 // Arguments: 1731 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1732 // ignored 1733 // name - stub name string 1734 // 1735 // Inputs: 1736 // c_rarg0 - source array address 1737 // c_rarg1 - destination array address 1738 // c_rarg2 - element count, treated as ssize_t, can be zero 1739 // 1740 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1741 // let the hardware handle it. The two or four words within dwords 1742 // or qwords that span cache line boundaries will still be loaded 1743 // and stored atomically. 1744 // 1745 // Side Effects: 1746 // disjoint_short_copy_entry is set to the no-overlap entry point 1747 // used by generate_conjoint_short_copy(). 1748 // 1749 address generate_disjoint_short_copy(bool aligned, 1750 address* entry, const char *name) { 1751 const bool not_oop = false; 1752 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1753 } 1754 1755 // Arguments: 1756 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1757 // ignored 1758 // name - stub name string 1759 // 1760 // Inputs: 1761 // c_rarg0 - source array address 1762 // c_rarg1 - destination array address 1763 // c_rarg2 - element count, treated as ssize_t, can be zero 1764 // 1765 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1766 // let the hardware handle it. The two or four words within dwords 1767 // or qwords that span cache line boundaries will still be loaded 1768 // and stored atomically. 1769 // 1770 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1771 address *entry, const char *name) { 1772 const bool not_oop = false; 1773 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1774 1775 } 1776 // Arguments: 1777 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1778 // ignored 1779 // name - stub name string 1780 // 1781 // Inputs: 1782 // c_rarg0 - source array address 1783 // c_rarg1 - destination array address 1784 // c_rarg2 - element count, treated as ssize_t, can be zero 1785 // 1786 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1787 // the hardware handle it. The two dwords within qwords that span 1788 // cache line boundaries will still be loaded and stored atomicly. 1789 // 1790 // Side Effects: 1791 // disjoint_int_copy_entry is set to the no-overlap entry point 1792 // used by generate_conjoint_int_oop_copy(). 1793 // 1794 address generate_disjoint_int_copy(bool aligned, address *entry, 1795 const char *name) { 1796 const bool not_oop = false; 1797 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1798 } 1799 1800 // Arguments: 1801 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1802 // ignored 1803 // name - stub name string 1804 // 1805 // Inputs: 1806 // c_rarg0 - source array address 1807 // c_rarg1 - destination array address 1808 // c_rarg2 - element count, treated as ssize_t, can be zero 1809 // 1810 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1811 // the hardware handle it. The two dwords within qwords that span 1812 // cache line boundaries will still be loaded and stored atomicly. 1813 // 1814 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1815 address *entry, const char *name, 1816 bool dest_uninitialized = false) { 1817 const bool not_oop = false; 1818 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1819 } 1820 1821 1822 // Arguments: 1823 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1824 // ignored 1825 // name - stub name string 1826 // 1827 // Inputs: 1828 // c_rarg0 - source array address 1829 // c_rarg1 - destination array address 1830 // c_rarg2 - element count, treated as size_t, can be zero 1831 // 1832 // Side Effects: 1833 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1834 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1835 // 1836 address generate_disjoint_long_copy(bool aligned, address *entry, 1837 const char *name, bool dest_uninitialized = false) { 1838 const bool not_oop = false; 1839 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1840 } 1841 1842 // Arguments: 1843 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1844 // ignored 1845 // name - stub name string 1846 // 1847 // Inputs: 1848 // c_rarg0 - source array address 1849 // c_rarg1 - destination array address 1850 // c_rarg2 - element count, treated as size_t, can be zero 1851 // 1852 address generate_conjoint_long_copy(bool aligned, 1853 address nooverlap_target, address *entry, 1854 const char *name, bool dest_uninitialized = false) { 1855 const bool not_oop = false; 1856 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1857 } 1858 1859 // Arguments: 1860 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1861 // ignored 1862 // name - stub name string 1863 // 1864 // Inputs: 1865 // c_rarg0 - source array address 1866 // c_rarg1 - destination array address 1867 // c_rarg2 - element count, treated as size_t, can be zero 1868 // 1869 // Side Effects: 1870 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1871 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1872 // 1873 address generate_disjoint_oop_copy(bool aligned, address *entry, 1874 const char *name, bool dest_uninitialized) { 1875 const bool is_oop = true; 1876 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1877 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1878 } 1879 1880 // Arguments: 1881 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1882 // ignored 1883 // name - stub name string 1884 // 1885 // Inputs: 1886 // c_rarg0 - source array address 1887 // c_rarg1 - destination array address 1888 // c_rarg2 - element count, treated as size_t, can be zero 1889 // 1890 address generate_conjoint_oop_copy(bool aligned, 1891 address nooverlap_target, address *entry, 1892 const char *name, bool dest_uninitialized) { 1893 const bool is_oop = true; 1894 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1895 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1896 name, dest_uninitialized); 1897 } 1898 1899 1900 // Helper for generating a dynamic type check. 1901 // Smashes rscratch1. 1902 void generate_type_check(Register sub_klass, 1903 Register super_check_offset, 1904 Register super_klass, 1905 Label& L_success) { 1906 assert_different_registers(sub_klass, super_check_offset, super_klass); 1907 1908 BLOCK_COMMENT("type_check:"); 1909 1910 Label L_miss; 1911 1912 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1913 super_check_offset); 1914 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1915 1916 // Fall through on failure! 1917 __ BIND(L_miss); 1918 } 1919 1920 // 1921 // Generate checkcasting array copy stub 1922 // 1923 // Input: 1924 // c_rarg0 - source array address 1925 // c_rarg1 - destination array address 1926 // c_rarg2 - element count, treated as ssize_t, can be zero 1927 // c_rarg3 - size_t ckoff (super_check_offset) 1928 // c_rarg4 - oop ckval (super_klass) 1929 // 1930 // Output: 1931 // r0 == 0 - success 1932 // r0 == -1^K - failure, where K is partial transfer count 1933 // 1934 address generate_checkcast_copy(const char *name, address *entry, 1935 bool dest_uninitialized = false) { 1936 1937 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1938 1939 // Input registers (after setup_arg_regs) 1940 const Register from = c_rarg0; // source array address 1941 const Register to = c_rarg1; // destination array address 1942 const Register count = c_rarg2; // elementscount 1943 const Register ckoff = c_rarg3; // super_check_offset 1944 const Register ckval = c_rarg4; // super_klass 1945 1946 // Registers used as temps (r18, r19, r20 are save-on-entry) 1947 const Register count_save = r21; // orig elementscount 1948 const Register start_to = r20; // destination array start address 1949 const Register copied_oop = r18; // actual oop copied 1950 const Register r19_klass = r19; // oop._klass 1951 1952 //--------------------------------------------------------------- 1953 // Assembler stub will be used for this call to arraycopy 1954 // if the two arrays are subtypes of Object[] but the 1955 // destination array type is not equal to or a supertype 1956 // of the source type. Each element must be separately 1957 // checked. 1958 1959 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1960 copied_oop, r19_klass, count_save); 1961 1962 __ align(CodeEntryAlignment); 1963 StubCodeMark mark(this, "StubRoutines", name); 1964 address start = __ pc(); 1965 1966 __ enter(); // required for proper stackwalking of RuntimeStub frame 1967 1968 #ifdef ASSERT 1969 // caller guarantees that the arrays really are different 1970 // otherwise, we would have to make conjoint checks 1971 { Label L; 1972 array_overlap_test(L, TIMES_OOP); 1973 __ stop("checkcast_copy within a single array"); 1974 __ bind(L); 1975 } 1976 #endif //ASSERT 1977 1978 // Caller of this entry point must set up the argument registers. 1979 if (entry != NULL) { 1980 *entry = __ pc(); 1981 BLOCK_COMMENT("Entry:"); 1982 } 1983 1984 // Empty array: Nothing to do. 1985 __ cbz(count, L_done); 1986 1987 __ push(RegSet::of(r18, r19, r20, r21), sp); 1988 1989 #ifdef ASSERT 1990 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1991 // The ckoff and ckval must be mutually consistent, 1992 // even though caller generates both. 1993 { Label L; 1994 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1995 __ ldrw(start_to, Address(ckval, sco_offset)); 1996 __ cmpw(ckoff, start_to); 1997 __ br(Assembler::EQ, L); 1998 __ stop("super_check_offset inconsistent"); 1999 __ bind(L); 2000 } 2001 #endif //ASSERT 2002 2003 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 2004 2005 // save the original count 2006 __ mov(count_save, count); 2007 2008 // Copy from low to high addresses 2009 __ mov(start_to, to); // Save destination array start address 2010 __ b(L_load_element); 2011 2012 // ======== begin loop ======== 2013 // (Loop is rotated; its entry is L_load_element.) 2014 // Loop control: 2015 // for (; count != 0; count--) { 2016 // copied_oop = load_heap_oop(from++); 2017 // ... generate_type_check ...; 2018 // store_heap_oop(to++, copied_oop); 2019 // } 2020 __ align(OptoLoopAlignment); 2021 2022 __ BIND(L_store_element); 2023 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 2024 __ sub(count, count, 1); 2025 __ cbz(count, L_do_card_marks); 2026 2027 // ======== loop entry is here ======== 2028 __ BIND(L_load_element); 2029 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 2030 __ cbz(copied_oop, L_store_element); 2031 2032 __ load_klass(r19_klass, copied_oop);// query the object klass 2033 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 2034 // ======== end loop ======== 2035 2036 // It was a real error; we must depend on the caller to finish the job. 2037 // Register count = remaining oops, count_orig = total oops. 2038 // Emit GC store barriers for the oops we have copied and report 2039 // their number to the caller. 2040 2041 __ subs(count, count_save, count); // K = partially copied oop count 2042 __ eon(count, count, zr); // report (-1^K) to caller 2043 __ br(Assembler::EQ, L_done_pop); 2044 2045 __ BIND(L_do_card_marks); 2046 __ add(to, to, -heapOopSize); // make an inclusive end pointer 2047 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 2048 2049 __ bind(L_done_pop); 2050 __ pop(RegSet::of(r18, r19, r20, r21), sp); 2051 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2052 2053 __ bind(L_done); 2054 __ mov(r0, count); 2055 __ leave(); 2056 __ ret(lr); 2057 2058 return start; 2059 } 2060 2061 // Perform range checks on the proposed arraycopy. 2062 // Kills temp, but nothing else. 2063 // Also, clean the sign bits of src_pos and dst_pos. 2064 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2065 Register src_pos, // source position (c_rarg1) 2066 Register dst, // destination array oo (c_rarg2) 2067 Register dst_pos, // destination position (c_rarg3) 2068 Register length, 2069 Register temp, 2070 Label& L_failed) { 2071 BLOCK_COMMENT("arraycopy_range_checks:"); 2072 2073 assert_different_registers(rscratch1, temp); 2074 2075 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2076 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2077 __ addw(temp, length, src_pos); 2078 __ cmpw(temp, rscratch1); 2079 __ br(Assembler::HI, L_failed); 2080 2081 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2082 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2083 __ addw(temp, length, dst_pos); 2084 __ cmpw(temp, rscratch1); 2085 __ br(Assembler::HI, L_failed); 2086 2087 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2088 __ movw(src_pos, src_pos); 2089 __ movw(dst_pos, dst_pos); 2090 2091 BLOCK_COMMENT("arraycopy_range_checks done"); 2092 } 2093 2094 // These stubs get called from some dumb test routine. 2095 // I'll write them properly when they're called from 2096 // something that's actually doing something. 2097 static void fake_arraycopy_stub(address src, address dst, int count) { 2098 assert(count == 0, "huh?"); 2099 } 2100 2101 2102 // 2103 // Generate 'unsafe' array copy stub 2104 // Though just as safe as the other stubs, it takes an unscaled 2105 // size_t argument instead of an element count. 2106 // 2107 // Input: 2108 // c_rarg0 - source array address 2109 // c_rarg1 - destination array address 2110 // c_rarg2 - byte count, treated as ssize_t, can be zero 2111 // 2112 // Examines the alignment of the operands and dispatches 2113 // to a long, int, short, or byte copy loop. 2114 // 2115 address generate_unsafe_copy(const char *name, 2116 address byte_copy_entry, 2117 address short_copy_entry, 2118 address int_copy_entry, 2119 address long_copy_entry) { 2120 Label L_long_aligned, L_int_aligned, L_short_aligned; 2121 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2122 2123 __ align(CodeEntryAlignment); 2124 StubCodeMark mark(this, "StubRoutines", name); 2125 address start = __ pc(); 2126 __ enter(); // required for proper stackwalking of RuntimeStub frame 2127 2128 // bump this on entry, not on exit: 2129 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2130 2131 __ orr(rscratch1, s, d); 2132 __ orr(rscratch1, rscratch1, count); 2133 2134 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2135 __ cbz(rscratch1, L_long_aligned); 2136 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2137 __ cbz(rscratch1, L_int_aligned); 2138 __ tbz(rscratch1, 0, L_short_aligned); 2139 __ b(RuntimeAddress(byte_copy_entry)); 2140 2141 __ BIND(L_short_aligned); 2142 __ lsr(count, count, LogBytesPerShort); // size => short_count 2143 __ b(RuntimeAddress(short_copy_entry)); 2144 __ BIND(L_int_aligned); 2145 __ lsr(count, count, LogBytesPerInt); // size => int_count 2146 __ b(RuntimeAddress(int_copy_entry)); 2147 __ BIND(L_long_aligned); 2148 __ lsr(count, count, LogBytesPerLong); // size => long_count 2149 __ b(RuntimeAddress(long_copy_entry)); 2150 2151 return start; 2152 } 2153 2154 // 2155 // Generate generic array copy stubs 2156 // 2157 // Input: 2158 // c_rarg0 - src oop 2159 // c_rarg1 - src_pos (32-bits) 2160 // c_rarg2 - dst oop 2161 // c_rarg3 - dst_pos (32-bits) 2162 // c_rarg4 - element count (32-bits) 2163 // 2164 // Output: 2165 // r0 == 0 - success 2166 // r0 == -1^K - failure, where K is partial transfer count 2167 // 2168 address generate_generic_copy(const char *name, 2169 address byte_copy_entry, address short_copy_entry, 2170 address int_copy_entry, address oop_copy_entry, 2171 address long_copy_entry, address checkcast_copy_entry) { 2172 2173 Label L_failed, L_failed_0, L_objArray; 2174 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2175 2176 // Input registers 2177 const Register src = c_rarg0; // source array oop 2178 const Register src_pos = c_rarg1; // source position 2179 const Register dst = c_rarg2; // destination array oop 2180 const Register dst_pos = c_rarg3; // destination position 2181 const Register length = c_rarg4; 2182 2183 StubCodeMark mark(this, "StubRoutines", name); 2184 2185 __ align(CodeEntryAlignment); 2186 address start = __ pc(); 2187 2188 __ enter(); // required for proper stackwalking of RuntimeStub frame 2189 2190 // bump this on entry, not on exit: 2191 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2192 2193 //----------------------------------------------------------------------- 2194 // Assembler stub will be used for this call to arraycopy 2195 // if the following conditions are met: 2196 // 2197 // (1) src and dst must not be null. 2198 // (2) src_pos must not be negative. 2199 // (3) dst_pos must not be negative. 2200 // (4) length must not be negative. 2201 // (5) src klass and dst klass should be the same and not NULL. 2202 // (6) src and dst should be arrays. 2203 // (7) src_pos + length must not exceed length of src. 2204 // (8) dst_pos + length must not exceed length of dst. 2205 // 2206 2207 // if (src == NULL) return -1; 2208 __ cbz(src, L_failed); 2209 2210 // if (src_pos < 0) return -1; 2211 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2212 2213 // if (dst == NULL) return -1; 2214 __ cbz(dst, L_failed); 2215 2216 // if (dst_pos < 0) return -1; 2217 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2218 2219 // registers used as temp 2220 const Register scratch_length = r16; // elements count to copy 2221 const Register scratch_src_klass = r17; // array klass 2222 const Register lh = r18; // layout helper 2223 2224 // if (length < 0) return -1; 2225 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2226 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2227 2228 __ load_klass(scratch_src_klass, src); 2229 #ifdef ASSERT 2230 // assert(src->klass() != NULL); 2231 { 2232 BLOCK_COMMENT("assert klasses not null {"); 2233 Label L1, L2; 2234 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2235 __ bind(L1); 2236 __ stop("broken null klass"); 2237 __ bind(L2); 2238 __ load_klass(rscratch1, dst); 2239 __ cbz(rscratch1, L1); // this would be broken also 2240 BLOCK_COMMENT("} assert klasses not null done"); 2241 } 2242 #endif 2243 2244 // Load layout helper (32-bits) 2245 // 2246 // |array_tag| | header_size | element_type | |log2_element_size| 2247 // 32 30 24 16 8 2 0 2248 // 2249 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2250 // 2251 2252 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2253 2254 // Handle objArrays completely differently... 2255 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2256 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2257 __ movw(rscratch1, objArray_lh); 2258 __ eorw(rscratch2, lh, rscratch1); 2259 __ cbzw(rscratch2, L_objArray); 2260 2261 // if (src->klass() != dst->klass()) return -1; 2262 __ load_klass(rscratch2, dst); 2263 __ eor(rscratch2, rscratch2, scratch_src_klass); 2264 __ cbnz(rscratch2, L_failed); 2265 2266 // if (!src->is_Array()) return -1; 2267 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2268 2269 // At this point, it is known to be a typeArray (array_tag 0x3). 2270 #ifdef ASSERT 2271 { 2272 BLOCK_COMMENT("assert primitive array {"); 2273 Label L; 2274 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2275 __ cmpw(lh, rscratch2); 2276 __ br(Assembler::GE, L); 2277 __ stop("must be a primitive array"); 2278 __ bind(L); 2279 BLOCK_COMMENT("} assert primitive array done"); 2280 } 2281 #endif 2282 2283 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2284 rscratch2, L_failed); 2285 2286 // TypeArrayKlass 2287 // 2288 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2289 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2290 // 2291 2292 const Register rscratch1_offset = rscratch1; // array offset 2293 const Register r18_elsize = lh; // element size 2294 2295 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2296 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2297 __ add(src, src, rscratch1_offset); // src array offset 2298 __ add(dst, dst, rscratch1_offset); // dst array offset 2299 BLOCK_COMMENT("choose copy loop based on element size"); 2300 2301 // next registers should be set before the jump to corresponding stub 2302 const Register from = c_rarg0; // source array address 2303 const Register to = c_rarg1; // destination array address 2304 const Register count = c_rarg2; // elements count 2305 2306 // 'from', 'to', 'count' registers should be set in such order 2307 // since they are the same as 'src', 'src_pos', 'dst'. 2308 2309 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2310 2311 // The possible values of elsize are 0-3, i.e. exact_log2(element 2312 // size in bytes). We do a simple bitwise binary search. 2313 __ BIND(L_copy_bytes); 2314 __ tbnz(r18_elsize, 1, L_copy_ints); 2315 __ tbnz(r18_elsize, 0, L_copy_shorts); 2316 __ lea(from, Address(src, src_pos));// src_addr 2317 __ lea(to, Address(dst, dst_pos));// dst_addr 2318 __ movw(count, scratch_length); // length 2319 __ b(RuntimeAddress(byte_copy_entry)); 2320 2321 __ BIND(L_copy_shorts); 2322 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2323 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2324 __ movw(count, scratch_length); // length 2325 __ b(RuntimeAddress(short_copy_entry)); 2326 2327 __ BIND(L_copy_ints); 2328 __ tbnz(r18_elsize, 0, L_copy_longs); 2329 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2330 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2331 __ movw(count, scratch_length); // length 2332 __ b(RuntimeAddress(int_copy_entry)); 2333 2334 __ BIND(L_copy_longs); 2335 #ifdef ASSERT 2336 { 2337 BLOCK_COMMENT("assert long copy {"); 2338 Label L; 2339 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2340 __ cmpw(r18_elsize, LogBytesPerLong); 2341 __ br(Assembler::EQ, L); 2342 __ stop("must be long copy, but elsize is wrong"); 2343 __ bind(L); 2344 BLOCK_COMMENT("} assert long copy done"); 2345 } 2346 #endif 2347 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2348 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2349 __ movw(count, scratch_length); // length 2350 __ b(RuntimeAddress(long_copy_entry)); 2351 2352 // ObjArrayKlass 2353 __ BIND(L_objArray); 2354 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2355 2356 Label L_plain_copy, L_checkcast_copy; 2357 // test array classes for subtyping 2358 __ load_klass(r18, dst); 2359 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2360 __ br(Assembler::NE, L_checkcast_copy); 2361 2362 // Identically typed arrays can be copied without element-wise checks. 2363 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2364 rscratch2, L_failed); 2365 2366 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2367 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2368 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2369 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2370 __ movw(count, scratch_length); // length 2371 __ BIND(L_plain_copy); 2372 __ b(RuntimeAddress(oop_copy_entry)); 2373 2374 __ BIND(L_checkcast_copy); 2375 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2376 { 2377 // Before looking at dst.length, make sure dst is also an objArray. 2378 __ ldrw(rscratch1, Address(r18, lh_offset)); 2379 __ movw(rscratch2, objArray_lh); 2380 __ eorw(rscratch1, rscratch1, rscratch2); 2381 __ cbnzw(rscratch1, L_failed); 2382 2383 // It is safe to examine both src.length and dst.length. 2384 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2385 r18, L_failed); 2386 2387 const Register rscratch2_dst_klass = rscratch2; 2388 __ load_klass(rscratch2_dst_klass, dst); // reload 2389 2390 // Marshal the base address arguments now, freeing registers. 2391 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2392 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2393 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2394 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2395 __ movw(count, length); // length (reloaded) 2396 Register sco_temp = c_rarg3; // this register is free now 2397 assert_different_registers(from, to, count, sco_temp, 2398 rscratch2_dst_klass, scratch_src_klass); 2399 // assert_clean_int(count, sco_temp); 2400 2401 // Generate the type check. 2402 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2403 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2404 // assert_clean_int(sco_temp, r18); 2405 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2406 2407 // Fetch destination element klass from the ObjArrayKlass header. 2408 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2409 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2410 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2411 2412 // the checkcast_copy loop needs two extra arguments: 2413 assert(c_rarg3 == sco_temp, "#3 already in place"); 2414 // Set up arguments for checkcast_copy_entry. 2415 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2416 __ b(RuntimeAddress(checkcast_copy_entry)); 2417 } 2418 2419 __ BIND(L_failed); 2420 __ mov(r0, -1); 2421 __ leave(); // required for proper stackwalking of RuntimeStub frame 2422 __ ret(lr); 2423 2424 return start; 2425 } 2426 2427 // 2428 // Generate stub for array fill. If "aligned" is true, the 2429 // "to" address is assumed to be heapword aligned. 2430 // 2431 // Arguments for generated stub: 2432 // to: c_rarg0 2433 // value: c_rarg1 2434 // count: c_rarg2 treated as signed 2435 // 2436 address generate_fill(BasicType t, bool aligned, const char *name) { 2437 __ align(CodeEntryAlignment); 2438 StubCodeMark mark(this, "StubRoutines", name); 2439 address start = __ pc(); 2440 2441 BLOCK_COMMENT("Entry:"); 2442 2443 const Register to = c_rarg0; // source array address 2444 const Register value = c_rarg1; // value 2445 const Register count = c_rarg2; // elements count 2446 2447 const Register bz_base = r10; // base for block_zero routine 2448 const Register cnt_words = r11; // temp register 2449 2450 __ enter(); 2451 2452 Label L_fill_elements, L_exit1; 2453 2454 int shift = -1; 2455 switch (t) { 2456 case T_BYTE: 2457 shift = 0; 2458 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2459 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2460 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2461 __ br(Assembler::LO, L_fill_elements); 2462 break; 2463 case T_SHORT: 2464 shift = 1; 2465 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2466 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2467 __ br(Assembler::LO, L_fill_elements); 2468 break; 2469 case T_INT: 2470 shift = 2; 2471 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2472 __ br(Assembler::LO, L_fill_elements); 2473 break; 2474 default: ShouldNotReachHere(); 2475 } 2476 2477 // Align source address at 8 bytes address boundary. 2478 Label L_skip_align1, L_skip_align2, L_skip_align4; 2479 if (!aligned) { 2480 switch (t) { 2481 case T_BYTE: 2482 // One byte misalignment happens only for byte arrays. 2483 __ tbz(to, 0, L_skip_align1); 2484 __ strb(value, Address(__ post(to, 1))); 2485 __ subw(count, count, 1); 2486 __ bind(L_skip_align1); 2487 // Fallthrough 2488 case T_SHORT: 2489 // Two bytes misalignment happens only for byte and short (char) arrays. 2490 __ tbz(to, 1, L_skip_align2); 2491 __ strh(value, Address(__ post(to, 2))); 2492 __ subw(count, count, 2 >> shift); 2493 __ bind(L_skip_align2); 2494 // Fallthrough 2495 case T_INT: 2496 // Align to 8 bytes, we know we are 4 byte aligned to start. 2497 __ tbz(to, 2, L_skip_align4); 2498 __ strw(value, Address(__ post(to, 4))); 2499 __ subw(count, count, 4 >> shift); 2500 __ bind(L_skip_align4); 2501 break; 2502 default: ShouldNotReachHere(); 2503 } 2504 } 2505 2506 // 2507 // Fill large chunks 2508 // 2509 __ lsrw(cnt_words, count, 3 - shift); // number of words 2510 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2511 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2512 if (UseBlockZeroing) { 2513 Label non_block_zeroing, rest; 2514 Register tmp = rscratch1; 2515 // count >= BlockZeroingLowLimit && value == 0 2516 __ subs(tmp, cnt_words, BlockZeroingLowLimit >> 3); 2517 __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE); 2518 __ br(Assembler::NE, non_block_zeroing); 2519 __ mov(bz_base, to); 2520 __ block_zero(bz_base, cnt_words, true); 2521 __ mov(to, bz_base); 2522 __ b(rest); 2523 __ bind(non_block_zeroing); 2524 __ fill_words(to, cnt_words, value); 2525 __ bind(rest); 2526 } 2527 else { 2528 __ fill_words(to, cnt_words, value); 2529 } 2530 2531 // Remaining count is less than 8 bytes. Fill it by a single store. 2532 // Note that the total length is no less than 8 bytes. 2533 if (t == T_BYTE || t == T_SHORT) { 2534 Label L_exit1; 2535 __ cbzw(count, L_exit1); 2536 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2537 __ str(value, Address(to, -8)); // overwrite some elements 2538 __ bind(L_exit1); 2539 __ leave(); 2540 __ ret(lr); 2541 } 2542 2543 // Handle copies less than 8 bytes. 2544 Label L_fill_2, L_fill_4, L_exit2; 2545 __ bind(L_fill_elements); 2546 switch (t) { 2547 case T_BYTE: 2548 __ tbz(count, 0, L_fill_2); 2549 __ strb(value, Address(__ post(to, 1))); 2550 __ bind(L_fill_2); 2551 __ tbz(count, 1, L_fill_4); 2552 __ strh(value, Address(__ post(to, 2))); 2553 __ bind(L_fill_4); 2554 __ tbz(count, 2, L_exit2); 2555 __ strw(value, Address(to)); 2556 break; 2557 case T_SHORT: 2558 __ tbz(count, 0, L_fill_4); 2559 __ strh(value, Address(__ post(to, 2))); 2560 __ bind(L_fill_4); 2561 __ tbz(count, 1, L_exit2); 2562 __ strw(value, Address(to)); 2563 break; 2564 case T_INT: 2565 __ cbzw(count, L_exit2); 2566 __ strw(value, Address(to)); 2567 break; 2568 default: ShouldNotReachHere(); 2569 } 2570 __ bind(L_exit2); 2571 __ leave(); 2572 __ ret(lr); 2573 return start; 2574 } 2575 2576 void generate_arraycopy_stubs() { 2577 address entry; 2578 address entry_jbyte_arraycopy; 2579 address entry_jshort_arraycopy; 2580 address entry_jint_arraycopy; 2581 address entry_oop_arraycopy; 2582 address entry_jlong_arraycopy; 2583 address entry_checkcast_arraycopy; 2584 2585 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2586 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2587 2588 StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11); 2589 2590 //*** jbyte 2591 // Always need aligned and unaligned versions 2592 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2593 "jbyte_disjoint_arraycopy"); 2594 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2595 &entry_jbyte_arraycopy, 2596 "jbyte_arraycopy"); 2597 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2598 "arrayof_jbyte_disjoint_arraycopy"); 2599 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2600 "arrayof_jbyte_arraycopy"); 2601 2602 //*** jshort 2603 // Always need aligned and unaligned versions 2604 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2605 "jshort_disjoint_arraycopy"); 2606 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2607 &entry_jshort_arraycopy, 2608 "jshort_arraycopy"); 2609 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2610 "arrayof_jshort_disjoint_arraycopy"); 2611 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2612 "arrayof_jshort_arraycopy"); 2613 2614 //*** jint 2615 // Aligned versions 2616 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2617 "arrayof_jint_disjoint_arraycopy"); 2618 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2619 "arrayof_jint_arraycopy"); 2620 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2621 // entry_jint_arraycopy always points to the unaligned version 2622 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2623 "jint_disjoint_arraycopy"); 2624 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2625 &entry_jint_arraycopy, 2626 "jint_arraycopy"); 2627 2628 //*** jlong 2629 // It is always aligned 2630 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2631 "arrayof_jlong_disjoint_arraycopy"); 2632 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2633 "arrayof_jlong_arraycopy"); 2634 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2635 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2636 2637 //*** oops 2638 { 2639 // With compressed oops we need unaligned versions; notice that 2640 // we overwrite entry_oop_arraycopy. 2641 bool aligned = !UseCompressedOops; 2642 2643 StubRoutines::_arrayof_oop_disjoint_arraycopy 2644 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2645 /*dest_uninitialized*/false); 2646 StubRoutines::_arrayof_oop_arraycopy 2647 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2648 /*dest_uninitialized*/false); 2649 // Aligned versions without pre-barriers 2650 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2651 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2652 /*dest_uninitialized*/true); 2653 StubRoutines::_arrayof_oop_arraycopy_uninit 2654 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2655 /*dest_uninitialized*/true); 2656 } 2657 2658 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2659 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2660 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2661 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2662 2663 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2664 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2665 /*dest_uninitialized*/true); 2666 2667 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2668 entry_jbyte_arraycopy, 2669 entry_jshort_arraycopy, 2670 entry_jint_arraycopy, 2671 entry_jlong_arraycopy); 2672 2673 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2674 entry_jbyte_arraycopy, 2675 entry_jshort_arraycopy, 2676 entry_jint_arraycopy, 2677 entry_oop_arraycopy, 2678 entry_jlong_arraycopy, 2679 entry_checkcast_arraycopy); 2680 2681 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2682 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2683 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2684 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2685 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2686 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2687 } 2688 2689 void generate_math_stubs() { Unimplemented(); } 2690 2691 // Arguments: 2692 // 2693 // Inputs: 2694 // c_rarg0 - source byte array address 2695 // c_rarg1 - destination byte array address 2696 // c_rarg2 - K (key) in little endian int array 2697 // 2698 address generate_aescrypt_encryptBlock() { 2699 __ align(CodeEntryAlignment); 2700 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2701 2702 Label L_doLast; 2703 2704 const Register from = c_rarg0; // source array address 2705 const Register to = c_rarg1; // destination array address 2706 const Register key = c_rarg2; // key array address 2707 const Register keylen = rscratch1; 2708 2709 address start = __ pc(); 2710 __ enter(); 2711 2712 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2713 2714 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2715 2716 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2717 __ rev32(v1, __ T16B, v1); 2718 __ rev32(v2, __ T16B, v2); 2719 __ rev32(v3, __ T16B, v3); 2720 __ rev32(v4, __ T16B, v4); 2721 __ aese(v0, v1); 2722 __ aesmc(v0, v0); 2723 __ aese(v0, v2); 2724 __ aesmc(v0, v0); 2725 __ aese(v0, v3); 2726 __ aesmc(v0, v0); 2727 __ aese(v0, v4); 2728 __ aesmc(v0, v0); 2729 2730 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2731 __ rev32(v1, __ T16B, v1); 2732 __ rev32(v2, __ T16B, v2); 2733 __ rev32(v3, __ T16B, v3); 2734 __ rev32(v4, __ T16B, v4); 2735 __ aese(v0, v1); 2736 __ aesmc(v0, v0); 2737 __ aese(v0, v2); 2738 __ aesmc(v0, v0); 2739 __ aese(v0, v3); 2740 __ aesmc(v0, v0); 2741 __ aese(v0, v4); 2742 __ aesmc(v0, v0); 2743 2744 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2745 __ rev32(v1, __ T16B, v1); 2746 __ rev32(v2, __ T16B, v2); 2747 2748 __ cmpw(keylen, 44); 2749 __ br(Assembler::EQ, L_doLast); 2750 2751 __ aese(v0, v1); 2752 __ aesmc(v0, v0); 2753 __ aese(v0, v2); 2754 __ aesmc(v0, v0); 2755 2756 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2757 __ rev32(v1, __ T16B, v1); 2758 __ rev32(v2, __ T16B, v2); 2759 2760 __ cmpw(keylen, 52); 2761 __ br(Assembler::EQ, L_doLast); 2762 2763 __ aese(v0, v1); 2764 __ aesmc(v0, v0); 2765 __ aese(v0, v2); 2766 __ aesmc(v0, v0); 2767 2768 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2769 __ rev32(v1, __ T16B, v1); 2770 __ rev32(v2, __ T16B, v2); 2771 2772 __ BIND(L_doLast); 2773 2774 __ aese(v0, v1); 2775 __ aesmc(v0, v0); 2776 __ aese(v0, v2); 2777 2778 __ ld1(v1, __ T16B, key); 2779 __ rev32(v1, __ T16B, v1); 2780 __ eor(v0, __ T16B, v0, v1); 2781 2782 __ st1(v0, __ T16B, to); 2783 2784 __ mov(r0, 0); 2785 2786 __ leave(); 2787 __ ret(lr); 2788 2789 return start; 2790 } 2791 2792 // Arguments: 2793 // 2794 // Inputs: 2795 // c_rarg0 - source byte array address 2796 // c_rarg1 - destination byte array address 2797 // c_rarg2 - K (key) in little endian int array 2798 // 2799 address generate_aescrypt_decryptBlock() { 2800 assert(UseAES, "need AES instructions and misaligned SSE support"); 2801 __ align(CodeEntryAlignment); 2802 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2803 Label L_doLast; 2804 2805 const Register from = c_rarg0; // source array address 2806 const Register to = c_rarg1; // destination array address 2807 const Register key = c_rarg2; // key array address 2808 const Register keylen = rscratch1; 2809 2810 address start = __ pc(); 2811 __ enter(); // required for proper stackwalking of RuntimeStub frame 2812 2813 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2814 2815 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2816 2817 __ ld1(v5, __ T16B, __ post(key, 16)); 2818 __ rev32(v5, __ T16B, v5); 2819 2820 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2821 __ rev32(v1, __ T16B, v1); 2822 __ rev32(v2, __ T16B, v2); 2823 __ rev32(v3, __ T16B, v3); 2824 __ rev32(v4, __ T16B, v4); 2825 __ aesd(v0, v1); 2826 __ aesimc(v0, v0); 2827 __ aesd(v0, v2); 2828 __ aesimc(v0, v0); 2829 __ aesd(v0, v3); 2830 __ aesimc(v0, v0); 2831 __ aesd(v0, v4); 2832 __ aesimc(v0, v0); 2833 2834 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2835 __ rev32(v1, __ T16B, v1); 2836 __ rev32(v2, __ T16B, v2); 2837 __ rev32(v3, __ T16B, v3); 2838 __ rev32(v4, __ T16B, v4); 2839 __ aesd(v0, v1); 2840 __ aesimc(v0, v0); 2841 __ aesd(v0, v2); 2842 __ aesimc(v0, v0); 2843 __ aesd(v0, v3); 2844 __ aesimc(v0, v0); 2845 __ aesd(v0, v4); 2846 __ aesimc(v0, v0); 2847 2848 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2849 __ rev32(v1, __ T16B, v1); 2850 __ rev32(v2, __ T16B, v2); 2851 2852 __ cmpw(keylen, 44); 2853 __ br(Assembler::EQ, L_doLast); 2854 2855 __ aesd(v0, v1); 2856 __ aesimc(v0, v0); 2857 __ aesd(v0, v2); 2858 __ aesimc(v0, v0); 2859 2860 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2861 __ rev32(v1, __ T16B, v1); 2862 __ rev32(v2, __ T16B, v2); 2863 2864 __ cmpw(keylen, 52); 2865 __ br(Assembler::EQ, L_doLast); 2866 2867 __ aesd(v0, v1); 2868 __ aesimc(v0, v0); 2869 __ aesd(v0, v2); 2870 __ aesimc(v0, v0); 2871 2872 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2873 __ rev32(v1, __ T16B, v1); 2874 __ rev32(v2, __ T16B, v2); 2875 2876 __ BIND(L_doLast); 2877 2878 __ aesd(v0, v1); 2879 __ aesimc(v0, v0); 2880 __ aesd(v0, v2); 2881 2882 __ eor(v0, __ T16B, v0, v5); 2883 2884 __ st1(v0, __ T16B, to); 2885 2886 __ mov(r0, 0); 2887 2888 __ leave(); 2889 __ ret(lr); 2890 2891 return start; 2892 } 2893 2894 // Arguments: 2895 // 2896 // Inputs: 2897 // c_rarg0 - source byte array address 2898 // c_rarg1 - destination byte array address 2899 // c_rarg2 - K (key) in little endian int array 2900 // c_rarg3 - r vector byte array address 2901 // c_rarg4 - input length 2902 // 2903 // Output: 2904 // x0 - input length 2905 // 2906 address generate_cipherBlockChaining_encryptAESCrypt() { 2907 assert(UseAES, "need AES instructions and misaligned SSE support"); 2908 __ align(CodeEntryAlignment); 2909 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2910 2911 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2912 2913 const Register from = c_rarg0; // source array address 2914 const Register to = c_rarg1; // destination array address 2915 const Register key = c_rarg2; // key array address 2916 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2917 // and left with the results of the last encryption block 2918 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2919 const Register keylen = rscratch1; 2920 2921 address start = __ pc(); 2922 __ enter(); 2923 2924 __ mov(rscratch2, len_reg); 2925 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2926 2927 __ ld1(v0, __ T16B, rvec); 2928 2929 __ cmpw(keylen, 52); 2930 __ br(Assembler::CC, L_loadkeys_44); 2931 __ br(Assembler::EQ, L_loadkeys_52); 2932 2933 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2934 __ rev32(v17, __ T16B, v17); 2935 __ rev32(v18, __ T16B, v18); 2936 __ BIND(L_loadkeys_52); 2937 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2938 __ rev32(v19, __ T16B, v19); 2939 __ rev32(v20, __ T16B, v20); 2940 __ BIND(L_loadkeys_44); 2941 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2942 __ rev32(v21, __ T16B, v21); 2943 __ rev32(v22, __ T16B, v22); 2944 __ rev32(v23, __ T16B, v23); 2945 __ rev32(v24, __ T16B, v24); 2946 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2947 __ rev32(v25, __ T16B, v25); 2948 __ rev32(v26, __ T16B, v26); 2949 __ rev32(v27, __ T16B, v27); 2950 __ rev32(v28, __ T16B, v28); 2951 __ ld1(v29, v30, v31, __ T16B, key); 2952 __ rev32(v29, __ T16B, v29); 2953 __ rev32(v30, __ T16B, v30); 2954 __ rev32(v31, __ T16B, v31); 2955 2956 __ BIND(L_aes_loop); 2957 __ ld1(v1, __ T16B, __ post(from, 16)); 2958 __ eor(v0, __ T16B, v0, v1); 2959 2960 __ br(Assembler::CC, L_rounds_44); 2961 __ br(Assembler::EQ, L_rounds_52); 2962 2963 __ aese(v0, v17); __ aesmc(v0, v0); 2964 __ aese(v0, v18); __ aesmc(v0, v0); 2965 __ BIND(L_rounds_52); 2966 __ aese(v0, v19); __ aesmc(v0, v0); 2967 __ aese(v0, v20); __ aesmc(v0, v0); 2968 __ BIND(L_rounds_44); 2969 __ aese(v0, v21); __ aesmc(v0, v0); 2970 __ aese(v0, v22); __ aesmc(v0, v0); 2971 __ aese(v0, v23); __ aesmc(v0, v0); 2972 __ aese(v0, v24); __ aesmc(v0, v0); 2973 __ aese(v0, v25); __ aesmc(v0, v0); 2974 __ aese(v0, v26); __ aesmc(v0, v0); 2975 __ aese(v0, v27); __ aesmc(v0, v0); 2976 __ aese(v0, v28); __ aesmc(v0, v0); 2977 __ aese(v0, v29); __ aesmc(v0, v0); 2978 __ aese(v0, v30); 2979 __ eor(v0, __ T16B, v0, v31); 2980 2981 __ st1(v0, __ T16B, __ post(to, 16)); 2982 __ sub(len_reg, len_reg, 16); 2983 __ cbnz(len_reg, L_aes_loop); 2984 2985 __ st1(v0, __ T16B, rvec); 2986 2987 __ mov(r0, rscratch2); 2988 2989 __ leave(); 2990 __ ret(lr); 2991 2992 return start; 2993 } 2994 2995 // Arguments: 2996 // 2997 // Inputs: 2998 // c_rarg0 - source byte array address 2999 // c_rarg1 - destination byte array address 3000 // c_rarg2 - K (key) in little endian int array 3001 // c_rarg3 - r vector byte array address 3002 // c_rarg4 - input length 3003 // 3004 // Output: 3005 // r0 - input length 3006 // 3007 address generate_cipherBlockChaining_decryptAESCrypt() { 3008 assert(UseAES, "need AES instructions and misaligned SSE support"); 3009 __ align(CodeEntryAlignment); 3010 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 3011 3012 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 3013 3014 const Register from = c_rarg0; // source array address 3015 const Register to = c_rarg1; // destination array address 3016 const Register key = c_rarg2; // key array address 3017 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3018 // and left with the results of the last encryption block 3019 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3020 const Register keylen = rscratch1; 3021 3022 address start = __ pc(); 3023 __ enter(); 3024 3025 __ mov(rscratch2, len_reg); 3026 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3027 3028 __ ld1(v2, __ T16B, rvec); 3029 3030 __ ld1(v31, __ T16B, __ post(key, 16)); 3031 __ rev32(v31, __ T16B, v31); 3032 3033 __ cmpw(keylen, 52); 3034 __ br(Assembler::CC, L_loadkeys_44); 3035 __ br(Assembler::EQ, L_loadkeys_52); 3036 3037 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 3038 __ rev32(v17, __ T16B, v17); 3039 __ rev32(v18, __ T16B, v18); 3040 __ BIND(L_loadkeys_52); 3041 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 3042 __ rev32(v19, __ T16B, v19); 3043 __ rev32(v20, __ T16B, v20); 3044 __ BIND(L_loadkeys_44); 3045 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 3046 __ rev32(v21, __ T16B, v21); 3047 __ rev32(v22, __ T16B, v22); 3048 __ rev32(v23, __ T16B, v23); 3049 __ rev32(v24, __ T16B, v24); 3050 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 3051 __ rev32(v25, __ T16B, v25); 3052 __ rev32(v26, __ T16B, v26); 3053 __ rev32(v27, __ T16B, v27); 3054 __ rev32(v28, __ T16B, v28); 3055 __ ld1(v29, v30, __ T16B, key); 3056 __ rev32(v29, __ T16B, v29); 3057 __ rev32(v30, __ T16B, v30); 3058 3059 __ BIND(L_aes_loop); 3060 __ ld1(v0, __ T16B, __ post(from, 16)); 3061 __ orr(v1, __ T16B, v0, v0); 3062 3063 __ br(Assembler::CC, L_rounds_44); 3064 __ br(Assembler::EQ, L_rounds_52); 3065 3066 __ aesd(v0, v17); __ aesimc(v0, v0); 3067 __ aesd(v0, v18); __ aesimc(v0, v0); 3068 __ BIND(L_rounds_52); 3069 __ aesd(v0, v19); __ aesimc(v0, v0); 3070 __ aesd(v0, v20); __ aesimc(v0, v0); 3071 __ BIND(L_rounds_44); 3072 __ aesd(v0, v21); __ aesimc(v0, v0); 3073 __ aesd(v0, v22); __ aesimc(v0, v0); 3074 __ aesd(v0, v23); __ aesimc(v0, v0); 3075 __ aesd(v0, v24); __ aesimc(v0, v0); 3076 __ aesd(v0, v25); __ aesimc(v0, v0); 3077 __ aesd(v0, v26); __ aesimc(v0, v0); 3078 __ aesd(v0, v27); __ aesimc(v0, v0); 3079 __ aesd(v0, v28); __ aesimc(v0, v0); 3080 __ aesd(v0, v29); __ aesimc(v0, v0); 3081 __ aesd(v0, v30); 3082 __ eor(v0, __ T16B, v0, v31); 3083 __ eor(v0, __ T16B, v0, v2); 3084 3085 __ st1(v0, __ T16B, __ post(to, 16)); 3086 __ orr(v2, __ T16B, v1, v1); 3087 3088 __ sub(len_reg, len_reg, 16); 3089 __ cbnz(len_reg, L_aes_loop); 3090 3091 __ st1(v2, __ T16B, rvec); 3092 3093 __ mov(r0, rscratch2); 3094 3095 __ leave(); 3096 __ ret(lr); 3097 3098 return start; 3099 } 3100 3101 // Arguments: 3102 // 3103 // Inputs: 3104 // c_rarg0 - byte[] source+offset 3105 // c_rarg1 - int[] SHA.state 3106 // c_rarg2 - int offset 3107 // c_rarg3 - int limit 3108 // 3109 address generate_sha1_implCompress(bool multi_block, const char *name) { 3110 __ align(CodeEntryAlignment); 3111 StubCodeMark mark(this, "StubRoutines", name); 3112 address start = __ pc(); 3113 3114 Register buf = c_rarg0; 3115 Register state = c_rarg1; 3116 Register ofs = c_rarg2; 3117 Register limit = c_rarg3; 3118 3119 Label keys; 3120 Label sha1_loop; 3121 3122 // load the keys into v0..v3 3123 __ adr(rscratch1, keys); 3124 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3125 // load 5 words state into v6, v7 3126 __ ldrq(v6, Address(state, 0)); 3127 __ ldrs(v7, Address(state, 16)); 3128 3129 3130 __ BIND(sha1_loop); 3131 // load 64 bytes of data into v16..v19 3132 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3133 __ rev32(v16, __ T16B, v16); 3134 __ rev32(v17, __ T16B, v17); 3135 __ rev32(v18, __ T16B, v18); 3136 __ rev32(v19, __ T16B, v19); 3137 3138 // do the sha1 3139 __ addv(v4, __ T4S, v16, v0); 3140 __ orr(v20, __ T16B, v6, v6); 3141 3142 FloatRegister d0 = v16; 3143 FloatRegister d1 = v17; 3144 FloatRegister d2 = v18; 3145 FloatRegister d3 = v19; 3146 3147 for (int round = 0; round < 20; round++) { 3148 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3149 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3150 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3151 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3152 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3153 3154 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3155 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3156 __ sha1h(tmp2, __ T4S, v20); 3157 if (round < 5) 3158 __ sha1c(v20, __ T4S, tmp3, tmp4); 3159 else if (round < 10 || round >= 15) 3160 __ sha1p(v20, __ T4S, tmp3, tmp4); 3161 else 3162 __ sha1m(v20, __ T4S, tmp3, tmp4); 3163 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3164 3165 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3166 } 3167 3168 __ addv(v7, __ T2S, v7, v21); 3169 __ addv(v6, __ T4S, v6, v20); 3170 3171 if (multi_block) { 3172 __ add(ofs, ofs, 64); 3173 __ cmp(ofs, limit); 3174 __ br(Assembler::LE, sha1_loop); 3175 __ mov(c_rarg0, ofs); // return ofs 3176 } 3177 3178 __ strq(v6, Address(state, 0)); 3179 __ strs(v7, Address(state, 16)); 3180 3181 __ ret(lr); 3182 3183 __ bind(keys); 3184 __ emit_int32(0x5a827999); 3185 __ emit_int32(0x6ed9eba1); 3186 __ emit_int32(0x8f1bbcdc); 3187 __ emit_int32(0xca62c1d6); 3188 3189 return start; 3190 } 3191 3192 3193 // Arguments: 3194 // 3195 // Inputs: 3196 // c_rarg0 - byte[] source+offset 3197 // c_rarg1 - int[] SHA.state 3198 // c_rarg2 - int offset 3199 // c_rarg3 - int limit 3200 // 3201 address generate_sha256_implCompress(bool multi_block, const char *name) { 3202 static const uint32_t round_consts[64] = { 3203 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3204 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3205 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3206 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3207 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3208 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3209 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3210 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3211 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3212 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3213 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3214 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3215 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3216 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3217 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3218 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3219 }; 3220 __ align(CodeEntryAlignment); 3221 StubCodeMark mark(this, "StubRoutines", name); 3222 address start = __ pc(); 3223 3224 Register buf = c_rarg0; 3225 Register state = c_rarg1; 3226 Register ofs = c_rarg2; 3227 Register limit = c_rarg3; 3228 3229 Label sha1_loop; 3230 3231 __ stpd(v8, v9, __ pre(sp, -32)); 3232 __ stpd(v10, v11, Address(sp, 16)); 3233 3234 // dga == v0 3235 // dgb == v1 3236 // dg0 == v2 3237 // dg1 == v3 3238 // dg2 == v4 3239 // t0 == v6 3240 // t1 == v7 3241 3242 // load 16 keys to v16..v31 3243 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3244 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3245 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3246 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3247 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3248 3249 // load 8 words (256 bits) state 3250 __ ldpq(v0, v1, state); 3251 3252 __ BIND(sha1_loop); 3253 // load 64 bytes of data into v8..v11 3254 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3255 __ rev32(v8, __ T16B, v8); 3256 __ rev32(v9, __ T16B, v9); 3257 __ rev32(v10, __ T16B, v10); 3258 __ rev32(v11, __ T16B, v11); 3259 3260 __ addv(v6, __ T4S, v8, v16); 3261 __ orr(v2, __ T16B, v0, v0); 3262 __ orr(v3, __ T16B, v1, v1); 3263 3264 FloatRegister d0 = v8; 3265 FloatRegister d1 = v9; 3266 FloatRegister d2 = v10; 3267 FloatRegister d3 = v11; 3268 3269 3270 for (int round = 0; round < 16; round++) { 3271 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3272 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3273 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3274 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3275 3276 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3277 __ orr(v4, __ T16B, v2, v2); 3278 if (round < 15) 3279 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3280 __ sha256h(v2, __ T4S, v3, tmp2); 3281 __ sha256h2(v3, __ T4S, v4, tmp2); 3282 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3283 3284 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3285 } 3286 3287 __ addv(v0, __ T4S, v0, v2); 3288 __ addv(v1, __ T4S, v1, v3); 3289 3290 if (multi_block) { 3291 __ add(ofs, ofs, 64); 3292 __ cmp(ofs, limit); 3293 __ br(Assembler::LE, sha1_loop); 3294 __ mov(c_rarg0, ofs); // return ofs 3295 } 3296 3297 __ ldpd(v10, v11, Address(sp, 16)); 3298 __ ldpd(v8, v9, __ post(sp, 32)); 3299 3300 __ stpq(v0, v1, state); 3301 3302 __ ret(lr); 3303 3304 return start; 3305 } 3306 3307 #ifndef BUILTIN_SIM 3308 // Safefetch stubs. 3309 void generate_safefetch(const char* name, int size, address* entry, 3310 address* fault_pc, address* continuation_pc) { 3311 // safefetch signatures: 3312 // int SafeFetch32(int* adr, int errValue); 3313 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3314 // 3315 // arguments: 3316 // c_rarg0 = adr 3317 // c_rarg1 = errValue 3318 // 3319 // result: 3320 // PPC_RET = *adr or errValue 3321 3322 StubCodeMark mark(this, "StubRoutines", name); 3323 3324 // Entry point, pc or function descriptor. 3325 *entry = __ pc(); 3326 3327 // Load *adr into c_rarg1, may fault. 3328 *fault_pc = __ pc(); 3329 switch (size) { 3330 case 4: 3331 // int32_t 3332 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3333 break; 3334 case 8: 3335 // int64_t 3336 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3337 break; 3338 default: 3339 ShouldNotReachHere(); 3340 } 3341 3342 // return errValue or *adr 3343 *continuation_pc = __ pc(); 3344 __ mov(r0, c_rarg1); 3345 __ ret(lr); 3346 } 3347 #endif 3348 3349 /** 3350 * Arguments: 3351 * 3352 * Inputs: 3353 * c_rarg0 - int crc 3354 * c_rarg1 - byte* buf 3355 * c_rarg2 - int length 3356 * 3357 * Ouput: 3358 * rax - int crc result 3359 */ 3360 address generate_updateBytesCRC32() { 3361 assert(UseCRC32Intrinsics, "what are we doing here?"); 3362 3363 __ align(CodeEntryAlignment); 3364 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3365 3366 address start = __ pc(); 3367 3368 const Register crc = c_rarg0; // crc 3369 const Register buf = c_rarg1; // source java byte array address 3370 const Register len = c_rarg2; // length 3371 const Register table0 = c_rarg3; // crc_table address 3372 const Register table1 = c_rarg4; 3373 const Register table2 = c_rarg5; 3374 const Register table3 = c_rarg6; 3375 const Register tmp3 = c_rarg7; 3376 3377 BLOCK_COMMENT("Entry:"); 3378 __ enter(); // required for proper stackwalking of RuntimeStub frame 3379 3380 __ kernel_crc32(crc, buf, len, 3381 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3382 3383 __ leave(); // required for proper stackwalking of RuntimeStub frame 3384 __ ret(lr); 3385 3386 return start; 3387 } 3388 3389 /** 3390 * Arguments: 3391 * 3392 * Inputs: 3393 * c_rarg0 - int crc 3394 * c_rarg1 - byte* buf 3395 * c_rarg2 - int length 3396 * c_rarg3 - int* table 3397 * 3398 * Ouput: 3399 * r0 - int crc result 3400 */ 3401 address generate_updateBytesCRC32C() { 3402 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3403 3404 __ align(CodeEntryAlignment); 3405 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3406 3407 address start = __ pc(); 3408 3409 const Register crc = c_rarg0; // crc 3410 const Register buf = c_rarg1; // source java byte array address 3411 const Register len = c_rarg2; // length 3412 const Register table0 = c_rarg3; // crc_table address 3413 const Register table1 = c_rarg4; 3414 const Register table2 = c_rarg5; 3415 const Register table3 = c_rarg6; 3416 const Register tmp3 = c_rarg7; 3417 3418 BLOCK_COMMENT("Entry:"); 3419 __ enter(); // required for proper stackwalking of RuntimeStub frame 3420 3421 __ kernel_crc32c(crc, buf, len, 3422 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3423 3424 __ leave(); // required for proper stackwalking of RuntimeStub frame 3425 __ ret(lr); 3426 3427 return start; 3428 } 3429 3430 /*** 3431 * Arguments: 3432 * 3433 * Inputs: 3434 * c_rarg0 - int adler 3435 * c_rarg1 - byte* buff 3436 * c_rarg2 - int len 3437 * 3438 * Output: 3439 * c_rarg0 - int adler result 3440 */ 3441 address generate_updateBytesAdler32() { 3442 __ align(CodeEntryAlignment); 3443 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3444 address start = __ pc(); 3445 3446 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3447 3448 // Aliases 3449 Register adler = c_rarg0; 3450 Register s1 = c_rarg0; 3451 Register s2 = c_rarg3; 3452 Register buff = c_rarg1; 3453 Register len = c_rarg2; 3454 Register nmax = r4; 3455 Register base = r5; 3456 Register count = r6; 3457 Register temp0 = rscratch1; 3458 Register temp1 = rscratch2; 3459 Register temp2 = r7; 3460 3461 // Max number of bytes we can process before having to take the mod 3462 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3463 unsigned long BASE = 0xfff1; 3464 unsigned long NMAX = 0x15B0; 3465 3466 __ mov(base, BASE); 3467 __ mov(nmax, NMAX); 3468 3469 // s1 is initialized to the lower 16 bits of adler 3470 // s2 is initialized to the upper 16 bits of adler 3471 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3472 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3473 3474 // The pipelined loop needs at least 16 elements for 1 iteration 3475 // It does check this, but it is more effective to skip to the cleanup loop 3476 __ cmp(len, 16); 3477 __ br(Assembler::HS, L_nmax); 3478 __ cbz(len, L_combine); 3479 3480 __ bind(L_simple_by1_loop); 3481 __ ldrb(temp0, Address(__ post(buff, 1))); 3482 __ add(s1, s1, temp0); 3483 __ add(s2, s2, s1); 3484 __ subs(len, len, 1); 3485 __ br(Assembler::HI, L_simple_by1_loop); 3486 3487 // s1 = s1 % BASE 3488 __ subs(temp0, s1, base); 3489 __ csel(s1, temp0, s1, Assembler::HS); 3490 3491 // s2 = s2 % BASE 3492 __ lsr(temp0, s2, 16); 3493 __ lsl(temp1, temp0, 4); 3494 __ sub(temp1, temp1, temp0); 3495 __ add(s2, temp1, s2, ext::uxth); 3496 3497 __ subs(temp0, s2, base); 3498 __ csel(s2, temp0, s2, Assembler::HS); 3499 3500 __ b(L_combine); 3501 3502 __ bind(L_nmax); 3503 __ subs(len, len, nmax); 3504 __ sub(count, nmax, 16); 3505 __ br(Assembler::LO, L_by16); 3506 3507 __ bind(L_nmax_loop); 3508 3509 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3510 3511 __ add(s1, s1, temp0, ext::uxtb); 3512 __ ubfx(temp2, temp0, 8, 8); 3513 __ add(s2, s2, s1); 3514 __ add(s1, s1, temp2); 3515 __ ubfx(temp2, temp0, 16, 8); 3516 __ add(s2, s2, s1); 3517 __ add(s1, s1, temp2); 3518 __ ubfx(temp2, temp0, 24, 8); 3519 __ add(s2, s2, s1); 3520 __ add(s1, s1, temp2); 3521 __ ubfx(temp2, temp0, 32, 8); 3522 __ add(s2, s2, s1); 3523 __ add(s1, s1, temp2); 3524 __ ubfx(temp2, temp0, 40, 8); 3525 __ add(s2, s2, s1); 3526 __ add(s1, s1, temp2); 3527 __ ubfx(temp2, temp0, 48, 8); 3528 __ add(s2, s2, s1); 3529 __ add(s1, s1, temp2); 3530 __ add(s2, s2, s1); 3531 __ add(s1, s1, temp0, Assembler::LSR, 56); 3532 __ add(s2, s2, s1); 3533 3534 __ add(s1, s1, temp1, ext::uxtb); 3535 __ ubfx(temp2, temp1, 8, 8); 3536 __ add(s2, s2, s1); 3537 __ add(s1, s1, temp2); 3538 __ ubfx(temp2, temp1, 16, 8); 3539 __ add(s2, s2, s1); 3540 __ add(s1, s1, temp2); 3541 __ ubfx(temp2, temp1, 24, 8); 3542 __ add(s2, s2, s1); 3543 __ add(s1, s1, temp2); 3544 __ ubfx(temp2, temp1, 32, 8); 3545 __ add(s2, s2, s1); 3546 __ add(s1, s1, temp2); 3547 __ ubfx(temp2, temp1, 40, 8); 3548 __ add(s2, s2, s1); 3549 __ add(s1, s1, temp2); 3550 __ ubfx(temp2, temp1, 48, 8); 3551 __ add(s2, s2, s1); 3552 __ add(s1, s1, temp2); 3553 __ add(s2, s2, s1); 3554 __ add(s1, s1, temp1, Assembler::LSR, 56); 3555 __ add(s2, s2, s1); 3556 3557 __ subs(count, count, 16); 3558 __ br(Assembler::HS, L_nmax_loop); 3559 3560 // s1 = s1 % BASE 3561 __ lsr(temp0, s1, 16); 3562 __ lsl(temp1, temp0, 4); 3563 __ sub(temp1, temp1, temp0); 3564 __ add(temp1, temp1, s1, ext::uxth); 3565 3566 __ lsr(temp0, temp1, 16); 3567 __ lsl(s1, temp0, 4); 3568 __ sub(s1, s1, temp0); 3569 __ add(s1, s1, temp1, ext:: uxth); 3570 3571 __ subs(temp0, s1, base); 3572 __ csel(s1, temp0, s1, Assembler::HS); 3573 3574 // s2 = s2 % BASE 3575 __ lsr(temp0, s2, 16); 3576 __ lsl(temp1, temp0, 4); 3577 __ sub(temp1, temp1, temp0); 3578 __ add(temp1, temp1, s2, ext::uxth); 3579 3580 __ lsr(temp0, temp1, 16); 3581 __ lsl(s2, temp0, 4); 3582 __ sub(s2, s2, temp0); 3583 __ add(s2, s2, temp1, ext:: uxth); 3584 3585 __ subs(temp0, s2, base); 3586 __ csel(s2, temp0, s2, Assembler::HS); 3587 3588 __ subs(len, len, nmax); 3589 __ sub(count, nmax, 16); 3590 __ br(Assembler::HS, L_nmax_loop); 3591 3592 __ bind(L_by16); 3593 __ adds(len, len, count); 3594 __ br(Assembler::LO, L_by1); 3595 3596 __ bind(L_by16_loop); 3597 3598 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3599 3600 __ add(s1, s1, temp0, ext::uxtb); 3601 __ ubfx(temp2, temp0, 8, 8); 3602 __ add(s2, s2, s1); 3603 __ add(s1, s1, temp2); 3604 __ ubfx(temp2, temp0, 16, 8); 3605 __ add(s2, s2, s1); 3606 __ add(s1, s1, temp2); 3607 __ ubfx(temp2, temp0, 24, 8); 3608 __ add(s2, s2, s1); 3609 __ add(s1, s1, temp2); 3610 __ ubfx(temp2, temp0, 32, 8); 3611 __ add(s2, s2, s1); 3612 __ add(s1, s1, temp2); 3613 __ ubfx(temp2, temp0, 40, 8); 3614 __ add(s2, s2, s1); 3615 __ add(s1, s1, temp2); 3616 __ ubfx(temp2, temp0, 48, 8); 3617 __ add(s2, s2, s1); 3618 __ add(s1, s1, temp2); 3619 __ add(s2, s2, s1); 3620 __ add(s1, s1, temp0, Assembler::LSR, 56); 3621 __ add(s2, s2, s1); 3622 3623 __ add(s1, s1, temp1, ext::uxtb); 3624 __ ubfx(temp2, temp1, 8, 8); 3625 __ add(s2, s2, s1); 3626 __ add(s1, s1, temp2); 3627 __ ubfx(temp2, temp1, 16, 8); 3628 __ add(s2, s2, s1); 3629 __ add(s1, s1, temp2); 3630 __ ubfx(temp2, temp1, 24, 8); 3631 __ add(s2, s2, s1); 3632 __ add(s1, s1, temp2); 3633 __ ubfx(temp2, temp1, 32, 8); 3634 __ add(s2, s2, s1); 3635 __ add(s1, s1, temp2); 3636 __ ubfx(temp2, temp1, 40, 8); 3637 __ add(s2, s2, s1); 3638 __ add(s1, s1, temp2); 3639 __ ubfx(temp2, temp1, 48, 8); 3640 __ add(s2, s2, s1); 3641 __ add(s1, s1, temp2); 3642 __ add(s2, s2, s1); 3643 __ add(s1, s1, temp1, Assembler::LSR, 56); 3644 __ add(s2, s2, s1); 3645 3646 __ subs(len, len, 16); 3647 __ br(Assembler::HS, L_by16_loop); 3648 3649 __ bind(L_by1); 3650 __ adds(len, len, 15); 3651 __ br(Assembler::LO, L_do_mod); 3652 3653 __ bind(L_by1_loop); 3654 __ ldrb(temp0, Address(__ post(buff, 1))); 3655 __ add(s1, temp0, s1); 3656 __ add(s2, s2, s1); 3657 __ subs(len, len, 1); 3658 __ br(Assembler::HS, L_by1_loop); 3659 3660 __ bind(L_do_mod); 3661 // s1 = s1 % BASE 3662 __ lsr(temp0, s1, 16); 3663 __ lsl(temp1, temp0, 4); 3664 __ sub(temp1, temp1, temp0); 3665 __ add(temp1, temp1, s1, ext::uxth); 3666 3667 __ lsr(temp0, temp1, 16); 3668 __ lsl(s1, temp0, 4); 3669 __ sub(s1, s1, temp0); 3670 __ add(s1, s1, temp1, ext:: uxth); 3671 3672 __ subs(temp0, s1, base); 3673 __ csel(s1, temp0, s1, Assembler::HS); 3674 3675 // s2 = s2 % BASE 3676 __ lsr(temp0, s2, 16); 3677 __ lsl(temp1, temp0, 4); 3678 __ sub(temp1, temp1, temp0); 3679 __ add(temp1, temp1, s2, ext::uxth); 3680 3681 __ lsr(temp0, temp1, 16); 3682 __ lsl(s2, temp0, 4); 3683 __ sub(s2, s2, temp0); 3684 __ add(s2, s2, temp1, ext:: uxth); 3685 3686 __ subs(temp0, s2, base); 3687 __ csel(s2, temp0, s2, Assembler::HS); 3688 3689 // Combine lower bits and higher bits 3690 __ bind(L_combine); 3691 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3692 3693 __ ret(lr); 3694 3695 return start; 3696 } 3697 3698 /** 3699 * Arguments: 3700 * 3701 * Input: 3702 * c_rarg0 - x address 3703 * c_rarg1 - x length 3704 * c_rarg2 - y address 3705 * c_rarg3 - y lenth 3706 * c_rarg4 - z address 3707 * c_rarg5 - z length 3708 */ 3709 address generate_multiplyToLen() { 3710 __ align(CodeEntryAlignment); 3711 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3712 3713 address start = __ pc(); 3714 const Register x = r0; 3715 const Register xlen = r1; 3716 const Register y = r2; 3717 const Register ylen = r3; 3718 const Register z = r4; 3719 const Register zlen = r5; 3720 3721 const Register tmp1 = r10; 3722 const Register tmp2 = r11; 3723 const Register tmp3 = r12; 3724 const Register tmp4 = r13; 3725 const Register tmp5 = r14; 3726 const Register tmp6 = r15; 3727 const Register tmp7 = r16; 3728 3729 BLOCK_COMMENT("Entry:"); 3730 __ enter(); // required for proper stackwalking of RuntimeStub frame 3731 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3732 __ leave(); // required for proper stackwalking of RuntimeStub frame 3733 __ ret(lr); 3734 3735 return start; 3736 } 3737 3738 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3739 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3740 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3741 // Karatsuba multiplication performs a 128*128 -> 256-bit 3742 // multiplication in three 128-bit multiplications and a few 3743 // additions. 3744 // 3745 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3746 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3747 // 3748 // Inputs: 3749 // 3750 // A0 in a.d[0] (subkey) 3751 // A1 in a.d[1] 3752 // (A1+A0) in a1_xor_a0.d[0] 3753 // 3754 // B0 in b.d[0] (state) 3755 // B1 in b.d[1] 3756 3757 __ ext(tmp1, __ T16B, b, b, 0x08); 3758 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3759 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3760 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3761 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3762 3763 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3764 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3765 __ eor(tmp2, __ T16B, tmp2, tmp4); 3766 __ eor(tmp2, __ T16B, tmp2, tmp3); 3767 3768 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3769 __ ins(result_hi, __ D, tmp2, 0, 1); 3770 __ ins(result_lo, __ D, tmp2, 1, 0); 3771 } 3772 3773 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3774 FloatRegister p, FloatRegister z, FloatRegister t1) { 3775 const FloatRegister t0 = result; 3776 3777 // The GCM field polynomial f is z^128 + p(z), where p = 3778 // z^7+z^2+z+1. 3779 // 3780 // z^128 === -p(z) (mod (z^128 + p(z))) 3781 // 3782 // so, given that the product we're reducing is 3783 // a == lo + hi * z^128 3784 // substituting, 3785 // === lo - hi * p(z) (mod (z^128 + p(z))) 3786 // 3787 // we reduce by multiplying hi by p(z) and subtracting the result 3788 // from (i.e. XORing it with) lo. Because p has no nonzero high 3789 // bits we can do this with two 64-bit multiplications, lo*p and 3790 // hi*p. 3791 3792 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3793 __ ext(t1, __ T16B, t0, z, 8); 3794 __ eor(hi, __ T16B, hi, t1); 3795 __ ext(t1, __ T16B, z, t0, 8); 3796 __ eor(lo, __ T16B, lo, t1); 3797 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3798 __ eor(result, __ T16B, lo, t0); 3799 } 3800 3801 /** 3802 * Arguments: 3803 * 3804 * Input: 3805 * c_rarg0 - current state address 3806 * c_rarg1 - H key address 3807 * c_rarg2 - data address 3808 * c_rarg3 - number of blocks 3809 * 3810 * Output: 3811 * Updated state at c_rarg0 3812 */ 3813 address generate_ghash_processBlocks() { 3814 // Bafflingly, GCM uses little-endian for the byte order, but 3815 // big-endian for the bit order. For example, the polynomial 1 is 3816 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3817 // 3818 // So, we must either reverse the bytes in each word and do 3819 // everything big-endian or reverse the bits in each byte and do 3820 // it little-endian. On AArch64 it's more idiomatic to reverse 3821 // the bits in each byte (we have an instruction, RBIT, to do 3822 // that) and keep the data in little-endian bit order throught the 3823 // calculation, bit-reversing the inputs and outputs. 3824 3825 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3826 __ align(wordSize * 2); 3827 address p = __ pc(); 3828 __ emit_int64(0x87); // The low-order bits of the field 3829 // polynomial (i.e. p = z^7+z^2+z+1) 3830 // repeated in the low and high parts of a 3831 // 128-bit vector 3832 __ emit_int64(0x87); 3833 3834 __ align(CodeEntryAlignment); 3835 address start = __ pc(); 3836 3837 Register state = c_rarg0; 3838 Register subkeyH = c_rarg1; 3839 Register data = c_rarg2; 3840 Register blocks = c_rarg3; 3841 3842 FloatRegister vzr = v30; 3843 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3844 3845 __ ldrq(v0, Address(state)); 3846 __ ldrq(v1, Address(subkeyH)); 3847 3848 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3849 __ rbit(v0, __ T16B, v0); 3850 __ rev64(v1, __ T16B, v1); 3851 __ rbit(v1, __ T16B, v1); 3852 3853 __ ldrq(v26, p); 3854 3855 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3856 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3857 3858 { 3859 Label L_ghash_loop; 3860 __ bind(L_ghash_loop); 3861 3862 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3863 // reversing each byte 3864 __ rbit(v2, __ T16B, v2); 3865 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3866 3867 // Multiply state in v2 by subkey in v1 3868 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3869 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3870 /*temps*/v6, v20, v18, v21); 3871 // Reduce v7:v5 by the field polynomial 3872 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3873 3874 __ sub(blocks, blocks, 1); 3875 __ cbnz(blocks, L_ghash_loop); 3876 } 3877 3878 // The bit-reversed result is at this point in v0 3879 __ rev64(v1, __ T16B, v0); 3880 __ rbit(v1, __ T16B, v1); 3881 3882 __ st1(v1, __ T16B, state); 3883 __ ret(lr); 3884 3885 return start; 3886 } 3887 3888 // Continuation point for throwing of implicit exceptions that are 3889 // not handled in the current activation. Fabricates an exception 3890 // oop and initiates normal exception dispatching in this 3891 // frame. Since we need to preserve callee-saved values (currently 3892 // only for C2, but done for C1 as well) we need a callee-saved oop 3893 // map and therefore have to make these stubs into RuntimeStubs 3894 // rather than BufferBlobs. If the compiler needs all registers to 3895 // be preserved between the fault point and the exception handler 3896 // then it must assume responsibility for that in 3897 // AbstractCompiler::continuation_for_implicit_null_exception or 3898 // continuation_for_implicit_division_by_zero_exception. All other 3899 // implicit exceptions (e.g., NullPointerException or 3900 // AbstractMethodError on entry) are either at call sites or 3901 // otherwise assume that stack unwinding will be initiated, so 3902 // caller saved registers were assumed volatile in the compiler. 3903 3904 #undef __ 3905 #define __ masm-> 3906 3907 address generate_throw_exception(const char* name, 3908 address runtime_entry, 3909 Register arg1 = noreg, 3910 Register arg2 = noreg) { 3911 // Information about frame layout at time of blocking runtime call. 3912 // Note that we only have to preserve callee-saved registers since 3913 // the compilers are responsible for supplying a continuation point 3914 // if they expect all registers to be preserved. 3915 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3916 enum layout { 3917 rfp_off = 0, 3918 rfp_off2, 3919 return_off, 3920 return_off2, 3921 framesize // inclusive of return address 3922 }; 3923 3924 int insts_size = 512; 3925 int locs_size = 64; 3926 3927 CodeBuffer code(name, insts_size, locs_size); 3928 OopMapSet* oop_maps = new OopMapSet(); 3929 MacroAssembler* masm = new MacroAssembler(&code); 3930 3931 address start = __ pc(); 3932 3933 // This is an inlined and slightly modified version of call_VM 3934 // which has the ability to fetch the return PC out of 3935 // thread-local storage and also sets up last_Java_sp slightly 3936 // differently than the real call_VM 3937 3938 __ enter(); // Save FP and LR before call 3939 3940 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3941 3942 // lr and fp are already in place 3943 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3944 3945 int frame_complete = __ pc() - start; 3946 3947 // Set up last_Java_sp and last_Java_fp 3948 address the_pc = __ pc(); 3949 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3950 3951 // Call runtime 3952 if (arg1 != noreg) { 3953 assert(arg2 != c_rarg1, "clobbered"); 3954 __ mov(c_rarg1, arg1); 3955 } 3956 if (arg2 != noreg) { 3957 __ mov(c_rarg2, arg2); 3958 } 3959 __ mov(c_rarg0, rthread); 3960 BLOCK_COMMENT("call runtime_entry"); 3961 __ mov(rscratch1, runtime_entry); 3962 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3963 3964 // Generate oop map 3965 OopMap* map = new OopMap(framesize, 0); 3966 3967 oop_maps->add_gc_map(the_pc - start, map); 3968 3969 __ reset_last_Java_frame(true); 3970 __ maybe_isb(); 3971 3972 __ leave(); 3973 3974 // check for pending exceptions 3975 #ifdef ASSERT 3976 Label L; 3977 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3978 __ cbnz(rscratch1, L); 3979 __ should_not_reach_here(); 3980 __ bind(L); 3981 #endif // ASSERT 3982 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3983 3984 3985 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3986 RuntimeStub* stub = 3987 RuntimeStub::new_runtime_stub(name, 3988 &code, 3989 frame_complete, 3990 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3991 oop_maps, false); 3992 return stub->entry_point(); 3993 } 3994 3995 class MontgomeryMultiplyGenerator : public MacroAssembler { 3996 3997 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3998 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3999 4000 RegSet _toSave; 4001 bool _squaring; 4002 4003 public: 4004 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4005 : MacroAssembler(as->code()), _squaring(squaring) { 4006 4007 // Register allocation 4008 4009 Register reg = c_rarg0; 4010 Pa_base = reg; // Argument registers 4011 if (squaring) 4012 Pb_base = Pa_base; 4013 else 4014 Pb_base = ++reg; 4015 Pn_base = ++reg; 4016 Rlen= ++reg; 4017 inv = ++reg; 4018 Pm_base = ++reg; 4019 4020 // Working registers: 4021 Ra = ++reg; // The current digit of a, b, n, and m. 4022 Rb = ++reg; 4023 Rm = ++reg; 4024 Rn = ++reg; 4025 4026 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4027 Pb = ++reg; 4028 Pm = ++reg; 4029 Pn = ++reg; 4030 4031 t0 = ++reg; // Three registers which form a 4032 t1 = ++reg; // triple-precision accumuator. 4033 t2 = ++reg; 4034 4035 Ri = ++reg; // Inner and outer loop indexes. 4036 Rj = ++reg; 4037 4038 Rhi_ab = ++reg; // Product registers: low and high parts 4039 Rlo_ab = ++reg; // of a*b and m*n. 4040 Rhi_mn = ++reg; 4041 Rlo_mn = ++reg; 4042 4043 // r19 and up are callee-saved. 4044 _toSave = RegSet::range(r19, reg) + Pm_base; 4045 } 4046 4047 private: 4048 void save_regs() { 4049 push(_toSave, sp); 4050 } 4051 4052 void restore_regs() { 4053 pop(_toSave, sp); 4054 } 4055 4056 template <typename T> 4057 void unroll_2(Register count, T block) { 4058 Label loop, end, odd; 4059 tbnz(count, 0, odd); 4060 cbz(count, end); 4061 align(16); 4062 bind(loop); 4063 (this->*block)(); 4064 bind(odd); 4065 (this->*block)(); 4066 subs(count, count, 2); 4067 br(Assembler::GT, loop); 4068 bind(end); 4069 } 4070 4071 template <typename T> 4072 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4073 Label loop, end, odd; 4074 tbnz(count, 0, odd); 4075 cbz(count, end); 4076 align(16); 4077 bind(loop); 4078 (this->*block)(d, s, tmp); 4079 bind(odd); 4080 (this->*block)(d, s, tmp); 4081 subs(count, count, 2); 4082 br(Assembler::GT, loop); 4083 bind(end); 4084 } 4085 4086 void pre1(RegisterOrConstant i) { 4087 block_comment("pre1"); 4088 // Pa = Pa_base; 4089 // Pb = Pb_base + i; 4090 // Pm = Pm_base; 4091 // Pn = Pn_base + i; 4092 // Ra = *Pa; 4093 // Rb = *Pb; 4094 // Rm = *Pm; 4095 // Rn = *Pn; 4096 ldr(Ra, Address(Pa_base)); 4097 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4098 ldr(Rm, Address(Pm_base)); 4099 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4100 lea(Pa, Address(Pa_base)); 4101 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4102 lea(Pm, Address(Pm_base)); 4103 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4104 4105 // Zero the m*n result. 4106 mov(Rhi_mn, zr); 4107 mov(Rlo_mn, zr); 4108 } 4109 4110 // The core multiply-accumulate step of a Montgomery 4111 // multiplication. The idea is to schedule operations as a 4112 // pipeline so that instructions with long latencies (loads and 4113 // multiplies) have time to complete before their results are 4114 // used. This most benefits in-order implementations of the 4115 // architecture but out-of-order ones also benefit. 4116 void step() { 4117 block_comment("step"); 4118 // MACC(Ra, Rb, t0, t1, t2); 4119 // Ra = *++Pa; 4120 // Rb = *--Pb; 4121 umulh(Rhi_ab, Ra, Rb); 4122 mul(Rlo_ab, Ra, Rb); 4123 ldr(Ra, pre(Pa, wordSize)); 4124 ldr(Rb, pre(Pb, -wordSize)); 4125 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4126 // previous iteration. 4127 // MACC(Rm, Rn, t0, t1, t2); 4128 // Rm = *++Pm; 4129 // Rn = *--Pn; 4130 umulh(Rhi_mn, Rm, Rn); 4131 mul(Rlo_mn, Rm, Rn); 4132 ldr(Rm, pre(Pm, wordSize)); 4133 ldr(Rn, pre(Pn, -wordSize)); 4134 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4135 } 4136 4137 void post1() { 4138 block_comment("post1"); 4139 4140 // MACC(Ra, Rb, t0, t1, t2); 4141 // Ra = *++Pa; 4142 // Rb = *--Pb; 4143 umulh(Rhi_ab, Ra, Rb); 4144 mul(Rlo_ab, Ra, Rb); 4145 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4146 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4147 4148 // *Pm = Rm = t0 * inv; 4149 mul(Rm, t0, inv); 4150 str(Rm, Address(Pm)); 4151 4152 // MACC(Rm, Rn, t0, t1, t2); 4153 // t0 = t1; t1 = t2; t2 = 0; 4154 umulh(Rhi_mn, Rm, Rn); 4155 4156 #ifndef PRODUCT 4157 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4158 { 4159 mul(Rlo_mn, Rm, Rn); 4160 add(Rlo_mn, t0, Rlo_mn); 4161 Label ok; 4162 cbz(Rlo_mn, ok); { 4163 stop("broken Montgomery multiply"); 4164 } bind(ok); 4165 } 4166 #endif 4167 // We have very carefully set things up so that 4168 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4169 // the lower half of Rm * Rn because we know the result already: 4170 // it must be -t0. t0 + (-t0) must generate a carry iff 4171 // t0 != 0. So, rather than do a mul and an adds we just set 4172 // the carry flag iff t0 is nonzero. 4173 // 4174 // mul(Rlo_mn, Rm, Rn); 4175 // adds(zr, t0, Rlo_mn); 4176 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4177 adcs(t0, t1, Rhi_mn); 4178 adc(t1, t2, zr); 4179 mov(t2, zr); 4180 } 4181 4182 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4183 block_comment("pre2"); 4184 // Pa = Pa_base + i-len; 4185 // Pb = Pb_base + len; 4186 // Pm = Pm_base + i-len; 4187 // Pn = Pn_base + len; 4188 4189 if (i.is_register()) { 4190 sub(Rj, i.as_register(), len); 4191 } else { 4192 mov(Rj, i.as_constant()); 4193 sub(Rj, Rj, len); 4194 } 4195 // Rj == i-len 4196 4197 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4198 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4199 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4200 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4201 4202 // Ra = *++Pa; 4203 // Rb = *--Pb; 4204 // Rm = *++Pm; 4205 // Rn = *--Pn; 4206 ldr(Ra, pre(Pa, wordSize)); 4207 ldr(Rb, pre(Pb, -wordSize)); 4208 ldr(Rm, pre(Pm, wordSize)); 4209 ldr(Rn, pre(Pn, -wordSize)); 4210 4211 mov(Rhi_mn, zr); 4212 mov(Rlo_mn, zr); 4213 } 4214 4215 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4216 block_comment("post2"); 4217 if (i.is_constant()) { 4218 mov(Rj, i.as_constant()-len.as_constant()); 4219 } else { 4220 sub(Rj, i.as_register(), len); 4221 } 4222 4223 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4224 4225 // As soon as we know the least significant digit of our result, 4226 // store it. 4227 // Pm_base[i-len] = t0; 4228 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4229 4230 // t0 = t1; t1 = t2; t2 = 0; 4231 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4232 adc(t1, t2, zr); 4233 mov(t2, zr); 4234 } 4235 4236 // A carry in t0 after Montgomery multiplication means that we 4237 // should subtract multiples of n from our result in m. We'll 4238 // keep doing that until there is no carry. 4239 void normalize(RegisterOrConstant len) { 4240 block_comment("normalize"); 4241 // while (t0) 4242 // t0 = sub(Pm_base, Pn_base, t0, len); 4243 Label loop, post, again; 4244 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4245 cbz(t0, post); { 4246 bind(again); { 4247 mov(i, zr); 4248 mov(cnt, len); 4249 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4250 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4251 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4252 align(16); 4253 bind(loop); { 4254 sbcs(Rm, Rm, Rn); 4255 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4256 add(i, i, 1); 4257 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4258 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4259 sub(cnt, cnt, 1); 4260 } cbnz(cnt, loop); 4261 sbc(t0, t0, zr); 4262 } cbnz(t0, again); 4263 } bind(post); 4264 } 4265 4266 // Move memory at s to d, reversing words. 4267 // Increments d to end of copied memory 4268 // Destroys tmp1, tmp2 4269 // Preserves len 4270 // Leaves s pointing to the address which was in d at start 4271 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4272 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4273 4274 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4275 mov(tmp1, len); 4276 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4277 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4278 } 4279 // where 4280 void reverse1(Register d, Register s, Register tmp) { 4281 ldr(tmp, pre(s, -wordSize)); 4282 ror(tmp, tmp, 32); 4283 str(tmp, post(d, wordSize)); 4284 } 4285 4286 void step_squaring() { 4287 // An extra ACC 4288 step(); 4289 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4290 } 4291 4292 void last_squaring(RegisterOrConstant i) { 4293 Label dont; 4294 // if ((i & 1) == 0) { 4295 tbnz(i.as_register(), 0, dont); { 4296 // MACC(Ra, Rb, t0, t1, t2); 4297 // Ra = *++Pa; 4298 // Rb = *--Pb; 4299 umulh(Rhi_ab, Ra, Rb); 4300 mul(Rlo_ab, Ra, Rb); 4301 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4302 } bind(dont); 4303 } 4304 4305 void extra_step_squaring() { 4306 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4307 4308 // MACC(Rm, Rn, t0, t1, t2); 4309 // Rm = *++Pm; 4310 // Rn = *--Pn; 4311 umulh(Rhi_mn, Rm, Rn); 4312 mul(Rlo_mn, Rm, Rn); 4313 ldr(Rm, pre(Pm, wordSize)); 4314 ldr(Rn, pre(Pn, -wordSize)); 4315 } 4316 4317 void post1_squaring() { 4318 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4319 4320 // *Pm = Rm = t0 * inv; 4321 mul(Rm, t0, inv); 4322 str(Rm, Address(Pm)); 4323 4324 // MACC(Rm, Rn, t0, t1, t2); 4325 // t0 = t1; t1 = t2; t2 = 0; 4326 umulh(Rhi_mn, Rm, Rn); 4327 4328 #ifndef PRODUCT 4329 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4330 { 4331 mul(Rlo_mn, Rm, Rn); 4332 add(Rlo_mn, t0, Rlo_mn); 4333 Label ok; 4334 cbz(Rlo_mn, ok); { 4335 stop("broken Montgomery multiply"); 4336 } bind(ok); 4337 } 4338 #endif 4339 // We have very carefully set things up so that 4340 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4341 // the lower half of Rm * Rn because we know the result already: 4342 // it must be -t0. t0 + (-t0) must generate a carry iff 4343 // t0 != 0. So, rather than do a mul and an adds we just set 4344 // the carry flag iff t0 is nonzero. 4345 // 4346 // mul(Rlo_mn, Rm, Rn); 4347 // adds(zr, t0, Rlo_mn); 4348 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4349 adcs(t0, t1, Rhi_mn); 4350 adc(t1, t2, zr); 4351 mov(t2, zr); 4352 } 4353 4354 void acc(Register Rhi, Register Rlo, 4355 Register t0, Register t1, Register t2) { 4356 adds(t0, t0, Rlo); 4357 adcs(t1, t1, Rhi); 4358 adc(t2, t2, zr); 4359 } 4360 4361 public: 4362 /** 4363 * Fast Montgomery multiplication. The derivation of the 4364 * algorithm is in A Cryptographic Library for the Motorola 4365 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4366 * 4367 * Arguments: 4368 * 4369 * Inputs for multiplication: 4370 * c_rarg0 - int array elements a 4371 * c_rarg1 - int array elements b 4372 * c_rarg2 - int array elements n (the modulus) 4373 * c_rarg3 - int length 4374 * c_rarg4 - int inv 4375 * c_rarg5 - int array elements m (the result) 4376 * 4377 * Inputs for squaring: 4378 * c_rarg0 - int array elements a 4379 * c_rarg1 - int array elements n (the modulus) 4380 * c_rarg2 - int length 4381 * c_rarg3 - int inv 4382 * c_rarg4 - int array elements m (the result) 4383 * 4384 */ 4385 address generate_multiply() { 4386 Label argh, nothing; 4387 bind(argh); 4388 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4389 4390 align(CodeEntryAlignment); 4391 address entry = pc(); 4392 4393 cbzw(Rlen, nothing); 4394 4395 enter(); 4396 4397 // Make room. 4398 cmpw(Rlen, 512); 4399 br(Assembler::HI, argh); 4400 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4401 andr(sp, Ra, -2 * wordSize); 4402 4403 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4404 4405 { 4406 // Copy input args, reversing as we go. We use Ra as a 4407 // temporary variable. 4408 reverse(Ra, Pa_base, Rlen, t0, t1); 4409 if (!_squaring) 4410 reverse(Ra, Pb_base, Rlen, t0, t1); 4411 reverse(Ra, Pn_base, Rlen, t0, t1); 4412 } 4413 4414 // Push all call-saved registers and also Pm_base which we'll need 4415 // at the end. 4416 save_regs(); 4417 4418 #ifndef PRODUCT 4419 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4420 { 4421 ldr(Rn, Address(Pn_base, 0)); 4422 mul(Rlo_mn, Rn, inv); 4423 cmp(Rlo_mn, -1); 4424 Label ok; 4425 br(EQ, ok); { 4426 stop("broken inverse in Montgomery multiply"); 4427 } bind(ok); 4428 } 4429 #endif 4430 4431 mov(Pm_base, Ra); 4432 4433 mov(t0, zr); 4434 mov(t1, zr); 4435 mov(t2, zr); 4436 4437 block_comment("for (int i = 0; i < len; i++) {"); 4438 mov(Ri, zr); { 4439 Label loop, end; 4440 cmpw(Ri, Rlen); 4441 br(Assembler::GE, end); 4442 4443 bind(loop); 4444 pre1(Ri); 4445 4446 block_comment(" for (j = i; j; j--) {"); { 4447 movw(Rj, Ri); 4448 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4449 } block_comment(" } // j"); 4450 4451 post1(); 4452 addw(Ri, Ri, 1); 4453 cmpw(Ri, Rlen); 4454 br(Assembler::LT, loop); 4455 bind(end); 4456 block_comment("} // i"); 4457 } 4458 4459 block_comment("for (int i = len; i < 2*len; i++) {"); 4460 mov(Ri, Rlen); { 4461 Label loop, end; 4462 cmpw(Ri, Rlen, Assembler::LSL, 1); 4463 br(Assembler::GE, end); 4464 4465 bind(loop); 4466 pre2(Ri, Rlen); 4467 4468 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4469 lslw(Rj, Rlen, 1); 4470 subw(Rj, Rj, Ri); 4471 subw(Rj, Rj, 1); 4472 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4473 } block_comment(" } // j"); 4474 4475 post2(Ri, Rlen); 4476 addw(Ri, Ri, 1); 4477 cmpw(Ri, Rlen, Assembler::LSL, 1); 4478 br(Assembler::LT, loop); 4479 bind(end); 4480 } 4481 block_comment("} // i"); 4482 4483 normalize(Rlen); 4484 4485 mov(Ra, Pm_base); // Save Pm_base in Ra 4486 restore_regs(); // Restore caller's Pm_base 4487 4488 // Copy our result into caller's Pm_base 4489 reverse(Pm_base, Ra, Rlen, t0, t1); 4490 4491 leave(); 4492 bind(nothing); 4493 ret(lr); 4494 4495 return entry; 4496 } 4497 // In C, approximately: 4498 4499 // void 4500 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4501 // unsigned long Pn_base[], unsigned long Pm_base[], 4502 // unsigned long inv, int len) { 4503 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4504 // unsigned long *Pa, *Pb, *Pn, *Pm; 4505 // unsigned long Ra, Rb, Rn, Rm; 4506 4507 // int i; 4508 4509 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4510 4511 // for (i = 0; i < len; i++) { 4512 // int j; 4513 4514 // Pa = Pa_base; 4515 // Pb = Pb_base + i; 4516 // Pm = Pm_base; 4517 // Pn = Pn_base + i; 4518 4519 // Ra = *Pa; 4520 // Rb = *Pb; 4521 // Rm = *Pm; 4522 // Rn = *Pn; 4523 4524 // int iters = i; 4525 // for (j = 0; iters--; j++) { 4526 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4527 // MACC(Ra, Rb, t0, t1, t2); 4528 // Ra = *++Pa; 4529 // Rb = *--Pb; 4530 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4531 // MACC(Rm, Rn, t0, t1, t2); 4532 // Rm = *++Pm; 4533 // Rn = *--Pn; 4534 // } 4535 4536 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4537 // MACC(Ra, Rb, t0, t1, t2); 4538 // *Pm = Rm = t0 * inv; 4539 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4540 // MACC(Rm, Rn, t0, t1, t2); 4541 4542 // assert(t0 == 0, "broken Montgomery multiply"); 4543 4544 // t0 = t1; t1 = t2; t2 = 0; 4545 // } 4546 4547 // for (i = len; i < 2*len; i++) { 4548 // int j; 4549 4550 // Pa = Pa_base + i-len; 4551 // Pb = Pb_base + len; 4552 // Pm = Pm_base + i-len; 4553 // Pn = Pn_base + len; 4554 4555 // Ra = *++Pa; 4556 // Rb = *--Pb; 4557 // Rm = *++Pm; 4558 // Rn = *--Pn; 4559 4560 // int iters = len*2-i-1; 4561 // for (j = i-len+1; iters--; j++) { 4562 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4563 // MACC(Ra, Rb, t0, t1, t2); 4564 // Ra = *++Pa; 4565 // Rb = *--Pb; 4566 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4567 // MACC(Rm, Rn, t0, t1, t2); 4568 // Rm = *++Pm; 4569 // Rn = *--Pn; 4570 // } 4571 4572 // Pm_base[i-len] = t0; 4573 // t0 = t1; t1 = t2; t2 = 0; 4574 // } 4575 4576 // while (t0) 4577 // t0 = sub(Pm_base, Pn_base, t0, len); 4578 // } 4579 4580 /** 4581 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4582 * multiplies than Montgomery multiplication so it should be up to 4583 * 25% faster. However, its loop control is more complex and it 4584 * may actually run slower on some machines. 4585 * 4586 * Arguments: 4587 * 4588 * Inputs: 4589 * c_rarg0 - int array elements a 4590 * c_rarg1 - int array elements n (the modulus) 4591 * c_rarg2 - int length 4592 * c_rarg3 - int inv 4593 * c_rarg4 - int array elements m (the result) 4594 * 4595 */ 4596 address generate_square() { 4597 Label argh; 4598 bind(argh); 4599 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4600 4601 align(CodeEntryAlignment); 4602 address entry = pc(); 4603 4604 enter(); 4605 4606 // Make room. 4607 cmpw(Rlen, 512); 4608 br(Assembler::HI, argh); 4609 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4610 andr(sp, Ra, -2 * wordSize); 4611 4612 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4613 4614 { 4615 // Copy input args, reversing as we go. We use Ra as a 4616 // temporary variable. 4617 reverse(Ra, Pa_base, Rlen, t0, t1); 4618 reverse(Ra, Pn_base, Rlen, t0, t1); 4619 } 4620 4621 // Push all call-saved registers and also Pm_base which we'll need 4622 // at the end. 4623 save_regs(); 4624 4625 mov(Pm_base, Ra); 4626 4627 mov(t0, zr); 4628 mov(t1, zr); 4629 mov(t2, zr); 4630 4631 block_comment("for (int i = 0; i < len; i++) {"); 4632 mov(Ri, zr); { 4633 Label loop, end; 4634 bind(loop); 4635 cmp(Ri, Rlen); 4636 br(Assembler::GE, end); 4637 4638 pre1(Ri); 4639 4640 block_comment("for (j = (i+1)/2; j; j--) {"); { 4641 add(Rj, Ri, 1); 4642 lsr(Rj, Rj, 1); 4643 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4644 } block_comment(" } // j"); 4645 4646 last_squaring(Ri); 4647 4648 block_comment(" for (j = i/2; j; j--) {"); { 4649 lsr(Rj, Ri, 1); 4650 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4651 } block_comment(" } // j"); 4652 4653 post1_squaring(); 4654 add(Ri, Ri, 1); 4655 cmp(Ri, Rlen); 4656 br(Assembler::LT, loop); 4657 4658 bind(end); 4659 block_comment("} // i"); 4660 } 4661 4662 block_comment("for (int i = len; i < 2*len; i++) {"); 4663 mov(Ri, Rlen); { 4664 Label loop, end; 4665 bind(loop); 4666 cmp(Ri, Rlen, Assembler::LSL, 1); 4667 br(Assembler::GE, end); 4668 4669 pre2(Ri, Rlen); 4670 4671 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4672 lsl(Rj, Rlen, 1); 4673 sub(Rj, Rj, Ri); 4674 sub(Rj, Rj, 1); 4675 lsr(Rj, Rj, 1); 4676 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4677 } block_comment(" } // j"); 4678 4679 last_squaring(Ri); 4680 4681 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4682 lsl(Rj, Rlen, 1); 4683 sub(Rj, Rj, Ri); 4684 lsr(Rj, Rj, 1); 4685 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4686 } block_comment(" } // j"); 4687 4688 post2(Ri, Rlen); 4689 add(Ri, Ri, 1); 4690 cmp(Ri, Rlen, Assembler::LSL, 1); 4691 4692 br(Assembler::LT, loop); 4693 bind(end); 4694 block_comment("} // i"); 4695 } 4696 4697 normalize(Rlen); 4698 4699 mov(Ra, Pm_base); // Save Pm_base in Ra 4700 restore_regs(); // Restore caller's Pm_base 4701 4702 // Copy our result into caller's Pm_base 4703 reverse(Pm_base, Ra, Rlen, t0, t1); 4704 4705 leave(); 4706 ret(lr); 4707 4708 return entry; 4709 } 4710 // In C, approximately: 4711 4712 // void 4713 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4714 // unsigned long Pm_base[], unsigned long inv, int len) { 4715 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4716 // unsigned long *Pa, *Pb, *Pn, *Pm; 4717 // unsigned long Ra, Rb, Rn, Rm; 4718 4719 // int i; 4720 4721 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4722 4723 // for (i = 0; i < len; i++) { 4724 // int j; 4725 4726 // Pa = Pa_base; 4727 // Pb = Pa_base + i; 4728 // Pm = Pm_base; 4729 // Pn = Pn_base + i; 4730 4731 // Ra = *Pa; 4732 // Rb = *Pb; 4733 // Rm = *Pm; 4734 // Rn = *Pn; 4735 4736 // int iters = (i+1)/2; 4737 // for (j = 0; iters--; j++) { 4738 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4739 // MACC2(Ra, Rb, t0, t1, t2); 4740 // Ra = *++Pa; 4741 // Rb = *--Pb; 4742 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4743 // MACC(Rm, Rn, t0, t1, t2); 4744 // Rm = *++Pm; 4745 // Rn = *--Pn; 4746 // } 4747 // if ((i & 1) == 0) { 4748 // assert(Ra == Pa_base[j], "must be"); 4749 // MACC(Ra, Ra, t0, t1, t2); 4750 // } 4751 // iters = i/2; 4752 // assert(iters == i-j, "must be"); 4753 // for (; iters--; j++) { 4754 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4755 // MACC(Rm, Rn, t0, t1, t2); 4756 // Rm = *++Pm; 4757 // Rn = *--Pn; 4758 // } 4759 4760 // *Pm = Rm = t0 * inv; 4761 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4762 // MACC(Rm, Rn, t0, t1, t2); 4763 4764 // assert(t0 == 0, "broken Montgomery multiply"); 4765 4766 // t0 = t1; t1 = t2; t2 = 0; 4767 // } 4768 4769 // for (i = len; i < 2*len; i++) { 4770 // int start = i-len+1; 4771 // int end = start + (len - start)/2; 4772 // int j; 4773 4774 // Pa = Pa_base + i-len; 4775 // Pb = Pa_base + len; 4776 // Pm = Pm_base + i-len; 4777 // Pn = Pn_base + len; 4778 4779 // Ra = *++Pa; 4780 // Rb = *--Pb; 4781 // Rm = *++Pm; 4782 // Rn = *--Pn; 4783 4784 // int iters = (2*len-i-1)/2; 4785 // assert(iters == end-start, "must be"); 4786 // for (j = start; iters--; j++) { 4787 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4788 // MACC2(Ra, Rb, t0, t1, t2); 4789 // Ra = *++Pa; 4790 // Rb = *--Pb; 4791 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4792 // MACC(Rm, Rn, t0, t1, t2); 4793 // Rm = *++Pm; 4794 // Rn = *--Pn; 4795 // } 4796 // if ((i & 1) == 0) { 4797 // assert(Ra == Pa_base[j], "must be"); 4798 // MACC(Ra, Ra, t0, t1, t2); 4799 // } 4800 // iters = (2*len-i)/2; 4801 // assert(iters == len-j, "must be"); 4802 // for (; iters--; j++) { 4803 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4804 // MACC(Rm, Rn, t0, t1, t2); 4805 // Rm = *++Pm; 4806 // Rn = *--Pn; 4807 // } 4808 // Pm_base[i-len] = t0; 4809 // t0 = t1; t1 = t2; t2 = 0; 4810 // } 4811 4812 // while (t0) 4813 // t0 = sub(Pm_base, Pn_base, t0, len); 4814 // } 4815 }; 4816 4817 // Initialization 4818 void generate_initial() { 4819 // Generate initial stubs and initializes the entry points 4820 4821 // entry points that exist in all platforms Note: This is code 4822 // that could be shared among different platforms - however the 4823 // benefit seems to be smaller than the disadvantage of having a 4824 // much more complicated generator structure. See also comment in 4825 // stubRoutines.hpp. 4826 4827 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4828 4829 StubRoutines::_call_stub_entry = 4830 generate_call_stub(StubRoutines::_call_stub_return_address); 4831 4832 // is referenced by megamorphic call 4833 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4834 4835 // Build this early so it's available for the interpreter. 4836 StubRoutines::_throw_StackOverflowError_entry = 4837 generate_throw_exception("StackOverflowError throw_exception", 4838 CAST_FROM_FN_PTR(address, 4839 SharedRuntime:: 4840 throw_StackOverflowError)); 4841 if (UseCRC32Intrinsics) { 4842 // set table address before stub generation which use it 4843 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4844 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4845 } 4846 } 4847 4848 void generate_all() { 4849 // support for verify_oop (must happen after universe_init) 4850 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4851 StubRoutines::_throw_AbstractMethodError_entry = 4852 generate_throw_exception("AbstractMethodError throw_exception", 4853 CAST_FROM_FN_PTR(address, 4854 SharedRuntime:: 4855 throw_AbstractMethodError)); 4856 4857 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4858 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4859 CAST_FROM_FN_PTR(address, 4860 SharedRuntime:: 4861 throw_IncompatibleClassChangeError)); 4862 4863 StubRoutines::_throw_NullPointerException_at_call_entry = 4864 generate_throw_exception("NullPointerException at call throw_exception", 4865 CAST_FROM_FN_PTR(address, 4866 SharedRuntime:: 4867 throw_NullPointerException_at_call)); 4868 4869 // arraycopy stubs used by compilers 4870 generate_arraycopy_stubs(); 4871 4872 if (UseMultiplyToLenIntrinsic) { 4873 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4874 } 4875 4876 if (UseMontgomeryMultiplyIntrinsic) { 4877 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4878 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4879 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4880 } 4881 4882 if (UseMontgomerySquareIntrinsic) { 4883 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4884 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4885 // We use generate_multiply() rather than generate_square() 4886 // because it's faster for the sizes of modulus we care about. 4887 StubRoutines::_montgomerySquare = g.generate_multiply(); 4888 } 4889 4890 if (UseShenandoahGC) { 4891 StubRoutines::aarch64::_shenandoah_wb = generate_shenandoah_wb(); 4892 } 4893 4894 #ifndef BUILTIN_SIM 4895 // generate GHASH intrinsics code 4896 if (UseGHASHIntrinsics) { 4897 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4898 } 4899 4900 if (UseAESIntrinsics) { 4901 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4902 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4903 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4904 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4905 } 4906 4907 if (UseSHA1Intrinsics) { 4908 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4909 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4910 } 4911 if (UseSHA256Intrinsics) { 4912 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4913 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4914 } 4915 4916 if (UseCRC32CIntrinsics) { 4917 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4918 } 4919 4920 // generate Adler32 intrinsics code 4921 if (UseAdler32Intrinsics) { 4922 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4923 } 4924 4925 // Safefetch stubs. 4926 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4927 &StubRoutines::_safefetch32_fault_pc, 4928 &StubRoutines::_safefetch32_continuation_pc); 4929 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4930 &StubRoutines::_safefetchN_fault_pc, 4931 &StubRoutines::_safefetchN_continuation_pc); 4932 #endif 4933 } 4934 4935 public: 4936 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4937 if (all) { 4938 generate_all(); 4939 } else { 4940 generate_initial(); 4941 } 4942 } 4943 }; // end class declaration 4944 4945 void StubGenerator_generate(CodeBuffer* code, bool all) { 4946 StubGenerator g(code, all); 4947 }