1 /* 2 * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/cardTable.hpp" 30 #include "gc/shared/cardTableModRefBS.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 #ifdef BUILTIN_SIM 50 #include "../../../../../../simulator/simulator.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #undef __ 58 #define __ _masm-> 59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Stub Code definitions 70 71 class StubGenerator: public StubCodeGenerator { 72 private: 73 74 #ifdef PRODUCT 75 #define inc_counter_np(counter) ((void)0) 76 #else 77 void inc_counter_np_(int& counter) { 78 __ lea(rscratch2, ExternalAddress((address)&counter)); 79 __ ldrw(rscratch1, Address(rscratch2)); 80 __ addw(rscratch1, rscratch1, 1); 81 __ strw(rscratch1, Address(rscratch2)); 82 } 83 #define inc_counter_np(counter) \ 84 BLOCK_COMMENT("inc_counter " #counter); \ 85 inc_counter_np_(counter); 86 #endif 87 88 // Call stubs are used to call Java from C 89 // 90 // Arguments: 91 // c_rarg0: call wrapper address address 92 // c_rarg1: result address 93 // c_rarg2: result type BasicType 94 // c_rarg3: method Method* 95 // c_rarg4: (interpreter) entry point address 96 // c_rarg5: parameters intptr_t* 97 // c_rarg6: parameter size (in words) int 98 // c_rarg7: thread Thread* 99 // 100 // There is no return from the stub itself as any Java result 101 // is written to result 102 // 103 // we save r30 (lr) as the return PC at the base of the frame and 104 // link r29 (fp) below it as the frame pointer installing sp (r31) 105 // into fp. 106 // 107 // we save r0-r7, which accounts for all the c arguments. 108 // 109 // TODO: strictly do we need to save them all? they are treated as 110 // volatile by C so could we omit saving the ones we are going to 111 // place in global registers (thread? method?) or those we only use 112 // during setup of the Java call? 113 // 114 // we don't need to save r8 which C uses as an indirect result location 115 // return register. 116 // 117 // we don't need to save r9-r15 which both C and Java treat as 118 // volatile 119 // 120 // we don't need to save r16-18 because Java does not use them 121 // 122 // we save r19-r28 which Java uses as scratch registers and C 123 // expects to be callee-save 124 // 125 // we save the bottom 64 bits of each value stored in v8-v15; it is 126 // the responsibility of the caller to preserve larger values. 127 // 128 // so the stub frame looks like this when we enter Java code 129 // 130 // [ return_from_Java ] <--- sp 131 // [ argument word n ] 132 // ... 133 // -27 [ argument word 1 ] 134 // -26 [ saved v15 ] <--- sp_after_call 135 // -25 [ saved v14 ] 136 // -24 [ saved v13 ] 137 // -23 [ saved v12 ] 138 // -22 [ saved v11 ] 139 // -21 [ saved v10 ] 140 // -20 [ saved v9 ] 141 // -19 [ saved v8 ] 142 // -18 [ saved r28 ] 143 // -17 [ saved r27 ] 144 // -16 [ saved r26 ] 145 // -15 [ saved r25 ] 146 // -14 [ saved r24 ] 147 // -13 [ saved r23 ] 148 // -12 [ saved r22 ] 149 // -11 [ saved r21 ] 150 // -10 [ saved r20 ] 151 // -9 [ saved r19 ] 152 // -8 [ call wrapper (r0) ] 153 // -7 [ result (r1) ] 154 // -6 [ result type (r2) ] 155 // -5 [ method (r3) ] 156 // -4 [ entry point (r4) ] 157 // -3 [ parameters (r5) ] 158 // -2 [ parameter size (r6) ] 159 // -1 [ thread (r7) ] 160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 161 // 1 [ saved lr (r30) ] 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -26, 166 167 d15_off = -26, 168 d13_off = -24, 169 d11_off = -22, 170 d9_off = -20, 171 172 r28_off = -18, 173 r26_off = -16, 174 r24_off = -14, 175 r22_off = -12, 176 r20_off = -10, 177 call_wrapper_off = -8, 178 result_off = -7, 179 result_type_off = -6, 180 method_off = -5, 181 entry_point_off = -4, 182 parameter_size_off = -2, 183 thread_off = -1, 184 fp_f = 0, 185 retaddr_off = 1, 186 }; 187 188 address generate_call_stub(address& return_address) { 189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 191 "adjust this code"); 192 193 StubCodeMark mark(this, "StubRoutines", "call_stub"); 194 address start = __ pc(); 195 196 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 197 198 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 199 const Address result (rfp, result_off * wordSize); 200 const Address result_type (rfp, result_type_off * wordSize); 201 const Address method (rfp, method_off * wordSize); 202 const Address entry_point (rfp, entry_point_off * wordSize); 203 const Address parameter_size(rfp, parameter_size_off * wordSize); 204 205 const Address thread (rfp, thread_off * wordSize); 206 207 const Address d15_save (rfp, d15_off * wordSize); 208 const Address d13_save (rfp, d13_off * wordSize); 209 const Address d11_save (rfp, d11_off * wordSize); 210 const Address d9_save (rfp, d9_off * wordSize); 211 212 const Address r28_save (rfp, r28_off * wordSize); 213 const Address r26_save (rfp, r26_off * wordSize); 214 const Address r24_save (rfp, r24_off * wordSize); 215 const Address r22_save (rfp, r22_off * wordSize); 216 const Address r20_save (rfp, r20_off * wordSize); 217 218 // stub code 219 220 // we need a C prolog to bootstrap the x86 caller into the sim 221 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 222 223 address aarch64_entry = __ pc(); 224 225 #ifdef BUILTIN_SIM 226 // Save sender's SP for stack traces. 227 __ mov(rscratch1, sp); 228 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 229 #endif 230 // set up frame and move sp to end of save area 231 __ enter(); 232 __ sub(sp, rfp, -sp_after_call_off * wordSize); 233 234 // save register parameters and Java scratch/global registers 235 // n.b. we save thread even though it gets installed in 236 // rthread because we want to sanity check rthread later 237 __ str(c_rarg7, thread); 238 __ strw(c_rarg6, parameter_size); 239 __ stp(c_rarg4, c_rarg5, entry_point); 240 __ stp(c_rarg2, c_rarg3, result_type); 241 __ stp(c_rarg0, c_rarg1, call_wrapper); 242 243 __ stp(r20, r19, r20_save); 244 __ stp(r22, r21, r22_save); 245 __ stp(r24, r23, r24_save); 246 __ stp(r26, r25, r26_save); 247 __ stp(r28, r27, r28_save); 248 249 __ stpd(v9, v8, d9_save); 250 __ stpd(v11, v10, d11_save); 251 __ stpd(v13, v12, d13_save); 252 __ stpd(v15, v14, d15_save); 253 254 // install Java thread in global register now we have saved 255 // whatever value it held 256 __ mov(rthread, c_rarg7); 257 // And method 258 __ mov(rmethod, c_rarg3); 259 260 // set up the heapbase register 261 __ reinit_heapbase(); 262 263 #ifdef ASSERT 264 // make sure we have no pending exceptions 265 { 266 Label L; 267 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 268 __ cmp(rscratch1, (unsigned)NULL_WORD); 269 __ br(Assembler::EQ, L); 270 __ stop("StubRoutines::call_stub: entered with pending exception"); 271 __ BIND(L); 272 } 273 #endif 274 // pass parameters if any 275 __ mov(esp, sp); 276 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 277 __ andr(sp, rscratch1, -2 * wordSize); 278 279 BLOCK_COMMENT("pass parameters if any"); 280 Label parameters_done; 281 // parameter count is still in c_rarg6 282 // and parameter pointer identifying param 1 is in c_rarg5 283 __ cbzw(c_rarg6, parameters_done); 284 285 address loop = __ pc(); 286 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 287 __ subsw(c_rarg6, c_rarg6, 1); 288 __ push(rscratch1); 289 __ br(Assembler::GT, loop); 290 291 __ BIND(parameters_done); 292 293 // call Java entry -- passing methdoOop, and current sp 294 // rmethod: Method* 295 // r13: sender sp 296 BLOCK_COMMENT("call Java function"); 297 __ mov(r13, sp); 298 __ blr(c_rarg4); 299 300 // tell the simulator we have returned to the stub 301 302 // we do this here because the notify will already have been done 303 // if we get to the next instruction via an exception 304 // 305 // n.b. adding this instruction here affects the calculation of 306 // whether or not a routine returns to the call stub (used when 307 // doing stack walks) since the normal test is to check the return 308 // pc against the address saved below. so we may need to allow for 309 // this extra instruction in the check. 310 311 if (NotifySimulator) { 312 __ notify(Assembler::method_reentry); 313 } 314 // save current address for use by exception handling code 315 316 return_address = __ pc(); 317 318 // store result depending on type (everything that is not 319 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 320 // n.b. this assumes Java returns an integral result in r0 321 // and a floating result in j_farg0 322 __ ldr(j_rarg2, result); 323 Label is_long, is_float, is_double, exit; 324 __ ldr(j_rarg1, result_type); 325 __ cmp(j_rarg1, T_OBJECT); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, T_LONG); 328 __ br(Assembler::EQ, is_long); 329 __ cmp(j_rarg1, T_FLOAT); 330 __ br(Assembler::EQ, is_float); 331 __ cmp(j_rarg1, T_DOUBLE); 332 __ br(Assembler::EQ, is_double); 333 334 // handle T_INT case 335 __ strw(r0, Address(j_rarg2)); 336 337 __ BIND(exit); 338 339 // pop parameters 340 __ sub(esp, rfp, -sp_after_call_off * wordSize); 341 342 #ifdef ASSERT 343 // verify that threads correspond 344 { 345 Label L, S; 346 __ ldr(rscratch1, thread); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::NE, S); 349 __ get_thread(rscratch1); 350 __ cmp(rthread, rscratch1); 351 __ br(Assembler::EQ, L); 352 __ BIND(S); 353 __ stop("StubRoutines::call_stub: threads must correspond"); 354 __ BIND(L); 355 } 356 #endif 357 358 // restore callee-save registers 359 __ ldpd(v15, v14, d15_save); 360 __ ldpd(v13, v12, d13_save); 361 __ ldpd(v11, v10, d11_save); 362 __ ldpd(v9, v8, d9_save); 363 364 __ ldp(r28, r27, r28_save); 365 __ ldp(r26, r25, r26_save); 366 __ ldp(r24, r23, r24_save); 367 __ ldp(r22, r21, r22_save); 368 __ ldp(r20, r19, r20_save); 369 370 __ ldp(c_rarg0, c_rarg1, call_wrapper); 371 __ ldrw(c_rarg2, result_type); 372 __ ldr(c_rarg3, method); 373 __ ldp(c_rarg4, c_rarg5, entry_point); 374 __ ldp(c_rarg6, c_rarg7, parameter_size); 375 376 #ifndef PRODUCT 377 // tell the simulator we are about to end Java execution 378 if (NotifySimulator) { 379 __ notify(Assembler::method_exit); 380 } 381 #endif 382 // leave frame and return to caller 383 __ leave(); 384 __ ret(lr); 385 386 // handle return types different from T_INT 387 388 __ BIND(is_long); 389 __ str(r0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 __ BIND(is_float); 393 __ strs(j_farg0, Address(j_rarg2, 0)); 394 __ br(Assembler::AL, exit); 395 396 __ BIND(is_double); 397 __ strd(j_farg0, Address(j_rarg2, 0)); 398 __ br(Assembler::AL, exit); 399 400 return start; 401 } 402 403 // Return point for a Java call if there's an exception thrown in 404 // Java code. The exception is caught and transformed into a 405 // pending exception stored in JavaThread that can be tested from 406 // within the VM. 407 // 408 // Note: Usually the parameters are removed by the callee. In case 409 // of an exception crossing an activation frame boundary, that is 410 // not the case if the callee is compiled code => need to setup the 411 // rsp. 412 // 413 // r0: exception oop 414 415 // NOTE: this is used as a target from the signal handler so it 416 // needs an x86 prolog which returns into the current simulator 417 // executing the generated catch_exception code. so the prolog 418 // needs to install rax in a sim register and adjust the sim's 419 // restart pc to enter the generated code at the start position 420 // then return from native to simulated execution. 421 422 address generate_catch_exception() { 423 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 424 address start = __ pc(); 425 426 // same as in generate_call_stub(): 427 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 428 const Address thread (rfp, thread_off * wordSize); 429 430 #ifdef ASSERT 431 // verify that threads correspond 432 { 433 Label L, S; 434 __ ldr(rscratch1, thread); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::NE, S); 437 __ get_thread(rscratch1); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::EQ, L); 440 __ bind(S); 441 __ stop("StubRoutines::catch_exception: threads must correspond"); 442 __ bind(L); 443 } 444 #endif 445 446 // set pending exception 447 __ verify_oop(r0); 448 449 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 450 __ mov(rscratch1, (address)__FILE__); 451 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 452 __ movw(rscratch1, (int)__LINE__); 453 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 454 455 // complete return to VM 456 assert(StubRoutines::_call_stub_return_address != NULL, 457 "_call_stub_return_address must have been generated before"); 458 __ b(StubRoutines::_call_stub_return_address); 459 460 return start; 461 } 462 463 // Continuation point for runtime calls returning with a pending 464 // exception. The pending exception check happened in the runtime 465 // or native call stub. The pending exception in Thread is 466 // converted into a Java-level exception. 467 // 468 // Contract with Java-level exception handlers: 469 // r0: exception 470 // r3: throwing pc 471 // 472 // NOTE: At entry of this stub, exception-pc must be in LR !! 473 474 // NOTE: this is always used as a jump target within generated code 475 // so it just needs to be generated code wiht no x86 prolog 476 477 address generate_forward_exception() { 478 StubCodeMark mark(this, "StubRoutines", "forward exception"); 479 address start = __ pc(); 480 481 // Upon entry, LR points to the return address returning into 482 // Java (interpreted or compiled) code; i.e., the return address 483 // becomes the throwing pc. 484 // 485 // Arguments pushed before the runtime call are still on the stack 486 // but the exception handler will reset the stack pointer -> 487 // ignore them. A potential result in registers can be ignored as 488 // well. 489 490 #ifdef ASSERT 491 // make sure this code is only executed if there is a pending exception 492 { 493 Label L; 494 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 495 __ cbnz(rscratch1, L); 496 __ stop("StubRoutines::forward exception: no pending exception (1)"); 497 __ bind(L); 498 } 499 #endif 500 501 // compute exception handler into r19 502 503 // call the VM to find the handler address associated with the 504 // caller address. pass thread in r0 and caller pc (ret address) 505 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 506 // the stack. 507 __ mov(c_rarg1, lr); 508 // lr will be trashed by the VM call so we move it to R19 509 // (callee-saved) because we also need to pass it to the handler 510 // returned by this call. 511 __ mov(r19, lr); 512 BLOCK_COMMENT("call exception_handler_for_return_address"); 513 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 514 SharedRuntime::exception_handler_for_return_address), 515 rthread, c_rarg1); 516 // we should not really care that lr is no longer the callee 517 // address. we saved the value the handler needs in r19 so we can 518 // just copy it to r3. however, the C2 handler will push its own 519 // frame and then calls into the VM and the VM code asserts that 520 // the PC for the frame above the handler belongs to a compiled 521 // Java method. So, we restore lr here to satisfy that assert. 522 __ mov(lr, r19); 523 // setup r0 & r3 & clear pending exception 524 __ mov(r3, r19); 525 __ mov(r19, r0); 526 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 527 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 528 529 #ifdef ASSERT 530 // make sure exception is set 531 { 532 Label L; 533 __ cbnz(r0, L); 534 __ stop("StubRoutines::forward exception: no pending exception (2)"); 535 __ bind(L); 536 } 537 #endif 538 539 // continue at exception handler 540 // r0: exception 541 // r3: throwing pc 542 // r19: exception handler 543 __ verify_oop(r0); 544 __ br(r19); 545 546 return start; 547 } 548 549 // Non-destructive plausibility checks for oops 550 // 551 // Arguments: 552 // r0: oop to verify 553 // rscratch1: error message 554 // 555 // Stack after saving c_rarg3: 556 // [tos + 0]: saved c_rarg3 557 // [tos + 1]: saved c_rarg2 558 // [tos + 2]: saved lr 559 // [tos + 3]: saved rscratch2 560 // [tos + 4]: saved r0 561 // [tos + 5]: saved rscratch1 562 address generate_verify_oop() { 563 564 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 565 address start = __ pc(); 566 567 Label exit, error; 568 569 // save c_rarg2 and c_rarg3 570 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 571 572 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 573 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 574 __ ldr(c_rarg3, Address(c_rarg2)); 575 __ add(c_rarg3, c_rarg3, 1); 576 __ str(c_rarg3, Address(c_rarg2)); 577 578 // object is in r0 579 // make sure object is 'reasonable' 580 __ cbz(r0, exit); // if obj is NULL it is OK 581 582 // Check if the oop is in the right area of memory 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 584 __ andr(c_rarg2, r0, c_rarg3); 585 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 586 587 // Compare c_rarg2 and c_rarg3. We don't use a compare 588 // instruction here because the flags register is live. 589 __ eor(c_rarg2, c_rarg2, c_rarg3); 590 __ cbnz(c_rarg2, error); 591 592 // make sure klass is 'reasonable', which is not zero. 593 __ load_klass(r0, r0); // get klass 594 __ cbz(r0, error); // if klass is NULL it is broken 595 596 // return if everything seems ok 597 __ bind(exit); 598 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 __ ret(lr); 601 602 // handle errors 603 __ bind(error); 604 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 605 606 __ push(RegSet::range(r0, r29), sp); 607 // debug(char* msg, int64_t pc, int64_t regs[]) 608 __ mov(c_rarg0, rscratch1); // pass address of error message 609 __ mov(c_rarg1, lr); // pass return address 610 __ mov(c_rarg2, sp); // pass address of regs on stack 611 #ifndef PRODUCT 612 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 613 #endif 614 BLOCK_COMMENT("call MacroAssembler::debug"); 615 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 616 __ blrt(rscratch1, 3, 0, 1); 617 618 return start; 619 } 620 621 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 622 623 // Generate code for an array write pre barrier 624 // 625 // addr - starting address 626 // count - element count 627 // tmp - scratch register 628 // saved_regs - registers to be saved before calling static_write_ref_array_pre 629 // 630 // Callers must specify which registers to preserve in saved_regs. 631 // Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs. 632 // 633 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized, RegSet saved_regs) { 634 BarrierSet* bs = Universe::heap()->barrier_set(); 635 switch (bs->kind()) { 636 case BarrierSet::G1SATBCTLogging: 637 // With G1, don't generate the call if we statically know that the target in uninitialized 638 if (!dest_uninitialized) { 639 __ push(saved_regs, sp); 640 if (count == c_rarg0) { 641 if (addr == c_rarg1) { 642 // exactly backwards!! 643 __ mov(rscratch1, c_rarg0); 644 __ mov(c_rarg0, c_rarg1); 645 __ mov(c_rarg1, rscratch1); 646 } else { 647 __ mov(c_rarg1, count); 648 __ mov(c_rarg0, addr); 649 } 650 } else { 651 __ mov(c_rarg0, addr); 652 __ mov(c_rarg1, count); 653 } 654 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 655 __ pop(saved_regs, sp); 656 break; 657 case BarrierSet::CardTableModRef: 658 break; 659 default: 660 ShouldNotReachHere(); 661 662 } 663 } 664 } 665 666 // 667 // Generate code for an array write post barrier 668 // 669 // Input: 670 // start - register containing starting address of destination array 671 // end - register containing ending address of destination array 672 // scratch - scratch register 673 // saved_regs - registers to be saved before calling static_write_ref_array_post 674 // 675 // The input registers are overwritten. 676 // The ending address is inclusive. 677 // Callers must specify which registers to preserve in saved_regs. 678 // Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs. 679 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch, RegSet saved_regs) { 680 assert_different_registers(start, end, scratch); 681 BarrierSet* bs = Universe::heap()->barrier_set(); 682 switch (bs->kind()) { 683 case BarrierSet::G1SATBCTLogging: 684 685 { 686 __ push(saved_regs, sp); 687 // must compute element count unless barrier set interface is changed (other platforms supply count) 688 assert_different_registers(start, end, scratch); 689 __ lea(scratch, Address(end, BytesPerHeapOop)); 690 __ sub(scratch, scratch, start); // subtract start to get #bytes 691 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 692 __ mov(c_rarg0, start); 693 __ mov(c_rarg1, scratch); 694 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 695 __ pop(saved_regs, sp); 696 } 697 break; 698 case BarrierSet::CardTableModRef: 699 { 700 CardTableModRefBS* ctbs = barrier_set_cast<CardTableModRefBS>(bs); 701 CardTable* ct = ctbs->card_table(); 702 assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code"); 703 704 Label L_loop; 705 706 __ lsr(start, start, CardTable::card_shift); 707 __ lsr(end, end, CardTable::card_shift); 708 __ sub(end, end, start); // number of bytes to copy 709 710 const Register count = end; // 'end' register contains bytes count now 711 __ load_byte_map_base(scratch); 712 __ add(start, start, scratch); 713 if (UseConcMarkSweepGC) { 714 __ membar(__ StoreStore); 715 } 716 __ BIND(L_loop); 717 __ strb(zr, Address(start, count)); 718 __ subs(count, count, 1); 719 __ br(Assembler::GE, L_loop); 720 } 721 break; 722 default: 723 ShouldNotReachHere(); 724 725 } 726 } 727 728 // The inner part of zero_words(). This is the bulk operation, 729 // zeroing words in blocks, possibly using DC ZVA to do it. The 730 // caller is responsible for zeroing the last few words. 731 // 732 // Inputs: 733 // r10: the HeapWord-aligned base address of an array to zero. 734 // r11: the count in HeapWords, r11 > 0. 735 // 736 // Returns r10 and r11, adjusted for the caller to clear. 737 // r10: the base address of the tail of words left to clear. 738 // r11: the number of words in the tail. 739 // r11 < MacroAssembler::zero_words_block_size. 740 741 address generate_zero_blocks() { 742 Label store_pair, loop_store_pair, done; 743 Label base_aligned; 744 745 Register base = r10, cnt = r11; 746 747 __ align(CodeEntryAlignment); 748 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 749 address start = __ pc(); 750 751 if (UseBlockZeroing) { 752 int zva_length = VM_Version::zva_length(); 753 754 // Ensure ZVA length can be divided by 16. This is required by 755 // the subsequent operations. 756 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 757 758 __ tbz(base, 3, base_aligned); 759 __ str(zr, Address(__ post(base, 8))); 760 __ sub(cnt, cnt, 1); 761 __ bind(base_aligned); 762 763 // Ensure count >= zva_length * 2 so that it still deserves a zva after 764 // alignment. 765 Label small; 766 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 767 __ subs(rscratch1, cnt, low_limit >> 3); 768 __ br(Assembler::LT, small); 769 __ zero_dcache_blocks(base, cnt); 770 __ bind(small); 771 } 772 773 { 774 // Number of stp instructions we'll unroll 775 const int unroll = 776 MacroAssembler::zero_words_block_size / 2; 777 // Clear the remaining blocks. 778 Label loop; 779 __ subs(cnt, cnt, unroll * 2); 780 __ br(Assembler::LT, done); 781 __ bind(loop); 782 for (int i = 0; i < unroll; i++) 783 __ stp(zr, zr, __ post(base, 16)); 784 __ subs(cnt, cnt, unroll * 2); 785 __ br(Assembler::GE, loop); 786 __ bind(done); 787 __ add(cnt, cnt, unroll * 2); 788 } 789 790 __ ret(lr); 791 792 return start; 793 } 794 795 796 typedef enum { 797 copy_forwards = 1, 798 copy_backwards = -1 799 } copy_direction; 800 801 // Bulk copy of blocks of 8 words. 802 // 803 // count is a count of words. 804 // 805 // Precondition: count >= 8 806 // 807 // Postconditions: 808 // 809 // The least significant bit of count contains the remaining count 810 // of words to copy. The rest of count is trash. 811 // 812 // s and d are adjusted to point to the remaining words to copy 813 // 814 void generate_copy_longs(Label &start, Register s, Register d, Register count, 815 copy_direction direction) { 816 int unit = wordSize * direction; 817 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 818 819 int offset; 820 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 821 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 822 const Register stride = r13; 823 824 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 825 assert_different_registers(s, d, count, rscratch1); 826 827 Label again, drain; 828 const char *stub_name; 829 if (direction == copy_forwards) 830 stub_name = "forward_copy_longs"; 831 else 832 stub_name = "backward_copy_longs"; 833 StubCodeMark mark(this, "StubRoutines", stub_name); 834 __ align(CodeEntryAlignment); 835 __ bind(start); 836 837 Label unaligned_copy_long; 838 if (AvoidUnalignedAccesses) { 839 __ tbnz(d, 3, unaligned_copy_long); 840 } 841 842 if (direction == copy_forwards) { 843 __ sub(s, s, bias); 844 __ sub(d, d, bias); 845 } 846 847 #ifdef ASSERT 848 // Make sure we are never given < 8 words 849 { 850 Label L; 851 __ cmp(count, 8); 852 __ br(Assembler::GE, L); 853 __ stop("genrate_copy_longs called with < 8 words"); 854 __ bind(L); 855 } 856 #endif 857 858 // Fill 8 registers 859 if (UseSIMDForMemoryOps) { 860 __ ldpq(v0, v1, Address(s, 4 * unit)); 861 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 862 } else { 863 __ ldp(t0, t1, Address(s, 2 * unit)); 864 __ ldp(t2, t3, Address(s, 4 * unit)); 865 __ ldp(t4, t5, Address(s, 6 * unit)); 866 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 867 } 868 869 __ subs(count, count, 16); 870 __ br(Assembler::LO, drain); 871 872 int prefetch = PrefetchCopyIntervalInBytes; 873 bool use_stride = false; 874 if (direction == copy_backwards) { 875 use_stride = prefetch > 256; 876 prefetch = -prefetch; 877 if (use_stride) __ mov(stride, prefetch); 878 } 879 880 __ bind(again); 881 882 if (PrefetchCopyIntervalInBytes > 0) 883 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 884 885 if (UseSIMDForMemoryOps) { 886 __ stpq(v0, v1, Address(d, 4 * unit)); 887 __ ldpq(v0, v1, Address(s, 4 * unit)); 888 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 889 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 890 } else { 891 __ stp(t0, t1, Address(d, 2 * unit)); 892 __ ldp(t0, t1, Address(s, 2 * unit)); 893 __ stp(t2, t3, Address(d, 4 * unit)); 894 __ ldp(t2, t3, Address(s, 4 * unit)); 895 __ stp(t4, t5, Address(d, 6 * unit)); 896 __ ldp(t4, t5, Address(s, 6 * unit)); 897 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 898 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 899 } 900 901 __ subs(count, count, 8); 902 __ br(Assembler::HS, again); 903 904 // Drain 905 __ bind(drain); 906 if (UseSIMDForMemoryOps) { 907 __ stpq(v0, v1, Address(d, 4 * unit)); 908 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 909 } else { 910 __ stp(t0, t1, Address(d, 2 * unit)); 911 __ stp(t2, t3, Address(d, 4 * unit)); 912 __ stp(t4, t5, Address(d, 6 * unit)); 913 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 914 } 915 916 { 917 Label L1, L2; 918 __ tbz(count, exact_log2(4), L1); 919 if (UseSIMDForMemoryOps) { 920 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 921 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 922 } else { 923 __ ldp(t0, t1, Address(s, 2 * unit)); 924 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 925 __ stp(t0, t1, Address(d, 2 * unit)); 926 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 927 } 928 __ bind(L1); 929 930 if (direction == copy_forwards) { 931 __ add(s, s, bias); 932 __ add(d, d, bias); 933 } 934 935 __ tbz(count, 1, L2); 936 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 937 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 938 __ bind(L2); 939 } 940 941 __ ret(lr); 942 943 if (AvoidUnalignedAccesses) { 944 Label drain, again; 945 // Register order for storing. Order is different for backward copy. 946 947 __ bind(unaligned_copy_long); 948 949 // source address is even aligned, target odd aligned 950 // 951 // when forward copying word pairs we read long pairs at offsets 952 // {0, 2, 4, 6} (in long words). when backwards copying we read 953 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 954 // address by -2 in the forwards case so we can compute the 955 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 956 // or -1. 957 // 958 // when forward copying we need to store 1 word, 3 pairs and 959 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 960 // zero offset We adjust the destination by -1 which means we 961 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 962 // 963 // When backwards copyng we need to store 1 word, 3 pairs and 964 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 965 // offsets {1, 3, 5, 7, 8} * unit. 966 967 if (direction == copy_forwards) { 968 __ sub(s, s, 16); 969 __ sub(d, d, 8); 970 } 971 972 // Fill 8 registers 973 // 974 // for forwards copy s was offset by -16 from the original input 975 // value of s so the register contents are at these offsets 976 // relative to the 64 bit block addressed by that original input 977 // and so on for each successive 64 byte block when s is updated 978 // 979 // t0 at offset 0, t1 at offset 8 980 // t2 at offset 16, t3 at offset 24 981 // t4 at offset 32, t5 at offset 40 982 // t6 at offset 48, t7 at offset 56 983 984 // for backwards copy s was not offset so the register contents 985 // are at these offsets into the preceding 64 byte block 986 // relative to that original input and so on for each successive 987 // preceding 64 byte block when s is updated. this explains the 988 // slightly counter-intuitive looking pattern of register usage 989 // in the stp instructions for backwards copy. 990 // 991 // t0 at offset -16, t1 at offset -8 992 // t2 at offset -32, t3 at offset -24 993 // t4 at offset -48, t5 at offset -40 994 // t6 at offset -64, t7 at offset -56 995 996 __ ldp(t0, t1, Address(s, 2 * unit)); 997 __ ldp(t2, t3, Address(s, 4 * unit)); 998 __ ldp(t4, t5, Address(s, 6 * unit)); 999 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1000 1001 __ subs(count, count, 16); 1002 __ br(Assembler::LO, drain); 1003 1004 int prefetch = PrefetchCopyIntervalInBytes; 1005 bool use_stride = false; 1006 if (direction == copy_backwards) { 1007 use_stride = prefetch > 256; 1008 prefetch = -prefetch; 1009 if (use_stride) __ mov(stride, prefetch); 1010 } 1011 1012 __ bind(again); 1013 1014 if (PrefetchCopyIntervalInBytes > 0) 1015 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1016 1017 if (direction == copy_forwards) { 1018 // allowing for the offset of -8 the store instructions place 1019 // registers into the target 64 bit block at the following 1020 // offsets 1021 // 1022 // t0 at offset 0 1023 // t1 at offset 8, t2 at offset 16 1024 // t3 at offset 24, t4 at offset 32 1025 // t5 at offset 40, t6 at offset 48 1026 // t7 at offset 56 1027 1028 __ str(t0, Address(d, 1 * unit)); 1029 __ stp(t1, t2, Address(d, 2 * unit)); 1030 __ ldp(t0, t1, Address(s, 2 * unit)); 1031 __ stp(t3, t4, Address(d, 4 * unit)); 1032 __ ldp(t2, t3, Address(s, 4 * unit)); 1033 __ stp(t5, t6, Address(d, 6 * unit)); 1034 __ ldp(t4, t5, Address(s, 6 * unit)); 1035 __ str(t7, Address(__ pre(d, 8 * unit))); 1036 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1037 } else { 1038 // d was not offset when we started so the registers are 1039 // written into the 64 bit block preceding d with the following 1040 // offsets 1041 // 1042 // t1 at offset -8 1043 // t3 at offset -24, t0 at offset -16 1044 // t5 at offset -48, t2 at offset -32 1045 // t7 at offset -56, t4 at offset -48 1046 // t6 at offset -64 1047 // 1048 // note that this matches the offsets previously noted for the 1049 // loads 1050 1051 __ str(t1, Address(d, 1 * unit)); 1052 __ stp(t3, t0, Address(d, 3 * unit)); 1053 __ ldp(t0, t1, Address(s, 2 * unit)); 1054 __ stp(t5, t2, Address(d, 5 * unit)); 1055 __ ldp(t2, t3, Address(s, 4 * unit)); 1056 __ stp(t7, t4, Address(d, 7 * unit)); 1057 __ ldp(t4, t5, Address(s, 6 * unit)); 1058 __ str(t6, Address(__ pre(d, 8 * unit))); 1059 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1060 } 1061 1062 __ subs(count, count, 8); 1063 __ br(Assembler::HS, again); 1064 1065 // Drain 1066 // 1067 // this uses the same pattern of offsets and register arguments 1068 // as above 1069 __ bind(drain); 1070 if (direction == copy_forwards) { 1071 __ str(t0, Address(d, 1 * unit)); 1072 __ stp(t1, t2, Address(d, 2 * unit)); 1073 __ stp(t3, t4, Address(d, 4 * unit)); 1074 __ stp(t5, t6, Address(d, 6 * unit)); 1075 __ str(t7, Address(__ pre(d, 8 * unit))); 1076 } else { 1077 __ str(t1, Address(d, 1 * unit)); 1078 __ stp(t3, t0, Address(d, 3 * unit)); 1079 __ stp(t5, t2, Address(d, 5 * unit)); 1080 __ stp(t7, t4, Address(d, 7 * unit)); 1081 __ str(t6, Address(__ pre(d, 8 * unit))); 1082 } 1083 // now we need to copy any remaining part block which may 1084 // include a 4 word block subblock and/or a 2 word subblock. 1085 // bits 2 and 1 in the count are the tell-tale for whetehr we 1086 // have each such subblock 1087 { 1088 Label L1, L2; 1089 __ tbz(count, exact_log2(4), L1); 1090 // this is the same as above but copying only 4 longs hence 1091 // with ony one intervening stp between the str instructions 1092 // but note that the offsets and registers still follow the 1093 // same pattern 1094 __ ldp(t0, t1, Address(s, 2 * unit)); 1095 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1096 if (direction == copy_forwards) { 1097 __ str(t0, Address(d, 1 * unit)); 1098 __ stp(t1, t2, Address(d, 2 * unit)); 1099 __ str(t3, Address(__ pre(d, 4 * unit))); 1100 } else { 1101 __ str(t1, Address(d, 1 * unit)); 1102 __ stp(t3, t0, Address(d, 3 * unit)); 1103 __ str(t2, Address(__ pre(d, 4 * unit))); 1104 } 1105 __ bind(L1); 1106 1107 __ tbz(count, 1, L2); 1108 // this is the same as above but copying only 2 longs hence 1109 // there is no intervening stp between the str instructions 1110 // but note that the offset and register patterns are still 1111 // the same 1112 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1113 if (direction == copy_forwards) { 1114 __ str(t0, Address(d, 1 * unit)); 1115 __ str(t1, Address(__ pre(d, 2 * unit))); 1116 } else { 1117 __ str(t1, Address(d, 1 * unit)); 1118 __ str(t0, Address(__ pre(d, 2 * unit))); 1119 } 1120 __ bind(L2); 1121 1122 // for forwards copy we need to re-adjust the offsets we 1123 // applied so that s and d are follow the last words written 1124 1125 if (direction == copy_forwards) { 1126 __ add(s, s, 16); 1127 __ add(d, d, 8); 1128 } 1129 1130 } 1131 1132 __ ret(lr); 1133 } 1134 } 1135 1136 // Small copy: less than 16 bytes. 1137 // 1138 // NB: Ignores all of the bits of count which represent more than 15 1139 // bytes, so a caller doesn't have to mask them. 1140 1141 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1142 bool is_backwards = step < 0; 1143 size_t granularity = uabs(step); 1144 int direction = is_backwards ? -1 : 1; 1145 int unit = wordSize * direction; 1146 1147 Label Lpair, Lword, Lint, Lshort, Lbyte; 1148 1149 assert(granularity 1150 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1151 1152 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1153 1154 // ??? I don't know if this bit-test-and-branch is the right thing 1155 // to do. It does a lot of jumping, resulting in several 1156 // mispredicted branches. It might make more sense to do this 1157 // with something like Duff's device with a single computed branch. 1158 1159 __ tbz(count, 3 - exact_log2(granularity), Lword); 1160 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1161 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1162 __ bind(Lword); 1163 1164 if (granularity <= sizeof (jint)) { 1165 __ tbz(count, 2 - exact_log2(granularity), Lint); 1166 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1167 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1168 __ bind(Lint); 1169 } 1170 1171 if (granularity <= sizeof (jshort)) { 1172 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1173 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1174 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1175 __ bind(Lshort); 1176 } 1177 1178 if (granularity <= sizeof (jbyte)) { 1179 __ tbz(count, 0, Lbyte); 1180 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1181 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1182 __ bind(Lbyte); 1183 } 1184 } 1185 1186 Label copy_f, copy_b; 1187 1188 // All-singing all-dancing memory copy. 1189 // 1190 // Copy count units of memory from s to d. The size of a unit is 1191 // step, which can be positive or negative depending on the direction 1192 // of copy. If is_aligned is false, we align the source address. 1193 // 1194 1195 void copy_memory(bool is_aligned, Register s, Register d, 1196 Register count, Register tmp, int step) { 1197 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1198 bool is_backwards = step < 0; 1199 int granularity = uabs(step); 1200 const Register t0 = r3, t1 = r4; 1201 1202 // <= 96 bytes do inline. Direction doesn't matter because we always 1203 // load all the data before writing anything 1204 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1205 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1206 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1207 const Register send = r17, dend = r18; 1208 1209 if (PrefetchCopyIntervalInBytes > 0) 1210 __ prfm(Address(s, 0), PLDL1KEEP); 1211 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1212 __ br(Assembler::HI, copy_big); 1213 1214 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1215 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1216 1217 __ cmp(count, 16/granularity); 1218 __ br(Assembler::LS, copy16); 1219 1220 __ cmp(count, 64/granularity); 1221 __ br(Assembler::HI, copy80); 1222 1223 __ cmp(count, 32/granularity); 1224 __ br(Assembler::LS, copy32); 1225 1226 // 33..64 bytes 1227 if (UseSIMDForMemoryOps) { 1228 __ ldpq(v0, v1, Address(s, 0)); 1229 __ ldpq(v2, v3, Address(send, -32)); 1230 __ stpq(v0, v1, Address(d, 0)); 1231 __ stpq(v2, v3, Address(dend, -32)); 1232 } else { 1233 __ ldp(t0, t1, Address(s, 0)); 1234 __ ldp(t2, t3, Address(s, 16)); 1235 __ ldp(t4, t5, Address(send, -32)); 1236 __ ldp(t6, t7, Address(send, -16)); 1237 1238 __ stp(t0, t1, Address(d, 0)); 1239 __ stp(t2, t3, Address(d, 16)); 1240 __ stp(t4, t5, Address(dend, -32)); 1241 __ stp(t6, t7, Address(dend, -16)); 1242 } 1243 __ b(finish); 1244 1245 // 17..32 bytes 1246 __ bind(copy32); 1247 __ ldp(t0, t1, Address(s, 0)); 1248 __ ldp(t2, t3, Address(send, -16)); 1249 __ stp(t0, t1, Address(d, 0)); 1250 __ stp(t2, t3, Address(dend, -16)); 1251 __ b(finish); 1252 1253 // 65..80/96 bytes 1254 // (96 bytes if SIMD because we do 32 byes per instruction) 1255 __ bind(copy80); 1256 if (UseSIMDForMemoryOps) { 1257 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1258 __ ldpq(v4, v5, Address(send, -32)); 1259 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1260 __ stpq(v4, v5, Address(dend, -32)); 1261 } else { 1262 __ ldp(t0, t1, Address(s, 0)); 1263 __ ldp(t2, t3, Address(s, 16)); 1264 __ ldp(t4, t5, Address(s, 32)); 1265 __ ldp(t6, t7, Address(s, 48)); 1266 __ ldp(t8, t9, Address(send, -16)); 1267 1268 __ stp(t0, t1, Address(d, 0)); 1269 __ stp(t2, t3, Address(d, 16)); 1270 __ stp(t4, t5, Address(d, 32)); 1271 __ stp(t6, t7, Address(d, 48)); 1272 __ stp(t8, t9, Address(dend, -16)); 1273 } 1274 __ b(finish); 1275 1276 // 0..16 bytes 1277 __ bind(copy16); 1278 __ cmp(count, 8/granularity); 1279 __ br(Assembler::LO, copy8); 1280 1281 // 8..16 bytes 1282 __ ldr(t0, Address(s, 0)); 1283 __ ldr(t1, Address(send, -8)); 1284 __ str(t0, Address(d, 0)); 1285 __ str(t1, Address(dend, -8)); 1286 __ b(finish); 1287 1288 if (granularity < 8) { 1289 // 4..7 bytes 1290 __ bind(copy8); 1291 __ tbz(count, 2 - exact_log2(granularity), copy4); 1292 __ ldrw(t0, Address(s, 0)); 1293 __ ldrw(t1, Address(send, -4)); 1294 __ strw(t0, Address(d, 0)); 1295 __ strw(t1, Address(dend, -4)); 1296 __ b(finish); 1297 if (granularity < 4) { 1298 // 0..3 bytes 1299 __ bind(copy4); 1300 __ cbz(count, finish); // get rid of 0 case 1301 if (granularity == 2) { 1302 __ ldrh(t0, Address(s, 0)); 1303 __ strh(t0, Address(d, 0)); 1304 } else { // granularity == 1 1305 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1306 // the first and last byte. 1307 // Handle the 3 byte case by loading and storing base + count/2 1308 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1309 // This does means in the 1 byte case we load/store the same 1310 // byte 3 times. 1311 __ lsr(count, count, 1); 1312 __ ldrb(t0, Address(s, 0)); 1313 __ ldrb(t1, Address(send, -1)); 1314 __ ldrb(t2, Address(s, count)); 1315 __ strb(t0, Address(d, 0)); 1316 __ strb(t1, Address(dend, -1)); 1317 __ strb(t2, Address(d, count)); 1318 } 1319 __ b(finish); 1320 } 1321 } 1322 1323 __ bind(copy_big); 1324 if (is_backwards) { 1325 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1326 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1327 } 1328 1329 // Now we've got the small case out of the way we can align the 1330 // source address on a 2-word boundary. 1331 1332 Label aligned; 1333 1334 if (is_aligned) { 1335 // We may have to adjust by 1 word to get s 2-word-aligned. 1336 __ tbz(s, exact_log2(wordSize), aligned); 1337 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1338 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1339 __ sub(count, count, wordSize/granularity); 1340 } else { 1341 if (is_backwards) { 1342 __ andr(rscratch2, s, 2 * wordSize - 1); 1343 } else { 1344 __ neg(rscratch2, s); 1345 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1346 } 1347 // rscratch2 is the byte adjustment needed to align s. 1348 __ cbz(rscratch2, aligned); 1349 int shift = exact_log2(granularity); 1350 if (shift) __ lsr(rscratch2, rscratch2, shift); 1351 __ sub(count, count, rscratch2); 1352 1353 #if 0 1354 // ?? This code is only correct for a disjoint copy. It may or 1355 // may not make sense to use it in that case. 1356 1357 // Copy the first pair; s and d may not be aligned. 1358 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1359 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1360 1361 // Align s and d, adjust count 1362 if (is_backwards) { 1363 __ sub(s, s, rscratch2); 1364 __ sub(d, d, rscratch2); 1365 } else { 1366 __ add(s, s, rscratch2); 1367 __ add(d, d, rscratch2); 1368 } 1369 #else 1370 copy_memory_small(s, d, rscratch2, rscratch1, step); 1371 #endif 1372 } 1373 1374 __ bind(aligned); 1375 1376 // s is now 2-word-aligned. 1377 1378 // We have a count of units and some trailing bytes. Adjust the 1379 // count and do a bulk copy of words. 1380 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1381 if (direction == copy_forwards) 1382 __ bl(copy_f); 1383 else 1384 __ bl(copy_b); 1385 1386 // And the tail. 1387 copy_memory_small(s, d, count, tmp, step); 1388 1389 if (granularity >= 8) __ bind(copy8); 1390 if (granularity >= 4) __ bind(copy4); 1391 __ bind(finish); 1392 } 1393 1394 1395 void clobber_registers() { 1396 #ifdef ASSERT 1397 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1398 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1399 for (Register r = r3; r <= r18; r++) 1400 if (r != rscratch1) __ mov(r, rscratch1); 1401 #endif 1402 } 1403 1404 // Scan over array at a for count oops, verifying each one. 1405 // Preserves a and count, clobbers rscratch1 and rscratch2. 1406 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1407 Label loop, end; 1408 __ mov(rscratch1, a); 1409 __ mov(rscratch2, zr); 1410 __ bind(loop); 1411 __ cmp(rscratch2, count); 1412 __ br(Assembler::HS, end); 1413 if (size == (size_t)wordSize) { 1414 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1415 __ verify_oop(temp); 1416 } else { 1417 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1418 __ decode_heap_oop(temp); // calls verify_oop 1419 } 1420 __ add(rscratch2, rscratch2, size); 1421 __ b(loop); 1422 __ bind(end); 1423 } 1424 1425 // Arguments: 1426 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1427 // ignored 1428 // is_oop - true => oop array, so generate store check code 1429 // name - stub name string 1430 // 1431 // Inputs: 1432 // c_rarg0 - source array address 1433 // c_rarg1 - destination array address 1434 // c_rarg2 - element count, treated as ssize_t, can be zero 1435 // 1436 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1437 // the hardware handle it. The two dwords within qwords that span 1438 // cache line boundaries will still be loaded and stored atomicly. 1439 // 1440 // Side Effects: 1441 // disjoint_int_copy_entry is set to the no-overlap entry point 1442 // used by generate_conjoint_int_oop_copy(). 1443 // 1444 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1445 const char *name, bool dest_uninitialized = false) { 1446 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1447 RegSet saved_reg = RegSet::of(s, d, count); 1448 __ align(CodeEntryAlignment); 1449 StubCodeMark mark(this, "StubRoutines", name); 1450 address start = __ pc(); 1451 __ enter(); 1452 1453 if (entry != NULL) { 1454 *entry = __ pc(); 1455 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1456 BLOCK_COMMENT("Entry:"); 1457 } 1458 1459 if (is_oop) { 1460 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_reg); 1461 // save regs before copy_memory 1462 __ push(RegSet::of(d, count), sp); 1463 } 1464 copy_memory(aligned, s, d, count, rscratch1, size); 1465 if (is_oop) { 1466 __ pop(RegSet::of(d, count), sp); 1467 if (VerifyOops) 1468 verify_oop_array(size, d, count, r16); 1469 __ sub(count, count, 1); // make an inclusive end pointer 1470 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1471 gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet()); 1472 } 1473 __ leave(); 1474 __ mov(r0, zr); // return 0 1475 __ ret(lr); 1476 #ifdef BUILTIN_SIM 1477 { 1478 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1479 sim->notifyCompile(const_cast<char*>(name), start); 1480 } 1481 #endif 1482 return start; 1483 } 1484 1485 // Arguments: 1486 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1487 // ignored 1488 // is_oop - true => oop array, so generate store check code 1489 // name - stub name string 1490 // 1491 // Inputs: 1492 // c_rarg0 - source array address 1493 // c_rarg1 - destination array address 1494 // c_rarg2 - element count, treated as ssize_t, can be zero 1495 // 1496 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1497 // the hardware handle it. The two dwords within qwords that span 1498 // cache line boundaries will still be loaded and stored atomicly. 1499 // 1500 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1501 address *entry, const char *name, 1502 bool dest_uninitialized = false) { 1503 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1504 RegSet saved_regs = RegSet::of(s, d, count); 1505 StubCodeMark mark(this, "StubRoutines", name); 1506 address start = __ pc(); 1507 __ enter(); 1508 1509 if (entry != NULL) { 1510 *entry = __ pc(); 1511 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1512 BLOCK_COMMENT("Entry:"); 1513 } 1514 1515 // use fwd copy when (d-s) above_equal (count*size) 1516 __ sub(rscratch1, d, s); 1517 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1518 __ br(Assembler::HS, nooverlap_target); 1519 1520 if (is_oop) { 1521 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_regs); 1522 // save regs before copy_memory 1523 __ push(RegSet::of(d, count), sp); 1524 } 1525 copy_memory(aligned, s, d, count, rscratch1, -size); 1526 if (is_oop) { 1527 __ pop(RegSet::of(d, count), sp); 1528 if (VerifyOops) 1529 verify_oop_array(size, d, count, r16); 1530 __ sub(count, count, 1); // make an inclusive end pointer 1531 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1532 gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet()); 1533 } 1534 __ leave(); 1535 __ mov(r0, zr); // return 0 1536 __ ret(lr); 1537 #ifdef BUILTIN_SIM 1538 { 1539 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1540 sim->notifyCompile(const_cast<char*>(name), start); 1541 } 1542 #endif 1543 return start; 1544 } 1545 1546 // Arguments: 1547 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1548 // ignored 1549 // name - stub name string 1550 // 1551 // Inputs: 1552 // c_rarg0 - source array address 1553 // c_rarg1 - destination array address 1554 // c_rarg2 - element count, treated as ssize_t, can be zero 1555 // 1556 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1557 // we let the hardware handle it. The one to eight bytes within words, 1558 // dwords or qwords that span cache line boundaries will still be loaded 1559 // and stored atomically. 1560 // 1561 // Side Effects: 1562 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1563 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1564 // we let the hardware handle it. The one to eight bytes within words, 1565 // dwords or qwords that span cache line boundaries will still be loaded 1566 // and stored atomically. 1567 // 1568 // Side Effects: 1569 // disjoint_byte_copy_entry is set to the no-overlap entry point 1570 // used by generate_conjoint_byte_copy(). 1571 // 1572 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1573 const bool not_oop = false; 1574 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1575 } 1576 1577 // Arguments: 1578 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1579 // ignored 1580 // name - stub name string 1581 // 1582 // Inputs: 1583 // c_rarg0 - source array address 1584 // c_rarg1 - destination array address 1585 // c_rarg2 - element count, treated as ssize_t, can be zero 1586 // 1587 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1588 // we let the hardware handle it. The one to eight bytes within words, 1589 // dwords or qwords that span cache line boundaries will still be loaded 1590 // and stored atomically. 1591 // 1592 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1593 address* entry, const char *name) { 1594 const bool not_oop = false; 1595 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1596 } 1597 1598 // Arguments: 1599 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1600 // ignored 1601 // name - stub name string 1602 // 1603 // Inputs: 1604 // c_rarg0 - source array address 1605 // c_rarg1 - destination array address 1606 // c_rarg2 - element count, treated as ssize_t, can be zero 1607 // 1608 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1609 // let the hardware handle it. The two or four words within dwords 1610 // or qwords that span cache line boundaries will still be loaded 1611 // and stored atomically. 1612 // 1613 // Side Effects: 1614 // disjoint_short_copy_entry is set to the no-overlap entry point 1615 // used by generate_conjoint_short_copy(). 1616 // 1617 address generate_disjoint_short_copy(bool aligned, 1618 address* entry, const char *name) { 1619 const bool not_oop = false; 1620 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1621 } 1622 1623 // Arguments: 1624 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1625 // ignored 1626 // name - stub name string 1627 // 1628 // Inputs: 1629 // c_rarg0 - source array address 1630 // c_rarg1 - destination array address 1631 // c_rarg2 - element count, treated as ssize_t, can be zero 1632 // 1633 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1634 // let the hardware handle it. The two or four words within dwords 1635 // or qwords that span cache line boundaries will still be loaded 1636 // and stored atomically. 1637 // 1638 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1639 address *entry, const char *name) { 1640 const bool not_oop = false; 1641 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1642 1643 } 1644 // Arguments: 1645 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1646 // ignored 1647 // name - stub name string 1648 // 1649 // Inputs: 1650 // c_rarg0 - source array address 1651 // c_rarg1 - destination array address 1652 // c_rarg2 - element count, treated as ssize_t, can be zero 1653 // 1654 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1655 // the hardware handle it. The two dwords within qwords that span 1656 // cache line boundaries will still be loaded and stored atomicly. 1657 // 1658 // Side Effects: 1659 // disjoint_int_copy_entry is set to the no-overlap entry point 1660 // used by generate_conjoint_int_oop_copy(). 1661 // 1662 address generate_disjoint_int_copy(bool aligned, address *entry, 1663 const char *name, bool dest_uninitialized = false) { 1664 const bool not_oop = false; 1665 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1666 } 1667 1668 // Arguments: 1669 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1670 // ignored 1671 // name - stub name string 1672 // 1673 // Inputs: 1674 // c_rarg0 - source array address 1675 // c_rarg1 - destination array address 1676 // c_rarg2 - element count, treated as ssize_t, can be zero 1677 // 1678 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1679 // the hardware handle it. The two dwords within qwords that span 1680 // cache line boundaries will still be loaded and stored atomicly. 1681 // 1682 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1683 address *entry, const char *name, 1684 bool dest_uninitialized = false) { 1685 const bool not_oop = false; 1686 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1687 } 1688 1689 1690 // Arguments: 1691 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1692 // ignored 1693 // name - stub name string 1694 // 1695 // Inputs: 1696 // c_rarg0 - source array address 1697 // c_rarg1 - destination array address 1698 // c_rarg2 - element count, treated as size_t, can be zero 1699 // 1700 // Side Effects: 1701 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1702 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1703 // 1704 address generate_disjoint_long_copy(bool aligned, address *entry, 1705 const char *name, bool dest_uninitialized = false) { 1706 const bool not_oop = false; 1707 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1708 } 1709 1710 // Arguments: 1711 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1712 // ignored 1713 // name - stub name string 1714 // 1715 // Inputs: 1716 // c_rarg0 - source array address 1717 // c_rarg1 - destination array address 1718 // c_rarg2 - element count, treated as size_t, can be zero 1719 // 1720 address generate_conjoint_long_copy(bool aligned, 1721 address nooverlap_target, address *entry, 1722 const char *name, bool dest_uninitialized = false) { 1723 const bool not_oop = false; 1724 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1725 } 1726 1727 // Arguments: 1728 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1729 // ignored 1730 // name - stub name string 1731 // 1732 // Inputs: 1733 // c_rarg0 - source array address 1734 // c_rarg1 - destination array address 1735 // c_rarg2 - element count, treated as size_t, can be zero 1736 // 1737 // Side Effects: 1738 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1739 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1740 // 1741 address generate_disjoint_oop_copy(bool aligned, address *entry, 1742 const char *name, bool dest_uninitialized) { 1743 const bool is_oop = true; 1744 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1745 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1746 } 1747 1748 // Arguments: 1749 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1750 // ignored 1751 // name - stub name string 1752 // 1753 // Inputs: 1754 // c_rarg0 - source array address 1755 // c_rarg1 - destination array address 1756 // c_rarg2 - element count, treated as size_t, can be zero 1757 // 1758 address generate_conjoint_oop_copy(bool aligned, 1759 address nooverlap_target, address *entry, 1760 const char *name, bool dest_uninitialized) { 1761 const bool is_oop = true; 1762 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1763 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1764 name, dest_uninitialized); 1765 } 1766 1767 1768 // Helper for generating a dynamic type check. 1769 // Smashes rscratch1. 1770 void generate_type_check(Register sub_klass, 1771 Register super_check_offset, 1772 Register super_klass, 1773 Label& L_success) { 1774 assert_different_registers(sub_klass, super_check_offset, super_klass); 1775 1776 BLOCK_COMMENT("type_check:"); 1777 1778 Label L_miss; 1779 1780 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1781 super_check_offset); 1782 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1783 1784 // Fall through on failure! 1785 __ BIND(L_miss); 1786 } 1787 1788 // 1789 // Generate checkcasting array copy stub 1790 // 1791 // Input: 1792 // c_rarg0 - source array address 1793 // c_rarg1 - destination array address 1794 // c_rarg2 - element count, treated as ssize_t, can be zero 1795 // c_rarg3 - size_t ckoff (super_check_offset) 1796 // c_rarg4 - oop ckval (super_klass) 1797 // 1798 // Output: 1799 // r0 == 0 - success 1800 // r0 == -1^K - failure, where K is partial transfer count 1801 // 1802 address generate_checkcast_copy(const char *name, address *entry, 1803 bool dest_uninitialized = false) { 1804 1805 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1806 1807 // Input registers (after setup_arg_regs) 1808 const Register from = c_rarg0; // source array address 1809 const Register to = c_rarg1; // destination array address 1810 const Register count = c_rarg2; // elementscount 1811 const Register ckoff = c_rarg3; // super_check_offset 1812 const Register ckval = c_rarg4; // super_klass 1813 1814 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1815 RegSet wb_post_saved_regs = RegSet::of(count); 1816 1817 // Registers used as temps (r18, r19, r20 are save-on-entry) 1818 const Register count_save = r21; // orig elementscount 1819 const Register start_to = r20; // destination array start address 1820 const Register copied_oop = r18; // actual oop copied 1821 const Register r19_klass = r19; // oop._klass 1822 1823 //--------------------------------------------------------------- 1824 // Assembler stub will be used for this call to arraycopy 1825 // if the two arrays are subtypes of Object[] but the 1826 // destination array type is not equal to or a supertype 1827 // of the source type. Each element must be separately 1828 // checked. 1829 1830 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1831 copied_oop, r19_klass, count_save); 1832 1833 __ align(CodeEntryAlignment); 1834 StubCodeMark mark(this, "StubRoutines", name); 1835 address start = __ pc(); 1836 1837 __ enter(); // required for proper stackwalking of RuntimeStub frame 1838 1839 #ifdef ASSERT 1840 // caller guarantees that the arrays really are different 1841 // otherwise, we would have to make conjoint checks 1842 { Label L; 1843 array_overlap_test(L, TIMES_OOP); 1844 __ stop("checkcast_copy within a single array"); 1845 __ bind(L); 1846 } 1847 #endif //ASSERT 1848 1849 // Caller of this entry point must set up the argument registers. 1850 if (entry != NULL) { 1851 *entry = __ pc(); 1852 BLOCK_COMMENT("Entry:"); 1853 } 1854 1855 // Empty array: Nothing to do. 1856 __ cbz(count, L_done); 1857 1858 __ push(RegSet::of(r18, r19, r20, r21), sp); 1859 1860 #ifdef ASSERT 1861 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1862 // The ckoff and ckval must be mutually consistent, 1863 // even though caller generates both. 1864 { Label L; 1865 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1866 __ ldrw(start_to, Address(ckval, sco_offset)); 1867 __ cmpw(ckoff, start_to); 1868 __ br(Assembler::EQ, L); 1869 __ stop("super_check_offset inconsistent"); 1870 __ bind(L); 1871 } 1872 #endif //ASSERT 1873 1874 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized, wb_pre_saved_regs); 1875 1876 // save the original count 1877 __ mov(count_save, count); 1878 1879 // Copy from low to high addresses 1880 __ mov(start_to, to); // Save destination array start address 1881 __ b(L_load_element); 1882 1883 // ======== begin loop ======== 1884 // (Loop is rotated; its entry is L_load_element.) 1885 // Loop control: 1886 // for (; count != 0; count--) { 1887 // copied_oop = load_heap_oop(from++); 1888 // ... generate_type_check ...; 1889 // store_heap_oop(to++, copied_oop); 1890 // } 1891 __ align(OptoLoopAlignment); 1892 1893 __ BIND(L_store_element); 1894 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1895 __ sub(count, count, 1); 1896 __ cbz(count, L_do_card_marks); 1897 1898 // ======== loop entry is here ======== 1899 __ BIND(L_load_element); 1900 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1901 __ cbz(copied_oop, L_store_element); 1902 1903 __ load_klass(r19_klass, copied_oop);// query the object klass 1904 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1905 // ======== end loop ======== 1906 1907 // It was a real error; we must depend on the caller to finish the job. 1908 // Register count = remaining oops, count_orig = total oops. 1909 // Emit GC store barriers for the oops we have copied and report 1910 // their number to the caller. 1911 1912 __ subs(count, count_save, count); // K = partially copied oop count 1913 __ eon(count, count, zr); // report (-1^K) to caller 1914 __ br(Assembler::EQ, L_done_pop); 1915 1916 __ BIND(L_do_card_marks); 1917 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1918 gen_write_ref_array_post_barrier(start_to, to, rscratch1, wb_post_saved_regs); 1919 1920 __ bind(L_done_pop); 1921 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1922 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1923 1924 __ bind(L_done); 1925 __ mov(r0, count); 1926 __ leave(); 1927 __ ret(lr); 1928 1929 return start; 1930 } 1931 1932 // Perform range checks on the proposed arraycopy. 1933 // Kills temp, but nothing else. 1934 // Also, clean the sign bits of src_pos and dst_pos. 1935 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1936 Register src_pos, // source position (c_rarg1) 1937 Register dst, // destination array oo (c_rarg2) 1938 Register dst_pos, // destination position (c_rarg3) 1939 Register length, 1940 Register temp, 1941 Label& L_failed) { 1942 BLOCK_COMMENT("arraycopy_range_checks:"); 1943 1944 assert_different_registers(rscratch1, temp); 1945 1946 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1947 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1948 __ addw(temp, length, src_pos); 1949 __ cmpw(temp, rscratch1); 1950 __ br(Assembler::HI, L_failed); 1951 1952 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1953 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1954 __ addw(temp, length, dst_pos); 1955 __ cmpw(temp, rscratch1); 1956 __ br(Assembler::HI, L_failed); 1957 1958 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1959 __ movw(src_pos, src_pos); 1960 __ movw(dst_pos, dst_pos); 1961 1962 BLOCK_COMMENT("arraycopy_range_checks done"); 1963 } 1964 1965 // These stubs get called from some dumb test routine. 1966 // I'll write them properly when they're called from 1967 // something that's actually doing something. 1968 static void fake_arraycopy_stub(address src, address dst, int count) { 1969 assert(count == 0, "huh?"); 1970 } 1971 1972 1973 // 1974 // Generate 'unsafe' array copy stub 1975 // Though just as safe as the other stubs, it takes an unscaled 1976 // size_t argument instead of an element count. 1977 // 1978 // Input: 1979 // c_rarg0 - source array address 1980 // c_rarg1 - destination array address 1981 // c_rarg2 - byte count, treated as ssize_t, can be zero 1982 // 1983 // Examines the alignment of the operands and dispatches 1984 // to a long, int, short, or byte copy loop. 1985 // 1986 address generate_unsafe_copy(const char *name, 1987 address byte_copy_entry, 1988 address short_copy_entry, 1989 address int_copy_entry, 1990 address long_copy_entry) { 1991 Label L_long_aligned, L_int_aligned, L_short_aligned; 1992 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1993 1994 __ align(CodeEntryAlignment); 1995 StubCodeMark mark(this, "StubRoutines", name); 1996 address start = __ pc(); 1997 __ enter(); // required for proper stackwalking of RuntimeStub frame 1998 1999 // bump this on entry, not on exit: 2000 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2001 2002 __ orr(rscratch1, s, d); 2003 __ orr(rscratch1, rscratch1, count); 2004 2005 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2006 __ cbz(rscratch1, L_long_aligned); 2007 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2008 __ cbz(rscratch1, L_int_aligned); 2009 __ tbz(rscratch1, 0, L_short_aligned); 2010 __ b(RuntimeAddress(byte_copy_entry)); 2011 2012 __ BIND(L_short_aligned); 2013 __ lsr(count, count, LogBytesPerShort); // size => short_count 2014 __ b(RuntimeAddress(short_copy_entry)); 2015 __ BIND(L_int_aligned); 2016 __ lsr(count, count, LogBytesPerInt); // size => int_count 2017 __ b(RuntimeAddress(int_copy_entry)); 2018 __ BIND(L_long_aligned); 2019 __ lsr(count, count, LogBytesPerLong); // size => long_count 2020 __ b(RuntimeAddress(long_copy_entry)); 2021 2022 return start; 2023 } 2024 2025 // 2026 // Generate generic array copy stubs 2027 // 2028 // Input: 2029 // c_rarg0 - src oop 2030 // c_rarg1 - src_pos (32-bits) 2031 // c_rarg2 - dst oop 2032 // c_rarg3 - dst_pos (32-bits) 2033 // c_rarg4 - element count (32-bits) 2034 // 2035 // Output: 2036 // r0 == 0 - success 2037 // r0 == -1^K - failure, where K is partial transfer count 2038 // 2039 address generate_generic_copy(const char *name, 2040 address byte_copy_entry, address short_copy_entry, 2041 address int_copy_entry, address oop_copy_entry, 2042 address long_copy_entry, address checkcast_copy_entry) { 2043 2044 Label L_failed, L_failed_0, L_objArray; 2045 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2046 2047 // Input registers 2048 const Register src = c_rarg0; // source array oop 2049 const Register src_pos = c_rarg1; // source position 2050 const Register dst = c_rarg2; // destination array oop 2051 const Register dst_pos = c_rarg3; // destination position 2052 const Register length = c_rarg4; 2053 2054 StubCodeMark mark(this, "StubRoutines", name); 2055 2056 __ align(CodeEntryAlignment); 2057 address start = __ pc(); 2058 2059 __ enter(); // required for proper stackwalking of RuntimeStub frame 2060 2061 // bump this on entry, not on exit: 2062 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2063 2064 //----------------------------------------------------------------------- 2065 // Assembler stub will be used for this call to arraycopy 2066 // if the following conditions are met: 2067 // 2068 // (1) src and dst must not be null. 2069 // (2) src_pos must not be negative. 2070 // (3) dst_pos must not be negative. 2071 // (4) length must not be negative. 2072 // (5) src klass and dst klass should be the same and not NULL. 2073 // (6) src and dst should be arrays. 2074 // (7) src_pos + length must not exceed length of src. 2075 // (8) dst_pos + length must not exceed length of dst. 2076 // 2077 2078 // if (src == NULL) return -1; 2079 __ cbz(src, L_failed); 2080 2081 // if (src_pos < 0) return -1; 2082 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2083 2084 // if (dst == NULL) return -1; 2085 __ cbz(dst, L_failed); 2086 2087 // if (dst_pos < 0) return -1; 2088 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2089 2090 // registers used as temp 2091 const Register scratch_length = r16; // elements count to copy 2092 const Register scratch_src_klass = r17; // array klass 2093 const Register lh = r18; // layout helper 2094 2095 // if (length < 0) return -1; 2096 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2097 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2098 2099 __ load_klass(scratch_src_klass, src); 2100 #ifdef ASSERT 2101 // assert(src->klass() != NULL); 2102 { 2103 BLOCK_COMMENT("assert klasses not null {"); 2104 Label L1, L2; 2105 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2106 __ bind(L1); 2107 __ stop("broken null klass"); 2108 __ bind(L2); 2109 __ load_klass(rscratch1, dst); 2110 __ cbz(rscratch1, L1); // this would be broken also 2111 BLOCK_COMMENT("} assert klasses not null done"); 2112 } 2113 #endif 2114 2115 // Load layout helper (32-bits) 2116 // 2117 // |array_tag| | header_size | element_type | |log2_element_size| 2118 // 32 30 24 16 8 2 0 2119 // 2120 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2121 // 2122 2123 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2124 2125 // Handle objArrays completely differently... 2126 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2127 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2128 __ movw(rscratch1, objArray_lh); 2129 __ eorw(rscratch2, lh, rscratch1); 2130 __ cbzw(rscratch2, L_objArray); 2131 2132 // if (src->klass() != dst->klass()) return -1; 2133 __ load_klass(rscratch2, dst); 2134 __ eor(rscratch2, rscratch2, scratch_src_klass); 2135 __ cbnz(rscratch2, L_failed); 2136 2137 // if (!src->is_Array()) return -1; 2138 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2139 2140 // At this point, it is known to be a typeArray (array_tag 0x3). 2141 #ifdef ASSERT 2142 { 2143 BLOCK_COMMENT("assert primitive array {"); 2144 Label L; 2145 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2146 __ cmpw(lh, rscratch2); 2147 __ br(Assembler::GE, L); 2148 __ stop("must be a primitive array"); 2149 __ bind(L); 2150 BLOCK_COMMENT("} assert primitive array done"); 2151 } 2152 #endif 2153 2154 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2155 rscratch2, L_failed); 2156 2157 // TypeArrayKlass 2158 // 2159 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2160 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2161 // 2162 2163 const Register rscratch1_offset = rscratch1; // array offset 2164 const Register r18_elsize = lh; // element size 2165 2166 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2167 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2168 __ add(src, src, rscratch1_offset); // src array offset 2169 __ add(dst, dst, rscratch1_offset); // dst array offset 2170 BLOCK_COMMENT("choose copy loop based on element size"); 2171 2172 // next registers should be set before the jump to corresponding stub 2173 const Register from = c_rarg0; // source array address 2174 const Register to = c_rarg1; // destination array address 2175 const Register count = c_rarg2; // elements count 2176 2177 // 'from', 'to', 'count' registers should be set in such order 2178 // since they are the same as 'src', 'src_pos', 'dst'. 2179 2180 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2181 2182 // The possible values of elsize are 0-3, i.e. exact_log2(element 2183 // size in bytes). We do a simple bitwise binary search. 2184 __ BIND(L_copy_bytes); 2185 __ tbnz(r18_elsize, 1, L_copy_ints); 2186 __ tbnz(r18_elsize, 0, L_copy_shorts); 2187 __ lea(from, Address(src, src_pos));// src_addr 2188 __ lea(to, Address(dst, dst_pos));// dst_addr 2189 __ movw(count, scratch_length); // length 2190 __ b(RuntimeAddress(byte_copy_entry)); 2191 2192 __ BIND(L_copy_shorts); 2193 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2194 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2195 __ movw(count, scratch_length); // length 2196 __ b(RuntimeAddress(short_copy_entry)); 2197 2198 __ BIND(L_copy_ints); 2199 __ tbnz(r18_elsize, 0, L_copy_longs); 2200 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2201 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2202 __ movw(count, scratch_length); // length 2203 __ b(RuntimeAddress(int_copy_entry)); 2204 2205 __ BIND(L_copy_longs); 2206 #ifdef ASSERT 2207 { 2208 BLOCK_COMMENT("assert long copy {"); 2209 Label L; 2210 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2211 __ cmpw(r18_elsize, LogBytesPerLong); 2212 __ br(Assembler::EQ, L); 2213 __ stop("must be long copy, but elsize is wrong"); 2214 __ bind(L); 2215 BLOCK_COMMENT("} assert long copy done"); 2216 } 2217 #endif 2218 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2219 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2220 __ movw(count, scratch_length); // length 2221 __ b(RuntimeAddress(long_copy_entry)); 2222 2223 // ObjArrayKlass 2224 __ BIND(L_objArray); 2225 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2226 2227 Label L_plain_copy, L_checkcast_copy; 2228 // test array classes for subtyping 2229 __ load_klass(r18, dst); 2230 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2231 __ br(Assembler::NE, L_checkcast_copy); 2232 2233 // Identically typed arrays can be copied without element-wise checks. 2234 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2235 rscratch2, L_failed); 2236 2237 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2238 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2239 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2240 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2241 __ movw(count, scratch_length); // length 2242 __ BIND(L_plain_copy); 2243 __ b(RuntimeAddress(oop_copy_entry)); 2244 2245 __ BIND(L_checkcast_copy); 2246 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2247 { 2248 // Before looking at dst.length, make sure dst is also an objArray. 2249 __ ldrw(rscratch1, Address(r18, lh_offset)); 2250 __ movw(rscratch2, objArray_lh); 2251 __ eorw(rscratch1, rscratch1, rscratch2); 2252 __ cbnzw(rscratch1, L_failed); 2253 2254 // It is safe to examine both src.length and dst.length. 2255 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2256 r18, L_failed); 2257 2258 const Register rscratch2_dst_klass = rscratch2; 2259 __ load_klass(rscratch2_dst_klass, dst); // reload 2260 2261 // Marshal the base address arguments now, freeing registers. 2262 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2263 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2264 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2265 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2266 __ movw(count, length); // length (reloaded) 2267 Register sco_temp = c_rarg3; // this register is free now 2268 assert_different_registers(from, to, count, sco_temp, 2269 rscratch2_dst_klass, scratch_src_klass); 2270 // assert_clean_int(count, sco_temp); 2271 2272 // Generate the type check. 2273 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2274 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2275 // assert_clean_int(sco_temp, r18); 2276 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2277 2278 // Fetch destination element klass from the ObjArrayKlass header. 2279 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2280 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2281 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2282 2283 // the checkcast_copy loop needs two extra arguments: 2284 assert(c_rarg3 == sco_temp, "#3 already in place"); 2285 // Set up arguments for checkcast_copy_entry. 2286 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2287 __ b(RuntimeAddress(checkcast_copy_entry)); 2288 } 2289 2290 __ BIND(L_failed); 2291 __ mov(r0, -1); 2292 __ leave(); // required for proper stackwalking of RuntimeStub frame 2293 __ ret(lr); 2294 2295 return start; 2296 } 2297 2298 // 2299 // Generate stub for array fill. If "aligned" is true, the 2300 // "to" address is assumed to be heapword aligned. 2301 // 2302 // Arguments for generated stub: 2303 // to: c_rarg0 2304 // value: c_rarg1 2305 // count: c_rarg2 treated as signed 2306 // 2307 address generate_fill(BasicType t, bool aligned, const char *name) { 2308 __ align(CodeEntryAlignment); 2309 StubCodeMark mark(this, "StubRoutines", name); 2310 address start = __ pc(); 2311 2312 BLOCK_COMMENT("Entry:"); 2313 2314 const Register to = c_rarg0; // source array address 2315 const Register value = c_rarg1; // value 2316 const Register count = c_rarg2; // elements count 2317 2318 const Register bz_base = r10; // base for block_zero routine 2319 const Register cnt_words = r11; // temp register 2320 2321 __ enter(); 2322 2323 Label L_fill_elements, L_exit1; 2324 2325 int shift = -1; 2326 switch (t) { 2327 case T_BYTE: 2328 shift = 0; 2329 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2330 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2331 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2332 __ br(Assembler::LO, L_fill_elements); 2333 break; 2334 case T_SHORT: 2335 shift = 1; 2336 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2337 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2338 __ br(Assembler::LO, L_fill_elements); 2339 break; 2340 case T_INT: 2341 shift = 2; 2342 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2343 __ br(Assembler::LO, L_fill_elements); 2344 break; 2345 default: ShouldNotReachHere(); 2346 } 2347 2348 // Align source address at 8 bytes address boundary. 2349 Label L_skip_align1, L_skip_align2, L_skip_align4; 2350 if (!aligned) { 2351 switch (t) { 2352 case T_BYTE: 2353 // One byte misalignment happens only for byte arrays. 2354 __ tbz(to, 0, L_skip_align1); 2355 __ strb(value, Address(__ post(to, 1))); 2356 __ subw(count, count, 1); 2357 __ bind(L_skip_align1); 2358 // Fallthrough 2359 case T_SHORT: 2360 // Two bytes misalignment happens only for byte and short (char) arrays. 2361 __ tbz(to, 1, L_skip_align2); 2362 __ strh(value, Address(__ post(to, 2))); 2363 __ subw(count, count, 2 >> shift); 2364 __ bind(L_skip_align2); 2365 // Fallthrough 2366 case T_INT: 2367 // Align to 8 bytes, we know we are 4 byte aligned to start. 2368 __ tbz(to, 2, L_skip_align4); 2369 __ strw(value, Address(__ post(to, 4))); 2370 __ subw(count, count, 4 >> shift); 2371 __ bind(L_skip_align4); 2372 break; 2373 default: ShouldNotReachHere(); 2374 } 2375 } 2376 2377 // 2378 // Fill large chunks 2379 // 2380 __ lsrw(cnt_words, count, 3 - shift); // number of words 2381 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2382 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2383 if (UseBlockZeroing) { 2384 Label non_block_zeroing, rest; 2385 // If the fill value is zero we can use the fast zero_words(). 2386 __ cbnz(value, non_block_zeroing); 2387 __ mov(bz_base, to); 2388 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2389 __ zero_words(bz_base, cnt_words); 2390 __ b(rest); 2391 __ bind(non_block_zeroing); 2392 __ fill_words(to, cnt_words, value); 2393 __ bind(rest); 2394 } else { 2395 __ fill_words(to, cnt_words, value); 2396 } 2397 2398 // Remaining count is less than 8 bytes. Fill it by a single store. 2399 // Note that the total length is no less than 8 bytes. 2400 if (t == T_BYTE || t == T_SHORT) { 2401 Label L_exit1; 2402 __ cbzw(count, L_exit1); 2403 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2404 __ str(value, Address(to, -8)); // overwrite some elements 2405 __ bind(L_exit1); 2406 __ leave(); 2407 __ ret(lr); 2408 } 2409 2410 // Handle copies less than 8 bytes. 2411 Label L_fill_2, L_fill_4, L_exit2; 2412 __ bind(L_fill_elements); 2413 switch (t) { 2414 case T_BYTE: 2415 __ tbz(count, 0, L_fill_2); 2416 __ strb(value, Address(__ post(to, 1))); 2417 __ bind(L_fill_2); 2418 __ tbz(count, 1, L_fill_4); 2419 __ strh(value, Address(__ post(to, 2))); 2420 __ bind(L_fill_4); 2421 __ tbz(count, 2, L_exit2); 2422 __ strw(value, Address(to)); 2423 break; 2424 case T_SHORT: 2425 __ tbz(count, 0, L_fill_4); 2426 __ strh(value, Address(__ post(to, 2))); 2427 __ bind(L_fill_4); 2428 __ tbz(count, 1, L_exit2); 2429 __ strw(value, Address(to)); 2430 break; 2431 case T_INT: 2432 __ cbzw(count, L_exit2); 2433 __ strw(value, Address(to)); 2434 break; 2435 default: ShouldNotReachHere(); 2436 } 2437 __ bind(L_exit2); 2438 __ leave(); 2439 __ ret(lr); 2440 return start; 2441 } 2442 2443 void generate_arraycopy_stubs() { 2444 address entry; 2445 address entry_jbyte_arraycopy; 2446 address entry_jshort_arraycopy; 2447 address entry_jint_arraycopy; 2448 address entry_oop_arraycopy; 2449 address entry_jlong_arraycopy; 2450 address entry_checkcast_arraycopy; 2451 2452 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2453 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2454 2455 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2456 2457 //*** jbyte 2458 // Always need aligned and unaligned versions 2459 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2460 "jbyte_disjoint_arraycopy"); 2461 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2462 &entry_jbyte_arraycopy, 2463 "jbyte_arraycopy"); 2464 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2465 "arrayof_jbyte_disjoint_arraycopy"); 2466 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2467 "arrayof_jbyte_arraycopy"); 2468 2469 //*** jshort 2470 // Always need aligned and unaligned versions 2471 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2472 "jshort_disjoint_arraycopy"); 2473 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2474 &entry_jshort_arraycopy, 2475 "jshort_arraycopy"); 2476 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2477 "arrayof_jshort_disjoint_arraycopy"); 2478 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2479 "arrayof_jshort_arraycopy"); 2480 2481 //*** jint 2482 // Aligned versions 2483 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2484 "arrayof_jint_disjoint_arraycopy"); 2485 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2486 "arrayof_jint_arraycopy"); 2487 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2488 // entry_jint_arraycopy always points to the unaligned version 2489 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2490 "jint_disjoint_arraycopy"); 2491 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2492 &entry_jint_arraycopy, 2493 "jint_arraycopy"); 2494 2495 //*** jlong 2496 // It is always aligned 2497 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2498 "arrayof_jlong_disjoint_arraycopy"); 2499 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2500 "arrayof_jlong_arraycopy"); 2501 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2502 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2503 2504 //*** oops 2505 { 2506 // With compressed oops we need unaligned versions; notice that 2507 // we overwrite entry_oop_arraycopy. 2508 bool aligned = !UseCompressedOops; 2509 2510 StubRoutines::_arrayof_oop_disjoint_arraycopy 2511 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2512 /*dest_uninitialized*/false); 2513 StubRoutines::_arrayof_oop_arraycopy 2514 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2515 /*dest_uninitialized*/false); 2516 // Aligned versions without pre-barriers 2517 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2518 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2519 /*dest_uninitialized*/true); 2520 StubRoutines::_arrayof_oop_arraycopy_uninit 2521 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2522 /*dest_uninitialized*/true); 2523 } 2524 2525 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2526 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2527 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2528 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2529 2530 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2531 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2532 /*dest_uninitialized*/true); 2533 2534 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2535 entry_jbyte_arraycopy, 2536 entry_jshort_arraycopy, 2537 entry_jint_arraycopy, 2538 entry_jlong_arraycopy); 2539 2540 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2541 entry_jbyte_arraycopy, 2542 entry_jshort_arraycopy, 2543 entry_jint_arraycopy, 2544 entry_oop_arraycopy, 2545 entry_jlong_arraycopy, 2546 entry_checkcast_arraycopy); 2547 2548 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2549 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2550 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2551 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2552 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2553 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2554 } 2555 2556 void generate_math_stubs() { Unimplemented(); } 2557 2558 // Arguments: 2559 // 2560 // Inputs: 2561 // c_rarg0 - source byte array address 2562 // c_rarg1 - destination byte array address 2563 // c_rarg2 - K (key) in little endian int array 2564 // 2565 address generate_aescrypt_encryptBlock() { 2566 __ align(CodeEntryAlignment); 2567 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2568 2569 Label L_doLast; 2570 2571 const Register from = c_rarg0; // source array address 2572 const Register to = c_rarg1; // destination array address 2573 const Register key = c_rarg2; // key array address 2574 const Register keylen = rscratch1; 2575 2576 address start = __ pc(); 2577 __ enter(); 2578 2579 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2580 2581 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2582 2583 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2584 __ rev32(v1, __ T16B, v1); 2585 __ rev32(v2, __ T16B, v2); 2586 __ rev32(v3, __ T16B, v3); 2587 __ rev32(v4, __ T16B, v4); 2588 __ aese(v0, v1); 2589 __ aesmc(v0, v0); 2590 __ aese(v0, v2); 2591 __ aesmc(v0, v0); 2592 __ aese(v0, v3); 2593 __ aesmc(v0, v0); 2594 __ aese(v0, v4); 2595 __ aesmc(v0, v0); 2596 2597 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2598 __ rev32(v1, __ T16B, v1); 2599 __ rev32(v2, __ T16B, v2); 2600 __ rev32(v3, __ T16B, v3); 2601 __ rev32(v4, __ T16B, v4); 2602 __ aese(v0, v1); 2603 __ aesmc(v0, v0); 2604 __ aese(v0, v2); 2605 __ aesmc(v0, v0); 2606 __ aese(v0, v3); 2607 __ aesmc(v0, v0); 2608 __ aese(v0, v4); 2609 __ aesmc(v0, v0); 2610 2611 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2612 __ rev32(v1, __ T16B, v1); 2613 __ rev32(v2, __ T16B, v2); 2614 2615 __ cmpw(keylen, 44); 2616 __ br(Assembler::EQ, L_doLast); 2617 2618 __ aese(v0, v1); 2619 __ aesmc(v0, v0); 2620 __ aese(v0, v2); 2621 __ aesmc(v0, v0); 2622 2623 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2624 __ rev32(v1, __ T16B, v1); 2625 __ rev32(v2, __ T16B, v2); 2626 2627 __ cmpw(keylen, 52); 2628 __ br(Assembler::EQ, L_doLast); 2629 2630 __ aese(v0, v1); 2631 __ aesmc(v0, v0); 2632 __ aese(v0, v2); 2633 __ aesmc(v0, v0); 2634 2635 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2636 __ rev32(v1, __ T16B, v1); 2637 __ rev32(v2, __ T16B, v2); 2638 2639 __ BIND(L_doLast); 2640 2641 __ aese(v0, v1); 2642 __ aesmc(v0, v0); 2643 __ aese(v0, v2); 2644 2645 __ ld1(v1, __ T16B, key); 2646 __ rev32(v1, __ T16B, v1); 2647 __ eor(v0, __ T16B, v0, v1); 2648 2649 __ st1(v0, __ T16B, to); 2650 2651 __ mov(r0, 0); 2652 2653 __ leave(); 2654 __ ret(lr); 2655 2656 return start; 2657 } 2658 2659 // Arguments: 2660 // 2661 // Inputs: 2662 // c_rarg0 - source byte array address 2663 // c_rarg1 - destination byte array address 2664 // c_rarg2 - K (key) in little endian int array 2665 // 2666 address generate_aescrypt_decryptBlock() { 2667 assert(UseAES, "need AES instructions and misaligned SSE support"); 2668 __ align(CodeEntryAlignment); 2669 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2670 Label L_doLast; 2671 2672 const Register from = c_rarg0; // source array address 2673 const Register to = c_rarg1; // destination array address 2674 const Register key = c_rarg2; // key array address 2675 const Register keylen = rscratch1; 2676 2677 address start = __ pc(); 2678 __ enter(); // required for proper stackwalking of RuntimeStub frame 2679 2680 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2681 2682 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2683 2684 __ ld1(v5, __ T16B, __ post(key, 16)); 2685 __ rev32(v5, __ T16B, v5); 2686 2687 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2688 __ rev32(v1, __ T16B, v1); 2689 __ rev32(v2, __ T16B, v2); 2690 __ rev32(v3, __ T16B, v3); 2691 __ rev32(v4, __ T16B, v4); 2692 __ aesd(v0, v1); 2693 __ aesimc(v0, v0); 2694 __ aesd(v0, v2); 2695 __ aesimc(v0, v0); 2696 __ aesd(v0, v3); 2697 __ aesimc(v0, v0); 2698 __ aesd(v0, v4); 2699 __ aesimc(v0, v0); 2700 2701 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2702 __ rev32(v1, __ T16B, v1); 2703 __ rev32(v2, __ T16B, v2); 2704 __ rev32(v3, __ T16B, v3); 2705 __ rev32(v4, __ T16B, v4); 2706 __ aesd(v0, v1); 2707 __ aesimc(v0, v0); 2708 __ aesd(v0, v2); 2709 __ aesimc(v0, v0); 2710 __ aesd(v0, v3); 2711 __ aesimc(v0, v0); 2712 __ aesd(v0, v4); 2713 __ aesimc(v0, v0); 2714 2715 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2716 __ rev32(v1, __ T16B, v1); 2717 __ rev32(v2, __ T16B, v2); 2718 2719 __ cmpw(keylen, 44); 2720 __ br(Assembler::EQ, L_doLast); 2721 2722 __ aesd(v0, v1); 2723 __ aesimc(v0, v0); 2724 __ aesd(v0, v2); 2725 __ aesimc(v0, v0); 2726 2727 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2728 __ rev32(v1, __ T16B, v1); 2729 __ rev32(v2, __ T16B, v2); 2730 2731 __ cmpw(keylen, 52); 2732 __ br(Assembler::EQ, L_doLast); 2733 2734 __ aesd(v0, v1); 2735 __ aesimc(v0, v0); 2736 __ aesd(v0, v2); 2737 __ aesimc(v0, v0); 2738 2739 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2740 __ rev32(v1, __ T16B, v1); 2741 __ rev32(v2, __ T16B, v2); 2742 2743 __ BIND(L_doLast); 2744 2745 __ aesd(v0, v1); 2746 __ aesimc(v0, v0); 2747 __ aesd(v0, v2); 2748 2749 __ eor(v0, __ T16B, v0, v5); 2750 2751 __ st1(v0, __ T16B, to); 2752 2753 __ mov(r0, 0); 2754 2755 __ leave(); 2756 __ ret(lr); 2757 2758 return start; 2759 } 2760 2761 // Arguments: 2762 // 2763 // Inputs: 2764 // c_rarg0 - source byte array address 2765 // c_rarg1 - destination byte array address 2766 // c_rarg2 - K (key) in little endian int array 2767 // c_rarg3 - r vector byte array address 2768 // c_rarg4 - input length 2769 // 2770 // Output: 2771 // x0 - input length 2772 // 2773 address generate_cipherBlockChaining_encryptAESCrypt() { 2774 assert(UseAES, "need AES instructions and misaligned SSE support"); 2775 __ align(CodeEntryAlignment); 2776 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2777 2778 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2779 2780 const Register from = c_rarg0; // source array address 2781 const Register to = c_rarg1; // destination array address 2782 const Register key = c_rarg2; // key array address 2783 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2784 // and left with the results of the last encryption block 2785 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2786 const Register keylen = rscratch1; 2787 2788 address start = __ pc(); 2789 2790 __ enter(); 2791 2792 __ movw(rscratch2, len_reg); 2793 2794 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2795 2796 __ ld1(v0, __ T16B, rvec); 2797 2798 __ cmpw(keylen, 52); 2799 __ br(Assembler::CC, L_loadkeys_44); 2800 __ br(Assembler::EQ, L_loadkeys_52); 2801 2802 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2803 __ rev32(v17, __ T16B, v17); 2804 __ rev32(v18, __ T16B, v18); 2805 __ BIND(L_loadkeys_52); 2806 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2807 __ rev32(v19, __ T16B, v19); 2808 __ rev32(v20, __ T16B, v20); 2809 __ BIND(L_loadkeys_44); 2810 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2811 __ rev32(v21, __ T16B, v21); 2812 __ rev32(v22, __ T16B, v22); 2813 __ rev32(v23, __ T16B, v23); 2814 __ rev32(v24, __ T16B, v24); 2815 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2816 __ rev32(v25, __ T16B, v25); 2817 __ rev32(v26, __ T16B, v26); 2818 __ rev32(v27, __ T16B, v27); 2819 __ rev32(v28, __ T16B, v28); 2820 __ ld1(v29, v30, v31, __ T16B, key); 2821 __ rev32(v29, __ T16B, v29); 2822 __ rev32(v30, __ T16B, v30); 2823 __ rev32(v31, __ T16B, v31); 2824 2825 __ BIND(L_aes_loop); 2826 __ ld1(v1, __ T16B, __ post(from, 16)); 2827 __ eor(v0, __ T16B, v0, v1); 2828 2829 __ br(Assembler::CC, L_rounds_44); 2830 __ br(Assembler::EQ, L_rounds_52); 2831 2832 __ aese(v0, v17); __ aesmc(v0, v0); 2833 __ aese(v0, v18); __ aesmc(v0, v0); 2834 __ BIND(L_rounds_52); 2835 __ aese(v0, v19); __ aesmc(v0, v0); 2836 __ aese(v0, v20); __ aesmc(v0, v0); 2837 __ BIND(L_rounds_44); 2838 __ aese(v0, v21); __ aesmc(v0, v0); 2839 __ aese(v0, v22); __ aesmc(v0, v0); 2840 __ aese(v0, v23); __ aesmc(v0, v0); 2841 __ aese(v0, v24); __ aesmc(v0, v0); 2842 __ aese(v0, v25); __ aesmc(v0, v0); 2843 __ aese(v0, v26); __ aesmc(v0, v0); 2844 __ aese(v0, v27); __ aesmc(v0, v0); 2845 __ aese(v0, v28); __ aesmc(v0, v0); 2846 __ aese(v0, v29); __ aesmc(v0, v0); 2847 __ aese(v0, v30); 2848 __ eor(v0, __ T16B, v0, v31); 2849 2850 __ st1(v0, __ T16B, __ post(to, 16)); 2851 2852 __ subw(len_reg, len_reg, 16); 2853 __ cbnzw(len_reg, L_aes_loop); 2854 2855 __ st1(v0, __ T16B, rvec); 2856 2857 __ mov(r0, rscratch2); 2858 2859 __ leave(); 2860 __ ret(lr); 2861 2862 return start; 2863 } 2864 2865 // Arguments: 2866 // 2867 // Inputs: 2868 // c_rarg0 - source byte array address 2869 // c_rarg1 - destination byte array address 2870 // c_rarg2 - K (key) in little endian int array 2871 // c_rarg3 - r vector byte array address 2872 // c_rarg4 - input length 2873 // 2874 // Output: 2875 // r0 - input length 2876 // 2877 address generate_cipherBlockChaining_decryptAESCrypt() { 2878 assert(UseAES, "need AES instructions and misaligned SSE support"); 2879 __ align(CodeEntryAlignment); 2880 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2881 2882 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2883 2884 const Register from = c_rarg0; // source array address 2885 const Register to = c_rarg1; // destination array address 2886 const Register key = c_rarg2; // key array address 2887 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2888 // and left with the results of the last encryption block 2889 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2890 const Register keylen = rscratch1; 2891 2892 address start = __ pc(); 2893 2894 __ enter(); 2895 2896 __ movw(rscratch2, len_reg); 2897 2898 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2899 2900 __ ld1(v2, __ T16B, rvec); 2901 2902 __ ld1(v31, __ T16B, __ post(key, 16)); 2903 __ rev32(v31, __ T16B, v31); 2904 2905 __ cmpw(keylen, 52); 2906 __ br(Assembler::CC, L_loadkeys_44); 2907 __ br(Assembler::EQ, L_loadkeys_52); 2908 2909 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2910 __ rev32(v17, __ T16B, v17); 2911 __ rev32(v18, __ T16B, v18); 2912 __ BIND(L_loadkeys_52); 2913 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2914 __ rev32(v19, __ T16B, v19); 2915 __ rev32(v20, __ T16B, v20); 2916 __ BIND(L_loadkeys_44); 2917 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2918 __ rev32(v21, __ T16B, v21); 2919 __ rev32(v22, __ T16B, v22); 2920 __ rev32(v23, __ T16B, v23); 2921 __ rev32(v24, __ T16B, v24); 2922 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2923 __ rev32(v25, __ T16B, v25); 2924 __ rev32(v26, __ T16B, v26); 2925 __ rev32(v27, __ T16B, v27); 2926 __ rev32(v28, __ T16B, v28); 2927 __ ld1(v29, v30, __ T16B, key); 2928 __ rev32(v29, __ T16B, v29); 2929 __ rev32(v30, __ T16B, v30); 2930 2931 __ BIND(L_aes_loop); 2932 __ ld1(v0, __ T16B, __ post(from, 16)); 2933 __ orr(v1, __ T16B, v0, v0); 2934 2935 __ br(Assembler::CC, L_rounds_44); 2936 __ br(Assembler::EQ, L_rounds_52); 2937 2938 __ aesd(v0, v17); __ aesimc(v0, v0); 2939 __ aesd(v0, v18); __ aesimc(v0, v0); 2940 __ BIND(L_rounds_52); 2941 __ aesd(v0, v19); __ aesimc(v0, v0); 2942 __ aesd(v0, v20); __ aesimc(v0, v0); 2943 __ BIND(L_rounds_44); 2944 __ aesd(v0, v21); __ aesimc(v0, v0); 2945 __ aesd(v0, v22); __ aesimc(v0, v0); 2946 __ aesd(v0, v23); __ aesimc(v0, v0); 2947 __ aesd(v0, v24); __ aesimc(v0, v0); 2948 __ aesd(v0, v25); __ aesimc(v0, v0); 2949 __ aesd(v0, v26); __ aesimc(v0, v0); 2950 __ aesd(v0, v27); __ aesimc(v0, v0); 2951 __ aesd(v0, v28); __ aesimc(v0, v0); 2952 __ aesd(v0, v29); __ aesimc(v0, v0); 2953 __ aesd(v0, v30); 2954 __ eor(v0, __ T16B, v0, v31); 2955 __ eor(v0, __ T16B, v0, v2); 2956 2957 __ st1(v0, __ T16B, __ post(to, 16)); 2958 __ orr(v2, __ T16B, v1, v1); 2959 2960 __ subw(len_reg, len_reg, 16); 2961 __ cbnzw(len_reg, L_aes_loop); 2962 2963 __ st1(v2, __ T16B, rvec); 2964 2965 __ mov(r0, rscratch2); 2966 2967 __ leave(); 2968 __ ret(lr); 2969 2970 return start; 2971 } 2972 2973 // Arguments: 2974 // 2975 // Inputs: 2976 // c_rarg0 - byte[] source+offset 2977 // c_rarg1 - int[] SHA.state 2978 // c_rarg2 - int offset 2979 // c_rarg3 - int limit 2980 // 2981 address generate_sha1_implCompress(bool multi_block, const char *name) { 2982 __ align(CodeEntryAlignment); 2983 StubCodeMark mark(this, "StubRoutines", name); 2984 address start = __ pc(); 2985 2986 Register buf = c_rarg0; 2987 Register state = c_rarg1; 2988 Register ofs = c_rarg2; 2989 Register limit = c_rarg3; 2990 2991 Label keys; 2992 Label sha1_loop; 2993 2994 // load the keys into v0..v3 2995 __ adr(rscratch1, keys); 2996 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2997 // load 5 words state into v6, v7 2998 __ ldrq(v6, Address(state, 0)); 2999 __ ldrs(v7, Address(state, 16)); 3000 3001 3002 __ BIND(sha1_loop); 3003 // load 64 bytes of data into v16..v19 3004 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3005 __ rev32(v16, __ T16B, v16); 3006 __ rev32(v17, __ T16B, v17); 3007 __ rev32(v18, __ T16B, v18); 3008 __ rev32(v19, __ T16B, v19); 3009 3010 // do the sha1 3011 __ addv(v4, __ T4S, v16, v0); 3012 __ orr(v20, __ T16B, v6, v6); 3013 3014 FloatRegister d0 = v16; 3015 FloatRegister d1 = v17; 3016 FloatRegister d2 = v18; 3017 FloatRegister d3 = v19; 3018 3019 for (int round = 0; round < 20; round++) { 3020 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3021 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3022 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3023 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3024 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3025 3026 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3027 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3028 __ sha1h(tmp2, __ T4S, v20); 3029 if (round < 5) 3030 __ sha1c(v20, __ T4S, tmp3, tmp4); 3031 else if (round < 10 || round >= 15) 3032 __ sha1p(v20, __ T4S, tmp3, tmp4); 3033 else 3034 __ sha1m(v20, __ T4S, tmp3, tmp4); 3035 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3036 3037 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3038 } 3039 3040 __ addv(v7, __ T2S, v7, v21); 3041 __ addv(v6, __ T4S, v6, v20); 3042 3043 if (multi_block) { 3044 __ add(ofs, ofs, 64); 3045 __ cmp(ofs, limit); 3046 __ br(Assembler::LE, sha1_loop); 3047 __ mov(c_rarg0, ofs); // return ofs 3048 } 3049 3050 __ strq(v6, Address(state, 0)); 3051 __ strs(v7, Address(state, 16)); 3052 3053 __ ret(lr); 3054 3055 __ bind(keys); 3056 __ emit_int32(0x5a827999); 3057 __ emit_int32(0x6ed9eba1); 3058 __ emit_int32(0x8f1bbcdc); 3059 __ emit_int32(0xca62c1d6); 3060 3061 return start; 3062 } 3063 3064 3065 // Arguments: 3066 // 3067 // Inputs: 3068 // c_rarg0 - byte[] source+offset 3069 // c_rarg1 - int[] SHA.state 3070 // c_rarg2 - int offset 3071 // c_rarg3 - int limit 3072 // 3073 address generate_sha256_implCompress(bool multi_block, const char *name) { 3074 static const uint32_t round_consts[64] = { 3075 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3076 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3077 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3078 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3079 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3080 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3081 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3082 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3083 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3084 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3085 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3086 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3087 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3088 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3089 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3090 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3091 }; 3092 __ align(CodeEntryAlignment); 3093 StubCodeMark mark(this, "StubRoutines", name); 3094 address start = __ pc(); 3095 3096 Register buf = c_rarg0; 3097 Register state = c_rarg1; 3098 Register ofs = c_rarg2; 3099 Register limit = c_rarg3; 3100 3101 Label sha1_loop; 3102 3103 __ stpd(v8, v9, __ pre(sp, -32)); 3104 __ stpd(v10, v11, Address(sp, 16)); 3105 3106 // dga == v0 3107 // dgb == v1 3108 // dg0 == v2 3109 // dg1 == v3 3110 // dg2 == v4 3111 // t0 == v6 3112 // t1 == v7 3113 3114 // load 16 keys to v16..v31 3115 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3116 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3117 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3118 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3119 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3120 3121 // load 8 words (256 bits) state 3122 __ ldpq(v0, v1, state); 3123 3124 __ BIND(sha1_loop); 3125 // load 64 bytes of data into v8..v11 3126 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3127 __ rev32(v8, __ T16B, v8); 3128 __ rev32(v9, __ T16B, v9); 3129 __ rev32(v10, __ T16B, v10); 3130 __ rev32(v11, __ T16B, v11); 3131 3132 __ addv(v6, __ T4S, v8, v16); 3133 __ orr(v2, __ T16B, v0, v0); 3134 __ orr(v3, __ T16B, v1, v1); 3135 3136 FloatRegister d0 = v8; 3137 FloatRegister d1 = v9; 3138 FloatRegister d2 = v10; 3139 FloatRegister d3 = v11; 3140 3141 3142 for (int round = 0; round < 16; round++) { 3143 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3144 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3145 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3146 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3147 3148 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3149 __ orr(v4, __ T16B, v2, v2); 3150 if (round < 15) 3151 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3152 __ sha256h(v2, __ T4S, v3, tmp2); 3153 __ sha256h2(v3, __ T4S, v4, tmp2); 3154 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3155 3156 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3157 } 3158 3159 __ addv(v0, __ T4S, v0, v2); 3160 __ addv(v1, __ T4S, v1, v3); 3161 3162 if (multi_block) { 3163 __ add(ofs, ofs, 64); 3164 __ cmp(ofs, limit); 3165 __ br(Assembler::LE, sha1_loop); 3166 __ mov(c_rarg0, ofs); // return ofs 3167 } 3168 3169 __ ldpd(v10, v11, Address(sp, 16)); 3170 __ ldpd(v8, v9, __ post(sp, 32)); 3171 3172 __ stpq(v0, v1, state); 3173 3174 __ ret(lr); 3175 3176 return start; 3177 } 3178 3179 #ifndef BUILTIN_SIM 3180 // Safefetch stubs. 3181 void generate_safefetch(const char* name, int size, address* entry, 3182 address* fault_pc, address* continuation_pc) { 3183 // safefetch signatures: 3184 // int SafeFetch32(int* adr, int errValue); 3185 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3186 // 3187 // arguments: 3188 // c_rarg0 = adr 3189 // c_rarg1 = errValue 3190 // 3191 // result: 3192 // PPC_RET = *adr or errValue 3193 3194 StubCodeMark mark(this, "StubRoutines", name); 3195 3196 // Entry point, pc or function descriptor. 3197 *entry = __ pc(); 3198 3199 // Load *adr into c_rarg1, may fault. 3200 *fault_pc = __ pc(); 3201 switch (size) { 3202 case 4: 3203 // int32_t 3204 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3205 break; 3206 case 8: 3207 // int64_t 3208 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3209 break; 3210 default: 3211 ShouldNotReachHere(); 3212 } 3213 3214 // return errValue or *adr 3215 *continuation_pc = __ pc(); 3216 __ mov(r0, c_rarg1); 3217 __ ret(lr); 3218 } 3219 #endif 3220 3221 /** 3222 * Arguments: 3223 * 3224 * Inputs: 3225 * c_rarg0 - int crc 3226 * c_rarg1 - byte* buf 3227 * c_rarg2 - int length 3228 * 3229 * Ouput: 3230 * rax - int crc result 3231 */ 3232 address generate_updateBytesCRC32() { 3233 assert(UseCRC32Intrinsics, "what are we doing here?"); 3234 3235 __ align(CodeEntryAlignment); 3236 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3237 3238 address start = __ pc(); 3239 3240 const Register crc = c_rarg0; // crc 3241 const Register buf = c_rarg1; // source java byte array address 3242 const Register len = c_rarg2; // length 3243 const Register table0 = c_rarg3; // crc_table address 3244 const Register table1 = c_rarg4; 3245 const Register table2 = c_rarg5; 3246 const Register table3 = c_rarg6; 3247 const Register tmp3 = c_rarg7; 3248 3249 BLOCK_COMMENT("Entry:"); 3250 __ enter(); // required for proper stackwalking of RuntimeStub frame 3251 3252 __ kernel_crc32(crc, buf, len, 3253 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3254 3255 __ leave(); // required for proper stackwalking of RuntimeStub frame 3256 __ ret(lr); 3257 3258 return start; 3259 } 3260 3261 /** 3262 * Arguments: 3263 * 3264 * Inputs: 3265 * c_rarg0 - int crc 3266 * c_rarg1 - byte* buf 3267 * c_rarg2 - int length 3268 * c_rarg3 - int* table 3269 * 3270 * Ouput: 3271 * r0 - int crc result 3272 */ 3273 address generate_updateBytesCRC32C() { 3274 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3275 3276 __ align(CodeEntryAlignment); 3277 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3278 3279 address start = __ pc(); 3280 3281 const Register crc = c_rarg0; // crc 3282 const Register buf = c_rarg1; // source java byte array address 3283 const Register len = c_rarg2; // length 3284 const Register table0 = c_rarg3; // crc_table address 3285 const Register table1 = c_rarg4; 3286 const Register table2 = c_rarg5; 3287 const Register table3 = c_rarg6; 3288 const Register tmp3 = c_rarg7; 3289 3290 BLOCK_COMMENT("Entry:"); 3291 __ enter(); // required for proper stackwalking of RuntimeStub frame 3292 3293 __ kernel_crc32c(crc, buf, len, 3294 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3295 3296 __ leave(); // required for proper stackwalking of RuntimeStub frame 3297 __ ret(lr); 3298 3299 return start; 3300 } 3301 3302 /*** 3303 * Arguments: 3304 * 3305 * Inputs: 3306 * c_rarg0 - int adler 3307 * c_rarg1 - byte* buff 3308 * c_rarg2 - int len 3309 * 3310 * Output: 3311 * c_rarg0 - int adler result 3312 */ 3313 address generate_updateBytesAdler32() { 3314 __ align(CodeEntryAlignment); 3315 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3316 address start = __ pc(); 3317 3318 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3319 3320 // Aliases 3321 Register adler = c_rarg0; 3322 Register s1 = c_rarg0; 3323 Register s2 = c_rarg3; 3324 Register buff = c_rarg1; 3325 Register len = c_rarg2; 3326 Register nmax = r4; 3327 Register base = r5; 3328 Register count = r6; 3329 Register temp0 = rscratch1; 3330 Register temp1 = rscratch2; 3331 Register temp2 = r7; 3332 3333 // Max number of bytes we can process before having to take the mod 3334 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3335 unsigned long BASE = 0xfff1; 3336 unsigned long NMAX = 0x15B0; 3337 3338 __ mov(base, BASE); 3339 __ mov(nmax, NMAX); 3340 3341 // s1 is initialized to the lower 16 bits of adler 3342 // s2 is initialized to the upper 16 bits of adler 3343 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3344 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3345 3346 // The pipelined loop needs at least 16 elements for 1 iteration 3347 // It does check this, but it is more effective to skip to the cleanup loop 3348 __ cmp(len, 16); 3349 __ br(Assembler::HS, L_nmax); 3350 __ cbz(len, L_combine); 3351 3352 __ bind(L_simple_by1_loop); 3353 __ ldrb(temp0, Address(__ post(buff, 1))); 3354 __ add(s1, s1, temp0); 3355 __ add(s2, s2, s1); 3356 __ subs(len, len, 1); 3357 __ br(Assembler::HI, L_simple_by1_loop); 3358 3359 // s1 = s1 % BASE 3360 __ subs(temp0, s1, base); 3361 __ csel(s1, temp0, s1, Assembler::HS); 3362 3363 // s2 = s2 % BASE 3364 __ lsr(temp0, s2, 16); 3365 __ lsl(temp1, temp0, 4); 3366 __ sub(temp1, temp1, temp0); 3367 __ add(s2, temp1, s2, ext::uxth); 3368 3369 __ subs(temp0, s2, base); 3370 __ csel(s2, temp0, s2, Assembler::HS); 3371 3372 __ b(L_combine); 3373 3374 __ bind(L_nmax); 3375 __ subs(len, len, nmax); 3376 __ sub(count, nmax, 16); 3377 __ br(Assembler::LO, L_by16); 3378 3379 __ bind(L_nmax_loop); 3380 3381 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3382 3383 __ add(s1, s1, temp0, ext::uxtb); 3384 __ ubfx(temp2, temp0, 8, 8); 3385 __ add(s2, s2, s1); 3386 __ add(s1, s1, temp2); 3387 __ ubfx(temp2, temp0, 16, 8); 3388 __ add(s2, s2, s1); 3389 __ add(s1, s1, temp2); 3390 __ ubfx(temp2, temp0, 24, 8); 3391 __ add(s2, s2, s1); 3392 __ add(s1, s1, temp2); 3393 __ ubfx(temp2, temp0, 32, 8); 3394 __ add(s2, s2, s1); 3395 __ add(s1, s1, temp2); 3396 __ ubfx(temp2, temp0, 40, 8); 3397 __ add(s2, s2, s1); 3398 __ add(s1, s1, temp2); 3399 __ ubfx(temp2, temp0, 48, 8); 3400 __ add(s2, s2, s1); 3401 __ add(s1, s1, temp2); 3402 __ add(s2, s2, s1); 3403 __ add(s1, s1, temp0, Assembler::LSR, 56); 3404 __ add(s2, s2, s1); 3405 3406 __ add(s1, s1, temp1, ext::uxtb); 3407 __ ubfx(temp2, temp1, 8, 8); 3408 __ add(s2, s2, s1); 3409 __ add(s1, s1, temp2); 3410 __ ubfx(temp2, temp1, 16, 8); 3411 __ add(s2, s2, s1); 3412 __ add(s1, s1, temp2); 3413 __ ubfx(temp2, temp1, 24, 8); 3414 __ add(s2, s2, s1); 3415 __ add(s1, s1, temp2); 3416 __ ubfx(temp2, temp1, 32, 8); 3417 __ add(s2, s2, s1); 3418 __ add(s1, s1, temp2); 3419 __ ubfx(temp2, temp1, 40, 8); 3420 __ add(s2, s2, s1); 3421 __ add(s1, s1, temp2); 3422 __ ubfx(temp2, temp1, 48, 8); 3423 __ add(s2, s2, s1); 3424 __ add(s1, s1, temp2); 3425 __ add(s2, s2, s1); 3426 __ add(s1, s1, temp1, Assembler::LSR, 56); 3427 __ add(s2, s2, s1); 3428 3429 __ subs(count, count, 16); 3430 __ br(Assembler::HS, L_nmax_loop); 3431 3432 // s1 = s1 % BASE 3433 __ lsr(temp0, s1, 16); 3434 __ lsl(temp1, temp0, 4); 3435 __ sub(temp1, temp1, temp0); 3436 __ add(temp1, temp1, s1, ext::uxth); 3437 3438 __ lsr(temp0, temp1, 16); 3439 __ lsl(s1, temp0, 4); 3440 __ sub(s1, s1, temp0); 3441 __ add(s1, s1, temp1, ext:: uxth); 3442 3443 __ subs(temp0, s1, base); 3444 __ csel(s1, temp0, s1, Assembler::HS); 3445 3446 // s2 = s2 % BASE 3447 __ lsr(temp0, s2, 16); 3448 __ lsl(temp1, temp0, 4); 3449 __ sub(temp1, temp1, temp0); 3450 __ add(temp1, temp1, s2, ext::uxth); 3451 3452 __ lsr(temp0, temp1, 16); 3453 __ lsl(s2, temp0, 4); 3454 __ sub(s2, s2, temp0); 3455 __ add(s2, s2, temp1, ext:: uxth); 3456 3457 __ subs(temp0, s2, base); 3458 __ csel(s2, temp0, s2, Assembler::HS); 3459 3460 __ subs(len, len, nmax); 3461 __ sub(count, nmax, 16); 3462 __ br(Assembler::HS, L_nmax_loop); 3463 3464 __ bind(L_by16); 3465 __ adds(len, len, count); 3466 __ br(Assembler::LO, L_by1); 3467 3468 __ bind(L_by16_loop); 3469 3470 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3471 3472 __ add(s1, s1, temp0, ext::uxtb); 3473 __ ubfx(temp2, temp0, 8, 8); 3474 __ add(s2, s2, s1); 3475 __ add(s1, s1, temp2); 3476 __ ubfx(temp2, temp0, 16, 8); 3477 __ add(s2, s2, s1); 3478 __ add(s1, s1, temp2); 3479 __ ubfx(temp2, temp0, 24, 8); 3480 __ add(s2, s2, s1); 3481 __ add(s1, s1, temp2); 3482 __ ubfx(temp2, temp0, 32, 8); 3483 __ add(s2, s2, s1); 3484 __ add(s1, s1, temp2); 3485 __ ubfx(temp2, temp0, 40, 8); 3486 __ add(s2, s2, s1); 3487 __ add(s1, s1, temp2); 3488 __ ubfx(temp2, temp0, 48, 8); 3489 __ add(s2, s2, s1); 3490 __ add(s1, s1, temp2); 3491 __ add(s2, s2, s1); 3492 __ add(s1, s1, temp0, Assembler::LSR, 56); 3493 __ add(s2, s2, s1); 3494 3495 __ add(s1, s1, temp1, ext::uxtb); 3496 __ ubfx(temp2, temp1, 8, 8); 3497 __ add(s2, s2, s1); 3498 __ add(s1, s1, temp2); 3499 __ ubfx(temp2, temp1, 16, 8); 3500 __ add(s2, s2, s1); 3501 __ add(s1, s1, temp2); 3502 __ ubfx(temp2, temp1, 24, 8); 3503 __ add(s2, s2, s1); 3504 __ add(s1, s1, temp2); 3505 __ ubfx(temp2, temp1, 32, 8); 3506 __ add(s2, s2, s1); 3507 __ add(s1, s1, temp2); 3508 __ ubfx(temp2, temp1, 40, 8); 3509 __ add(s2, s2, s1); 3510 __ add(s1, s1, temp2); 3511 __ ubfx(temp2, temp1, 48, 8); 3512 __ add(s2, s2, s1); 3513 __ add(s1, s1, temp2); 3514 __ add(s2, s2, s1); 3515 __ add(s1, s1, temp1, Assembler::LSR, 56); 3516 __ add(s2, s2, s1); 3517 3518 __ subs(len, len, 16); 3519 __ br(Assembler::HS, L_by16_loop); 3520 3521 __ bind(L_by1); 3522 __ adds(len, len, 15); 3523 __ br(Assembler::LO, L_do_mod); 3524 3525 __ bind(L_by1_loop); 3526 __ ldrb(temp0, Address(__ post(buff, 1))); 3527 __ add(s1, temp0, s1); 3528 __ add(s2, s2, s1); 3529 __ subs(len, len, 1); 3530 __ br(Assembler::HS, L_by1_loop); 3531 3532 __ bind(L_do_mod); 3533 // s1 = s1 % BASE 3534 __ lsr(temp0, s1, 16); 3535 __ lsl(temp1, temp0, 4); 3536 __ sub(temp1, temp1, temp0); 3537 __ add(temp1, temp1, s1, ext::uxth); 3538 3539 __ lsr(temp0, temp1, 16); 3540 __ lsl(s1, temp0, 4); 3541 __ sub(s1, s1, temp0); 3542 __ add(s1, s1, temp1, ext:: uxth); 3543 3544 __ subs(temp0, s1, base); 3545 __ csel(s1, temp0, s1, Assembler::HS); 3546 3547 // s2 = s2 % BASE 3548 __ lsr(temp0, s2, 16); 3549 __ lsl(temp1, temp0, 4); 3550 __ sub(temp1, temp1, temp0); 3551 __ add(temp1, temp1, s2, ext::uxth); 3552 3553 __ lsr(temp0, temp1, 16); 3554 __ lsl(s2, temp0, 4); 3555 __ sub(s2, s2, temp0); 3556 __ add(s2, s2, temp1, ext:: uxth); 3557 3558 __ subs(temp0, s2, base); 3559 __ csel(s2, temp0, s2, Assembler::HS); 3560 3561 // Combine lower bits and higher bits 3562 __ bind(L_combine); 3563 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3564 3565 __ ret(lr); 3566 3567 return start; 3568 } 3569 3570 /** 3571 * Arguments: 3572 * 3573 * Input: 3574 * c_rarg0 - x address 3575 * c_rarg1 - x length 3576 * c_rarg2 - y address 3577 * c_rarg3 - y lenth 3578 * c_rarg4 - z address 3579 * c_rarg5 - z length 3580 */ 3581 address generate_multiplyToLen() { 3582 __ align(CodeEntryAlignment); 3583 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3584 3585 address start = __ pc(); 3586 const Register x = r0; 3587 const Register xlen = r1; 3588 const Register y = r2; 3589 const Register ylen = r3; 3590 const Register z = r4; 3591 const Register zlen = r5; 3592 3593 const Register tmp1 = r10; 3594 const Register tmp2 = r11; 3595 const Register tmp3 = r12; 3596 const Register tmp4 = r13; 3597 const Register tmp5 = r14; 3598 const Register tmp6 = r15; 3599 const Register tmp7 = r16; 3600 3601 BLOCK_COMMENT("Entry:"); 3602 __ enter(); // required for proper stackwalking of RuntimeStub frame 3603 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3604 __ leave(); // required for proper stackwalking of RuntimeStub frame 3605 __ ret(lr); 3606 3607 return start; 3608 } 3609 3610 address generate_squareToLen() { 3611 // squareToLen algorithm for sizes 1..127 described in java code works 3612 // faster than multiply_to_len on some CPUs and slower on others, but 3613 // multiply_to_len shows a bit better overall results 3614 __ align(CodeEntryAlignment); 3615 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3616 address start = __ pc(); 3617 3618 const Register x = r0; 3619 const Register xlen = r1; 3620 const Register z = r2; 3621 const Register zlen = r3; 3622 const Register y = r4; // == x 3623 const Register ylen = r5; // == xlen 3624 3625 const Register tmp1 = r10; 3626 const Register tmp2 = r11; 3627 const Register tmp3 = r12; 3628 const Register tmp4 = r13; 3629 const Register tmp5 = r14; 3630 const Register tmp6 = r15; 3631 const Register tmp7 = r16; 3632 3633 RegSet spilled_regs = RegSet::of(y, ylen); 3634 BLOCK_COMMENT("Entry:"); 3635 __ enter(); 3636 __ push(spilled_regs, sp); 3637 __ mov(y, x); 3638 __ mov(ylen, xlen); 3639 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3640 __ pop(spilled_regs, sp); 3641 __ leave(); 3642 __ ret(lr); 3643 return start; 3644 } 3645 3646 address generate_mulAdd() { 3647 __ align(CodeEntryAlignment); 3648 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3649 3650 address start = __ pc(); 3651 3652 const Register out = r0; 3653 const Register in = r1; 3654 const Register offset = r2; 3655 const Register len = r3; 3656 const Register k = r4; 3657 3658 BLOCK_COMMENT("Entry:"); 3659 __ enter(); 3660 __ mul_add(out, in, offset, len, k); 3661 __ leave(); 3662 __ ret(lr); 3663 3664 return start; 3665 } 3666 3667 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3668 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3669 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3670 // Karatsuba multiplication performs a 128*128 -> 256-bit 3671 // multiplication in three 128-bit multiplications and a few 3672 // additions. 3673 // 3674 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3675 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3676 // 3677 // Inputs: 3678 // 3679 // A0 in a.d[0] (subkey) 3680 // A1 in a.d[1] 3681 // (A1+A0) in a1_xor_a0.d[0] 3682 // 3683 // B0 in b.d[0] (state) 3684 // B1 in b.d[1] 3685 3686 __ ext(tmp1, __ T16B, b, b, 0x08); 3687 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3688 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3689 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3690 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3691 3692 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3693 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3694 __ eor(tmp2, __ T16B, tmp2, tmp4); 3695 __ eor(tmp2, __ T16B, tmp2, tmp3); 3696 3697 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3698 __ ins(result_hi, __ D, tmp2, 0, 1); 3699 __ ins(result_lo, __ D, tmp2, 1, 0); 3700 } 3701 3702 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3703 FloatRegister p, FloatRegister z, FloatRegister t1) { 3704 const FloatRegister t0 = result; 3705 3706 // The GCM field polynomial f is z^128 + p(z), where p = 3707 // z^7+z^2+z+1. 3708 // 3709 // z^128 === -p(z) (mod (z^128 + p(z))) 3710 // 3711 // so, given that the product we're reducing is 3712 // a == lo + hi * z^128 3713 // substituting, 3714 // === lo - hi * p(z) (mod (z^128 + p(z))) 3715 // 3716 // we reduce by multiplying hi by p(z) and subtracting the result 3717 // from (i.e. XORing it with) lo. Because p has no nonzero high 3718 // bits we can do this with two 64-bit multiplications, lo*p and 3719 // hi*p. 3720 3721 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3722 __ ext(t1, __ T16B, t0, z, 8); 3723 __ eor(hi, __ T16B, hi, t1); 3724 __ ext(t1, __ T16B, z, t0, 8); 3725 __ eor(lo, __ T16B, lo, t1); 3726 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3727 __ eor(result, __ T16B, lo, t0); 3728 } 3729 3730 address generate_has_negatives(address &has_negatives_long) { 3731 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3732 const int large_loop_size = 64; 3733 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3734 int dcache_line = VM_Version::dcache_line_size(); 3735 3736 Register ary1 = r1, len = r2, result = r0; 3737 3738 __ align(CodeEntryAlignment); 3739 address entry = __ pc(); 3740 3741 __ enter(); 3742 3743 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3744 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3745 3746 __ cmp(len, 15); 3747 __ br(Assembler::GT, LEN_OVER_15); 3748 // The only case when execution falls into this code is when pointer is near 3749 // the end of memory page and we have to avoid reading next page 3750 __ add(ary1, ary1, len); 3751 __ subs(len, len, 8); 3752 __ br(Assembler::GT, LEN_OVER_8); 3753 __ ldr(rscratch2, Address(ary1, -8)); 3754 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3755 __ lsrv(rscratch2, rscratch2, rscratch1); 3756 __ tst(rscratch2, UPPER_BIT_MASK); 3757 __ cset(result, Assembler::NE); 3758 __ leave(); 3759 __ ret(lr); 3760 __ bind(LEN_OVER_8); 3761 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3762 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3763 __ tst(rscratch2, UPPER_BIT_MASK); 3764 __ br(Assembler::NE, RET_TRUE_NO_POP); 3765 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3766 __ lsrv(rscratch1, rscratch1, rscratch2); 3767 __ tst(rscratch1, UPPER_BIT_MASK); 3768 __ cset(result, Assembler::NE); 3769 __ leave(); 3770 __ ret(lr); 3771 3772 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3773 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3774 3775 has_negatives_long = __ pc(); // 2nd entry point 3776 3777 __ enter(); 3778 3779 __ bind(LEN_OVER_15); 3780 __ push(spilled_regs, sp); 3781 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3782 __ cbz(rscratch2, ALIGNED); 3783 __ ldp(tmp6, tmp1, Address(ary1)); 3784 __ mov(tmp5, 16); 3785 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3786 __ add(ary1, ary1, rscratch1); 3787 __ sub(len, len, rscratch1); 3788 __ orr(tmp6, tmp6, tmp1); 3789 __ tst(tmp6, UPPER_BIT_MASK); 3790 __ br(Assembler::NE, RET_TRUE); 3791 3792 __ bind(ALIGNED); 3793 __ cmp(len, large_loop_size); 3794 __ br(Assembler::LT, CHECK_16); 3795 // Perform 16-byte load as early return in pre-loop to handle situation 3796 // when initially aligned large array has negative values at starting bytes, 3797 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3798 // slower. Cases with negative bytes further ahead won't be affected that 3799 // much. In fact, it'll be faster due to early loads, less instructions and 3800 // less branches in LARGE_LOOP. 3801 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3802 __ sub(len, len, 16); 3803 __ orr(tmp6, tmp6, tmp1); 3804 __ tst(tmp6, UPPER_BIT_MASK); 3805 __ br(Assembler::NE, RET_TRUE); 3806 __ cmp(len, large_loop_size); 3807 __ br(Assembler::LT, CHECK_16); 3808 3809 if (SoftwarePrefetchHintDistance >= 0 3810 && SoftwarePrefetchHintDistance >= dcache_line) { 3811 // initial prefetch 3812 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3813 } 3814 __ bind(LARGE_LOOP); 3815 if (SoftwarePrefetchHintDistance >= 0) { 3816 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3817 } 3818 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3819 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3820 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3821 // instructions per cycle and have less branches, but this approach disables 3822 // early return, thus, all 64 bytes are loaded and checked every time. 3823 __ ldp(tmp2, tmp3, Address(ary1)); 3824 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3825 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3826 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3827 __ add(ary1, ary1, large_loop_size); 3828 __ sub(len, len, large_loop_size); 3829 __ orr(tmp2, tmp2, tmp3); 3830 __ orr(tmp4, tmp4, tmp5); 3831 __ orr(rscratch1, rscratch1, rscratch2); 3832 __ orr(tmp6, tmp6, tmp1); 3833 __ orr(tmp2, tmp2, tmp4); 3834 __ orr(rscratch1, rscratch1, tmp6); 3835 __ orr(tmp2, tmp2, rscratch1); 3836 __ tst(tmp2, UPPER_BIT_MASK); 3837 __ br(Assembler::NE, RET_TRUE); 3838 __ cmp(len, large_loop_size); 3839 __ br(Assembler::GE, LARGE_LOOP); 3840 3841 __ bind(CHECK_16); // small 16-byte load pre-loop 3842 __ cmp(len, 16); 3843 __ br(Assembler::LT, POST_LOOP16); 3844 3845 __ bind(LOOP16); // small 16-byte load loop 3846 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3847 __ sub(len, len, 16); 3848 __ orr(tmp2, tmp2, tmp3); 3849 __ tst(tmp2, UPPER_BIT_MASK); 3850 __ br(Assembler::NE, RET_TRUE); 3851 __ cmp(len, 16); 3852 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3853 3854 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3855 __ cmp(len, 8); 3856 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3857 __ ldr(tmp3, Address(__ post(ary1, 8))); 3858 __ sub(len, len, 8); 3859 __ tst(tmp3, UPPER_BIT_MASK); 3860 __ br(Assembler::NE, RET_TRUE); 3861 3862 __ bind(POST_LOOP16_LOAD_TAIL); 3863 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3864 __ ldr(tmp1, Address(ary1)); 3865 __ mov(tmp2, 64); 3866 __ sub(tmp4, tmp2, len, __ LSL, 3); 3867 __ lslv(tmp1, tmp1, tmp4); 3868 __ tst(tmp1, UPPER_BIT_MASK); 3869 __ br(Assembler::NE, RET_TRUE); 3870 // Fallthrough 3871 3872 __ bind(RET_FALSE); 3873 __ pop(spilled_regs, sp); 3874 __ leave(); 3875 __ mov(result, zr); 3876 __ ret(lr); 3877 3878 __ bind(RET_TRUE); 3879 __ pop(spilled_regs, sp); 3880 __ bind(RET_TRUE_NO_POP); 3881 __ leave(); 3882 __ mov(result, 1); 3883 __ ret(lr); 3884 3885 __ bind(DONE); 3886 __ pop(spilled_regs, sp); 3887 __ leave(); 3888 __ ret(lr); 3889 return entry; 3890 } 3891 3892 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3893 bool usePrefetch, Label &NOT_EQUAL) { 3894 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3895 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3896 tmp7 = r12, tmp8 = r13; 3897 Label LOOP; 3898 3899 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3900 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3901 __ bind(LOOP); 3902 if (usePrefetch) { 3903 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3904 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3905 } 3906 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3907 __ eor(tmp1, tmp1, tmp2); 3908 __ eor(tmp3, tmp3, tmp4); 3909 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3910 __ orr(tmp1, tmp1, tmp3); 3911 __ cbnz(tmp1, NOT_EQUAL); 3912 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3913 __ eor(tmp5, tmp5, tmp6); 3914 __ eor(tmp7, tmp7, tmp8); 3915 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3916 __ orr(tmp5, tmp5, tmp7); 3917 __ cbnz(tmp5, NOT_EQUAL); 3918 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3919 __ eor(tmp1, tmp1, tmp2); 3920 __ eor(tmp3, tmp3, tmp4); 3921 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3922 __ orr(tmp1, tmp1, tmp3); 3923 __ cbnz(tmp1, NOT_EQUAL); 3924 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3925 __ eor(tmp5, tmp5, tmp6); 3926 __ sub(cnt1, cnt1, 8 * wordSize); 3927 __ eor(tmp7, tmp7, tmp8); 3928 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3929 __ cmp(cnt1, loopThreshold); 3930 __ orr(tmp5, tmp5, tmp7); 3931 __ cbnz(tmp5, NOT_EQUAL); 3932 __ br(__ GE, LOOP); 3933 // post-loop 3934 __ eor(tmp1, tmp1, tmp2); 3935 __ eor(tmp3, tmp3, tmp4); 3936 __ orr(tmp1, tmp1, tmp3); 3937 __ sub(cnt1, cnt1, 2 * wordSize); 3938 __ cbnz(tmp1, NOT_EQUAL); 3939 } 3940 3941 void generate_large_array_equals_loop_simd(int loopThreshold, 3942 bool usePrefetch, Label &NOT_EQUAL) { 3943 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3944 tmp2 = rscratch2; 3945 Label LOOP; 3946 3947 __ bind(LOOP); 3948 if (usePrefetch) { 3949 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3950 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3951 } 3952 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3953 __ sub(cnt1, cnt1, 8 * wordSize); 3954 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3955 __ cmp(cnt1, loopThreshold); 3956 __ eor(v0, __ T16B, v0, v4); 3957 __ eor(v1, __ T16B, v1, v5); 3958 __ eor(v2, __ T16B, v2, v6); 3959 __ eor(v3, __ T16B, v3, v7); 3960 __ orr(v0, __ T16B, v0, v1); 3961 __ orr(v1, __ T16B, v2, v3); 3962 __ orr(v0, __ T16B, v0, v1); 3963 __ umov(tmp1, v0, __ D, 0); 3964 __ umov(tmp2, v0, __ D, 1); 3965 __ orr(tmp1, tmp1, tmp2); 3966 __ cbnz(tmp1, NOT_EQUAL); 3967 __ br(__ GE, LOOP); 3968 } 3969 3970 // a1 = r1 - array1 address 3971 // a2 = r2 - array2 address 3972 // result = r0 - return value. Already contains "false" 3973 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3974 // r3-r5 are reserved temporary registers 3975 address generate_large_array_equals() { 3976 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3977 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3978 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3979 tmp7 = r12, tmp8 = r13; 3980 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3981 SMALL_LOOP, POST_LOOP; 3982 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3983 // calculate if at least 32 prefetched bytes are used 3984 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3985 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3986 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3987 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3988 tmp5, tmp6, tmp7, tmp8); 3989 3990 __ align(CodeEntryAlignment); 3991 address entry = __ pc(); 3992 __ enter(); 3993 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3994 // also advance pointers to use post-increment instead of pre-increment 3995 __ add(a1, a1, wordSize); 3996 __ add(a2, a2, wordSize); 3997 if (AvoidUnalignedAccesses) { 3998 // both implementations (SIMD/nonSIMD) are using relatively large load 3999 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 4000 // on some CPUs in case of address is not at least 16-byte aligned. 4001 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 4002 // load if needed at least for 1st address and make if 16-byte aligned. 4003 Label ALIGNED16; 4004 __ tbz(a1, 3, ALIGNED16); 4005 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4006 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4007 __ sub(cnt1, cnt1, wordSize); 4008 __ eor(tmp1, tmp1, tmp2); 4009 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 4010 __ bind(ALIGNED16); 4011 } 4012 if (UseSIMDForArrayEquals) { 4013 if (SoftwarePrefetchHintDistance >= 0) { 4014 __ cmp(cnt1, prefetchLoopThreshold); 4015 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4016 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 4017 /* prfm = */ true, NOT_EQUAL); 4018 __ cmp(cnt1, nonPrefetchLoopThreshold); 4019 __ br(__ LT, TAIL); 4020 } 4021 __ bind(NO_PREFETCH_LARGE_LOOP); 4022 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 4023 /* prfm = */ false, NOT_EQUAL); 4024 } else { 4025 __ push(spilled_regs, sp); 4026 if (SoftwarePrefetchHintDistance >= 0) { 4027 __ cmp(cnt1, prefetchLoopThreshold); 4028 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4029 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 4030 /* prfm = */ true, NOT_EQUAL); 4031 __ cmp(cnt1, nonPrefetchLoopThreshold); 4032 __ br(__ LT, TAIL); 4033 } 4034 __ bind(NO_PREFETCH_LARGE_LOOP); 4035 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 4036 /* prfm = */ false, NOT_EQUAL); 4037 } 4038 __ bind(TAIL); 4039 __ cbz(cnt1, EQUAL); 4040 __ subs(cnt1, cnt1, wordSize); 4041 __ br(__ LE, POST_LOOP); 4042 __ bind(SMALL_LOOP); 4043 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4044 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4045 __ subs(cnt1, cnt1, wordSize); 4046 __ eor(tmp1, tmp1, tmp2); 4047 __ cbnz(tmp1, NOT_EQUAL); 4048 __ br(__ GT, SMALL_LOOP); 4049 __ bind(POST_LOOP); 4050 __ ldr(tmp1, Address(a1, cnt1)); 4051 __ ldr(tmp2, Address(a2, cnt1)); 4052 __ eor(tmp1, tmp1, tmp2); 4053 __ cbnz(tmp1, NOT_EQUAL); 4054 __ bind(EQUAL); 4055 __ mov(result, true); 4056 __ bind(NOT_EQUAL); 4057 if (!UseSIMDForArrayEquals) { 4058 __ pop(spilled_regs, sp); 4059 } 4060 __ bind(NOT_EQUAL_NO_POP); 4061 __ leave(); 4062 __ ret(lr); 4063 return entry; 4064 } 4065 4066 4067 /** 4068 * Arguments: 4069 * 4070 * Input: 4071 * c_rarg0 - current state address 4072 * c_rarg1 - H key address 4073 * c_rarg2 - data address 4074 * c_rarg3 - number of blocks 4075 * 4076 * Output: 4077 * Updated state at c_rarg0 4078 */ 4079 address generate_ghash_processBlocks() { 4080 // Bafflingly, GCM uses little-endian for the byte order, but 4081 // big-endian for the bit order. For example, the polynomial 1 is 4082 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4083 // 4084 // So, we must either reverse the bytes in each word and do 4085 // everything big-endian or reverse the bits in each byte and do 4086 // it little-endian. On AArch64 it's more idiomatic to reverse 4087 // the bits in each byte (we have an instruction, RBIT, to do 4088 // that) and keep the data in little-endian bit order throught the 4089 // calculation, bit-reversing the inputs and outputs. 4090 4091 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4092 __ align(wordSize * 2); 4093 address p = __ pc(); 4094 __ emit_int64(0x87); // The low-order bits of the field 4095 // polynomial (i.e. p = z^7+z^2+z+1) 4096 // repeated in the low and high parts of a 4097 // 128-bit vector 4098 __ emit_int64(0x87); 4099 4100 __ align(CodeEntryAlignment); 4101 address start = __ pc(); 4102 4103 Register state = c_rarg0; 4104 Register subkeyH = c_rarg1; 4105 Register data = c_rarg2; 4106 Register blocks = c_rarg3; 4107 4108 FloatRegister vzr = v30; 4109 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4110 4111 __ ldrq(v0, Address(state)); 4112 __ ldrq(v1, Address(subkeyH)); 4113 4114 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4115 __ rbit(v0, __ T16B, v0); 4116 __ rev64(v1, __ T16B, v1); 4117 __ rbit(v1, __ T16B, v1); 4118 4119 __ ldrq(v26, p); 4120 4121 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4122 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4123 4124 { 4125 Label L_ghash_loop; 4126 __ bind(L_ghash_loop); 4127 4128 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4129 // reversing each byte 4130 __ rbit(v2, __ T16B, v2); 4131 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4132 4133 // Multiply state in v2 by subkey in v1 4134 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4135 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4136 /*temps*/v6, v20, v18, v21); 4137 // Reduce v7:v5 by the field polynomial 4138 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4139 4140 __ sub(blocks, blocks, 1); 4141 __ cbnz(blocks, L_ghash_loop); 4142 } 4143 4144 // The bit-reversed result is at this point in v0 4145 __ rev64(v1, __ T16B, v0); 4146 __ rbit(v1, __ T16B, v1); 4147 4148 __ st1(v1, __ T16B, state); 4149 __ ret(lr); 4150 4151 return start; 4152 } 4153 4154 // Continuation point for throwing of implicit exceptions that are 4155 // not handled in the current activation. Fabricates an exception 4156 // oop and initiates normal exception dispatching in this 4157 // frame. Since we need to preserve callee-saved values (currently 4158 // only for C2, but done for C1 as well) we need a callee-saved oop 4159 // map and therefore have to make these stubs into RuntimeStubs 4160 // rather than BufferBlobs. If the compiler needs all registers to 4161 // be preserved between the fault point and the exception handler 4162 // then it must assume responsibility for that in 4163 // AbstractCompiler::continuation_for_implicit_null_exception or 4164 // continuation_for_implicit_division_by_zero_exception. All other 4165 // implicit exceptions (e.g., NullPointerException or 4166 // AbstractMethodError on entry) are either at call sites or 4167 // otherwise assume that stack unwinding will be initiated, so 4168 // caller saved registers were assumed volatile in the compiler. 4169 4170 #undef __ 4171 #define __ masm-> 4172 4173 address generate_throw_exception(const char* name, 4174 address runtime_entry, 4175 Register arg1 = noreg, 4176 Register arg2 = noreg) { 4177 // Information about frame layout at time of blocking runtime call. 4178 // Note that we only have to preserve callee-saved registers since 4179 // the compilers are responsible for supplying a continuation point 4180 // if they expect all registers to be preserved. 4181 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4182 enum layout { 4183 rfp_off = 0, 4184 rfp_off2, 4185 return_off, 4186 return_off2, 4187 framesize // inclusive of return address 4188 }; 4189 4190 int insts_size = 512; 4191 int locs_size = 64; 4192 4193 CodeBuffer code(name, insts_size, locs_size); 4194 OopMapSet* oop_maps = new OopMapSet(); 4195 MacroAssembler* masm = new MacroAssembler(&code); 4196 4197 address start = __ pc(); 4198 4199 // This is an inlined and slightly modified version of call_VM 4200 // which has the ability to fetch the return PC out of 4201 // thread-local storage and also sets up last_Java_sp slightly 4202 // differently than the real call_VM 4203 4204 __ enter(); // Save FP and LR before call 4205 4206 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4207 4208 // lr and fp are already in place 4209 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4210 4211 int frame_complete = __ pc() - start; 4212 4213 // Set up last_Java_sp and last_Java_fp 4214 address the_pc = __ pc(); 4215 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 4216 4217 // Call runtime 4218 if (arg1 != noreg) { 4219 assert(arg2 != c_rarg1, "clobbered"); 4220 __ mov(c_rarg1, arg1); 4221 } 4222 if (arg2 != noreg) { 4223 __ mov(c_rarg2, arg2); 4224 } 4225 __ mov(c_rarg0, rthread); 4226 BLOCK_COMMENT("call runtime_entry"); 4227 __ mov(rscratch1, runtime_entry); 4228 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4229 4230 // Generate oop map 4231 OopMap* map = new OopMap(framesize, 0); 4232 4233 oop_maps->add_gc_map(the_pc - start, map); 4234 4235 __ reset_last_Java_frame(true); 4236 __ maybe_isb(); 4237 4238 __ leave(); 4239 4240 // check for pending exceptions 4241 #ifdef ASSERT 4242 Label L; 4243 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4244 __ cbnz(rscratch1, L); 4245 __ should_not_reach_here(); 4246 __ bind(L); 4247 #endif // ASSERT 4248 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4249 4250 4251 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4252 RuntimeStub* stub = 4253 RuntimeStub::new_runtime_stub(name, 4254 &code, 4255 frame_complete, 4256 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4257 oop_maps, false); 4258 return stub->entry_point(); 4259 } 4260 4261 class MontgomeryMultiplyGenerator : public MacroAssembler { 4262 4263 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4264 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4265 4266 RegSet _toSave; 4267 bool _squaring; 4268 4269 public: 4270 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4271 : MacroAssembler(as->code()), _squaring(squaring) { 4272 4273 // Register allocation 4274 4275 Register reg = c_rarg0; 4276 Pa_base = reg; // Argument registers 4277 if (squaring) 4278 Pb_base = Pa_base; 4279 else 4280 Pb_base = ++reg; 4281 Pn_base = ++reg; 4282 Rlen= ++reg; 4283 inv = ++reg; 4284 Pm_base = ++reg; 4285 4286 // Working registers: 4287 Ra = ++reg; // The current digit of a, b, n, and m. 4288 Rb = ++reg; 4289 Rm = ++reg; 4290 Rn = ++reg; 4291 4292 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4293 Pb = ++reg; 4294 Pm = ++reg; 4295 Pn = ++reg; 4296 4297 t0 = ++reg; // Three registers which form a 4298 t1 = ++reg; // triple-precision accumuator. 4299 t2 = ++reg; 4300 4301 Ri = ++reg; // Inner and outer loop indexes. 4302 Rj = ++reg; 4303 4304 Rhi_ab = ++reg; // Product registers: low and high parts 4305 Rlo_ab = ++reg; // of a*b and m*n. 4306 Rhi_mn = ++reg; 4307 Rlo_mn = ++reg; 4308 4309 // r19 and up are callee-saved. 4310 _toSave = RegSet::range(r19, reg) + Pm_base; 4311 } 4312 4313 private: 4314 void save_regs() { 4315 push(_toSave, sp); 4316 } 4317 4318 void restore_regs() { 4319 pop(_toSave, sp); 4320 } 4321 4322 template <typename T> 4323 void unroll_2(Register count, T block) { 4324 Label loop, end, odd; 4325 tbnz(count, 0, odd); 4326 cbz(count, end); 4327 align(16); 4328 bind(loop); 4329 (this->*block)(); 4330 bind(odd); 4331 (this->*block)(); 4332 subs(count, count, 2); 4333 br(Assembler::GT, loop); 4334 bind(end); 4335 } 4336 4337 template <typename T> 4338 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4339 Label loop, end, odd; 4340 tbnz(count, 0, odd); 4341 cbz(count, end); 4342 align(16); 4343 bind(loop); 4344 (this->*block)(d, s, tmp); 4345 bind(odd); 4346 (this->*block)(d, s, tmp); 4347 subs(count, count, 2); 4348 br(Assembler::GT, loop); 4349 bind(end); 4350 } 4351 4352 void pre1(RegisterOrConstant i) { 4353 block_comment("pre1"); 4354 // Pa = Pa_base; 4355 // Pb = Pb_base + i; 4356 // Pm = Pm_base; 4357 // Pn = Pn_base + i; 4358 // Ra = *Pa; 4359 // Rb = *Pb; 4360 // Rm = *Pm; 4361 // Rn = *Pn; 4362 ldr(Ra, Address(Pa_base)); 4363 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4364 ldr(Rm, Address(Pm_base)); 4365 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4366 lea(Pa, Address(Pa_base)); 4367 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4368 lea(Pm, Address(Pm_base)); 4369 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4370 4371 // Zero the m*n result. 4372 mov(Rhi_mn, zr); 4373 mov(Rlo_mn, zr); 4374 } 4375 4376 // The core multiply-accumulate step of a Montgomery 4377 // multiplication. The idea is to schedule operations as a 4378 // pipeline so that instructions with long latencies (loads and 4379 // multiplies) have time to complete before their results are 4380 // used. This most benefits in-order implementations of the 4381 // architecture but out-of-order ones also benefit. 4382 void step() { 4383 block_comment("step"); 4384 // MACC(Ra, Rb, t0, t1, t2); 4385 // Ra = *++Pa; 4386 // Rb = *--Pb; 4387 umulh(Rhi_ab, Ra, Rb); 4388 mul(Rlo_ab, Ra, Rb); 4389 ldr(Ra, pre(Pa, wordSize)); 4390 ldr(Rb, pre(Pb, -wordSize)); 4391 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4392 // previous iteration. 4393 // MACC(Rm, Rn, t0, t1, t2); 4394 // Rm = *++Pm; 4395 // Rn = *--Pn; 4396 umulh(Rhi_mn, Rm, Rn); 4397 mul(Rlo_mn, Rm, Rn); 4398 ldr(Rm, pre(Pm, wordSize)); 4399 ldr(Rn, pre(Pn, -wordSize)); 4400 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4401 } 4402 4403 void post1() { 4404 block_comment("post1"); 4405 4406 // MACC(Ra, Rb, t0, t1, t2); 4407 // Ra = *++Pa; 4408 // Rb = *--Pb; 4409 umulh(Rhi_ab, Ra, Rb); 4410 mul(Rlo_ab, Ra, Rb); 4411 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4412 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4413 4414 // *Pm = Rm = t0 * inv; 4415 mul(Rm, t0, inv); 4416 str(Rm, Address(Pm)); 4417 4418 // MACC(Rm, Rn, t0, t1, t2); 4419 // t0 = t1; t1 = t2; t2 = 0; 4420 umulh(Rhi_mn, Rm, Rn); 4421 4422 #ifndef PRODUCT 4423 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4424 { 4425 mul(Rlo_mn, Rm, Rn); 4426 add(Rlo_mn, t0, Rlo_mn); 4427 Label ok; 4428 cbz(Rlo_mn, ok); { 4429 stop("broken Montgomery multiply"); 4430 } bind(ok); 4431 } 4432 #endif 4433 // We have very carefully set things up so that 4434 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4435 // the lower half of Rm * Rn because we know the result already: 4436 // it must be -t0. t0 + (-t0) must generate a carry iff 4437 // t0 != 0. So, rather than do a mul and an adds we just set 4438 // the carry flag iff t0 is nonzero. 4439 // 4440 // mul(Rlo_mn, Rm, Rn); 4441 // adds(zr, t0, Rlo_mn); 4442 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4443 adcs(t0, t1, Rhi_mn); 4444 adc(t1, t2, zr); 4445 mov(t2, zr); 4446 } 4447 4448 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4449 block_comment("pre2"); 4450 // Pa = Pa_base + i-len; 4451 // Pb = Pb_base + len; 4452 // Pm = Pm_base + i-len; 4453 // Pn = Pn_base + len; 4454 4455 if (i.is_register()) { 4456 sub(Rj, i.as_register(), len); 4457 } else { 4458 mov(Rj, i.as_constant()); 4459 sub(Rj, Rj, len); 4460 } 4461 // Rj == i-len 4462 4463 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4464 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4465 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4466 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4467 4468 // Ra = *++Pa; 4469 // Rb = *--Pb; 4470 // Rm = *++Pm; 4471 // Rn = *--Pn; 4472 ldr(Ra, pre(Pa, wordSize)); 4473 ldr(Rb, pre(Pb, -wordSize)); 4474 ldr(Rm, pre(Pm, wordSize)); 4475 ldr(Rn, pre(Pn, -wordSize)); 4476 4477 mov(Rhi_mn, zr); 4478 mov(Rlo_mn, zr); 4479 } 4480 4481 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4482 block_comment("post2"); 4483 if (i.is_constant()) { 4484 mov(Rj, i.as_constant()-len.as_constant()); 4485 } else { 4486 sub(Rj, i.as_register(), len); 4487 } 4488 4489 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4490 4491 // As soon as we know the least significant digit of our result, 4492 // store it. 4493 // Pm_base[i-len] = t0; 4494 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4495 4496 // t0 = t1; t1 = t2; t2 = 0; 4497 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4498 adc(t1, t2, zr); 4499 mov(t2, zr); 4500 } 4501 4502 // A carry in t0 after Montgomery multiplication means that we 4503 // should subtract multiples of n from our result in m. We'll 4504 // keep doing that until there is no carry. 4505 void normalize(RegisterOrConstant len) { 4506 block_comment("normalize"); 4507 // while (t0) 4508 // t0 = sub(Pm_base, Pn_base, t0, len); 4509 Label loop, post, again; 4510 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4511 cbz(t0, post); { 4512 bind(again); { 4513 mov(i, zr); 4514 mov(cnt, len); 4515 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4516 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4517 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4518 align(16); 4519 bind(loop); { 4520 sbcs(Rm, Rm, Rn); 4521 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4522 add(i, i, 1); 4523 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4524 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4525 sub(cnt, cnt, 1); 4526 } cbnz(cnt, loop); 4527 sbc(t0, t0, zr); 4528 } cbnz(t0, again); 4529 } bind(post); 4530 } 4531 4532 // Move memory at s to d, reversing words. 4533 // Increments d to end of copied memory 4534 // Destroys tmp1, tmp2 4535 // Preserves len 4536 // Leaves s pointing to the address which was in d at start 4537 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4538 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4539 4540 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4541 mov(tmp1, len); 4542 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4543 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4544 } 4545 // where 4546 void reverse1(Register d, Register s, Register tmp) { 4547 ldr(tmp, pre(s, -wordSize)); 4548 ror(tmp, tmp, 32); 4549 str(tmp, post(d, wordSize)); 4550 } 4551 4552 void step_squaring() { 4553 // An extra ACC 4554 step(); 4555 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4556 } 4557 4558 void last_squaring(RegisterOrConstant i) { 4559 Label dont; 4560 // if ((i & 1) == 0) { 4561 tbnz(i.as_register(), 0, dont); { 4562 // MACC(Ra, Rb, t0, t1, t2); 4563 // Ra = *++Pa; 4564 // Rb = *--Pb; 4565 umulh(Rhi_ab, Ra, Rb); 4566 mul(Rlo_ab, Ra, Rb); 4567 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4568 } bind(dont); 4569 } 4570 4571 void extra_step_squaring() { 4572 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4573 4574 // MACC(Rm, Rn, t0, t1, t2); 4575 // Rm = *++Pm; 4576 // Rn = *--Pn; 4577 umulh(Rhi_mn, Rm, Rn); 4578 mul(Rlo_mn, Rm, Rn); 4579 ldr(Rm, pre(Pm, wordSize)); 4580 ldr(Rn, pre(Pn, -wordSize)); 4581 } 4582 4583 void post1_squaring() { 4584 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4585 4586 // *Pm = Rm = t0 * inv; 4587 mul(Rm, t0, inv); 4588 str(Rm, Address(Pm)); 4589 4590 // MACC(Rm, Rn, t0, t1, t2); 4591 // t0 = t1; t1 = t2; t2 = 0; 4592 umulh(Rhi_mn, Rm, Rn); 4593 4594 #ifndef PRODUCT 4595 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4596 { 4597 mul(Rlo_mn, Rm, Rn); 4598 add(Rlo_mn, t0, Rlo_mn); 4599 Label ok; 4600 cbz(Rlo_mn, ok); { 4601 stop("broken Montgomery multiply"); 4602 } bind(ok); 4603 } 4604 #endif 4605 // We have very carefully set things up so that 4606 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4607 // the lower half of Rm * Rn because we know the result already: 4608 // it must be -t0. t0 + (-t0) must generate a carry iff 4609 // t0 != 0. So, rather than do a mul and an adds we just set 4610 // the carry flag iff t0 is nonzero. 4611 // 4612 // mul(Rlo_mn, Rm, Rn); 4613 // adds(zr, t0, Rlo_mn); 4614 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4615 adcs(t0, t1, Rhi_mn); 4616 adc(t1, t2, zr); 4617 mov(t2, zr); 4618 } 4619 4620 void acc(Register Rhi, Register Rlo, 4621 Register t0, Register t1, Register t2) { 4622 adds(t0, t0, Rlo); 4623 adcs(t1, t1, Rhi); 4624 adc(t2, t2, zr); 4625 } 4626 4627 public: 4628 /** 4629 * Fast Montgomery multiplication. The derivation of the 4630 * algorithm is in A Cryptographic Library for the Motorola 4631 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4632 * 4633 * Arguments: 4634 * 4635 * Inputs for multiplication: 4636 * c_rarg0 - int array elements a 4637 * c_rarg1 - int array elements b 4638 * c_rarg2 - int array elements n (the modulus) 4639 * c_rarg3 - int length 4640 * c_rarg4 - int inv 4641 * c_rarg5 - int array elements m (the result) 4642 * 4643 * Inputs for squaring: 4644 * c_rarg0 - int array elements a 4645 * c_rarg1 - int array elements n (the modulus) 4646 * c_rarg2 - int length 4647 * c_rarg3 - int inv 4648 * c_rarg4 - int array elements m (the result) 4649 * 4650 */ 4651 address generate_multiply() { 4652 Label argh, nothing; 4653 bind(argh); 4654 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4655 4656 align(CodeEntryAlignment); 4657 address entry = pc(); 4658 4659 cbzw(Rlen, nothing); 4660 4661 enter(); 4662 4663 // Make room. 4664 cmpw(Rlen, 512); 4665 br(Assembler::HI, argh); 4666 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4667 andr(sp, Ra, -2 * wordSize); 4668 4669 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4670 4671 { 4672 // Copy input args, reversing as we go. We use Ra as a 4673 // temporary variable. 4674 reverse(Ra, Pa_base, Rlen, t0, t1); 4675 if (!_squaring) 4676 reverse(Ra, Pb_base, Rlen, t0, t1); 4677 reverse(Ra, Pn_base, Rlen, t0, t1); 4678 } 4679 4680 // Push all call-saved registers and also Pm_base which we'll need 4681 // at the end. 4682 save_regs(); 4683 4684 #ifndef PRODUCT 4685 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4686 { 4687 ldr(Rn, Address(Pn_base, 0)); 4688 mul(Rlo_mn, Rn, inv); 4689 cmp(Rlo_mn, -1); 4690 Label ok; 4691 br(EQ, ok); { 4692 stop("broken inverse in Montgomery multiply"); 4693 } bind(ok); 4694 } 4695 #endif 4696 4697 mov(Pm_base, Ra); 4698 4699 mov(t0, zr); 4700 mov(t1, zr); 4701 mov(t2, zr); 4702 4703 block_comment("for (int i = 0; i < len; i++) {"); 4704 mov(Ri, zr); { 4705 Label loop, end; 4706 cmpw(Ri, Rlen); 4707 br(Assembler::GE, end); 4708 4709 bind(loop); 4710 pre1(Ri); 4711 4712 block_comment(" for (j = i; j; j--) {"); { 4713 movw(Rj, Ri); 4714 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4715 } block_comment(" } // j"); 4716 4717 post1(); 4718 addw(Ri, Ri, 1); 4719 cmpw(Ri, Rlen); 4720 br(Assembler::LT, loop); 4721 bind(end); 4722 block_comment("} // i"); 4723 } 4724 4725 block_comment("for (int i = len; i < 2*len; i++) {"); 4726 mov(Ri, Rlen); { 4727 Label loop, end; 4728 cmpw(Ri, Rlen, Assembler::LSL, 1); 4729 br(Assembler::GE, end); 4730 4731 bind(loop); 4732 pre2(Ri, Rlen); 4733 4734 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4735 lslw(Rj, Rlen, 1); 4736 subw(Rj, Rj, Ri); 4737 subw(Rj, Rj, 1); 4738 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4739 } block_comment(" } // j"); 4740 4741 post2(Ri, Rlen); 4742 addw(Ri, Ri, 1); 4743 cmpw(Ri, Rlen, Assembler::LSL, 1); 4744 br(Assembler::LT, loop); 4745 bind(end); 4746 } 4747 block_comment("} // i"); 4748 4749 normalize(Rlen); 4750 4751 mov(Ra, Pm_base); // Save Pm_base in Ra 4752 restore_regs(); // Restore caller's Pm_base 4753 4754 // Copy our result into caller's Pm_base 4755 reverse(Pm_base, Ra, Rlen, t0, t1); 4756 4757 leave(); 4758 bind(nothing); 4759 ret(lr); 4760 4761 return entry; 4762 } 4763 // In C, approximately: 4764 4765 // void 4766 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4767 // unsigned long Pn_base[], unsigned long Pm_base[], 4768 // unsigned long inv, int len) { 4769 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4770 // unsigned long *Pa, *Pb, *Pn, *Pm; 4771 // unsigned long Ra, Rb, Rn, Rm; 4772 4773 // int i; 4774 4775 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4776 4777 // for (i = 0; i < len; i++) { 4778 // int j; 4779 4780 // Pa = Pa_base; 4781 // Pb = Pb_base + i; 4782 // Pm = Pm_base; 4783 // Pn = Pn_base + i; 4784 4785 // Ra = *Pa; 4786 // Rb = *Pb; 4787 // Rm = *Pm; 4788 // Rn = *Pn; 4789 4790 // int iters = i; 4791 // for (j = 0; iters--; j++) { 4792 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4793 // MACC(Ra, Rb, t0, t1, t2); 4794 // Ra = *++Pa; 4795 // Rb = *--Pb; 4796 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4797 // MACC(Rm, Rn, t0, t1, t2); 4798 // Rm = *++Pm; 4799 // Rn = *--Pn; 4800 // } 4801 4802 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4803 // MACC(Ra, Rb, t0, t1, t2); 4804 // *Pm = Rm = t0 * inv; 4805 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4806 // MACC(Rm, Rn, t0, t1, t2); 4807 4808 // assert(t0 == 0, "broken Montgomery multiply"); 4809 4810 // t0 = t1; t1 = t2; t2 = 0; 4811 // } 4812 4813 // for (i = len; i < 2*len; i++) { 4814 // int j; 4815 4816 // Pa = Pa_base + i-len; 4817 // Pb = Pb_base + len; 4818 // Pm = Pm_base + i-len; 4819 // Pn = Pn_base + len; 4820 4821 // Ra = *++Pa; 4822 // Rb = *--Pb; 4823 // Rm = *++Pm; 4824 // Rn = *--Pn; 4825 4826 // int iters = len*2-i-1; 4827 // for (j = i-len+1; iters--; j++) { 4828 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4829 // MACC(Ra, Rb, t0, t1, t2); 4830 // Ra = *++Pa; 4831 // Rb = *--Pb; 4832 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4833 // MACC(Rm, Rn, t0, t1, t2); 4834 // Rm = *++Pm; 4835 // Rn = *--Pn; 4836 // } 4837 4838 // Pm_base[i-len] = t0; 4839 // t0 = t1; t1 = t2; t2 = 0; 4840 // } 4841 4842 // while (t0) 4843 // t0 = sub(Pm_base, Pn_base, t0, len); 4844 // } 4845 4846 /** 4847 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4848 * multiplies than Montgomery multiplication so it should be up to 4849 * 25% faster. However, its loop control is more complex and it 4850 * may actually run slower on some machines. 4851 * 4852 * Arguments: 4853 * 4854 * Inputs: 4855 * c_rarg0 - int array elements a 4856 * c_rarg1 - int array elements n (the modulus) 4857 * c_rarg2 - int length 4858 * c_rarg3 - int inv 4859 * c_rarg4 - int array elements m (the result) 4860 * 4861 */ 4862 address generate_square() { 4863 Label argh; 4864 bind(argh); 4865 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4866 4867 align(CodeEntryAlignment); 4868 address entry = pc(); 4869 4870 enter(); 4871 4872 // Make room. 4873 cmpw(Rlen, 512); 4874 br(Assembler::HI, argh); 4875 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4876 andr(sp, Ra, -2 * wordSize); 4877 4878 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4879 4880 { 4881 // Copy input args, reversing as we go. We use Ra as a 4882 // temporary variable. 4883 reverse(Ra, Pa_base, Rlen, t0, t1); 4884 reverse(Ra, Pn_base, Rlen, t0, t1); 4885 } 4886 4887 // Push all call-saved registers and also Pm_base which we'll need 4888 // at the end. 4889 save_regs(); 4890 4891 mov(Pm_base, Ra); 4892 4893 mov(t0, zr); 4894 mov(t1, zr); 4895 mov(t2, zr); 4896 4897 block_comment("for (int i = 0; i < len; i++) {"); 4898 mov(Ri, zr); { 4899 Label loop, end; 4900 bind(loop); 4901 cmp(Ri, Rlen); 4902 br(Assembler::GE, end); 4903 4904 pre1(Ri); 4905 4906 block_comment("for (j = (i+1)/2; j; j--) {"); { 4907 add(Rj, Ri, 1); 4908 lsr(Rj, Rj, 1); 4909 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4910 } block_comment(" } // j"); 4911 4912 last_squaring(Ri); 4913 4914 block_comment(" for (j = i/2; j; j--) {"); { 4915 lsr(Rj, Ri, 1); 4916 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4917 } block_comment(" } // j"); 4918 4919 post1_squaring(); 4920 add(Ri, Ri, 1); 4921 cmp(Ri, Rlen); 4922 br(Assembler::LT, loop); 4923 4924 bind(end); 4925 block_comment("} // i"); 4926 } 4927 4928 block_comment("for (int i = len; i < 2*len; i++) {"); 4929 mov(Ri, Rlen); { 4930 Label loop, end; 4931 bind(loop); 4932 cmp(Ri, Rlen, Assembler::LSL, 1); 4933 br(Assembler::GE, end); 4934 4935 pre2(Ri, Rlen); 4936 4937 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4938 lsl(Rj, Rlen, 1); 4939 sub(Rj, Rj, Ri); 4940 sub(Rj, Rj, 1); 4941 lsr(Rj, Rj, 1); 4942 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4943 } block_comment(" } // j"); 4944 4945 last_squaring(Ri); 4946 4947 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4948 lsl(Rj, Rlen, 1); 4949 sub(Rj, Rj, Ri); 4950 lsr(Rj, Rj, 1); 4951 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4952 } block_comment(" } // j"); 4953 4954 post2(Ri, Rlen); 4955 add(Ri, Ri, 1); 4956 cmp(Ri, Rlen, Assembler::LSL, 1); 4957 4958 br(Assembler::LT, loop); 4959 bind(end); 4960 block_comment("} // i"); 4961 } 4962 4963 normalize(Rlen); 4964 4965 mov(Ra, Pm_base); // Save Pm_base in Ra 4966 restore_regs(); // Restore caller's Pm_base 4967 4968 // Copy our result into caller's Pm_base 4969 reverse(Pm_base, Ra, Rlen, t0, t1); 4970 4971 leave(); 4972 ret(lr); 4973 4974 return entry; 4975 } 4976 // In C, approximately: 4977 4978 // void 4979 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4980 // unsigned long Pm_base[], unsigned long inv, int len) { 4981 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4982 // unsigned long *Pa, *Pb, *Pn, *Pm; 4983 // unsigned long Ra, Rb, Rn, Rm; 4984 4985 // int i; 4986 4987 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4988 4989 // for (i = 0; i < len; i++) { 4990 // int j; 4991 4992 // Pa = Pa_base; 4993 // Pb = Pa_base + i; 4994 // Pm = Pm_base; 4995 // Pn = Pn_base + i; 4996 4997 // Ra = *Pa; 4998 // Rb = *Pb; 4999 // Rm = *Pm; 5000 // Rn = *Pn; 5001 5002 // int iters = (i+1)/2; 5003 // for (j = 0; iters--; j++) { 5004 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5005 // MACC2(Ra, Rb, t0, t1, t2); 5006 // Ra = *++Pa; 5007 // Rb = *--Pb; 5008 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5009 // MACC(Rm, Rn, t0, t1, t2); 5010 // Rm = *++Pm; 5011 // Rn = *--Pn; 5012 // } 5013 // if ((i & 1) == 0) { 5014 // assert(Ra == Pa_base[j], "must be"); 5015 // MACC(Ra, Ra, t0, t1, t2); 5016 // } 5017 // iters = i/2; 5018 // assert(iters == i-j, "must be"); 5019 // for (; iters--; j++) { 5020 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5021 // MACC(Rm, Rn, t0, t1, t2); 5022 // Rm = *++Pm; 5023 // Rn = *--Pn; 5024 // } 5025 5026 // *Pm = Rm = t0 * inv; 5027 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5028 // MACC(Rm, Rn, t0, t1, t2); 5029 5030 // assert(t0 == 0, "broken Montgomery multiply"); 5031 5032 // t0 = t1; t1 = t2; t2 = 0; 5033 // } 5034 5035 // for (i = len; i < 2*len; i++) { 5036 // int start = i-len+1; 5037 // int end = start + (len - start)/2; 5038 // int j; 5039 5040 // Pa = Pa_base + i-len; 5041 // Pb = Pa_base + len; 5042 // Pm = Pm_base + i-len; 5043 // Pn = Pn_base + len; 5044 5045 // Ra = *++Pa; 5046 // Rb = *--Pb; 5047 // Rm = *++Pm; 5048 // Rn = *--Pn; 5049 5050 // int iters = (2*len-i-1)/2; 5051 // assert(iters == end-start, "must be"); 5052 // for (j = start; iters--; j++) { 5053 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5054 // MACC2(Ra, Rb, t0, t1, t2); 5055 // Ra = *++Pa; 5056 // Rb = *--Pb; 5057 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5058 // MACC(Rm, Rn, t0, t1, t2); 5059 // Rm = *++Pm; 5060 // Rn = *--Pn; 5061 // } 5062 // if ((i & 1) == 0) { 5063 // assert(Ra == Pa_base[j], "must be"); 5064 // MACC(Ra, Ra, t0, t1, t2); 5065 // } 5066 // iters = (2*len-i)/2; 5067 // assert(iters == len-j, "must be"); 5068 // for (; iters--; j++) { 5069 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5070 // MACC(Rm, Rn, t0, t1, t2); 5071 // Rm = *++Pm; 5072 // Rn = *--Pn; 5073 // } 5074 // Pm_base[i-len] = t0; 5075 // t0 = t1; t1 = t2; t2 = 0; 5076 // } 5077 5078 // while (t0) 5079 // t0 = sub(Pm_base, Pn_base, t0, len); 5080 // } 5081 }; 5082 5083 5084 // Initialization 5085 void generate_initial() { 5086 // Generate initial stubs and initializes the entry points 5087 5088 // entry points that exist in all platforms Note: This is code 5089 // that could be shared among different platforms - however the 5090 // benefit seems to be smaller than the disadvantage of having a 5091 // much more complicated generator structure. See also comment in 5092 // stubRoutines.hpp. 5093 5094 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5095 5096 StubRoutines::_call_stub_entry = 5097 generate_call_stub(StubRoutines::_call_stub_return_address); 5098 5099 // is referenced by megamorphic call 5100 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5101 5102 // Build this early so it's available for the interpreter. 5103 StubRoutines::_throw_StackOverflowError_entry = 5104 generate_throw_exception("StackOverflowError throw_exception", 5105 CAST_FROM_FN_PTR(address, 5106 SharedRuntime::throw_StackOverflowError)); 5107 StubRoutines::_throw_delayed_StackOverflowError_entry = 5108 generate_throw_exception("delayed StackOverflowError throw_exception", 5109 CAST_FROM_FN_PTR(address, 5110 SharedRuntime::throw_delayed_StackOverflowError)); 5111 if (UseCRC32Intrinsics) { 5112 // set table address before stub generation which use it 5113 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5114 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5115 } 5116 5117 if (UseCRC32CIntrinsics) { 5118 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5119 } 5120 } 5121 5122 void generate_all() { 5123 // support for verify_oop (must happen after universe_init) 5124 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5125 StubRoutines::_throw_AbstractMethodError_entry = 5126 generate_throw_exception("AbstractMethodError throw_exception", 5127 CAST_FROM_FN_PTR(address, 5128 SharedRuntime:: 5129 throw_AbstractMethodError)); 5130 5131 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5132 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5133 CAST_FROM_FN_PTR(address, 5134 SharedRuntime:: 5135 throw_IncompatibleClassChangeError)); 5136 5137 StubRoutines::_throw_NullPointerException_at_call_entry = 5138 generate_throw_exception("NullPointerException at call throw_exception", 5139 CAST_FROM_FN_PTR(address, 5140 SharedRuntime:: 5141 throw_NullPointerException_at_call)); 5142 5143 // arraycopy stubs used by compilers 5144 generate_arraycopy_stubs(); 5145 5146 // has negatives stub for large arrays. 5147 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5148 5149 // array equals stub for large arrays. 5150 if (!UseSimpleArrayEquals) { 5151 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5152 } 5153 5154 if (UseMultiplyToLenIntrinsic) { 5155 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5156 } 5157 5158 if (UseSquareToLenIntrinsic) { 5159 StubRoutines::_squareToLen = generate_squareToLen(); 5160 } 5161 5162 if (UseMulAddIntrinsic) { 5163 StubRoutines::_mulAdd = generate_mulAdd(); 5164 } 5165 5166 if (UseMontgomeryMultiplyIntrinsic) { 5167 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5168 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5169 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5170 } 5171 5172 if (UseMontgomerySquareIntrinsic) { 5173 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5174 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5175 // We use generate_multiply() rather than generate_square() 5176 // because it's faster for the sizes of modulus we care about. 5177 StubRoutines::_montgomerySquare = g.generate_multiply(); 5178 } 5179 5180 #ifndef BUILTIN_SIM 5181 // generate GHASH intrinsics code 5182 if (UseGHASHIntrinsics) { 5183 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5184 } 5185 5186 if (UseAESIntrinsics) { 5187 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5188 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5189 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5190 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5191 } 5192 5193 if (UseSHA1Intrinsics) { 5194 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5195 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5196 } 5197 if (UseSHA256Intrinsics) { 5198 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5199 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5200 } 5201 5202 // generate Adler32 intrinsics code 5203 if (UseAdler32Intrinsics) { 5204 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5205 } 5206 5207 // Safefetch stubs. 5208 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5209 &StubRoutines::_safefetch32_fault_pc, 5210 &StubRoutines::_safefetch32_continuation_pc); 5211 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5212 &StubRoutines::_safefetchN_fault_pc, 5213 &StubRoutines::_safefetchN_continuation_pc); 5214 #endif 5215 StubRoutines::aarch64::set_completed(); 5216 } 5217 5218 public: 5219 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5220 if (all) { 5221 generate_all(); 5222 } else { 5223 generate_initial(); 5224 } 5225 } 5226 }; // end class declaration 5227 5228 void StubGenerator_generate(CodeBuffer* code, bool all) { 5229 StubGenerator g(code, all); 5230 }