1 /* 2 * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * Copyright (c) 2015, Linaro Ltd. All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "nativeInst_aarch32.hpp" 34 #include "oops/instanceOop.hpp" 35 #include "oops/method.hpp" 36 #include "oops/objArrayKlass.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/frame.inline.hpp" 40 #include "runtime/handles.inline.hpp" 41 #include "runtime/sharedRuntime.hpp" 42 #include "runtime/stubCodeGenerator.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "runtime/thread.inline.hpp" 45 #include "vm_version_aarch32.hpp" 46 #ifdef COMPILER2 47 #include "opto/runtime.hpp" 48 #endif 49 50 51 // Declaration and definition of StubGenerator (no .hpp file). 52 // For a more detailed description of the stub routine structure 53 // see the comment in stubRoutines.hpp 54 55 #undef __ 56 #define __ _masm-> 57 #define TIMES_OOP lsl(exact_log2(4)) 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #else 62 #define BLOCK_COMMENT(str) __ block_comment(str) 63 #endif 64 65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 66 67 // Stub Code definitions 68 69 class StubGenerator: public StubCodeGenerator { 70 private: 71 72 #ifdef PRODUCT 73 #define inc_counter_np(counter) ((void)0) 74 #else 75 void inc_counter_np_(int& counter) { 76 __ lea(rscratch2, ExternalAddress((address)&counter)); 77 __ ldr(rscratch1, Address(rscratch2)); 78 __ add(rscratch1, rscratch1, 1); 79 __ str(rscratch1, Address(rscratch2)); 80 } 81 #define inc_counter_np(counter) \ 82 BLOCK_COMMENT("inc_counter " #counter); \ 83 inc_counter_np_(counter); 84 #endif 85 86 // Call stubs are used to call Java from C 87 // 88 // There are only four registers available to house arguments and we're expecting eight 89 // the layout will be as follows: 90 91 // c_rarg0 = call wrapper address 92 // c_rarg1 = result 93 // c_rarg2 = result type 94 // c_rarg3 = method 95 // sp -> [ entry_point 96 // parameters -> java params 97 // parameter size (in words) 98 // thread] (address increasing) 99 // 100 // We don't 101 // NEW!! layout for aarch32 so that save and restore can be collapsed into a single 102 // load/store 103 // layout of saved registers now is 104 // 0 [ saved lr ] <- rfp 105 // -1 [ saved fp ] 106 // -2 [ r12/rthread ] Thread passed in args 107 // -3 [ r10/rmethod ] NOTE omitted rfp as restored automatically 108 // -4 [ r9/rscratch1 ] Platform register? 109 // -5 [ r8/thread ] 110 // -6 [ r7/rcpool ] 111 // -7 [ r6/rlocals ] 112 // -8 [ r5/rbcp ] 113 // -9 [ r4/rdispatch ] 114 // -10 [ r2/res type ] 115 // -11 [ r1/result ] 116 // -12 [r0/call wrapper]<- sp (when restored from fp value) 117 // -13 maybe alignment 118 // -YY [ java arg0 ] 119 // ... 120 // -xx [ java argn ] <- sp on branch into java 121 // 122 // XXX Note we do not save floating point registers 123 // Only floating point registers s16-31 / d8-15 need to be saved 124 // these are never touched by template interpreted code. 125 // On a sequence such as C -> Java -> C, the C functions will save them if used. 126 127 address generate_call_stub(address& return_address) { 128 /*assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 129 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 130 "adjust this code");*/ 131 const int thread_off = -frame::get_frame_size(VMFrameAPCS) * wordSize; 132 133 StubCodeMark mark(this, "StubRoutines", "call_stub"); 134 address start = __ pc(); 135 __ reg_printf("entering call stub with { sp : %p, rfp : %p, lr : %p}\n", sp, rfp, lr); 136 __ enter(VMFrameAPCS); //save rfp & lr and possibly another 2 words 137 138 const int entry_point_arg_off = 1 * wordSize, 139 params_arg_off = 2 * wordSize, 140 param_sz_arg_off = 3 * wordSize, 141 thread_arg_off = 4 * wordSize; 142 // r12 is a scratch register so we can clobber it to save thread 143 // which is needed at the end 144 __ ldr(r12, Address(rfp, thread_arg_off)); 145 // r0, r1, r2, r4 - r10, r12 146 // we save r0 as the call_wrapper_address is needed elsewhere 147 // we save r1, r2 as they hold the result and it's type, 148 // which are needed on return 149 // r12 holds the thread ptr 150 unsigned c_save_regset = 0b0001011111110111; 151 int nsaved = __ count_bits(c_save_regset); 152 __ stmdb(sp, c_save_regset); 153 154 // Offset from rfp to end of stack. 155 const int rfp_tos_offset_bytes = frame::get_offset_from_rfp_bytes() + nsaved * wordSize; 156 157 // install Java thread in global register now we have saved 158 // whatever value it held 159 __ mov(rthread, r12); 160 // And method 161 __ mov(rmethod, c_rarg3); 162 163 #ifdef ASSERT 164 // make sure we have no pending exceptions 165 { 166 Label L; 167 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 168 __ cmp(rscratch1, (unsigned)NULL_WORD); 169 __ b(L, Assembler::EQ); 170 __ stop("StubRoutines::call_stub: entered with pending exception"); 171 __ BIND(L); 172 } 173 #endif 174 __ ldr(rscratch2, Address(rfp, param_sz_arg_off)); 175 // align sp at the time we call java 176 __ sub(sp, sp, rscratch2, lsl(LogBytesPerWord)); 177 __ align_stack(); 178 __ add(sp, sp, rscratch2, lsl(LogBytesPerWord)); 179 180 __ ldr(rscratch1, Address(rfp, params_arg_off)); 181 182 BLOCK_COMMENT("pass parameters if any"); 183 Label parameters_done; 184 185 __ reg_printf("call_stub param_off = %p, param_sz = %d\n", rscratch1, rscratch2); 186 __ cmp(rscratch2, 0); 187 __ b(parameters_done, Assembler::EQ); 188 189 // r14 makes ok temp as already saved in frame header 190 address loop = __ pc(); 191 __ ldr(r14, Address(__ post(rscratch1, wordSize))); 192 __ subs(rscratch2, rscratch2, 1); 193 194 // TODO remove 195 __ reg_printf("\tARG SP[%d] : 0x%08x\n", rscratch2, r14); 196 __ cmp(rscratch2, 0); 197 // END TODO 198 __ push(r14); 199 __ b(loop, Assembler::GT); 200 201 __ BIND(parameters_done); 202 203 #ifdef ASSERT 204 __ verify_stack_alignment(); 205 #endif 206 207 BLOCK_COMMENT("call Java function"); 208 __ ldr(rscratch1, Address(rfp, entry_point_arg_off)); 209 __ reg_printf("Calling Java function with rfp = %p, sp = %p\n", rfp, sp); 210 __ mov(r4, sp); // set sender sp 211 __ bl(rscratch1); 212 // save current address for use by exception handling code 213 return_address = __ pc(); 214 215 __ reg_printf("Returned to call_stub with rfp = %p, sp = %p\n", rfp, sp); 216 217 // At this point rfp should be restored to the value it was set to before 218 // use it to set the top of stack. 219 __ sub(sp, rfp, rfp_tos_offset_bytes); 220 221 #ifdef ASSERT 222 // verify that threads correspond 223 __ ldr(r12, Address(rfp, thread_off)); 224 //rfp points to register stored in highest memory location - first on 225 // stack, that's the saved lr, r12 is just below that 226 // stored in r12 at this point 227 { 228 Label L, S; 229 __ cmp(rthread, r12); 230 __ b(S, Assembler::NE); 231 __ get_thread(r12); 232 __ cmp(rthread, r12); 233 __ b(L, Assembler::EQ); 234 __ BIND(S); 235 __ stop("StubRoutines::call_stub: threads must correspond"); 236 __ BIND(L); 237 } 238 #endif 239 240 if(MacroAssembler::enable_debugging_static) { 241 // FIXME Remove this hacky debugging code 242 Label L; 243 __ ldr(rscratch2, Address(rthread, Thread::pending_exception_offset())); 244 __ cbnz(rscratch2, L); 245 // If we're returning via an exception then we shouldn't report exit, 246 // the exception handler will have already reported the exit and reporting 247 // via our progress through the call stub will result in an extra method 248 // being reported as exited. 249 __ print_method_exit(); 250 __ bind(L); 251 } 252 253 // NOTE Horrible tricks here 254 // We need to preserve current r0 and r1 values as they contain the return value. 255 // First we discard r0 saved to stack, no longer needed. 256 // We have saved result and type as c_rarg1 and c_rarg2, so now we alter 257 // the regset to load as follows: 258 // c_rarg2 = result 259 // c_rarg3 = result_type 260 261 assert((c_save_regset & 0xf) == 0b0111, "change me"); 262 __ add(sp, sp, wordSize); 263 const int altered_saved_regset = (~0xf & c_save_regset) | 0xc; 264 __ ldmia(sp, altered_saved_regset); 265 266 // store result depending on type (everything that is not 267 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 268 // n.b. this assumes Java returns an integral result in r0 269 // and a floating result in j_farg0 270 271 Label is_object, is_long, is_float, is_double, exit; 272 __ cmp(c_rarg3, T_OBJECT); 273 __ b(is_object, Assembler::EQ); 274 __ cmp(c_rarg3, T_LONG); 275 __ b(is_long, Assembler::EQ); 276 if(hasFPU()) { 277 // soft FP fall through T_INT case 278 __ cmp(c_rarg3, T_FLOAT); 279 __ b(is_float, Assembler::EQ); 280 } 281 __ cmp(c_rarg3, T_DOUBLE); 282 if(hasFPU()) { 283 __ b(is_double, Assembler::EQ); 284 } else { 285 __ b(is_long, Assembler::EQ); 286 } 287 288 // handle T_INT case 289 __ str(r0, Address(c_rarg2)); 290 291 __ BIND(exit); 292 __ leave(VMFrameAPCS); //Restore rfp, sp, lr 293 __ reg_printf("leaving call stub with { sp : %p, rfp : %p, lr : %p}\n", sp, rfp, lr); 294 // Pop arguments from stack. 295 //__ add(sp, sp, 4 * wordSize); 296 297 __ b(lr); 298 299 // handle return types different from T_INT 300 __ BIND(is_object); 301 __ mov(r1, 0); 302 303 __ BIND(is_long); 304 __ strd(r0, r1, Address(c_rarg2, 0)); 305 __ b(exit, Assembler::AL); 306 307 if(hasFPU()) { 308 __ BIND(is_float); 309 __ vstr_f32(f0, Address(c_rarg2, 0)); 310 __ b(exit, Assembler::AL); 311 312 __ BIND(is_double); 313 __ vstr_f64(d0, Address(c_rarg2, 0)); 314 __ b(exit, Assembler::AL); 315 } 316 return start; 317 } 318 319 // Return point for a Java call if there's an exception thrown in 320 // Java code. The exception is caught and transformed into a 321 // pending exception stored in JavaThread that can be tested from 322 // within the VM. 323 // 324 // Note: Usually the parameters are removed by the callee. In case 325 // of an exception crossing an activation frame boundary, that is 326 // not the case if the callee is compiled code => need to setup the 327 // rsp. 328 // 329 // r0: exception oop 330 331 // NOTE: this is used as a target from the signal handler so it 332 // needs an x86 prolog which returns into the current simulator 333 // executing the generated catch_exception code. so the prolog 334 // needs to install rax in a sim register and adjust the sim's 335 // restart pc to enter the generated code at the start position 336 // then return from native to simulated execution. 337 338 address generate_catch_exception() { 339 const int thread_off = -frame::get_frame_size(VMFrameAPCS) * wordSize; 340 341 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 342 address start = __ pc(); 343 344 // same as in generate_call_stub(): 345 const Address thread(rfp, thread_off); 346 347 #ifdef ASSERT 348 // verify that threads correspond 349 { 350 Label L, S; 351 __ ldr(rscratch1, thread); 352 __ cmp(rthread, rscratch1); 353 __ b(S, Assembler::NE); 354 __ get_thread(rscratch1); 355 __ cmp(rthread, rscratch1); 356 __ b(L, Assembler::EQ); 357 __ bind(S); 358 __ stop("StubRoutines::catch_exception: threads must correspond"); 359 __ bind(L); 360 } 361 #endif 362 363 // set pending exception 364 __ verify_oop(r0); 365 366 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 367 __ mov(rscratch1, (address)__FILE__); 368 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 369 __ mov(rscratch1, (int)__LINE__); 370 __ str(rscratch1, Address(rthread, Thread::exception_line_offset())); 371 372 // complete return to VM 373 assert(StubRoutines::_call_stub_return_address != NULL, 374 "_call_stub_return_address must have been generated before"); 375 __ b(StubRoutines::_call_stub_return_address); 376 377 return start; 378 } 379 380 // Continuation point for runtime calls returning with a pending 381 // exception. The pending exception check happened in the runtime 382 // or native call stub. The pending exception in Thread is 383 // converted into a Java-level exception. 384 // 385 // Contract with Java-level exception handlers: 386 // r0: exception 387 // r3: throwing pc 388 // 389 // NOTE: At entry of this stub, exception-pc must be in LR !! 390 391 // NOTE: this is always used as a jump target within generated code 392 // so it just needs to be generated code wiht no x86 prolog 393 394 address generate_forward_exception() { 395 //FIXME NOTE ON ALTERATION TO ARM32 IT WAS ASSUMED THAT rmethod 396 // won't be used anymore and set on entry to the handler - is this true? 397 398 Register spare = rmethod; 399 400 StubCodeMark mark(this, "StubRoutines", "forward exception"); 401 address start = __ pc(); 402 403 // Upon entry, LR points to the return address returning into 404 // Java (interpreted or compiled) code; i.e., the return address 405 // becomes the throwing pc. 406 // 407 // Arguments pushed before the runtime call are still on the stack 408 // but the exception handler will reset the stack pointer -> 409 // ignore them. A potential result in registers can be ignored as 410 // well. 411 412 #ifdef ASSERT 413 // make sure this code is only executed if there is a pending exception 414 { 415 Label L; 416 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 417 __ cbnz(rscratch1, L); 418 __ stop("StubRoutines::forward exception: no pending exception (1)"); 419 __ bind(L); 420 } 421 #endif 422 423 // compute exception handler into r2 424 425 // call the VM to find the handler address associated with the 426 // caller address. pass thread in r0 and caller pc (ret address) 427 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 428 // the stack. 429 __ mov(c_rarg1, lr); 430 // lr will be trashed by the VM call so we move it to R2 431 // (callee-saved) because we also need to pass it to the handler 432 // returned by this call. 433 __ mov(spare, lr); //note rscratch1 is a callee saved register 434 BLOCK_COMMENT("call exception_handler_for_return_address"); 435 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 436 SharedRuntime::exception_handler_for_return_address), 437 rthread, c_rarg1); 438 // we should not really care that lr is no longer the callee 439 // address. we saved the value the handler needs in spare so we can 440 // just copy it to r3. however, the C2 handler will push its own 441 // frame and then calls into the VM and the VM code asserts that 442 // the PC for the frame above the handler belongs to a compiled 443 // Java method. So, we restore lr here to satisfy that assert. 444 __ mov(lr, spare); 445 // setup r0 & r3 & clear pending exception 446 __ mov(r3, spare); 447 __ mov(spare, r0); 448 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 449 __ mov(rscratch1, 0); 450 __ str(rscratch1, Address(rthread, Thread::pending_exception_offset())); 451 452 #ifdef ASSERT 453 // make sure exception is set 454 { 455 Label L; 456 __ cbnz(r0, L); 457 __ stop("StubRoutines::forward exception: no pending exception (2)"); 458 __ bind(L); 459 } 460 #endif 461 // continue at exception handler 462 // r0: exception 463 // r3: throwing pc 464 // spare: exception handler 465 466 __ verify_oop(r0); 467 __ b(spare); 468 469 return start; 470 } 471 472 // Non-destructive plausibility checks for oops 473 // 474 // Arguments: 475 // r0: oop to verify 476 // rscratch1: error message 477 // 478 // Stack after saving c_rarg3: 479 // [tos + 0]: saved c_rarg3 480 // [tos + 1]: saved c_rarg2 481 // [tos + 2]: saved lr 482 // [tos + 3]: saved rscratch2 483 // [tos + 4]: saved r1 484 // [tos + 5]: saved r0 485 // [tos + 6]: saved rscratch1 486 address generate_verify_oop() { 487 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 488 address start = __ pc(); 489 490 Label exit, error; 491 492 // save c_rarg2 and c_rarg3 493 __ stmdb(sp, RegSet::of(c_rarg2, c_rarg3).bits()); 494 495 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 496 __ ldr(c_rarg3, Address(c_rarg2)); 497 __ add(c_rarg3, c_rarg3, 1); 498 __ str(c_rarg3, Address(c_rarg2)); 499 500 // object is in r0 501 // make sure object is 'reasonable' 502 __ cbz(r0, exit); // if obj is NULL it is OK 503 504 // Check if the oop is in the right area of memory 505 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 506 __ andr(c_rarg2, r0, c_rarg3); 507 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 508 509 // Compare c_rarg2 and c_rarg3. We don't use a compare 510 // instruction here because the flags register is live. 511 __ eor(c_rarg2, c_rarg2, c_rarg3); 512 __ cbnz(c_rarg2, error); 513 514 // make sure klass is 'reasonable', which is not zero. 515 __ load_klass(r0, r0); // get klass 516 __ cbz(r0, error); // if klass is NULL it is broken 517 518 // return if everything seems ok 519 __ bind(exit); 520 521 __ ldmia(sp, RegSet::of(c_rarg2, c_rarg3).bits()); 522 __ b(lr); 523 524 // handle errors 525 __ bind(error); 526 __ ldmia(sp, RegSet::of(c_rarg2, c_rarg3).bits()); 527 528 __ pusha(); 529 // Save old sp 530 __ add(c_rarg2, sp, 14 * wordSize); 531 __ str(c_rarg2, Address( __ pre(sp, -wordSize))); 532 __ mov(c_rarg0, rscratch1); // pass address of error message 533 __ mov(c_rarg1, lr); // pass return address 534 __ mov(c_rarg2, sp); // pass address of regs on stack 535 #ifndef PRODUCT 536 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 537 #endif 538 BLOCK_COMMENT("call MacroAssembler::debug"); 539 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug32)); 540 __ bl(rscratch1); 541 __ hlt(0); 542 543 return start; 544 } 545 546 // NOTE : very strange, I changed this but I don't know why the Address:(signed extend word) was here 547 //void array_overlap_test(Label& L_no_overlap, Address sf) { __ b(L_no_overlap); } 548 void array_overlap_test(Label& L_no_overlap) { __ b(L_no_overlap); } 549 //no test being performed ? 550 551 // 552 // Small copy: less than 4 bytes. 553 // 554 // NB: Ignores all of the bits of count which represent more than 3 555 // bytes, so a caller doesn't have to mask them. 556 557 void copy_memory_small(Register s, Register d, Register count, Register tmp, bool is_aligned, int step) { 558 const int granularity = uabs(step); 559 const bool gen_always = !is_aligned || (-4 < step && step < 0); 560 Label halfword, done; 561 562 if ((granularity <= 1) || gen_always) { 563 __ tst(count, 1); 564 __ b(halfword, Assembler::EQ); 565 __ ldrb(tmp, step < 0 ? __ pre(s, -1) : __ post(s, 1)); 566 __ strb(tmp, step < 0 ? __ pre(d, -1) : __ post(d, 1)); 567 } 568 569 if ((granularity <= 2) || gen_always) { 570 __ bind(halfword); 571 __ tst(count, 2); 572 __ b(done, Assembler::EQ); 573 __ ldrh(tmp, step < 0 ? __ pre(s, -2) : __ post(s, 2)); 574 __ strh(tmp, step < 0 ? __ pre(d, -2) : __ post(d, 2)); 575 } 576 577 __ bind(done); 578 } 579 580 void copy_memory_simd(Register s, Register d, 581 Register count, Register tmp, int step, 582 DoubleFloatRegSet tmp_set, size_t tmp_set_size ) { 583 assert(UseSIMDForMemoryOps, "should be available"); 584 Label simd_loop, simd_small; 585 586 __ cmp(count, tmp_set_size); 587 __ b(simd_small, Assembler::LT); 588 589 __ mov(tmp, count, __ lsr(exact_log2(tmp_set_size))); 590 __ sub(count, count, tmp, __ lsl(exact_log2(tmp_set_size))); 591 592 __ bind(simd_loop); 593 594 __ pld(Address(s, step < 0 ? -2 * tmp_set_size : tmp_set_size)); 595 596 if (step < 0) { 597 __ vldmdb_f64(s, tmp_set.bits()); 598 __ vstmdb_f64(d, tmp_set.bits()); 599 } else { 600 __ vldmia_f64(s, tmp_set.bits()); 601 __ vstmia_f64(d, tmp_set.bits()); 602 } 603 604 __ subs(tmp, tmp, 1); 605 __ b(simd_loop, Assembler::NE); 606 607 __ bind(simd_small); 608 } 609 610 // All-singing all-dancing memory copy. 611 // 612 // Copy count units of memory from s to d. The size of a unit is 613 // step, which can be positive or negative depending on the direction 614 // of copy. If is_aligned is false, we align the source address. 615 // 616 617 void copy_memory(bool is_aligned, Register s, Register d, 618 Register count, int step) { 619 const int small_copy_size = 32; // 1 copy by ldm pays off alignment efforts and push/pop of temp set 620 const int granularity = uabs(step); 621 const Register tmp2 = rscratch2; 622 const Register t0 = r3; 623 Label small; 624 625 assert_different_registers(s, d, count, tmp2, t0); 626 627 __ mov(count, count, __ lsl(exact_log2(granularity))); 628 629 if (step < 0) { 630 __ add(s, s, count); 631 __ add(d, d, count); 632 } 633 634 __ cmp(count, small_copy_size); 635 __ b(small, Assembler::LT); 636 637 // aligning 638 if (!is_aligned || (-4 < step && step < 0)) { 639 assert(3 <= small_copy_size, "may copy number of bytes required for alignment"); 640 if (step < 0) { 641 __ andr(tmp2, s, 3); 642 } else { 643 __ rsb(tmp2, s, 0); 644 __ andr(tmp2, tmp2, 3); 645 } 646 __ sub(count, count, tmp2); 647 copy_memory_small(s, d, tmp2, t0, is_aligned, step); 648 } 649 650 #ifdef ASSERT 651 Label src_aligned; 652 __ tst(s, 3); 653 __ b(src_aligned, Assembler::EQ); 654 __ stop("src is not aligned"); 655 __ bind(src_aligned); 656 #endif 657 658 // if destination is unaliged, copying by words is the only option 659 __ tst(d, 3); 660 __ b(small, Assembler::NE); 661 if (UseSIMDForMemoryOps && (VM_Version::features() & FT_AdvSIMD)) { 662 copy_memory_simd(s, d, count, tmp2, step, DoubleFloatRegSet::range(d0, d7), 64); 663 copy_memory_simd(s, d, count, tmp2, step, DoubleFloatRegSet::range(d0, d1), 16); 664 } else { 665 const RegSet tmp_set = RegSet::range(r4, r7); 666 const int tmp_set_size = 16; 667 Label ldm_loop; 668 669 assert_different_registers(s, d, count, tmp2, r4, r5, r6, r7); 670 671 __ cmp(count, tmp_set_size); 672 __ b(small, Assembler::LT); 673 674 __ push(tmp_set, sp); 675 676 __ mov(tmp2, count, __ lsr(exact_log2(tmp_set_size))); 677 __ sub(count, count, tmp2, __ lsl(exact_log2(tmp_set_size))); 678 679 __ bind(ldm_loop); 680 681 __ pld(Address(s, step < 0 ? -2 * tmp_set_size : tmp_set_size)); 682 683 if (step < 0) { 684 __ ldmdb(s, tmp_set.bits()); 685 __ stmdb(d, tmp_set.bits()); 686 } else { 687 __ ldmia(s, tmp_set.bits()); 688 __ stmia(d, tmp_set.bits()); 689 } 690 691 __ subs(tmp2, tmp2, 1); 692 __ b(ldm_loop, Assembler::NE); 693 694 __ pop(tmp_set, sp); 695 } 696 697 __ bind(small); 698 699 Label words_loop, words_done; 700 __ cmp(count, BytesPerWord); 701 __ b(words_done, Assembler::LT); 702 703 __ mov(tmp2, count, __ lsr(exact_log2(BytesPerWord))); 704 __ sub(count, count, tmp2, __ lsl(exact_log2(BytesPerWord))); 705 706 __ bind(words_loop); 707 708 Address src = step < 0 ? __ pre(s, -BytesPerWord) : __ post(s, BytesPerWord); 709 Address dst = step < 0 ? __ pre(d, -BytesPerWord) : __ post(d, BytesPerWord); 710 711 __ pld(Address(s, step < 0 ? -2 * BytesPerWord : BytesPerWord)); 712 __ ldr(t0, src); 713 __ str(t0, dst); 714 __ subs(tmp2, tmp2, 1); 715 716 __ b(words_loop, Assembler::NE); 717 718 __ bind(words_done); 719 copy_memory_small(s, d, count, t0, is_aligned, step); 720 } 721 722 // Arguments: 723 // aligned - true => Input and output aligned on a HeapWord == 4-byte boundary 724 // ignored 725 // is_oop - true => oop array, so generate store check code 726 // name - stub name string 727 // 728 // Inputs: 729 // c_rarg0 - source array address 730 // c_rarg1 - destination array address 731 // c_rarg2 - element count, treated as ssize_t, can be zero 732 // 733 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 734 // the hardware handle it. The two dwords within qwords that span 735 // cache line boundaries will still be loaded and stored atomicly. 736 // 737 // Side Effects: 738 // disjoint_int_copy_entry is set to the no-overlap entry point 739 // used by generate_conjoint_int_oop_copy(). 740 // 741 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 742 const char *name, bool dest_uninitialized = false) { 743 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 744 __ align(CodeEntryAlignment); 745 StubCodeMark mark(this, "StubRoutines", name); 746 address start = __ pc(); 747 if (entry != NULL) { 748 *entry = __ pc(); 749 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 750 BLOCK_COMMENT("Entry:"); 751 } 752 __ enter(VMFrameAPCS); 753 754 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 755 if (dest_uninitialized) { 756 decorators |= IS_DEST_UNINITIALIZED; 757 } 758 if (aligned) { 759 decorators |= ARRAYCOPY_ALIGNED; 760 } 761 762 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 763 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count); 764 765 if (is_oop) { 766 __ push(RegSet::of(d, count), sp); 767 } 768 769 // copy memory likes to voluntary use rscratch2 and r3 770 copy_memory(aligned, s, d, count, size); 771 772 if (is_oop) { 773 __ pop(RegSet::of(d, count), sp); 774 __ sub(count, count, 1); // make an inclusive end pointer 775 __ lea(count, Address(d, count, lsl(exact_log2(size)))); 776 } 777 778 // barriers are for oop arrays only, so don't worry about s, d and count being lost before 779 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch2); 780 781 __ leave(VMFrameAPCS); 782 __ b(lr); 783 return start; 784 } 785 786 // Arguments: 787 // aligned - true => Input and output aligned on a HeapWord == 4-byte boundary 788 // ignored 789 // is_oop - true => oop array, so generate store check code 790 // name - stub name string 791 // 792 // Inputs: 793 // c_rarg0 - source array address 794 // c_rarg1 - destination array address 795 // c_rarg2 - element count, treated as ssize_t, can be zero 796 // 797 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 798 // the hardware handle it. The two dwords within qwords that span 799 // cache line boundaries will still be loaded and stored atomicly. 800 // 801 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 802 address *entry, const char *name, 803 bool dest_uninitialized = false) { 804 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 805 __ align(CodeEntryAlignment); 806 StubCodeMark mark(this, "StubRoutines", name); 807 address start = __ pc(); 808 809 __ cmp(d, s); 810 __ b(nooverlap_target, Assembler::LS); 811 812 __ enter(VMFrameAPCS); 813 814 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 815 if (dest_uninitialized) { 816 decorators |= IS_DEST_UNINITIALIZED; 817 } 818 if (aligned) { 819 decorators |= ARRAYCOPY_ALIGNED; 820 } 821 822 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 823 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count); 824 825 if (is_oop) { 826 __ push(RegSet::of(d, count), sp); 827 } 828 829 // copy memory likes to voluntary use rscratch2 and r3 830 copy_memory(aligned, s, d, count, -size); 831 832 if (is_oop) { 833 __ pop(RegSet::of(d, count), sp); 834 __ sub(count, count, 1); // make an inclusive end pointer 835 __ lea(count, Address(d, count, lsl(exact_log2(size)))); 836 } 837 838 // barriers are for oop arrays only, so don't worry about s, d and count being lost before 839 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch2); 840 841 __ leave(VMFrameAPCS); 842 __ b(lr); 843 return start; 844 } 845 846 // Helper for generating a dynamic type check. 847 // Smashes rscratch1. 848 void generate_type_check(Register sub_klass, 849 Register super_check_offset, 850 Register super_klass, 851 Label& L_success) { 852 assert_different_registers(sub_klass, super_check_offset, super_klass); 853 854 BLOCK_COMMENT("type_check:"); 855 856 Label L_miss; 857 858 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 859 super_check_offset); 860 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 861 862 // Fall through on failure! 863 __ BIND(L_miss); 864 } 865 866 // 867 // Generate checkcasting array copy stub 868 // 869 // Input: 870 // c_rarg0 - source array address 871 // c_rarg1 - destination array address 872 // c_rarg2 - element count, treated as ssize_t, can be zero 873 // c_rarg3 - size_t ckoff (super_check_offset) 874 // [sp] - oop ckval (super_klass) 875 // 876 // Output: 877 // r0 == 0 - success 878 // r0 == -1^K - failure, where K is partial transfer count 879 // 880 address generate_checkcast_copy(const char *name, address *entry, 881 bool dest_uninitialized = false) { 882 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 883 884 // Input registers (after setup_arg_regs) 885 const Register from = c_rarg0; // source array address 886 const Register to = c_rarg1; // destination array address 887 const Register count = c_rarg2; // elementscount 888 const Register ckoff = c_rarg3; // super_check_offset 889 890 // Registers used as temps 891 const Register ckval = r4; // super_klass 892 const Register count_save = r5; // orig elementscount 893 const Register copied_oop = r6; // actual oop copied 894 const Register oop_klass = r7; // oop._klass 895 const Register start_to = lr; 896 897 //--------------------------------------------------------------- 898 // Assembler stub will be used for this call to arraycopy 899 // if the two arrays are subtypes of Object[] but the 900 // destination array type is not equal to or a supertype 901 // of the source type. Each element must be separately 902 // checked. 903 904 assert_different_registers(from, to, count, ckoff, ckval, 905 copied_oop, oop_klass, count_save); 906 907 __ align(CodeEntryAlignment); 908 StubCodeMark mark(this, "StubRoutines", name); 909 address start = __ pc(); 910 911 __ enter(VMFrameAPCS); // required for proper stackwalking of RuntimeStub frame 912 913 #ifdef ASSERT 914 // caller guarantees that the arrays really are different 915 // otherwise, we would have to make conjoint checks 916 { Label L; 917 array_overlap_test(L);//, TIMES_OOP); 918 __ stop("checkcast_copy within a single array"); 919 __ bind(L); 920 } 921 #endif //ASSERT 922 923 // Caller of this entry point must set up the argument registers. 924 if (entry != NULL) { 925 *entry = __ pc(); 926 BLOCK_COMMENT("Entry:"); 927 } 928 929 // Empty array: Nothing to do. 930 __ cbz(count, L_done); 931 932 // rscratch1 used as temp, rscratch2 can be killed by inc_counter_np 933 __ push(RegSet::of(count_save, copied_oop, oop_klass, ckval, rscratch1, rscratch2), sp); 934 __ ldr(ckval, Address(rfp, wordSize)); 935 936 #ifdef ASSERT 937 BLOCK_COMMENT("assert consistent ckoff/ckval"); 938 // The ckoff and ckval must be mutually consistent, 939 // even though caller generates both. 940 { Label L; 941 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 942 __ ldr(rscratch1, Address(ckval, sco_offset)); 943 __ cmp(ckoff, rscratch1); 944 __ b(L, Assembler::EQ); 945 __ stop("super_check_offset inconsistent"); 946 __ bind(L); 947 } 948 #endif //ASSERT 949 950 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; 951 bool is_oop = true; 952 if (dest_uninitialized) { 953 decorators |= IS_DEST_UNINITIALIZED; 954 } 955 956 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 957 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count); 958 959 // save the original count 960 __ mov(count_save, count); 961 962 // save destination array start address 963 __ mov(start_to, to); 964 965 // Copy from low to high addresses 966 __ b(L_load_element); 967 968 // ======== begin loop ======== 969 // (Loop is rotated; its entry is L_load_element.) 970 // Loop control: 971 // for (; count != 0; count--) { 972 // copied_oop = load_heap_oop(from++); 973 // ... generate_type_check ...; 974 // store_heap_oop(to++, copied_oop); 975 // } 976 __ align(OptoLoopAlignment); 977 978 __ BIND(L_store_element); 979 __ store_heap_oop(__ post(to, 4), copied_oop, noreg, noreg, AS_RAW); // store the oop 980 __ sub(count, count, 1); 981 __ cbz(count, L_do_card_marks); 982 983 // ======== loop entry is here ======== 984 __ BIND(L_load_element); 985 __ load_heap_oop(copied_oop, __ post(from, 4), noreg, noreg, AS_RAW); // load the oop 986 __ cbz(copied_oop, L_store_element); 987 988 __ load_klass(oop_klass, copied_oop);// query the object klass 989 generate_type_check(oop_klass, ckoff, ckval, L_store_element); 990 // ======== end loop ======== 991 992 // It was a real error; we must depend on the caller to finish the job. 993 // Register count = remaining oops, count_orig = total oops. 994 // Emit GC store barriers for the oops we have copied and report 995 // their number to the caller. 996 997 __ subs(count, count_save, count); // K = partially copied oop count 998 __ inv(count, count); // report (-1^K) to caller 999 __ b(L_done_pop, Assembler::EQ); 1000 1001 __ BIND(L_do_card_marks); 1002 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1003 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1); 1004 1005 __ bind(L_done_pop); 1006 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1007 __ pop(RegSet::of(count_save, copied_oop, oop_klass, ckval, rscratch1, rscratch2), sp); 1008 1009 __ bind(L_done); 1010 __ mov(r0, count); 1011 __ leave(VMFrameAPCS); 1012 __ b(lr); 1013 return start; 1014 } 1015 1016 void generate_arraycopy_stubs() { 1017 address entry; 1018 1019 // jbyte 1020 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(sizeof(jbyte), true, false, &entry, "arrayof_jbyte_disjoint_arraycopy"); 1021 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(sizeof(jbyte), true, false, entry, NULL, "arrayof_jbyte_arraycopy"); 1022 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(sizeof(jbyte), false, false, &entry, "jbyte_disjoint_arraycopy"); 1023 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(sizeof(jbyte), false, false, entry, NULL, "jbyte_arraycopy"); 1024 // jshort 1025 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(sizeof(jshort), true, false, &entry, "arrayof_jshort_disjoint_arraycopy"); 1026 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(sizeof(jshort), true, false, entry, NULL, "arrayof_jshort_arraycopy"); 1027 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(sizeof(jshort), false, false, &entry, "jshort_disjoint_arraycopy"); 1028 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(sizeof(jshort), false, false, entry, NULL, "jshort_arraycopy"); 1029 // jint (always aligned) 1030 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(sizeof(jint), true, false, &entry, "arrayof_jint_disjoint_arraycopy"); 1031 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(sizeof(jint), true, false, entry, NULL, "arrayof_jint_arraycopy"); 1032 StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy; 1033 StubRoutines::_jint_arraycopy = StubRoutines::_arrayof_jint_arraycopy; 1034 // jlong (always aligned) 1035 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(sizeof(jlong), true, false, &entry, "arrayof_jlong_disjoint_arraycopy"); 1036 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(sizeof(jlong), true, false, entry, NULL, "arrayof_jlong_arraycopy"); 1037 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 1038 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 1039 // OOP (always aligned) 1040 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_copy(sizeof(jint), true, true, &entry, "arrayof_oop_disjoint_arraycopy"); 1041 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_copy(sizeof(jint), true, true, entry, NULL, "arrayof_oop_arraycopy"); 1042 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_copy(sizeof(jint), true, true, &entry, "arrayof_oop_disjoint_arraycopy_uninit", true); 1043 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_copy(sizeof(jint), true, true, entry, NULL, "arrayof_oop_arraycopy_uninit", true); 1044 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 1045 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 1046 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 1047 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 1048 1049 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", NULL); 1050 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, true); 1051 } 1052 1053 void generate_math_stubs() { Unimplemented(); } 1054 1055 // Safefetch stubs. 1056 void generate_safefetch(const char* name, int size, address* entry, 1057 address* fault_pc, address* continuation_pc) { 1058 // safefetch signatures: 1059 // int SafeFetch32(int* adr, int errValue); 1060 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 1061 // 1062 // arguments: 1063 // c_rarg0 = adr 1064 // c_rarg1 = errValue 1065 // 1066 // result: 1067 // PPC_RET = *adr or errValue 1068 1069 StubCodeMark mark(this, "StubRoutines", name); 1070 1071 // Entry point, pc or function descriptor. 1072 *entry = __ pc(); 1073 1074 // Load *adr into c_rarg1, may fault. 1075 __ mov(c_rarg2, c_rarg0); 1076 *fault_pc = __ pc(); 1077 switch (size) { 1078 case 4: 1079 // int32_t 1080 __ ldr(c_rarg0, Address(c_rarg2, 0)); 1081 break; 1082 default: 1083 ShouldNotReachHere(); 1084 } 1085 __ b(lr); 1086 // return errValue or *adr 1087 *continuation_pc = __ pc(); 1088 __ mov(r0, c_rarg1); 1089 __ b(lr); 1090 } 1091 1092 /** 1093 * Arguments: 1094 * 1095 * Inputs: 1096 * c_rarg0 - int crc 1097 * c_rarg1 - byte* buf 1098 * c_rarg2 - int length 1099 * 1100 * Output: 1101 * r0 - int crc result 1102 * 1103 * Preserves: 1104 * r13 1105 * 1106 */ 1107 address generate_updateBytesCRC32(int is_crc32c) { 1108 assert(!is_crc32c ? UseCRC32Intrinsics : UseCRC32CIntrinsics, "what are we doing here?"); 1109 1110 __ align(CodeEntryAlignment); 1111 StubCodeMark mark(this, "StubRoutines", !is_crc32c ? "updateBytesCRC32" : "updateBytesCRC32C"); 1112 1113 address start = __ pc(); 1114 1115 const Register crc = c_rarg0; // crc 1116 const Register buf = c_rarg1; // source java byte array address 1117 const Register len = c_rarg2; // length 1118 const Register table0 = c_rarg3; // crc_table address 1119 const Register table1 = r4; 1120 const Register table2 = r5; 1121 const Register table3 = lr; 1122 1123 BLOCK_COMMENT("Entry:"); 1124 __ enter(); // required for proper stackwalking of RuntimeStub frame 1125 __ push(RegSet::of(table1, table2, r6, r7, rscratch1, rscratch2), sp); 1126 1127 __ kernel_crc32(crc, buf, len, 1128 table0, table1, table2, table3, rscratch1, rscratch2, r6, is_crc32c); 1129 1130 __ pop(RegSet::of(table1, table2, r6, r7, rscratch1, rscratch2), sp); 1131 __ leave(); // required for proper stackwalking of RuntimeStub frame 1132 __ ret(lr); 1133 1134 return start; 1135 } 1136 1137 /** 1138 * Arguments: 1139 * 1140 * Input: 1141 * c_rarg0 - x address 1142 * c_rarg1 - x length 1143 * c_rarg2 - y address 1144 * c_rarg3 - y lenth 1145 * sp[0] - z address 1146 * sp[1] - z length 1147 */ 1148 address generate_multiplyToLen() { 1149 __ align(CodeEntryAlignment); 1150 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 1151 1152 address start = __ pc(); 1153 const Register x = r0; 1154 const Register xlen = r1; 1155 const Register y = r2; 1156 const Register ylen = r3; 1157 1158 const Register z = r4; 1159 const Register zlen = r5; 1160 1161 const Register tmp1 = r6; 1162 const Register tmp2 = r7; 1163 const Register tmp3 = r8; 1164 const Register tmp4 = r9; 1165 const Register tmp5 = r12; 1166 const Register tmp6 = r14; 1167 1168 BLOCK_COMMENT("Entry:"); 1169 __ enter(); // required for proper stackwalking of RuntimeStub frame 1170 __ push(RegSet::of(z, zlen, tmp1, tmp2)+RegSet::of(tmp3, tmp4, tmp5, tmp6), sp); 1171 __ ldr(z, Address(rfp, 4)); 1172 __ ldr(zlen, Address(rfp, 8)); 1173 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 1174 __ pop(RegSet::of(z, zlen, tmp1, tmp2)+RegSet::of(tmp3, tmp4, tmp5, tmp6), sp); 1175 __ leave(); // required for proper stackwalking of RuntimeStub frame 1176 __ ret(lr); 1177 1178 return start; 1179 } 1180 1181 /** 1182 * Arguments: 1183 * 1184 * Input: 1185 * c_rarg0 - out 1186 * c_rarg1 - int 1187 * c_rarg2 - offset 1188 * c_rarg3 - len 1189 * sp[0] - k 1190 */ 1191 address generate_mulAdd() { 1192 __ align(CodeEntryAlignment); 1193 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 1194 1195 address start = __ pc(); 1196 const Register out = r0; 1197 const Register in = r1; 1198 const Register offset = r2; 1199 const Register len = r3; 1200 1201 const Register k = r4; 1202 1203 const Register tmp1 = r6; 1204 const Register tmp2 = r7; 1205 const Register tmp3 = r8; 1206 1207 BLOCK_COMMENT("Entry:"); 1208 __ enter(); // required for proper stackwalking of RuntimeStub frame 1209 __ push(RegSet::of(k, tmp1, tmp2, tmp3), sp); 1210 __ ldr(k, Address(rfp, 4)); 1211 __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3); 1212 __ pop(RegSet::of(k, tmp1, tmp2, tmp3), sp); 1213 __ leave(); // required for proper stackwalking of RuntimeStub frame 1214 __ ret(lr); 1215 1216 return start; 1217 } 1218 1219 1220 // Arguments: 1221 // 1222 // Inputs: 1223 // c_rarg0 - source byte array address 1224 // c_rarg1 - destination byte array address 1225 // c_rarg2 - K (key) in little endian int array 1226 // 1227 1228 address generate_aescrypt_encryptBlock() { 1229 assert(UseAESIntrinsics, "what are we doing here?"); 1230 __ align(CodeEntryAlignment); 1231 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 1232 1233 address start = __ pc(); 1234 1235 const Register from = c_rarg0; // source array address 1236 const Register to = c_rarg1; // destination array address 1237 const Register key = c_rarg2; // key array address 1238 const Register keylen = c_rarg3; 1239 const Register table1 = r4; 1240 const Register t0 = r5; 1241 const Register t1 = r6; 1242 const Register t2 = r7; 1243 const Register t3 = r8; 1244 const Register t4 = r9; 1245 const Register t5 = r10; 1246 const Register t6 = r11; 1247 const Register t7 = r12; 1248 1249 BLOCK_COMMENT("Entry:"); 1250 __ enter(); 1251 1252 __ push(RegSet::of(r4, r5, r6, r7, r8), sp); 1253 __ push(RegSet::of(r9, r10, r11, r12), sp); 1254 __ kernel_aescrypt_encryptBlock(from, to, key, keylen, table1, 1255 t0, t1, t2, t3, t4, t5, t6, t7); 1256 __ pop(RegSet::of(r9, r10, r11, r12), sp); 1257 __ pop(RegSet::of(r4, r5, r6, r7, r8), sp); 1258 1259 __ leave(); 1260 __ ret(lr); 1261 1262 return start; 1263 } 1264 1265 // Arguments: 1266 // 1267 // Inputs: 1268 // c_rarg0 - source byte array address 1269 // c_rarg1 - destination byte array address 1270 // c_rarg2 - K (key) in little endian int array 1271 // 1272 1273 address generate_aescrypt_decryptBlock() { 1274 assert(UseAESIntrinsics, "what are we doing here?"); 1275 __ align(CodeEntryAlignment); 1276 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 1277 1278 address start = __ pc(); 1279 1280 const Register from = c_rarg0; // source array address 1281 const Register to = c_rarg1; // destination array address 1282 const Register key = c_rarg2; // key array address 1283 const Register keylen = c_rarg3; 1284 const Register table1 = r4; 1285 const Register t0 = r5; 1286 const Register t1 = r6; 1287 const Register t2 = r7; 1288 const Register t3 = r8; 1289 const Register t4 = r9; 1290 const Register t5 = r10; 1291 const Register t6 = r11; 1292 const Register t7 = r12; 1293 1294 BLOCK_COMMENT("Entry:"); 1295 __ enter(); 1296 1297 __ push(RegSet::of(r4, r5, r6, r7, r8), sp); 1298 __ push(RegSet::of(r9, r10, r11, r12), sp); 1299 __ kernel_aescrypt_decryptBlock(from, to, key, keylen, table1, 1300 t0, t1, t2, t3, t4, t5, t6, t7); 1301 __ pop(RegSet::of(r9, r10, r11, r12), sp); 1302 __ pop(RegSet::of(r4, r5, r6, r7, r8), sp); 1303 1304 __ leave(); 1305 __ ret(lr); 1306 1307 return start; 1308 } 1309 1310 // Arguments: 1311 // 1312 // Inputs: 1313 // c_rarg0 - source byte array address 1314 // c_rarg1 - destination byte array address 1315 // c_rarg2 - K (key) in little endian int array 1316 // c_rarg3 - r vector byte array address 1317 // c_rarg4 - input length 1318 // 1319 // Output: 1320 // x0 - input length 1321 // 1322 1323 address generate_cipherBlockChaining_encryptAESCrypt(bool len_on_stack) { 1324 assert(UseAESIntrinsics && UseNeon, "what are we doing here?"); 1325 __ align(CodeEntryAlignment); 1326 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 1327 1328 address start = __ pc(); 1329 1330 const Register from = c_rarg0; // source array address 1331 const Register to = c_rarg1; // destination array address 1332 const Register key = c_rarg2; // key array address 1333 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 1334 // and left with the results of the last encryption block 1335 const Register len = r4; // src len (must be multiple of blocksize 16) 1336 const Register keylen = r5; 1337 const Register table = r6; 1338 const Register t0 = r7; 1339 const Register t1 = r8; 1340 const Register t2 = r9; 1341 const Register t3 = r10; 1342 const Register t4 = r11; 1343 const Register t5 = r12; 1344 const Register t6 = lr; 1345 1346 BLOCK_COMMENT("Entry:"); 1347 __ enter(); 1348 1349 __ push(RegSet::of(r4, r5, r6, r7, r8), sp); 1350 __ push(RegSet::of(r9, r10, r11, r12), sp); 1351 __ vstmdb_f64(sp, 0xff00); // d8-d15 are callee save registers 1352 1353 if (len_on_stack) 1354 __ ldr(len, Address(rfp, wordSize)); 1355 __ kernel_aescrypt_encrypt(from, to, key, rvec, len, keylen, table, 1356 t0, t1, t2, t3, t4, t5, t6); 1357 1358 __ vldmia_f64(sp, 0xff00); 1359 __ pop(RegSet::of(r9, r10, r11, r12), sp); 1360 __ pop(RegSet::of(r4, r5, r6, r7, r8), sp); 1361 1362 __ leave(); 1363 __ ret(lr); 1364 1365 return start; 1366 } 1367 1368 // Arguments: 1369 // 1370 // Inputs: 1371 // c_rarg0 - source byte array address 1372 // c_rarg1 - destination byte array address 1373 // c_rarg2 - K (key) in little endian int array 1374 // c_rarg3 - r vector byte array address 1375 // c_rarg4 - input length 1376 // 1377 // Output: 1378 // x0 - input length 1379 // 1380 1381 address generate_cipherBlockChaining_decryptAESCrypt(bool len_on_stack) { 1382 assert(UseAESIntrinsics && UseNeon, "what are we doing here?"); 1383 __ align(CodeEntryAlignment); 1384 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 1385 1386 address start = __ pc(); 1387 1388 const Register from = c_rarg0; // source array address 1389 const Register to = c_rarg1; // destination array address 1390 const Register key = c_rarg2; // key array address 1391 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 1392 // and left with the results of the last encryption block 1393 const Register len = r4; // src len (must be multiple of blocksize 16) 1394 const Register keylen = r5; 1395 const Register table = r6; 1396 const Register t0 = r7; 1397 const Register t1 = r8; 1398 const Register t2 = r9; 1399 const Register t3 = r10; 1400 const Register t4 = r11; 1401 const Register t5 = r12; 1402 const Register t6 = lr; 1403 1404 BLOCK_COMMENT("Entry:"); 1405 __ enter(); 1406 1407 __ push(RegSet::of(r4, r5, r6, r7, r8), sp); 1408 __ push(RegSet::of(r9, r10, r11, r12), sp); 1409 __ vstmdb_f64(sp, 0xff00); // d8-d15 are callee save registers 1410 1411 if (len_on_stack) 1412 __ ldr(len, Address(rfp, wordSize)); 1413 __ kernel_aescrypt_decrypt(from, to, key, rvec, len, keylen, table, 1414 t0, t1, t2, t3, t4, t5, t6); 1415 1416 __ vldmia_f64(sp, 0xff00); 1417 __ pop(RegSet::of(r9, r10, r11, r12), sp); 1418 __ pop(RegSet::of(r4, r5, r6, r7, r8), sp); 1419 1420 __ leave(); 1421 __ ret(lr); 1422 1423 return start; 1424 } 1425 1426 // Arguments: 1427 // 1428 // Inputs: 1429 // c_rarg0 - source byte array address 1430 // c_rarg1 - state array 1431 1432 address generate_sha_implCompress() { 1433 assert(UseSHA1Intrinsics, "what are we doing here?"); 1434 __ align(CodeEntryAlignment); 1435 StubCodeMark mark(this, "StubRoutines", "sha_implCompress"); 1436 address start = __ pc(); 1437 1438 const Register from = c_rarg0; // source array address 1439 const Register state = c_rarg1; // state array address 1440 const Register t0 = c_rarg2; 1441 const Register t1 = c_rarg3; 1442 const Register t2 = r4; 1443 const Register t3 = r5; 1444 const Register t4 = r6; 1445 const Register t5 = r7; 1446 const Register t6 = r8; 1447 const Register t7 = r9; 1448 const Register t8 = r10; 1449 const Register t9 = r11; 1450 const Register t10 = r12; 1451 DoubleFloatRegSet _fToSave = DoubleFloatRegSet::range(d0, d15); 1452 1453 BLOCK_COMMENT("Entry:"); 1454 __ enter(); 1455 1456 __ push(RegSet::of(r4, r5, r6, r7, r8), sp); 1457 __ push(RegSet::of(r9, r10, r11, r12), sp); 1458 __ vstmdb_f64(sp, _fToSave.bits()); 1459 1460 __ kernel_sha_implCompress(from, state, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10); 1461 1462 __ vldmia_f64(sp, _fToSave.bits(), true); 1463 __ pop(RegSet::of(r9, r10, r11, r12), sp); 1464 __ pop(RegSet::of(r4, r5, r6, r7, r8), sp); 1465 1466 __ leave(); 1467 __ ret(lr); 1468 1469 return start; 1470 } 1471 1472 // Arguments: 1473 // 1474 // Inputs: 1475 // c_rarg0 - source byte array address 1476 // c_rarg1 - state array 1477 1478 address generate_sha256_implCompress() { 1479 assert(UseSHA256Intrinsics, "what are we doing here?"); 1480 __ align(CodeEntryAlignment); 1481 StubCodeMark mark(this, "StubRoutines", "sha256_implCompress"); 1482 address start = __ pc(); 1483 1484 const Register from = c_rarg0; // source array address 1485 const Register state = c_rarg1; // state array address 1486 const Register t0 = c_rarg2; 1487 const Register t1 = c_rarg3; 1488 const Register t2 = r4; 1489 const Register t3 = r5; 1490 const Register t4 = r6; 1491 const Register t5 = r7; 1492 const Register t6 = r8; 1493 const Register t7 = r9; 1494 const Register t8 = r10; 1495 const Register t9 = r11; 1496 const Register t10 = r12; 1497 const Register t11 = lr; 1498 DoubleFloatRegSet _fToSave1 = DoubleFloatRegSet::range(d0, d15); 1499 DoubleFloatRegSet _fToSave2 = DoubleFloatRegSet::range(d16,d31); 1500 1501 BLOCK_COMMENT("Entry:"); 1502 __ enter(); 1503 1504 __ push(RegSet::of(r4, r5, r6, r7, r8), sp); 1505 __ push(RegSet::of(r9, r10, r11, r12, lr), sp); 1506 __ vstmdb_f64(sp, _fToSave1.bits()); 1507 __ vstmdb_f64(sp, _fToSave2.bits()); 1508 1509 __ kernel_sha256_implCompress(from, state, t0, t1, 1510 t2, t3, t4, t5, t6, t7, t8, t9, t10, t11); 1511 1512 __ vldmia_f64(sp, _fToSave2.bits(), true); 1513 __ vldmia_f64(sp, _fToSave1.bits(), true); 1514 __ pop(RegSet::of(r9, r10, r11, r12, lr), sp); 1515 __ pop(RegSet::of(r4, r5, r6, r7, r8), sp); 1516 1517 __ leave(); 1518 __ ret(lr); 1519 1520 return start; 1521 } 1522 1523 // Arguments: 1524 // 1525 // Inputs: 1526 // c_rarg0 - source byte array address 1527 // c_rarg1 - state array 1528 1529 address generate_sha512_implCompress() { 1530 assert(UseSHA512Intrinsics, "what are we doing here?"); 1531 __ align(CodeEntryAlignment); 1532 StubCodeMark mark(this, "StubRoutines", "sha512_implCompress"); 1533 address start = __ pc(); 1534 1535 const Register from = c_rarg0; // source array address 1536 const Register state = c_rarg1; // state array address 1537 const Register t0 = c_rarg2; 1538 const Register t1 = c_rarg3; 1539 DoubleFloatRegSet _fToSave1 = DoubleFloatRegSet::range(d0, d15); 1540 DoubleFloatRegSet _fToSave2 = DoubleFloatRegSet::range(d16,d31); 1541 1542 1543 BLOCK_COMMENT("Entry:"); 1544 __ enter(); 1545 1546 __ vstmdb_f64(sp, _fToSave1.bits()); 1547 __ vstmdb_f64(sp, _fToSave2.bits()); 1548 1549 __ kernel_sha512_implCompress(from, state, t0, t1); 1550 1551 __ vldmia_f64(sp, _fToSave2.bits(), true); 1552 __ vldmia_f64(sp, _fToSave1.bits(), true); 1553 1554 __ leave(); 1555 __ ret(lr); 1556 1557 return start; 1558 } 1559 1560 // Continuation point for throwing of implicit exceptions that are 1561 // not handled in the current activation. Fabricates an exception 1562 // oop and initiates normal exception dispatching in this 1563 // frame. Since we need to preserve callee-saved values (currently 1564 // only for C2, but done for C1 as well) we need a callee-saved oop 1565 // map and therefore have to make these stubs into RuntimeStubs 1566 // rather than BufferBlobs. If the compiler needs all registers to 1567 // be preserved between the fault point and the exception handler 1568 // then it must assume responsibility for that in 1569 // AbstractCompiler::continuation_for_implicit_null_exception or 1570 // continuation_for_implicit_division_by_zero_exception. All other 1571 // implicit exceptions (e.g., NullPointerException or 1572 // AbstractMethodError on entry) are either at call sites or 1573 // otherwise assume that stack unwinding will be initiated, so 1574 // caller saved registers were assumed volatile in the compiler. 1575 1576 #undef __ 1577 #define __ masm-> 1578 1579 address generate_throw_exception(const char* name, 1580 address runtime_entry, 1581 Register arg1 = noreg, 1582 Register arg2 = noreg) { 1583 // Information about frame layout at time of blocking runtime call. 1584 // Note that we only have to preserve callee-saved registers since 1585 // the compilers are responsible for supplying a continuation point 1586 // if they expect all registers to be preserved. 1587 // n.b. aarch32 asserts that frame::arg_reg_save_area_bytes == 0 1588 const int framesize = frame::get_frame_size(); 1589 const int insts_size = 512; 1590 const int locs_size = 64; 1591 1592 CodeBuffer code(name, insts_size, locs_size); 1593 OopMapSet* oop_maps = new OopMapSet(); 1594 MacroAssembler* masm = new MacroAssembler(&code); 1595 1596 address start = __ pc(); 1597 1598 // This is an inlined and slightly modified version of call_VM 1599 // which has the ability to fetch the return PC out of 1600 // thread-local storage and also sets up last_Java_sp slightly 1601 // differently than the real call_VM 1602 1603 __ enter(); // Save at least FP and LR before call 1604 1605 assert(is_even(framesize), "sp not 8-byte aligned"); 1606 1607 int frame_complete = __ pc() - start; 1608 1609 // Set up last_Java_sp and last_Java_fp 1610 address the_pc = __ pc(); 1611 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 1612 1613 // Call runtime 1614 if (arg1 != noreg) { 1615 assert(arg2 != c_rarg1, "clobbered"); 1616 __ mov(c_rarg1, arg1); 1617 } 1618 if (arg2 != noreg) { 1619 __ mov(c_rarg2, arg2); 1620 } 1621 __ mov(c_rarg0, rthread); 1622 BLOCK_COMMENT("call runtime_entry"); 1623 __ align_stack(); 1624 __ mov(rscratch1, runtime_entry); 1625 __ bl(rscratch1); 1626 1627 // Generate oop map 1628 OopMap* map = new OopMap(framesize, 0); 1629 1630 oop_maps->add_gc_map(the_pc - start, map); 1631 1632 __ reset_last_Java_frame(true); 1633 __ maybe_isb(); 1634 1635 __ leave(); 1636 1637 // check for pending exceptions 1638 #ifdef ASSERT 1639 Label L; 1640 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 1641 __ cbnz(rscratch1, L); 1642 __ should_not_reach_here(); 1643 __ bind(L); 1644 #endif // ASSERT 1645 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1646 1647 1648 // codeBlob framesize is in words (not VMRegImpl::slot_size) 1649 RuntimeStub* stub = 1650 RuntimeStub::new_runtime_stub(name, 1651 &code, 1652 frame_complete, 1653 framesize, 1654 oop_maps, false); 1655 return stub->entry_point(); 1656 } 1657 1658 class MontgomeryMultiplyGenerator : public MacroAssembler { 1659 1660 Register Pa_base, Pb_base, Pn_base, Pm_base, Rlen, Ri, Rj, Pa, Pb, Pn, Pm; 1661 FloatRegister inv, Ra, Rb, Rm, Rn, RabAB, RaBAb, s0, s1, s2, tmp; 1662 1663 RegSet _toSave; 1664 DoubleFloatRegSet _fToSave; 1665 bool _squaring; 1666 1667 public: 1668 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 1669 : MacroAssembler(as->code()), _squaring(squaring) { 1670 1671 // Register allocation 1672 1673 Register reg = c_rarg0; 1674 1675 Pa_base = reg++; // Argument registers 1676 if (squaring) 1677 Pb_base = Pa_base; 1678 else 1679 Pb_base = reg++; 1680 Pn_base = reg++; 1681 Rlen= reg++; 1682 Pm_base = r4; 1683 1684 Ri = r5; // Inner and outer loop indexes. 1685 Rj = r6; 1686 1687 Pa = r7; // Pointers to the current/next digit of a, b, n, and m. 1688 Pb = r8; 1689 Pm = r9; 1690 Pn = r12; 1691 1692 _toSave = RegSet::range(r4, r8) + RegSet::of(r9, r12); 1693 1694 // Now NEON registers 1695 1696 // Working registers: 1697 Ra = d0; // The current digit of a, b, n, and m. 1698 Rb = d1; // The values are stored as read, that is high and 1699 Rm = d2; // low 32-bit parts are exchanged 1700 Rn = d3; 1701 1702 // Three registers which form a triple-precision accumulator. 1703 // For sake of performance these are 128-bit and are overlapping 1704 // (hence the name is s, not t). The schema is the following: 1705 // w4|w3|w2|w1|w0| (32-bit words) 1706 // s0 lo: |**|**| 1707 // s0 hi: |**|**| 1708 // s1 lo: |**|**| 1709 // s1 hi: |**|**| 1710 // s2 lo: |**|**| 1711 // s2 hi: |**|**| 1712 // the idea is that each of 64-bit s registers accumulate only 32-bit 1713 // numbers and hence never needs carry operation 1714 1715 s0 = q2; 1716 s1 = q3; 1717 s2 = q4; 1718 1719 RabAB = q5; // Product registers: low, high and middle parts 1720 RaBAb = q6; // of a*b and m*n. hi(A)*hi(B) is the same quad as lo(a)*lo(b) 1721 1722 inv = d14; 1723 tmp = d15; 1724 1725 _fToSave = DoubleFloatRegSet::range(d8, tmp); 1726 } 1727 1728 private: 1729 void save_regs() { 1730 vstmdb_f64(sp, _fToSave.bits()); 1731 push(_toSave, sp); 1732 } 1733 1734 void restore_regs() { 1735 pop(_toSave, sp); 1736 vldmia_f64(sp, _fToSave.bits(), true); 1737 } 1738 1739 template <typename T> 1740 void unroll_2(Register count, T block) { 1741 Label loop, end, odd; 1742 tbnz(count, 0, odd); 1743 cbz(count, end); 1744 align(16); 1745 bind(loop); 1746 (this->*block)(); 1747 bind(odd); 1748 (this->*block)(); 1749 subs(count, count, 2); 1750 b(loop, Assembler::GT); 1751 bind(end); 1752 } 1753 1754 void pre1(Register i) { 1755 block_comment("pre1"); 1756 // Pa = Pa_base; 1757 // Pb = Pb_base + i; 1758 // Pm = Pm_base; 1759 // Pn = Pn_base + i; 1760 // Ra = *Pa; 1761 // Rb = *Pb; 1762 // Rm = *Pm; 1763 // Rn = *Pn; 1764 lea(Pa, Address(Pa_base)); 1765 lea(Pb, Address(Pb_base, i, lsl(LogBytesPerLong), Address::SUB)); 1766 lea(Pm, Address(Pm_base)); 1767 lea(Pn, Address(Pn_base, i, lsl(LogBytesPerLong), Address::SUB)); 1768 1769 vld1_64(Ra, Address(Pa), Assembler::ALIGN_STD); 1770 vld1_64(Rb, Address(Pb), Assembler::ALIGN_STD); 1771 vld1_64(Rm, Address(Pm), Assembler::ALIGN_STD); 1772 vld1_64(Rn, Address(Pn), Assembler::ALIGN_STD); 1773 } 1774 1775 // The core multiply-accumulate step of a Montgomery 1776 // multiplication. The idea is to schedule operations as a 1777 // pipeline so that instructions with long latencies (loads and 1778 // multiplies) have time to complete before their results are 1779 // used. This most benefits in-order implementations of the 1780 // architecture but out-of-order ones also benefit. 1781 void step() { 1782 block_comment("step"); 1783 // MACC(Rm, Rn, t0, t1, t2); 1784 // Rm = *++Pm; 1785 // Rn = *--Pn; 1786 sub(Pm, Pm, BytesPerLong); 1787 add(Pn, Pn, BytesPerLong); 1788 vmul_acc1(Rm, Rn, tmp, RabAB, RaBAb); 1789 vld1_64(Rm, Address(Pm), Assembler::ALIGN_STD); 1790 vld1_64(Rn, Address(Pn), Assembler::ALIGN_STD); 1791 vmul_acc2(tmp, RabAB, RaBAb); 1792 1793 // MACC(Ra, Rb, t0, t1, t2); 1794 // Ra = *++Pa; 1795 // Rb = *--Pb; 1796 sub(Pa, Pa, BytesPerLong); 1797 add(Pb, Pb, BytesPerLong); 1798 vmul_acc1(Ra, Rb, tmp, RabAB, RaBAb); 1799 vld1_64(Ra, Address(Pa), Assembler::ALIGN_STD); 1800 vld1_64(Rb, Address(Pb), Assembler::ALIGN_STD); 1801 vmul_acc2(tmp, RabAB, RaBAb); 1802 } 1803 1804 void post1() { 1805 FloatRegister t0 = RabAB; 1806 1807 block_comment("post1"); 1808 1809 // MACC(Ra, Rb, t0, t1, t2); 1810 vmul_acc1(Ra, Rb, tmp, RabAB, RaBAb); 1811 vmul_acc2(tmp, RabAB, RaBAb); 1812 1813 // *Pm = Rm = t0 * inv; 1814 vmul_fin(t0, tmp); 1815 vmul_simple(Rm, t0, inv, RaBAb); // RaBAb is tmp 1816 vrev64_64_32(Rm, Rm); // write in reversed, big-endian format 1817 vst1_64(Rm, Address(Pm), ALIGN_STD); 1818 1819 // MACC(Rm, Rn, t0, t1, t2); 1820 vmul_acc1(Rm, Rn, tmp, RabAB, RaBAb); 1821 vmul_acc2(tmp, RabAB, RaBAb); 1822 1823 #ifndef PRODUCT 1824 // assert(t0 == 0, "broken Montgomery multiply"); 1825 { 1826 vmul_fin(t0, tmp); 1827 Label ok; 1828 push(RegSet::of(Ri, Rj), sp); 1829 vmov_f64(Ri, Rj, t0); 1830 orr(Ri, Ri, Rj); 1831 cbz(Ri, ok); { 1832 stop("broken Montgomery multiply"); 1833 } bind(ok); 1834 pop(RegSet::of(Ri, Rj), sp); 1835 } 1836 #endif 1837 1838 // t0 = t1; t1 = t2; t2 = 0; 1839 shift_t(RabAB); 1840 } 1841 1842 void pre2(Register i, Register len) { 1843 block_comment("pre2"); 1844 // Pa = Pa_base + i-len; 1845 // Pb = Pb_base + len; 1846 // Pm = Pm_base + i-len; 1847 // Pn = Pn_base + len; 1848 1849 // Rj == i-len 1850 sub(Rj, i, len); 1851 1852 lea(Pa, Address(Pa_base, Rj, lsl(LogBytesPerLong), Address::SUB)); 1853 lea(Pb, Address(Pb_base, len, lsl(LogBytesPerLong), Address::SUB)); 1854 lea(Pm, Address(Pm_base, Rj, lsl(LogBytesPerLong), Address::SUB)); 1855 lea(Pn, Address(Pn_base, len, lsl(LogBytesPerLong), Address::SUB)); 1856 1857 // Ra = *++Pa; 1858 // Rb = *--Pb; 1859 // Rm = *++Pm; 1860 // Rn = *--Pn; 1861 sub(Pa, Pa, BytesPerLong); 1862 add(Pb, Pb, BytesPerLong); 1863 sub(Pm, Pm, BytesPerLong); 1864 add(Pn, Pn, BytesPerLong); 1865 1866 vld1_64(Ra, Address(Pa), ALIGN_STD); 1867 vld1_64(Rb, Address(Pb), ALIGN_STD); 1868 vld1_64(Rm, Address(Pm), ALIGN_STD); 1869 vld1_64(Rn, Address(Pn), ALIGN_STD); 1870 } 1871 1872 void post2(Register i, Register len) { 1873 FloatRegister t0 = RabAB; 1874 1875 block_comment("post2"); 1876 1877 vmul_fin(t0, tmp); 1878 1879 // As soon as we know the least significant digit of our result, 1880 // store it. 1881 // Pm_base[i-len] = t0; 1882 sub(Rj, i, len); 1883 lea(Rj, Address(Pm_base, Rj, lsl(LogBytesPerLong), Address::SUB)); 1884 vrev64_64_32(t0, t0); 1885 vst1_64(t0, Address(Rj), ALIGN_STD); 1886 1887 // t0 = t1; t1 = t2; t2 = 0; 1888 shift_t(RabAB); 1889 } 1890 1891 // A carry in t0 after Montgomery multiplication means that we 1892 // should subtract multiples of n from our result in m. We'll 1893 // keep doing that until there is no carry. ARM registers are used 1894 // for this operation, this is faster than using NEON 1895 void normalize(Register len, Register t0lo, Register t0hi, 1896 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 1897 block_comment("normalize"); 1898 // while (t0) 1899 // t0 = sub(Pm_base, Pn_base, t0, len); 1900 Label loop, post, again; 1901 Register cnt = tmp1, i = tmp2, m = tmp3, n = tmp4, flags = tmp5; 1902 // let them point to last 32-bit element now 1903 add(Pn_base, Pn_base, BytesPerInt); 1904 add(Pm_base, Pm_base, BytesPerInt); 1905 orrs(n, t0lo, t0hi); 1906 b(post, EQ); { 1907 bind(again); { 1908 mov(i, 0); 1909 mov(cnt, len); // each loop processes 64 bits 1910 ldr(m, Address(Pm_base)); 1911 ldr(n, Address(Pn_base)); 1912 cmp(n, n); // set carry flag, i.e. no borrow 1913 mrs(flags); 1914 align(16); 1915 bind(loop); { 1916 msr(flags, true, false); 1917 sbcs(m, m, n); 1918 str(m, Address(Pm_base, i, lsl(LogBytesPerWord), Address::SUB)); 1919 add(i, i, 1); 1920 ldr(n, Address(Pn_base, i, lsl(LogBytesPerWord), Address::SUB)); 1921 ldr(m, Address(Pm_base, i, lsl(LogBytesPerWord), Address::SUB)); 1922 sbcs(m, m, n); 1923 mrs(flags); 1924 str(m, Address(Pm_base, i, lsl(LogBytesPerWord), Address::SUB)); 1925 add(i, i, 1); 1926 ldr(n, Address(Pn_base, i, lsl(LogBytesPerWord), Address::SUB)); 1927 ldr(m, Address(Pm_base, i, lsl(LogBytesPerWord), Address::SUB)); 1928 sub(cnt, cnt, 1); 1929 } cbnz(cnt, loop); 1930 msr(flags, true, false); 1931 sbcs(t0lo, t0lo, 0); 1932 sbc(t0hi, t0hi, 0); 1933 orrs(n, t0lo, t0hi); 1934 } b(again, NE); 1935 } bind(post); 1936 } 1937 1938 void step_squaring() { 1939 // An extra ACC for A*B 1940 step(); 1941 vmul_acc2(tmp, RabAB, RaBAb, false); 1942 } 1943 1944 void last_squaring(Register i) { 1945 Label dont; 1946 // if ((i & 1) == 0) { 1947 tbnz(i, 0, dont); { 1948 // MACC(Ra, Rb, t0, t1, t2); 1949 // Ra = *++Pa; 1950 // Rb = *--Pb; 1951 sub(Pa, Pa, BytesPerLong); 1952 add(Pb, Pb, BytesPerLong); 1953 vmul_acc1(Ra, Rb, tmp, RabAB, RaBAb); 1954 vmul_acc2(tmp, RabAB, RaBAb); 1955 } bind(dont); 1956 } 1957 1958 void extra_step_squaring() { 1959 // MACC(Rm, Rn, t0, t1, t2); 1960 // Rm = *++Pm; 1961 // Rn = *--Pn; 1962 sub(Pm, Pm, BytesPerLong); 1963 add(Pn, Pn, BytesPerLong); 1964 vmul_acc1(Rm, Rn, tmp, RabAB, RaBAb); 1965 vld1_64(Rm, Address(Pm), Assembler::ALIGN_STD); 1966 vld1_64(Rn, Address(Pn), Assembler::ALIGN_STD); 1967 vmul_acc2(tmp, RabAB, RaBAb); 1968 } 1969 1970 void post1_squaring() { 1971 FloatRegister t0 = RabAB; 1972 1973 // *Pm = Rm = t0 * inv; 1974 vmul_fin(t0, tmp); 1975 vmul_simple(Rm, t0, inv, RaBAb); // RaBAb is tmp 1976 vrev64_64_32(Rm, Rm); 1977 vst1_64(Rm, Address(Pm), ALIGN_STD); 1978 1979 // MACC(Rm, Rn, t0, t1, t2); 1980 vmul_acc1(Rm, Rn, tmp, RabAB, RaBAb); 1981 vmul_acc2(tmp, RabAB, RaBAb); 1982 1983 #ifndef PRODUCT 1984 // assert(t0 == 0, "broken Montgomery multiply"); 1985 { 1986 vmul_fin(t0, tmp); 1987 Label ok; 1988 push(RegSet::of(Ri, Rj), sp); 1989 vmov_f64(Ri, Rj, t0); 1990 orr(Ri, Ri, Rj); 1991 cbz(Ri, ok); { 1992 stop("broken Montgomery square"); 1993 } bind(ok); 1994 pop(RegSet::of(Ri, Rj), sp); 1995 } 1996 #endif 1997 1998 // t0 = t1; t1 = t2; t2 = 0; 1999 shift_t(RabAB); 2000 } 2001 2002 /** 2003 * Initializes the accumulators 2004 */ 2005 void vmul_init() { 2006 vmov_128_32(s0, 0); 2007 vmov_128_32(s1, 0); 2008 vmov_128_32(s2, 0); 2009 } 2010 2011 /** 2012 * Multiplies unsigned 64-bit a by unsigned 64-bit b accumulating the 2013 * result into temp array (s0-s2). temp array is not converged into 2014 * resulting number. See vmul_fin. 2015 * Performance critical part. 2016 * @param a first operand 2017 * @param b second operand 2018 */ 2019 void vmul_acc1(FloatRegister a, FloatRegister b, FloatRegister tmp, FloatRegister RabAB, FloatRegister RaBAb) { 2020 vrev64_64_32(tmp, b); 2021 vmull_32u(RabAB, a, b); 2022 vmull_32u(RaBAb, a, tmp); 2023 } 2024 2025 void vmul_acc2(FloatRegister tmp, FloatRegister RabAB, FloatRegister RaBAb, bool trn_aBAb = true) { 2026 // words 2-0 of accumulator 2027 vaddw_32u(s0, s0, RabAB->successor(FloatRegisterImpl::DOUBLE)); 2028 if (trn_aBAb) { 2029 // words 3-1 of accumulator. phase 1 2030 vtrn_64_32(RaBAb, RaBAb->successor(FloatRegisterImpl::DOUBLE)); 2031 } 2032 // words 4-2 of accumulator 2033 vaddw_32u(s2, s2, RabAB); 2034 // words 3-1 of accumulator. phase 2 2035 vpadal_128_u32(s1, RaBAb); 2036 } 2037 2038 /** 2039 * Simple unsigned 64-bit multiply a by b. 2040 * Least significant 64 bits of result are written into register res, 2041 * the rest are discarded. 2042 * @param res 64-bit result 2043 * @param a 64-bit operand 2044 * @param b 64-bit operand 2045 * @param tmp 128-bit temporary register 2046 */ 2047 void vmul_simple(FloatRegister res, FloatRegister a, FloatRegister b, FloatRegister tmp) { 2048 FloatRegister tmp2 = tmp->successor(FloatRegisterImpl::DOUBLE); 2049 vmull_32u(tmp, a, b); 2050 vrev64_64_32(tmp2, b); 2051 vmul_64_32(tmp2, a, tmp2); 2052 vpaddl_64_u32(tmp2, tmp2); 2053 vshl_64_64(tmp2, tmp2, 32); 2054 vadd_64_64(res, tmp, tmp2); 2055 } 2056 2057 /** 2058 * Converges the temp array and returns least significant 64 bits of the result. 2059 * @param t0 the register to write the least significant 64 bits of result 2060 * @param tmp 64-bit temporary register 2061 */ 2062 void vmul_fin(FloatRegister t0, FloatRegister tmp1) { 2063 FloatRegister abLow = s0; 2064 FloatRegister abHigh = s0->successor(FloatRegisterImpl::DOUBLE); 2065 FloatRegister aBAbLow = s1; 2066 2067 // words 0 and 1 2068 vshr_64_u64(tmp1, abLow, 32); 2069 vadd_64_64(tmp1, tmp1, abHigh); 2070 vadd_64_64(tmp1, tmp1, aBAbLow); 2071 vmov_64(t0, abLow); 2072 vsli_64_64(t0, tmp1, 32); 2073 } 2074 2075 /** 2076 * Performs t0 = t1; t1 = t2; t2 = 0; represented as s0-s2. 2077 * @param tmp 128-bit register 2078 */ 2079 void shift_t(FloatRegister tmp) { 2080 FloatRegister s0hi = s0->successor(FloatRegisterImpl::DOUBLE); 2081 FloatRegister s1hi = s1->successor(FloatRegisterImpl::DOUBLE); 2082 FloatRegister s2hi = s2->successor(FloatRegisterImpl::DOUBLE); 2083 FloatRegister tmphi = tmp->successor(FloatRegisterImpl::DOUBLE); 2084 vshr_64_u64(s0, s0, 32); 2085 vaddl_32u(tmp, s1, s0hi); 2086 vadd_64_64(s0, s0, tmp); 2087 vshr_64_u64(s0, s0, 32); 2088 vadd_64_64(tmphi, s0, tmphi); 2089 vaddl_32u(s0, s1hi, s2); 2090 vadd_64_64(s0, s0, tmphi); 2091 vmov_64(s1, s2hi); 2092 vmov_64_32(s1hi, 0); 2093 vmov_128_32(s2, 0); 2094 } 2095 2096 public: 2097 /** 2098 * Fast Montgomery multiplication. The derivation of the 2099 * algorithm is in A Cryptographic Library for the Motorola 2100 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 2101 * 2102 * Arguments: 2103 * 2104 * Inputs for multiplication: 2105 * c_rarg0 - int64 array elements a 2106 * c_rarg1 - int64 array elements b 2107 * c_rarg2 - int64 array elements n (the modulus) 2108 * c_rarg3 - int64 length 2109 * [sp] - int64 inv 2110 * [sp+8] - int64 array elements m (the result) 2111 * 2112 */ 2113 address generate_multiply() { 2114 Label nothing; 2115 align(CodeEntryAlignment); 2116 address entry = pc(); 2117 2118 cbz(Rlen, nothing); 2119 2120 enter(); 2121 2122 // Push all call-saved registers 2123 save_regs(); 2124 2125 // load inv and m array pointer 2126 add(Ri, rfp, 4); 2127 vld1_64(inv, Address(Ri), ALIGN_STD); 2128 ldr(Pm_base, Address(Ri, BytesPerLong)); 2129 2130 lsr(Rlen, Rlen, 1); // length in longwords = len/2 2131 2132 // let Px_base point on last 64-bit element of an array 2133 add(Pa_base, Pa_base, Rlen, lsl(LogBytesPerLong)); 2134 sub(Pa_base, Pa_base, BytesPerLong); 2135 if (!_squaring) { 2136 add(Pb_base, Pb_base, Rlen, lsl(LogBytesPerLong)); 2137 sub(Pb_base, Pb_base, BytesPerLong); 2138 } 2139 add(Pn_base, Pn_base, Rlen, lsl(LogBytesPerLong)); 2140 sub(Pn_base, Pn_base, BytesPerLong); 2141 add(Pm_base, Pm_base, Rlen, lsl(LogBytesPerLong)); 2142 sub(Pm_base, Pm_base, BytesPerLong); 2143 2144 #ifndef PRODUCT 2145 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 2146 { 2147 // Pn, Pm and s0 are used as a temporary 2148 vld1_64(Rn, Address(Pn_base), Assembler::ALIGN_STD); 2149 vrev64_64_32(Rn, Rn); 2150 vmul_simple(tmp, Rn, inv, s0); 2151 vmov_f64(Pm, Pn, tmp); 2152 andr(Pm, Pm, Pn); 2153 cmn(Pm, 1); 2154 Label ok; 2155 b(ok, EQ); { 2156 stop("broken inverse in Montgomery multiply"); 2157 } bind(ok); 2158 } 2159 #endif 2160 2161 vmul_init(); 2162 2163 block_comment("for (int i = 0; i < len; i++) {"); 2164 mov(Ri, 0); { 2165 Label loop, end; 2166 cmp(Ri, Rlen); 2167 b(end, Assembler::GE); 2168 2169 bind(loop); 2170 pre1(Ri); 2171 2172 block_comment(" for (j = i; j; j--) {"); { 2173 mov(Rj, Ri); 2174 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 2175 } block_comment(" } // j"); 2176 2177 post1(); 2178 add(Ri, Ri, 1); 2179 cmp(Ri, Rlen); 2180 b(loop, Assembler::LT); 2181 bind(end); 2182 block_comment("} // i"); 2183 } 2184 2185 block_comment("for (int i = len; i < 2*len; i++) {"); 2186 mov(Ri, Rlen); { 2187 Label loop, end; 2188 cmp(Ri, Rlen, lsl(1)); 2189 b(end, Assembler::GE); 2190 2191 bind(loop); 2192 pre2(Ri, Rlen); 2193 2194 block_comment(" for (j = len*2-i-1; j; j--) {"); { 2195 lsl(Rj, Rlen, 1); 2196 sub(Rj, Rj, Ri); 2197 sub(Rj, Rj, 1); 2198 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 2199 } block_comment(" } // j"); 2200 2201 post2(Ri, Rlen); 2202 add(Ri, Ri, 1); 2203 cmp(Ri, Rlen, lsl(1)); 2204 b(loop, Assembler::LT); 2205 bind(end); 2206 } 2207 block_comment("} // i"); 2208 2209 FloatRegister t0 = RabAB; // use as temporary 2210 vmul_fin(t0, tmp); 2211 vmov_f64(Pa, Pb, t0); 2212 normalize(Rlen, Pa, Pb, Pm, Pn, Ri, Rj, Pa_base); 2213 2214 restore_regs(); 2215 leave(); 2216 bind(nothing); 2217 ret(lr); 2218 2219 return entry; 2220 } 2221 // In C, approximately: 2222 2223 // void 2224 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 2225 // unsigned long Pn_base[], unsigned long Pm_base[], 2226 // unsigned long inv, int len) { 2227 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 2228 // unsigned long *Pa, *Pb, *Pn, *Pm; 2229 // unsigned long Ra, Rb, Rn, Rm; 2230 2231 // int i; 2232 2233 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 2234 2235 // for (i = 0; i < len; i++) { 2236 // int j; 2237 2238 // Pa = Pa_base; 2239 // Pb = Pb_base + i; 2240 // Pm = Pm_base; 2241 // Pn = Pn_base + i; 2242 2243 // Ra = *Pa; 2244 // Rb = *Pb; 2245 // Rm = *Pm; 2246 // Rn = *Pn; 2247 2248 // int iters = i; 2249 // for (j = 0; iters--; j++) { 2250 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 2251 // MACC(Ra, Rb, t0, t1, t2); 2252 // Ra = *++Pa; 2253 // Rb = *--Pb; 2254 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 2255 // MACC(Rm, Rn, t0, t1, t2); 2256 // Rm = *++Pm; 2257 // Rn = *--Pn; 2258 // } 2259 2260 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 2261 // MACC(Ra, Rb, t0, t1, t2); 2262 // *Pm = Rm = t0 * inv; 2263 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 2264 // MACC(Rm, Rn, t0, t1, t2); 2265 2266 // assert(t0 == 0, "broken Montgomery multiply"); 2267 2268 // t0 = t1; t1 = t2; t2 = 0; 2269 // } 2270 2271 // for (i = len; i < 2*len; i++) { 2272 // int j; 2273 2274 // Pa = Pa_base + i-len; 2275 // Pb = Pb_base + len; 2276 // Pm = Pm_base + i-len; 2277 // Pn = Pn_base + len; 2278 2279 // Ra = *++Pa; 2280 // Rb = *--Pb; 2281 // Rm = *++Pm; 2282 // Rn = *--Pn; 2283 2284 // int iters = len*2-i-1; 2285 // for (j = i-len+1; iters--; j++) { 2286 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 2287 // MACC(Ra, Rb, t0, t1, t2); 2288 // Ra = *++Pa; 2289 // Rb = *--Pb; 2290 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 2291 // MACC(Rm, Rn, t0, t1, t2); 2292 // Rm = *++Pm; 2293 // Rn = *--Pn; 2294 // } 2295 2296 // Pm_base[i-len] = t0; 2297 // t0 = t1; t1 = t2; t2 = 0; 2298 // } 2299 2300 // while (t0) 2301 // t0 = sub(Pm_base, Pn_base, t0, len); 2302 // } 2303 2304 /** 2305 * Fast Montgomery squaring. This uses asymptotically 25% fewer 2306 * multiplies than Montgomery multiplication so it should be up to 2307 * 25% faster. However, its loop control is more complex and it 2308 * may actually run slower on some machines. 2309 * 2310 * Arguments: 2311 * 2312 * Inputs: 2313 * c_rarg0 - int64 array elements a 2314 * c_rarg1 - int64 array elements n (the modulus) 2315 * c_rarg2 - int length 2316 * [sp] - int inv 2317 * [sp+8] - int array elements m (the result) 2318 * 2319 */ 2320 address generate_square() { 2321 align(CodeEntryAlignment); 2322 address entry = pc(); 2323 2324 enter(); 2325 2326 save_regs(); 2327 2328 // load inv and m array pointer 2329 add(Ri, rfp, 4); 2330 vld1_64(inv, Address(Ri), ALIGN_STD); 2331 ldr(Pm_base, Address(Ri, BytesPerLong)); 2332 2333 lsr(Rlen, Rlen, 1); // length in longwords = len/2 2334 2335 // let Px_base point on last 64-bit element of an array 2336 add(Pa_base, Pa_base, Rlen, lsl(LogBytesPerLong)); 2337 sub(Pa_base, Pa_base, BytesPerLong); 2338 add(Pn_base, Pn_base, Rlen, lsl(LogBytesPerLong)); 2339 sub(Pn_base, Pn_base, BytesPerLong); 2340 add(Pm_base, Pm_base, Rlen, lsl(LogBytesPerLong)); 2341 sub(Pm_base, Pm_base, BytesPerLong); 2342 2343 #ifndef PRODUCT 2344 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 2345 { 2346 // Pn, Pm and s0 are used as a temporary 2347 vld1_64(Rn, Address(Pn_base), Assembler::ALIGN_STD); 2348 vrev64_64_32(Rn, Rn); 2349 vmul_simple(tmp, Rn, inv, s0); 2350 vmov_f64(Pm, Pn, tmp); 2351 andr(Pm, Pm, Pn); 2352 cmn(Pm, 1); 2353 Label ok; 2354 b(ok, EQ); { 2355 stop("broken inverse in Montgomery square"); 2356 } bind(ok); 2357 } 2358 #endif 2359 2360 vmul_init(); 2361 2362 block_comment("for (int i = 0; i < len; i++) {"); 2363 mov(Ri, 0); { 2364 Label loop, end; 2365 bind(loop); 2366 cmp(Ri, Rlen); 2367 b(end, GE); 2368 2369 pre1(Ri); 2370 2371 block_comment("for (j = (i+1)/2; j; j--) {"); { 2372 add(Rj, Ri, 1); 2373 lsr(Rj, Rj, 1); 2374 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 2375 } block_comment(" } // j"); 2376 2377 last_squaring(Ri); 2378 2379 block_comment(" for (j = i/2; j; j--) {"); { 2380 lsr(Rj, Ri, 1); 2381 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 2382 } block_comment(" } // j"); 2383 2384 post1_squaring(); 2385 add(Ri, Ri, 1); 2386 cmp(Ri, Rlen); 2387 b(loop, LT); 2388 2389 bind(end); 2390 block_comment("} // i"); 2391 } 2392 2393 block_comment("for (int i = len; i < 2*len; i++) {"); 2394 mov(Ri, Rlen); { 2395 Label loop, end; 2396 bind(loop); 2397 cmp(Ri, Rlen, lsl(1)); 2398 b(end, GE); 2399 2400 pre2(Ri, Rlen); 2401 2402 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 2403 lsl(Rj, Rlen, 1); 2404 sub(Rj, Rj, Ri); 2405 sub(Rj, Rj, 1); 2406 lsr(Rj, Rj, 1); 2407 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 2408 } block_comment(" } // j"); 2409 2410 last_squaring(Ri); 2411 2412 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 2413 lsl(Rj, Rlen, 1); 2414 sub(Rj, Rj, Ri); 2415 lsr(Rj, Rj, 1); 2416 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 2417 } block_comment(" } // j"); 2418 2419 post2(Ri, Rlen); 2420 add(Ri, Ri, 1); 2421 cmp(Ri, Rlen, lsl(1)); 2422 2423 b(loop, LT); 2424 bind(end); 2425 block_comment("} // i"); 2426 } 2427 2428 FloatRegister t0 = RabAB; // use as temporary 2429 vmul_fin(t0, tmp); 2430 vmov_f64(Pa, Pb, t0); 2431 normalize(Rlen, Pa, Pb, Pm, Pn, Ri, Rj, Pa_base); 2432 2433 restore_regs(); 2434 leave(); 2435 ret(lr); 2436 2437 return entry; 2438 } 2439 // In C, approximately: 2440 2441 // void 2442 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 2443 // unsigned long Pm_base[], unsigned long inv, int len) { 2444 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 2445 // unsigned long *Pa, *Pb, *Pn, *Pm; 2446 // unsigned long Ra, Rb, Rn, Rm; 2447 2448 // int i; 2449 2450 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 2451 2452 // for (i = 0; i < len; i++) { 2453 // int j; 2454 2455 // Pa = Pa_base; 2456 // Pb = Pa_base + i; 2457 // Pm = Pm_base; 2458 // Pn = Pn_base + i; 2459 2460 // Ra = *Pa; 2461 // Rb = *Pb; 2462 // Rm = *Pm; 2463 // Rn = *Pn; 2464 2465 // int iters = (i+1)/2; 2466 // for (j = 0; iters--; j++) { 2467 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 2468 // MACC2(Ra, Rb, t0, t1, t2); 2469 // Ra = *++Pa; 2470 // Rb = *--Pb; 2471 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 2472 // MACC(Rm, Rn, t0, t1, t2); 2473 // Rm = *++Pm; 2474 // Rn = *--Pn; 2475 // } 2476 // if ((i & 1) == 0) { 2477 // assert(Ra == Pa_base[j], "must be"); 2478 // MACC(Ra, Ra, t0, t1, t2); 2479 // } 2480 // iters = i/2; 2481 // assert(iters == i-j, "must be"); 2482 // for (; iters--; j++) { 2483 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 2484 // MACC(Rm, Rn, t0, t1, t2); 2485 // Rm = *++Pm; 2486 // Rn = *--Pn; 2487 // } 2488 2489 // *Pm = Rm = t0 * inv; 2490 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 2491 // MACC(Rm, Rn, t0, t1, t2); 2492 2493 // assert(t0 == 0, "broken Montgomery multiply"); 2494 2495 // t0 = t1; t1 = t2; t2 = 0; 2496 // } 2497 2498 // for (i = len; i < 2*len; i++) { 2499 // int start = i-len+1; 2500 // int end = start + (len - start)/2; 2501 // int j; 2502 2503 // Pa = Pa_base + i-len; 2504 // Pb = Pa_base + len; 2505 // Pm = Pm_base + i-len; 2506 // Pn = Pn_base + len; 2507 2508 // Ra = *++Pa; 2509 // Rb = *--Pb; 2510 // Rm = *++Pm; 2511 // Rn = *--Pn; 2512 2513 // int iters = (2*len-i-1)/2; 2514 // assert(iters == end-start, "must be"); 2515 // for (j = start; iters--; j++) { 2516 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 2517 // MACC2(Ra, Rb, t0, t1, t2); 2518 // Ra = *++Pa; 2519 // Rb = *--Pb; 2520 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 2521 // MACC(Rm, Rn, t0, t1, t2); 2522 // Rm = *++Pm; 2523 // Rn = *--Pn; 2524 // } 2525 // if ((i & 1) == 0) { 2526 // assert(Ra == Pa_base[j], "must be"); 2527 // MACC(Ra, Ra, t0, t1, t2); 2528 // } 2529 // iters = (2*len-i)/2; 2530 // assert(iters == len-j, "must be"); 2531 // for (; iters--; j++) { 2532 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 2533 // MACC(Rm, Rn, t0, t1, t2); 2534 // Rm = *++Pm; 2535 // Rn = *--Pn; 2536 // } 2537 // Pm_base[i-len] = t0; 2538 // t0 = t1; t1 = t2; t2 = 0; 2539 // } 2540 2541 // while (t0) 2542 // t0 = sub(Pm_base, Pn_base, t0, len); 2543 // } 2544 }; 2545 2546 // Initialization 2547 void generate_initial() { 2548 // Generate initial stubs and initializes the entry points 2549 2550 // entry points that exist in all platforms Note: This is code 2551 // that could be shared among different platforms - however the 2552 // benefit seems to be smaller than the disadvantage of having a 2553 // much more complicated generator structure. See also comment in 2554 // stubRoutines.hpp. 2555 2556 StubRoutines::_forward_exception_entry = generate_forward_exception(); 2557 2558 StubRoutines::_call_stub_entry = 2559 generate_call_stub(StubRoutines::_call_stub_return_address); 2560 2561 // is referenced by megamorphic call 2562 StubRoutines::_catch_exception_entry = generate_catch_exception(); 2563 2564 // Build this early so it's available for the interpreter. 2565 StubRoutines::_throw_StackOverflowError_entry = 2566 generate_throw_exception("StackOverflowError throw_exception", 2567 CAST_FROM_FN_PTR(address, 2568 SharedRuntime::throw_StackOverflowError)); 2569 StubRoutines::_throw_delayed_StackOverflowError_entry = 2570 generate_throw_exception("delayed StackOverflowError throw_exception", 2571 CAST_FROM_FN_PTR(address, 2572 SharedRuntime::throw_delayed_StackOverflowError)); 2573 if (UseCRC32Intrinsics) { 2574 // set table address before stub generation which use it 2575 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch32::_crc_table; 2576 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(false); 2577 } 2578 2579 if (UseCRC32CIntrinsics) { 2580 // set table address before stub generation which use it 2581 StubRoutines::_crc32c_table_addr = (address)StubRoutines::aarch32::_crc32c_table; 2582 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32(true); 2583 } 2584 2585 if (UseAESIntrinsics) { 2586 // set table address before stub generation which use it 2587 StubRoutines::_aes_table_te_addr = (address)StubRoutines::aarch32::_aes_te_table; 2588 StubRoutines::_aes_table_td_addr = (address)StubRoutines::aarch32::_aes_td_table; 2589 2590 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 2591 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 2592 2593 if (UseNeon) { 2594 // AES CBC implementation uses NEON insructions 2595 StubRoutines::_cipherBlockChaining_encryptAESCrypt_special = generate_cipherBlockChaining_encryptAESCrypt(false); 2596 StubRoutines::_cipherBlockChaining_decryptAESCrypt_special = generate_cipherBlockChaining_decryptAESCrypt(false); 2597 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(true); 2598 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(true); 2599 } 2600 } 2601 2602 if (UseSHA1Intrinsics) { 2603 StubRoutines::_sha1_table_addr = (address)StubRoutines::aarch32::_sha1_table; 2604 StubRoutines::_sha1_implCompress = generate_sha_implCompress(); 2605 } 2606 if (UseSHA256Intrinsics) { 2607 StubRoutines::_sha256_table_addr = (address)StubRoutines::aarch32::_sha256_table; 2608 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(); 2609 } 2610 if (UseSHA512Intrinsics) { 2611 StubRoutines::_sha512_table_addr = (address)StubRoutines::aarch32::_sha512_table; 2612 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(); 2613 } 2614 2615 NativeCall::init(); 2616 } 2617 #undef __ 2618 #define __ _masm-> 2619 2620 #ifdef COMPILER2 2621 address generate_idiv_irem_stub(const char *name, bool want_mod) { 2622 __ align(CodeEntryAlignment); 2623 StubCodeMark mark(this, "StubRoutines", name); 2624 2625 address start = __ pc(); 2626 2627 BLOCK_COMMENT("Entry:"); 2628 __ enter(); // required for proper stackwalking of RuntimeStub frame 2629 // C2 knows this kills rscratch1 and rscratch2, so not save them 2630 2631 __ divide(r0, r1, r2, 32, want_mod); 2632 2633 __ leave(); // required for proper stackwalking of RuntimeStub frame 2634 __ ret(lr); 2635 2636 return start; 2637 } 2638 2639 // Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super ); 2640 // Arguments : 2641 // 2642 // ret : R0, returned 2643 // icc/xcc: set as R0 (depending on wordSize) 2644 // sub : R1, argument, not changed 2645 // super: R2, argument, not changed 2646 // raddr: LR, blown by call 2647 address generate_partial_subtype_check() { 2648 __ align(CodeEntryAlignment); 2649 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check"); 2650 address start = __ pc(); 2651 2652 // based on SPARC check_klass_subtype_[fast|slow]_path (without CompressedOops) 2653 2654 // R0 used as tmp_reg (in addition to return reg) 2655 Register sub_klass = r1; 2656 Register super_klass = r2; 2657 Register tmp_reg2 = r3; 2658 Register tmp_reg3 = r4; 2659 2660 // inc_counter_np kills rscratch1 and rscratch2 2661 #define saved_set RegSet::of(tmp_reg2, tmp_reg3, rscratch1, rscratch2) 2662 2663 Label L_loop, L_fail; 2664 2665 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 2666 2667 // fast check should be redundant 2668 2669 // slow check 2670 { 2671 __ push(saved_set, sp); 2672 2673 // a couple of useful fields in sub_klass: 2674 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 2675 2676 // Do a linear scan of the secondary super-klass chain. 2677 // This code is rarely used, so simplicity is a virtue here. 2678 2679 inc_counter_np(SharedRuntime::_partial_subtype_ctr); 2680 2681 Register scan_temp = tmp_reg2; 2682 Register count_temp = tmp_reg3; 2683 2684 // We will consult the secondary-super array. 2685 __ ldr(scan_temp, Address(sub_klass, ss_offset)); 2686 2687 Register search_key = super_klass; 2688 2689 // Load the array length. 2690 __ ldr(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes())); 2691 __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes()); 2692 2693 __ add(count_temp, count_temp, 1); 2694 2695 // Top of search loop 2696 __ bind(L_loop); 2697 // Notes: 2698 // scan_temp starts at the array elements 2699 // count_temp is 1+size 2700 __ subs(count_temp, count_temp, 1); 2701 __ b(L_fail, Assembler::EQ); // not found in the array 2702 2703 // Load next super to check 2704 // In the array of super classes elements are pointer sized. 2705 int element_size = wordSize; 2706 __ ldr(r0, __ post(scan_temp, element_size)); 2707 2708 // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list 2709 __ subs(r0, r0, search_key); // set R0 to 0 on success (and flags to eq) 2710 2711 // A miss means we are NOT a subtype and need to keep looping 2712 __ b(L_loop, Assembler::NE); 2713 2714 // Falling out the bottom means we found a hit; we ARE a subtype 2715 2716 // Success. Cache the super we found and proceed in triumph. 2717 __ str(super_klass, Address(sub_klass, sc_offset)); 2718 2719 // Return success 2720 // R0 is already 0 and flags are already set to eq 2721 __ pop(saved_set, sp); 2722 __ ret(lr); 2723 2724 // Return failure 2725 __ bind(L_fail); 2726 __ movs_i(r0, 1); // sets the flags 2727 __ pop(saved_set, sp); 2728 __ ret(lr); 2729 } 2730 return start; 2731 } 2732 #undef saved_set 2733 2734 address generate_string_compress_neon() { 2735 __ align(CodeEntryAlignment); 2736 StubCodeMark mark(this, "StubRoutines", "string_compress_neon"); 2737 address start = __ pc(); 2738 2739 Register src = r2; 2740 Register dst = r1; 2741 Register len = r3; 2742 Register t = r9; 2743 Register t2 = r12; 2744 FloatRegister a1 = d0; 2745 FloatRegister a2 = d1; 2746 FloatRegister b1 = d2; 2747 FloatRegister b2 = d3; 2748 Register result = r0; 2749 2750 Label Lloop2, Lset_result; 2751 2752 __ sub(len, len, 8+16); 2753 __ vld1_64(a1, a2, __ post(src, 16), Assembler::ALIGN_STD); 2754 __ bind(Lloop2); { 2755 __ vld1_64(b1, __ post(src, 8), Assembler::ALIGN_STD); 2756 __ vuzp_64_8(a1, a2); // a1 now has lower bytes, a2 upper 2757 __ vld1_64(b2, __ post(src, 8), Assembler::ALIGN_STD); 2758 __ vmov_f64(t, t2, a2); 2759 __ vst1_64(a1, __ post(dst, 8), Assembler::ALIGN_STD); 2760 __ orrs(t, t, t2); 2761 __ b(Lset_result, Assembler::NE); 2762 2763 __ vld1_64(a1, __ post(src, 8), Assembler::ALIGN_STD); 2764 __ vuzp_64_8(b1, b2); // b1 now has lower bytes, b2 upper 2765 __ vld1_64(a2, __ post(src, 8), Assembler::ALIGN_STD); 2766 __ vmov_f64(t, t2, b2); 2767 __ vst1_64(b1, __ post(dst, 8), Assembler::ALIGN_STD); 2768 __ orrs(t, t, t2); 2769 __ b(Lset_result, Assembler::NE); 2770 __ subs(len, len, 16); 2771 __ b(Lloop2, Assembler::GE); 2772 } 2773 2774 __ vuzp_64_8(a1, a2); // a1 now has lower bytes, a2 upper 2775 __ vmov_f64(t, t2, a2); 2776 __ vst1_64(a1, __ post(dst, 8), Assembler::ALIGN_STD); 2777 __ orrs(t, t, t2); 2778 __ b(Lset_result, Assembler::NE); 2779 __ adds(len, len, 16); 2780 __ ret(lr); // leaves Z-flag to check for per-char slow case 2781 2782 __ bind(Lset_result); 2783 __ movs_i(result, 0, Assembler::NE); // sets Z flag 2784 __ ret(lr); 2785 2786 return start; 2787 } 2788 2789 address generate_string_inflate_neon() { 2790 __ align(CodeEntryAlignment); 2791 StubCodeMark mark(this, "StubRoutines", "string_inflate_neon"); 2792 address start = __ pc(); 2793 2794 Register src = r0; 2795 Register dst = r1; 2796 Register len = r2; 2797 FloatRegister a1 = d0; 2798 2799 Label Lloop2; 2800 2801 __ sub(len, len, 16); 2802 __ bind(Lloop2); { 2803 __ vld1_64(d0, __ post(src, 8), Assembler::ALIGN_STD); 2804 __ vmovl_8u(q0, d0); 2805 __ vst1_64(d0, d1, __ post(dst, 16), Assembler::ALIGN_STD); 2806 __ vld1_64(d0, __ post(src, 8), Assembler::ALIGN_STD); 2807 __ vmovl_8u(q0, d0); 2808 __ vst1_64(d0, d1, __ post(dst, 16), Assembler::ALIGN_STD); 2809 __ subs(len, len, 16); 2810 __ b(Lloop2, Assembler::HS); 2811 } 2812 2813 __ adds(len, len, 16); // sets Z flag to check in intrinsic 2814 __ ret(lr); 2815 2816 return start; 2817 } 2818 2819 void generate_c2_stubs() { 2820 StubRoutines::aarch32::_idiv_entry = 2821 generate_idiv_irem_stub("idiv_c2_stub", false); 2822 StubRoutines::aarch32::_irem_entry = 2823 generate_idiv_irem_stub("irem_c2_stub", true); 2824 StubRoutines::aarch32::_partial_subtype_check = 2825 generate_partial_subtype_check(); 2826 if (VM_Version::features() & FT_AdvSIMD) { 2827 StubRoutines::aarch32::_string_compress_neon = 2828 generate_string_compress_neon(); 2829 StubRoutines::aarch32::_string_inflate_neon = 2830 generate_string_inflate_neon(); 2831 } 2832 } 2833 #endif 2834 2835 void generate_all() { 2836 // support for verify_oop (must happen after universe_init) 2837 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 2838 StubRoutines::_throw_AbstractMethodError_entry = 2839 generate_throw_exception("AbstractMethodError throw_exception", 2840 CAST_FROM_FN_PTR(address, 2841 SharedRuntime:: 2842 throw_AbstractMethodError)); 2843 2844 StubRoutines::_throw_IncompatibleClassChangeError_entry = 2845 generate_throw_exception("IncompatibleClassChangeError throw_exception", 2846 CAST_FROM_FN_PTR(address, 2847 SharedRuntime:: 2848 throw_IncompatibleClassChangeError)); 2849 2850 StubRoutines::_throw_NullPointerException_at_call_entry = 2851 generate_throw_exception("NullPointerException at call throw_exception", 2852 CAST_FROM_FN_PTR(address, 2853 SharedRuntime:: 2854 throw_NullPointerException_at_call)); 2855 2856 // arraycopy stubs used by compilers 2857 generate_arraycopy_stubs(); 2858 2859 #ifdef COMPILER2 2860 if (UseMultiplyToLenIntrinsic) { 2861 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 2862 StubRoutines::_mulAdd = generate_mulAdd(); 2863 } 2864 #endif 2865 2866 if (UseMontgomeryMultiplyIntrinsic) { 2867 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 2868 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 2869 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 2870 } 2871 2872 if (UseMontgomerySquareIntrinsic) { 2873 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 2874 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 2875 StubRoutines::_montgomerySquare = g.generate_square(); 2876 } 2877 2878 // Safefetch stubs. 2879 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 2880 &StubRoutines::_safefetch32_fault_pc, 2881 &StubRoutines::_safefetch32_continuation_pc); 2882 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 2883 &StubRoutines::_safefetchN_fault_pc, 2884 &StubRoutines::_safefetchN_continuation_pc); 2885 2886 #ifdef COMPILER2 2887 generate_c2_stubs(); 2888 #endif 2889 } 2890 2891 public: 2892 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 2893 if (all) { 2894 generate_all(); 2895 } else { 2896 generate_initial(); 2897 } 2898 2899 } 2900 }; // end class declaration 2901 2902 void StubGenerator_generate(CodeBuffer* code, bool all) { 2903 StubGenerator g(code, all); 2904 }