1 /*
2 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
2226 if (prolog_ptr)
2227 patch_end[-2] = (u_int64_t)prolog_ptr;
2228 patch_end[-1] = calltype;
2229 }
2230 #endif
2231
2232 void MacroAssembler::push_CPU_state() {
2233 push(0x3fffffff, sp); // integer registers except lr & sp
2234
2235 for (int i = 30; i >= 0; i -= 2)
2236 stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2237 Address(pre(sp, -2 * wordSize)));
2238 }
2239
2240 void MacroAssembler::pop_CPU_state() {
2241 for (int i = 0; i < 32; i += 2)
2242 ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2243 Address(post(sp, 2 * wordSize)));
2244
2245 pop(0x3fffffff, sp); // integer registers except lr & sp
2246 }
2247
2248 /**
2249 * Emits code to update CRC-32 with a byte value according to constants in table
2250 *
2251 * @param [in,out]crc Register containing the crc.
2252 * @param [in]val Register containing the byte to fold into the CRC.
2253 * @param [in]table Register containing the table of crc constants.
2254 *
2255 * uint32_t crc;
2256 * val = crc_table[(val ^ crc) & 0xFF];
2257 * crc = val ^ (crc >> 8);
2258 *
2259 */
2260 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2261 eor(val, val, crc);
2262 andr(val, val, 0xff);
2263 ldrw(val, Address(table, val, Address::lsl(2)));
2264 eor(crc, val, crc, Assembler::LSR, 8);
2265 }
|
1 /*
2 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
2226 if (prolog_ptr)
2227 patch_end[-2] = (u_int64_t)prolog_ptr;
2228 patch_end[-1] = calltype;
2229 }
2230 #endif
2231
2232 void MacroAssembler::push_CPU_state() {
2233 push(0x3fffffff, sp); // integer registers except lr & sp
2234
2235 for (int i = 30; i >= 0; i -= 2)
2236 stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2237 Address(pre(sp, -2 * wordSize)));
2238 }
2239
2240 void MacroAssembler::pop_CPU_state() {
2241 for (int i = 0; i < 32; i += 2)
2242 ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2243 Address(post(sp, 2 * wordSize)));
2244
2245 pop(0x3fffffff, sp); // integer registers except lr & sp
2246 }
2247
2248 /**
2249 * Helpers for multiply_to_len().
2250 */
2251 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2252 Register src1, Register src2) {
2253 adds(dest_lo, dest_lo, src1);
2254 adc(dest_hi, dest_hi, zr);
2255 adds(dest_lo, dest_lo, src2);
2256 adc(final_dest_hi, dest_hi, zr);
2257 }
2258
2259 // Generate an address from (r + r1 extend offset). "size" is the
2260 // size of the operand. The result may be in rscratch2.
2261 Address MacroAssembler::offsetted_address(Register r, Register r1,
2262 Address::extend ext, int offset, int size) {
2263 if (offset || (ext.shift() % size != 0)) {
2264 lea(rscratch2, Address(r, r1, ext));
2265 return Address(rscratch2, offset);
2266 } else {
2267 return Address(r, r1, ext);
2268 }
2269 }
2270
2271 /**
2272 * Multiply 64 bit by 64 bit first loop.
2273 */
2274 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2275 Register y, Register y_idx, Register z,
2276 Register carry, Register product,
2277 Register idx, Register kdx) {
2278 //
2279 // jlong carry, x[], y[], z[];
2280 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2281 // huge_128 product = y[idx] * x[xstart] + carry;
2282 // z[kdx] = (jlong)product;
2283 // carry = (jlong)(product >>> 64);
2284 // }
2285 // z[xstart] = carry;
2286 //
2287
2288 Label L_first_loop, L_first_loop_exit;
2289 Label L_one_x, L_one_y, L_multiply;
2290
2291 subsw(xstart, xstart, 1);
2292 br(Assembler::MI, L_one_x);
2293
2294 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2295 ldr(x_xstart, Address(rscratch1));
2296 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2297
2298 bind(L_first_loop);
2299 subsw(idx, idx, 1);
2300 br(Assembler::MI, L_first_loop_exit);
2301 subsw(idx, idx, 1);
2302 br(Assembler::MI, L_one_y);
2303 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2304 ldr(y_idx, Address(rscratch1));
2305 ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2306 bind(L_multiply);
2307
2308 // AArch64 has multiply-accumulate instruction that we can't use
2309 // here because it has no way to process carries so we have to use
2310 // separate add and adc instructions. Bah.
2311 umulh(rscratch1, x_xstart, y_idx); // product * y_idx -> rscratch1:product
2312 mul(product, x_xstart, y_idx);
2313 adds(product, product, carry);
2314 adc(carry, rscratch1, zr); // product * y_idx + carry -> carry:product
2315
2316 subw(kdx, kdx, 2);
2317 ror(product, product, 32); // back to big-endian
2318 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerWord));
2319
2320 b(L_first_loop);
2321
2322 bind(L_one_y);
2323 ldrw(y_idx, Address(y, 0));
2324 b(L_multiply);
2325
2326 bind(L_one_x);
2327 ldrw(x_xstart, Address(x, 0));
2328 b(L_first_loop);
2329
2330 bind(L_first_loop_exit);
2331 }
2332
2333 /**
2334 * Multiply 128 bit by 128. Unrolled inner loop.
2335 *
2336 */
2337 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2338 Register carry, Register carry2,
2339 Register idx, Register jdx,
2340 Register yz_idx1, Register yz_idx2,
2341 Register tmp, Register tmp3, Register tmp4,
2342 Register tmp6, Register product_hi) {
2343
2344 // jlong carry, x[], y[], z[];
2345 // int kdx = ystart+1;
2346 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2347 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2348 // jlong carry2 = (jlong)(tmp3 >>> 64);
2349 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2;
2350 // carry = (jlong)(tmp4 >>> 64);
2351 // z[kdx+idx+1] = (jlong)tmp3;
2352 // z[kdx+idx] = (jlong)tmp4;
2353 // }
2354 // idx += 2;
2355 // if (idx > 0) {
2356 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2357 // z[kdx+idx] = (jlong)yz_idx1;
2358 // carry = (jlong)(yz_idx1 >>> 64);
2359 // }
2360 //
2361
2362 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2363
2364 lsrw(jdx, idx, 2);
2365
2366 bind(L_third_loop);
2367
2368 subsw(jdx, jdx, 1);
2369 br(Assembler::MI, L_third_loop_exit);
2370 subw(idx, idx, 4);
2371
2372 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2373
2374 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2375
2376 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2377
2378 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2379 ror(yz_idx2, yz_idx2, 32);
2380
2381 ldp(rscratch2, rscratch1, Address(tmp6, 0));
2382
2383 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
2384 umulh(tmp4, product_hi, yz_idx1);
2385
2386 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2387 ror(rscratch2, rscratch2, 32);
2388
2389 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp
2390 umulh(carry2, product_hi, yz_idx2);
2391
2392 // propagate sum of both multiplications into carry:tmp4:tmp3
2393 adds(tmp3, tmp3, carry);
2394 adc(tmp4, tmp4, zr);
2395 adds(tmp3, tmp3, rscratch1);
2396 adcs(tmp4, tmp4, tmp);
2397 adc(carry, carry2, zr);
2398 adds(tmp4, tmp4, rscratch2);
2399 adc(carry, carry, zr);
2400
2401 ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2402 ror(tmp4, tmp4, 32);
2403 stp(tmp4, tmp3, Address(tmp6, 0));
2404
2405 b(L_third_loop);
2406 bind (L_third_loop_exit);
2407
2408 andw (idx, idx, 0x3);
2409 cbz(idx, L_post_third_loop_done);
2410
2411 Label L_check_1;
2412 subsw(idx, idx, 2);
2413 br(Assembler::MI, L_check_1);
2414
2415 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2416 ldr(yz_idx1, Address(rscratch1, 0));
2417 ror(yz_idx1, yz_idx1, 32);
2418 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
2419 umulh(tmp4, product_hi, yz_idx1);
2420 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2421 ldr(yz_idx2, Address(rscratch1, 0));
2422 ror(yz_idx2, yz_idx2, 32);
2423
2424 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2425
2426 ror(tmp3, tmp3, 32);
2427 str(tmp3, Address(rscratch1, 0));
2428
2429 bind (L_check_1);
2430
2431 andw (idx, idx, 0x1);
2432 subsw(idx, idx, 1);
2433 br(Assembler::MI, L_post_third_loop_done);
2434 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2435 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3
2436 umulh(carry2, tmp4, product_hi);
2437 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2438
2439 add2_with_carry(carry2, tmp3, tmp4, carry);
2440
2441 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2442 extr(carry, carry2, tmp3, 32);
2443
2444 bind(L_post_third_loop_done);
2445 }
2446
2447 /**
2448 * Code for BigInteger::multiplyToLen() instrinsic.
2449 *
2450 * r0: x
2451 * r1: xlen
2452 * r2: y
2453 * r3: ylen
2454 * r4: z
2455 * r5: zlen
2456 * r10: tmp1
2457 * r11: tmp2
2458 * r12: tmp3
2459 * r13: tmp4
2460 * r14: tmp5
2461 * r15: tmp6
2462 * r16: tmp7
2463 *
2464 */
2465 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2466 Register z, Register zlen,
2467 Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2468 Register tmp5, Register tmp6, Register product_hi) {
2469
2470 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2471
2472 const Register idx = tmp1;
2473 const Register kdx = tmp2;
2474 const Register xstart = tmp3;
2475
2476 const Register y_idx = tmp4;
2477 const Register carry = tmp5;
2478 const Register product = xlen;
2479 const Register x_xstart = zlen; // reuse register
2480
2481 // First Loop.
2482 //
2483 // final static long LONG_MASK = 0xffffffffL;
2484 // int xstart = xlen - 1;
2485 // int ystart = ylen - 1;
2486 // long carry = 0;
2487 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2488 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2489 // z[kdx] = (int)product;
2490 // carry = product >>> 32;
2491 // }
2492 // z[xstart] = (int)carry;
2493 //
2494
2495 movw(idx, ylen); // idx = ylen;
2496 movw(kdx, zlen); // kdx = xlen+ylen;
2497 mov(carry, zr); // carry = 0;
2498
2499 Label L_done;
2500
2501 movw(xstart, xlen);
2502 subsw(xstart, xstart, 1);
2503 br(Assembler::MI, L_done);
2504
2505 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
2506
2507 Label L_second_loop;
2508 cbzw(kdx, L_second_loop);
2509
2510 Label L_carry;
2511 subw(kdx, kdx, 1);
2512 cbzw(kdx, L_carry);
2513
2514 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2515 lsr(carry, carry, 32);
2516 subw(kdx, kdx, 1);
2517
2518 bind(L_carry);
2519 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2520
2521 // Second and third (nested) loops.
2522 //
2523 // for (int i = xstart-1; i >= 0; i--) { // Second loop
2524 // carry = 0;
2525 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
2526 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
2527 // (z[k] & LONG_MASK) + carry;
2528 // z[k] = (int)product;
2529 // carry = product >>> 32;
2530 // }
2531 // z[i] = (int)carry;
2532 // }
2533 //
2534 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
2535
2536 const Register jdx = tmp1;
2537
2538 bind(L_second_loop);
2539 mov(carry, zr); // carry = 0;
2540 movw(jdx, ylen); // j = ystart+1
2541
2542 subsw(xstart, xstart, 1); // i = xstart-1;
2543 br(Assembler::MI, L_done);
2544
2545 str(z, Address(pre(sp, -4 * wordSize)));
2546
2547 Label L_last_x;
2548 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
2549 subsw(xstart, xstart, 1); // i = xstart-1;
2550 br(Assembler::MI, L_last_x);
2551
2552 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
2553 ldr(product_hi, Address(rscratch1));
2554 ror(product_hi, product_hi, 32); // convert big-endian to little-endian
2555
2556 Label L_third_loop_prologue;
2557 bind(L_third_loop_prologue);
2558
2559 str(ylen, Address(sp, wordSize));
2560 stp(x, xstart, Address(sp, 2 * wordSize));
2561 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
2562 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
2563 ldp(z, ylen, Address(post(sp, 2 * wordSize)));
2564 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen
2565
2566 addw(tmp3, xlen, 1);
2567 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2568 subsw(tmp3, tmp3, 1);
2569 br(Assembler::MI, L_done);
2570
2571 lsr(carry, carry, 32);
2572 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2573 b(L_second_loop);
2574
2575 // Next infrequent code is moved outside loops.
2576 bind(L_last_x);
2577 ldrw(product_hi, Address(x, 0));
2578 b(L_third_loop_prologue);
2579
2580 bind(L_done);
2581 }
2582
2583 /**
2584 * Emits code to update CRC-32 with a byte value according to constants in table
2585 *
2586 * @param [in,out]crc Register containing the crc.
2587 * @param [in]val Register containing the byte to fold into the CRC.
2588 * @param [in]table Register containing the table of crc constants.
2589 *
2590 * uint32_t crc;
2591 * val = crc_table[(val ^ crc) & 0xFF];
2592 * crc = val ^ (crc >> 8);
2593 *
2594 */
2595 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2596 eor(val, val, crc);
2597 andr(val, val, 0xff);
2598 ldrw(val, Address(table, val, Address::lsl(2)));
2599 eor(crc, val, crc, Assembler::LSR, 8);
2600 }
|