hotspot Sdiff src/cpu/aarch64/vm

src/cpu/aarch64/vm/macroAssembler_aarch64.cpp

rev 8067 : 8077615: AARCH64: Add C2 intrinsic for BigInteger::multiplyToLen() method
Summary: Add C2 intrinsic for BigInteger::multiplyToLen() on AArch64.
Reviewed-by: kvn

   1 /*
   2  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *

2226   if (prolog_ptr)
2227     patch_end[-2] = (u_int64_t)prolog_ptr;
2228   patch_end[-1] = calltype;
2229 }
2230 #endif
2231 
2232 void MacroAssembler::push_CPU_state() {
2233     push(0x3fffffff, sp);         // integer registers except lr & sp
2234 
2235     for (int i = 30; i >= 0; i -= 2)
2236       stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2237            Address(pre(sp, -2 * wordSize)));
2238 }
2239 
2240 void MacroAssembler::pop_CPU_state() {
2241   for (int i = 0; i < 32; i += 2)
2242     ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2243          Address(post(sp, 2 * wordSize)));
2244 
2245   pop(0x3fffffff, sp);         // integer registers except lr & sp















































































































































































































































































































































2246 }
2247 
2248 /**
2249  * Emits code to update CRC-32 with a byte value according to constants in table
2250  *
2251  * @param [in,out]crc   Register containing the crc.
2252  * @param [in]val       Register containing the byte to fold into the CRC.
2253  * @param [in]table     Register containing the table of crc constants.
2254  *
2255  * uint32_t crc;
2256  * val = crc_table[(val ^ crc) & 0xFF];
2257  * crc = val ^ (crc >> 8);
2258  *
2259  */
2260 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2261   eor(val, val, crc);
2262   andr(val, val, 0xff);
2263   ldrw(val, Address(table, val, Address::lsl(2)));
2264   eor(crc, val, crc, Assembler::LSR, 8);
2265 }

   1 /*
   2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *

2226   if (prolog_ptr)
2227     patch_end[-2] = (u_int64_t)prolog_ptr;
2228   patch_end[-1] = calltype;
2229 }
2230 #endif
2231 
2232 void MacroAssembler::push_CPU_state() {
2233     push(0x3fffffff, sp);         // integer registers except lr & sp
2234 
2235     for (int i = 30; i >= 0; i -= 2)
2236       stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2237            Address(pre(sp, -2 * wordSize)));
2238 }
2239 
2240 void MacroAssembler::pop_CPU_state() {
2241   for (int i = 0; i < 32; i += 2)
2242     ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2243          Address(post(sp, 2 * wordSize)));
2244 
2245   pop(0x3fffffff, sp);         // integer registers except lr & sp
2246 }
2247 
2248 /**
2249  * Helpers for multiply_to_len().
2250  */
2251 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2252                                      Register src1, Register src2) {
2253   adds(dest_lo, dest_lo, src1);
2254   adc(dest_hi, dest_hi, zr);
2255   adds(dest_lo, dest_lo, src2);
2256   adc(final_dest_hi, dest_hi, zr);
2257 }
2258 
2259 // Generate an address from (r + r1 extend offset).  "size" is the
2260 // size of the operand.  The result may be in rscratch2.
2261 Address MacroAssembler::offsetted_address(Register r, Register r1,
2262                                           Address::extend ext, int offset, int size) {
2263   if (offset || (ext.shift() % size != 0)) {
2264     lea(rscratch2, Address(r, r1, ext));
2265     return Address(rscratch2, offset);
2266   } else {
2267     return Address(r, r1, ext);
2268   }
2269 }
2270 
2271 /**
2272  * Multiply 64 bit by 64 bit first loop.
2273  */
2274 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2275                                            Register y, Register y_idx, Register z,
2276                                            Register carry, Register product,
2277                                            Register idx, Register kdx) {
2278   //
2279   //  jlong carry, x[], y[], z[];
2280   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2281   //    huge_128 product = y[idx] * x[xstart] + carry;
2282   //    z[kdx] = (jlong)product;
2283   //    carry  = (jlong)(product >>> 64);
2284   //  }
2285   //  z[xstart] = carry;
2286   //
2287 
2288   Label L_first_loop, L_first_loop_exit;
2289   Label L_one_x, L_one_y, L_multiply;
2290 
2291   subsw(xstart, xstart, 1);
2292   br(Assembler::MI, L_one_x);
2293 
2294   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2295   ldr(x_xstart, Address(rscratch1));
2296   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2297 
2298   bind(L_first_loop);
2299   subsw(idx, idx, 1);
2300   br(Assembler::MI, L_first_loop_exit);
2301   subsw(idx, idx, 1);
2302   br(Assembler::MI, L_one_y);
2303   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2304   ldr(y_idx, Address(rscratch1));
2305   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2306   bind(L_multiply);
2307 
2308   // AArch64 has multiply-accumulate instruction that we can't use
2309   // here because it has no way to process carries so we have to use
2310   // separate add and adc instructions.  Bah.
2311   umulh(rscratch1, x_xstart, y_idx); // product * y_idx -> rscratch1:product
2312   mul(product, x_xstart, y_idx);
2313   adds(product, product, carry);
2314   adc(carry, rscratch1, zr);   // product * y_idx + carry -> carry:product
2315 
2316   subw(kdx, kdx, 2);
2317   ror(product, product, 32); // back to big-endian
2318   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerWord));
2319 
2320   b(L_first_loop);
2321 
2322   bind(L_one_y);
2323   ldrw(y_idx, Address(y,  0));
2324   b(L_multiply);
2325 
2326   bind(L_one_x);
2327   ldrw(x_xstart, Address(x,  0));
2328   b(L_first_loop);
2329 
2330   bind(L_first_loop_exit);
2331 }
2332 
2333 /**
2334  * Multiply 128 bit by 128. Unrolled inner loop.
2335  *
2336  */
2337 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2338                                              Register carry, Register carry2,
2339                                              Register idx, Register jdx,
2340                                              Register yz_idx1, Register yz_idx2,
2341                                              Register tmp, Register tmp3, Register tmp4,
2342                                              Register tmp6, Register product_hi) {
2343 
2344   //   jlong carry, x[], y[], z[];
2345   //   int kdx = ystart+1;
2346   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2347   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2348   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2349   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2350   //     carry  = (jlong)(tmp4 >>> 64);
2351   //     z[kdx+idx+1] = (jlong)tmp3;
2352   //     z[kdx+idx] = (jlong)tmp4;
2353   //   }
2354   //   idx += 2;
2355   //   if (idx > 0) {
2356   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2357   //     z[kdx+idx] = (jlong)yz_idx1;
2358   //     carry  = (jlong)(yz_idx1 >>> 64);
2359   //   }
2360   //
2361 
2362   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2363 
2364   lsrw(jdx, idx, 2);
2365 
2366   bind(L_third_loop);
2367 
2368   subsw(jdx, jdx, 1);
2369   br(Assembler::MI, L_third_loop_exit);
2370   subw(idx, idx, 4);
2371 
2372   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2373 
2374   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2375 
2376   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2377 
2378   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2379   ror(yz_idx2, yz_idx2, 32);
2380 
2381   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2382 
2383   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2384   umulh(tmp4, product_hi, yz_idx1);
2385 
2386   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2387   ror(rscratch2, rscratch2, 32);
2388 
2389   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2390   umulh(carry2, product_hi, yz_idx2);
2391 
2392   // propagate sum of both multiplications into carry:tmp4:tmp3
2393   adds(tmp3, tmp3, carry);
2394   adc(tmp4, tmp4, zr);
2395   adds(tmp3, tmp3, rscratch1);
2396   adcs(tmp4, tmp4, tmp);
2397   adc(carry, carry2, zr);
2398   adds(tmp4, tmp4, rscratch2);
2399   adc(carry, carry, zr);
2400 
2401   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2402   ror(tmp4, tmp4, 32);
2403   stp(tmp4, tmp3, Address(tmp6, 0));
2404 
2405   b(L_third_loop);
2406   bind (L_third_loop_exit);
2407 
2408   andw (idx, idx, 0x3);
2409   cbz(idx, L_post_third_loop_done);
2410 
2411   Label L_check_1;
2412   subsw(idx, idx, 2);
2413   br(Assembler::MI, L_check_1);
2414 
2415   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2416   ldr(yz_idx1, Address(rscratch1, 0));
2417   ror(yz_idx1, yz_idx1, 32);
2418   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2419   umulh(tmp4, product_hi, yz_idx1);
2420   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2421   ldr(yz_idx2, Address(rscratch1, 0));
2422   ror(yz_idx2, yz_idx2, 32);
2423 
2424   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2425 
2426   ror(tmp3, tmp3, 32);
2427   str(tmp3, Address(rscratch1, 0));
2428 
2429   bind (L_check_1);
2430 
2431   andw (idx, idx, 0x1);
2432   subsw(idx, idx, 1);
2433   br(Assembler::MI, L_post_third_loop_done);
2434   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2435   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2436   umulh(carry2, tmp4, product_hi);
2437   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2438 
2439   add2_with_carry(carry2, tmp3, tmp4, carry);
2440 
2441   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2442   extr(carry, carry2, tmp3, 32);
2443 
2444   bind(L_post_third_loop_done);
2445 }
2446 
2447 /**
2448  * Code for BigInteger::multiplyToLen() instrinsic.
2449  *
2450  * r0: x
2451  * r1: xlen
2452  * r2: y
2453  * r3: ylen
2454  * r4:  z
2455  * r5: zlen
2456  * r10: tmp1
2457  * r11: tmp2
2458  * r12: tmp3
2459  * r13: tmp4
2460  * r14: tmp5
2461  * r15: tmp6
2462  * r16: tmp7
2463  *
2464  */
2465 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2466                                      Register z, Register zlen,
2467                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2468                                      Register tmp5, Register tmp6, Register product_hi) {
2469 
2470   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2471 
2472   const Register idx = tmp1;
2473   const Register kdx = tmp2;
2474   const Register xstart = tmp3;
2475 
2476   const Register y_idx = tmp4;
2477   const Register carry = tmp5;
2478   const Register product  = xlen;
2479   const Register x_xstart = zlen;  // reuse register
2480 
2481   // First Loop.
2482   //
2483   //  final static long LONG_MASK = 0xffffffffL;
2484   //  int xstart = xlen - 1;
2485   //  int ystart = ylen - 1;
2486   //  long carry = 0;
2487   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2488   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2489   //    z[kdx] = (int)product;
2490   //    carry = product >>> 32;
2491   //  }
2492   //  z[xstart] = (int)carry;
2493   //
2494 
2495   movw(idx, ylen);      // idx = ylen;
2496   movw(kdx, zlen);      // kdx = xlen+ylen;
2497   mov(carry, zr);       // carry = 0;
2498 
2499   Label L_done;
2500 
2501   movw(xstart, xlen);
2502   subsw(xstart, xstart, 1);
2503   br(Assembler::MI, L_done);
2504 
2505   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
2506 
2507   Label L_second_loop;
2508   cbzw(kdx, L_second_loop);
2509 
2510   Label L_carry;
2511   subw(kdx, kdx, 1);
2512   cbzw(kdx, L_carry);
2513 
2514   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2515   lsr(carry, carry, 32);
2516   subw(kdx, kdx, 1);
2517 
2518   bind(L_carry);
2519   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2520 
2521   // Second and third (nested) loops.
2522   //
2523   // for (int i = xstart-1; i >= 0; i--) { // Second loop
2524   //   carry = 0;
2525   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
2526   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
2527   //                    (z[k] & LONG_MASK) + carry;
2528   //     z[k] = (int)product;
2529   //     carry = product >>> 32;
2530   //   }
2531   //   z[i] = (int)carry;
2532   // }
2533   //
2534   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
2535 
2536   const Register jdx = tmp1;
2537 
2538   bind(L_second_loop);
2539   mov(carry, zr);                // carry = 0;
2540   movw(jdx, ylen);               // j = ystart+1
2541 
2542   subsw(xstart, xstart, 1);      // i = xstart-1;
2543   br(Assembler::MI, L_done);
2544 
2545   str(z, Address(pre(sp, -4 * wordSize)));
2546 
2547   Label L_last_x;
2548   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
2549   subsw(xstart, xstart, 1);       // i = xstart-1;
2550   br(Assembler::MI, L_last_x);
2551 
2552   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
2553   ldr(product_hi, Address(rscratch1));
2554   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
2555 
2556   Label L_third_loop_prologue;
2557   bind(L_third_loop_prologue);
2558 
2559   str(ylen, Address(sp, wordSize));
2560   stp(x, xstart, Address(sp, 2 * wordSize));
2561   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
2562                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
2563   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
2564   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
2565 
2566   addw(tmp3, xlen, 1);
2567   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2568   subsw(tmp3, tmp3, 1);
2569   br(Assembler::MI, L_done);
2570 
2571   lsr(carry, carry, 32);
2572   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2573   b(L_second_loop);
2574 
2575   // Next infrequent code is moved outside loops.
2576   bind(L_last_x);
2577   ldrw(product_hi, Address(x,  0));
2578   b(L_third_loop_prologue);
2579 
2580   bind(L_done);
2581 }
2582 
2583 /**
2584  * Emits code to update CRC-32 with a byte value according to constants in table
2585  *
2586  * @param [in,out]crc   Register containing the crc.
2587  * @param [in]val       Register containing the byte to fold into the CRC.
2588  * @param [in]table     Register containing the table of crc constants.
2589  *
2590  * uint32_t crc;
2591  * val = crc_table[(val ^ crc) & 0xFF];
2592  * crc = val ^ (crc >> 8);
2593  *
2594  */
2595 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2596   eor(val, val, crc);
2597   andr(val, val, 0xff);
2598   ldrw(val, Address(table, val, Address::lsl(2)));
2599   eor(crc, val, crc, Assembler::LSR, 8);
2600 }