--- old/src/cpu/aarch64/vm/assembler_aarch64.hpp 2015-04-13 18:56:40.096278259 +0100 +++ new/src/cpu/aarch64/vm/assembler_aarch64.hpp 2015-04-13 18:56:39.906297276 +0100 @@ -1,6 +1,6 @@ /* * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Red Hat Inc. All rights reserved. + * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -1469,7 +1469,7 @@ f(op, 31, 29); f(0b11010000, 28, 21); f(0b000000, 15, 10); - rf(Rm, 16), rf(Rn, 5), rf(Rd, 0); + zrf(Rm, 16), zrf(Rn, 5), zrf(Rd, 0); } #define INSN(NAME, op) \ --- old/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp 2015-04-13 18:56:41.126175169 +0100 +++ new/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp 2015-04-13 18:56:40.946193185 +0100 @@ -1,6 +1,6 @@ /* - * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Red Hat Inc. All rights reserved. + * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -2246,6 +2246,341 @@ } /** + * Helpers for multiply_to_len(). + */ +void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, + Register src1, Register src2) { + adds(dest_lo, dest_lo, src1); + adc(dest_hi, dest_hi, zr); + adds(dest_lo, dest_lo, src2); + adc(final_dest_hi, dest_hi, zr); +} + +// Generate an address from (r + r1 extend offset). "size" is the +// size of the operand. The result may be in rscratch2. +Address MacroAssembler::offsetted_address(Register r, Register r1, + Address::extend ext, int offset, int size) { + if (offset || (ext.shift() % size != 0)) { + lea(rscratch2, Address(r, r1, ext)); + return Address(rscratch2, offset); + } else { + return Address(r, r1, ext); + } +} + +/** + * Multiply 64 bit by 64 bit first loop. + */ +void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, + Register y, Register y_idx, Register z, + Register carry, Register product, + Register idx, Register kdx) { + // + // jlong carry, x[], y[], z[]; + // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { + // huge_128 product = y[idx] * x[xstart] + carry; + // z[kdx] = (jlong)product; + // carry = (jlong)(product >>> 64); + // } + // z[xstart] = carry; + // + + Label L_first_loop, L_first_loop_exit; + Label L_one_x, L_one_y, L_multiply; + + subsw(xstart, xstart, 1); + br(Assembler::MI, L_one_x); + + lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); + ldr(x_xstart, Address(rscratch1)); + ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian + + bind(L_first_loop); + subsw(idx, idx, 1); + br(Assembler::MI, L_first_loop_exit); + subsw(idx, idx, 1); + br(Assembler::MI, L_one_y); + lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); + ldr(y_idx, Address(rscratch1)); + ror(y_idx, y_idx, 32); // convert big-endian to little-endian + bind(L_multiply); + + // AArch64 has multiply-accumulate instruction that we can't use + // here because it has no way to process carries so we have to use + // separate add and adc instructions. Bah. + umulh(rscratch1, x_xstart, y_idx); // product * y_idx -> rscratch1:product + mul(product, x_xstart, y_idx); + adds(product, product, carry); + adc(carry, rscratch1, zr); // product * y_idx + carry -> carry:product + + subw(kdx, kdx, 2); + ror(product, product, 32); // back to big-endian + str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerWord)); + + b(L_first_loop); + + bind(L_one_y); + ldrw(y_idx, Address(y, 0)); + b(L_multiply); + + bind(L_one_x); + ldrw(x_xstart, Address(x, 0)); + b(L_first_loop); + + bind(L_first_loop_exit); +} + +/** + * Multiply 128 bit by 128. Unrolled inner loop. + * + */ +void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, + Register carry, Register carry2, + Register idx, Register jdx, + Register yz_idx1, Register yz_idx2, + Register tmp, Register tmp3, Register tmp4, + Register tmp6, Register product_hi) { + + // jlong carry, x[], y[], z[]; + // int kdx = ystart+1; + // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop + // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; + // jlong carry2 = (jlong)(tmp3 >>> 64); + // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; + // carry = (jlong)(tmp4 >>> 64); + // z[kdx+idx+1] = (jlong)tmp3; + // z[kdx+idx] = (jlong)tmp4; + // } + // idx += 2; + // if (idx > 0) { + // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; + // z[kdx+idx] = (jlong)yz_idx1; + // carry = (jlong)(yz_idx1 >>> 64); + // } + // + + Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; + + lsrw(jdx, idx, 2); + + bind(L_third_loop); + + subsw(jdx, jdx, 1); + br(Assembler::MI, L_third_loop_exit); + subw(idx, idx, 4); + + lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); + + ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); + + lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); + + ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian + ror(yz_idx2, yz_idx2, 32); + + ldp(rscratch2, rscratch1, Address(tmp6, 0)); + + mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 + umulh(tmp4, product_hi, yz_idx1); + + ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian + ror(rscratch2, rscratch2, 32); + + mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp + umulh(carry2, product_hi, yz_idx2); + + // propagate sum of both multiplications into carry:tmp4:tmp3 + adds(tmp3, tmp3, carry); + adc(tmp4, tmp4, zr); + adds(tmp3, tmp3, rscratch1); + adcs(tmp4, tmp4, tmp); + adc(carry, carry2, zr); + adds(tmp4, tmp4, rscratch2); + adc(carry, carry, zr); + + ror(tmp3, tmp3, 32); // convert little-endian to big-endian + ror(tmp4, tmp4, 32); + stp(tmp4, tmp3, Address(tmp6, 0)); + + b(L_third_loop); + bind (L_third_loop_exit); + + andw (idx, idx, 0x3); + cbz(idx, L_post_third_loop_done); + + Label L_check_1; + subsw(idx, idx, 2); + br(Assembler::MI, L_check_1); + + lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); + ldr(yz_idx1, Address(rscratch1, 0)); + ror(yz_idx1, yz_idx1, 32); + mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 + umulh(tmp4, product_hi, yz_idx1); + lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); + ldr(yz_idx2, Address(rscratch1, 0)); + ror(yz_idx2, yz_idx2, 32); + + add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); + + ror(tmp3, tmp3, 32); + str(tmp3, Address(rscratch1, 0)); + + bind (L_check_1); + + andw (idx, idx, 0x1); + subsw(idx, idx, 1); + br(Assembler::MI, L_post_third_loop_done); + ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); + mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 + umulh(carry2, tmp4, product_hi); + ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); + + add2_with_carry(carry2, tmp3, tmp4, carry); + + strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); + extr(carry, carry2, tmp3, 32); + + bind(L_post_third_loop_done); +} + +/** + * Code for BigInteger::multiplyToLen() instrinsic. + * + * r0: x + * r1: xlen + * r2: y + * r3: ylen + * r4: z + * r5: zlen + * r10: tmp1 + * r11: tmp2 + * r12: tmp3 + * r13: tmp4 + * r14: tmp5 + * r15: tmp6 + * r16: tmp7 + * + */ +void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, + Register z, Register zlen, + Register tmp1, Register tmp2, Register tmp3, Register tmp4, + Register tmp5, Register tmp6, Register product_hi) { + + assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); + + const Register idx = tmp1; + const Register kdx = tmp2; + const Register xstart = tmp3; + + const Register y_idx = tmp4; + const Register carry = tmp5; + const Register product = xlen; + const Register x_xstart = zlen; // reuse register + + // First Loop. + // + // final static long LONG_MASK = 0xffffffffL; + // int xstart = xlen - 1; + // int ystart = ylen - 1; + // long carry = 0; + // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { + // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; + // z[kdx] = (int)product; + // carry = product >>> 32; + // } + // z[xstart] = (int)carry; + // + + movw(idx, ylen); // idx = ylen; + movw(kdx, zlen); // kdx = xlen+ylen; + mov(carry, zr); // carry = 0; + + Label L_done; + + movw(xstart, xlen); + subsw(xstart, xstart, 1); + br(Assembler::MI, L_done); + + multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); + + Label L_second_loop; + cbzw(kdx, L_second_loop); + + Label L_carry; + subw(kdx, kdx, 1); + cbzw(kdx, L_carry); + + strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); + lsr(carry, carry, 32); + subw(kdx, kdx, 1); + + bind(L_carry); + strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); + + // Second and third (nested) loops. + // + // for (int i = xstart-1; i >= 0; i--) { // Second loop + // carry = 0; + // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop + // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + + // (z[k] & LONG_MASK) + carry; + // z[k] = (int)product; + // carry = product >>> 32; + // } + // z[i] = (int)carry; + // } + // + // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi + + const Register jdx = tmp1; + + bind(L_second_loop); + mov(carry, zr); // carry = 0; + movw(jdx, ylen); // j = ystart+1 + + subsw(xstart, xstart, 1); // i = xstart-1; + br(Assembler::MI, L_done); + + str(z, Address(pre(sp, -4 * wordSize))); + + Label L_last_x; + lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j + subsw(xstart, xstart, 1); // i = xstart-1; + br(Assembler::MI, L_last_x); + + lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); + ldr(product_hi, Address(rscratch1)); + ror(product_hi, product_hi, 32); // convert big-endian to little-endian + + Label L_third_loop_prologue; + bind(L_third_loop_prologue); + + str(ylen, Address(sp, wordSize)); + stp(x, xstart, Address(sp, 2 * wordSize)); + multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, + tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); + ldp(z, ylen, Address(post(sp, 2 * wordSize))); + ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen + + addw(tmp3, xlen, 1); + strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); + subsw(tmp3, tmp3, 1); + br(Assembler::MI, L_done); + + lsr(carry, carry, 32); + strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); + b(L_second_loop); + + // Next infrequent code is moved outside loops. + bind(L_last_x); + ldrw(product_hi, Address(x, 0)); + b(L_third_loop_prologue); + + bind(L_done); +} + +/** * Emits code to update CRC-32 with a byte value according to constants in table * * @param [in,out]crc Register containing the crc. --- old/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp 2015-04-13 18:56:42.216066073 +0100 +++ new/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp 2015-04-13 18:56:42.036084089 +0100 @@ -1,6 +1,6 @@ /* * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Red Hat Inc. All rights reserved. + * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -167,9 +167,8 @@ // aliases defined in AARCH64 spec - template - inline void cmpw(Register Rd, T imm) { subsw(zr, Rd, imm); } + inline void cmpw(Register Rd, T imm) { subsw(zr, Rd, imm); } inline void cmp(Register Rd, unsigned imm) { subs(zr, Rd, imm); } inline void cmnw(Register Rd, unsigned imm) { addsw(zr, Rd, imm); } @@ -1121,9 +1120,34 @@ Register tmp1, Register tmp2, Register tmp3, Register tmp4, int int_cnt1, Register result); - +private: + void add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, + Register src1, Register src2); + void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) { + add2_with_carry(dest_hi, dest_hi, dest_lo, src1, src2); + } + void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, + Register y, Register y_idx, Register z, + Register carry, Register product, + Register idx, Register kdx); + void multiply_128_x_128_loop(Register y, Register z, + Register carry, Register carry2, + Register idx, Register jdx, + Register yz_idx1, Register yz_idx2, + Register tmp, Register tmp3, Register tmp4, + Register tmp7, Register product_hi); +public: + void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, + Register zlen, Register tmp1, Register tmp2, Register tmp3, + Register tmp4, Register tmp5, Register tmp6, Register tmp7); // ISB may be needed because of a safepoint void maybe_isb() { isb(); } + +private: + // Return the effective address r + (r1 << ext) + offset. + // Uses rscratch2. + Address offsetted_address(Register r, Register r1, Address::extend ext, + int offset, int size); }; // Used by aarch64.ad to control code generation --- old/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp 2015-04-13 18:56:43.215965986 +0100 +++ new/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp 2015-04-13 18:56:43.035984002 +0100 @@ -1,6 +1,6 @@ /* * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Red Hat Inc. All rights reserved. + * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -2356,8 +2356,46 @@ return start; } -#undef __ -#define __ masm-> + /** + * Arguments: + * + * Input: + * c_rarg0 - x address + * c_rarg1 - x length + * c_rarg2 - y address + * c_rarg3 - y lenth + * c_rarg4 - z address + * c_rarg5 - z length + */ + address generate_multiplyToLen() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); + + address start = __ pc(); + const Register x = r0; + const Register xlen = r1; + const Register y = r2; + const Register ylen = r3; + const Register z = r4; + const Register zlen = r5; + + const Register tmp1 = r10; + const Register tmp2 = r11; + const Register tmp3 = r12; + const Register tmp4 = r13; + const Register tmp5 = r14; + const Register tmp6 = r15; + const Register tmp7 = r16; + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(lr); + + return start; + } // Continuation point for throwing of implicit exceptions that are // not handled in the current activation. Fabricates an exception @@ -2375,6 +2413,9 @@ // otherwise assume that stack unwinding will be initiated, so // caller saved registers were assumed volatile in the compiler. +#undef __ +#define __ masm-> + address generate_throw_exception(const char* name, address runtime_entry, Register arg1 = noreg, @@ -2518,6 +2559,10 @@ // arraycopy stubs used by compilers generate_arraycopy_stubs(); + if (UseMultiplyToLenIntrinsic) { + StubRoutines::_multiplyToLen = generate_multiplyToLen(); + } + #ifndef BUILTIN_SIM if (UseAESIntrinsics) { StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); --- old/src/cpu/aarch64/vm/vm_version_aarch64.cpp 2015-04-13 18:56:44.265860895 +0100 +++ new/src/cpu/aarch64/vm/vm_version_aarch64.cpp 2015-04-13 18:56:44.085878910 +0100 @@ -1,6 +1,6 @@ /* * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Red Hat Inc. All rights reserved. + * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -193,6 +193,10 @@ } } + if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { + UseMultiplyToLenIntrinsic = true; + } + #ifdef COMPILER2 if (FLAG_IS_DEFAULT(OptoScheduling)) { OptoScheduling = true; --- old/test/compiler/intrinsics/multiplytolen/TestMultiplyToLen.java 2015-04-13 18:56:45.235763811 +0100 +++ new/test/compiler/intrinsics/multiplytolen/TestMultiplyToLen.java 2015-04-13 18:56:45.055781826 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -98,11 +98,33 @@ if (!bytecompare(oldres,newres)) { System.out.print("mismatch for:b1:" + stringify(b1) + " :b2:" + stringify(b2) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres)); + throw new Exception("Failed"); + } + } + + // Test carry propagation. Multiple carries during bignum + // multiplication are rare (especially when using 64-bit + // arithmetic) so we have to provoke them deliberately. + for (int j = 4; j <= 396; j += 4) { + byte[] bytes = new byte[j]; + Arrays.fill(bytes, (byte)255); + b1 = new BigInteger(bytes); + b2 = new BigInteger(bytes); + + oldres = base_multiply(b1,b2); + newres = new_multiply(b1,b2); + + oldsum = oldsum.add(oldres); + newsum = newsum.add(newres); + + if (!bytecompare(oldres,newres)) { + System.out.print("mismatch for:b1:" + stringify(b1) + " :b2:" + stringify(b2) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres)); System.out.println(b1); System.out.println(b2); throw new Exception("Failed"); } } + if (!bytecompare(oldsum,newsum)) { System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum)); throw new Exception("Failed");