--- old/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp 2017-09-25 16:39:34.735165362 +0300 +++ new/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp 2017-09-25 16:39:34.679166240 +0300 @@ -2840,6 +2840,44 @@ bind(L_done); } +// Code for BigInteger::mulAdd instrinsic +// out = r0 +// in = r1 +// offset = r2 (already out.length-offset) +// len = r3 +// k = r4 +// +// pseudo code from java implementation: +// carry = 0; +// offset = out.length-offset - 1; +// for (int j=len-1; j >= 0; j--) { +// product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; +// out[offset--] = (int)product; +// carry = product >>> 32; +// } +// return (int)carry; +void MacroAssembler::mul_add(Register out, Register in, Register offset, + Register len, Register k) { + Label LOOP, END; + // pre-loop + cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches + csel(out, zr, out, Assembler::EQ); + br(Assembler::EQ, END); + add(in, in, len, LSL, 2); // in[j+1] address + add(offset, out, offset, LSL, 2); // out[offset + 1] address + mov(out, zr); // used to keep carry now + BIND(LOOP); + ldrw(rscratch1, Address(pre(in, -4))); + madd(rscratch1, rscratch1, k, out); + ldrw(rscratch2, Address(pre(offset, -4))); + add(rscratch1, rscratch1, rscratch2); + strw(rscratch1, Address(offset)); + lsr(out, rscratch1, 32); + subs(len, len, 1); + br(Assembler::NE, LOOP); + BIND(END); +} + /** * Emits code to update CRC-32 with a byte value according to constants in table * --- old/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp 2017-09-25 16:39:34.939162164 +0300 +++ new/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp 2017-09-25 16:39:34.891162916 +0300 @@ -1265,6 +1265,7 @@ void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6, Register tmp7); + void mul_add(Register out, Register in, Register offs, Register len, Register k); // ISB may be needed because of a safepoint void maybe_isb() { isb(); } --- old/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp 2017-09-25 16:39:35.135159091 +0300 +++ new/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp 2017-09-25 16:39:35.083159906 +0300 @@ -3607,6 +3607,63 @@ return start; } + address generate_squareToLen() { + // squareToLen algorithm for sizes 1..127 described in java code works + // faster than multiply_to_len on some CPUs and slower on others, but + // multiply_to_len shows a bit better overall results + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "squareToLen"); + address start = __ pc(); + + const Register x = r0; + const Register xlen = r1; + const Register z = r2; + const Register zlen = r3; + const Register y = r4; // == x + const Register ylen = r5; // == xlen + + const Register tmp1 = r10; + const Register tmp2 = r11; + const Register tmp3 = r12; + const Register tmp4 = r13; + const Register tmp5 = r14; + const Register tmp6 = r15; + const Register tmp7 = r16; + + RegSet spilled_regs = RegSet::of(y, ylen); + BLOCK_COMMENT("Entry:"); + __ enter(); + __ push(spilled_regs, sp); + __ mov(y, x); + __ mov(ylen, xlen); + __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + __ pop(spilled_regs, sp); + __ leave(); + __ ret(lr); + return start; + } + + address generate_mulAdd() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "mulAdd"); + + address start = __ pc(); + + const Register out = r0; + const Register in = r1; + const Register offset = r2; + const Register len = r3; + const Register k = r4; + + BLOCK_COMMENT("Entry:"); + __ enter(); + __ mul_add(out, in, offset, len, k); + __ leave(); + __ ret(lr); + + return start; + } + void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { @@ -4913,6 +4970,14 @@ StubRoutines::_multiplyToLen = generate_multiplyToLen(); } + if (UseSquareToLenIntrinsic) { + StubRoutines::_squareToLen = generate_squareToLen(); + } + + if (UseMulAddIntrinsic) { + StubRoutines::_mulAdd = generate_mulAdd(); + } + if (UseMontgomeryMultiplyIntrinsic) { StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); --- old/src/cpu/aarch64/vm/vm_version_aarch64.cpp 2017-09-25 16:39:35.351155706 +0300 +++ new/src/cpu/aarch64/vm/vm_version_aarch64.cpp 2017-09-25 16:39:35.295156584 +0300 @@ -340,6 +340,14 @@ UseMultiplyToLenIntrinsic = true; } + if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) { + UseSquareToLenIntrinsic = true; + } + + if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) { + UseMulAddIntrinsic = true; + } + if (FLAG_IS_DEFAULT(UseBarriersForVolatile)) { UseBarriersForVolatile = (_features & CPU_DMB_ATOMICS) != 0; }