--- old/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp 2015-05-04 15:06:48.000000000 -0700 +++ new/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp 2015-05-04 15:06:48.000000000 -0700 @@ -7748,6 +7748,504 @@ pop(tmp2); pop(tmp1); } + +//Helper functions for square_to_len() + +/** + * Store the squares of x[], right shifted one bit (divided by 2) into z[] + * Preserves x and z and modifies rest of the registers. + */ + +void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { + // Perform square and right shift by 1 + // Handle odd xlen case first, then for even xlen do the following + // jlong carry = 0; + // for (int j=0, i=0; j < xlen; j+=2, i+=4) { + // huge_128 product = x[j:j+1] * x[j:j+1]; + // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); + // z[i+2:i+3] = (jlong)(product >>> 1); + // carry = (jlong)product; + // } + + xorq(tmp5, tmp5); // carry + xorq(rdxReg, rdxReg); + xorl(tmp1, tmp1); // index for x + xorl(tmp4, tmp4); // index for z + + Label L_first_loop, L_first_loop_exit; + + testl(xlen, 1); + jccb(Assembler::zero, L_first_loop); //jump if xlen is even + + // Square and right shift by 1 the odd element using 32 bit multiply + movl(raxReg, Address(x, tmp1, Address::times_4, 0)); + imulq(raxReg, raxReg); + shrq(raxReg, 1); + adcq(tmp5, 0); + movq(Address(z, tmp4, Address::times_4, 0), raxReg); + incrementl(tmp1); + addl(tmp4, 2); + + // Square and right shift by 1 the rest using 64 bit multiply + bind(L_first_loop); + cmpptr(tmp1, xlen); + jccb(Assembler::equal, L_first_loop_exit); + + // Square + movq(raxReg, Address(x, tmp1, Address::times_4, 0)); + rorq(raxReg, 32); // convert big-endian to little-endian + mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax + + // Right shift by 1 and save carry + shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 + rcrq(rdxReg, 1); + rcrq(raxReg, 1); + adcq(tmp5, 0); + + // Store result in z + movq(Address(z, tmp4, Address::times_4, 0), rdxReg); + movq(Address(z, tmp4, Address::times_4, 8), raxReg); + + // Update indices for x and z + addl(tmp1, 2); + addl(tmp4, 4); + jmp(L_first_loop); + + bind(L_first_loop_exit); +} + + +/** + * Perform the following multiply add operation using BMI2 instructions + * carry:sum = sum + op1*op2 + carry + * op2 should be in rdx + * op2 is preserved, all other registers are modified + */ +void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { + // assert op2 is rdx + mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 + addq(sum, carry); + adcq(tmp2, 0); + addq(sum, op1); + adcq(tmp2, 0); + movq(carry, tmp2); +} + +/** + * Perform the following multiply add operation: + * carry:sum = sum + op1*op2 + carry + * Preserves op1, op2 and modifies rest of registers + */ +void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { + // rdx:rax = op1 * op2 + movq(raxReg, op2); + mulq(op1); + + // rdx:rax = sum + carry + rdx:rax + addq(sum, carry); + adcq(rdxReg, 0); + addq(sum, raxReg); + adcq(rdxReg, 0); + + // carry:sum = rdx:sum + movq(carry, rdxReg); +} + +/** + * Add 64 bit long carry into z[] with carry propogation. + * Preserves z and carry register values and modifies rest of registers. + * + */ +void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { + Label L_fourth_loop, L_fourth_loop_exit; + + movl(tmp1, 1); + subl(zlen, 2); + addq(Address(z, zlen, Address::times_4, 0), carry); + + bind(L_fourth_loop); + jccb(Assembler::carryClear, L_fourth_loop_exit); + subl(zlen, 2); + jccb(Assembler::negative, L_fourth_loop_exit); + addq(Address(z, zlen, Address::times_4, 0), tmp1); + jmp(L_fourth_loop); + bind(L_fourth_loop_exit); +} + +/** + * Shift z[] left by 1 bit. + * Preserves x, len, z and zlen registers and modifies rest of the registers. + * + */ +void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { + + Label L_fifth_loop, L_fifth_loop_exit; + + // Fifth loop + // Perform primitiveLeftShift(z, zlen, 1) + + const Register prev_carry = tmp1; + const Register new_carry = tmp4; + const Register value = tmp2; + const Register zidx = tmp3; + + // int zidx, carry; + // long value; + // carry = 0; + // for (zidx = zlen-2; zidx >=0; zidx -= 2) { + // (carry:value) = (z[i] << 1) | carry ; + // z[i] = value; + // } + + movl(zidx, zlen); + xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register + + bind(L_fifth_loop); + decl(zidx); // Use decl to preserve carry flag + decl(zidx); + jccb(Assembler::negative, L_fifth_loop_exit); + + if (UseBMI2Instructions) { + movq(value, Address(z, zidx, Address::times_4, 0)); + rclq(value, 1); + rorxq(value, value, 32); + movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form + } + else { + // clear new_carry + xorl(new_carry, new_carry); + + // Shift z[i] by 1, or in previous carry and save new carry + movq(value, Address(z, zidx, Address::times_4, 0)); + shlq(value, 1); + adcl(new_carry, 0); + + orq(value, prev_carry); + rorq(value, 0x20); + movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form + + // Set previous carry = new carry + movl(prev_carry, new_carry); + } + jmp(L_fifth_loop); + + bind(L_fifth_loop_exit); +} + + +/** + * Code for BigInteger::squareToLen() intrinsic + * + * rdi: x + * rsi: len + * r8: z + * rcx: zlen + * r12: tmp1 + * r13: tmp2 + * r14: tmp3 + * r15: tmp4 + * rbx: tmp5 + * + */ +void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { + + Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply; + push(tmp1); + push(tmp2); + push(tmp3); + push(tmp4); + push(tmp5); + + // First loop + // Store the squares, right shifted one bit (i.e., divided by 2). + square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); + + // Add in off-diagonal sums. + // + // Second, third (nested) and fourth loops. + // zlen +=2; + // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { + // carry = 0; + // long op2 = x[xidx:xidx+1]; + // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { + // k -= 2; + // long op1 = x[j:j+1]; + // long sum = z[k:k+1]; + // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); + // z[k:k+1] = sum; + // } + // add_one_64(z, k, carry, tmp_regs); + // } + + + const Register carry = tmp5; + const Register sum = tmp3; + const Register op1 = tmp4; + Register op2 = tmp2; + + push(zlen); + push(len); + addl(zlen,2); + bind(L_second_loop); + xorq(carry, carry); + subl(zlen, 4); + subl(len, 2); + push(zlen); + push(len); + cmpl(len, 0); + jccb(Assembler::lessEqual, L_second_loop_exit); + + // Multiply an array by one 64 bit long. + if (UseBMI2Instructions) { + op2 = rdxReg; + movq(op2, Address(x, len, Address::times_4, 0)); + rorxq(op2, op2, 32); + } + else { + movq(op2, Address(x, len, Address::times_4, 0)); + rorq(op2, 32); + } + + bind(L_third_loop); + decrementl(len); + jccb(Assembler::negative, L_third_loop_exit); + decrementl(len); + jccb(Assembler::negative, L_last_x); + + movq(op1, Address(x, len, Address::times_4, 0)); + rorq(op1, 32); + + bind(L_multiply); + subl(zlen, 2); + movq(sum, Address(z, zlen, Address::times_4, 0)); + + // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. + if (UseBMI2Instructions) { + multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); + } + else { + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); + } + + movq(Address(z, zlen, Address::times_4, 0), sum); + + jmp(L_third_loop); + bind(L_third_loop_exit); + + // Fourth loop + // Add 64 bit long carry into z with carry propogation. + // Uses offsetted zlen. + add_one_64(z, zlen, carry, tmp1); + + pop(len); + pop(zlen); + jmp(L_second_loop); + + // Next infrequent code is moved outside loops. + bind(L_last_x); + movl(op1, Address(x, 0)); + jmp(L_multiply); + + bind(L_second_loop_exit); + pop(len); + pop(zlen); + pop(len); + pop(zlen); + + // Fifth loop + // Shift z left 1 bit. + lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); + + // z[zlen-1] |= x[len-1] & 1; + movl(tmp3, Address(x, len, Address::times_4, -4)); + andl(tmp3, 1); + orl(Address(z, zlen, Address::times_4, -4), tmp3); + + pop(tmp5); + pop(tmp4); + pop(tmp3); + pop(tmp2); + pop(tmp1); +} + +/** + * Helper function for mul_add() + * Multiply the in[] by int k and add to out[] starting at offset offs using + * 128 bit by 32 bit multiply and return the carry in tmp5. + * Only quad int aligned length of in[] is operated on in this function. + * k is in rdxReg for BMI2Instructions, for others it is in tmp2. + * This function preserves out, in and k registers. + * len and offset point to the appropriate index in "in" & "out" correspondingly + * tmp5 has the carry. + * other registers are temporary and are modified. + * + */ +void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, + Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, + Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { + + Label L_first_loop, L_first_loop_exit; + + movl(tmp1, len); + shrl(tmp1, 2); + + bind(L_first_loop); + subl(tmp1, 1); + jccb(Assembler::negative, L_first_loop_exit); + + subl(len, 4); + subl(offset, 4); + + Register op2 = tmp2; + const Register sum = tmp3; + const Register op1 = tmp4; + const Register carry = tmp5; + + if (UseBMI2Instructions) { + op2 = rdxReg; + } + + movq(op1, Address(in, len, Address::times_4, 8)); + rorq(op1, 32); + movq(sum, Address(out, offset, Address::times_4, 8)); + rorq(sum, 32); + if (UseBMI2Instructions) { + multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); + } + else { + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); + } + // Store back in big endian from little endian + rorq(sum, 0x20); + movq(Address(out, offset, Address::times_4, 8), sum); + + movq(op1, Address(in, len, Address::times_4, 0)); + rorq(op1, 32); + movq(sum, Address(out, offset, Address::times_4, 0)); + rorq(sum, 32); + if (UseBMI2Instructions) { + multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); + } + else { + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); + } + // Store back in big endian from little endian + rorq(sum, 0x20); + movq(Address(out, offset, Address::times_4, 0), sum); + + jmp(L_first_loop); + bind(L_first_loop_exit); +} + +/** + * Code for BigInteger::mulAdd() intrinsic + * + * rdi: out + * rsi: in + * r11: offs (out.length - offset) + * rcx: len + * r8: k + * r12: tmp1 + * r13: tmp2 + * r14: tmp3 + * r15: tmp4 + * rbx: tmp5 + * Multiply the in[] by word k and add to out[], return the carry in rax + */ +void MacroAssembler::mul_add(Register out, Register in, Register offs, + Register len, Register k, Register tmp1, Register tmp2, Register tmp3, + Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { + + Label L_carry, L_last_in, L_done; + +// carry = 0; +// for (int j=len-1; j >= 0; j--) { +// long product = (in[j] & LONG_MASK) * kLong + +// (out[offs] & LONG_MASK) + carry; +// out[offs--] = (int)product; +// carry = product >>> 32; +// } +// + push(tmp1); + push(tmp2); + push(tmp3); + push(tmp4); + push(tmp5); + + Register op2 = tmp2; + const Register sum = tmp3; + const Register op1 = tmp4; + const Register carry = tmp5; + + if (UseBMI2Instructions) { + op2 = rdxReg; + movl(op2, k); + } + else { + movl(op2, k); + } + + xorq(carry, carry); + + //First loop + + //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply + //The carry is in tmp5 + mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); + + //Multiply the trailing in[] entry using 64 bit by 32 bit, if any + decrementl(len); + jccb(Assembler::negative, L_carry); + decrementl(len); + jccb(Assembler::negative, L_last_in); + + movq(op1, Address(in, len, Address::times_4, 0)); + rorq(op1, 32); + + subl(offs, 2); + movq(sum, Address(out, offs, Address::times_4, 0)); + rorq(sum, 32); + + if (UseBMI2Instructions) { + multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); + } + else { + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); + } + + // Store back in big endian from little endian + rorq(sum, 0x20); + movq(Address(out, offs, Address::times_4, 0), sum); + + testl(len, len); + jccb(Assembler::zero, L_carry); + + //Multiply the last in[] entry, if any + bind(L_last_in); + movl(op1, Address(in, 0)); + movl(sum, Address(out, offs, Address::times_4, -4)); + + movl(raxReg, k); + mull(op1); //tmp4 * eax -> edx:eax + addl(sum, carry); + adcl(rdxReg, 0); + addl(sum, raxReg); + adcl(rdxReg, 0); + movl(carry, rdxReg); + + movl(Address(out, offs, Address::times_4, -4), sum); + + bind(L_carry); + //return tmp5/carry as carry in rax + movl(rax, carry); + + bind(L_done); + pop(tmp5); + pop(tmp4); + pop(tmp3); + pop(tmp2); + pop(tmp1); +} #endif /**