< prev index next >

src/cpu/ppc/vm/macroAssembler_ppc.cpp

Print this page
rev 8631 : 8130654: ppc: implement MultiplyToLen intrinsic
Contributed-by: Peter.Januschke@sap.com

*** 3431,3440 **** --- 3431,3810 ---- } li(result_reg, 1); bind(Ldone_false); } + // dest_lo += src1 + src2 + // dest_hi += carry1 + carry2 + void MacroAssembler::add2_with_carry(Register dest_hi, + Register dest_lo, + Register src1, Register src2) { + li(R0, 0); + addc(dest_lo, dest_lo, src1); + adde(dest_hi, dest_hi, R0); + addc(dest_lo, dest_lo, src2); + adde(dest_hi, dest_hi, R0); + } + + // Multiply 64 bit by 64 bit first loop. + void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, + Register x_xstart, + Register y, Register y_idx, + Register z, + Register carry, + Register product_high, Register product, + Register idx, Register kdx, + Register tmp) { + // jlong carry, x[], y[], z[]; + // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { + // huge_128 product = y[idx] * x[xstart] + carry; + // z[kdx] = (jlong)product; + // carry = (jlong)(product >>> 64); + // } + // z[xstart] = carry; + + Label L_first_loop, L_first_loop_exit; + Label L_one_x, L_one_y, L_multiply; + + addic_(xstart, xstart, -1); + blt(CCR0, L_one_x); // Special case: length of x is 1. + + // Load next two integers of x. + sldi(tmp, xstart, LogBytesPerInt); + ldx(x_xstart, x, tmp); + #ifdef VM_LITTLE_ENDIAN + rldicl(x_xstart, x_xstart, 32, 0); + #endif + + align(32, 16); + bind(L_first_loop); + + cmpdi(CCR0, idx, 1); + blt(CCR0, L_first_loop_exit); + addi(idx, idx, -2); + beq(CCR0, L_one_y); + + // Load next two integers of y. + sldi(tmp, idx, LogBytesPerInt); + ldx(y_idx, y, tmp); + #ifdef VM_LITTLE_ENDIAN + rldicl(y_idx, y_idx, 32, 0); + #endif + + + bind(L_multiply); + multiply64(product_high, product, x_xstart, y_idx); + + li(tmp, 0); + addc(product, product, carry); // Add carry to result. + adde(product_high, product_high, tmp); // Add carry of the last addition. + addi(kdx, kdx, -2); + + // Store result. + #ifdef VM_LITTLE_ENDIAN + rldicl(product, product, 32, 0); + #endif + sldi(tmp, kdx, LogBytesPerInt); + stdx(product, z, tmp); + mr_if_needed(carry, product_high); + b(L_first_loop); + + + bind(L_one_y); // Load one 32 bit portion of y as (0,value). + + lwz(y_idx, 0, y); + b(L_multiply); + + + bind( L_one_x ); // Load one 32 bit portion of x as (0,value). + + lwz(x_xstart, 0, x); + b(L_first_loop); + + bind(L_first_loop_exit); + } + + // Multiply 64 bit by 64 bit and add 128 bit. + void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, + Register z, Register yz_idx, + Register idx, Register carry, + Register product_high, Register product, + Register tmp, int offset) { + + // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; + // z[kdx] = (jlong)product; + + sldi(tmp, idx, LogBytesPerInt); + if ( offset ) { + addi(tmp, tmp, offset); + } + ldx(yz_idx, y, tmp); + #ifdef VM_LITTLE_ENDIAN + rldicl(yz_idx, yz_idx, 32, 0); + #endif + + multiply64(product_high, product, x_xstart, yz_idx); + ldx(yz_idx, z, tmp); + #ifdef VM_LITTLE_ENDIAN + rldicl(yz_idx, yz_idx, 32, 0); + #endif + + add2_with_carry(product_high, product, carry, yz_idx); + + sldi(tmp, idx, LogBytesPerInt); + if ( offset ) { + addi(tmp, tmp, offset); + } + #ifdef VM_LITTLE_ENDIAN + rldicl(product, product, 32, 0); + #endif + stdx(product, z, tmp); + } + + // Multiply 128 bit by 128 bit. Unrolled inner loop. + void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, + Register y, Register z, + Register yz_idx, Register idx, Register carry, + Register product_high, Register product, + Register carry2, Register tmp) { + + // jlong carry, x[], y[], z[]; + // int kdx = ystart+1; + // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop + // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; + // z[kdx+idx+1] = (jlong)product; + // jlong carry2 = (jlong)(product >>> 64); + // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; + // z[kdx+idx] = (jlong)product; + // carry = (jlong)(product >>> 64); + // } + // idx += 2; + // if (idx > 0) { + // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; + // z[kdx+idx] = (jlong)product; + // carry = (jlong)(product >>> 64); + // } + + Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; + const Register jdx = R0; + + // Scale the index. + srdi_(jdx, idx, 2); + beq(CCR0, L_third_loop_exit); + mtctr(jdx); + + align(32, 16); + bind(L_third_loop); + + addi(idx, idx, -4); + + multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); + mr_if_needed(carry2, product_high); + + multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); + mr_if_needed(carry, product_high); + bdnz(L_third_loop); + + bind(L_third_loop_exit); // Handle any left-over operand parts. + + andi_(idx, idx, 0x3); + beq(CCR0, L_post_third_loop_done); + + Label L_check_1; + + addic_(idx, idx, -2); + blt(CCR0, L_check_1); + + multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); + mr_if_needed(carry, product_high); + + bind(L_check_1); + + addi(idx, idx, 0x2); + andi_(idx, idx, 0x1) ; + addic_(idx, idx, -1); + blt(CCR0, L_post_third_loop_done); + + sldi(tmp, idx, LogBytesPerInt); + lwzx(yz_idx, y, tmp); + multiply64(product_high, product, x_xstart, yz_idx); + lwzx(yz_idx, z, tmp); + + add2_with_carry(product_high, product, yz_idx, carry); + + sldi(tmp, idx, LogBytesPerInt); + stwx(product, z, tmp); + srdi(product, product, 32); + + sldi(product_high, product_high, 32); + orr(product, product, product_high); + mr_if_needed(carry, product); + + bind(L_post_third_loop_done); + } // multiply_128_x_128_loop + + void MacroAssembler::multiply_to_len(Register x, Register xlen, + Register y, Register ylen, + Register z, Register zlen, + Register tmp1, Register tmp2, + Register tmp3, Register tmp4, + Register tmp5, Register tmp6, + Register tmp7, Register tmp8, + Register tmp9, Register tmp10, + Register tmp11, Register tmp12, + Register tmp13) { + + ShortBranchVerifier sbv(this); + + assert_different_registers(x, xlen, y, ylen, z, zlen, + tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); + assert_different_registers(x, xlen, y, ylen, z, zlen, + tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); + assert_different_registers(x, xlen, y, ylen, z, zlen, + tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); + + const Register idx = tmp1; + const Register kdx = tmp2; + const Register xstart = tmp3; + + const Register y_idx = tmp4; + const Register carry = tmp5; + const Register product = tmp6; + const Register product_high = tmp7; + const Register x_xstart = tmp8; + const Register tmp = tmp9; + + // First Loop. + // + // final static long LONG_MASK = 0xffffffffL; + // int xstart = xlen - 1; + // int ystart = ylen - 1; + // long carry = 0; + // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { + // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; + // z[kdx] = (int)product; + // carry = product >>> 32; + // } + // z[xstart] = (int)carry; + + mr_if_needed(idx, ylen); // idx = ylen + mr_if_needed(kdx, zlen); // kdx = xlen + ylen + li(carry, 0); // carry = 0 + + Label L_done; + + addic_(xstart, xlen, -1); + blt(CCR0, L_done); + + multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, + carry, product_high, product, idx, kdx, tmp); + + Label L_second_loop; + + cmpdi(CCR0, kdx, 0); + beq(CCR0, L_second_loop); + + Label L_carry; + + addic_(kdx, kdx, -1); + beq(CCR0, L_carry); + + // Store lower 32 bits of carry. + sldi(tmp, kdx, LogBytesPerInt); + stwx(carry, z, tmp); + srdi(carry, carry, 32); + addi(kdx, kdx, -1); + + + bind(L_carry); + + // Store upper 32 bits of carry. + sldi(tmp, kdx, LogBytesPerInt); + stwx(carry, z, tmp); + + // Second and third (nested) loops. + // + // for (int i = xstart-1; i >= 0; i--) { // Second loop + // carry = 0; + // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop + // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + + // (z[k] & LONG_MASK) + carry; + // z[k] = (int)product; + // carry = product >>> 32; + // } + // z[i] = (int)carry; + // } + // + // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx + + bind(L_second_loop); + + li(carry, 0); // carry = 0; + + addic_(xstart, xstart, -1); // i = xstart-1; + blt(CCR0, L_done); + + Register zsave = tmp10; + + mr(zsave, z); + + + Label L_last_x; + + sldi(tmp, xstart, LogBytesPerInt); + add(z, z, tmp); // z = z + k - j + addi(z, z, 4); + addic_(xstart, xstart, -1); // i = xstart-1; + blt(CCR0, L_last_x); + + sldi(tmp, xstart, LogBytesPerInt); + ldx(x_xstart, x, tmp); + #ifdef VM_LITTLE_ENDIAN + rldicl(x_xstart, x_xstart, 32, 0); + #endif + + + Label L_third_loop_prologue; + + bind(L_third_loop_prologue); + + Register xsave = tmp11; + Register xlensave = tmp12; + Register ylensave = tmp13; + + mr(xsave, x); + mr(xlensave, xstart); + mr(ylensave, ylen); + + + multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, + carry, product_high, product, x, tmp); + + mr(z, zsave); + mr(x, xsave); + mr(xlen, xlensave); // This is the decrement of the loop counter! + mr(ylen, ylensave); + + addi(tmp3, xlen, 1); + sldi(tmp, tmp3, LogBytesPerInt); + stwx(carry, z, tmp); + addic_(tmp3, tmp3, -1); + blt(CCR0, L_done); + + srdi(carry, carry, 32); + sldi(tmp, tmp3, LogBytesPerInt); + stwx(carry, z, tmp); + b(L_second_loop); + + // Next infrequent code is moved outside loops. + bind(L_last_x); + + lwz(x_xstart, 0, x); + b(L_third_loop_prologue); + + bind(L_done); + } // multiply_to_len void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { #ifdef ASSERT Label ok; if (check_equal) {
< prev index next >