src/cpu/x86/vm/macroAssembler_x86.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File
*** old/src/cpu/x86/vm/macroAssembler_x86.cpp	Fri Aug 29 19:42:09 2014
--- new/src/cpu/x86/vm/macroAssembler_x86.cpp	Fri Aug 29 19:42:09 2014

*** 7291,7300 ****
--- 7291,7761 ----
    bind(L_copy_1_char_exit);
    addptr(result, len); // len is negative count of not processed elements
    bind(L_done);
  }
  
+ #ifdef _LP64
+ /**
+  * Helper for multiply_to_len().
+  */
+ void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
+   addq(dest_lo, src1);
+   adcq(dest_hi, 0);
+   addq(dest_lo, src2);
+   adcq(dest_hi, 0);
+ }
+ 
+ /**
+  * Multiply 64 bit by 64 bit first loop.
+  */
+ void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
+                                            Register y, Register y_idx, Register z,
+                                            Register carry, Register product,
+                                            Register idx, Register kdx) {
+   //
+   //  jlong carry, x[], y[], z[];
+   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
+   //    huge_128 product = y[idx] * x[xstart] + carry;
+   //    z[kdx] = (jlong)product;
+   //    carry  = (jlong)(product >>> 64);
+   //  }
+   //  z[xstart] = carry;
+   //
+ 
+   Label L_first_loop, L_first_loop_exit;
+   Label L_one_x, L_one_y, L_multiply;
+ 
+   decrementl(xstart);
+   jcc(Assembler::negative, L_one_x);
+ 
+   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
+   rorq(x_xstart, 32); // convert big-endian to little-endian
+ 
+   bind(L_first_loop);
+   decrementl(idx);
+   jcc(Assembler::negative, L_first_loop_exit);
+   decrementl(idx);
+   jcc(Assembler::negative, L_one_y);
+   movq(y_idx, Address(y, idx, Address::times_4,  0));
+   rorq(y_idx, 32); // convert big-endian to little-endian
+   bind(L_multiply);
+   movq(product, x_xstart);
+   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
+   addq(product, carry);
+   adcq(rdx, 0);
+   subl(kdx, 2);
+   movl(Address(z, kdx, Address::times_4,  4), product);
+   shrq(product, 32);
+   movl(Address(z, kdx, Address::times_4,  0), product);
+   movq(carry, rdx);
+   jmp(L_first_loop);
+ 
+   bind(L_one_y);
+   movl(y_idx, Address(y,  0));
+   jmp(L_multiply);
+ 
+   bind(L_one_x);
+   movl(x_xstart, Address(x,  0));
+   jmp(L_first_loop);
+ 
+   bind(L_first_loop_exit);
+ }
+ 
+ /**
+  * Multiply 64 bit by 64 bit and add 128 bit.
+  */
+ void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
+                                             Register yz_idx, Register idx,
+                                             Register carry, Register product, int offset) {
+   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
+   //     z[kdx] = (jlong)product;
+ 
+   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
+   rorq(yz_idx, 32); // convert big-endian to little-endian
+   movq(product, x_xstart);
+   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
+   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
+   rorq(yz_idx, 32); // convert big-endian to little-endian
+ 
+   add2_with_carry(rdx, product, carry, yz_idx);
+ 
+   movl(Address(z, idx, Address::times_4,  offset+4), product);
+   shrq(product, 32);
+   movl(Address(z, idx, Address::times_4,  offset), product);
+ 
+ }
+ 
+ /**
+  * Multiply 128 bit by 128 bit. Unrolled inner loop.
+  */
+ void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
+                                              Register yz_idx, Register idx, Register jdx,
+                                              Register carry, Register product,
+                                              Register carry2) {
+   //   jlong carry, x[], y[], z[];
+   //   int kdx = ystart+1;
+   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
+   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
+   //     z[kdx+idx+1] = (jlong)product;
+   //     jlong carry2  = (jlong)(product >>> 64);
+   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
+   //     z[kdx+idx] = (jlong)product;
+   //     carry  = (jlong)(product >>> 64);
+   //   }
+   //   idx += 2;
+   //   if (idx > 0) {
+   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
+   //     z[kdx+idx] = (jlong)product;
+   //     carry  = (jlong)(product >>> 64);
+   //   }
+   //
+ 
+   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
+ 
+   movl(jdx, idx);
+   andl(jdx, 0xFFFFFFFC);
+   shrl(jdx, 2);
+ 
+   bind(L_third_loop);
+   subl(jdx, 1);
+   jcc(Assembler::negative, L_third_loop_exit);
+   subl(idx, 4);
+ 
+   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
+   movq(carry2, rdx);
+ 
+   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
+   movq(carry, rdx);
+   jmp(L_third_loop);
+ 
+   bind (L_third_loop_exit);
+ 
+   andl (idx, 0x3);
+   jcc(Assembler::zero, L_post_third_loop_done);
+ 
+   Label L_check_1;
+   subl(idx, 2);
+   jcc(Assembler::negative, L_check_1);
+ 
+   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
+   movq(carry, rdx);
+ 
+   bind (L_check_1);
+   addl (idx, 0x2);
+   andl (idx, 0x1);
+   subl(idx, 1);
+   jcc(Assembler::negative, L_post_third_loop_done);
+ 
+   movl(yz_idx, Address(y, idx, Address::times_4,  0));
+   movq(product, x_xstart);
+   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
+   movl(yz_idx, Address(z, idx, Address::times_4,  0));
+ 
+   add2_with_carry(rdx, product, yz_idx, carry);
+ 
+   movl(Address(z, idx, Address::times_4,  0), product);
+   shrq(product, 32);
+ 
+   shlq(rdx, 32);
+   orq(product, rdx);
+   movq(carry, product);
+ 
+   bind(L_post_third_loop_done);
+ }
+ 
+ /**
+  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
+  *
+  */
+ void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
+                                                   Register carry, Register carry2,
+                                                   Register idx, Register jdx,
+                                                   Register yz_idx1, Register yz_idx2,
+                                                   Register tmp, Register tmp3, Register tmp4) {
+   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
+ 
+   //   jlong carry, x[], y[], z[];
+   //   int kdx = ystart+1;
+   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
+   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
+   //     jlong carry2  = (jlong)(tmp3 >>> 64);
+   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
+   //     carry  = (jlong)(tmp4 >>> 64);
+   //     z[kdx+idx+1] = (jlong)tmp3;
+   //     z[kdx+idx] = (jlong)tmp4;
+   //   }
+   //   idx += 2;
+   //   if (idx > 0) {
+   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
+   //     z[kdx+idx] = (jlong)yz_idx1;
+   //     carry  = (jlong)(yz_idx1 >>> 64);
+   //   }
+   //
+ 
+   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
+ 
+   movl(jdx, idx);
+   andl(jdx, 0xFFFFFFFC);
+   shrl(jdx, 2);
+ 
+   bind(L_third_loop);
+   subl(jdx, 1);
+   jcc(Assembler::negative, L_third_loop_exit);
+   subl(idx, 4);
+ 
+   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
+   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
+   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
+   rorxq(yz_idx2, yz_idx2, 32);
+ 
+   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
+   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
+ 
+   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
+   rorxq(yz_idx1, yz_idx1, 32);
+   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
+   rorxq(yz_idx2, yz_idx2, 32);
+ 
+   if (VM_Version::supports_adx()) {
+     adcxq(tmp3, carry);
+     adoxq(tmp3, yz_idx1);
+ 
+     adcxq(tmp4, tmp);
+     adoxq(tmp4, yz_idx2);
+ 
+     movl(carry, 0); // does not affect flags
+     adcxq(carry2, carry);
+     adoxq(carry2, carry);
+   } else {
+     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
+     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
+   }
+   movq(carry, carry2);
+ 
+   movl(Address(z, idx, Address::times_4, 12), tmp3);
+   shrq(tmp3, 32);
+   movl(Address(z, idx, Address::times_4,  8), tmp3);
+ 
+   movl(Address(z, idx, Address::times_4,  4), tmp4);
+   shrq(tmp4, 32);
+   movl(Address(z, idx, Address::times_4,  0), tmp4);
+ 
+   jmp(L_third_loop);
+ 
+   bind (L_third_loop_exit);
+ 
+   andl (idx, 0x3);
+   jcc(Assembler::zero, L_post_third_loop_done);
+ 
+   Label L_check_1;
+   subl(idx, 2);
+   jcc(Assembler::negative, L_check_1);
+ 
+   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
+   rorxq(yz_idx1, yz_idx1, 32);
+   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
+   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
+   rorxq(yz_idx2, yz_idx2, 32);
+ 
+   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
+ 
+   movl(Address(z, idx, Address::times_4,  4), tmp3);
+   shrq(tmp3, 32);
+   movl(Address(z, idx, Address::times_4,  0), tmp3);
+   movq(carry, tmp4);
+ 
+   bind (L_check_1);
+   addl (idx, 0x2);
+   andl (idx, 0x1);
+   subl(idx, 1);
+   jcc(Assembler::negative, L_post_third_loop_done);
+   movl(tmp4, Address(y, idx, Address::times_4,  0));
+   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
+   movl(tmp4, Address(z, idx, Address::times_4,  0));
+ 
+   add2_with_carry(carry2, tmp3, tmp4, carry);
+ 
+   movl(Address(z, idx, Address::times_4,  0), tmp3);
+   shrq(tmp3, 32);
+ 
+   shlq(carry2, 32);
+   orq(tmp3, carry2);
+   movq(carry, tmp3);
+ 
+   bind(L_post_third_loop_done);
+ }
+ 
+ /**
+  * Code for BigInteger::multiplyToLen() instrinsic.
+  *
+  * rdi: x
+  * rax: xlen
+  * rsi: y
+  * rcx: ylen
+  * r8:  z
+  * r11: zlen
+  * r12: tmp1
+  * r13: tmp2
+  * r14: tmp3
+  * r15: tmp4
+  * rbx: tmp5
+  *
+  */
+ void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
+                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
+   ShortBranchVerifier sbv(this);
+   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
+ 
+   push(tmp1);
+   push(tmp2);
+   push(tmp3);
+   push(tmp4);
+   push(tmp5);
+ 
+   push(xlen);
+   push(zlen);
+ 
+   const Register idx = tmp1;
+   const Register kdx = tmp2;
+   const Register xstart = tmp3;
+ 
+   const Register y_idx = tmp4;
+   const Register carry = tmp5;
+   const Register product  = xlen;
+   const Register x_xstart = zlen;  // reuse register
+ 
+   // First Loop.
+   //
+   //  final static long LONG_MASK = 0xffffffffL;
+   //  int xstart = xlen - 1;
+   //  int ystart = ylen - 1;
+   //  long carry = 0;
+   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
+   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
+   //    z[kdx] = (int)product;
+   //    carry = product >>> 32;
+   //  }
+   //  z[xstart] = (int)carry;
+   //
+ 
+   movl(idx, ylen);      // idx = ylen;
+   movl(kdx, zlen);      // kdx = xlen+ylen;
+   xorq(carry, carry);   // carry = 0;
+ 
+   Label L_done;
+ 
+   movl(xstart, xlen);
+   decrementl(xstart);
+   jcc(Assembler::negative, L_done);
+ 
+   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
+ 
+   Label L_second_loop;
+   testl(kdx, kdx);
+   jcc(Assembler::zero, L_second_loop);
+ 
+   Label L_carry;
+   subl(kdx, 1);
+   jcc(Assembler::zero, L_carry);
+ 
+   movl(Address(z, kdx, Address::times_4,  0), carry);
+   shrq(carry, 32);
+   subl(kdx, 1);
+ 
+   bind(L_carry);
+   movl(Address(z, kdx, Address::times_4,  0), carry);
+ 
+   // Second and third (nested) loops.
+   //
+   // for (int i = xstart-1; i >= 0; i--) { // Second loop
+   //   carry = 0;
+   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
+   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
+   //                    (z[k] & LONG_MASK) + carry;
+   //     z[k] = (int)product;
+   //     carry = product >>> 32;
+   //   }
+   //   z[i] = (int)carry;
+   // }
+   //
+   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
+ 
+   const Register jdx = tmp1;
+ 
+   bind(L_second_loop);
+   xorl(carry, carry);    // carry = 0;
+   movl(jdx, ylen);       // j = ystart+1
+ 
+   subl(xstart, 1);       // i = xstart-1;
+   jcc(Assembler::negative, L_done);
+ 
+   push (z);
+ 
+   Label L_last_x;
+   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
+   subl(xstart, 1);       // i = xstart-1;
+   jcc(Assembler::negative, L_last_x);
+ 
+   if (UseBMI2Instructions) {
+     movq(rdx,  Address(x, xstart, Address::times_4,  0));
+     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
+   } else {
+     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
+     rorq(x_xstart, 32);  // convert big-endian to little-endian
+   }
+ 
+   Label L_third_loop_prologue;
+   bind(L_third_loop_prologue);
+ 
+   push (x);
+   push (xstart);
+   push (ylen);
+ 
+ 
+   if (UseBMI2Instructions) {
+     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
+   } else { // !UseBMI2Instructions
+     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
+   }
+ 
+   pop(ylen);
+   pop(xlen);
+   pop(x);
+   pop(z);
+ 
+   movl(tmp3, xlen);
+   addl(tmp3, 1);
+   movl(Address(z, tmp3, Address::times_4,  0), carry);
+   subl(tmp3, 1);
+   jccb(Assembler::negative, L_done);
+ 
+   shrq(carry, 32);
+   movl(Address(z, tmp3, Address::times_4,  0), carry);
+   jmp(L_second_loop);
+ 
+   // Next infrequent code is moved outside loops.
+   bind(L_last_x);
+   if (UseBMI2Instructions) {
+     movl(rdx, Address(x,  0));
+   } else {
+     movl(x_xstart, Address(x,  0));
+   }
+   jmp(L_third_loop_prologue);
+ 
+   bind(L_done);
+ 
+   pop(zlen);
+   pop(xlen);
+ 
+   pop(tmp5);
+   pop(tmp4);
+   pop(tmp3);
+   pop(tmp2);
+   pop(tmp1);
+ }
+ #endif
+ 
  /**
   * Emits code to update CRC-32 with a byte value according to constants in table
   *
   * @param [in,out]crc   Register containing the crc.
   * @param [in]val       Register containing the byte to fold into the CRC.
src/cpu/x86/vm/macroAssembler_x86.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File