hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File
*** old/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon May  4 15:06:48 2015
--- new/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon May  4 15:06:48 2015

*** 7746,7755 ****
--- 7746,8253 ----
    pop(tmp4);
    pop(tmp3);
    pop(tmp2);
    pop(tmp1);
  }
+ 
+ //Helper functions for square_to_len()
+ 
+ /**
+  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
+  * Preserves x and z and modifies rest of the registers.
+  */
+ 
+ void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+   // Perform square and right shift by 1
+   // Handle odd xlen case first, then for even xlen do the following
+   // jlong carry = 0;
+   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
+   //     huge_128 product = x[j:j+1] * x[j:j+1]; 
+   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
+   //     z[i+2:i+3] = (jlong)(product >>> 1);
+   //     carry = (jlong)product;
+   // }
+ 
+   xorq(tmp5, tmp5);     // carry
+   xorq(rdxReg, rdxReg);
+   xorl(tmp1, tmp1);     // index for x
+   xorl(tmp4, tmp4);     // index for z
+  
+   Label L_first_loop, L_first_loop_exit;
+  
+   testl(xlen, 1);
+   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
+ 
+   // Square and right shift by 1 the odd element using 32 bit multiply
+   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
+   imulq(raxReg, raxReg);
+   shrq(raxReg, 1);
+   adcq(tmp5, 0);
+   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
+   incrementl(tmp1);
+   addl(tmp4, 2);
+  
+   // Square and  right shift by 1 the rest using 64 bit multiply
+   bind(L_first_loop);
+   cmpptr(tmp1, xlen);
+   jccb(Assembler::equal, L_first_loop_exit);
+ 
+   // Square
+   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
+   rorq(raxReg, 32);    // convert big-endian to little-endian
+   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
+ 
+   // Right shift by 1 and save carry
+   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 
+   rcrq(rdxReg, 1);
+   rcrq(raxReg, 1);
+   adcq(tmp5, 0);
+ 
+   // Store result in z
+   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
+   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
+ 
+   // Update indices for x and z 
+   addl(tmp1, 2);
+   addl(tmp4, 4);
+   jmp(L_first_loop);
+ 
+   bind(L_first_loop_exit);
+ }
+ 
+ 
+ /**
+  * Perform the following multiply add operation using BMI2 instructions
+  * carry:sum = sum + op1*op2 + carry 
+  * op2 should be in rdx
+  * op2 is preserved, all other registers are modified
+  */
+ void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
+   // assert op2 is rdx
+   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
+   addq(sum, carry);
+   adcq(tmp2, 0);
+   addq(sum, op1);
+   adcq(tmp2, 0); 
+   movq(carry, tmp2);
+ }
+ 
+ /**
+  * Perform the following multiply add operation:
+  * carry:sum = sum + op1*op2 + carry
+  * Preserves op1, op2 and modifies rest of registers
+  */
+ void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
+   // rdx:rax = op1 * op2
+   movq(raxReg, op2);
+   mulq(op1); 
+ 
+   //  rdx:rax = sum + carry + rdx:rax
+   addq(sum, carry);
+   adcq(rdxReg, 0);
+   addq(sum, raxReg);
+   adcq(rdxReg, 0);
+ 
+   // carry:sum = rdx:sum
+   movq(carry, rdxReg);
+ }
+ 
+ /**
+  * Add 64 bit long carry into z[] with carry propogation.
+  * Preserves z and carry register values and modifies rest of registers.
+  *
+  */
+ void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
+   Label L_fourth_loop, L_fourth_loop_exit;
+ 
+   movl(tmp1, 1);
+   subl(zlen, 2);
+   addq(Address(z, zlen, Address::times_4, 0), carry);
+ 
+   bind(L_fourth_loop);
+   jccb(Assembler::carryClear, L_fourth_loop_exit);
+   subl(zlen, 2);
+   jccb(Assembler::negative, L_fourth_loop_exit);
+   addq(Address(z, zlen, Address::times_4, 0), tmp1);
+   jmp(L_fourth_loop);
+   bind(L_fourth_loop_exit);
+ }
+ 
+ /**
+  * Shift z[] left by 1 bit.
+  * Preserves x, len, z and zlen registers and modifies rest of the registers.
+  *
+  */
+ void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
+ 
+   Label L_fifth_loop, L_fifth_loop_exit;
+ 
+   // Fifth loop  
+   // Perform primitiveLeftShift(z, zlen, 1)
+ 
+   const Register prev_carry = tmp1;
+   const Register new_carry = tmp4;
+   const Register value = tmp2;
+   const Register zidx = tmp3;
+ 
+   // int zidx, carry;
+   // long value;
+   // carry = 0;
+   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
+   //    (carry:value)  = (z[i] << 1) | carry ;
+   //    z[i] = value; 
+   // }
+ 
+   movl(zidx, zlen);
+   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
+ 
+   bind(L_fifth_loop);
+   decl(zidx);  // Use decl to preserve carry flag
+   decl(zidx);
+   jccb(Assembler::negative, L_fifth_loop_exit);
+   
+   if (UseBMI2Instructions) {
+      movq(value, Address(z, zidx, Address::times_4, 0));
+      rclq(value, 1);
+      rorxq(value, value, 32);
+      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
+   }
+   else {
+     // clear new_carry
+     xorl(new_carry, new_carry);
+ 
+     // Shift z[i] by 1, or in previous carry and save new carry 
+     movq(value, Address(z, zidx, Address::times_4, 0)); 
+     shlq(value, 1);
+     adcl(new_carry, 0);
+ 
+     orq(value, prev_carry);
+     rorq(value, 0x20);
+     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
+ 
+     // Set previous carry = new carry
+     movl(prev_carry, new_carry);
+   }
+   jmp(L_fifth_loop);
+ 
+   bind(L_fifth_loop_exit);
+ }
+ 
+ 
+ /**
+  * Code for BigInteger::squareToLen() intrinsic
+  *
+  * rdi: x
+  * rsi: len
+  * r8:  z
+  * rcx: zlen
+  * r12: tmp1
+  * r13: tmp2
+  * r14: tmp3
+  * r15: tmp4
+  * rbx: tmp5
+  *
+  */
+ void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+     
+   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
+   push(tmp1);
+   push(tmp2);
+   push(tmp3);
+   push(tmp4);
+   push(tmp5);
+ 
+   // First loop
+   // Store the squares, right shifted one bit (i.e., divided by 2).
+   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
+ 
+   // Add in off-diagonal sums.
+   //
+   // Second, third (nested) and fourth loops.
+   // zlen +=2;
+   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
+   //    carry = 0;
+   //    long op2 = x[xidx:xidx+1];
+   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
+   //       k -= 2;
+   //       long op1 = x[j:j+1];
+   //       long sum = z[k:k+1];
+   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
+   //       z[k:k+1] = sum;
+   //    }
+   //    add_one_64(z, k, carry, tmp_regs);
+   // }
+ 
+   
+   const Register carry = tmp5;
+   const Register sum = tmp3;
+   const Register op1 = tmp4;
+   Register op2 = tmp2;
+ 
+   push(zlen);
+   push(len);
+   addl(zlen,2);
+   bind(L_second_loop);
+   xorq(carry, carry);
+   subl(zlen, 4);
+   subl(len, 2);
+   push(zlen);
+   push(len);
+   cmpl(len, 0);
+   jccb(Assembler::lessEqual, L_second_loop_exit);
+ 
+   // Multiply an array by one 64 bit long.
+   if (UseBMI2Instructions) {
+     op2 = rdxReg;
+     movq(op2, Address(x, len, Address::times_4,  0));
+     rorxq(op2, op2, 32);
+   }
+   else {
+     movq(op2, Address(x, len, Address::times_4,  0));
+     rorq(op2, 32);
+   }
+ 
+   bind(L_third_loop);
+   decrementl(len);
+   jccb(Assembler::negative, L_third_loop_exit);
+   decrementl(len);
+   jccb(Assembler::negative, L_last_x);
+ 
+   movq(op1, Address(x, len, Address::times_4,  0));
+   rorq(op1, 32);
+   
+   bind(L_multiply);
+   subl(zlen, 2);
+   movq(sum, Address(z, zlen, Address::times_4,  0));
+ 
+   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
+   if (UseBMI2Instructions) {
+     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
+   }
+   else {
+     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+   }
+ 
+   movq(Address(z, zlen, Address::times_4, 0), sum);
+     
+   jmp(L_third_loop);
+   bind(L_third_loop_exit);
+   
+   // Fourth loop
+   // Add 64 bit long carry into z with carry propogation.
+   // Uses offsetted zlen.
+   add_one_64(z, zlen, carry, tmp1);
+ 
+   pop(len);
+   pop(zlen);
+   jmp(L_second_loop);
+ 
+   // Next infrequent code is moved outside loops.
+   bind(L_last_x);
+   movl(op1, Address(x, 0));
+   jmp(L_multiply);
+   
+   bind(L_second_loop_exit);
+   pop(len);
+   pop(zlen);
+   pop(len);
+   pop(zlen);
+   
+   // Fifth loop
+   // Shift z left 1 bit.
+   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
+ 
+   // z[zlen-1] |= x[len-1] & 1;
+   movl(tmp3, Address(x, len, Address::times_4, -4));
+   andl(tmp3, 1);
+   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
+ 
+   pop(tmp5);
+   pop(tmp4);
+   pop(tmp3);
+   pop(tmp2);
+   pop(tmp1);
+ }
+ 
+ /**
+  * Helper function for mul_add()
+  * Multiply the in[] by int k and add to out[] starting at offset offs using 
+  * 128 bit by 32 bit multiply and return the carry in tmp5.
+  * Only quad int aligned length of in[] is operated on in this function.
+  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
+  * This function preserves out, in and k registers. 
+  * len and offset point to the appropriate index in "in" & "out" correspondingly
+  * tmp5 has the carry.
+  * other registers are temporary and are modified.
+  * 
+  */
+ void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, 
+   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
+   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+ 
+   Label L_first_loop, L_first_loop_exit;
+ 
+   movl(tmp1, len);
+   shrl(tmp1, 2);
+ 
+   bind(L_first_loop);
+   subl(tmp1, 1);
+   jccb(Assembler::negative, L_first_loop_exit);  
+  
+   subl(len, 4);
+   subl(offset, 4);
+ 
+   Register op2 = tmp2;
+   const Register sum = tmp3;
+   const Register op1 = tmp4;
+   const Register carry = tmp5;
+ 
+   if (UseBMI2Instructions) {
+     op2 = rdxReg;
+   }
+ 
+   movq(op1, Address(in, len, Address::times_4,  8));
+   rorq(op1, 32);
+   movq(sum, Address(out, offset, Address::times_4,  8));
+   rorq(sum, 32);
+   if (UseBMI2Instructions) {
+     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+   }
+   else {
+     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+   }
+   // Store back in big endian from little endian
+   rorq(sum, 0x20);
+   movq(Address(out, offset, Address::times_4,  8), sum);
+ 
+   movq(op1, Address(in, len, Address::times_4,  0));
+   rorq(op1, 32);
+   movq(sum, Address(out, offset, Address::times_4,  0));
+   rorq(sum, 32);
+   if (UseBMI2Instructions) {
+     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+   }
+   else {
+     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+   }
+   // Store back in big endian from little endian
+   rorq(sum, 0x20);
+   movq(Address(out, offset, Address::times_4,  0), sum);
+ 
+   jmp(L_first_loop);
+   bind(L_first_loop_exit);
+ }
+ 
+ /**
+  * Code for BigInteger::mulAdd() intrinsic
+  *
+  * rdi: out
+  * rsi: in
+  * r11: offs (out.length - offset)
+  * rcx: len
+  * r8:  k
+  * r12: tmp1
+  * r13: tmp2
+  * r14: tmp3
+  * r15: tmp4
+  * rbx: tmp5
+  * Multiply the in[] by word k and add to out[], return the carry in rax
+  */
+ void MacroAssembler::mul_add(Register out, Register in, Register offs, 
+    Register len, Register k, Register tmp1, Register tmp2, Register tmp3, 
+    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+ 
+   Label L_carry, L_last_in, L_done;
+ 
+ // carry = 0;
+ // for (int j=len-1; j >= 0; j--) {
+ //    long product = (in[j] & LONG_MASK) * kLong +
+ //                   (out[offs] & LONG_MASK) + carry;
+ //    out[offs--] = (int)product;
+ //    carry = product >>> 32;
+ // }
+ //
+   push(tmp1);
+   push(tmp2);
+   push(tmp3);
+   push(tmp4);
+   push(tmp5);
+ 
+   Register op2 = tmp2;
+   const Register sum = tmp3;
+   const Register op1 = tmp4;
+   const Register carry =  tmp5;
+ 
+   if (UseBMI2Instructions) {
+     op2 = rdxReg;
+     movl(op2, k);
+   }
+   else {
+     movl(op2, k);
+   }
+ 
+   xorq(carry, carry);  
+ 
+   //First loop
+ 
+   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
+   //The carry is in tmp5
+   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
+ 
+   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
+   decrementl(len);
+   jccb(Assembler::negative, L_carry);
+   decrementl(len);
+   jccb(Assembler::negative, L_last_in);
+ 
+   movq(op1, Address(in, len, Address::times_4,  0));
+   rorq(op1, 32);
+   
+   subl(offs, 2);
+   movq(sum, Address(out, offs, Address::times_4,  0));
+   rorq(sum, 32);
+ 
+   if (UseBMI2Instructions) {
+     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+   }
+   else {
+     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+   }
+ 
+   // Store back in big endian from little endian
+   rorq(sum, 0x20);
+   movq(Address(out, offs, Address::times_4,  0), sum);
+ 
+   testl(len, len);
+   jccb(Assembler::zero, L_carry);
+ 
+   //Multiply the last in[] entry, if any
+   bind(L_last_in);
+   movl(op1, Address(in, 0));
+   movl(sum, Address(out, offs, Address::times_4,  -4));
+   
+   movl(raxReg, k);
+   mull(op1); //tmp4 * eax -> edx:eax 
+   addl(sum, carry);
+   adcl(rdxReg, 0);
+   addl(sum, raxReg);
+   adcl(rdxReg, 0);
+   movl(carry, rdxReg);  
+   
+   movl(Address(out, offs, Address::times_4,  -4), sum);
+ 
+   bind(L_carry);
+   //return tmp5/carry as carry in rax
+   movl(rax, carry);
+   
+   bind(L_done);
+   pop(tmp5);
+   pop(tmp4);
+   pop(tmp3);
+   pop(tmp2);
+   pop(tmp1);
+ }
  #endif
  
  /**
   * Emits code to update CRC-32 with a byte value according to constants in table
   *
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File