8069539 Sdiff hotspot/src/cpu/x86/vm

hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File 8069539 Sdiff hotspot/src/cpu/x86/vm

hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp

Print this page

7731   // Next infrequent code is moved outside loops.
7732   bind(L_last_x);
7733   if (UseBMI2Instructions) {
7734     movl(rdx, Address(x,  0));
7735   } else {
7736     movl(x_xstart, Address(x,  0));
7737   }
7738   jmp(L_third_loop_prologue);
7739 
7740   bind(L_done);
7741 
7742   pop(zlen);
7743   pop(xlen);
7744 
7745   pop(tmp5);
7746   pop(tmp4);
7747   pop(tmp3);
7748   pop(tmp2);
7749   pop(tmp1);
7750 }


















































































































































































































































































































































































































































































































7751 #endif
7752 
7753 /**
7754  * Emits code to update CRC-32 with a byte value according to constants in table
7755  *
7756  * @param [in,out]crc   Register containing the crc.
7757  * @param [in]val       Register containing the byte to fold into the CRC.
7758  * @param [in]table     Register containing the table of crc constants.
7759  *
7760  * uint32_t crc;
7761  * val = crc_table[(val ^ crc) & 0xFF];
7762  * crc = val ^ (crc >> 8);
7763  *
7764  */
7765 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7766   xorl(val, crc);
7767   andl(val, 0xFF);
7768   shrl(crc, 8); // unsigned shift
7769   xorl(crc, Address(table, val, Address::times_4, 0));
7770 }

7731   // Next infrequent code is moved outside loops.
7732   bind(L_last_x);
7733   if (UseBMI2Instructions) {
7734     movl(rdx, Address(x,  0));
7735   } else {
7736     movl(x_xstart, Address(x,  0));
7737   }
7738   jmp(L_third_loop_prologue);
7739 
7740   bind(L_done);
7741 
7742   pop(zlen);
7743   pop(xlen);
7744 
7745   pop(tmp5);
7746   pop(tmp4);
7747   pop(tmp3);
7748   pop(tmp2);
7749   pop(tmp1);
7750 }
7751 
7752 //Helper functions for square_to_len()
7753 
7754 /**
7755  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7756  * Preserves x and z and modifies rest of the registers.
7757  */
7758 
7759 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7760   // Perform square and right shift by 1
7761   // Handle odd xlen case first, then for even xlen do the following
7762   // jlong carry = 0;
7763   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7764   //     huge_128 product = x[j:j+1] * x[j:j+1]; 
7765   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7766   //     z[i+2:i+3] = (jlong)(product >>> 1);
7767   //     carry = (jlong)product;
7768   // }
7769 
7770   xorq(tmp5, tmp5);     // carry
7771   xorq(rdxReg, rdxReg);
7772   xorl(tmp1, tmp1);     // index for x
7773   xorl(tmp4, tmp4);     // index for z
7774  
7775   Label L_first_loop, L_first_loop_exit;
7776  
7777   testl(xlen, 1);
7778   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7779 
7780   // Square and right shift by 1 the odd element using 32 bit multiply
7781   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7782   imulq(raxReg, raxReg);
7783   shrq(raxReg, 1);
7784   adcq(tmp5, 0);
7785   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7786   incrementl(tmp1);
7787   addl(tmp4, 2);
7788  
7789   // Square and  right shift by 1 the rest using 64 bit multiply
7790   bind(L_first_loop);
7791   cmpptr(tmp1, xlen);
7792   jccb(Assembler::equal, L_first_loop_exit);
7793 
7794   // Square
7795   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
7796   rorq(raxReg, 32);    // convert big-endian to little-endian
7797   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
7798 
7799   // Right shift by 1 and save carry
7800   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 
7801   rcrq(rdxReg, 1);
7802   rcrq(raxReg, 1);
7803   adcq(tmp5, 0);
7804 
7805   // Store result in z
7806   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7807   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7808 
7809   // Update indices for x and z 
7810   addl(tmp1, 2);
7811   addl(tmp4, 4);
7812   jmp(L_first_loop);
7813 
7814   bind(L_first_loop_exit);
7815 }
7816 
7817 
7818 /**
7819  * Perform the following multiply add operation using BMI2 instructions
7820  * carry:sum = sum + op1*op2 + carry 
7821  * op2 should be in rdx
7822  * op2 is preserved, all other registers are modified
7823  */
7824 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7825   // assert op2 is rdx
7826   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
7827   addq(sum, carry);
7828   adcq(tmp2, 0);
7829   addq(sum, op1);
7830   adcq(tmp2, 0); 
7831   movq(carry, tmp2);
7832 }
7833 
7834 /**
7835  * Perform the following multiply add operation:
7836  * carry:sum = sum + op1*op2 + carry
7837  * Preserves op1, op2 and modifies rest of registers
7838  */
7839 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7840   // rdx:rax = op1 * op2
7841   movq(raxReg, op2);
7842   mulq(op1); 
7843 
7844   //  rdx:rax = sum + carry + rdx:rax
7845   addq(sum, carry);
7846   adcq(rdxReg, 0);
7847   addq(sum, raxReg);
7848   adcq(rdxReg, 0);
7849 
7850   // carry:sum = rdx:sum
7851   movq(carry, rdxReg);
7852 }
7853 
7854 /**
7855  * Add 64 bit long carry into z[] with carry propogation.
7856  * Preserves z and carry register values and modifies rest of registers.
7857  *
7858  */
7859 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7860   Label L_fourth_loop, L_fourth_loop_exit;
7861 
7862   movl(tmp1, 1);
7863   subl(zlen, 2);
7864   addq(Address(z, zlen, Address::times_4, 0), carry);
7865 
7866   bind(L_fourth_loop);
7867   jccb(Assembler::carryClear, L_fourth_loop_exit);
7868   subl(zlen, 2);
7869   jccb(Assembler::negative, L_fourth_loop_exit);
7870   addq(Address(z, zlen, Address::times_4, 0), tmp1);
7871   jmp(L_fourth_loop);
7872   bind(L_fourth_loop_exit);
7873 }
7874 
7875 /**
7876  * Shift z[] left by 1 bit.
7877  * Preserves x, len, z and zlen registers and modifies rest of the registers.
7878  *
7879  */
7880 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7881 
7882   Label L_fifth_loop, L_fifth_loop_exit;
7883 
7884   // Fifth loop  
7885   // Perform primitiveLeftShift(z, zlen, 1)
7886 
7887   const Register prev_carry = tmp1;
7888   const Register new_carry = tmp4;
7889   const Register value = tmp2;
7890   const Register zidx = tmp3;
7891 
7892   // int zidx, carry;
7893   // long value;
7894   // carry = 0;
7895   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7896   //    (carry:value)  = (z[i] << 1) | carry ;
7897   //    z[i] = value; 
7898   // }
7899 
7900   movl(zidx, zlen);
7901   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7902 
7903   bind(L_fifth_loop);
7904   decl(zidx);  // Use decl to preserve carry flag
7905   decl(zidx);
7906   jccb(Assembler::negative, L_fifth_loop_exit);
7907   
7908   if (UseBMI2Instructions) {
7909      movq(value, Address(z, zidx, Address::times_4, 0));
7910      rclq(value, 1);
7911      rorxq(value, value, 32);
7912      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
7913   }
7914   else {
7915     // clear new_carry
7916     xorl(new_carry, new_carry);
7917 
7918     // Shift z[i] by 1, or in previous carry and save new carry 
7919     movq(value, Address(z, zidx, Address::times_4, 0)); 
7920     shlq(value, 1);
7921     adcl(new_carry, 0);
7922 
7923     orq(value, prev_carry);
7924     rorq(value, 0x20);
7925     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
7926 
7927     // Set previous carry = new carry
7928     movl(prev_carry, new_carry);
7929   }
7930   jmp(L_fifth_loop);
7931 
7932   bind(L_fifth_loop_exit);
7933 }
7934 
7935 
7936 /**
7937  * Code for BigInteger::squareToLen() intrinsic
7938  *
7939  * rdi: x
7940  * rsi: len
7941  * r8:  z
7942  * rcx: zlen
7943  * r12: tmp1
7944  * r13: tmp2
7945  * r14: tmp3
7946  * r15: tmp4
7947  * rbx: tmp5
7948  *
7949  */
7950 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7951     
7952   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
7953   push(tmp1);
7954   push(tmp2);
7955   push(tmp3);
7956   push(tmp4);
7957   push(tmp5);
7958 
7959   // First loop
7960   // Store the squares, right shifted one bit (i.e., divided by 2).
7961   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7962 
7963   // Add in off-diagonal sums.
7964   //
7965   // Second, third (nested) and fourth loops.
7966   // zlen +=2;
7967   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7968   //    carry = 0;
7969   //    long op2 = x[xidx:xidx+1];
7970   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7971   //       k -= 2;
7972   //       long op1 = x[j:j+1];
7973   //       long sum = z[k:k+1];
7974   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7975   //       z[k:k+1] = sum;
7976   //    }
7977   //    add_one_64(z, k, carry, tmp_regs);
7978   // }
7979 
7980   
7981   const Register carry = tmp5;
7982   const Register sum = tmp3;
7983   const Register op1 = tmp4;
7984   Register op2 = tmp2;
7985 
7986   push(zlen);
7987   push(len);
7988   addl(zlen,2);
7989   bind(L_second_loop);
7990   xorq(carry, carry);
7991   subl(zlen, 4);
7992   subl(len, 2);
7993   push(zlen);
7994   push(len);
7995   cmpl(len, 0);
7996   jccb(Assembler::lessEqual, L_second_loop_exit);
7997 
7998   // Multiply an array by one 64 bit long.
7999   if (UseBMI2Instructions) {
8000     op2 = rdxReg;
8001     movq(op2, Address(x, len, Address::times_4,  0));
8002     rorxq(op2, op2, 32);
8003   }
8004   else {
8005     movq(op2, Address(x, len, Address::times_4,  0));
8006     rorq(op2, 32);
8007   }
8008 
8009   bind(L_third_loop);
8010   decrementl(len);
8011   jccb(Assembler::negative, L_third_loop_exit);
8012   decrementl(len);
8013   jccb(Assembler::negative, L_last_x);
8014 
8015   movq(op1, Address(x, len, Address::times_4,  0));
8016   rorq(op1, 32);
8017   
8018   bind(L_multiply);
8019   subl(zlen, 2);
8020   movq(sum, Address(z, zlen, Address::times_4,  0));
8021 
8022   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
8023   if (UseBMI2Instructions) {
8024     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
8025   }
8026   else {
8027     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8028   }
8029 
8030   movq(Address(z, zlen, Address::times_4, 0), sum);
8031     
8032   jmp(L_third_loop);
8033   bind(L_third_loop_exit);
8034   
8035   // Fourth loop
8036   // Add 64 bit long carry into z with carry propogation.
8037   // Uses offsetted zlen.
8038   add_one_64(z, zlen, carry, tmp1);
8039 
8040   pop(len);
8041   pop(zlen);
8042   jmp(L_second_loop);
8043 
8044   // Next infrequent code is moved outside loops.
8045   bind(L_last_x);
8046   movl(op1, Address(x, 0));
8047   jmp(L_multiply);
8048   
8049   bind(L_second_loop_exit);
8050   pop(len);
8051   pop(zlen);
8052   pop(len);
8053   pop(zlen);
8054   
8055   // Fifth loop
8056   // Shift z left 1 bit.
8057   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
8058 
8059   // z[zlen-1] |= x[len-1] & 1;
8060   movl(tmp3, Address(x, len, Address::times_4, -4));
8061   andl(tmp3, 1);
8062   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
8063 
8064   pop(tmp5);
8065   pop(tmp4);
8066   pop(tmp3);
8067   pop(tmp2);
8068   pop(tmp1);
8069 }
8070 
8071 /**
8072  * Helper function for mul_add()
8073  * Multiply the in[] by int k and add to out[] starting at offset offs using 
8074  * 128 bit by 32 bit multiply and return the carry in tmp5.
8075  * Only quad int aligned length of in[] is operated on in this function.
8076  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
8077  * This function preserves out, in and k registers. 
8078  * len and offset point to the appropriate index in "in" & "out" correspondingly
8079  * tmp5 has the carry.
8080  * other registers are temporary and are modified.
8081  * 
8082  */
8083 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, 
8084   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
8085   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8086 
8087   Label L_first_loop, L_first_loop_exit;
8088 
8089   movl(tmp1, len);
8090   shrl(tmp1, 2);
8091 
8092   bind(L_first_loop);
8093   subl(tmp1, 1);
8094   jccb(Assembler::negative, L_first_loop_exit);  
8095  
8096   subl(len, 4);
8097   subl(offset, 4);
8098 
8099   Register op2 = tmp2;
8100   const Register sum = tmp3;
8101   const Register op1 = tmp4;
8102   const Register carry = tmp5;
8103 
8104   if (UseBMI2Instructions) {
8105     op2 = rdxReg;
8106   }
8107 
8108   movq(op1, Address(in, len, Address::times_4,  8));
8109   rorq(op1, 32);
8110   movq(sum, Address(out, offset, Address::times_4,  8));
8111   rorq(sum, 32);
8112   if (UseBMI2Instructions) {
8113     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8114   }
8115   else {
8116     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8117   }
8118   // Store back in big endian from little endian
8119   rorq(sum, 0x20);
8120   movq(Address(out, offset, Address::times_4,  8), sum);
8121 
8122   movq(op1, Address(in, len, Address::times_4,  0));
8123   rorq(op1, 32);
8124   movq(sum, Address(out, offset, Address::times_4,  0));
8125   rorq(sum, 32);
8126   if (UseBMI2Instructions) {
8127     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8128   }
8129   else {
8130     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8131   }
8132   // Store back in big endian from little endian
8133   rorq(sum, 0x20);
8134   movq(Address(out, offset, Address::times_4,  0), sum);
8135 
8136   jmp(L_first_loop);
8137   bind(L_first_loop_exit);
8138 }
8139 
8140 /**
8141  * Code for BigInteger::mulAdd() intrinsic
8142  *
8143  * rdi: out
8144  * rsi: in
8145  * r11: offs (out.length - offset)
8146  * rcx: len
8147  * r8:  k
8148  * r12: tmp1
8149  * r13: tmp2
8150  * r14: tmp3
8151  * r15: tmp4
8152  * rbx: tmp5
8153  * Multiply the in[] by word k and add to out[], return the carry in rax
8154  */
8155 void MacroAssembler::mul_add(Register out, Register in, Register offs, 
8156    Register len, Register k, Register tmp1, Register tmp2, Register tmp3, 
8157    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8158 
8159   Label L_carry, L_last_in, L_done;
8160 
8161 // carry = 0;
8162 // for (int j=len-1; j >= 0; j--) {
8163 //    long product = (in[j] & LONG_MASK) * kLong +
8164 //                   (out[offs] & LONG_MASK) + carry;
8165 //    out[offs--] = (int)product;
8166 //    carry = product >>> 32;
8167 // }
8168 //
8169   push(tmp1);
8170   push(tmp2);
8171   push(tmp3);
8172   push(tmp4);
8173   push(tmp5);
8174 
8175   Register op2 = tmp2;
8176   const Register sum = tmp3;
8177   const Register op1 = tmp4;
8178   const Register carry =  tmp5;
8179 
8180   if (UseBMI2Instructions) {
8181     op2 = rdxReg;
8182     movl(op2, k);
8183   }
8184   else {
8185     movl(op2, k);
8186   }
8187 
8188   xorq(carry, carry);  
8189 
8190   //First loop
8191 
8192   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
8193   //The carry is in tmp5
8194   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
8195 
8196   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
8197   decrementl(len);
8198   jccb(Assembler::negative, L_carry);
8199   decrementl(len);
8200   jccb(Assembler::negative, L_last_in);
8201 
8202   movq(op1, Address(in, len, Address::times_4,  0));
8203   rorq(op1, 32);
8204   
8205   subl(offs, 2);
8206   movq(sum, Address(out, offs, Address::times_4,  0));
8207   rorq(sum, 32);
8208 
8209   if (UseBMI2Instructions) {
8210     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8211   }
8212   else {
8213     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8214   }
8215 
8216   // Store back in big endian from little endian
8217   rorq(sum, 0x20);
8218   movq(Address(out, offs, Address::times_4,  0), sum);
8219 
8220   testl(len, len);
8221   jccb(Assembler::zero, L_carry);
8222 
8223   //Multiply the last in[] entry, if any
8224   bind(L_last_in);
8225   movl(op1, Address(in, 0));
8226   movl(sum, Address(out, offs, Address::times_4,  -4));
8227   
8228   movl(raxReg, k);
8229   mull(op1); //tmp4 * eax -> edx:eax 
8230   addl(sum, carry);
8231   adcl(rdxReg, 0);
8232   addl(sum, raxReg);
8233   adcl(rdxReg, 0);
8234   movl(carry, rdxReg);  
8235   
8236   movl(Address(out, offs, Address::times_4,  -4), sum);
8237 
8238   bind(L_carry);
8239   //return tmp5/carry as carry in rax
8240   movl(rax, carry);
8241   
8242   bind(L_done);
8243   pop(tmp5);
8244   pop(tmp4);
8245   pop(tmp3);
8246   pop(tmp2);
8247   pop(tmp1);
8248 }
8249 #endif
8250 
8251 /**
8252  * Emits code to update CRC-32 with a byte value according to constants in table
8253  *
8254  * @param [in,out]crc   Register containing the crc.
8255  * @param [in]val       Register containing the byte to fold into the CRC.
8256  * @param [in]table     Register containing the table of crc constants.
8257  *
8258  * uint32_t crc;
8259  * val = crc_table[(val ^ crc) & 0xFF];
8260  * crc = val ^ (crc >> 8);
8261  *
8262  */
8263 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
8264   xorl(val, crc);
8265   andl(val, 0xFF);
8266   shrl(crc, 8); // unsigned shift
8267   xorl(crc, Address(table, val, Address::times_4, 0));
8268 }

hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File