hotspot Sdiff src/cpu/x86/vm

src/cpu/x86/vm/macroAssembler_x86.cpp

rev 7792 : [mq]: 8081778-Use-Intel-x64-CPU-instructions-for-RSA-acceleration

7752   // Next infrequent code is moved outside loops.
7753   bind(L_last_x);
7754   if (UseBMI2Instructions) {
7755     movl(rdx, Address(x,  0));
7756   } else {
7757     movl(x_xstart, Address(x,  0));
7758   }
7759   jmp(L_third_loop_prologue);
7760 
7761   bind(L_done);
7762 
7763   pop(zlen);
7764   pop(xlen);
7765 
7766   pop(tmp5);
7767   pop(tmp4);
7768   pop(tmp3);
7769   pop(tmp2);
7770   pop(tmp1);
7771 }

















































































































































































































































































































































































































































































































7772 #endif
7773 
7774 /**
7775  * Emits code to update CRC-32 with a byte value according to constants in table
7776  *
7777  * @param [in,out]crc   Register containing the crc.
7778  * @param [in]val       Register containing the byte to fold into the CRC.
7779  * @param [in]table     Register containing the table of crc constants.
7780  *
7781  * uint32_t crc;
7782  * val = crc_table[(val ^ crc) & 0xFF];
7783  * crc = val ^ (crc >> 8);
7784  *
7785  */
7786 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7787   xorl(val, crc);
7788   andl(val, 0xFF);
7789   shrl(crc, 8); // unsigned shift
7790   xorl(crc, Address(table, val, Address::times_4, 0));
7791 }

7752   // Next infrequent code is moved outside loops.
7753   bind(L_last_x);
7754   if (UseBMI2Instructions) {
7755     movl(rdx, Address(x,  0));
7756   } else {
7757     movl(x_xstart, Address(x,  0));
7758   }
7759   jmp(L_third_loop_prologue);
7760 
7761   bind(L_done);
7762 
7763   pop(zlen);
7764   pop(xlen);
7765 
7766   pop(tmp5);
7767   pop(tmp4);
7768   pop(tmp3);
7769   pop(tmp2);
7770   pop(tmp1);
7771 }
7772 
7773 //Helper functions for square_to_len()
7774 
7775 /**
7776  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7777  * Preserves x and z and modifies rest of the registers.
7778  */
7779 
7780 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7781   // Perform square and right shift by 1
7782   // Handle odd xlen case first, then for even xlen do the following
7783   // jlong carry = 0;
7784   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7785   //     huge_128 product = x[j:j+1] * x[j:j+1];
7786   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7787   //     z[i+2:i+3] = (jlong)(product >>> 1);
7788   //     carry = (jlong)product;
7789   // }
7790 
7791   xorq(tmp5, tmp5);     // carry
7792   xorq(rdxReg, rdxReg);
7793   xorl(tmp1, tmp1);     // index for x
7794   xorl(tmp4, tmp4);     // index for z
7795 
7796   Label L_first_loop, L_first_loop_exit;
7797 
7798   testl(xlen, 1);
7799   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7800 
7801   // Square and right shift by 1 the odd element using 32 bit multiply
7802   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7803   imulq(raxReg, raxReg);
7804   shrq(raxReg, 1);
7805   adcq(tmp5, 0);
7806   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7807   incrementl(tmp1);
7808   addl(tmp4, 2);
7809 
7810   // Square and  right shift by 1 the rest using 64 bit multiply
7811   bind(L_first_loop);
7812   cmpptr(tmp1, xlen);
7813   jccb(Assembler::equal, L_first_loop_exit);
7814 
7815   // Square
7816   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
7817   rorq(raxReg, 32);    // convert big-endian to little-endian
7818   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
7819 
7820   // Right shift by 1 and save carry
7821   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
7822   rcrq(rdxReg, 1);
7823   rcrq(raxReg, 1);
7824   adcq(tmp5, 0);
7825 
7826   // Store result in z
7827   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7828   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7829 
7830   // Update indices for x and z
7831   addl(tmp1, 2);
7832   addl(tmp4, 4);
7833   jmp(L_first_loop);
7834 
7835   bind(L_first_loop_exit);
7836 }
7837 
7838 
7839 /**
7840  * Perform the following multiply add operation using BMI2 instructions
7841  * carry:sum = sum + op1*op2 + carry
7842  * op2 should be in rdx
7843  * op2 is preserved, all other registers are modified
7844  */
7845 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7846   // assert op2 is rdx
7847   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
7848   addq(sum, carry);
7849   adcq(tmp2, 0);
7850   addq(sum, op1);
7851   adcq(tmp2, 0);
7852   movq(carry, tmp2);
7853 }
7854 
7855 /**
7856  * Perform the following multiply add operation:
7857  * carry:sum = sum + op1*op2 + carry
7858  * Preserves op1, op2 and modifies rest of registers
7859  */
7860 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7861   // rdx:rax = op1 * op2
7862   movq(raxReg, op2);
7863   mulq(op1);
7864 
7865   //  rdx:rax = sum + carry + rdx:rax
7866   addq(sum, carry);
7867   adcq(rdxReg, 0);
7868   addq(sum, raxReg);
7869   adcq(rdxReg, 0);
7870 
7871   // carry:sum = rdx:sum
7872   movq(carry, rdxReg);
7873 }
7874 
7875 /**
7876  * Add 64 bit long carry into z[] with carry propogation.
7877  * Preserves z and carry register values and modifies rest of registers.
7878  *
7879  */
7880 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7881   Label L_fourth_loop, L_fourth_loop_exit;
7882 
7883   movl(tmp1, 1);
7884   subl(zlen, 2);
7885   addq(Address(z, zlen, Address::times_4, 0), carry);
7886 
7887   bind(L_fourth_loop);
7888   jccb(Assembler::carryClear, L_fourth_loop_exit);
7889   subl(zlen, 2);
7890   jccb(Assembler::negative, L_fourth_loop_exit);
7891   addq(Address(z, zlen, Address::times_4, 0), tmp1);
7892   jmp(L_fourth_loop);
7893   bind(L_fourth_loop_exit);
7894 }
7895 
7896 /**
7897  * Shift z[] left by 1 bit.
7898  * Preserves x, len, z and zlen registers and modifies rest of the registers.
7899  *
7900  */
7901 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7902 
7903   Label L_fifth_loop, L_fifth_loop_exit;
7904 
7905   // Fifth loop
7906   // Perform primitiveLeftShift(z, zlen, 1)
7907 
7908   const Register prev_carry = tmp1;
7909   const Register new_carry = tmp4;
7910   const Register value = tmp2;
7911   const Register zidx = tmp3;
7912 
7913   // int zidx, carry;
7914   // long value;
7915   // carry = 0;
7916   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7917   //    (carry:value)  = (z[i] << 1) | carry ;
7918   //    z[i] = value;
7919   // }
7920 
7921   movl(zidx, zlen);
7922   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7923 
7924   bind(L_fifth_loop);
7925   decl(zidx);  // Use decl to preserve carry flag
7926   decl(zidx);
7927   jccb(Assembler::negative, L_fifth_loop_exit);
7928 
7929   if (UseBMI2Instructions) {
7930      movq(value, Address(z, zidx, Address::times_4, 0));
7931      rclq(value, 1);
7932      rorxq(value, value, 32);
7933      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
7934   }
7935   else {
7936     // clear new_carry
7937     xorl(new_carry, new_carry);
7938 
7939     // Shift z[i] by 1, or in previous carry and save new carry
7940     movq(value, Address(z, zidx, Address::times_4, 0));
7941     shlq(value, 1);
7942     adcl(new_carry, 0);
7943 
7944     orq(value, prev_carry);
7945     rorq(value, 0x20);
7946     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
7947 
7948     // Set previous carry = new carry
7949     movl(prev_carry, new_carry);
7950   }
7951   jmp(L_fifth_loop);
7952 
7953   bind(L_fifth_loop_exit);
7954 }
7955 
7956 
7957 /**
7958  * Code for BigInteger::squareToLen() intrinsic
7959  *
7960  * rdi: x
7961  * rsi: len
7962  * r8:  z
7963  * rcx: zlen
7964  * r12: tmp1
7965  * r13: tmp2
7966  * r14: tmp3
7967  * r15: tmp4
7968  * rbx: tmp5
7969  *
7970  */
7971 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7972 
7973   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
7974   push(tmp1);
7975   push(tmp2);
7976   push(tmp3);
7977   push(tmp4);
7978   push(tmp5);
7979 
7980   // First loop
7981   // Store the squares, right shifted one bit (i.e., divided by 2).
7982   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7983 
7984   // Add in off-diagonal sums.
7985   //
7986   // Second, third (nested) and fourth loops.
7987   // zlen +=2;
7988   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7989   //    carry = 0;
7990   //    long op2 = x[xidx:xidx+1];
7991   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7992   //       k -= 2;
7993   //       long op1 = x[j:j+1];
7994   //       long sum = z[k:k+1];
7995   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7996   //       z[k:k+1] = sum;
7997   //    }
7998   //    add_one_64(z, k, carry, tmp_regs);
7999   // }
8000 
8001   const Register carry = tmp5;
8002   const Register sum = tmp3;
8003   const Register op1 = tmp4;
8004   Register op2 = tmp2;
8005 
8006   push(zlen);
8007   push(len);
8008   addl(zlen,2);
8009   bind(L_second_loop);
8010   xorq(carry, carry);
8011   subl(zlen, 4);
8012   subl(len, 2);
8013   push(zlen);
8014   push(len);
8015   cmpl(len, 0);
8016   jccb(Assembler::lessEqual, L_second_loop_exit);
8017 
8018   // Multiply an array by one 64 bit long.
8019   if (UseBMI2Instructions) {
8020     op2 = rdxReg;
8021     movq(op2, Address(x, len, Address::times_4,  0));
8022     rorxq(op2, op2, 32);
8023   }
8024   else {
8025     movq(op2, Address(x, len, Address::times_4,  0));
8026     rorq(op2, 32);
8027   }
8028 
8029   bind(L_third_loop);
8030   decrementl(len);
8031   jccb(Assembler::negative, L_third_loop_exit);
8032   decrementl(len);
8033   jccb(Assembler::negative, L_last_x);
8034 
8035   movq(op1, Address(x, len, Address::times_4,  0));
8036   rorq(op1, 32);
8037 
8038   bind(L_multiply);
8039   subl(zlen, 2);
8040   movq(sum, Address(z, zlen, Address::times_4,  0));
8041 
8042   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
8043   if (UseBMI2Instructions) {
8044     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
8045   }
8046   else {
8047     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8048   }
8049 
8050   movq(Address(z, zlen, Address::times_4, 0), sum);
8051 
8052   jmp(L_third_loop);
8053   bind(L_third_loop_exit);
8054 
8055   // Fourth loop
8056   // Add 64 bit long carry into z with carry propogation.
8057   // Uses offsetted zlen.
8058   add_one_64(z, zlen, carry, tmp1);
8059 
8060   pop(len);
8061   pop(zlen);
8062   jmp(L_second_loop);
8063 
8064   // Next infrequent code is moved outside loops.
8065   bind(L_last_x);
8066   movl(op1, Address(x, 0));
8067   jmp(L_multiply);
8068 
8069   bind(L_second_loop_exit);
8070   pop(len);
8071   pop(zlen);
8072   pop(len);
8073   pop(zlen);
8074 
8075   // Fifth loop
8076   // Shift z left 1 bit.
8077   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
8078 
8079   // z[zlen-1] |= x[len-1] & 1;
8080   movl(tmp3, Address(x, len, Address::times_4, -4));
8081   andl(tmp3, 1);
8082   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
8083 
8084   pop(tmp5);
8085   pop(tmp4);
8086   pop(tmp3);
8087   pop(tmp2);
8088   pop(tmp1);
8089 }
8090 
8091 /**
8092  * Helper function for mul_add()
8093  * Multiply the in[] by int k and add to out[] starting at offset offs using
8094  * 128 bit by 32 bit multiply and return the carry in tmp5.
8095  * Only quad int aligned length of in[] is operated on in this function.
8096  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
8097  * This function preserves out, in and k registers.
8098  * len and offset point to the appropriate index in "in" & "out" correspondingly
8099  * tmp5 has the carry.
8100  * other registers are temporary and are modified.
8101  *
8102  */
8103 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
8104   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
8105   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8106 
8107   Label L_first_loop, L_first_loop_exit;
8108 
8109   movl(tmp1, len);
8110   shrl(tmp1, 2);
8111 
8112   bind(L_first_loop);
8113   subl(tmp1, 1);
8114   jccb(Assembler::negative, L_first_loop_exit);
8115 
8116   subl(len, 4);
8117   subl(offset, 4);
8118 
8119   Register op2 = tmp2;
8120   const Register sum = tmp3;
8121   const Register op1 = tmp4;
8122   const Register carry = tmp5;
8123 
8124   if (UseBMI2Instructions) {
8125     op2 = rdxReg;
8126   }
8127 
8128   movq(op1, Address(in, len, Address::times_4,  8));
8129   rorq(op1, 32);
8130   movq(sum, Address(out, offset, Address::times_4,  8));
8131   rorq(sum, 32);
8132   if (UseBMI2Instructions) {
8133     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8134   }
8135   else {
8136     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8137   }
8138   // Store back in big endian from little endian
8139   rorq(sum, 0x20);
8140   movq(Address(out, offset, Address::times_4,  8), sum);
8141 
8142   movq(op1, Address(in, len, Address::times_4,  0));
8143   rorq(op1, 32);
8144   movq(sum, Address(out, offset, Address::times_4,  0));
8145   rorq(sum, 32);
8146   if (UseBMI2Instructions) {
8147     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8148   }
8149   else {
8150     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8151   }
8152   // Store back in big endian from little endian
8153   rorq(sum, 0x20);
8154   movq(Address(out, offset, Address::times_4,  0), sum);
8155 
8156   jmp(L_first_loop);
8157   bind(L_first_loop_exit);
8158 }
8159 
8160 /**
8161  * Code for BigInteger::mulAdd() intrinsic
8162  *
8163  * rdi: out
8164  * rsi: in
8165  * r11: offs (out.length - offset)
8166  * rcx: len
8167  * r8:  k
8168  * r12: tmp1
8169  * r13: tmp2
8170  * r14: tmp3
8171  * r15: tmp4
8172  * rbx: tmp5
8173  * Multiply the in[] by word k and add to out[], return the carry in rax
8174  */
8175 void MacroAssembler::mul_add(Register out, Register in, Register offs,
8176    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
8177    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8178 
8179   Label L_carry, L_last_in, L_done;
8180 
8181 // carry = 0;
8182 // for (int j=len-1; j >= 0; j--) {
8183 //    long product = (in[j] & LONG_MASK) * kLong +
8184 //                   (out[offs] & LONG_MASK) + carry;
8185 //    out[offs--] = (int)product;
8186 //    carry = product >>> 32;
8187 // }
8188 //
8189   push(tmp1);
8190   push(tmp2);
8191   push(tmp3);
8192   push(tmp4);
8193   push(tmp5);
8194 
8195   Register op2 = tmp2;
8196   const Register sum = tmp3;
8197   const Register op1 = tmp4;
8198   const Register carry =  tmp5;
8199 
8200   if (UseBMI2Instructions) {
8201     op2 = rdxReg;
8202     movl(op2, k);
8203   }
8204   else {
8205     movl(op2, k);
8206   }
8207 
8208   xorq(carry, carry);
8209 
8210   //First loop
8211 
8212   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
8213   //The carry is in tmp5
8214   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
8215 
8216   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
8217   decrementl(len);
8218   jccb(Assembler::negative, L_carry);
8219   decrementl(len);
8220   jccb(Assembler::negative, L_last_in);
8221 
8222   movq(op1, Address(in, len, Address::times_4,  0));
8223   rorq(op1, 32);
8224 
8225   subl(offs, 2);
8226   movq(sum, Address(out, offs, Address::times_4,  0));
8227   rorq(sum, 32);
8228 
8229   if (UseBMI2Instructions) {
8230     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8231   }
8232   else {
8233     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8234   }
8235 
8236   // Store back in big endian from little endian
8237   rorq(sum, 0x20);
8238   movq(Address(out, offs, Address::times_4,  0), sum);
8239 
8240   testl(len, len);
8241   jccb(Assembler::zero, L_carry);
8242 
8243   //Multiply the last in[] entry, if any
8244   bind(L_last_in);
8245   movl(op1, Address(in, 0));
8246   movl(sum, Address(out, offs, Address::times_4,  -4));
8247 
8248   movl(raxReg, k);
8249   mull(op1); //tmp4 * eax -> edx:eax
8250   addl(sum, carry);
8251   adcl(rdxReg, 0);
8252   addl(sum, raxReg);
8253   adcl(rdxReg, 0);
8254   movl(carry, rdxReg);
8255 
8256   movl(Address(out, offs, Address::times_4,  -4), sum);
8257 
8258   bind(L_carry);
8259   //return tmp5/carry as carry in rax
8260   movl(rax, carry);
8261 
8262   bind(L_done);
8263   pop(tmp5);
8264   pop(tmp4);
8265   pop(tmp3);
8266   pop(tmp2);
8267   pop(tmp1);
8268 }
8269 #endif
8270 
8271 /**
8272  * Emits code to update CRC-32 with a byte value according to constants in table
8273  *
8274  * @param [in,out]crc   Register containing the crc.
8275  * @param [in]val       Register containing the byte to fold into the CRC.
8276  * @param [in]table     Register containing the table of crc constants.
8277  *
8278  * uint32_t crc;
8279  * val = crc_table[(val ^ crc) & 0xFF];
8280  * crc = val ^ (crc >> 8);
8281  *
8282  */
8283 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
8284   xorl(val, crc);
8285   andl(val, 0xFF);
8286   shrl(crc, 8); // unsigned shift
8287   xorl(crc, Address(table, val, Address::times_4, 0));
8288 }

< prev index next >