7731 // Next infrequent code is moved outside loops. 7732 bind(L_last_x); 7733 if (UseBMI2Instructions) { 7734 movl(rdx, Address(x, 0)); 7735 } else { 7736 movl(x_xstart, Address(x, 0)); 7737 } 7738 jmp(L_third_loop_prologue); 7739 7740 bind(L_done); 7741 7742 pop(zlen); 7743 pop(xlen); 7744 7745 pop(tmp5); 7746 pop(tmp4); 7747 pop(tmp3); 7748 pop(tmp2); 7749 pop(tmp1); 7750 } 7751 #endif 7752 7753 /** 7754 * Emits code to update CRC-32 with a byte value according to constants in table 7755 * 7756 * @param [in,out]crc Register containing the crc. 7757 * @param [in]val Register containing the byte to fold into the CRC. 7758 * @param [in]table Register containing the table of crc constants. 7759 * 7760 * uint32_t crc; 7761 * val = crc_table[(val ^ crc) & 0xFF]; 7762 * crc = val ^ (crc >> 8); 7763 * 7764 */ 7765 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 7766 xorl(val, crc); 7767 andl(val, 0xFF); 7768 shrl(crc, 8); // unsigned shift 7769 xorl(crc, Address(table, val, Address::times_4, 0)); 7770 } | 7731 // Next infrequent code is moved outside loops. 7732 bind(L_last_x); 7733 if (UseBMI2Instructions) { 7734 movl(rdx, Address(x, 0)); 7735 } else { 7736 movl(x_xstart, Address(x, 0)); 7737 } 7738 jmp(L_third_loop_prologue); 7739 7740 bind(L_done); 7741 7742 pop(zlen); 7743 pop(xlen); 7744 7745 pop(tmp5); 7746 pop(tmp4); 7747 pop(tmp3); 7748 pop(tmp2); 7749 pop(tmp1); 7750 } 7751 7752 //Helper functions for square_to_len() 7753 7754 /** 7755 * Store the squares of x[], right shifted one bit (divided by 2) into z[] 7756 * Preserves x and z and modifies rest of the registers. 7757 */ 7758 7759 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7760 // Perform square and right shift by 1 7761 // Handle odd xlen case first, then for even xlen do the following 7762 // jlong carry = 0; 7763 // for (int j=0, i=0; j < xlen; j+=2, i+=4) { 7764 // huge_128 product = x[j:j+1] * x[j:j+1]; 7765 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); 7766 // z[i+2:i+3] = (jlong)(product >>> 1); 7767 // carry = (jlong)product; 7768 // } 7769 7770 xorq(tmp5, tmp5); // carry 7771 xorq(rdxReg, rdxReg); 7772 xorl(tmp1, tmp1); // index for x 7773 xorl(tmp4, tmp4); // index for z 7774 7775 Label L_first_loop, L_first_loop_exit; 7776 7777 testl(xlen, 1); 7778 jccb(Assembler::zero, L_first_loop); //jump if xlen is even 7779 7780 // Square and right shift by 1 the odd element using 32 bit multiply 7781 movl(raxReg, Address(x, tmp1, Address::times_4, 0)); 7782 imulq(raxReg, raxReg); 7783 shrq(raxReg, 1); 7784 adcq(tmp5, 0); 7785 movq(Address(z, tmp4, Address::times_4, 0), raxReg); 7786 incrementl(tmp1); 7787 addl(tmp4, 2); 7788 7789 // Square and right shift by 1 the rest using 64 bit multiply 7790 bind(L_first_loop); 7791 cmpptr(tmp1, xlen); 7792 jccb(Assembler::equal, L_first_loop_exit); 7793 7794 // Square 7795 movq(raxReg, Address(x, tmp1, Address::times_4, 0)); 7796 rorq(raxReg, 32); // convert big-endian to little-endian 7797 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax 7798 7799 // Right shift by 1 and save carry 7800 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 7801 rcrq(rdxReg, 1); 7802 rcrq(raxReg, 1); 7803 adcq(tmp5, 0); 7804 7805 // Store result in z 7806 movq(Address(z, tmp4, Address::times_4, 0), rdxReg); 7807 movq(Address(z, tmp4, Address::times_4, 8), raxReg); 7808 7809 // Update indices for x and z 7810 addl(tmp1, 2); 7811 addl(tmp4, 4); 7812 jmp(L_first_loop); 7813 7814 bind(L_first_loop_exit); 7815 } 7816 7817 7818 /** 7819 * Perform the following multiply add operation using BMI2 instructions 7820 * carry:sum = sum + op1*op2 + carry 7821 * op2 should be in rdx 7822 * op2 is preserved, all other registers are modified 7823 */ 7824 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { 7825 // assert op2 is rdx 7826 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 7827 addq(sum, carry); 7828 adcq(tmp2, 0); 7829 addq(sum, op1); 7830 adcq(tmp2, 0); 7831 movq(carry, tmp2); 7832 } 7833 7834 /** 7835 * Perform the following multiply add operation: 7836 * carry:sum = sum + op1*op2 + carry 7837 * Preserves op1, op2 and modifies rest of registers 7838 */ 7839 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { 7840 // rdx:rax = op1 * op2 7841 movq(raxReg, op2); 7842 mulq(op1); 7843 7844 // rdx:rax = sum + carry + rdx:rax 7845 addq(sum, carry); 7846 adcq(rdxReg, 0); 7847 addq(sum, raxReg); 7848 adcq(rdxReg, 0); 7849 7850 // carry:sum = rdx:sum 7851 movq(carry, rdxReg); 7852 } 7853 7854 /** 7855 * Add 64 bit long carry into z[] with carry propogation. 7856 * Preserves z and carry register values and modifies rest of registers. 7857 * 7858 */ 7859 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { 7860 Label L_fourth_loop, L_fourth_loop_exit; 7861 7862 movl(tmp1, 1); 7863 subl(zlen, 2); 7864 addq(Address(z, zlen, Address::times_4, 0), carry); 7865 7866 bind(L_fourth_loop); 7867 jccb(Assembler::carryClear, L_fourth_loop_exit); 7868 subl(zlen, 2); 7869 jccb(Assembler::negative, L_fourth_loop_exit); 7870 addq(Address(z, zlen, Address::times_4, 0), tmp1); 7871 jmp(L_fourth_loop); 7872 bind(L_fourth_loop_exit); 7873 } 7874 7875 /** 7876 * Shift z[] left by 1 bit. 7877 * Preserves x, len, z and zlen registers and modifies rest of the registers. 7878 * 7879 */ 7880 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 7881 7882 Label L_fifth_loop, L_fifth_loop_exit; 7883 7884 // Fifth loop 7885 // Perform primitiveLeftShift(z, zlen, 1) 7886 7887 const Register prev_carry = tmp1; 7888 const Register new_carry = tmp4; 7889 const Register value = tmp2; 7890 const Register zidx = tmp3; 7891 7892 // int zidx, carry; 7893 // long value; 7894 // carry = 0; 7895 // for (zidx = zlen-2; zidx >=0; zidx -= 2) { 7896 // (carry:value) = (z[i] << 1) | carry ; 7897 // z[i] = value; 7898 // } 7899 7900 movl(zidx, zlen); 7901 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register 7902 7903 bind(L_fifth_loop); 7904 decl(zidx); // Use decl to preserve carry flag 7905 decl(zidx); 7906 jccb(Assembler::negative, L_fifth_loop_exit); 7907 7908 if (UseBMI2Instructions) { 7909 movq(value, Address(z, zidx, Address::times_4, 0)); 7910 rclq(value, 1); 7911 rorxq(value, value, 32); 7912 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 7913 } 7914 else { 7915 // clear new_carry 7916 xorl(new_carry, new_carry); 7917 7918 // Shift z[i] by 1, or in previous carry and save new carry 7919 movq(value, Address(z, zidx, Address::times_4, 0)); 7920 shlq(value, 1); 7921 adcl(new_carry, 0); 7922 7923 orq(value, prev_carry); 7924 rorq(value, 0x20); 7925 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 7926 7927 // Set previous carry = new carry 7928 movl(prev_carry, new_carry); 7929 } 7930 jmp(L_fifth_loop); 7931 7932 bind(L_fifth_loop_exit); 7933 } 7934 7935 7936 /** 7937 * Code for BigInteger::squareToLen() intrinsic 7938 * 7939 * rdi: x 7940 * rsi: len 7941 * r8: z 7942 * rcx: zlen 7943 * r12: tmp1 7944 * r13: tmp2 7945 * r14: tmp3 7946 * r15: tmp4 7947 * rbx: tmp5 7948 * 7949 */ 7950 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7951 7952 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply; 7953 push(tmp1); 7954 push(tmp2); 7955 push(tmp3); 7956 push(tmp4); 7957 push(tmp5); 7958 7959 // First loop 7960 // Store the squares, right shifted one bit (i.e., divided by 2). 7961 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); 7962 7963 // Add in off-diagonal sums. 7964 // 7965 // Second, third (nested) and fourth loops. 7966 // zlen +=2; 7967 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { 7968 // carry = 0; 7969 // long op2 = x[xidx:xidx+1]; 7970 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { 7971 // k -= 2; 7972 // long op1 = x[j:j+1]; 7973 // long sum = z[k:k+1]; 7974 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); 7975 // z[k:k+1] = sum; 7976 // } 7977 // add_one_64(z, k, carry, tmp_regs); 7978 // } 7979 7980 7981 const Register carry = tmp5; 7982 const Register sum = tmp3; 7983 const Register op1 = tmp4; 7984 Register op2 = tmp2; 7985 7986 push(zlen); 7987 push(len); 7988 addl(zlen,2); 7989 bind(L_second_loop); 7990 xorq(carry, carry); 7991 subl(zlen, 4); 7992 subl(len, 2); 7993 push(zlen); 7994 push(len); 7995 cmpl(len, 0); 7996 jccb(Assembler::lessEqual, L_second_loop_exit); 7997 7998 // Multiply an array by one 64 bit long. 7999 if (UseBMI2Instructions) { 8000 op2 = rdxReg; 8001 movq(op2, Address(x, len, Address::times_4, 0)); 8002 rorxq(op2, op2, 32); 8003 } 8004 else { 8005 movq(op2, Address(x, len, Address::times_4, 0)); 8006 rorq(op2, 32); 8007 } 8008 8009 bind(L_third_loop); 8010 decrementl(len); 8011 jccb(Assembler::negative, L_third_loop_exit); 8012 decrementl(len); 8013 jccb(Assembler::negative, L_last_x); 8014 8015 movq(op1, Address(x, len, Address::times_4, 0)); 8016 rorq(op1, 32); 8017 8018 bind(L_multiply); 8019 subl(zlen, 2); 8020 movq(sum, Address(z, zlen, Address::times_4, 0)); 8021 8022 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. 8023 if (UseBMI2Instructions) { 8024 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); 8025 } 8026 else { 8027 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8028 } 8029 8030 movq(Address(z, zlen, Address::times_4, 0), sum); 8031 8032 jmp(L_third_loop); 8033 bind(L_third_loop_exit); 8034 8035 // Fourth loop 8036 // Add 64 bit long carry into z with carry propogation. 8037 // Uses offsetted zlen. 8038 add_one_64(z, zlen, carry, tmp1); 8039 8040 pop(len); 8041 pop(zlen); 8042 jmp(L_second_loop); 8043 8044 // Next infrequent code is moved outside loops. 8045 bind(L_last_x); 8046 movl(op1, Address(x, 0)); 8047 jmp(L_multiply); 8048 8049 bind(L_second_loop_exit); 8050 pop(len); 8051 pop(zlen); 8052 pop(len); 8053 pop(zlen); 8054 8055 // Fifth loop 8056 // Shift z left 1 bit. 8057 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); 8058 8059 // z[zlen-1] |= x[len-1] & 1; 8060 movl(tmp3, Address(x, len, Address::times_4, -4)); 8061 andl(tmp3, 1); 8062 orl(Address(z, zlen, Address::times_4, -4), tmp3); 8063 8064 pop(tmp5); 8065 pop(tmp4); 8066 pop(tmp3); 8067 pop(tmp2); 8068 pop(tmp1); 8069 } 8070 8071 /** 8072 * Helper function for mul_add() 8073 * Multiply the in[] by int k and add to out[] starting at offset offs using 8074 * 128 bit by 32 bit multiply and return the carry in tmp5. 8075 * Only quad int aligned length of in[] is operated on in this function. 8076 * k is in rdxReg for BMI2Instructions, for others it is in tmp2. 8077 * This function preserves out, in and k registers. 8078 * len and offset point to the appropriate index in "in" & "out" correspondingly 8079 * tmp5 has the carry. 8080 * other registers are temporary and are modified. 8081 * 8082 */ 8083 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, 8084 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, 8085 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 8086 8087 Label L_first_loop, L_first_loop_exit; 8088 8089 movl(tmp1, len); 8090 shrl(tmp1, 2); 8091 8092 bind(L_first_loop); 8093 subl(tmp1, 1); 8094 jccb(Assembler::negative, L_first_loop_exit); 8095 8096 subl(len, 4); 8097 subl(offset, 4); 8098 8099 Register op2 = tmp2; 8100 const Register sum = tmp3; 8101 const Register op1 = tmp4; 8102 const Register carry = tmp5; 8103 8104 if (UseBMI2Instructions) { 8105 op2 = rdxReg; 8106 } 8107 8108 movq(op1, Address(in, len, Address::times_4, 8)); 8109 rorq(op1, 32); 8110 movq(sum, Address(out, offset, Address::times_4, 8)); 8111 rorq(sum, 32); 8112 if (UseBMI2Instructions) { 8113 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 8114 } 8115 else { 8116 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8117 } 8118 // Store back in big endian from little endian 8119 rorq(sum, 0x20); 8120 movq(Address(out, offset, Address::times_4, 8), sum); 8121 8122 movq(op1, Address(in, len, Address::times_4, 0)); 8123 rorq(op1, 32); 8124 movq(sum, Address(out, offset, Address::times_4, 0)); 8125 rorq(sum, 32); 8126 if (UseBMI2Instructions) { 8127 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 8128 } 8129 else { 8130 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8131 } 8132 // Store back in big endian from little endian 8133 rorq(sum, 0x20); 8134 movq(Address(out, offset, Address::times_4, 0), sum); 8135 8136 jmp(L_first_loop); 8137 bind(L_first_loop_exit); 8138 } 8139 8140 /** 8141 * Code for BigInteger::mulAdd() intrinsic 8142 * 8143 * rdi: out 8144 * rsi: in 8145 * r11: offs (out.length - offset) 8146 * rcx: len 8147 * r8: k 8148 * r12: tmp1 8149 * r13: tmp2 8150 * r14: tmp3 8151 * r15: tmp4 8152 * rbx: tmp5 8153 * Multiply the in[] by word k and add to out[], return the carry in rax 8154 */ 8155 void MacroAssembler::mul_add(Register out, Register in, Register offs, 8156 Register len, Register k, Register tmp1, Register tmp2, Register tmp3, 8157 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 8158 8159 Label L_carry, L_last_in, L_done; 8160 8161 // carry = 0; 8162 // for (int j=len-1; j >= 0; j--) { 8163 // long product = (in[j] & LONG_MASK) * kLong + 8164 // (out[offs] & LONG_MASK) + carry; 8165 // out[offs--] = (int)product; 8166 // carry = product >>> 32; 8167 // } 8168 // 8169 push(tmp1); 8170 push(tmp2); 8171 push(tmp3); 8172 push(tmp4); 8173 push(tmp5); 8174 8175 Register op2 = tmp2; 8176 const Register sum = tmp3; 8177 const Register op1 = tmp4; 8178 const Register carry = tmp5; 8179 8180 if (UseBMI2Instructions) { 8181 op2 = rdxReg; 8182 movl(op2, k); 8183 } 8184 else { 8185 movl(op2, k); 8186 } 8187 8188 xorq(carry, carry); 8189 8190 //First loop 8191 8192 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply 8193 //The carry is in tmp5 8194 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); 8195 8196 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any 8197 decrementl(len); 8198 jccb(Assembler::negative, L_carry); 8199 decrementl(len); 8200 jccb(Assembler::negative, L_last_in); 8201 8202 movq(op1, Address(in, len, Address::times_4, 0)); 8203 rorq(op1, 32); 8204 8205 subl(offs, 2); 8206 movq(sum, Address(out, offs, Address::times_4, 0)); 8207 rorq(sum, 32); 8208 8209 if (UseBMI2Instructions) { 8210 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 8211 } 8212 else { 8213 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8214 } 8215 8216 // Store back in big endian from little endian 8217 rorq(sum, 0x20); 8218 movq(Address(out, offs, Address::times_4, 0), sum); 8219 8220 testl(len, len); 8221 jccb(Assembler::zero, L_carry); 8222 8223 //Multiply the last in[] entry, if any 8224 bind(L_last_in); 8225 movl(op1, Address(in, 0)); 8226 movl(sum, Address(out, offs, Address::times_4, -4)); 8227 8228 movl(raxReg, k); 8229 mull(op1); //tmp4 * eax -> edx:eax 8230 addl(sum, carry); 8231 adcl(rdxReg, 0); 8232 addl(sum, raxReg); 8233 adcl(rdxReg, 0); 8234 movl(carry, rdxReg); 8235 8236 movl(Address(out, offs, Address::times_4, -4), sum); 8237 8238 bind(L_carry); 8239 //return tmp5/carry as carry in rax 8240 movl(rax, carry); 8241 8242 bind(L_done); 8243 pop(tmp5); 8244 pop(tmp4); 8245 pop(tmp3); 8246 pop(tmp2); 8247 pop(tmp1); 8248 } 8249 #endif 8250 8251 /** 8252 * Emits code to update CRC-32 with a byte value according to constants in table 8253 * 8254 * @param [in,out]crc Register containing the crc. 8255 * @param [in]val Register containing the byte to fold into the CRC. 8256 * @param [in]table Register containing the table of crc constants. 8257 * 8258 * uint32_t crc; 8259 * val = crc_table[(val ^ crc) & 0xFF]; 8260 * crc = val ^ (crc >> 8); 8261 * 8262 */ 8263 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 8264 xorl(val, crc); 8265 andl(val, 0xFF); 8266 shrl(crc, 8); // unsigned shift 8267 xorl(crc, Address(table, val, Address::times_4, 0)); 8268 } |