src/cpu/x86/vm/macroAssembler_x86.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File 8055494 Sdiff src/cpu/x86/vm

src/cpu/x86/vm/macroAssembler_x86.cpp

Print this page




7276     jccb(Assembler::lessEqual, L_copy_8_chars);
7277 
7278     bind(L_copy_8_chars_exit);
7279     subptr(len, 8);
7280     jccb(Assembler::zero, L_done);
7281   }
7282 
7283   bind(L_copy_1_char);
7284   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
7285   testl(tmp5, 0xff00);      // check if Unicode char
7286   jccb(Assembler::notZero, L_copy_1_char_exit);
7287   movb(Address(dst, len, Address::times_1, 0), tmp5);
7288   addptr(len, 1);
7289   jccb(Assembler::less, L_copy_1_char);
7290 
7291   bind(L_copy_1_char_exit);
7292   addptr(result, len); // len is negative count of not processed elements
7293   bind(L_done);
7294 }
7295 













































































































































































































































































































































































































































































7296 /**
7297  * Emits code to update CRC-32 with a byte value according to constants in table
7298  *
7299  * @param [in,out]crc   Register containing the crc.
7300  * @param [in]val       Register containing the byte to fold into the CRC.
7301  * @param [in]table     Register containing the table of crc constants.
7302  *
7303  * uint32_t crc;
7304  * val = crc_table[(val ^ crc) & 0xFF];
7305  * crc = val ^ (crc >> 8);
7306  *
7307  */
7308 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7309   xorl(val, crc);
7310   andl(val, 0xFF);
7311   shrl(crc, 8); // unsigned shift
7312   xorl(crc, Address(table, val, Address::times_4, 0));
7313 }
7314 
7315 /**




7276     jccb(Assembler::lessEqual, L_copy_8_chars);
7277 
7278     bind(L_copy_8_chars_exit);
7279     subptr(len, 8);
7280     jccb(Assembler::zero, L_done);
7281   }
7282 
7283   bind(L_copy_1_char);
7284   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
7285   testl(tmp5, 0xff00);      // check if Unicode char
7286   jccb(Assembler::notZero, L_copy_1_char_exit);
7287   movb(Address(dst, len, Address::times_1, 0), tmp5);
7288   addptr(len, 1);
7289   jccb(Assembler::less, L_copy_1_char);
7290 
7291   bind(L_copy_1_char_exit);
7292   addptr(result, len); // len is negative count of not processed elements
7293   bind(L_done);
7294 }
7295 
7296 #ifdef _LP64
7297 /**
7298  * Helper for multiply_to_len().
7299  */
7300 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
7301   addq(dest_lo, src1);
7302   adcq(dest_hi, 0);
7303   addq(dest_lo, src2);
7304   adcq(dest_hi, 0);
7305 }
7306 
7307 /**
7308  * Multiply 64 bit by 64 bit first loop.
7309  */
7310 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
7311                                            Register y, Register y_idx, Register z,
7312                                            Register carry, Register product,
7313                                            Register idx, Register kdx) {
7314   //
7315   //  jlong carry, x[], y[], z[];
7316   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7317   //    huge_128 product = y[idx] * x[xstart] + carry;
7318   //    z[kdx] = (jlong)product;
7319   //    carry  = (jlong)(product >>> 64);
7320   //  }
7321   //  z[xstart] = carry;
7322   //
7323 
7324   Label L_first_loop, L_first_loop_exit;
7325   Label L_one_x, L_one_y, L_multiply;
7326 
7327   decrementl(xstart);
7328   jcc(Assembler::negative, L_one_x);
7329 
7330   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
7331   rorq(x_xstart, 32); // convert big-endian to little-endian
7332 
7333   bind(L_first_loop);
7334   decrementl(idx);
7335   jcc(Assembler::negative, L_first_loop_exit);
7336   decrementl(idx);
7337   jcc(Assembler::negative, L_one_y);
7338   movq(y_idx, Address(y, idx, Address::times_4,  0));
7339   rorq(y_idx, 32); // convert big-endian to little-endian
7340   bind(L_multiply);
7341   movq(product, x_xstart);
7342   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
7343   addq(product, carry);
7344   adcq(rdx, 0);
7345   subl(kdx, 2);
7346   movl(Address(z, kdx, Address::times_4,  4), product);
7347   shrq(product, 32);
7348   movl(Address(z, kdx, Address::times_4,  0), product);
7349   movq(carry, rdx);
7350   jmp(L_first_loop);
7351 
7352   bind(L_one_y);
7353   movl(y_idx, Address(y,  0));
7354   jmp(L_multiply);
7355 
7356   bind(L_one_x);
7357   movl(x_xstart, Address(x,  0));
7358   jmp(L_first_loop);
7359 
7360   bind(L_first_loop_exit);
7361 }
7362 
7363 /**
7364  * Multiply 64 bit by 64 bit and add 128 bit.
7365  */
7366 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
7367                                             Register yz_idx, Register idx,
7368                                             Register carry, Register product, int offset) {
7369   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
7370   //     z[kdx] = (jlong)product;
7371 
7372   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
7373   rorq(yz_idx, 32); // convert big-endian to little-endian
7374   movq(product, x_xstart);
7375   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
7376   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
7377   rorq(yz_idx, 32); // convert big-endian to little-endian
7378 
7379   add2_with_carry(rdx, product, carry, yz_idx);
7380 
7381   movl(Address(z, idx, Address::times_4,  offset+4), product);
7382   shrq(product, 32);
7383   movl(Address(z, idx, Address::times_4,  offset), product);
7384 
7385 }
7386 
7387 /**
7388  * Multiply 128 bit by 128 bit. Unrolled inner loop.
7389  */
7390 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
7391                                              Register yz_idx, Register idx, Register jdx,
7392                                              Register carry, Register product,
7393                                              Register carry2) {
7394   //   jlong carry, x[], y[], z[];
7395   //   int kdx = ystart+1;
7396   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7397   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
7398   //     z[kdx+idx+1] = (jlong)product;
7399   //     jlong carry2  = (jlong)(product >>> 64);
7400   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
7401   //     z[kdx+idx] = (jlong)product;
7402   //     carry  = (jlong)(product >>> 64);
7403   //   }
7404   //   idx += 2;
7405   //   if (idx > 0) {
7406   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
7407   //     z[kdx+idx] = (jlong)product;
7408   //     carry  = (jlong)(product >>> 64);
7409   //   }
7410   //
7411 
7412   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7413 
7414   movl(jdx, idx);
7415   andl(jdx, 0xFFFFFFFC);
7416   shrl(jdx, 2);
7417 
7418   bind(L_third_loop);
7419   subl(jdx, 1);
7420   jcc(Assembler::negative, L_third_loop_exit);
7421   subl(idx, 4);
7422 
7423   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
7424   movq(carry2, rdx);
7425 
7426   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
7427   movq(carry, rdx);
7428   jmp(L_third_loop);
7429 
7430   bind (L_third_loop_exit);
7431 
7432   andl (idx, 0x3);
7433   jcc(Assembler::zero, L_post_third_loop_done);
7434 
7435   Label L_check_1;
7436   subl(idx, 2);
7437   jcc(Assembler::negative, L_check_1);
7438 
7439   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
7440   movq(carry, rdx);
7441 
7442   bind (L_check_1);
7443   addl (idx, 0x2);
7444   andl (idx, 0x1);
7445   subl(idx, 1);
7446   jcc(Assembler::negative, L_post_third_loop_done);
7447 
7448   movl(yz_idx, Address(y, idx, Address::times_4,  0));
7449   movq(product, x_xstart);
7450   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7451   movl(yz_idx, Address(z, idx, Address::times_4,  0));
7452 
7453   add2_with_carry(rdx, product, yz_idx, carry);
7454 
7455   movl(Address(z, idx, Address::times_4,  0), product);
7456   shrq(product, 32);
7457 
7458   shlq(rdx, 32);
7459   orq(product, rdx);
7460   movq(carry, product);
7461 
7462   bind(L_post_third_loop_done);
7463 }
7464 
7465 /**
7466  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
7467  *
7468  */
7469 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
7470                                                   Register carry, Register carry2,
7471                                                   Register idx, Register jdx,
7472                                                   Register yz_idx1, Register yz_idx2,
7473                                                   Register tmp, Register tmp3, Register tmp4) {
7474   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
7475 
7476   //   jlong carry, x[], y[], z[];
7477   //   int kdx = ystart+1;
7478   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7479   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
7480   //     jlong carry2  = (jlong)(tmp3 >>> 64);
7481   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
7482   //     carry  = (jlong)(tmp4 >>> 64);
7483   //     z[kdx+idx+1] = (jlong)tmp3;
7484   //     z[kdx+idx] = (jlong)tmp4;
7485   //   }
7486   //   idx += 2;
7487   //   if (idx > 0) {
7488   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
7489   //     z[kdx+idx] = (jlong)yz_idx1;
7490   //     carry  = (jlong)(yz_idx1 >>> 64);
7491   //   }
7492   //
7493 
7494   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7495 
7496   movl(jdx, idx);
7497   andl(jdx, 0xFFFFFFFC);
7498   shrl(jdx, 2);
7499 
7500   bind(L_third_loop);
7501   subl(jdx, 1);
7502   jcc(Assembler::negative, L_third_loop_exit);
7503   subl(idx, 4);
7504 
7505   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
7506   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
7507   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
7508   rorxq(yz_idx2, yz_idx2, 32);
7509 
7510   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
7511   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
7512 
7513   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
7514   rorxq(yz_idx1, yz_idx1, 32);
7515   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7516   rorxq(yz_idx2, yz_idx2, 32);
7517 
7518   if (VM_Version::supports_adx()) {
7519     adcxq(tmp3, carry);
7520     adoxq(tmp3, yz_idx1);
7521 
7522     adcxq(tmp4, tmp);
7523     adoxq(tmp4, yz_idx2);
7524 
7525     movl(carry, 0); // does not affect flags
7526     adcxq(carry2, carry);
7527     adoxq(carry2, carry);
7528   } else {
7529     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
7530     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
7531   }
7532   movq(carry, carry2);
7533 
7534   movl(Address(z, idx, Address::times_4, 12), tmp3);
7535   shrq(tmp3, 32);
7536   movl(Address(z, idx, Address::times_4,  8), tmp3);
7537 
7538   movl(Address(z, idx, Address::times_4,  4), tmp4);
7539   shrq(tmp4, 32);
7540   movl(Address(z, idx, Address::times_4,  0), tmp4);
7541 
7542   jmp(L_third_loop);
7543 
7544   bind (L_third_loop_exit);
7545 
7546   andl (idx, 0x3);
7547   jcc(Assembler::zero, L_post_third_loop_done);
7548 
7549   Label L_check_1;
7550   subl(idx, 2);
7551   jcc(Assembler::negative, L_check_1);
7552 
7553   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
7554   rorxq(yz_idx1, yz_idx1, 32);
7555   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
7556   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7557   rorxq(yz_idx2, yz_idx2, 32);
7558 
7559   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
7560 
7561   movl(Address(z, idx, Address::times_4,  4), tmp3);
7562   shrq(tmp3, 32);
7563   movl(Address(z, idx, Address::times_4,  0), tmp3);
7564   movq(carry, tmp4);
7565 
7566   bind (L_check_1);
7567   addl (idx, 0x2);
7568   andl (idx, 0x1);
7569   subl(idx, 1);
7570   jcc(Assembler::negative, L_post_third_loop_done);
7571   movl(tmp4, Address(y, idx, Address::times_4,  0));
7572   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
7573   movl(tmp4, Address(z, idx, Address::times_4,  0));
7574 
7575   add2_with_carry(carry2, tmp3, tmp4, carry);
7576 
7577   movl(Address(z, idx, Address::times_4,  0), tmp3);
7578   shrq(tmp3, 32);
7579 
7580   shlq(carry2, 32);
7581   orq(tmp3, carry2);
7582   movq(carry, tmp3);
7583 
7584   bind(L_post_third_loop_done);
7585 }
7586 
7587 /**
7588  * Code for BigInteger::multiplyToLen() instrinsic.
7589  *
7590  * rdi: x
7591  * rax: xlen
7592  * rsi: y
7593  * rcx: ylen
7594  * r8:  z
7595  * r11: zlen
7596  * r12: tmp1
7597  * r13: tmp2
7598  * r14: tmp3
7599  * r15: tmp4
7600  * rbx: tmp5
7601  *
7602  */
7603 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
7604                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7605   ShortBranchVerifier sbv(this);
7606   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7607 
7608   push(tmp1);
7609   push(tmp2);
7610   push(tmp3);
7611   push(tmp4);
7612   push(tmp5);
7613 
7614   push(xlen);
7615   push(zlen);
7616 
7617   const Register idx = tmp1;
7618   const Register kdx = tmp2;
7619   const Register xstart = tmp3;
7620 
7621   const Register y_idx = tmp4;
7622   const Register carry = tmp5;
7623   const Register product  = xlen;
7624   const Register x_xstart = zlen;  // reuse register
7625 
7626   // First Loop.
7627   //
7628   //  final static long LONG_MASK = 0xffffffffL;
7629   //  int xstart = xlen - 1;
7630   //  int ystart = ylen - 1;
7631   //  long carry = 0;
7632   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7633   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7634   //    z[kdx] = (int)product;
7635   //    carry = product >>> 32;
7636   //  }
7637   //  z[xstart] = (int)carry;
7638   //
7639 
7640   movl(idx, ylen);      // idx = ylen;
7641   movl(kdx, zlen);      // kdx = xlen+ylen;
7642   xorq(carry, carry);   // carry = 0;
7643 
7644   Label L_done;
7645 
7646   movl(xstart, xlen);
7647   decrementl(xstart);
7648   jcc(Assembler::negative, L_done);
7649 
7650   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
7651 
7652   Label L_second_loop;
7653   testl(kdx, kdx);
7654   jcc(Assembler::zero, L_second_loop);
7655 
7656   Label L_carry;
7657   subl(kdx, 1);
7658   jcc(Assembler::zero, L_carry);
7659 
7660   movl(Address(z, kdx, Address::times_4,  0), carry);
7661   shrq(carry, 32);
7662   subl(kdx, 1);
7663 
7664   bind(L_carry);
7665   movl(Address(z, kdx, Address::times_4,  0), carry);
7666 
7667   // Second and third (nested) loops.
7668   //
7669   // for (int i = xstart-1; i >= 0; i--) { // Second loop
7670   //   carry = 0;
7671   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
7672   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
7673   //                    (z[k] & LONG_MASK) + carry;
7674   //     z[k] = (int)product;
7675   //     carry = product >>> 32;
7676   //   }
7677   //   z[i] = (int)carry;
7678   // }
7679   //
7680   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
7681 
7682   const Register jdx = tmp1;
7683 
7684   bind(L_second_loop);
7685   xorl(carry, carry);    // carry = 0;
7686   movl(jdx, ylen);       // j = ystart+1
7687 
7688   subl(xstart, 1);       // i = xstart-1;
7689   jcc(Assembler::negative, L_done);
7690 
7691   push (z);
7692 
7693   Label L_last_x;
7694   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
7695   subl(xstart, 1);       // i = xstart-1;
7696   jcc(Assembler::negative, L_last_x);
7697 
7698   if (UseBMI2Instructions) {
7699     movq(rdx,  Address(x, xstart, Address::times_4,  0));
7700     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
7701   } else {
7702     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
7703     rorq(x_xstart, 32);  // convert big-endian to little-endian
7704   }
7705 
7706   Label L_third_loop_prologue;
7707   bind(L_third_loop_prologue);
7708 
7709   push (x);
7710   push (xstart);
7711   push (ylen);
7712 
7713 
7714   if (UseBMI2Instructions) {
7715     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
7716   } else { // !UseBMI2Instructions
7717     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
7718   }
7719 
7720   pop(ylen);
7721   pop(xlen);
7722   pop(x);
7723   pop(z);
7724 
7725   movl(tmp3, xlen);
7726   addl(tmp3, 1);
7727   movl(Address(z, tmp3, Address::times_4,  0), carry);
7728   subl(tmp3, 1);
7729   jccb(Assembler::negative, L_done);
7730 
7731   shrq(carry, 32);
7732   movl(Address(z, tmp3, Address::times_4,  0), carry);
7733   jmp(L_second_loop);
7734 
7735   // Next infrequent code is moved outside loops.
7736   bind(L_last_x);
7737   if (UseBMI2Instructions) {
7738     movl(rdx, Address(x,  0));
7739   } else {
7740     movl(x_xstart, Address(x,  0));
7741   }
7742   jmp(L_third_loop_prologue);
7743 
7744   bind(L_done);
7745 
7746   pop(zlen);
7747   pop(xlen);
7748 
7749   pop(tmp5);
7750   pop(tmp4);
7751   pop(tmp3);
7752   pop(tmp2);
7753   pop(tmp1);
7754 }
7755 #endif
7756 
7757 /**
7758  * Emits code to update CRC-32 with a byte value according to constants in table
7759  *
7760  * @param [in,out]crc   Register containing the crc.
7761  * @param [in]val       Register containing the byte to fold into the CRC.
7762  * @param [in]table     Register containing the table of crc constants.
7763  *
7764  * uint32_t crc;
7765  * val = crc_table[(val ^ crc) & 0xFF];
7766  * crc = val ^ (crc >> 8);
7767  *
7768  */
7769 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7770   xorl(val, crc);
7771   andl(val, 0xFF);
7772   shrl(crc, 8); // unsigned shift
7773   xorl(crc, Address(table, val, Address::times_4, 0));
7774 }
7775 
7776 /**


src/cpu/x86/vm/macroAssembler_x86.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File