7276 jccb(Assembler::lessEqual, L_copy_8_chars); 7277 7278 bind(L_copy_8_chars_exit); 7279 subptr(len, 8); 7280 jccb(Assembler::zero, L_done); 7281 } 7282 7283 bind(L_copy_1_char); 7284 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0)); 7285 testl(tmp5, 0xff00); // check if Unicode char 7286 jccb(Assembler::notZero, L_copy_1_char_exit); 7287 movb(Address(dst, len, Address::times_1, 0), tmp5); 7288 addptr(len, 1); 7289 jccb(Assembler::less, L_copy_1_char); 7290 7291 bind(L_copy_1_char_exit); 7292 addptr(result, len); // len is negative count of not processed elements 7293 bind(L_done); 7294 } 7295 7296 /** 7297 * Emits code to update CRC-32 with a byte value according to constants in table 7298 * 7299 * @param [in,out]crc Register containing the crc. 7300 * @param [in]val Register containing the byte to fold into the CRC. 7301 * @param [in]table Register containing the table of crc constants. 7302 * 7303 * uint32_t crc; 7304 * val = crc_table[(val ^ crc) & 0xFF]; 7305 * crc = val ^ (crc >> 8); 7306 * 7307 */ 7308 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 7309 xorl(val, crc); 7310 andl(val, 0xFF); 7311 shrl(crc, 8); // unsigned shift 7312 xorl(crc, Address(table, val, Address::times_4, 0)); 7313 } 7314 7315 /** | 7276 jccb(Assembler::lessEqual, L_copy_8_chars); 7277 7278 bind(L_copy_8_chars_exit); 7279 subptr(len, 8); 7280 jccb(Assembler::zero, L_done); 7281 } 7282 7283 bind(L_copy_1_char); 7284 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0)); 7285 testl(tmp5, 0xff00); // check if Unicode char 7286 jccb(Assembler::notZero, L_copy_1_char_exit); 7287 movb(Address(dst, len, Address::times_1, 0), tmp5); 7288 addptr(len, 1); 7289 jccb(Assembler::less, L_copy_1_char); 7290 7291 bind(L_copy_1_char_exit); 7292 addptr(result, len); // len is negative count of not processed elements 7293 bind(L_done); 7294 } 7295 7296 #ifdef _LP64 7297 /** 7298 * Helper for multiply_to_len(). 7299 */ 7300 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) { 7301 addq(dest_lo, src1); 7302 adcq(dest_hi, 0); 7303 addq(dest_lo, src2); 7304 adcq(dest_hi, 0); 7305 } 7306 7307 /** 7308 * Multiply 64 bit by 64 bit first loop. 7309 */ 7310 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 7311 Register y, Register y_idx, Register z, 7312 Register carry, Register product, 7313 Register idx, Register kdx) { 7314 // 7315 // jlong carry, x[], y[], z[]; 7316 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 7317 // huge_128 product = y[idx] * x[xstart] + carry; 7318 // z[kdx] = (jlong)product; 7319 // carry = (jlong)(product >>> 64); 7320 // } 7321 // z[xstart] = carry; 7322 // 7323 7324 Label L_first_loop, L_first_loop_exit; 7325 Label L_one_x, L_one_y, L_multiply; 7326 7327 decrementl(xstart); 7328 jcc(Assembler::negative, L_one_x); 7329 7330 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 7331 rorq(x_xstart, 32); // convert big-endian to little-endian 7332 7333 bind(L_first_loop); 7334 decrementl(idx); 7335 jcc(Assembler::negative, L_first_loop_exit); 7336 decrementl(idx); 7337 jcc(Assembler::negative, L_one_y); 7338 movq(y_idx, Address(y, idx, Address::times_4, 0)); 7339 rorq(y_idx, 32); // convert big-endian to little-endian 7340 bind(L_multiply); 7341 movq(product, x_xstart); 7342 mulq(y_idx); // product(rax) * y_idx -> rdx:rax 7343 addq(product, carry); 7344 adcq(rdx, 0); 7345 subl(kdx, 2); 7346 movl(Address(z, kdx, Address::times_4, 4), product); 7347 shrq(product, 32); 7348 movl(Address(z, kdx, Address::times_4, 0), product); 7349 movq(carry, rdx); 7350 jmp(L_first_loop); 7351 7352 bind(L_one_y); 7353 movl(y_idx, Address(y, 0)); 7354 jmp(L_multiply); 7355 7356 bind(L_one_x); 7357 movl(x_xstart, Address(x, 0)); 7358 jmp(L_first_loop); 7359 7360 bind(L_first_loop_exit); 7361 } 7362 7363 /** 7364 * Multiply 64 bit by 64 bit and add 128 bit. 7365 */ 7366 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z, 7367 Register yz_idx, Register idx, 7368 Register carry, Register product, int offset) { 7369 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 7370 // z[kdx] = (jlong)product; 7371 7372 movq(yz_idx, Address(y, idx, Address::times_4, offset)); 7373 rorq(yz_idx, 32); // convert big-endian to little-endian 7374 movq(product, x_xstart); 7375 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 7376 movq(yz_idx, Address(z, idx, Address::times_4, offset)); 7377 rorq(yz_idx, 32); // convert big-endian to little-endian 7378 7379 add2_with_carry(rdx, product, carry, yz_idx); 7380 7381 movl(Address(z, idx, Address::times_4, offset+4), product); 7382 shrq(product, 32); 7383 movl(Address(z, idx, Address::times_4, offset), product); 7384 7385 } 7386 7387 /** 7388 * Multiply 128 bit by 128 bit. Unrolled inner loop. 7389 */ 7390 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z, 7391 Register yz_idx, Register idx, Register jdx, 7392 Register carry, Register product, 7393 Register carry2) { 7394 // jlong carry, x[], y[], z[]; 7395 // int kdx = ystart+1; 7396 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 7397 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 7398 // z[kdx+idx+1] = (jlong)product; 7399 // jlong carry2 = (jlong)(product >>> 64); 7400 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 7401 // z[kdx+idx] = (jlong)product; 7402 // carry = (jlong)(product >>> 64); 7403 // } 7404 // idx += 2; 7405 // if (idx > 0) { 7406 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 7407 // z[kdx+idx] = (jlong)product; 7408 // carry = (jlong)(product >>> 64); 7409 // } 7410 // 7411 7412 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 7413 7414 movl(jdx, idx); 7415 andl(jdx, 0xFFFFFFFC); 7416 shrl(jdx, 2); 7417 7418 bind(L_third_loop); 7419 subl(jdx, 1); 7420 jcc(Assembler::negative, L_third_loop_exit); 7421 subl(idx, 4); 7422 7423 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8); 7424 movq(carry2, rdx); 7425 7426 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0); 7427 movq(carry, rdx); 7428 jmp(L_third_loop); 7429 7430 bind (L_third_loop_exit); 7431 7432 andl (idx, 0x3); 7433 jcc(Assembler::zero, L_post_third_loop_done); 7434 7435 Label L_check_1; 7436 subl(idx, 2); 7437 jcc(Assembler::negative, L_check_1); 7438 7439 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0); 7440 movq(carry, rdx); 7441 7442 bind (L_check_1); 7443 addl (idx, 0x2); 7444 andl (idx, 0x1); 7445 subl(idx, 1); 7446 jcc(Assembler::negative, L_post_third_loop_done); 7447 7448 movl(yz_idx, Address(y, idx, Address::times_4, 0)); 7449 movq(product, x_xstart); 7450 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 7451 movl(yz_idx, Address(z, idx, Address::times_4, 0)); 7452 7453 add2_with_carry(rdx, product, yz_idx, carry); 7454 7455 movl(Address(z, idx, Address::times_4, 0), product); 7456 shrq(product, 32); 7457 7458 shlq(rdx, 32); 7459 orq(product, rdx); 7460 movq(carry, product); 7461 7462 bind(L_post_third_loop_done); 7463 } 7464 7465 /** 7466 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop. 7467 * 7468 */ 7469 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z, 7470 Register carry, Register carry2, 7471 Register idx, Register jdx, 7472 Register yz_idx1, Register yz_idx2, 7473 Register tmp, Register tmp3, Register tmp4) { 7474 assert(UseBMI2Instructions, "should be used only when BMI2 is available"); 7475 7476 // jlong carry, x[], y[], z[]; 7477 // int kdx = ystart+1; 7478 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 7479 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry; 7480 // jlong carry2 = (jlong)(tmp3 >>> 64); 7481 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2; 7482 // carry = (jlong)(tmp4 >>> 64); 7483 // z[kdx+idx+1] = (jlong)tmp3; 7484 // z[kdx+idx] = (jlong)tmp4; 7485 // } 7486 // idx += 2; 7487 // if (idx > 0) { 7488 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry; 7489 // z[kdx+idx] = (jlong)yz_idx1; 7490 // carry = (jlong)(yz_idx1 >>> 64); 7491 // } 7492 // 7493 7494 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 7495 7496 movl(jdx, idx); 7497 andl(jdx, 0xFFFFFFFC); 7498 shrl(jdx, 2); 7499 7500 bind(L_third_loop); 7501 subl(jdx, 1); 7502 jcc(Assembler::negative, L_third_loop_exit); 7503 subl(idx, 4); 7504 7505 movq(yz_idx1, Address(y, idx, Address::times_4, 8)); 7506 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 7507 movq(yz_idx2, Address(y, idx, Address::times_4, 0)); 7508 rorxq(yz_idx2, yz_idx2, 32); 7509 7510 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 7511 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp 7512 7513 movq(yz_idx1, Address(z, idx, Address::times_4, 8)); 7514 rorxq(yz_idx1, yz_idx1, 32); 7515 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 7516 rorxq(yz_idx2, yz_idx2, 32); 7517 7518 if (VM_Version::supports_adx()) { 7519 adcxq(tmp3, carry); 7520 adoxq(tmp3, yz_idx1); 7521 7522 adcxq(tmp4, tmp); 7523 adoxq(tmp4, yz_idx2); 7524 7525 movl(carry, 0); // does not affect flags 7526 adcxq(carry2, carry); 7527 adoxq(carry2, carry); 7528 } else { 7529 add2_with_carry(tmp4, tmp3, carry, yz_idx1); 7530 add2_with_carry(carry2, tmp4, tmp, yz_idx2); 7531 } 7532 movq(carry, carry2); 7533 7534 movl(Address(z, idx, Address::times_4, 12), tmp3); 7535 shrq(tmp3, 32); 7536 movl(Address(z, idx, Address::times_4, 8), tmp3); 7537 7538 movl(Address(z, idx, Address::times_4, 4), tmp4); 7539 shrq(tmp4, 32); 7540 movl(Address(z, idx, Address::times_4, 0), tmp4); 7541 7542 jmp(L_third_loop); 7543 7544 bind (L_third_loop_exit); 7545 7546 andl (idx, 0x3); 7547 jcc(Assembler::zero, L_post_third_loop_done); 7548 7549 Label L_check_1; 7550 subl(idx, 2); 7551 jcc(Assembler::negative, L_check_1); 7552 7553 movq(yz_idx1, Address(y, idx, Address::times_4, 0)); 7554 rorxq(yz_idx1, yz_idx1, 32); 7555 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 7556 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 7557 rorxq(yz_idx2, yz_idx2, 32); 7558 7559 add2_with_carry(tmp4, tmp3, carry, yz_idx2); 7560 7561 movl(Address(z, idx, Address::times_4, 4), tmp3); 7562 shrq(tmp3, 32); 7563 movl(Address(z, idx, Address::times_4, 0), tmp3); 7564 movq(carry, tmp4); 7565 7566 bind (L_check_1); 7567 addl (idx, 0x2); 7568 andl (idx, 0x1); 7569 subl(idx, 1); 7570 jcc(Assembler::negative, L_post_third_loop_done); 7571 movl(tmp4, Address(y, idx, Address::times_4, 0)); 7572 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3 7573 movl(tmp4, Address(z, idx, Address::times_4, 0)); 7574 7575 add2_with_carry(carry2, tmp3, tmp4, carry); 7576 7577 movl(Address(z, idx, Address::times_4, 0), tmp3); 7578 shrq(tmp3, 32); 7579 7580 shlq(carry2, 32); 7581 orq(tmp3, carry2); 7582 movq(carry, tmp3); 7583 7584 bind(L_post_third_loop_done); 7585 } 7586 7587 /** 7588 * Code for BigInteger::multiplyToLen() instrinsic. 7589 * 7590 * rdi: x 7591 * rax: xlen 7592 * rsi: y 7593 * rcx: ylen 7594 * r8: z 7595 * r11: zlen 7596 * r12: tmp1 7597 * r13: tmp2 7598 * r14: tmp3 7599 * r15: tmp4 7600 * rbx: tmp5 7601 * 7602 */ 7603 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, 7604 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 7605 ShortBranchVerifier sbv(this); 7606 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx); 7607 7608 push(tmp1); 7609 push(tmp2); 7610 push(tmp3); 7611 push(tmp4); 7612 push(tmp5); 7613 7614 push(xlen); 7615 push(zlen); 7616 7617 const Register idx = tmp1; 7618 const Register kdx = tmp2; 7619 const Register xstart = tmp3; 7620 7621 const Register y_idx = tmp4; 7622 const Register carry = tmp5; 7623 const Register product = xlen; 7624 const Register x_xstart = zlen; // reuse register 7625 7626 // First Loop. 7627 // 7628 // final static long LONG_MASK = 0xffffffffL; 7629 // int xstart = xlen - 1; 7630 // int ystart = ylen - 1; 7631 // long carry = 0; 7632 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 7633 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 7634 // z[kdx] = (int)product; 7635 // carry = product >>> 32; 7636 // } 7637 // z[xstart] = (int)carry; 7638 // 7639 7640 movl(idx, ylen); // idx = ylen; 7641 movl(kdx, zlen); // kdx = xlen+ylen; 7642 xorq(carry, carry); // carry = 0; 7643 7644 Label L_done; 7645 7646 movl(xstart, xlen); 7647 decrementl(xstart); 7648 jcc(Assembler::negative, L_done); 7649 7650 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 7651 7652 Label L_second_loop; 7653 testl(kdx, kdx); 7654 jcc(Assembler::zero, L_second_loop); 7655 7656 Label L_carry; 7657 subl(kdx, 1); 7658 jcc(Assembler::zero, L_carry); 7659 7660 movl(Address(z, kdx, Address::times_4, 0), carry); 7661 shrq(carry, 32); 7662 subl(kdx, 1); 7663 7664 bind(L_carry); 7665 movl(Address(z, kdx, Address::times_4, 0), carry); 7666 7667 // Second and third (nested) loops. 7668 // 7669 // for (int i = xstart-1; i >= 0; i--) { // Second loop 7670 // carry = 0; 7671 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 7672 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 7673 // (z[k] & LONG_MASK) + carry; 7674 // z[k] = (int)product; 7675 // carry = product >>> 32; 7676 // } 7677 // z[i] = (int)carry; 7678 // } 7679 // 7680 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 7681 7682 const Register jdx = tmp1; 7683 7684 bind(L_second_loop); 7685 xorl(carry, carry); // carry = 0; 7686 movl(jdx, ylen); // j = ystart+1 7687 7688 subl(xstart, 1); // i = xstart-1; 7689 jcc(Assembler::negative, L_done); 7690 7691 push (z); 7692 7693 Label L_last_x; 7694 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j 7695 subl(xstart, 1); // i = xstart-1; 7696 jcc(Assembler::negative, L_last_x); 7697 7698 if (UseBMI2Instructions) { 7699 movq(rdx, Address(x, xstart, Address::times_4, 0)); 7700 rorxq(rdx, rdx, 32); // convert big-endian to little-endian 7701 } else { 7702 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 7703 rorq(x_xstart, 32); // convert big-endian to little-endian 7704 } 7705 7706 Label L_third_loop_prologue; 7707 bind(L_third_loop_prologue); 7708 7709 push (x); 7710 push (xstart); 7711 push (ylen); 7712 7713 7714 if (UseBMI2Instructions) { 7715 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4); 7716 } else { // !UseBMI2Instructions 7717 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x); 7718 } 7719 7720 pop(ylen); 7721 pop(xlen); 7722 pop(x); 7723 pop(z); 7724 7725 movl(tmp3, xlen); 7726 addl(tmp3, 1); 7727 movl(Address(z, tmp3, Address::times_4, 0), carry); 7728 subl(tmp3, 1); 7729 jccb(Assembler::negative, L_done); 7730 7731 shrq(carry, 32); 7732 movl(Address(z, tmp3, Address::times_4, 0), carry); 7733 jmp(L_second_loop); 7734 7735 // Next infrequent code is moved outside loops. 7736 bind(L_last_x); 7737 if (UseBMI2Instructions) { 7738 movl(rdx, Address(x, 0)); 7739 } else { 7740 movl(x_xstart, Address(x, 0)); 7741 } 7742 jmp(L_third_loop_prologue); 7743 7744 bind(L_done); 7745 7746 pop(zlen); 7747 pop(xlen); 7748 7749 pop(tmp5); 7750 pop(tmp4); 7751 pop(tmp3); 7752 pop(tmp2); 7753 pop(tmp1); 7754 } 7755 #endif 7756 7757 /** 7758 * Emits code to update CRC-32 with a byte value according to constants in table 7759 * 7760 * @param [in,out]crc Register containing the crc. 7761 * @param [in]val Register containing the byte to fold into the CRC. 7762 * @param [in]table Register containing the table of crc constants. 7763 * 7764 * uint32_t crc; 7765 * val = crc_table[(val ^ crc) & 0xFF]; 7766 * crc = val ^ (crc >> 8); 7767 * 7768 */ 7769 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 7770 xorl(val, crc); 7771 andl(val, 0xFF); 7772 shrl(crc, 8); // unsigned shift 7773 xorl(crc, Address(table, val, Address::times_4, 0)); 7774 } 7775 7776 /** |