33 #include "prims/methodHandles.hpp"
34 #include "runtime/biasedLocking.hpp"
35 #include "runtime/icache.hpp"
36 #include "runtime/interfaceSupport.hpp"
37 #include "runtime/objectMonitor.hpp"
38 #include "runtime/os.hpp"
39 #include "runtime/sharedRuntime.hpp"
40 #include "runtime/stubRoutines.hpp"
41 #include "utilities/macros.hpp"
42 #if INCLUDE_ALL_GCS
43 #include "gc/g1/g1CollectedHeap.inline.hpp"
44 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
45 #include "gc/g1/heapRegion.hpp"
46 #endif // INCLUDE_ALL_GCS
47
48 #ifdef PRODUCT
49 #define BLOCK_COMMENT(str) // nothing
50 #else
51 #define BLOCK_COMMENT(str) block_comment(str)
52 #endif
53
54 #ifdef ASSERT
55 // On RISC, there's no benefit to verifying instruction boundaries.
56 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
57 #endif
58
59 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
60 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
61 if (Assembler::is_simm(si31, 16)) {
62 ld(d, si31, a);
63 if (emit_filler_nop) nop();
64 } else {
65 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
66 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
67 addis(d, a, hi);
68 ld(d, lo, d);
69 }
70 }
71
72 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
3416 //14:
3417 if (cntval & 2) {
3418 lwzx(R0, str1_reg, index_reg);
3419 lwzx(tmp2_reg, str2_reg, index_reg);
3420 cmpw(CCR0, R0, tmp2_reg);
3421 bne(CCR0, Ldone_false);
3422 if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3423 }
3424 if (cntval & 1) {
3425 lhzx(R0, str1_reg, index_reg);
3426 lhzx(tmp2_reg, str2_reg, index_reg);
3427 cmpw(CCR0, R0, tmp2_reg);
3428 bne(CCR0, Ldone_false);
3429 }
3430 // fallthru: true
3431 }
3432 li(result_reg, 1);
3433 bind(Ldone_false);
3434 }
3435
3436 // dest_lo += src1 + src2
3437 // dest_hi += carry1 + carry2
3438 void MacroAssembler::add2_with_carry(Register dest_hi,
3439 Register dest_lo,
3440 Register src1, Register src2) {
3441 li(R0, 0);
3442 addc(dest_lo, dest_lo, src1);
3443 adde(dest_hi, dest_hi, R0);
3444 addc(dest_lo, dest_lo, src2);
3445 adde(dest_hi, dest_hi, R0);
3446 }
3447
3448 // Multiply 64 bit by 64 bit first loop.
3449 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3450 Register x_xstart,
3451 Register y, Register y_idx,
3452 Register z,
3453 Register carry,
3454 Register product_high, Register product,
3455 Register idx, Register kdx,
3498 addc(product, product, carry); // Add carry to result.
3499 adde(product_high, product_high, tmp); // Add carry of the last addition.
3500 addi(kdx, kdx, -2);
3501
3502 // Store result.
3503 #ifdef VM_LITTLE_ENDIAN
3504 rldicl(product, product, 32, 0);
3505 #endif
3506 sldi(tmp, kdx, LogBytesPerInt);
3507 stdx(product, z, tmp);
3508 mr_if_needed(carry, product_high);
3509 b(L_first_loop);
3510
3511
3512 bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3513
3514 lwz(y_idx, 0, y);
3515 b(L_multiply);
3516
3517
3518 bind( L_one_x ); // Load one 32 bit portion of x as (0,value).
3519
3520 lwz(x_xstart, 0, x);
3521 b(L_first_loop);
3522
3523 bind(L_first_loop_exit);
3524 }
3525
3526 // Multiply 64 bit by 64 bit and add 128 bit.
3527 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3528 Register z, Register yz_idx,
3529 Register idx, Register carry,
3530 Register product_high, Register product,
3531 Register tmp, int offset) {
3532
3533 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3534 // z[kdx] = (jlong)product;
3535
3536 sldi(tmp, idx, LogBytesPerInt);
3537 if ( offset ) {
3538 addi(tmp, tmp, offset);
3539 }
3540 ldx(yz_idx, y, tmp);
3541 #ifdef VM_LITTLE_ENDIAN
3542 rldicl(yz_idx, yz_idx, 32, 0);
3543 #endif
3544
3545 multiply64(product_high, product, x_xstart, yz_idx);
3546 ldx(yz_idx, z, tmp);
3547 #ifdef VM_LITTLE_ENDIAN
3548 rldicl(yz_idx, yz_idx, 32, 0);
3549 #endif
3550
3551 add2_with_carry(product_high, product, carry, yz_idx);
3552
3553 sldi(tmp, idx, LogBytesPerInt);
3554 if ( offset ) {
3555 addi(tmp, tmp, offset);
3556 }
3557 #ifdef VM_LITTLE_ENDIAN
3558 rldicl(product, product, 32, 0);
3559 #endif
3560 stdx(product, z, tmp);
3561 }
3562
3563 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3564 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3565 Register y, Register z,
3566 Register yz_idx, Register idx, Register carry,
3567 Register product_high, Register product,
3568 Register carry2, Register tmp) {
3569
3570 // jlong carry, x[], y[], z[];
3571 // int kdx = ystart+1;
3572 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3573 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3574 // z[kdx+idx+1] = (jlong)product;
|
33 #include "prims/methodHandles.hpp"
34 #include "runtime/biasedLocking.hpp"
35 #include "runtime/icache.hpp"
36 #include "runtime/interfaceSupport.hpp"
37 #include "runtime/objectMonitor.hpp"
38 #include "runtime/os.hpp"
39 #include "runtime/sharedRuntime.hpp"
40 #include "runtime/stubRoutines.hpp"
41 #include "utilities/macros.hpp"
42 #if INCLUDE_ALL_GCS
43 #include "gc/g1/g1CollectedHeap.inline.hpp"
44 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
45 #include "gc/g1/heapRegion.hpp"
46 #endif // INCLUDE_ALL_GCS
47
48 #ifdef PRODUCT
49 #define BLOCK_COMMENT(str) // nothing
50 #else
51 #define BLOCK_COMMENT(str) block_comment(str)
52 #endif
53 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
54
55 #ifdef ASSERT
56 // On RISC, there's no benefit to verifying instruction boundaries.
57 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
58 #endif
59
60 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
61 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
62 if (Assembler::is_simm(si31, 16)) {
63 ld(d, si31, a);
64 if (emit_filler_nop) nop();
65 } else {
66 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
67 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
68 addis(d, a, hi);
69 ld(d, lo, d);
70 }
71 }
72
73 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
3417 //14:
3418 if (cntval & 2) {
3419 lwzx(R0, str1_reg, index_reg);
3420 lwzx(tmp2_reg, str2_reg, index_reg);
3421 cmpw(CCR0, R0, tmp2_reg);
3422 bne(CCR0, Ldone_false);
3423 if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3424 }
3425 if (cntval & 1) {
3426 lhzx(R0, str1_reg, index_reg);
3427 lhzx(tmp2_reg, str2_reg, index_reg);
3428 cmpw(CCR0, R0, tmp2_reg);
3429 bne(CCR0, Ldone_false);
3430 }
3431 // fallthru: true
3432 }
3433 li(result_reg, 1);
3434 bind(Ldone_false);
3435 }
3436
3437 // Helpers for Intrinsic Emitters
3438 //
3439 // Revert the byte order of a 32bit value in a register
3440 // src: 0x44556677
3441 // dst: 0x77665544
3442 // Three steps to obtain the result:
3443 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3444 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3445 // This value initializes dst.
3446 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3447 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3448 // This value is mask inserted into dst with a [0..23] mask of 1s.
3449 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3450 // This value is mask inserted into dst with a [8..15] mask of 1s.
3451 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3452 assert_different_registers(dst, src);
3453
3454 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3455 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3456 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
3457 }
3458
3459 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3460 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3461 // body size from 20 to 16 instructions.
3462 // Returns the offset that was used to calculate the address of column tc3.
3463 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3464 // at hand, the original table address can be easily reconstructed.
3465 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3466
3467 #ifdef VM_LITTLE_ENDIAN
3468 // This is what we implement (the DOLIT4 part):
3469 // ========================================================================= */
3470 // #define DOLIT4 c ^= *buf4++; \
3471 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3472 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3473 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3474 // ========================================================================= */
3475 const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3476 const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3477 const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3478 const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3479 #else
3480 // This is what we implement (the DOBIG4 part):
3481 // =========================================================================
3482 // #define DOBIG4 c ^= *++buf4; \
3483 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3484 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3485 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3486 // =========================================================================
3487 const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3488 const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3489 const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3490 const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3491 #endif
3492 assert_different_registers(table, tc0, tc1, tc2);
3493 assert(table == tc3, "must be!");
3494
3495 if (ix0 != 0) addi(tc0, table, ix0);
3496 if (ix1 != 0) addi(tc1, table, ix1);
3497 if (ix2 != 0) addi(tc2, table, ix2);
3498 if (ix3 != 0) addi(tc3, table, ix3);
3499
3500 return ix3;
3501 }
3502
3503 /**
3504 * uint32_t crc;
3505 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3506 */
3507 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3508 assert_different_registers(crc, table, tmp);
3509 assert_different_registers(val, table);
3510
3511 if (crc == val) { // Must rotate first to use the unmodified value.
3512 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3513 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3514 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3515 } else {
3516 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3517 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3518 }
3519 lwzx(tmp, table, tmp);
3520 xorr(crc, crc, tmp);
3521 }
3522
3523 /**
3524 * uint32_t crc;
3525 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3526 */
3527 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3528 fold_byte_crc32(crc, crc, table, tmp);
3529 }
3530
3531 /**
3532 * Emits code to update CRC-32 with a byte value according to constants in table.
3533 *
3534 * @param [in,out]crc Register containing the crc.
3535 * @param [in]val Register containing the byte to fold into the CRC.
3536 * @param [in]table Register containing the table of crc constants.
3537 *
3538 * uint32_t crc;
3539 * val = crc_table[(val ^ crc) & 0xFF];
3540 * crc = val ^ (crc >> 8);
3541 */
3542 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3543 BLOCK_COMMENT("update_byte_crc32:");
3544 xorr(val, val, crc);
3545 fold_byte_crc32(crc, val, table, val);
3546 }
3547
3548 /**
3549 * @param crc register containing existing CRC (32-bit)
3550 * @param buf register pointing to input byte buffer (byte*)
3551 * @param len register containing number of bytes
3552 * @param table register pointing to CRC table
3553 */
3554 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3555 Register data, bool loopAlignment, bool invertCRC) {
3556 assert_different_registers(crc, buf, len, table, data);
3557
3558 Label L_mainLoop, L_done;
3559 const int mainLoop_stepping = 1;
3560 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3561
3562 // Process all bytes in a single-byte loop.
3563 cmpdi(CCR0, len, 0); // Anything to do?
3564 mtctr(len);
3565 beq(CCR0, L_done);
3566
3567 if (invertCRC) {
3568 nand(crc, crc, crc); // ~c
3569 }
3570
3571 align(mainLoop_alignment);
3572 BIND(L_mainLoop);
3573 lbz(data, 0, buf); // Byte from buffer, zero-extended.
3574 addi(buf, buf, mainLoop_stepping); // Advance buffer position.
3575 update_byte_crc32(crc, data, table);
3576 bdnz(L_mainLoop); // Iterate.
3577
3578 if (invertCRC) {
3579 nand(crc, crc, crc); // ~c
3580 }
3581
3582 bind(L_done);
3583 }
3584
3585 /**
3586 * Emits code to update CRC-32 with a 4-byte value according to constants in table
3587 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3588 */
3589 // A not on the lookup table address(es):
3590 // The lookup table consists of two sets of four columns each.
3591 // The columns {0..3} are used for little-endian machines.
3592 // The columns {4..7} are used for big-endian machines.
3593 // To save the effort of adding the column offset to the table address each time
3594 // a table element is looked up, it is possible to pass the pre-calculated
3595 // column addresses.
3596 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3597 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3598 Register t0, Register t1, Register t2, Register t3,
3599 Register tc0, Register tc1, Register tc2, Register tc3) {
3600 assert_different_registers(crc, t3);
3601
3602 // XOR crc with next four bytes of buffer.
3603 lwz(t3, bufDisp, buf);
3604 if (bufInc != 0) {
3605 addi(buf, buf, bufInc);
3606 }
3607 xorr(t3, t3, crc);
3608
3609 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3610 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2
3611 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2
3612 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2
3613 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2
3614
3615 // Use the pre-calculated column addresses.
3616 // Load pre-calculated table values.
3617 lwzx(t0, tc0, t0);
3618 lwzx(t1, tc1, t1);
3619 lwzx(t2, tc2, t2);
3620 lwzx(t3, tc3, t3);
3621
3622 // Calculate new crc from table values.
3623 xorr(t0, t0, t1);
3624 xorr(t2, t2, t3);
3625 xorr(crc, t0, t2); // Now crc contains the final checksum value.
3626 }
3627
3628 /**
3629 * @param crc register containing existing CRC (32-bit)
3630 * @param buf register pointing to input byte buffer (byte*)
3631 * @param len register containing number of bytes
3632 * @param table register pointing to CRC table
3633 *
3634 * Uses R9..R12 as work register. Must be saved/restored by caller!
3635 */
3636 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
3637 Register t0, Register t1, Register t2, Register t3,
3638 Register tc0, Register tc1, Register tc2, Register tc3) {
3639 assert_different_registers(crc, buf, len, table);
3640
3641 Label L_mainLoop, L_tail;
3642 Register tmp = t0;
3643 Register data = t0;
3644 Register tmp2 = t1;
3645 const int mainLoop_stepping = 8;
3646 const int tailLoop_stepping = 1;
3647 const int log_stepping = exact_log2(mainLoop_stepping);
3648 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3649 const int complexThreshold = 2*mainLoop_stepping;
3650
3651 // Don't test for len <= 0 here. This pathological case should not occur anyway.
3652 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3653 // The situation itself is detected and handled correctly by the conditional branches
3654 // following aghi(len, -stepping) and aghi(len, +stepping).
3655 assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3656
3657 BLOCK_COMMENT("kernel_crc32_2word {");
3658
3659 nand(crc, crc, crc); // ~c
3660
3661 // Check for short (<mainLoop_stepping) buffer.
3662 cmpdi(CCR0, len, complexThreshold);
3663 blt(CCR0, L_tail);
3664
3665 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3666 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3667 {
3668 // Align buf addr to mainLoop_stepping boundary.
3669 neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
3670 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3671
3672 if (complexThreshold > mainLoop_stepping) {
3673 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3674 } else {
3675 sub(tmp, len, tmp2); // Remaining bytes for main loop.
3676 cmpdi(CCR0, tmp, mainLoop_stepping);
3677 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
3678 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3679 }
3680 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3681 }
3682
3683 srdi(tmp2, len, log_stepping); // #iterations for mainLoop
3684 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
3685 mtctr(tmp2);
3686
3687 #ifdef VM_LITTLE_ENDIAN
3688 Register crc_rv = crc;
3689 #else
3690 Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
3691 // Occupies tmp, but frees up crc.
3692 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
3693 tmp = crc;
3694 #endif
3695
3696 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3697
3698 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
3699 BIND(L_mainLoop);
3700 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3701 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3702 bdnz(L_mainLoop);
3703
3704 #ifndef VM_LITTLE_ENDIAN
3705 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
3706 tmp = crc_rv; // Tmp uses it's original register again.
3707 #endif
3708
3709 // Restore original table address for tailLoop.
3710 if (reconstructTableOffset != 0) {
3711 addi(table, table, -reconstructTableOffset);
3712 }
3713
3714 // Process last few (<complexThreshold) bytes of buffer.
3715 BIND(L_tail);
3716 update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3717
3718 nand(crc, crc, crc); // ~c
3719 BLOCK_COMMENT("} kernel_crc32_2word");
3720 }
3721
3722 /**
3723 * @param crc register containing existing CRC (32-bit)
3724 * @param buf register pointing to input byte buffer (byte*)
3725 * @param len register containing number of bytes
3726 * @param table register pointing to CRC table
3727 *
3728 * uses R9..R12 as work register. Must be saved/restored by caller!
3729 */
3730 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3731 Register t0, Register t1, Register t2, Register t3,
3732 Register tc0, Register tc1, Register tc2, Register tc3) {
3733 assert_different_registers(crc, buf, len, table);
3734
3735 Label L_mainLoop, L_tail;
3736 Register tmp = t0;
3737 Register data = t0;
3738 Register tmp2 = t1;
3739 const int mainLoop_stepping = 4;
3740 const int tailLoop_stepping = 1;
3741 const int log_stepping = exact_log2(mainLoop_stepping);
3742 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3743 const int complexThreshold = 2*mainLoop_stepping;
3744
3745 // Don't test for len <= 0 here. This pathological case should not occur anyway.
3746 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3747 // The situation itself is detected and handled correctly by the conditional branches
3748 // following aghi(len, -stepping) and aghi(len, +stepping).
3749 assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3750
3751 BLOCK_COMMENT("kernel_crc32_1word {");
3752
3753 nand(crc, crc, crc); // ~c
3754
3755 // Check for short (<mainLoop_stepping) buffer.
3756 cmpdi(CCR0, len, complexThreshold);
3757 blt(CCR0, L_tail);
3758
3759 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3760 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3761 {
3762 // Align buf addr to mainLoop_stepping boundary.
3763 neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
3764 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3765
3766 if (complexThreshold > mainLoop_stepping) {
3767 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3768 } else {
3769 sub(tmp, len, tmp2); // Remaining bytes for main loop.
3770 cmpdi(CCR0, tmp, mainLoop_stepping);
3771 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
3772 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3773 }
3774 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3775 }
3776
3777 srdi(tmp2, len, log_stepping); // #iterations for mainLoop
3778 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
3779 mtctr(tmp2);
3780
3781 #ifdef VM_LITTLE_ENDIAN
3782 Register crc_rv = crc;
3783 #else
3784 Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
3785 // Occupies tmp, but frees up crc.
3786 load_reverse_32(crc_rv, crc); // evert byte order because we are dealing with big-endian data.
3787 tmp = crc;
3788 #endif
3789
3790 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3791
3792 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
3793 BIND(L_mainLoop);
3794 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3795 bdnz(L_mainLoop);
3796
3797 #ifndef VM_LITTLE_ENDIAN
3798 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
3799 tmp = crc_rv; // Tmp uses it's original register again.
3800 #endif
3801
3802 // Restore original table address for tailLoop.
3803 if (reconstructTableOffset != 0) {
3804 addi(table, table, -reconstructTableOffset);
3805 }
3806
3807 // Process last few (<complexThreshold) bytes of buffer.
3808 BIND(L_tail);
3809 update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3810
3811 nand(crc, crc, crc); // ~c
3812 BLOCK_COMMENT("} kernel_crc32_1word");
3813 }
3814
3815 /**
3816 * @param crc register containing existing CRC (32-bit)
3817 * @param buf register pointing to input byte buffer (byte*)
3818 * @param len register containing number of bytes
3819 * @param table register pointing to CRC table
3820 *
3821 * Uses R7_ARG5, R8_ARG6 as work registers.
3822 */
3823 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
3824 Register t0, Register t1, Register t2, Register t3) {
3825 assert_different_registers(crc, buf, len, table);
3826
3827 Register data = t0; // Holds the current byte to be folded into crc.
3828
3829 BLOCK_COMMENT("kernel_crc32_1byte {");
3830
3831 // Process all bytes in a single-byte loop.
3832 update_byteLoop_crc32(crc, buf, len, table, data, true, true);
3833
3834 BLOCK_COMMENT("} kernel_crc32_1byte");
3835 }
3836
3837 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
3838 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp);
3839
3840 BLOCK_COMMENT("kernel_crc32_singleByte:");
3841 nand(crc, crc, crc); // ~c
3842
3843 lbz(tmp, 0, buf); // Byte from buffer, zero-extended.
3844 update_byte_crc32(crc, tmp, table);
3845
3846 nand(crc, crc, crc); // ~c
3847 }
3848
3849 // dest_lo += src1 + src2
3850 // dest_hi += carry1 + carry2
3851 void MacroAssembler::add2_with_carry(Register dest_hi,
3852 Register dest_lo,
3853 Register src1, Register src2) {
3854 li(R0, 0);
3855 addc(dest_lo, dest_lo, src1);
3856 adde(dest_hi, dest_hi, R0);
3857 addc(dest_lo, dest_lo, src2);
3858 adde(dest_hi, dest_hi, R0);
3859 }
3860
3861 // Multiply 64 bit by 64 bit first loop.
3862 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3863 Register x_xstart,
3864 Register y, Register y_idx,
3865 Register z,
3866 Register carry,
3867 Register product_high, Register product,
3868 Register idx, Register kdx,
3911 addc(product, product, carry); // Add carry to result.
3912 adde(product_high, product_high, tmp); // Add carry of the last addition.
3913 addi(kdx, kdx, -2);
3914
3915 // Store result.
3916 #ifdef VM_LITTLE_ENDIAN
3917 rldicl(product, product, 32, 0);
3918 #endif
3919 sldi(tmp, kdx, LogBytesPerInt);
3920 stdx(product, z, tmp);
3921 mr_if_needed(carry, product_high);
3922 b(L_first_loop);
3923
3924
3925 bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3926
3927 lwz(y_idx, 0, y);
3928 b(L_multiply);
3929
3930
3931 bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3932
3933 lwz(x_xstart, 0, x);
3934 b(L_first_loop);
3935
3936 bind(L_first_loop_exit);
3937 }
3938
3939 // Multiply 64 bit by 64 bit and add 128 bit.
3940 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3941 Register z, Register yz_idx,
3942 Register idx, Register carry,
3943 Register product_high, Register product,
3944 Register tmp, int offset) {
3945
3946 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3947 // z[kdx] = (jlong)product;
3948
3949 sldi(tmp, idx, LogBytesPerInt);
3950 if (offset) {
3951 addi(tmp, tmp, offset);
3952 }
3953 ldx(yz_idx, y, tmp);
3954 #ifdef VM_LITTLE_ENDIAN
3955 rldicl(yz_idx, yz_idx, 32, 0);
3956 #endif
3957
3958 multiply64(product_high, product, x_xstart, yz_idx);
3959 ldx(yz_idx, z, tmp);
3960 #ifdef VM_LITTLE_ENDIAN
3961 rldicl(yz_idx, yz_idx, 32, 0);
3962 #endif
3963
3964 add2_with_carry(product_high, product, carry, yz_idx);
3965
3966 sldi(tmp, idx, LogBytesPerInt);
3967 if (offset) {
3968 addi(tmp, tmp, offset);
3969 }
3970 #ifdef VM_LITTLE_ENDIAN
3971 rldicl(product, product, 32, 0);
3972 #endif
3973 stdx(product, z, tmp);
3974 }
3975
3976 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3977 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3978 Register y, Register z,
3979 Register yz_idx, Register idx, Register carry,
3980 Register product_high, Register product,
3981 Register carry2, Register tmp) {
3982
3983 // jlong carry, x[], y[], z[];
3984 // int kdx = ystart+1;
3985 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3986 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3987 // z[kdx+idx+1] = (jlong)product;
|