8131048-crc32-hs-comp Sdiff src/cpu/ppc/vm

src/cpu/ppc/vm/macroAssembler_ppc.cpp

rev 8685 : 8131048: ppc: implement CRC32 intrinsic
Contributed-by: lutz.schmidt@sap.com

  33 #include "prims/methodHandles.hpp"
  34 #include "runtime/biasedLocking.hpp"
  35 #include "runtime/icache.hpp"
  36 #include "runtime/interfaceSupport.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/os.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "utilities/macros.hpp"
  42 #if INCLUDE_ALL_GCS
  43 #include "gc/g1/g1CollectedHeap.inline.hpp"
  44 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  45 #include "gc/g1/heapRegion.hpp"
  46 #endif // INCLUDE_ALL_GCS
  47 
  48 #ifdef PRODUCT
  49 #define BLOCK_COMMENT(str) // nothing
  50 #else
  51 #define BLOCK_COMMENT(str) block_comment(str)
  52 #endif

  53 
  54 #ifdef ASSERT
  55 // On RISC, there's no benefit to verifying instruction boundaries.
  56 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  57 #endif
  58 
  59 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  60   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  61   if (Assembler::is_simm(si31, 16)) {
  62     ld(d, si31, a);
  63     if (emit_filler_nop) nop();
  64   } else {
  65     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  66     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  67     addis(d, a, hi);
  68     ld(d, lo, d);
  69   }
  70 }
  71 
  72 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {

3416     //14:
3417     if (cntval & 2) {
3418       lwzx(R0, str1_reg, index_reg);
3419       lwzx(tmp2_reg, str2_reg, index_reg);
3420       cmpw(CCR0, R0, tmp2_reg);
3421       bne(CCR0, Ldone_false);
3422       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3423     }
3424     if (cntval & 1) {
3425       lhzx(R0, str1_reg, index_reg);
3426       lhzx(tmp2_reg, str2_reg, index_reg);
3427       cmpw(CCR0, R0, tmp2_reg);
3428       bne(CCR0, Ldone_false);
3429     }
3430     // fallthru: true
3431   }
3432   li(result_reg, 1);
3433   bind(Ldone_false);
3434 }
3435 




























































































































































































































































































































































































































3436 // dest_lo += src1 + src2
3437 // dest_hi += carry1 + carry2
3438 void MacroAssembler::add2_with_carry(Register dest_hi,
3439                                      Register dest_lo,
3440                                      Register src1, Register src2) {
3441   li(R0, 0);
3442   addc(dest_lo, dest_lo, src1);
3443   adde(dest_hi, dest_hi, R0);
3444   addc(dest_lo, dest_lo, src2);
3445   adde(dest_hi, dest_hi, R0);
3446 }
3447 
3448 // Multiply 64 bit by 64 bit first loop.
3449 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3450                                            Register x_xstart,
3451                                            Register y, Register y_idx,
3452                                            Register z,
3453                                            Register carry,
3454                                            Register product_high, Register product,
3455                                            Register idx, Register kdx,

3498   addc(product, product, carry);         // Add carry to result.
3499   adde(product_high, product_high, tmp); // Add carry of the last addition.
3500   addi(kdx, kdx, -2);
3501 
3502   // Store result.
3503 #ifdef VM_LITTLE_ENDIAN
3504   rldicl(product, product, 32, 0);
3505 #endif
3506   sldi(tmp, kdx, LogBytesPerInt);
3507   stdx(product, z, tmp);
3508   mr_if_needed(carry, product_high);
3509   b(L_first_loop);
3510 
3511 
3512   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3513 
3514   lwz(y_idx, 0, y);
3515   b(L_multiply);
3516 
3517 
3518   bind( L_one_x ); // Load one 32 bit portion of x as (0,value).
3519 
3520   lwz(x_xstart, 0, x);
3521   b(L_first_loop);
3522 
3523   bind(L_first_loop_exit);
3524 }
3525 
3526 // Multiply 64 bit by 64 bit and add 128 bit.
3527 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3528                                             Register z, Register yz_idx,
3529                                             Register idx, Register carry,
3530                                             Register product_high, Register product,
3531                                             Register tmp, int offset) {
3532 
3533   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3534   //  z[kdx] = (jlong)product;
3535 
3536   sldi(tmp, idx, LogBytesPerInt);
3537   if ( offset ) {
3538     addi(tmp, tmp, offset);
3539   }
3540   ldx(yz_idx, y, tmp);
3541 #ifdef VM_LITTLE_ENDIAN
3542   rldicl(yz_idx, yz_idx, 32, 0);
3543 #endif
3544 
3545   multiply64(product_high, product, x_xstart, yz_idx);
3546   ldx(yz_idx, z, tmp);
3547 #ifdef VM_LITTLE_ENDIAN
3548   rldicl(yz_idx, yz_idx, 32, 0);
3549 #endif
3550 
3551   add2_with_carry(product_high, product, carry, yz_idx);
3552 
3553   sldi(tmp, idx, LogBytesPerInt);
3554   if ( offset ) {
3555     addi(tmp, tmp, offset);
3556   }
3557 #ifdef VM_LITTLE_ENDIAN
3558   rldicl(product, product, 32, 0);
3559 #endif
3560   stdx(product, z, tmp);
3561 }
3562 
3563 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3564 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3565                                              Register y, Register z,
3566                                              Register yz_idx, Register idx, Register carry,
3567                                              Register product_high, Register product,
3568                                              Register carry2, Register tmp) {
3569 
3570   //  jlong carry, x[], y[], z[];
3571   //  int kdx = ystart+1;
3572   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3573   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3574   //    z[kdx+idx+1] = (jlong)product;

  33 #include "prims/methodHandles.hpp"
  34 #include "runtime/biasedLocking.hpp"
  35 #include "runtime/icache.hpp"
  36 #include "runtime/interfaceSupport.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/os.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "utilities/macros.hpp"
  42 #if INCLUDE_ALL_GCS
  43 #include "gc/g1/g1CollectedHeap.inline.hpp"
  44 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  45 #include "gc/g1/heapRegion.hpp"
  46 #endif // INCLUDE_ALL_GCS
  47 
  48 #ifdef PRODUCT
  49 #define BLOCK_COMMENT(str) // nothing
  50 #else
  51 #define BLOCK_COMMENT(str) block_comment(str)
  52 #endif
  53 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  54 
  55 #ifdef ASSERT
  56 // On RISC, there's no benefit to verifying instruction boundaries.
  57 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  58 #endif
  59 
  60 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  61   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  62   if (Assembler::is_simm(si31, 16)) {
  63     ld(d, si31, a);
  64     if (emit_filler_nop) nop();
  65   } else {
  66     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  67     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  68     addis(d, a, hi);
  69     ld(d, lo, d);
  70   }
  71 }
  72 
  73 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {

3417     //14:
3418     if (cntval & 2) {
3419       lwzx(R0, str1_reg, index_reg);
3420       lwzx(tmp2_reg, str2_reg, index_reg);
3421       cmpw(CCR0, R0, tmp2_reg);
3422       bne(CCR0, Ldone_false);
3423       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3424     }
3425     if (cntval & 1) {
3426       lhzx(R0, str1_reg, index_reg);
3427       lhzx(tmp2_reg, str2_reg, index_reg);
3428       cmpw(CCR0, R0, tmp2_reg);
3429       bne(CCR0, Ldone_false);
3430     }
3431     // fallthru: true
3432   }
3433   li(result_reg, 1);
3434   bind(Ldone_false);
3435 }
3436 
3437 // Helpers for Intrinsic Emitters
3438 //
3439 // Revert the byte order of a 32bit value in a register
3440 //   src: 0x44556677
3441 //   dst: 0x77665544
3442 // Three steps to obtain the result:
3443 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3444 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3445 //     This value initializes dst.
3446 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3447 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3448 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3449 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3450 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3451 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3452   assert_different_registers(dst, src);
3453 
3454   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3455   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3456   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3457 }
3458 
3459 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3460 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3461 // body size from 20 to 16 instructions.
3462 // Returns the offset that was used to calculate the address of column tc3.
3463 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3464 // at hand, the original table address can be easily reconstructed.
3465 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3466 
3467 #ifdef VM_LITTLE_ENDIAN
3468   // This is what we implement (the DOLIT4 part):
3469   // ========================================================================= */
3470   // #define DOLIT4 c ^= *buf4++; \
3471   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3472   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3473   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3474   // ========================================================================= */
3475   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3476   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3477   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3478   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3479 #else
3480   // This is what we implement (the DOBIG4 part):
3481   // =========================================================================
3482   // #define DOBIG4 c ^= *++buf4; \
3483   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3484   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3485   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3486   // =========================================================================
3487   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3488   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3489   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3490   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3491 #endif
3492   assert_different_registers(table, tc0, tc1, tc2);
3493   assert(table == tc3, "must be!");
3494 
3495   if (ix0 != 0) addi(tc0, table, ix0);
3496   if (ix1 != 0) addi(tc1, table, ix1);
3497   if (ix2 != 0) addi(tc2, table, ix2);
3498   if (ix3 != 0) addi(tc3, table, ix3);
3499 
3500   return ix3;
3501 }
3502 
3503 /**
3504  * uint32_t crc;
3505  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3506  */
3507 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3508   assert_different_registers(crc, table, tmp);
3509   assert_different_registers(val, table);
3510 
3511   if (crc == val) {                   // Must rotate first to use the unmodified value.
3512     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3513                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3514     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3515   } else {
3516     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3517     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3518   }
3519   lwzx(tmp, table, tmp);
3520   xorr(crc, crc, tmp);
3521 }
3522 
3523 /**
3524  * uint32_t crc;
3525  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3526  */
3527 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3528   fold_byte_crc32(crc, crc, table, tmp);
3529 }
3530 
3531 /**
3532  * Emits code to update CRC-32 with a byte value according to constants in table.
3533  *
3534  * @param [in,out]crc   Register containing the crc.
3535  * @param [in]val       Register containing the byte to fold into the CRC.
3536  * @param [in]table     Register containing the table of crc constants.
3537  *
3538  * uint32_t crc;
3539  * val = crc_table[(val ^ crc) & 0xFF];
3540  * crc = val ^ (crc >> 8);
3541  */
3542 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3543   BLOCK_COMMENT("update_byte_crc32:");
3544   xorr(val, val, crc);
3545   fold_byte_crc32(crc, val, table, val);
3546 }
3547 
3548 /**
3549  * @param crc   register containing existing CRC (32-bit)
3550  * @param buf   register pointing to input byte buffer (byte*)
3551  * @param len   register containing number of bytes
3552  * @param table register pointing to CRC table
3553  */
3554 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3555                                            Register data, bool loopAlignment, bool invertCRC) {
3556   assert_different_registers(crc, buf, len, table, data);
3557 
3558   Label L_mainLoop, L_done;
3559   const int mainLoop_stepping  = 1;
3560   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3561 
3562   // Process all bytes in a single-byte loop.
3563   cmpdi(CCR0, len, 0);                           // Anything to do?
3564   mtctr(len);
3565   beq(CCR0, L_done);
3566 
3567   if (invertCRC) {
3568     nand(crc, crc, crc);                         // ~c
3569   }
3570 
3571   align(mainLoop_alignment);
3572   BIND(L_mainLoop);
3573     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3574     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3575     update_byte_crc32(crc, data, table);
3576     bdnz(L_mainLoop);                            // Iterate.
3577 
3578   if (invertCRC) {
3579     nand(crc, crc, crc);                         // ~c
3580   }
3581 
3582   bind(L_done);
3583 }
3584 
3585 /**
3586  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3587  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3588  */
3589 // A not on the lookup table address(es):
3590 // The lookup table consists of two sets of four columns each.
3591 // The columns {0..3} are used for little-endian machines.
3592 // The columns {4..7} are used for big-endian machines.
3593 // To save the effort of adding the column offset to the table address each time
3594 // a table element is looked up, it is possible to pass the pre-calculated
3595 // column addresses.
3596 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3597 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3598                                         Register t0,  Register t1,  Register t2,  Register t3,
3599                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3600   assert_different_registers(crc, t3);
3601 
3602   // XOR crc with next four bytes of buffer.
3603   lwz(t3, bufDisp, buf);
3604   if (bufInc != 0) {
3605     addi(buf, buf, bufInc);
3606   }
3607   xorr(t3, t3, crc);
3608 
3609   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3610   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3611   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3612   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3613   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3614 
3615   // Use the pre-calculated column addresses.
3616   // Load pre-calculated table values.
3617   lwzx(t0, tc0, t0);
3618   lwzx(t1, tc1, t1);
3619   lwzx(t2, tc2, t2);
3620   lwzx(t3, tc3, t3);
3621 
3622   // Calculate new crc from table values.
3623   xorr(t0,  t0, t1);
3624   xorr(t2,  t2, t3);
3625   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3626 }
3627 
3628 /**
3629  * @param crc   register containing existing CRC (32-bit)
3630  * @param buf   register pointing to input byte buffer (byte*)
3631  * @param len   register containing number of bytes
3632  * @param table register pointing to CRC table
3633  *
3634  * Uses R9..R12 as work register. Must be saved/restored by caller!
3635  */
3636 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
3637                                         Register t0,  Register t1,  Register t2,  Register t3,
3638                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3639   assert_different_registers(crc, buf, len, table);
3640 
3641   Label L_mainLoop, L_tail;
3642   Register  tmp  = t0;
3643   Register  data = t0;
3644   Register  tmp2 = t1;
3645   const int mainLoop_stepping  = 8;
3646   const int tailLoop_stepping  = 1;
3647   const int log_stepping       = exact_log2(mainLoop_stepping);
3648   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3649   const int complexThreshold   = 2*mainLoop_stepping;
3650 
3651   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3652   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3653   // The situation itself is detected and handled correctly by the conditional branches
3654   // following  aghi(len, -stepping) and aghi(len, +stepping).
3655   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3656 
3657   BLOCK_COMMENT("kernel_crc32_2word {");
3658 
3659   nand(crc, crc, crc);                           // ~c
3660 
3661   // Check for short (<mainLoop_stepping) buffer.
3662   cmpdi(CCR0, len, complexThreshold);
3663   blt(CCR0, L_tail);
3664 
3665   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3666   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3667   {
3668     // Align buf addr to mainLoop_stepping boundary.
3669     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
3670     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3671 
3672     if (complexThreshold > mainLoop_stepping) {
3673       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3674     } else {
3675       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3676       cmpdi(CCR0, tmp, mainLoop_stepping);
3677       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3678       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3679     }
3680     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3681   }
3682 
3683   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3684   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3685   mtctr(tmp2);
3686 
3687 #ifdef VM_LITTLE_ENDIAN
3688   Register crc_rv = crc;
3689 #else
3690   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3691                                                  // Occupies tmp, but frees up crc.
3692   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3693   tmp = crc;
3694 #endif
3695 
3696   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3697 
3698   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3699   BIND(L_mainLoop);
3700     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3701     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3702     bdnz(L_mainLoop);
3703 
3704 #ifndef VM_LITTLE_ENDIAN
3705   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3706   tmp = crc_rv;                                  // Tmp uses it's original register again.
3707 #endif
3708 
3709   // Restore original table address for tailLoop.
3710   if (reconstructTableOffset != 0) {
3711     addi(table, table, -reconstructTableOffset);
3712   }
3713 
3714   // Process last few (<complexThreshold) bytes of buffer.
3715   BIND(L_tail);
3716   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3717 
3718   nand(crc, crc, crc);                           // ~c
3719   BLOCK_COMMENT("} kernel_crc32_2word");
3720 }
3721 
3722 /**
3723  * @param crc   register containing existing CRC (32-bit)
3724  * @param buf   register pointing to input byte buffer (byte*)
3725  * @param len   register containing number of bytes
3726  * @param table register pointing to CRC table
3727  *
3728  * uses R9..R12 as work register. Must be saved/restored by caller!
3729  */
3730 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3731                                         Register t0,  Register t1,  Register t2,  Register t3,
3732                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3733   assert_different_registers(crc, buf, len, table);
3734 
3735   Label L_mainLoop, L_tail;
3736   Register  tmp          = t0;
3737   Register  data         = t0;
3738   Register  tmp2         = t1;
3739   const int mainLoop_stepping  = 4;
3740   const int tailLoop_stepping  = 1;
3741   const int log_stepping       = exact_log2(mainLoop_stepping);
3742   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3743   const int complexThreshold   = 2*mainLoop_stepping;
3744 
3745   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3746   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3747   // The situation itself is detected and handled correctly by the conditional branches
3748   // following  aghi(len, -stepping) and aghi(len, +stepping).
3749   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3750 
3751   BLOCK_COMMENT("kernel_crc32_1word {");
3752 
3753   nand(crc, crc, crc);                           // ~c
3754 
3755   // Check for short (<mainLoop_stepping) buffer.
3756   cmpdi(CCR0, len, complexThreshold);
3757   blt(CCR0, L_tail);
3758 
3759   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3760   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3761   {
3762     // Align buf addr to mainLoop_stepping boundary.
3763     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3764     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3765 
3766     if (complexThreshold > mainLoop_stepping) {
3767       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3768     } else {
3769       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3770       cmpdi(CCR0, tmp, mainLoop_stepping);
3771       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3772       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3773     }
3774     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3775   }
3776 
3777   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3778   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3779   mtctr(tmp2);
3780 
3781 #ifdef VM_LITTLE_ENDIAN
3782   Register crc_rv = crc;
3783 #else
3784   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3785                                                  // Occupies tmp, but frees up crc.
3786   load_reverse_32(crc_rv, crc);                  // evert byte order because we are dealing with big-endian data.
3787   tmp = crc;
3788 #endif
3789 
3790   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3791 
3792   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3793   BIND(L_mainLoop);
3794     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3795     bdnz(L_mainLoop);
3796 
3797 #ifndef VM_LITTLE_ENDIAN
3798   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3799   tmp = crc_rv;                                  // Tmp uses it's original register again.
3800 #endif
3801 
3802   // Restore original table address for tailLoop.
3803   if (reconstructTableOffset != 0) {
3804     addi(table, table, -reconstructTableOffset);
3805   }
3806 
3807   // Process last few (<complexThreshold) bytes of buffer.
3808   BIND(L_tail);
3809   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3810 
3811   nand(crc, crc, crc);                           // ~c
3812   BLOCK_COMMENT("} kernel_crc32_1word");
3813 }
3814 
3815 /**
3816  * @param crc   register containing existing CRC (32-bit)
3817  * @param buf   register pointing to input byte buffer (byte*)
3818  * @param len   register containing number of bytes
3819  * @param table register pointing to CRC table
3820  *
3821  * Uses R7_ARG5, R8_ARG6 as work registers.
3822  */
3823 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
3824                                         Register t0,  Register t1,  Register t2,  Register t3) {
3825   assert_different_registers(crc, buf, len, table);
3826 
3827   Register  data = t0;                   // Holds the current byte to be folded into crc.
3828 
3829   BLOCK_COMMENT("kernel_crc32_1byte {");
3830 
3831   // Process all bytes in a single-byte loop.
3832   update_byteLoop_crc32(crc, buf, len, table, data, true, true);
3833 
3834   BLOCK_COMMENT("} kernel_crc32_1byte");
3835 }
3836 
3837 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
3838   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
3839 
3840   BLOCK_COMMENT("kernel_crc32_singleByte:");
3841   nand(crc, crc, crc);       // ~c
3842 
3843   lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
3844   update_byte_crc32(crc, tmp, table);
3845 
3846   nand(crc, crc, crc);       // ~c
3847 }
3848 
3849 // dest_lo += src1 + src2
3850 // dest_hi += carry1 + carry2
3851 void MacroAssembler::add2_with_carry(Register dest_hi,
3852                                      Register dest_lo,
3853                                      Register src1, Register src2) {
3854   li(R0, 0);
3855   addc(dest_lo, dest_lo, src1);
3856   adde(dest_hi, dest_hi, R0);
3857   addc(dest_lo, dest_lo, src2);
3858   adde(dest_hi, dest_hi, R0);
3859 }
3860 
3861 // Multiply 64 bit by 64 bit first loop.
3862 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3863                                            Register x_xstart,
3864                                            Register y, Register y_idx,
3865                                            Register z,
3866                                            Register carry,
3867                                            Register product_high, Register product,
3868                                            Register idx, Register kdx,

3911   addc(product, product, carry);         // Add carry to result.
3912   adde(product_high, product_high, tmp); // Add carry of the last addition.
3913   addi(kdx, kdx, -2);
3914 
3915   // Store result.
3916 #ifdef VM_LITTLE_ENDIAN
3917   rldicl(product, product, 32, 0);
3918 #endif
3919   sldi(tmp, kdx, LogBytesPerInt);
3920   stdx(product, z, tmp);
3921   mr_if_needed(carry, product_high);
3922   b(L_first_loop);
3923 
3924 
3925   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3926 
3927   lwz(y_idx, 0, y);
3928   b(L_multiply);
3929 
3930 
3931   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3932 
3933   lwz(x_xstart, 0, x);
3934   b(L_first_loop);
3935 
3936   bind(L_first_loop_exit);
3937 }
3938 
3939 // Multiply 64 bit by 64 bit and add 128 bit.
3940 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3941                                             Register z, Register yz_idx,
3942                                             Register idx, Register carry,
3943                                             Register product_high, Register product,
3944                                             Register tmp, int offset) {
3945 
3946   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3947   //  z[kdx] = (jlong)product;
3948 
3949   sldi(tmp, idx, LogBytesPerInt);
3950   if (offset) {
3951     addi(tmp, tmp, offset);
3952   }
3953   ldx(yz_idx, y, tmp);
3954 #ifdef VM_LITTLE_ENDIAN
3955   rldicl(yz_idx, yz_idx, 32, 0);
3956 #endif
3957 
3958   multiply64(product_high, product, x_xstart, yz_idx);
3959   ldx(yz_idx, z, tmp);
3960 #ifdef VM_LITTLE_ENDIAN
3961   rldicl(yz_idx, yz_idx, 32, 0);
3962 #endif
3963 
3964   add2_with_carry(product_high, product, carry, yz_idx);
3965 
3966   sldi(tmp, idx, LogBytesPerInt);
3967   if (offset) {
3968     addi(tmp, tmp, offset);
3969   }
3970 #ifdef VM_LITTLE_ENDIAN
3971   rldicl(product, product, 32, 0);
3972 #endif
3973   stdx(product, z, tmp);
3974 }
3975 
3976 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3977 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3978                                              Register y, Register z,
3979                                              Register yz_idx, Register idx, Register carry,
3980                                              Register product_high, Register product,
3981                                              Register carry2, Register tmp) {
3982 
3983   //  jlong carry, x[], y[], z[];
3984   //  int kdx = ystart+1;
3985   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3986   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3987   //    z[kdx+idx+1] = (jlong)product;

< prev index next >