8131048-crc32-hs-comp Cdiff src/cpu/ppc/vm/macroAssembler

src/cpu/ppc/vm/macroAssembler_ppc.cpp

rev 8691 : 8131048: ppc: implement CRC32 intrinsic
Reviewed-by: kvn, simonis
Contributed-by: lutz.schmidt@sap.com


*** 48,57 ****
--- 48,58 ----
  #ifdef PRODUCT
  #define BLOCK_COMMENT(str) // nothing
  #else
  #define BLOCK_COMMENT(str) block_comment(str)
  #endif
+ #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  
  #ifdef ASSERT
  // On RISC, there's no benefit to verifying instruction boundaries.
  bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  #endif
*** 3431,3440 ****
--- 3432,3853 ----
    }
    li(result_reg, 1);
    bind(Ldone_false);
  }
  
+ // Helpers for Intrinsic Emitters
+ //
+ // Revert the byte order of a 32bit value in a register
+ //   src: 0x44556677
+ //   dst: 0x77665544
+ // Three steps to obtain the result:
+ //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
+ //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
+ //     This value initializes dst.
+ //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
+ //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
+ //     This value is mask inserted into dst with a [0..23] mask of 1s.
+ //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
+ //     This value is mask inserted into dst with a [8..15] mask of 1s.
+ void MacroAssembler::load_reverse_32(Register dst, Register src) {
+   assert_different_registers(dst, src);
+ 
+   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
+   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
+   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
+ }
+ 
+ // Calculate the column addresses of the crc32 lookup table into distinct registers.
+ // This loop-invariant calculation is moved out of the loop body, reducing the loop
+ // body size from 20 to 16 instructions.
+ // Returns the offset that was used to calculate the address of column tc3.
+ // Due to register shortage, setting tc3 may overwrite table. With the return offset
+ // at hand, the original table address can be easily reconstructed.
+ int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
+ 
+ #ifdef VM_LITTLE_ENDIAN
+   // This is what we implement (the DOLIT4 part):
+   // ========================================================================= */
+   // #define DOLIT4 c ^= *buf4++; \
+   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+   // ========================================================================= */
+   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
+   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
+   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
+   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
+ #else
+   // This is what we implement (the DOBIG4 part):
+   // =========================================================================
+   // #define DOBIG4 c ^= *++buf4; \
+   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+   // =========================================================================
+   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
+   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
+   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
+   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
+ #endif
+   assert_different_registers(table, tc0, tc1, tc2);
+   assert(table == tc3, "must be!");
+ 
+   if (ix0 != 0) addi(tc0, table, ix0);
+   if (ix1 != 0) addi(tc1, table, ix1);
+   if (ix2 != 0) addi(tc2, table, ix2);
+   if (ix3 != 0) addi(tc3, table, ix3);
+ 
+   return ix3;
+ }
+ 
+ /**
+  * uint32_t crc;
+  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
+  */
+ void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
+   assert_different_registers(crc, table, tmp);
+   assert_different_registers(val, table);
+ 
+   if (crc == val) {                   // Must rotate first to use the unmodified value.
+     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
+                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
+     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
+   } else {
+     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
+     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
+   }
+   lwzx(tmp, table, tmp);
+   xorr(crc, crc, tmp);
+ }
+ 
+ /**
+  * uint32_t crc;
+  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
+  */
+ void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
+   fold_byte_crc32(crc, crc, table, tmp);
+ }
+ 
+ /**
+  * Emits code to update CRC-32 with a byte value according to constants in table.
+  *
+  * @param [in,out]crc   Register containing the crc.
+  * @param [in]val       Register containing the byte to fold into the CRC.
+  * @param [in]table     Register containing the table of crc constants.
+  *
+  * uint32_t crc;
+  * val = crc_table[(val ^ crc) & 0xFF];
+  * crc = val ^ (crc >> 8);
+  */
+ void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
+   BLOCK_COMMENT("update_byte_crc32:");
+   xorr(val, val, crc);
+   fold_byte_crc32(crc, val, table, val);
+ }
+ 
+ /**
+  * @param crc   register containing existing CRC (32-bit)
+  * @param buf   register pointing to input byte buffer (byte*)
+  * @param len   register containing number of bytes
+  * @param table register pointing to CRC table
+  */
+ void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
+                                            Register data, bool loopAlignment, bool invertCRC) {
+   assert_different_registers(crc, buf, len, table, data);
+ 
+   Label L_mainLoop, L_done;
+   const int mainLoop_stepping  = 1;
+   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
+ 
+   // Process all bytes in a single-byte loop.
+   cmpdi(CCR0, len, 0);                           // Anything to do?
+   mtctr(len);
+   beq(CCR0, L_done);
+ 
+   if (invertCRC) {
+     nand(crc, crc, crc);                         // ~c
+   }
+ 
+   align(mainLoop_alignment);
+   BIND(L_mainLoop);
+     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
+     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
+     update_byte_crc32(crc, data, table);
+     bdnz(L_mainLoop);                            // Iterate.
+ 
+   if (invertCRC) {
+     nand(crc, crc, crc);                         // ~c
+   }
+ 
+   bind(L_done);
+ }
+ 
+ /**
+  * Emits code to update CRC-32 with a 4-byte value according to constants in table
+  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
+  */
+ // A not on the lookup table address(es):
+ // The lookup table consists of two sets of four columns each.
+ // The columns {0..3} are used for little-endian machines.
+ // The columns {4..7} are used for big-endian machines.
+ // To save the effort of adding the column offset to the table address each time
+ // a table element is looked up, it is possible to pass the pre-calculated
+ // column addresses.
+ // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
+ void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
+                                         Register t0,  Register t1,  Register t2,  Register t3,
+                                         Register tc0, Register tc1, Register tc2, Register tc3) {
+   assert_different_registers(crc, t3);
+ 
+   // XOR crc with next four bytes of buffer.
+   lwz(t3, bufDisp, buf);
+   if (bufInc != 0) {
+     addi(buf, buf, bufInc);
+   }
+   xorr(t3, t3, crc);
+ 
+   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
+   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
+   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
+   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
+   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
+ 
+   // Use the pre-calculated column addresses.
+   // Load pre-calculated table values.
+   lwzx(t0, tc0, t0);
+   lwzx(t1, tc1, t1);
+   lwzx(t2, tc2, t2);
+   lwzx(t3, tc3, t3);
+ 
+   // Calculate new crc from table values.
+   xorr(t0,  t0, t1);
+   xorr(t2,  t2, t3);
+   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
+ }
+ 
+ /**
+  * @param crc   register containing existing CRC (32-bit)
+  * @param buf   register pointing to input byte buffer (byte*)
+  * @param len   register containing number of bytes
+  * @param table register pointing to CRC table
+  *
+  * Uses R9..R12 as work register. Must be saved/restored by caller!
+  */
+ void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
+                                         Register t0,  Register t1,  Register t2,  Register t3,
+                                         Register tc0, Register tc1, Register tc2, Register tc3) {
+   assert_different_registers(crc, buf, len, table);
+ 
+   Label L_mainLoop, L_tail;
+   Register  tmp  = t0;
+   Register  data = t0;
+   Register  tmp2 = t1;
+   const int mainLoop_stepping  = 8;
+   const int tailLoop_stepping  = 1;
+   const int log_stepping       = exact_log2(mainLoop_stepping);
+   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
+   const int complexThreshold   = 2*mainLoop_stepping;
+ 
+   // Don't test for len <= 0 here. This pathological case should not occur anyway.
+   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
+   // The situation itself is detected and handled correctly by the conditional branches
+   // following  aghi(len, -stepping) and aghi(len, +stepping).
+   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
+ 
+   BLOCK_COMMENT("kernel_crc32_2word {");
+ 
+   nand(crc, crc, crc);                           // ~c
+ 
+   // Check for short (<mainLoop_stepping) buffer.
+   cmpdi(CCR0, len, complexThreshold);
+   blt(CCR0, L_tail);
+ 
+   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
+   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
+   {
+     // Align buf addr to mainLoop_stepping boundary.
+     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
+     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
+ 
+     if (complexThreshold > mainLoop_stepping) {
+       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+     } else {
+       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
+       cmpdi(CCR0, tmp, mainLoop_stepping);
+       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
+       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+     }
+     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
+   }
+ 
+   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
+   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
+   mtctr(tmp2);
+ 
+ #ifdef VM_LITTLE_ENDIAN
+   Register crc_rv = crc;
+ #else
+   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
+                                                  // Occupies tmp, but frees up crc.
+   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
+   tmp = crc;
+ #endif
+ 
+   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
+ 
+   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
+   BIND(L_mainLoop);
+     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
+     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
+     bdnz(L_mainLoop);
+ 
+ #ifndef VM_LITTLE_ENDIAN
+   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
+   tmp = crc_rv;                                  // Tmp uses it's original register again.
+ #endif
+ 
+   // Restore original table address for tailLoop.
+   if (reconstructTableOffset != 0) {
+     addi(table, table, -reconstructTableOffset);
+   }
+ 
+   // Process last few (<complexThreshold) bytes of buffer.
+   BIND(L_tail);
+   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
+ 
+   nand(crc, crc, crc);                           // ~c
+   BLOCK_COMMENT("} kernel_crc32_2word");
+ }
+ 
+ /**
+  * @param crc   register containing existing CRC (32-bit)
+  * @param buf   register pointing to input byte buffer (byte*)
+  * @param len   register containing number of bytes
+  * @param table register pointing to CRC table
+  *
+  * uses R9..R12 as work register. Must be saved/restored by caller!
+  */
+ void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
+                                         Register t0,  Register t1,  Register t2,  Register t3,
+                                         Register tc0, Register tc1, Register tc2, Register tc3) {
+   assert_different_registers(crc, buf, len, table);
+ 
+   Label L_mainLoop, L_tail;
+   Register  tmp          = t0;
+   Register  data         = t0;
+   Register  tmp2         = t1;
+   const int mainLoop_stepping  = 4;
+   const int tailLoop_stepping  = 1;
+   const int log_stepping       = exact_log2(mainLoop_stepping);
+   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
+   const int complexThreshold   = 2*mainLoop_stepping;
+ 
+   // Don't test for len <= 0 here. This pathological case should not occur anyway.
+   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
+   // The situation itself is detected and handled correctly by the conditional branches
+   // following  aghi(len, -stepping) and aghi(len, +stepping).
+   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
+ 
+   BLOCK_COMMENT("kernel_crc32_1word {");
+ 
+   nand(crc, crc, crc);                           // ~c
+ 
+   // Check for short (<mainLoop_stepping) buffer.
+   cmpdi(CCR0, len, complexThreshold);
+   blt(CCR0, L_tail);
+ 
+   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
+   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
+   {
+     // Align buf addr to mainLoop_stepping boundary.
+     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
+     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
+ 
+     if (complexThreshold > mainLoop_stepping) {
+       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+     } else {
+       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
+       cmpdi(CCR0, tmp, mainLoop_stepping);
+       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
+       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+     }
+     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
+   }
+ 
+   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
+   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
+   mtctr(tmp2);
+ 
+ #ifdef VM_LITTLE_ENDIAN
+   Register crc_rv = crc;
+ #else
+   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
+                                                  // Occupies tmp, but frees up crc.
+   load_reverse_32(crc_rv, crc);                  // evert byte order because we are dealing with big-endian data.
+   tmp = crc;
+ #endif
+ 
+   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
+ 
+   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
+   BIND(L_mainLoop);
+     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
+     bdnz(L_mainLoop);
+ 
+ #ifndef VM_LITTLE_ENDIAN
+   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
+   tmp = crc_rv;                                  // Tmp uses it's original register again.
+ #endif
+ 
+   // Restore original table address for tailLoop.
+   if (reconstructTableOffset != 0) {
+     addi(table, table, -reconstructTableOffset);
+   }
+ 
+   // Process last few (<complexThreshold) bytes of buffer.
+   BIND(L_tail);
+   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
+ 
+   nand(crc, crc, crc);                           // ~c
+   BLOCK_COMMENT("} kernel_crc32_1word");
+ }
+ 
+ /**
+  * @param crc   register containing existing CRC (32-bit)
+  * @param buf   register pointing to input byte buffer (byte*)
+  * @param len   register containing number of bytes
+  * @param table register pointing to CRC table
+  *
+  * Uses R7_ARG5, R8_ARG6 as work registers.
+  */
+ void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
+                                         Register t0,  Register t1,  Register t2,  Register t3) {
+   assert_different_registers(crc, buf, len, table);
+ 
+   Register  data = t0;                   // Holds the current byte to be folded into crc.
+ 
+   BLOCK_COMMENT("kernel_crc32_1byte {");
+ 
+   // Process all bytes in a single-byte loop.
+   update_byteLoop_crc32(crc, buf, len, table, data, true, true);
+ 
+   BLOCK_COMMENT("} kernel_crc32_1byte");
+ }
+ 
+ void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
+   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
+ 
+   BLOCK_COMMENT("kernel_crc32_singleByte:");
+   nand(crc, crc, crc);       // ~c
+ 
+   lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
+   update_byte_crc32(crc, tmp, table);
+ 
+   nand(crc, crc, crc);       // ~c
+ }
+ 
  // dest_lo += src1 + src2
  // dest_hi += carry1 + carry2
  void MacroAssembler::add2_with_carry(Register dest_hi,
                                       Register dest_lo,
                                       Register src1, Register src2) {
*** 3513,3523 ****
  
    lwz(y_idx, 0, y);
    b(L_multiply);
  
  
!   bind( L_one_x ); // Load one 32 bit portion of x as (0,value).
  
    lwz(x_xstart, 0, x);
    b(L_first_loop);
  
    bind(L_first_loop_exit);
--- 3926,3936 ----
  
    lwz(y_idx, 0, y);
    b(L_multiply);
  
  
!   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
  
    lwz(x_xstart, 0, x);
    b(L_first_loop);
  
    bind(L_first_loop_exit);
*** 3532,3542 ****
  
    //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
    //  z[kdx] = (jlong)product;
  
    sldi(tmp, idx, LogBytesPerInt);
!   if ( offset ) {
      addi(tmp, tmp, offset);
    }
    ldx(yz_idx, y, tmp);
  #ifdef VM_LITTLE_ENDIAN
    rldicl(yz_idx, yz_idx, 32, 0);
--- 3945,3955 ----
  
    //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
    //  z[kdx] = (jlong)product;
  
    sldi(tmp, idx, LogBytesPerInt);
!   if (offset) {
      addi(tmp, tmp, offset);
    }
    ldx(yz_idx, y, tmp);
  #ifdef VM_LITTLE_ENDIAN
    rldicl(yz_idx, yz_idx, 32, 0);
*** 3549,3559 ****
  #endif
  
    add2_with_carry(product_high, product, carry, yz_idx);
  
    sldi(tmp, idx, LogBytesPerInt);
!   if ( offset ) {
      addi(tmp, tmp, offset);
    }
  #ifdef VM_LITTLE_ENDIAN
    rldicl(product, product, 32, 0);
  #endif
--- 3962,3972 ----
  #endif
  
    add2_with_carry(product_high, product, carry, yz_idx);
  
    sldi(tmp, idx, LogBytesPerInt);
!   if (offset) {
      addi(tmp, tmp, offset);
    }
  #ifdef VM_LITTLE_ENDIAN
    rldicl(product, product, 32, 0);
  #endif

< prev index next >