jdk Cdiff src/hotspot/cpu/ppc/macroAssembler

src/hotspot/cpu/ppc/macroAssembler_ppc.cpp

rev 53441 : 8217459: [PPC64] Cleanup non-vector version of CRC32
Reviewed-by:


*** 3857,3891 ****
  // body size from 20 to 16 instructions.
  // Returns the offset that was used to calculate the address of column tc3.
  // Due to register shortage, setting tc3 may overwrite table. With the return offset
  // at hand, the original table address can be easily reconstructed.
  int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
  
  #ifdef VM_LITTLE_ENDIAN
    // This is what we implement (the DOLIT4 part):
    // ========================================================================= */
    // #define DOLIT4 c ^= *buf4++; \
    //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
    //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
    // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
    // ========================================================================= */
!   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
!   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
!   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
!   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
  #else
    // This is what we implement (the DOBIG4 part):
    // =========================================================================
    // #define DOBIG4 c ^= *++buf4; \
    //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
    //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
    // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
    // =========================================================================
!   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
!   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
!   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
!   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
  #endif
    assert_different_registers(table, tc0, tc1, tc2);
    assert(table == tc3, "must be!");
  
    addi(tc0, table, ix0);
--- 3857,3894 ----
  // body size from 20 to 16 instructions.
  // Returns the offset that was used to calculate the address of column tc3.
  // Due to register shortage, setting tc3 may overwrite table. With the return offset
  // at hand, the original table address can be easily reconstructed.
  int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
+   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
  
+   // Point to 4 byte folding tables (byte-reversed for Big Endian)
+   // Layout: See StubRoutines::generate_crc_constants.
  #ifdef VM_LITTLE_ENDIAN
    // This is what we implement (the DOLIT4 part):
    // ========================================================================= */
    // #define DOLIT4 c ^= *buf4++; \
    //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
    //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
    // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
    // ========================================================================= */
!   const int ix0 = 3 * CRC32_TABLE_SIZE;
!   const int ix1 = 2 * CRC32_TABLE_SIZE;
!   const int ix2 = 1 * CRC32_TABLE_SIZE;
!   const int ix3 = 0 * CRC32_TABLE_SIZE;
  #else
    // This is what we implement (the DOBIG4 part):
    // =========================================================================
    // #define DOBIG4 c ^= *++buf4; \
    //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
    //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
    // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
    // =========================================================================
!   const int ix0 = 1 * CRC32_TABLE_SIZE;
!   const int ix1 = 2 * CRC32_TABLE_SIZE;
!   const int ix2 = 3 * CRC32_TABLE_SIZE;
!   const int ix3 = 4 * CRC32_TABLE_SIZE;
  #endif
    assert_different_registers(table, tc0, tc1, tc2);
    assert(table == tc3, "must be!");
  
    addi(tc0, table, ix0);
*** 3915,3932 ****
    lwzx(tmp, table, tmp);
    xorr(crc, crc, tmp);
  }
  
  /**
-  * uint32_t crc;
-  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
-  */
- void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
-   fold_byte_crc32(crc, crc, table, tmp);
- }
- 
- /**
   * Emits code to update CRC-32 with a byte value according to constants in table.
   *
   * @param [in,out]crc   Register containing the crc.
   * @param [in]val       Register containing the byte to fold into the CRC.
   * @param [in]table     Register containing the table of crc constants.
--- 3918,3927 ----
*** 4113,4160 ****
  
  /**
   * @param crc   register containing existing CRC (32-bit)
   * @param buf   register pointing to input byte buffer (byte*)
   * @param len   register containing number of bytes
!  * @param table register pointing to CRC table
!  *
!  * Uses R7_ARG5, R8_ARG6 as work registers.
   */
! void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
                                          Register t0,  Register t1,  Register t2,  Register t3,
!                                         bool invertCRC) {
!   assert_different_registers(crc, buf, len, table);
! 
!   Register  data = t0;                   // Holds the current byte to be folded into crc.
! 
!   BLOCK_COMMENT("kernel_crc32_1byte {");
! 
!   if (invertCRC) {
!     nand(crc, crc, crc);                      // 1s complement of crc
!   }
! 
!   // Process all bytes in a single-byte loop.
!   update_byteLoop_crc32(crc, buf, len, table, data, true);
! 
!   if (invertCRC) {
!     nand(crc, crc, crc);                      // 1s complement of crc
!   }
!   BLOCK_COMMENT("} kernel_crc32_1byte");
! }
! 
! /**
!  * @param crc             register containing existing CRC (32-bit)
!  * @param buf             register pointing to input byte buffer (byte*)
!  * @param len             register containing number of bytes
!  * @param table           register pointing to CRC table
!  * @param constants       register pointing to CRC table for 128-bit aligned memory
!  * @param t0-t5           temp registers
!  */
! void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table,
!                                          Register constants, Register t0, Register t1, Register t2,
!                                          Register t3, Register t4, Register t5, bool invertCRC) {
!   assert_different_registers(crc, buf, len, table);
  
    Label L_tail;
  
    BLOCK_COMMENT("kernel_crc32_vpmsum {");
  
--- 4108,4124 ----
  
  /**
   * @param crc             register containing existing CRC (32-bit)
   * @param buf             register pointing to input byte buffer (byte*)
   * @param len             register containing number of bytes
!  * @param constants       register pointing to precomputed constants
!  * @param t0-t6           temp registers
   */
! void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
                                           Register t0, Register t1, Register t2, Register t3,
!                                          Register t4, Register t5, Register t6, bool invertCRC) {
!   assert_different_registers(crc, buf, len, constants);
  
    Label L_tail;
  
    BLOCK_COMMENT("kernel_crc32_vpmsum {");
  
*** 4175,4192 ****
    andi(prealign, prealign, alignment - 1);
    cmpw(CCR0, t1, prealign);
    blt(CCR0, L_tail); // len - prealign < threshold?
  
    subf(len, prealign, len);
!   update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
  
    // Calculate from first aligned address as far as possible.
!   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5);
  
    // Remaining bytes.
    BIND(L_tail);
!   update_byteLoop_crc32(crc, buf, len, table, t2, false);
  
    if (invertCRC) {
      nand(crc, crc, crc);                      // 1s complement of crc
    }
  
--- 4139,4158 ----
    andi(prealign, prealign, alignment - 1);
    cmpw(CCR0, t1, prealign);
    blt(CCR0, L_tail); // len - prealign < threshold?
  
    subf(len, prealign, len);
!   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
  
    // Calculate from first aligned address as far as possible.
!   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
!   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
!   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
  
    // Remaining bytes.
    BIND(L_tail);
!   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
  
    if (invertCRC) {
      nand(crc, crc, crc);                      // 1s complement of crc
    }
  
*** 4196,4209 ****
  /**
   * @param crc             register containing existing CRC (32-bit)
   * @param buf             register pointing to input byte buffer (byte*)
   * @param len             register containing number of bytes (will get updated to remaining bytes)
   * @param constants       register pointing to CRC table for 128-bit aligned memory
!  * @param t0-t5           temp registers
   */
! void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len,
!     Register constants, Register t0, Register t1, Register t2, Register t3, Register t4, Register t5) {
  
    // Save non-volatile vector registers (frameless).
    Register offset = t1;
    int offsetInt = 0;
    offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
--- 4162,4175 ----
  /**
   * @param crc             register containing existing CRC (32-bit)
   * @param buf             register pointing to input byte buffer (byte*)
   * @param len             register containing number of bytes (will get updated to remaining bytes)
   * @param constants       register pointing to CRC table for 128-bit aligned memory
!  * @param t0-t6           temp registers
   */
! void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
!     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
  
    // Save non-volatile vector registers (frameless).
    Register offset = t1;
    int offsetInt = 0;
    offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
*** 4215,4225 ****
  #ifndef VM_LITTLE_ENDIAN
    offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
  #endif
    offsetInt -= 8; std(R14, offsetInt, R1_SP);
    offsetInt -= 8; std(R15, offsetInt, R1_SP);
-   offsetInt -= 8; std(R16, offsetInt, R1_SP);
  
    // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
    // bytes per iteration. The basic scheme is:
    // lvx: load vector (Big Endian needs reversal)
    // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
--- 4181,4190 ----
*** 4233,4246 ****
  
    const int outer_consts_size = (unroll_factor2 - 1) * 16,
              inner_consts_size = (unroll_factor / unroll_factor2) * 16;
  
    // Support registers.
!   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, crc /* will live in VCRC */ };
    Register num_bytes = R14,
             loop_count = R15,
!            cur_const = R16;
    // Constant array for outer loop: unroll_factor2 - 1 registers,
    // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
    VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
                   consts1[] = { VR23, VR24 };
    // Data register arrays: 2 arrays with unroll_factor2 registers.
--- 4198,4211 ----
  
    const int outer_consts_size = (unroll_factor2 - 1) * 16,
              inner_consts_size = (unroll_factor / unroll_factor2) * 16;
  
    // Support registers.
!   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
    Register num_bytes = R14,
             loop_count = R15,
!            cur_const = crc; // will live in VCRC
    // Constant array for outer loop: unroll_factor2 - 1 registers,
    // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
    VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
                   consts1[] = { VR23, VR24 };
    // Data register arrays: 2 arrays with unroll_factor2 registers.
*** 4468,4488 ****
  #ifndef VM_LITTLE_ENDIAN
    offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
  #endif
    offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
    offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
-   offsetInt -= 8;  ld(R16, offsetInt, R1_SP);
  }
  
  void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
                             Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
    load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
                                       : StubRoutines::crc_table_addr()   , R0);
  
    if (VM_Version::has_vpmsumb()) {
-     load_const_optimized(t1, is_crc32c ? StubRoutines::ppc64::crc32c_constants()
-                                        : StubRoutines::ppc64::crc_constants()   , R0);
      kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
    } else {
      kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
    }
  }
--- 4433,4450 ----

< prev index next >