< prev index next >
src/hotspot/cpu/ppc/macroAssembler_ppc.cpp
Print this page
rev 53441 : 8217459: [PPC64] Cleanup non-vector version of CRC32
Reviewed-by:
*** 3857,3891 ****
// body size from 20 to 16 instructions.
// Returns the offset that was used to calculate the address of column tc3.
// Due to register shortage, setting tc3 may overwrite table. With the return offset
// at hand, the original table address can be easily reconstructed.
int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
#ifdef VM_LITTLE_ENDIAN
// This is what we implement (the DOLIT4 part):
// ========================================================================= */
// #define DOLIT4 c ^= *buf4++; \
// c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
// crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
// #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
// ========================================================================= */
! const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
! const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
! const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
! const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
#else
// This is what we implement (the DOBIG4 part):
// =========================================================================
// #define DOBIG4 c ^= *++buf4; \
// c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
// crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
// #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
// =========================================================================
! const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
! const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
! const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
! const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
#endif
assert_different_registers(table, tc0, tc1, tc2);
assert(table == tc3, "must be!");
addi(tc0, table, ix0);
--- 3857,3894 ----
// body size from 20 to 16 instructions.
// Returns the offset that was used to calculate the address of column tc3.
// Due to register shortage, setting tc3 may overwrite table. With the return offset
// at hand, the original table address can be easily reconstructed.
int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
+ assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
+ // Point to 4 byte folding tables (byte-reversed for Big Endian)
+ // Layout: See StubRoutines::generate_crc_constants.
#ifdef VM_LITTLE_ENDIAN
// This is what we implement (the DOLIT4 part):
// ========================================================================= */
// #define DOLIT4 c ^= *buf4++; \
// c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
// crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
// #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
// ========================================================================= */
! const int ix0 = 3 * CRC32_TABLE_SIZE;
! const int ix1 = 2 * CRC32_TABLE_SIZE;
! const int ix2 = 1 * CRC32_TABLE_SIZE;
! const int ix3 = 0 * CRC32_TABLE_SIZE;
#else
// This is what we implement (the DOBIG4 part):
// =========================================================================
// #define DOBIG4 c ^= *++buf4; \
// c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
// crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
// #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
// =========================================================================
! const int ix0 = 1 * CRC32_TABLE_SIZE;
! const int ix1 = 2 * CRC32_TABLE_SIZE;
! const int ix2 = 3 * CRC32_TABLE_SIZE;
! const int ix3 = 4 * CRC32_TABLE_SIZE;
#endif
assert_different_registers(table, tc0, tc1, tc2);
assert(table == tc3, "must be!");
addi(tc0, table, ix0);
*** 3915,3932 ****
lwzx(tmp, table, tmp);
xorr(crc, crc, tmp);
}
/**
- * uint32_t crc;
- * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
- */
- void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
- fold_byte_crc32(crc, crc, table, tmp);
- }
-
- /**
* Emits code to update CRC-32 with a byte value according to constants in table.
*
* @param [in,out]crc Register containing the crc.
* @param [in]val Register containing the byte to fold into the CRC.
* @param [in]table Register containing the table of crc constants.
--- 3918,3927 ----
*** 4113,4160 ****
/**
* @param crc register containing existing CRC (32-bit)
* @param buf register pointing to input byte buffer (byte*)
* @param len register containing number of bytes
! * @param table register pointing to CRC table
! *
! * Uses R7_ARG5, R8_ARG6 as work registers.
*/
! void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
Register t0, Register t1, Register t2, Register t3,
! bool invertCRC) {
! assert_different_registers(crc, buf, len, table);
!
! Register data = t0; // Holds the current byte to be folded into crc.
!
! BLOCK_COMMENT("kernel_crc32_1byte {");
!
! if (invertCRC) {
! nand(crc, crc, crc); // 1s complement of crc
! }
!
! // Process all bytes in a single-byte loop.
! update_byteLoop_crc32(crc, buf, len, table, data, true);
!
! if (invertCRC) {
! nand(crc, crc, crc); // 1s complement of crc
! }
! BLOCK_COMMENT("} kernel_crc32_1byte");
! }
!
! /**
! * @param crc register containing existing CRC (32-bit)
! * @param buf register pointing to input byte buffer (byte*)
! * @param len register containing number of bytes
! * @param table register pointing to CRC table
! * @param constants register pointing to CRC table for 128-bit aligned memory
! * @param t0-t5 temp registers
! */
! void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table,
! Register constants, Register t0, Register t1, Register t2,
! Register t3, Register t4, Register t5, bool invertCRC) {
! assert_different_registers(crc, buf, len, table);
Label L_tail;
BLOCK_COMMENT("kernel_crc32_vpmsum {");
--- 4108,4124 ----
/**
* @param crc register containing existing CRC (32-bit)
* @param buf register pointing to input byte buffer (byte*)
* @param len register containing number of bytes
! * @param constants register pointing to precomputed constants
! * @param t0-t6 temp registers
*/
! void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
Register t0, Register t1, Register t2, Register t3,
! Register t4, Register t5, Register t6, bool invertCRC) {
! assert_different_registers(crc, buf, len, constants);
Label L_tail;
BLOCK_COMMENT("kernel_crc32_vpmsum {");
*** 4175,4192 ****
andi(prealign, prealign, alignment - 1);
cmpw(CCR0, t1, prealign);
blt(CCR0, L_tail); // len - prealign < threshold?
subf(len, prealign, len);
! update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
// Calculate from first aligned address as far as possible.
! kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5);
// Remaining bytes.
BIND(L_tail);
! update_byteLoop_crc32(crc, buf, len, table, t2, false);
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
--- 4139,4158 ----
andi(prealign, prealign, alignment - 1);
cmpw(CCR0, t1, prealign);
blt(CCR0, L_tail); // len - prealign < threshold?
subf(len, prealign, len);
! update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
// Calculate from first aligned address as far as possible.
! addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
! kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
! addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
// Remaining bytes.
BIND(L_tail);
! update_byteLoop_crc32(crc, buf, len, constants, t2, false);
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
*** 4196,4209 ****
/**
* @param crc register containing existing CRC (32-bit)
* @param buf register pointing to input byte buffer (byte*)
* @param len register containing number of bytes (will get updated to remaining bytes)
* @param constants register pointing to CRC table for 128-bit aligned memory
! * @param t0-t5 temp registers
*/
! void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len,
! Register constants, Register t0, Register t1, Register t2, Register t3, Register t4, Register t5) {
// Save non-volatile vector registers (frameless).
Register offset = t1;
int offsetInt = 0;
offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
--- 4162,4175 ----
/**
* @param crc register containing existing CRC (32-bit)
* @param buf register pointing to input byte buffer (byte*)
* @param len register containing number of bytes (will get updated to remaining bytes)
* @param constants register pointing to CRC table for 128-bit aligned memory
! * @param t0-t6 temp registers
*/
! void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
! Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
// Save non-volatile vector registers (frameless).
Register offset = t1;
int offsetInt = 0;
offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
*** 4215,4225 ****
#ifndef VM_LITTLE_ENDIAN
offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
#endif
offsetInt -= 8; std(R14, offsetInt, R1_SP);
offsetInt -= 8; std(R15, offsetInt, R1_SP);
- offsetInt -= 8; std(R16, offsetInt, R1_SP);
// Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
// bytes per iteration. The basic scheme is:
// lvx: load vector (Big Endian needs reversal)
// vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
--- 4181,4190 ----
*** 4233,4246 ****
const int outer_consts_size = (unroll_factor2 - 1) * 16,
inner_consts_size = (unroll_factor / unroll_factor2) * 16;
// Support registers.
! Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, crc /* will live in VCRC */ };
Register num_bytes = R14,
loop_count = R15,
! cur_const = R16;
// Constant array for outer loop: unroll_factor2 - 1 registers,
// Constant array for inner loop: unroll_factor / unroll_factor2 registers.
VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
consts1[] = { VR23, VR24 };
// Data register arrays: 2 arrays with unroll_factor2 registers.
--- 4198,4211 ----
const int outer_consts_size = (unroll_factor2 - 1) * 16,
inner_consts_size = (unroll_factor / unroll_factor2) * 16;
// Support registers.
! Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
Register num_bytes = R14,
loop_count = R15,
! cur_const = crc; // will live in VCRC
// Constant array for outer loop: unroll_factor2 - 1 registers,
// Constant array for inner loop: unroll_factor / unroll_factor2 registers.
VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
consts1[] = { VR23, VR24 };
// Data register arrays: 2 arrays with unroll_factor2 registers.
*** 4468,4488 ****
#ifndef VM_LITTLE_ENDIAN
offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
#endif
offsetInt -= 8; ld(R14, offsetInt, R1_SP);
offsetInt -= 8; ld(R15, offsetInt, R1_SP);
- offsetInt -= 8; ld(R16, offsetInt, R1_SP);
}
void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
: StubRoutines::crc_table_addr() , R0);
if (VM_Version::has_vpmsumb()) {
- load_const_optimized(t1, is_crc32c ? StubRoutines::ppc64::crc32c_constants()
- : StubRoutines::ppc64::crc_constants() , R0);
kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
} else {
kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
}
}
--- 4433,4450 ----
< prev index next >