< prev index next >
src/hotspot/cpu/ppc/macroAssembler_ppc.cpp
Print this page
rev 53130 : 8216060: [PPC64] Vector CRC implementation should be used by interpreter and be faster for short arrays
Reviewed-by: gromero
@@ -3972,11 +3972,11 @@
/**
* Emits code to update CRC-32 with a 4-byte value according to constants in table
* Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
*/
-// A not on the lookup table address(es):
+// A note on the lookup table address(es):
// The lookup table consists of two sets of four columns each.
// The columns {0..3} are used for little-endian machines.
// The columns {4..7} are used for big-endian machines.
// To save the effort of adding the column offset to the table address each time
// a table element is looked up, it is possible to pass the pre-calculated
@@ -4148,72 +4148,64 @@
* @param table register pointing to CRC table
* @param constants register pointing to CRC table for 128-bit aligned memory
* @param barretConstants register pointing to table for barrett reduction
* @param t0-t4 temp registers
*/
-void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
- Register constants, Register barretConstants,
- Register t0, Register t1, Register t2, Register t3, Register t4,
- bool invertCRC) {
+void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table,
+ Register constants, Register t0, Register t1, Register t2,
+ Register t3, Register t4, Register t5, bool invertCRC) {
assert_different_registers(crc, buf, len, table);
- Label L_alignedHead, L_tail;
+ Label L_tail;
- BLOCK_COMMENT("kernel_crc32_1word_vpmsum {");
+ BLOCK_COMMENT("kernel_crc32_vpmsum {");
- // 1. ~c
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
- // 2. use kernel_crc32_1word for short len
+ // Enforce 32 bit.
clrldi(len, len, 32);
- cmpdi(CCR0, len, 512);
- blt(CCR0, L_tail);
- // 3. calculate from 0 to first aligned address
- const int alignment = 16;
+ // Align if we have enough bytes for the fast version.
+ const int alignment = 16,
+ threshold = 32;
Register prealign = t0;
- andi_(prealign, buf, alignment - 1);
- beq(CCR0, L_alignedHead);
- subfic(prealign, prealign, alignment);
+ neg(prealign, buf);
+ addi(t1, len, -threshold);
+ andi(prealign, prealign, alignment - 1);
+ cmpw(CCR0, t1, prealign);
+ blt(CCR0, L_tail); // len - prealign < threshold?
subf(len, prealign, len);
update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
- // 4. calculate from first aligned address as far as possible
- BIND(L_alignedHead);
- kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4);
+ // Calculate from first aligned address as far as possible.
+ kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5);
- // 5. remaining bytes
+ // Remaining bytes.
BIND(L_tail);
- Register tc0 = t4;
- Register tc1 = constants;
- Register tc2 = barretConstants;
- kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false);
+ update_byteLoop_crc32(crc, buf, len, table, t2, false);
- // 6. ~c
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
- BLOCK_COMMENT("} kernel_crc32_1word_vpmsum");
+ BLOCK_COMMENT("} kernel_crc32_vpmsum");
}
/**
* @param crc register containing existing CRC (32-bit)
* @param buf register pointing to input byte buffer (byte*)
* @param len register containing number of bytes (will get updated to remaining bytes)
* @param constants register pointing to CRC table for 128-bit aligned memory
* @param barretConstants register pointing to table for barrett reduction
* @param t0-t4 temp registers
- * Precondition: len should be >= 512. Otherwise, nothing will be done.
*/
-void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
- Register constants, Register barretConstants,
- Register t0, Register t1, Register t2, Register t3, Register t4) {
+void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len,
+ Register constants, Register t0, Register t1, Register t2, Register t3, Register t4, Register t5) {
// Save non-volatile vector registers (frameless).
Register offset = t1;
int offsetInt = 0;
offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
@@ -4226,29 +4218,31 @@
offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
#endif
offsetInt -= 8; std(R14, offsetInt, R1_SP);
offsetInt -= 8; std(R15, offsetInt, R1_SP);
offsetInt -= 8; std(R16, offsetInt, R1_SP);
- offsetInt -= 8; std(R17, offsetInt, R1_SP);
// Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
// bytes per iteration. The basic scheme is:
// lvx: load vector (Big Endian needs reversal)
// vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
// vxor: xor partial results together to get unroll_factor2 vectors
// Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
// Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
- const int unroll_factor = 2048;
- const int unroll_factor2 = 8;
+ const int unroll_factor = CRC32_UNROLL_FACTOR,
+ unroll_factor2 = CRC32_UNROLL_FACTOR2;
+
+ const int outer_consts_size = (unroll_factor2 - 1) * 16,
+ inner_consts_size = (unroll_factor / unroll_factor2) * 16;
// Support registers.
- Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 };
- Register num_bytes = R15,
- loop_count = R16,
- cur_const = R17;
+ Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, crc /* will live in VCRC */ };
+ Register num_bytes = R14,
+ loop_count = R15,
+ cur_const = R16;
// Constant array for outer loop: unroll_factor2 - 1 registers,
// Constant array for inner loop: unroll_factor / unroll_factor2 registers.
VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
consts1[] = { VR23, VR24 };
// Data register arrays: 2 arrays with unroll_factor2 registers.
@@ -4266,25 +4260,23 @@
if (VM_Version::has_mfdscr()) {
load_const_optimized(t0, VM_Version::_dscr_val | 7);
mtdscr(t0);
}
- mtvrwz(VCRC, crc); // crc lives lives in VCRC, now
+ mtvrwz(VCRC, crc); // crc lives in VCRC, now
for (int i = 1; i < unroll_factor2; ++i) {
li(offs[i], 16 * i);
}
// Load consts for outer loop
lvx(consts0[0], constants);
for (int i = 1; i < unroll_factor2 - 1; ++i) {
lvx(consts0[i], offs[i], constants);
}
- addi(constants, constants, (unroll_factor2 - 1) * 16);
load_const_optimized(num_bytes, 16 * unroll_factor);
- load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
// Reuse data registers outside of the loop.
VectorRegister Vtmp = data1[0];
VectorRegister Vtmp2 = data1[1];
VectorRegister zeroes = data1[2];
@@ -4308,17 +4300,19 @@
#endif
cmpd(CCR0, len, num_bytes);
blt(CCR0, L_last);
+ addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
+ load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
+
// ********** Main loop start **********
align(32);
bind(L_outer_loop);
// Begin of unrolled first iteration (no xor).
lvx(data1[0], buf);
- mr(cur_const, constants);
for (int i = 1; i < unroll_factor2 / 2; ++i) {
lvx(data1[i], offs[i], buf);
}
vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
lvx(consts1[0], cur_const);
@@ -4367,10 +4361,12 @@
}
addi(buf, buf, 16 * unroll_factor2);
}
bdnz(L_inner_loop);
+ addi(cur_const, constants, outer_consts_size); // Reset
+
// Tail of last iteration (no loads).
for (int i = 0; i < unroll_factor2 / 2; ++i) {
BE_swap_bytes(data1[i + unroll_factor2 / 2]);
vxor(data0[i], data0[i], data1[i]);
vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
@@ -4395,33 +4391,65 @@
bge(CCR0, L_outer_loop);
// Last chance with lower num_bytes.
bind(L_last);
srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
- add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one.
+ // Point behind last const for inner loop.
+ add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
- subf(constants, R0, constants); // Point to constant to be used first.
+ subf(cur_const, R0, cur_const); // Point to constant to be used first.
addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
bgt(CCR0, L_outer_loop);
// ********** Main loop end **********
-#undef BE_swap_bytes
// Restore DSCR pre-fetch value.
if (VM_Version::has_mfdscr()) {
load_const_optimized(t0, VM_Version::_dscr_val);
mtdscr(t0);
}
+ // ********** Simple loop for remaining 16 byte blocks **********
+ {
+ Label L_loop, L_done;
+
+ srdi_(t0, len, 4); // 16 bytes per iteration
+ clrldi(len, len, 64-4);
+ beq(CCR0, L_done);
+
+ // Point to const (same as last const for inner loop).
+ add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
+ mtctr(t0);
+ lvx(Vtmp2, cur_const);
+
+ align(32);
+ bind(L_loop);
+
+ lvx(Vtmp, buf);
+ addi(buf, buf, 16);
+ vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
+ BE_swap_bytes(Vtmp);
+ vxor(VCRC, VCRC, Vtmp);
+ vpmsumw(VCRC, VCRC, Vtmp2);
+ bdnz(L_loop);
+
+ bind(L_done);
+ }
+ // ********** Simple loop end **********
+#undef BE_swap_bytes
+
+ // Point to Barrett constants
+ add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
+
vspltisb(zeroes, 0);
// Combine to 64 bit result.
vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
// Reduce to 32 bit CRC: Remainder by multiply-high.
- lvx(Vtmp, barretConstants);
+ lvx(Vtmp, cur_const);
vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.
vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.
vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
vsldoi(Vtmp, zeroes, Vtmp, 8);
vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.
@@ -4443,11 +4471,10 @@
offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
#endif
offsetInt -= 8; ld(R14, offsetInt, R1_SP);
offsetInt -= 8; ld(R15, offsetInt, R1_SP);
offsetInt -= 8; ld(R16, offsetInt, R1_SP);
- offsetInt -= 8; ld(R17, offsetInt, R1_SP);
}
void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
assert_different_registers(crc, buf, /* len, not used!! */ table, tmp);
< prev index next >