< prev index next >
src/cpu/ppc/vm/macroAssembler_ppc.cpp
Print this page
rev 11916 : 8164920: ppc: enhancement of CRC32 intrinsic
@@ -4330,10 +4330,573 @@
update_byteLoop_crc32(crc, buf, len, table, data, true, true);
BLOCK_COMMENT("} kernel_crc32_1byte");
}
+/**
+ * @param crc register containing existing CRC (32-bit)
+ * @param buf register pointing to input byte buffer (byte*)
+ * @param len register containing number of bytes
+ * @param table register pointing to CRC table
+ * @param constants register pointing to CRC table for 128-bit aligned memory
+ * @param barretConstants register pointing to table for barrett reduction
+ * @param t0 volatile register
+ * @param t1 volatile register
+ * @param t2 volatile register
+ * @param t3 volatile register
+ */
+void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
+ Register constants, Register barretConstants,
+ Register t0, Register t1, Register t2, Register t3, Register t4) {
+ assert_different_registers(crc, buf, len, table);
+
+ Label L_alignHead, L_tail, L_alignTail, L_start, L_end;
+
+ Register prealign = t0;
+ Register postalign = t0;
+
+ BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
+
+ // 1. use kernel_crc32_1word for shorter than 384bit
+ clrldi(len, len, 32);
+ cmpdi(CCR0, len, 384);
+ bge(CCR0, L_start);
+
+ Register tc0 = t4;
+ Register tc1 = constants;
+ Register tc2 = barretConstants;
+ kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table);
+ b(L_end);
+
+ BIND(L_start);
+
+ // 2. ~c
+ nand(crc, crc, crc);
+
+ // 3. calculate from 0 to first 128bit-aligned address
+ clrldi(prealign, buf, 57);
+
+ subfic(prealign, prealign, 128);
+ cmpdi(CCR0, prealign, 128);
+ beq(CCR0, L_alignHead);
+
+ subf(len, prealign, len);
+ update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false);
+
+ // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
+ BIND(L_alignHead);
+
+ clrldi(postalign, len, 57);
+ subf(len, postalign, len);
+
+ // len must be more than 256bit
+ kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
+
+ b(L_alignTail);
+
+ // 5. calculate remaining
+ BIND(L_alignTail);
+
+ cmpdi(CCR0, postalign, 0);
+ beq(CCR0, L_tail);
+
+ update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false);
+
+ BIND(L_tail);
+
+ // 6. ~c
+ nand(crc, crc, crc);
+
+ BIND(L_end);
+
+ BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
+}
+
+/**
+ * @param crc register containing existing CRC (32-bit)
+ * @param buf register pointing to input byte buffer (byte*)
+ * @param len register containing number of bytes
+ * @param constants register pointing to CRC table for 128-bit aligned memory
+ * @param barretConstants register pointing to table for barrett reduction
+ * @param t0 volatile register
+ * @param t1 volatile register
+ * @param t2 volatile register
+ */
+void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
+ Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
+ Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
+ Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
+ Label L_1, L_2, L_3, L_4;
+
+ Register rLoaded = t0;
+ Register rTmp1 = t1;
+ Register rTmp2 = t2;
+ Register off16 = R22;
+ Register off32 = R23;
+ Register off48 = R24;
+ Register off64 = R25;
+ Register off80 = R26;
+ Register off96 = R27;
+ Register off112 = R28;
+ Register rIdx = R29;
+ Register rMax = R30;
+ Register constantsPos = R31;
+
+ VectorRegister mask_32bit = VR24;
+ VectorRegister mask_64bit = VR25;
+ VectorRegister zeroes = VR26;
+ VectorRegister const1 = VR27;
+ VectorRegister const2 = VR28;
+
+ // Save non-volatile vector registers (frameless).
+ Register offset = t1;
+ li(offset, -32); stvx(VR20, offset, R1_SP);
+ addi(offset, offset, -32); stvx(VR21, offset, R1_SP);
+ addi(offset, offset, -32); stvx(VR22, offset, R1_SP);
+ addi(offset, offset, -32); stvx(VR23, offset, R1_SP);
+ addi(offset, offset, -32); stvx(VR24, offset, R1_SP);
+ addi(offset, offset, -32); stvx(VR25, offset, R1_SP);
+ addi(offset, offset, -32); stvx(VR26, offset, R1_SP);
+ addi(offset, offset, -32); stvx(VR27, offset, R1_SP);
+ addi(offset, offset, -32); stvx(VR28, offset, R1_SP);
+ addi(offset, offset, -8); std(R22, offset, R1_SP);
+ addi(offset, offset, -8); std(R23, offset, R1_SP);
+ addi(offset, offset, -8); std(R24, offset, R1_SP);
+ addi(offset, offset, -8); std(R25, offset, R1_SP);
+ addi(offset, offset, -8); std(R26, offset, R1_SP);
+ addi(offset, offset, -8); std(R27, offset, R1_SP);
+ addi(offset, offset, -8); std(R28, offset, R1_SP);
+ addi(offset, offset, -8); std(R29, offset, R1_SP);
+ addi(offset, offset, -8); std(R30, offset, R1_SP);
+ addi(offset, offset, -8); std(R31, offset, R1_SP);
+
+ // Set constants
+ li(off16, 16);
+ li(off32, 32);
+ li(off48, 48);
+ li(off64, 64);
+ li(off80, 80);
+ li(off96, 96);
+ li(off112, 112);
+
+ clrldi(crc, crc, 32);
+
+ vxor(zeroes, zeroes, zeroes);
+ vspltisw(VR0, -1);
+
+ vsldoi(mask_32bit, zeroes, VR0, 4);
+ vsldoi(mask_64bit, zeroes, VR0, 8);
+
+ /* Get the initial value into v8 */
+ vxor(VR8, VR8, VR8);
+ mtvrd(VR8, crc);
+ vsldoi(VR8, zeroes, VR8, 8); /* shift into bottom 32 bits */
+
+ li (rLoaded, 0);
+
+ rldicr(rIdx, len, 0, 56);
+
+ {
+ BIND(L_1);
+ /* Checksum in blocks of MAX_SIZE (32768) */
+ lis(rMax, 0);
+ ori(rMax, rMax, 32768);
+ mr(rTmp2, rMax);
+ cmpd(CCR0, rIdx, rMax);
+ bgt(CCR0, L_2);
+ mr(rMax, rIdx);
+
+ BIND(L_2);
+ subf(rIdx, rMax, rIdx);
+
+ /* our main loop does 128 bytes at a time */
+ srdi(rMax, rMax, 7);
+
+ /*
+ * Work out the offset into the constants table to start at. Each
+ * constant is 16 bytes, and it is used against 128 bytes of input
+ * data - 128 / 16 = 8
+ */
+ sldi(rTmp1, rMax, 4);
+ srdi(rTmp2, rTmp2, 3);
+ subf(rTmp1, rTmp1, rTmp2);
+
+ /* We reduce our final 128 bytes in a separate step */
+ addi(rMax, rMax, -1);
+ mtctr(rMax);
+
+ /* Find the start of our constants */
+ add(constantsPos, constants, rTmp1);
+
+ /* zero VR0-v7 which will contain our checksums */
+ vxor(VR0, VR0, VR0);
+ vxor(VR1, VR1, VR1);
+ vxor(VR2, VR2, VR2);
+ vxor(VR3, VR3, VR3);
+ vxor(VR4, VR4, VR4);
+ vxor(VR5, VR5, VR5);
+ vxor(VR6, VR6, VR6);
+ vxor(VR7, VR7, VR7);
+
+ lvx(const1, 0, constantsPos);
+
+ /*
+ * If we are looping back to consume more data we use the values
+ * already in VR16-v23.
+ */
+ cmpdi(CCR0, rLoaded, 1);
+ beq(CCR0, L_3);
+ {
+
+ /* First warm up pass */
+ lvx(VR16, buf);
+ lvx(VR17, off16, buf);
+ lvx(VR18, off32, buf);
+ lvx(VR19, off48, buf);
+ lvx(VR20, off64, buf);
+ lvx(VR21, off80, buf);
+ lvx(VR22, off96, buf);
+ lvx(VR23, off112, buf);
+ addi(buf, buf, 8*16);
+
+ /* xor in initial value */
+ vxor(VR16, VR16, VR8);
+ }
+
+ BIND(L_3);
+ bdz(L_first_warm_up_done);
+
+ addi(constantsPos, constantsPos, 16);
+ lvx(const2, 0, constantsPos);
+
+ /* Second warm up pass */
+ vpmsumd(VR8, VR16, const1);
+ lvx(VR16, buf);
+
+ vpmsumd(VR9, VR17, const1);
+ lvx(VR17, off16, buf);
+
+ vpmsumd(VR10, VR18, const1);
+ lvx(VR18, off32, buf);
+
+ vpmsumd(VR11, VR19, const1);
+ lvx(VR19, off48, buf);
+
+ vpmsumd(VR12, VR20, const1);
+ lvx(VR20, off64, buf);
+
+ vpmsumd(VR13, VR21, const1);
+ lvx(VR21, off80, buf);
+
+ vpmsumd(VR14, VR22, const1);
+ lvx(VR22, off96, buf);
+
+ vpmsumd(VR15, VR23, const1);
+ lvx(VR23, off112, buf);
+
+ addi(buf, buf, 8 * 16);
+
+ bdz(L_first_cool_down);
+
+ /*
+ * main loop. We modulo schedule it such that it takes three iterations
+ * to complete - first iteration load, second iteration vpmsum, third
+ * iteration xor.
+ */
+ {
+ BIND(L_4);
+ lvx(const1, 0, constantsPos); addi(constantsPos, constantsPos, 16);
+
+ vxor(VR0, VR0, VR8);
+ vpmsumd(VR8, VR16, const2);
+ lvx(VR16, buf);
+
+ vxor(VR1, VR1, VR9);
+ vpmsumd(VR9, VR17, const2);
+ lvx(VR17, off16, buf);
+
+ vxor(VR2, VR2, VR10);
+ vpmsumd(VR10, VR18, const2);
+ lvx(VR18, off32, buf);
+
+ vxor(VR3, VR3, VR11);
+ vpmsumd(VR11, VR19, const2);
+ lvx(VR19, off48, buf);
+ lvx(const2, 0, constantsPos);
+
+ vxor(VR4, VR4, VR12);
+ vpmsumd(VR12, VR20, const1);
+ lvx(VR20, off64, buf);
+
+ vxor(VR5, VR5, VR13);
+ vpmsumd(VR13, VR21, const1);
+ lvx(VR21, off80, buf);
+
+ vxor(VR6, VR6, VR14);
+ vpmsumd(VR14, VR22, const1);
+ lvx(VR22, off96, buf);
+
+ vxor(VR7, VR7, VR15);
+ vpmsumd(VR15, VR23, const1);
+ lvx(VR23, off112, buf);
+
+ addi(buf, buf, 8 * 16);
+
+ bdnz(L_4);
+ }
+
+ BIND(L_first_cool_down);
+
+ /* First cool down pass */
+ lvx(const1, 0, constantsPos);
+ addi(constantsPos, constantsPos, 16);
+
+ vxor(VR0, VR0, VR8);
+ vpmsumd(VR8, VR16, const1);
+
+ vxor(VR1, VR1, VR9);
+ vpmsumd(VR9, VR17, const1);
+
+ vxor(VR2, VR2, VR10);
+ vpmsumd(VR10, VR18, const1);
+
+ vxor(VR3, VR3, VR11);
+ vpmsumd(VR11, VR19, const1);
+
+ vxor(VR4, VR4, VR12);
+ vpmsumd(VR12, VR20, const1);
+
+ vxor(VR5, VR5, VR13);
+ vpmsumd(VR13, VR21, const1);
+
+ vxor(VR6, VR6, VR14);
+ vpmsumd(VR14, VR22, const1);
+
+ vxor(VR7, VR7, VR15);
+ vpmsumd(VR15, VR23, const1);
+
+ BIND(L_second_cool_down);
+ /* Second cool down pass */
+ vxor(VR0, VR0, VR8);
+ vxor(VR1, VR1, VR9);
+ vxor(VR2, VR2, VR10);
+ vxor(VR3, VR3, VR11);
+ vxor(VR4, VR4, VR12);
+ vxor(VR5, VR5, VR13);
+ vxor(VR6, VR6, VR14);
+ vxor(VR7, VR7, VR15);
+
+ /*
+ * vpmsumd produces a 96 bit result in the least significant bits
+ * of the register. Since we are bit reflected we have to shift it
+ * left 32 bits so it occupies the least significant bits in the
+ * bit reflected domain.
+ */
+ vsldoi(VR0, VR0, zeroes, 4);
+ vsldoi(VR1, VR1, zeroes, 4);
+ vsldoi(VR2, VR2, zeroes, 4);
+ vsldoi(VR3, VR3, zeroes, 4);
+ vsldoi(VR4, VR4, zeroes, 4);
+ vsldoi(VR5, VR5, zeroes, 4);
+ vsldoi(VR6, VR6, zeroes, 4);
+ vsldoi(VR7, VR7, zeroes, 4);
+
+ /* xor with last 1024 bits */
+ lvx(VR8, buf);
+ lvx(VR9, off16, buf);
+ lvx(VR10, off32, buf);
+ lvx(VR11, off48, buf);
+ lvx(VR12, off64, buf);
+ lvx(VR13, off80, buf);
+ lvx(VR14, off96, buf);
+ lvx(VR15, off112, buf);
+ addi(buf, buf, 8 * 16);
+
+ vxor(VR16, VR0, VR8);
+ vxor(VR17, VR1, VR9);
+ vxor(VR18, VR2, VR10);
+ vxor(VR19, VR3, VR11);
+ vxor(VR20, VR4, VR12);
+ vxor(VR21, VR5, VR13);
+ vxor(VR22, VR6, VR14);
+ vxor(VR23, VR7, VR15);
+
+ li(rLoaded, 1);
+ cmpdi(CCR0, rIdx, 0);
+ addi(rIdx, rIdx, 128);
+ bne(CCR0, L_1);
+ }
+
+ /* Work out how many bytes we have left */
+ andi_(len, len, 127);
+
+ /* Calculate where in the constant table we need to start */
+ subfic(rTmp1, len, 128);
+ add(constantsPos, constantsPos, rTmp1);
+
+ /* How many 16 byte chunks are in the tail */
+ srdi(rIdx, len, 4);
+ mtctr(rIdx);
+
+ /*
+ * Reduce the previously calculated 1024 bits to 64 bits, shifting
+ * 32 bits to include the trailing 32 bits of zeros
+ */
+ lvx(VR0, 0, constantsPos);
+ lvx(VR1, off16, constantsPos);
+ lvx(VR2, off32, constantsPos);
+ lvx(VR3, off48, constantsPos);
+ lvx(VR4, off64, constantsPos);
+ lvx(VR5, off80, constantsPos);
+ lvx(VR6, off96, constantsPos);
+ lvx(VR7, off112, constantsPos);
+ addi(constantsPos, constantsPos, 8 * 16);
+
+ vpmsumw(VR0, VR16, VR0);
+ vpmsumw(VR1, VR17, VR1);
+ vpmsumw(VR2, VR18, VR2);
+ vpmsumw(VR3, VR19, VR3);
+ vpmsumw(VR4, VR20, VR4);
+ vpmsumw(VR5, VR21, VR5);
+ vpmsumw(VR6, VR22, VR6);
+ vpmsumw(VR7, VR23, VR7);
+
+ /* Now reduce the tail (0 - 112 bytes) */
+ cmpdi(CCR0, rIdx, 0);
+ beq(CCR0, L_XOR);
+
+ lvx(VR16, 0, buf); addi(buf, buf, 16);
+ lvx(VR17, constantsPos);
+ vpmsumw(VR16, VR16, VR17);
+ vxor(VR0, VR0, VR16);
+ beq(CCR0, L_XOR);
+
+ lvx(VR16, 0, buf); addi(buf, buf, 16);
+ lvx(VR17, off16, constantsPos);
+ vpmsumw(VR16, VR16, VR17);
+ vxor(VR0, VR0, VR16);
+ beq(CCR0, L_XOR);
+
+ lvx(VR16, 0, buf); addi(buf, buf, 16);
+ lvx(VR17, off32, constantsPos);
+ vpmsumw(VR16, VR16, VR17);
+ vxor(VR0, VR0, VR16);
+ beq(CCR0, L_XOR);
+
+ lvx(VR16, 0, buf); addi(buf, buf, 16);
+ lvx(VR17, off48,constantsPos);
+ vpmsumw(VR16, VR16, VR17);
+ vxor(VR0, VR0, VR16);
+ beq(CCR0, L_XOR);
+
+ lvx(VR16, 0, buf); addi(buf, buf, 16);
+ lvx(VR17, off64, constantsPos);
+ vpmsumw(VR16, VR16, VR17);
+ vxor(VR0, VR0, VR16);
+ beq(CCR0, L_XOR);
+
+ lvx(VR16, 0, buf); addi(buf, buf, 16);
+ lvx(VR17, off80, constantsPos);
+ vpmsumw(VR16, VR16, VR17);
+ vxor(VR0, VR0, VR16);
+ beq(CCR0, L_XOR);
+
+ lvx(VR16, 0, buf); addi(buf, buf, 16);
+ lvx(VR17, off96, constantsPos);
+ vpmsumw(VR16, VR16, VR17);
+ vxor(VR0, VR0, VR16);
+
+ /* Now xor all the parallel chunks together */
+ BIND(L_XOR);
+ vxor(VR0, VR0, VR1);
+ vxor(VR2, VR2, VR3);
+ vxor(VR4, VR4, VR5);
+ vxor(VR6, VR6, VR7);
+
+ vxor(VR0, VR0, VR2);
+ vxor(VR4, VR4, VR6);
+
+ vxor(VR0, VR0, VR4);
+
+ b(L_barrett_reduction);
+
+ BIND(L_first_warm_up_done);
+ lvx(const1, 0, constantsPos);
+ addi(constantsPos, constantsPos, 16);
+ vpmsumd(VR8, VR16, const1);
+ vpmsumd(VR9, VR17, const1);
+ vpmsumd(VR10, VR18, const1);
+ vpmsumd(VR11, VR19, const1);
+ vpmsumd(VR12, VR20, const1);
+ vpmsumd(VR13, VR21, const1);
+ vpmsumd(VR14, VR22, const1);
+ vpmsumd(VR15, VR23, const1);
+ b(L_second_cool_down);
+
+ BIND(L_barrett_reduction);
+
+ lvx(const1, 0, barretConstants);
+ addi(barretConstants, barretConstants, 16);
+ lvx(const2, 0, barretConstants);
+
+ vsldoi(VR1, VR0, VR0, 8);
+ vxor(VR0, VR0, VR1); /* xor two 64 bit results together */
+
+ /* shift left one bit */
+ vspltisb(VR1, 1);
+ vsl(VR0, VR0, VR1);
+
+ vand(VR0, VR0, mask_64bit);
+
+ /*
+ * The reflected version of Barrett reduction. Instead of bit
+ * reflecting our data (which is expensive to do), we bit reflect our
+ * constants and our algorithm, which means the intermediate data in
+ * our vector registers goes from 0-63 instead of 63-0. We can reflect
+ * the algorithm because we don't carry in mod 2 arithmetic.
+ */
+ vand(VR1, VR0, mask_32bit); /* bottom 32 bits of a */
+ vpmsumd(VR1, VR1, const1); /* ma */
+ vand(VR1, VR1, mask_32bit); /* bottom 32bits of ma */
+ vpmsumd(VR1, VR1, const2); /* qn */
+ vxor(VR0, VR0, VR1); /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Since we are bit reflected, the result (ie the low 32 bits) is in
+ * the high 32 bits. We just need to shift it left 4 bytes
+ * V0 [ 0 1 X 3 ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi(VR0, VR0, zeroes, 4); /* shift result into top 64 bits of */
+
+ /* Get it into r3 */
+ mfvrd(crc, VR0);
+
+ BIND(L_end);
+
+ // Restore non-volatile vector registers (frameless).
+ li(offset, -32); lvx(VR20, offset, R1_SP);
+ addi(offset, offset, -32); lvx(VR21, offset, R1_SP);
+ addi(offset, offset, -32); lvx(VR22, offset, R1_SP);
+ addi(offset, offset, -32); lvx(VR23, offset, R1_SP);
+ addi(offset, offset, -32); lvx(VR24, offset, R1_SP);
+ addi(offset, offset, -32); lvx(VR25, offset, R1_SP);
+ addi(offset, offset, -32); lvx(VR26, offset, R1_SP);
+ addi(offset, offset, -32); lvx(VR27, offset, R1_SP);
+ addi(offset, offset, -32); lvx(VR28, offset, R1_SP);
+ addi(offset, offset, -8); ld(R22, offset, R1_SP);
+ addi(offset, offset, -8); ld(R23, offset, R1_SP);
+ addi(offset, offset, -8); ld(R24, offset, R1_SP);
+ addi(offset, offset, -8); ld(R25, offset, R1_SP);
+ addi(offset, offset, -8); ld(R26, offset, R1_SP);
+ addi(offset, offset, -8); ld(R27, offset, R1_SP);
+ addi(offset, offset, -8); ld(R28, offset, R1_SP);
+ addi(offset, offset, -8); ld(R29, offset, R1_SP);
+ addi(offset, offset, -8); ld(R30, offset, R1_SP);
+ addi(offset, offset, -8); ld(R31, offset, R1_SP);
+}
+
void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
assert_different_registers(crc, buf, /* len, not used!! */ table, tmp);
BLOCK_COMMENT("kernel_crc32_singleByte:");
nand(crc, crc, crc); // ~c
< prev index next >