< prev index next >

src/cpu/ppc/vm/macroAssembler_ppc.cpp

Print this page
rev 11928 : 8164920: ppc: enhancement of CRC32 intrinsic
Reviewed-by: mdoerr
Contributed-by: Hiroshi H Horii <horii@jp.ibm.com>

*** 4330,4339 **** --- 4330,4898 ---- update_byteLoop_crc32(crc, buf, len, table, data, true, true); BLOCK_COMMENT("} kernel_crc32_1byte"); } + /** + * @param crc register containing existing CRC (32-bit) + * @param buf register pointing to input byte buffer (byte*) + * @param len register containing number of bytes + * @param table register pointing to CRC table + * @param constants register pointing to CRC table for 128-bit aligned memory + * @param barretConstants register pointing to table for barrett reduction + * @param t0 volatile register + * @param t1 volatile register + * @param t2 volatile register + * @param t3 volatile register + */ + void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table, + Register constants, Register barretConstants, + Register t0, Register t1, Register t2, Register t3, Register t4) { + assert_different_registers(crc, buf, len, table); + + Label L_alignedHead, L_tail, L_alignTail, L_start, L_end; + + Register prealign = t0; + Register postalign = t0; + + BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {"); + + // 1. use kernel_crc32_1word for shorter than 384bit + clrldi(len, len, 32); + cmpdi(CCR0, len, 384); + bge(CCR0, L_start); + + Register tc0 = t4; + Register tc1 = constants; + Register tc2 = barretConstants; + kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table); + b(L_end); + + BIND(L_start); + + // 2. ~c + nand(crc, crc, crc); + + // 3. calculate from 0 to first 128bit-aligned address + clrldi_(prealign, buf, 57); + beq(CCR0, L_alignedHead); + + subfic(prealign, prealign, 128); + + subf(len, prealign, len); + update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false); + + // 4. calculate from first 128bit-aligned address to last 128bit-aligned address + BIND(L_alignedHead); + + clrldi(postalign, len, 57); + subf(len, postalign, len); + + // len must be more than 256bit + kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3); + + // 5. calculate remaining + cmpdi(CCR0, postalign, 0); + beq(CCR0, L_tail); + + update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false); + + BIND(L_tail); + + // 6. ~c + nand(crc, crc, crc); + + BIND(L_end); + + BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb"); + } + + /** + * @param crc register containing existing CRC (32-bit) + * @param buf register pointing to input byte buffer (byte*) + * @param len register containing number of bytes + * @param constants register pointing to CRC table for 128-bit aligned memory + * @param barretConstants register pointing to table for barrett reduction + * @param t0 volatile register + * @param t1 volatile register + * @param t2 volatile register + */ + void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len, + Register constants, Register barretConstants, Register t0, Register t1, Register t2) { + Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test; + Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15; + Label L_1, L_2, L_3, L_4; + + Register rLoaded = t0; + Register rTmp1 = t1; + Register rTmp2 = t2; + Register off16 = R22; + Register off32 = R23; + Register off48 = R24; + Register off64 = R25; + Register off80 = R26; + Register off96 = R27; + Register off112 = R28; + Register rIdx = R29; + Register rMax = R30; + Register constantsPos = R31; + + VectorRegister mask_32bit = VR24; + VectorRegister mask_64bit = VR25; + VectorRegister zeroes = VR26; + VectorRegister const1 = VR27; + VectorRegister const2 = VR28; + + // Save non-volatile vector registers (frameless). + Register offset = t1; int offsetInt = 0; + offsetInt -= 16; li(offset, -16); stvx(VR20, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP); + offsetInt -= 8; std(R22, offsetInt, R1_SP); + offsetInt -= 8; std(R23, offsetInt, R1_SP); + offsetInt -= 8; std(R24, offsetInt, R1_SP); + offsetInt -= 8; std(R25, offsetInt, R1_SP); + offsetInt -= 8; std(R26, offsetInt, R1_SP); + offsetInt -= 8; std(R27, offsetInt, R1_SP); + offsetInt -= 8; std(R28, offsetInt, R1_SP); + offsetInt -= 8; std(R29, offsetInt, R1_SP); + offsetInt -= 8; std(R30, offsetInt, R1_SP); + offsetInt -= 8; std(R31, offsetInt, R1_SP); + + // Set constants + li(off16, 16); + li(off32, 32); + li(off48, 48); + li(off64, 64); + li(off80, 80); + li(off96, 96); + li(off112, 112); + + clrldi(crc, crc, 32); + + vxor(zeroes, zeroes, zeroes); + vspltisw(VR0, -1); + + vsldoi(mask_32bit, zeroes, VR0, 4); + vsldoi(mask_64bit, zeroes, VR0, -8); + + // Get the initial value into v8 + vxor(VR8, VR8, VR8); + mtvrd(VR8, crc); + vsldoi(VR8, zeroes, VR8, -8); // shift into bottom 32 bits + + li (rLoaded, 0); + + rldicr(rIdx, len, 0, 56); + + { + BIND(L_1); + // Checksum in blocks of MAX_SIZE (32768) + lis(rMax, 0); + ori(rMax, rMax, 32768); + mr(rTmp2, rMax); + cmpd(CCR0, rIdx, rMax); + bgt(CCR0, L_2); + mr(rMax, rIdx); + + BIND(L_2); + subf(rIdx, rMax, rIdx); + + // our main loop does 128 bytes at a time + srdi(rMax, rMax, 7); + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + sldi(rTmp1, rMax, 4); + srdi(rTmp2, rTmp2, 3); + subf(rTmp1, rTmp1, rTmp2); + + // We reduce our final 128 bytes in a separate step + addi(rMax, rMax, -1); + mtctr(rMax); + + // Find the start of our constants + add(constantsPos, constants, rTmp1); + + // zero VR0-v7 which will contain our checksums + vxor(VR0, VR0, VR0); + vxor(VR1, VR1, VR1); + vxor(VR2, VR2, VR2); + vxor(VR3, VR3, VR3); + vxor(VR4, VR4, VR4); + vxor(VR5, VR5, VR5); + vxor(VR6, VR6, VR6); + vxor(VR7, VR7, VR7); + + lvx(const1, constantsPos); + + /* + * If we are looping back to consume more data we use the values + * already in VR16-v23. + */ + cmpdi(CCR0, rLoaded, 1); + beq(CCR0, L_3); + { + + // First warm up pass + lvx(VR16, buf); + lvx(VR17, off16, buf); + lvx(VR18, off32, buf); + lvx(VR19, off48, buf); + lvx(VR20, off64, buf); + lvx(VR21, off80, buf); + lvx(VR22, off96, buf); + lvx(VR23, off112, buf); + addi(buf, buf, 8*16); + + // xor in initial value + vxor(VR16, VR16, VR8); + } + + BIND(L_3); + bdz(L_first_warm_up_done); + + addi(constantsPos, constantsPos, 16); + lvx(const2, constantsPos); + + // Second warm up pass + vpmsumd(VR8, VR16, const1); + lvx(VR16, buf); + + vpmsumd(VR9, VR17, const1); + lvx(VR17, off16, buf); + + vpmsumd(VR10, VR18, const1); + lvx(VR18, off32, buf); + + vpmsumd(VR11, VR19, const1); + lvx(VR19, off48, buf); + + vpmsumd(VR12, VR20, const1); + lvx(VR20, off64, buf); + + vpmsumd(VR13, VR21, const1); + lvx(VR21, off80, buf); + + vpmsumd(VR14, VR22, const1); + lvx(VR22, off96, buf); + + vpmsumd(VR15, VR23, const1); + lvx(VR23, off112, buf); + + addi(buf, buf, 8 * 16); + + bdz(L_first_cool_down); + + /* + * main loop. We modulo schedule it such that it takes three iterations + * to complete - first iteration load, second iteration vpmsum, third + * iteration xor. + */ + { + BIND(L_4); + lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16); + + vxor(VR0, VR0, VR8); + vpmsumd(VR8, VR16, const2); + lvx(VR16, buf); + + vxor(VR1, VR1, VR9); + vpmsumd(VR9, VR17, const2); + lvx(VR17, off16, buf); + + vxor(VR2, VR2, VR10); + vpmsumd(VR10, VR18, const2); + lvx(VR18, off32, buf); + + vxor(VR3, VR3, VR11); + vpmsumd(VR11, VR19, const2); + lvx(VR19, off48, buf); + lvx(const2, constantsPos); + + vxor(VR4, VR4, VR12); + vpmsumd(VR12, VR20, const1); + lvx(VR20, off64, buf); + + vxor(VR5, VR5, VR13); + vpmsumd(VR13, VR21, const1); + lvx(VR21, off80, buf); + + vxor(VR6, VR6, VR14); + vpmsumd(VR14, VR22, const1); + lvx(VR22, off96, buf); + + vxor(VR7, VR7, VR15); + vpmsumd(VR15, VR23, const1); + lvx(VR23, off112, buf); + + addi(buf, buf, 8 * 16); + + bdnz(L_4); + } + + BIND(L_first_cool_down); + + // First cool down pass + lvx(const1, constantsPos); + addi(constantsPos, constantsPos, 16); + + vxor(VR0, VR0, VR8); + vpmsumd(VR8, VR16, const1); + + vxor(VR1, VR1, VR9); + vpmsumd(VR9, VR17, const1); + + vxor(VR2, VR2, VR10); + vpmsumd(VR10, VR18, const1); + + vxor(VR3, VR3, VR11); + vpmsumd(VR11, VR19, const1); + + vxor(VR4, VR4, VR12); + vpmsumd(VR12, VR20, const1); + + vxor(VR5, VR5, VR13); + vpmsumd(VR13, VR21, const1); + + vxor(VR6, VR6, VR14); + vpmsumd(VR14, VR22, const1); + + vxor(VR7, VR7, VR15); + vpmsumd(VR15, VR23, const1); + + BIND(L_second_cool_down); + // Second cool down pass + vxor(VR0, VR0, VR8); + vxor(VR1, VR1, VR9); + vxor(VR2, VR2, VR10); + vxor(VR3, VR3, VR11); + vxor(VR4, VR4, VR12); + vxor(VR5, VR5, VR13); + vxor(VR6, VR6, VR14); + vxor(VR7, VR7, VR15); + + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + vsldoi(VR0, VR0, zeroes, 4); + vsldoi(VR1, VR1, zeroes, 4); + vsldoi(VR2, VR2, zeroes, 4); + vsldoi(VR3, VR3, zeroes, 4); + vsldoi(VR4, VR4, zeroes, 4); + vsldoi(VR5, VR5, zeroes, 4); + vsldoi(VR6, VR6, zeroes, 4); + vsldoi(VR7, VR7, zeroes, 4); + + // xor with last 1024 bits + lvx(VR8, buf); + lvx(VR9, off16, buf); + lvx(VR10, off32, buf); + lvx(VR11, off48, buf); + lvx(VR12, off64, buf); + lvx(VR13, off80, buf); + lvx(VR14, off96, buf); + lvx(VR15, off112, buf); + addi(buf, buf, 8 * 16); + + vxor(VR16, VR0, VR8); + vxor(VR17, VR1, VR9); + vxor(VR18, VR2, VR10); + vxor(VR19, VR3, VR11); + vxor(VR20, VR4, VR12); + vxor(VR21, VR5, VR13); + vxor(VR22, VR6, VR14); + vxor(VR23, VR7, VR15); + + li(rLoaded, 1); + cmpdi(CCR0, rIdx, 0); + addi(rIdx, rIdx, 128); + bne(CCR0, L_1); + } + + // Work out how many bytes we have left + andi_(len, len, 127); + + // Calculate where in the constant table we need to start + subfic(rTmp1, len, 128); + add(constantsPos, constantsPos, rTmp1); + + // How many 16 byte chunks are in the tail + srdi(rIdx, len, 4); + mtctr(rIdx); + + /* + * Reduce the previously calculated 1024 bits to 64 bits, shifting + * 32 bits to include the trailing 32 bits of zeros + */ + lvx(VR0, constantsPos); + lvx(VR1, off16, constantsPos); + lvx(VR2, off32, constantsPos); + lvx(VR3, off48, constantsPos); + lvx(VR4, off64, constantsPos); + lvx(VR5, off80, constantsPos); + lvx(VR6, off96, constantsPos); + lvx(VR7, off112, constantsPos); + addi(constantsPos, constantsPos, 8 * 16); + + vpmsumw(VR0, VR16, VR0); + vpmsumw(VR1, VR17, VR1); + vpmsumw(VR2, VR18, VR2); + vpmsumw(VR3, VR19, VR3); + vpmsumw(VR4, VR20, VR4); + vpmsumw(VR5, VR21, VR5); + vpmsumw(VR6, VR22, VR6); + vpmsumw(VR7, VR23, VR7); + + // Now reduce the tail (0 - 112 bytes) + cmpdi(CCR0, rIdx, 0); + beq(CCR0, L_XOR); + + lvx(VR16, buf); addi(buf, buf, 16); + lvx(VR17, constantsPos); + vpmsumw(VR16, VR16, VR17); + vxor(VR0, VR0, VR16); + beq(CCR0, L_XOR); + + lvx(VR16, buf); addi(buf, buf, 16); + lvx(VR17, off16, constantsPos); + vpmsumw(VR16, VR16, VR17); + vxor(VR0, VR0, VR16); + beq(CCR0, L_XOR); + + lvx(VR16, buf); addi(buf, buf, 16); + lvx(VR17, off32, constantsPos); + vpmsumw(VR16, VR16, VR17); + vxor(VR0, VR0, VR16); + beq(CCR0, L_XOR); + + lvx(VR16, buf); addi(buf, buf, 16); + lvx(VR17, off48,constantsPos); + vpmsumw(VR16, VR16, VR17); + vxor(VR0, VR0, VR16); + beq(CCR0, L_XOR); + + lvx(VR16, buf); addi(buf, buf, 16); + lvx(VR17, off64, constantsPos); + vpmsumw(VR16, VR16, VR17); + vxor(VR0, VR0, VR16); + beq(CCR0, L_XOR); + + lvx(VR16, buf); addi(buf, buf, 16); + lvx(VR17, off80, constantsPos); + vpmsumw(VR16, VR16, VR17); + vxor(VR0, VR0, VR16); + beq(CCR0, L_XOR); + + lvx(VR16, buf); addi(buf, buf, 16); + lvx(VR17, off96, constantsPos); + vpmsumw(VR16, VR16, VR17); + vxor(VR0, VR0, VR16); + + // Now xor all the parallel chunks together + BIND(L_XOR); + vxor(VR0, VR0, VR1); + vxor(VR2, VR2, VR3); + vxor(VR4, VR4, VR5); + vxor(VR6, VR6, VR7); + + vxor(VR0, VR0, VR2); + vxor(VR4, VR4, VR6); + + vxor(VR0, VR0, VR4); + + b(L_barrett_reduction); + + BIND(L_first_warm_up_done); + lvx(const1, constantsPos); + addi(constantsPos, constantsPos, 16); + vpmsumd(VR8, VR16, const1); + vpmsumd(VR9, VR17, const1); + vpmsumd(VR10, VR18, const1); + vpmsumd(VR11, VR19, const1); + vpmsumd(VR12, VR20, const1); + vpmsumd(VR13, VR21, const1); + vpmsumd(VR14, VR22, const1); + vpmsumd(VR15, VR23, const1); + b(L_second_cool_down); + + BIND(L_barrett_reduction); + + lvx(const1, barretConstants); + addi(barretConstants, barretConstants, 16); + lvx(const2, barretConstants); + + vsldoi(VR1, VR0, VR0, -8); + vxor(VR0, VR0, VR1); // xor two 64 bit results together + + // shift left one bit + vspltisb(VR1, 1); + vsl(VR0, VR0, VR1); + + vand(VR0, VR0, mask_64bit); + + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + vand(VR1, VR0, mask_32bit); // bottom 32 bits of a + vpmsumd(VR1, VR1, const1); // ma + vand(VR1, VR1, mask_32bit); // bottom 32bits of ma + vpmsumd(VR1, VR1, const2); // qn */ + vxor(VR0, VR0, VR1); // a - qn, subtraction is xor in GF(2) + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + vsldoi(VR0, VR0, zeroes, 4); // shift result into top 64 bits of + + // Get it into r3 + mfvrd(crc, VR0); + + BIND(L_end); + + offsetInt = 0; + // Restore non-volatile Vector registers (frameless). + offsetInt -= 16; li(offset, -16); lvx(VR20, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP); + offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP); + offsetInt -= 8; ld(R22, offsetInt, R1_SP); + offsetInt -= 8; ld(R23, offsetInt, R1_SP); + offsetInt -= 8; ld(R24, offsetInt, R1_SP); + offsetInt -= 8; ld(R25, offsetInt, R1_SP); + offsetInt -= 8; ld(R26, offsetInt, R1_SP); + offsetInt -= 8; ld(R27, offsetInt, R1_SP); + offsetInt -= 8; ld(R28, offsetInt, R1_SP); + offsetInt -= 8; ld(R29, offsetInt, R1_SP); + offsetInt -= 8; ld(R30, offsetInt, R1_SP); + offsetInt -= 8; ld(R31, offsetInt, R1_SP); + } + void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) { assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); BLOCK_COMMENT("kernel_crc32_singleByte:"); nand(crc, crc, crc); // ~c
< prev index next >