< prev index next >
src/cpu/ppc/vm/macroAssembler_ppc.cpp
Print this page
rev 8691 : 8131048: ppc: implement CRC32 intrinsic
Reviewed-by: kvn, simonis
Contributed-by: lutz.schmidt@sap.com
*** 48,57 ****
--- 48,58 ----
#ifdef PRODUCT
#define BLOCK_COMMENT(str) // nothing
#else
#define BLOCK_COMMENT(str) block_comment(str)
#endif
+ #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
#ifdef ASSERT
// On RISC, there's no benefit to verifying instruction boundaries.
bool AbstractAssembler::pd_check_instruction_mark() { return false; }
#endif
*** 3431,3440 ****
--- 3432,3853 ----
}
li(result_reg, 1);
bind(Ldone_false);
}
+ // Helpers for Intrinsic Emitters
+ //
+ // Revert the byte order of a 32bit value in a register
+ // src: 0x44556677
+ // dst: 0x77665544
+ // Three steps to obtain the result:
+ // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
+ // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
+ // This value initializes dst.
+ // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
+ // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
+ // This value is mask inserted into dst with a [0..23] mask of 1s.
+ // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
+ // This value is mask inserted into dst with a [8..15] mask of 1s.
+ void MacroAssembler::load_reverse_32(Register dst, Register src) {
+ assert_different_registers(dst, src);
+
+ rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
+ rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
+ rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
+ }
+
+ // Calculate the column addresses of the crc32 lookup table into distinct registers.
+ // This loop-invariant calculation is moved out of the loop body, reducing the loop
+ // body size from 20 to 16 instructions.
+ // Returns the offset that was used to calculate the address of column tc3.
+ // Due to register shortage, setting tc3 may overwrite table. With the return offset
+ // at hand, the original table address can be easily reconstructed.
+ int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
+
+ #ifdef VM_LITTLE_ENDIAN
+ // This is what we implement (the DOLIT4 part):
+ // ========================================================================= */
+ // #define DOLIT4 c ^= *buf4++; \
+ // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+ // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+ // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+ // ========================================================================= */
+ const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
+ const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
+ const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
+ const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
+ #else
+ // This is what we implement (the DOBIG4 part):
+ // =========================================================================
+ // #define DOBIG4 c ^= *++buf4; \
+ // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+ // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+ // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+ // =========================================================================
+ const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
+ const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
+ const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
+ const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
+ #endif
+ assert_different_registers(table, tc0, tc1, tc2);
+ assert(table == tc3, "must be!");
+
+ if (ix0 != 0) addi(tc0, table, ix0);
+ if (ix1 != 0) addi(tc1, table, ix1);
+ if (ix2 != 0) addi(tc2, table, ix2);
+ if (ix3 != 0) addi(tc3, table, ix3);
+
+ return ix3;
+ }
+
+ /**
+ * uint32_t crc;
+ * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
+ */
+ void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
+ assert_different_registers(crc, table, tmp);
+ assert_different_registers(val, table);
+
+ if (crc == val) { // Must rotate first to use the unmodified value.
+ rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
+ // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
+ srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
+ } else {
+ srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
+ rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
+ }
+ lwzx(tmp, table, tmp);
+ xorr(crc, crc, tmp);
+ }
+
+ /**
+ * uint32_t crc;
+ * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
+ */
+ void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
+ fold_byte_crc32(crc, crc, table, tmp);
+ }
+
+ /**
+ * Emits code to update CRC-32 with a byte value according to constants in table.
+ *
+ * @param [in,out]crc Register containing the crc.
+ * @param [in]val Register containing the byte to fold into the CRC.
+ * @param [in]table Register containing the table of crc constants.
+ *
+ * uint32_t crc;
+ * val = crc_table[(val ^ crc) & 0xFF];
+ * crc = val ^ (crc >> 8);
+ */
+ void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
+ BLOCK_COMMENT("update_byte_crc32:");
+ xorr(val, val, crc);
+ fold_byte_crc32(crc, val, table, val);
+ }
+
+ /**
+ * @param crc register containing existing CRC (32-bit)
+ * @param buf register pointing to input byte buffer (byte*)
+ * @param len register containing number of bytes
+ * @param table register pointing to CRC table
+ */
+ void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
+ Register data, bool loopAlignment, bool invertCRC) {
+ assert_different_registers(crc, buf, len, table, data);
+
+ Label L_mainLoop, L_done;
+ const int mainLoop_stepping = 1;
+ const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
+
+ // Process all bytes in a single-byte loop.
+ cmpdi(CCR0, len, 0); // Anything to do?
+ mtctr(len);
+ beq(CCR0, L_done);
+
+ if (invertCRC) {
+ nand(crc, crc, crc); // ~c
+ }
+
+ align(mainLoop_alignment);
+ BIND(L_mainLoop);
+ lbz(data, 0, buf); // Byte from buffer, zero-extended.
+ addi(buf, buf, mainLoop_stepping); // Advance buffer position.
+ update_byte_crc32(crc, data, table);
+ bdnz(L_mainLoop); // Iterate.
+
+ if (invertCRC) {
+ nand(crc, crc, crc); // ~c
+ }
+
+ bind(L_done);
+ }
+
+ /**
+ * Emits code to update CRC-32 with a 4-byte value according to constants in table
+ * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
+ */
+ // A not on the lookup table address(es):
+ // The lookup table consists of two sets of four columns each.
+ // The columns {0..3} are used for little-endian machines.
+ // The columns {4..7} are used for big-endian machines.
+ // To save the effort of adding the column offset to the table address each time
+ // a table element is looked up, it is possible to pass the pre-calculated
+ // column addresses.
+ // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
+ void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
+ Register t0, Register t1, Register t2, Register t3,
+ Register tc0, Register tc1, Register tc2, Register tc3) {
+ assert_different_registers(crc, t3);
+
+ // XOR crc with next four bytes of buffer.
+ lwz(t3, bufDisp, buf);
+ if (bufInc != 0) {
+ addi(buf, buf, bufInc);
+ }
+ xorr(t3, t3, crc);
+
+ // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
+ rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2
+ rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2
+ rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2
+ rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2
+
+ // Use the pre-calculated column addresses.
+ // Load pre-calculated table values.
+ lwzx(t0, tc0, t0);
+ lwzx(t1, tc1, t1);
+ lwzx(t2, tc2, t2);
+ lwzx(t3, tc3, t3);
+
+ // Calculate new crc from table values.
+ xorr(t0, t0, t1);
+ xorr(t2, t2, t3);
+ xorr(crc, t0, t2); // Now crc contains the final checksum value.
+ }
+
+ /**
+ * @param crc register containing existing CRC (32-bit)
+ * @param buf register pointing to input byte buffer (byte*)
+ * @param len register containing number of bytes
+ * @param table register pointing to CRC table
+ *
+ * Uses R9..R12 as work register. Must be saved/restored by caller!
+ */
+ void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
+ Register t0, Register t1, Register t2, Register t3,
+ Register tc0, Register tc1, Register tc2, Register tc3) {
+ assert_different_registers(crc, buf, len, table);
+
+ Label L_mainLoop, L_tail;
+ Register tmp = t0;
+ Register data = t0;
+ Register tmp2 = t1;
+ const int mainLoop_stepping = 8;
+ const int tailLoop_stepping = 1;
+ const int log_stepping = exact_log2(mainLoop_stepping);
+ const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
+ const int complexThreshold = 2*mainLoop_stepping;
+
+ // Don't test for len <= 0 here. This pathological case should not occur anyway.
+ // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
+ // The situation itself is detected and handled correctly by the conditional branches
+ // following aghi(len, -stepping) and aghi(len, +stepping).
+ assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
+
+ BLOCK_COMMENT("kernel_crc32_2word {");
+
+ nand(crc, crc, crc); // ~c
+
+ // Check for short (<mainLoop_stepping) buffer.
+ cmpdi(CCR0, len, complexThreshold);
+ blt(CCR0, L_tail);
+
+ // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
+ // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
+ {
+ // Align buf addr to mainLoop_stepping boundary.
+ neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
+ rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
+
+ if (complexThreshold > mainLoop_stepping) {
+ sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+ } else {
+ sub(tmp, len, tmp2); // Remaining bytes for main loop.
+ cmpdi(CCR0, tmp, mainLoop_stepping);
+ blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
+ mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+ }
+ update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
+ }
+
+ srdi(tmp2, len, log_stepping); // #iterations for mainLoop
+ andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
+ mtctr(tmp2);
+
+ #ifdef VM_LITTLE_ENDIAN
+ Register crc_rv = crc;
+ #else
+ Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
+ // Occupies tmp, but frees up crc.
+ load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
+ tmp = crc;
+ #endif
+
+ int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
+
+ align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
+ BIND(L_mainLoop);
+ update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
+ update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
+ bdnz(L_mainLoop);
+
+ #ifndef VM_LITTLE_ENDIAN
+ load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
+ tmp = crc_rv; // Tmp uses it's original register again.
+ #endif
+
+ // Restore original table address for tailLoop.
+ if (reconstructTableOffset != 0) {
+ addi(table, table, -reconstructTableOffset);
+ }
+
+ // Process last few (<complexThreshold) bytes of buffer.
+ BIND(L_tail);
+ update_byteLoop_crc32(crc, buf, len, table, data, false, false);
+
+ nand(crc, crc, crc); // ~c
+ BLOCK_COMMENT("} kernel_crc32_2word");
+ }
+
+ /**
+ * @param crc register containing existing CRC (32-bit)
+ * @param buf register pointing to input byte buffer (byte*)
+ * @param len register containing number of bytes
+ * @param table register pointing to CRC table
+ *
+ * uses R9..R12 as work register. Must be saved/restored by caller!
+ */
+ void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
+ Register t0, Register t1, Register t2, Register t3,
+ Register tc0, Register tc1, Register tc2, Register tc3) {
+ assert_different_registers(crc, buf, len, table);
+
+ Label L_mainLoop, L_tail;
+ Register tmp = t0;
+ Register data = t0;
+ Register tmp2 = t1;
+ const int mainLoop_stepping = 4;
+ const int tailLoop_stepping = 1;
+ const int log_stepping = exact_log2(mainLoop_stepping);
+ const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
+ const int complexThreshold = 2*mainLoop_stepping;
+
+ // Don't test for len <= 0 here. This pathological case should not occur anyway.
+ // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
+ // The situation itself is detected and handled correctly by the conditional branches
+ // following aghi(len, -stepping) and aghi(len, +stepping).
+ assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
+
+ BLOCK_COMMENT("kernel_crc32_1word {");
+
+ nand(crc, crc, crc); // ~c
+
+ // Check for short (<mainLoop_stepping) buffer.
+ cmpdi(CCR0, len, complexThreshold);
+ blt(CCR0, L_tail);
+
+ // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
+ // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
+ {
+ // Align buf addr to mainLoop_stepping boundary.
+ neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
+ rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
+
+ if (complexThreshold > mainLoop_stepping) {
+ sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+ } else {
+ sub(tmp, len, tmp2); // Remaining bytes for main loop.
+ cmpdi(CCR0, tmp, mainLoop_stepping);
+ blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
+ mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
+ }
+ update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
+ }
+
+ srdi(tmp2, len, log_stepping); // #iterations for mainLoop
+ andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
+ mtctr(tmp2);
+
+ #ifdef VM_LITTLE_ENDIAN
+ Register crc_rv = crc;
+ #else
+ Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
+ // Occupies tmp, but frees up crc.
+ load_reverse_32(crc_rv, crc); // evert byte order because we are dealing with big-endian data.
+ tmp = crc;
+ #endif
+
+ int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
+
+ align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
+ BIND(L_mainLoop);
+ update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
+ bdnz(L_mainLoop);
+
+ #ifndef VM_LITTLE_ENDIAN
+ load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
+ tmp = crc_rv; // Tmp uses it's original register again.
+ #endif
+
+ // Restore original table address for tailLoop.
+ if (reconstructTableOffset != 0) {
+ addi(table, table, -reconstructTableOffset);
+ }
+
+ // Process last few (<complexThreshold) bytes of buffer.
+ BIND(L_tail);
+ update_byteLoop_crc32(crc, buf, len, table, data, false, false);
+
+ nand(crc, crc, crc); // ~c
+ BLOCK_COMMENT("} kernel_crc32_1word");
+ }
+
+ /**
+ * @param crc register containing existing CRC (32-bit)
+ * @param buf register pointing to input byte buffer (byte*)
+ * @param len register containing number of bytes
+ * @param table register pointing to CRC table
+ *
+ * Uses R7_ARG5, R8_ARG6 as work registers.
+ */
+ void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
+ Register t0, Register t1, Register t2, Register t3) {
+ assert_different_registers(crc, buf, len, table);
+
+ Register data = t0; // Holds the current byte to be folded into crc.
+
+ BLOCK_COMMENT("kernel_crc32_1byte {");
+
+ // Process all bytes in a single-byte loop.
+ update_byteLoop_crc32(crc, buf, len, table, data, true, true);
+
+ BLOCK_COMMENT("} kernel_crc32_1byte");
+ }
+
+ void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
+ assert_different_registers(crc, buf, /* len, not used!! */ table, tmp);
+
+ BLOCK_COMMENT("kernel_crc32_singleByte:");
+ nand(crc, crc, crc); // ~c
+
+ lbz(tmp, 0, buf); // Byte from buffer, zero-extended.
+ update_byte_crc32(crc, tmp, table);
+
+ nand(crc, crc, crc); // ~c
+ }
+
// dest_lo += src1 + src2
// dest_hi += carry1 + carry2
void MacroAssembler::add2_with_carry(Register dest_hi,
Register dest_lo,
Register src1, Register src2) {
*** 3513,3523 ****
lwz(y_idx, 0, y);
b(L_multiply);
! bind( L_one_x ); // Load one 32 bit portion of x as (0,value).
lwz(x_xstart, 0, x);
b(L_first_loop);
bind(L_first_loop_exit);
--- 3926,3936 ----
lwz(y_idx, 0, y);
b(L_multiply);
! bind(L_one_x); // Load one 32 bit portion of x as (0,value).
lwz(x_xstart, 0, x);
b(L_first_loop);
bind(L_first_loop_exit);
*** 3532,3542 ****
// huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
// z[kdx] = (jlong)product;
sldi(tmp, idx, LogBytesPerInt);
! if ( offset ) {
addi(tmp, tmp, offset);
}
ldx(yz_idx, y, tmp);
#ifdef VM_LITTLE_ENDIAN
rldicl(yz_idx, yz_idx, 32, 0);
--- 3945,3955 ----
// huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
// z[kdx] = (jlong)product;
sldi(tmp, idx, LogBytesPerInt);
! if (offset) {
addi(tmp, tmp, offset);
}
ldx(yz_idx, y, tmp);
#ifdef VM_LITTLE_ENDIAN
rldicl(yz_idx, yz_idx, 32, 0);
*** 3549,3559 ****
#endif
add2_with_carry(product_high, product, carry, yz_idx);
sldi(tmp, idx, LogBytesPerInt);
! if ( offset ) {
addi(tmp, tmp, offset);
}
#ifdef VM_LITTLE_ENDIAN
rldicl(product, product, 32, 0);
#endif
--- 3962,3972 ----
#endif
add2_with_carry(product_high, product, carry, yz_idx);
sldi(tmp, idx, LogBytesPerInt);
! if (offset) {
addi(tmp, tmp, offset);
}
#ifdef VM_LITTLE_ENDIAN
rldicl(product, product, 32, 0);
#endif
< prev index next >