--- old/src/cpu/ppc/vm/macroAssembler_ppc.cpp 2018-03-26 06:04:21.303738621 -0400 +++ new/src/cpu/ppc/vm/macroAssembler_ppc.cpp 2018-03-26 06:04:21.135735378 -0400 @@ -49,6 +49,7 @@ #else #define BLOCK_COMMENT(str) block_comment(str) #endif +#define BIND(label) bind(label); BLOCK_COMMENT(#label ":") #ifdef ASSERT // On RISC, there's no benefit to verifying instruction boundaries. @@ -3022,6 +3023,418 @@ bind(Ldone_false); } +// Helpers for Intrinsic Emitters +// +// Revert the byte order of a 32bit value in a register +// src: 0x44556677 +// dst: 0x77665544 +// Three steps to obtain the result: +// 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word +// into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. +// This value initializes dst. +// 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost +// byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. +// This value is mask inserted into dst with a [0..23] mask of 1s. +// 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. +// This value is mask inserted into dst with a [8..15] mask of 1s. +void MacroAssembler::load_reverse_32(Register dst, Register src) { + assert_different_registers(dst, src); + + rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. + rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. + rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. +} + +// Calculate the column addresses of the crc32 lookup table into distinct registers. +// This loop-invariant calculation is moved out of the loop body, reducing the loop +// body size from 20 to 16 instructions. +// Returns the offset that was used to calculate the address of column tc3. +// Due to register shortage, setting tc3 may overwrite table. With the return offset +// at hand, the original table address can be easily reconstructed. +int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { + +#ifdef VM_LITTLE_ENDIAN + // This is what we implement (the DOLIT4 part): + // ========================================================================= */ + // #define DOLIT4 c ^= *buf4++; \ + // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ + // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] + // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 + // ========================================================================= */ + const int ix0 = 3*(4*CRC32_COLUMN_SIZE); + const int ix1 = 2*(4*CRC32_COLUMN_SIZE); + const int ix2 = 1*(4*CRC32_COLUMN_SIZE); + const int ix3 = 0*(4*CRC32_COLUMN_SIZE); +#else + // This is what we implement (the DOBIG4 part): + // ========================================================================= + // #define DOBIG4 c ^= *++buf4; \ + // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ + // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] + // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 + // ========================================================================= + const int ix0 = 4*(4*CRC32_COLUMN_SIZE); + const int ix1 = 5*(4*CRC32_COLUMN_SIZE); + const int ix2 = 6*(4*CRC32_COLUMN_SIZE); + const int ix3 = 7*(4*CRC32_COLUMN_SIZE); +#endif + assert_different_registers(table, tc0, tc1, tc2); + assert(table == tc3, "must be!"); + + if (ix0 != 0) addi(tc0, table, ix0); + if (ix1 != 0) addi(tc1, table, ix1); + if (ix2 != 0) addi(tc2, table, ix2); + if (ix3 != 0) addi(tc3, table, ix3); + + return ix3; +} + +/** + * uint32_t crc; + * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); + */ +void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { + assert_different_registers(crc, table, tmp); + assert_different_registers(val, table); + + if (crc == val) { // Must rotate first to use the unmodified value. + rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. + // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. + srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. + } else { + srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. + rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. + } + lwzx(tmp, table, tmp); + xorr(crc, crc, tmp); +} + +/** + * uint32_t crc; + * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); + */ +void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { + fold_byte_crc32(crc, crc, table, tmp); +} + +/** + * Emits code to update CRC-32 with a byte value according to constants in table. + * + * @param [in,out]crc Register containing the crc. + * @param [in]val Register containing the byte to fold into the CRC. + * @param [in]table Register containing the table of crc constants. + * + * uint32_t crc; + * val = crc_table[(val ^ crc) & 0xFF]; + * crc = val ^ (crc >> 8); + */ +void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { + BLOCK_COMMENT("update_byte_crc32:"); + xorr(val, val, crc); + fold_byte_crc32(crc, val, table, val); +} + +/** + * @param crc register containing existing CRC (32-bit) + * @param buf register pointing to input byte buffer (byte*) + * @param len register containing number of bytes + * @param table register pointing to CRC table + */ +void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, + Register data, bool loopAlignment, bool invertCRC) { + assert_different_registers(crc, buf, len, table, data); + + Label L_mainLoop, L_done; + const int mainLoop_stepping = 1; + const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; + + // Process all bytes in a single-byte loop. + cmpdi(CCR0, len, 0); // Anything to do? + mtctr(len); + beq(CCR0, L_done); + + if (invertCRC) { + nand(crc, crc, crc); // ~c + } + + align(mainLoop_alignment); + BIND(L_mainLoop); + lbz(data, 0, buf); // Byte from buffer, zero-extended. + addi(buf, buf, mainLoop_stepping); // Advance buffer position. + update_byte_crc32(crc, data, table); + bdnz(L_mainLoop); // Iterate. + + if (invertCRC) { + nand(crc, crc, crc); // ~c + } + + bind(L_done); +} + +/** + * Emits code to update CRC-32 with a 4-byte value according to constants in table + * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c + */ +// A not on the lookup table address(es): +// The lookup table consists of two sets of four columns each. +// The columns {0..3} are used for little-endian machines. +// The columns {4..7} are used for big-endian machines. +// To save the effort of adding the column offset to the table address each time +// a table element is looked up, it is possible to pass the pre-calculated +// column addresses. +// Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. +void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, + Register t0, Register t1, Register t2, Register t3, + Register tc0, Register tc1, Register tc2, Register tc3) { + assert_different_registers(crc, t3); + + // XOR crc with next four bytes of buffer. + lwz(t3, bufDisp, buf); + if (bufInc != 0) { + addi(buf, buf, bufInc); + } + xorr(t3, t3, crc); + + // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. + rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 + rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 + rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 + rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 + + // Use the pre-calculated column addresses. + // Load pre-calculated table values. + lwzx(t0, tc0, t0); + lwzx(t1, tc1, t1); + lwzx(t2, tc2, t2); + lwzx(t3, tc3, t3); + + // Calculate new crc from table values. + xorr(t0, t0, t1); + xorr(t2, t2, t3); + xorr(crc, t0, t2); // Now crc contains the final checksum value. +} + +/** + * @param crc register containing existing CRC (32-bit) + * @param buf register pointing to input byte buffer (byte*) + * @param len register containing number of bytes + * @param table register pointing to CRC table + * + * Uses R9..R12 as work register. Must be saved/restored by caller! + */ +void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, + Register t0, Register t1, Register t2, Register t3, + Register tc0, Register tc1, Register tc2, Register tc3) { + assert_different_registers(crc, buf, len, table); + + Label L_mainLoop, L_tail; + Register tmp = t0; + Register data = t0; + Register tmp2 = t1; + const int mainLoop_stepping = 8; + const int tailLoop_stepping = 1; + const int log_stepping = exact_log2(mainLoop_stepping); + const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; + const int complexThreshold = 2*mainLoop_stepping; + + // Don't test for len <= 0 here. This pathological case should not occur anyway. + // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. + // The situation itself is detected and handled correctly by the conditional branches + // following aghi(len, -stepping) and aghi(len, +stepping). + assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); + + BLOCK_COMMENT("kernel_crc32_2word {"); + + nand(crc, crc, crc); // ~c + + // Check for short ( mainLoop_stepping) { + sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). + } else { + sub(tmp, len, tmp2); // Remaining bytes for main loop. + cmpdi(CCR0, tmp, mainLoop_stepping); + blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing + mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). + } + update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); + } + + srdi(tmp2, len, log_stepping); // #iterations for mainLoop + andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop + mtctr(tmp2); + +#ifdef VM_LITTLE_ENDIAN + Register crc_rv = crc; +#else + Register crc_rv = tmp; // Load_reverse needs separate registers to work on. + // Occupies tmp, but frees up crc. + load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. + tmp = crc; +#endif + + int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); + + align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. + BIND(L_mainLoop); + update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); + update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); + bdnz(L_mainLoop); + +#ifndef VM_LITTLE_ENDIAN + load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. + tmp = crc_rv; // Tmp uses it's original register again. +#endif + + // Restore original table address for tailLoop. + if (reconstructTableOffset != 0) { + addi(table, table, -reconstructTableOffset); + } + + // Process last few ( 4 ? InputForNewCode : 32; + const int complexThreshold = 2*mainLoop_stepping; + + // Don't test for len <= 0 here. This pathological case should not occur anyway. + // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. + // The situation itself is detected and handled correctly by the conditional branches + // following aghi(len, -stepping) and aghi(len, +stepping). + assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); + + BLOCK_COMMENT("kernel_crc32_1word {"); + + nand(crc, crc, crc); // ~c + + // Check for short ( mainLoop_stepping) { + sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). + } else { + sub(tmp, len, tmp2); // Remaining bytes for main loop. + cmpdi(CCR0, tmp, mainLoop_stepping); + blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing + mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). + } + update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); + } + + srdi(tmp2, len, log_stepping); // #iterations for mainLoop + andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop + mtctr(tmp2); + +#ifdef VM_LITTLE_ENDIAN + Register crc_rv = crc; +#else + Register crc_rv = tmp; // Load_reverse needs separate registers to work on. + // Occupies tmp, but frees up crc. + load_reverse_32(crc_rv, crc); // evert byte order because we are dealing with big-endian data. + tmp = crc; +#endif + + int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); + + align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. + BIND(L_mainLoop); + update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); + bdnz(L_mainLoop); + +#ifndef VM_LITTLE_ENDIAN + load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. + tmp = crc_rv; // Tmp uses it's original register again. +#endif + + // Restore original table address for tailLoop. + if (reconstructTableOffset != 0) { + addi(table, table, -reconstructTableOffset); + } + + // Process last few (