< prev index next >

src/hotspot/cpu/ppc/macroAssembler_ppc.cpp

Print this page
rev 53302 : 8216060: [PPC64] Vector CRC implementation should be used by interpreter and be faster for short arrays
Reviewed-by: gromero, goetz

*** 1,8 **** /* ! * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. ! * Copyright (c) 2012, 2018, SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. --- 1,8 ---- /* ! * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. ! * Copyright (c) 2012, 2019, SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation.
*** 3972,3982 **** /** * Emits code to update CRC-32 with a 4-byte value according to constants in table * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c */ ! // A not on the lookup table address(es): // The lookup table consists of two sets of four columns each. // The columns {0..3} are used for little-endian machines. // The columns {4..7} are used for big-endian machines. // To save the effort of adding the column offset to the table address each time // a table element is looked up, it is possible to pass the pre-calculated --- 3972,3982 ---- /** * Emits code to update CRC-32 with a 4-byte value according to constants in table * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c */ ! // A note on the lookup table address(es): // The lookup table consists of two sets of four columns each. // The columns {0..3} are used for little-endian machines. // The columns {4..7} are used for big-endian machines. // To save the effort of adding the column offset to the table address each time // a table element is looked up, it is possible to pass the pre-calculated
*** 4148,4219 **** * @param table register pointing to CRC table * @param constants register pointing to CRC table for 128-bit aligned memory * @param barretConstants register pointing to table for barrett reduction * @param t0-t4 temp registers */ ! void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table, ! Register constants, Register barretConstants, ! Register t0, Register t1, Register t2, Register t3, Register t4, ! bool invertCRC) { assert_different_registers(crc, buf, len, table); ! Label L_alignedHead, L_tail; ! BLOCK_COMMENT("kernel_crc32_1word_vpmsum {"); - // 1. ~c if (invertCRC) { nand(crc, crc, crc); // 1s complement of crc } ! // 2. use kernel_crc32_1word for short len clrldi(len, len, 32); - cmpdi(CCR0, len, 512); - blt(CCR0, L_tail); ! // 3. calculate from 0 to first aligned address ! const int alignment = 16; Register prealign = t0; ! andi_(prealign, buf, alignment - 1); ! beq(CCR0, L_alignedHead); ! subfic(prealign, prealign, alignment); subf(len, prealign, len); update_byteLoop_crc32(crc, buf, prealign, table, t2, false); ! // 4. calculate from first aligned address as far as possible ! BIND(L_alignedHead); ! kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4); ! // 5. remaining bytes BIND(L_tail); ! Register tc0 = t4; ! Register tc1 = constants; ! Register tc2 = barretConstants; ! kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false); - // 6. ~c if (invertCRC) { nand(crc, crc, crc); // 1s complement of crc } ! BLOCK_COMMENT("} kernel_crc32_1word_vpmsum"); } /** * @param crc register containing existing CRC (32-bit) * @param buf register pointing to input byte buffer (byte*) * @param len register containing number of bytes (will get updated to remaining bytes) * @param constants register pointing to CRC table for 128-bit aligned memory * @param barretConstants register pointing to table for barrett reduction * @param t0-t4 temp registers - * Precondition: len should be >= 512. Otherwise, nothing will be done. */ ! void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len, ! Register constants, Register barretConstants, ! Register t0, Register t1, Register t2, Register t3, Register t4) { // Save non-volatile vector registers (frameless). Register offset = t1; int offsetInt = 0; offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); --- 4148,4211 ---- * @param table register pointing to CRC table * @param constants register pointing to CRC table for 128-bit aligned memory * @param barretConstants register pointing to table for barrett reduction * @param t0-t4 temp registers */ ! void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table, ! Register constants, Register t0, Register t1, Register t2, ! Register t3, Register t4, Register t5, bool invertCRC) { assert_different_registers(crc, buf, len, table); ! Label L_tail; ! BLOCK_COMMENT("kernel_crc32_vpmsum {"); if (invertCRC) { nand(crc, crc, crc); // 1s complement of crc } ! // Enforce 32 bit. clrldi(len, len, 32); ! // Align if we have enough bytes for the fast version. ! const int alignment = 16, ! threshold = 32; Register prealign = t0; ! neg(prealign, buf); ! addi(t1, len, -threshold); ! andi(prealign, prealign, alignment - 1); ! cmpw(CCR0, t1, prealign); ! blt(CCR0, L_tail); // len - prealign < threshold? subf(len, prealign, len); update_byteLoop_crc32(crc, buf, prealign, table, t2, false); ! // Calculate from first aligned address as far as possible. ! kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5); ! // Remaining bytes. BIND(L_tail); ! update_byteLoop_crc32(crc, buf, len, table, t2, false); if (invertCRC) { nand(crc, crc, crc); // 1s complement of crc } ! BLOCK_COMMENT("} kernel_crc32_vpmsum"); } /** * @param crc register containing existing CRC (32-bit) * @param buf register pointing to input byte buffer (byte*) * @param len register containing number of bytes (will get updated to remaining bytes) * @param constants register pointing to CRC table for 128-bit aligned memory * @param barretConstants register pointing to table for barrett reduction * @param t0-t4 temp registers */ ! void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, ! Register constants, Register t0, Register t1, Register t2, Register t3, Register t4, Register t5) { // Save non-volatile vector registers (frameless). Register offset = t1; int offsetInt = 0; offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
*** 4226,4254 **** offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); #endif offsetInt -= 8; std(R14, offsetInt, R1_SP); offsetInt -= 8; std(R15, offsetInt, R1_SP); offsetInt -= 8; std(R16, offsetInt, R1_SP); - offsetInt -= 8; std(R17, offsetInt, R1_SP); // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor // bytes per iteration. The basic scheme is: // lvx: load vector (Big Endian needs reversal) // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift // vxor: xor partial results together to get unroll_factor2 vectors // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. ! const int unroll_factor = 2048; ! const int unroll_factor2 = 8; // Support registers. ! Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 }; ! Register num_bytes = R15, ! loop_count = R16, ! cur_const = R17; // Constant array for outer loop: unroll_factor2 - 1 registers, // Constant array for inner loop: unroll_factor / unroll_factor2 registers. VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, consts1[] = { VR23, VR24 }; // Data register arrays: 2 arrays with unroll_factor2 registers. --- 4218,4248 ---- offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); #endif offsetInt -= 8; std(R14, offsetInt, R1_SP); offsetInt -= 8; std(R15, offsetInt, R1_SP); offsetInt -= 8; std(R16, offsetInt, R1_SP); // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor // bytes per iteration. The basic scheme is: // lvx: load vector (Big Endian needs reversal) // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift // vxor: xor partial results together to get unroll_factor2 vectors // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. ! const int unroll_factor = CRC32_UNROLL_FACTOR, ! unroll_factor2 = CRC32_UNROLL_FACTOR2; ! ! const int outer_consts_size = (unroll_factor2 - 1) * 16, ! inner_consts_size = (unroll_factor / unroll_factor2) * 16; // Support registers. ! Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, crc /* will live in VCRC */ }; ! Register num_bytes = R14, ! loop_count = R15, ! cur_const = R16; // Constant array for outer loop: unroll_factor2 - 1 registers, // Constant array for inner loop: unroll_factor / unroll_factor2 registers. VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, consts1[] = { VR23, VR24 }; // Data register arrays: 2 arrays with unroll_factor2 registers.
*** 4266,4290 **** if (VM_Version::has_mfdscr()) { load_const_optimized(t0, VM_Version::_dscr_val | 7); mtdscr(t0); } ! mtvrwz(VCRC, crc); // crc lives lives in VCRC, now for (int i = 1; i < unroll_factor2; ++i) { li(offs[i], 16 * i); } // Load consts for outer loop lvx(consts0[0], constants); for (int i = 1; i < unroll_factor2 - 1; ++i) { lvx(consts0[i], offs[i], constants); } - addi(constants, constants, (unroll_factor2 - 1) * 16); load_const_optimized(num_bytes, 16 * unroll_factor); - load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. // Reuse data registers outside of the loop. VectorRegister Vtmp = data1[0]; VectorRegister Vtmp2 = data1[1]; VectorRegister zeroes = data1[2]; --- 4260,4282 ---- if (VM_Version::has_mfdscr()) { load_const_optimized(t0, VM_Version::_dscr_val | 7); mtdscr(t0); } ! mtvrwz(VCRC, crc); // crc lives in VCRC, now for (int i = 1; i < unroll_factor2; ++i) { li(offs[i], 16 * i); } // Load consts for outer loop lvx(consts0[0], constants); for (int i = 1; i < unroll_factor2 - 1; ++i) { lvx(consts0[i], offs[i], constants); } load_const_optimized(num_bytes, 16 * unroll_factor); // Reuse data registers outside of the loop. VectorRegister Vtmp = data1[0]; VectorRegister Vtmp2 = data1[1]; VectorRegister zeroes = data1[2];
*** 4308,4324 **** #endif cmpd(CCR0, len, num_bytes); blt(CCR0, L_last); // ********** Main loop start ********** align(32); bind(L_outer_loop); // Begin of unrolled first iteration (no xor). lvx(data1[0], buf); - mr(cur_const, constants); for (int i = 1; i < unroll_factor2 / 2; ++i) { lvx(data1[i], offs[i], buf); } vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. lvx(consts1[0], cur_const); --- 4300,4318 ---- #endif cmpd(CCR0, len, num_bytes); blt(CCR0, L_last); + addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop + load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. + // ********** Main loop start ********** align(32); bind(L_outer_loop); // Begin of unrolled first iteration (no xor). lvx(data1[0], buf); for (int i = 1; i < unroll_factor2 / 2; ++i) { lvx(data1[i], offs[i], buf); } vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. lvx(consts1[0], cur_const);
*** 4367,4376 **** --- 4361,4372 ---- } addi(buf, buf, 16 * unroll_factor2); } bdnz(L_inner_loop); + addi(cur_const, constants, outer_consts_size); // Reset + // Tail of last iteration (no loads). for (int i = 0; i < unroll_factor2 / 2; ++i) { BE_swap_bytes(data1[i + unroll_factor2 / 2]); vxor(data0[i], data0[i], data1[i]); vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
*** 4395,4427 **** bge(CCR0, L_outer_loop); // Last chance with lower num_bytes. bind(L_last); srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. ! add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one. sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); ! subf(constants, R0, constants); // Point to constant to be used first. addic_(loop_count, loop_count, -1); // One double-iteration peeled off. bgt(CCR0, L_outer_loop); // ********** Main loop end ********** - #undef BE_swap_bytes // Restore DSCR pre-fetch value. if (VM_Version::has_mfdscr()) { load_const_optimized(t0, VM_Version::_dscr_val); mtdscr(t0); } vspltisb(zeroes, 0); // Combine to 64 bit result. vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. // Reduce to 32 bit CRC: Remainder by multiply-high. ! lvx(Vtmp, barretConstants); vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. vsldoi(Vtmp, zeroes, Vtmp, 8); vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. --- 4391,4455 ---- bge(CCR0, L_outer_loop); // Last chance with lower num_bytes. bind(L_last); srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. ! // Point behind last const for inner loop. ! add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); ! subf(cur_const, R0, cur_const); // Point to constant to be used first. addic_(loop_count, loop_count, -1); // One double-iteration peeled off. bgt(CCR0, L_outer_loop); // ********** Main loop end ********** // Restore DSCR pre-fetch value. if (VM_Version::has_mfdscr()) { load_const_optimized(t0, VM_Version::_dscr_val); mtdscr(t0); } + // ********** Simple loop for remaining 16 byte blocks ********** + { + Label L_loop, L_done; + + srdi_(t0, len, 4); // 16 bytes per iteration + clrldi(len, len, 64-4); + beq(CCR0, L_done); + + // Point to const (same as last const for inner loop). + add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); + mtctr(t0); + lvx(Vtmp2, cur_const); + + align(32); + bind(L_loop); + + lvx(Vtmp, buf); + addi(buf, buf, 16); + vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. + BE_swap_bytes(Vtmp); + vxor(VCRC, VCRC, Vtmp); + vpmsumw(VCRC, VCRC, Vtmp2); + bdnz(L_loop); + + bind(L_done); + } + // ********** Simple loop end ********** + #undef BE_swap_bytes + + // Point to Barrett constants + add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); + vspltisb(zeroes, 0); // Combine to 64 bit result. vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. // Reduce to 32 bit CRC: Remainder by multiply-high. ! lvx(Vtmp, cur_const); vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. vsldoi(Vtmp, zeroes, Vtmp, 8); vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.
*** 4443,4453 **** offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); #endif offsetInt -= 8; ld(R14, offsetInt, R1_SP); offsetInt -= 8; ld(R15, offsetInt, R1_SP); offsetInt -= 8; ld(R16, offsetInt, R1_SP); - offsetInt -= 8; ld(R17, offsetInt, R1_SP); } void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) { assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); --- 4471,4480 ----
< prev index next >