3842 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3843 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3844 // This value is mask inserted into dst with a [0..23] mask of 1s.
3845 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3846 // This value is mask inserted into dst with a [8..15] mask of 1s.
3847 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3848 assert_different_registers(dst, src);
3849
3850 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3851 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3852 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
3853 }
3854
3855 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3856 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3857 // body size from 20 to 16 instructions.
3858 // Returns the offset that was used to calculate the address of column tc3.
3859 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3860 // at hand, the original table address can be easily reconstructed.
3861 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3862
3863 #ifdef VM_LITTLE_ENDIAN
3864 // This is what we implement (the DOLIT4 part):
3865 // ========================================================================= */
3866 // #define DOLIT4 c ^= *buf4++; \
3867 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3868 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3869 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3870 // ========================================================================= */
3871 const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3872 const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3873 const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3874 const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3875 #else
3876 // This is what we implement (the DOBIG4 part):
3877 // =========================================================================
3878 // #define DOBIG4 c ^= *++buf4; \
3879 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3880 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3881 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3882 // =========================================================================
3883 const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3884 const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3885 const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3886 const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3887 #endif
3888 assert_different_registers(table, tc0, tc1, tc2);
3889 assert(table == tc3, "must be!");
3890
3891 addi(tc0, table, ix0);
3892 addi(tc1, table, ix1);
3893 addi(tc2, table, ix2);
3894 if (ix3 != 0) addi(tc3, table, ix3);
3895
3896 return ix3;
3897 }
3898
3899 /**
3900 * uint32_t crc;
3901 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3902 */
3903 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3904 assert_different_registers(crc, table, tmp);
3905 assert_different_registers(val, table);
3906
3907 if (crc == val) { // Must rotate first to use the unmodified value.
3908 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3909 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3910 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3911 } else {
3912 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3913 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3914 }
3915 lwzx(tmp, table, tmp);
3916 xorr(crc, crc, tmp);
3917 }
3918
3919 /**
3920 * uint32_t crc;
3921 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3922 */
3923 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3924 fold_byte_crc32(crc, crc, table, tmp);
3925 }
3926
3927 /**
3928 * Emits code to update CRC-32 with a byte value according to constants in table.
3929 *
3930 * @param [in,out]crc Register containing the crc.
3931 * @param [in]val Register containing the byte to fold into the CRC.
3932 * @param [in]table Register containing the table of crc constants.
3933 *
3934 * uint32_t crc;
3935 * val = crc_table[(val ^ crc) & 0xFF];
3936 * crc = val ^ (crc >> 8);
3937 */
3938 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3939 BLOCK_COMMENT("update_byte_crc32:");
3940 xorr(val, val, crc);
3941 fold_byte_crc32(crc, val, table, val);
3942 }
3943
3944 /**
3945 * @param crc register containing existing CRC (32-bit)
3946 * @param buf register pointing to input byte buffer (byte*)
3947 * @param len register containing number of bytes
4098
4099 // Restore original table address for tailLoop.
4100 if (reconstructTableOffset != 0) {
4101 addi(table, table, -reconstructTableOffset);
4102 }
4103
4104 // Process last few (<complexThreshold) bytes of buffer.
4105 BIND(L_tail);
4106 update_byteLoop_crc32(crc, buf, len, table, data, false);
4107
4108 if (invertCRC) {
4109 nand(crc, crc, crc); // 1s complement of crc
4110 }
4111 BLOCK_COMMENT("} kernel_crc32_1word");
4112 }
4113
4114 /**
4115 * @param crc register containing existing CRC (32-bit)
4116 * @param buf register pointing to input byte buffer (byte*)
4117 * @param len register containing number of bytes
4118 * @param table register pointing to CRC table
4119 *
4120 * Uses R7_ARG5, R8_ARG6 as work registers.
4121 */
4122 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4123 Register t0, Register t1, Register t2, Register t3,
4124 bool invertCRC) {
4125 assert_different_registers(crc, buf, len, table);
4126
4127 Register data = t0; // Holds the current byte to be folded into crc.
4128
4129 BLOCK_COMMENT("kernel_crc32_1byte {");
4130
4131 if (invertCRC) {
4132 nand(crc, crc, crc); // 1s complement of crc
4133 }
4134
4135 // Process all bytes in a single-byte loop.
4136 update_byteLoop_crc32(crc, buf, len, table, data, true);
4137
4138 if (invertCRC) {
4139 nand(crc, crc, crc); // 1s complement of crc
4140 }
4141 BLOCK_COMMENT("} kernel_crc32_1byte");
4142 }
4143
4144 /**
4145 * @param crc register containing existing CRC (32-bit)
4146 * @param buf register pointing to input byte buffer (byte*)
4147 * @param len register containing number of bytes
4148 * @param table register pointing to CRC table
4149 * @param constants register pointing to CRC table for 128-bit aligned memory
4150 * @param t0-t5 temp registers
4151 */
4152 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table,
4153 Register constants, Register t0, Register t1, Register t2,
4154 Register t3, Register t4, Register t5, bool invertCRC) {
4155 assert_different_registers(crc, buf, len, table);
4156
4157 Label L_tail;
4158
4159 BLOCK_COMMENT("kernel_crc32_vpmsum {");
4160
4161 if (invertCRC) {
4162 nand(crc, crc, crc); // 1s complement of crc
4163 }
4164
4165 // Enforce 32 bit.
4166 clrldi(len, len, 32);
4167
4168 // Align if we have enough bytes for the fast version.
4169 const int alignment = 16,
4170 threshold = 32;
4171 Register prealign = t0;
4172
4173 neg(prealign, buf);
4174 addi(t1, len, -threshold);
4175 andi(prealign, prealign, alignment - 1);
4176 cmpw(CCR0, t1, prealign);
4177 blt(CCR0, L_tail); // len - prealign < threshold?
4178
4179 subf(len, prealign, len);
4180 update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4181
4182 // Calculate from first aligned address as far as possible.
4183 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5);
4184
4185 // Remaining bytes.
4186 BIND(L_tail);
4187 update_byteLoop_crc32(crc, buf, len, table, t2, false);
4188
4189 if (invertCRC) {
4190 nand(crc, crc, crc); // 1s complement of crc
4191 }
4192
4193 BLOCK_COMMENT("} kernel_crc32_vpmsum");
4194 }
4195
4196 /**
4197 * @param crc register containing existing CRC (32-bit)
4198 * @param buf register pointing to input byte buffer (byte*)
4199 * @param len register containing number of bytes (will get updated to remaining bytes)
4200 * @param constants register pointing to CRC table for 128-bit aligned memory
4201 * @param t0-t5 temp registers
4202 */
4203 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len,
4204 Register constants, Register t0, Register t1, Register t2, Register t3, Register t4, Register t5) {
4205
4206 // Save non-volatile vector registers (frameless).
4207 Register offset = t1;
4208 int offsetInt = 0;
4209 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
4210 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
4211 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
4212 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
4213 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
4214 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
4215 #ifndef VM_LITTLE_ENDIAN
4216 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
4217 #endif
4218 offsetInt -= 8; std(R14, offsetInt, R1_SP);
4219 offsetInt -= 8; std(R15, offsetInt, R1_SP);
4220 offsetInt -= 8; std(R16, offsetInt, R1_SP);
4221
4222 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
4223 // bytes per iteration. The basic scheme is:
4224 // lvx: load vector (Big Endian needs reversal)
4225 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
4226 // vxor: xor partial results together to get unroll_factor2 vectors
4227
4228 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
4229
4230 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
4231 const int unroll_factor = CRC32_UNROLL_FACTOR,
4232 unroll_factor2 = CRC32_UNROLL_FACTOR2;
4233
4234 const int outer_consts_size = (unroll_factor2 - 1) * 16,
4235 inner_consts_size = (unroll_factor / unroll_factor2) * 16;
4236
4237 // Support registers.
4238 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, crc /* will live in VCRC */ };
4239 Register num_bytes = R14,
4240 loop_count = R15,
4241 cur_const = R16;
4242 // Constant array for outer loop: unroll_factor2 - 1 registers,
4243 // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
4244 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
4245 consts1[] = { VR23, VR24 };
4246 // Data register arrays: 2 arrays with unroll_factor2 registers.
4247 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
4248 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
4249
4250 VectorRegister VCRC = data0[0];
4251 VectorRegister Vc = VR25;
4252 VectorRegister swap_bytes = VR26; // Only for Big Endian.
4253
4254 // We have at least 1 iteration (ensured by caller).
4255 Label L_outer_loop, L_inner_loop, L_last;
4256
4257 // If supported set DSCR pre-fetch to deepest.
4258 if (VM_Version::has_mfdscr()) {
4259 load_const_optimized(t0, VM_Version::_dscr_val | 7);
4260 mtdscr(t0);
4261 }
4453 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.
4454 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit.
4455
4456 // Move result. len is already updated.
4457 vsldoi(VCRC, VCRC, zeroes, 8);
4458 mfvrd(crc, VCRC);
4459
4460 // Restore non-volatile Vector registers (frameless).
4461 offsetInt = 0;
4462 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4463 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4464 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4465 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4466 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4467 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4468 #ifndef VM_LITTLE_ENDIAN
4469 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4470 #endif
4471 offsetInt -= 8; ld(R14, offsetInt, R1_SP);
4472 offsetInt -= 8; ld(R15, offsetInt, R1_SP);
4473 offsetInt -= 8; ld(R16, offsetInt, R1_SP);
4474 }
4475
4476 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
4477 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
4478 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
4479 : StubRoutines::crc_table_addr() , R0);
4480
4481 if (VM_Version::has_vpmsumb()) {
4482 load_const_optimized(t1, is_crc32c ? StubRoutines::ppc64::crc32c_constants()
4483 : StubRoutines::ppc64::crc_constants() , R0);
4484 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
4485 } else {
4486 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
4487 }
4488 }
4489
4490 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
4491 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp);
4492
4493 BLOCK_COMMENT("kernel_crc32_singleByte:");
4494 if (invertCRC) {
4495 nand(crc, crc, crc); // 1s complement of crc
4496 }
4497
4498 lbz(tmp, 0, buf); // Byte from buffer, zero-extended.
4499 update_byte_crc32(crc, tmp, table);
4500
4501 if (invertCRC) {
4502 nand(crc, crc, crc); // 1s complement of crc
4503 }
|
3842 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3843 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3844 // This value is mask inserted into dst with a [0..23] mask of 1s.
3845 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3846 // This value is mask inserted into dst with a [8..15] mask of 1s.
3847 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3848 assert_different_registers(dst, src);
3849
3850 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3851 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3852 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
3853 }
3854
3855 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3856 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3857 // body size from 20 to 16 instructions.
3858 // Returns the offset that was used to calculate the address of column tc3.
3859 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3860 // at hand, the original table address can be easily reconstructed.
3861 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3862 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3863
3864 // Point to 4 byte folding tables (byte-reversed for Big Endian)
3865 // Layout: See StubRoutines::generate_crc_constants.
3866 #ifdef VM_LITTLE_ENDIAN
3867 // This is what we implement (the DOLIT4 part):
3868 // ========================================================================= */
3869 // #define DOLIT4 c ^= *buf4++; \
3870 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3871 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3872 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3873 // ========================================================================= */
3874 const int ix0 = 3 * CRC32_TABLE_SIZE;
3875 const int ix1 = 2 * CRC32_TABLE_SIZE;
3876 const int ix2 = 1 * CRC32_TABLE_SIZE;
3877 const int ix3 = 0 * CRC32_TABLE_SIZE;
3878 #else
3879 // This is what we implement (the DOBIG4 part):
3880 // =========================================================================
3881 // #define DOBIG4 c ^= *++buf4; \
3882 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3883 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3884 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3885 // =========================================================================
3886 const int ix0 = 1 * CRC32_TABLE_SIZE;
3887 const int ix1 = 2 * CRC32_TABLE_SIZE;
3888 const int ix2 = 3 * CRC32_TABLE_SIZE;
3889 const int ix3 = 4 * CRC32_TABLE_SIZE;
3890 #endif
3891 assert_different_registers(table, tc0, tc1, tc2);
3892 assert(table == tc3, "must be!");
3893
3894 addi(tc0, table, ix0);
3895 addi(tc1, table, ix1);
3896 addi(tc2, table, ix2);
3897 if (ix3 != 0) addi(tc3, table, ix3);
3898
3899 return ix3;
3900 }
3901
3902 /**
3903 * uint32_t crc;
3904 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3905 */
3906 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3907 assert_different_registers(crc, table, tmp);
3908 assert_different_registers(val, table);
3909
3910 if (crc == val) { // Must rotate first to use the unmodified value.
3911 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3912 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3913 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3914 } else {
3915 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3916 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3917 }
3918 lwzx(tmp, table, tmp);
3919 xorr(crc, crc, tmp);
3920 }
3921
3922 /**
3923 * Emits code to update CRC-32 with a byte value according to constants in table.
3924 *
3925 * @param [in,out]crc Register containing the crc.
3926 * @param [in]val Register containing the byte to fold into the CRC.
3927 * @param [in]table Register containing the table of crc constants.
3928 *
3929 * uint32_t crc;
3930 * val = crc_table[(val ^ crc) & 0xFF];
3931 * crc = val ^ (crc >> 8);
3932 */
3933 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3934 BLOCK_COMMENT("update_byte_crc32:");
3935 xorr(val, val, crc);
3936 fold_byte_crc32(crc, val, table, val);
3937 }
3938
3939 /**
3940 * @param crc register containing existing CRC (32-bit)
3941 * @param buf register pointing to input byte buffer (byte*)
3942 * @param len register containing number of bytes
4093
4094 // Restore original table address for tailLoop.
4095 if (reconstructTableOffset != 0) {
4096 addi(table, table, -reconstructTableOffset);
4097 }
4098
4099 // Process last few (<complexThreshold) bytes of buffer.
4100 BIND(L_tail);
4101 update_byteLoop_crc32(crc, buf, len, table, data, false);
4102
4103 if (invertCRC) {
4104 nand(crc, crc, crc); // 1s complement of crc
4105 }
4106 BLOCK_COMMENT("} kernel_crc32_1word");
4107 }
4108
4109 /**
4110 * @param crc register containing existing CRC (32-bit)
4111 * @param buf register pointing to input byte buffer (byte*)
4112 * @param len register containing number of bytes
4113 * @param constants register pointing to precomputed constants
4114 * @param t0-t6 temp registers
4115 */
4116 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
4117 Register t0, Register t1, Register t2, Register t3,
4118 Register t4, Register t5, Register t6, bool invertCRC) {
4119 assert_different_registers(crc, buf, len, constants);
4120
4121 Label L_tail;
4122
4123 BLOCK_COMMENT("kernel_crc32_vpmsum {");
4124
4125 if (invertCRC) {
4126 nand(crc, crc, crc); // 1s complement of crc
4127 }
4128
4129 // Enforce 32 bit.
4130 clrldi(len, len, 32);
4131
4132 // Align if we have enough bytes for the fast version.
4133 const int alignment = 16,
4134 threshold = 32;
4135 Register prealign = t0;
4136
4137 neg(prealign, buf);
4138 addi(t1, len, -threshold);
4139 andi(prealign, prealign, alignment - 1);
4140 cmpw(CCR0, t1, prealign);
4141 blt(CCR0, L_tail); // len - prealign < threshold?
4142
4143 subf(len, prealign, len);
4144 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
4145
4146 // Calculate from first aligned address as far as possible.
4147 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
4148 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
4149 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
4150
4151 // Remaining bytes.
4152 BIND(L_tail);
4153 update_byteLoop_crc32(crc, buf, len, constants, t2, false);
4154
4155 if (invertCRC) {
4156 nand(crc, crc, crc); // 1s complement of crc
4157 }
4158
4159 BLOCK_COMMENT("} kernel_crc32_vpmsum");
4160 }
4161
4162 /**
4163 * @param crc register containing existing CRC (32-bit)
4164 * @param buf register pointing to input byte buffer (byte*)
4165 * @param len register containing number of bytes (will get updated to remaining bytes)
4166 * @param constants register pointing to CRC table for 128-bit aligned memory
4167 * @param t0-t6 temp registers
4168 */
4169 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
4170 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
4171
4172 // Save non-volatile vector registers (frameless).
4173 Register offset = t1;
4174 int offsetInt = 0;
4175 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
4176 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
4177 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
4178 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
4179 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
4180 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
4181 #ifndef VM_LITTLE_ENDIAN
4182 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
4183 #endif
4184 offsetInt -= 8; std(R14, offsetInt, R1_SP);
4185 offsetInt -= 8; std(R15, offsetInt, R1_SP);
4186
4187 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
4188 // bytes per iteration. The basic scheme is:
4189 // lvx: load vector (Big Endian needs reversal)
4190 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
4191 // vxor: xor partial results together to get unroll_factor2 vectors
4192
4193 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
4194
4195 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
4196 const int unroll_factor = CRC32_UNROLL_FACTOR,
4197 unroll_factor2 = CRC32_UNROLL_FACTOR2;
4198
4199 const int outer_consts_size = (unroll_factor2 - 1) * 16,
4200 inner_consts_size = (unroll_factor / unroll_factor2) * 16;
4201
4202 // Support registers.
4203 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
4204 Register num_bytes = R14,
4205 loop_count = R15,
4206 cur_const = crc; // will live in VCRC
4207 // Constant array for outer loop: unroll_factor2 - 1 registers,
4208 // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
4209 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
4210 consts1[] = { VR23, VR24 };
4211 // Data register arrays: 2 arrays with unroll_factor2 registers.
4212 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
4213 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
4214
4215 VectorRegister VCRC = data0[0];
4216 VectorRegister Vc = VR25;
4217 VectorRegister swap_bytes = VR26; // Only for Big Endian.
4218
4219 // We have at least 1 iteration (ensured by caller).
4220 Label L_outer_loop, L_inner_loop, L_last;
4221
4222 // If supported set DSCR pre-fetch to deepest.
4223 if (VM_Version::has_mfdscr()) {
4224 load_const_optimized(t0, VM_Version::_dscr_val | 7);
4225 mtdscr(t0);
4226 }
4418 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.
4419 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit.
4420
4421 // Move result. len is already updated.
4422 vsldoi(VCRC, VCRC, zeroes, 8);
4423 mfvrd(crc, VCRC);
4424
4425 // Restore non-volatile Vector registers (frameless).
4426 offsetInt = 0;
4427 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4428 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4429 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4430 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4431 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4432 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4433 #ifndef VM_LITTLE_ENDIAN
4434 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4435 #endif
4436 offsetInt -= 8; ld(R14, offsetInt, R1_SP);
4437 offsetInt -= 8; ld(R15, offsetInt, R1_SP);
4438 }
4439
4440 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
4441 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
4442 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
4443 : StubRoutines::crc_table_addr() , R0);
4444
4445 if (VM_Version::has_vpmsumb()) {
4446 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
4447 } else {
4448 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
4449 }
4450 }
4451
4452 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
4453 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp);
4454
4455 BLOCK_COMMENT("kernel_crc32_singleByte:");
4456 if (invertCRC) {
4457 nand(crc, crc, crc); // 1s complement of crc
4458 }
4459
4460 lbz(tmp, 0, buf); // Byte from buffer, zero-extended.
4461 update_byte_crc32(crc, tmp, table);
4462
4463 if (invertCRC) {
4464 nand(crc, crc, crc); // 1s complement of crc
4465 }
|