< prev index next >

src/hotspot/cpu/ppc/macroAssembler_ppc.cpp

Print this page
rev 53441 : 8217459: [PPC64] Cleanup non-vector version of CRC32
Reviewed-by:


3842 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3843 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3844 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3845 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3846 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3847 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3848   assert_different_registers(dst, src);
3849 
3850   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3851   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3852   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3853 }
3854 
3855 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3856 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3857 // body size from 20 to 16 instructions.
3858 // Returns the offset that was used to calculate the address of column tc3.
3859 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3860 // at hand, the original table address can be easily reconstructed.
3861 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {

3862 


3863 #ifdef VM_LITTLE_ENDIAN
3864   // This is what we implement (the DOLIT4 part):
3865   // ========================================================================= */
3866   // #define DOLIT4 c ^= *buf4++; \
3867   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3868   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3869   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3870   // ========================================================================= */
3871   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3872   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3873   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3874   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3875 #else
3876   // This is what we implement (the DOBIG4 part):
3877   // =========================================================================
3878   // #define DOBIG4 c ^= *++buf4; \
3879   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3880   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3881   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3882   // =========================================================================
3883   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3884   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3885   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3886   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3887 #endif
3888   assert_different_registers(table, tc0, tc1, tc2);
3889   assert(table == tc3, "must be!");
3890 
3891   addi(tc0, table, ix0);
3892   addi(tc1, table, ix1);
3893   addi(tc2, table, ix2);
3894   if (ix3 != 0) addi(tc3, table, ix3);
3895 
3896   return ix3;
3897 }
3898 
3899 /**
3900  * uint32_t crc;
3901  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3902  */
3903 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3904   assert_different_registers(crc, table, tmp);
3905   assert_different_registers(val, table);
3906 
3907   if (crc == val) {                   // Must rotate first to use the unmodified value.
3908     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3909                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3910     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3911   } else {
3912     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3913     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3914   }
3915   lwzx(tmp, table, tmp);
3916   xorr(crc, crc, tmp);
3917 }
3918 
3919 /**
3920  * uint32_t crc;
3921  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3922  */
3923 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3924   fold_byte_crc32(crc, crc, table, tmp);
3925 }
3926 
3927 /**
3928  * Emits code to update CRC-32 with a byte value according to constants in table.
3929  *
3930  * @param [in,out]crc   Register containing the crc.
3931  * @param [in]val       Register containing the byte to fold into the CRC.
3932  * @param [in]table     Register containing the table of crc constants.
3933  *
3934  * uint32_t crc;
3935  * val = crc_table[(val ^ crc) & 0xFF];
3936  * crc = val ^ (crc >> 8);
3937  */
3938 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3939   BLOCK_COMMENT("update_byte_crc32:");
3940   xorr(val, val, crc);
3941   fold_byte_crc32(crc, val, table, val);
3942 }
3943 
3944 /**
3945  * @param crc   register containing existing CRC (32-bit)
3946  * @param buf   register pointing to input byte buffer (byte*)
3947  * @param len   register containing number of bytes


4098 
4099   // Restore original table address for tailLoop.
4100   if (reconstructTableOffset != 0) {
4101     addi(table, table, -reconstructTableOffset);
4102   }
4103 
4104   // Process last few (<complexThreshold) bytes of buffer.
4105   BIND(L_tail);
4106   update_byteLoop_crc32(crc, buf, len, table, data, false);
4107 
4108   if (invertCRC) {
4109     nand(crc, crc, crc);                      // 1s complement of crc
4110   }
4111   BLOCK_COMMENT("} kernel_crc32_1word");
4112 }
4113 
4114 /**
4115  * @param crc   register containing existing CRC (32-bit)
4116  * @param buf   register pointing to input byte buffer (byte*)
4117  * @param len   register containing number of bytes
4118  * @param table register pointing to CRC table
4119  *
4120  * Uses R7_ARG5, R8_ARG6 as work registers.
4121  */
4122 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4123                                         Register t0,  Register t1,  Register t2,  Register t3,
4124                                         bool invertCRC) {
4125   assert_different_registers(crc, buf, len, table);
4126 
4127   Register  data = t0;                   // Holds the current byte to be folded into crc.
4128 
4129   BLOCK_COMMENT("kernel_crc32_1byte {");
4130 
4131   if (invertCRC) {
4132     nand(crc, crc, crc);                      // 1s complement of crc
4133   }
4134 
4135   // Process all bytes in a single-byte loop.
4136   update_byteLoop_crc32(crc, buf, len, table, data, true);
4137 
4138   if (invertCRC) {
4139     nand(crc, crc, crc);                      // 1s complement of crc
4140   }
4141   BLOCK_COMMENT("} kernel_crc32_1byte");
4142 }
4143 
4144 /**
4145  * @param crc             register containing existing CRC (32-bit)
4146  * @param buf             register pointing to input byte buffer (byte*)
4147  * @param len             register containing number of bytes
4148  * @param table           register pointing to CRC table
4149  * @param constants       register pointing to CRC table for 128-bit aligned memory
4150  * @param t0-t5           temp registers
4151  */
4152 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table,
4153                                          Register constants, Register t0, Register t1, Register t2,
4154                                          Register t3, Register t4, Register t5, bool invertCRC) {
4155   assert_different_registers(crc, buf, len, table);
4156 
4157   Label L_tail;
4158 
4159   BLOCK_COMMENT("kernel_crc32_vpmsum {");
4160 
4161   if (invertCRC) {
4162     nand(crc, crc, crc);                      // 1s complement of crc
4163   }
4164 
4165   // Enforce 32 bit.
4166   clrldi(len, len, 32);
4167 
4168   // Align if we have enough bytes for the fast version.
4169   const int alignment = 16,
4170             threshold = 32;
4171   Register prealign = t0;
4172 
4173   neg(prealign, buf);
4174   addi(t1, len, -threshold);
4175   andi(prealign, prealign, alignment - 1);
4176   cmpw(CCR0, t1, prealign);
4177   blt(CCR0, L_tail); // len - prealign < threshold?
4178 
4179   subf(len, prealign, len);
4180   update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4181 
4182   // Calculate from first aligned address as far as possible.
4183   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5);


4184 
4185   // Remaining bytes.
4186   BIND(L_tail);
4187   update_byteLoop_crc32(crc, buf, len, table, t2, false);
4188 
4189   if (invertCRC) {
4190     nand(crc, crc, crc);                      // 1s complement of crc
4191   }
4192 
4193   BLOCK_COMMENT("} kernel_crc32_vpmsum");
4194 }
4195 
4196 /**
4197  * @param crc             register containing existing CRC (32-bit)
4198  * @param buf             register pointing to input byte buffer (byte*)
4199  * @param len             register containing number of bytes (will get updated to remaining bytes)
4200  * @param constants       register pointing to CRC table for 128-bit aligned memory
4201  * @param t0-t5           temp registers
4202  */
4203 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len,
4204     Register constants, Register t0, Register t1, Register t2, Register t3, Register t4, Register t5) {
4205 
4206   // Save non-volatile vector registers (frameless).
4207   Register offset = t1;
4208   int offsetInt = 0;
4209   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
4210   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
4211   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
4212   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
4213   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
4214   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
4215 #ifndef VM_LITTLE_ENDIAN
4216   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
4217 #endif
4218   offsetInt -= 8; std(R14, offsetInt, R1_SP);
4219   offsetInt -= 8; std(R15, offsetInt, R1_SP);
4220   offsetInt -= 8; std(R16, offsetInt, R1_SP);
4221 
4222   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
4223   // bytes per iteration. The basic scheme is:
4224   // lvx: load vector (Big Endian needs reversal)
4225   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
4226   // vxor: xor partial results together to get unroll_factor2 vectors
4227 
4228   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
4229 
4230   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
4231   const int unroll_factor = CRC32_UNROLL_FACTOR,
4232             unroll_factor2 = CRC32_UNROLL_FACTOR2;
4233 
4234   const int outer_consts_size = (unroll_factor2 - 1) * 16,
4235             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
4236 
4237   // Support registers.
4238   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, crc /* will live in VCRC */ };
4239   Register num_bytes = R14,
4240            loop_count = R15,
4241            cur_const = R16;
4242   // Constant array for outer loop: unroll_factor2 - 1 registers,
4243   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
4244   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
4245                  consts1[] = { VR23, VR24 };
4246   // Data register arrays: 2 arrays with unroll_factor2 registers.
4247   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
4248                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
4249 
4250   VectorRegister VCRC = data0[0];
4251   VectorRegister Vc = VR25;
4252   VectorRegister swap_bytes = VR26; // Only for Big Endian.
4253 
4254   // We have at least 1 iteration (ensured by caller).
4255   Label L_outer_loop, L_inner_loop, L_last;
4256 
4257   // If supported set DSCR pre-fetch to deepest.
4258   if (VM_Version::has_mfdscr()) {
4259     load_const_optimized(t0, VM_Version::_dscr_val | 7);
4260     mtdscr(t0);
4261   }


4453   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
4454   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
4455 
4456   // Move result. len is already updated.
4457   vsldoi(VCRC, VCRC, zeroes, 8);
4458   mfvrd(crc, VCRC);
4459 
4460   // Restore non-volatile Vector registers (frameless).
4461   offsetInt = 0;
4462   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4463   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4464   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4465   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4466   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4467   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4468 #ifndef VM_LITTLE_ENDIAN
4469   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4470 #endif
4471   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
4472   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
4473   offsetInt -= 8;  ld(R16, offsetInt, R1_SP);
4474 }
4475 
4476 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
4477                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
4478   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
4479                                      : StubRoutines::crc_table_addr()   , R0);
4480 
4481   if (VM_Version::has_vpmsumb()) {
4482     load_const_optimized(t1, is_crc32c ? StubRoutines::ppc64::crc32c_constants()
4483                                        : StubRoutines::ppc64::crc_constants()   , R0);
4484     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
4485   } else {
4486     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
4487   }
4488 }
4489 
4490 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
4491   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4492 
4493   BLOCK_COMMENT("kernel_crc32_singleByte:");
4494   if (invertCRC) {
4495     nand(crc, crc, crc);                // 1s complement of crc
4496   }
4497 
4498   lbz(tmp, 0, buf);                     // Byte from buffer, zero-extended.
4499   update_byte_crc32(crc, tmp, table);
4500 
4501   if (invertCRC) {
4502     nand(crc, crc, crc);                // 1s complement of crc
4503   }




3842 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3843 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3844 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3845 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3846 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3847 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3848   assert_different_registers(dst, src);
3849 
3850   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3851   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3852   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3853 }
3854 
3855 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3856 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3857 // body size from 20 to 16 instructions.
3858 // Returns the offset that was used to calculate the address of column tc3.
3859 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3860 // at hand, the original table address can be easily reconstructed.
3861 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3862   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3863 
3864   // Point to 4 byte folding tables (byte-reversed for Big Endian)
3865   // Layout: See StubRoutines::generate_crc_constants.
3866 #ifdef VM_LITTLE_ENDIAN
3867   // This is what we implement (the DOLIT4 part):
3868   // ========================================================================= */
3869   // #define DOLIT4 c ^= *buf4++; \
3870   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3871   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3872   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3873   // ========================================================================= */
3874   const int ix0 = 3 * CRC32_TABLE_SIZE;
3875   const int ix1 = 2 * CRC32_TABLE_SIZE;
3876   const int ix2 = 1 * CRC32_TABLE_SIZE;
3877   const int ix3 = 0 * CRC32_TABLE_SIZE;
3878 #else
3879   // This is what we implement (the DOBIG4 part):
3880   // =========================================================================
3881   // #define DOBIG4 c ^= *++buf4; \
3882   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3883   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3884   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3885   // =========================================================================
3886   const int ix0 = 1 * CRC32_TABLE_SIZE;
3887   const int ix1 = 2 * CRC32_TABLE_SIZE;
3888   const int ix2 = 3 * CRC32_TABLE_SIZE;
3889   const int ix3 = 4 * CRC32_TABLE_SIZE;
3890 #endif
3891   assert_different_registers(table, tc0, tc1, tc2);
3892   assert(table == tc3, "must be!");
3893 
3894   addi(tc0, table, ix0);
3895   addi(tc1, table, ix1);
3896   addi(tc2, table, ix2);
3897   if (ix3 != 0) addi(tc3, table, ix3);
3898 
3899   return ix3;
3900 }
3901 
3902 /**
3903  * uint32_t crc;
3904  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3905  */
3906 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3907   assert_different_registers(crc, table, tmp);
3908   assert_different_registers(val, table);
3909 
3910   if (crc == val) {                   // Must rotate first to use the unmodified value.
3911     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3912                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3913     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3914   } else {
3915     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3916     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3917   }
3918   lwzx(tmp, table, tmp);
3919   xorr(crc, crc, tmp);
3920 }
3921 
3922 /**








3923  * Emits code to update CRC-32 with a byte value according to constants in table.
3924  *
3925  * @param [in,out]crc   Register containing the crc.
3926  * @param [in]val       Register containing the byte to fold into the CRC.
3927  * @param [in]table     Register containing the table of crc constants.
3928  *
3929  * uint32_t crc;
3930  * val = crc_table[(val ^ crc) & 0xFF];
3931  * crc = val ^ (crc >> 8);
3932  */
3933 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3934   BLOCK_COMMENT("update_byte_crc32:");
3935   xorr(val, val, crc);
3936   fold_byte_crc32(crc, val, table, val);
3937 }
3938 
3939 /**
3940  * @param crc   register containing existing CRC (32-bit)
3941  * @param buf   register pointing to input byte buffer (byte*)
3942  * @param len   register containing number of bytes


4093 
4094   // Restore original table address for tailLoop.
4095   if (reconstructTableOffset != 0) {
4096     addi(table, table, -reconstructTableOffset);
4097   }
4098 
4099   // Process last few (<complexThreshold) bytes of buffer.
4100   BIND(L_tail);
4101   update_byteLoop_crc32(crc, buf, len, table, data, false);
4102 
4103   if (invertCRC) {
4104     nand(crc, crc, crc);                      // 1s complement of crc
4105   }
4106   BLOCK_COMMENT("} kernel_crc32_1word");
4107 }
4108 
4109 /**
4110  * @param crc             register containing existing CRC (32-bit)
4111  * @param buf             register pointing to input byte buffer (byte*)
4112  * @param len             register containing number of bytes
4113  * @param constants       register pointing to precomputed constants
4114  * @param t0-t6           temp registers

4115  */
4116 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
4117                                          Register t0, Register t1, Register t2, Register t3,
4118                                          Register t4, Register t5, Register t6, bool invertCRC) {
4119   assert_different_registers(crc, buf, len, constants);






























4120 
4121   Label L_tail;
4122 
4123   BLOCK_COMMENT("kernel_crc32_vpmsum {");
4124 
4125   if (invertCRC) {
4126     nand(crc, crc, crc);                      // 1s complement of crc
4127   }
4128 
4129   // Enforce 32 bit.
4130   clrldi(len, len, 32);
4131 
4132   // Align if we have enough bytes for the fast version.
4133   const int alignment = 16,
4134             threshold = 32;
4135   Register prealign = t0;
4136 
4137   neg(prealign, buf);
4138   addi(t1, len, -threshold);
4139   andi(prealign, prealign, alignment - 1);
4140   cmpw(CCR0, t1, prealign);
4141   blt(CCR0, L_tail); // len - prealign < threshold?
4142 
4143   subf(len, prealign, len);
4144   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
4145 
4146   // Calculate from first aligned address as far as possible.
4147   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
4148   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
4149   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
4150 
4151   // Remaining bytes.
4152   BIND(L_tail);
4153   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
4154 
4155   if (invertCRC) {
4156     nand(crc, crc, crc);                      // 1s complement of crc
4157   }
4158 
4159   BLOCK_COMMENT("} kernel_crc32_vpmsum");
4160 }
4161 
4162 /**
4163  * @param crc             register containing existing CRC (32-bit)
4164  * @param buf             register pointing to input byte buffer (byte*)
4165  * @param len             register containing number of bytes (will get updated to remaining bytes)
4166  * @param constants       register pointing to CRC table for 128-bit aligned memory
4167  * @param t0-t6           temp registers
4168  */
4169 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
4170     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
4171 
4172   // Save non-volatile vector registers (frameless).
4173   Register offset = t1;
4174   int offsetInt = 0;
4175   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
4176   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
4177   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
4178   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
4179   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
4180   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
4181 #ifndef VM_LITTLE_ENDIAN
4182   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
4183 #endif
4184   offsetInt -= 8; std(R14, offsetInt, R1_SP);
4185   offsetInt -= 8; std(R15, offsetInt, R1_SP);

4186 
4187   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
4188   // bytes per iteration. The basic scheme is:
4189   // lvx: load vector (Big Endian needs reversal)
4190   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
4191   // vxor: xor partial results together to get unroll_factor2 vectors
4192 
4193   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
4194 
4195   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
4196   const int unroll_factor = CRC32_UNROLL_FACTOR,
4197             unroll_factor2 = CRC32_UNROLL_FACTOR2;
4198 
4199   const int outer_consts_size = (unroll_factor2 - 1) * 16,
4200             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
4201 
4202   // Support registers.
4203   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
4204   Register num_bytes = R14,
4205            loop_count = R15,
4206            cur_const = crc; // will live in VCRC
4207   // Constant array for outer loop: unroll_factor2 - 1 registers,
4208   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
4209   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
4210                  consts1[] = { VR23, VR24 };
4211   // Data register arrays: 2 arrays with unroll_factor2 registers.
4212   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
4213                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
4214 
4215   VectorRegister VCRC = data0[0];
4216   VectorRegister Vc = VR25;
4217   VectorRegister swap_bytes = VR26; // Only for Big Endian.
4218 
4219   // We have at least 1 iteration (ensured by caller).
4220   Label L_outer_loop, L_inner_loop, L_last;
4221 
4222   // If supported set DSCR pre-fetch to deepest.
4223   if (VM_Version::has_mfdscr()) {
4224     load_const_optimized(t0, VM_Version::_dscr_val | 7);
4225     mtdscr(t0);
4226   }


4418   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
4419   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
4420 
4421   // Move result. len is already updated.
4422   vsldoi(VCRC, VCRC, zeroes, 8);
4423   mfvrd(crc, VCRC);
4424 
4425   // Restore non-volatile Vector registers (frameless).
4426   offsetInt = 0;
4427   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4428   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4429   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4430   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4431   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4432   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4433 #ifndef VM_LITTLE_ENDIAN
4434   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4435 #endif
4436   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
4437   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);

4438 }
4439 
4440 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
4441                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
4442   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
4443                                      : StubRoutines::crc_table_addr()   , R0);
4444 
4445   if (VM_Version::has_vpmsumb()) {


4446     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
4447   } else {
4448     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
4449   }
4450 }
4451 
4452 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
4453   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4454 
4455   BLOCK_COMMENT("kernel_crc32_singleByte:");
4456   if (invertCRC) {
4457     nand(crc, crc, crc);                // 1s complement of crc
4458   }
4459 
4460   lbz(tmp, 0, buf);                     // Byte from buffer, zero-extended.
4461   update_byte_crc32(crc, tmp, table);
4462 
4463   if (invertCRC) {
4464     nand(crc, crc, crc);                // 1s complement of crc
4465   }


< prev index next >