hotspot Cdiff src/cpu/ppc/vm/macroAssembler

src/cpu/ppc/vm/macroAssembler_ppc.cpp

rev 11928 : 8164920: ppc: enhancement of CRC32 intrinsic
Reviewed-by: mdoerr
Contributed-by: Hiroshi H Horii <horii@jp.ibm.com>


*** 4330,4339 ****
--- 4330,4898 ----
    update_byteLoop_crc32(crc, buf, len, table, data, true, true);
  
    BLOCK_COMMENT("} kernel_crc32_1byte");
  }
  
+ /**
+  * @param crc             register containing existing CRC (32-bit)
+  * @param buf             register pointing to input byte buffer (byte*)
+  * @param len             register containing number of bytes
+  * @param table           register pointing to CRC table
+  * @param constants       register pointing to CRC table for 128-bit aligned memory
+  * @param barretConstants register pointing to table for barrett reduction
+  * @param t0              volatile register
+  * @param t1              volatile register
+  * @param t2              volatile register
+  * @param t3              volatile register
+  */
+ void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
+                         Register constants,  Register barretConstants,
+                         Register t0,  Register t1, Register t2, Register t3, Register t4) {
+   assert_different_registers(crc, buf, len, table);
+ 
+   Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
+ 
+   Register  prealign     = t0;
+   Register  postalign    = t0;
+ 
+   BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
+ 
+   // 1. use kernel_crc32_1word for shorter than 384bit
+   clrldi(len, len, 32);
+   cmpdi(CCR0, len, 384);
+   bge(CCR0, L_start);
+ 
+     Register tc0 = t4;
+     Register tc1 = constants;
+     Register tc2 = barretConstants;
+     kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table);
+     b(L_end);
+ 
+   BIND(L_start);
+ 
+     // 2. ~c
+     nand(crc, crc, crc);
+ 
+     // 3. calculate from 0 to first 128bit-aligned address
+     clrldi_(prealign, buf, 57);
+     beq(CCR0, L_alignedHead);
+ 
+     subfic(prealign, prealign, 128);
+ 
+     subf(len, prealign, len);
+     update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false);
+ 
+     // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
+     BIND(L_alignedHead);
+ 
+     clrldi(postalign, len, 57);
+     subf(len, postalign, len);
+ 
+     // len must be more than 256bit
+     kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
+ 
+     // 5. calculate remaining
+     cmpdi(CCR0, postalign, 0);
+     beq(CCR0, L_tail);
+ 
+     update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false);
+ 
+     BIND(L_tail);
+ 
+     // 6. ~c
+     nand(crc, crc, crc);
+ 
+   BIND(L_end);
+ 
+   BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
+ }
+ 
+ /**
+  * @param crc             register containing existing CRC (32-bit)
+  * @param buf             register pointing to input byte buffer (byte*)
+  * @param len             register containing number of bytes
+  * @param constants       register pointing to CRC table for 128-bit aligned memory
+  * @param barretConstants register pointing to table for barrett reduction
+  * @param t0              volatile register
+  * @param t1              volatile register
+  * @param t2              volatile register
+  */
+ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
+     Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
+   Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
+   Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
+   Label L_1, L_2, L_3, L_4;
+ 
+   Register  rLoaded      = t0;
+   Register  rTmp1        = t1;
+   Register  rTmp2        = t2;
+   Register  off16        = R22;
+   Register  off32        = R23;
+   Register  off48        = R24;
+   Register  off64        = R25;
+   Register  off80        = R26;
+   Register  off96        = R27;
+   Register  off112       = R28;
+   Register  rIdx         = R29;
+   Register  rMax         = R30;
+   Register  constantsPos = R31;
+ 
+   VectorRegister mask_32bit = VR24;
+   VectorRegister mask_64bit = VR25;
+   VectorRegister zeroes     = VR26;
+   VectorRegister const1     = VR27;
+   VectorRegister const2     = VR28;
+ 
+   // Save non-volatile vector registers (frameless).
+   Register offset = t1;   int offsetInt = 0;
+   offsetInt -= 16; li(offset, -16);           stvx(VR20, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);
+   offsetInt -= 8; std(R22, offsetInt, R1_SP);
+   offsetInt -= 8; std(R23, offsetInt, R1_SP);
+   offsetInt -= 8; std(R24, offsetInt, R1_SP);
+   offsetInt -= 8; std(R25, offsetInt, R1_SP);
+   offsetInt -= 8; std(R26, offsetInt, R1_SP);
+   offsetInt -= 8; std(R27, offsetInt, R1_SP);
+   offsetInt -= 8; std(R28, offsetInt, R1_SP);
+   offsetInt -= 8; std(R29, offsetInt, R1_SP);
+   offsetInt -= 8; std(R30, offsetInt, R1_SP);
+   offsetInt -= 8; std(R31, offsetInt, R1_SP);
+ 
+   // Set constants
+   li(off16, 16);
+   li(off32, 32);
+   li(off48, 48);
+   li(off64, 64);
+   li(off80, 80);
+   li(off96, 96);
+   li(off112, 112);
+ 
+   clrldi(crc, crc, 32);
+ 
+   vxor(zeroes, zeroes, zeroes);
+   vspltisw(VR0, -1);
+ 
+   vsldoi(mask_32bit, zeroes, VR0, 4);
+   vsldoi(mask_64bit, zeroes, VR0, -8);
+ 
+   // Get the initial value into v8
+   vxor(VR8, VR8, VR8);
+   mtvrd(VR8, crc);
+   vsldoi(VR8, zeroes, VR8, -8); // shift into bottom 32 bits
+ 
+   li (rLoaded, 0);
+ 
+   rldicr(rIdx, len, 0, 56);
+ 
+   {
+     BIND(L_1);
+     // Checksum in blocks of MAX_SIZE (32768)
+     lis(rMax, 0);
+     ori(rMax, rMax, 32768);
+     mr(rTmp2, rMax);
+     cmpd(CCR0, rIdx, rMax);
+     bgt(CCR0, L_2);
+     mr(rMax, rIdx);
+ 
+     BIND(L_2);
+     subf(rIdx, rMax, rIdx);
+ 
+     // our main loop does 128 bytes at a time
+     srdi(rMax, rMax, 7);
+ 
+     /*
+      * Work out the offset into the constants table to start at. Each
+      * constant is 16 bytes, and it is used against 128 bytes of input
+      * data - 128 / 16 = 8
+      */
+     sldi(rTmp1, rMax, 4);
+     srdi(rTmp2, rTmp2, 3);
+     subf(rTmp1, rTmp1, rTmp2);
+ 
+     // We reduce our final 128 bytes in a separate step
+     addi(rMax, rMax, -1);
+     mtctr(rMax);
+ 
+     // Find the start of our constants
+     add(constantsPos, constants, rTmp1);
+ 
+     // zero VR0-v7 which will contain our checksums
+     vxor(VR0, VR0, VR0);
+     vxor(VR1, VR1, VR1);
+     vxor(VR2, VR2, VR2);
+     vxor(VR3, VR3, VR3);
+     vxor(VR4, VR4, VR4);
+     vxor(VR5, VR5, VR5);
+     vxor(VR6, VR6, VR6);
+     vxor(VR7, VR7, VR7);
+ 
+     lvx(const1, constantsPos);
+ 
+     /*
+      * If we are looping back to consume more data we use the values
+      * already in VR16-v23.
+      */
+     cmpdi(CCR0, rLoaded, 1);
+     beq(CCR0, L_3);
+     {
+ 
+       // First warm up pass
+       lvx(VR16, buf);
+       lvx(VR17, off16, buf);
+       lvx(VR18, off32, buf);
+       lvx(VR19, off48, buf);
+       lvx(VR20, off64, buf);
+       lvx(VR21, off80, buf);
+       lvx(VR22, off96, buf);
+       lvx(VR23, off112, buf);
+       addi(buf, buf, 8*16);
+ 
+       // xor in initial value
+       vxor(VR16, VR16, VR8);
+     }
+ 
+     BIND(L_3);
+     bdz(L_first_warm_up_done);
+ 
+     addi(constantsPos, constantsPos, 16);
+     lvx(const2, constantsPos);
+ 
+     // Second warm up pass
+     vpmsumd(VR8, VR16, const1);
+     lvx(VR16, buf);
+ 
+     vpmsumd(VR9, VR17, const1);
+     lvx(VR17, off16, buf);
+ 
+     vpmsumd(VR10, VR18, const1);
+     lvx(VR18, off32, buf);
+ 
+     vpmsumd(VR11, VR19, const1);
+     lvx(VR19, off48, buf);
+ 
+     vpmsumd(VR12, VR20, const1);
+     lvx(VR20, off64, buf);
+ 
+     vpmsumd(VR13, VR21, const1);
+     lvx(VR21, off80, buf);
+ 
+     vpmsumd(VR14, VR22, const1);
+     lvx(VR22, off96, buf);
+ 
+     vpmsumd(VR15, VR23, const1);
+     lvx(VR23, off112, buf);
+ 
+     addi(buf, buf, 8 * 16);
+ 
+     bdz(L_first_cool_down);
+ 
+     /*
+      * main loop. We modulo schedule it such that it takes three iterations
+      * to complete - first iteration load, second iteration vpmsum, third
+      * iteration xor.
+      */
+     {
+       BIND(L_4);
+       lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);
+ 
+       vxor(VR0, VR0, VR8);
+       vpmsumd(VR8, VR16, const2);
+       lvx(VR16, buf);
+ 
+       vxor(VR1, VR1, VR9);
+       vpmsumd(VR9, VR17, const2);
+       lvx(VR17, off16, buf);
+ 
+       vxor(VR2, VR2, VR10);
+       vpmsumd(VR10, VR18, const2);
+       lvx(VR18, off32, buf);
+ 
+       vxor(VR3, VR3, VR11);
+       vpmsumd(VR11, VR19, const2);
+       lvx(VR19, off48, buf);
+       lvx(const2, constantsPos);
+ 
+       vxor(VR4, VR4, VR12);
+       vpmsumd(VR12, VR20, const1);
+       lvx(VR20, off64, buf);
+ 
+       vxor(VR5, VR5, VR13);
+       vpmsumd(VR13, VR21, const1);
+       lvx(VR21, off80, buf);
+ 
+       vxor(VR6, VR6, VR14);
+       vpmsumd(VR14, VR22, const1);
+       lvx(VR22, off96, buf);
+ 
+       vxor(VR7, VR7, VR15);
+       vpmsumd(VR15, VR23, const1);
+       lvx(VR23, off112, buf);
+ 
+       addi(buf, buf, 8 * 16);
+ 
+       bdnz(L_4);
+     }
+ 
+     BIND(L_first_cool_down);
+ 
+     // First cool down pass
+     lvx(const1, constantsPos);
+     addi(constantsPos, constantsPos, 16);
+ 
+     vxor(VR0, VR0, VR8);
+     vpmsumd(VR8, VR16, const1);
+ 
+     vxor(VR1, VR1, VR9);
+     vpmsumd(VR9, VR17, const1);
+ 
+     vxor(VR2, VR2, VR10);
+     vpmsumd(VR10, VR18, const1);
+ 
+     vxor(VR3, VR3, VR11);
+     vpmsumd(VR11, VR19, const1);
+ 
+     vxor(VR4, VR4, VR12);
+     vpmsumd(VR12, VR20, const1);
+ 
+     vxor(VR5, VR5, VR13);
+     vpmsumd(VR13, VR21, const1);
+ 
+     vxor(VR6, VR6, VR14);
+     vpmsumd(VR14, VR22, const1);
+ 
+     vxor(VR7, VR7, VR15);
+     vpmsumd(VR15, VR23, const1);
+ 
+     BIND(L_second_cool_down);
+     // Second cool down pass
+     vxor(VR0, VR0, VR8);
+     vxor(VR1, VR1, VR9);
+     vxor(VR2, VR2, VR10);
+     vxor(VR3, VR3, VR11);
+     vxor(VR4, VR4, VR12);
+     vxor(VR5, VR5, VR13);
+     vxor(VR6, VR6, VR14);
+     vxor(VR7, VR7, VR15);
+ 
+     /*
+      * vpmsumd produces a 96 bit result in the least significant bits
+      * of the register. Since we are bit reflected we have to shift it
+      * left 32 bits so it occupies the least significant bits in the
+      * bit reflected domain.
+      */
+     vsldoi(VR0, VR0, zeroes, 4);
+     vsldoi(VR1, VR1, zeroes, 4);
+     vsldoi(VR2, VR2, zeroes, 4);
+     vsldoi(VR3, VR3, zeroes, 4);
+     vsldoi(VR4, VR4, zeroes, 4);
+     vsldoi(VR5, VR5, zeroes, 4);
+     vsldoi(VR6, VR6, zeroes, 4);
+     vsldoi(VR7, VR7, zeroes, 4);
+ 
+     // xor with last 1024 bits
+     lvx(VR8, buf);
+     lvx(VR9, off16, buf);
+     lvx(VR10, off32, buf);
+     lvx(VR11, off48, buf);
+     lvx(VR12, off64, buf);
+     lvx(VR13, off80, buf);
+     lvx(VR14, off96, buf);
+     lvx(VR15, off112, buf);
+     addi(buf, buf, 8 * 16);
+ 
+     vxor(VR16, VR0, VR8);
+     vxor(VR17, VR1, VR9);
+     vxor(VR18, VR2, VR10);
+     vxor(VR19, VR3, VR11);
+     vxor(VR20, VR4, VR12);
+     vxor(VR21, VR5, VR13);
+     vxor(VR22, VR6, VR14);
+     vxor(VR23, VR7, VR15);
+ 
+     li(rLoaded, 1);
+     cmpdi(CCR0, rIdx, 0);
+     addi(rIdx, rIdx, 128);
+     bne(CCR0, L_1);
+   }
+ 
+   // Work out how many bytes we have left
+   andi_(len, len, 127);
+ 
+   // Calculate where in the constant table we need to start
+   subfic(rTmp1, len, 128);
+   add(constantsPos, constantsPos, rTmp1);
+ 
+   // How many 16 byte chunks are in the tail
+   srdi(rIdx, len, 4);
+   mtctr(rIdx);
+ 
+   /*
+    * Reduce the previously calculated 1024 bits to 64 bits, shifting
+    * 32 bits to include the trailing 32 bits of zeros
+    */
+   lvx(VR0, constantsPos);
+   lvx(VR1, off16, constantsPos);
+   lvx(VR2, off32, constantsPos);
+   lvx(VR3, off48, constantsPos);
+   lvx(VR4, off64, constantsPos);
+   lvx(VR5, off80, constantsPos);
+   lvx(VR6, off96, constantsPos);
+   lvx(VR7, off112, constantsPos);
+   addi(constantsPos, constantsPos, 8 * 16);
+ 
+   vpmsumw(VR0, VR16, VR0);
+   vpmsumw(VR1, VR17, VR1);
+   vpmsumw(VR2, VR18, VR2);
+   vpmsumw(VR3, VR19, VR3);
+   vpmsumw(VR4, VR20, VR4);
+   vpmsumw(VR5, VR21, VR5);
+   vpmsumw(VR6, VR22, VR6);
+   vpmsumw(VR7, VR23, VR7);
+ 
+   // Now reduce the tail (0 - 112 bytes)
+   cmpdi(CCR0, rIdx, 0);
+   beq(CCR0, L_XOR);
+ 
+   lvx(VR16, buf); addi(buf, buf, 16);
+   lvx(VR17, constantsPos);
+   vpmsumw(VR16, VR16, VR17);
+   vxor(VR0, VR0, VR16);
+   beq(CCR0, L_XOR);
+ 
+   lvx(VR16, buf); addi(buf, buf, 16);
+   lvx(VR17, off16, constantsPos);
+   vpmsumw(VR16, VR16, VR17);
+   vxor(VR0, VR0, VR16);
+   beq(CCR0, L_XOR);
+ 
+   lvx(VR16, buf); addi(buf, buf, 16);
+   lvx(VR17, off32, constantsPos);
+   vpmsumw(VR16, VR16, VR17);
+   vxor(VR0, VR0, VR16);
+   beq(CCR0, L_XOR);
+ 
+   lvx(VR16, buf); addi(buf, buf, 16);
+   lvx(VR17, off48,constantsPos);
+   vpmsumw(VR16, VR16, VR17);
+   vxor(VR0, VR0, VR16);
+   beq(CCR0, L_XOR);
+ 
+   lvx(VR16, buf); addi(buf, buf, 16);
+   lvx(VR17, off64, constantsPos);
+   vpmsumw(VR16, VR16, VR17);
+   vxor(VR0, VR0, VR16);
+   beq(CCR0, L_XOR);
+ 
+   lvx(VR16, buf); addi(buf, buf, 16);
+   lvx(VR17, off80, constantsPos);
+   vpmsumw(VR16, VR16, VR17);
+   vxor(VR0, VR0, VR16);
+   beq(CCR0, L_XOR);
+ 
+   lvx(VR16, buf); addi(buf, buf, 16);
+   lvx(VR17, off96, constantsPos);
+   vpmsumw(VR16, VR16, VR17);
+   vxor(VR0, VR0, VR16);
+ 
+   // Now xor all the parallel chunks together
+   BIND(L_XOR);
+   vxor(VR0, VR0, VR1);
+   vxor(VR2, VR2, VR3);
+   vxor(VR4, VR4, VR5);
+   vxor(VR6, VR6, VR7);
+ 
+   vxor(VR0, VR0, VR2);
+   vxor(VR4, VR4, VR6);
+ 
+   vxor(VR0, VR0, VR4);
+ 
+   b(L_barrett_reduction);
+ 
+   BIND(L_first_warm_up_done);
+   lvx(const1, constantsPos);
+   addi(constantsPos, constantsPos, 16);
+   vpmsumd(VR8,  VR16, const1);
+   vpmsumd(VR9,  VR17, const1);
+   vpmsumd(VR10, VR18, const1);
+   vpmsumd(VR11, VR19, const1);
+   vpmsumd(VR12, VR20, const1);
+   vpmsumd(VR13, VR21, const1);
+   vpmsumd(VR14, VR22, const1);
+   vpmsumd(VR15, VR23, const1);
+   b(L_second_cool_down);
+ 
+   BIND(L_barrett_reduction);
+ 
+   lvx(const1, barretConstants);
+   addi(barretConstants, barretConstants, 16);
+   lvx(const2, barretConstants);
+ 
+   vsldoi(VR1, VR0, VR0, -8);
+   vxor(VR0, VR0, VR1);    // xor two 64 bit results together
+ 
+   // shift left one bit
+   vspltisb(VR1, 1);
+   vsl(VR0, VR0, VR1);
+ 
+   vand(VR0, VR0, mask_64bit);
+ 
+   /*
+    * The reflected version of Barrett reduction. Instead of bit
+    * reflecting our data (which is expensive to do), we bit reflect our
+    * constants and our algorithm, which means the intermediate data in
+    * our vector registers goes from 0-63 instead of 63-0. We can reflect
+    * the algorithm because we don't carry in mod 2 arithmetic.
+    */
+   vand(VR1, VR0, mask_32bit);  // bottom 32 bits of a
+   vpmsumd(VR1, VR1, const1);   // ma
+   vand(VR1, VR1, mask_32bit);  // bottom 32bits of ma
+   vpmsumd(VR1, VR1, const2);   // qn */
+   vxor(VR0, VR0, VR1);         // a - qn, subtraction is xor in GF(2)
+ 
+   /*
+    * Since we are bit reflected, the result (ie the low 32 bits) is in
+    * the high 32 bits. We just need to shift it left 4 bytes
+    * V0 [ 0 1 X 3 ]
+    * V0 [ 0 X 2 3 ]
+    */
+   vsldoi(VR0, VR0, zeroes, 4);    // shift result into top 64 bits of
+ 
+   // Get it into r3
+   mfvrd(crc, VR0);
+ 
+   BIND(L_end);
+ 
+   offsetInt = 0;
+   // Restore non-volatile Vector registers (frameless).
+   offsetInt -= 16; li(offset, -16);           lvx(VR20, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
+   offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
+   offsetInt -= 8;  ld(R22, offsetInt, R1_SP);
+   offsetInt -= 8;  ld(R23, offsetInt, R1_SP);
+   offsetInt -= 8;  ld(R24, offsetInt, R1_SP);
+   offsetInt -= 8;  ld(R25, offsetInt, R1_SP);
+   offsetInt -= 8;  ld(R26, offsetInt, R1_SP);
+   offsetInt -= 8;  ld(R27, offsetInt, R1_SP);
+   offsetInt -= 8;  ld(R28, offsetInt, R1_SP);
+   offsetInt -= 8;  ld(R29, offsetInt, R1_SP);
+   offsetInt -= 8;  ld(R30, offsetInt, R1_SP);
+   offsetInt -= 8;  ld(R31, offsetInt, R1_SP);
+ }
+ 
  void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
    assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
  
    BLOCK_COMMENT("kernel_crc32_singleByte:");
    nand(crc, crc, crc);       // ~c

< prev index next >