jdk Cdiff src/hotspot/cpu/ppc/macroAssembler

src/hotspot/cpu/ppc/macroAssembler_ppc.cpp

rev 53302 : 8216060: [PPC64] Vector CRC implementation should be used by interpreter and be faster for short arrays
Reviewed-by: gromero, goetz


*** 1,8 ****
  /*
!  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
!  * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
   * This code is free software; you can redistribute it and/or modify it
   * under the terms of the GNU General Public License version 2 only, as
   * published by the Free Software Foundation.
--- 1,8 ----
  /*
!  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
!  * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
   * This code is free software; you can redistribute it and/or modify it
   * under the terms of the GNU General Public License version 2 only, as
   * published by the Free Software Foundation.
*** 3972,3982 ****
  
  /**
   * Emits code to update CRC-32 with a 4-byte value according to constants in table
   * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
   */
! // A not on the lookup table address(es):
  // The lookup table consists of two sets of four columns each.
  // The columns {0..3} are used for little-endian machines.
  // The columns {4..7} are used for big-endian machines.
  // To save the effort of adding the column offset to the table address each time
  // a table element is looked up, it is possible to pass the pre-calculated
--- 3972,3982 ----
  
  /**
   * Emits code to update CRC-32 with a 4-byte value according to constants in table
   * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
   */
! // A note on the lookup table address(es):
  // The lookup table consists of two sets of four columns each.
  // The columns {0..3} are used for little-endian machines.
  // The columns {4..7} are used for big-endian machines.
  // To save the effort of adding the column offset to the table address each time
  // a table element is looked up, it is possible to pass the pre-calculated
*** 4148,4219 ****
   * @param table           register pointing to CRC table
   * @param constants       register pointing to CRC table for 128-bit aligned memory
   * @param barretConstants register pointing to table for barrett reduction
   * @param t0-t4           temp registers
   */
! void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
!                                                Register constants, Register barretConstants,
!                                                Register t0, Register t1, Register t2, Register t3, Register t4,
!                                                bool invertCRC) {
    assert_different_registers(crc, buf, len, table);
  
!   Label L_alignedHead, L_tail;
  
!   BLOCK_COMMENT("kernel_crc32_1word_vpmsum {");
  
-   // 1. ~c
    if (invertCRC) {
      nand(crc, crc, crc);                      // 1s complement of crc
    }
  
!   // 2. use kernel_crc32_1word for short len
    clrldi(len, len, 32);
-   cmpdi(CCR0, len, 512);
-   blt(CCR0, L_tail);
  
!   // 3. calculate from 0 to first aligned address
!   const int alignment = 16;
    Register prealign = t0;
  
!   andi_(prealign, buf, alignment - 1);
!   beq(CCR0, L_alignedHead);
!   subfic(prealign, prealign, alignment);
  
    subf(len, prealign, len);
    update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
  
!   // 4. calculate from first aligned address as far as possible
!   BIND(L_alignedHead);
!   kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4);
  
!   // 5. remaining bytes
    BIND(L_tail);
!   Register tc0 = t4;
!   Register tc1 = constants;
!   Register tc2 = barretConstants;
!   kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false);
  
-   // 6. ~c
    if (invertCRC) {
      nand(crc, crc, crc);                      // 1s complement of crc
    }
  
!   BLOCK_COMMENT("} kernel_crc32_1word_vpmsum");
  }
  
  /**
   * @param crc             register containing existing CRC (32-bit)
   * @param buf             register pointing to input byte buffer (byte*)
   * @param len             register containing number of bytes (will get updated to remaining bytes)
   * @param constants       register pointing to CRC table for 128-bit aligned memory
   * @param barretConstants register pointing to table for barrett reduction
   * @param t0-t4           temp registers
-  * Precondition: len should be >= 512. Otherwise, nothing will be done.
   */
! void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
!     Register constants, Register barretConstants,
!     Register t0, Register t1, Register t2, Register t3, Register t4) {
  
    // Save non-volatile vector registers (frameless).
    Register offset = t1;
    int offsetInt = 0;
    offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
--- 4148,4211 ----
   * @param table           register pointing to CRC table
   * @param constants       register pointing to CRC table for 128-bit aligned memory
   * @param barretConstants register pointing to table for barrett reduction
   * @param t0-t4           temp registers
   */
! void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table,
!                                          Register constants, Register t0, Register t1, Register t2,
!                                          Register t3, Register t4, Register t5, bool invertCRC) {
    assert_different_registers(crc, buf, len, table);
  
!   Label L_tail;
  
!   BLOCK_COMMENT("kernel_crc32_vpmsum {");
  
    if (invertCRC) {
      nand(crc, crc, crc);                      // 1s complement of crc
    }
  
!   // Enforce 32 bit.
    clrldi(len, len, 32);
  
!   // Align if we have enough bytes for the fast version.
!   const int alignment = 16,
!             threshold = 32;
    Register prealign = t0;
  
!   neg(prealign, buf);
!   addi(t1, len, -threshold);
!   andi(prealign, prealign, alignment - 1);
!   cmpw(CCR0, t1, prealign);
!   blt(CCR0, L_tail); // len - prealign < threshold?
  
    subf(len, prealign, len);
    update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
  
!   // Calculate from first aligned address as far as possible.
!   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5);
  
!   // Remaining bytes.
    BIND(L_tail);
!   update_byteLoop_crc32(crc, buf, len, table, t2, false);
  
    if (invertCRC) {
      nand(crc, crc, crc);                      // 1s complement of crc
    }
  
!   BLOCK_COMMENT("} kernel_crc32_vpmsum");
  }
  
  /**
   * @param crc             register containing existing CRC (32-bit)
   * @param buf             register pointing to input byte buffer (byte*)
   * @param len             register containing number of bytes (will get updated to remaining bytes)
   * @param constants       register pointing to CRC table for 128-bit aligned memory
   * @param barretConstants register pointing to table for barrett reduction
   * @param t0-t4           temp registers
   */
! void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len,
!     Register constants, Register t0, Register t1, Register t2, Register t3, Register t4, Register t5) {
  
    // Save non-volatile vector registers (frameless).
    Register offset = t1;
    int offsetInt = 0;
    offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
*** 4226,4254 ****
    offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
  #endif
    offsetInt -= 8; std(R14, offsetInt, R1_SP);
    offsetInt -= 8; std(R15, offsetInt, R1_SP);
    offsetInt -= 8; std(R16, offsetInt, R1_SP);
-   offsetInt -= 8; std(R17, offsetInt, R1_SP);
  
    // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
    // bytes per iteration. The basic scheme is:
    // lvx: load vector (Big Endian needs reversal)
    // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
    // vxor: xor partial results together to get unroll_factor2 vectors
  
    // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
  
    // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
!   const int unroll_factor = 2048;
!   const int unroll_factor2 = 8;
  
    // Support registers.
!   Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 };
!   Register num_bytes = R15,
!            loop_count = R16,
!            cur_const = R17;
    // Constant array for outer loop: unroll_factor2 - 1 registers,
    // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
    VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
                   consts1[] = { VR23, VR24 };
    // Data register arrays: 2 arrays with unroll_factor2 registers.
--- 4218,4248 ----
    offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
  #endif
    offsetInt -= 8; std(R14, offsetInt, R1_SP);
    offsetInt -= 8; std(R15, offsetInt, R1_SP);
    offsetInt -= 8; std(R16, offsetInt, R1_SP);
  
    // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
    // bytes per iteration. The basic scheme is:
    // lvx: load vector (Big Endian needs reversal)
    // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
    // vxor: xor partial results together to get unroll_factor2 vectors
  
    // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
  
    // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
!   const int unroll_factor = CRC32_UNROLL_FACTOR,
!             unroll_factor2 = CRC32_UNROLL_FACTOR2;
! 
!   const int outer_consts_size = (unroll_factor2 - 1) * 16,
!             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
  
    // Support registers.
!   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, crc /* will live in VCRC */ };
!   Register num_bytes = R14,
!            loop_count = R15,
!            cur_const = R16;
    // Constant array for outer loop: unroll_factor2 - 1 registers,
    // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
    VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
                   consts1[] = { VR23, VR24 };
    // Data register arrays: 2 arrays with unroll_factor2 registers.
*** 4266,4290 ****
    if (VM_Version::has_mfdscr()) {
      load_const_optimized(t0, VM_Version::_dscr_val | 7);
      mtdscr(t0);
    }
  
!   mtvrwz(VCRC, crc); // crc lives lives in VCRC, now
  
    for (int i = 1; i < unroll_factor2; ++i) {
      li(offs[i], 16 * i);
    }
  
    // Load consts for outer loop
    lvx(consts0[0], constants);
    for (int i = 1; i < unroll_factor2 - 1; ++i) {
      lvx(consts0[i], offs[i], constants);
    }
-   addi(constants, constants, (unroll_factor2 - 1) * 16);
  
    load_const_optimized(num_bytes, 16 * unroll_factor);
-   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
  
    // Reuse data registers outside of the loop.
    VectorRegister Vtmp = data1[0];
    VectorRegister Vtmp2 = data1[1];
    VectorRegister zeroes = data1[2];
--- 4260,4282 ----
    if (VM_Version::has_mfdscr()) {
      load_const_optimized(t0, VM_Version::_dscr_val | 7);
      mtdscr(t0);
    }
  
!   mtvrwz(VCRC, crc); // crc lives in VCRC, now
  
    for (int i = 1; i < unroll_factor2; ++i) {
      li(offs[i], 16 * i);
    }
  
    // Load consts for outer loop
    lvx(consts0[0], constants);
    for (int i = 1; i < unroll_factor2 - 1; ++i) {
      lvx(consts0[i], offs[i], constants);
    }
  
    load_const_optimized(num_bytes, 16 * unroll_factor);
  
    // Reuse data registers outside of the loop.
    VectorRegister Vtmp = data1[0];
    VectorRegister Vtmp2 = data1[1];
    VectorRegister zeroes = data1[2];
*** 4308,4324 ****
  #endif
  
    cmpd(CCR0, len, num_bytes);
    blt(CCR0, L_last);
  
    // ********** Main loop start **********
    align(32);
    bind(L_outer_loop);
  
    // Begin of unrolled first iteration (no xor).
    lvx(data1[0], buf);
-   mr(cur_const, constants);
    for (int i = 1; i < unroll_factor2 / 2; ++i) {
      lvx(data1[i], offs[i], buf);
    }
    vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
    lvx(consts1[0], cur_const);
--- 4300,4318 ----
  #endif
  
    cmpd(CCR0, len, num_bytes);
    blt(CCR0, L_last);
  
+   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
+   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
+ 
    // ********** Main loop start **********
    align(32);
    bind(L_outer_loop);
  
    // Begin of unrolled first iteration (no xor).
    lvx(data1[0], buf);
    for (int i = 1; i < unroll_factor2 / 2; ++i) {
      lvx(data1[i], offs[i], buf);
    }
    vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
    lvx(consts1[0], cur_const);
*** 4367,4376 ****
--- 4361,4372 ----
      }
      addi(buf, buf, 16 * unroll_factor2);
    }
    bdnz(L_inner_loop);
  
+   addi(cur_const, constants, outer_consts_size); // Reset
+ 
    // Tail of last iteration (no loads).
    for (int i = 0; i < unroll_factor2 / 2; ++i) {
      BE_swap_bytes(data1[i + unroll_factor2 / 2]);
      vxor(data0[i], data0[i], data1[i]);
      vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
*** 4395,4427 ****
    bge(CCR0, L_outer_loop);
  
    // Last chance with lower num_bytes.
    bind(L_last);
    srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
!   add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one.
    sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
    clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
!   subf(constants, R0, constants); // Point to constant to be used first.
  
    addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
    bgt(CCR0, L_outer_loop);
    // ********** Main loop end **********
- #undef BE_swap_bytes
  
    // Restore DSCR pre-fetch value.
    if (VM_Version::has_mfdscr()) {
      load_const_optimized(t0, VM_Version::_dscr_val);
      mtdscr(t0);
    }
  
    vspltisb(zeroes, 0);
  
    // Combine to 64 bit result.
    vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
  
    // Reduce to 32 bit CRC: Remainder by multiply-high.
!   lvx(Vtmp, barretConstants);
    vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
    vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
    vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
    vsldoi(Vtmp, zeroes, Vtmp, 8);
    vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
--- 4391,4455 ----
    bge(CCR0, L_outer_loop);
  
    // Last chance with lower num_bytes.
    bind(L_last);
    srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
!   // Point behind last const for inner loop.
!   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
    sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
    clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
!   subf(cur_const, R0, cur_const); // Point to constant to be used first.
  
    addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
    bgt(CCR0, L_outer_loop);
    // ********** Main loop end **********
  
    // Restore DSCR pre-fetch value.
    if (VM_Version::has_mfdscr()) {
      load_const_optimized(t0, VM_Version::_dscr_val);
      mtdscr(t0);
    }
  
+   // ********** Simple loop for remaining 16 byte blocks **********
+   {
+     Label L_loop, L_done;
+ 
+     srdi_(t0, len, 4); // 16 bytes per iteration
+     clrldi(len, len, 64-4);
+     beq(CCR0, L_done);
+ 
+     // Point to const (same as last const for inner loop).
+     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
+     mtctr(t0);
+     lvx(Vtmp2, cur_const);
+ 
+     align(32);
+     bind(L_loop);
+ 
+     lvx(Vtmp, buf);
+     addi(buf, buf, 16);
+     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
+     BE_swap_bytes(Vtmp);
+     vxor(VCRC, VCRC, Vtmp);
+     vpmsumw(VCRC, VCRC, Vtmp2);
+     bdnz(L_loop);
+ 
+     bind(L_done);
+   }
+   // ********** Simple loop end **********
+ #undef BE_swap_bytes
+ 
+   // Point to Barrett constants
+   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
+ 
    vspltisb(zeroes, 0);
  
    // Combine to 64 bit result.
    vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
  
    // Reduce to 32 bit CRC: Remainder by multiply-high.
!   lvx(Vtmp, cur_const);
    vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
    vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
    vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
    vsldoi(Vtmp, zeroes, Vtmp, 8);
    vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
*** 4443,4453 ****
    offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
  #endif
    offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
    offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
    offsetInt -= 8;  ld(R16, offsetInt, R1_SP);
-   offsetInt -= 8;  ld(R17, offsetInt, R1_SP);
  }
  
  void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
    assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
  
--- 4471,4480 ----

< prev index next >