hotspot Cdiff src/cpu/x86/vm/macroAssembler

src/cpu/x86/vm/macroAssembler_x86.cpp


*** 8253,8266 ****
  
  }
  
  // Search for Non-ASCII character (Negative byte value) in a byte array,
  // return true if it has any and false otherwise.
  void MacroAssembler::has_negatives(Register ary1, Register len,
                                     Register result, Register tmp1,
                                     XMMRegister vec1, XMMRegister vec2) {
- 
    // rsi: byte array
    // rcx: len
    // rax: result
    ShortBranchVerifier sbv(this);
    assert_different_registers(ary1, len, result, tmp1);
--- 8253,8275 ----
  
  }
  
  // Search for Non-ASCII character (Negative byte value) in a byte array,
  // return true if it has any and false otherwise.
+ //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
+ //   @HotSpotIntrinsicCandidate
+ //   private static boolean hasNegatives(byte[] ba, int off, int len) {
+ //     for (int i = off; i < off + len; i++) {
+ //       if (ba[i] < 0) {
+ //         return true;
+ //       }
+ //     }
+ //     return false;
+ //   }
  void MacroAssembler::has_negatives(Register ary1, Register len,
    Register result, Register tmp1,
    XMMRegister vec1, XMMRegister vec2) {
    // rsi: byte array
    // rcx: len
    // rax: result
    ShortBranchVerifier sbv(this);
    assert_different_registers(ary1, len, result, tmp1);
*** 8269,8281 ****
  
    // len == 0
    testl(len, len);
    jcc(Assembler::zero, FALSE_LABEL);
  
    movl(result, len); // copy
  
!   if (UseAVX >= 2 && UseSSE >= 2) {
      // With AVX2, use 32-byte vector compare
      Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
  
      // Compare 32-byte vectors
      andl(result, 0x0000001f);  //   tail count (in bytes)
--- 8278,8370 ----
  
    // len == 0
    testl(len, len);
    jcc(Assembler::zero, FALSE_LABEL);
  
+   if ((UseAVX > 2) && // AVX512
+     VM_Version::supports_avx512vlbw() &&
+     VM_Version::supports_bmi2()) {
+ 
+     set_vector_masking();  // opening of the stub context for programming mask registers
+ 
+     Label test_64_loop, test_tail;
+     Register tmp3_aliased = len;
+ 
+     movl(tmp1, len);
+     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
+ 
+     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
+     andl(len, ~(64 - 1));    // vector count (in chars)
+     jccb(Assembler::zero, test_tail);
+ 
+     lea(ary1, Address(ary1, len, Address::times_1));
+     negptr(len);
+ 
+     bind(test_64_loop);
+     // Check whether our 64 elements of size byte contain negatives
+     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
+     kortestql(k2, k2);
+     jcc(Assembler::notZero, TRUE_LABEL);
+ 
+     addptr(len, 64);
+     jccb(Assembler::notZero, test_64_loop);
+ 
+ 
+     bind(test_tail);
+     // bail out when there is nothing to be done
+     testl(tmp1, -1);
+     jcc(Assembler::zero, FALSE_LABEL);
+ 
+     // Save k1
+     kmovql(k3, k1);
+ 
+     // ~(~0 << len) applied up to two times (for 32-bit scenario)
+   #ifdef _LP64
+       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
+       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
+       notq(tmp3_aliased);
+       kmovql(k1, tmp3_aliased);
+   #else
+     Label k_init;
+     jmp(k_init);
+ 
+     // We could not read 64-bits from a general purpose register thus we move
+     // data required to compose 64 1's to the instruction stream
+     // We emit 64 byte wide series of elements from 0..63 which later on would
+     // be used as a compare targets with tail count contained in tmp1 register.
+     // Result would be a k1 register having tmp1 consecutive number or 1
+     // counting from least significant bit.
+     address tmp = pc();
+     emit_int64(0x0706050403020100);
+     emit_int64(0x0F0E0D0C0B0A0908);
+     emit_int64(0x1716151413121110);
+     emit_int64(0x1F1E1D1C1B1A1918);
+     emit_int64(0x2726252423222120);
+     emit_int64(0x2F2E2D2C2B2A2928);
+     emit_int64(0x3736353433323130);
+     emit_int64(0x3F3E3D3C3B3A3938);
+ 
+     bind(k_init);
+     lea(len, InternalAddress(tmp));
+     // create mask to test for negative byte inside a vector
+     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
+     evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit);
+ 
+ #endif
+     evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit);
+     ktestq(k2, k1);
+     // Restore k1
+     kmovql(k1, k3);
+     jcc(Assembler::notZero, TRUE_LABEL);
+ 
+     jmp(FALSE_LABEL);
+ 
+     clear_vector_masking();   // closing of the stub context for programming mask registers
+   } else {
      movl(result, len); // copy
  
!     if (UseAVX == 2 && UseSSE >= 2) {
        // With AVX2, use 32-byte vector compare
        Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
  
        // Compare 32-byte vectors
        andl(result, 0x0000001f);  //   tail count (in bytes)
*** 8341,8351 ****
  
      bind(COMPARE_TAIL); // len is zero
      movl(len, result);
      // Fallthru to tail compare
    }
! 
    // Compare 4-byte vectors
    andl(len, 0xfffffffc); // vector count (in bytes)
    jccb(Assembler::zero, COMPARE_CHAR);
  
    lea(ary1, Address(ary1, len, Address::times_1));
--- 8430,8440 ----
  
        bind(COMPARE_TAIL); // len is zero
        movl(len, result);
        // Fallthru to tail compare
      }
!   }
    // Compare 4-byte vectors
    andl(len, 0xfffffffc); // vector count (in bytes)
    jccb(Assembler::zero, COMPARE_CHAR);
  
    lea(ary1, Address(ary1, len, Address::times_1));
*** 8835,8848 ****
--- 8924,8950 ----
    }
    BIND(L_exit);
  }
  
  // encode char[] to byte[] in ISO_8859_1
+    //@HotSpotIntrinsicCandidate
+    //private static int implEncodeISOArray(byte[] sa, int sp,
+    //byte[] da, int dp, int len) {
+    //  int i = 0;
+    //  for (; i < len; i++) {
+    //    char c = StringUTF16.getChar(sa, sp++);
+    //    if (c > '\u00FF')
+    //      break;
+    //    da[dp++] = (byte)c;
+    //  }
+    //  return i;
+    //}
  void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
    XMMRegister tmp1Reg, XMMRegister tmp2Reg,
    XMMRegister tmp3Reg, XMMRegister tmp4Reg,
    Register tmp5, Register result) {
+ 
    // rsi: src
    // rdi: dst
    // rdx: len
    // rcx: tmp5
    // rax: result
*** 8853,8862 ****
--- 8955,8965 ----
    // set result
    xorl(result, result);
    // check for zero length
    testl(len, len);
    jcc(Assembler::zero, L_done);
+ 
    movl(result, len);
  
    // Setup pointers
    lea(src, Address(src, len, Address::times_2)); // char[]
    lea(dst, Address(dst, len, Address::times_1)); // byte[]
*** 8961,8970 ****
--- 9064,9074 ----
    addptr(len, 1);
    jccb(Assembler::less, L_copy_1_char);
  
    bind(L_copy_1_char_exit);
    addptr(result, len); // len is negative count of not processed elements
+ 
    bind(L_done);
  }
  
  #ifdef _LP64
  /**
*** 10781,10797 ****
  }
  #endif // LP64
  #undef BIND
  #undef BLOCK_COMMENT
  
- 
  // Compress char[] array to byte[].
  void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
                                           XMMRegister tmp1Reg, XMMRegister tmp2Reg,
                                           XMMRegister tmp3Reg, XMMRegister tmp4Reg,
                                           Register tmp5, Register result) {
!   Label copy_chars_loop, return_length, return_zero, done;
  
    // rsi: src
    // rdi: dst
    // rdx: len
    // rcx: tmp5
--- 10885,10912 ----
  }
  #endif // LP64
  #undef BIND
  #undef BLOCK_COMMENT
  
  // Compress char[] array to byte[].
+ //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
+ //   @HotSpotIntrinsicCandidate
+ //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
+ //     for (int i = 0; i < len; i++) {
+ //       int c = src[srcOff++];
+ //       if (c >>> 8 != 0) {
+ //         return 0;
+ //       }
+ //       dst[dstOff++] = (byte)c;
+ //     }
+ //     return len;
+ //   }
  void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
    XMMRegister tmp1Reg, XMMRegister tmp2Reg,
    XMMRegister tmp3Reg, XMMRegister tmp4Reg,
    Register tmp5, Register result) {
!   Label copy_chars_loop, return_length, return_zero, done, below_threshold;
  
    // rsi: src
    // rdi: dst
    // rdx: len
    // rcx: tmp5
*** 10804,10818 ****
--- 10919,11063 ----
    assert(len != result, "");
  
    // save length for return
    push(len);
  
+   if ((UseAVX > 2) && // AVX512
+     VM_Version::supports_avx512vlbw() &&
+     VM_Version::supports_bmi2()) {
+ 
+     set_vector_masking();  // opening of the stub context for programming mask registers
+ 
+     Label copy_32_loop, copy_loop_tail, copy_just_portion_of_candidates;
+ 
+     // alignement
+     Label post_alignement;
+ 
+     // if length of the string is less than 16, handle it in an old fashioned
+     // way
+     testl(len, -32);
+     jcc(Assembler::zero, below_threshold);
+ 
+     // First check whether a character is compressable ( <= 0xFF).
+     // Create mask to test for Unicode chars inside zmm vector
+     movl(result, 0x00FF);
+     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
+ 
+     testl(len, -64);
+     jcc(Assembler::zero, post_alignement);
+ 
+     // Save k1
+     kmovql(k3, k1);
+ 
+     movl(tmp5, dst);
+     andl(tmp5, (64 - 1));
+     negl(tmp5);
+     andl(tmp5, (64 - 1));
+ 
+     // bail out when there is nothing to be done
+     testl(tmp5, 0xFFFFFFFF);
+     jcc(Assembler::zero, post_alignement);
+ 
+     // ~(~0 << len), where len is the # of remaining elements to process
+     movl(result, 0xFFFFFFFF);
+     shlxl(result, result, tmp5);
+     notl(result);
+ 
+     kmovdl(k1, result);
+ 
+     evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
+     evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit);
+     ktestd(k2, k1);
+     jcc(Assembler::carryClear, copy_just_portion_of_candidates);
+ 
+     evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
+ 
+     addptr(src, tmp5);
+     addptr(src, tmp5);
+     addptr(dst, tmp5);
+     subl(len, tmp5);
+ 
+     bind(post_alignement);
+     // end of alignement
+ 
+     movl(tmp5, len);
+     andl(tmp5, (32 - 1));   // tail count (in chars)
+     andl(len, ~(32 - 1));    // vector count (in chars)
+     jcc(Assembler::zero, copy_loop_tail);
+ 
+     lea(src, Address(src, len, Address::times_2));
+     lea(dst, Address(dst, len, Address::times_1));
+     negptr(len);
+ 
+     bind(copy_32_loop);
+     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
+     evpcmpuw(k2, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit);
+     kortestdl(k2, k2);
+     jcc(Assembler::carryClear, copy_just_portion_of_candidates);
+ 
+     // All elements in current processed chunk are valid candidates for
+     // compression. Write a truncated byte elements to the memory.
+     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
+     addptr(len, 32);
+     jcc(Assembler::notZero, copy_32_loop);
+ 
+     bind(copy_loop_tail);
+     // bail out when there is nothing to be done
+     testl(tmp5, 0xFFFFFFFF);
+     jcc(Assembler::zero, return_length);
+ 
+     // Save k1
+     kmovql(k3, k1);
+ 
+     movl(len, tmp5);
+ 
+     // ~(~0 << len), where len is the # of remaining elements to process
+     movl(result, 0xFFFFFFFF);
+     shlxl(result, result, len);
+     notl(result);
+ 
+     kmovdl(k1, result);
+ 
+     evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
+     evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit);
+     ktestd(k2, k1);
+     jcc(Assembler::carryClear, copy_just_portion_of_candidates);
+ 
+     evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
+     // Restore k1
+     kmovql(k1, k3);
+ 
+     jmp(return_length);
+ 
+     bind(copy_just_portion_of_candidates);
+     kmovdl(tmp5, k2);
+     tzcntl(tmp5, tmp5);
+ 
+     // ~(~0 << tmp5), where tmp5 is a number of elements in an array from the
+     // result to the first element larger than 0xFF
+     movl(result, 0xFFFFFFFF);
+     shlxl(result, result, tmp5);
+     notl(result);
+ 
+     kmovdl(k1, result);
+ 
+     evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
+     // Restore k1
+     kmovql(k1, k3);
+ 
+     jmp(return_zero);
+ 
+     clear_vector_masking();   // closing of the stub context for programming mask registers
+   }
    if (UseSSE42Intrinsics) {
      assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
      Label copy_32_loop, copy_16, copy_tail;
  
+     bind(below_threshold);
+ 
      movl(result, len);
+ 
      movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
  
      // vectored compression
      andl(len, 0xfffffff0);    // vector count (in chars)
      andl(result, 0x0000000f);    // tail count (in chars)
*** 10890,10927 ****
  
    bind(done);
  }
  
  // Inflate byte[] array to char[].
  void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
                                          XMMRegister tmp1, Register tmp2) {
!   Label copy_chars_loop, done;
! 
    // rsi: src
    // rdi: dst
    // rdx: len
    // rcx: tmp2
  
    // rsi holds start addr of source byte[] to be inflated
    // rdi holds start addr of destination char[]
    // rdx holds length
    assert_different_registers(src, dst, len, tmp2);
  
    if (UseSSE42Intrinsics) {
      assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
!     Label copy_8_loop, copy_bytes, copy_tail;
  
      movl(tmp2, len);
      andl(tmp2, 0x00000007);   // tail count (in chars)
      andl(len, 0xfffffff8);    // vector count (in chars)
      jccb(Assembler::zero, copy_tail);
  
      // vectored inflation
      lea(src, Address(src, len, Address::times_1));
      lea(dst, Address(dst, len, Address::times_2));
      negptr(len);
  
      // inflate 8 chars per iter
      bind(copy_8_loop);
      pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
      movdqu(Address(dst, len, Address::times_2), tmp1);
      addptr(len, 8);
--- 11135,11267 ----
  
    bind(done);
  }
  
  // Inflate byte[] array to char[].
+ //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
+ //   @HotSpotIntrinsicCandidate
+ //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
+ //     for (int i = 0; i < len; i++) {
+ //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
+ //     }
+ //   }
  void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
    XMMRegister tmp1, Register tmp2) {
!   Label copy_chars_loop, done, below_threshold;
    // rsi: src
    // rdi: dst
    // rdx: len
    // rcx: tmp2
  
    // rsi holds start addr of source byte[] to be inflated
    // rdi holds start addr of destination char[]
    // rdx holds length
    assert_different_registers(src, dst, len, tmp2);
  
+   if ((UseAVX > 2) && // AVX512
+     VM_Version::supports_avx512vlbw() &&
+     VM_Version::supports_bmi2()) {
+ 
+     set_vector_masking();  // opening of the stub context for programming mask registers
+ 
+     Label copy_32_loop, copy_tail;
+     Register tmp3_aliased = len;
+ 
+     // if length of the string is less than 16, handle it in an old fashioned
+     // way
+     testl(len, -16);
+     jcc(Assembler::zero, below_threshold);
+ 
+     // In order to use only one arithmetic operation for the main loop we use
+     // this pre-calculation
+     movl(tmp2, len);
+     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
+     andl(len, -32);     // vector count
+     jccb(Assembler::zero, copy_tail);
+ 
+     lea(src, Address(src, len, Address::times_1));
+     lea(dst, Address(dst, len, Address::times_2));
+     negptr(len);
+ 
+ 
+     // inflate 32 chars per iter
+     bind(copy_32_loop);
+     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
+     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
+     addptr(len, 32);
+     jcc(Assembler::notZero, copy_32_loop);
+ 
+     bind(copy_tail);
+     // bail out when there is nothing to be done
+     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
+     jcc(Assembler::zero, done);
+ 
+     // Save k1
+     kmovql(k2, k1);
+ 
+     // ~(~0 << length), where length is the # of remaining elements to process
+     movl(tmp3_aliased, -1);
+     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
+     notl(tmp3_aliased);
+     kmovdl(k1, tmp3_aliased);
+     evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit);
+     evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit);
+ 
+     // Restore k1
+     kmovql(k1, k2);
+     jmp(done);
+ 
+     clear_vector_masking();   // closing of the stub context for programming mask registers
+   }
    if (UseSSE42Intrinsics) {
      assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
!     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
  
      movl(tmp2, len);
+ 
+     if (UseAVX > 1) {
+       andl(tmp2, (16 - 1));
+       andl(len, -16);
+       jccb(Assembler::zero, copy_new_tail);
+     } else {
        andl(tmp2, 0x00000007);   // tail count (in chars)
        andl(len, 0xfffffff8);    // vector count (in chars)
        jccb(Assembler::zero, copy_tail);
+     }
  
      // vectored inflation
      lea(src, Address(src, len, Address::times_1));
      lea(dst, Address(dst, len, Address::times_2));
      negptr(len);
  
+     if (UseAVX > 1) {
+       bind(copy_16_loop);
+       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
+       vmovdqu(Address(dst, len, Address::times_2), tmp1);
+       addptr(len, 16);
+       jcc(Assembler::notZero, copy_16_loop);
+ 
+       bind(below_threshold);
+       bind(copy_new_tail);
+       if (UseAVX > 2) {
+         movl(tmp2, len);
+       }
+       else {
+         movl(len, tmp2);
+       }
+       andl(tmp2, 0x00000007);
+       andl(len, 0xFFFFFFF8);
+       jccb(Assembler::zero, copy_tail);
+ 
+       pmovzxbw(tmp1, Address(src, 0));
+       movdqu(Address(dst, 0), tmp1);
+       addptr(src, 8);
+       addptr(dst, 2 * 8);
+ 
+       jmp(copy_tail, true);
+     }
+ 
      // inflate 8 chars per iter
      bind(copy_8_loop);
      pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
      movdqu(Address(dst, len, Address::times_2), tmp1);
      addptr(len, 8);
*** 10956,10966 ****
    jcc(Assembler::notZero, copy_chars_loop);
  
    bind(done);
  }
  
- 
  Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
    switch (cond) {
      // Note some conditions are synonyms for others
      case Assembler::zero:         return Assembler::notZero;
      case Assembler::notZero:      return Assembler::zero;
--- 11296,11305 ----
< prev index next >