< prev index next >
src/cpu/x86/vm/macroAssembler_x86.cpp
Print this page
*** 8253,8266 ****
}
// Search for Non-ASCII character (Negative byte value) in a byte array,
// return true if it has any and false otherwise.
void MacroAssembler::has_negatives(Register ary1, Register len,
Register result, Register tmp1,
XMMRegister vec1, XMMRegister vec2) {
-
// rsi: byte array
// rcx: len
// rax: result
ShortBranchVerifier sbv(this);
assert_different_registers(ary1, len, result, tmp1);
--- 8253,8275 ----
}
// Search for Non-ASCII character (Negative byte value) in a byte array,
// return true if it has any and false otherwise.
+ // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
+ // @HotSpotIntrinsicCandidate
+ // private static boolean hasNegatives(byte[] ba, int off, int len) {
+ // for (int i = off; i < off + len; i++) {
+ // if (ba[i] < 0) {
+ // return true;
+ // }
+ // }
+ // return false;
+ // }
void MacroAssembler::has_negatives(Register ary1, Register len,
Register result, Register tmp1,
XMMRegister vec1, XMMRegister vec2) {
// rsi: byte array
// rcx: len
// rax: result
ShortBranchVerifier sbv(this);
assert_different_registers(ary1, len, result, tmp1);
*** 8269,8281 ****
// len == 0
testl(len, len);
jcc(Assembler::zero, FALSE_LABEL);
movl(result, len); // copy
! if (UseAVX >= 2 && UseSSE >= 2) {
// With AVX2, use 32-byte vector compare
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
// Compare 32-byte vectors
andl(result, 0x0000001f); // tail count (in bytes)
--- 8278,8370 ----
// len == 0
testl(len, len);
jcc(Assembler::zero, FALSE_LABEL);
+ if ((UseAVX > 2) && // AVX512
+ VM_Version::supports_avx512vlbw() &&
+ VM_Version::supports_bmi2()) {
+
+ set_vector_masking(); // opening of the stub context for programming mask registers
+
+ Label test_64_loop, test_tail;
+ Register tmp3_aliased = len;
+
+ movl(tmp1, len);
+ vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
+
+ andl(tmp1, 64 - 1); // tail count (in chars) 0x3F
+ andl(len, ~(64 - 1)); // vector count (in chars)
+ jccb(Assembler::zero, test_tail);
+
+ lea(ary1, Address(ary1, len, Address::times_1));
+ negptr(len);
+
+ bind(test_64_loop);
+ // Check whether our 64 elements of size byte contain negatives
+ evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
+ kortestql(k2, k2);
+ jcc(Assembler::notZero, TRUE_LABEL);
+
+ addptr(len, 64);
+ jccb(Assembler::notZero, test_64_loop);
+
+
+ bind(test_tail);
+ // bail out when there is nothing to be done
+ testl(tmp1, -1);
+ jcc(Assembler::zero, FALSE_LABEL);
+
+ // Save k1
+ kmovql(k3, k1);
+
+ // ~(~0 << len) applied up to two times (for 32-bit scenario)
+ #ifdef _LP64
+ mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
+ shlxq(tmp3_aliased, tmp3_aliased, tmp1);
+ notq(tmp3_aliased);
+ kmovql(k1, tmp3_aliased);
+ #else
+ Label k_init;
+ jmp(k_init);
+
+ // We could not read 64-bits from a general purpose register thus we move
+ // data required to compose 64 1's to the instruction stream
+ // We emit 64 byte wide series of elements from 0..63 which later on would
+ // be used as a compare targets with tail count contained in tmp1 register.
+ // Result would be a k1 register having tmp1 consecutive number or 1
+ // counting from least significant bit.
+ address tmp = pc();
+ emit_int64(0x0706050403020100);
+ emit_int64(0x0F0E0D0C0B0A0908);
+ emit_int64(0x1716151413121110);
+ emit_int64(0x1F1E1D1C1B1A1918);
+ emit_int64(0x2726252423222120);
+ emit_int64(0x2F2E2D2C2B2A2928);
+ emit_int64(0x3736353433323130);
+ emit_int64(0x3F3E3D3C3B3A3938);
+
+ bind(k_init);
+ lea(len, InternalAddress(tmp));
+ // create mask to test for negative byte inside a vector
+ evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
+ evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit);
+
+ #endif
+ evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit);
+ ktestq(k2, k1);
+ // Restore k1
+ kmovql(k1, k3);
+ jcc(Assembler::notZero, TRUE_LABEL);
+
+ jmp(FALSE_LABEL);
+
+ clear_vector_masking(); // closing of the stub context for programming mask registers
+ } else {
movl(result, len); // copy
! if (UseAVX == 2 && UseSSE >= 2) {
// With AVX2, use 32-byte vector compare
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
// Compare 32-byte vectors
andl(result, 0x0000001f); // tail count (in bytes)
*** 8341,8351 ****
bind(COMPARE_TAIL); // len is zero
movl(len, result);
// Fallthru to tail compare
}
!
// Compare 4-byte vectors
andl(len, 0xfffffffc); // vector count (in bytes)
jccb(Assembler::zero, COMPARE_CHAR);
lea(ary1, Address(ary1, len, Address::times_1));
--- 8430,8440 ----
bind(COMPARE_TAIL); // len is zero
movl(len, result);
// Fallthru to tail compare
}
! }
// Compare 4-byte vectors
andl(len, 0xfffffffc); // vector count (in bytes)
jccb(Assembler::zero, COMPARE_CHAR);
lea(ary1, Address(ary1, len, Address::times_1));
*** 8835,8848 ****
--- 8924,8950 ----
}
BIND(L_exit);
}
// encode char[] to byte[] in ISO_8859_1
+ //@HotSpotIntrinsicCandidate
+ //private static int implEncodeISOArray(byte[] sa, int sp,
+ //byte[] da, int dp, int len) {
+ // int i = 0;
+ // for (; i < len; i++) {
+ // char c = StringUTF16.getChar(sa, sp++);
+ // if (c > '\u00FF')
+ // break;
+ // da[dp++] = (byte)c;
+ // }
+ // return i;
+ //}
void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
Register tmp5, Register result) {
+
// rsi: src
// rdi: dst
// rdx: len
// rcx: tmp5
// rax: result
*** 8853,8862 ****
--- 8955,8965 ----
// set result
xorl(result, result);
// check for zero length
testl(len, len);
jcc(Assembler::zero, L_done);
+
movl(result, len);
// Setup pointers
lea(src, Address(src, len, Address::times_2)); // char[]
lea(dst, Address(dst, len, Address::times_1)); // byte[]
*** 8961,8970 ****
--- 9064,9074 ----
addptr(len, 1);
jccb(Assembler::less, L_copy_1_char);
bind(L_copy_1_char_exit);
addptr(result, len); // len is negative count of not processed elements
+
bind(L_done);
}
#ifdef _LP64
/**
*** 10781,10797 ****
}
#endif // LP64
#undef BIND
#undef BLOCK_COMMENT
-
// Compress char[] array to byte[].
void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
Register tmp5, Register result) {
! Label copy_chars_loop, return_length, return_zero, done;
// rsi: src
// rdi: dst
// rdx: len
// rcx: tmp5
--- 10885,10912 ----
}
#endif // LP64
#undef BIND
#undef BLOCK_COMMENT
// Compress char[] array to byte[].
+ // ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
+ // @HotSpotIntrinsicCandidate
+ // private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
+ // for (int i = 0; i < len; i++) {
+ // int c = src[srcOff++];
+ // if (c >>> 8 != 0) {
+ // return 0;
+ // }
+ // dst[dstOff++] = (byte)c;
+ // }
+ // return len;
+ // }
void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
Register tmp5, Register result) {
! Label copy_chars_loop, return_length, return_zero, done, below_threshold;
// rsi: src
// rdi: dst
// rdx: len
// rcx: tmp5
*** 10804,10818 ****
--- 10919,11063 ----
assert(len != result, "");
// save length for return
push(len);
+ if ((UseAVX > 2) && // AVX512
+ VM_Version::supports_avx512vlbw() &&
+ VM_Version::supports_bmi2()) {
+
+ set_vector_masking(); // opening of the stub context for programming mask registers
+
+ Label copy_32_loop, copy_loop_tail, copy_just_portion_of_candidates;
+
+ // alignement
+ Label post_alignement;
+
+ // if length of the string is less than 16, handle it in an old fashioned
+ // way
+ testl(len, -32);
+ jcc(Assembler::zero, below_threshold);
+
+ // First check whether a character is compressable ( <= 0xFF).
+ // Create mask to test for Unicode chars inside zmm vector
+ movl(result, 0x00FF);
+ evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
+
+ testl(len, -64);
+ jcc(Assembler::zero, post_alignement);
+
+ // Save k1
+ kmovql(k3, k1);
+
+ movl(tmp5, dst);
+ andl(tmp5, (64 - 1));
+ negl(tmp5);
+ andl(tmp5, (64 - 1));
+
+ // bail out when there is nothing to be done
+ testl(tmp5, 0xFFFFFFFF);
+ jcc(Assembler::zero, post_alignement);
+
+ // ~(~0 << len), where len is the # of remaining elements to process
+ movl(result, 0xFFFFFFFF);
+ shlxl(result, result, tmp5);
+ notl(result);
+
+ kmovdl(k1, result);
+
+ evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
+ evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit);
+ ktestd(k2, k1);
+ jcc(Assembler::carryClear, copy_just_portion_of_candidates);
+
+ evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
+
+ addptr(src, tmp5);
+ addptr(src, tmp5);
+ addptr(dst, tmp5);
+ subl(len, tmp5);
+
+ bind(post_alignement);
+ // end of alignement
+
+ movl(tmp5, len);
+ andl(tmp5, (32 - 1)); // tail count (in chars)
+ andl(len, ~(32 - 1)); // vector count (in chars)
+ jcc(Assembler::zero, copy_loop_tail);
+
+ lea(src, Address(src, len, Address::times_2));
+ lea(dst, Address(dst, len, Address::times_1));
+ negptr(len);
+
+ bind(copy_32_loop);
+ evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
+ evpcmpuw(k2, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit);
+ kortestdl(k2, k2);
+ jcc(Assembler::carryClear, copy_just_portion_of_candidates);
+
+ // All elements in current processed chunk are valid candidates for
+ // compression. Write a truncated byte elements to the memory.
+ evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
+ addptr(len, 32);
+ jcc(Assembler::notZero, copy_32_loop);
+
+ bind(copy_loop_tail);
+ // bail out when there is nothing to be done
+ testl(tmp5, 0xFFFFFFFF);
+ jcc(Assembler::zero, return_length);
+
+ // Save k1
+ kmovql(k3, k1);
+
+ movl(len, tmp5);
+
+ // ~(~0 << len), where len is the # of remaining elements to process
+ movl(result, 0xFFFFFFFF);
+ shlxl(result, result, len);
+ notl(result);
+
+ kmovdl(k1, result);
+
+ evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
+ evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit);
+ ktestd(k2, k1);
+ jcc(Assembler::carryClear, copy_just_portion_of_candidates);
+
+ evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
+ // Restore k1
+ kmovql(k1, k3);
+
+ jmp(return_length);
+
+ bind(copy_just_portion_of_candidates);
+ kmovdl(tmp5, k2);
+ tzcntl(tmp5, tmp5);
+
+ // ~(~0 << tmp5), where tmp5 is a number of elements in an array from the
+ // result to the first element larger than 0xFF
+ movl(result, 0xFFFFFFFF);
+ shlxl(result, result, tmp5);
+ notl(result);
+
+ kmovdl(k1, result);
+
+ evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
+ // Restore k1
+ kmovql(k1, k3);
+
+ jmp(return_zero);
+
+ clear_vector_masking(); // closing of the stub context for programming mask registers
+ }
if (UseSSE42Intrinsics) {
assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
Label copy_32_loop, copy_16, copy_tail;
+ bind(below_threshold);
+
movl(result, len);
+
movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
// vectored compression
andl(len, 0xfffffff0); // vector count (in chars)
andl(result, 0x0000000f); // tail count (in chars)
*** 10890,10927 ****
bind(done);
}
// Inflate byte[] array to char[].
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
XMMRegister tmp1, Register tmp2) {
! Label copy_chars_loop, done;
!
// rsi: src
// rdi: dst
// rdx: len
// rcx: tmp2
// rsi holds start addr of source byte[] to be inflated
// rdi holds start addr of destination char[]
// rdx holds length
assert_different_registers(src, dst, len, tmp2);
if (UseSSE42Intrinsics) {
assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
! Label copy_8_loop, copy_bytes, copy_tail;
movl(tmp2, len);
andl(tmp2, 0x00000007); // tail count (in chars)
andl(len, 0xfffffff8); // vector count (in chars)
jccb(Assembler::zero, copy_tail);
// vectored inflation
lea(src, Address(src, len, Address::times_1));
lea(dst, Address(dst, len, Address::times_2));
negptr(len);
// inflate 8 chars per iter
bind(copy_8_loop);
pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
movdqu(Address(dst, len, Address::times_2), tmp1);
addptr(len, 8);
--- 11135,11267 ----
bind(done);
}
// Inflate byte[] array to char[].
+ // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
+ // @HotSpotIntrinsicCandidate
+ // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
+ // for (int i = 0; i < len; i++) {
+ // dst[dstOff++] = (char)(src[srcOff++] & 0xff);
+ // }
+ // }
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
XMMRegister tmp1, Register tmp2) {
! Label copy_chars_loop, done, below_threshold;
// rsi: src
// rdi: dst
// rdx: len
// rcx: tmp2
// rsi holds start addr of source byte[] to be inflated
// rdi holds start addr of destination char[]
// rdx holds length
assert_different_registers(src, dst, len, tmp2);
+ if ((UseAVX > 2) && // AVX512
+ VM_Version::supports_avx512vlbw() &&
+ VM_Version::supports_bmi2()) {
+
+ set_vector_masking(); // opening of the stub context for programming mask registers
+
+ Label copy_32_loop, copy_tail;
+ Register tmp3_aliased = len;
+
+ // if length of the string is less than 16, handle it in an old fashioned
+ // way
+ testl(len, -16);
+ jcc(Assembler::zero, below_threshold);
+
+ // In order to use only one arithmetic operation for the main loop we use
+ // this pre-calculation
+ movl(tmp2, len);
+ andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
+ andl(len, -32); // vector count
+ jccb(Assembler::zero, copy_tail);
+
+ lea(src, Address(src, len, Address::times_1));
+ lea(dst, Address(dst, len, Address::times_2));
+ negptr(len);
+
+
+ // inflate 32 chars per iter
+ bind(copy_32_loop);
+ vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
+ evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
+ addptr(len, 32);
+ jcc(Assembler::notZero, copy_32_loop);
+
+ bind(copy_tail);
+ // bail out when there is nothing to be done
+ testl(tmp2, -1); // we don't destroy the contents of tmp2 here
+ jcc(Assembler::zero, done);
+
+ // Save k1
+ kmovql(k2, k1);
+
+ // ~(~0 << length), where length is the # of remaining elements to process
+ movl(tmp3_aliased, -1);
+ shlxl(tmp3_aliased, tmp3_aliased, tmp2);
+ notl(tmp3_aliased);
+ kmovdl(k1, tmp3_aliased);
+ evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit);
+ evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit);
+
+ // Restore k1
+ kmovql(k1, k2);
+ jmp(done);
+
+ clear_vector_masking(); // closing of the stub context for programming mask registers
+ }
if (UseSSE42Intrinsics) {
assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
! Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
movl(tmp2, len);
+
+ if (UseAVX > 1) {
+ andl(tmp2, (16 - 1));
+ andl(len, -16);
+ jccb(Assembler::zero, copy_new_tail);
+ } else {
andl(tmp2, 0x00000007); // tail count (in chars)
andl(len, 0xfffffff8); // vector count (in chars)
jccb(Assembler::zero, copy_tail);
+ }
// vectored inflation
lea(src, Address(src, len, Address::times_1));
lea(dst, Address(dst, len, Address::times_2));
negptr(len);
+ if (UseAVX > 1) {
+ bind(copy_16_loop);
+ vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
+ vmovdqu(Address(dst, len, Address::times_2), tmp1);
+ addptr(len, 16);
+ jcc(Assembler::notZero, copy_16_loop);
+
+ bind(below_threshold);
+ bind(copy_new_tail);
+ if (UseAVX > 2) {
+ movl(tmp2, len);
+ }
+ else {
+ movl(len, tmp2);
+ }
+ andl(tmp2, 0x00000007);
+ andl(len, 0xFFFFFFF8);
+ jccb(Assembler::zero, copy_tail);
+
+ pmovzxbw(tmp1, Address(src, 0));
+ movdqu(Address(dst, 0), tmp1);
+ addptr(src, 8);
+ addptr(dst, 2 * 8);
+
+ jmp(copy_tail, true);
+ }
+
// inflate 8 chars per iter
bind(copy_8_loop);
pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
movdqu(Address(dst, len, Address::times_2), tmp1);
addptr(len, 8);
*** 10956,10966 ****
jcc(Assembler::notZero, copy_chars_loop);
bind(done);
}
-
Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
switch (cond) {
// Note some conditions are synonyms for others
case Assembler::zero: return Assembler::notZero;
case Assembler::notZero: return Assembler::zero;
--- 11296,11305 ----
< prev index next >