< prev index next >

src/cpu/x86/vm/macroAssembler_x86.cpp

Print this page

        

*** 8253,8266 **** } // Search for Non-ASCII character (Negative byte value) in a byte array, // return true if it has any and false otherwise. void MacroAssembler::has_negatives(Register ary1, Register len, Register result, Register tmp1, XMMRegister vec1, XMMRegister vec2) { - // rsi: byte array // rcx: len // rax: result ShortBranchVerifier sbv(this); assert_different_registers(ary1, len, result, tmp1); --- 8253,8275 ---- } // Search for Non-ASCII character (Negative byte value) in a byte array, // return true if it has any and false otherwise. + // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java + // @HotSpotIntrinsicCandidate + // private static boolean hasNegatives(byte[] ba, int off, int len) { + // for (int i = off; i < off + len; i++) { + // if (ba[i] < 0) { + // return true; + // } + // } + // return false; + // } void MacroAssembler::has_negatives(Register ary1, Register len, Register result, Register tmp1, XMMRegister vec1, XMMRegister vec2) { // rsi: byte array // rcx: len // rax: result ShortBranchVerifier sbv(this); assert_different_registers(ary1, len, result, tmp1);
*** 8269,8281 **** // len == 0 testl(len, len); jcc(Assembler::zero, FALSE_LABEL); movl(result, len); // copy ! if (UseAVX >= 2 && UseSSE >= 2) { // With AVX2, use 32-byte vector compare Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; // Compare 32-byte vectors andl(result, 0x0000001f); // tail count (in bytes) --- 8278,8370 ---- // len == 0 testl(len, len); jcc(Assembler::zero, FALSE_LABEL); + if ((UseAVX > 2) && // AVX512 + VM_Version::supports_avx512vlbw() && + VM_Version::supports_bmi2()) { + + set_vector_masking(); // opening of the stub context for programming mask registers + + Label test_64_loop, test_tail; + Register tmp3_aliased = len; + + movl(tmp1, len); + vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); + + andl(tmp1, 64 - 1); // tail count (in chars) 0x3F + andl(len, ~(64 - 1)); // vector count (in chars) + jccb(Assembler::zero, test_tail); + + lea(ary1, Address(ary1, len, Address::times_1)); + negptr(len); + + bind(test_64_loop); + // Check whether our 64 elements of size byte contain negatives + evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); + kortestql(k2, k2); + jcc(Assembler::notZero, TRUE_LABEL); + + addptr(len, 64); + jccb(Assembler::notZero, test_64_loop); + + + bind(test_tail); + // bail out when there is nothing to be done + testl(tmp1, -1); + jcc(Assembler::zero, FALSE_LABEL); + + // Save k1 + kmovql(k3, k1); + + // ~(~0 << len) applied up to two times (for 32-bit scenario) + #ifdef _LP64 + mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); + shlxq(tmp3_aliased, tmp3_aliased, tmp1); + notq(tmp3_aliased); + kmovql(k1, tmp3_aliased); + #else + Label k_init; + jmp(k_init); + + // We could not read 64-bits from a general purpose register thus we move + // data required to compose 64 1's to the instruction stream + // We emit 64 byte wide series of elements from 0..63 which later on would + // be used as a compare targets with tail count contained in tmp1 register. + // Result would be a k1 register having tmp1 consecutive number or 1 + // counting from least significant bit. + address tmp = pc(); + emit_int64(0x0706050403020100); + emit_int64(0x0F0E0D0C0B0A0908); + emit_int64(0x1716151413121110); + emit_int64(0x1F1E1D1C1B1A1918); + emit_int64(0x2726252423222120); + emit_int64(0x2F2E2D2C2B2A2928); + emit_int64(0x3736353433323130); + emit_int64(0x3F3E3D3C3B3A3938); + + bind(k_init); + lea(len, InternalAddress(tmp)); + // create mask to test for negative byte inside a vector + evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); + evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit); + + #endif + evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit); + ktestq(k2, k1); + // Restore k1 + kmovql(k1, k3); + jcc(Assembler::notZero, TRUE_LABEL); + + jmp(FALSE_LABEL); + + clear_vector_masking(); // closing of the stub context for programming mask registers + } else { movl(result, len); // copy ! if (UseAVX == 2 && UseSSE >= 2) { // With AVX2, use 32-byte vector compare Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; // Compare 32-byte vectors andl(result, 0x0000001f); // tail count (in bytes)
*** 8341,8351 **** bind(COMPARE_TAIL); // len is zero movl(len, result); // Fallthru to tail compare } ! // Compare 4-byte vectors andl(len, 0xfffffffc); // vector count (in bytes) jccb(Assembler::zero, COMPARE_CHAR); lea(ary1, Address(ary1, len, Address::times_1)); --- 8430,8440 ---- bind(COMPARE_TAIL); // len is zero movl(len, result); // Fallthru to tail compare } ! } // Compare 4-byte vectors andl(len, 0xfffffffc); // vector count (in bytes) jccb(Assembler::zero, COMPARE_CHAR); lea(ary1, Address(ary1, len, Address::times_1));
*** 8835,8848 **** --- 8924,8950 ---- } BIND(L_exit); } // encode char[] to byte[] in ISO_8859_1 + //@HotSpotIntrinsicCandidate + //private static int implEncodeISOArray(byte[] sa, int sp, + //byte[] da, int dp, int len) { + // int i = 0; + // for (; i < len; i++) { + // char c = StringUTF16.getChar(sa, sp++); + // if (c > '\u00FF') + // break; + // da[dp++] = (byte)c; + // } + // return i; + //} void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, XMMRegister tmp1Reg, XMMRegister tmp2Reg, XMMRegister tmp3Reg, XMMRegister tmp4Reg, Register tmp5, Register result) { + // rsi: src // rdi: dst // rdx: len // rcx: tmp5 // rax: result
*** 8853,8862 **** --- 8955,8965 ---- // set result xorl(result, result); // check for zero length testl(len, len); jcc(Assembler::zero, L_done); + movl(result, len); // Setup pointers lea(src, Address(src, len, Address::times_2)); // char[] lea(dst, Address(dst, len, Address::times_1)); // byte[]
*** 8961,8970 **** --- 9064,9074 ---- addptr(len, 1); jccb(Assembler::less, L_copy_1_char); bind(L_copy_1_char_exit); addptr(result, len); // len is negative count of not processed elements + bind(L_done); } #ifdef _LP64 /**
*** 10781,10797 **** } #endif // LP64 #undef BIND #undef BLOCK_COMMENT - // Compress char[] array to byte[]. void MacroAssembler::char_array_compress(Register src, Register dst, Register len, XMMRegister tmp1Reg, XMMRegister tmp2Reg, XMMRegister tmp3Reg, XMMRegister tmp4Reg, Register tmp5, Register result) { ! Label copy_chars_loop, return_length, return_zero, done; // rsi: src // rdi: dst // rdx: len // rcx: tmp5 --- 10885,10912 ---- } #endif // LP64 #undef BIND #undef BLOCK_COMMENT // Compress char[] array to byte[]. + // ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java + // @HotSpotIntrinsicCandidate + // private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { + // for (int i = 0; i < len; i++) { + // int c = src[srcOff++]; + // if (c >>> 8 != 0) { + // return 0; + // } + // dst[dstOff++] = (byte)c; + // } + // return len; + // } void MacroAssembler::char_array_compress(Register src, Register dst, Register len, XMMRegister tmp1Reg, XMMRegister tmp2Reg, XMMRegister tmp3Reg, XMMRegister tmp4Reg, Register tmp5, Register result) { ! Label copy_chars_loop, return_length, return_zero, done, below_threshold; // rsi: src // rdi: dst // rdx: len // rcx: tmp5
*** 10804,10818 **** --- 10919,11063 ---- assert(len != result, ""); // save length for return push(len); + if ((UseAVX > 2) && // AVX512 + VM_Version::supports_avx512vlbw() && + VM_Version::supports_bmi2()) { + + set_vector_masking(); // opening of the stub context for programming mask registers + + Label copy_32_loop, copy_loop_tail, copy_just_portion_of_candidates; + + // alignement + Label post_alignement; + + // if length of the string is less than 16, handle it in an old fashioned + // way + testl(len, -32); + jcc(Assembler::zero, below_threshold); + + // First check whether a character is compressable ( <= 0xFF). + // Create mask to test for Unicode chars inside zmm vector + movl(result, 0x00FF); + evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit); + + testl(len, -64); + jcc(Assembler::zero, post_alignement); + + // Save k1 + kmovql(k3, k1); + + movl(tmp5, dst); + andl(tmp5, (64 - 1)); + negl(tmp5); + andl(tmp5, (64 - 1)); + + // bail out when there is nothing to be done + testl(tmp5, 0xFFFFFFFF); + jcc(Assembler::zero, post_alignement); + + // ~(~0 << len), where len is the # of remaining elements to process + movl(result, 0xFFFFFFFF); + shlxl(result, result, tmp5); + notl(result); + + kmovdl(k1, result); + + evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit); + evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit); + ktestd(k2, k1); + jcc(Assembler::carryClear, copy_just_portion_of_candidates); + + evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit); + + addptr(src, tmp5); + addptr(src, tmp5); + addptr(dst, tmp5); + subl(len, tmp5); + + bind(post_alignement); + // end of alignement + + movl(tmp5, len); + andl(tmp5, (32 - 1)); // tail count (in chars) + andl(len, ~(32 - 1)); // vector count (in chars) + jcc(Assembler::zero, copy_loop_tail); + + lea(src, Address(src, len, Address::times_2)); + lea(dst, Address(dst, len, Address::times_1)); + negptr(len); + + bind(copy_32_loop); + evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit); + evpcmpuw(k2, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit); + kortestdl(k2, k2); + jcc(Assembler::carryClear, copy_just_portion_of_candidates); + + // All elements in current processed chunk are valid candidates for + // compression. Write a truncated byte elements to the memory. + evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit); + addptr(len, 32); + jcc(Assembler::notZero, copy_32_loop); + + bind(copy_loop_tail); + // bail out when there is nothing to be done + testl(tmp5, 0xFFFFFFFF); + jcc(Assembler::zero, return_length); + + // Save k1 + kmovql(k3, k1); + + movl(len, tmp5); + + // ~(~0 << len), where len is the # of remaining elements to process + movl(result, 0xFFFFFFFF); + shlxl(result, result, len); + notl(result); + + kmovdl(k1, result); + + evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit); + evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit); + ktestd(k2, k1); + jcc(Assembler::carryClear, copy_just_portion_of_candidates); + + evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit); + // Restore k1 + kmovql(k1, k3); + + jmp(return_length); + + bind(copy_just_portion_of_candidates); + kmovdl(tmp5, k2); + tzcntl(tmp5, tmp5); + + // ~(~0 << tmp5), where tmp5 is a number of elements in an array from the + // result to the first element larger than 0xFF + movl(result, 0xFFFFFFFF); + shlxl(result, result, tmp5); + notl(result); + + kmovdl(k1, result); + + evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit); + // Restore k1 + kmovql(k1, k3); + + jmp(return_zero); + + clear_vector_masking(); // closing of the stub context for programming mask registers + } if (UseSSE42Intrinsics) { assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available"); Label copy_32_loop, copy_16, copy_tail; + bind(below_threshold); + movl(result, len); + movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors // vectored compression andl(len, 0xfffffff0); // vector count (in chars) andl(result, 0x0000000f); // tail count (in chars)
*** 10890,10927 **** bind(done); } // Inflate byte[] array to char[]. void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, XMMRegister tmp1, Register tmp2) { ! Label copy_chars_loop, done; ! // rsi: src // rdi: dst // rdx: len // rcx: tmp2 // rsi holds start addr of source byte[] to be inflated // rdi holds start addr of destination char[] // rdx holds length assert_different_registers(src, dst, len, tmp2); if (UseSSE42Intrinsics) { assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available"); ! Label copy_8_loop, copy_bytes, copy_tail; movl(tmp2, len); andl(tmp2, 0x00000007); // tail count (in chars) andl(len, 0xfffffff8); // vector count (in chars) jccb(Assembler::zero, copy_tail); // vectored inflation lea(src, Address(src, len, Address::times_1)); lea(dst, Address(dst, len, Address::times_2)); negptr(len); // inflate 8 chars per iter bind(copy_8_loop); pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words movdqu(Address(dst, len, Address::times_2), tmp1); addptr(len, 8); --- 11135,11267 ---- bind(done); } // Inflate byte[] array to char[]. + // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java + // @HotSpotIntrinsicCandidate + // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) { + // for (int i = 0; i < len; i++) { + // dst[dstOff++] = (char)(src[srcOff++] & 0xff); + // } + // } void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, XMMRegister tmp1, Register tmp2) { ! Label copy_chars_loop, done, below_threshold; // rsi: src // rdi: dst // rdx: len // rcx: tmp2 // rsi holds start addr of source byte[] to be inflated // rdi holds start addr of destination char[] // rdx holds length assert_different_registers(src, dst, len, tmp2); + if ((UseAVX > 2) && // AVX512 + VM_Version::supports_avx512vlbw() && + VM_Version::supports_bmi2()) { + + set_vector_masking(); // opening of the stub context for programming mask registers + + Label copy_32_loop, copy_tail; + Register tmp3_aliased = len; + + // if length of the string is less than 16, handle it in an old fashioned + // way + testl(len, -16); + jcc(Assembler::zero, below_threshold); + + // In order to use only one arithmetic operation for the main loop we use + // this pre-calculation + movl(tmp2, len); + andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop + andl(len, -32); // vector count + jccb(Assembler::zero, copy_tail); + + lea(src, Address(src, len, Address::times_1)); + lea(dst, Address(dst, len, Address::times_2)); + negptr(len); + + + // inflate 32 chars per iter + bind(copy_32_loop); + vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit); + evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit); + addptr(len, 32); + jcc(Assembler::notZero, copy_32_loop); + + bind(copy_tail); + // bail out when there is nothing to be done + testl(tmp2, -1); // we don't destroy the contents of tmp2 here + jcc(Assembler::zero, done); + + // Save k1 + kmovql(k2, k1); + + // ~(~0 << length), where length is the # of remaining elements to process + movl(tmp3_aliased, -1); + shlxl(tmp3_aliased, tmp3_aliased, tmp2); + notl(tmp3_aliased); + kmovdl(k1, tmp3_aliased); + evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit); + evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit); + + // Restore k1 + kmovql(k1, k2); + jmp(done); + + clear_vector_masking(); // closing of the stub context for programming mask registers + } if (UseSSE42Intrinsics) { assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available"); ! Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail; movl(tmp2, len); + + if (UseAVX > 1) { + andl(tmp2, (16 - 1)); + andl(len, -16); + jccb(Assembler::zero, copy_new_tail); + } else { andl(tmp2, 0x00000007); // tail count (in chars) andl(len, 0xfffffff8); // vector count (in chars) jccb(Assembler::zero, copy_tail); + } // vectored inflation lea(src, Address(src, len, Address::times_1)); lea(dst, Address(dst, len, Address::times_2)); negptr(len); + if (UseAVX > 1) { + bind(copy_16_loop); + vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit); + vmovdqu(Address(dst, len, Address::times_2), tmp1); + addptr(len, 16); + jcc(Assembler::notZero, copy_16_loop); + + bind(below_threshold); + bind(copy_new_tail); + if (UseAVX > 2) { + movl(tmp2, len); + } + else { + movl(len, tmp2); + } + andl(tmp2, 0x00000007); + andl(len, 0xFFFFFFF8); + jccb(Assembler::zero, copy_tail); + + pmovzxbw(tmp1, Address(src, 0)); + movdqu(Address(dst, 0), tmp1); + addptr(src, 8); + addptr(dst, 2 * 8); + + jmp(copy_tail, true); + } + // inflate 8 chars per iter bind(copy_8_loop); pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words movdqu(Address(dst, len, Address::times_2), tmp1); addptr(len, 8);
*** 10956,10966 **** jcc(Assembler::notZero, copy_chars_loop); bind(done); } - Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { switch (cond) { // Note some conditions are synonyms for others case Assembler::zero: return Assembler::notZero; case Assembler::notZero: return Assembler::zero; --- 11296,11305 ----
< prev index next >