--- old/src/cpu/x86/vm/macroAssembler_x86.cpp 2016-04-29 15:56:26.142272900 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp 2016-04-29 15:56:25.898107200 -0700 @@ -8255,10 +8255,19 @@ // Search for Non-ASCII character (Negative byte value) in a byte array, // return true if it has any and false otherwise. +// ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java +// @HotSpotIntrinsicCandidate +// private static boolean hasNegatives(byte[] ba, int off, int len) { +// for (int i = off; i < off + len; i++) { +// if (ba[i] < 0) { +// return true; +// } +// } +// return false; +// } void MacroAssembler::has_negatives(Register ary1, Register len, - Register result, Register tmp1, - XMMRegister vec1, XMMRegister vec2) { - + Register result, Register tmp1, + XMMRegister vec1, XMMRegister vec2) { // rsi: byte array // rcx: len // rax: result @@ -8271,79 +8280,159 @@ testl(len, len); jcc(Assembler::zero, FALSE_LABEL); - movl(result, len); // copy + if ((UseAVX > 2) && // AVX512 + VM_Version::supports_avx512vlbw() && + VM_Version::supports_bmi2()) { - if (UseAVX >= 2 && UseSSE >= 2) { - // With AVX2, use 32-byte vector compare - Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; + set_vector_masking(); // opening of the stub context for programming mask registers - // Compare 32-byte vectors - andl(result, 0x0000001f); // tail count (in bytes) - andl(len, 0xffffffe0); // vector count (in bytes) - jccb(Assembler::zero, COMPARE_TAIL); + Label test_64_loop, test_tail; + Register tmp3_aliased = len; + + movl(tmp1, len); + vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); + + andl(tmp1, 64 - 1); // tail count (in chars) 0x3F + andl(len, ~(64 - 1)); // vector count (in chars) + jccb(Assembler::zero, test_tail); lea(ary1, Address(ary1, len, Address::times_1)); negptr(len); - movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector - movdl(vec2, tmp1); - vpbroadcastd(vec2, vec2); + bind(test_64_loop); + // Check whether our 64 elements of size byte contain negatives + evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); + kortestql(k2, k2); + jcc(Assembler::notZero, TRUE_LABEL); + + addptr(len, 64); + jccb(Assembler::notZero, test_64_loop); - bind(COMPARE_WIDE_VECTORS); - vmovdqu(vec1, Address(ary1, len, Address::times_1)); - vptest(vec1, vec2); - jccb(Assembler::notZero, TRUE_LABEL); - addptr(len, 32); - jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); - testl(result, result); - jccb(Assembler::zero, FALSE_LABEL); - - vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); - vptest(vec1, vec2); - jccb(Assembler::notZero, TRUE_LABEL); - jmpb(FALSE_LABEL); + bind(test_tail); + // bail out when there is nothing to be done + testl(tmp1, -1); + jcc(Assembler::zero, FALSE_LABEL); - bind(COMPARE_TAIL); // len is zero - movl(len, result); - // Fallthru to tail compare - } else if (UseSSE42Intrinsics) { - assert(UseSSE >= 4, "SSE4 must be for SSE4.2 intrinsics to be available"); - // With SSE4.2, use double quad vector compare - Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; + // Save k1 + kmovql(k3, k1); - // Compare 16-byte vectors - andl(result, 0x0000000f); // tail count (in bytes) - andl(len, 0xfffffff0); // vector count (in bytes) - jccb(Assembler::zero, COMPARE_TAIL); + // ~(~0 << len) applied up to two times (for 32-bit scenario) + #ifdef _LP64 + mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); + shlxq(tmp3_aliased, tmp3_aliased, tmp1); + notq(tmp3_aliased); + kmovql(k1, tmp3_aliased); + #else + Label k_init; + jmp(k_init); + + // We could not read 64-bits from a general purpose register thus we move + // data required to compose 64 1's to the instruction stream + // We emit 64 byte wide series of elements from 0..63 which later on would + // be used as a compare targets with tail count contained in tmp1 register. + // Result would be a k1 register having tmp1 consecutive number or 1 + // counting from least significant bit. + address tmp = pc(); + emit_int64(0x0706050403020100); + emit_int64(0x0F0E0D0C0B0A0908); + emit_int64(0x1716151413121110); + emit_int64(0x1F1E1D1C1B1A1918); + emit_int64(0x2726252423222120); + emit_int64(0x2F2E2D2C2B2A2928); + emit_int64(0x3736353433323130); + emit_int64(0x3F3E3D3C3B3A3938); + + bind(k_init); + lea(len, InternalAddress(tmp)); + // create mask to test for negative byte inside a vector + evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); + evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit); - lea(ary1, Address(ary1, len, Address::times_1)); - negptr(len); +#endif + evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit); + ktestq(k2, k1); + // Restore k1 + kmovql(k1, k3); + jcc(Assembler::notZero, TRUE_LABEL); - movl(tmp1, 0x80808080); - movdl(vec2, tmp1); - pshufd(vec2, vec2, 0); + jmp(FALSE_LABEL); - bind(COMPARE_WIDE_VECTORS); - movdqu(vec1, Address(ary1, len, Address::times_1)); - ptest(vec1, vec2); - jccb(Assembler::notZero, TRUE_LABEL); - addptr(len, 16); - jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); + clear_vector_masking(); // closing of the stub context for programming mask registers + } else { + movl(result, len); // copy - testl(result, result); - jccb(Assembler::zero, FALSE_LABEL); + if (UseAVX == 2 && UseSSE >= 2) { + // With AVX2, use 32-byte vector compare + Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; + + // Compare 32-byte vectors + andl(result, 0x0000001f); // tail count (in bytes) + andl(len, 0xffffffe0); // vector count (in bytes) + jccb(Assembler::zero, COMPARE_TAIL); + + lea(ary1, Address(ary1, len, Address::times_1)); + negptr(len); + + movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector + movdl(vec2, tmp1); + vpbroadcastd(vec2, vec2); + + bind(COMPARE_WIDE_VECTORS); + vmovdqu(vec1, Address(ary1, len, Address::times_1)); + vptest(vec1, vec2); + jccb(Assembler::notZero, TRUE_LABEL); + addptr(len, 32); + jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); - movdqu(vec1, Address(ary1, result, Address::times_1, -16)); - ptest(vec1, vec2); - jccb(Assembler::notZero, TRUE_LABEL); - jmpb(FALSE_LABEL); + testl(result, result); + jccb(Assembler::zero, FALSE_LABEL); - bind(COMPARE_TAIL); // len is zero - movl(len, result); - // Fallthru to tail compare + vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); + vptest(vec1, vec2); + jccb(Assembler::notZero, TRUE_LABEL); + jmpb(FALSE_LABEL); + + bind(COMPARE_TAIL); // len is zero + movl(len, result); + // Fallthru to tail compare + } else if (UseSSE42Intrinsics) { + assert(UseSSE >= 4, "SSE4 must be for SSE4.2 intrinsics to be available"); + // With SSE4.2, use double quad vector compare + Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; + + // Compare 16-byte vectors + andl(result, 0x0000000f); // tail count (in bytes) + andl(len, 0xfffffff0); // vector count (in bytes) + jccb(Assembler::zero, COMPARE_TAIL); + + lea(ary1, Address(ary1, len, Address::times_1)); + negptr(len); + + movl(tmp1, 0x80808080); + movdl(vec2, tmp1); + pshufd(vec2, vec2, 0); + + bind(COMPARE_WIDE_VECTORS); + movdqu(vec1, Address(ary1, len, Address::times_1)); + ptest(vec1, vec2); + jccb(Assembler::notZero, TRUE_LABEL); + addptr(len, 16); + jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); + + testl(result, result); + jccb(Assembler::zero, FALSE_LABEL); + + movdqu(vec1, Address(ary1, result, Address::times_1, -16)); + ptest(vec1, vec2); + jccb(Assembler::notZero, TRUE_LABEL); + jmpb(FALSE_LABEL); + + bind(COMPARE_TAIL); // len is zero + movl(len, result); + // Fallthru to tail compare + } } - // Compare 4-byte vectors andl(len, 0xfffffffc); // vector count (in bytes) jccb(Assembler::zero, COMPARE_CHAR); @@ -8837,10 +8926,23 @@ } // encode char[] to byte[] in ISO_8859_1 + //@HotSpotIntrinsicCandidate + //private static int implEncodeISOArray(byte[] sa, int sp, + //byte[] da, int dp, int len) { + // int i = 0; + // for (; i < len; i++) { + // char c = StringUTF16.getChar(sa, sp++); + // if (c > '\u00FF') + // break; + // da[dp++] = (byte)c; + // } + // return i; + //} void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, - XMMRegister tmp1Reg, XMMRegister tmp2Reg, - XMMRegister tmp3Reg, XMMRegister tmp4Reg, - Register tmp5, Register result) { + XMMRegister tmp1Reg, XMMRegister tmp2Reg, + XMMRegister tmp3Reg, XMMRegister tmp4Reg, + Register tmp5, Register result) { + // rsi: src // rdi: dst // rdx: len @@ -8855,6 +8957,7 @@ // check for zero length testl(len, len); jcc(Assembler::zero, L_done); + movl(result, len); // Setup pointers @@ -8963,6 +9066,7 @@ bind(L_copy_1_char_exit); addptr(result, len); // len is negative count of not processed elements + bind(L_done); } @@ -10783,13 +10887,24 @@ #undef BIND #undef BLOCK_COMMENT - // Compress char[] array to byte[]. +// ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java +// @HotSpotIntrinsicCandidate +// private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { +// for (int i = 0; i < len; i++) { +// int c = src[srcOff++]; +// if (c >>> 8 != 0) { +// return 0; +// } +// dst[dstOff++] = (byte)c; +// } +// return len; +// } void MacroAssembler::char_array_compress(Register src, Register dst, Register len, - XMMRegister tmp1Reg, XMMRegister tmp2Reg, - XMMRegister tmp3Reg, XMMRegister tmp4Reg, - Register tmp5, Register result) { - Label copy_chars_loop, return_length, return_zero, done; + XMMRegister tmp1Reg, XMMRegister tmp2Reg, + XMMRegister tmp3Reg, XMMRegister tmp4Reg, + Register tmp5, Register result) { + Label copy_chars_loop, return_length, return_zero, done, below_threshold; // rsi: src // rdi: dst @@ -10806,11 +10921,141 @@ // save length for return push(len); + if ((UseAVX > 2) && // AVX512 + VM_Version::supports_avx512vlbw() && + VM_Version::supports_bmi2()) { + + set_vector_masking(); // opening of the stub context for programming mask registers + + Label copy_32_loop, copy_loop_tail, copy_just_portion_of_candidates; + + // alignement + Label post_alignement; + + // if length of the string is less than 16, handle it in an old fashioned + // way + testl(len, -32); + jcc(Assembler::zero, below_threshold); + + // First check whether a character is compressable ( <= 0xFF). + // Create mask to test for Unicode chars inside zmm vector + movl(result, 0x00FF); + evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit); + + testl(len, -64); + jcc(Assembler::zero, post_alignement); + + // Save k1 + kmovql(k3, k1); + + movl(tmp5, dst); + andl(tmp5, (64 - 1)); + negl(tmp5); + andl(tmp5, (64 - 1)); + + // bail out when there is nothing to be done + testl(tmp5, 0xFFFFFFFF); + jcc(Assembler::zero, post_alignement); + + // ~(~0 << len), where len is the # of remaining elements to process + movl(result, 0xFFFFFFFF); + shlxl(result, result, tmp5); + notl(result); + + kmovdl(k1, result); + + evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit); + evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit); + ktestd(k2, k1); + jcc(Assembler::carryClear, copy_just_portion_of_candidates); + + evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit); + + addptr(src, tmp5); + addptr(src, tmp5); + addptr(dst, tmp5); + subl(len, tmp5); + + bind(post_alignement); + // end of alignement + + movl(tmp5, len); + andl(tmp5, (32 - 1)); // tail count (in chars) + andl(len, ~(32 - 1)); // vector count (in chars) + jcc(Assembler::zero, copy_loop_tail); + + lea(src, Address(src, len, Address::times_2)); + lea(dst, Address(dst, len, Address::times_1)); + negptr(len); + + bind(copy_32_loop); + evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit); + evpcmpuw(k2, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit); + kortestdl(k2, k2); + jcc(Assembler::carryClear, copy_just_portion_of_candidates); + + // All elements in current processed chunk are valid candidates for + // compression. Write a truncated byte elements to the memory. + evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit); + addptr(len, 32); + jcc(Assembler::notZero, copy_32_loop); + + bind(copy_loop_tail); + // bail out when there is nothing to be done + testl(tmp5, 0xFFFFFFFF); + jcc(Assembler::zero, return_length); + + // Save k1 + kmovql(k3, k1); + + movl(len, tmp5); + + // ~(~0 << len), where len is the # of remaining elements to process + movl(result, 0xFFFFFFFF); + shlxl(result, result, len); + notl(result); + + kmovdl(k1, result); + + evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit); + evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, ComparisonPredicate::le, Assembler::AVX_512bit); + ktestd(k2, k1); + jcc(Assembler::carryClear, copy_just_portion_of_candidates); + + evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit); + // Restore k1 + kmovql(k1, k3); + + jmp(return_length); + + bind(copy_just_portion_of_candidates); + kmovdl(tmp5, k2); + tzcntl(tmp5, tmp5); + + // ~(~0 << tmp5), where tmp5 is a number of elements in an array from the + // result to the first element larger than 0xFF + movl(result, 0xFFFFFFFF); + shlxl(result, result, tmp5); + notl(result); + + kmovdl(k1, result); + + evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit); + // Restore k1 + kmovql(k1, k3); + + jmp(return_zero); + + clear_vector_masking(); // closing of the stub context for programming mask registers + } if (UseSSE42Intrinsics) { assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available"); Label copy_32_loop, copy_16, copy_tail; + bind(below_threshold); + movl(result, len); + movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors // vectored compression @@ -10892,10 +11137,16 @@ } // Inflate byte[] array to char[]. +// ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java +// @HotSpotIntrinsicCandidate +// private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) { +// for (int i = 0; i < len; i++) { +// dst[dstOff++] = (char)(src[srcOff++] & 0xff); +// } +// } void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, - XMMRegister tmp1, Register tmp2) { - Label copy_chars_loop, done; - + XMMRegister tmp1, Register tmp2) { + Label copy_chars_loop, done, below_threshold; // rsi: src // rdi: dst // rdx: len @@ -10906,20 +11157,109 @@ // rdx holds length assert_different_registers(src, dst, len, tmp2); + if ((UseAVX > 2) && // AVX512 + VM_Version::supports_avx512vlbw() && + VM_Version::supports_bmi2()) { + + set_vector_masking(); // opening of the stub context for programming mask registers + + Label copy_32_loop, copy_tail; + Register tmp3_aliased = len; + + // if length of the string is less than 16, handle it in an old fashioned + // way + testl(len, -16); + jcc(Assembler::zero, below_threshold); + + // In order to use only one arithmetic operation for the main loop we use + // this pre-calculation + movl(tmp2, len); + andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop + andl(len, -32); // vector count + jccb(Assembler::zero, copy_tail); + + lea(src, Address(src, len, Address::times_1)); + lea(dst, Address(dst, len, Address::times_2)); + negptr(len); + + + // inflate 32 chars per iter + bind(copy_32_loop); + vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit); + evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit); + addptr(len, 32); + jcc(Assembler::notZero, copy_32_loop); + + bind(copy_tail); + // bail out when there is nothing to be done + testl(tmp2, -1); // we don't destroy the contents of tmp2 here + jcc(Assembler::zero, done); + + // Save k1 + kmovql(k2, k1); + + // ~(~0 << length), where length is the # of remaining elements to process + movl(tmp3_aliased, -1); + shlxl(tmp3_aliased, tmp3_aliased, tmp2); + notl(tmp3_aliased); + kmovdl(k1, tmp3_aliased); + evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit); + evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit); + + // Restore k1 + kmovql(k1, k2); + jmp(done); + + clear_vector_masking(); // closing of the stub context for programming mask registers + } if (UseSSE42Intrinsics) { assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available"); - Label copy_8_loop, copy_bytes, copy_tail; + Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail; movl(tmp2, len); - andl(tmp2, 0x00000007); // tail count (in chars) - andl(len, 0xfffffff8); // vector count (in chars) - jccb(Assembler::zero, copy_tail); + + if (UseAVX > 1) { + andl(tmp2, (16 - 1)); + andl(len, -16); + jccb(Assembler::zero, copy_new_tail); + } else { + andl(tmp2, 0x00000007); // tail count (in chars) + andl(len, 0xfffffff8); // vector count (in chars) + jccb(Assembler::zero, copy_tail); + } // vectored inflation lea(src, Address(src, len, Address::times_1)); lea(dst, Address(dst, len, Address::times_2)); negptr(len); + if (UseAVX > 1) { + bind(copy_16_loop); + vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit); + vmovdqu(Address(dst, len, Address::times_2), tmp1); + addptr(len, 16); + jcc(Assembler::notZero, copy_16_loop); + + bind(below_threshold); + bind(copy_new_tail); + if (UseAVX > 2) { + movl(tmp2, len); + } + else { + movl(len, tmp2); + } + andl(tmp2, 0x00000007); + andl(len, 0xFFFFFFF8); + jccb(Assembler::zero, copy_tail); + + pmovzxbw(tmp1, Address(src, 0)); + movdqu(Address(dst, 0), tmp1); + addptr(src, 8); + addptr(dst, 2 * 8); + + jmp(copy_tail, true); + } + // inflate 8 chars per iter bind(copy_8_loop); pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words @@ -10958,7 +11298,6 @@ bind(done); } - Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { switch (cond) { // Note some conditions are synonyms for others