--- old/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp 2019-02-21 16:08:13.046971067 +0300 +++ new/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp 2019-02-21 16:08:12.830972662 +0300 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -4861,7 +4861,331 @@ BIND(DONE); } -// Compare strings. +// Summary: Compare strings intrinsic implementation. All combinations of UTF-16 +// and Latin1 encodings for both strings are considered. Comparison +// is performed in lexical order. +// +// Input: str1: pointer to 1st string +// str2: pointer to 2nd string +// cnt1: number of bytes in 1st string +// cnt2: number of bytes in 2nd string +// +// Algorithm parameter: +// ae: encodings used in 1st and 2nd strings +// +// Temporary registers: +// tmp1, tmp2, rscratch1, rscratch2: always used +// vtmp1, vtmp2, vtmp3: used in case encodings are different +// +// Output: result - return 0 if strings are equal. Returns positive value +// if 1st string > 2nd string in lexical order. Returns +// negative value if 1st string < 2nd string. +// +// Side effects: str1, str2, cnt1, cnt2, tmp1, tmp2, rscratch1, rscratch2: clobbered. +// vtmp1, vtmp2, vtmp2: clobbered if encodings are different +// +// Additional data: boolean values: isLL, isLU, isUL, str1_isL, str2_isL and +// int minCharInWords are derived from ae parameter based on encodings used +// in strings. Different code is generated depending of these values: +// +// isLL = both strings are Latin1 +// isLU = 1st string is Latin1, 2nd string is UTF-16 +// isUL = 1st string is UTF-16, 2nd string is Latin1 +// str1_isL = 1st string is Latin1 +// str2_isL = 2nd string is Latin1 +// str1_chr_shift = shift value to convert between characters counter to byte counter for 1st string +// str2_chr_shift = shift value to convert between characters counter to byte counter for 2nd string +// minCharInWords = minimum number of characters that fit in register (8 for LL case, 4 otherwise) +// +// +// PSEUDO CODE: +// +// // N.B.: this pseudo-code doesn't strictly follow implementation details. +// // It is here to help understand the basics. Detailed implementation +// // description is listed after this code. +// +// ; +// result = cnt1 - cnt2; // length difference. Used in if all min(cnt1, cnt2) characters are same +// cnt2 = min(cnt1, cnt2); // amount of characters to check +// if (result <= minCharInWords) { // <= wordSize bytes should be loaded for comparison +// if (cnt2 == 0) return result; +// while (cnt2 != 0) { +// char str1char = str1[0]; +// char str2char = str2[0]; +// str1 += 1 << str1_chr_shift; // advance pointer by size of str1 character +// str2 += 1 << str2_chr_shift; // advance pointer by size of str2 character +// if (str1[0] != str2[0]) return str1[0] - str2[0]; +// cnt2--; +// } +// } else { // > wordSize bytes should be loaded for comparison +// // This code checks string in 8-byte blocks. If encodings are +// // different, Latin1 string will be loaded via 4-byte blocks and then +// // each block will be converted to 8-byte UTF-16 equivalent. Then 8 byte +// // blocks are compared. Each load is 8 characters for LL case and 4 +// // characters for LU/UL/UU. +// // This set of instructions (load 8 Latin1 character OR load 4 Latin1 +// // characters and convert it to 4 UTF-16 character OR load 4 UTF-16 +// // character) is referred as below. +// +// // First iteration in the loop is unrolled to add initialization. +// +// // The code below calculates addresses of each string last load: addresses +// // of last 8 characters for LL case and last 4 characters otherwise. +// // Then offsets from the addresses to the beginning of the strings are +// // calculated. Offset is then use as loop counter. When offset is >= 0, then +// // only last loads (possible overlapped) are left to be checked. +// // N.B.: in case of same encodings, offsets are the same for both strings. +// // Then offset for 2nd string is used for both strings. +// +// tmp1 = ; +// if (str1 == str2) return result; +// tmp2 = ; +// +// // use special implementation optimized for large strings. See detailed code and stub comments. +// if (cnt2 >= 72) return compare_long_string_implementation(); +// +// cnt2 -= ; // 8 for isLL case. 4 otherwise. +// +// if (str1_isL == str2_isL) { +// // Optional optimization for same encoding cases. Can be applied for all +// // cases, but is faster in same encoding cases only. Without this branch +// // smallest string (8 character for LL and 4 characters for others) would +// // be checked twice. +// if (cnt2 == 0) goto TAIL_CHECK; // no more characters to be loaded. Just check already loaded data. +// } +// +// // calculate addresses of last loads. use str1 and str2 pointers for that +// str1 = str1 + cnt2 << str1_chr_shift; +// str2 = str2 + cnt2 << str2_chr_shift; +// +// // calculate offsets for both strings. cnt1 and cnt2 can be reused +// if (str1_isL != str2_isL) cnt1 = - (cnt2 << str1_chr_shift); +// cnt2 = - (cnt2 << str2_chr_shift); +// +// // increment calculated offsets by the number of already loaded bytes +// if (isLU) cnt1 += 4; +// if (isUL) cnt1 += 8; +// cnt2 += isUL ? 4 : 8; +// +// if (cnt2 >= 0) goto TAIL; // only last loads remains. Still need to check currently loaded data. +// +// rscratch2 = tmp1 BIT_XOR tmp2; +// if (rscratch2 != 0) goto DIFFERENCE; +// +// // main loop. Label = NEXT_WORD +// do { +// tmp1 = ; +// tmp2 = ; +// +// // update offsets by the number of loaded bytes +// cnt2 += isUL ? 4 : 8; +// if (isLU) cnt1 += 4; +// if (isUL) cnt1 += 8; +// +// if (cnt2 >= 0) goto TAIL; // last block left to be loaded. Still need to check currently loaded block. +// rscratch2 = tmp1 BIT_XOR tmp2; +// } while (rscratch2 == 0); +// goto DIFFERENCE: +// +// TAIL: // last block left to be loaded. Still need to check currently loaded block. +// rscratch2 = tmp1 BIT_XOR tmp2; +// if (rscratch2 != 0) goto DIFFERENCE; +// tmp1 = ; +// tmp2 = ; +// // fallthrough to TAIL_CHECK +// TAIL_CHECK: +// rscratch2 = tmp1 BIT_XOR tmp2; +// if (rscratch2 == 0) return result; +// DIFFERENCE: // different character found. Find it and compute difference +// // tmp1 and tmp2 have current data with at least 1 different character. +// // Find index of first such character. +// rscratch2 = REVERSE_BITS(rscratch2); +// rscratch2 = COUNT_LEADING_ZEROES(rscratch2); // position of different bit in current 8 bytes +// rscratch2 = rscratch2 & (isLL ? -8 : -16); // number of bits until (possibly converted) different characters in tmp1 and tmp2 +// tmp1 = tmp1 >> rscratch2; // now first character in tmp1 is the one sought for +// tmp1 = tmp1 & (isLL ? 0xFF : 0xFFFF); // only first different character left +// tmp2 = tmp2 >> rscratch2; // now first character in tmp2 is the one sought for +// tmp2 = tmp2 & (isLL ? 0xFF : 0xFFFF); // only first different character left +// result = tmp1 - tmp2; +// } +// return result; +// +// +// +// DETAILED CODE: +// +// if (!str1_isL) cnt1 = cnt1 >> 1; // counter for 1st string (in characters) +// if (!str2_isL) cnt2 = cnt2 >> 1; // counter for 2nd string (in characters) +// result = cnt1 - cnt2; // keep in flags the result of operation +// cnt2 = min(cnt1, cnt2); // implemented as csel instruction using stored flag value above +// bool shortStringsCase = cnt2 <= minCharInWords; // kept in flag +// if (shortStringsCase) goto SHORT_STRING; // separate code for short strings +// if (str1_isL == str2_isL) { // same encoding case +// tmp1 = LOAD8BYTES(str1); +// bool sameString = str1 == str2; // kept in flags +// if (sameString) goto DONE; // the string is the same, return +// tmp2 = LOAD8BYTES(str2); +// bool largeStrings = cnt2 >= 72; // kept in flags +// if (largeStrings) goto STUB; // handled in separate stub implementation for large strings +// cnt2 = cnt2 - minCharsInWord; // decrement counter by the number of loaded characters +// bool noMoreLoadsAvailable = cnt2 == 0; // kept in flags +// if (noMoreLoadsAvailable) goto TAIL_CHECK; +// str2 = str2 + cnt2 << str2_chr_shift; // address of str2 last load +// str1 = str1 + cnt2 << str1_chr_shift; // address of str1 last load +// cnt2 = -(cnt2 << str2_chr_shift); // byte offset to 1st character in each string +// } else if (isLU) { +// vtmp = LOAD4BYTES(str1); +// bool sameString = str1 == str2; // kept in flags +// if (sameString) goto DONE; // return +// tmp2 = LOAD8BYTES(str2); +// bool largeStrings = cnt2 >= 72; // kept in flags +// if (largeStrings) goto STUB; // handled in separate stub implementation for large strings +// cnt2 = cnt2 - 4; // decrement counter by the number of loaded characters +// vtmpz = 0; // implemented as eor +// str1 = str1 + cnt2 << str1_chr_shift; // address of str1 last load +// str2 = str2 + cnt2 << str2_chr_shift; // address of str2 last load +// vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); // convert Latin1 to UTF16 because it'll be compared with UTF16. Implemented as zip instruction +// cnt1 = -(cnt2 << str1_chr_shift); // byte offset to 1st character in 1st string +// cnt2 = -(cnt2 << str2_chr_shift); // byte offset to 1st character in 2nd string +// cnt1 = cnt1 + 4; // advance 1st string offset by the number of loaded bytes +// tmp1 = vtmp; // move converted characters from FPU register to GPR +// } else { // UL +// tmp1 = LOAD8BYTES(str1); +// bool sameString = str1 == str2; // kept in flags +// if (sameString) goto DONE; // return +// vtmp = LOAD4BYTES(str2); +// bool largeStrings = cnt2 >= 72; // kept in flags +// if (largeStrings) goto STUB; // separate stub implementation for large strings +// cnt2 = cnt2 - 4; // update counter by the number of loaded characters +// str1 = str1 + cnt2 << str1_chr_shift; // address of str1 last load +// vtmpz = 0; // implemented as eor +// str2 = str2 + cnt2 << str2_chr_shift; // address of str2 last load +// cnt1 = -(cnt2 << str1_chr_shift); // byte offset to 1st character in 1st string +// vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); // convert Latin1 to UTF16 because it'll be compared with UTF16. implemented as zip instruction +// cnt2 = -(cnt2 << str2_chr_shift); // byte offset to 1st character in 2nd string +// cnt1 = cnt1 + 8; // advance 1st string offset by the number of loaded bytes +// tmp2 = vtmp; // move converted characters from FPU register to GPR +// } +// cnt2 = cnt2 + (isUL ? 4 : 8); // update offset by the number of loaded bytes +// bool onlyLastLoadRemains = cnt2 >= 0; // kept in flags +// if (onlyLastLoadRemains) goto TAIL; +// rscratch2 = BIT_XOR(tmp1, tmp2); // current block comparison result +// if (rscratch2 != 0) goto DIFFERENCE; // found different characters in current block +// NEXT_WORD: // main loop +// // implementation for each encoding loads 4 or 8 characters at calculated +// // offsets from each string and convert encodings if necessary. Then offsets +// // are updated. +// if (str1_isL == str2_isL) { +// tmp1 = LOAD8BYTES(str1, cnt2); +// tmp2 = LOAD8BYTES(str2, cnt2); +// cnt2 = cnt2 + 8; // update counter by the number of loaded bytes +// onlyLastLoadRemains = cnt2 >= 0; // kept in flags +// } else if (isLU) { +// vtmp = LOAD4BYTES(str1, cnt1); +// tmp2 = LOAD8BYTES(str2, cnt2); +// cnt1 = cnt1 + 4; +// vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); +// tmp1 = vtmp; +// cnt2 = cnt2 + 8; +// onlyLastLoadRemains = cnt2 >= 0; // kept in flags +// } else { // UL +// vtmp = LOAD4BYTES(str2, cnt2); +// tmp1 = LOAD8BYTES(str1, cnt1); +// vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); +// cnt1 = cnt1 + 8; +// tmp2 = vtmp; +// cnt2 = cnt2 + 4; +// onlyLastLoadRemains = cnt2 >= 0; // kept in flags +// } +// if (onlyLastLoadRemains) goto TAIL; +// rscratch2 = BIT_XOR(tmp1, tmp2); // current block comparison result +// if (rscratch2 == 0) goto NEXT_WORD; +// goto DIFFERENCE; +// TAIL: // check already loaded data and last load +// rscratch2 = BIT_XOR(tmp1, tmp2); // current block comparison result +// if (rscratch2 != 0) goto DIFFERENCE; +// +// // last load (and convert if needed) from each string +// if (str1_isL == str2_isL) { +// tmp1 = LOAD8BYTES(str1); +// tmp2 = LOAD8BYTES(str2); +// } else if (isLU) { +// vtmp = LOAD4BYTES(str1); +// tmp2 = LOAD8BYTES(str2); +// vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); +// tmp1 = vtmp; +// } else { // UL +// vtmp = LOAD4BYTES(str2); +// tmp1 = LOAD8BYTES(str1); +// vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); +// tmp2 = vtmp; +// } +// TAIL_CHECK: // last check +// rscratch2 = BIT_XOR(tmp1, tmp2); // current block comparison result +// if (rscratch2 == 0) goto DONE; // return +// DIFFERENCE: +// rscratch2 = REVERSE_BITS(rscratch2); // It's not possible to count trailing zeroes. Reverse bits and then count leading zeroes instead. +// rscratch2 = COUNT_LEADING_ZEROES(rscratch2); // position of different bit in current 8 bytes +// rscratch2 = rscratch2 & (isLL ? -8 : -16); // number of bits until (possibly converted) different characters in tmp1 and tmp2 +// tmp1 = tmp1 >> rscratch2; // first character in tmp1 is the one sought for +// tmp1 = isLL ? UNSIGNED_EXTEND_BYTE2INT(tmp1) : UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different character left +// tmp2 = tmp2 >> rscratch2; // first character in tmp2 is the one sought for +// tmp2 = isLL ? UNSIGNED_EXTEND_BYTE2INT(tmp2) : UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different character left +// result = tmp1 - tmp2; +// goto DONE; +// } +// +// STUB: +// +// goto DONE; +// +// // Short strings comparison code. Instead of simple per-character loop with +// // load-and-compare code it uses loop than issues 2 per-character loads from +// // each string per iteration. Different registers are used for that to +// // remove dependencies: (tmp1, cnt1) and (tmp2, rscratch1) pairs. +// // First characters loads are issued in pre-loop. +// SHORT_STRING: +// if (cnt2 == 0) goto DONE; // no characters to compare. Length difference (already calculated) should be used as result +// tmp1 = LOAD_STR1_CHAR(str1); +// str1 = str1 + str1_chr_size); // merged with load above as post-increment +// cnt2 = cnt2 - 1; // calculate remaining length after first character is loaded +// bool endReached = cnt2 == 0; // kept in flags +// if (endReached) goto SHORT_LAST_INIT; // load 1 character to complete from 2nd string to complete init and compare it with 1st string character +// cnt1 = LOAD_STR2_CHAR(str2); +// str2 = str2 + str2_chr_size; // merged with load above as post-increment +// goto SHORT_LOOP_START; // per-character loop entry point +// SHORT_LOOP: // per-character loop +// cnt2 = cnt2 - 1; // calculate remaining length +// endReached = cnt2 == 0; +// if (endReached) goto SHORT_LAST_INIT; +// SHORT_LOOP_START: // per-character loop entry point +// tmp2 = LOAD_STR1_CHAR(str1); +// rscratch1 = LOAD_STR2_CHAR(str2); +// bool differentResult = tmp1 != cnt1; // check difference of previously loaded pair of registers while last pair is still loading. Kept in flags +// if (differentResult) goto SHORT_LOOP_TAIL; // calculate character difference and return +// cnt2 = cnt2 - 1; // calculate remaining length +// endReached = cnt2 == 0; +// if (endReached) goto SHORT_LAST2; // last comparison of second pair of registers (tmp2, rscratch1) is left +// tmp1 = LOAD_STR1_CHAR(str1); +// cnt1 = LOAD_STR2_CHAR(str2); +// bool sameResult = tmp2 == rscratch1; // check difference of previously loaded pair of registers while last pair is still loading. Kept in flags +// if (sameResult) goto SHORT_LOOP; +// result = tmp2 - rscratch1; +// goto DONE; +// SHORT_LAST2: // last comparison is left: (tmp2, rscratch1) +// sameResult = tmp2 == rscratch1; +// if (sameResult) goto DONE; +// result = tmp2 - rscratch1; +// goto DONE; +// SHORT_LAST_INIT: +// cnt1 = LOAD_STR2_CHAR(str2); +// SHORT_LAST: // last comparison of second pair of registers (tmp1, cnt1) is left +// sameResult = tmp1 == cnt1; +// if (sameResult) goto DONE; +// result = tmp1 - cnt1; +// DONE: +// return; // result + void MacroAssembler::string_compare(Register str1, Register str2, Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {