--- old/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp 2019-02-21 16:08:13.046971067 +0300 +++ new/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp 2019-02-21 16:08:12.830972662 +0300 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -4861,7 +4861,331 @@ BIND(DONE); } -// Compare strings. +// Summary: Compare strings intrinsic implementation. All combinations of UTF-16 +// and Latin1 encodings for both strings are considered. Comparison +// is performed in lexical order. +// +// Input: str1: pointer to 1st string +// str2: pointer to 2nd string +// cnt1: number of bytes in 1st string +// cnt2: number of bytes in 2nd string +// +// Algorithm parameter: +// ae: encodings used in 1st and 2nd strings +// +// Temporary registers: +// tmp1, tmp2, rscratch1, rscratch2: always used +// vtmp1, vtmp2, vtmp3: used in case encodings are different +// +// Output: result - return 0 if strings are equal. Returns positive value +// if 1st string > 2nd string in lexical order. Returns +// negative value if 1st string < 2nd string. +// +// Side effects: str1, str2, cnt1, cnt2, tmp1, tmp2, rscratch1, rscratch2: clobbered. +// vtmp1, vtmp2, vtmp2: clobbered if encodings are different +// +// Additional data: boolean values: isLL, isLU, isUL, str1_isL, str2_isL and +// int minCharInWords are derived from ae parameter based on encodings used +// in strings. Different code is generated depending of these values: +// +// isLL = both strings are Latin1 +// isLU = 1st string is Latin1, 2nd string is UTF-16 +// isUL = 1st string is UTF-16, 2nd string is Latin1 +// str1_isL = 1st string is Latin1 +// str2_isL = 2nd string is Latin1 +// str1_chr_shift = shift value to convert between characters counter to byte counter for 1st string +// str2_chr_shift = shift value to convert between characters counter to byte counter for 2nd string +// minCharInWords = minimum number of characters that fit in register (8 for LL case, 4 otherwise) +// +// +// PSEUDO CODE: +// +// // N.B.: this pseudo-code doesn't strictly follow implementation details. +// // It is here to help understand the basics. Detailed implementation +// // description is listed after this code. +// +// ; +// result = cnt1 - cnt2; // length difference. Used in if all min(cnt1, cnt2) characters are same +// cnt2 = min(cnt1, cnt2); // amount of characters to check +// if (result <= minCharInWords) { // <= wordSize bytes should be loaded for comparison +// if (cnt2 == 0) return result; +// while (cnt2 != 0) { +// char str1char = str1[0]; +// char str2char = str2[0]; +// str1 += 1 << str1_chr_shift; // advance pointer by size of str1 character +// str2 += 1 << str2_chr_shift; // advance pointer by size of str2 character +// if (str1[0] != str2[0]) return str1[0] - str2[0]; +// cnt2--; +// } +// } else { // > wordSize bytes should be loaded for comparison +// // This code checks string in 8-byte blocks. If encodings are +// // different, Latin1 string will be loaded via 4-byte blocks and then +// // each block will be converted to 8-byte UTF-16 equivalent. Then 8 byte +// // blocks are compared. Each load is 8 characters for LL case and 4 +// // characters for LU/UL/UU. +// // This set of instructions (load 8 Latin1 character OR load 4 Latin1 +// // characters and convert it to 4 UTF-16 character OR load 4 UTF-16 +// // character) is referred as below. +// +// // First iteration in the loop is unrolled to add initialization. +// +// // The code below calculates addresses of each string last load: addresses +// // of last 8 characters for LL case and last 4 characters otherwise. +// // Then offsets from the addresses to the beginning of the strings are +// // calculated. Offset is then use as loop counter. When offset is >= 0, then +// // only last loads (possible overlapped) are left to be checked. +// // N.B.: in case of same encodings, offsets are the same for both strings. +// // Then offset for 2nd string is used for both strings. +// +// tmp1 = ; +// if (str1 == str2) return result; +// tmp2 = ; +// +// // use special implementation optimized for large strings. See detailed code and stub comments. +// if (cnt2 >= 72) return compare_long_string_implementation(); +// +// cnt2 -= ; // 8 for isLL case. 4 otherwise. +// +// if (str1_isL == str2_isL) { +// // Optional optimization for same encoding cases. Can be applied for all +// // cases, but is faster in same encoding cases only. Without this branch +// // smallest string (8 character for LL and 4 characters for others) would +// // be checked twice. +// if (cnt2 == 0) goto TAIL_CHECK; // no more characters to be loaded. Just check already loaded data. +// } +// +// // calculate addresses of last loads. use str1 and str2 pointers for that +// str1 = str1 + cnt2 << str1_chr_shift; +// str2 = str2 + cnt2 << str2_chr_shift; +// +// // calculate offsets for both strings. cnt1 and cnt2 can be reused +// if (str1_isL != str2_isL) cnt1 = - (cnt2 << str1_chr_shift); +// cnt2 = - (cnt2 << str2_chr_shift); +// +// // increment calculated offsets by the number of already loaded bytes +// if (isLU) cnt1 += 4; +// if (isUL) cnt1 += 8; +// cnt2 += isUL ? 4 : 8; +// +// if (cnt2 >= 0) goto TAIL; // only last loads remains. Still need to check currently loaded data. +// +// rscratch2 = tmp1 BIT_XOR tmp2; +// if (rscratch2 != 0) goto DIFFERENCE; +// +// // main loop. Label = NEXT_WORD +// do { +// tmp1 = ; +// tmp2 = ; +// +// // update offsets by the number of loaded bytes +// cnt2 += isUL ? 4 : 8; +// if (isLU) cnt1 += 4; +// if (isUL) cnt1 += 8; +// +// if (cnt2 >= 0) goto TAIL; // last block left to be loaded. Still need to check currently loaded block. +// rscratch2 = tmp1 BIT_XOR tmp2; +// } while (rscratch2 == 0); +// goto DIFFERENCE: +// +// TAIL: // last block left to be loaded. Still need to check currently loaded block. +// rscratch2 = tmp1 BIT_XOR tmp2; +// if (rscratch2 != 0) goto DIFFERENCE; +// tmp1 = ; +// tmp2 = ; +// // fallthrough to TAIL_CHECK +// TAIL_CHECK: +// rscratch2 = tmp1 BIT_XOR tmp2; +// if (rscratch2 == 0) return result; +// DIFFERENCE: // different character found. Find it and compute difference +// // tmp1 and tmp2 have current data with at least 1 different character. +// // Find index of first such character. +// rscratch2 = REVERSE_BITS(rscratch2); +// rscratch2 = COUNT_LEADING_ZEROES(rscratch2); // position of different bit in current 8 bytes +// rscratch2 = rscratch2 & (isLL ? -8 : -16); // number of bits until (possibly converted) different characters in tmp1 and tmp2 +// tmp1 = tmp1 >> rscratch2; // now first character in tmp1 is the one sought for +// tmp1 = tmp1 & (isLL ? 0xFF : 0xFFFF); // only first different character left +// tmp2 = tmp2 >> rscratch2; // now first character in tmp2 is the one sought for +// tmp2 = tmp2 & (isLL ? 0xFF : 0xFFFF); // only first different character left +// result = tmp1 - tmp2; +// } +// return result; +// +// +// +// DETAILED CODE: +// +// if (!str1_isL) cnt1 = cnt1 >> 1; // counter for 1st string (in characters) +// if (!str2_isL) cnt2 = cnt2 >> 1; // counter for 2nd string (in characters) +// result = cnt1 - cnt2; // keep in flags the result of operation +// cnt2 = min(cnt1, cnt2); // implemented as csel instruction using stored flag value above +// bool shortStringsCase = cnt2 <= minCharInWords; // kept in flag +// if (shortStringsCase) goto SHORT_STRING; // separate code for short strings +// if (str1_isL == str2_isL) { // same encoding case +// tmp1 = LOAD8BYTES(str1); +// bool sameString = str1 == str2; // kept in flags +// if (sameString) goto DONE; // the string is the same, return +// tmp2 = LOAD8BYTES(str2); +// bool largeStrings = cnt2 >= 72; // kept in flags +// if (largeStrings) goto STUB; // handled in separate stub implementation for large strings +// cnt2 = cnt2 - minCharsInWord; // decrement counter by the number of loaded characters +// bool noMoreLoadsAvailable = cnt2 == 0; // kept in flags +// if (noMoreLoadsAvailable) goto TAIL_CHECK; +// str2 = str2 + cnt2 << str2_chr_shift; // address of str2 last load +// str1 = str1 + cnt2 << str1_chr_shift; // address of str1 last load +// cnt2 = -(cnt2 << str2_chr_shift); // byte offset to 1st character in each string +// } else if (isLU) { +// vtmp = LOAD4BYTES(str1); +// bool sameString = str1 == str2; // kept in flags +// if (sameString) goto DONE; // return +// tmp2 = LOAD8BYTES(str2); +// bool largeStrings = cnt2 >= 72; // kept in flags +// if (largeStrings) goto STUB; // handled in separate stub implementation for large strings +// cnt2 = cnt2 - 4; // decrement counter by the number of loaded characters +// vtmpz = 0; // implemented as eor +// str1 = str1 + cnt2 << str1_chr_shift; // address of str1 last load +// str2 = str2 + cnt2 << str2_chr_shift; // address of str2 last load +// vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); // convert Latin1 to UTF16 because it'll be compared with UTF16. Implemented as zip instruction +// cnt1 = -(cnt2 << str1_chr_shift); // byte offset to 1st character in 1st string +// cnt2 = -(cnt2 << str2_chr_shift); // byte offset to 1st character in 2nd string +// cnt1 = cnt1 + 4; // advance 1st string offset by the number of loaded bytes +// tmp1 = vtmp; // move converted characters from FPU register to GPR +// } else { // UL +// tmp1 = LOAD8BYTES(str1); +// bool sameString = str1 == str2; // kept in flags +// if (sameString) goto DONE; // return +// vtmp = LOAD4BYTES(str2); +// bool largeStrings = cnt2 >= 72; // kept in flags +// if (largeStrings) goto STUB; // separate stub implementation for large strings +// cnt2 = cnt2 - 4; // update counter by the number of loaded characters +// str1 = str1 + cnt2 << str1_chr_shift; // address of str1 last load +// vtmpz = 0; // implemented as eor +// str2 = str2 + cnt2 << str2_chr_shift; // address of str2 last load +// cnt1 = -(cnt2 << str1_chr_shift); // byte offset to 1st character in 1st string +// vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); // convert Latin1 to UTF16 because it'll be compared with UTF16. implemented as zip instruction +// cnt2 = -(cnt2 << str2_chr_shift); // byte offset to 1st character in 2nd string +// cnt1 = cnt1 + 8; // advance 1st string offset by the number of loaded bytes +// tmp2 = vtmp; // move converted characters from FPU register to GPR +// } +// cnt2 = cnt2 + (isUL ? 4 : 8); // update offset by the number of loaded bytes +// bool onlyLastLoadRemains = cnt2 >= 0; // kept in flags +// if (onlyLastLoadRemains) goto TAIL; +// rscratch2 = BIT_XOR(tmp1, tmp2); // current block comparison result +// if (rscratch2 != 0) goto DIFFERENCE; // found different characters in current block +// NEXT_WORD: // main loop +// // implementation for each encoding loads 4 or 8 characters at calculated +// // offsets from each string and convert encodings if necessary. Then offsets +// // are updated. +// if (str1_isL == str2_isL) { +// tmp1 = LOAD8BYTES(str1, cnt2); +// tmp2 = LOAD8BYTES(str2, cnt2); +// cnt2 = cnt2 + 8; // update counter by the number of loaded bytes +// onlyLastLoadRemains = cnt2 >= 0; // kept in flags +// } else if (isLU) { +// vtmp = LOAD4BYTES(str1, cnt1); +// tmp2 = LOAD8BYTES(str2, cnt2); +// cnt1 = cnt1 + 4; +// vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); +// tmp1 = vtmp; +// cnt2 = cnt2 + 8; +// onlyLastLoadRemains = cnt2 >= 0; // kept in flags +// } else { // UL +// vtmp = LOAD4BYTES(str2, cnt2); +// tmp1 = LOAD8BYTES(str1, cnt1); +// vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); +// cnt1 = cnt1 + 8; +// tmp2 = vtmp; +// cnt2 = cnt2 + 4; +// onlyLastLoadRemains = cnt2 >= 0; // kept in flags +// } +// if (onlyLastLoadRemains) goto TAIL; +// rscratch2 = BIT_XOR(tmp1, tmp2); // current block comparison result +// if (rscratch2 == 0) goto NEXT_WORD; +// goto DIFFERENCE; +// TAIL: // check already loaded data and last load +// rscratch2 = BIT_XOR(tmp1, tmp2); // current block comparison result +// if (rscratch2 != 0) goto DIFFERENCE; +// +// // last load (and convert if needed) from each string +// if (str1_isL == str2_isL) { +// tmp1 = LOAD8BYTES(str1); +// tmp2 = LOAD8BYTES(str2); +// } else if (isLU) { +// vtmp = LOAD4BYTES(str1); +// tmp2 = LOAD8BYTES(str2); +// vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); +// tmp1 = vtmp; +// } else { // UL +// vtmp = LOAD4BYTES(str2); +// tmp1 = LOAD8BYTES(str1); +// vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz); +// tmp2 = vtmp; +// } +// TAIL_CHECK: // last check +// rscratch2 = BIT_XOR(tmp1, tmp2); // current block comparison result +// if (rscratch2 == 0) goto DONE; // return +// DIFFERENCE: +// rscratch2 = REVERSE_BITS(rscratch2); // It's not possible to count trailing zeroes. Reverse bits and then count leading zeroes instead. +// rscratch2 = COUNT_LEADING_ZEROES(rscratch2); // position of different bit in current 8 bytes +// rscratch2 = rscratch2 & (isLL ? -8 : -16); // number of bits until (possibly converted) different characters in tmp1 and tmp2 +// tmp1 = tmp1 >> rscratch2; // first character in tmp1 is the one sought for +// tmp1 = isLL ? UNSIGNED_EXTEND_BYTE2INT(tmp1) : UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different character left +// tmp2 = tmp2 >> rscratch2; // first character in tmp2 is the one sought for +// tmp2 = isLL ? UNSIGNED_EXTEND_BYTE2INT(tmp2) : UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different character left +// result = tmp1 - tmp2; +// goto DONE; +// } +// +// STUB: +// +// goto DONE; +// +// // Short strings comparison code. Instead of simple per-character loop with +// // load-and-compare code it uses loop than issues 2 per-character loads from +// // each string per iteration. Different registers are used for that to +// // remove dependencies: (tmp1, cnt1) and (tmp2, rscratch1) pairs. +// // First characters loads are issued in pre-loop. +// SHORT_STRING: +// if (cnt2 == 0) goto DONE; // no characters to compare. Length difference (already calculated) should be used as result +// tmp1 = LOAD_STR1_CHAR(str1); +// str1 = str1 + str1_chr_size); // merged with load above as post-increment +// cnt2 = cnt2 - 1; // calculate remaining length after first character is loaded +// bool endReached = cnt2 == 0; // kept in flags +// if (endReached) goto SHORT_LAST_INIT; // load 1 character to complete from 2nd string to complete init and compare it with 1st string character +// cnt1 = LOAD_STR2_CHAR(str2); +// str2 = str2 + str2_chr_size; // merged with load above as post-increment +// goto SHORT_LOOP_START; // per-character loop entry point +// SHORT_LOOP: // per-character loop +// cnt2 = cnt2 - 1; // calculate remaining length +// endReached = cnt2 == 0; +// if (endReached) goto SHORT_LAST_INIT; +// SHORT_LOOP_START: // per-character loop entry point +// tmp2 = LOAD_STR1_CHAR(str1); +// rscratch1 = LOAD_STR2_CHAR(str2); +// bool differentResult = tmp1 != cnt1; // check difference of previously loaded pair of registers while last pair is still loading. Kept in flags +// if (differentResult) goto SHORT_LOOP_TAIL; // calculate character difference and return +// cnt2 = cnt2 - 1; // calculate remaining length +// endReached = cnt2 == 0; +// if (endReached) goto SHORT_LAST2; // last comparison of second pair of registers (tmp2, rscratch1) is left +// tmp1 = LOAD_STR1_CHAR(str1); +// cnt1 = LOAD_STR2_CHAR(str2); +// bool sameResult = tmp2 == rscratch1; // check difference of previously loaded pair of registers while last pair is still loading. Kept in flags +// if (sameResult) goto SHORT_LOOP; +// result = tmp2 - rscratch1; +// goto DONE; +// SHORT_LAST2: // last comparison is left: (tmp2, rscratch1) +// sameResult = tmp2 == rscratch1; +// if (sameResult) goto DONE; +// result = tmp2 - rscratch1; +// goto DONE; +// SHORT_LAST_INIT: +// cnt1 = LOAD_STR2_CHAR(str2); +// SHORT_LAST: // last comparison of second pair of registers (tmp1, cnt1) is left +// sameResult = tmp1 == cnt1; +// if (sameResult) goto DONE; +// result = tmp1 - cnt1; +// DONE: +// return; // result + void MacroAssembler::string_compare(Register str1, Register str2, Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { --- old/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp 2019-02-21 16:08:13.518967582 +0300 +++ new/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp 2019-02-21 16:08:13.278969355 +0300 @@ -4026,9 +4026,26 @@ return entry; } - // code for comparing 16 bytes of strings with same encoding + // Summary: part of string compareTo implementation. Called for code generation in multiple points. + // 1) load 8 bytes and advance pointers of both strings and compare + // previously loaded 8 bytes. jump to DIFF1 if different characters found + // 2) load 8 bytes and advance pointers of both strings and compare + // previously loaded 8 bytes. jump to DIFF2 if different characters found + // + // Input: + // str1 (r1): pointer for next load from 1st string + // cnt1 (r2): register to use for loading data from 2nd string + // str2 (r3): pointer for next load from 2nd string + // tmp1 (r10): already loaded 8 bytes of 1st string. + // tmp2 (r11): already loaded 8 bytes of 2nd string. + // + // Output: + // rscratch2: result of last comparison + // tmp1, tmp2: contains different parts of 1st and 2nd strings if exit via DIFF1 label. Not used for normal and DIFF2 exits + // rscratch1, cnt1: contains different parts of 1st and 2nd strings if exit via DIFF2 label. Not used for normal and DIFF1 exits + // void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { - Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; + Register str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; __ ldr(rscratch1, Address(__ post(str1, 8))); __ eor(rscratch2, tmp1, tmp2); __ ldr(cnt1, Address(__ post(str2, 8))); @@ -4039,14 +4056,49 @@ __ cbnz(rscratch2, DIFF2); } - // code for comparing 16 characters of strings with Latin1 and Utf16 encoding + // Summary: part of string compare implementation. Called for code generation in multiple points. + // - expecting 4 UTF-16 string characters preloaded into tmp3 + // - load 16 characters from each string + // - convert Latin1 characters to UTF-16 + // - compare preloaded 4 characters with 4 first converted Latin1 characters + // - compare next 12 loaded and converted characters from each string + // - compared data is in tmpU and tmpL registers or in tmp3 and tmpL + // - in case different characters are found while comparing tmpU and + // tmpL, jumps to DIFF1. Jumps to DIFF2 in case different character + // was found while comparing tmp3 and tmpL + // - string pointers are increased by amount of loaded bytes + // + // Input: + // strUnext (r2): pointer for next load from UTF-16 string + // strLnext (r11): pointer for next load from Lating1 string + // tmp3 (r12): used to store parts of UTF-16 string + // vmptZ (v0): zeroed register for conversion from Latin1 to UTF-16 + // + // Temporary registers: + // vtmp (v1): used to load 16 Latin1 characters and part of converted Latin1 string + // vtmp3 (v2): used for part of converted Latin1 string + // + // Output: rscratch2: result of last comparison + // tmpL: last compared part of converted Latin1 string + // tmpU: in case of exit via DIFF1 or normal exit: contains last compared part of UTF-16 string. + // Contains part of UTF-16 string compared before last comparison otherwise. + // tmp3: in case of exit via DIFF2: contains last compared part of UTF-16 string. + // in case of exit via DIFF1: contains part of UTF-16 string compared before last comparison otherwise. + // in case of normal exit: contains preloaded 8 bytes of UTF-16 string for next comparisons + // + // Parameters: + // tmpL: holds parts of converted Latin1 string + // tmpU: holds parts of UTF-16 string + // DIFF1: label to jump to if different characters are found in tmpU and tmpL + // DIFF2: label to jump to if different characters are found in tmp3 and tmpL + // void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, Label &DIFF2) { - Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; + Register strUnext = r2, tmp1 = r10, strLnext = r11, tmp3 = r12; FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; - __ ldrq(vtmp, Address(__ post(tmp2, 16))); - __ ldr(tmpU, Address(__ post(cnt1, 8))); + __ ldrq(vtmp, Address(__ post(strLnext, 16))); + __ ldr(tmpU, Address(__ post(strUnext, 8))); __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 @@ -4054,30 +4106,224 @@ __ eor(rscratch2, tmp3, tmpL); __ cbnz(rscratch2, DIFF2); - __ ldr(tmp3, Address(__ post(cnt1, 8))); + __ ldr(tmp3, Address(__ post(strUnext, 8))); __ umov(tmpL, vtmp3, __ D, 1); __ eor(rscratch2, tmpU, tmpL); __ cbnz(rscratch2, DIFF1); __ zip2(vtmp, __ T16B, vtmp, vtmpZ); - __ ldr(tmpU, Address(__ post(cnt1, 8))); + __ ldr(tmpU, Address(__ post(strUnext, 8))); __ fmovd(tmpL, vtmp); __ eor(rscratch2, tmp3, tmpL); __ cbnz(rscratch2, DIFF2); - __ ldr(tmp3, Address(__ post(cnt1, 8))); + __ ldr(tmp3, Address(__ post(strUnext, 8))); __ umov(tmpL, vtmp, __ D, 1); __ eor(rscratch2, tmpU, tmpL); __ cbnz(rscratch2, DIFF1); } - // r0 = result - // r1 = str1 - // r2 = cnt1 - // r3 = str2 - // r4 = cnt2 - // r10 = tmp1 - // r11 = tmp2 + // Summary: Compare long strings intrinsic implementation for different encodings. + // Comparison is performed in lexical order. + // + // Prerequisites: string length >= 72 characters + // + // Input: result (r0): length difference + // str1 (r1): pointer to 1st string + // str2 (r2): pointer to 2nd string + // cnt1 (r3): number of characters in 1st string + // cnt2 (r4): minimum of str1 and str2 length. Used as counter + // tmp1 (r10): starting 8 bytes of 1st string for UTF-16 string + // tmp2 (r11): starting 8 bytes of 2nd string for UTF-16 string + // vtmpZ (v0): used to convert encodings by providing zero values + // vtmp (v1): starting bytes of Latin1 string. Also used as temporary register + // vtmp3 (v2): temporary register + // + // Temporary registers: + // rscratch1, rscratch2: clobbered on exit + // preloadedChunk (r12), smallLoopCounter (r14): pushed on stack, then restored on exit + // + // + // Output: result - return 0 if strings are equal. Returns positive value + // if 1st string > 2nd string in lexical order. Return + // negative value if 1st string < 2nd string. + // + // Side effects: str1, str2, cnt1, cnt2, tmp1, tmp2, rscratch1, rscratch2: clobbered. + // + // Algorithm parameters: + // isLU: true if 1st string is Latin1. + // + // Calculated constants: + // largeLoopExitCondition: Exit condition for loop with prefetch. + // + // + // PSEUDO CODE: + // // Code below uses code block which: + // // - loads 16 Latin1 characters at once. Then converts it to UTF-16 and move to GPR + // // - issues 4 smaller loads of 4 UTF-16 characters and for each load compare it with converted Latin1 characters + // // - smaller loads are using 2 different registers to break register dependencies + // // - jump to DIFF or DIFF2 label depending on which register has a character different from converted Latin1 character + // + // ; + // ; + // ; + // ; + // ; + // cnt2 = cnt2 - 4; // keep characters counter reduced by 4, because last 4 characters are compared separately + // if (SoftwarePrefetchHintDistance >= 0) { // need prefetch + // if (cnt2 < largeLoopExitConditioni) goto NO_PREFETCH; // don't use loop with prefetch in case prefetch distance is too far away + // do { // 64-characters loop with prefetch. + // // Each iteration has 2 prefetch instructions for UTF-16 string and 1 for Latin1 string + // // contains 2-iterations loops (16 characters each) between prefetch instructions + // // to avoid huge code generation + // ; + // ; + // for (smallLoopCounter = 0; smallLoopCounter < 2; smallLoopCounter++) { + // ; + // } + // ; + // for (smallLoopCounter = 0; smallLoopCounter < 2; smallLoopCounter++) { + // ; + // } + // cnt2 = cnt2 - 64; // update counter by the number of loaded characters + // } while(cnt2 >= largeLoopExitCondition); + // } + // if (cnt2 == 0) goto LOAD_LAST; // load and compare last 4 characters + // NO_PREFETCH: + // if (; + // } while(); + // + // if (cnt2 == 0) goto LOAD_LAST; + // TAIL: + // ; + // + // ; + // goto LOAD_LAST; + // DIFF1: + // ; + // // fallthrough + // DIFF2: + // ; + // goto CALCULATE_DIFFERENCE; + // LOAD_LAST: + // ; + // ; + // if () return; // result = already calculated length difference + // CALCULATE_DIFFERENCE: + // result = ; + // DONE: + // return; + // + // + // + // + // + // DETAILED CODE: + // vtmpZ = 0; // used to convert encodings + // vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpZ); // implemented as zip1 instruction + // + // // update string pointers by the number of loaded bytes + // str1 = str1 + (isLU ? wordSize/2 : wordSize); + // str2 = str2 + (isLU ? wordSize : wordSize/2); + // + // // copy converted string into GPR + // if (isLU) tmp1 = vtmp; + // else tmp2 = vtmp; + // + // cnt2 = cnt2 - 8; // reduce cnt2 by the number of already loaded characters. And reduce by 4 more characters + // str1 = str1 + cnt2 << (isLU ? 0 : 1); // address of 1st string last 4 characters + // rscratch1 = BIT_XOR(tmp1, tmp2); // begin loaded chunks comparison + // str2 = str2 + cnt2 << (isLU ? 1 : 0); // address of 2nd string last 4 characters + // rscratch1 = tmp2; // copy 2nd string chunk + // if (rscratch2 != 0) CALCULATE_DIFFERENCE; // found different character + // + // // several redefinitions below to have meaningful names + // void* strU = isLU ? str2 : str1; // UTF-16 string pointer to last 4 characters + // void* strL = isLU ? str1 : str2; // Latin1 string pointer to last 4 characters + // long tmpU = isLU ? rscratch1 : tmp1; // UTF-16 characters holder + // long tmpL = isLU ? tmp1 : rscratch1; // Latin1 characters holder + // void* strLnext = tmp2; // Latin1 string pointer to load next character(s) + // void* strUnext = cnt1; // UTF-16 string pointer to load next character(s) + // + // PUSH_ON_STACK(preloadedChunk, smallLoopCounter); + // strLnext = strL - cnt2; // initialize pointer to Latin1 string next load + // strUnext = strU - cnt2 << 1; // initialize pointer to UTF-16 string next load + // + // preloadedChunk = LOAD8BYTES(strUnext, 8); // pre-load next 8 bytes of UTF-16 string + // strUnext = strUnext + 8; // merged with load above as post-increment + // + // if (SoftwarePrefetchHintDistance >= 0) { + // rscratch2 = cnt2 - prefetchLoopExitCondition; + // if (rscratch2 < 0) goto NO_PREFETCH; + // LARGE_LOOP_PREFETCH: // 64-characters loop + // PREFETCH(strLnext, SoftwarePrefetchHintDistance); + // smallLoopCounter = 2; // initialize inner loop counter + // PREFETCH(strUnext, SoftwarePrefetchHintDistance); + // LARGE_LOOP_PREFETCH_REPEAT1: { // 16 characters inner loop with 2 iterations + // compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // see compare_string_16_x_LU comments + // smallLoopCounter--; + // bool smallLoopRepeat = (smallLoopCounter > 0); // kept in flags + // if (smallLoopRepeat) goto LARGE_LOOP_PREFETCH_REPEAT1; + // } + // PREFETCH(strUnext, SoftwarePrefetchHintDistance); + // smallLoopCounter = 2; // initialize inner loop counter + // LARGE_LOOP_PREFETCH_REPEAT2: { // one more 16 characters inner loop with 2 iterations + // compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // see compare_string_16_x_LU comments + // smallLoopCounter--; + // bool smallLoopRepeat = (smallLoopCounter > 0); // kept in flags + // if (smallLoopRepeat) goto LARGE_LOOP_PREFETCH_REPEAT2; + // } + // cnt2 = cnt2 - 64; + // rscratch2 = cnt2 - prefetchLoopExitCondition; + // if (rscratch2 >= 0) goto LARGE_LOOP_PREFETCH; + // } // end of 64-characters loop + // + // if (cnt2 == 0) goto LOAD_LAST; // no more characters left except last 4 characters reserved earlier + // NO_PREFETCH: // all further loads doesn't require prefetch instruction + // cnt2 = cnt2 - 16; // keep cnt2 counter reduced by 16 + // if (cnt2 < 0) goto TAIL; // less than 16 characters left to load until last 4 reserved characters + // SMALL_LOOP: // 16-characters loop + // compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); + // cnt2 = cnt2 - 16; // decrement counter by previously loaded 16 characters + // bool repeatSmallLoop = cnt2 >= 0; // kept in flags + // if (repeatSmallLoop) goto SMALL_LOOP; + // if (cnt2 == -16) goto LOAD_LAST; + // TAIL: + // strUnext = strUnext + cnt2 << 1; // pointer to UTF-16 last 16 characters + 8 bytes + // strLnext = strLnext + cnt2; // pointer to Latin1 last 16 characters + // preloadedChunk = LOAD8BYTES(strUnext, -8); + // compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); + // goto LOAD_LAST; + // DIFF2: + // tmpU = preloadedChunk; + // DIFF1: + // pop(preloadedChunk, smallLoopCounter); + // goto CALCULATE_DIFFERENCE; + // LOAD_LAST: + // tmpU = preloadedChunk; // already loaded last 4 UTF-16 characters. Just copy to required register + // pop(preloadedChunk, smallLoopCounter); + // vtmp = LOAD4BYTES(strL); + // vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpZ); + // tmpL = vtmp; + // rscratch2 = BIT_XOR(tmpU, tmpL); + // if (rscratch2 == 0) goto DONE; + // CALCULATE_DIFFERENCE: + // // No count trailing zeroes instruction is available. Reverse bits and count leading zeroes instead. + // rscratch2 = REVERSE_BITS(rscratch2); + // rscratch2 = COUNT_LEADING_ZEROES(rscratch2); + // rscratch2 = rscratch2 & -16; // clear lowest 4 bits to have number of bits until different character + // tmp1 = tmp1 >> rscratch2; // shift off same symbols from 1st string data + // tmp1 = UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different symbol remains in 1st string data + // rscratch1 = rscratch1 >> rscratch2; // shift off same symbols from 2nd string data + // rscratch1 = UNSIGNED_EXTEND_SHORT2INT(rscratch1); // only first different symbol remains in 2nd string data + // result = tmp1 - rscratch1; // character difference + // DONE: + // return; address generate_compare_long_string_different_encoding(bool isLU) { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", isLU @@ -4088,9 +4334,9 @@ DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, - tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; + tmp1 = r10, tmp2 = r11, preloadedChunk = r12, smallLoopCounter = r14; FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; - RegSet spilled_regs = RegSet::of(tmp3, tmp4); + RegSet spilled_regs = RegSet::of(preloadedChunk, smallLoopCounter); int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2); @@ -4110,29 +4356,32 @@ Register strU = isLU ? str2 : str1, strL = isLU ? str1 : str2, tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison - tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison + tmpL = isLU ? tmp1 : rscratch1, // where to keep L for comparison + strLnext = tmp2, + strUnext = cnt1; __ push(spilled_regs, sp); - __ sub(tmp2, strL, cnt2); // strL pointer to load from - __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from + __ sub(strLnext, strL, cnt2); // strL pointer to load from + __ sub(strUnext, strU, cnt2, __ LSL, 1); // strU pointer to load from - __ ldr(tmp3, Address(__ post(cnt1, 8))); + // safe to read ahead 4 characters, because string length >= 72 characters + __ ldr(preloadedChunk, Address(__ post(strUnext, 8))); if (SoftwarePrefetchHintDistance >= 0) { __ subs(rscratch2, cnt2, prefetchLoopExitCondition); __ br(__ LT, NO_PREFETCH); __ bind(LARGE_LOOP_PREFETCH); - __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); - __ mov(tmp4, 2); - __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); + __ prfm(Address(strLnext, SoftwarePrefetchHintDistance)); + __ mov(smallLoopCounter, 2); + __ prfm(Address(strUnext, SoftwarePrefetchHintDistance)); __ bind(LARGE_LOOP_PREFETCH_REPEAT1); compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); - __ subs(tmp4, tmp4, 1); + __ subs(smallLoopCounter, smallLoopCounter, 1); __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); - __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); - __ mov(tmp4, 2); + __ prfm(Address(strUnext, SoftwarePrefetchHintDistance)); + __ mov(smallLoopCounter, 2); __ bind(LARGE_LOOP_PREFETCH_REPEAT2); compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); - __ subs(tmp4, tmp4, 1); + __ subs(smallLoopCounter, smallLoopCounter, 1); __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); __ sub(cnt2, cnt2, 64); __ subs(rscratch2, cnt2, prefetchLoopExitCondition); @@ -4140,6 +4389,9 @@ } __ cbz(cnt2, LOAD_LAST); // no characters left except last load __ bind(NO_PREFETCH); + // Load and compare cnt2 characters using 16 characters loop with + // compare_string_16_x_LU primitive. In case 1..15 characters left: + // use same compare_string_16_x_LU primitive with partial overlapping __ subs(cnt2, cnt2, 16); __ br(__ LT, TAIL); __ bind(SMALL_LOOP); // smaller loop @@ -4149,20 +4401,20 @@ __ cmn(cnt2, (u1)16); __ br(__ EQ, LOAD_LAST); __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) - __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string - __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string - __ ldr(tmp3, Address(cnt1, -8)); + __ add(strUnext, strUnext, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string + __ add(strLnext, strLnext, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string + __ ldr(preloadedChunk, Address(strUnext, -8)); compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load __ b(LOAD_LAST); __ bind(DIFF2); - __ mov(tmpU, tmp3); + __ mov(tmpU, preloadedChunk); __ bind(DIFF1); __ pop(spilled_regs, sp); __ b(CALCULATE_DIFFERENCE); __ bind(LOAD_LAST); - // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. + // Last 4 UTF-16 characters are already pre-loaded into preloadedChunk by compare_string_16_x_LU. // No need to load it again - __ mov(tmpU, tmp3); + __ mov(tmpU, preloadedChunk); __ pop(spilled_regs, sp); __ ldrs(vtmp, Address(strL)); @@ -4188,14 +4440,171 @@ return entry; } - // r0 = result - // r1 = str1 - // r2 = cnt1 - // r3 = str2 - // r4 = cnt2 - // r10 = tmp1 - // r11 = tmp2 + // Summary: Compare long strings intrinsic implementation for same encodings. + // Comparison is performed in lexical order. + // + // Prerequisites: string length >= 72 characters + // + // Input: result (r0): length difference + // str1 (r1): pointer to 1st string + // str2 (r2): pointer to 2nd string + // cnt1 (r3): amount of characters in 1st string + // cnt2 (r4): minimum of str1 and str2 length. Used as counter + // tmp1 (r10): starting 8 bytes of 1st string + // tmp2 (r11): starting 8 bytes of 2nd string + // + // Temporary registers: + // rscratch1, rscratch2 + // + // + // Output: result - return 0 if strings are equal. Returns positive value + // if 1st string > 2nd string in lexical order. Returns + // negative value if 1st string < 2nd string. + // + // Side effects: str1, str2, cnt1, cnt2, tmp1, tmp2, rscratch1, rscratch2: clobbered. + // + // Algorithm parameters: + // isLL: true if both string are Latin1. false if both are UTF-16. + // Used to generate code for both Latin1 - Latin1 (LL) case and + // UTF-16 - UTF-16 (UU) case. + // Calculated constants: + // largeLoopExitCondition: MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); + // Exit condition for loop with prefetch. + // characters_in_word: isLL ? 8 : 4 + // Number of characters fit in work (1 register) + // characters_in_dword: 2 * characters_in_word + // Number of characaters fit in double word (2 registers) + // byte_to_char_shift: isLL ? 0 : 1 + // Shift value to convert between byte and character counters + // + // PSEUDO CODE: + // // Code below uses code block, which: + // // - compares already loaded tmp1 and tmp2. goto DIFF label if it's not equal + // // - loads and compares next 8 bytes of both strings (stored in rscratch1 and cnt1). goto DIFF2 label if it's not equal + // // - loads next 8 bytes of both strings into tmp1 and tmp2 + // // - on each load string pointers are updated to point at character after loaded block + // + // ; + // if (SoftwarePrefetchHintDistance >= 0) { // need prefetch + // do { // 64-byte loop with prefetch + // ; + // ; + // ; + // ; + // cnt2 = cnt2 - (8 * characters_in_word); // update counter by amount + // ; + // ; + // } while(cnt2 >= largeLoopExitCondition); + // } + // if (cnt2 == 0) + // while(cnt2 > characters_in_dword) { + // ; + // cnt2 = cnt2 - characters_in_dword; + // } + // ; + // ; + // ; + // goto LAST_CHECK; + // DIFF2: + // ; + // // fallthrough to DIFF + // DIFF: + // + // goto DONE; + // LAST_CHECK: // label to jump to when last chunk of data has to be checked. + // // Return character difference if different characters are found. + // // Return length difference (already calculated) otherwise. + // + // DONE: + // return; + // + // + // DETAILED CODE: + // // N.B.: compare_string_16_bytes_same and call below is not + // // an actual call at runtime. It is called at code generation time. + // + // cnt2 = cnt2 - characters_in_word); + // str1 = str1 + wordSize; + // str2 = str2 + wordSize; + // if (SoftwarePrefetchHintDistance >= 0) { + // LARGE_LOOP_PREFETCH: + // LOAD_PREFETCH(str1, SoftwarePrefetchHintDistance); + // LOAD_PREFETCH(str2, SoftwarePrefetchHintDistance); + // compare_string_16_bytes_same(DIFF, DIFF2); + // compare_string_16_bytes_same(DIFF, DIFF2); + // cnt2 = cnt2 - 8 * characters_in_word; + // compare_string_16_bytes_same(DIFF, DIFF2); + // rscratch2 = cnt2 - largeLoopExitCondition; // rscratch2 is not used. Use subs instead of cmp in case of potentially large constants + // bool canLoop = rscratch2 > 0; // kept in flags + // compare_string_16_bytes_same(DIFF, DIFF2); + // if (canLoop) LARGE_LOOP_PREFETCH; + // if (cnt2 == 0) goto LAST_CHECK; // no more loads left + // } + // + // cnt2 = cnt2 - characters_in_dword; // keep cnt2 counter reduced by 16 (LL) or 8 (UU) + // bool lessThan16bytesLeft = cnt2 < 0; // kept in flags + // if (lessThan16bytesLeft) goto TAIL; + // SMALL_LOOP: // 16 byte loop + // compare_string_16_bytes_same(DIFF, DIFF2); + // cnt2 = cnt2 - characters_in_dword; + // bool canLoop = cnt2 >= 16; + // if (canLoop) goto SMALL_LOOP; + // bool lastCheckLeft = cnt2 == -characters_in_dword; + // if (lastCheckLeft) goto LAST_CHECK; + // TAIL: // less than 16 bytes left to load. And 8 bytes were loaded but not + // // compared. Reuse primitive. Handle last + // // 24 string bytes by preloading first 8 of these 24 bytes, then use + // // . And then compare last 8 bytes loaded + // // by + // // This will partially overlap with previous load and comparison, but + // // makes code more simple + // str1 = str1 + cnt2 << byte_to_char_shift; + // str2 = str2 + cnt2 << byte_to_char_shift; + // tmp1 = LOAD8BYTES(str1, -8); + // tmp2 = LOAD8BYTES(str2, -8); + // compare_string_16_bytes_same(DIFF, DIFF2); + // goto LAST_CHECK; + // DIFF2: // calculate character difference, when data stored in rscratch1 and cnt1 + // // move loaded chunks to tmp1 and tmp2 registers to use in DIFF block + // tmp1 = rscratch1; + // tmp2 = cnt1; + // // fallthrough to DIFF + // DIFF: // calculate character difference, when data stored in tmp1 and tmp2 + // // and find different characters. rscratch2 contains zeroes at positions with + // // same characters. Find index of first different bit (== amount of + // // trailing zeroes), which is: * + // // + . Then, clearing bits within character + // // (3 lowest bits for Latin1 case and 4 lowest bits for UTF-16 case) + // // will result in the number of bits until different character in current chunks. + // + // // As it's not possible to count trailing zeroes, reverse bits and count leading zeroes + // rscratch2 = REVERSE_BITS(rscratch2); + // rscratch2 = COUNT_LEADING_ZEROES(rscratch2); + // rscratch2 = rscratch2 & (isLL ? -8 : -16); // clear lowest 3 (Latin1) or 4 (UTF-16) bits + // tmp1 = tmp1 >> rscratch2; // shift off same characters from 1st string chunk + // tmp2 = tmp2 >> rscratch2; // shift off same characters from 2nd string chunk + // + // // Only first character should be left for comparison. Use unsigned extend instruction for that + // if (isLL) { + // tmp1 = UNSIGNED_EXTEND_BYTE2INT(tmp1); + // tmp2 = UNSIGNED_EXTEND_BYTE2INT(tmp2); + // } else { + // tmp1 = UNSIGNED_EXTEND_SHORT2INT(tmp1); + // tmp2 = UNSIGNED_EXTEND_SHORT2INT(tmp2); + // } + // + // result = tmp1 - tmp2; + // godo DONE; + // LAST_CHECK: + // rscratch2 = BIT_XOR(tmp1, tmp2); + // if (rscratch2 != 0) goto DIFF; + // DONE: + // return result; + address generate_compare_long_string_same_encoding(bool isLL) { + const int characters_in_word = isLL ? 8 : 4; + const int characters_in_dword = 2 * characters_in_word; + const int byte_to_char_shift = isLL ? 0 : 1; __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", isLL ? "compare_long_string_same_encoding LL" @@ -4203,15 +4612,13 @@ address entry = __ pc(); Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, tmp1 = r10, tmp2 = r11; - Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, - LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, - DIFF_LAST_POSITION, DIFF_LAST_POSITION2; + Label SMALL_LOOP, LARGE_LOOP_PREFETCH, DIFF2, TAIL, DONE, DIFF, LAST_CHECK; // exit from large loop when less than 64 bytes left to read or we're about // to prefetch memory behind array border int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used // update cnt2 counter with already loaded 8 bytes - __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); + __ sub(cnt2, cnt2, characters_in_word); // update pointers, because of previous read __ add(str1, str1, wordSize); __ add(str2, str2, wordSize); @@ -4221,58 +4628,32 @@ __ prfm(Address(str2, SoftwarePrefetchHintDistance)); compare_string_16_bytes_same(DIFF, DIFF2); compare_string_16_bytes_same(DIFF, DIFF2); - __ sub(cnt2, cnt2, isLL ? 64 : 32); + __ sub(cnt2, cnt2, 8 * characters_in_word); compare_string_16_bytes_same(DIFF, DIFF2); __ subs(rscratch2, cnt2, largeLoopExitCondition); compare_string_16_bytes_same(DIFF, DIFF2); __ br(__ GT, LARGE_LOOP_PREFETCH); - __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? + __ cbz(cnt2, LAST_CHECK); // Check if no more chars left } - // less than 16 bytes left? - __ subs(cnt2, cnt2, isLL ? 16 : 8); + __ subs(cnt2, cnt2, characters_in_dword); // keep number of characters reduced by 16 (LL) or 8 (UU) __ br(__ LT, TAIL); __ bind(SMALL_LOOP); compare_string_16_bytes_same(DIFF, DIFF2); - __ subs(cnt2, cnt2, isLL ? 16 : 8); + __ subs(cnt2, cnt2, characters_in_dword); __ br(__ GE, SMALL_LOOP); + __ cmn(cnt2, (u1)(characters_in_dword)); + __ br(__ EQ, LAST_CHECK); __ bind(TAIL); - __ adds(cnt2, cnt2, isLL ? 16 : 8); - __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); - __ subs(cnt2, cnt2, isLL ? 8 : 4); - __ br(__ LE, CHECK_LAST); - __ eor(rscratch2, tmp1, tmp2); - __ cbnz(rscratch2, DIFF); - __ ldr(tmp1, Address(__ post(str1, 8))); - __ ldr(tmp2, Address(__ post(str2, 8))); - __ sub(cnt2, cnt2, isLL ? 8 : 4); - __ bind(CHECK_LAST); - if (!isLL) { - __ add(cnt2, cnt2, cnt2); // now in bytes - } - __ eor(rscratch2, tmp1, tmp2); - __ cbnz(rscratch2, DIFF); - __ ldr(rscratch1, Address(str1, cnt2)); - __ ldr(cnt1, Address(str2, cnt2)); - __ eor(rscratch2, rscratch1, cnt1); - __ cbz(rscratch2, LENGTH_DIFF); - // Find the first different characters in the longwords and - // compute their difference. + __ add(str1, str1, cnt2, __ LSL, byte_to_char_shift); // points to last 16 bytes to compare + __ add(str2, str2, cnt2, __ LSL, byte_to_char_shift); // points to last 16 bytes to compare + __ ldr(tmp1, Address(str1, -8)); // preload 8 bytes before current pointer + __ ldr(tmp2, Address(str2, -8)); // preload 8 bytes before current pointer + compare_string_16_bytes_same(DIFF, DIFF2); + __ b(LAST_CHECK); __ bind(DIFF2); - __ rev(rscratch2, rscratch2); - __ clz(rscratch2, rscratch2); - __ andr(rscratch2, rscratch2, isLL ? -8 : -16); - __ lsrv(rscratch1, rscratch1, rscratch2); - if (isLL) { - __ lsrv(cnt1, cnt1, rscratch2); - __ uxtbw(rscratch1, rscratch1); - __ uxtbw(cnt1, cnt1); - } else { - __ lsrv(cnt1, cnt1, rscratch2); - __ uxthw(rscratch1, rscratch1); - __ uxthw(cnt1, cnt1); - } - __ subw(result, rscratch1, cnt1); - __ b(LENGTH_DIFF); + __ mov(tmp1, rscratch1); + __ mov(tmp2, cnt1); + // fallthrough to DIFF __ bind(DIFF); __ rev(rscratch2, rscratch2); __ clz(rscratch2, rscratch2); @@ -4288,11 +4669,11 @@ __ uxthw(tmp2, tmp2); } __ subw(result, tmp1, tmp2); - __ b(LENGTH_DIFF); - __ bind(LAST_CHECK_AND_LENGTH_DIFF); + __ b(DONE); + __ bind(LAST_CHECK); __ eor(rscratch2, tmp1, tmp2); __ cbnz(rscratch2, DIFF); - __ bind(LENGTH_DIFF); + __ bind(DONE); __ ret(lr); return entry; }