--- old/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	2019-02-21 16:08:13.046971067 +0300
+++ new/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	2019-02-21 16:08:12.830972662 +0300
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -4861,7 +4861,331 @@
   BIND(DONE);
 }
 
-// Compare strings.
+// Summary: Compare strings intrinsic implementation. All combinations of UTF-16
+//          and Latin1 encodings for both strings are considered. Comparison
+//          is performed in lexical order.
+//
+// Input:   str1: pointer to 1st string
+//          str2: pointer to 2nd string
+//          cnt1: number of bytes in 1st string
+//          cnt2: number of bytes in 2nd string
+//
+// Algorithm parameter:
+//          ae: encodings used in 1st and 2nd strings
+//
+// Temporary registers:
+//          tmp1, tmp2, rscratch1, rscratch2: always used
+//          vtmp1, vtmp2, vtmp3: used in case encodings are different
+//
+// Output:  result - return 0 if strings are equal. Returns positive value
+//          if 1st string > 2nd string in lexical order. Returns
+//          negative value if 1st string < 2nd string.
+//
+// Side effects: str1, str2, cnt1, cnt2, tmp1, tmp2, rscratch1, rscratch2: clobbered.
+//               vtmp1, vtmp2, vtmp2: clobbered if encodings are different
+//
+// Additional data: boolean values: isLL, isLU, isUL, str1_isL, str2_isL and
+// int minCharInWords are derived from ae parameter based on encodings used
+// in strings. Different code is generated depending of these values:
+//
+// isLL = both strings are Latin1
+// isLU = 1st string is Latin1, 2nd string is UTF-16
+// isUL = 1st string is UTF-16, 2nd string is Latin1
+// str1_isL = 1st string is Latin1
+// str2_isL = 2nd string is Latin1
+// str1_chr_shift = shift value to convert between characters counter to byte counter for 1st string
+// str2_chr_shift = shift value to convert between characters counter to byte counter for 2nd string
+// minCharInWords = minimum number of characters that fit in register (8 for LL case, 4 otherwise)
+//
+//
+// PSEUDO CODE:
+//
+// // N.B.: this pseudo-code doesn't strictly follow implementation details.
+// // It is here to help understand the basics. Detailed implementation
+// // description is listed after this code.
+//
+// <convert byte counters cnt1, cnt2 into character counters if UTF-16 encoding is used>;
+// result = cnt1 - cnt2; // length difference. Used in if all min(cnt1, cnt2) characters are same
+// cnt2 = min(cnt1, cnt2); // amount of characters to check
+// if (result <= minCharInWords) { // <= wordSize bytes should be loaded for comparison
+//   if (cnt2 == 0) return result;
+//   while (cnt2 != 0) {
+//     char str1char = str1[0];
+//     char str2char = str2[0];
+//     str1 += 1 << str1_chr_shift; // advance pointer by size of str1 character
+//     str2 += 1 << str2_chr_shift; // advance pointer by size of str2 character
+//     if (str1[0] != str2[0]) return str1[0] - str2[0];
+//     cnt2--;
+//   }
+// } else { // > wordSize bytes should be loaded for comparison
+//     // This code checks string in 8-byte blocks. If encodings are
+//     // different, Latin1 string will be loaded via 4-byte blocks and then
+//     // each block will be converted to 8-byte UTF-16 equivalent. Then 8 byte
+//     // blocks are compared. Each load is 8 characters for LL case and 4
+//     // characters for LU/UL/UU.
+//     // This set of instructions (load 8 Latin1 character OR load 4 Latin1
+//     // characters and convert it to 4 UTF-16 character OR load 4 UTF-16
+//     // character) is referred as <load-and-probably-convert ...> below.
+//
+//     // First iteration in the loop is unrolled to add initialization.
+//
+//     // The code below calculates addresses of each string last load: addresses
+//     // of last 8 characters for LL case and last 4 characters otherwise.
+//     // Then offsets from the addresses to the beginning of the strings are
+//     // calculated. Offset is then use as loop counter. When offset is >= 0, then
+//     // only last loads (possible overlapped) are left to be checked.
+//     // N.B.: in case of same encodings, offsets are the same for both strings.
+//     // Then offset for 2nd string is used for both strings.
+//
+//     tmp1 = <load-and-probably-convert str1>;
+//     if (str1 == str2) return result;
+//     tmp2 = <load-and-probably-convert str2>;
+//
+//     // use special implementation optimized for large strings. See detailed code and stub comments.
+//     if (cnt2 >= 72) return compare_long_string_implementation(<args>);
+//
+//     cnt2 -= <amount of loaded characters>; // 8 for isLL case. 4 otherwise.
+//
+//     if (str1_isL == str2_isL) {
+//       // Optional optimization for same encoding cases. Can be applied for all
+//       // cases, but is faster in same encoding cases only. Without this branch
+//       // smallest string (8 character for LL and 4 characters for others) would
+//       // be checked twice.
+//       if (cnt2 == 0) goto TAIL_CHECK; // no more characters to be loaded. Just check already loaded data.
+//     }
+//
+//     // calculate addresses of last loads. use str1 and str2 pointers for that
+//     str1 = str1 + cnt2 << str1_chr_shift;
+//     str2 = str2 + cnt2 << str2_chr_shift;
+//
+//     // calculate offsets for both strings. cnt1 and cnt2 can be reused
+//     if (str1_isL != str2_isL) cnt1 = - (cnt2 << str1_chr_shift);
+//     cnt2 = - (cnt2 << str2_chr_shift);
+//
+//     // increment calculated offsets by the number of already loaded bytes
+//     if (isLU) cnt1 += 4;
+//     if (isUL) cnt1 += 8;
+//     cnt2 += isUL ? 4 : 8;
+//
+//     if (cnt2 >= 0) goto TAIL; // only last loads remains. Still need to check currently loaded data.
+//
+//     rscratch2 = tmp1 BIT_XOR tmp2;
+//     if (rscratch2 != 0) goto DIFFERENCE;
+//
+//     // main loop. Label = NEXT_WORD
+//     do {
+//       tmp1 = <load-and-probably-convert str1 at offset of (str1_isL == str2_isL ? cnt2 : cnt1)>;
+//       tmp2 = <load-and-probably-convert str2 at offset of cnt2>;
+//
+//       // update offsets by the number of loaded bytes
+//       cnt2 += isUL ? 4 : 8;
+//       if (isLU) cnt1 += 4;
+//       if (isUL) cnt1 += 8;
+//
+//       if (cnt2 >= 0) goto TAIL; // last block left to be loaded. Still need to check currently loaded block.
+//       rscratch2 = tmp1 BIT_XOR tmp2;
+//     } while (rscratch2 == 0);
+//     goto DIFFERENCE:
+//
+//   TAIL: // last block left to be loaded. Still need to check currently loaded block.
+//     rscratch2 = tmp1 BIT_XOR tmp2;
+//     if (rscratch2 != 0) goto DIFFERENCE;
+//     tmp1 = <load-and-probably-convert str1>;
+//     tmp2 = <load-and-probably-convert str2>;
+//     // fallthrough to TAIL_CHECK
+//   TAIL_CHECK:
+//     rscratch2 = tmp1 BIT_XOR tmp2;
+//     if (rscratch2 == 0) return result;
+//   DIFFERENCE: // different character found. Find it and compute difference
+//     // tmp1 and tmp2 have current data with at least 1 different character.
+//     // Find index of first such character.
+//     rscratch2 = REVERSE_BITS(rscratch2);
+//     rscratch2 = COUNT_LEADING_ZEROES(rscratch2); // position of different bit in current 8 bytes
+//     rscratch2 = rscratch2 & (isLL ? -8 : -16); // number of bits until (possibly converted) different characters in tmp1 and tmp2
+//     tmp1 = tmp1 >> rscratch2; // now first character in tmp1 is the one sought for
+//     tmp1 = tmp1 & (isLL ? 0xFF : 0xFFFF); // only first different character left
+//     tmp2 = tmp2 >> rscratch2; // now first character in tmp2 is the one sought for
+//     tmp2 = tmp2 & (isLL ? 0xFF : 0xFFFF); // only first different character left
+//     result = tmp1 - tmp2;
+// }
+// return result;
+//
+//
+//
+// DETAILED CODE:
+//
+//  if (!str1_isL) cnt1 = cnt1 >> 1;                // counter for 1st string (in characters)
+//  if (!str2_isL) cnt2 = cnt2 >> 1;                // counter for 2nd string (in characters)
+//  result = cnt1 - cnt2;                           // keep in flags the result of operation
+//  cnt2 = min(cnt1, cnt2);                         // implemented as csel instruction using stored flag value above
+//  bool shortStringsCase = cnt2 <= minCharInWords; // kept in flag
+//  if (shortStringsCase) goto SHORT_STRING;        // separate code for short strings
+//  if (str1_isL == str2_isL) {                     // same encoding case
+//    tmp1 = LOAD8BYTES(str1);
+//    bool sameString = str1 == str2;               // kept in flags
+//    if (sameString) goto DONE;                    // the string is the same, return
+//    tmp2 = LOAD8BYTES(str2);
+//    bool largeStrings = cnt2 >= 72;               // kept in flags
+//    if (largeStrings) goto STUB;                  // handled in separate stub implementation for large strings
+//    cnt2 = cnt2 - minCharsInWord;                 // decrement counter by the number of loaded characters
+//    bool noMoreLoadsAvailable = cnt2 == 0;        // kept in flags
+//    if (noMoreLoadsAvailable) goto TAIL_CHECK;
+//    str2 = str2 + cnt2 << str2_chr_shift;         // address of str2 last load
+//    str1 = str1 + cnt2 << str1_chr_shift;         // address of str1 last load
+//    cnt2 = -(cnt2 << str2_chr_shift);             // byte offset to 1st character in each string
+//  } else if (isLU) {
+//    vtmp = LOAD4BYTES(str1);
+//    bool sameString = str1 == str2;               // kept in flags
+//    if (sameString) goto DONE;                    // return
+//    tmp2 = LOAD8BYTES(str2);
+//    bool largeStrings = cnt2 >= 72;               // kept in flags
+//    if (largeStrings) goto STUB;                  // handled in separate stub implementation for large strings
+//    cnt2 = cnt2 - 4;                              // decrement counter by the number  of loaded characters
+//    vtmpz = 0;                                    // implemented as eor
+//    str1 = str1 + cnt2 << str1_chr_shift;         // address of str1 last load
+//    str2 = str2 + cnt2 << str2_chr_shift;         // address of str2 last load
+//    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);  // convert Latin1 to UTF16 because it'll be compared with UTF16. Implemented as zip instruction
+//    cnt1 = -(cnt2 << str1_chr_shift);             // byte offset to 1st character in 1st string
+//    cnt2 = -(cnt2 << str2_chr_shift);             // byte offset to 1st character in 2nd string
+//    cnt1 = cnt1 + 4;                              // advance 1st string offset by the number of loaded bytes
+//    tmp1 = vtmp;                                  // move converted characters from FPU register to GPR
+//  } else { // UL
+//    tmp1 = LOAD8BYTES(str1);
+//    bool sameString = str1 == str2;               // kept in flags
+//    if (sameString) goto DONE;                    // return
+//    vtmp = LOAD4BYTES(str2);
+//    bool largeStrings = cnt2 >= 72;               // kept in flags
+//    if (largeStrings) goto STUB;                  // separate stub implementation for large strings
+//    cnt2 = cnt2 - 4;                              // update counter by the number of loaded characters
+//    str1 = str1 + cnt2 << str1_chr_shift;         // address of str1 last load
+//    vtmpz = 0;                                    // implemented as eor
+//    str2 = str2 + cnt2 << str2_chr_shift;         // address of str2 last load
+//    cnt1 = -(cnt2 << str1_chr_shift);             // byte offset to 1st character in 1st string
+//    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);  // convert Latin1 to UTF16 because it'll be compared with UTF16. implemented as zip instruction
+//    cnt2 = -(cnt2 << str2_chr_shift);             // byte offset to 1st character in 2nd string
+//    cnt1 = cnt1 + 8;                              // advance 1st string offset by the number of loaded bytes
+//    tmp2 = vtmp;                                  // move converted characters from FPU register to GPR
+//  }
+//  cnt2 = cnt2 + (isUL ? 4 : 8);                   // update offset by the number of loaded bytes
+//  bool onlyLastLoadRemains = cnt2 >= 0;           // kept in flags
+//  if (onlyLastLoadRemains) goto TAIL;
+//  rscratch2 = BIT_XOR(tmp1, tmp2);                // current block comparison result
+//  if (rscratch2 != 0) goto DIFFERENCE;            // found different characters in current block
+// NEXT_WORD:                                       // main loop
+//  // implementation for each encoding loads 4 or 8 characters at calculated
+//  // offsets from each string and convert encodings if necessary. Then offsets
+//  // are updated.
+//  if (str1_isL == str2_isL) {
+//    tmp1 = LOAD8BYTES(str1, cnt2);
+//    tmp2 = LOAD8BYTES(str2, cnt2);
+//    cnt2 = cnt2 + 8;                              // update counter by the number of loaded bytes
+//    onlyLastLoadRemains = cnt2 >= 0;              // kept in flags
+//  } else if (isLU) {
+//    vtmp = LOAD4BYTES(str1, cnt1);
+//    tmp2 = LOAD8BYTES(str2, cnt2);
+//    cnt1 = cnt1 + 4;
+//    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);
+//    tmp1 = vtmp;
+//    cnt2 = cnt2 + 8;
+//    onlyLastLoadRemains = cnt2 >= 0;              // kept in flags
+//  } else { // UL
+//    vtmp = LOAD4BYTES(str2, cnt2);
+//    tmp1 = LOAD8BYTES(str1, cnt1);
+//    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);
+//    cnt1 = cnt1 + 8;
+//    tmp2 = vtmp;
+//    cnt2 = cnt2 + 4;
+//    onlyLastLoadRemains = cnt2 >= 0;              // kept in flags
+//  }
+//  if (onlyLastLoadRemains) goto TAIL;
+//  rscratch2 = BIT_XOR(tmp1, tmp2);                // current block comparison result
+//  if (rscratch2 == 0) goto NEXT_WORD;
+//  goto DIFFERENCE;
+// TAIL: // check already loaded data and last load
+//  rscratch2 = BIT_XOR(tmp1, tmp2);                // current block comparison result
+//  if (rscratch2 != 0) goto DIFFERENCE;
+//
+//  // last load (and convert if needed) from each string
+//  if (str1_isL == str2_isL) {
+//    tmp1 = LOAD8BYTES(str1);
+//    tmp2 = LOAD8BYTES(str2);
+//  } else if (isLU) {
+//    vtmp = LOAD4BYTES(str1);
+//    tmp2 = LOAD8BYTES(str2);
+//    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);
+//    tmp1 = vtmp;
+//  } else { // UL
+//    vtmp = LOAD4BYTES(str2);
+//    tmp1 = LOAD8BYTES(str1);
+//    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);
+//    tmp2 = vtmp;
+//  }
+// TAIL_CHECK:                                      // last check
+//  rscratch2 = BIT_XOR(tmp1, tmp2);                // current block comparison result
+//  if (rscratch2 == 0) goto DONE;                  // return
+// DIFFERENCE:
+//  rscratch2 = REVERSE_BITS(rscratch2);            // It's not possible to count trailing zeroes. Reverse bits and then count leading zeroes instead.
+//  rscratch2 = COUNT_LEADING_ZEROES(rscratch2);    // position of different bit in current 8 bytes
+//  rscratch2 = rscratch2 & (isLL ? -8 : -16);      // number of bits until (possibly converted) different characters in tmp1 and tmp2
+//  tmp1 = tmp1 >> rscratch2;                       // first character in tmp1 is the one sought for
+//  tmp1 = isLL ? UNSIGNED_EXTEND_BYTE2INT(tmp1) : UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different character left
+//  tmp2 = tmp2 >> rscratch2;                       // first character in tmp2 is the one sought for
+//  tmp2 = isLL ? UNSIGNED_EXTEND_BYTE2INT(tmp2) : UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different character left
+//  result = tmp1 - tmp2;
+//  goto DONE;
+// }
+//
+// STUB:
+//  <get address of compare_long_string_[LL|UU|LU|UL] stub routine and call it>
+//  goto DONE;
+//
+// // Short strings comparison code. Instead of simple per-character loop with
+// // load-and-compare code it uses loop than issues 2 per-character loads from
+// // each string per iteration. Different registers are used for that to
+// // remove dependencies: (tmp1, cnt1) and (tmp2, rscratch1) pairs.
+// // First characters loads are issued in pre-loop.
+// SHORT_STRING:
+//  if (cnt2 == 0) goto DONE;                       // no characters to compare. Length difference (already calculated) should be used as result
+//  tmp1 = LOAD_STR1_CHAR(str1);
+//  str1 = str1 + str1_chr_size);                   // merged with load above as post-increment
+//  cnt2 = cnt2 - 1;                                // calculate remaining length after first character is loaded
+//  bool endReached = cnt2 == 0;                    // kept in flags
+//  if (endReached) goto SHORT_LAST_INIT;           // load 1 character to complete from 2nd string to complete init and compare it with 1st string character
+//  cnt1 = LOAD_STR2_CHAR(str2);
+//  str2 = str2 + str2_chr_size;                    // merged with load above as post-increment
+//  goto SHORT_LOOP_START;                          // per-character loop entry point
+// SHORT_LOOP:                                      // per-character loop
+//  cnt2 = cnt2 - 1;                                // calculate remaining length
+//  endReached = cnt2 == 0;
+//  if (endReached) goto SHORT_LAST_INIT;
+// SHORT_LOOP_START:                                // per-character loop entry point
+//  tmp2 = LOAD_STR1_CHAR(str1);
+//  rscratch1 = LOAD_STR2_CHAR(str2);
+//  bool differentResult = tmp1 != cnt1;            // check difference of previously loaded pair of registers while last pair is still loading. Kept in flags
+//  if (differentResult) goto SHORT_LOOP_TAIL;      // calculate character difference and return
+//  cnt2 = cnt2 - 1;                                // calculate remaining length
+//  endReached = cnt2 == 0;
+//  if (endReached) goto SHORT_LAST2;               // last comparison of second pair of registers (tmp2, rscratch1) is left
+//  tmp1 = LOAD_STR1_CHAR(str1);
+//  cnt1 = LOAD_STR2_CHAR(str2);
+//  bool sameResult = tmp2 == rscratch1;            // check difference of previously loaded pair of registers while last pair is still loading. Kept in flags
+//  if (sameResult) goto SHORT_LOOP;
+//  result = tmp2 - rscratch1;
+//  goto DONE;
+// SHORT_LAST2:                                     // last comparison is left: (tmp2, rscratch1)
+//  sameResult = tmp2 == rscratch1;
+//  if (sameResult) goto DONE;
+//  result = tmp2 - rscratch1;
+//  goto DONE;
+// SHORT_LAST_INIT:
+//  cnt1 = LOAD_STR2_CHAR(str2);
+// SHORT_LAST:                                      // last comparison of second pair of registers (tmp1, cnt1) is left
+//  sameResult = tmp1 == cnt1;
+//  if (sameResult) goto DONE;
+//  result = tmp1 - cnt1;
+// DONE:
+//  return;                                         // result
+
 void MacroAssembler::string_compare(Register str1, Register str2,
     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {