--- old/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	2019-02-21 16:08:13.046971067 +0300
+++ new/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	2019-02-21 16:08:12.830972662 +0300
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -4861,7 +4861,331 @@
   BIND(DONE);
 }
 
-// Compare strings.
+// Summary: Compare strings intrinsic implementation. All combinations of UTF-16
+//          and Latin1 encodings for both strings are considered. Comparison
+//          is performed in lexical order.
+//
+// Input:   str1: pointer to 1st string
+//          str2: pointer to 2nd string
+//          cnt1: number of bytes in 1st string
+//          cnt2: number of bytes in 2nd string
+//
+// Algorithm parameter:
+//          ae: encodings used in 1st and 2nd strings
+//
+// Temporary registers:
+//          tmp1, tmp2, rscratch1, rscratch2: always used
+//          vtmp1, vtmp2, vtmp3: used in case encodings are different
+//
+// Output:  result - return 0 if strings are equal. Returns positive value
+//          if 1st string > 2nd string in lexical order. Returns
+//          negative value if 1st string < 2nd string.
+//
+// Side effects: str1, str2, cnt1, cnt2, tmp1, tmp2, rscratch1, rscratch2: clobbered.
+//               vtmp1, vtmp2, vtmp2: clobbered if encodings are different
+//
+// Additional data: boolean values: isLL, isLU, isUL, str1_isL, str2_isL and
+// int minCharInWords are derived from ae parameter based on encodings used
+// in strings. Different code is generated depending of these values:
+//
+// isLL = both strings are Latin1
+// isLU = 1st string is Latin1, 2nd string is UTF-16
+// isUL = 1st string is UTF-16, 2nd string is Latin1
+// str1_isL = 1st string is Latin1
+// str2_isL = 2nd string is Latin1
+// str1_chr_shift = shift value to convert between characters counter to byte counter for 1st string
+// str2_chr_shift = shift value to convert between characters counter to byte counter for 2nd string
+// minCharInWords = minimum number of characters that fit in register (8 for LL case, 4 otherwise)
+//
+//
+// PSEUDO CODE:
+//
+// // N.B.: this pseudo-code doesn't strictly follow implementation details.
+// // It is here to help understand the basics. Detailed implementation
+// // description is listed after this code.
+//
+// <convert byte counters cnt1, cnt2 into character counters if UTF-16 encoding is used>;
+// result = cnt1 - cnt2; // length difference. Used in if all min(cnt1, cnt2) characters are same
+// cnt2 = min(cnt1, cnt2); // amount of characters to check
+// if (result <= minCharInWords) { // <= wordSize bytes should be loaded for comparison
+//   if (cnt2 == 0) return result;
+//   while (cnt2 != 0) {
+//     char str1char = str1[0];
+//     char str2char = str2[0];
+//     str1 += 1 << str1_chr_shift; // advance pointer by size of str1 character
+//     str2 += 1 << str2_chr_shift; // advance pointer by size of str2 character
+//     if (str1[0] != str2[0]) return str1[0] - str2[0];
+//     cnt2--;
+//   }
+// } else { // > wordSize bytes should be loaded for comparison
+//     // This code checks string in 8-byte blocks. If encodings are
+//     // different, Latin1 string will be loaded via 4-byte blocks and then
+//     // each block will be converted to 8-byte UTF-16 equivalent. Then 8 byte
+//     // blocks are compared. Each load is 8 characters for LL case and 4
+//     // characters for LU/UL/UU.
+//     // This set of instructions (load 8 Latin1 character OR load 4 Latin1
+//     // characters and convert it to 4 UTF-16 character OR load 4 UTF-16
+//     // character) is referred as <load-and-probably-convert ...> below.
+//
+//     // First iteration in the loop is unrolled to add initialization.
+//
+//     // The code below calculates addresses of each string last load: addresses
+//     // of last 8 characters for LL case and last 4 characters otherwise.
+//     // Then offsets from the addresses to the beginning of the strings are
+//     // calculated. Offset is then use as loop counter. When offset is >= 0, then
+//     // only last loads (possible overlapped) are left to be checked.
+//     // N.B.: in case of same encodings, offsets are the same for both strings.
+//     // Then offset for 2nd string is used for both strings.
+//
+//     tmp1 = <load-and-probably-convert str1>;
+//     if (str1 == str2) return result;
+//     tmp2 = <load-and-probably-convert str2>;
+//
+//     // use special implementation optimized for large strings. See detailed code and stub comments.
+//     if (cnt2 >= 72) return compare_long_string_implementation(<args>);
+//
+//     cnt2 -= <amount of loaded characters>; // 8 for isLL case. 4 otherwise.
+//
+//     if (str1_isL == str2_isL) {
+//       // Optional optimization for same encoding cases. Can be applied for all
+//       // cases, but is faster in same encoding cases only. Without this branch
+//       // smallest string (8 character for LL and 4 characters for others) would
+//       // be checked twice.
+//       if (cnt2 == 0) goto TAIL_CHECK; // no more characters to be loaded. Just check already loaded data.
+//     }
+//
+//     // calculate addresses of last loads. use str1 and str2 pointers for that
+//     str1 = str1 + cnt2 << str1_chr_shift;
+//     str2 = str2 + cnt2 << str2_chr_shift;
+//
+//     // calculate offsets for both strings. cnt1 and cnt2 can be reused
+//     if (str1_isL != str2_isL) cnt1 = - (cnt2 << str1_chr_shift);
+//     cnt2 = - (cnt2 << str2_chr_shift);
+//
+//     // increment calculated offsets by the number of already loaded bytes
+//     if (isLU) cnt1 += 4;
+//     if (isUL) cnt1 += 8;
+//     cnt2 += isUL ? 4 : 8;
+//
+//     if (cnt2 >= 0) goto TAIL; // only last loads remains. Still need to check currently loaded data.
+//
+//     rscratch2 = tmp1 BIT_XOR tmp2;
+//     if (rscratch2 != 0) goto DIFFERENCE;
+//
+//     // main loop. Label = NEXT_WORD
+//     do {
+//       tmp1 = <load-and-probably-convert str1 at offset of (str1_isL == str2_isL ? cnt2 : cnt1)>;
+//       tmp2 = <load-and-probably-convert str2 at offset of cnt2>;
+//
+//       // update offsets by the number of loaded bytes
+//       cnt2 += isUL ? 4 : 8;
+//       if (isLU) cnt1 += 4;
+//       if (isUL) cnt1 += 8;
+//
+//       if (cnt2 >= 0) goto TAIL; // last block left to be loaded. Still need to check currently loaded block.
+//       rscratch2 = tmp1 BIT_XOR tmp2;
+//     } while (rscratch2 == 0);
+//     goto DIFFERENCE:
+//
+//   TAIL: // last block left to be loaded. Still need to check currently loaded block.
+//     rscratch2 = tmp1 BIT_XOR tmp2;
+//     if (rscratch2 != 0) goto DIFFERENCE;
+//     tmp1 = <load-and-probably-convert str1>;
+//     tmp2 = <load-and-probably-convert str2>;
+//     // fallthrough to TAIL_CHECK
+//   TAIL_CHECK:
+//     rscratch2 = tmp1 BIT_XOR tmp2;
+//     if (rscratch2 == 0) return result;
+//   DIFFERENCE: // different character found. Find it and compute difference
+//     // tmp1 and tmp2 have current data with at least 1 different character.
+//     // Find index of first such character.
+//     rscratch2 = REVERSE_BITS(rscratch2);
+//     rscratch2 = COUNT_LEADING_ZEROES(rscratch2); // position of different bit in current 8 bytes
+//     rscratch2 = rscratch2 & (isLL ? -8 : -16); // number of bits until (possibly converted) different characters in tmp1 and tmp2
+//     tmp1 = tmp1 >> rscratch2; // now first character in tmp1 is the one sought for
+//     tmp1 = tmp1 & (isLL ? 0xFF : 0xFFFF); // only first different character left
+//     tmp2 = tmp2 >> rscratch2; // now first character in tmp2 is the one sought for
+//     tmp2 = tmp2 & (isLL ? 0xFF : 0xFFFF); // only first different character left
+//     result = tmp1 - tmp2;
+// }
+// return result;
+//
+//
+//
+// DETAILED CODE:
+//
+//  if (!str1_isL) cnt1 = cnt1 >> 1;                // counter for 1st string (in characters)
+//  if (!str2_isL) cnt2 = cnt2 >> 1;                // counter for 2nd string (in characters)
+//  result = cnt1 - cnt2;                           // keep in flags the result of operation
+//  cnt2 = min(cnt1, cnt2);                         // implemented as csel instruction using stored flag value above
+//  bool shortStringsCase = cnt2 <= minCharInWords; // kept in flag
+//  if (shortStringsCase) goto SHORT_STRING;        // separate code for short strings
+//  if (str1_isL == str2_isL) {                     // same encoding case
+//    tmp1 = LOAD8BYTES(str1);
+//    bool sameString = str1 == str2;               // kept in flags
+//    if (sameString) goto DONE;                    // the string is the same, return
+//    tmp2 = LOAD8BYTES(str2);
+//    bool largeStrings = cnt2 >= 72;               // kept in flags
+//    if (largeStrings) goto STUB;                  // handled in separate stub implementation for large strings
+//    cnt2 = cnt2 - minCharsInWord;                 // decrement counter by the number of loaded characters
+//    bool noMoreLoadsAvailable = cnt2 == 0;        // kept in flags
+//    if (noMoreLoadsAvailable) goto TAIL_CHECK;
+//    str2 = str2 + cnt2 << str2_chr_shift;         // address of str2 last load
+//    str1 = str1 + cnt2 << str1_chr_shift;         // address of str1 last load
+//    cnt2 = -(cnt2 << str2_chr_shift);             // byte offset to 1st character in each string
+//  } else if (isLU) {
+//    vtmp = LOAD4BYTES(str1);
+//    bool sameString = str1 == str2;               // kept in flags
+//    if (sameString) goto DONE;                    // return
+//    tmp2 = LOAD8BYTES(str2);
+//    bool largeStrings = cnt2 >= 72;               // kept in flags
+//    if (largeStrings) goto STUB;                  // handled in separate stub implementation for large strings
+//    cnt2 = cnt2 - 4;                              // decrement counter by the number  of loaded characters
+//    vtmpz = 0;                                    // implemented as eor
+//    str1 = str1 + cnt2 << str1_chr_shift;         // address of str1 last load
+//    str2 = str2 + cnt2 << str2_chr_shift;         // address of str2 last load
+//    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);  // convert Latin1 to UTF16 because it'll be compared with UTF16. Implemented as zip instruction
+//    cnt1 = -(cnt2 << str1_chr_shift);             // byte offset to 1st character in 1st string
+//    cnt2 = -(cnt2 << str2_chr_shift);             // byte offset to 1st character in 2nd string
+//    cnt1 = cnt1 + 4;                              // advance 1st string offset by the number of loaded bytes
+//    tmp1 = vtmp;                                  // move converted characters from FPU register to GPR
+//  } else { // UL
+//    tmp1 = LOAD8BYTES(str1);
+//    bool sameString = str1 == str2;               // kept in flags
+//    if (sameString) goto DONE;                    // return
+//    vtmp = LOAD4BYTES(str2);
+//    bool largeStrings = cnt2 >= 72;               // kept in flags
+//    if (largeStrings) goto STUB;                  // separate stub implementation for large strings
+//    cnt2 = cnt2 - 4;                              // update counter by the number of loaded characters
+//    str1 = str1 + cnt2 << str1_chr_shift;         // address of str1 last load
+//    vtmpz = 0;                                    // implemented as eor
+//    str2 = str2 + cnt2 << str2_chr_shift;         // address of str2 last load
+//    cnt1 = -(cnt2 << str1_chr_shift);             // byte offset to 1st character in 1st string
+//    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);  // convert Latin1 to UTF16 because it'll be compared with UTF16. implemented as zip instruction
+//    cnt2 = -(cnt2 << str2_chr_shift);             // byte offset to 1st character in 2nd string
+//    cnt1 = cnt1 + 8;                              // advance 1st string offset by the number of loaded bytes
+//    tmp2 = vtmp;                                  // move converted characters from FPU register to GPR
+//  }
+//  cnt2 = cnt2 + (isUL ? 4 : 8);                   // update offset by the number of loaded bytes
+//  bool onlyLastLoadRemains = cnt2 >= 0;           // kept in flags
+//  if (onlyLastLoadRemains) goto TAIL;
+//  rscratch2 = BIT_XOR(tmp1, tmp2);                // current block comparison result
+//  if (rscratch2 != 0) goto DIFFERENCE;            // found different characters in current block
+// NEXT_WORD:                                       // main loop
+//  // implementation for each encoding loads 4 or 8 characters at calculated
+//  // offsets from each string and convert encodings if necessary. Then offsets
+//  // are updated.
+//  if (str1_isL == str2_isL) {
+//    tmp1 = LOAD8BYTES(str1, cnt2);
+//    tmp2 = LOAD8BYTES(str2, cnt2);
+//    cnt2 = cnt2 + 8;                              // update counter by the number of loaded bytes
+//    onlyLastLoadRemains = cnt2 >= 0;              // kept in flags
+//  } else if (isLU) {
+//    vtmp = LOAD4BYTES(str1, cnt1);
+//    tmp2 = LOAD8BYTES(str2, cnt2);
+//    cnt1 = cnt1 + 4;
+//    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);
+//    tmp1 = vtmp;
+//    cnt2 = cnt2 + 8;
+//    onlyLastLoadRemains = cnt2 >= 0;              // kept in flags
+//  } else { // UL
+//    vtmp = LOAD4BYTES(str2, cnt2);
+//    tmp1 = LOAD8BYTES(str1, cnt1);
+//    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);
+//    cnt1 = cnt1 + 8;
+//    tmp2 = vtmp;
+//    cnt2 = cnt2 + 4;
+//    onlyLastLoadRemains = cnt2 >= 0;              // kept in flags
+//  }
+//  if (onlyLastLoadRemains) goto TAIL;
+//  rscratch2 = BIT_XOR(tmp1, tmp2);                // current block comparison result
+//  if (rscratch2 == 0) goto NEXT_WORD;
+//  goto DIFFERENCE;
+// TAIL: // check already loaded data and last load
+//  rscratch2 = BIT_XOR(tmp1, tmp2);                // current block comparison result
+//  if (rscratch2 != 0) goto DIFFERENCE;
+//
+//  // last load (and convert if needed) from each string
+//  if (str1_isL == str2_isL) {
+//    tmp1 = LOAD8BYTES(str1);
+//    tmp2 = LOAD8BYTES(str2);
+//  } else if (isLU) {
+//    vtmp = LOAD4BYTES(str1);
+//    tmp2 = LOAD8BYTES(str2);
+//    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);
+//    tmp1 = vtmp;
+//  } else { // UL
+//    vtmp = LOAD4BYTES(str2);
+//    tmp1 = LOAD8BYTES(str1);
+//    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);
+//    tmp2 = vtmp;
+//  }
+// TAIL_CHECK:                                      // last check
+//  rscratch2 = BIT_XOR(tmp1, tmp2);                // current block comparison result
+//  if (rscratch2 == 0) goto DONE;                  // return
+// DIFFERENCE:
+//  rscratch2 = REVERSE_BITS(rscratch2);            // It's not possible to count trailing zeroes. Reverse bits and then count leading zeroes instead.
+//  rscratch2 = COUNT_LEADING_ZEROES(rscratch2);    // position of different bit in current 8 bytes
+//  rscratch2 = rscratch2 & (isLL ? -8 : -16);      // number of bits until (possibly converted) different characters in tmp1 and tmp2
+//  tmp1 = tmp1 >> rscratch2;                       // first character in tmp1 is the one sought for
+//  tmp1 = isLL ? UNSIGNED_EXTEND_BYTE2INT(tmp1) : UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different character left
+//  tmp2 = tmp2 >> rscratch2;                       // first character in tmp2 is the one sought for
+//  tmp2 = isLL ? UNSIGNED_EXTEND_BYTE2INT(tmp2) : UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different character left
+//  result = tmp1 - tmp2;
+//  goto DONE;
+// }
+//
+// STUB:
+//  <get address of compare_long_string_[LL|UU|LU|UL] stub routine and call it>
+//  goto DONE;
+//
+// // Short strings comparison code. Instead of simple per-character loop with
+// // load-and-compare code it uses loop than issues 2 per-character loads from
+// // each string per iteration. Different registers are used for that to
+// // remove dependencies: (tmp1, cnt1) and (tmp2, rscratch1) pairs.
+// // First characters loads are issued in pre-loop.
+// SHORT_STRING:
+//  if (cnt2 == 0) goto DONE;                       // no characters to compare. Length difference (already calculated) should be used as result
+//  tmp1 = LOAD_STR1_CHAR(str1);
+//  str1 = str1 + str1_chr_size);                   // merged with load above as post-increment
+//  cnt2 = cnt2 - 1;                                // calculate remaining length after first character is loaded
+//  bool endReached = cnt2 == 0;                    // kept in flags
+//  if (endReached) goto SHORT_LAST_INIT;           // load 1 character to complete from 2nd string to complete init and compare it with 1st string character
+//  cnt1 = LOAD_STR2_CHAR(str2);
+//  str2 = str2 + str2_chr_size;                    // merged with load above as post-increment
+//  goto SHORT_LOOP_START;                          // per-character loop entry point
+// SHORT_LOOP:                                      // per-character loop
+//  cnt2 = cnt2 - 1;                                // calculate remaining length
+//  endReached = cnt2 == 0;
+//  if (endReached) goto SHORT_LAST_INIT;
+// SHORT_LOOP_START:                                // per-character loop entry point
+//  tmp2 = LOAD_STR1_CHAR(str1);
+//  rscratch1 = LOAD_STR2_CHAR(str2);
+//  bool differentResult = tmp1 != cnt1;            // check difference of previously loaded pair of registers while last pair is still loading. Kept in flags
+//  if (differentResult) goto SHORT_LOOP_TAIL;      // calculate character difference and return
+//  cnt2 = cnt2 - 1;                                // calculate remaining length
+//  endReached = cnt2 == 0;
+//  if (endReached) goto SHORT_LAST2;               // last comparison of second pair of registers (tmp2, rscratch1) is left
+//  tmp1 = LOAD_STR1_CHAR(str1);
+//  cnt1 = LOAD_STR2_CHAR(str2);
+//  bool sameResult = tmp2 == rscratch1;            // check difference of previously loaded pair of registers while last pair is still loading. Kept in flags
+//  if (sameResult) goto SHORT_LOOP;
+//  result = tmp2 - rscratch1;
+//  goto DONE;
+// SHORT_LAST2:                                     // last comparison is left: (tmp2, rscratch1)
+//  sameResult = tmp2 == rscratch1;
+//  if (sameResult) goto DONE;
+//  result = tmp2 - rscratch1;
+//  goto DONE;
+// SHORT_LAST_INIT:
+//  cnt1 = LOAD_STR2_CHAR(str2);
+// SHORT_LAST:                                      // last comparison of second pair of registers (tmp1, cnt1) is left
+//  sameResult = tmp1 == cnt1;
+//  if (sameResult) goto DONE;
+//  result = tmp1 - cnt1;
+// DONE:
+//  return;                                         // result
+
 void MacroAssembler::string_compare(Register str1, Register str2,
     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
--- old/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	2019-02-21 16:08:13.518967582 +0300
+++ new/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	2019-02-21 16:08:13.278969355 +0300
@@ -4026,9 +4026,26 @@
     return entry;
   }
 
-  // code for comparing 16 bytes of strings with same encoding
+  // Summary: part of string compareTo implementation. Called for code generation in multiple points.
+  //          1) load 8 bytes and advance pointers of both strings and compare
+  //             previously loaded 8 bytes. jump to DIFF1 if different characters found
+  //          2) load 8 bytes and advance pointers of both strings and compare
+  //             previously loaded 8 bytes. jump to DIFF2 if different characters found
+  //
+  // Input:
+  //          str1  (r1): pointer for next load from 1st string
+  //          cnt1  (r2): register to use for loading data from 2nd string
+  //          str2  (r3): pointer for next load from 2nd string
+  //          tmp1 (r10): already loaded 8 bytes of 1st string.
+  //          tmp2 (r11): already loaded 8 bytes of 2nd string.
+  //
+  // Output:
+  //          rscratch2: result of last comparison
+  //          tmp1, tmp2: contains different parts of 1st and 2nd strings if exit via DIFF1 label. Not used for normal and DIFF2 exits
+  //          rscratch1, cnt1: contains different parts of 1st and 2nd strings if exit via DIFF2 label. Not used for normal and DIFF1 exits
+  //
   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
-    Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
+    Register str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
     __ ldr(rscratch1, Address(__ post(str1, 8)));
     __ eor(rscratch2, tmp1, tmp2);
     __ ldr(cnt1, Address(__ post(str2, 8)));
@@ -4039,14 +4056,49 @@
     __ cbnz(rscratch2, DIFF2);
   }
 
-  // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
+  // Summary: part of string compare implementation. Called for code generation in multiple points.
+  //          - expecting 4 UTF-16 string characters preloaded into tmp3
+  //          - load 16 characters from each string
+  //          - convert Latin1 characters to UTF-16
+  //          - compare preloaded 4 characters with 4 first converted Latin1 characters
+  //          - compare next 12 loaded and converted characters from each string
+  //          - compared data is in tmpU and tmpL registers or in tmp3 and tmpL
+  //          - in case different characters are found while comparing tmpU and
+  //            tmpL, jumps to DIFF1. Jumps to DIFF2 in case different character
+  //            was found while comparing tmp3 and tmpL
+  //          - string pointers are increased by amount of loaded bytes
+  //
+  // Input:
+  //          strUnext  (r2): pointer for next load from UTF-16 string
+  //          strLnext (r11): pointer for next load from Lating1 string
+  //          tmp3     (r12): used to store parts of UTF-16 string
+  //          vmptZ     (v0): zeroed register for conversion from Latin1 to UTF-16
+  //
+  // Temporary registers:
+  //          vtmp      (v1): used to load 16 Latin1 characters and part of converted Latin1 string
+  //          vtmp3     (v2): used for part of converted Latin1 string
+  //
+  // Output: rscratch2: result of last comparison
+  //         tmpL: last compared part of converted Latin1 string
+  //         tmpU: in case of exit via DIFF1 or normal exit: contains last compared part of UTF-16 string.
+  //               Contains part of UTF-16 string compared before last comparison otherwise.
+  //         tmp3: in case of exit via DIFF2: contains last compared part of UTF-16 string.
+  //               in case of exit via DIFF1: contains part of UTF-16 string compared before last comparison otherwise.
+  //               in case of normal exit: contains preloaded 8 bytes of UTF-16 string for next comparisons
+  //
+  // Parameters:
+  //          tmpL: holds parts of converted Latin1 string
+  //          tmpU: holds parts of UTF-16 string
+  //          DIFF1: label to jump to if different characters are found in tmpU and tmpL
+  //          DIFF2: label to jump to if different characters are found in tmp3 and tmpL
+  //
   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
       Label &DIFF2) {
-    Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
+    Register strUnext = r2, tmp1 = r10, strLnext = r11, tmp3 = r12;
     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 
-    __ ldrq(vtmp, Address(__ post(tmp2, 16)));
-    __ ldr(tmpU, Address(__ post(cnt1, 8)));
+    __ ldrq(vtmp, Address(__ post(strLnext, 16)));
+    __ ldr(tmpU, Address(__ post(strUnext, 8)));
     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 
@@ -4054,30 +4106,224 @@
     __ eor(rscratch2, tmp3, tmpL);
     __ cbnz(rscratch2, DIFF2);
 
-    __ ldr(tmp3, Address(__ post(cnt1, 8)));
+    __ ldr(tmp3, Address(__ post(strUnext, 8)));
     __ umov(tmpL, vtmp3, __ D, 1);
     __ eor(rscratch2, tmpU, tmpL);
     __ cbnz(rscratch2, DIFF1);
 
     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
-    __ ldr(tmpU, Address(__ post(cnt1, 8)));
+    __ ldr(tmpU, Address(__ post(strUnext, 8)));
     __ fmovd(tmpL, vtmp);
     __ eor(rscratch2, tmp3, tmpL);
     __ cbnz(rscratch2, DIFF2);
 
-    __ ldr(tmp3, Address(__ post(cnt1, 8)));
+    __ ldr(tmp3, Address(__ post(strUnext, 8)));
     __ umov(tmpL, vtmp, __ D, 1);
     __ eor(rscratch2, tmpU, tmpL);
     __ cbnz(rscratch2, DIFF1);
   }
 
-  // r0  = result
-  // r1  = str1
-  // r2  = cnt1
-  // r3  = str2
-  // r4  = cnt2
-  // r10 = tmp1
-  // r11 = tmp2
+  // Summary: Compare long strings intrinsic implementation for different encodings.
+  //          Comparison is performed in lexical order.
+  //
+  // Prerequisites: string length >= 72 characters
+  //
+  // Input: result (r0): length difference
+  //          str1 (r1): pointer to 1st string
+  //          str2 (r2): pointer to 2nd string
+  //          cnt1 (r3): number of characters in 1st string
+  //          cnt2 (r4): minimum of str1 and str2 length. Used as counter
+  //          tmp1 (r10): starting 8 bytes of 1st string for UTF-16 string
+  //          tmp2 (r11): starting 8 bytes of 2nd string for UTF-16 string
+  //          vtmpZ (v0): used to convert encodings by providing zero values
+  //          vtmp  (v1): starting bytes of Latin1 string. Also used as temporary register
+  //          vtmp3 (v2): temporary register
+  //
+  // Temporary registers:
+  //          rscratch1, rscratch2: clobbered on exit
+  //          preloadedChunk (r12), smallLoopCounter (r14): pushed on stack, then restored on exit
+  //
+  //
+  // Output:  result - return 0 if strings are equal. Returns positive value
+  //          if 1st string > 2nd string in lexical order. Return
+  //          negative value if 1st string < 2nd string.
+  //
+  // Side effects: str1, str2, cnt1, cnt2, tmp1, tmp2, rscratch1, rscratch2: clobbered.
+  //
+  // Algorithm parameters:
+  //          isLU: true if 1st string is Latin1.
+  //
+  // Calculated constants:
+  //          largeLoopExitCondition: Exit condition for loop with prefetch.
+  //
+  //
+  // PSEUDO CODE:
+  //   // Code below uses <compare_string_16_x_LU> code block which:
+  //   // - loads 16 Latin1 characters at once. Then converts it to UTF-16 and move to GPR
+  //   // - issues 4 smaller loads of 4 UTF-16 characters and for each load compare it with converted Latin1 characters
+  //   // - smaller loads are using 2 different registers to break register dependencies
+  //   // - jump to DIFF or DIFF2 label depending on which register has a character different from converted Latin1 character
+  //
+  //   <push preloadedChunk and smallLoopCounter on stack>;
+  //   <convert already loaded Latin1 characters to UTF-16 and compare it>;
+  //   <advance string pointers by the number of loaded byte>;
+  //   <calculate strUnext and strLnext == pointers to load next chunks from UTF-16 and Latin1 strings>;
+  //   <preload first 4 UTF characters>;
+  //   cnt2 = cnt2 - 4;                              // keep characters counter reduced by 4, because last 4 characters are compared separately
+  //   if (SoftwarePrefetchHintDistance >= 0) {      // need prefetch
+  //     if (cnt2 < largeLoopExitConditioni) goto NO_PREFETCH; // don't use loop with prefetch in case prefetch distance is too far away
+  //     do {                                        // 64-characters loop with prefetch.
+  //       // Each iteration has 2 prefetch instructions for UTF-16 string and 1 for Latin1 string
+  //       // contains 2-iterations loops (16 characters each) between prefetch instructions
+  //       // to avoid huge code generation
+  //       <prefetch strLnext at SoftwarePrefetchHintDistance>;
+  //       <prefetch strUnext at SoftwarePrefetchHintDistance>;
+  //       for (smallLoopCounter = 0; smallLoopCounter < 2; smallLoopCounter++) {
+  //         <compare_string_16_x_LU>;
+  //       }
+  //       <prefetch strUnext at SoftwarePrefetchHintDistance>;
+  //       for (smallLoopCounter = 0; smallLoopCounter < 2; smallLoopCounter++) {
+  //         <compare_string_16_x_LU>;
+  //       }
+  //       cnt2 = cnt2 - 64;                          // update counter by the number of loaded characters
+  //     } while(cnt2 >= largeLoopExitCondition);
+  //   }
+  //   if (cnt2 == 0) goto LOAD_LAST;                 // load and compare last 4 characters
+  //  NO_PREFETCH:
+  //   if (<less than 16 characters left to load) goto TAIL;
+  //
+  //   // smaller by-16-characters loop
+  //   do {
+  //     cnt2 = cnt2 - 16;
+  //     <compare_string_16_x_LU>;
+  //   } while(<has at least 16 characters to load>);
+  //
+  //   if (cnt2 == 0) goto LOAD_LAST;
+  // TAIL:
+  //   <adjust pointers to point to 16 characters before last load>;
+  //   <preload first 4 UTF characters>
+  //   <load and compare 16 characters before last load>;
+  //   goto LOAD_LAST;
+  // DIFF1:
+  //   <move utf string data to same comparison register as for DIFF2>;
+  //   // fallthrough
+  // DIFF2:
+  //   <pop preloadedChunk and smallLoopCounter from stack>;
+  //   goto CALCULATE_DIFFERENCE;
+  // LOAD_LAST:
+  //   <pop preloadedChunk and smallLoopCounter from stack>;
+  //   <load and compare last 4 characters>;
+  //   if (<difference not found>) return;           // result = already calculated length difference
+  // CALCULATE_DIFFERENCE:
+  //   result = <calculate character difference>;
+  // DONE:
+  //   return;
+  //
+  //
+  //
+  //
+  //
+  // DETAILED CODE:
+  //  vtmpZ = 0;                                     // used to convert encodings
+  //  vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpZ);   // implemented as zip1 instruction
+  //
+  //  // update string pointers by the number of loaded bytes
+  //  str1 = str1 + (isLU ? wordSize/2 : wordSize);
+  //  str2 = str2 + (isLU ? wordSize : wordSize/2);
+  //
+  //  // copy converted string into GPR
+  //  if (isLU) tmp1 = vtmp;
+  //  else tmp2 = vtmp;
+  //
+  //  cnt2 = cnt2 - 8;                                // reduce cnt2 by the number of already loaded characters. And reduce by 4 more characters
+  //  str1 = str1 + cnt2 << (isLU ? 0 : 1);           // address of 1st string last 4 characters
+  //  rscratch1 = BIT_XOR(tmp1, tmp2);                // begin loaded chunks comparison
+  //  str2 = str2 + cnt2 << (isLU ? 1 : 0);           // address of 2nd string last 4 characters
+  //  rscratch1 = tmp2;                               // copy 2nd string chunk
+  //  if (rscratch2 != 0) CALCULATE_DIFFERENCE;       // found different character
+  //
+  //  // several redefinitions below to have meaningful names
+  //  void* strU = isLU ? str2 : str1;                // UTF-16 string pointer to last 4 characters
+  //  void* strL = isLU ? str1 : str2;                // Latin1 string pointer to last 4 characters
+  //  long tmpU = isLU ? rscratch1 : tmp1;            // UTF-16 characters holder
+  //  long tmpL = isLU ? tmp1 : rscratch1;            // Latin1 characters holder
+  //  void* strLnext = tmp2;                          // Latin1 string pointer to load next character(s)
+  //  void* strUnext = cnt1;                          // UTF-16 string pointer to load next character(s)
+  //
+  //  PUSH_ON_STACK(preloadedChunk, smallLoopCounter);
+  //  strLnext = strL - cnt2;                         // initialize pointer to Latin1 string next load
+  //  strUnext = strU - cnt2 << 1;                    // initialize pointer to UTF-16 string next load
+  //
+  //  preloadedChunk = LOAD8BYTES(strUnext, 8);       // pre-load next 8 bytes of UTF-16 string
+  //  strUnext = strUnext + 8;                        // merged with load above as post-increment
+  //
+  //  if (SoftwarePrefetchHintDistance >= 0) {
+  //    rscratch2 = cnt2 - prefetchLoopExitCondition;
+  //    if (rscratch2 < 0) goto NO_PREFETCH;
+  //   LARGE_LOOP_PREFETCH:                           // 64-characters loop
+  //    PREFETCH(strLnext, SoftwarePrefetchHintDistance);
+  //    smallLoopCounter = 2;                         // initialize inner loop counter
+  //    PREFETCH(strUnext, SoftwarePrefetchHintDistance);
+  //   LARGE_LOOP_PREFETCH_REPEAT1: {                 // 16 characters inner loop with 2 iterations
+  //      compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // see compare_string_16_x_LU comments
+  //      smallLoopCounter--;
+  //      bool smallLoopRepeat = (smallLoopCounter > 0); // kept in flags
+  //      if (smallLoopRepeat) goto LARGE_LOOP_PREFETCH_REPEAT1;
+  //   }
+  //    PREFETCH(strUnext, SoftwarePrefetchHintDistance);
+  //    smallLoopCounter = 2;                         // initialize inner loop counter
+  //   LARGE_LOOP_PREFETCH_REPEAT2: {                 // one more 16 characters inner loop with 2 iterations
+  //      compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // see compare_string_16_x_LU comments
+  //      smallLoopCounter--;
+  //      bool smallLoopRepeat = (smallLoopCounter > 0); // kept in flags
+  //      if (smallLoopRepeat) goto LARGE_LOOP_PREFETCH_REPEAT2;
+  //   }
+  //    cnt2 = cnt2 - 64;
+  //    rscratch2 = cnt2 - prefetchLoopExitCondition;
+  //    if (rscratch2 >= 0) goto LARGE_LOOP_PREFETCH;
+  //  } // end of 64-characters loop
+  //
+  //  if (cnt2 == 0) goto LOAD_LAST;                  // no more characters left except last 4 characters reserved earlier
+  // NO_PREFETCH:                                     // all further loads doesn't require prefetch instruction
+  //  cnt2 = cnt2 - 16;                               // keep cnt2 counter reduced by 16
+  //  if (cnt2 < 0) goto TAIL;                        // less than 16 characters left to load until last 4 reserved characters
+  // SMALL_LOOP:                                      // 16-characters loop
+  //  compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
+  //  cnt2 = cnt2 - 16;                               // decrement counter by previously loaded 16 characters
+  //  bool repeatSmallLoop = cnt2 >= 0;               // kept in flags
+  //  if (repeatSmallLoop) goto SMALL_LOOP;
+  //  if (cnt2 == -16) goto LOAD_LAST;
+  // TAIL:
+  //  strUnext = strUnext + cnt2 << 1;                // pointer to UTF-16 last 16 characters + 8 bytes
+  //  strLnext = strLnext + cnt2;                     // pointer to Latin1 last 16 characters
+  //  preloadedChunk = LOAD8BYTES(strUnext, -8);
+  //  compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
+  //  goto LOAD_LAST;
+  // DIFF2:
+  //  tmpU = preloadedChunk;
+  // DIFF1:
+  //  pop(preloadedChunk, smallLoopCounter);
+  //  goto CALCULATE_DIFFERENCE;
+  // LOAD_LAST:
+  //  tmpU = preloadedChunk;                         // already loaded last 4 UTF-16 characters. Just copy to required register
+  //  pop(preloadedChunk, smallLoopCounter);
+  //  vtmp = LOAD4BYTES(strL);
+  //  vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpZ);
+  //  tmpL = vtmp;
+  //  rscratch2 = BIT_XOR(tmpU, tmpL);
+  //  if (rscratch2 == 0) goto DONE;
+  // CALCULATE_DIFFERENCE:
+  //  // No count trailing zeroes instruction is available. Reverse bits and count leading zeroes instead.
+  //  rscratch2 = REVERSE_BITS(rscratch2);
+  //  rscratch2 = COUNT_LEADING_ZEROES(rscratch2);
+  //  rscratch2 = rscratch2 & -16;                   // clear lowest 4 bits to have number of bits until different character
+  //  tmp1 = tmp1 >> rscratch2;                      // shift off same symbols from 1st string data
+  //  tmp1 = UNSIGNED_EXTEND_SHORT2INT(tmp1);        // only first different symbol remains in 1st string data
+  //  rscratch1 = rscratch1 >> rscratch2;            // shift off same symbols from 2nd string data
+  //  rscratch1 = UNSIGNED_EXTEND_SHORT2INT(rscratch1); // only first different symbol remains in 2nd string data
+  //  result = tmp1 - rscratch1;                     // character difference
+  // DONE:
+  //  return;
   address generate_compare_long_string_different_encoding(bool isLU) {
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", isLU
@@ -4088,9 +4334,9 @@
         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
-        tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
+        tmp1 = r10, tmp2 = r11, preloadedChunk = r12, smallLoopCounter = r14;
     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
-    RegSet spilled_regs = RegSet::of(tmp3, tmp4);
+    RegSet spilled_regs = RegSet::of(preloadedChunk, smallLoopCounter);
 
     int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
 
@@ -4110,29 +4356,32 @@
     Register strU = isLU ? str2 : str1,
              strL = isLU ? str1 : str2,
              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
-             tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
+             tmpL = isLU ? tmp1 : rscratch1, // where to keep L for comparison
+             strLnext = tmp2,
+             strUnext = cnt1;
     __ push(spilled_regs, sp);
-    __ sub(tmp2, strL, cnt2); // strL pointer to load from
-    __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
+    __ sub(strLnext, strL, cnt2);            // strL pointer to load from
+    __ sub(strUnext, strU, cnt2, __ LSL, 1); // strU pointer to load from
 
-    __ ldr(tmp3, Address(__ post(cnt1, 8)));
+    // safe to read ahead 4 characters, because string length >= 72 characters
+    __ ldr(preloadedChunk, Address(__ post(strUnext, 8)));
 
     if (SoftwarePrefetchHintDistance >= 0) {
       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
       __ br(__ LT, NO_PREFETCH);
       __ bind(LARGE_LOOP_PREFETCH);
-        __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
-        __ mov(tmp4, 2);
-        __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
+        __ prfm(Address(strLnext, SoftwarePrefetchHintDistance));
+        __ mov(smallLoopCounter, 2);
+        __ prfm(Address(strUnext, SoftwarePrefetchHintDistance));
         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
-          __ subs(tmp4, tmp4, 1);
+          __ subs(smallLoopCounter, smallLoopCounter, 1);
           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
-          __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
-          __ mov(tmp4, 2);
+          __ prfm(Address(strUnext, SoftwarePrefetchHintDistance));
+          __ mov(smallLoopCounter, 2);
         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
-          __ subs(tmp4, tmp4, 1);
+          __ subs(smallLoopCounter, smallLoopCounter, 1);
           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
           __ sub(cnt2, cnt2, 64);
           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
@@ -4140,6 +4389,9 @@
     }
     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
     __ bind(NO_PREFETCH);
+    // Load and compare cnt2 characters using 16 characters loop with
+    // compare_string_16_x_LU primitive. In case 1..15 characters left:
+    // use same compare_string_16_x_LU primitive with partial overlapping
     __ subs(cnt2, cnt2, 16);
     __ br(__ LT, TAIL);
     __ bind(SMALL_LOOP); // smaller loop
@@ -4149,20 +4401,20 @@
       __ cmn(cnt2, (u1)16);
       __ br(__ EQ, LOAD_LAST);
     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
-      __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string
-      __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
-      __ ldr(tmp3, Address(cnt1, -8));
+      __ add(strUnext, strUnext, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string
+      __ add(strLnext, strLnext, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
+      __ ldr(preloadedChunk, Address(strUnext, -8));
       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
       __ b(LOAD_LAST);
     __ bind(DIFF2);
-      __ mov(tmpU, tmp3);
+      __ mov(tmpU, preloadedChunk);
     __ bind(DIFF1);
       __ pop(spilled_regs, sp);
       __ b(CALCULATE_DIFFERENCE);
     __ bind(LOAD_LAST);
-      // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
+      // Last 4 UTF-16 characters are already pre-loaded into preloadedChunk by compare_string_16_x_LU.
       // No need to load it again
-      __ mov(tmpU, tmp3);
+      __ mov(tmpU, preloadedChunk);
       __ pop(spilled_regs, sp);
 
       __ ldrs(vtmp, Address(strL));
@@ -4188,14 +4440,171 @@
     return entry;
   }
 
-  // r0  = result
-  // r1  = str1
-  // r2  = cnt1
-  // r3  = str2
-  // r4  = cnt2
-  // r10 = tmp1
-  // r11 = tmp2
+  // Summary: Compare long strings intrinsic implementation for same encodings.
+  //          Comparison is performed in lexical order.
+  //
+  // Prerequisites: string length >= 72 characters
+  //
+  // Input: result (r0): length difference
+  //          str1 (r1): pointer to 1st string
+  //          str2 (r2): pointer to 2nd string
+  //          cnt1 (r3): amount of characters in 1st string
+  //          cnt2 (r4): minimum of str1 and str2 length. Used as counter
+  //          tmp1 (r10): starting 8 bytes of 1st string
+  //          tmp2 (r11): starting 8 bytes of 2nd string
+  //
+  // Temporary registers:
+  //          rscratch1, rscratch2
+  //
+  //
+  // Output:  result - return 0 if strings are equal. Returns positive value
+  //          if 1st string > 2nd string in lexical order. Returns
+  //          negative value if 1st string < 2nd string.
+  //
+  // Side effects: str1, str2, cnt1, cnt2, tmp1, tmp2, rscratch1, rscratch2: clobbered.
+  //
+  // Algorithm parameters:
+  //          isLL: true if both string are Latin1. false if both are UTF-16.
+  //                Used to generate code for both Latin1 - Latin1 (LL) case and
+  //                UTF-16 - UTF-16 (UU) case.
+  // Calculated constants:
+  //          largeLoopExitCondition: MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
+  //                Exit condition for loop with prefetch.
+  //          characters_in_word: isLL ? 8 : 4
+  //                Number of characters fit in work (1 register)
+  //          characters_in_dword: 2 * characters_in_word
+  //                Number of characaters fit in double word (2 registers)
+  //          byte_to_char_shift: isLL ? 0 : 1
+  //                Shift value to convert between byte and character counters
+  //
+  // PSEUDO CODE:
+  //   // Code below uses <compare_string_16_bytes_same> code block, which:
+  //   // - compares already loaded tmp1 and tmp2. goto DIFF label if it's not equal
+  //   // - loads and compares next 8 bytes of both strings (stored in rscratch1 and cnt1). goto DIFF2 label if it's not equal
+  //   // - loads next 8 bytes of both strings into tmp1 and tmp2
+  //   // - on each load string pointers are updated to point at character after loaded block
+  //
+  //   <update cnt2 counter and str1, str2 pointers by already loaded characters>;
+  //   if (SoftwarePrefetchHintDistance >= 0) {     // need prefetch
+  //     do { // 64-byte loop with prefetch
+  //       <prefetch str1 at SoftwarePrefetchHintDistance>;
+  //       <prefetch str2 at SoftwarePrefetchHintDistance>;
+  //       <compare_string_16_bytes_same>;
+  //       <compare_string_16_bytes_same>;
+  //       cnt2 = cnt2 - (8 * characters_in_word);            // update counter by amount
+  //       <compare_string_16_bytes_same>;
+  //       <compare_string_16_bytes_same>;
+  //     } while(cnt2 >= largeLoopExitCondition);
+  //   }
+  //   if (cnt2 == 0)
+  //   while(cnt2 > characters_in_dword) {
+  //     <compare_string_16_bytes_same>;
+  //     cnt2 = cnt2 - characters_in_dword;
+  //   }
+  //   <adjust pointer to last 16 bytes for each string>;
+  //   <preload 8 bytes before current pointers into tmp1 and tmp2)>;
+  //   <compare_string_16_bytes_same>;
+  //   goto LAST_CHECK;
+  // DIFF2:
+  //   <copy string chunks to tmp1 and tmp2 for further analysis>;
+  //   // fallthrough to DIFF
+  // DIFF:
+  //   <get different characters from tmp1 and tmp2. Then calculate characters difference>
+  //   goto DONE;
+  // LAST_CHECK: // label to jump to when last chunk of data has to be checked.
+  //   // Return character difference if different characters are found.
+  //   // Return length difference (already calculated) otherwise.
+  //   <check last loaded chunks and return result>
+  // DONE:
+  //   return;
+  //
+  //
+  // DETAILED CODE:
+  //  // N.B.: compare_string_16_bytes_same and call below is not
+  //  // an actual call at runtime. It is called at code generation time.
+  //
+  //  cnt2 = cnt2 - characters_in_word);
+  //  str1 = str1 + wordSize;
+  //  str2 = str2 + wordSize;
+  //  if (SoftwarePrefetchHintDistance >= 0) {
+  //    LARGE_LOOP_PREFETCH:
+  //      LOAD_PREFETCH(str1, SoftwarePrefetchHintDistance);
+  //      LOAD_PREFETCH(str2, SoftwarePrefetchHintDistance);
+  //      compare_string_16_bytes_same(DIFF, DIFF2);
+  //      compare_string_16_bytes_same(DIFF, DIFF2);
+  //      cnt2 = cnt2 - 8 * characters_in_word;
+  //      compare_string_16_bytes_same(DIFF, DIFF2);
+  //      rscratch2 = cnt2 - largeLoopExitCondition; // rscratch2 is not used. Use subs instead of cmp in case of potentially large constants
+  //      bool canLoop = rscratch2 > 0;              // kept in flags
+  //      compare_string_16_bytes_same(DIFF, DIFF2);
+  //      if (canLoop) LARGE_LOOP_PREFETCH;
+  //      if (cnt2 == 0) goto LAST_CHECK;            // no more loads left
+  //  }
+  //
+  //  cnt2 = cnt2 - characters_in_dword;             // keep cnt2 counter reduced by 16 (LL) or 8 (UU)
+  //  bool lessThan16bytesLeft = cnt2 < 0;           // kept in flags
+  //  if (lessThan16bytesLeft) goto TAIL;
+  //  SMALL_LOOP:                                // 16 byte loop
+  //    compare_string_16_bytes_same(DIFF, DIFF2);
+  //    cnt2 = cnt2 - characters_in_dword;
+  //    bool canLoop = cnt2 >= 16;
+  //    if (canLoop) goto SMALL_LOOP;
+  //    bool lastCheckLeft = cnt2 == -characters_in_dword;
+  //    if (lastCheckLeft) goto LAST_CHECK;
+  //  TAIL: // less than 16 bytes left to load. And 8 bytes were loaded but not
+  //    // compared. Reuse <compare_string_16_bytes_same> primitive. Handle last
+  //    // 24 string bytes by preloading first 8 of these 24 bytes, then use
+  //    // <compare_string_16_bytes_same>. And then compare last 8 bytes loaded
+  //    // by <compare_string_16_bytes_same>
+  //    // This will partially overlap with previous load and comparison, but
+  //    // makes code more simple
+  //    str1 = str1 + cnt2 << byte_to_char_shift;
+  //    str2 = str2 + cnt2 << byte_to_char_shift;
+  //    tmp1 = LOAD8BYTES(str1, -8);
+  //    tmp2 = LOAD8BYTES(str2, -8);
+  //    compare_string_16_bytes_same(DIFF, DIFF2);
+  //    goto LAST_CHECK;
+  //  DIFF2: // calculate character difference, when data stored in rscratch1 and cnt1
+  //    // move loaded chunks to tmp1 and tmp2 registers to use in DIFF block
+  //    tmp1 = rscratch1;
+  //    tmp2 = cnt1;
+  //    // fallthrough to DIFF
+  //  DIFF: // calculate character difference, when data stored in tmp1 and tmp2
+  //    // and find different characters. rscratch2 contains zeroes at positions with
+  //    // same characters. Find index of first different bit (== amount of
+  //    // trailing zeroes), which is: <different character index> * <bits in character>
+  //    // + <some bits within characaters>. Then, clearing bits within character
+  //    // (3 lowest bits for Latin1 case and 4 lowest bits for UTF-16 case)
+  //    // will result in the number of bits until different character in current chunks.
+  //
+  //    // As it's not possible to count trailing zeroes, reverse bits and count leading zeroes
+  //    rscratch2 = REVERSE_BITS(rscratch2);
+  //    rscratch2 = COUNT_LEADING_ZEROES(rscratch2);
+  //    rscratch2 = rscratch2 & (isLL ? -8 : -16);   // clear lowest 3 (Latin1) or 4 (UTF-16) bits
+  //    tmp1 = tmp1 >> rscratch2;                    // shift off same characters from 1st string chunk
+  //    tmp2 = tmp2 >> rscratch2;                    // shift off same characters from 2nd string chunk
+  //
+  //    // Only first character should be left for comparison. Use unsigned extend instruction for that
+  //    if (isLL) {
+  //      tmp1 = UNSIGNED_EXTEND_BYTE2INT(tmp1);
+  //      tmp2 = UNSIGNED_EXTEND_BYTE2INT(tmp2);
+  //    } else {
+  //      tmp1 = UNSIGNED_EXTEND_SHORT2INT(tmp1);
+  //      tmp2 = UNSIGNED_EXTEND_SHORT2INT(tmp2);
+  //    }
+  //
+  //    result = tmp1 - tmp2;
+  //    godo DONE;
+  //  LAST_CHECK:
+  //    rscratch2 = BIT_XOR(tmp1, tmp2);
+  //    if (rscratch2 != 0) goto DIFF;
+  //  DONE:
+  //    return result;
+
   address generate_compare_long_string_same_encoding(bool isLL) {
+    const int characters_in_word = isLL ? 8 : 4;
+    const int characters_in_dword = 2 * characters_in_word;
+    const int byte_to_char_shift = isLL ? 0 : 1;
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", isLL
         ? "compare_long_string_same_encoding LL"
@@ -4203,15 +4612,13 @@
     address entry = __ pc();
     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
         tmp1 = r10, tmp2 = r11;
-    Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
-        LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
-        DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
+    Label SMALL_LOOP, LARGE_LOOP_PREFETCH, DIFF2, TAIL, DONE, DIFF, LAST_CHECK;
     // exit from large loop when less than 64 bytes left to read or we're about
     // to prefetch memory behind array border
     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
     // update cnt2 counter with already loaded 8 bytes
-    __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
+    __ sub(cnt2, cnt2, characters_in_word);
     // update pointers, because of previous read
     __ add(str1, str1, wordSize);
     __ add(str2, str2, wordSize);
@@ -4221,58 +4628,32 @@
         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
         compare_string_16_bytes_same(DIFF, DIFF2);
         compare_string_16_bytes_same(DIFF, DIFF2);
-        __ sub(cnt2, cnt2, isLL ? 64 : 32);
+        __ sub(cnt2, cnt2, 8 * characters_in_word);
         compare_string_16_bytes_same(DIFF, DIFF2);
         __ subs(rscratch2, cnt2, largeLoopExitCondition);
         compare_string_16_bytes_same(DIFF, DIFF2);
         __ br(__ GT, LARGE_LOOP_PREFETCH);
-        __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
+        __ cbz(cnt2, LAST_CHECK);                     // Check if no more chars left
     }
-    // less than 16 bytes left?
-    __ subs(cnt2, cnt2, isLL ? 16 : 8);
+    __ subs(cnt2, cnt2, characters_in_dword);         // keep number of characters reduced by 16 (LL) or 8 (UU)
     __ br(__ LT, TAIL);
     __ bind(SMALL_LOOP);
       compare_string_16_bytes_same(DIFF, DIFF2);
-      __ subs(cnt2, cnt2, isLL ? 16 : 8);
+      __ subs(cnt2, cnt2, characters_in_dword);
       __ br(__ GE, SMALL_LOOP);
+      __ cmn(cnt2, (u1)(characters_in_dword));
+      __ br(__ EQ, LAST_CHECK);
     __ bind(TAIL);
-      __ adds(cnt2, cnt2, isLL ? 16 : 8);
-      __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
-      __ subs(cnt2, cnt2, isLL ? 8 : 4);
-      __ br(__ LE, CHECK_LAST);
-      __ eor(rscratch2, tmp1, tmp2);
-      __ cbnz(rscratch2, DIFF);
-      __ ldr(tmp1, Address(__ post(str1, 8)));
-      __ ldr(tmp2, Address(__ post(str2, 8)));
-      __ sub(cnt2, cnt2, isLL ? 8 : 4);
-    __ bind(CHECK_LAST);
-      if (!isLL) {
-        __ add(cnt2, cnt2, cnt2); // now in bytes
-      }
-      __ eor(rscratch2, tmp1, tmp2);
-      __ cbnz(rscratch2, DIFF);
-      __ ldr(rscratch1, Address(str1, cnt2));
-      __ ldr(cnt1, Address(str2, cnt2));
-      __ eor(rscratch2, rscratch1, cnt1);
-      __ cbz(rscratch2, LENGTH_DIFF);
-      // Find the first different characters in the longwords and
-      // compute their difference.
+      __ add(str1, str1, cnt2, __ LSL, byte_to_char_shift); // points to last 16 bytes to compare
+      __ add(str2, str2, cnt2, __ LSL, byte_to_char_shift); // points to last 16 bytes to compare
+      __ ldr(tmp1, Address(str1, -8));                // preload 8 bytes before current pointer
+      __ ldr(tmp2, Address(str2, -8));                // preload 8 bytes before current pointer
+      compare_string_16_bytes_same(DIFF, DIFF2);
+      __ b(LAST_CHECK);
     __ bind(DIFF2);
-      __ rev(rscratch2, rscratch2);
-      __ clz(rscratch2, rscratch2);
-      __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
-      __ lsrv(rscratch1, rscratch1, rscratch2);
-      if (isLL) {
-        __ lsrv(cnt1, cnt1, rscratch2);
-        __ uxtbw(rscratch1, rscratch1);
-        __ uxtbw(cnt1, cnt1);
-      } else {
-        __ lsrv(cnt1, cnt1, rscratch2);
-        __ uxthw(rscratch1, rscratch1);
-        __ uxthw(cnt1, cnt1);
-      }
-      __ subw(result, rscratch1, cnt1);
-      __ b(LENGTH_DIFF);
+      __ mov(tmp1, rscratch1);
+      __ mov(tmp2, cnt1);
+      // fallthrough to DIFF
     __ bind(DIFF);
       __ rev(rscratch2, rscratch2);
       __ clz(rscratch2, rscratch2);
@@ -4288,11 +4669,11 @@
         __ uxthw(tmp2, tmp2);
       }
       __ subw(result, tmp1, tmp2);
-      __ b(LENGTH_DIFF);
-    __ bind(LAST_CHECK_AND_LENGTH_DIFF);
+      __ b(DONE);
+    __ bind(LAST_CHECK);
       __ eor(rscratch2, tmp1, tmp2);
       __ cbnz(rscratch2, DIFF);
-    __ bind(LENGTH_DIFF);
+    __ bind(DONE);
       __ ret(lr);
     return entry;
   }