< prev index next >

src/cpu/x86/vm/macroAssembler_x86.cpp

Print this page

        

@@ -44,10 +44,13 @@
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
 #include "gc/g1/heapRegion.hpp"
 #endif // INCLUDE_ALL_GCS
 #include "crc32c.h"
+#ifdef COMPILER2
+#include "opto/intrinsicnode.hpp"
+#endif
 
 #ifdef PRODUCT
 #define BLOCK_COMMENT(str) /* nothing */
 #define STOP(error) stop(error)
 #else

@@ -6297,51 +6300,68 @@
     NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
     rep_stos();
   }
 }
 
+#ifdef COMPILER2
+
 // IndexOf for constant substrings with size >= 8 chars
 // which don't need to be loaded through stack.
 void MacroAssembler::string_indexofC8(Register str1, Register str2,
                                       Register cnt1, Register cnt2,
                                       int int_cnt2,  Register result,
-                                      XMMRegister vec, Register tmp) {
+                                      XMMRegister vec, Register tmp,
+                                      int ae) {
   ShortBranchVerifier sbv(this);
   assert(UseSSE42Intrinsics, "SSE4.2 is required");
+  assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 
-  // This method uses pcmpestri instruction with bound registers
+  // This method uses the pcmpestri instruction with bound registers
   //   inputs:
   //     xmm - substring
   //     rax - substring length (elements count)
   //     mem - scanned string
   //     rdx - string length (elements count)
   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
+  //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
   //   outputs:
   //     rcx - matched index in string
   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
+  int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
+  int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
+  Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
+  Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
 
   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
 
   // Note, inline_string_indexOf() generates checks:
   // if (substr.count > string.count) return -1;
   // if (substr.count == 0) return 0;
-  assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
+  assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
 
   // Load substring.
+  if (ae == StrIntrinsicNode::UL) {
+    pmovzxbw(vec, Address(str2, 0));
+  } else {
   movdqu(vec, Address(str2, 0));
+  }
   movl(cnt2, int_cnt2);
   movptr(result, str1); // string addr
 
-  if (int_cnt2 > 8) {
+  if (int_cnt2 > stride) {
     jmpb(SCAN_TO_SUBSTR);
 
     // Reload substr for rescan, this code
     // is executed only for large substrings (> 8 chars)
     bind(RELOAD_SUBSTR);
+    if (ae == StrIntrinsicNode::UL) {
+      pmovzxbw(vec, Address(str2, 0));
+    } else {
     movdqu(vec, Address(str2, 0));
+    }
     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
 
     bind(RELOAD_STR);
     // We came here after the beginning of the substring was
     // matched but the rest of it was not so we need to search

@@ -6356,131 +6376,150 @@
 
     decrementl(cnt1);     // Shift to next element
     cmpl(cnt1, cnt2);
     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
 
-    addptr(result, 2);
+    addptr(result, (1<<scale1));
 
   } // (int_cnt2 > 8)
 
   // Scan string for start of substr in 16-byte vectors
   bind(SCAN_TO_SUBSTR);
-  pcmpestri(vec, Address(result, 0), 0x0d);
+  pcmpestri(vec, Address(result, 0), mode);
   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
-  subl(cnt1, 8);
+  subl(cnt1, stride);
   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
   cmpl(cnt1, cnt2);
   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
   addptr(result, 16);
   jmpb(SCAN_TO_SUBSTR);
 
   // Found a potential substr
   bind(FOUND_CANDIDATE);
   // Matched whole vector if first element matched (tmp(rcx) == 0).
-  if (int_cnt2 == 8) {
+  if (int_cnt2 == stride) {
     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
   } else { // int_cnt2 > 8
     jccb(Assembler::overflow, FOUND_SUBSTR);
   }
   // After pcmpestri tmp(rcx) contains matched element index
   // Compute start addr of substr
-  lea(result, Address(result, tmp, Address::times_2));
+  lea(result, Address(result, tmp, scale1));
 
   // Make sure string is still long enough
   subl(cnt1, tmp);
   cmpl(cnt1, cnt2);
-  if (int_cnt2 == 8) {
+  if (int_cnt2 == stride) {
     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
   } else { // int_cnt2 > 8
     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
   }
   // Left less then substring.
 
   bind(RET_NOT_FOUND);
   movl(result, -1);
   jmpb(EXIT);
 
-  if (int_cnt2 > 8) {
+  if (int_cnt2 > stride) {
     // This code is optimized for the case when whole substring
     // is matched if its head is matched.
     bind(MATCH_SUBSTR_HEAD);
-    pcmpestri(vec, Address(result, 0), 0x0d);
+    pcmpestri(vec, Address(result, 0), mode);
     // Reload only string if does not match
     jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
 
     Label CONT_SCAN_SUBSTR;
     // Compare the rest of substring (> 8 chars).
     bind(FOUND_SUBSTR);
     // First 8 chars are already matched.
     negptr(cnt2);
-    addptr(cnt2, 8);
+    addptr(cnt2, stride);
 
     bind(SCAN_SUBSTR);
-    subl(cnt1, 8);
-    cmpl(cnt2, -8); // Do not read beyond substring
+    subl(cnt1, stride);
+    cmpl(cnt2, -stride); // Do not read beyond substring
     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
     // Back-up strings to avoid reading beyond substring:
     // cnt1 = cnt1 - cnt2 + 8
     addl(cnt1, cnt2); // cnt2 is negative
-    addl(cnt1, 8);
-    movl(cnt2, 8); negptr(cnt2);
+    addl(cnt1, stride);
+    movl(cnt2, stride); negptr(cnt2);
     bind(CONT_SCAN_SUBSTR);
     if (int_cnt2 < (int)G) {
-      movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
-      pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
+      int tail_off1 = int_cnt2<<scale1;
+      int tail_off2 = int_cnt2<<scale2;
+      if (ae == StrIntrinsicNode::UL) {
+        pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
+      } else {
+        movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
+      }
+      pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
     } else {
       // calculate index in register to avoid integer overflow (int_cnt2*2)
       movl(tmp, int_cnt2);
       addptr(tmp, cnt2);
-      movdqu(vec, Address(str2, tmp, Address::times_2, 0));
-      pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
+      if (ae == StrIntrinsicNode::UL) {
+        pmovzxbw(vec, Address(str2, tmp, scale2, 0));
+      } else {
+        movdqu(vec, Address(str2, tmp, scale2, 0));
+      }
+      pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
     }
     // Need to reload strings pointers if not matched whole vector
     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
-    addptr(cnt2, 8);
+    addptr(cnt2, stride);
     jcc(Assembler::negative, SCAN_SUBSTR);
     // Fall through if found full substring
 
   } // (int_cnt2 > 8)
 
   bind(RET_FOUND);
   // Found result if we matched full small substring.
   // Compute substr offset
   subptr(result, str1);
+  if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
   shrl(result, 1); // index
+  }
   bind(EXIT);
 
 } // string_indexofC8
 
 // Small strings are loaded through stack if they cross page boundary.
 void MacroAssembler::string_indexof(Register str1, Register str2,
                                     Register cnt1, Register cnt2,
                                     int int_cnt2,  Register result,
-                                    XMMRegister vec, Register tmp) {
+                                    XMMRegister vec, Register tmp,
+                                    int ae) {
   ShortBranchVerifier sbv(this);
   assert(UseSSE42Intrinsics, "SSE4.2 is required");
+  assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
+
   //
   // int_cnt2 is length of small (< 8 chars) constant substring
   // or (-1) for non constant substring in which case its length
   // is in cnt2 register.
   //
   // Note, inline_string_indexOf() generates checks:
   // if (substr.count > string.count) return -1;
   // if (substr.count == 0) return 0;
   //
-  assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
-
-  // This method uses pcmpestri instruction with bound registers
+  int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
+  assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
+  // This method uses the pcmpestri instruction with bound registers
   //   inputs:
   //     xmm - substring
   //     rax - substring length (elements count)
   //     mem - scanned string
   //     rdx - string length (elements count)
   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
+  //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
   //   outputs:
   //     rcx - matched index in string
   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
+  int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
+  Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
+  Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
 
   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
         FOUND_CANDIDATE;
 

@@ -6490,27 +6529,44 @@
     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
 
     movptr(tmp, rsp); // save old SP
 
     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
-      if (int_cnt2 == 1) {  // One char
+      if (int_cnt2 == (1>>scale2)) { // One byte
+        assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
+        load_unsigned_byte(result, Address(str2, 0));
+        movdl(vec, result); // move 32 bits
+      } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
+        // Not enough header space in 32-bit VM: 12+3 = 15.
+        movl(result, Address(str2, -1));
+        shrl(result, 8);
+        movdl(vec, result); // move 32 bits
+      } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
         load_unsigned_short(result, Address(str2, 0));
         movdl(vec, result); // move 32 bits
-      } else if (int_cnt2 == 2) { // Two chars
+      } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
         movdl(vec, Address(str2, 0)); // move 32 bits
-      } else if (int_cnt2 == 4) { // Four chars
+      } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
         movq(vec, Address(str2, 0));  // move 64 bits
-      } else { // cnt2 = { 3, 5, 6, 7 }
+      } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
         // Array header size is 12 bytes in 32-bit VM
         // + 6 bytes for 3 chars == 18 bytes,
         // enough space to load vec and shift.
         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
-        movdqu(vec, Address(str2, (int_cnt2*2)-16));
-        psrldq(vec, 16-(int_cnt2*2));
+        if (ae == StrIntrinsicNode::UL) {
+          int tail_off = int_cnt2-8;
+          pmovzxbw(vec, Address(str2, tail_off));
+          psrldq(vec, -2*tail_off);
+        }
+        else {
+          int tail_off = int_cnt2*(1<<scale2);
+          movdqu(vec, Address(str2, tail_off-16));
+          psrldq(vec, 16-tail_off);
+        }
       }
     } else { // not constant substring
-      cmpl(cnt2, 8);
+      cmpl(cnt2, stride);
       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
 
       // We can read beyond string if srt+16 does not cross page boundary
       // since heaps are aligned and mapped by pages.
       assert(os::vm_page_size() < (int)G, "default page should be small");

@@ -6519,44 +6575,54 @@
       cmpl(result, (os::vm_page_size()-16));
       jccb(Assembler::belowEqual, CHECK_STR);
 
       // Move small strings to stack to allow load 16 bytes into vec.
       subptr(rsp, 16);
-      int stk_offset = wordSize-2;
+      int stk_offset = wordSize-(1<<scale2);
       push(cnt2);
 
       bind(COPY_SUBSTR);
-      load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
-      movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
+      if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
+        load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
+        movb(Address(rsp, cnt2, scale2, stk_offset), result);
+      } else if (ae == StrIntrinsicNode::UU) {
+        load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
+        movw(Address(rsp, cnt2, scale2, stk_offset), result);
+      }
       decrement(cnt2);
       jccb(Assembler::notZero, COPY_SUBSTR);
 
       pop(cnt2);
       movptr(str2, rsp);  // New substring address
     } // non constant
 
     bind(CHECK_STR);
-    cmpl(cnt1, 8);
+    cmpl(cnt1, stride);
     jccb(Assembler::aboveEqual, BIG_STRINGS);
 
     // Check cross page boundary.
     movl(result, str1); // We need only low 32 bits
     andl(result, (os::vm_page_size()-1));
     cmpl(result, (os::vm_page_size()-16));
     jccb(Assembler::belowEqual, BIG_STRINGS);
 
     subptr(rsp, 16);
-    int stk_offset = -2;
+    int stk_offset = -(1<<scale1);
     if (int_cnt2 < 0) { // not constant
       push(cnt2);
       stk_offset += wordSize;
     }
     movl(cnt2, cnt1);
 
     bind(COPY_STR);
-    load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
-    movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
+    if (ae == StrIntrinsicNode::LL) {
+      load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
+      movb(Address(rsp, cnt2, scale1, stk_offset), result);
+    } else {
+      load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
+      movw(Address(rsp, cnt2, scale1, stk_offset), result);
+    }
     decrement(cnt2);
     jccb(Assembler::notZero, COPY_STR);
 
     if (int_cnt2 < 0) { // not constant
       pop(cnt2);

@@ -6564,11 +6630,15 @@
     movptr(str1, rsp);  // New string address
 
     bind(BIG_STRINGS);
     // Load substring.
     if (int_cnt2 < 0) { // -1
+      if (ae == StrIntrinsicNode::UL) {
+        pmovzxbw(vec, Address(str2, 0));
+      } else {
       movdqu(vec, Address(str2, 0));
+      }
       push(cnt2);       // substr count
       push(str2);       // substr addr
       push(str1);       // string addr
     } else {
       // Small (< 8 chars) constant substrings are loaded already.

@@ -6595,41 +6665,47 @@
     // Reload substr for rescan, this code
     // is executed only for large substrings (> 8 chars)
     bind(RELOAD_SUBSTR);
     movptr(str2, Address(rsp, 2*wordSize));
     movl(cnt2, Address(rsp, 3*wordSize));
+    if (ae == StrIntrinsicNode::UL) {
+      pmovzxbw(vec, Address(str2, 0));
+    } else {
     movdqu(vec, Address(str2, 0));
+    }
     // We came here after the beginning of the substring was
     // matched but the rest of it was not so we need to search
     // again. Start from the next element after the previous match.
     subptr(str1, result); // Restore counter
+    if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
     shrl(str1, 1);
+    }
     addl(cnt1, str1);
     decrementl(cnt1);   // Shift to next element
     cmpl(cnt1, cnt2);
     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
 
-    addptr(result, 2);
+    addptr(result, (1<<scale1));
   } // non constant
 
   // Scan string for start of substr in 16-byte vectors
   bind(SCAN_TO_SUBSTR);
   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
-  pcmpestri(vec, Address(result, 0), 0x0d);
+  pcmpestri(vec, Address(result, 0), mode);
   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
-  subl(cnt1, 8);
+  subl(cnt1, stride);
   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
   cmpl(cnt1, cnt2);
   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
   addptr(result, 16);
 
   bind(ADJUST_STR);
-  cmpl(cnt1, 8); // Do not read beyond string
+  cmpl(cnt1, stride); // Do not read beyond string
   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
   // Back-up string to avoid reading beyond string.
-  lea(result, Address(result, cnt1, Address::times_2, -16));
-  movl(cnt1, 8);
+  lea(result, Address(result, cnt1, scale1, -16));
+  movl(cnt1, stride);
   jmpb(SCAN_TO_SUBSTR);
 
   // Found a potential substr
   bind(FOUND_CANDIDATE);
   // After pcmpestri tmp(rcx) contains matched element index

@@ -6644,30 +6720,29 @@
   movl(result, -1);
   jmpb(CLEANUP);
 
   bind(FOUND_SUBSTR);
   // Compute start addr of substr
-  lea(result, Address(result, tmp, Address::times_2));
-
+  lea(result, Address(result, tmp, scale1));
   if (int_cnt2 > 0) { // Constant substring
     // Repeat search for small substring (< 8 chars)
     // from new point without reloading substring.
     // Have to check that we don't read beyond string.
-    cmpl(tmp, 8-int_cnt2);
+    cmpl(tmp, stride-int_cnt2);
     jccb(Assembler::greater, ADJUST_STR);
     // Fall through if matched whole substring.
   } else { // non constant
     assert(int_cnt2 == -1, "should be != 0");
 
     addl(tmp, cnt2);
     // Found result if we matched whole substring.
-    cmpl(tmp, 8);
+    cmpl(tmp, stride);
     jccb(Assembler::lessEqual, RET_FOUND);
 
     // Repeat search for small substring (<= 8 chars)
     // from new point 'str1' without reloading substring.
-    cmpl(cnt2, 8);
+    cmpl(cnt2, stride);
     // Have to check that we don't read beyond string.
     jccb(Assembler::lessEqual, ADJUST_STR);
 
     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
     // Compare the rest of substring (> 8 chars).

@@ -6676,53 +6751,190 @@
     cmpl(tmp, cnt2);
     // First 8 chars are already matched.
     jccb(Assembler::equal, CHECK_NEXT);
 
     bind(SCAN_SUBSTR);
-    pcmpestri(vec, Address(str1, 0), 0x0d);
+    pcmpestri(vec, Address(str1, 0), mode);
     // Need to reload strings pointers if not matched whole vector
     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
 
     bind(CHECK_NEXT);
-    subl(cnt2, 8);
+    subl(cnt2, stride);
     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
     addptr(str1, 16);
+    if (ae == StrIntrinsicNode::UL) {
+      addptr(str2, 8);
+    } else {
     addptr(str2, 16);
-    subl(cnt1, 8);
-    cmpl(cnt2, 8); // Do not read beyond substring
+    }
+    subl(cnt1, stride);
+    cmpl(cnt2, stride); // Do not read beyond substring
     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
     // Back-up strings to avoid reading beyond substring.
-    lea(str2, Address(str2, cnt2, Address::times_2, -16));
-    lea(str1, Address(str1, cnt2, Address::times_2, -16));
+
+    if (ae == StrIntrinsicNode::UL) {
+      lea(str2, Address(str2, cnt2, scale2, -8));
+      lea(str1, Address(str1, cnt2, scale1, -16));
+    } else {
+      lea(str2, Address(str2, cnt2, scale2, -16));
+      lea(str1, Address(str1, cnt2, scale1, -16));
+    }
     subl(cnt1, cnt2);
-    movl(cnt2, 8);
-    addl(cnt1, 8);
+    movl(cnt2, stride);
+    addl(cnt1, stride);
     bind(CONT_SCAN_SUBSTR);
+    if (ae == StrIntrinsicNode::UL) {
+      pmovzxbw(vec, Address(str2, 0));
+    } else {
     movdqu(vec, Address(str2, 0));
+    }
     jmpb(SCAN_SUBSTR);
 
     bind(RET_FOUND_LONG);
     movptr(str1, Address(rsp, wordSize));
   } // non constant
 
   bind(RET_FOUND);
   // Compute substr offset
   subptr(result, str1);
+  if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
   shrl(result, 1); // index
-
+  }
   bind(CLEANUP);
   pop(rsp); // restore SP
 
 } // string_indexof
 
-// Compare strings.
+void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
+                                         XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
+  ShortBranchVerifier sbv(this);
+  assert(UseSSE42Intrinsics, "SSE4.2 is required");
+
+  int stride = 8;
+
+  Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
+        SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
+        RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
+        FOUND_SEQ_CHAR, DONE_LABEL;
+
+  movptr(result, str1);
+  if (UseAVX >= 2) {
+    cmpl(cnt1, stride);
+    jccb(Assembler::less, SCAN_TO_CHAR_LOOP);
+    cmpl(cnt1, 2*stride);
+    jccb(Assembler::less, SCAN_TO_8_CHAR_INIT);
+    movdl(vec1, ch);
+    vpbroadcastw(vec1, vec1);
+    vpxor(vec2, vec2);
+    movl(tmp, cnt1);
+    andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
+    andl(cnt1,0x0000000F);  //tail count (in chars)
+
+    bind(SCAN_TO_16_CHAR_LOOP);
+    vmovdqu(vec3, Address(result, 0));
+    vpcmpeqw(vec3, vec3, vec1, true);
+    vptest(vec2, vec3);
+    jcc(Assembler::carryClear, FOUND_CHAR);
+    addptr(result, 32);
+    subl(tmp, 2*stride);
+    jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
+    jmp(SCAN_TO_8_CHAR);
+    bind(SCAN_TO_8_CHAR_INIT);
+    movdl(vec1, ch);
+    pshuflw(vec1, vec1, 0x00);
+    pshufd(vec1, vec1, 0);
+    pxor(vec2, vec2);
+  }
+  if (UseAVX >= 2 || UseSSE42Intrinsics) {
+    bind(SCAN_TO_8_CHAR);
+    cmpl(cnt1, stride);
+    if (UseAVX >= 2) {
+      jccb(Assembler::less, SCAN_TO_CHAR);
+    }
+    if (!(UseAVX >= 2)) {
+      jccb(Assembler::less, SCAN_TO_CHAR_LOOP);
+      movdl(vec1, ch);
+      pshuflw(vec1, vec1, 0x00);
+      pshufd(vec1, vec1, 0);
+      pxor(vec2, vec2);
+    }
+    movl(tmp, cnt1);
+    andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
+    andl(cnt1,0x00000007);  //tail count (in chars)
+
+    bind(SCAN_TO_8_CHAR_LOOP);
+    movdqu(vec3, Address(result, 0));
+    pcmpeqw(vec3, vec1);
+    ptest(vec2, vec3);
+    jcc(Assembler::carryClear, FOUND_CHAR);
+    addptr(result, 16);
+    subl(tmp, stride);
+    jccb(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
+  }
+  bind(SCAN_TO_CHAR);
+  testl(cnt1, cnt1);
+  jcc(Assembler::zero, RET_NOT_FOUND);
+
+  bind(SCAN_TO_CHAR_LOOP);
+  load_unsigned_short(tmp, Address(result, 0));
+  cmpl(ch, tmp);
+  jccb(Assembler::equal, FOUND_SEQ_CHAR);
+  addptr(result, 2);
+  subl(cnt1, 1);
+  jccb(Assembler::zero, RET_NOT_FOUND);
+  jmp(SCAN_TO_CHAR_LOOP);
+
+  bind(RET_NOT_FOUND);
+  movl(result, -1);
+  jmpb(DONE_LABEL);
+
+  if (UseAVX >= 2 || UseSSE42Intrinsics) {
+    bind(FOUND_CHAR);
+    if (UseAVX >= 2) {
+      vpmovmskb(tmp, vec3);
+    } else {
+      pmovmskb(tmp, vec3);
+    }
+    bsfl(ch, tmp);
+    addl(result, ch);
+  }
+
+  bind(FOUND_SEQ_CHAR);
+  subptr(result, str1);
+  shrl(result, 1);
+
+  bind(DONE_LABEL);
+} // string_indexof_char
+
+// helper function for string_compare
+void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
+                                        Address::ScaleFactor scale, Address::ScaleFactor scale1,
+                                        Address::ScaleFactor scale2, Register index, int ae) {
+  if (ae == StrIntrinsicNode::LL) {
+    load_unsigned_byte(elem1, Address(str1, index, scale, 0));
+    load_unsigned_byte(elem2, Address(str2, index, scale, 0));
+  } else if (ae == StrIntrinsicNode::UU) {
+    load_unsigned_short(elem1, Address(str1, index, scale, 0));
+    load_unsigned_short(elem2, Address(str2, index, scale, 0));
+  } else {
+    load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
+    load_unsigned_short(elem2, Address(str2, index, scale2, 0));
+  }
+}
+
+// Compare strings, used for char[] and byte[].
 void MacroAssembler::string_compare(Register str1, Register str2,
                                     Register cnt1, Register cnt2, Register result,
-                                    XMMRegister vec1) {
+                                    XMMRegister vec1, int ae) {
   ShortBranchVerifier sbv(this);
   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
+  int stride, stride2, adr_stride, adr_stride1, adr_stride2;
+  Address::ScaleFactor scale, scale1, scale2;
 
+  if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
+    shrl(cnt2, 1);
+  }
   // Compute the minimum of the string lengths and the
   // difference of the string lengths (stack).
   // Do the conditional move stuff
   movl(result, cnt1);
   subl(cnt1, cnt2);

@@ -6730,74 +6942,127 @@
   cmov32(Assembler::lessEqual, cnt2, result);
 
   // Is the minimum length zero?
   testl(cnt2, cnt2);
   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
-
-  // Compare first characters
+  if (ae == StrIntrinsicNode::LL) {
+    // Load first bytes
+    load_unsigned_byte(result, Address(str1, 0));
+    load_unsigned_byte(cnt1, Address(str2, 0));
+  } else if (ae == StrIntrinsicNode::UU) {
+    // Load first characters
   load_unsigned_short(result, Address(str1, 0));
   load_unsigned_short(cnt1, Address(str2, 0));
+  } else {
+    load_unsigned_byte(result, Address(str1, 0));
+    load_unsigned_short(cnt1, Address(str2, 0));
+  }
   subl(result, cnt1);
   jcc(Assembler::notZero,  POP_LABEL);
+
+  if (ae == StrIntrinsicNode::UU) {
+    // Divide length by 2 to get number of chars
+    shrl(cnt2, 1);
+  }
   cmpl(cnt2, 1);
   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
 
-  // Check if the strings start at the same location.
+  // Check if the strings start at the same location and setup scale and stride
+  if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
   cmpptr(str1, str2);
   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
-
-  Address::ScaleFactor scale = Address::times_2;
-  int stride = 8;
+    if (ae == StrIntrinsicNode::LL) {
+      scale = Address::times_1;
+      stride = 16;
+    } else {
+      scale = Address::times_2;
+      stride = 8;
+    }
+  } else {
+    scale1 = Address::times_1;
+    scale2 = Address::times_2;
+    stride = 8;
+  }
 
   if (UseAVX >= 2 && UseSSE42Intrinsics) {
     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
     Label COMPARE_TAIL_LONG;
     int pcmpmask = 0x19;
+    if (ae == StrIntrinsicNode::LL) {
+      pcmpmask &= ~0x01;
+    }
 
     // Setup to compare 16-chars (32-bytes) vectors,
     // start from first character again because it has aligned address.
-    int stride2 = 16;
-    int adr_stride  = stride  << scale;
+    if (ae == StrIntrinsicNode::LL) {
+      stride2 = 32;
+    } else {
+      stride2 = 16;
+    }
+    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
+      adr_stride = stride << scale;
+    } else {
+      adr_stride1 = 8;  //stride << scale1;
+      adr_stride2 = 16; //stride << scale2;
+    }
 
     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
     // rax and rdx are used by pcmpestri as elements counters
     movl(result, cnt2);
     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
     jcc(Assembler::zero, COMPARE_TAIL_LONG);
 
     // fast path : compare first 2 8-char vectors.
     bind(COMPARE_16_CHARS);
+    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
     movdqu(vec1, Address(str1, 0));
+    } else {
+      pmovzxbw(vec1, Address(str1, 0));
+    }
     pcmpestri(vec1, Address(str2, 0), pcmpmask);
     jccb(Assembler::below, COMPARE_INDEX_CHAR);
 
+    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
     movdqu(vec1, Address(str1, adr_stride));
     pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
+    } else {
+      pmovzxbw(vec1, Address(str1, adr_stride1));
+      pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
+    }
     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
     addl(cnt1, stride);
 
     // Compare the characters at index in cnt1
-    bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character
-    load_unsigned_short(result, Address(str1, cnt1, scale));
-    load_unsigned_short(cnt2, Address(str2, cnt1, scale));
+    bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
+    load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
     subl(result, cnt2);
     jmp(POP_LABEL);
 
     // Setup the registers to start vector comparison loop
     bind(COMPARE_WIDE_VECTORS);
+    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
     lea(str1, Address(str1, result, scale));
     lea(str2, Address(str2, result, scale));
+    } else {
+      lea(str1, Address(str1, result, scale1));
+      lea(str2, Address(str2, result, scale2));
+    }
     subl(result, stride2);
     subl(cnt2, stride2);
     jccb(Assembler::zero, COMPARE_WIDE_TAIL);
     negptr(result);
 
     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
     bind(COMPARE_WIDE_VECTORS_LOOP);
+    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
     vmovdqu(vec1, Address(str1, result, scale));
     vpxor(vec1, Address(str2, result, scale));
+    } else {
+      vpmovzxbw(vec1, Address(str1, result, scale1));
+      vpxor(vec1, Address(str2, result, scale2));
+    }
     vptest(vec1, vec1);
     jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
     addptr(result, stride2);
     subl(cnt2, stride2);
     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);

@@ -6816,27 +7081,41 @@
 
     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
     bind(VECTOR_NOT_EQUAL);
     // clean upper bits of YMM registers
     vpxor(vec1, vec1);
+    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
     lea(str1, Address(str1, result, scale));
     lea(str2, Address(str2, result, scale));
+    } else {
+      lea(str1, Address(str1, result, scale1));
+      lea(str2, Address(str2, result, scale2));
+    }
     jmp(COMPARE_16_CHARS);
 
     // Compare tail chars, length between 1 to 15 chars
     bind(COMPARE_TAIL_LONG);
     movl(cnt2, result);
     cmpl(cnt2, stride);
     jccb(Assembler::less, COMPARE_SMALL_STR);
 
+    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
     movdqu(vec1, Address(str1, 0));
+    } else {
+      pmovzxbw(vec1, Address(str1, 0));
+    }
     pcmpestri(vec1, Address(str2, 0), pcmpmask);
     jcc(Assembler::below, COMPARE_INDEX_CHAR);
     subptr(cnt2, stride);
     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
+    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
     lea(str1, Address(str1, result, scale));
     lea(str2, Address(str2, result, scale));
+    } else {
+      lea(str1, Address(str1, result, scale1));
+      lea(str2, Address(str2, result, scale2));
+    }
     negptr(cnt2);
     jmpb(WHILE_HEAD_LABEL);
 
     bind(COMPARE_SMALL_STR);
   } else if (UseSSE42Intrinsics) {

@@ -6844,14 +7123,21 @@
     int pcmpmask = 0x19;
     // Setup to compare 8-char (16-byte) vectors,
     // start from first character again because it has aligned address.
     movl(result, cnt2);
     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
+    if (ae == StrIntrinsicNode::LL) {
+      pcmpmask &= ~0x01;
+    }
     jccb(Assembler::zero, COMPARE_TAIL);
-
+    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
     lea(str1, Address(str1, result, scale));
     lea(str2, Address(str2, result, scale));
+    } else {
+      lea(str1, Address(str1, result, scale1));
+      lea(str2, Address(str2, result, scale2));
+    }
     negptr(result);
 
     // pcmpestri
     //   inputs:
     //     vec1- substring

@@ -6863,12 +7149,17 @@
     //   outputs:
     //     rcx - first mismatched element index
     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
 
     bind(COMPARE_WIDE_VECTORS);
+    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
     movdqu(vec1, Address(str1, result, scale));
     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
+    } else {
+      pmovzxbw(vec1, Address(str1, result, scale1));
+      pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
+    }
     // After pcmpestri cnt1(rcx) contains mismatched element index
 
     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
     addptr(result, stride);
     subptr(cnt2, stride);

@@ -6879,69 +7170,222 @@
     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
 
     movl(cnt2, stride);
     movl(result, stride);
     negptr(result);
+    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
     movdqu(vec1, Address(str1, result, scale));
     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
+    } else {
+      pmovzxbw(vec1, Address(str1, result, scale1));
+      pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
+    }
     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
 
     // Mismatched characters in the vectors
     bind(VECTOR_NOT_EQUAL);
     addptr(cnt1, result);
-    load_unsigned_short(result, Address(str1, cnt1, scale));
-    load_unsigned_short(cnt2, Address(str2, cnt1, scale));
+    load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
     subl(result, cnt2);
     jmpb(POP_LABEL);
 
     bind(COMPARE_TAIL); // limit is zero
     movl(cnt2, result);
     // Fallthru to tail compare
   }
   // Shift str2 and str1 to the end of the arrays, negate min
+  if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
   lea(str1, Address(str1, cnt2, scale));
   lea(str2, Address(str2, cnt2, scale));
+  } else {
+    lea(str1, Address(str1, cnt2, scale1));
+    lea(str2, Address(str2, cnt2, scale2));
+  }
   decrementl(cnt2);  // first character was compared already
   negptr(cnt2);
 
   // Compare the rest of the elements
   bind(WHILE_HEAD_LABEL);
-  load_unsigned_short(result, Address(str1, cnt2, scale, 0));
-  load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
+  load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
   subl(result, cnt1);
   jccb(Assembler::notZero, POP_LABEL);
   increment(cnt2);
   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
 
   // Strings are equal up to min length.  Return the length difference.
   bind(LENGTH_DIFF_LABEL);
   pop(result);
+  if (ae == StrIntrinsicNode::UU) {
+    // Divide diff by 2 to get number of chars
+    sarl(result, 1);
+  }
   jmpb(DONE_LABEL);
 
   // Discard the stored length difference
   bind(POP_LABEL);
   pop(cnt1);
 
   // That's it
   bind(DONE_LABEL);
+  if(ae == StrIntrinsicNode::UL) {
+    negl(result);
+  }
 }
 
-// Compare char[] arrays aligned to 4 bytes or substrings.
-void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
-                                        Register limit, Register result, Register chr,
+// Search for Non-ASCII character (Negative byte value) in a byte array,
+// return true if it has any and false otherwise.
+void MacroAssembler::has_negatives(Register ary1, Register len,
+                                   Register result, Register tmp1,
                                         XMMRegister vec1, XMMRegister vec2) {
+
+  // rsi: byte array
+  // rcx: len
+  // rax: result
+  ShortBranchVerifier sbv(this);
+  assert_different_registers(ary1, len, result, tmp1);
+  assert_different_registers(vec1, vec2);
+  Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
+
+  // len == 0
+  testl(len, len);
+  jcc(Assembler::zero, FALSE_LABEL);
+
+  movl(result, len); // copy
+
+  if (UseAVX >= 2) {
+    // With AVX2, use 32-byte vector compare
+    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
+
+    // Compare 32-byte vectors
+    andl(result, 0x0000001f);  //   tail count (in bytes)
+    andl(len, 0xffffffe0);   // vector count (in bytes)
+    jccb(Assembler::zero, COMPARE_TAIL);
+
+    lea(ary1, Address(ary1, len, Address::times_1));
+    negptr(len);
+
+    movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
+    movdl(vec2, tmp1);
+    vpbroadcastd(vec2, vec2);
+
+    bind(COMPARE_WIDE_VECTORS);
+    vmovdqu(vec1, Address(ary1, len, Address::times_1));
+    vptest(vec1, vec2);
+    jccb(Assembler::notZero, TRUE_LABEL);
+    addptr(len, 32);
+    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
+
+    testl(result, result);
+    jccb(Assembler::zero, FALSE_LABEL);
+
+    vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
+    vptest(vec1, vec2);
+    jccb(Assembler::notZero, TRUE_LABEL);
+    jmpb(FALSE_LABEL);
+
+    bind(COMPARE_TAIL); // len is zero
+    movl(len, result);
+    // Fallthru to tail compare
+  } else if (UseSSE42Intrinsics) {
+    // With SSE4.2, use double quad vector compare
+    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
+
+    // Compare 16-byte vectors
+    andl(result, 0x0000000f);  //   tail count (in bytes)
+    andl(len, 0xfffffff0);   // vector count (in bytes)
+    jccb(Assembler::zero, COMPARE_TAIL);
+
+    lea(ary1, Address(ary1, len, Address::times_1));
+    negptr(len);
+
+    movl(tmp1, 0x80808080);
+    movdl(vec2, tmp1);
+    pshufd(vec2, vec2, 0);
+
+    bind(COMPARE_WIDE_VECTORS);
+    movdqu(vec1, Address(ary1, len, Address::times_1));
+    ptest(vec1, vec2);
+    jccb(Assembler::notZero, TRUE_LABEL);
+    addptr(len, 16);
+    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
+
+    testl(result, result);
+    jccb(Assembler::zero, FALSE_LABEL);
+
+    movdqu(vec1, Address(ary1, result, Address::times_1, -16));
+    ptest(vec1, vec2);
+    jccb(Assembler::notZero, TRUE_LABEL);
+    jmpb(FALSE_LABEL);
+
+    bind(COMPARE_TAIL); // len is zero
+    movl(len, result);
+    // Fallthru to tail compare
+  }
+
+  // Compare 4-byte vectors
+  andl(len, 0xfffffffc); // vector count (in bytes)
+  jccb(Assembler::zero, COMPARE_CHAR);
+
+  lea(ary1, Address(ary1, len, Address::times_1));
+  negptr(len);
+
+  bind(COMPARE_VECTORS);
+  movl(tmp1, Address(ary1, len, Address::times_1));
+  andl(tmp1, 0x80808080);
+  jccb(Assembler::notZero, TRUE_LABEL);
+  addptr(len, 4);
+  jcc(Assembler::notZero, COMPARE_VECTORS);
+
+  // Compare trailing char (final 2 bytes), if any
+  bind(COMPARE_CHAR);
+  testl(result, 0x2);   // tail  char
+  jccb(Assembler::zero, COMPARE_BYTE);
+  load_unsigned_short(tmp1, Address(ary1, 0));
+  andl(tmp1, 0x00008080);
+  jccb(Assembler::notZero, TRUE_LABEL);
+  subptr(result, 2);
+  lea(ary1, Address(ary1, 2));
+
+  bind(COMPARE_BYTE);
+  testl(result, 0x1);   // tail  byte
+  jccb(Assembler::zero, FALSE_LABEL);
+  load_unsigned_byte(tmp1, Address(ary1, 0));
+  andl(tmp1, 0x00000080);
+  jccb(Assembler::notEqual, TRUE_LABEL);
+  jmpb(FALSE_LABEL);
+
+  bind(TRUE_LABEL);
+  movl(result, 1);   // return true
+  jmpb(DONE);
+
+  bind(FALSE_LABEL);
+  xorl(result, result); // return false
+
+  // That's it
+  bind(DONE);
+  if (UseAVX >= 2) {
+    // clean upper bits of YMM registers
+    vpxor(vec1, vec1);
+    vpxor(vec2, vec2);
+  }
+}
+
+// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
+void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
+                                   Register limit, Register result, Register chr,
+                                   XMMRegister vec1, XMMRegister vec2, bool is_char) {
   ShortBranchVerifier sbv(this);
-  Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
+  Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
 
   int length_offset  = arrayOopDesc::length_offset_in_bytes();
-  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+  int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
 
+  if (is_array_equ) {
   // Check the input args
   cmpptr(ary1, ary2);
   jcc(Assembler::equal, TRUE_LABEL);
 
-  if (is_array_equ) {
     // Need additional checks for arrays_equals.
     testptr(ary1, ary1);
     jcc(Assembler::zero, FALSE_LABEL);
     testptr(ary2, ary2);
     jcc(Assembler::zero, FALSE_LABEL);

@@ -6960,19 +7404,22 @@
     // Load array address
     lea(ary1, Address(ary1, base_offset));
     lea(ary2, Address(ary2, base_offset));
   }
 
+  if (is_array_equ && is_char) {
+    // arrays_equals when used for char[].
   shll(limit, 1);      // byte count != 0
+  }
   movl(result, limit); // copy
 
   if (UseAVX >= 2) {
     // With AVX2, use 32-byte vector compare
     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
 
     // Compare 32-byte vectors
-    andl(result, 0x0000001e);  //   tail count (in bytes)
+    andl(result, 0x0000001f);  //   tail count (in bytes)
     andl(limit, 0xffffffe0);   // vector count (in bytes)
     jccb(Assembler::zero, COMPARE_TAIL);
 
     lea(ary1, Address(ary1, limit, Address::times_1));
     lea(ary2, Address(ary2, limit, Address::times_1));

@@ -7005,11 +7452,11 @@
   } else if (UseSSE42Intrinsics) {
     // With SSE4.2, use double quad vector compare
     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
 
     // Compare 16-byte vectors
-    andl(result, 0x0000000e);  //   tail count (in bytes)
+    andl(result, 0x0000000f);  //   tail count (in bytes)
     andl(limit, 0xfffffff0);   // vector count (in bytes)
     jccb(Assembler::zero, COMPARE_TAIL);
 
     lea(ary1, Address(ary1, limit, Address::times_1));
     lea(ary2, Address(ary2, limit, Address::times_1));

@@ -7057,16 +7504,30 @@
   jcc(Assembler::notZero, COMPARE_VECTORS);
 
   // Compare trailing char (final 2 bytes), if any
   bind(COMPARE_CHAR);
   testl(result, 0x2);   // tail  char
-  jccb(Assembler::zero, TRUE_LABEL);
+  jccb(Assembler::zero, COMPARE_BYTE);
   load_unsigned_short(chr, Address(ary1, 0));
   load_unsigned_short(limit, Address(ary2, 0));
   cmpl(chr, limit);
   jccb(Assembler::notEqual, FALSE_LABEL);
 
+  if (is_array_equ && is_char) {
+    bind(COMPARE_BYTE);
+  } else {
+    lea(ary1, Address(ary1, 2));
+    lea(ary2, Address(ary2, 2));
+
+    bind(COMPARE_BYTE);
+    testl(result, 0x1);   // tail  byte
+    jccb(Assembler::zero, TRUE_LABEL);
+    load_unsigned_byte(chr, Address(ary1, 0));
+    load_unsigned_byte(limit, Address(ary2, 0));
+    cmpl(chr, limit);
+    jccb(Assembler::notEqual, FALSE_LABEL);
+  }
   bind(TRUE_LABEL);
   movl(result, 1);   // return true
   jmpb(DONE);
 
   bind(FALSE_LABEL);

@@ -7079,10 +7540,12 @@
     vpxor(vec1, vec1);
     vpxor(vec2, vec2);
   }
 }
 
+#endif
+
 void MacroAssembler::generate_fill(BasicType t, bool aligned,
                                    Register to, Register value, Register count,
                                    Register rtmp, XMMRegister xtmp) {
   ShortBranchVerifier sbv(this);
   assert_different_registers(to, value, count, rtmp);

@@ -9083,10 +9546,183 @@
 #endif // LP64
 #undef BIND
 #undef BLOCK_COMMENT
 
 
+// Compress char[] array to byte[].
+void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
+                                         XMMRegister tmp1Reg, XMMRegister tmp2Reg,
+                                         XMMRegister tmp3Reg, XMMRegister tmp4Reg,
+                                         Register tmp5, Register result) {
+  Label copy_chars_loop, return_length, return_zero, done;
+
+  // rsi: src
+  // rdi: dst
+  // rdx: len
+  // rcx: tmp5
+  // rax: result
+
+  // rsi holds start addr of source char[] to be compressed
+  // rdi holds start addr of destination byte[]
+  // rdx holds length
+
+  assert(len != result, "");
+
+  // save length for return
+  push(len);
+
+  if (UseSSE42Intrinsics) {
+    Label copy_32_loop, copy_16, copy_tail;
+
+    movl(result, len);
+    movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
+
+    // vectored compression
+    andl(len, 0xfffffff0);    // vector count (in chars)
+    andl(result, 0x0000000f);    // tail count (in chars)
+    testl(len, len);
+    jccb(Assembler::zero, copy_16);
+
+    // compress 16 chars per iter
+    movdl(tmp1Reg, tmp5);
+    pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
+    pxor(tmp4Reg, tmp4Reg);
+
+    lea(src, Address(src, len, Address::times_2));
+    lea(dst, Address(dst, len, Address::times_1));
+    negptr(len);
+
+    bind(copy_32_loop);
+    movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
+    por(tmp4Reg, tmp2Reg);
+    movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
+    por(tmp4Reg, tmp3Reg);
+    ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
+    jcc(Assembler::notZero, return_zero);
+    packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
+    movdqu(Address(dst, len, Address::times_1), tmp2Reg);
+    addptr(len, 16);
+    jcc(Assembler::notZero, copy_32_loop);
+
+    // compress next vector of 8 chars (if any)
+    bind(copy_16);
+    movl(len, result);
+    andl(len, 0xfffffff8);    // vector count (in chars)
+    andl(result, 0x00000007);    // tail count (in chars)
+    testl(len, len);
+    jccb(Assembler::zero, copy_tail);
+
+    movdl(tmp1Reg, tmp5);
+    pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
+    pxor(tmp3Reg, tmp3Reg);
+
+    movdqu(tmp2Reg, Address(src, 0));
+    ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
+    jccb(Assembler::notZero, return_zero);
+    packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
+    movq(Address(dst, 0), tmp2Reg);
+    addptr(src, 16);
+    addptr(dst, 8);
+
+    bind(copy_tail);
+    movl(len, result);
+  }
+  // compress 1 char per iter
+  testl(len, len);
+  jccb(Assembler::zero, return_length);
+  lea(src, Address(src, len, Address::times_2));
+  lea(dst, Address(dst, len, Address::times_1));
+  negptr(len);
+
+  bind(copy_chars_loop);
+  load_unsigned_short(result, Address(src, len, Address::times_2));
+  testl(result, 0xff00);      // check if Unicode char
+  jccb(Assembler::notZero, return_zero);
+  movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
+  increment(len);
+  jcc(Assembler::notZero, copy_chars_loop);
+
+  // if compression succeeded, return length
+  bind(return_length);
+  pop(result);
+  jmpb(done);
+
+  // if compression failed, return 0
+  bind(return_zero);
+  xorl(result, result);
+  addptr(rsp, wordSize);
+
+  bind(done);
+}
+
+// Inflate byte[] array to char[].
+void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
+                                        XMMRegister tmp1, Register tmp2) {
+  Label copy_chars_loop, done;
+
+  // rsi: src
+  // rdi: dst
+  // rdx: len
+  // rcx: tmp2
+
+  // rsi holds start addr of source byte[] to be inflated
+  // rdi holds start addr of destination char[]
+  // rdx holds length
+  assert_different_registers(src, dst, len, tmp2);
+
+  if (UseSSE42Intrinsics) {
+    Label copy_8_loop, copy_bytes, copy_tail;
+
+    movl(tmp2, len);
+    andl(tmp2, 0x00000007);   // tail count (in chars)
+    andl(len, 0xfffffff8);    // vector count (in chars)
+    jccb(Assembler::zero, copy_tail);
+
+    // vectored inflation
+    lea(src, Address(src, len, Address::times_1));
+    lea(dst, Address(dst, len, Address::times_2));
+    negptr(len);
+
+    // inflate 8 chars per iter
+    bind(copy_8_loop);
+    pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
+    movdqu(Address(dst, len, Address::times_2), tmp1);
+    addptr(len, 8);
+    jcc(Assembler::notZero, copy_8_loop);
+
+    bind(copy_tail);
+    movl(len, tmp2);
+
+    cmpl(len, 4);
+    jccb(Assembler::less, copy_bytes);
+
+    movdl(tmp1, Address(src, 0));  // load 4 byte chars
+    pmovzxbw(tmp1, tmp1);
+    movq(Address(dst, 0), tmp1);
+    subptr(len, 4);
+    addptr(src, 4);
+    addptr(dst, 8);
+
+    bind(copy_bytes);
+  }
+  testl(len, len);
+  jccb(Assembler::zero, done);
+  lea(src, Address(src, len, Address::times_1));
+  lea(dst, Address(dst, len, Address::times_2));
+  negptr(len);
+
+  // inflate 1 char per iter
+  bind(copy_chars_loop);
+  load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
+  movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
+  increment(len);
+  jcc(Assembler::notZero, copy_chars_loop);
+
+  bind(done);
+}
+
+
 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
   switch (cond) {
     // Note some conditions are synonyms for others
     case Assembler::zero:         return Assembler::notZero;
     case Assembler::notZero:      return Assembler::zero;
< prev index next >