--- old/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	2018-04-20 22:35:26.545483345 +0300
+++ new/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	2018-04-20 22:35:26.277487523 +0300
@@ -3988,6 +3988,304 @@
     return entry;
   }
 
+  // R0 = result
+  // R1 = str2
+  // R2 = cnt1
+  // R3 = str1
+  // R4 = cnt2
+  // This generic linear code use few additional ideas, which makes it faster:
+  // 1) we can safely keep at least 1st register of pattern(since length >= 8)
+  // in order to skip initial loading(help in systems with 1 ld pipeline)
+  // 2) we can use "fast" algorithm of finding single character to search for
+  // first symbol with less branches(1 branch per each loaded register instead
+  // of branch for each symbol), so, this is where constants like
+  // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
+  // 3) after loading and analyzing 1st register of source string, it can be
+  // used to search for every 1st character entry, saving few loads in
+  // comparison with "simplier-but-slower" implementation
+  // 4) in order to avoid lots of push/pop operations, code below is heavily
+  // re-using/re-initializing/compressing register values, which makes code
+  // larger and a bit less readable, however, most of extra operations are
+  // issued during loads or branches, so, penalty is minimal
+  address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
+    const char* stubName = str1_isL
+        ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
+        : "indexof_linear_uu";
+    StubCodeMark mark(this, "StubRoutines", stubName);
+    __ align(CodeEntryAlignment);
+    address entry = __ pc();
+
+    int str1_chr_size = str1_isL ? 1 : 2;
+    int str2_chr_size = str2_isL ? 1 : 2;
+    int str1_chr_shift = str1_isL ? 0 : 1;
+    int str2_chr_shift = str2_isL ? 0 : 1;
+    bool isL = str1_isL && str2_isL;
+   // parameters
+    Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
+    // temporary registers
+    Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
+    RegSet spilled_regs = RegSet::range(tmp1, tmp4);
+    // redefinitions
+    Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
+
+    __ push(spilled_regs, sp);
+    Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, L_SMALL_MATCH_LOOP,
+        L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
+        L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
+        L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
+        L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
+        L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
+    // Read whole register from str1. It is safe, because length >=8 here
+    __ ldr(ch1, Address(str1));
+    // Read whole register from str2. It is safe, because length >=8 here
+    __ ldr(ch2, Address(str2));
+    __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
+    if (str1_isL != str2_isL) {
+      __ eor(v0, __ T16B, v0, v0);
+    }
+    __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
+    __ mul(first, first, tmp1);
+    // check if we have less than 1 register to check
+    __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
+    if (str1_isL != str2_isL) {
+      __ fmovd(v1, ch1);
+    }
+    __ br(__ LE, L_SMALL);
+    __ eor(ch2, first, ch2);
+    if (str1_isL != str2_isL) {
+      __ zip1(v1, __ T16B, v1, v0);
+    }
+    __ sub(tmp2, ch2, tmp1);
+    __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
+    __ bics(tmp2, tmp2, ch2);
+    if (str1_isL != str2_isL) {
+      __ fmovd(ch1, v1);
+    }
+    __ br(__ NE, L_HAS_ZERO);
+    __ subs(cnt2, cnt2, wordSize/str2_chr_size);
+    __ add(result, result, wordSize/str2_chr_size);
+    __ add(str2, str2, wordSize);
+    __ br(__ LT, L_POST_LOOP);
+    __ BIND(L_LOOP);
+      __ ldr(ch2, Address(str2));
+      __ eor(ch2, first, ch2);
+      __ sub(tmp2, ch2, tmp1);
+      __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
+      __ bics(tmp2, tmp2, ch2);
+      __ br(__ NE, L_HAS_ZERO);
+    __ BIND(L_LOOP_PROCEED);
+      __ subs(cnt2, cnt2, wordSize/str2_chr_size);
+      __ add(str2, str2, wordSize);
+      __ add(result, result, wordSize/str2_chr_size);
+      __ br(__ GE, L_LOOP);
+    __ BIND(L_POST_LOOP);
+      __ cmp(cnt2, -wordSize/str2_chr_size); // no extra characters to check
+      __ br(__ LE, NOMATCH);
+      __ ldr(ch2, Address(str2));
+      __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
+      __ eor(ch2, first, ch2);
+      __ sub(tmp2, ch2, tmp1);
+      __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
+      __ mov(tmp4, -1); // all bits set
+      __ b(L_SMALL_PROCEED);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_SMALL);
+      __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
+      __ eor(ch2, first, ch2);
+      if (str1_isL != str2_isL) {
+        __ zip1(v1, __ T16B, v1, v0);
+      }
+      __ sub(tmp2, ch2, tmp1);
+      __ mov(tmp4, -1); // all bits set
+      __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
+      if (str1_isL != str2_isL) {
+        __ fmovd(ch1, v1); // move converted 4 symbols
+      }
+    __ BIND(L_SMALL_PROCEED);
+      __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
+      __ bic(tmp2, tmp2, ch2);
+      __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
+      __ rbit(tmp2, tmp2);
+      __ br(__ EQ, NOMATCH);
+    __ BIND(L_SMALL_HAS_ZERO_LOOP);
+      __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
+      __ cmp(cnt1, wordSize/str2_chr_size);
+      __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
+      if (str2_isL) { // LL
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
+        __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
+        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
+      } else {
+        __ mov(ch2, 0xE); // all bits in byte set except last one
+        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+      }
+      __ cmp(ch1, ch2);
+      __ mov(tmp4, wordSize/str2_chr_size);
+      __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
+    __ BIND(L_SMALL_CMP_LOOP);
+      str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
+               : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
+      str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
+               : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
+      __ add(tmp4, tmp4, 1);
+      __ cmp(tmp4, cnt1);
+      __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
+      __ cmp(first, ch2);
+      __ br(__ EQ, L_SMALL_CMP_LOOP);
+    __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
+      __ cbz(tmp2, NOMATCH); // no more matches. exit
+      __ clz(tmp4, tmp2);
+      __ add(result, result, 1); // advance index
+      __ add(str2, str2, str2_chr_size); // advance pointer
+      __ b(L_SMALL_HAS_ZERO_LOOP);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
+      __ cmp(first, ch2);
+      __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
+      __ b(DONE);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
+      if (str2_isL) { // LL
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
+        __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
+        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
+      } else {
+        __ mov(ch2, 0xE); // all bits in byte set except last one
+        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+      }
+      __ cmp(ch1, ch2);
+      __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
+      __ b(DONE);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_HAS_ZERO);
+      __ rbit(tmp2, tmp2);
+      __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
+      // Now, perform compression of counters(cnt2 and cnt1) into one register.
+      // It's fine because both counters are 32bit and are not changed in this
+      // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
+      __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
+      __ sub(result, result, 1);
+    __ BIND(L_HAS_ZERO_LOOP);
+      __ mov(cnt1, wordSize/str2_chr_size);
+      __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
+      __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
+      if (str2_isL) {
+        __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(tmp4, tmp4, 1);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ lsl(tmp2, tmp2, 1);
+        __ mov(tmp4, wordSize/str2_chr_size);
+      } else {
+        __ mov(ch2, 0xE);
+        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(tmp4, tmp4, 1);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
+        __ lsl(tmp2, tmp2, 1);
+        __ mov(tmp4, wordSize/str2_chr_size);
+        __ sub(str2, str2, str2_chr_size);
+      }
+      __ cmp(ch1, ch2);
+      __ mov(tmp4, wordSize/str2_chr_size);
+      __ br(__ NE, L_CMP_LOOP_NOMATCH);
+    __ BIND(L_CMP_LOOP);
+      str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
+               : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
+      str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
+               : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
+      __ add(tmp4, tmp4, 1);
+      __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
+      __ br(__ GE, L_CMP_LOOP_LAST_CMP);
+      __ cmp(cnt1, ch2);
+      __ br(__ EQ, L_CMP_LOOP);
+    __ BIND(L_CMP_LOOP_NOMATCH);
+      // here we're not matched
+      __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
+      __ clz(tmp4, tmp2);
+      __ add(str2, str2, str2_chr_size); // advance pointer
+      __ b(L_HAS_ZERO_LOOP);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_CMP_LOOP_LAST_CMP);
+      __ cmp(cnt1, ch2);
+      __ br(__ NE, L_CMP_LOOP_NOMATCH);
+      __ b(DONE);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_CMP_LOOP_LAST_CMP2);
+      if (str2_isL) {
+        __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(tmp4, tmp4, 1);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ lsl(tmp2, tmp2, 1);
+      } else {
+        __ mov(ch2, 0xE);
+        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(tmp4, tmp4, 1);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
+        __ lsl(tmp2, tmp2, 1);
+        __ sub(str2, str2, str2_chr_size);
+      }
+      __ cmp(ch1, ch2);
+      __ br(__ NE, L_CMP_LOOP_NOMATCH);
+      __ b(DONE);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
+      // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
+      // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
+      // so, result was increased at max by wordSize/str2_chr_size - 1, so,
+      // respective high bit wasn't changed. L_LOOP_PROCEED will increase
+      // result by analyzed characters value, so, we can just reset lower bits
+      // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
+      // 2) restore cnt1 and cnt2 values from "compressed" cnt2
+      // 3) advance str2 value to represent next str2 octet. result & 7/3 is
+      // index of last analyzed substring inside current octet. So, str2 in at
+      // respective start address. We need to advance it to next octet
+      __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
+      __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
+      __ bfm(result, zr, 0, 2 - str2_chr_shift);
+      __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
+      __ movw(cnt2, cnt2);
+      __ b(L_LOOP_PROCEED);
+    __ align(OptoLoopAlignment);
+    __ BIND(NOMATCH);
+      __ mov(result, -1);
+    __ BIND(DONE);
+      __ pop(spilled_regs, sp);
+      __ ret(lr);
+    return entry;
+  }
+
+  void generate_string_indexof_stubs() {
+    StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
+    StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
+    StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
+  }
 
   /**
    *  Arguments:
@@ -5076,6 +5374,8 @@
       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
     }
 
+    generate_string_indexof_stubs();
+
     if (UseMultiplyToLenIntrinsic) {
       StubRoutines::_multiplyToLen = generate_multiplyToLen();
     }