1 /*
   2  * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 
  31 #ifdef PRODUCT
  32 #define BLOCK_COMMENT(str) /* nothing */
  33 #define STOP(error) stop(error)
  34 #else
  35 #define BLOCK_COMMENT(str) block_comment(str)
  36 #define STOP(error) block_comment(error); stop(error)
  37 #endif
  38 
  39 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  40 
  41 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  42 
  43 // Search for str1 in str2 and return index or -1
  44 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
  45                                        Register cnt2, Register cnt1,
  46                                        Register tmp1, Register tmp2,
  47                                        Register tmp3, Register tmp4,
  48                                        Register tmp5, Register tmp6,
  49                                        int icnt1, Register result, int ae) {
  50   // NOTE: tmp5, tmp6 can be zr depending on specific method version
  51   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
  52 
  53   Register ch1 = rscratch1;
  54   Register ch2 = rscratch2;
  55   Register cnt1tmp = tmp1;
  56   Register cnt2tmp = tmp2;
  57   Register cnt1_neg = cnt1;
  58   Register cnt2_neg = cnt2;
  59   Register result_tmp = tmp4;
  60 
  61   bool isL = ae == StrIntrinsicNode::LL;
  62 
  63   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
  64   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
  65   int str1_chr_shift = str1_isL ? 0:1;
  66   int str2_chr_shift = str2_isL ? 0:1;
  67   int str1_chr_size = str1_isL ? 1:2;
  68   int str2_chr_size = str2_isL ? 1:2;
  69   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
  70                                       (chr_insn)&MacroAssembler::ldrh;
  71   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
  72                                       (chr_insn)&MacroAssembler::ldrh;
  73   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
  74   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
  75 
  76   // Note, inline_string_indexOf() generates checks:
  77   // if (substr.count > string.count) return -1;
  78   // if (substr.count == 0) return 0;
  79 
  80   // We have two strings, a source string in str2, cnt2 and a pattern string
  81   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
  82 
  83   // For larger pattern and source we use a simplified Boyer Moore algorithm.
  84   // With a small pattern and source we use linear scan.
  85 
  86   if (icnt1 == -1) {
  87     sub(result_tmp, cnt2, cnt1);
  88     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
  89     br(LT, LINEARSEARCH);
  90     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
  91     subs(zr, cnt1, 256);
  92     lsr(tmp1, cnt2, 2);
  93     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
  94     br(GE, LINEARSTUB);
  95   }
  96 
  97 // The Boyer Moore alogorithm is based on the description here:-
  98 //
  99 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 100 //
 101 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 102 // and the 'Good Suffix' rule.
 103 //
 104 // These rules are essentially heuristics for how far we can shift the
 105 // pattern along the search string.
 106 //
 107 // The implementation here uses the 'Bad Character' rule only because of the
 108 // complexity of initialisation for the 'Good Suffix' rule.
 109 //
 110 // This is also known as the Boyer-Moore-Horspool algorithm:-
 111 //
 112 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 113 //
 114 // This particular implementation has few java-specific optimizations.
 115 //
 116 // #define ASIZE 256
 117 //
 118 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 119 //       int i, j;
 120 //       unsigned c;
 121 //       unsigned char bc[ASIZE];
 122 //
 123 //       /* Preprocessing */
 124 //       for (i = 0; i < ASIZE; ++i)
 125 //          bc[i] = m;
 126 //       for (i = 0; i < m - 1; ) {
 127 //          c = x[i];
 128 //          ++i;
 129 //          // c < 256 for Latin1 string, so, no need for branch
 130 //          #ifdef PATTERN_STRING_IS_LATIN1
 131 //          bc[c] = m - i;
 132 //          #else
 133 //          if (c < ASIZE) bc[c] = m - i;
 134 //          #endif
 135 //       }
 136 //
 137 //       /* Searching */
 138 //       j = 0;
 139 //       while (j <= n - m) {
 140 //          c = y[i+j];
 141 //          if (x[m-1] == c)
 142 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 143 //          if (i < 0) return j;
 144 //          // c < 256 for Latin1 string, so, no need for branch
 145 //          #ifdef SOURCE_STRING_IS_LATIN1
 146 //          // LL case: (c< 256) always true. Remove branch
 147 //          j += bc[y[j+m-1]];
 148 //          #endif
 149 //          #ifndef PATTERN_STRING_IS_UTF
 150 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 151 //          if (c < ASIZE)
 152 //            j += bc[y[j+m-1]];
 153 //          else
 154 //            j += 1
 155 //          #endif
 156 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 157 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 158 //          if (c < ASIZE)
 159 //            j += bc[y[j+m-1]];
 160 //          else
 161 //            j += m
 162 //          #endif
 163 //       }
 164 //    }
 165 
 166   if (icnt1 == -1) {
 167     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 168         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 169     Register cnt1end = tmp2;
 170     Register str2end = cnt2;
 171     Register skipch = tmp2;
 172 
 173     // str1 length is >=8, so, we can read at least 1 register for cases when
 174     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 175     // UL case. We'll re-read last character in inner pre-loop code to have
 176     // single outer pre-loop load
 177     const int firstStep = isL ? 7 : 3;
 178 
 179     const int ASIZE = 256;
 180     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 181     sub(sp, sp, ASIZE);
 182     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 183     mov(ch1, sp);
 184     BIND(BM_INIT_LOOP);
 185       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 186       subs(tmp5, tmp5, 1);
 187       br(GT, BM_INIT_LOOP);
 188 
 189       sub(cnt1tmp, cnt1, 1);
 190       mov(tmp5, str2);
 191       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 192       sub(ch2, cnt1, 1);
 193       mov(tmp3, str1);
 194     BIND(BCLOOP);
 195       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 196       if (!str1_isL) {
 197         subs(zr, ch1, ASIZE);
 198         br(HS, BCSKIP);
 199       }
 200       strb(ch2, Address(sp, ch1));
 201     BIND(BCSKIP);
 202       subs(ch2, ch2, 1);
 203       br(GT, BCLOOP);
 204 
 205       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 206       if (str1_isL == str2_isL) {
 207         // load last 8 bytes (8LL/4UU symbols)
 208         ldr(tmp6, Address(tmp6, -wordSize));
 209       } else {
 210         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 211         // convert Latin1 to UTF. We'll have to wait until load completed, but
 212         // it's still faster than per-character loads+checks
 213         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 214         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 215         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 216         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 217         orr(ch2, ch1, ch2, LSL, 16);
 218         orr(tmp6, tmp6, tmp3, LSL, 48);
 219         orr(tmp6, tmp6, ch2, LSL, 16);
 220       }
 221     BIND(BMLOOPSTR2);
 222       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 223       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 224       if (str1_isL == str2_isL) {
 225         // re-init tmp3. It's for free because it's executed in parallel with
 226         // load above. Alternative is to initialize it before loop, but it'll
 227         // affect performance on in-order systems with 2 or more ld/st pipelines
 228         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 229       }
 230       if (!isL) { // UU/UL case
 231         lsl(ch2, cnt1tmp, 1); // offset in bytes
 232       }
 233       cmp(tmp3, skipch);
 234       br(NE, BMSKIP);
 235       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 236       mov(ch1, tmp6);
 237       if (isL) {
 238         b(BMLOOPSTR1_AFTER_LOAD);
 239       } else {
 240         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 241         b(BMLOOPSTR1_CMP);
 242       }
 243     BIND(BMLOOPSTR1);
 244       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 245       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 246     BIND(BMLOOPSTR1_AFTER_LOAD);
 247       subs(cnt1tmp, cnt1tmp, 1);
 248       br(LT, BMLOOPSTR1_LASTCMP);
 249     BIND(BMLOOPSTR1_CMP);
 250       cmp(ch1, ch2);
 251       br(EQ, BMLOOPSTR1);
 252     BIND(BMSKIP);
 253       if (!isL) {
 254         // if we've met UTF symbol while searching Latin1 pattern, then we can
 255         // skip cnt1 symbols
 256         if (str1_isL != str2_isL) {
 257           mov(result_tmp, cnt1);
 258         } else {
 259           mov(result_tmp, 1);
 260         }
 261         subs(zr, skipch, ASIZE);
 262         br(HS, BMADV);
 263       }
 264       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 265     BIND(BMADV);
 266       sub(cnt1tmp, cnt1, 1);
 267       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 268       cmp(str2, str2end);
 269       br(LE, BMLOOPSTR2);
 270       add(sp, sp, ASIZE);
 271       b(NOMATCH);
 272     BIND(BMLOOPSTR1_LASTCMP);
 273       cmp(ch1, ch2);
 274       br(NE, BMSKIP);
 275     BIND(BMMATCH);
 276       sub(result, str2, tmp5);
 277       if (!str2_isL) lsr(result, result, 1);
 278       add(sp, sp, ASIZE);
 279       b(DONE);
 280 
 281     BIND(LINEARSTUB);
 282     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 283     br(LT, LINEAR_MEDIUM);
 284     mov(result, zr);
 285     RuntimeAddress stub = NULL;
 286     if (isL) {
 287       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 288       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
 289     } else if (str1_isL) {
 290       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 291        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
 292     } else {
 293       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 294       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
 295     }
 296     trampoline_call(stub);
 297     b(DONE);
 298   }
 299 
 300   BIND(LINEARSEARCH);
 301   {
 302     Label DO1, DO2, DO3;
 303 
 304     Register str2tmp = tmp2;
 305     Register first = tmp3;
 306 
 307     if (icnt1 == -1)
 308     {
 309         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 310 
 311         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 312         br(LT, DOSHORT);
 313       BIND(LINEAR_MEDIUM);
 314         (this->*str1_load_1chr)(first, Address(str1));
 315         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 316         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 317         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 318         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 319 
 320       BIND(FIRST_LOOP);
 321         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 322         cmp(first, ch2);
 323         br(EQ, STR1_LOOP);
 324       BIND(STR2_NEXT);
 325         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 326         br(LE, FIRST_LOOP);
 327         b(NOMATCH);
 328 
 329       BIND(STR1_LOOP);
 330         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 331         add(cnt2tmp, cnt2_neg, str2_chr_size);
 332         br(GE, MATCH);
 333 
 334       BIND(STR1_NEXT);
 335         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 336         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 337         cmp(ch1, ch2);
 338         br(NE, STR2_NEXT);
 339         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 340         add(cnt2tmp, cnt2tmp, str2_chr_size);
 341         br(LT, STR1_NEXT);
 342         b(MATCH);
 343 
 344       BIND(DOSHORT);
 345       if (str1_isL == str2_isL) {
 346         cmp(cnt1, (u1)2);
 347         br(LT, DO1);
 348         br(GT, DO3);
 349       }
 350     }
 351 
 352     if (icnt1 == 4) {
 353       Label CH1_LOOP;
 354 
 355         (this->*load_4chr)(ch1, str1);
 356         sub(result_tmp, cnt2, 4);
 357         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 358         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 359 
 360       BIND(CH1_LOOP);
 361         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 362         cmp(ch1, ch2);
 363         br(EQ, MATCH);
 364         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 365         br(LE, CH1_LOOP);
 366         b(NOMATCH);
 367       }
 368 
 369     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 370       Label CH1_LOOP;
 371 
 372       BIND(DO2);
 373         (this->*load_2chr)(ch1, str1);
 374         if (icnt1 == 2) {
 375           sub(result_tmp, cnt2, 2);
 376         }
 377         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 378         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 379       BIND(CH1_LOOP);
 380         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 381         cmp(ch1, ch2);
 382         br(EQ, MATCH);
 383         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 384         br(LE, CH1_LOOP);
 385         b(NOMATCH);
 386     }
 387 
 388     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 389       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 390 
 391       BIND(DO3);
 392         (this->*load_2chr)(first, str1);
 393         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 394         if (icnt1 == 3) {
 395           sub(result_tmp, cnt2, 3);
 396         }
 397         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 398         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 399       BIND(FIRST_LOOP);
 400         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 401         cmpw(first, ch2);
 402         br(EQ, STR1_LOOP);
 403       BIND(STR2_NEXT);
 404         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 405         br(LE, FIRST_LOOP);
 406         b(NOMATCH);
 407 
 408       BIND(STR1_LOOP);
 409         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 410         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 411         cmp(ch1, ch2);
 412         br(NE, STR2_NEXT);
 413         b(MATCH);
 414     }
 415 
 416     if (icnt1 == -1 || icnt1 == 1) {
 417       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 418 
 419       BIND(DO1);
 420         (this->*str1_load_1chr)(ch1, str1);
 421         cmp(cnt2, (u1)8);
 422         br(LT, DO1_SHORT);
 423 
 424         sub(result_tmp, cnt2, 8/str2_chr_size);
 425         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 426         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 427         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 428 
 429         if (str2_isL) {
 430           orr(ch1, ch1, ch1, LSL, 8);
 431         }
 432         orr(ch1, ch1, ch1, LSL, 16);
 433         orr(ch1, ch1, ch1, LSL, 32);
 434       BIND(CH1_LOOP);
 435         ldr(ch2, Address(str2, cnt2_neg));
 436         eor(ch2, ch1, ch2);
 437         sub(tmp1, ch2, tmp3);
 438         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 439         bics(tmp1, tmp1, tmp2);
 440         br(NE, HAS_ZERO);
 441         adds(cnt2_neg, cnt2_neg, 8);
 442         br(LT, CH1_LOOP);
 443 
 444         cmp(cnt2_neg, (u1)8);
 445         mov(cnt2_neg, 0);
 446         br(LT, CH1_LOOP);
 447         b(NOMATCH);
 448 
 449       BIND(HAS_ZERO);
 450         rev(tmp1, tmp1);
 451         clz(tmp1, tmp1);
 452         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 453         b(MATCH);
 454 
 455       BIND(DO1_SHORT);
 456         mov(result_tmp, cnt2);
 457         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 458         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 459       BIND(DO1_LOOP);
 460         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 461         cmpw(ch1, ch2);
 462         br(EQ, MATCH);
 463         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 464         br(LT, DO1_LOOP);
 465     }
 466   }
 467   BIND(NOMATCH);
 468     mov(result, -1);
 469     b(DONE);
 470   BIND(MATCH);
 471     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 472   BIND(DONE);
 473 }
 474 
 475 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 476 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 477 
 478 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 479                                             Register ch, Register result,
 480                                             Register tmp1, Register tmp2, Register tmp3)
 481 {
 482   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 483   Register cnt1_neg = cnt1;
 484   Register ch1 = rscratch1;
 485   Register result_tmp = rscratch2;
 486 
 487   cbz(cnt1, NOMATCH);
 488 
 489   cmp(cnt1, (u1)4);
 490   br(LT, DO1_SHORT);
 491 
 492   orr(ch, ch, ch, LSL, 16);
 493   orr(ch, ch, ch, LSL, 32);
 494 
 495   sub(cnt1, cnt1, 4);
 496   mov(result_tmp, cnt1);
 497   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 498   sub(cnt1_neg, zr, cnt1, LSL, 1);
 499 
 500   mov(tmp3, 0x0001000100010001);
 501 
 502   BIND(CH1_LOOP);
 503     ldr(ch1, Address(str1, cnt1_neg));
 504     eor(ch1, ch, ch1);
 505     sub(tmp1, ch1, tmp3);
 506     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 507     bics(tmp1, tmp1, tmp2);
 508     br(NE, HAS_ZERO);
 509     adds(cnt1_neg, cnt1_neg, 8);
 510     br(LT, CH1_LOOP);
 511 
 512     cmp(cnt1_neg, (u1)8);
 513     mov(cnt1_neg, 0);
 514     br(LT, CH1_LOOP);
 515     b(NOMATCH);
 516 
 517   BIND(HAS_ZERO);
 518     rev(tmp1, tmp1);
 519     clz(tmp1, tmp1);
 520     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 521     b(MATCH);
 522 
 523   BIND(DO1_SHORT);
 524     mov(result_tmp, cnt1);
 525     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 526     sub(cnt1_neg, zr, cnt1, LSL, 1);
 527   BIND(DO1_LOOP);
 528     ldrh(ch1, Address(str1, cnt1_neg));
 529     cmpw(ch, ch1);
 530     br(EQ, MATCH);
 531     adds(cnt1_neg, cnt1_neg, 2);
 532     br(LT, DO1_LOOP);
 533   BIND(NOMATCH);
 534     mov(result, -1);
 535     b(DONE);
 536   BIND(MATCH);
 537     add(result, result_tmp, cnt1_neg, ASR, 1);
 538   BIND(DONE);
 539 }
 540 
 541 // Compare strings.
 542 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 543     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 544     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
 545   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 546       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 547       SHORT_LOOP_START, TAIL_CHECK;
 548 
 549   bool isLL = ae == StrIntrinsicNode::LL;
 550   bool isLU = ae == StrIntrinsicNode::LU;
 551   bool isUL = ae == StrIntrinsicNode::UL;
 552 
 553   // The stub threshold for LL strings is: 72 (64 + 8) chars
 554   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
 555   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
 556   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
 557 
 558   bool str1_isL = isLL || isLU;
 559   bool str2_isL = isLL || isUL;
 560 
 561   int str1_chr_shift = str1_isL ? 0 : 1;
 562   int str2_chr_shift = str2_isL ? 0 : 1;
 563   int str1_chr_size = str1_isL ? 1 : 2;
 564   int str2_chr_size = str2_isL ? 1 : 2;
 565   int minCharsInWord = isLL ? wordSize : wordSize/2;
 566 
 567   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
 568   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 569                                       (chr_insn)&MacroAssembler::ldrh;
 570   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 571                                       (chr_insn)&MacroAssembler::ldrh;
 572   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
 573                             (uxt_insn)&MacroAssembler::uxthw;
 574 
 575   BLOCK_COMMENT("string_compare {");
 576 
 577   // Bizzarely, the counts are passed in bytes, regardless of whether they
 578   // are L or U strings, however the result is always in characters.
 579   if (!str1_isL) asrw(cnt1, cnt1, 1);
 580   if (!str2_isL) asrw(cnt2, cnt2, 1);
 581 
 582   // Compute the minimum of the string lengths and save the difference.
 583   subsw(result, cnt1, cnt2);
 584   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 585 
 586   // A very short string
 587   cmpw(cnt2, minCharsInWord);
 588   br(Assembler::LE, SHORT_STRING);
 589 
 590   // Compare longwords
 591   // load first parts of strings and finish initialization while loading
 592   {
 593     if (str1_isL == str2_isL) { // LL or UU
 594       ldr(tmp1, Address(str1));
 595       cmp(str1, str2);
 596       br(Assembler::EQ, DONE);
 597       ldr(tmp2, Address(str2));
 598       cmp(cnt2, stub_threshold);
 599       br(GE, STUB);
 600       subsw(cnt2, cnt2, minCharsInWord);
 601       br(EQ, TAIL_CHECK);
 602       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 603       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 604       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 605     } else if (isLU) {
 606       ldrs(vtmp, Address(str1));
 607       ldr(tmp2, Address(str2));
 608       cmp(cnt2, stub_threshold);
 609       br(GE, STUB);
 610       subw(cnt2, cnt2, 4);
 611       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 612       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 613       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 614       zip1(vtmp, T8B, vtmp, vtmpZ);
 615       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 616       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 617       add(cnt1, cnt1, 4);
 618       fmovd(tmp1, vtmp);
 619     } else { // UL case
 620       ldr(tmp1, Address(str1));
 621       ldrs(vtmp, Address(str2));
 622       cmp(cnt2, stub_threshold);
 623       br(GE, STUB);
 624       subw(cnt2, cnt2, 4);
 625       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 626       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 627       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 628       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 629       zip1(vtmp, T8B, vtmp, vtmpZ);
 630       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 631       add(cnt1, cnt1, 8);
 632       fmovd(tmp2, vtmp);
 633     }
 634     adds(cnt2, cnt2, isUL ? 4 : 8);
 635     br(GE, TAIL);
 636     eor(rscratch2, tmp1, tmp2);
 637     cbnz(rscratch2, DIFF);
 638     // main loop
 639     bind(NEXT_WORD);
 640     if (str1_isL == str2_isL) {
 641       ldr(tmp1, Address(str1, cnt2));
 642       ldr(tmp2, Address(str2, cnt2));
 643       adds(cnt2, cnt2, 8);
 644     } else if (isLU) {
 645       ldrs(vtmp, Address(str1, cnt1));
 646       ldr(tmp2, Address(str2, cnt2));
 647       add(cnt1, cnt1, 4);
 648       zip1(vtmp, T8B, vtmp, vtmpZ);
 649       fmovd(tmp1, vtmp);
 650       adds(cnt2, cnt2, 8);
 651     } else { // UL
 652       ldrs(vtmp, Address(str2, cnt2));
 653       ldr(tmp1, Address(str1, cnt1));
 654       zip1(vtmp, T8B, vtmp, vtmpZ);
 655       add(cnt1, cnt1, 8);
 656       fmovd(tmp2, vtmp);
 657       adds(cnt2, cnt2, 4);
 658     }
 659     br(GE, TAIL);
 660 
 661     eor(rscratch2, tmp1, tmp2);
 662     cbz(rscratch2, NEXT_WORD);
 663     b(DIFF);
 664     bind(TAIL);
 665     eor(rscratch2, tmp1, tmp2);
 666     cbnz(rscratch2, DIFF);
 667     // Last longword.  In the case where length == 4 we compare the
 668     // same longword twice, but that's still faster than another
 669     // conditional branch.
 670     if (str1_isL == str2_isL) {
 671       ldr(tmp1, Address(str1));
 672       ldr(tmp2, Address(str2));
 673     } else if (isLU) {
 674       ldrs(vtmp, Address(str1));
 675       ldr(tmp2, Address(str2));
 676       zip1(vtmp, T8B, vtmp, vtmpZ);
 677       fmovd(tmp1, vtmp);
 678     } else { // UL
 679       ldrs(vtmp, Address(str2));
 680       ldr(tmp1, Address(str1));
 681       zip1(vtmp, T8B, vtmp, vtmpZ);
 682       fmovd(tmp2, vtmp);
 683     }
 684     bind(TAIL_CHECK);
 685     eor(rscratch2, tmp1, tmp2);
 686     cbz(rscratch2, DONE);
 687 
 688     // Find the first different characters in the longwords and
 689     // compute their difference.
 690     bind(DIFF);
 691     rev(rscratch2, rscratch2);
 692     clz(rscratch2, rscratch2);
 693     andr(rscratch2, rscratch2, isLL ? -8 : -16);
 694     lsrv(tmp1, tmp1, rscratch2);
 695     (this->*ext_chr)(tmp1, tmp1);
 696     lsrv(tmp2, tmp2, rscratch2);
 697     (this->*ext_chr)(tmp2, tmp2);
 698     subw(result, tmp1, tmp2);
 699     b(DONE);
 700   }
 701 
 702   bind(STUB);
 703     RuntimeAddress stub = NULL;
 704     switch(ae) {
 705       case StrIntrinsicNode::LL:
 706         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
 707         break;
 708       case StrIntrinsicNode::UU:
 709         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
 710         break;
 711       case StrIntrinsicNode::LU:
 712         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
 713         break;
 714       case StrIntrinsicNode::UL:
 715         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
 716         break;
 717       default:
 718         ShouldNotReachHere();
 719      }
 720     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
 721     trampoline_call(stub);
 722     b(DONE);
 723 
 724   bind(SHORT_STRING);
 725   // Is the minimum length zero?
 726   cbz(cnt2, DONE);
 727   // arrange code to do most branches while loading and loading next characters
 728   // while comparing previous
 729   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 730   subs(cnt2, cnt2, 1);
 731   br(EQ, SHORT_LAST_INIT);
 732   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 733   b(SHORT_LOOP_START);
 734   bind(SHORT_LOOP);
 735   subs(cnt2, cnt2, 1);
 736   br(EQ, SHORT_LAST);
 737   bind(SHORT_LOOP_START);
 738   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
 739   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
 740   cmp(tmp1, cnt1);
 741   br(NE, SHORT_LOOP_TAIL);
 742   subs(cnt2, cnt2, 1);
 743   br(EQ, SHORT_LAST2);
 744   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 745   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 746   cmp(tmp2, rscratch1);
 747   br(EQ, SHORT_LOOP);
 748   sub(result, tmp2, rscratch1);
 749   b(DONE);
 750   bind(SHORT_LOOP_TAIL);
 751   sub(result, tmp1, cnt1);
 752   b(DONE);
 753   bind(SHORT_LAST2);
 754   cmp(tmp2, rscratch1);
 755   br(EQ, DONE);
 756   sub(result, tmp2, rscratch1);
 757 
 758   b(DONE);
 759   bind(SHORT_LAST_INIT);
 760   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 761   bind(SHORT_LAST);
 762   cmp(tmp1, cnt1);
 763   br(EQ, DONE);
 764   sub(result, tmp1, cnt1);
 765 
 766   bind(DONE);
 767 
 768   BLOCK_COMMENT("} string_compare");
 769 }