1 /*
   2  * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 package org.graalvm.compiler.asm.amd64;
  24 
  25 import static jdk.vm.ci.amd64.AMD64.rax;
  26 import static jdk.vm.ci.amd64.AMD64.rcx;
  27 import static jdk.vm.ci.amd64.AMD64.rdx;
  28 import static jdk.vm.ci.amd64.AMD64.rsp;
  29 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseIncDec;
  30 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseXmmLoadAndClearUpper;
  31 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseXmmRegToRegMoveAll;
  32 
  33 import org.graalvm.compiler.asm.Label;
  34 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  35 import org.graalvm.compiler.core.common.NumUtil;
  36 
  37 import jdk.vm.ci.amd64.AMD64;
  38 import jdk.vm.ci.amd64.AMD64Kind;
  39 import jdk.vm.ci.code.Register;
  40 import jdk.vm.ci.code.TargetDescription;
  41 
  42 /**
  43  * This class implements commonly used X86 code patterns.
  44  */
  45 public class AMD64MacroAssembler extends AMD64Assembler {
  46 
  47     public AMD64MacroAssembler(TargetDescription target) {
  48         super(target);
  49     }
  50 
  51     public final void decrementq(Register reg, int value) {
  52         if (value == Integer.MIN_VALUE) {
  53             subq(reg, value);
  54             return;
  55         }
  56         if (value < 0) {
  57             incrementq(reg, -value);
  58             return;
  59         }
  60         if (value == 0) {
  61             return;
  62         }
  63         if (value == 1 && UseIncDec) {
  64             decq(reg);
  65         } else {
  66             subq(reg, value);
  67         }
  68     }
  69 
  70     public final void decrementq(AMD64Address dst, int value) {
  71         if (value == Integer.MIN_VALUE) {
  72             subq(dst, value);
  73             return;
  74         }
  75         if (value < 0) {
  76             incrementq(dst, -value);
  77             return;
  78         }
  79         if (value == 0) {
  80             return;
  81         }
  82         if (value == 1 && UseIncDec) {
  83             decq(dst);
  84         } else {
  85             subq(dst, value);
  86         }
  87     }
  88 
  89     public void incrementq(Register reg, int value) {
  90         if (value == Integer.MIN_VALUE) {
  91             addq(reg, value);
  92             return;
  93         }
  94         if (value < 0) {
  95             decrementq(reg, -value);
  96             return;
  97         }
  98         if (value == 0) {
  99             return;
 100         }
 101         if (value == 1 && UseIncDec) {
 102             incq(reg);
 103         } else {
 104             addq(reg, value);
 105         }
 106     }
 107 
 108     public final void incrementq(AMD64Address dst, int value) {
 109         if (value == Integer.MIN_VALUE) {
 110             addq(dst, value);
 111             return;
 112         }
 113         if (value < 0) {
 114             decrementq(dst, -value);
 115             return;
 116         }
 117         if (value == 0) {
 118             return;
 119         }
 120         if (value == 1 && UseIncDec) {
 121             incq(dst);
 122         } else {
 123             addq(dst, value);
 124         }
 125     }
 126 
 127     public final void movptr(Register dst, AMD64Address src) {
 128         movq(dst, src);
 129     }
 130 
 131     public final void movptr(AMD64Address dst, Register src) {
 132         movq(dst, src);
 133     }
 134 
 135     public final void movptr(AMD64Address dst, int src) {
 136         movslq(dst, src);
 137     }
 138 
 139     public final void cmpptr(Register src1, Register src2) {
 140         cmpq(src1, src2);
 141     }
 142 
 143     public final void cmpptr(Register src1, AMD64Address src2) {
 144         cmpq(src1, src2);
 145     }
 146 
 147     public final void decrementl(Register reg) {
 148         decrementl(reg, 1);
 149     }
 150 
 151     public final void decrementl(Register reg, int value) {
 152         if (value == Integer.MIN_VALUE) {
 153             subl(reg, value);
 154             return;
 155         }
 156         if (value < 0) {
 157             incrementl(reg, -value);
 158             return;
 159         }
 160         if (value == 0) {
 161             return;
 162         }
 163         if (value == 1 && UseIncDec) {
 164             decl(reg);
 165         } else {
 166             subl(reg, value);
 167         }
 168     }
 169 
 170     public final void decrementl(AMD64Address dst, int value) {
 171         if (value == Integer.MIN_VALUE) {
 172             subl(dst, value);
 173             return;
 174         }
 175         if (value < 0) {
 176             incrementl(dst, -value);
 177             return;
 178         }
 179         if (value == 0) {
 180             return;
 181         }
 182         if (value == 1 && UseIncDec) {
 183             decl(dst);
 184         } else {
 185             subl(dst, value);
 186         }
 187     }
 188 
 189     public final void incrementl(Register reg, int value) {
 190         if (value == Integer.MIN_VALUE) {
 191             addl(reg, value);
 192             return;
 193         }
 194         if (value < 0) {
 195             decrementl(reg, -value);
 196             return;
 197         }
 198         if (value == 0) {
 199             return;
 200         }
 201         if (value == 1 && UseIncDec) {
 202             incl(reg);
 203         } else {
 204             addl(reg, value);
 205         }
 206     }
 207 
 208     public final void incrementl(AMD64Address dst, int value) {
 209         if (value == Integer.MIN_VALUE) {
 210             addl(dst, value);
 211             return;
 212         }
 213         if (value < 0) {
 214             decrementl(dst, -value);
 215             return;
 216         }
 217         if (value == 0) {
 218             return;
 219         }
 220         if (value == 1 && UseIncDec) {
 221             incl(dst);
 222         } else {
 223             addl(dst, value);
 224         }
 225     }
 226 
 227     public void movflt(Register dst, Register src) {
 228         assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
 229         if (UseXmmRegToRegMoveAll) {
 230             movaps(dst, src);
 231         } else {
 232             movss(dst, src);
 233         }
 234     }
 235 
 236     public void movflt(Register dst, AMD64Address src) {
 237         assert dst.getRegisterCategory().equals(AMD64.XMM);
 238         movss(dst, src);
 239     }
 240 
 241     public void movflt(AMD64Address dst, Register src) {
 242         assert src.getRegisterCategory().equals(AMD64.XMM);
 243         movss(dst, src);
 244     }
 245 
 246     public void movdbl(Register dst, Register src) {
 247         assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
 248         if (UseXmmRegToRegMoveAll) {
 249             movapd(dst, src);
 250         } else {
 251             movsd(dst, src);
 252         }
 253     }
 254 
 255     public void movdbl(Register dst, AMD64Address src) {
 256         assert dst.getRegisterCategory().equals(AMD64.XMM);
 257         if (UseXmmLoadAndClearUpper) {
 258             movsd(dst, src);
 259         } else {
 260             movlpd(dst, src);
 261         }
 262     }
 263 
 264     public void movdbl(AMD64Address dst, Register src) {
 265         assert src.getRegisterCategory().equals(AMD64.XMM);
 266         movsd(dst, src);
 267     }
 268 
 269     /**
 270      * Non-atomic write of a 64-bit constant to memory. Do not use if the address might be a
 271      * volatile field!
 272      */
 273     public final void movlong(AMD64Address dst, long src) {
 274         if (NumUtil.isInt(src)) {
 275             AMD64MIOp.MOV.emit(this, OperandSize.QWORD, dst, (int) src);
 276         } else {
 277             AMD64Address high = new AMD64Address(dst.getBase(), dst.getIndex(), dst.getScale(), dst.getDisplacement() + 4);
 278             movl(dst, (int) (src & 0xFFFFFFFF));
 279             movl(high, (int) (src >> 32));
 280         }
 281 
 282     }
 283 
 284     public final void setl(ConditionFlag cc, Register dst) {
 285         setb(cc, dst);
 286         movzbl(dst, dst);
 287     }
 288 
 289     public final void setq(ConditionFlag cc, Register dst) {
 290         setb(cc, dst);
 291         movzbq(dst, dst);
 292     }
 293 
 294     public final void flog(Register dest, Register value, boolean base10) {
 295         if (base10) {
 296             fldlg2();
 297         } else {
 298             fldln2();
 299         }
 300         AMD64Address tmp = trigPrologue(value);
 301         fyl2x();
 302         trigEpilogue(dest, tmp);
 303     }
 304 
 305     public final void fsin(Register dest, Register value) {
 306         AMD64Address tmp = trigPrologue(value);
 307         fsin();
 308         trigEpilogue(dest, tmp);
 309     }
 310 
 311     public final void fcos(Register dest, Register value) {
 312         AMD64Address tmp = trigPrologue(value);
 313         fcos();
 314         trigEpilogue(dest, tmp);
 315     }
 316 
 317     public final void ftan(Register dest, Register value) {
 318         AMD64Address tmp = trigPrologue(value);
 319         fptan();
 320         fstp(0); // ftan pushes 1.0 in addition to the actual result, pop
 321         trigEpilogue(dest, tmp);
 322     }
 323 
 324     public final void fpop() {
 325         ffree(0);
 326         fincstp();
 327     }
 328 
 329     private AMD64Address trigPrologue(Register value) {
 330         assert value.getRegisterCategory().equals(AMD64.XMM);
 331         AMD64Address tmp = new AMD64Address(AMD64.rsp);
 332         subq(AMD64.rsp, AMD64Kind.DOUBLE.getSizeInBytes());
 333         movdbl(tmp, value);
 334         fldd(tmp);
 335         return tmp;
 336     }
 337 
 338     private void trigEpilogue(Register dest, AMD64Address tmp) {
 339         assert dest.getRegisterCategory().equals(AMD64.XMM);
 340         fstpd(tmp);
 341         movdbl(dest, tmp);
 342         addq(AMD64.rsp, AMD64Kind.DOUBLE.getSizeInBytes());
 343     }
 344 
 345     // IndexOf for constant substrings with size >= 8 chars
 346     // which don't need to be loaded through stack.
 347     public void stringIndexofC8(Register str1, Register str2,
 348                     Register cnt1, Register cnt2,
 349                     int intCnt2, Register result,
 350                     Register vec, Register tmp) {
 351         // assert(UseSSE42Intrinsics, "SSE4.2 is required");
 352 
 353         // This method uses pcmpestri inxtruction with bound registers
 354         // inputs:
 355         // xmm - substring
 356         // rax - substring length (elements count)
 357         // mem - scanned string
 358         // rdx - string length (elements count)
 359         // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
 360         // outputs:
 361         // rcx - matched index in string
 362         assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
 363 
 364         Label reloadSubstr = new Label();
 365         Label scanToSubstr = new Label();
 366         Label scanSubstr = new Label();
 367         Label retFound = new Label();
 368         Label retNotFound = new Label();
 369         Label exit = new Label();
 370         Label foundSubstr = new Label();
 371         Label matchSubstrHead = new Label();
 372         Label reloadStr = new Label();
 373         Label foundCandidate = new Label();
 374 
 375         // Note, inline_string_indexOf() generates checks:
 376         // if (substr.count > string.count) return -1;
 377         // if (substr.count == 0) return 0;
 378         assert intCnt2 >= 8 : "this code isused only for cnt2 >= 8 chars";
 379 
 380         // Load substring.
 381         movdqu(vec, new AMD64Address(str2, 0));
 382         movl(cnt2, intCnt2);
 383         movq(result, str1); // string addr
 384 
 385         if (intCnt2 > 8) {
 386             jmpb(scanToSubstr);
 387 
 388             // Reload substr for rescan, this code
 389             // is executed only for large substrings (> 8 chars)
 390             bind(reloadSubstr);
 391             movdqu(vec, new AMD64Address(str2, 0));
 392             negq(cnt2); // Jumped here with negative cnt2, convert to positive
 393 
 394             bind(reloadStr);
 395             // We came here after the beginning of the substring was
 396             // matched but the rest of it was not so we need to search
 397             // again. Start from the next element after the previous match.
 398 
 399             // cnt2 is number of substring reminding elements and
 400             // cnt1 is number of string reminding elements when cmp failed.
 401             // Restored cnt1 = cnt1 - cnt2 + int_cnt2
 402             subl(cnt1, cnt2);
 403             addl(cnt1, intCnt2);
 404             movl(cnt2, intCnt2); // Now restore cnt2
 405 
 406             decrementl(cnt1, 1);     // Shift to next element
 407             cmpl(cnt1, cnt2);
 408             jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
 409 
 410             addq(result, 2);
 411 
 412         } // (int_cnt2 > 8)
 413 
 414         // Scan string for start of substr in 16-byte vectors
 415         bind(scanToSubstr);
 416         pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
 417         jccb(ConditionFlag.Below, foundCandidate);   // CF == 1
 418         subl(cnt1, 8);
 419         jccb(ConditionFlag.LessEqual, retNotFound); // Scanned full string
 420         cmpl(cnt1, cnt2);
 421         jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
 422         addq(result, 16);
 423         jmpb(scanToSubstr);
 424 
 425         // Found a potential substr
 426         bind(foundCandidate);
 427         // Matched whole vector if first element matched (tmp(rcx) == 0).
 428         if (intCnt2 == 8) {
 429             jccb(ConditionFlag.Overflow, retFound);    // OF == 1
 430         } else { // int_cnt2 > 8
 431             jccb(ConditionFlag.Overflow, foundSubstr);
 432         }
 433         // After pcmpestri tmp(rcx) contains matched element index
 434         // Compute start addr of substr
 435         leaq(result, new AMD64Address(result, tmp, Scale.Times2, 0));
 436 
 437         // Make sure string is still long enough
 438         subl(cnt1, tmp);
 439         cmpl(cnt1, cnt2);
 440         if (intCnt2 == 8) {
 441             jccb(ConditionFlag.GreaterEqual, scanToSubstr);
 442         } else { // int_cnt2 > 8
 443             jccb(ConditionFlag.GreaterEqual, matchSubstrHead);
 444         }
 445         // Left less then substring.
 446 
 447         bind(retNotFound);
 448         movl(result, -1);
 449         jmpb(exit);
 450 
 451         if (intCnt2 > 8) {
 452             // This code is optimized for the case when whole substring
 453             // is matched if its head is matched.
 454             bind(matchSubstrHead);
 455             pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
 456             // Reload only string if does not match
 457             jccb(ConditionFlag.NoOverflow, reloadStr); // OF == 0
 458 
 459             Label contScanSubstr = new Label();
 460             // Compare the rest of substring (> 8 chars).
 461             bind(foundSubstr);
 462             // First 8 chars are already matched.
 463             negq(cnt2);
 464             addq(cnt2, 8);
 465 
 466             bind(scanSubstr);
 467             subl(cnt1, 8);
 468             cmpl(cnt2, -8); // Do not read beyond substring
 469             jccb(ConditionFlag.LessEqual, contScanSubstr);
 470             // Back-up strings to avoid reading beyond substring:
 471             // cnt1 = cnt1 - cnt2 + 8
 472             addl(cnt1, cnt2); // cnt2 is negative
 473             addl(cnt1, 8);
 474             movl(cnt2, 8);
 475             negq(cnt2);
 476             bind(contScanSubstr);
 477             if (intCnt2 < 1024 * 1024 * 1024) {
 478                 movdqu(vec, new AMD64Address(str2, cnt2, Scale.Times2, intCnt2 * 2));
 479                 pcmpestri(vec, new AMD64Address(result, cnt2, Scale.Times2, intCnt2 * 2), 0x0d);
 480             } else {
 481                 // calculate index in register to avoid integer overflow (int_cnt2*2)
 482                 movl(tmp, intCnt2);
 483                 addq(tmp, cnt2);
 484                 movdqu(vec, new AMD64Address(str2, tmp, Scale.Times2, 0));
 485                 pcmpestri(vec, new AMD64Address(result, tmp, Scale.Times2, 0), 0x0d);
 486             }
 487             // Need to reload strings pointers if not matched whole vector
 488             jcc(ConditionFlag.NoOverflow, reloadSubstr); // OF == 0
 489             addq(cnt2, 8);
 490             jcc(ConditionFlag.Negative, scanSubstr);
 491             // Fall through if found full substring
 492 
 493         } // (int_cnt2 > 8)
 494 
 495         bind(retFound);
 496         // Found result if we matched full small substring.
 497         // Compute substr offset
 498         subq(result, str1);
 499         shrl(result, 1); // index
 500         bind(exit);
 501 
 502     } // string_indexofC8
 503 
 504     // Small strings are loaded through stack if they cross page boundary.
 505     public void stringIndexOf(Register str1, Register str2,
 506                     Register cnt1, Register cnt2,
 507                     int intCnt2, Register result,
 508                     Register vec, Register tmp, int vmPageSize) {
 509         //
 510         // int_cnt2 is length of small (< 8 chars) constant substring
 511         // or (-1) for non constant substring in which case its length
 512         // is in cnt2 register.
 513         //
 514         // Note, inline_string_indexOf() generates checks:
 515         // if (substr.count > string.count) return -1;
 516         // if (substr.count == 0) return 0;
 517         //
 518         assert intCnt2 == -1 || (0 < intCnt2 && intCnt2 < 8) : "should be != 0";
 519 
 520         // This method uses pcmpestri instruction with bound registers
 521         // inputs:
 522         // xmm - substring
 523         // rax - substring length (elements count)
 524         // mem - scanned string
 525         // rdx - string length (elements count)
 526         // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
 527         // outputs:
 528         // rcx - matched index in string
 529         assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
 530 
 531         Label reloadSubstr = new Label();
 532         Label scanToSubstr = new Label();
 533         Label scanSubstr = new Label();
 534         Label adjustStr = new Label();
 535         Label retFound = new Label();
 536         Label retNotFound = new Label();
 537         Label cleanup = new Label();
 538         Label foundSubstr = new Label();
 539         Label foundCandidate = new Label();
 540 
 541         int wordSize = 8;
 542         // We don't know where these strings are located
 543         // and we can't read beyond them. Load them through stack.
 544         Label bigStrings = new Label();
 545         Label checkStr = new Label();
 546         Label copySubstr = new Label();
 547         Label copyStr = new Label();
 548 
 549         movq(tmp, rsp); // save old SP
 550 
 551         if (intCnt2 > 0) {     // small (< 8 chars) constant substring
 552             if (intCnt2 == 1) {  // One char
 553                 movzwl(result, new AMD64Address(str2, 0));
 554                 movdl(vec, result); // move 32 bits
 555             } else if (intCnt2 == 2) { // Two chars
 556                 movdl(vec, new AMD64Address(str2, 0)); // move 32 bits
 557             } else if (intCnt2 == 4) { // Four chars
 558                 movq(vec, new AMD64Address(str2, 0));  // move 64 bits
 559             } else { // cnt2 = { 3, 5, 6, 7 }
 560                 // Array header size is 12 bytes in 32-bit VM
 561                 // + 6 bytes for 3 chars == 18 bytes,
 562                 // enough space to load vec and shift.
 563                 movdqu(vec, new AMD64Address(str2, (intCnt2 * 2) - 16));
 564                 psrldq(vec, 16 - (intCnt2 * 2));
 565             }
 566         } else { // not constant substring
 567             cmpl(cnt2, 8);
 568             jccb(ConditionFlag.AboveEqual, bigStrings); // Both strings are big enough
 569 
 570             // We can read beyond string if str+16 does not cross page boundary
 571             // since heaps are aligned and mapped by pages.
 572             assert vmPageSize < 1024 * 1024 * 1024 : "default page should be small";
 573             movl(result, str2); // We need only low 32 bits
 574             andl(result, (vmPageSize - 1));
 575             cmpl(result, (vmPageSize - 16));
 576             jccb(ConditionFlag.BelowEqual, checkStr);
 577 
 578             // Move small strings to stack to allow load 16 bytes into vec.
 579             subq(rsp, 16);
 580             int stackOffset = wordSize - 2;
 581             push(cnt2);
 582 
 583             bind(copySubstr);
 584             movzwl(result, new AMD64Address(str2, cnt2, Scale.Times2, -2));
 585             movw(new AMD64Address(rsp, cnt2, Scale.Times2, stackOffset), result);
 586             decrementl(cnt2, 1);
 587             jccb(ConditionFlag.NotZero, copySubstr);
 588 
 589             pop(cnt2);
 590             movq(str2, rsp);  // New substring address
 591         } // non constant
 592 
 593         bind(checkStr);
 594         cmpl(cnt1, 8);
 595         jccb(ConditionFlag.AboveEqual, bigStrings);
 596 
 597         // Check cross page boundary.
 598         movl(result, str1); // We need only low 32 bits
 599         andl(result, (vmPageSize - 1));
 600         cmpl(result, (vmPageSize - 16));
 601         jccb(ConditionFlag.BelowEqual, bigStrings);
 602 
 603         subq(rsp, 16);
 604         int stackOffset = -2;
 605         if (intCnt2 < 0) { // not constant
 606             push(cnt2);
 607             stackOffset += wordSize;
 608         }
 609         movl(cnt2, cnt1);
 610 
 611         bind(copyStr);
 612         movzwl(result, new AMD64Address(str1, cnt2, Scale.Times2, -2));
 613         movw(new AMD64Address(rsp, cnt2, Scale.Times2, stackOffset), result);
 614         decrementl(cnt2, 1);
 615         jccb(ConditionFlag.NotZero, copyStr);
 616 
 617         if (intCnt2 < 0) { // not constant
 618             pop(cnt2);
 619         }
 620         movq(str1, rsp);  // New string address
 621 
 622         bind(bigStrings);
 623         // Load substring.
 624         if (intCnt2 < 0) { // -1
 625             movdqu(vec, new AMD64Address(str2, 0));
 626             push(cnt2);       // substr count
 627             push(str2);       // substr addr
 628             push(str1);       // string addr
 629         } else {
 630             // Small (< 8 chars) constant substrings are loaded already.
 631             movl(cnt2, intCnt2);
 632         }
 633         push(tmp);  // original SP
 634         // Finished loading
 635 
 636         // ========================================================
 637         // Start search
 638         //
 639 
 640         movq(result, str1); // string addr
 641 
 642         if (intCnt2 < 0) {  // Only for non constant substring
 643             jmpb(scanToSubstr);
 644 
 645             // SP saved at sp+0
 646             // String saved at sp+1*wordSize
 647             // Substr saved at sp+2*wordSize
 648             // Substr count saved at sp+3*wordSize
 649 
 650             // Reload substr for rescan, this code
 651             // is executed only for large substrings (> 8 chars)
 652             bind(reloadSubstr);
 653             movq(str2, new AMD64Address(rsp, 2 * wordSize));
 654             movl(cnt2, new AMD64Address(rsp, 3 * wordSize));
 655             movdqu(vec, new AMD64Address(str2, 0));
 656             // We came here after the beginning of the substring was
 657             // matched but the rest of it was not so we need to search
 658             // again. Start from the next element after the previous match.
 659             subq(str1, result); // Restore counter
 660             shrl(str1, 1);
 661             addl(cnt1, str1);
 662             decrementl(cnt1);   // Shift to next element
 663             cmpl(cnt1, cnt2);
 664             jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
 665 
 666             addq(result, 2);
 667         } // non constant
 668 
 669         // Scan string for start of substr in 16-byte vectors
 670         bind(scanToSubstr);
 671         assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
 672         pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
 673         jccb(ConditionFlag.Below, foundCandidate);   // CF == 1
 674         subl(cnt1, 8);
 675         jccb(ConditionFlag.LessEqual, retNotFound); // Scanned full string
 676         cmpl(cnt1, cnt2);
 677         jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
 678         addq(result, 16);
 679 
 680         bind(adjustStr);
 681         cmpl(cnt1, 8); // Do not read beyond string
 682         jccb(ConditionFlag.GreaterEqual, scanToSubstr);
 683         // Back-up string to avoid reading beyond string.
 684         leaq(result, new AMD64Address(result, cnt1, Scale.Times2, -16));
 685         movl(cnt1, 8);
 686         jmpb(scanToSubstr);
 687 
 688         // Found a potential substr
 689         bind(foundCandidate);
 690         // After pcmpestri tmp(rcx) contains matched element index
 691 
 692         // Make sure string is still long enough
 693         subl(cnt1, tmp);
 694         cmpl(cnt1, cnt2);
 695         jccb(ConditionFlag.GreaterEqual, foundSubstr);
 696         // Left less then substring.
 697 
 698         bind(retNotFound);
 699         movl(result, -1);
 700         jmpb(cleanup);
 701 
 702         bind(foundSubstr);
 703         // Compute start addr of substr
 704         leaq(result, new AMD64Address(result, tmp, Scale.Times2));
 705 
 706         if (intCnt2 > 0) { // Constant substring
 707             // Repeat search for small substring (< 8 chars)
 708             // from new point without reloading substring.
 709             // Have to check that we don't read beyond string.
 710             cmpl(tmp, 8 - intCnt2);
 711             jccb(ConditionFlag.Greater, adjustStr);
 712             // Fall through if matched whole substring.
 713         } else { // non constant
 714             assert intCnt2 == -1 : "should be != 0";
 715 
 716             addl(tmp, cnt2);
 717             // Found result if we matched whole substring.
 718             cmpl(tmp, 8);
 719             jccb(ConditionFlag.LessEqual, retFound);
 720 
 721             // Repeat search for small substring (<= 8 chars)
 722             // from new point 'str1' without reloading substring.
 723             cmpl(cnt2, 8);
 724             // Have to check that we don't read beyond string.
 725             jccb(ConditionFlag.LessEqual, adjustStr);
 726 
 727             Label checkNext = new Label();
 728             Label contScanSubstr = new Label();
 729             Label retFoundLong = new Label();
 730             // Compare the rest of substring (> 8 chars).
 731             movq(str1, result);
 732 
 733             cmpl(tmp, cnt2);
 734             // First 8 chars are already matched.
 735             jccb(ConditionFlag.Equal, checkNext);
 736 
 737             bind(scanSubstr);
 738             pcmpestri(vec, new AMD64Address(str1, 0), 0x0d);
 739             // Need to reload strings pointers if not matched whole vector
 740             jcc(ConditionFlag.NoOverflow, reloadSubstr); // OF == 0
 741 
 742             bind(checkNext);
 743             subl(cnt2, 8);
 744             jccb(ConditionFlag.LessEqual, retFoundLong); // Found full substring
 745             addq(str1, 16);
 746             addq(str2, 16);
 747             subl(cnt1, 8);
 748             cmpl(cnt2, 8); // Do not read beyond substring
 749             jccb(ConditionFlag.GreaterEqual, contScanSubstr);
 750             // Back-up strings to avoid reading beyond substring.
 751             leaq(str2, new AMD64Address(str2, cnt2, Scale.Times2, -16));
 752             leaq(str1, new AMD64Address(str1, cnt2, Scale.Times2, -16));
 753             subl(cnt1, cnt2);
 754             movl(cnt2, 8);
 755             addl(cnt1, 8);
 756             bind(contScanSubstr);
 757             movdqu(vec, new AMD64Address(str2, 0));
 758             jmpb(scanSubstr);
 759 
 760             bind(retFoundLong);
 761             movq(str1, new AMD64Address(rsp, wordSize));
 762         } // non constant
 763 
 764         bind(retFound);
 765         // Compute substr offset
 766         subq(result, str1);
 767         shrl(result, 1); // index
 768 
 769         bind(cleanup);
 770         pop(rsp); // restore SP
 771 
 772     }
 773 
 774 }