1 /*
   2  * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 package org.graalvm.compiler.asm.amd64;
  24 
  25 import static jdk.vm.ci.amd64.AMD64.rax;
  26 import static jdk.vm.ci.amd64.AMD64.rcx;
  27 import static jdk.vm.ci.amd64.AMD64.rdx;
  28 import static jdk.vm.ci.amd64.AMD64.rsp;
  29 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseIncDec;
  30 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseXmmLoadAndClearUpper;
  31 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseXmmRegToRegMoveAll;
  32 
  33 import org.graalvm.compiler.asm.Label;
  34 import org.graalvm.compiler.core.common.NumUtil;
  35 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  36 
  37 import jdk.vm.ci.amd64.AMD64;
  38 import jdk.vm.ci.amd64.AMD64Kind;
  39 import jdk.vm.ci.code.Register;
  40 import jdk.vm.ci.code.TargetDescription;
  41 
  42 /**
  43  * This class implements commonly used X86 code patterns.
  44  */
  45 public class AMD64MacroAssembler extends AMD64Assembler {
  46 
  47     public AMD64MacroAssembler(TargetDescription target) {
  48         super(target);
  49     }
  50 
  51     public final void decrementq(Register reg, int value) {
  52         if (value == Integer.MIN_VALUE) {
  53             subq(reg, value);
  54             return;
  55         }
  56         if (value < 0) {
  57             incrementq(reg, -value);
  58             return;
  59         }
  60         if (value == 0) {
  61             return;
  62         }
  63         if (value == 1 && UseIncDec) {
  64             decq(reg);
  65         } else {
  66             subq(reg, value);
  67         }
  68     }
  69 
  70     public final void decrementq(AMD64Address dst, int value) {
  71         if (value == Integer.MIN_VALUE) {
  72             subq(dst, value);
  73             return;
  74         }
  75         if (value < 0) {
  76             incrementq(dst, -value);
  77             return;
  78         }
  79         if (value == 0) {
  80             return;
  81         }
  82         if (value == 1 && UseIncDec) {
  83             decq(dst);
  84         } else {
  85             subq(dst, value);
  86         }
  87     }
  88 
  89     public void incrementq(Register reg, int value) {
  90         if (value == Integer.MIN_VALUE) {
  91             addq(reg, value);
  92             return;
  93         }
  94         if (value < 0) {
  95             decrementq(reg, -value);
  96             return;
  97         }
  98         if (value == 0) {
  99             return;
 100         }
 101         if (value == 1 && UseIncDec) {
 102             incq(reg);
 103         } else {
 104             addq(reg, value);
 105         }
 106     }
 107 
 108     public final void incrementq(AMD64Address dst, int value) {
 109         if (value == Integer.MIN_VALUE) {
 110             addq(dst, value);
 111             return;
 112         }
 113         if (value < 0) {
 114             decrementq(dst, -value);
 115             return;
 116         }
 117         if (value == 0) {
 118             return;
 119         }
 120         if (value == 1 && UseIncDec) {
 121             incq(dst);
 122         } else {
 123             addq(dst, value);
 124         }
 125     }
 126 
 127     public final void movptr(Register dst, AMD64Address src) {
 128         movq(dst, src);
 129     }
 130 
 131     public final void movptr(AMD64Address dst, Register src) {
 132         movq(dst, src);
 133     }
 134 
 135     public final void movptr(AMD64Address dst, int src) {
 136         movslq(dst, src);
 137     }
 138 
 139     public final void cmpptr(Register src1, Register src2) {
 140         cmpq(src1, src2);
 141     }
 142 
 143     public final void cmpptr(Register src1, AMD64Address src2) {
 144         cmpq(src1, src2);
 145     }
 146 
 147     public final void decrementl(Register reg) {
 148         decrementl(reg, 1);
 149     }
 150 
 151     public final void decrementl(Register reg, int value) {
 152         if (value == Integer.MIN_VALUE) {
 153             subl(reg, value);
 154             return;
 155         }
 156         if (value < 0) {
 157             incrementl(reg, -value);
 158             return;
 159         }
 160         if (value == 0) {
 161             return;
 162         }
 163         if (value == 1 && UseIncDec) {
 164             decl(reg);
 165         } else {
 166             subl(reg, value);
 167         }
 168     }
 169 
 170     public final void decrementl(AMD64Address dst, int value) {
 171         if (value == Integer.MIN_VALUE) {
 172             subl(dst, value);
 173             return;
 174         }
 175         if (value < 0) {
 176             incrementl(dst, -value);
 177             return;
 178         }
 179         if (value == 0) {
 180             return;
 181         }
 182         if (value == 1 && UseIncDec) {
 183             decl(dst);
 184         } else {
 185             subl(dst, value);
 186         }
 187     }
 188 
 189     public final void incrementl(Register reg, int value) {
 190         if (value == Integer.MIN_VALUE) {
 191             addl(reg, value);
 192             return;
 193         }
 194         if (value < 0) {
 195             decrementl(reg, -value);
 196             return;
 197         }
 198         if (value == 0) {
 199             return;
 200         }
 201         if (value == 1 && UseIncDec) {
 202             incl(reg);
 203         } else {
 204             addl(reg, value);
 205         }
 206     }
 207 
 208     public final void incrementl(AMD64Address dst, int value) {
 209         if (value == Integer.MIN_VALUE) {
 210             addl(dst, value);
 211             return;
 212         }
 213         if (value < 0) {
 214             decrementl(dst, -value);
 215             return;
 216         }
 217         if (value == 0) {
 218             return;
 219         }
 220         if (value == 1 && UseIncDec) {
 221             incl(dst);
 222         } else {
 223             addl(dst, value);
 224         }
 225     }
 226 
 227     public void movflt(Register dst, Register src) {
 228         assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
 229         if (UseXmmRegToRegMoveAll) {
 230             movaps(dst, src);
 231         } else {
 232             movss(dst, src);
 233         }
 234     }
 235 
 236     public void movflt(Register dst, AMD64Address src) {
 237         assert dst.getRegisterCategory().equals(AMD64.XMM);
 238         movss(dst, src);
 239     }
 240 
 241     public void movflt(AMD64Address dst, Register src) {
 242         assert src.getRegisterCategory().equals(AMD64.XMM);
 243         movss(dst, src);
 244     }
 245 
 246     public void movdbl(Register dst, Register src) {
 247         assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
 248         if (UseXmmRegToRegMoveAll) {
 249             movapd(dst, src);
 250         } else {
 251             movsd(dst, src);
 252         }
 253     }
 254 
 255     public void movdbl(Register dst, AMD64Address src) {
 256         assert dst.getRegisterCategory().equals(AMD64.XMM);
 257         if (UseXmmLoadAndClearUpper) {
 258             movsd(dst, src);
 259         } else {
 260             movlpd(dst, src);
 261         }
 262     }
 263 
 264     public void movdbl(AMD64Address dst, Register src) {
 265         assert src.getRegisterCategory().equals(AMD64.XMM);
 266         movsd(dst, src);
 267     }
 268 
 269     /**
 270      * Non-atomic write of a 64-bit constant to memory. Do not use if the address might be a
 271      * volatile field!
 272      */
 273     public final void movlong(AMD64Address dst, long src) {
 274         if (NumUtil.isInt(src)) {
 275             AMD64MIOp.MOV.emit(this, OperandSize.QWORD, dst, (int) src);
 276         } else {
 277             AMD64Address high = new AMD64Address(dst.getBase(), dst.getIndex(), dst.getScale(), dst.getDisplacement() + 4);
 278             movl(dst, (int) (src & 0xFFFFFFFF));
 279             movl(high, (int) (src >> 32));
 280         }
 281 
 282     }
 283 
 284     public final void flog(Register dest, Register value, boolean base10) {
 285         if (base10) {
 286             fldlg2();
 287         } else {
 288             fldln2();
 289         }
 290         AMD64Address tmp = trigPrologue(value);
 291         fyl2x();
 292         trigEpilogue(dest, tmp);
 293     }
 294 
 295     public final void fsin(Register dest, Register value) {
 296         AMD64Address tmp = trigPrologue(value);
 297         fsin();
 298         trigEpilogue(dest, tmp);
 299     }
 300 
 301     public final void fcos(Register dest, Register value) {
 302         AMD64Address tmp = trigPrologue(value);
 303         fcos();
 304         trigEpilogue(dest, tmp);
 305     }
 306 
 307     public final void ftan(Register dest, Register value) {
 308         AMD64Address tmp = trigPrologue(value);
 309         fptan();
 310         fstp(0); // ftan pushes 1.0 in addition to the actual result, pop
 311         trigEpilogue(dest, tmp);
 312     }
 313 
 314     public final void fpop() {
 315         ffree(0);
 316         fincstp();
 317     }
 318 
 319     private AMD64Address trigPrologue(Register value) {
 320         assert value.getRegisterCategory().equals(AMD64.XMM);
 321         AMD64Address tmp = new AMD64Address(AMD64.rsp);
 322         subq(AMD64.rsp, AMD64Kind.DOUBLE.getSizeInBytes());
 323         movdbl(tmp, value);
 324         fldd(tmp);
 325         return tmp;
 326     }
 327 
 328     private void trigEpilogue(Register dest, AMD64Address tmp) {
 329         assert dest.getRegisterCategory().equals(AMD64.XMM);
 330         fstpd(tmp);
 331         movdbl(dest, tmp);
 332         addq(AMD64.rsp, AMD64Kind.DOUBLE.getSizeInBytes());
 333     }
 334 
 335     // IndexOf for constant substrings with size >= 8 chars
 336     // which don't need to be loaded through stack.
 337     public void stringIndexofC8(Register str1, Register str2,
 338                     Register cnt1, Register cnt2,
 339                     int intCnt2, Register result,
 340                     Register vec, Register tmp) {
 341         // assert(UseSSE42Intrinsics, "SSE4.2 is required");
 342 
 343         // This method uses pcmpestri inxtruction with bound registers
 344         // inputs:
 345         // xmm - substring
 346         // rax - substring length (elements count)
 347         // mem - scanned string
 348         // rdx - string length (elements count)
 349         // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
 350         // outputs:
 351         // rcx - matched index in string
 352         assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
 353 
 354         Label reloadSubstr = new Label();
 355         Label scanToSubstr = new Label();
 356         Label scanSubstr = new Label();
 357         Label retFound = new Label();
 358         Label retNotFound = new Label();
 359         Label exit = new Label();
 360         Label foundSubstr = new Label();
 361         Label matchSubstrHead = new Label();
 362         Label reloadStr = new Label();
 363         Label foundCandidate = new Label();
 364 
 365         // Note, inline_string_indexOf() generates checks:
 366         // if (substr.count > string.count) return -1;
 367         // if (substr.count == 0) return 0;
 368         assert intCnt2 >= 8 : "this code isused only for cnt2 >= 8 chars";
 369 
 370         // Load substring.
 371         movdqu(vec, new AMD64Address(str2, 0));
 372         movl(cnt2, intCnt2);
 373         movq(result, str1); // string addr
 374 
 375         if (intCnt2 > 8) {
 376             jmpb(scanToSubstr);
 377 
 378             // Reload substr for rescan, this code
 379             // is executed only for large substrings (> 8 chars)
 380             bind(reloadSubstr);
 381             movdqu(vec, new AMD64Address(str2, 0));
 382             negq(cnt2); // Jumped here with negative cnt2, convert to positive
 383 
 384             bind(reloadStr);
 385             // We came here after the beginning of the substring was
 386             // matched but the rest of it was not so we need to search
 387             // again. Start from the next element after the previous match.
 388 
 389             // cnt2 is number of substring reminding elements and
 390             // cnt1 is number of string reminding elements when cmp failed.
 391             // Restored cnt1 = cnt1 - cnt2 + int_cnt2
 392             subl(cnt1, cnt2);
 393             addl(cnt1, intCnt2);
 394             movl(cnt2, intCnt2); // Now restore cnt2
 395 
 396             decrementl(cnt1, 1);     // Shift to next element
 397             cmpl(cnt1, cnt2);
 398             jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
 399 
 400             addq(result, 2);
 401 
 402         } // (int_cnt2 > 8)
 403 
 404         // Scan string for start of substr in 16-byte vectors
 405         bind(scanToSubstr);
 406         pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
 407         jccb(ConditionFlag.Below, foundCandidate);   // CF == 1
 408         subl(cnt1, 8);
 409         jccb(ConditionFlag.LessEqual, retNotFound); // Scanned full string
 410         cmpl(cnt1, cnt2);
 411         jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
 412         addq(result, 16);
 413         jmpb(scanToSubstr);
 414 
 415         // Found a potential substr
 416         bind(foundCandidate);
 417         // Matched whole vector if first element matched (tmp(rcx) == 0).
 418         if (intCnt2 == 8) {
 419             jccb(ConditionFlag.Overflow, retFound);    // OF == 1
 420         } else { // int_cnt2 > 8
 421             jccb(ConditionFlag.Overflow, foundSubstr);
 422         }
 423         // After pcmpestri tmp(rcx) contains matched element index
 424         // Compute start addr of substr
 425         leaq(result, new AMD64Address(result, tmp, Scale.Times2, 0));
 426 
 427         // Make sure string is still long enough
 428         subl(cnt1, tmp);
 429         cmpl(cnt1, cnt2);
 430         if (intCnt2 == 8) {
 431             jccb(ConditionFlag.GreaterEqual, scanToSubstr);
 432         } else { // int_cnt2 > 8
 433             jccb(ConditionFlag.GreaterEqual, matchSubstrHead);
 434         }
 435         // Left less then substring.
 436 
 437         bind(retNotFound);
 438         movl(result, -1);
 439         jmpb(exit);
 440 
 441         if (intCnt2 > 8) {
 442             // This code is optimized for the case when whole substring
 443             // is matched if its head is matched.
 444             bind(matchSubstrHead);
 445             pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
 446             // Reload only string if does not match
 447             jccb(ConditionFlag.NoOverflow, reloadStr); // OF == 0
 448 
 449             Label contScanSubstr = new Label();
 450             // Compare the rest of substring (> 8 chars).
 451             bind(foundSubstr);
 452             // First 8 chars are already matched.
 453             negq(cnt2);
 454             addq(cnt2, 8);
 455 
 456             bind(scanSubstr);
 457             subl(cnt1, 8);
 458             cmpl(cnt2, -8); // Do not read beyond substring
 459             jccb(ConditionFlag.LessEqual, contScanSubstr);
 460             // Back-up strings to avoid reading beyond substring:
 461             // cnt1 = cnt1 - cnt2 + 8
 462             addl(cnt1, cnt2); // cnt2 is negative
 463             addl(cnt1, 8);
 464             movl(cnt2, 8);
 465             negq(cnt2);
 466             bind(contScanSubstr);
 467             if (intCnt2 < 1024 * 1024 * 1024) {
 468                 movdqu(vec, new AMD64Address(str2, cnt2, Scale.Times2, intCnt2 * 2));
 469                 pcmpestri(vec, new AMD64Address(result, cnt2, Scale.Times2, intCnt2 * 2), 0x0d);
 470             } else {
 471                 // calculate index in register to avoid integer overflow (int_cnt2*2)
 472                 movl(tmp, intCnt2);
 473                 addq(tmp, cnt2);
 474                 movdqu(vec, new AMD64Address(str2, tmp, Scale.Times2, 0));
 475                 pcmpestri(vec, new AMD64Address(result, tmp, Scale.Times2, 0), 0x0d);
 476             }
 477             // Need to reload strings pointers if not matched whole vector
 478             jcc(ConditionFlag.NoOverflow, reloadSubstr); // OF == 0
 479             addq(cnt2, 8);
 480             jcc(ConditionFlag.Negative, scanSubstr);
 481             // Fall through if found full substring
 482 
 483         } // (int_cnt2 > 8)
 484 
 485         bind(retFound);
 486         // Found result if we matched full small substring.
 487         // Compute substr offset
 488         subq(result, str1);
 489         shrl(result, 1); // index
 490         bind(exit);
 491 
 492     } // string_indexofC8
 493 
 494     // Small strings are loaded through stack if they cross page boundary.
 495     public void stringIndexOf(Register str1, Register str2,
 496                     Register cnt1, Register cnt2,
 497                     int intCnt2, Register result,
 498                     Register vec, Register tmp, int vmPageSize) {
 499         //
 500         // int_cnt2 is length of small (< 8 chars) constant substring
 501         // or (-1) for non constant substring in which case its length
 502         // is in cnt2 register.
 503         //
 504         // Note, inline_string_indexOf() generates checks:
 505         // if (substr.count > string.count) return -1;
 506         // if (substr.count == 0) return 0;
 507         //
 508         assert intCnt2 == -1 || (0 < intCnt2 && intCnt2 < 8) : "should be != 0";
 509 
 510         // This method uses pcmpestri instruction with bound registers
 511         // inputs:
 512         // xmm - substring
 513         // rax - substring length (elements count)
 514         // mem - scanned string
 515         // rdx - string length (elements count)
 516         // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
 517         // outputs:
 518         // rcx - matched index in string
 519         assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
 520 
 521         Label reloadSubstr = new Label();
 522         Label scanToSubstr = new Label();
 523         Label scanSubstr = new Label();
 524         Label adjustStr = new Label();
 525         Label retFound = new Label();
 526         Label retNotFound = new Label();
 527         Label cleanup = new Label();
 528         Label foundSubstr = new Label();
 529         Label foundCandidate = new Label();
 530 
 531         int wordSize = 8;
 532         // We don't know where these strings are located
 533         // and we can't read beyond them. Load them through stack.
 534         Label bigStrings = new Label();
 535         Label checkStr = new Label();
 536         Label copySubstr = new Label();
 537         Label copyStr = new Label();
 538 
 539         movq(tmp, rsp); // save old SP
 540 
 541         if (intCnt2 > 0) {     // small (< 8 chars) constant substring
 542             if (intCnt2 == 1) {  // One char
 543                 movzwl(result, new AMD64Address(str2, 0));
 544                 movdl(vec, result); // move 32 bits
 545             } else if (intCnt2 == 2) { // Two chars
 546                 movdl(vec, new AMD64Address(str2, 0)); // move 32 bits
 547             } else if (intCnt2 == 4) { // Four chars
 548                 movq(vec, new AMD64Address(str2, 0));  // move 64 bits
 549             } else { // cnt2 = { 3, 5, 6, 7 }
 550                 // Array header size is 12 bytes in 32-bit VM
 551                 // + 6 bytes for 3 chars == 18 bytes,
 552                 // enough space to load vec and shift.
 553                 movdqu(vec, new AMD64Address(str2, (intCnt2 * 2) - 16));
 554                 psrldq(vec, 16 - (intCnt2 * 2));
 555             }
 556         } else { // not constant substring
 557             cmpl(cnt2, 8);
 558             jccb(ConditionFlag.AboveEqual, bigStrings); // Both strings are big enough
 559 
 560             // We can read beyond string if str+16 does not cross page boundary
 561             // since heaps are aligned and mapped by pages.
 562             assert vmPageSize < 1024 * 1024 * 1024 : "default page should be small";
 563             movl(result, str2); // We need only low 32 bits
 564             andl(result, (vmPageSize - 1));
 565             cmpl(result, (vmPageSize - 16));
 566             jccb(ConditionFlag.BelowEqual, checkStr);
 567 
 568             // Move small strings to stack to allow load 16 bytes into vec.
 569             subq(rsp, 16);
 570             int stackOffset = wordSize - 2;
 571             push(cnt2);
 572 
 573             bind(copySubstr);
 574             movzwl(result, new AMD64Address(str2, cnt2, Scale.Times2, -2));
 575             movw(new AMD64Address(rsp, cnt2, Scale.Times2, stackOffset), result);
 576             decrementl(cnt2, 1);
 577             jccb(ConditionFlag.NotZero, copySubstr);
 578 
 579             pop(cnt2);
 580             movq(str2, rsp);  // New substring address
 581         } // non constant
 582 
 583         bind(checkStr);
 584         cmpl(cnt1, 8);
 585         jccb(ConditionFlag.AboveEqual, bigStrings);
 586 
 587         // Check cross page boundary.
 588         movl(result, str1); // We need only low 32 bits
 589         andl(result, (vmPageSize - 1));
 590         cmpl(result, (vmPageSize - 16));
 591         jccb(ConditionFlag.BelowEqual, bigStrings);
 592 
 593         subq(rsp, 16);
 594         int stackOffset = -2;
 595         if (intCnt2 < 0) { // not constant
 596             push(cnt2);
 597             stackOffset += wordSize;
 598         }
 599         movl(cnt2, cnt1);
 600 
 601         bind(copyStr);
 602         movzwl(result, new AMD64Address(str1, cnt2, Scale.Times2, -2));
 603         movw(new AMD64Address(rsp, cnt2, Scale.Times2, stackOffset), result);
 604         decrementl(cnt2, 1);
 605         jccb(ConditionFlag.NotZero, copyStr);
 606 
 607         if (intCnt2 < 0) { // not constant
 608             pop(cnt2);
 609         }
 610         movq(str1, rsp);  // New string address
 611 
 612         bind(bigStrings);
 613         // Load substring.
 614         if (intCnt2 < 0) { // -1
 615             movdqu(vec, new AMD64Address(str2, 0));
 616             push(cnt2);       // substr count
 617             push(str2);       // substr addr
 618             push(str1);       // string addr
 619         } else {
 620             // Small (< 8 chars) constant substrings are loaded already.
 621             movl(cnt2, intCnt2);
 622         }
 623         push(tmp);  // original SP
 624         // Finished loading
 625 
 626         // ========================================================
 627         // Start search
 628         //
 629 
 630         movq(result, str1); // string addr
 631 
 632         if (intCnt2 < 0) {  // Only for non constant substring
 633             jmpb(scanToSubstr);
 634 
 635             // SP saved at sp+0
 636             // String saved at sp+1*wordSize
 637             // Substr saved at sp+2*wordSize
 638             // Substr count saved at sp+3*wordSize
 639 
 640             // Reload substr for rescan, this code
 641             // is executed only for large substrings (> 8 chars)
 642             bind(reloadSubstr);
 643             movq(str2, new AMD64Address(rsp, 2 * wordSize));
 644             movl(cnt2, new AMD64Address(rsp, 3 * wordSize));
 645             movdqu(vec, new AMD64Address(str2, 0));
 646             // We came here after the beginning of the substring was
 647             // matched but the rest of it was not so we need to search
 648             // again. Start from the next element after the previous match.
 649             subq(str1, result); // Restore counter
 650             shrl(str1, 1);
 651             addl(cnt1, str1);
 652             decrementl(cnt1);   // Shift to next element
 653             cmpl(cnt1, cnt2);
 654             jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
 655 
 656             addq(result, 2);
 657         } // non constant
 658 
 659         // Scan string for start of substr in 16-byte vectors
 660         bind(scanToSubstr);
 661         assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
 662         pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
 663         jccb(ConditionFlag.Below, foundCandidate);   // CF == 1
 664         subl(cnt1, 8);
 665         jccb(ConditionFlag.LessEqual, retNotFound); // Scanned full string
 666         cmpl(cnt1, cnt2);
 667         jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
 668         addq(result, 16);
 669 
 670         bind(adjustStr);
 671         cmpl(cnt1, 8); // Do not read beyond string
 672         jccb(ConditionFlag.GreaterEqual, scanToSubstr);
 673         // Back-up string to avoid reading beyond string.
 674         leaq(result, new AMD64Address(result, cnt1, Scale.Times2, -16));
 675         movl(cnt1, 8);
 676         jmpb(scanToSubstr);
 677 
 678         // Found a potential substr
 679         bind(foundCandidate);
 680         // After pcmpestri tmp(rcx) contains matched element index
 681 
 682         // Make sure string is still long enough
 683         subl(cnt1, tmp);
 684         cmpl(cnt1, cnt2);
 685         jccb(ConditionFlag.GreaterEqual, foundSubstr);
 686         // Left less then substring.
 687 
 688         bind(retNotFound);
 689         movl(result, -1);
 690         jmpb(cleanup);
 691 
 692         bind(foundSubstr);
 693         // Compute start addr of substr
 694         leaq(result, new AMD64Address(result, tmp, Scale.Times2));
 695 
 696         if (intCnt2 > 0) { // Constant substring
 697             // Repeat search for small substring (< 8 chars)
 698             // from new point without reloading substring.
 699             // Have to check that we don't read beyond string.
 700             cmpl(tmp, 8 - intCnt2);
 701             jccb(ConditionFlag.Greater, adjustStr);
 702             // Fall through if matched whole substring.
 703         } else { // non constant
 704             assert intCnt2 == -1 : "should be != 0";
 705 
 706             addl(tmp, cnt2);
 707             // Found result if we matched whole substring.
 708             cmpl(tmp, 8);
 709             jccb(ConditionFlag.LessEqual, retFound);
 710 
 711             // Repeat search for small substring (<= 8 chars)
 712             // from new point 'str1' without reloading substring.
 713             cmpl(cnt2, 8);
 714             // Have to check that we don't read beyond string.
 715             jccb(ConditionFlag.LessEqual, adjustStr);
 716 
 717             Label checkNext = new Label();
 718             Label contScanSubstr = new Label();
 719             Label retFoundLong = new Label();
 720             // Compare the rest of substring (> 8 chars).
 721             movq(str1, result);
 722 
 723             cmpl(tmp, cnt2);
 724             // First 8 chars are already matched.
 725             jccb(ConditionFlag.Equal, checkNext);
 726 
 727             bind(scanSubstr);
 728             pcmpestri(vec, new AMD64Address(str1, 0), 0x0d);
 729             // Need to reload strings pointers if not matched whole vector
 730             jcc(ConditionFlag.NoOverflow, reloadSubstr); // OF == 0
 731 
 732             bind(checkNext);
 733             subl(cnt2, 8);
 734             jccb(ConditionFlag.LessEqual, retFoundLong); // Found full substring
 735             addq(str1, 16);
 736             addq(str2, 16);
 737             subl(cnt1, 8);
 738             cmpl(cnt2, 8); // Do not read beyond substring
 739             jccb(ConditionFlag.GreaterEqual, contScanSubstr);
 740             // Back-up strings to avoid reading beyond substring.
 741             leaq(str2, new AMD64Address(str2, cnt2, Scale.Times2, -16));
 742             leaq(str1, new AMD64Address(str1, cnt2, Scale.Times2, -16));
 743             subl(cnt1, cnt2);
 744             movl(cnt2, 8);
 745             addl(cnt1, 8);
 746             bind(contScanSubstr);
 747             movdqu(vec, new AMD64Address(str2, 0));
 748             jmpb(scanSubstr);
 749 
 750             bind(retFoundLong);
 751             movq(str1, new AMD64Address(rsp, wordSize));
 752         } // non constant
 753 
 754         bind(retFound);
 755         // Compute substr offset
 756         subq(result, str1);
 757         shrl(result, 1); // index
 758 
 759         bind(cleanup);
 760         pop(rsp); // restore SP
 761 
 762     }
 763 
 764 }