1 /*
   2  * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import static jdk.vm.ci.amd64.AMD64.k7;
  28 import static jdk.vm.ci.amd64.AMD64.rax;
  29 import static jdk.vm.ci.amd64.AMD64.rcx;
  30 import static jdk.vm.ci.amd64.AMD64.rdx;
  31 import static jdk.vm.ci.code.ValueUtil.asRegister;
  32 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
  33 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
  34 
  35 import java.util.EnumSet;
  36 
  37 import org.graalvm.compiler.asm.Label;
  38 import org.graalvm.compiler.asm.amd64.AMD64Address;
  39 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  40 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
  41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  42 import org.graalvm.compiler.core.common.LIRKind;
  43 import org.graalvm.compiler.lir.LIRInstructionClass;
  44 import org.graalvm.compiler.lir.Opcode;
  45 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  46 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  47 
  48 import jdk.vm.ci.amd64.AMD64;
  49 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  50 import jdk.vm.ci.amd64.AMD64Kind;
  51 import jdk.vm.ci.code.Register;
  52 import jdk.vm.ci.code.TargetDescription;
  53 import jdk.vm.ci.meta.JavaKind;
  54 import jdk.vm.ci.meta.Value;
  55 
  56 /**
  57  * Emits code which compares two arrays lexicographically. If the CPU supports any vector
  58  * instructions specialized code is emitted to leverage these instructions.
  59  */
  60 @Opcode("ARRAY_COMPARE_TO")
  61 public final class AMD64ArrayCompareToOp extends AMD64LIRInstruction {
  62     public static final LIRInstructionClass<AMD64ArrayCompareToOp> TYPE = LIRInstructionClass.create(AMD64ArrayCompareToOp.class);
  63 
  64     private final JavaKind kind1;
  65     private final JavaKind kind2;
  66     private final int array1BaseOffset;
  67     private final int array2BaseOffset;
  68 
  69     @Def({REG}) protected Value resultValue;
  70     @Alive({REG}) protected Value array1Value;
  71     @Alive({REG}) protected Value array2Value;
  72     @Use({REG}) protected Value length1Value;
  73     @Use({REG}) protected Value length2Value;
  74     @Temp({REG}) protected Value length1ValueTemp;
  75     @Temp({REG}) protected Value length2ValueTemp;
  76     @Temp({REG}) protected Value temp1;
  77     @Temp({REG}) protected Value temp2;
  78 
  79     @Temp({REG, ILLEGAL}) protected Value vectorTemp1;
  80 
  81     public AMD64ArrayCompareToOp(LIRGeneratorTool tool, JavaKind kind1, JavaKind kind2, Value result, Value array1, Value array2, Value length1, Value length2) {
  82         super(TYPE);
  83         this.kind1 = kind1;
  84         this.kind2 = kind2;
  85 
  86         // Both offsets should be the same but better be safe than sorry.
  87         this.array1BaseOffset = tool.getProviders().getArrayOffsetProvider().arrayBaseOffset(kind1);
  88         this.array2BaseOffset = tool.getProviders().getArrayOffsetProvider().arrayBaseOffset(kind2);
  89 
  90         this.resultValue = result;
  91         this.array1Value = array1;
  92         this.array2Value = array2;
  93         /*
  94          * The length values are inputs but are also killed like temporaries so need both Use and
  95          * Temp annotations, which will only work with fixed registers.
  96          */
  97         this.length1Value = length1;
  98         this.length2Value = length2;
  99         this.length1ValueTemp = length1;
 100         this.length2ValueTemp = length2;
 101 
 102         // Allocate some temporaries.
 103         this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
 104         this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
 105 
 106         // We only need the vector temporaries if we generate SSE code.
 107         if (supportsSSE42(tool.target())) {
 108             this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 109         } else {
 110             this.vectorTemp1 = Value.ILLEGAL;
 111         }
 112     }
 113 
 114     private static boolean supportsSSE42(TargetDescription target) {
 115         AMD64 arch = (AMD64) target.arch;
 116         return arch.getFeatures().contains(CPUFeature.SSE4_2);
 117     }
 118 
 119     private static boolean supportsAVX2(TargetDescription target) {
 120         AMD64 arch = (AMD64) target.arch;
 121         return arch.getFeatures().contains(CPUFeature.AVX2);
 122     }
 123 
 124     private static boolean supportsAVX512VLBW(TargetDescription target) {
 125         EnumSet<CPUFeature> features = ((AMD64) target.arch).getFeatures();
 126         return features.contains(CPUFeature.AVX512BW) && features.contains(CPUFeature.AVX512VL);
 127     }
 128 
 129     @Override
 130     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
 131         Register result = asRegister(resultValue);
 132         Register str1 = asRegister(temp1);
 133         Register str2 = asRegister(temp2);
 134 
 135         // Load array base addresses.
 136         masm.leaq(str1, new AMD64Address(asRegister(array1Value), array1BaseOffset));
 137         masm.leaq(str2, new AMD64Address(asRegister(array2Value), array2BaseOffset));
 138         Register cnt1 = asRegister(length1Value);
 139         Register cnt2 = asRegister(length2Value);
 140 
 141         // Checkstyle: stop
 142         Label LENGTH_DIFF_LABEL = new Label();
 143         Label POP_LABEL = new Label();
 144         Label DONE_LABEL = new Label();
 145         Label WHILE_HEAD_LABEL = new Label();
 146         Label COMPARE_WIDE_VECTORS_LOOP_FAILED = new Label(); // used only _LP64 && AVX3
 147         int stride, stride2;
 148         int adr_stride = -1;
 149         int adr_stride1 = -1;
 150         int adr_stride2 = -1;
 151         // Checkstyle: resume
 152         int stride2x2 = 0x40;
 153         AMD64Address.Scale scale = null;
 154         AMD64Address.Scale scale1 = null;
 155         AMD64Address.Scale scale2 = null;
 156 
 157         // if (ae != StrIntrinsicNode::LL) {
 158         if (kind1 == JavaKind.Byte && kind2 == JavaKind.Byte) {
 159             stride2x2 = 0x20;
 160         }
 161 
 162         // if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
 163         if (kind1 != kind2) {
 164             masm.shrl(cnt2, 1);
 165         }
 166         // Compute the minimum of the string lengths and the
 167         // difference of the string lengths (stack).
 168         // Do the conditional move stuff
 169         masm.movl(result, cnt1);
 170         masm.subl(cnt1, cnt2);
 171         masm.push(cnt1);
 172         masm.cmovl(ConditionFlag.LessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
 173 
 174         // Is the minimum length zero?
 175         masm.testl(cnt2, cnt2);
 176         masm.jcc(ConditionFlag.Zero, LENGTH_DIFF_LABEL);
 177         // if (ae == StrIntrinsicNode::LL) {
 178         if (kind1 == JavaKind.Byte && kind2 == JavaKind.Byte) {
 179             // Load first bytes
 180             masm.movzbl(result, new AMD64Address(str1, 0));  // result = str1[0]
 181             masm.movzbl(cnt1, new AMD64Address(str2, 0));    // cnt1 = str2[0]
 182             // } else if (ae == StrIntrinsicNode::UU) {
 183         } else if (kind1 == JavaKind.Char && kind2 == JavaKind.Char) {
 184             // Load first characters
 185             masm.movzwl(result, new AMD64Address(str1, 0));
 186             masm.movzwl(cnt1, new AMD64Address(str2, 0));
 187         } else {
 188             masm.movzbl(result, new AMD64Address(str1, 0));
 189             masm.movzwl(cnt1, new AMD64Address(str2, 0));
 190         }
 191         masm.subl(result, cnt1);
 192         masm.jcc(ConditionFlag.NotZero, POP_LABEL);
 193 
 194         // if (ae == StrIntrinsicNode::UU) {
 195         if (kind1 == JavaKind.Char && kind2 == JavaKind.Char) {
 196             // Divide length by 2 to get number of chars
 197             masm.shrl(cnt2, 1);
 198         }
 199         masm.cmpl(cnt2, 1);
 200         masm.jcc(ConditionFlag.Equal, LENGTH_DIFF_LABEL);
 201 
 202         // Check if the strings start at the same location and setup scale and stride
 203         // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 204         if (kind1 == kind2) {
 205             masm.cmpptr(str1, str2);
 206             masm.jcc(ConditionFlag.Equal, LENGTH_DIFF_LABEL);
 207             // if (ae == StrIntrinsicNode::LL) {
 208             if (kind1 == JavaKind.Byte && kind2 == JavaKind.Byte) {
 209                 scale = AMD64Address.Scale.Times1;
 210                 stride = 16;
 211             } else {
 212                 scale = AMD64Address.Scale.Times2;
 213                 stride = 8;
 214             }
 215         } else {
 216             scale1 = AMD64Address.Scale.Times1;
 217             scale2 = AMD64Address.Scale.Times2;
 218             // scale not used
 219             stride = 8;
 220         }
 221 
 222         // if (UseAVX >= 2 && UseSSE42Intrinsics) {
 223         if (supportsAVX2(crb.target) && supportsSSE42(crb.target)) {
 224             Register vec1 = asRegister(vectorTemp1, AMD64Kind.DOUBLE);
 225 
 226             // Checkstyle: stop
 227             Label COMPARE_WIDE_VECTORS = new Label();
 228             Label VECTOR_NOT_EQUAL = new Label();
 229             Label COMPARE_WIDE_TAIL = new Label();
 230             Label COMPARE_SMALL_STR = new Label();
 231             Label COMPARE_WIDE_VECTORS_LOOP = new Label();
 232             Label COMPARE_16_CHARS = new Label();
 233             Label COMPARE_INDEX_CHAR = new Label();
 234             Label COMPARE_WIDE_VECTORS_LOOP_AVX2 = new Label();
 235             Label COMPARE_TAIL_LONG = new Label();
 236             Label COMPARE_WIDE_VECTORS_LOOP_AVX3 = new Label();  // used only _LP64 && AVX3
 237             // Checkstyle: resume
 238 
 239             int pcmpmask = 0x19;
 240             // if (ae == StrIntrinsicNode::LL) {
 241             if (kind1 == JavaKind.Byte && kind2 == JavaKind.Byte) {
 242                 pcmpmask &= ~0x01;
 243             }
 244 
 245             // Setup to compare 16-chars (32-bytes) vectors,
 246             // start from first character again because it has aligned address.
 247             // if (ae == StrIntrinsicNode::LL) {
 248             if (kind1 == JavaKind.Byte && kind2 == JavaKind.Byte) {
 249                 stride2 = 32;
 250             } else {
 251                 stride2 = 16;
 252             }
 253             // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 254             if (kind1 == kind2) {
 255                 adr_stride = stride << scale.log2;
 256             } else {
 257                 adr_stride1 = 8;  // stride << scale1;
 258                 adr_stride2 = 16; // stride << scale2;
 259             }
 260 
 261             assert result.equals(rax) && cnt2.equals(rdx) && cnt1.equals(rcx) : "pcmpestri";
 262             // rax and rdx are used by pcmpestri as elements counters
 263             masm.movl(result, cnt2);
 264             masm.andl(cnt2, ~(stride2 - 1));   // cnt2 holds the vector count
 265             masm.jcc(ConditionFlag.Zero, COMPARE_TAIL_LONG);
 266 
 267             // fast path : compare first 2 8-char vectors.
 268             masm.bind(COMPARE_16_CHARS);
 269             // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 270             if (kind1 == kind2) {
 271                 masm.movdqu(vec1, new AMD64Address(str1, 0));
 272             } else {
 273                 masm.pmovzxbw(vec1, new AMD64Address(str1, 0));
 274             }
 275             masm.pcmpestri(vec1, new AMD64Address(str2, 0), pcmpmask);
 276             masm.jccb(ConditionFlag.Below, COMPARE_INDEX_CHAR);
 277 
 278             // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 279             if (kind1 == kind2) {
 280                 masm.movdqu(vec1, new AMD64Address(str1, adr_stride));
 281                 masm.pcmpestri(vec1, new AMD64Address(str2, adr_stride), pcmpmask);
 282             } else {
 283                 masm.pmovzxbw(vec1, new AMD64Address(str1, adr_stride1));
 284                 masm.pcmpestri(vec1, new AMD64Address(str2, adr_stride2), pcmpmask);
 285             }
 286             masm.jccb(ConditionFlag.AboveEqual, COMPARE_WIDE_VECTORS);
 287             masm.addl(cnt1, stride);
 288 
 289             // Compare the characters at index in cnt1
 290             masm.bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
 291             loadNextElements(masm, result, cnt2, str1, str2, scale, scale1, scale2, cnt1);
 292             masm.subl(result, cnt2);
 293             masm.jmp(POP_LABEL);
 294 
 295             // Setup the registers to start vector comparison loop
 296             masm.bind(COMPARE_WIDE_VECTORS);
 297             // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 298             if (kind1 == kind2) {
 299                 masm.leaq(str1, new AMD64Address(str1, result, scale));
 300                 masm.leaq(str2, new AMD64Address(str2, result, scale));
 301             } else {
 302                 masm.leaq(str1, new AMD64Address(str1, result, scale1));
 303                 masm.leaq(str2, new AMD64Address(str2, result, scale2));
 304             }
 305             masm.subl(result, stride2);
 306             masm.subl(cnt2, stride2);
 307             masm.jcc(ConditionFlag.Zero, COMPARE_WIDE_TAIL);
 308             masm.negq(result);
 309 
 310             // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
 311             masm.bind(COMPARE_WIDE_VECTORS_LOOP);
 312 
 313             // if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
 314             if (supportsAVX512VLBW(crb.target)) {
 315                 masm.cmpl(cnt2, stride2x2);
 316                 masm.jccb(ConditionFlag.Below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
 317                 masm.testl(cnt2, stride2x2 - 1);   // cnt2 holds the vector count
 318                 // means we cannot subtract by 0x40
 319                 masm.jccb(ConditionFlag.NotZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);
 320 
 321                 masm.bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
 322                 // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 323                 if (kind1 == kind2) {
 324                     masm.evmovdqu64(vec1, new AMD64Address(str1, result, scale));
 325                     // k7 == 11..11, if operands equal, otherwise k7 has some 0
 326                     masm.evpcmpeqb(k7, vec1, new AMD64Address(str2, result, scale));
 327                 } else {
 328                     masm.evpmovzxbw(vec1, new AMD64Address(str1, result, scale1));
 329                     // k7 == 11..11, if operands equal, otherwise k7 has some 0
 330                     masm.evpcmpeqb(k7, vec1, new AMD64Address(str2, result, scale2));
 331                 }
 332                 masm.kortestq(k7, k7);
 333                 masm.jcc(ConditionFlag.AboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
 334                 masm.addq(result, stride2x2);  // update since we already compared at this addr
 335                 masm.subl(cnt2, stride2x2);      // and sub the size too
 336                 masm.jccb(ConditionFlag.NotZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
 337 
 338                 masm.vpxor(vec1, vec1, vec1);
 339                 masm.jmpb(COMPARE_WIDE_TAIL);
 340             }
 341 
 342             masm.bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
 343             // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 344             if (kind1 == kind2) {
 345                 masm.vmovdqu(vec1, new AMD64Address(str1, result, scale));
 346                 masm.vpxor(vec1, vec1, new AMD64Address(str2, result, scale));
 347             } else {
 348                 masm.vpmovzxbw(vec1, new AMD64Address(str1, result, scale1));
 349                 masm.vpxor(vec1, vec1, new AMD64Address(str2, result, scale2));
 350             }
 351             masm.vptest(vec1, vec1);
 352             masm.jcc(ConditionFlag.NotZero, VECTOR_NOT_EQUAL);
 353             masm.addq(result, stride2);
 354             masm.subl(cnt2, stride2);
 355             masm.jcc(ConditionFlag.NotZero, COMPARE_WIDE_VECTORS_LOOP);
 356             // clean upper bits of YMM registers
 357             masm.vpxor(vec1, vec1, vec1);
 358 
 359             // compare wide vectors tail
 360             masm.bind(COMPARE_WIDE_TAIL);
 361             masm.testq(result, result);
 362             masm.jcc(ConditionFlag.Zero, LENGTH_DIFF_LABEL);
 363 
 364             masm.movl(result, stride2);
 365             masm.movl(cnt2, result);
 366             masm.negq(result);
 367             masm.jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
 368 
 369             // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
 370             masm.bind(VECTOR_NOT_EQUAL);
 371             // clean upper bits of YMM registers
 372             masm.vpxor(vec1, vec1, vec1);
 373             // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 374             if (kind1 == kind2) {
 375                 masm.leaq(str1, new AMD64Address(str1, result, scale));
 376                 masm.leaq(str2, new AMD64Address(str2, result, scale));
 377             } else {
 378                 masm.leaq(str1, new AMD64Address(str1, result, scale1));
 379                 masm.leaq(str2, new AMD64Address(str2, result, scale2));
 380             }
 381             masm.jmp(COMPARE_16_CHARS);
 382 
 383             // Compare tail chars, length between 1 to 15 chars
 384             masm.bind(COMPARE_TAIL_LONG);
 385             masm.movl(cnt2, result);
 386             masm.cmpl(cnt2, stride);
 387             masm.jcc(ConditionFlag.Less, COMPARE_SMALL_STR);
 388 
 389             // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 390             if (kind1 == kind2) {
 391                 masm.movdqu(vec1, new AMD64Address(str1, 0));
 392             } else {
 393                 masm.pmovzxbw(vec1, new AMD64Address(str1, 0));
 394             }
 395             masm.pcmpestri(vec1, new AMD64Address(str2, 0), pcmpmask);
 396             masm.jcc(ConditionFlag.Below, COMPARE_INDEX_CHAR);
 397             masm.subq(cnt2, stride);
 398             masm.jcc(ConditionFlag.Zero, LENGTH_DIFF_LABEL);
 399             // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 400             if (kind1 == kind2) {
 401                 masm.leaq(str1, new AMD64Address(str1, result, scale));
 402                 masm.leaq(str2, new AMD64Address(str2, result, scale));
 403             } else {
 404                 masm.leaq(str1, new AMD64Address(str1, result, scale1));
 405                 masm.leaq(str2, new AMD64Address(str2, result, scale2));
 406             }
 407             masm.negq(cnt2);
 408             masm.jmpb(WHILE_HEAD_LABEL);
 409 
 410             masm.bind(COMPARE_SMALL_STR);
 411         } else if (supportsSSE42(crb.target)) {
 412             Register vec1 = asRegister(vectorTemp1, AMD64Kind.DOUBLE);
 413 
 414             // Checkstyle: stop
 415             Label COMPARE_WIDE_VECTORS = new Label();
 416             Label VECTOR_NOT_EQUAL = new Label();
 417             Label COMPARE_TAIL = new Label();
 418             // Checkstyle: resume
 419             int pcmpmask = 0x19;
 420             // Setup to compare 8-char (16-byte) vectors,
 421             // start from first character again because it has aligned address.
 422             masm.movl(result, cnt2);
 423             masm.andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
 424             // if (ae == StrIntrinsicNode::LL) {
 425             if (kind1 == JavaKind.Byte && kind2 == JavaKind.Byte) {
 426                 pcmpmask &= ~0x01;
 427             }
 428             masm.jcc(ConditionFlag.Zero, COMPARE_TAIL);
 429             // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 430             if (kind1 == kind2) {
 431                 masm.leaq(str1, new AMD64Address(str1, result, scale));
 432                 masm.leaq(str2, new AMD64Address(str2, result, scale));
 433             } else {
 434                 masm.leaq(str1, new AMD64Address(str1, result, scale1));
 435                 masm.leaq(str2, new AMD64Address(str2, result, scale2));
 436             }
 437             masm.negq(result);
 438 
 439             // pcmpestri
 440             // inputs:
 441             // vec1- substring
 442             // rax - negative string length (elements count)
 443             // mem - scanned string
 444             // rdx - string length (elements count)
 445             // pcmpmask - cmp mode: 11000 (string compare with negated result)
 446             // + 00 (unsigned bytes) or + 01 (unsigned shorts)
 447             // outputs:
 448             // rcx - first mismatched element index
 449             assert result.equals(rax) && cnt2.equals(rdx) && cnt1.equals(rcx) : "pcmpestri";
 450 
 451             masm.bind(COMPARE_WIDE_VECTORS);
 452             // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 453             if (kind1 == kind2) {
 454                 masm.movdqu(vec1, new AMD64Address(str1, result, scale));
 455                 masm.pcmpestri(vec1, new AMD64Address(str2, result, scale), pcmpmask);
 456             } else {
 457                 masm.pmovzxbw(vec1, new AMD64Address(str1, result, scale1));
 458                 masm.pcmpestri(vec1, new AMD64Address(str2, result, scale2), pcmpmask);
 459             }
 460             // After pcmpestri cnt1(rcx) contains mismatched element index
 461 
 462             masm.jccb(ConditionFlag.Below, VECTOR_NOT_EQUAL);  // CF==1
 463             masm.addq(result, stride);
 464             masm.subq(cnt2, stride);
 465             masm.jccb(ConditionFlag.NotZero, COMPARE_WIDE_VECTORS);
 466 
 467             // compare wide vectors tail
 468             masm.testq(result, result);
 469             masm.jcc(ConditionFlag.Zero, LENGTH_DIFF_LABEL);
 470 
 471             masm.movl(cnt2, stride);
 472             masm.movl(result, stride);
 473             masm.negq(result);
 474             // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 475             if (kind1 == kind2) {
 476                 masm.movdqu(vec1, new AMD64Address(str1, result, scale));
 477                 masm.pcmpestri(vec1, new AMD64Address(str2, result, scale), pcmpmask);
 478             } else {
 479                 masm.pmovzxbw(vec1, new AMD64Address(str1, result, scale1));
 480                 masm.pcmpestri(vec1, new AMD64Address(str2, result, scale2), pcmpmask);
 481             }
 482             masm.jccb(ConditionFlag.AboveEqual, LENGTH_DIFF_LABEL);
 483 
 484             // Mismatched characters in the vectors
 485             masm.bind(VECTOR_NOT_EQUAL);
 486             masm.addq(cnt1, result);
 487             loadNextElements(masm, result, cnt2, str1, str2, scale, scale1, scale2, cnt1);
 488             masm.subl(result, cnt2);
 489             masm.jmpb(POP_LABEL);
 490 
 491             masm.bind(COMPARE_TAIL); // limit is zero
 492             masm.movl(cnt2, result);
 493             // Fallthru to tail compare
 494         }
 495 
 496         // Shift str2 and str1 to the end of the arrays, negate min
 497         // if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 498         if (kind1 == kind2) {
 499             masm.leaq(str1, new AMD64Address(str1, cnt2, scale));
 500             masm.leaq(str2, new AMD64Address(str2, cnt2, scale));
 501         } else {
 502             masm.leaq(str1, new AMD64Address(str1, cnt2, scale1));
 503             masm.leaq(str2, new AMD64Address(str2, cnt2, scale2));
 504         }
 505         masm.decrementl(cnt2);  // first character was compared already
 506         masm.negq(cnt2);
 507 
 508         // Compare the rest of the elements
 509         masm.bind(WHILE_HEAD_LABEL);
 510         loadNextElements(masm, result, cnt1, str1, str2, scale, scale1, scale2, cnt2);
 511         masm.subl(result, cnt1);
 512         masm.jccb(ConditionFlag.NotZero, POP_LABEL);
 513         masm.incrementq(cnt2, 1);
 514         masm.jccb(ConditionFlag.NotZero, WHILE_HEAD_LABEL);
 515 
 516         // Strings are equal up to min length. Return the length difference.
 517         masm.bind(LENGTH_DIFF_LABEL);
 518         masm.pop(result);
 519         // if (ae == StrIntrinsicNode::UU) {
 520         if (kind1 == JavaKind.Char && kind2 == JavaKind.Char) {
 521             // Divide diff by 2 to get number of chars
 522             masm.sarl(result, 1);
 523         }
 524         masm.jmpb(DONE_LABEL);
 525 
 526         // if (VM_Version::supports_avx512vlbw()) {
 527         if (supportsAVX512VLBW(crb.target)) {
 528             masm.bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
 529 
 530             masm.kmovq(cnt1, k7);
 531             masm.notq(cnt1);
 532             masm.bsfq(cnt2, cnt1);
 533             // if (ae != StrIntrinsicNode::LL) {
 534             if (kind1 != JavaKind.Byte && kind2 != JavaKind.Byte) {
 535                 // Divide diff by 2 to get number of chars
 536                 masm.sarl(cnt2, 1);
 537             }
 538             masm.addq(result, cnt2);
 539             // if (ae == StrIntrinsicNode::LL) {
 540             if (kind1 == JavaKind.Byte && kind2 == JavaKind.Byte) {
 541                 masm.movzbl(cnt1, new AMD64Address(str2, result, Scale.Times1));
 542                 masm.movzbl(result, new AMD64Address(str1, result, Scale.Times1));
 543             } else if (kind1 == JavaKind.Char && kind2 == JavaKind.Char) {
 544                 masm.movzwl(cnt1, new AMD64Address(str2, result, scale));
 545                 masm.movzwl(result, new AMD64Address(str1, result, scale));
 546             } else {
 547                 masm.movzwl(cnt1, new AMD64Address(str2, result, scale2));
 548                 masm.movzbl(result, new AMD64Address(str1, result, scale1));
 549             }
 550             masm.subl(result, cnt1);
 551             masm.jmpb(POP_LABEL);
 552         }
 553 
 554         // Discard the stored length difference
 555         masm.bind(POP_LABEL);
 556         masm.pop(cnt1);
 557 
 558         // That's it
 559         masm.bind(DONE_LABEL);
 560         // if (ae == StrIntrinsicNode::UL) {
 561         if (kind1 == JavaKind.Char && kind2 == JavaKind.Byte) {
 562             masm.negl(result);
 563         }
 564     }
 565 
 566     private void loadNextElements(AMD64MacroAssembler masm, Register elem1, Register elem2, Register str1, Register str2,
 567                     AMD64Address.Scale scale, AMD64Address.Scale scale1,
 568                     AMD64Address.Scale scale2, Register index) {
 569         // if (ae == StrIntrinsicNode::LL) {
 570         if (kind1 == JavaKind.Byte && kind2 == JavaKind.Byte) {
 571             masm.movzbl(elem1, new AMD64Address(str1, index, scale, 0));
 572             masm.movzbl(elem2, new AMD64Address(str2, index, scale, 0));
 573             // } else if (ae == StrIntrinsicNode::UU) {
 574         } else if (kind1 == JavaKind.Char && kind2 == JavaKind.Char) {
 575             masm.movzwl(elem1, new AMD64Address(str1, index, scale, 0));
 576             masm.movzwl(elem2, new AMD64Address(str2, index, scale, 0));
 577         } else {
 578             masm.movzbl(elem1, new AMD64Address(str1, index, scale1, 0));
 579             masm.movzwl(elem2, new AMD64Address(str2, index, scale2, 0));
 580         }
 581     }
 582 }