1 /*
   2  * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import static jdk.vm.ci.code.ValueUtil.asRegister;
  28 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
  29 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
  30 
  31 import org.graalvm.compiler.asm.Label;
  32 import org.graalvm.compiler.asm.amd64.AMD64Address;
  33 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
  34 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp;
  35 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp;
  36 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMOp;
  37 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRVMOp;
  38 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  39 import org.graalvm.compiler.asm.amd64.AVXKind;
  40 import org.graalvm.compiler.core.common.LIRKind;
  41 import org.graalvm.compiler.lir.LIRInstructionClass;
  42 import org.graalvm.compiler.lir.Opcode;
  43 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  44 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  45 
  46 import jdk.vm.ci.amd64.AMD64;
  47 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  48 import jdk.vm.ci.amd64.AMD64Kind;
  49 import jdk.vm.ci.code.Register;
  50 import jdk.vm.ci.meta.JavaKind;
  51 import jdk.vm.ci.meta.Value;
  52 
  53 /**
  54  */
  55 @Opcode("AMD64_ARRAY_INDEX_OF")
  56 public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction {
  57     public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class);
  58 
  59     private final JavaKind kind;
  60     private final int vmPageSize;
  61 
  62     @Def({REG}) protected Value resultValue;
  63     @Alive({REG}) protected Value charArrayPtrValue;
  64     @Use({REG}) protected Value charArrayLengthValue;
  65     @Alive({REG}) protected Value searchCharValue;
  66     @Temp({REG}) protected Value arraySlotsRemaining;
  67     @Temp({REG}) protected Value comparisonResult1;
  68     @Temp({REG}) protected Value comparisonResult2;
  69     @Temp({REG}) protected Value comparisonResult3;
  70     @Temp({REG}) protected Value comparisonResult4;
  71     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal;
  72     @Temp({REG, ILLEGAL}) protected Value vectorArray1;
  73     @Temp({REG, ILLEGAL}) protected Value vectorArray2;
  74     @Temp({REG, ILLEGAL}) protected Value vectorArray3;
  75     @Temp({REG, ILLEGAL}) protected Value vectorArray4;
  76 
  77     public AMD64ArrayIndexOfOp(
  78                     JavaKind kind,
  79                     int vmPageSize, LIRGeneratorTool tool,
  80                     Value result,
  81                     Value arrayPtr,
  82                     Value arrayLength,
  83                     Value searchChar) {
  84         super(TYPE);
  85         this.kind = kind;
  86         this.vmPageSize = vmPageSize;
  87         assert byteMode() || charMode();
  88         assert supports(tool, CPUFeature.SSSE3) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool);
  89         resultValue = result;
  90         charArrayPtrValue = arrayPtr;
  91         charArrayLengthValue = arrayLength;
  92         searchCharValue = searchChar;
  93 
  94         this.arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
  95         this.comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
  96         this.comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
  97         this.comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
  98         this.comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
  99         AMD64Kind vectorKind = byteMode() ? supportsAVX2(tool) ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE : supportsAVX2(tool) ? AMD64Kind.V256_WORD : AMD64Kind.V128_WORD;
 100         this.vectorCompareVal = tool.newVariable(LIRKind.value(vectorKind));
 101         this.vectorArray1 = tool.newVariable(LIRKind.value(vectorKind));
 102         this.vectorArray2 = tool.newVariable(LIRKind.value(vectorKind));
 103         this.vectorArray3 = tool.newVariable(LIRKind.value(vectorKind));
 104         this.vectorArray4 = tool.newVariable(LIRKind.value(vectorKind));
 105     }
 106 
 107     private boolean byteMode() {
 108         return kind == JavaKind.Byte;
 109     }
 110 
 111     private boolean charMode() {
 112         return kind == JavaKind.Char;
 113     }
 114 
 115     @Override
 116     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
 117         Register arrayPtr = asRegister(charArrayPtrValue);
 118         Register arrayLength = asRegister(charArrayLengthValue);
 119         Register searchValue = asRegister(searchCharValue);
 120         Register result = asRegister(resultValue);
 121         Register vecCmp = asRegister(vectorCompareVal);
 122         Register vecArray1 = asRegister(vectorArray1);
 123         Register vecArray2 = asRegister(vectorArray2);
 124         Register vecArray3 = asRegister(vectorArray3);
 125         Register vecArray4 = asRegister(vectorArray4);
 126         Register slotsRemaining = asRegister(arraySlotsRemaining);
 127         Register cmpResult1 = asRegister(comparisonResult1);
 128         Register cmpResult2 = asRegister(comparisonResult2);
 129         Register cmpResult3 = asRegister(comparisonResult3);
 130         Register cmpResult4 = asRegister(comparisonResult4);
 131 
 132         Label bulkVectorLoop = new Label();
 133         Label singleVectorLoop = new Label();
 134         Label vectorFound1 = new Label();
 135         Label vectorFound2 = new Label();
 136         Label vectorFound3 = new Label();
 137         Label vectorFound4 = new Label();
 138         Label lessThanVectorSizeRemaining = new Label();
 139         Label lessThanVectorSizeRemainingLoop = new Label();
 140         Label retFound = new Label();
 141         Label retNotFound = new Label();
 142         Label end = new Label();
 143 
 144         AVXKind.AVXSize vectorSize = asm.supports(CPUFeature.AVX2) ? AVXKind.AVXSize.YMM : AVXKind.AVXSize.XMM;
 145         int nVectors = 4;
 146         int bytesPerVector = vectorSize.getBytes();
 147         int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount();
 148         int bulkSize = arraySlotsPerVector * nVectors;
 149         int bulkSizeBytes = bytesPerVector * nVectors;
 150         assert bulkSizeBytes >= 64;
 151 
 152         // load array length
 153         // important: this must be the first register manipulation, since charArrayLengthValue is
 154         // annotated with @Use
 155         asm.movl(slotsRemaining, arrayLength);
 156         // move search value to vector
 157         if (asm.supports(CPUFeature.AVX)) {
 158             VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp, searchValue);
 159         } else {
 160             asm.movdl(vecCmp, searchValue);
 161         }
 162         // load array pointer
 163         asm.movq(result, arrayPtr);
 164         // load copy of low part of array pointer
 165         asm.movl(cmpResult1, arrayPtr);
 166         // fill comparison vector with copies of the search value
 167         emitBroadcast(asm, vecCmp, vecArray1, vectorSize);
 168 
 169         // check if bulk vector load is in bounds
 170         asm.cmpl(slotsRemaining, bulkSize);
 171         asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop);
 172 
 173         // check if array pointer is 64-byte aligned
 174         asm.andl(cmpResult1, 63);
 175         asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop);
 176 
 177         // do one unaligned bulk comparison pass and adjust alignment afterwards
 178         emitBulkCompare(asm, vectorSize, bytesPerVector, result, vecCmp, vecArray1, vecArray2, vecArray3, vecArray4, cmpResult1, cmpResult2, cmpResult3, cmpResult4,
 179                         vectorFound1, vectorFound2, vectorFound3, vectorFound4, false);
 180         // load copy of low part of array pointer
 181         asm.movl(cmpResult1, arrayPtr);
 182         // adjust array pointer
 183         asm.addq(result, bulkSizeBytes);
 184         // adjust number of array slots remaining
 185         asm.subl(slotsRemaining, bulkSize);
 186         // get offset to 64-byte alignment
 187         asm.andl(cmpResult1, 63);
 188         emitBytesToArraySlots(asm, cmpResult1);
 189         // adjust array pointer to 64-byte alignment
 190         asm.andq(result, ~63);
 191         // adjust number of array slots remaining
 192         asm.addl(slotsRemaining, cmpResult1);
 193         // check if there are enough array slots remaining for the bulk loop
 194         asm.cmpl(slotsRemaining, bulkSize);
 195         asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop);
 196 
 197         emitAlign(crb, asm);
 198         asm.bind(bulkVectorLoop);
 199         // memory-aligned bulk comparison
 200         emitBulkCompare(asm, vectorSize, bytesPerVector, result, vecCmp, vecArray1, vecArray2, vecArray3, vecArray4, cmpResult1, cmpResult2, cmpResult3, cmpResult4,
 201                         vectorFound1, vectorFound2, vectorFound3, vectorFound4, true);
 202         // adjust number of array slots remaining
 203         asm.subl(slotsRemaining, bulkSize);
 204         // adjust array pointer
 205         asm.addq(result, bulkSizeBytes);
 206         // check if there are enough array slots remaining for the bulk loop
 207         asm.cmpl(slotsRemaining, bulkSize);
 208         asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop);
 209         // continue loop
 210         asm.jmpb(bulkVectorLoop);
 211 
 212         emitAlign(crb, asm);
 213         // same loop as bulkVectorLoop, with only one vector
 214         asm.bind(singleVectorLoop);
 215         // check if single vector load is in bounds
 216         asm.cmpl(slotsRemaining, arraySlotsPerVector);
 217         asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining);
 218         // compare
 219         emitSingleVectorCompare(asm, vectorSize, result, vecCmp, vecArray1, cmpResult1);
 220 
 221         // check if a match was found
 222         asm.testl(cmpResult1, cmpResult1);
 223         asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound1);
 224         // adjust number of array slots remaining
 225         asm.subl(slotsRemaining, arraySlotsPerVector);
 226         // adjust array pointer
 227         asm.addq(result, bytesPerVector);
 228         // continue loop
 229         asm.jmpb(singleVectorLoop);
 230 
 231         asm.bind(lessThanVectorSizeRemaining);
 232         // check if any array slots remain
 233         asm.testl(slotsRemaining, slotsRemaining);
 234         asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
 235 
 236         // a vector compare will read out of bounds of the input array.
 237         // check if the out-of-bounds read would cross a memory page boundary.
 238         // load copy of low part of array pointer
 239         asm.movl(cmpResult1, result);
 240         // check if pointer + vector size would cross the page boundary
 241         asm.andl(cmpResult1, (vmPageSize - 1));
 242         asm.cmpl(cmpResult1, (vmPageSize - bytesPerVector));
 243         // if the page boundary would be crossed, do byte/character-wise comparison instead.
 244         asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop);
 245         // otherwise, do a vector compare that reads beyond array bounds
 246         emitSingleVectorCompare(asm, vectorSize, result, vecCmp, vecArray1, cmpResult1);
 247         // check if a match was found
 248         asm.testl(cmpResult1, cmpResult1);
 249         asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
 250         // find match offset
 251         asm.bsfq(cmpResult1, cmpResult1);
 252         if (charMode()) {
 253             // convert number of remaining characters to bytes
 254             asm.shll(slotsRemaining, 1);
 255         }
 256         // adjust array pointer for match result
 257         asm.addq(result, cmpResult1);
 258         // check if offset of matched value is greater than number of bytes remaining / out of array
 259         // bounds
 260         asm.cmpl(cmpResult1, slotsRemaining);
 261         // match is out of bounds, return no match
 262         asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound);
 263         // match is in bounds, return offset
 264         asm.jmpb(retFound);
 265 
 266         // compare remaining slots in the array one-by-one
 267         asm.bind(lessThanVectorSizeRemainingLoop);
 268         // check if any array slots remain
 269         asm.testl(slotsRemaining, slotsRemaining);
 270         asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
 271         // load char / byte
 272         AMD64Assembler.OperandSize operandSize = byteMode() ? AMD64Assembler.OperandSize.BYTE : AMD64Assembler.OperandSize.WORD;
 273         if (byteMode()) {
 274             AMD64Assembler.AMD64RMOp.MOVB.emit(asm, operandSize, cmpResult1, new AMD64Address(result));
 275         } else {
 276             AMD64Assembler.AMD64RMOp.MOV.emit(asm, operandSize, cmpResult1, new AMD64Address(result));
 277         }
 278         // check for match
 279         AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(operandSize).emit(asm, operandSize, cmpResult1, searchValue);
 280         asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound);
 281         // adjust number of array slots remaining
 282         asm.decrementl(slotsRemaining);
 283         // adjust array pointer
 284         asm.addq(result, kind.getByteCount());
 285         // continue loop
 286         asm.jmpb(lessThanVectorSizeRemainingLoop);
 287 
 288         // return -1 (no match)
 289         asm.bind(retNotFound);
 290         asm.movl(result, -1);
 291         asm.jmpb(end);
 292 
 293         emitVectorFoundWithOffset(asm, bytesPerVector, result, cmpResult2, vectorFound2, retFound);
 294         emitVectorFoundWithOffset(asm, bytesPerVector * 2, result, cmpResult3, vectorFound3, retFound);
 295         emitVectorFoundWithOffset(asm, bytesPerVector * 3, result, cmpResult4, vectorFound4, retFound);
 296 
 297         asm.bind(vectorFound1);
 298         // find index of first set bit in bit mask
 299         asm.bsfq(cmpResult1, cmpResult1);
 300         // add offset to array pointer
 301         asm.addq(result, cmpResult1);
 302 
 303         asm.bind(retFound);
 304         // convert array pointer to offset
 305         asm.subq(result, arrayPtr);
 306         emitBytesToArraySlots(asm, result);
 307         asm.bind(end);
 308     }
 309 
 310     private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
 311         asm.align(crb.target.wordSize * 2);
 312     }
 313 
 314     /**
 315      * Fills {@code vecDst} with copies of its lowest byte or word.
 316      */
 317     private void emitBroadcast(AMD64MacroAssembler asm, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) {
 318         if (asm.supports(CPUFeature.AVX2)) {
 319             if (byteMode()) {
 320                 VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst);
 321             } else {
 322                 VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst);
 323             }
 324         } else if (asm.supports(CPUFeature.AVX)) {
 325             if (byteMode()) {
 326                 // fill vecTmp with zeroes
 327                 VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp);
 328                 // broadcast loaded search value
 329                 VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp);
 330             } else {
 331                 // fill low qword
 332                 VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0);
 333                 // copy low qword to high qword
 334                 VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
 335             }
 336         } else {
 337             // SSE version
 338             if (byteMode()) {
 339                 // fill vecTmp with zeroes
 340                 asm.pxor(vecTmp, vecTmp);
 341                 // broadcast loaded search value
 342                 asm.pshufb(vecDst, vecTmp);
 343             } else {
 344                 // fill low qword
 345                 asm.pshuflw(vecDst, vecDst, 0);
 346                 // copy low qword to high qword
 347                 asm.pshufd(vecDst, vecDst, 0);
 348             }
 349         }
 350     }
 351 
 352     /**
 353      * Loads {@code vectorSize} bytes from the position pointed to by {@code arrayPtr} and compares
 354      * them to the search value stored in {@code vecCmp}. {@code vecArray} is overwritten by this
 355      * operation. The comparison result is stored in {@code cmpResult}.
 356      */
 357     private void emitSingleVectorCompare(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize,
 358                     Register arrayPtr, Register vecCmp, Register vecArray, Register cmpResult) {
 359         // load array contents into vector
 360         emitArrayLoad(asm, vectorSize, vecArray, arrayPtr, 0, false);
 361         // compare all loaded bytes to the search value.
 362         emitVectorCompare(asm, vectorSize, vecArray, vecCmp);
 363         // create a 32-bit-mask from the most significant bit of every byte in the comparison
 364         // result.
 365         emitMOVMSK(asm, vectorSize, cmpResult, vecArray);
 366     }
 367 
 368     /**
 369      * Convert a byte offset stored in {@code bytes} to an array index offset.
 370      */
 371     private void emitBytesToArraySlots(AMD64MacroAssembler asm, Register bytes) {
 372         if (charMode()) {
 373             asm.shrl(bytes, 1);
 374         } else {
 375             assert byteMode();
 376         }
 377     }
 378 
 379     private void emitBulkCompare(AMD64MacroAssembler asm,
 380                     AVXKind.AVXSize vectorSize,
 381                     int bytesPerVector,
 382                     Register arrayPtr,
 383                     Register vecCmp,
 384                     Register vecArray1,
 385                     Register vecArray2,
 386                     Register vecArray3,
 387                     Register vecArray4,
 388                     Register cmpResult1,
 389                     Register cmpResult2,
 390                     Register cmpResult3,
 391                     Register cmpResult4,
 392                     Label vectorFound1,
 393                     Label vectorFound2,
 394                     Label vectorFound3,
 395                     Label vectorFound4,
 396                     boolean alignedLoad) {
 397         // load array contents into vectors
 398         emitArrayLoad(asm, vectorSize, vecArray1, arrayPtr, 0, alignedLoad);
 399         emitArrayLoad(asm, vectorSize, vecArray2, arrayPtr, bytesPerVector, alignedLoad);
 400         emitArrayLoad(asm, vectorSize, vecArray3, arrayPtr, bytesPerVector * 2, alignedLoad);
 401         emitArrayLoad(asm, vectorSize, vecArray4, arrayPtr, bytesPerVector * 3, alignedLoad);
 402         // compare all loaded bytes to the search value.
 403         // matching bytes are set to 0xff, non-matching bytes are set to 0x00.
 404         emitVectorCompare(asm, vectorSize, vecArray1, vecCmp);
 405         emitVectorCompare(asm, vectorSize, vecArray2, vecCmp);
 406         emitVectorCompare(asm, vectorSize, vecArray3, vecCmp);
 407         emitVectorCompare(asm, vectorSize, vecArray4, vecCmp);
 408         // create 32-bit-masks from the most significant bit of every byte in the comparison
 409         // results.
 410         emitMOVMSK(asm, vectorSize, cmpResult1, vecArray1);
 411         emitMOVMSK(asm, vectorSize, cmpResult2, vecArray2);
 412         emitMOVMSK(asm, vectorSize, cmpResult3, vecArray3);
 413         emitMOVMSK(asm, vectorSize, cmpResult4, vecArray4);
 414         // check if a match was found
 415         asm.testl(cmpResult1, cmpResult1);
 416         asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound1);
 417         asm.testl(cmpResult2, cmpResult2);
 418         asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound2);
 419         asm.testl(cmpResult3, cmpResult3);
 420         asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound3);
 421         asm.testl(cmpResult4, cmpResult4);
 422         asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound4);
 423     }
 424 
 425     private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm, int resultOffset, Register result, Register cmpResult, Label entry, Label ret) {
 426         asm.bind(entry);
 427         if (resultOffset > 0) {
 428             // adjust array pointer
 429             asm.addq(result, resultOffset);
 430         }
 431         // find index of first set bit in bit mask
 432         asm.bsfq(cmpResult, cmpResult);
 433         // add offset to array pointer
 434         asm.addq(result, cmpResult);
 435         asm.jmpb(ret);
 436     }
 437 
 438     private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) {
 439         AMD64Address src = new AMD64Address(arrayPtr, offset);
 440         if (asm.supports(CPUFeature.AVX)) {
 441             VexMoveOp loadOp = alignedLoad ? VexMoveOp.VMOVDQA : VexMoveOp.VMOVDQU;
 442             loadOp.emit(asm, vectorSize, vecDst, src);
 443         } else {
 444             // SSE
 445             asm.movdqu(vecDst, src);
 446         }
 447     }
 448 
 449     private void emitVectorCompare(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) {
 450         // compare all loaded bytes to the search value.
 451         // matching bytes are set to 0xff, non-matching bytes are set to 0x00.
 452         if (asm.supports(CPUFeature.AVX)) {
 453             if (byteMode()) {
 454                 VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
 455             } else {
 456                 VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
 457             }
 458         } else {
 459             // SSE
 460             if (byteMode()) {
 461                 asm.pcmpeqb(vecArray, vecCmp);
 462             } else {
 463                 asm.pcmpeqw(vecArray, vecCmp);
 464             }
 465         }
 466     }
 467 
 468     private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) {
 469         if (asm.supports(CPUFeature.AVX)) {
 470             VexRMOp.VPMOVMSKB.emit(asm, vectorSize, dst, vecSrc);
 471         } else {
 472             // SSE
 473             asm.pmovmskb(dst, vecSrc);
 474         }
 475     }
 476 
 477     private static boolean supportsAVX2(LIRGeneratorTool tool) {
 478         return supports(tool, CPUFeature.AVX2);
 479     }
 480 
 481     private static boolean supports(LIRGeneratorTool tool, CPUFeature cpuFeature) {
 482         return ((AMD64) tool.target().arch).getFeatures().contains(cpuFeature);
 483     }
 484 }