1 /*
   2  * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import jdk.vm.ci.amd64.AMD64;
  28 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  29 import jdk.vm.ci.amd64.AMD64Kind;
  30 import jdk.vm.ci.code.Register;
  31 import jdk.vm.ci.meta.JavaKind;
  32 import jdk.vm.ci.meta.Value;
  33 import org.graalvm.compiler.asm.Label;
  34 import org.graalvm.compiler.asm.amd64.AMD64Address;
  35 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
  36 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp;
  37 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp;
  38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMOp;
  39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRVMOp;
  40 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  41 import org.graalvm.compiler.asm.amd64.AVXKind;
  42 import org.graalvm.compiler.core.common.LIRKind;
  43 import org.graalvm.compiler.lir.LIRInstructionClass;
  44 import org.graalvm.compiler.lir.Opcode;
  45 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  46 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  47 
  48 import static jdk.vm.ci.code.ValueUtil.asRegister;
  49 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
  50 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
  51 
  52 /**
  53  */
  54 @Opcode("AMD64_ARRAY_INDEX_OF")
  55 public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction {
  56     public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class);
  57 
  58     private final JavaKind kind;
  59     private final int vmPageSize;
  60     private final int nValues;
  61     private final boolean findTwoConsecutive;
  62     private final AMD64Kind vectorKind;
  63 
  64     @Def({REG}) protected Value resultValue;
  65     @Alive({REG}) protected Value arrayPtrValue;
  66     @Use({REG}) protected Value arrayLengthValue;
  67     @Alive({REG}) protected Value searchValue1;
  68     @Alive({REG, ILLEGAL}) protected Value searchValue2;
  69     @Alive({REG, ILLEGAL}) protected Value searchValue3;
  70     @Alive({REG, ILLEGAL}) protected Value searchValue4;
  71     @Temp({REG}) protected Value arraySlotsRemaining;
  72     @Temp({REG}) protected Value comparisonResult1;
  73     @Temp({REG}) protected Value comparisonResult2;
  74     @Temp({REG}) protected Value comparisonResult3;
  75     @Temp({REG}) protected Value comparisonResult4;
  76     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal1;
  77     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal2;
  78     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal3;
  79     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal4;
  80     @Temp({REG, ILLEGAL}) protected Value vectorArray1;
  81     @Temp({REG, ILLEGAL}) protected Value vectorArray2;
  82     @Temp({REG, ILLEGAL}) protected Value vectorArray3;
  83     @Temp({REG, ILLEGAL}) protected Value vectorArray4;
  84 
  85     public AMD64ArrayIndexOfOp(JavaKind kind, boolean findTwoConsecutive, int vmPageSize, int maxVectorSize, LIRGeneratorTool tool, Value result, Value arrayPtr, Value arrayLength,
  86                     Value... searchValues) {
  87         super(TYPE);
  88         this.kind = kind;
  89         this.findTwoConsecutive = findTwoConsecutive;
  90         this.vmPageSize = vmPageSize;
  91         assert 0 < searchValues.length && searchValues.length <= 4;
  92         assert byteMode(kind) || charMode(kind);
  93         assert supports(tool, CPUFeature.SSE2) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool);
  94         nValues = searchValues.length;
  95         assert !findTwoConsecutive || nValues == 1;
  96         resultValue = result;
  97         arrayPtrValue = arrayPtr;
  98         arrayLengthValue = arrayLength;
  99         searchValue1 = searchValues[0];
 100         searchValue2 = nValues > 1 ? searchValues[1] : Value.ILLEGAL;
 101         searchValue3 = nValues > 2 ? searchValues[2] : Value.ILLEGAL;
 102         searchValue4 = nValues > 3 ? searchValues[3] : Value.ILLEGAL;
 103         arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
 104         comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
 105         comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
 106         comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
 107         comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
 108         vectorKind = supportsAVX2(tool) && (maxVectorSize < 0 || maxVectorSize >= 32) ? byteMode(kind) ? AMD64Kind.V256_BYTE : AMD64Kind.V256_WORD
 109                         : byteMode(kind) ? AMD64Kind.V128_BYTE : AMD64Kind.V128_WORD;
 110         vectorCompareVal1 = tool.newVariable(LIRKind.value(vectorKind));
 111         vectorCompareVal2 = nValues > 1 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
 112         vectorCompareVal3 = nValues > 2 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
 113         vectorCompareVal4 = nValues > 3 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
 114         vectorArray1 = tool.newVariable(LIRKind.value(vectorKind));
 115         vectorArray2 = tool.newVariable(LIRKind.value(vectorKind));
 116         vectorArray3 = tool.newVariable(LIRKind.value(vectorKind));
 117         vectorArray4 = tool.newVariable(LIRKind.value(vectorKind));
 118     }
 119 
 120     private static boolean byteMode(JavaKind kind) {
 121         return kind == JavaKind.Byte;
 122     }
 123 
 124     private static boolean charMode(JavaKind kind) {
 125         return kind == JavaKind.Char;
 126     }
 127 
 128     private JavaKind getComparisonKind() {
 129         return findTwoConsecutive ? (byteMode(kind) ? JavaKind.Char : JavaKind.Int) : kind;
 130     }
 131 
 132     private AVXKind.AVXSize getVectorSize() {
 133         return AVXKind.getDataSize(vectorKind);
 134     }
 135 
 136     @Override
 137     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
 138         Register arrayPtr = asRegister(arrayPtrValue);
 139         Register arrayLength = asRegister(arrayLengthValue);
 140         Register result = asRegister(resultValue);
 141         Register slotsRemaining = asRegister(arraySlotsRemaining);
 142         Register[] searchValue = {
 143                         nValues > 0 ? asRegister(searchValue1) : null,
 144                         nValues > 1 ? asRegister(searchValue2) : null,
 145                         nValues > 2 ? asRegister(searchValue3) : null,
 146                         nValues > 3 ? asRegister(searchValue4) : null,
 147         };
 148         Register[] vecCmp = {
 149                         nValues > 0 ? asRegister(vectorCompareVal1) : null,
 150                         nValues > 1 ? asRegister(vectorCompareVal2) : null,
 151                         nValues > 2 ? asRegister(vectorCompareVal3) : null,
 152                         nValues > 3 ? asRegister(vectorCompareVal4) : null,
 153         };
 154         Register[] vecArray = {
 155                         asRegister(vectorArray1),
 156                         asRegister(vectorArray2),
 157                         asRegister(vectorArray3),
 158                         asRegister(vectorArray4),
 159         };
 160         Register[] cmpResult = {
 161                         asRegister(comparisonResult1),
 162                         asRegister(comparisonResult2),
 163                         asRegister(comparisonResult3),
 164                         asRegister(comparisonResult4),
 165         };
 166         Label retFound = new Label();
 167         Label retNotFound = new Label();
 168         Label end = new Label();
 169 
 170         // load array length
 171         // important: this must be the first register manipulation, since arrayLengthValue is
 172         // annotated with @Use
 173         asm.movl(slotsRemaining, arrayLength);
 174         // load array pointer
 175         asm.movq(result, arrayPtr);
 176         // move search values to vectors
 177         for (int i = 0; i < nValues; i++) {
 178             if (asm.supports(CPUFeature.AVX)) {
 179                 VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp[i], searchValue[i]);
 180             } else {
 181                 asm.movdl(vecCmp[i], searchValue[i]);
 182             }
 183         }
 184         // fill comparison vector with copies of the search value
 185         for (int i = 0; i < nValues; i++) {
 186             emitBroadcast(asm, getComparisonKind(), vecCmp[i], vecArray[0], getVectorSize());
 187         }
 188 
 189         emitArrayIndexOfChars(crb, asm, result, slotsRemaining, searchValue, vecCmp, vecArray, cmpResult, retFound, retNotFound);
 190 
 191         // return -1 (no match)
 192         asm.bind(retNotFound);
 193         asm.movq(result, -1);
 194         asm.jmpb(end);
 195 
 196         asm.bind(retFound);
 197         // convert array pointer to offset
 198         asm.subq(result, arrayPtr);
 199         if (charMode(kind)) {
 200             asm.shrq(result, 1);
 201         }
 202         asm.bind(end);
 203     }
 204 
 205     private void emitArrayIndexOfChars(CompilationResultBuilder crb, AMD64MacroAssembler asm,
 206                     Register arrayPtr,
 207                     Register slotsRemaining,
 208                     Register[] searchValue,
 209                     Register[] vecCmp,
 210                     Register[] vecArray,
 211                     Register[] cmpResult,
 212                     Label retFound,
 213                     Label retNotFound) {
 214         int nVectors = nValues == 1 ? 4 : nValues == 2 ? 2 : 1;
 215         AVXKind.AVXSize vectorSize = getVectorSize();
 216 
 217         Label bulkVectorLoop = new Label();
 218         Label singleVectorLoop = new Label();
 219         Label[] vectorFound = {
 220                         new Label(),
 221                         new Label(),
 222                         new Label(),
 223                         new Label(),
 224         };
 225         Label lessThanVectorSizeRemaining = new Label();
 226         Label lessThanVectorSizeRemainingLoop = new Label();
 227         Label bulkVectorLoopExit = nVectors == 1 ? lessThanVectorSizeRemaining : singleVectorLoop;
 228         int bytesPerVector = vectorSize.getBytes();
 229         int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount();
 230         int singleVectorLoopCondition = arraySlotsPerVector;
 231         int bulkSize = arraySlotsPerVector * nVectors;
 232         int bulkSizeBytes = bytesPerVector * nVectors;
 233         int bulkLoopCondition = bulkSize;
 234         int[] vectorOffsets;
 235         JavaKind vectorCompareKind = kind;
 236         if (findTwoConsecutive) {
 237             singleVectorLoopCondition++;
 238             bulkLoopCondition++;
 239             bulkSize /= 2;
 240             bulkSizeBytes /= 2;
 241             vectorOffsets = new int[]{0, kind.getByteCount(), bytesPerVector, bytesPerVector + kind.getByteCount()};
 242             vectorCompareKind = byteMode(kind) ? JavaKind.Char : JavaKind.Int;
 243         } else {
 244             vectorOffsets = new int[]{0, bytesPerVector, bytesPerVector * 2, bytesPerVector * 3};
 245         }
 246 
 247         // load copy of low part of array pointer
 248         Register tmpArrayPtrLow = cmpResult[0];
 249         asm.movl(tmpArrayPtrLow, arrayPtr);
 250 
 251         // check if bulk vector load is in bounds
 252         asm.cmpl(slotsRemaining, bulkLoopCondition);
 253         asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
 254 
 255         // check if array pointer is aligned to bulkSize
 256         asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1);
 257         asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop);
 258 
 259         // do one unaligned bulk comparison pass and adjust alignment afterwards
 260         emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false);
 261         // load copy of low part of array pointer
 262         asm.movl(tmpArrayPtrLow, arrayPtr);
 263         // adjust array pointer
 264         asm.addq(arrayPtr, bulkSizeBytes);
 265         // adjust number of array slots remaining
 266         asm.subl(slotsRemaining, bulkSize);
 267         // get offset to bulk size alignment
 268         asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1);
 269         emitBytesToArraySlots(asm, kind, tmpArrayPtrLow);
 270         // adjust array pointer to bulk size alignment
 271         asm.andq(arrayPtr, ~(bulkSizeBytes - 1));
 272         // adjust number of array slots remaining
 273         asm.addl(slotsRemaining, tmpArrayPtrLow);
 274         // check if there are enough array slots remaining for the bulk loop
 275         asm.cmpl(slotsRemaining, bulkLoopCondition);
 276         asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
 277 
 278         emitAlign(crb, asm);
 279         asm.bind(bulkVectorLoop);
 280         // memory-aligned bulk comparison
 281         emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, !findTwoConsecutive);
 282         // adjust number of array slots remaining
 283         asm.subl(slotsRemaining, bulkSize);
 284         // adjust array pointer
 285         asm.addq(arrayPtr, bulkSizeBytes);
 286         // check if there are enough array slots remaining for the bulk loop
 287         asm.cmpl(slotsRemaining, bulkLoopCondition);
 288         asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
 289         // continue loop
 290         asm.jmp(bulkVectorLoop);
 291 
 292         if (nVectors > 1) {
 293             emitAlign(crb, asm);
 294             // same loop as bulkVectorLoop, with only one vector
 295             asm.bind(singleVectorLoop);
 296             // check if single vector load is in bounds
 297             asm.cmpl(slotsRemaining, singleVectorLoopCondition);
 298             asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining);
 299             // compare
 300             emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoConsecutive ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false);
 301             // adjust number of array slots remaining
 302             asm.subl(slotsRemaining, arraySlotsPerVector);
 303             // adjust array pointer
 304             asm.addq(arrayPtr, bytesPerVector);
 305             // continue loop
 306             asm.jmpb(singleVectorLoop);
 307         }
 308 
 309         asm.bind(lessThanVectorSizeRemaining);
 310         // check if any array slots remain
 311         asm.testl(slotsRemaining, slotsRemaining);
 312         asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
 313 
 314         // a vector compare will read out of bounds of the input array.
 315         // check if the out-of-bounds read would cross a memory page boundary.
 316         // load copy of low part of array pointer
 317         asm.movl(tmpArrayPtrLow, arrayPtr);
 318         // check if pointer + vector size would cross the page boundary
 319         asm.andl(tmpArrayPtrLow, (vmPageSize - 1));
 320         asm.cmpl(tmpArrayPtrLow, (vmPageSize - (findTwoConsecutive ? bytesPerVector + kind.getByteCount() : bytesPerVector)));
 321         // if the page boundary would be crossed, do byte/character-wise comparison instead.
 322         asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop);
 323 
 324         Label[] overBoundsMatch = {new Label(), new Label()};
 325         // otherwise, do a vector compare that reads beyond array bounds
 326         emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoConsecutive ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, overBoundsMatch, false);
 327         // no match
 328         asm.jmp(retNotFound);
 329         if (findTwoConsecutive) {
 330             Label overBoundsFinish = new Label();
 331             asm.bind(overBoundsMatch[1]);
 332             // get match offset of second result
 333             asm.bsfq(cmpResult[1], cmpResult[1]);
 334             asm.addl(cmpResult[1], kind.getByteCount());
 335             // replace first result with second and continue
 336             asm.movl(cmpResult[0], cmpResult[1]);
 337             asm.jmpb(overBoundsFinish);
 338 
 339             asm.bind(overBoundsMatch[0]);
 340             emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, overBoundsFinish);
 341         } else {
 342             asm.bind(overBoundsMatch[0]);
 343             // find match offset
 344             asm.bsfq(cmpResult[0], cmpResult[0]);
 345         }
 346 
 347         // adjust array pointer for match result
 348         asm.addq(arrayPtr, cmpResult[0]);
 349         if (charMode(kind)) {
 350             // convert byte offset to chars
 351             asm.shrl(cmpResult[0], 1);
 352         }
 353         // check if offset of matched value is greater than number of bytes remaining / out of array
 354         // bounds
 355         if (findTwoConsecutive) {
 356             asm.decrementl(slotsRemaining);
 357         }
 358         asm.cmpl(cmpResult[0], slotsRemaining);
 359         // match is out of bounds, return no match
 360         asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound);
 361         // adjust number of array slots remaining
 362         if (findTwoConsecutive) {
 363             asm.incrementl(slotsRemaining, 1);
 364         }
 365         asm.subl(slotsRemaining, cmpResult[0]);
 366         // match is in bounds, return offset
 367         asm.jmp(retFound);
 368 
 369         // compare remaining slots in the array one-by-one
 370         asm.bind(lessThanVectorSizeRemainingLoop);
 371         // check if enough array slots remain
 372         asm.cmpl(slotsRemaining, findTwoConsecutive ? 1 : 0);
 373         asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, retNotFound);
 374         // load char / byte
 375         if (byteMode(kind)) {
 376             if (findTwoConsecutive) {
 377                 asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr));
 378             } else {
 379                 asm.movzbl(cmpResult[0], new AMD64Address(arrayPtr));
 380             }
 381         } else {
 382             if (findTwoConsecutive) {
 383                 asm.movl(cmpResult[0], new AMD64Address(arrayPtr));
 384             } else {
 385                 asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr));
 386             }
 387         }
 388         // check for match
 389         for (int i = 0; i < nValues; i++) {
 390             emitCompareInst(asm, getComparisonKind(), cmpResult[0], searchValue[i]);
 391             asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound);
 392         }
 393         // adjust number of array slots remaining
 394         asm.decrementl(slotsRemaining);
 395         // adjust array pointer
 396         asm.addq(arrayPtr, kind.getByteCount());
 397         // continue loop
 398         asm.jmpb(lessThanVectorSizeRemainingLoop);
 399 
 400         for (int i = 1; i < nVectors; i += (findTwoConsecutive ? 2 : 1)) {
 401             emitVectorFoundWithOffset(asm, kind, vectorOffsets[i], arrayPtr, cmpResult[i], slotsRemaining, vectorFound[i], retFound);
 402         }
 403 
 404         if (findTwoConsecutive) {
 405             asm.bind(vectorFound[2]);
 406             asm.addq(arrayPtr, vectorOffsets[2]);
 407             // adjust number of array slots remaining
 408             asm.subl(slotsRemaining, charMode(kind) ? vectorOffsets[2] / 2 : vectorOffsets[2]);
 409             asm.movl(cmpResult[0], cmpResult[2]);
 410             asm.movl(cmpResult[1], cmpResult[3]);
 411             asm.bind(vectorFound[0]);
 412             emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, new Label());
 413         } else {
 414             asm.bind(vectorFound[0]);
 415             // find index of first set bit in bit mask
 416             asm.bsfq(cmpResult[0], cmpResult[0]);
 417         }
 418         // add offset to array pointer
 419         asm.addq(arrayPtr, cmpResult[0]);
 420         if (charMode(kind)) {
 421             // convert byte offset to chars
 422             asm.shrl(cmpResult[0], 1);
 423         }
 424         // adjust number of array slots remaining
 425         asm.subl(slotsRemaining, cmpResult[0]);
 426         asm.jmpb(retFound);
 427     }
 428 
 429     private static void emitFindTwoCharPrefixMinResult(AMD64MacroAssembler asm, JavaKind kind, Register[] cmpResult, Label done) {
 430         // find match offset
 431         asm.bsfq(cmpResult[0], cmpResult[0]);
 432         // check if second result is also a match
 433         asm.testl(cmpResult[1], cmpResult[1]);
 434         asm.jcc(AMD64Assembler.ConditionFlag.Zero, done);
 435         // get match offset of second result
 436         asm.bsfq(cmpResult[1], cmpResult[1]);
 437         asm.addl(cmpResult[1], kind.getByteCount());
 438         // check if first result is less than second
 439         asm.cmpl(cmpResult[0], cmpResult[1]);
 440         asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, done);
 441         // first result is greater than second, replace it with the second result
 442         asm.movl(cmpResult[0], cmpResult[1]);
 443         asm.bind(done);
 444     }
 445 
 446     private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
 447         asm.align(crb.target.wordSize * 2);
 448     }
 449 
 450     /**
 451      * Fills {@code vecDst} with copies of its lowest byte, word or dword.
 452      */
 453     private static void emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) {
 454         switch (kind) {
 455             case Byte:
 456                 if (asm.supports(CPUFeature.AVX2)) {
 457                     VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst);
 458                 } else if (asm.supports(CPUFeature.AVX)) {
 459                     VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp);
 460                     VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp);
 461                 } else if (asm.supports(CPUFeature.SSSE3)) {
 462                     asm.pxor(vecTmp, vecTmp);
 463                     asm.pshufb(vecDst, vecTmp);
 464                 } else { // SSE2
 465                     asm.punpcklbw(vecDst, vecDst);
 466                     asm.punpcklbw(vecDst, vecDst);
 467                     asm.pshufd(vecDst, vecDst, 0);
 468                 }
 469                 break;
 470             case Short:
 471             case Char:
 472                 if (asm.supports(CPUFeature.AVX2)) {
 473                     VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst);
 474                 } else if (asm.supports(CPUFeature.AVX)) {
 475                     VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0);
 476                     VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
 477                 } else { // SSE
 478                     asm.pshuflw(vecDst, vecDst, 0);
 479                     asm.pshufd(vecDst, vecDst, 0);
 480                 }
 481                 break;
 482             case Int:
 483                 if (asm.supports(CPUFeature.AVX2)) {
 484                     VexRMOp.VPBROADCASTD.emit(asm, vectorSize, vecDst, vecDst);
 485                 } else if (asm.supports(CPUFeature.AVX)) {
 486                     VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
 487                 } else { // SSE
 488                     asm.pshufd(vecDst, vecDst, 0);
 489                 }
 490                 break;
 491             default:
 492                 throw new UnsupportedOperationException();
 493         }
 494     }
 495 
 496     /**
 497      * Convert a byte offset stored in {@code bytes} to an array index offset.
 498      */
 499     private static void emitBytesToArraySlots(AMD64MacroAssembler asm, JavaKind kind, Register bytes) {
 500         if (charMode(kind)) {
 501             asm.shrl(bytes, 1);
 502         } else {
 503             assert byteMode(kind);
 504         }
 505     }
 506 
 507     private static void emitVectorCompare(AMD64MacroAssembler asm,
 508                     JavaKind kind,
 509                     AVXKind.AVXSize vectorSize,
 510                     int nValues,
 511                     int nVectors,
 512                     int[] vectorOffsets,
 513                     Register arrayPtr,
 514                     Register[] vecCmp,
 515                     Register[] vecArray,
 516                     Register[] cmpResult,
 517                     Label[] vectorFound,
 518                     boolean alignedLoad) {
 519         // load array contents into vectors
 520         for (int i = 0; i < nValues; i++) {
 521             for (int j = 0; j < nVectors; j++) {
 522                 emitArrayLoad(asm, vectorSize, vecArray[(i * nVectors) + j], arrayPtr, vectorOffsets[j], alignedLoad);
 523             }
 524         }
 525         // compare all loaded bytes to the search value.
 526         // matching bytes are set to 0xff, non-matching bytes are set to 0x00.
 527         for (int i = 0; i < nValues; i++) {
 528             for (int j = 0; j < nVectors; j++) {
 529                 emitVectorCompareInst(asm, kind, vectorSize, vecArray[(i * nVectors) + j], vecCmp[i]);
 530             }
 531         }
 532         // create 32-bit-masks from the most significant bit of every byte in the comparison
 533         // results.
 534         for (int i = 0; i < nValues * nVectors; i++) {
 535             emitMOVMSK(asm, vectorSize, cmpResult[i], vecArray[i]);
 536         }
 537         // join results of comparisons against multiple values
 538         for (int stride = 1; stride < nValues; stride *= 2) {
 539             for (int i = 0; i < nVectors; i++) {
 540                 for (int j = 0; j + stride < nValues; j += stride * 2) {
 541                     asm.orl(cmpResult[i + (j * nVectors)], cmpResult[i + ((j + stride) * nVectors)]);
 542                 }
 543             }
 544         }
 545         // check if a match was found
 546         for (int i = 0; i < nVectors; i++) {
 547             asm.testl(cmpResult[i], cmpResult[i]);
 548             asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound[i]);
 549         }
 550     }
 551 
 552     private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm,
 553                     JavaKind kind,
 554                     int resultOffset,
 555                     Register result,
 556                     Register cmpResult,
 557                     Register slotsRemaining,
 558                     Label entry,
 559                     Label ret) {
 560         asm.bind(entry);
 561         if (resultOffset > 0) {
 562             // adjust array pointer
 563             asm.addq(result, resultOffset);
 564             // adjust number of array slots remaining
 565             asm.subl(slotsRemaining, charMode(kind) ? resultOffset / 2 : resultOffset);
 566         }
 567         // find index of first set bit in bit mask
 568         asm.bsfq(cmpResult, cmpResult);
 569         // add offset to array pointer
 570         asm.addq(result, cmpResult);
 571         if (charMode(kind)) {
 572             // convert byte offset to chars
 573             asm.shrl(cmpResult, 1);
 574         }
 575         // adjust number of array slots remaining
 576         asm.subl(slotsRemaining, cmpResult);
 577         asm.jmpb(ret);
 578     }
 579 
 580     private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) {
 581         AMD64Address src = new AMD64Address(arrayPtr, offset);
 582         if (asm.supports(CPUFeature.AVX)) {
 583             VexMoveOp loadOp = alignedLoad ? VexMoveOp.VMOVDQA : VexMoveOp.VMOVDQU;
 584             loadOp.emit(asm, vectorSize, vecDst, src);
 585         } else {
 586             // SSE
 587             asm.movdqu(vecDst, src);
 588         }
 589     }
 590 
 591     /**
 592      * Compares all packed bytes/words/dwords in {@code vecArray} to {@code vecCmp}. Matching values
 593      * are set to all ones (0xff, 0xffff, ...), non-matching values are set to zero.
 594      */
 595     private static void emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) {
 596         switch (kind) {
 597             case Byte:
 598                 if (asm.supports(CPUFeature.AVX)) {
 599                     VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
 600                 } else { // SSE
 601                     asm.pcmpeqb(vecArray, vecCmp);
 602                 }
 603                 break;
 604             case Short:
 605             case Char:
 606                 if (asm.supports(CPUFeature.AVX)) {
 607                     VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
 608                 } else { // SSE
 609                     asm.pcmpeqw(vecArray, vecCmp);
 610                 }
 611                 break;
 612             case Int:
 613                 if (asm.supports(CPUFeature.AVX)) {
 614                     VexRVMOp.VPCMPEQD.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
 615                 } else { // SSE
 616                     asm.pcmpeqd(vecArray, vecCmp);
 617                 }
 618                 break;
 619             default:
 620                 throw new UnsupportedOperationException();
 621         }
 622     }
 623 
 624     private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) {
 625         if (asm.supports(CPUFeature.AVX)) {
 626             VexRMOp.VPMOVMSKB.emit(asm, vectorSize, dst, vecSrc);
 627         } else {
 628             // SSE
 629             asm.pmovmskb(dst, vecSrc);
 630         }
 631     }
 632 
 633     private static void emitCompareInst(AMD64MacroAssembler asm, JavaKind kind, Register dst, Register src) {
 634         switch (kind) {
 635             case Byte:
 636                 asm.cmpb(dst, src);
 637                 break;
 638             case Short:
 639             case Char:
 640                 asm.cmpw(dst, src);
 641                 break;
 642             case Int:
 643                 asm.cmpl(dst, src);
 644                 break;
 645             default:
 646                 asm.cmpq(dst, src);
 647         }
 648     }
 649 
 650     private static boolean supportsAVX2(LIRGeneratorTool tool) {
 651         return supports(tool, CPUFeature.AVX2);
 652     }
 653 
 654     private static boolean supports(LIRGeneratorTool tool, CPUFeature cpuFeature) {
 655         return ((AMD64) tool.target().arch).getFeatures().contains(cpuFeature);
 656     }
 657 }