1 /*
   2  * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import jdk.vm.ci.amd64.AMD64;
  28 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  29 import jdk.vm.ci.amd64.AMD64Kind;
  30 import jdk.vm.ci.code.Register;
  31 import jdk.vm.ci.meta.JavaKind;
  32 import jdk.vm.ci.meta.Value;
  33 import org.graalvm.compiler.asm.Label;
  34 import org.graalvm.compiler.asm.amd64.AMD64Address;
  35 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
  36 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp;
  37 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp;
  38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMOp;
  39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRVMOp;
  40 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  41 import org.graalvm.compiler.asm.amd64.AVXKind;
  42 import org.graalvm.compiler.core.common.LIRKind;
  43 import org.graalvm.compiler.lir.LIRInstructionClass;
  44 import org.graalvm.compiler.lir.Opcode;
  45 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  46 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  47 
  48 import static jdk.vm.ci.code.ValueUtil.asRegister;
  49 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
  50 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
  51 
  52 /**
  53  */
  54 @Opcode("AMD64_ARRAY_INDEX_OF")
  55 public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction {
  56     public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class);
  57 
  58     private final JavaKind kind;
  59     private final int vmPageSize;
  60     private final int nValues;
  61     private final boolean findTwoConsecutive;
  62     private final AMD64Kind vectorKind;
  63 
  64     @Def({REG}) protected Value resultValue;
  65     @Alive({REG}) protected Value arrayPtrValue;
  66     @Use({REG}) protected Value arrayLengthValue;
  67     @Alive({REG}) protected Value searchValue1;
  68     @Alive({REG, ILLEGAL}) protected Value searchValue2;
  69     @Alive({REG, ILLEGAL}) protected Value searchValue3;
  70     @Alive({REG, ILLEGAL}) protected Value searchValue4;
  71     @Temp({REG}) protected Value arraySlotsRemaining;
  72     @Temp({REG}) protected Value comparisonResult1;
  73     @Temp({REG}) protected Value comparisonResult2;
  74     @Temp({REG}) protected Value comparisonResult3;
  75     @Temp({REG}) protected Value comparisonResult4;
  76     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal1;
  77     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal2;
  78     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal3;
  79     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal4;
  80     @Temp({REG, ILLEGAL}) protected Value vectorArray1;
  81     @Temp({REG, ILLEGAL}) protected Value vectorArray2;
  82     @Temp({REG, ILLEGAL}) protected Value vectorArray3;
  83     @Temp({REG, ILLEGAL}) protected Value vectorArray4;
  84 
  85     public AMD64ArrayIndexOfOp(JavaKind kind, boolean findTwoConsecutive, int vmPageSize, int maxVectorSize, LIRGeneratorTool tool, Value result, Value arrayPtr, Value arrayLength,
  86                     Value... searchValues) {
  87         super(TYPE);
  88         this.kind = kind;
  89         this.findTwoConsecutive = findTwoConsecutive;
  90         this.vmPageSize = vmPageSize;
  91         assert 0 < searchValues.length && searchValues.length <= 4;
  92         assert byteMode(kind) || charMode(kind);
  93         assert supports(tool, CPUFeature.SSE2) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool);
  94         nValues = searchValues.length;
  95         assert !findTwoConsecutive || nValues == 1;
  96         resultValue = result;
  97         arrayPtrValue = arrayPtr;
  98         arrayLengthValue = arrayLength;
  99         searchValue1 = searchValues[0];
 100         searchValue2 = nValues > 1 ? searchValues[1] : Value.ILLEGAL;
 101         searchValue3 = nValues > 2 ? searchValues[2] : Value.ILLEGAL;
 102         searchValue4 = nValues > 3 ? searchValues[3] : Value.ILLEGAL;
 103         arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
 104         comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
 105         comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
 106         comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
 107         comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
 108         vectorKind = supportsAVX2(tool) && (maxVectorSize < 0 || maxVectorSize >= 32) ? byteMode(kind) ? AMD64Kind.V256_BYTE : AMD64Kind.V256_WORD
 109                         : byteMode(kind) ? AMD64Kind.V128_BYTE : AMD64Kind.V128_WORD;
 110         vectorCompareVal1 = tool.newVariable(LIRKind.value(vectorKind));
 111         vectorCompareVal2 = nValues > 1 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
 112         vectorCompareVal3 = nValues > 2 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
 113         vectorCompareVal4 = nValues > 3 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
 114         vectorArray1 = tool.newVariable(LIRKind.value(vectorKind));
 115         vectorArray2 = tool.newVariable(LIRKind.value(vectorKind));
 116         vectorArray3 = tool.newVariable(LIRKind.value(vectorKind));
 117         vectorArray4 = tool.newVariable(LIRKind.value(vectorKind));
 118     }
 119 
 120     private static boolean byteMode(JavaKind kind) {
 121         return kind == JavaKind.Byte;
 122     }
 123 
 124     private static boolean charMode(JavaKind kind) {
 125         return kind == JavaKind.Char;
 126     }
 127 
 128     @Override
 129     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
 130         Register arrayPtr = asRegister(arrayPtrValue);
 131         Register arrayLength = asRegister(arrayLengthValue);
 132         Register result = asRegister(resultValue);
 133         Register slotsRemaining = asRegister(arraySlotsRemaining);
 134         Register[] searchValue = {
 135                         nValues > 0 ? asRegister(searchValue1) : null,
 136                         nValues > 1 ? asRegister(searchValue2) : null,
 137                         nValues > 2 ? asRegister(searchValue3) : null,
 138                         nValues > 3 ? asRegister(searchValue4) : null,
 139         };
 140         Register[] vecCmp = {
 141                         nValues > 0 ? asRegister(vectorCompareVal1) : null,
 142                         nValues > 1 ? asRegister(vectorCompareVal2) : null,
 143                         nValues > 2 ? asRegister(vectorCompareVal3) : null,
 144                         nValues > 3 ? asRegister(vectorCompareVal4) : null,
 145         };
 146         Register[] vecArray = {
 147                         asRegister(vectorArray1),
 148                         asRegister(vectorArray2),
 149                         asRegister(vectorArray3),
 150                         asRegister(vectorArray4),
 151         };
 152         Register[] cmpResult = {
 153                         asRegister(comparisonResult1),
 154                         asRegister(comparisonResult2),
 155                         asRegister(comparisonResult3),
 156                         asRegister(comparisonResult4),
 157         };
 158         Label retFound = new Label();
 159         Label retNotFound = new Label();
 160         Label end = new Label();
 161 
 162         AVXKind.AVXSize vectorSize = AVXKind.getDataSize(vectorKind);
 163         int nVectors = nValues == 1 ? 4 : nValues == 2 ? 2 : 1;
 164 
 165         // load array length
 166         // important: this must be the first register manipulation, since arrayLengthValue is
 167         // annotated with @Use
 168         asm.movl(slotsRemaining, arrayLength);
 169         // load array pointer
 170         asm.movq(result, arrayPtr);
 171         // move search values to vectors
 172         for (int i = 0; i < nValues; i++) {
 173             if (asm.supports(CPUFeature.AVX)) {
 174                 VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp[i], searchValue[i]);
 175             } else {
 176                 asm.movdl(vecCmp[i], searchValue[i]);
 177             }
 178         }
 179         // fill comparison vector with copies of the search value
 180         for (int i = 0; i < nValues; i++) {
 181             emitBroadcast(asm, findTwoConsecutive ? (byteMode(kind) ? JavaKind.Char : JavaKind.Int) : kind, vecCmp[i], vecArray[0], vectorSize);
 182         }
 183 
 184         emitArrayIndexOfChars(crb, asm, kind, vectorSize, result, slotsRemaining, searchValue, vecCmp, vecArray, cmpResult, retFound, retNotFound, vmPageSize, nValues, nVectors, findTwoConsecutive);
 185 
 186         // return -1 (no match)
 187         asm.bind(retNotFound);
 188         asm.movq(result, -1);
 189         asm.jmpb(end);
 190 
 191         asm.bind(retFound);
 192         // convert array pointer to offset
 193         asm.subq(result, arrayPtr);
 194         if (charMode(kind)) {
 195             asm.shrq(result, 1);
 196         }
 197         asm.bind(end);
 198     }
 199 
 200     private static void emitArrayIndexOfChars(CompilationResultBuilder crb, AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize,
 201                     Register arrayPtr,
 202                     Register slotsRemaining,
 203                     Register[] searchValue,
 204                     Register[] vecCmp,
 205                     Register[] vecArray,
 206                     Register[] cmpResult,
 207                     Label retFound,
 208                     Label retNotFound,
 209                     int vmPageSize,
 210                     int nValues,
 211                     int nVectors,
 212                     boolean findTwoCharPrefix) {
 213         Label bulkVectorLoop = new Label();
 214         Label singleVectorLoop = new Label();
 215         Label[] vectorFound = {
 216                         new Label(),
 217                         new Label(),
 218                         new Label(),
 219                         new Label(),
 220         };
 221         Label lessThanVectorSizeRemaining = new Label();
 222         Label lessThanVectorSizeRemainingLoop = new Label();
 223         Label bulkVectorLoopExit = nVectors == 1 ? lessThanVectorSizeRemaining : singleVectorLoop;
 224         int bytesPerVector = vectorSize.getBytes();
 225         int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount();
 226         int singleVectorLoopCondition = arraySlotsPerVector;
 227         int bulkSize = arraySlotsPerVector * nVectors;
 228         int bulkSizeBytes = bytesPerVector * nVectors;
 229         int bulkLoopCondition = bulkSize;
 230         int[] vectorOffsets;
 231         JavaKind vectorCompareKind = kind;
 232         if (findTwoCharPrefix) {
 233             singleVectorLoopCondition++;
 234             bulkLoopCondition++;
 235             bulkSize /= 2;
 236             bulkSizeBytes /= 2;
 237             vectorOffsets = new int[]{0, kind.getByteCount(), bytesPerVector, bytesPerVector + kind.getByteCount()};
 238             vectorCompareKind = byteMode(kind) ? JavaKind.Char : JavaKind.Int;
 239         } else {
 240             vectorOffsets = new int[]{0, bytesPerVector, bytesPerVector * 2, bytesPerVector * 3};
 241         }
 242 
 243         // load copy of low part of array pointer
 244         Register tmpArrayPtrLow = cmpResult[0];
 245         asm.movl(tmpArrayPtrLow, arrayPtr);
 246 
 247         // check if bulk vector load is in bounds
 248         asm.cmpl(slotsRemaining, bulkLoopCondition);
 249         asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
 250 
 251         // check if array pointer is aligned to bulkSize
 252         asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1);
 253         asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop);
 254 
 255         // do one unaligned bulk comparison pass and adjust alignment afterwards
 256         emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false);
 257         // load copy of low part of array pointer
 258         asm.movl(tmpArrayPtrLow, arrayPtr);
 259         // adjust array pointer
 260         asm.addq(arrayPtr, bulkSizeBytes);
 261         // adjust number of array slots remaining
 262         asm.subl(slotsRemaining, bulkSize);
 263         // get offset to bulk size alignment
 264         asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1);
 265         emitBytesToArraySlots(asm, kind, tmpArrayPtrLow);
 266         // adjust array pointer to bulk size alignment
 267         asm.andq(arrayPtr, ~(bulkSizeBytes - 1));
 268         // adjust number of array slots remaining
 269         asm.addl(slotsRemaining, tmpArrayPtrLow);
 270         // check if there are enough array slots remaining for the bulk loop
 271         asm.cmpl(slotsRemaining, bulkLoopCondition);
 272         asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
 273 
 274         emitAlign(crb, asm);
 275         asm.bind(bulkVectorLoop);
 276         // memory-aligned bulk comparison
 277         emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, !findTwoCharPrefix);
 278         // adjust number of array slots remaining
 279         asm.subl(slotsRemaining, bulkSize);
 280         // adjust array pointer
 281         asm.addq(arrayPtr, bulkSizeBytes);
 282         // check if there are enough array slots remaining for the bulk loop
 283         asm.cmpl(slotsRemaining, bulkLoopCondition);
 284         asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
 285         // continue loop
 286         asm.jmp(bulkVectorLoop);
 287 
 288         if (nVectors > 1) {
 289             emitAlign(crb, asm);
 290             // same loop as bulkVectorLoop, with only one vector
 291             asm.bind(singleVectorLoop);
 292             // check if single vector load is in bounds
 293             asm.cmpl(slotsRemaining, singleVectorLoopCondition);
 294             asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining);
 295             // compare
 296             emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoCharPrefix ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false);
 297             // adjust number of array slots remaining
 298             asm.subl(slotsRemaining, arraySlotsPerVector);
 299             // adjust array pointer
 300             asm.addq(arrayPtr, bytesPerVector);
 301             // continue loop
 302             asm.jmpb(singleVectorLoop);
 303         }
 304 
 305         asm.bind(lessThanVectorSizeRemaining);
 306         // check if any array slots remain
 307         asm.testl(slotsRemaining, slotsRemaining);
 308         asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
 309 
 310         // a vector compare will read out of bounds of the input array.
 311         // check if the out-of-bounds read would cross a memory page boundary.
 312         // load copy of low part of array pointer
 313         asm.movl(tmpArrayPtrLow, arrayPtr);
 314         // check if pointer + vector size would cross the page boundary
 315         asm.andl(tmpArrayPtrLow, (vmPageSize - 1));
 316         asm.cmpl(tmpArrayPtrLow, (vmPageSize - (findTwoCharPrefix ? bytesPerVector + kind.getByteCount() : bytesPerVector)));
 317         // if the page boundary would be crossed, do byte/character-wise comparison instead.
 318         asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop);
 319 
 320         Label[] overBoundsMatch = {new Label(), new Label()};
 321         // otherwise, do a vector compare that reads beyond array bounds
 322         emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoCharPrefix ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, overBoundsMatch, false);
 323         // no match
 324         asm.jmp(retNotFound);
 325         if (findTwoCharPrefix) {
 326             Label overBoundsFinish = new Label();
 327             asm.bind(overBoundsMatch[1]);
 328             // get match offset of second result
 329             asm.bsfq(cmpResult[1], cmpResult[1]);
 330             asm.addl(cmpResult[1], kind.getByteCount());
 331             // replace first result with second and continue
 332             asm.movl(cmpResult[0], cmpResult[1]);
 333             asm.jmpb(overBoundsFinish);
 334 
 335             asm.bind(overBoundsMatch[0]);
 336             emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, overBoundsFinish);
 337         } else {
 338             asm.bind(overBoundsMatch[0]);
 339             // find match offset
 340             asm.bsfq(cmpResult[0], cmpResult[0]);
 341         }
 342 
 343         // adjust array pointer for match result
 344         asm.addq(arrayPtr, cmpResult[0]);
 345         if (charMode(kind)) {
 346             // convert byte offset to chars
 347             asm.shrl(cmpResult[0], 1);
 348         }
 349         // check if offset of matched value is greater than number of bytes remaining / out of array
 350         // bounds
 351         if (findTwoCharPrefix) {
 352             asm.decrementl(slotsRemaining);
 353         }
 354         asm.cmpl(cmpResult[0], slotsRemaining);
 355         // match is out of bounds, return no match
 356         asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound);
 357         // adjust number of array slots remaining
 358         if (findTwoCharPrefix) {
 359             asm.incrementl(slotsRemaining, 1);
 360         }
 361         asm.subl(slotsRemaining, cmpResult[0]);
 362         // match is in bounds, return offset
 363         asm.jmp(retFound);
 364 
 365         // compare remaining slots in the array one-by-one
 366         asm.bind(lessThanVectorSizeRemainingLoop);
 367         // check if enough array slots remain
 368         asm.cmpl(slotsRemaining, findTwoCharPrefix ? 1 : 0);
 369         asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, retNotFound);
 370         // load char / byte
 371         if (byteMode(kind)) {
 372             if (findTwoCharPrefix) {
 373                 asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr));
 374             } else {
 375                 asm.movzbl(cmpResult[0], new AMD64Address(arrayPtr));
 376             }
 377         } else {
 378             if (findTwoCharPrefix) {
 379                 asm.movl(cmpResult[0], new AMD64Address(arrayPtr));
 380             } else {
 381                 asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr));
 382             }
 383         }
 384         // check for match
 385         for (int i = 0; i < nValues; i++) {
 386             asm.cmpl(cmpResult[0], searchValue[i]);
 387             asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound);
 388         }
 389         // adjust number of array slots remaining
 390         asm.decrementl(slotsRemaining);
 391         // adjust array pointer
 392         asm.addq(arrayPtr, kind.getByteCount());
 393         // continue loop
 394         asm.jmpb(lessThanVectorSizeRemainingLoop);
 395 
 396         for (int i = 1; i < nVectors; i += (findTwoCharPrefix ? 2 : 1)) {
 397             emitVectorFoundWithOffset(asm, kind, vectorOffsets[i], arrayPtr, cmpResult[i], slotsRemaining, vectorFound[i], retFound);
 398         }
 399 
 400         if (findTwoCharPrefix) {
 401             asm.bind(vectorFound[2]);
 402             asm.addq(arrayPtr, vectorOffsets[2]);
 403             // adjust number of array slots remaining
 404             asm.subl(slotsRemaining, charMode(kind) ? vectorOffsets[2] / 2 : vectorOffsets[2]);
 405             asm.movl(cmpResult[0], cmpResult[2]);
 406             asm.movl(cmpResult[1], cmpResult[3]);
 407             asm.bind(vectorFound[0]);
 408             emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, new Label());
 409         } else {
 410             asm.bind(vectorFound[0]);
 411             // find index of first set bit in bit mask
 412             asm.bsfq(cmpResult[0], cmpResult[0]);
 413         }
 414         // add offset to array pointer
 415         asm.addq(arrayPtr, cmpResult[0]);
 416         if (charMode(kind)) {
 417             // convert byte offset to chars
 418             asm.shrl(cmpResult[0], 1);
 419         }
 420         // adjust number of array slots remaining
 421         asm.subl(slotsRemaining, cmpResult[0]);
 422         asm.jmpb(retFound);
 423     }
 424 
 425     private static void emitFindTwoCharPrefixMinResult(AMD64MacroAssembler asm, JavaKind kind, Register[] cmpResult, Label done) {
 426         // find match offset
 427         asm.bsfq(cmpResult[0], cmpResult[0]);
 428         // check if second result is also a match
 429         asm.testl(cmpResult[1], cmpResult[1]);
 430         asm.jcc(AMD64Assembler.ConditionFlag.Zero, done);
 431         // get match offset of second result
 432         asm.bsfq(cmpResult[1], cmpResult[1]);
 433         asm.addl(cmpResult[1], kind.getByteCount());
 434         // check if first result is less than second
 435         asm.cmpl(cmpResult[0], cmpResult[1]);
 436         asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, done);
 437         // first result is greater than second, replace it with the second result
 438         asm.movl(cmpResult[0], cmpResult[1]);
 439         asm.bind(done);
 440     }
 441 
 442     private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
 443         asm.align(crb.target.wordSize * 2);
 444     }
 445 
 446     /**
 447      * Fills {@code vecDst} with copies of its lowest byte, word or dword.
 448      */
 449     private static void emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) {
 450         switch (kind) {
 451             case Byte:
 452                 if (asm.supports(CPUFeature.AVX2)) {
 453                     VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst);
 454                 } else if (asm.supports(CPUFeature.AVX)) {
 455                     VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp);
 456                     VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp);
 457                 } else if (asm.supports(CPUFeature.SSSE3)) {
 458                     asm.pxor(vecTmp, vecTmp);
 459                     asm.pshufb(vecDst, vecTmp);
 460                 } else { // SSE2
 461                     asm.punpcklbw(vecDst, vecDst);
 462                     asm.punpcklbw(vecDst, vecDst);
 463                     asm.pshufd(vecDst, vecDst, 0);
 464                 }
 465                 break;
 466             case Short:
 467             case Char:
 468                 if (asm.supports(CPUFeature.AVX2)) {
 469                     VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst);
 470                 } else if (asm.supports(CPUFeature.AVX)) {
 471                     VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0);
 472                     VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
 473                 } else { // SSE
 474                     asm.pshuflw(vecDst, vecDst, 0);
 475                     asm.pshufd(vecDst, vecDst, 0);
 476                 }
 477                 break;
 478             case Int:
 479                 if (asm.supports(CPUFeature.AVX2)) {
 480                     VexRMOp.VPBROADCASTD.emit(asm, vectorSize, vecDst, vecDst);
 481                 } else if (asm.supports(CPUFeature.AVX)) {
 482                     VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
 483                 } else { // SSE
 484                     asm.pshufd(vecDst, vecDst, 0);
 485                 }
 486                 break;
 487             default:
 488                 throw new UnsupportedOperationException();
 489         }
 490     }
 491 
 492     /**
 493      * Convert a byte offset stored in {@code bytes} to an array index offset.
 494      */
 495     private static void emitBytesToArraySlots(AMD64MacroAssembler asm, JavaKind kind, Register bytes) {
 496         if (charMode(kind)) {
 497             asm.shrl(bytes, 1);
 498         } else {
 499             assert byteMode(kind);
 500         }
 501     }
 502 
 503     private static void emitVectorCompare(AMD64MacroAssembler asm,
 504                     JavaKind kind,
 505                     AVXKind.AVXSize vectorSize,
 506                     int nValues,
 507                     int nVectors,
 508                     int[] vectorOffsets,
 509                     Register arrayPtr,
 510                     Register[] vecCmp,
 511                     Register[] vecArray,
 512                     Register[] cmpResult,
 513                     Label[] vectorFound,
 514                     boolean alignedLoad) {
 515         // load array contents into vectors
 516         for (int i = 0; i < nValues; i++) {
 517             for (int j = 0; j < nVectors; j++) {
 518                 emitArrayLoad(asm, vectorSize, vecArray[(i * nVectors) + j], arrayPtr, vectorOffsets[j], alignedLoad);
 519             }
 520         }
 521         // compare all loaded bytes to the search value.
 522         // matching bytes are set to 0xff, non-matching bytes are set to 0x00.
 523         for (int i = 0; i < nValues; i++) {
 524             for (int j = 0; j < nVectors; j++) {
 525                 emitVectorCompareInst(asm, kind, vectorSize, vecArray[(i * nVectors) + j], vecCmp[i]);
 526             }
 527         }
 528         // create 32-bit-masks from the most significant bit of every byte in the comparison
 529         // results.
 530         for (int i = 0; i < nValues * nVectors; i++) {
 531             emitMOVMSK(asm, vectorSize, cmpResult[i], vecArray[i]);
 532         }
 533         // join results of comparisons against multiple values
 534         for (int stride = 1; stride < nValues; stride *= 2) {
 535             for (int i = 0; i < nVectors; i++) {
 536                 for (int j = 0; j + stride < nValues; j += stride * 2) {
 537                     asm.orl(cmpResult[i + (j * nVectors)], cmpResult[i + ((j + stride) * nVectors)]);
 538                 }
 539             }
 540         }
 541         // check if a match was found
 542         for (int i = 0; i < nVectors; i++) {
 543             asm.testl(cmpResult[i], cmpResult[i]);
 544             asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound[i]);
 545         }
 546     }
 547 
 548     private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm,
 549                     JavaKind kind,
 550                     int resultOffset,
 551                     Register result,
 552                     Register cmpResult,
 553                     Register slotsRemaining,
 554                     Label entry,
 555                     Label ret) {
 556         asm.bind(entry);
 557         if (resultOffset > 0) {
 558             // adjust array pointer
 559             asm.addq(result, resultOffset);
 560             // adjust number of array slots remaining
 561             asm.subl(slotsRemaining, charMode(kind) ? resultOffset / 2 : resultOffset);
 562         }
 563         // find index of first set bit in bit mask
 564         asm.bsfq(cmpResult, cmpResult);
 565         // add offset to array pointer
 566         asm.addq(result, cmpResult);
 567         if (charMode(kind)) {
 568             // convert byte offset to chars
 569             asm.shrl(cmpResult, 1);
 570         }
 571         // adjust number of array slots remaining
 572         asm.subl(slotsRemaining, cmpResult);
 573         asm.jmpb(ret);
 574     }
 575 
 576     private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) {
 577         AMD64Address src = new AMD64Address(arrayPtr, offset);
 578         if (asm.supports(CPUFeature.AVX)) {
 579             VexMoveOp loadOp = alignedLoad ? VexMoveOp.VMOVDQA : VexMoveOp.VMOVDQU;
 580             loadOp.emit(asm, vectorSize, vecDst, src);
 581         } else {
 582             // SSE
 583             asm.movdqu(vecDst, src);
 584         }
 585     }
 586 
 587     /**
 588      * Compares all packed bytes/words/dwords in {@code vecArray} to {@code vecCmp}. Matching values
 589      * are set to all ones (0xff, 0xffff, ...), non-matching values are set to zero.
 590      */
 591     private static void emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) {
 592         switch (kind) {
 593             case Byte:
 594                 if (asm.supports(CPUFeature.AVX)) {
 595                     VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
 596                 } else { // SSE
 597                     asm.pcmpeqb(vecArray, vecCmp);
 598                 }
 599                 break;
 600             case Short:
 601             case Char:
 602                 if (asm.supports(CPUFeature.AVX)) {
 603                     VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
 604                 } else { // SSE
 605                     asm.pcmpeqw(vecArray, vecCmp);
 606                 }
 607                 break;
 608             case Int:
 609                 if (asm.supports(CPUFeature.AVX)) {
 610                     VexRVMOp.VPCMPEQD.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
 611                 } else { // SSE
 612                     asm.pcmpeqd(vecArray, vecCmp);
 613                 }
 614                 break;
 615             default:
 616                 throw new UnsupportedOperationException();
 617         }
 618     }
 619 
 620     private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) {
 621         if (asm.supports(CPUFeature.AVX)) {
 622             VexRMOp.VPMOVMSKB.emit(asm, vectorSize, dst, vecSrc);
 623         } else {
 624             // SSE
 625             asm.pmovmskb(dst, vecSrc);
 626         }
 627     }
 628 
 629     private static boolean supportsAVX2(LIRGeneratorTool tool) {
 630         return supports(tool, CPUFeature.AVX2);
 631     }
 632 
 633     private static boolean supports(LIRGeneratorTool tool, CPUFeature cpuFeature) {
 634         return ((AMD64) tool.target().arch).getFeatures().contains(cpuFeature);
 635     }
 636 }