1 /*
   2  * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import static jdk.vm.ci.code.ValueUtil.asRegister;
  28 import static jdk.vm.ci.code.ValueUtil.isRegister;
  29 import static jdk.vm.ci.code.ValueUtil.isStackSlot;
  30 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.CONST;
  31 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
  32 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
  33 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.STACK;
  34 
  35 import java.util.Objects;
  36 
  37 import org.graalvm.compiler.asm.Label;
  38 import org.graalvm.compiler.asm.amd64.AMD64Address;
  39 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  40 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
  41 import org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64RMOp;
  42 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp;
  43 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp;
  44 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMOp;
  45 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRVMOp;
  46 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize;
  47 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  48 import org.graalvm.compiler.asm.amd64.AVXKind;
  49 import org.graalvm.compiler.core.common.LIRKind;
  50 import org.graalvm.compiler.core.common.NumUtil;
  51 import org.graalvm.compiler.lir.ConstantValue;
  52 import org.graalvm.compiler.lir.LIRInstructionClass;
  53 import org.graalvm.compiler.lir.Opcode;
  54 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  55 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  56 
  57 import jdk.vm.ci.amd64.AMD64;
  58 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  59 import jdk.vm.ci.amd64.AMD64Kind;
  60 import jdk.vm.ci.code.Register;
  61 import jdk.vm.ci.meta.JavaConstant;
  62 import jdk.vm.ci.meta.JavaKind;
  63 import jdk.vm.ci.meta.Value;
  64 
  65 /**
  66  */
  67 @Opcode("AMD64_ARRAY_INDEX_OF")
  68 public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction {
  69     public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class);
  70 
  71     private final JavaKind valueKind;
  72     private final int nValues;
  73     private final boolean findTwoConsecutive;
  74     private final AMD64Kind vectorKind;
  75     private final int arrayBaseOffset;
  76     private final Scale arrayIndexScale;
  77 
  78     @Def({REG}) protected Value resultValue;
  79     @Alive({REG}) protected Value arrayPtrValue;
  80     @Alive({REG}) protected Value arrayLengthValue;
  81     @Use({REG}) protected Value fromIndexValue;
  82     @Alive({REG, STACK, CONST}) protected Value searchValue1;
  83     @Alive({REG, STACK, CONST, ILLEGAL}) protected Value searchValue2;
  84     @Alive({REG, STACK, CONST, ILLEGAL}) protected Value searchValue3;
  85     @Alive({REG, STACK, CONST, ILLEGAL}) protected Value searchValue4;
  86     @Temp({REG}) protected Value comparisonResult1;
  87     @Temp({REG, ILLEGAL}) protected Value comparisonResult2;
  88     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal1;
  89     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal2;
  90     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal3;
  91     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal4;
  92     @Temp({REG, ILLEGAL}) protected Value vectorArray1;
  93     @Temp({REG, ILLEGAL}) protected Value vectorArray2;
  94     @Temp({REG, ILLEGAL}) protected Value vectorArray3;
  95     @Temp({REG, ILLEGAL}) protected Value vectorArray4;
  96 
  97     public AMD64ArrayIndexOfOp(JavaKind arrayKind, JavaKind valueKind, boolean findTwoConsecutive, int maxVectorSize, LIRGeneratorTool tool,
  98                     Value result, Value arrayPtr, Value arrayLength, Value fromIndex, Value... searchValues) {
  99         super(TYPE);
 100         this.valueKind = valueKind;
 101         this.arrayBaseOffset = tool.getProviders().getMetaAccess().getArrayBaseOffset(arrayKind);
 102         this.arrayIndexScale = Objects.requireNonNull(Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(valueKind)));
 103         this.findTwoConsecutive = findTwoConsecutive;
 104         assert 0 < searchValues.length && searchValues.length <= 4;
 105         assert byteMode(valueKind) || charMode(valueKind);
 106         assert supports(tool, CPUFeature.SSE2) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool);
 107         nValues = searchValues.length;
 108         assert !findTwoConsecutive || nValues == 1;
 109         resultValue = result;
 110         arrayPtrValue = arrayPtr;
 111         arrayLengthValue = arrayLength;
 112         fromIndexValue = fromIndex;
 113         searchValue1 = searchValues[0];
 114         searchValue2 = nValues > 1 ? searchValues[1] : Value.ILLEGAL;
 115         searchValue3 = nValues > 2 ? searchValues[2] : Value.ILLEGAL;
 116         searchValue4 = nValues > 3 ? searchValues[3] : Value.ILLEGAL;
 117         comparisonResult1 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
 118         comparisonResult2 = findTwoConsecutive ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL;
 119         vectorKind = supportsAVX2(tool) && (maxVectorSize < 0 || maxVectorSize >= 32) ? byteMode(valueKind) ? AMD64Kind.V256_BYTE : AMD64Kind.V256_WORD
 120                         : byteMode(valueKind) ? AMD64Kind.V128_BYTE : AMD64Kind.V128_WORD;
 121         vectorCompareVal1 = tool.newVariable(LIRKind.value(vectorKind));
 122         vectorCompareVal2 = nValues > 1 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
 123         vectorCompareVal3 = nValues > 2 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
 124         vectorCompareVal4 = nValues > 3 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
 125         vectorArray1 = tool.newVariable(LIRKind.value(vectorKind));
 126         vectorArray2 = tool.newVariable(LIRKind.value(vectorKind));
 127         vectorArray3 = tool.newVariable(LIRKind.value(vectorKind));
 128         vectorArray4 = tool.newVariable(LIRKind.value(vectorKind));
 129     }
 130 
 131     private static boolean byteMode(JavaKind kind) {
 132         return kind == JavaKind.Byte;
 133     }
 134 
 135     private static boolean charMode(JavaKind kind) {
 136         return kind == JavaKind.Char;
 137     }
 138 
 139     private JavaKind getComparisonKind() {
 140         return findTwoConsecutive ? (byteMode(valueKind) ? JavaKind.Char : JavaKind.Int) : valueKind;
 141     }
 142 
 143     private AVXKind.AVXSize getVectorSize() {
 144         return AVXKind.getDataSize(vectorKind);
 145     }
 146 
 147     @Override
 148     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
 149         int nVectors = nValues == 1 ? 4 : nValues == 2 ? 2 : 1;
 150         Register arrayPtr = asRegister(arrayPtrValue);
 151         Register arrayLength = asRegister(arrayLengthValue);
 152         Register fromIndex = asRegister(fromIndexValue);
 153         Register index = asRegister(resultValue);
 154         Value[] searchValue = {
 155                         nValues > 0 ? searchValue1 : null,
 156                         nValues > 1 ? searchValue2 : null,
 157                         nValues > 2 ? searchValue3 : null,
 158                         nValues > 3 ? searchValue4 : null,
 159         };
 160         Register[] vecCmp = {
 161                         nValues > 0 ? asRegister(vectorCompareVal1) : null,
 162                         nValues > 1 ? asRegister(vectorCompareVal2) : null,
 163                         nValues > 2 ? asRegister(vectorCompareVal3) : null,
 164                         nValues > 3 ? asRegister(vectorCompareVal4) : null,
 165         };
 166         Register[] vecArray = {
 167                         asRegister(vectorArray1),
 168                         asRegister(vectorArray2),
 169                         asRegister(vectorArray3),
 170                         asRegister(vectorArray4),
 171         };
 172         Register[] cmpResult = {
 173                         asRegister(comparisonResult1),
 174                         findTwoConsecutive ? asRegister(comparisonResult2) : null,
 175         };
 176         Label ret = new Label();
 177 
 178         Label bulkVectorLoop = new Label();
 179         Label singleVectorLoop = new Label();
 180         Label[] vectorFound = {
 181                         new Label(),
 182                         new Label(),
 183                         new Label(),
 184                         new Label(),
 185         };
 186         Label runVectorized = new Label();
 187         Label elementWiseLoop = new Label();
 188         Label elementWiseFound = new Label();
 189         Label elementWiseNotFound = new Label();
 190         Label skipBulkVectorLoop = new Label();
 191         int vectorSize = getVectorSize().getBytes() / valueKind.getByteCount();
 192         int bulkSize = vectorSize * nVectors;
 193         JavaKind vectorCompareKind = valueKind;
 194         if (findTwoConsecutive) {
 195             bulkSize /= 2;
 196             vectorCompareKind = byteMode(valueKind) ? JavaKind.Char : JavaKind.Int;
 197         }
 198         // index = fromIndex + vectorSize (+1 if findTwoConsecutive)
 199         // important: this must be the first register manipulation, since fromIndex is
 200         // annotated with @Use
 201         asm.leaq(index, new AMD64Address(fromIndex, vectorSize + (findTwoConsecutive ? 1 : 0)));
 202 
 203         // check if vector vector load is in bounds
 204         asm.cmpq(index, arrayLength);
 205         asm.jccb(AMD64Assembler.ConditionFlag.LessEqual, runVectorized);
 206 
 207         // search range is smaller than vector size, do element-wise comparison
 208 
 209         // index = fromIndex (+ 1 if findTwoConsecutive)
 210         asm.subq(index, vectorSize);
 211         // check if enough array slots remain
 212         asm.cmpq(index, arrayLength);
 213         asm.jccb(AMD64Assembler.ConditionFlag.GreaterEqual, elementWiseNotFound);
 214         // compare one-by-one
 215         asm.bind(elementWiseLoop);
 216         // check for match
 217         OperandSize cmpSize = getOpSize(getComparisonKind());
 218         // address = findTwoConsecutive ? array[index - 1] : array[index]
 219         AMD64Address arrayAddr = new AMD64Address(arrayPtr, index, arrayIndexScale, arrayBaseOffset - (findTwoConsecutive ? valueKind.getByteCount() : 0));
 220         boolean valuesOnStack = searchValuesOnStack(searchValue);
 221         if (valuesOnStack) {
 222             (cmpSize == OperandSize.BYTE ? AMD64RMOp.MOVB : AMD64RMOp.MOV).emit(asm, cmpSize, cmpResult[0], arrayAddr);
 223             for (int i = 0; i < nValues; i++) {
 224                 if (isConstant(searchValue[i])) {
 225                     int imm = asConstant(searchValue[i]).asInt();
 226                     AMD64Assembler.AMD64BinaryArithmetic.CMP.getMIOpcode(cmpSize, NumUtil.isByte(imm)).emit(asm, cmpSize, cmpResult[0], imm);
 227                 } else if (isStackSlot(searchValue[i])) {
 228                     AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(cmpSize).emit(asm, cmpSize, cmpResult[0], (AMD64Address) crb.asAddress(searchValue[i]));
 229                 } else {
 230                     AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(cmpSize).emit(asm, cmpSize, cmpResult[0], asRegister(searchValue[i]));
 231                 }
 232                 asm.jccb(AMD64Assembler.ConditionFlag.Equal, elementWiseFound);
 233             }
 234         } else {
 235             for (int i = 0; i < nValues; i++) {
 236                 if (isConstant(searchValue[i])) {
 237                     int imm = asConstant(searchValue[i]).asInt();
 238                     AMD64Assembler.AMD64BinaryArithmetic.CMP.getMIOpcode(cmpSize, NumUtil.isByte(imm)).emit(asm, cmpSize, arrayAddr, imm);
 239                 } else {
 240                     AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(cmpSize).emit(asm, cmpSize, asRegister(searchValue[i]), arrayAddr);
 241                 }
 242                 asm.jccb(AMD64Assembler.ConditionFlag.Equal, elementWiseFound);
 243             }
 244         }
 245         // adjust index
 246         asm.incrementq(index, 1);
 247         // continue loop
 248         asm.cmpq(index, arrayLength);
 249         asm.jccb(AMD64Assembler.ConditionFlag.Less, elementWiseLoop);
 250 
 251         asm.bind(elementWiseNotFound);
 252         asm.xorq(index, index);
 253 
 254         if (findTwoConsecutive) {
 255             asm.bind(elementWiseFound);
 256             asm.decrementq(index, 1);
 257         } else {
 258             asm.decrementq(index, 1);
 259             asm.bind(elementWiseFound);
 260         }
 261         asm.jmp(ret);
 262 
 263         // vectorized implementation
 264         asm.bind(runVectorized);
 265 
 266         // move search values to vectors
 267         for (int i = 0; i < nValues; i++) {
 268             // fill comparison vector with copies of the search value
 269             broadcastSearchValue(crb, asm, vecCmp[i], searchValue[i], cmpResult[0], vecArray[0]);
 270         }
 271 
 272         // do one unaligned vector comparison pass and adjust alignment afterwards
 273         emitVectorCompare(asm, vectorCompareKind, findTwoConsecutive ? 2 : 1, arrayPtr, index, vecCmp, vecArray, cmpResult, vectorFound, false, false);
 274 
 275         // adjust index to vector size alignment
 276         asm.leaq(cmpResult[0], new AMD64Address(arrayPtr, arrayBaseOffset));
 277         if (charMode(valueKind)) {
 278             asm.shrq(cmpResult[0], 1);
 279         }
 280         asm.addq(index, cmpResult[0]);
 281         // adjust to next lower multiple of vector size
 282         asm.andq(index, ~(vectorSize - 1));
 283         asm.subq(index, cmpResult[0]);
 284         // add bulk size
 285         asm.addq(index, bulkSize);
 286 
 287         // check if there are enough array slots remaining for the bulk loop
 288         asm.cmpq(index, arrayLength);
 289         asm.jccb(AMD64Assembler.ConditionFlag.Greater, skipBulkVectorLoop);
 290 
 291         emitAlign(crb, asm);
 292         asm.bind(bulkVectorLoop);
 293         // memory-aligned bulk comparison
 294         emitVectorCompare(asm, vectorCompareKind, nVectors, arrayPtr, index, vecCmp, vecArray, cmpResult, vectorFound, false, !findTwoConsecutive);
 295         // adjust index
 296         asm.addq(index, bulkSize);
 297         // check if there are enough array slots remaining for the bulk loop
 298         asm.cmpq(index, arrayLength);
 299         asm.jccb(AMD64Assembler.ConditionFlag.LessEqual, bulkVectorLoop);
 300 
 301         asm.bind(skipBulkVectorLoop);
 302         if ((findTwoConsecutive && nVectors == 2) || nVectors == 1) {
 303             // do last load from end of array
 304             asm.movq(index, arrayLength);
 305             // compare
 306             emitVectorCompare(asm, vectorCompareKind, findTwoConsecutive ? 2 : 1, arrayPtr, index, vecCmp, vecArray, cmpResult, vectorFound, true, false);
 307         } else {
 308             // remove bulk offset
 309             asm.subq(index, bulkSize);
 310             emitAlign(crb, asm);
 311             // same loop as bulkVectorLoop, with only one vector
 312             asm.bind(singleVectorLoop);
 313             // add vector size
 314             asm.addq(index, vectorSize);
 315             // check if vector load is in bounds
 316             asm.cmpq(index, arrayLength);
 317             // if load would be over bounds, set the load to the end of the array
 318             asm.cmovq(AMD64Assembler.ConditionFlag.Greater, index, arrayLength);
 319             // compare
 320             emitVectorCompare(asm, vectorCompareKind, findTwoConsecutive ? 2 : 1, arrayPtr, index, vecCmp, vecArray, cmpResult, vectorFound, true, false);
 321             // check if there are enough array slots remaining for the loop
 322             asm.cmpq(index, arrayLength);
 323             asm.jccb(AMD64Assembler.ConditionFlag.Less, singleVectorLoop);
 324         }
 325 
 326         asm.movl(index, -1);
 327         asm.jmpb(ret);
 328 
 329         if (findTwoConsecutive) {
 330             Label vectorFound2Done = new Label();
 331 
 332             // vectorFound[0] and vectorFound[2] behave like the single-char case
 333             asm.bind(vectorFound[2]);
 334             // add static offset
 335             asm.subq(index, getResultIndexDelta(2));
 336             asm.jmpb(vectorFound2Done);
 337 
 338             asm.bind(vectorFound[0]);
 339             // add static offset
 340             asm.subq(index, getResultIndexDelta(0));
 341             asm.bind(vectorFound2Done);
 342             // find offset
 343             asm.bsfq(cmpResult[0], cmpResult[0]);
 344             if (charMode(valueKind)) {
 345                 // convert byte offset to chars
 346                 asm.shrl(cmpResult[0], 1);
 347             }
 348             // add offset to index
 349             asm.addq(index, cmpResult[0]);
 350             asm.jmpb(ret);
 351 
 352             Label minResult = new Label();
 353             Label minResultDone = new Label();
 354 
 355             // in vectorFound[1] and vectorFound[3], we have to check the results 0 and 2 as well
 356             if (nVectors > 2) {
 357                 asm.bind(vectorFound[3]);
 358                 // add offset
 359                 asm.subq(index, getResultIndexDelta(3));
 360                 asm.jmpb(minResult);
 361             }
 362 
 363             asm.bind(vectorFound[1]);
 364             // add offset
 365             asm.subq(index, getResultIndexDelta(1));
 366 
 367             asm.bind(minResult);
 368             // find offset 0
 369             asm.bsfq(cmpResult[1], cmpResult[1]);
 370             // check if second result is also a match
 371             asm.testq(cmpResult[0], cmpResult[0]);
 372             asm.jccb(AMD64Assembler.ConditionFlag.Zero, minResultDone);
 373             // find offset 1
 374             asm.bsfq(cmpResult[0], cmpResult[0]);
 375             asm.addq(cmpResult[0], valueKind.getByteCount());
 376             // if first result is greater than second, replace it with the second result
 377             asm.cmpq(cmpResult[1], cmpResult[0]);
 378             asm.cmovq(AMD64Assembler.ConditionFlag.Greater, cmpResult[1], cmpResult[0]);
 379             asm.bind(minResultDone);
 380             if (charMode(valueKind)) {
 381                 // convert byte offset to chars
 382                 asm.shrl(cmpResult[1], 1);
 383             }
 384             // add offset to index
 385             asm.addq(index, cmpResult[1]);
 386         } else {
 387             Label end = new Label();
 388             for (int i = 0; i < nVectors; i++) {
 389                 asm.bind(vectorFound[i]);
 390                 // add static offset
 391                 asm.subq(index, getResultIndexDelta(i));
 392                 if (i < nVectors - 1) {
 393                     asm.jmpb(end);
 394                 }
 395             }
 396             asm.bind(end);
 397             // find offset
 398             asm.bsfq(cmpResult[0], cmpResult[0]);
 399             if (charMode(valueKind)) {
 400                 // convert byte offset to chars
 401                 asm.shrl(cmpResult[0], 1);
 402             }
 403             // add offset to index
 404             asm.addq(index, cmpResult[0]);
 405         }
 406         asm.bind(ret);
 407     }
 408 
 409     private boolean searchValuesOnStack(Value[] searchValue) {
 410         for (int i = 0; i < nValues; i++) {
 411             if (isStackSlot(searchValue[i])) {
 412                 return true;
 413             }
 414         }
 415         return false;
 416     }
 417 
 418     private int getResultIndexDelta(int i) {
 419         return (((findTwoConsecutive ? i / 2 : i) + 1) * (getVectorSize().getBytes() / valueKind.getByteCount())) + (findTwoConsecutive ? (i & 1) : 0);
 420     }
 421 
 422     private int getVectorOffset(int i) {
 423         return arrayBaseOffset - getResultIndexDelta(i) * valueKind.getByteCount();
 424     }
 425 
 426     private void broadcastSearchValue(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register dst, Value srcVal, Register tmpReg, Register tmpVector) {
 427         Register src = asRegOrTmpReg(crb, asm, srcVal, tmpReg);
 428         if (asm.supports(CPUFeature.AVX)) {
 429             VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, dst, src);
 430         } else {
 431             asm.movdl(dst, src);
 432         }
 433         emitBroadcast(asm, getComparisonKind(), dst, tmpVector, getVectorSize());
 434     }
 435 
 436     private static boolean isConstant(Value val) {
 437         assert !(val instanceof ConstantValue) || ((ConstantValue) val).isJavaConstant();
 438         return val instanceof ConstantValue;
 439     }
 440 
 441     private static JavaConstant asConstant(Value val) {
 442         return ((ConstantValue) val).getJavaConstant();
 443     }
 444 
 445     private static Register asRegOrTmpReg(CompilationResultBuilder crb, AMD64MacroAssembler asm, Value val, Register tmpReg) {
 446         if (isRegister(val)) {
 447             return asRegister(val);
 448         } else if (isStackSlot(val)) {
 449             asm.movl(tmpReg, (AMD64Address) crb.asAddress(val));
 450             return tmpReg;
 451         } else {
 452             assert isConstant(val);
 453             asm.movl(tmpReg, asConstant(val).asInt());
 454             return tmpReg;
 455         }
 456     }
 457 
 458     private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
 459         asm.align(crb.target.wordSize * 2);
 460     }
 461 
 462     /**
 463      * Fills {@code vecDst} with copies of its lowest byte, word or dword.
 464      */
 465     private static void emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) {
 466         switch (kind) {
 467             case Byte:
 468                 if (asm.supports(CPUFeature.AVX2)) {
 469                     VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst);
 470                 } else if (asm.supports(CPUFeature.AVX)) {
 471                     VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp);
 472                     VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp);
 473                 } else if (asm.supports(CPUFeature.SSSE3)) {
 474                     asm.pxor(vecTmp, vecTmp);
 475                     asm.pshufb(vecDst, vecTmp);
 476                 } else { // SSE2
 477                     asm.punpcklbw(vecDst, vecDst);
 478                     asm.punpcklbw(vecDst, vecDst);
 479                     asm.pshufd(vecDst, vecDst, 0);
 480                 }
 481                 break;
 482             case Short:
 483             case Char:
 484                 if (asm.supports(CPUFeature.AVX2)) {
 485                     VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst);
 486                 } else if (asm.supports(CPUFeature.AVX)) {
 487                     VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0);
 488                     VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
 489                 } else { // SSE
 490                     asm.pshuflw(vecDst, vecDst, 0);
 491                     asm.pshufd(vecDst, vecDst, 0);
 492                 }
 493                 break;
 494             case Int:
 495                 if (asm.supports(CPUFeature.AVX2)) {
 496                     VexRMOp.VPBROADCASTD.emit(asm, vectorSize, vecDst, vecDst);
 497                 } else if (asm.supports(CPUFeature.AVX)) {
 498                     VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
 499                 } else { // SSE
 500                     asm.pshufd(vecDst, vecDst, 0);
 501                 }
 502                 break;
 503             default:
 504                 throw new UnsupportedOperationException();
 505         }
 506     }
 507 
 508     private void emitVectorCompare(AMD64MacroAssembler asm,
 509                     JavaKind kind,
 510                     int nVectors,
 511                     Register arrayPtr,
 512                     Register index,
 513                     Register[] vecCmp,
 514                     Register[] vecArray,
 515                     Register[] cmpResult,
 516                     Label[] vectorFound,
 517                     boolean shortJmp,
 518                     boolean alignedLoad) {
 519         // load array contents into vectors
 520         for (int i = 0; i < nVectors; i++) {
 521             int base = i * nValues;
 522             for (int j = 0; j < nValues; j++) {
 523                 emitArrayLoad(asm, getVectorSize(), vecArray[base + j], arrayPtr, index, getVectorOffset(nVectors - (i + 1)), alignedLoad);
 524             }
 525         }
 526         // compare all loaded bytes to the search value.
 527         // matching bytes are set to 0xff, non-matching bytes are set to 0x00.
 528         if (!findTwoConsecutive) {
 529             for (int i = 0; i < nVectors; i++) {
 530                 int base = i * nValues;
 531                 for (int j = 0; j < nValues; j++) {
 532                     emitVectorCompareInst(asm, kind, getVectorSize(), vecArray[base + j], vecCmp[j]);
 533                     if ((j & 1) == 1) {
 534                         emitPOR(asm, getVectorSize(), vecArray[base + j - 1], vecArray[base + j]);
 535                     }
 536                 }
 537                 if (nValues > 2) {
 538                     emitPOR(asm, getVectorSize(), vecArray[base], vecArray[base + 2]);
 539                 }
 540                 emitMOVMSK(asm, getVectorSize(), cmpResult[0], vecArray[base]);
 541                 emitJnz(asm, cmpResult[0], vectorFound[nVectors - (i + 1)], shortJmp);
 542             }
 543         } else {
 544             for (int i = 0; i < nVectors; i += 2) {
 545                 emitVectorCompareInst(asm, kind, getVectorSize(), vecArray[i], vecCmp[0]);
 546                 emitVectorCompareInst(asm, kind, getVectorSize(), vecArray[i + 1], vecCmp[0]);
 547                 emitMOVMSK(asm, getVectorSize(), cmpResult[1], vecArray[i]);
 548                 emitMOVMSK(asm, getVectorSize(), cmpResult[0], vecArray[i + 1]);
 549                 emitJnz(asm, cmpResult[1], vectorFound[nVectors - (i + 1)], shortJmp);
 550                 emitJnz(asm, cmpResult[0], vectorFound[nVectors - (i + 2)], shortJmp);
 551             }
 552         }
 553     }
 554 
 555     private static void emitJnz(AMD64MacroAssembler asm, Register cond, Label tgt, boolean shortJmp) {
 556         asm.testl(cond, cond);
 557         if (shortJmp) {
 558             asm.jccb(AMD64Assembler.ConditionFlag.NotZero, tgt);
 559         } else {
 560             asm.jcc(AMD64Assembler.ConditionFlag.NotZero, tgt);
 561         }
 562     }
 563 
 564     private void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, Register index, int offset, boolean alignedLoad) {
 565         AMD64Address src = new AMD64Address(arrayPtr, index, arrayIndexScale, offset);
 566         if (asm.supports(CPUFeature.AVX)) {
 567             VexMoveOp loadOp = alignedLoad ? VexMoveOp.VMOVDQA : VexMoveOp.VMOVDQU;
 568             loadOp.emit(asm, vectorSize, vecDst, src);
 569         } else {
 570             // SSE
 571             asm.movdqu(vecDst, src);
 572         }
 573     }
 574 
 575     /**
 576      * Compares all packed bytes/words/dwords in {@code vecArray} to {@code vecCmp}. Matching values
 577      * are set to all ones (0xff, 0xffff, ...), non-matching values are set to zero.
 578      */
 579     private static void emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) {
 580         switch (kind) {
 581             case Byte:
 582                 if (asm.supports(CPUFeature.AVX)) {
 583                     VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
 584                 } else { // SSE
 585                     asm.pcmpeqb(vecArray, vecCmp);
 586                 }
 587                 break;
 588             case Short:
 589             case Char:
 590                 if (asm.supports(CPUFeature.AVX)) {
 591                     VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
 592                 } else { // SSE
 593                     asm.pcmpeqw(vecArray, vecCmp);
 594                 }
 595                 break;
 596             case Int:
 597                 if (asm.supports(CPUFeature.AVX)) {
 598                     VexRVMOp.VPCMPEQD.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
 599                 } else { // SSE
 600                     asm.pcmpeqd(vecArray, vecCmp);
 601                 }
 602                 break;
 603             default:
 604                 throw new UnsupportedOperationException();
 605         }
 606     }
 607 
 608     private static void emitPOR(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) {
 609         if (asm.supports(CPUFeature.AVX)) {
 610             VexRVMOp.VPOR.emit(asm, vectorSize, dst, dst, vecSrc);
 611         } else {
 612             // SSE
 613             asm.por(dst, vecSrc);
 614         }
 615     }
 616 
 617     private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) {
 618         if (asm.supports(CPUFeature.AVX)) {
 619             VexRMOp.VPMOVMSKB.emit(asm, vectorSize, dst, vecSrc);
 620         } else {
 621             // SSE
 622             asm.pmovmskb(dst, vecSrc);
 623         }
 624     }
 625 
 626     private static OperandSize getOpSize(JavaKind kind) {
 627         switch (kind) {
 628             case Byte:
 629                 return OperandSize.BYTE;
 630             case Short:
 631             case Char:
 632                 return OperandSize.WORD;
 633             case Int:
 634                 return OperandSize.DWORD;
 635             default:
 636                 return OperandSize.QWORD;
 637         }
 638     }
 639 
 640     private static boolean supportsAVX2(LIRGeneratorTool tool) {
 641         return supports(tool, CPUFeature.AVX2);
 642     }
 643 
 644     private static boolean supports(LIRGeneratorTool tool, CPUFeature cpuFeature) {
 645         return ((AMD64) tool.target().arch).getFeatures().contains(cpuFeature);
 646     }
 647 }