< prev index next >

src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.amd64/src/org/graalvm/compiler/lir/amd64/AMD64ArrayIndexOfOp.java

Print this page
rev 52509 : [mq]: graal

@@ -22,14 +22,16 @@
  */
 
 
 package org.graalvm.compiler.lir.amd64;
 
-import static jdk.vm.ci.code.ValueUtil.asRegister;
-import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
-import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
-
+import jdk.vm.ci.amd64.AMD64;
+import jdk.vm.ci.amd64.AMD64.CPUFeature;
+import jdk.vm.ci.amd64.AMD64Kind;
+import jdk.vm.ci.code.Register;
+import jdk.vm.ci.meta.JavaKind;
+import jdk.vm.ci.meta.Value;
 import org.graalvm.compiler.asm.Label;
 import org.graalvm.compiler.asm.amd64.AMD64Address;
 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp;
 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp;

@@ -41,399 +43,535 @@
 import org.graalvm.compiler.lir.LIRInstructionClass;
 import org.graalvm.compiler.lir.Opcode;
 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
 
-import jdk.vm.ci.amd64.AMD64;
-import jdk.vm.ci.amd64.AMD64.CPUFeature;
-import jdk.vm.ci.amd64.AMD64Kind;
-import jdk.vm.ci.code.Register;
-import jdk.vm.ci.meta.JavaKind;
-import jdk.vm.ci.meta.Value;
+import static jdk.vm.ci.code.ValueUtil.asRegister;
+import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
+import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
 
 /**
  */
 @Opcode("AMD64_ARRAY_INDEX_OF")
 public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction {
     public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class);
 
     private final JavaKind kind;
     private final int vmPageSize;
+    private final int nValues;
+    private final boolean findTwoConsecutive;
+    private final AMD64Kind vectorKind;
 
     @Def({REG}) protected Value resultValue;
-    @Alive({REG}) protected Value charArrayPtrValue;
-    @Use({REG}) protected Value charArrayLengthValue;
-    @Alive({REG}) protected Value searchCharValue;
+    @Alive({REG}) protected Value arrayPtrValue;
+    @Use({REG}) protected Value arrayLengthValue;
+    @Alive({REG}) protected Value searchValue1;
+    @Alive({REG, ILLEGAL}) protected Value searchValue2;
+    @Alive({REG, ILLEGAL}) protected Value searchValue3;
+    @Alive({REG, ILLEGAL}) protected Value searchValue4;
     @Temp({REG}) protected Value arraySlotsRemaining;
     @Temp({REG}) protected Value comparisonResult1;
     @Temp({REG}) protected Value comparisonResult2;
     @Temp({REG}) protected Value comparisonResult3;
     @Temp({REG}) protected Value comparisonResult4;
-    @Temp({REG, ILLEGAL}) protected Value vectorCompareVal;
+    @Temp({REG, ILLEGAL}) protected Value vectorCompareVal1;
+    @Temp({REG, ILLEGAL}) protected Value vectorCompareVal2;
+    @Temp({REG, ILLEGAL}) protected Value vectorCompareVal3;
+    @Temp({REG, ILLEGAL}) protected Value vectorCompareVal4;
     @Temp({REG, ILLEGAL}) protected Value vectorArray1;
     @Temp({REG, ILLEGAL}) protected Value vectorArray2;
     @Temp({REG, ILLEGAL}) protected Value vectorArray3;
     @Temp({REG, ILLEGAL}) protected Value vectorArray4;
 
-    public AMD64ArrayIndexOfOp(
-                    JavaKind kind,
-                    int vmPageSize, LIRGeneratorTool tool,
-                    Value result,
-                    Value arrayPtr,
-                    Value arrayLength,
-                    Value searchChar) {
+    public AMD64ArrayIndexOfOp(JavaKind kind, boolean findTwoConsecutive, int vmPageSize, int maxVectorSize, LIRGeneratorTool tool, Value result, Value arrayPtr, Value arrayLength,
+                    Value... searchValues) {
         super(TYPE);
         this.kind = kind;
+        this.findTwoConsecutive = findTwoConsecutive;
         this.vmPageSize = vmPageSize;
-        assert byteMode() || charMode();
-        assert supports(tool, CPUFeature.SSSE3) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool);
+        assert 0 < searchValues.length && searchValues.length <= 4;
+        assert byteMode(kind) || charMode(kind);
+        assert supports(tool, CPUFeature.SSE2) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool);
+        nValues = searchValues.length;
+        assert !findTwoConsecutive || nValues == 1;
         resultValue = result;
-        charArrayPtrValue = arrayPtr;
-        charArrayLengthValue = arrayLength;
-        searchCharValue = searchChar;
-
-        this.arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
-        this.comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
-        this.comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
-        this.comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
-        this.comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
-        AMD64Kind vectorKind = byteMode() ? supportsAVX2(tool) ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE : supportsAVX2(tool) ? AMD64Kind.V256_WORD : AMD64Kind.V128_WORD;
-        this.vectorCompareVal = tool.newVariable(LIRKind.value(vectorKind));
-        this.vectorArray1 = tool.newVariable(LIRKind.value(vectorKind));
-        this.vectorArray2 = tool.newVariable(LIRKind.value(vectorKind));
-        this.vectorArray3 = tool.newVariable(LIRKind.value(vectorKind));
-        this.vectorArray4 = tool.newVariable(LIRKind.value(vectorKind));
+        arrayPtrValue = arrayPtr;
+        arrayLengthValue = arrayLength;
+        searchValue1 = searchValues[0];
+        searchValue2 = nValues > 1 ? searchValues[1] : Value.ILLEGAL;
+        searchValue3 = nValues > 2 ? searchValues[2] : Value.ILLEGAL;
+        searchValue4 = nValues > 3 ? searchValues[3] : Value.ILLEGAL;
+        arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
+        comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
+        comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
+        comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
+        comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
+        vectorKind = supportsAVX2(tool) && (maxVectorSize < 0 || maxVectorSize >= 32) ? byteMode(kind) ? AMD64Kind.V256_BYTE : AMD64Kind.V256_WORD
+                        : byteMode(kind) ? AMD64Kind.V128_BYTE : AMD64Kind.V128_WORD;
+        vectorCompareVal1 = tool.newVariable(LIRKind.value(vectorKind));
+        vectorCompareVal2 = nValues > 1 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
+        vectorCompareVal3 = nValues > 2 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
+        vectorCompareVal4 = nValues > 3 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
+        vectorArray1 = tool.newVariable(LIRKind.value(vectorKind));
+        vectorArray2 = tool.newVariable(LIRKind.value(vectorKind));
+        vectorArray3 = tool.newVariable(LIRKind.value(vectorKind));
+        vectorArray4 = tool.newVariable(LIRKind.value(vectorKind));
     }
 
-    private boolean byteMode() {
+    private static boolean byteMode(JavaKind kind) {
         return kind == JavaKind.Byte;
     }
 
-    private boolean charMode() {
+    private static boolean charMode(JavaKind kind) {
         return kind == JavaKind.Char;
     }
 
     @Override
     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
-        Register arrayPtr = asRegister(charArrayPtrValue);
-        Register arrayLength = asRegister(charArrayLengthValue);
-        Register searchValue = asRegister(searchCharValue);
+        Register arrayPtr = asRegister(arrayPtrValue);
+        Register arrayLength = asRegister(arrayLengthValue);
         Register result = asRegister(resultValue);
-        Register vecCmp = asRegister(vectorCompareVal);
-        Register vecArray1 = asRegister(vectorArray1);
-        Register vecArray2 = asRegister(vectorArray2);
-        Register vecArray3 = asRegister(vectorArray3);
-        Register vecArray4 = asRegister(vectorArray4);
         Register slotsRemaining = asRegister(arraySlotsRemaining);
-        Register cmpResult1 = asRegister(comparisonResult1);
-        Register cmpResult2 = asRegister(comparisonResult2);
-        Register cmpResult3 = asRegister(comparisonResult3);
-        Register cmpResult4 = asRegister(comparisonResult4);
-
-        Label bulkVectorLoop = new Label();
-        Label singleVectorLoop = new Label();
-        Label vectorFound1 = new Label();
-        Label vectorFound2 = new Label();
-        Label vectorFound3 = new Label();
-        Label vectorFound4 = new Label();
-        Label lessThanVectorSizeRemaining = new Label();
-        Label lessThanVectorSizeRemainingLoop = new Label();
+        Register[] searchValue = {
+                        nValues > 0 ? asRegister(searchValue1) : null,
+                        nValues > 1 ? asRegister(searchValue2) : null,
+                        nValues > 2 ? asRegister(searchValue3) : null,
+                        nValues > 3 ? asRegister(searchValue4) : null,
+        };
+        Register[] vecCmp = {
+                        nValues > 0 ? asRegister(vectorCompareVal1) : null,
+                        nValues > 1 ? asRegister(vectorCompareVal2) : null,
+                        nValues > 2 ? asRegister(vectorCompareVal3) : null,
+                        nValues > 3 ? asRegister(vectorCompareVal4) : null,
+        };
+        Register[] vecArray = {
+                        asRegister(vectorArray1),
+                        asRegister(vectorArray2),
+                        asRegister(vectorArray3),
+                        asRegister(vectorArray4),
+        };
+        Register[] cmpResult = {
+                        asRegister(comparisonResult1),
+                        asRegister(comparisonResult2),
+                        asRegister(comparisonResult3),
+                        asRegister(comparisonResult4),
+        };
         Label retFound = new Label();
         Label retNotFound = new Label();
         Label end = new Label();
 
-        AVXKind.AVXSize vectorSize = asm.supports(CPUFeature.AVX2) ? AVXKind.AVXSize.YMM : AVXKind.AVXSize.XMM;
-        int nVectors = 4;
-        int bytesPerVector = vectorSize.getBytes();
-        int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount();
-        int bulkSize = arraySlotsPerVector * nVectors;
-        int bulkSizeBytes = bytesPerVector * nVectors;
-        assert bulkSizeBytes >= 64;
+        AVXKind.AVXSize vectorSize = AVXKind.getDataSize(vectorKind);
+        int nVectors = nValues == 1 ? 4 : nValues == 2 ? 2 : 1;
 
         // load array length
-        // important: this must be the first register manipulation, since charArrayLengthValue is
+        // important: this must be the first register manipulation, since arrayLengthValue is
         // annotated with @Use
         asm.movl(slotsRemaining, arrayLength);
-        // move search value to vector
+        // load array pointer
+        asm.movq(result, arrayPtr);
+        // move search values to vectors
+        for (int i = 0; i < nValues; i++) {
         if (asm.supports(CPUFeature.AVX)) {
-            VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp, searchValue);
+                VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp[i], searchValue[i]);
         } else {
-            asm.movdl(vecCmp, searchValue);
+                asm.movdl(vecCmp[i], searchValue[i]);
+            }
         }
-        // load array pointer
-        asm.movq(result, arrayPtr);
-        // load copy of low part of array pointer
-        asm.movl(cmpResult1, arrayPtr);
         // fill comparison vector with copies of the search value
-        emitBroadcast(asm, vecCmp, vecArray1, vectorSize);
+        for (int i = 0; i < nValues; i++) {
+            emitBroadcast(asm, findTwoConsecutive ? (byteMode(kind) ? JavaKind.Char : JavaKind.Int) : kind, vecCmp[i], vecArray[0], vectorSize);
+        }
+
+        emitArrayIndexOfChars(crb, asm, kind, vectorSize, result, slotsRemaining, searchValue, vecCmp, vecArray, cmpResult, retFound, retNotFound, vmPageSize, nValues, nVectors, findTwoConsecutive);
+
+        // return -1 (no match)
+        asm.bind(retNotFound);
+        asm.movq(result, -1);
+        asm.jmpb(end);
+
+        asm.bind(retFound);
+        // convert array pointer to offset
+        asm.subq(result, arrayPtr);
+        if (charMode(kind)) {
+            asm.shrq(result, 1);
+        }
+        asm.bind(end);
+    }
+
+    private static void emitArrayIndexOfChars(CompilationResultBuilder crb, AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize,
+                    Register arrayPtr,
+                    Register slotsRemaining,
+                    Register[] searchValue,
+                    Register[] vecCmp,
+                    Register[] vecArray,
+                    Register[] cmpResult,
+                    Label retFound,
+                    Label retNotFound,
+                    int vmPageSize,
+                    int nValues,
+                    int nVectors,
+                    boolean findTwoCharPrefix) {
+        Label bulkVectorLoop = new Label();
+        Label singleVectorLoop = new Label();
+        Label[] vectorFound = {
+                        new Label(),
+                        new Label(),
+                        new Label(),
+                        new Label(),
+        };
+        Label lessThanVectorSizeRemaining = new Label();
+        Label lessThanVectorSizeRemainingLoop = new Label();
+        Label bulkVectorLoopExit = nVectors == 1 ? lessThanVectorSizeRemaining : singleVectorLoop;
+        int bytesPerVector = vectorSize.getBytes();
+        int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount();
+        int singleVectorLoopCondition = arraySlotsPerVector;
+        int bulkSize = arraySlotsPerVector * nVectors;
+        int bulkSizeBytes = bytesPerVector * nVectors;
+        int bulkLoopCondition = bulkSize;
+        int[] vectorOffsets;
+        JavaKind vectorCompareKind = kind;
+        if (findTwoCharPrefix) {
+            singleVectorLoopCondition++;
+            bulkLoopCondition++;
+            bulkSize /= 2;
+            bulkSizeBytes /= 2;
+            vectorOffsets = new int[]{0, kind.getByteCount(), bytesPerVector, bytesPerVector + kind.getByteCount()};
+            vectorCompareKind = byteMode(kind) ? JavaKind.Char : JavaKind.Int;
+        } else {
+            vectorOffsets = new int[]{0, bytesPerVector, bytesPerVector * 2, bytesPerVector * 3};
+        }
+
+        // load copy of low part of array pointer
+        Register tmpArrayPtrLow = cmpResult[0];
+        asm.movl(tmpArrayPtrLow, arrayPtr);
 
         // check if bulk vector load is in bounds
-        asm.cmpl(slotsRemaining, bulkSize);
-        asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop);
+        asm.cmpl(slotsRemaining, bulkLoopCondition);
+        asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
 
-        // check if array pointer is 64-byte aligned
-        asm.andl(cmpResult1, 63);
+        // check if array pointer is aligned to bulkSize
+        asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1);
         asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop);
 
         // do one unaligned bulk comparison pass and adjust alignment afterwards
-        emitBulkCompare(asm, vectorSize, bytesPerVector, result, vecCmp, vecArray1, vecArray2, vecArray3, vecArray4, cmpResult1, cmpResult2, cmpResult3, cmpResult4,
-                        vectorFound1, vectorFound2, vectorFound3, vectorFound4, false);
+        emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false);
         // load copy of low part of array pointer
-        asm.movl(cmpResult1, arrayPtr);
+        asm.movl(tmpArrayPtrLow, arrayPtr);
         // adjust array pointer
-        asm.addq(result, bulkSizeBytes);
+        asm.addq(arrayPtr, bulkSizeBytes);
         // adjust number of array slots remaining
         asm.subl(slotsRemaining, bulkSize);
-        // get offset to 64-byte alignment
-        asm.andl(cmpResult1, 63);
-        emitBytesToArraySlots(asm, cmpResult1);
-        // adjust array pointer to 64-byte alignment
-        asm.andq(result, ~63);
+        // get offset to bulk size alignment
+        asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1);
+        emitBytesToArraySlots(asm, kind, tmpArrayPtrLow);
+        // adjust array pointer to bulk size alignment
+        asm.andq(arrayPtr, ~(bulkSizeBytes - 1));
         // adjust number of array slots remaining
-        asm.addl(slotsRemaining, cmpResult1);
+        asm.addl(slotsRemaining, tmpArrayPtrLow);
         // check if there are enough array slots remaining for the bulk loop
-        asm.cmpl(slotsRemaining, bulkSize);
-        asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop);
+        asm.cmpl(slotsRemaining, bulkLoopCondition);
+        asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
 
         emitAlign(crb, asm);
         asm.bind(bulkVectorLoop);
         // memory-aligned bulk comparison
-        emitBulkCompare(asm, vectorSize, bytesPerVector, result, vecCmp, vecArray1, vecArray2, vecArray3, vecArray4, cmpResult1, cmpResult2, cmpResult3, cmpResult4,
-                        vectorFound1, vectorFound2, vectorFound3, vectorFound4, true);
+        emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, !findTwoCharPrefix);
         // adjust number of array slots remaining
         asm.subl(slotsRemaining, bulkSize);
         // adjust array pointer
-        asm.addq(result, bulkSizeBytes);
+        asm.addq(arrayPtr, bulkSizeBytes);
         // check if there are enough array slots remaining for the bulk loop
-        asm.cmpl(slotsRemaining, bulkSize);
-        asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop);
+        asm.cmpl(slotsRemaining, bulkLoopCondition);
+        asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
         // continue loop
-        asm.jmpb(bulkVectorLoop);
+        asm.jmp(bulkVectorLoop);
 
+        if (nVectors > 1) {
         emitAlign(crb, asm);
         // same loop as bulkVectorLoop, with only one vector
         asm.bind(singleVectorLoop);
         // check if single vector load is in bounds
-        asm.cmpl(slotsRemaining, arraySlotsPerVector);
+            asm.cmpl(slotsRemaining, singleVectorLoopCondition);
         asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining);
         // compare
-        emitSingleVectorCompare(asm, vectorSize, result, vecCmp, vecArray1, cmpResult1);
-
-        // check if a match was found
-        asm.testl(cmpResult1, cmpResult1);
-        asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound1);
+            emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoCharPrefix ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false);
         // adjust number of array slots remaining
         asm.subl(slotsRemaining, arraySlotsPerVector);
         // adjust array pointer
-        asm.addq(result, bytesPerVector);
+            asm.addq(arrayPtr, bytesPerVector);
         // continue loop
         asm.jmpb(singleVectorLoop);
+        }
 
         asm.bind(lessThanVectorSizeRemaining);
         // check if any array slots remain
         asm.testl(slotsRemaining, slotsRemaining);
         asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
 
         // a vector compare will read out of bounds of the input array.
         // check if the out-of-bounds read would cross a memory page boundary.
         // load copy of low part of array pointer
-        asm.movl(cmpResult1, result);
+        asm.movl(tmpArrayPtrLow, arrayPtr);
         // check if pointer + vector size would cross the page boundary
-        asm.andl(cmpResult1, (vmPageSize - 1));
-        asm.cmpl(cmpResult1, (vmPageSize - bytesPerVector));
+        asm.andl(tmpArrayPtrLow, (vmPageSize - 1));
+        asm.cmpl(tmpArrayPtrLow, (vmPageSize - (findTwoCharPrefix ? bytesPerVector + kind.getByteCount() : bytesPerVector)));
         // if the page boundary would be crossed, do byte/character-wise comparison instead.
         asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop);
+
+        Label[] overBoundsMatch = {new Label(), new Label()};
         // otherwise, do a vector compare that reads beyond array bounds
-        emitSingleVectorCompare(asm, vectorSize, result, vecCmp, vecArray1, cmpResult1);
-        // check if a match was found
-        asm.testl(cmpResult1, cmpResult1);
-        asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
+        emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoCharPrefix ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, overBoundsMatch, false);
+        // no match
+        asm.jmp(retNotFound);
+        if (findTwoCharPrefix) {
+            Label overBoundsFinish = new Label();
+            asm.bind(overBoundsMatch[1]);
+            // get match offset of second result
+            asm.bsfq(cmpResult[1], cmpResult[1]);
+            asm.addl(cmpResult[1], kind.getByteCount());
+            // replace first result with second and continue
+            asm.movl(cmpResult[0], cmpResult[1]);
+            asm.jmpb(overBoundsFinish);
+
+            asm.bind(overBoundsMatch[0]);
+            emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, overBoundsFinish);
+        } else {
+            asm.bind(overBoundsMatch[0]);
         // find match offset
-        asm.bsfq(cmpResult1, cmpResult1);
-        if (charMode()) {
-            // convert number of remaining characters to bytes
-            asm.shll(slotsRemaining, 1);
+            asm.bsfq(cmpResult[0], cmpResult[0]);
         }
+
         // adjust array pointer for match result
-        asm.addq(result, cmpResult1);
+        asm.addq(arrayPtr, cmpResult[0]);
+        if (charMode(kind)) {
+            // convert byte offset to chars
+            asm.shrl(cmpResult[0], 1);
+        }
         // check if offset of matched value is greater than number of bytes remaining / out of array
         // bounds
-        asm.cmpl(cmpResult1, slotsRemaining);
+        if (findTwoCharPrefix) {
+            asm.decrementl(slotsRemaining);
+        }
+        asm.cmpl(cmpResult[0], slotsRemaining);
         // match is out of bounds, return no match
         asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound);
+        // adjust number of array slots remaining
+        if (findTwoCharPrefix) {
+            asm.incrementl(slotsRemaining, 1);
+        }
+        asm.subl(slotsRemaining, cmpResult[0]);
         // match is in bounds, return offset
-        asm.jmpb(retFound);
+        asm.jmp(retFound);
 
         // compare remaining slots in the array one-by-one
         asm.bind(lessThanVectorSizeRemainingLoop);
-        // check if any array slots remain
-        asm.testl(slotsRemaining, slotsRemaining);
-        asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
+        // check if enough array slots remain
+        asm.cmpl(slotsRemaining, findTwoCharPrefix ? 1 : 0);
+        asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, retNotFound);
         // load char / byte
-        AMD64Assembler.OperandSize operandSize = byteMode() ? AMD64Assembler.OperandSize.BYTE : AMD64Assembler.OperandSize.WORD;
-        if (byteMode()) {
-            AMD64Assembler.AMD64RMOp.MOVB.emit(asm, operandSize, cmpResult1, new AMD64Address(result));
+        if (byteMode(kind)) {
+            if (findTwoCharPrefix) {
+                asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr));
+            } else {
+                asm.movzbl(cmpResult[0], new AMD64Address(arrayPtr));
+            }
         } else {
-            AMD64Assembler.AMD64RMOp.MOV.emit(asm, operandSize, cmpResult1, new AMD64Address(result));
+            if (findTwoCharPrefix) {
+                asm.movl(cmpResult[0], new AMD64Address(arrayPtr));
+            } else {
+                asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr));
+            }
         }
         // check for match
-        AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(operandSize).emit(asm, operandSize, cmpResult1, searchValue);
+        for (int i = 0; i < nValues; i++) {
+            asm.cmpl(cmpResult[0], searchValue[i]);
         asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound);
+        }
         // adjust number of array slots remaining
         asm.decrementl(slotsRemaining);
         // adjust array pointer
-        asm.addq(result, kind.getByteCount());
+        asm.addq(arrayPtr, kind.getByteCount());
         // continue loop
         asm.jmpb(lessThanVectorSizeRemainingLoop);
 
-        // return -1 (no match)
-        asm.bind(retNotFound);
-        asm.movl(result, -1);
-        asm.jmpb(end);
-
-        emitVectorFoundWithOffset(asm, bytesPerVector, result, cmpResult2, vectorFound2, retFound);
-        emitVectorFoundWithOffset(asm, bytesPerVector * 2, result, cmpResult3, vectorFound3, retFound);
-        emitVectorFoundWithOffset(asm, bytesPerVector * 3, result, cmpResult4, vectorFound4, retFound);
+        for (int i = 1; i < nVectors; i += (findTwoCharPrefix ? 2 : 1)) {
+            emitVectorFoundWithOffset(asm, kind, vectorOffsets[i], arrayPtr, cmpResult[i], slotsRemaining, vectorFound[i], retFound);
+        }
 
-        asm.bind(vectorFound1);
+        if (findTwoCharPrefix) {
+            asm.bind(vectorFound[2]);
+            asm.addq(arrayPtr, vectorOffsets[2]);
+            // adjust number of array slots remaining
+            asm.subl(slotsRemaining, charMode(kind) ? vectorOffsets[2] / 2 : vectorOffsets[2]);
+            asm.movl(cmpResult[0], cmpResult[2]);
+            asm.movl(cmpResult[1], cmpResult[3]);
+            asm.bind(vectorFound[0]);
+            emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, new Label());
+        } else {
+            asm.bind(vectorFound[0]);
         // find index of first set bit in bit mask
-        asm.bsfq(cmpResult1, cmpResult1);
+            asm.bsfq(cmpResult[0], cmpResult[0]);
+        }
         // add offset to array pointer
-        asm.addq(result, cmpResult1);
+        asm.addq(arrayPtr, cmpResult[0]);
+        if (charMode(kind)) {
+            // convert byte offset to chars
+            asm.shrl(cmpResult[0], 1);
+        }
+        // adjust number of array slots remaining
+        asm.subl(slotsRemaining, cmpResult[0]);
+        asm.jmpb(retFound);
+    }
 
-        asm.bind(retFound);
-        // convert array pointer to offset
-        asm.subq(result, arrayPtr);
-        emitBytesToArraySlots(asm, result);
-        asm.bind(end);
+    private static void emitFindTwoCharPrefixMinResult(AMD64MacroAssembler asm, JavaKind kind, Register[] cmpResult, Label done) {
+        // find match offset
+        asm.bsfq(cmpResult[0], cmpResult[0]);
+        // check if second result is also a match
+        asm.testl(cmpResult[1], cmpResult[1]);
+        asm.jcc(AMD64Assembler.ConditionFlag.Zero, done);
+        // get match offset of second result
+        asm.bsfq(cmpResult[1], cmpResult[1]);
+        asm.addl(cmpResult[1], kind.getByteCount());
+        // check if first result is less than second
+        asm.cmpl(cmpResult[0], cmpResult[1]);
+        asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, done);
+        // first result is greater than second, replace it with the second result
+        asm.movl(cmpResult[0], cmpResult[1]);
+        asm.bind(done);
     }
 
     private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
         asm.align(crb.target.wordSize * 2);
     }
 
     /**
-     * Fills {@code vecDst} with copies of its lowest byte or word.
+     * Fills {@code vecDst} with copies of its lowest byte, word or dword.
      */
-    private void emitBroadcast(AMD64MacroAssembler asm, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) {
+    private static void emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) {
+        switch (kind) {
+            case Byte:
         if (asm.supports(CPUFeature.AVX2)) {
-            if (byteMode()) {
                 VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst);
-            } else {
-                VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst);
-            }
         } else if (asm.supports(CPUFeature.AVX)) {
-            if (byteMode()) {
-                // fill vecTmp with zeroes
                 VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp);
-                // broadcast loaded search value
                 VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp);
-            } else {
-                // fill low qword
-                VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0);
-                // copy low qword to high qword
-                VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
-            }
-        } else {
-            // SSE version
-            if (byteMode()) {
-                // fill vecTmp with zeroes
+                } else if (asm.supports(CPUFeature.SSSE3)) {
                 asm.pxor(vecTmp, vecTmp);
-                // broadcast loaded search value
                 asm.pshufb(vecDst, vecTmp);
-            } else {
-                // fill low qword
+                } else { // SSE2
+                    asm.punpcklbw(vecDst, vecDst);
+                    asm.punpcklbw(vecDst, vecDst);
+                    asm.pshufd(vecDst, vecDst, 0);
+                }
+                break;
+            case Short:
+            case Char:
+                if (asm.supports(CPUFeature.AVX2)) {
+                    VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst);
+                } else if (asm.supports(CPUFeature.AVX)) {
+                    VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0);
+                    VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
+                } else { // SSE
                 asm.pshuflw(vecDst, vecDst, 0);
-                // copy low qword to high qword
                 asm.pshufd(vecDst, vecDst, 0);
             }
+                break;
+            case Int:
+                if (asm.supports(CPUFeature.AVX2)) {
+                    VexRMOp.VPBROADCASTD.emit(asm, vectorSize, vecDst, vecDst);
+                } else if (asm.supports(CPUFeature.AVX)) {
+                    VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
+                } else { // SSE
+                    asm.pshufd(vecDst, vecDst, 0);
         }
+                break;
+            default:
+                throw new UnsupportedOperationException();
     }
-
-    /**
-     * Loads {@code vectorSize} bytes from the position pointed to by {@code arrayPtr} and compares
-     * them to the search value stored in {@code vecCmp}. {@code vecArray} is overwritten by this
-     * operation. The comparison result is stored in {@code cmpResult}.
-     */
-    private void emitSingleVectorCompare(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize,
-                    Register arrayPtr, Register vecCmp, Register vecArray, Register cmpResult) {
-        // load array contents into vector
-        emitArrayLoad(asm, vectorSize, vecArray, arrayPtr, 0, false);
-        // compare all loaded bytes to the search value.
-        emitVectorCompare(asm, vectorSize, vecArray, vecCmp);
-        // create a 32-bit-mask from the most significant bit of every byte in the comparison
-        // result.
-        emitMOVMSK(asm, vectorSize, cmpResult, vecArray);
     }
 
     /**
      * Convert a byte offset stored in {@code bytes} to an array index offset.
      */
-    private void emitBytesToArraySlots(AMD64MacroAssembler asm, Register bytes) {
-        if (charMode()) {
+    private static void emitBytesToArraySlots(AMD64MacroAssembler asm, JavaKind kind, Register bytes) {
+        if (charMode(kind)) {
             asm.shrl(bytes, 1);
         } else {
-            assert byteMode();
+            assert byteMode(kind);
         }
     }
 
-    private void emitBulkCompare(AMD64MacroAssembler asm,
+    private static void emitVectorCompare(AMD64MacroAssembler asm,
+                    JavaKind kind,
                     AVXKind.AVXSize vectorSize,
-                    int bytesPerVector,
+                    int nValues,
+                    int nVectors,
+                    int[] vectorOffsets,
                     Register arrayPtr,
-                    Register vecCmp,
-                    Register vecArray1,
-                    Register vecArray2,
-                    Register vecArray3,
-                    Register vecArray4,
-                    Register cmpResult1,
-                    Register cmpResult2,
-                    Register cmpResult3,
-                    Register cmpResult4,
-                    Label vectorFound1,
-                    Label vectorFound2,
-                    Label vectorFound3,
-                    Label vectorFound4,
+                    Register[] vecCmp,
+                    Register[] vecArray,
+                    Register[] cmpResult,
+                    Label[] vectorFound,
                     boolean alignedLoad) {
         // load array contents into vectors
-        emitArrayLoad(asm, vectorSize, vecArray1, arrayPtr, 0, alignedLoad);
-        emitArrayLoad(asm, vectorSize, vecArray2, arrayPtr, bytesPerVector, alignedLoad);
-        emitArrayLoad(asm, vectorSize, vecArray3, arrayPtr, bytesPerVector * 2, alignedLoad);
-        emitArrayLoad(asm, vectorSize, vecArray4, arrayPtr, bytesPerVector * 3, alignedLoad);
+        for (int i = 0; i < nValues; i++) {
+            for (int j = 0; j < nVectors; j++) {
+                emitArrayLoad(asm, vectorSize, vecArray[(i * nVectors) + j], arrayPtr, vectorOffsets[j], alignedLoad);
+            }
+        }
         // compare all loaded bytes to the search value.
         // matching bytes are set to 0xff, non-matching bytes are set to 0x00.
-        emitVectorCompare(asm, vectorSize, vecArray1, vecCmp);
-        emitVectorCompare(asm, vectorSize, vecArray2, vecCmp);
-        emitVectorCompare(asm, vectorSize, vecArray3, vecCmp);
-        emitVectorCompare(asm, vectorSize, vecArray4, vecCmp);
+        for (int i = 0; i < nValues; i++) {
+            for (int j = 0; j < nVectors; j++) {
+                emitVectorCompareInst(asm, kind, vectorSize, vecArray[(i * nVectors) + j], vecCmp[i]);
+            }
+        }
         // create 32-bit-masks from the most significant bit of every byte in the comparison
         // results.
-        emitMOVMSK(asm, vectorSize, cmpResult1, vecArray1);
-        emitMOVMSK(asm, vectorSize, cmpResult2, vecArray2);
-        emitMOVMSK(asm, vectorSize, cmpResult3, vecArray3);
-        emitMOVMSK(asm, vectorSize, cmpResult4, vecArray4);
+        for (int i = 0; i < nValues * nVectors; i++) {
+            emitMOVMSK(asm, vectorSize, cmpResult[i], vecArray[i]);
+        }
+        // join results of comparisons against multiple values
+        for (int stride = 1; stride < nValues; stride *= 2) {
+            for (int i = 0; i < nVectors; i++) {
+                for (int j = 0; j + stride < nValues; j += stride * 2) {
+                    asm.orl(cmpResult[i + (j * nVectors)], cmpResult[i + ((j + stride) * nVectors)]);
+                }
+            }
+        }
         // check if a match was found
-        asm.testl(cmpResult1, cmpResult1);
-        asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound1);
-        asm.testl(cmpResult2, cmpResult2);
-        asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound2);
-        asm.testl(cmpResult3, cmpResult3);
-        asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound3);
-        asm.testl(cmpResult4, cmpResult4);
-        asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound4);
+        for (int i = 0; i < nVectors; i++) {
+            asm.testl(cmpResult[i], cmpResult[i]);
+            asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound[i]);
+        }
     }
 
-    private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm, int resultOffset, Register result, Register cmpResult, Label entry, Label ret) {
+    private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm,
+                    JavaKind kind,
+                    int resultOffset,
+                    Register result,
+                    Register cmpResult,
+                    Register slotsRemaining,
+                    Label entry,
+                    Label ret) {
         asm.bind(entry);
         if (resultOffset > 0) {
             // adjust array pointer
             asm.addq(result, resultOffset);
+            // adjust number of array slots remaining
+            asm.subl(slotsRemaining, charMode(kind) ? resultOffset / 2 : resultOffset);
         }
         // find index of first set bit in bit mask
         asm.bsfq(cmpResult, cmpResult);
         // add offset to array pointer
         asm.addq(result, cmpResult);
+        if (charMode(kind)) {
+            // convert byte offset to chars
+            asm.shrl(cmpResult, 1);
+        }
+        // adjust number of array slots remaining
+        asm.subl(slotsRemaining, cmpResult);
         asm.jmpb(ret);
     }
 
     private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) {
         AMD64Address src = new AMD64Address(arrayPtr, offset);

@@ -444,26 +582,40 @@
             // SSE
             asm.movdqu(vecDst, src);
         }
     }
 
-    private void emitVectorCompare(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) {
-        // compare all loaded bytes to the search value.
-        // matching bytes are set to 0xff, non-matching bytes are set to 0x00.
+    /**
+     * Compares all packed bytes/words/dwords in {@code vecArray} to {@code vecCmp}. Matching values
+     * are set to all ones (0xff, 0xffff, ...), non-matching values are set to zero.
+     */
+    private static void emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) {
+        switch (kind) {
+            case Byte:
         if (asm.supports(CPUFeature.AVX)) {
-            if (byteMode()) {
                 VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
-            } else {
-                VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
-            }
-        } else {
-            // SSE
-            if (byteMode()) {
+                } else { // SSE
                 asm.pcmpeqb(vecArray, vecCmp);
-            } else {
+                }
+                break;
+            case Short:
+            case Char:
+                if (asm.supports(CPUFeature.AVX)) {
+                    VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
+                } else { // SSE
                 asm.pcmpeqw(vecArray, vecCmp);
             }
+                break;
+            case Int:
+                if (asm.supports(CPUFeature.AVX)) {
+                    VexRVMOp.VPCMPEQD.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
+                } else { // SSE
+                    asm.pcmpeqd(vecArray, vecCmp);
+                }
+                break;
+            default:
+                throw new UnsupportedOperationException();
         }
     }
 
     private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) {
         if (asm.supports(CPUFeature.AVX)) {
< prev index next >