< prev index next >
src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.amd64/src/org/graalvm/compiler/lir/amd64/AMD64ArrayIndexOfOp.java
Print this page
rev 52509 : [mq]: graal2
*** 22,35 ****
*/
package org.graalvm.compiler.lir.amd64;
! import static jdk.vm.ci.code.ValueUtil.asRegister;
! import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
! import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
!
import org.graalvm.compiler.asm.Label;
import org.graalvm.compiler.asm.amd64.AMD64Address;
import org.graalvm.compiler.asm.amd64.AMD64Assembler;
import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp;
import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp;
--- 22,37 ----
*/
package org.graalvm.compiler.lir.amd64;
! import jdk.vm.ci.amd64.AMD64;
! import jdk.vm.ci.amd64.AMD64.CPUFeature;
! import jdk.vm.ci.amd64.AMD64Kind;
! import jdk.vm.ci.code.Register;
! import jdk.vm.ci.meta.JavaKind;
! import jdk.vm.ci.meta.Value;
import org.graalvm.compiler.asm.Label;
import org.graalvm.compiler.asm.amd64.AMD64Address;
import org.graalvm.compiler.asm.amd64.AMD64Assembler;
import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp;
import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp;
*** 41,439 ****
import org.graalvm.compiler.lir.LIRInstructionClass;
import org.graalvm.compiler.lir.Opcode;
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
! import jdk.vm.ci.amd64.AMD64;
! import jdk.vm.ci.amd64.AMD64.CPUFeature;
! import jdk.vm.ci.amd64.AMD64Kind;
! import jdk.vm.ci.code.Register;
! import jdk.vm.ci.meta.JavaKind;
! import jdk.vm.ci.meta.Value;
/**
*/
@Opcode("AMD64_ARRAY_INDEX_OF")
public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction {
public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class);
private final JavaKind kind;
private final int vmPageSize;
@Def({REG}) protected Value resultValue;
! @Alive({REG}) protected Value charArrayPtrValue;
! @Use({REG}) protected Value charArrayLengthValue;
! @Alive({REG}) protected Value searchCharValue;
@Temp({REG}) protected Value arraySlotsRemaining;
@Temp({REG}) protected Value comparisonResult1;
@Temp({REG}) protected Value comparisonResult2;
@Temp({REG}) protected Value comparisonResult3;
@Temp({REG}) protected Value comparisonResult4;
! @Temp({REG, ILLEGAL}) protected Value vectorCompareVal;
@Temp({REG, ILLEGAL}) protected Value vectorArray1;
@Temp({REG, ILLEGAL}) protected Value vectorArray2;
@Temp({REG, ILLEGAL}) protected Value vectorArray3;
@Temp({REG, ILLEGAL}) protected Value vectorArray4;
! public AMD64ArrayIndexOfOp(
! JavaKind kind,
! int vmPageSize, LIRGeneratorTool tool,
! Value result,
! Value arrayPtr,
! Value arrayLength,
! Value searchChar) {
super(TYPE);
this.kind = kind;
this.vmPageSize = vmPageSize;
! assert byteMode() || charMode();
! assert supports(tool, CPUFeature.SSSE3) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool);
resultValue = result;
! charArrayPtrValue = arrayPtr;
! charArrayLengthValue = arrayLength;
! searchCharValue = searchChar;
!
! this.arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
! this.comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
! this.comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
! this.comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
! this.comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
! AMD64Kind vectorKind = byteMode() ? supportsAVX2(tool) ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE : supportsAVX2(tool) ? AMD64Kind.V256_WORD : AMD64Kind.V128_WORD;
! this.vectorCompareVal = tool.newVariable(LIRKind.value(vectorKind));
! this.vectorArray1 = tool.newVariable(LIRKind.value(vectorKind));
! this.vectorArray2 = tool.newVariable(LIRKind.value(vectorKind));
! this.vectorArray3 = tool.newVariable(LIRKind.value(vectorKind));
! this.vectorArray4 = tool.newVariable(LIRKind.value(vectorKind));
}
! private boolean byteMode() {
return kind == JavaKind.Byte;
}
! private boolean charMode() {
return kind == JavaKind.Char;
}
@Override
public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
! Register arrayPtr = asRegister(charArrayPtrValue);
! Register arrayLength = asRegister(charArrayLengthValue);
! Register searchValue = asRegister(searchCharValue);
Register result = asRegister(resultValue);
- Register vecCmp = asRegister(vectorCompareVal);
- Register vecArray1 = asRegister(vectorArray1);
- Register vecArray2 = asRegister(vectorArray2);
- Register vecArray3 = asRegister(vectorArray3);
- Register vecArray4 = asRegister(vectorArray4);
Register slotsRemaining = asRegister(arraySlotsRemaining);
! Register cmpResult1 = asRegister(comparisonResult1);
! Register cmpResult2 = asRegister(comparisonResult2);
! Register cmpResult3 = asRegister(comparisonResult3);
! Register cmpResult4 = asRegister(comparisonResult4);
!
! Label bulkVectorLoop = new Label();
! Label singleVectorLoop = new Label();
! Label vectorFound1 = new Label();
! Label vectorFound2 = new Label();
! Label vectorFound3 = new Label();
! Label vectorFound4 = new Label();
! Label lessThanVectorSizeRemaining = new Label();
! Label lessThanVectorSizeRemainingLoop = new Label();
Label retFound = new Label();
Label retNotFound = new Label();
Label end = new Label();
! AVXKind.AVXSize vectorSize = asm.supports(CPUFeature.AVX2) ? AVXKind.AVXSize.YMM : AVXKind.AVXSize.XMM;
! int nVectors = 4;
! int bytesPerVector = vectorSize.getBytes();
! int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount();
! int bulkSize = arraySlotsPerVector * nVectors;
! int bulkSizeBytes = bytesPerVector * nVectors;
! assert bulkSizeBytes >= 64;
// load array length
! // important: this must be the first register manipulation, since charArrayLengthValue is
// annotated with @Use
asm.movl(slotsRemaining, arrayLength);
! // move search value to vector
if (asm.supports(CPUFeature.AVX)) {
! VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp, searchValue);
} else {
! asm.movdl(vecCmp, searchValue);
}
- // load array pointer
- asm.movq(result, arrayPtr);
- // load copy of low part of array pointer
- asm.movl(cmpResult1, arrayPtr);
// fill comparison vector with copies of the search value
! emitBroadcast(asm, vecCmp, vecArray1, vectorSize);
// check if bulk vector load is in bounds
! asm.cmpl(slotsRemaining, bulkSize);
! asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop);
! // check if array pointer is 64-byte aligned
! asm.andl(cmpResult1, 63);
asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop);
// do one unaligned bulk comparison pass and adjust alignment afterwards
! emitBulkCompare(asm, vectorSize, bytesPerVector, result, vecCmp, vecArray1, vecArray2, vecArray3, vecArray4, cmpResult1, cmpResult2, cmpResult3, cmpResult4,
! vectorFound1, vectorFound2, vectorFound3, vectorFound4, false);
// load copy of low part of array pointer
! asm.movl(cmpResult1, arrayPtr);
// adjust array pointer
! asm.addq(result, bulkSizeBytes);
// adjust number of array slots remaining
asm.subl(slotsRemaining, bulkSize);
! // get offset to 64-byte alignment
! asm.andl(cmpResult1, 63);
! emitBytesToArraySlots(asm, cmpResult1);
! // adjust array pointer to 64-byte alignment
! asm.andq(result, ~63);
// adjust number of array slots remaining
! asm.addl(slotsRemaining, cmpResult1);
// check if there are enough array slots remaining for the bulk loop
! asm.cmpl(slotsRemaining, bulkSize);
! asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop);
emitAlign(crb, asm);
asm.bind(bulkVectorLoop);
// memory-aligned bulk comparison
! emitBulkCompare(asm, vectorSize, bytesPerVector, result, vecCmp, vecArray1, vecArray2, vecArray3, vecArray4, cmpResult1, cmpResult2, cmpResult3, cmpResult4,
! vectorFound1, vectorFound2, vectorFound3, vectorFound4, true);
// adjust number of array slots remaining
asm.subl(slotsRemaining, bulkSize);
// adjust array pointer
! asm.addq(result, bulkSizeBytes);
// check if there are enough array slots remaining for the bulk loop
! asm.cmpl(slotsRemaining, bulkSize);
! asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop);
// continue loop
! asm.jmpb(bulkVectorLoop);
emitAlign(crb, asm);
// same loop as bulkVectorLoop, with only one vector
asm.bind(singleVectorLoop);
// check if single vector load is in bounds
! asm.cmpl(slotsRemaining, arraySlotsPerVector);
asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining);
// compare
! emitSingleVectorCompare(asm, vectorSize, result, vecCmp, vecArray1, cmpResult1);
!
! // check if a match was found
! asm.testl(cmpResult1, cmpResult1);
! asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound1);
// adjust number of array slots remaining
asm.subl(slotsRemaining, arraySlotsPerVector);
// adjust array pointer
! asm.addq(result, bytesPerVector);
// continue loop
asm.jmpb(singleVectorLoop);
asm.bind(lessThanVectorSizeRemaining);
// check if any array slots remain
asm.testl(slotsRemaining, slotsRemaining);
asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
// a vector compare will read out of bounds of the input array.
// check if the out-of-bounds read would cross a memory page boundary.
// load copy of low part of array pointer
! asm.movl(cmpResult1, result);
// check if pointer + vector size would cross the page boundary
! asm.andl(cmpResult1, (vmPageSize - 1));
! asm.cmpl(cmpResult1, (vmPageSize - bytesPerVector));
// if the page boundary would be crossed, do byte/character-wise comparison instead.
asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop);
// otherwise, do a vector compare that reads beyond array bounds
! emitSingleVectorCompare(asm, vectorSize, result, vecCmp, vecArray1, cmpResult1);
! // check if a match was found
! asm.testl(cmpResult1, cmpResult1);
! asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
// find match offset
! asm.bsfq(cmpResult1, cmpResult1);
! if (charMode()) {
! // convert number of remaining characters to bytes
! asm.shll(slotsRemaining, 1);
}
// adjust array pointer for match result
! asm.addq(result, cmpResult1);
// check if offset of matched value is greater than number of bytes remaining / out of array
// bounds
! asm.cmpl(cmpResult1, slotsRemaining);
// match is out of bounds, return no match
asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound);
// match is in bounds, return offset
! asm.jmpb(retFound);
// compare remaining slots in the array one-by-one
asm.bind(lessThanVectorSizeRemainingLoop);
! // check if any array slots remain
! asm.testl(slotsRemaining, slotsRemaining);
! asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
// load char / byte
! AMD64Assembler.OperandSize operandSize = byteMode() ? AMD64Assembler.OperandSize.BYTE : AMD64Assembler.OperandSize.WORD;
! if (byteMode()) {
! AMD64Assembler.AMD64RMOp.MOVB.emit(asm, operandSize, cmpResult1, new AMD64Address(result));
} else {
! AMD64Assembler.AMD64RMOp.MOV.emit(asm, operandSize, cmpResult1, new AMD64Address(result));
}
// check for match
! AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(operandSize).emit(asm, operandSize, cmpResult1, searchValue);
asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound);
// adjust number of array slots remaining
asm.decrementl(slotsRemaining);
// adjust array pointer
! asm.addq(result, kind.getByteCount());
// continue loop
asm.jmpb(lessThanVectorSizeRemainingLoop);
! // return -1 (no match)
! asm.bind(retNotFound);
! asm.movl(result, -1);
! asm.jmpb(end);
!
! emitVectorFoundWithOffset(asm, bytesPerVector, result, cmpResult2, vectorFound2, retFound);
! emitVectorFoundWithOffset(asm, bytesPerVector * 2, result, cmpResult3, vectorFound3, retFound);
! emitVectorFoundWithOffset(asm, bytesPerVector * 3, result, cmpResult4, vectorFound4, retFound);
! asm.bind(vectorFound1);
// find index of first set bit in bit mask
! asm.bsfq(cmpResult1, cmpResult1);
// add offset to array pointer
! asm.addq(result, cmpResult1);
! asm.bind(retFound);
! // convert array pointer to offset
! asm.subq(result, arrayPtr);
! emitBytesToArraySlots(asm, result);
! asm.bind(end);
}
private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
asm.align(crb.target.wordSize * 2);
}
/**
! * Fills {@code vecDst} with copies of its lowest byte or word.
*/
! private void emitBroadcast(AMD64MacroAssembler asm, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) {
if (asm.supports(CPUFeature.AVX2)) {
- if (byteMode()) {
VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst);
- } else {
- VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst);
- }
} else if (asm.supports(CPUFeature.AVX)) {
- if (byteMode()) {
- // fill vecTmp with zeroes
VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp);
- // broadcast loaded search value
VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp);
! } else {
! // fill low qword
! VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0);
! // copy low qword to high qword
! VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
! }
! } else {
! // SSE version
! if (byteMode()) {
! // fill vecTmp with zeroes
asm.pxor(vecTmp, vecTmp);
- // broadcast loaded search value
asm.pshufb(vecDst, vecTmp);
! } else {
! // fill low qword
asm.pshuflw(vecDst, vecDst, 0);
- // copy low qword to high qword
asm.pshufd(vecDst, vecDst, 0);
}
}
}
-
- /**
- * Loads {@code vectorSize} bytes from the position pointed to by {@code arrayPtr} and compares
- * them to the search value stored in {@code vecCmp}. {@code vecArray} is overwritten by this
- * operation. The comparison result is stored in {@code cmpResult}.
- */
- private void emitSingleVectorCompare(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize,
- Register arrayPtr, Register vecCmp, Register vecArray, Register cmpResult) {
- // load array contents into vector
- emitArrayLoad(asm, vectorSize, vecArray, arrayPtr, 0, false);
- // compare all loaded bytes to the search value.
- emitVectorCompare(asm, vectorSize, vecArray, vecCmp);
- // create a 32-bit-mask from the most significant bit of every byte in the comparison
- // result.
- emitMOVMSK(asm, vectorSize, cmpResult, vecArray);
}
/**
* Convert a byte offset stored in {@code bytes} to an array index offset.
*/
! private void emitBytesToArraySlots(AMD64MacroAssembler asm, Register bytes) {
! if (charMode()) {
asm.shrl(bytes, 1);
} else {
! assert byteMode();
}
}
! private void emitBulkCompare(AMD64MacroAssembler asm,
AVXKind.AVXSize vectorSize,
! int bytesPerVector,
Register arrayPtr,
! Register vecCmp,
! Register vecArray1,
! Register vecArray2,
! Register vecArray3,
! Register vecArray4,
! Register cmpResult1,
! Register cmpResult2,
! Register cmpResult3,
! Register cmpResult4,
! Label vectorFound1,
! Label vectorFound2,
! Label vectorFound3,
! Label vectorFound4,
boolean alignedLoad) {
// load array contents into vectors
! emitArrayLoad(asm, vectorSize, vecArray1, arrayPtr, 0, alignedLoad);
! emitArrayLoad(asm, vectorSize, vecArray2, arrayPtr, bytesPerVector, alignedLoad);
! emitArrayLoad(asm, vectorSize, vecArray3, arrayPtr, bytesPerVector * 2, alignedLoad);
! emitArrayLoad(asm, vectorSize, vecArray4, arrayPtr, bytesPerVector * 3, alignedLoad);
// compare all loaded bytes to the search value.
// matching bytes are set to 0xff, non-matching bytes are set to 0x00.
! emitVectorCompare(asm, vectorSize, vecArray1, vecCmp);
! emitVectorCompare(asm, vectorSize, vecArray2, vecCmp);
! emitVectorCompare(asm, vectorSize, vecArray3, vecCmp);
! emitVectorCompare(asm, vectorSize, vecArray4, vecCmp);
// create 32-bit-masks from the most significant bit of every byte in the comparison
// results.
! emitMOVMSK(asm, vectorSize, cmpResult1, vecArray1);
! emitMOVMSK(asm, vectorSize, cmpResult2, vecArray2);
! emitMOVMSK(asm, vectorSize, cmpResult3, vecArray3);
! emitMOVMSK(asm, vectorSize, cmpResult4, vecArray4);
// check if a match was found
! asm.testl(cmpResult1, cmpResult1);
! asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound1);
! asm.testl(cmpResult2, cmpResult2);
! asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound2);
! asm.testl(cmpResult3, cmpResult3);
! asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound3);
! asm.testl(cmpResult4, cmpResult4);
! asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound4);
}
! private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm, int resultOffset, Register result, Register cmpResult, Label entry, Label ret) {
asm.bind(entry);
if (resultOffset > 0) {
// adjust array pointer
asm.addq(result, resultOffset);
}
// find index of first set bit in bit mask
asm.bsfq(cmpResult, cmpResult);
// add offset to array pointer
asm.addq(result, cmpResult);
asm.jmpb(ret);
}
private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) {
AMD64Address src = new AMD64Address(arrayPtr, offset);
--- 43,577 ----
import org.graalvm.compiler.lir.LIRInstructionClass;
import org.graalvm.compiler.lir.Opcode;
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
! import static jdk.vm.ci.code.ValueUtil.asRegister;
! import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
! import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
/**
*/
@Opcode("AMD64_ARRAY_INDEX_OF")
public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction {
public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class);
private final JavaKind kind;
private final int vmPageSize;
+ private final int nValues;
+ private final boolean findTwoConsecutive;
+ private final AMD64Kind vectorKind;
@Def({REG}) protected Value resultValue;
! @Alive({REG}) protected Value arrayPtrValue;
! @Use({REG}) protected Value arrayLengthValue;
! @Alive({REG}) protected Value searchValue1;
! @Alive({REG, ILLEGAL}) protected Value searchValue2;
! @Alive({REG, ILLEGAL}) protected Value searchValue3;
! @Alive({REG, ILLEGAL}) protected Value searchValue4;
@Temp({REG}) protected Value arraySlotsRemaining;
@Temp({REG}) protected Value comparisonResult1;
@Temp({REG}) protected Value comparisonResult2;
@Temp({REG}) protected Value comparisonResult3;
@Temp({REG}) protected Value comparisonResult4;
! @Temp({REG, ILLEGAL}) protected Value vectorCompareVal1;
! @Temp({REG, ILLEGAL}) protected Value vectorCompareVal2;
! @Temp({REG, ILLEGAL}) protected Value vectorCompareVal3;
! @Temp({REG, ILLEGAL}) protected Value vectorCompareVal4;
@Temp({REG, ILLEGAL}) protected Value vectorArray1;
@Temp({REG, ILLEGAL}) protected Value vectorArray2;
@Temp({REG, ILLEGAL}) protected Value vectorArray3;
@Temp({REG, ILLEGAL}) protected Value vectorArray4;
! public AMD64ArrayIndexOfOp(JavaKind kind, boolean findTwoConsecutive, int vmPageSize, int maxVectorSize, LIRGeneratorTool tool, Value result, Value arrayPtr, Value arrayLength,
! Value... searchValues) {
super(TYPE);
this.kind = kind;
+ this.findTwoConsecutive = findTwoConsecutive;
this.vmPageSize = vmPageSize;
! assert 0 < searchValues.length && searchValues.length <= 4;
! assert byteMode(kind) || charMode(kind);
! assert supports(tool, CPUFeature.SSE2) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool);
! nValues = searchValues.length;
! assert !findTwoConsecutive || nValues == 1;
resultValue = result;
! arrayPtrValue = arrayPtr;
! arrayLengthValue = arrayLength;
! searchValue1 = searchValues[0];
! searchValue2 = nValues > 1 ? searchValues[1] : Value.ILLEGAL;
! searchValue3 = nValues > 2 ? searchValues[2] : Value.ILLEGAL;
! searchValue4 = nValues > 3 ? searchValues[3] : Value.ILLEGAL;
! arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
! comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
! comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
! comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
! comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
! vectorKind = supportsAVX2(tool) && (maxVectorSize < 0 || maxVectorSize >= 32) ? byteMode(kind) ? AMD64Kind.V256_BYTE : AMD64Kind.V256_WORD
! : byteMode(kind) ? AMD64Kind.V128_BYTE : AMD64Kind.V128_WORD;
! vectorCompareVal1 = tool.newVariable(LIRKind.value(vectorKind));
! vectorCompareVal2 = nValues > 1 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
! vectorCompareVal3 = nValues > 2 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
! vectorCompareVal4 = nValues > 3 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
! vectorArray1 = tool.newVariable(LIRKind.value(vectorKind));
! vectorArray2 = tool.newVariable(LIRKind.value(vectorKind));
! vectorArray3 = tool.newVariable(LIRKind.value(vectorKind));
! vectorArray4 = tool.newVariable(LIRKind.value(vectorKind));
}
! private static boolean byteMode(JavaKind kind) {
return kind == JavaKind.Byte;
}
! private static boolean charMode(JavaKind kind) {
return kind == JavaKind.Char;
}
@Override
public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
! Register arrayPtr = asRegister(arrayPtrValue);
! Register arrayLength = asRegister(arrayLengthValue);
Register result = asRegister(resultValue);
Register slotsRemaining = asRegister(arraySlotsRemaining);
! Register[] searchValue = {
! nValues > 0 ? asRegister(searchValue1) : null,
! nValues > 1 ? asRegister(searchValue2) : null,
! nValues > 2 ? asRegister(searchValue3) : null,
! nValues > 3 ? asRegister(searchValue4) : null,
! };
! Register[] vecCmp = {
! nValues > 0 ? asRegister(vectorCompareVal1) : null,
! nValues > 1 ? asRegister(vectorCompareVal2) : null,
! nValues > 2 ? asRegister(vectorCompareVal3) : null,
! nValues > 3 ? asRegister(vectorCompareVal4) : null,
! };
! Register[] vecArray = {
! asRegister(vectorArray1),
! asRegister(vectorArray2),
! asRegister(vectorArray3),
! asRegister(vectorArray4),
! };
! Register[] cmpResult = {
! asRegister(comparisonResult1),
! asRegister(comparisonResult2),
! asRegister(comparisonResult3),
! asRegister(comparisonResult4),
! };
Label retFound = new Label();
Label retNotFound = new Label();
Label end = new Label();
! AVXKind.AVXSize vectorSize = AVXKind.getDataSize(vectorKind);
! int nVectors = nValues == 1 ? 4 : nValues == 2 ? 2 : 1;
// load array length
! // important: this must be the first register manipulation, since arrayLengthValue is
// annotated with @Use
asm.movl(slotsRemaining, arrayLength);
! // load array pointer
! asm.movq(result, arrayPtr);
! // move search values to vectors
! for (int i = 0; i < nValues; i++) {
if (asm.supports(CPUFeature.AVX)) {
! VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp[i], searchValue[i]);
} else {
! asm.movdl(vecCmp[i], searchValue[i]);
! }
}
// fill comparison vector with copies of the search value
! for (int i = 0; i < nValues; i++) {
! emitBroadcast(asm, findTwoConsecutive ? (byteMode(kind) ? JavaKind.Char : JavaKind.Int) : kind, vecCmp[i], vecArray[0], vectorSize);
! }
!
! emitArrayIndexOfChars(crb, asm, kind, vectorSize, result, slotsRemaining, searchValue, vecCmp, vecArray, cmpResult, retFound, retNotFound, vmPageSize, nValues, nVectors, findTwoConsecutive);
!
! // return -1 (no match)
! asm.bind(retNotFound);
! asm.movq(result, -1);
! asm.jmpb(end);
!
! asm.bind(retFound);
! // convert array pointer to offset
! asm.subq(result, arrayPtr);
! if (charMode(kind)) {
! asm.shrq(result, 1);
! }
! asm.bind(end);
! }
!
! private static void emitArrayIndexOfChars(CompilationResultBuilder crb, AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize,
! Register arrayPtr,
! Register slotsRemaining,
! Register[] searchValue,
! Register[] vecCmp,
! Register[] vecArray,
! Register[] cmpResult,
! Label retFound,
! Label retNotFound,
! int vmPageSize,
! int nValues,
! int nVectors,
! boolean findTwoCharPrefix) {
! Label bulkVectorLoop = new Label();
! Label singleVectorLoop = new Label();
! Label[] vectorFound = {
! new Label(),
! new Label(),
! new Label(),
! new Label(),
! };
! Label lessThanVectorSizeRemaining = new Label();
! Label lessThanVectorSizeRemainingLoop = new Label();
! Label bulkVectorLoopExit = nVectors == 1 ? lessThanVectorSizeRemaining : singleVectorLoop;
! int bytesPerVector = vectorSize.getBytes();
! int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount();
! int singleVectorLoopCondition = arraySlotsPerVector;
! int bulkSize = arraySlotsPerVector * nVectors;
! int bulkSizeBytes = bytesPerVector * nVectors;
! int bulkLoopCondition = bulkSize;
! int[] vectorOffsets;
! JavaKind vectorCompareKind = kind;
! if (findTwoCharPrefix) {
! singleVectorLoopCondition++;
! bulkLoopCondition++;
! bulkSize /= 2;
! bulkSizeBytes /= 2;
! vectorOffsets = new int[]{0, kind.getByteCount(), bytesPerVector, bytesPerVector + kind.getByteCount()};
! vectorCompareKind = byteMode(kind) ? JavaKind.Char : JavaKind.Int;
! } else {
! vectorOffsets = new int[]{0, bytesPerVector, bytesPerVector * 2, bytesPerVector * 3};
! }
!
! // load copy of low part of array pointer
! Register tmpArrayPtrLow = cmpResult[0];
! asm.movl(tmpArrayPtrLow, arrayPtr);
// check if bulk vector load is in bounds
! asm.cmpl(slotsRemaining, bulkLoopCondition);
! asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
! // check if array pointer is aligned to bulkSize
! asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1);
asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop);
// do one unaligned bulk comparison pass and adjust alignment afterwards
! emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false);
// load copy of low part of array pointer
! asm.movl(tmpArrayPtrLow, arrayPtr);
// adjust array pointer
! asm.addq(arrayPtr, bulkSizeBytes);
// adjust number of array slots remaining
asm.subl(slotsRemaining, bulkSize);
! // get offset to bulk size alignment
! asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1);
! emitBytesToArraySlots(asm, kind, tmpArrayPtrLow);
! // adjust array pointer to bulk size alignment
! asm.andq(arrayPtr, ~(bulkSizeBytes - 1));
// adjust number of array slots remaining
! asm.addl(slotsRemaining, tmpArrayPtrLow);
// check if there are enough array slots remaining for the bulk loop
! asm.cmpl(slotsRemaining, bulkLoopCondition);
! asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
emitAlign(crb, asm);
asm.bind(bulkVectorLoop);
// memory-aligned bulk comparison
! emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, !findTwoCharPrefix);
// adjust number of array slots remaining
asm.subl(slotsRemaining, bulkSize);
// adjust array pointer
! asm.addq(arrayPtr, bulkSizeBytes);
// check if there are enough array slots remaining for the bulk loop
! asm.cmpl(slotsRemaining, bulkLoopCondition);
! asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
// continue loop
! asm.jmp(bulkVectorLoop);
+ if (nVectors > 1) {
emitAlign(crb, asm);
// same loop as bulkVectorLoop, with only one vector
asm.bind(singleVectorLoop);
// check if single vector load is in bounds
! asm.cmpl(slotsRemaining, singleVectorLoopCondition);
asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining);
// compare
! emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoCharPrefix ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false);
// adjust number of array slots remaining
asm.subl(slotsRemaining, arraySlotsPerVector);
// adjust array pointer
! asm.addq(arrayPtr, bytesPerVector);
// continue loop
asm.jmpb(singleVectorLoop);
+ }
asm.bind(lessThanVectorSizeRemaining);
// check if any array slots remain
asm.testl(slotsRemaining, slotsRemaining);
asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
// a vector compare will read out of bounds of the input array.
// check if the out-of-bounds read would cross a memory page boundary.
// load copy of low part of array pointer
! asm.movl(tmpArrayPtrLow, arrayPtr);
// check if pointer + vector size would cross the page boundary
! asm.andl(tmpArrayPtrLow, (vmPageSize - 1));
! asm.cmpl(tmpArrayPtrLow, (vmPageSize - (findTwoCharPrefix ? bytesPerVector + kind.getByteCount() : bytesPerVector)));
// if the page boundary would be crossed, do byte/character-wise comparison instead.
asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop);
+
+ Label[] overBoundsMatch = {new Label(), new Label()};
// otherwise, do a vector compare that reads beyond array bounds
! emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoCharPrefix ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, overBoundsMatch, false);
! // no match
! asm.jmp(retNotFound);
! if (findTwoCharPrefix) {
! Label overBoundsFinish = new Label();
! asm.bind(overBoundsMatch[1]);
! // get match offset of second result
! asm.bsfq(cmpResult[1], cmpResult[1]);
! asm.addl(cmpResult[1], kind.getByteCount());
! // replace first result with second and continue
! asm.movl(cmpResult[0], cmpResult[1]);
! asm.jmpb(overBoundsFinish);
!
! asm.bind(overBoundsMatch[0]);
! emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, overBoundsFinish);
! } else {
! asm.bind(overBoundsMatch[0]);
// find match offset
! asm.bsfq(cmpResult[0], cmpResult[0]);
}
+
// adjust array pointer for match result
! asm.addq(arrayPtr, cmpResult[0]);
! if (charMode(kind)) {
! // convert byte offset to chars
! asm.shrl(cmpResult[0], 1);
! }
// check if offset of matched value is greater than number of bytes remaining / out of array
// bounds
! if (findTwoCharPrefix) {
! asm.decrementl(slotsRemaining);
! }
! asm.cmpl(cmpResult[0], slotsRemaining);
// match is out of bounds, return no match
asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound);
+ // adjust number of array slots remaining
+ if (findTwoCharPrefix) {
+ asm.incrementl(slotsRemaining, 1);
+ }
+ asm.subl(slotsRemaining, cmpResult[0]);
// match is in bounds, return offset
! asm.jmp(retFound);
// compare remaining slots in the array one-by-one
asm.bind(lessThanVectorSizeRemainingLoop);
! // check if enough array slots remain
! asm.cmpl(slotsRemaining, findTwoCharPrefix ? 1 : 0);
! asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, retNotFound);
// load char / byte
! if (byteMode(kind)) {
! if (findTwoCharPrefix) {
! asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr));
! } else {
! asm.movzbl(cmpResult[0], new AMD64Address(arrayPtr));
! }
} else {
! if (findTwoCharPrefix) {
! asm.movl(cmpResult[0], new AMD64Address(arrayPtr));
! } else {
! asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr));
! }
}
// check for match
! for (int i = 0; i < nValues; i++) {
! asm.cmpl(cmpResult[0], searchValue[i]);
asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound);
+ }
// adjust number of array slots remaining
asm.decrementl(slotsRemaining);
// adjust array pointer
! asm.addq(arrayPtr, kind.getByteCount());
// continue loop
asm.jmpb(lessThanVectorSizeRemainingLoop);
! for (int i = 1; i < nVectors; i += (findTwoCharPrefix ? 2 : 1)) {
! emitVectorFoundWithOffset(asm, kind, vectorOffsets[i], arrayPtr, cmpResult[i], slotsRemaining, vectorFound[i], retFound);
! }
! if (findTwoCharPrefix) {
! asm.bind(vectorFound[2]);
! asm.addq(arrayPtr, vectorOffsets[2]);
! // adjust number of array slots remaining
! asm.subl(slotsRemaining, charMode(kind) ? vectorOffsets[2] / 2 : vectorOffsets[2]);
! asm.movl(cmpResult[0], cmpResult[2]);
! asm.movl(cmpResult[1], cmpResult[3]);
! asm.bind(vectorFound[0]);
! emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, new Label());
! } else {
! asm.bind(vectorFound[0]);
// find index of first set bit in bit mask
! asm.bsfq(cmpResult[0], cmpResult[0]);
! }
// add offset to array pointer
! asm.addq(arrayPtr, cmpResult[0]);
! if (charMode(kind)) {
! // convert byte offset to chars
! asm.shrl(cmpResult[0], 1);
! }
! // adjust number of array slots remaining
! asm.subl(slotsRemaining, cmpResult[0]);
! asm.jmpb(retFound);
! }
! private static void emitFindTwoCharPrefixMinResult(AMD64MacroAssembler asm, JavaKind kind, Register[] cmpResult, Label done) {
! // find match offset
! asm.bsfq(cmpResult[0], cmpResult[0]);
! // check if second result is also a match
! asm.testl(cmpResult[1], cmpResult[1]);
! asm.jcc(AMD64Assembler.ConditionFlag.Zero, done);
! // get match offset of second result
! asm.bsfq(cmpResult[1], cmpResult[1]);
! asm.addl(cmpResult[1], kind.getByteCount());
! // check if first result is less than second
! asm.cmpl(cmpResult[0], cmpResult[1]);
! asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, done);
! // first result is greater than second, replace it with the second result
! asm.movl(cmpResult[0], cmpResult[1]);
! asm.bind(done);
}
private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
asm.align(crb.target.wordSize * 2);
}
/**
! * Fills {@code vecDst} with copies of its lowest byte, word or dword.
*/
! private static void emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) {
! switch (kind) {
! case Byte:
if (asm.supports(CPUFeature.AVX2)) {
VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst);
} else if (asm.supports(CPUFeature.AVX)) {
VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp);
VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp);
! } else if (asm.supports(CPUFeature.SSSE3)) {
asm.pxor(vecTmp, vecTmp);
asm.pshufb(vecDst, vecTmp);
! } else { // SSE2
! asm.punpcklbw(vecDst, vecDst);
! asm.punpcklbw(vecDst, vecDst);
! asm.pshufd(vecDst, vecDst, 0);
! }
! break;
! case Short:
! case Char:
! if (asm.supports(CPUFeature.AVX2)) {
! VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst);
! } else if (asm.supports(CPUFeature.AVX)) {
! VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0);
! VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
! } else { // SSE
asm.pshuflw(vecDst, vecDst, 0);
asm.pshufd(vecDst, vecDst, 0);
}
+ break;
+ case Int:
+ if (asm.supports(CPUFeature.AVX2)) {
+ VexRMOp.VPBROADCASTD.emit(asm, vectorSize, vecDst, vecDst);
+ } else if (asm.supports(CPUFeature.AVX)) {
+ VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
+ } else { // SSE
+ asm.pshufd(vecDst, vecDst, 0);
}
+ break;
+ default:
+ throw new UnsupportedOperationException();
}
}
/**
* Convert a byte offset stored in {@code bytes} to an array index offset.
*/
! private static void emitBytesToArraySlots(AMD64MacroAssembler asm, JavaKind kind, Register bytes) {
! if (charMode(kind)) {
asm.shrl(bytes, 1);
} else {
! assert byteMode(kind);
}
}
! private static void emitVectorCompare(AMD64MacroAssembler asm,
! JavaKind kind,
AVXKind.AVXSize vectorSize,
! int nValues,
! int nVectors,
! int[] vectorOffsets,
Register arrayPtr,
! Register[] vecCmp,
! Register[] vecArray,
! Register[] cmpResult,
! Label[] vectorFound,
boolean alignedLoad) {
// load array contents into vectors
! for (int i = 0; i < nValues; i++) {
! for (int j = 0; j < nVectors; j++) {
! emitArrayLoad(asm, vectorSize, vecArray[(i * nVectors) + j], arrayPtr, vectorOffsets[j], alignedLoad);
! }
! }
// compare all loaded bytes to the search value.
// matching bytes are set to 0xff, non-matching bytes are set to 0x00.
! for (int i = 0; i < nValues; i++) {
! for (int j = 0; j < nVectors; j++) {
! emitVectorCompareInst(asm, kind, vectorSize, vecArray[(i * nVectors) + j], vecCmp[i]);
! }
! }
// create 32-bit-masks from the most significant bit of every byte in the comparison
// results.
! for (int i = 0; i < nValues * nVectors; i++) {
! emitMOVMSK(asm, vectorSize, cmpResult[i], vecArray[i]);
! }
! // join results of comparisons against multiple values
! for (int stride = 1; stride < nValues; stride *= 2) {
! for (int i = 0; i < nVectors; i++) {
! for (int j = 0; j + stride < nValues; j += stride * 2) {
! asm.orl(cmpResult[i + (j * nVectors)], cmpResult[i + ((j + stride) * nVectors)]);
! }
! }
! }
// check if a match was found
! for (int i = 0; i < nVectors; i++) {
! asm.testl(cmpResult[i], cmpResult[i]);
! asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound[i]);
! }
}
! private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm,
! JavaKind kind,
! int resultOffset,
! Register result,
! Register cmpResult,
! Register slotsRemaining,
! Label entry,
! Label ret) {
asm.bind(entry);
if (resultOffset > 0) {
// adjust array pointer
asm.addq(result, resultOffset);
+ // adjust number of array slots remaining
+ asm.subl(slotsRemaining, charMode(kind) ? resultOffset / 2 : resultOffset);
}
// find index of first set bit in bit mask
asm.bsfq(cmpResult, cmpResult);
// add offset to array pointer
asm.addq(result, cmpResult);
+ if (charMode(kind)) {
+ // convert byte offset to chars
+ asm.shrl(cmpResult, 1);
+ }
+ // adjust number of array slots remaining
+ asm.subl(slotsRemaining, cmpResult);
asm.jmpb(ret);
}
private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) {
AMD64Address src = new AMD64Address(arrayPtr, offset);
*** 444,469 ****
// SSE
asm.movdqu(vecDst, src);
}
}
! private void emitVectorCompare(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) {
! // compare all loaded bytes to the search value.
! // matching bytes are set to 0xff, non-matching bytes are set to 0x00.
if (asm.supports(CPUFeature.AVX)) {
- if (byteMode()) {
VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
! } else {
! VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
! }
! } else {
! // SSE
! if (byteMode()) {
asm.pcmpeqb(vecArray, vecCmp);
! } else {
asm.pcmpeqw(vecArray, vecCmp);
}
}
}
private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) {
if (asm.supports(CPUFeature.AVX)) {
--- 582,621 ----
// SSE
asm.movdqu(vecDst, src);
}
}
! /**
! * Compares all packed bytes/words/dwords in {@code vecArray} to {@code vecCmp}. Matching values
! * are set to all ones (0xff, 0xffff, ...), non-matching values are set to zero.
! */
! private static void emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) {
! switch (kind) {
! case Byte:
if (asm.supports(CPUFeature.AVX)) {
VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
! } else { // SSE
asm.pcmpeqb(vecArray, vecCmp);
! }
! break;
! case Short:
! case Char:
! if (asm.supports(CPUFeature.AVX)) {
! VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
! } else { // SSE
asm.pcmpeqw(vecArray, vecCmp);
}
+ break;
+ case Int:
+ if (asm.supports(CPUFeature.AVX)) {
+ VexRVMOp.VPCMPEQD.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
+ } else { // SSE
+ asm.pcmpeqd(vecArray, vecCmp);
+ }
+ break;
+ default:
+ throw new UnsupportedOperationException();
}
}
private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) {
if (asm.supports(CPUFeature.AVX)) {
< prev index next >