< prev index next >

src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.amd64/src/org/graalvm/compiler/lir/amd64/AMD64ArrayIndexOfOp.java

Print this page
rev 52509 : [mq]: graal2

*** 22,35 **** */ package org.graalvm.compiler.lir.amd64; ! import static jdk.vm.ci.code.ValueUtil.asRegister; ! import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; ! import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; ! import org.graalvm.compiler.asm.Label; import org.graalvm.compiler.asm.amd64.AMD64Address; import org.graalvm.compiler.asm.amd64.AMD64Assembler; import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp; import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp; --- 22,37 ---- */ package org.graalvm.compiler.lir.amd64; ! import jdk.vm.ci.amd64.AMD64; ! import jdk.vm.ci.amd64.AMD64.CPUFeature; ! import jdk.vm.ci.amd64.AMD64Kind; ! import jdk.vm.ci.code.Register; ! import jdk.vm.ci.meta.JavaKind; ! import jdk.vm.ci.meta.Value; import org.graalvm.compiler.asm.Label; import org.graalvm.compiler.asm.amd64.AMD64Address; import org.graalvm.compiler.asm.amd64.AMD64Assembler; import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp; import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp;
*** 41,439 **** import org.graalvm.compiler.lir.LIRInstructionClass; import org.graalvm.compiler.lir.Opcode; import org.graalvm.compiler.lir.asm.CompilationResultBuilder; import org.graalvm.compiler.lir.gen.LIRGeneratorTool; ! import jdk.vm.ci.amd64.AMD64; ! import jdk.vm.ci.amd64.AMD64.CPUFeature; ! import jdk.vm.ci.amd64.AMD64Kind; ! import jdk.vm.ci.code.Register; ! import jdk.vm.ci.meta.JavaKind; ! import jdk.vm.ci.meta.Value; /** */ @Opcode("AMD64_ARRAY_INDEX_OF") public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction { public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class); private final JavaKind kind; private final int vmPageSize; @Def({REG}) protected Value resultValue; ! @Alive({REG}) protected Value charArrayPtrValue; ! @Use({REG}) protected Value charArrayLengthValue; ! @Alive({REG}) protected Value searchCharValue; @Temp({REG}) protected Value arraySlotsRemaining; @Temp({REG}) protected Value comparisonResult1; @Temp({REG}) protected Value comparisonResult2; @Temp({REG}) protected Value comparisonResult3; @Temp({REG}) protected Value comparisonResult4; ! @Temp({REG, ILLEGAL}) protected Value vectorCompareVal; @Temp({REG, ILLEGAL}) protected Value vectorArray1; @Temp({REG, ILLEGAL}) protected Value vectorArray2; @Temp({REG, ILLEGAL}) protected Value vectorArray3; @Temp({REG, ILLEGAL}) protected Value vectorArray4; ! public AMD64ArrayIndexOfOp( ! JavaKind kind, ! int vmPageSize, LIRGeneratorTool tool, ! Value result, ! Value arrayPtr, ! Value arrayLength, ! Value searchChar) { super(TYPE); this.kind = kind; this.vmPageSize = vmPageSize; ! assert byteMode() || charMode(); ! assert supports(tool, CPUFeature.SSSE3) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool); resultValue = result; ! charArrayPtrValue = arrayPtr; ! charArrayLengthValue = arrayLength; ! searchCharValue = searchChar; ! ! this.arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); ! this.comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); ! this.comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); ! this.comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); ! this.comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); ! AMD64Kind vectorKind = byteMode() ? supportsAVX2(tool) ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE : supportsAVX2(tool) ? AMD64Kind.V256_WORD : AMD64Kind.V128_WORD; ! this.vectorCompareVal = tool.newVariable(LIRKind.value(vectorKind)); ! this.vectorArray1 = tool.newVariable(LIRKind.value(vectorKind)); ! this.vectorArray2 = tool.newVariable(LIRKind.value(vectorKind)); ! this.vectorArray3 = tool.newVariable(LIRKind.value(vectorKind)); ! this.vectorArray4 = tool.newVariable(LIRKind.value(vectorKind)); } ! private boolean byteMode() { return kind == JavaKind.Byte; } ! private boolean charMode() { return kind == JavaKind.Char; } @Override public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) { ! Register arrayPtr = asRegister(charArrayPtrValue); ! Register arrayLength = asRegister(charArrayLengthValue); ! Register searchValue = asRegister(searchCharValue); Register result = asRegister(resultValue); - Register vecCmp = asRegister(vectorCompareVal); - Register vecArray1 = asRegister(vectorArray1); - Register vecArray2 = asRegister(vectorArray2); - Register vecArray3 = asRegister(vectorArray3); - Register vecArray4 = asRegister(vectorArray4); Register slotsRemaining = asRegister(arraySlotsRemaining); ! Register cmpResult1 = asRegister(comparisonResult1); ! Register cmpResult2 = asRegister(comparisonResult2); ! Register cmpResult3 = asRegister(comparisonResult3); ! Register cmpResult4 = asRegister(comparisonResult4); ! ! Label bulkVectorLoop = new Label(); ! Label singleVectorLoop = new Label(); ! Label vectorFound1 = new Label(); ! Label vectorFound2 = new Label(); ! Label vectorFound3 = new Label(); ! Label vectorFound4 = new Label(); ! Label lessThanVectorSizeRemaining = new Label(); ! Label lessThanVectorSizeRemainingLoop = new Label(); Label retFound = new Label(); Label retNotFound = new Label(); Label end = new Label(); ! AVXKind.AVXSize vectorSize = asm.supports(CPUFeature.AVX2) ? AVXKind.AVXSize.YMM : AVXKind.AVXSize.XMM; ! int nVectors = 4; ! int bytesPerVector = vectorSize.getBytes(); ! int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount(); ! int bulkSize = arraySlotsPerVector * nVectors; ! int bulkSizeBytes = bytesPerVector * nVectors; ! assert bulkSizeBytes >= 64; // load array length ! // important: this must be the first register manipulation, since charArrayLengthValue is // annotated with @Use asm.movl(slotsRemaining, arrayLength); ! // move search value to vector if (asm.supports(CPUFeature.AVX)) { ! VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp, searchValue); } else { ! asm.movdl(vecCmp, searchValue); } - // load array pointer - asm.movq(result, arrayPtr); - // load copy of low part of array pointer - asm.movl(cmpResult1, arrayPtr); // fill comparison vector with copies of the search value ! emitBroadcast(asm, vecCmp, vecArray1, vectorSize); // check if bulk vector load is in bounds ! asm.cmpl(slotsRemaining, bulkSize); ! asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop); ! // check if array pointer is 64-byte aligned ! asm.andl(cmpResult1, 63); asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop); // do one unaligned bulk comparison pass and adjust alignment afterwards ! emitBulkCompare(asm, vectorSize, bytesPerVector, result, vecCmp, vecArray1, vecArray2, vecArray3, vecArray4, cmpResult1, cmpResult2, cmpResult3, cmpResult4, ! vectorFound1, vectorFound2, vectorFound3, vectorFound4, false); // load copy of low part of array pointer ! asm.movl(cmpResult1, arrayPtr); // adjust array pointer ! asm.addq(result, bulkSizeBytes); // adjust number of array slots remaining asm.subl(slotsRemaining, bulkSize); ! // get offset to 64-byte alignment ! asm.andl(cmpResult1, 63); ! emitBytesToArraySlots(asm, cmpResult1); ! // adjust array pointer to 64-byte alignment ! asm.andq(result, ~63); // adjust number of array slots remaining ! asm.addl(slotsRemaining, cmpResult1); // check if there are enough array slots remaining for the bulk loop ! asm.cmpl(slotsRemaining, bulkSize); ! asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop); emitAlign(crb, asm); asm.bind(bulkVectorLoop); // memory-aligned bulk comparison ! emitBulkCompare(asm, vectorSize, bytesPerVector, result, vecCmp, vecArray1, vecArray2, vecArray3, vecArray4, cmpResult1, cmpResult2, cmpResult3, cmpResult4, ! vectorFound1, vectorFound2, vectorFound3, vectorFound4, true); // adjust number of array slots remaining asm.subl(slotsRemaining, bulkSize); // adjust array pointer ! asm.addq(result, bulkSizeBytes); // check if there are enough array slots remaining for the bulk loop ! asm.cmpl(slotsRemaining, bulkSize); ! asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop); // continue loop ! asm.jmpb(bulkVectorLoop); emitAlign(crb, asm); // same loop as bulkVectorLoop, with only one vector asm.bind(singleVectorLoop); // check if single vector load is in bounds ! asm.cmpl(slotsRemaining, arraySlotsPerVector); asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining); // compare ! emitSingleVectorCompare(asm, vectorSize, result, vecCmp, vecArray1, cmpResult1); ! ! // check if a match was found ! asm.testl(cmpResult1, cmpResult1); ! asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound1); // adjust number of array slots remaining asm.subl(slotsRemaining, arraySlotsPerVector); // adjust array pointer ! asm.addq(result, bytesPerVector); // continue loop asm.jmpb(singleVectorLoop); asm.bind(lessThanVectorSizeRemaining); // check if any array slots remain asm.testl(slotsRemaining, slotsRemaining); asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound); // a vector compare will read out of bounds of the input array. // check if the out-of-bounds read would cross a memory page boundary. // load copy of low part of array pointer ! asm.movl(cmpResult1, result); // check if pointer + vector size would cross the page boundary ! asm.andl(cmpResult1, (vmPageSize - 1)); ! asm.cmpl(cmpResult1, (vmPageSize - bytesPerVector)); // if the page boundary would be crossed, do byte/character-wise comparison instead. asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop); // otherwise, do a vector compare that reads beyond array bounds ! emitSingleVectorCompare(asm, vectorSize, result, vecCmp, vecArray1, cmpResult1); ! // check if a match was found ! asm.testl(cmpResult1, cmpResult1); ! asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound); // find match offset ! asm.bsfq(cmpResult1, cmpResult1); ! if (charMode()) { ! // convert number of remaining characters to bytes ! asm.shll(slotsRemaining, 1); } // adjust array pointer for match result ! asm.addq(result, cmpResult1); // check if offset of matched value is greater than number of bytes remaining / out of array // bounds ! asm.cmpl(cmpResult1, slotsRemaining); // match is out of bounds, return no match asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound); // match is in bounds, return offset ! asm.jmpb(retFound); // compare remaining slots in the array one-by-one asm.bind(lessThanVectorSizeRemainingLoop); ! // check if any array slots remain ! asm.testl(slotsRemaining, slotsRemaining); ! asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound); // load char / byte ! AMD64Assembler.OperandSize operandSize = byteMode() ? AMD64Assembler.OperandSize.BYTE : AMD64Assembler.OperandSize.WORD; ! if (byteMode()) { ! AMD64Assembler.AMD64RMOp.MOVB.emit(asm, operandSize, cmpResult1, new AMD64Address(result)); } else { ! AMD64Assembler.AMD64RMOp.MOV.emit(asm, operandSize, cmpResult1, new AMD64Address(result)); } // check for match ! AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(operandSize).emit(asm, operandSize, cmpResult1, searchValue); asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound); // adjust number of array slots remaining asm.decrementl(slotsRemaining); // adjust array pointer ! asm.addq(result, kind.getByteCount()); // continue loop asm.jmpb(lessThanVectorSizeRemainingLoop); ! // return -1 (no match) ! asm.bind(retNotFound); ! asm.movl(result, -1); ! asm.jmpb(end); ! ! emitVectorFoundWithOffset(asm, bytesPerVector, result, cmpResult2, vectorFound2, retFound); ! emitVectorFoundWithOffset(asm, bytesPerVector * 2, result, cmpResult3, vectorFound3, retFound); ! emitVectorFoundWithOffset(asm, bytesPerVector * 3, result, cmpResult4, vectorFound4, retFound); ! asm.bind(vectorFound1); // find index of first set bit in bit mask ! asm.bsfq(cmpResult1, cmpResult1); // add offset to array pointer ! asm.addq(result, cmpResult1); ! asm.bind(retFound); ! // convert array pointer to offset ! asm.subq(result, arrayPtr); ! emitBytesToArraySlots(asm, result); ! asm.bind(end); } private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) { asm.align(crb.target.wordSize * 2); } /** ! * Fills {@code vecDst} with copies of its lowest byte or word. */ ! private void emitBroadcast(AMD64MacroAssembler asm, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) { if (asm.supports(CPUFeature.AVX2)) { - if (byteMode()) { VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst); - } else { - VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst); - } } else if (asm.supports(CPUFeature.AVX)) { - if (byteMode()) { - // fill vecTmp with zeroes VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp); - // broadcast loaded search value VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp); ! } else { ! // fill low qword ! VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0); ! // copy low qword to high qword ! VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0); ! } ! } else { ! // SSE version ! if (byteMode()) { ! // fill vecTmp with zeroes asm.pxor(vecTmp, vecTmp); - // broadcast loaded search value asm.pshufb(vecDst, vecTmp); ! } else { ! // fill low qword asm.pshuflw(vecDst, vecDst, 0); - // copy low qword to high qword asm.pshufd(vecDst, vecDst, 0); } } } - - /** - * Loads {@code vectorSize} bytes from the position pointed to by {@code arrayPtr} and compares - * them to the search value stored in {@code vecCmp}. {@code vecArray} is overwritten by this - * operation. The comparison result is stored in {@code cmpResult}. - */ - private void emitSingleVectorCompare(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, - Register arrayPtr, Register vecCmp, Register vecArray, Register cmpResult) { - // load array contents into vector - emitArrayLoad(asm, vectorSize, vecArray, arrayPtr, 0, false); - // compare all loaded bytes to the search value. - emitVectorCompare(asm, vectorSize, vecArray, vecCmp); - // create a 32-bit-mask from the most significant bit of every byte in the comparison - // result. - emitMOVMSK(asm, vectorSize, cmpResult, vecArray); } /** * Convert a byte offset stored in {@code bytes} to an array index offset. */ ! private void emitBytesToArraySlots(AMD64MacroAssembler asm, Register bytes) { ! if (charMode()) { asm.shrl(bytes, 1); } else { ! assert byteMode(); } } ! private void emitBulkCompare(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, ! int bytesPerVector, Register arrayPtr, ! Register vecCmp, ! Register vecArray1, ! Register vecArray2, ! Register vecArray3, ! Register vecArray4, ! Register cmpResult1, ! Register cmpResult2, ! Register cmpResult3, ! Register cmpResult4, ! Label vectorFound1, ! Label vectorFound2, ! Label vectorFound3, ! Label vectorFound4, boolean alignedLoad) { // load array contents into vectors ! emitArrayLoad(asm, vectorSize, vecArray1, arrayPtr, 0, alignedLoad); ! emitArrayLoad(asm, vectorSize, vecArray2, arrayPtr, bytesPerVector, alignedLoad); ! emitArrayLoad(asm, vectorSize, vecArray3, arrayPtr, bytesPerVector * 2, alignedLoad); ! emitArrayLoad(asm, vectorSize, vecArray4, arrayPtr, bytesPerVector * 3, alignedLoad); // compare all loaded bytes to the search value. // matching bytes are set to 0xff, non-matching bytes are set to 0x00. ! emitVectorCompare(asm, vectorSize, vecArray1, vecCmp); ! emitVectorCompare(asm, vectorSize, vecArray2, vecCmp); ! emitVectorCompare(asm, vectorSize, vecArray3, vecCmp); ! emitVectorCompare(asm, vectorSize, vecArray4, vecCmp); // create 32-bit-masks from the most significant bit of every byte in the comparison // results. ! emitMOVMSK(asm, vectorSize, cmpResult1, vecArray1); ! emitMOVMSK(asm, vectorSize, cmpResult2, vecArray2); ! emitMOVMSK(asm, vectorSize, cmpResult3, vecArray3); ! emitMOVMSK(asm, vectorSize, cmpResult4, vecArray4); // check if a match was found ! asm.testl(cmpResult1, cmpResult1); ! asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound1); ! asm.testl(cmpResult2, cmpResult2); ! asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound2); ! asm.testl(cmpResult3, cmpResult3); ! asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound3); ! asm.testl(cmpResult4, cmpResult4); ! asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound4); } ! private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm, int resultOffset, Register result, Register cmpResult, Label entry, Label ret) { asm.bind(entry); if (resultOffset > 0) { // adjust array pointer asm.addq(result, resultOffset); } // find index of first set bit in bit mask asm.bsfq(cmpResult, cmpResult); // add offset to array pointer asm.addq(result, cmpResult); asm.jmpb(ret); } private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) { AMD64Address src = new AMD64Address(arrayPtr, offset); --- 43,577 ---- import org.graalvm.compiler.lir.LIRInstructionClass; import org.graalvm.compiler.lir.Opcode; import org.graalvm.compiler.lir.asm.CompilationResultBuilder; import org.graalvm.compiler.lir.gen.LIRGeneratorTool; ! import static jdk.vm.ci.code.ValueUtil.asRegister; ! import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; ! import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; /** */ @Opcode("AMD64_ARRAY_INDEX_OF") public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction { public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class); private final JavaKind kind; private final int vmPageSize; + private final int nValues; + private final boolean findTwoConsecutive; + private final AMD64Kind vectorKind; @Def({REG}) protected Value resultValue; ! @Alive({REG}) protected Value arrayPtrValue; ! @Use({REG}) protected Value arrayLengthValue; ! @Alive({REG}) protected Value searchValue1; ! @Alive({REG, ILLEGAL}) protected Value searchValue2; ! @Alive({REG, ILLEGAL}) protected Value searchValue3; ! @Alive({REG, ILLEGAL}) protected Value searchValue4; @Temp({REG}) protected Value arraySlotsRemaining; @Temp({REG}) protected Value comparisonResult1; @Temp({REG}) protected Value comparisonResult2; @Temp({REG}) protected Value comparisonResult3; @Temp({REG}) protected Value comparisonResult4; ! @Temp({REG, ILLEGAL}) protected Value vectorCompareVal1; ! @Temp({REG, ILLEGAL}) protected Value vectorCompareVal2; ! @Temp({REG, ILLEGAL}) protected Value vectorCompareVal3; ! @Temp({REG, ILLEGAL}) protected Value vectorCompareVal4; @Temp({REG, ILLEGAL}) protected Value vectorArray1; @Temp({REG, ILLEGAL}) protected Value vectorArray2; @Temp({REG, ILLEGAL}) protected Value vectorArray3; @Temp({REG, ILLEGAL}) protected Value vectorArray4; ! public AMD64ArrayIndexOfOp(JavaKind kind, boolean findTwoConsecutive, int vmPageSize, int maxVectorSize, LIRGeneratorTool tool, Value result, Value arrayPtr, Value arrayLength, ! Value... searchValues) { super(TYPE); this.kind = kind; + this.findTwoConsecutive = findTwoConsecutive; this.vmPageSize = vmPageSize; ! assert 0 < searchValues.length && searchValues.length <= 4; ! assert byteMode(kind) || charMode(kind); ! assert supports(tool, CPUFeature.SSE2) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool); ! nValues = searchValues.length; ! assert !findTwoConsecutive || nValues == 1; resultValue = result; ! arrayPtrValue = arrayPtr; ! arrayLengthValue = arrayLength; ! searchValue1 = searchValues[0]; ! searchValue2 = nValues > 1 ? searchValues[1] : Value.ILLEGAL; ! searchValue3 = nValues > 2 ? searchValues[2] : Value.ILLEGAL; ! searchValue4 = nValues > 3 ? searchValues[3] : Value.ILLEGAL; ! arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); ! comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); ! comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); ! comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); ! comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); ! vectorKind = supportsAVX2(tool) && (maxVectorSize < 0 || maxVectorSize >= 32) ? byteMode(kind) ? AMD64Kind.V256_BYTE : AMD64Kind.V256_WORD ! : byteMode(kind) ? AMD64Kind.V128_BYTE : AMD64Kind.V128_WORD; ! vectorCompareVal1 = tool.newVariable(LIRKind.value(vectorKind)); ! vectorCompareVal2 = nValues > 1 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; ! vectorCompareVal3 = nValues > 2 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; ! vectorCompareVal4 = nValues > 3 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; ! vectorArray1 = tool.newVariable(LIRKind.value(vectorKind)); ! vectorArray2 = tool.newVariable(LIRKind.value(vectorKind)); ! vectorArray3 = tool.newVariable(LIRKind.value(vectorKind)); ! vectorArray4 = tool.newVariable(LIRKind.value(vectorKind)); } ! private static boolean byteMode(JavaKind kind) { return kind == JavaKind.Byte; } ! private static boolean charMode(JavaKind kind) { return kind == JavaKind.Char; } @Override public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) { ! Register arrayPtr = asRegister(arrayPtrValue); ! Register arrayLength = asRegister(arrayLengthValue); Register result = asRegister(resultValue); Register slotsRemaining = asRegister(arraySlotsRemaining); ! Register[] searchValue = { ! nValues > 0 ? asRegister(searchValue1) : null, ! nValues > 1 ? asRegister(searchValue2) : null, ! nValues > 2 ? asRegister(searchValue3) : null, ! nValues > 3 ? asRegister(searchValue4) : null, ! }; ! Register[] vecCmp = { ! nValues > 0 ? asRegister(vectorCompareVal1) : null, ! nValues > 1 ? asRegister(vectorCompareVal2) : null, ! nValues > 2 ? asRegister(vectorCompareVal3) : null, ! nValues > 3 ? asRegister(vectorCompareVal4) : null, ! }; ! Register[] vecArray = { ! asRegister(vectorArray1), ! asRegister(vectorArray2), ! asRegister(vectorArray3), ! asRegister(vectorArray4), ! }; ! Register[] cmpResult = { ! asRegister(comparisonResult1), ! asRegister(comparisonResult2), ! asRegister(comparisonResult3), ! asRegister(comparisonResult4), ! }; Label retFound = new Label(); Label retNotFound = new Label(); Label end = new Label(); ! AVXKind.AVXSize vectorSize = AVXKind.getDataSize(vectorKind); ! int nVectors = nValues == 1 ? 4 : nValues == 2 ? 2 : 1; // load array length ! // important: this must be the first register manipulation, since arrayLengthValue is // annotated with @Use asm.movl(slotsRemaining, arrayLength); ! // load array pointer ! asm.movq(result, arrayPtr); ! // move search values to vectors ! for (int i = 0; i < nValues; i++) { if (asm.supports(CPUFeature.AVX)) { ! VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp[i], searchValue[i]); } else { ! asm.movdl(vecCmp[i], searchValue[i]); ! } } // fill comparison vector with copies of the search value ! for (int i = 0; i < nValues; i++) { ! emitBroadcast(asm, findTwoConsecutive ? (byteMode(kind) ? JavaKind.Char : JavaKind.Int) : kind, vecCmp[i], vecArray[0], vectorSize); ! } ! ! emitArrayIndexOfChars(crb, asm, kind, vectorSize, result, slotsRemaining, searchValue, vecCmp, vecArray, cmpResult, retFound, retNotFound, vmPageSize, nValues, nVectors, findTwoConsecutive); ! ! // return -1 (no match) ! asm.bind(retNotFound); ! asm.movq(result, -1); ! asm.jmpb(end); ! ! asm.bind(retFound); ! // convert array pointer to offset ! asm.subq(result, arrayPtr); ! if (charMode(kind)) { ! asm.shrq(result, 1); ! } ! asm.bind(end); ! } ! ! private static void emitArrayIndexOfChars(CompilationResultBuilder crb, AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, ! Register arrayPtr, ! Register slotsRemaining, ! Register[] searchValue, ! Register[] vecCmp, ! Register[] vecArray, ! Register[] cmpResult, ! Label retFound, ! Label retNotFound, ! int vmPageSize, ! int nValues, ! int nVectors, ! boolean findTwoCharPrefix) { ! Label bulkVectorLoop = new Label(); ! Label singleVectorLoop = new Label(); ! Label[] vectorFound = { ! new Label(), ! new Label(), ! new Label(), ! new Label(), ! }; ! Label lessThanVectorSizeRemaining = new Label(); ! Label lessThanVectorSizeRemainingLoop = new Label(); ! Label bulkVectorLoopExit = nVectors == 1 ? lessThanVectorSizeRemaining : singleVectorLoop; ! int bytesPerVector = vectorSize.getBytes(); ! int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount(); ! int singleVectorLoopCondition = arraySlotsPerVector; ! int bulkSize = arraySlotsPerVector * nVectors; ! int bulkSizeBytes = bytesPerVector * nVectors; ! int bulkLoopCondition = bulkSize; ! int[] vectorOffsets; ! JavaKind vectorCompareKind = kind; ! if (findTwoCharPrefix) { ! singleVectorLoopCondition++; ! bulkLoopCondition++; ! bulkSize /= 2; ! bulkSizeBytes /= 2; ! vectorOffsets = new int[]{0, kind.getByteCount(), bytesPerVector, bytesPerVector + kind.getByteCount()}; ! vectorCompareKind = byteMode(kind) ? JavaKind.Char : JavaKind.Int; ! } else { ! vectorOffsets = new int[]{0, bytesPerVector, bytesPerVector * 2, bytesPerVector * 3}; ! } ! ! // load copy of low part of array pointer ! Register tmpArrayPtrLow = cmpResult[0]; ! asm.movl(tmpArrayPtrLow, arrayPtr); // check if bulk vector load is in bounds ! asm.cmpl(slotsRemaining, bulkLoopCondition); ! asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit); ! // check if array pointer is aligned to bulkSize ! asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1); asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop); // do one unaligned bulk comparison pass and adjust alignment afterwards ! emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false); // load copy of low part of array pointer ! asm.movl(tmpArrayPtrLow, arrayPtr); // adjust array pointer ! asm.addq(arrayPtr, bulkSizeBytes); // adjust number of array slots remaining asm.subl(slotsRemaining, bulkSize); ! // get offset to bulk size alignment ! asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1); ! emitBytesToArraySlots(asm, kind, tmpArrayPtrLow); ! // adjust array pointer to bulk size alignment ! asm.andq(arrayPtr, ~(bulkSizeBytes - 1)); // adjust number of array slots remaining ! asm.addl(slotsRemaining, tmpArrayPtrLow); // check if there are enough array slots remaining for the bulk loop ! asm.cmpl(slotsRemaining, bulkLoopCondition); ! asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit); emitAlign(crb, asm); asm.bind(bulkVectorLoop); // memory-aligned bulk comparison ! emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, !findTwoCharPrefix); // adjust number of array slots remaining asm.subl(slotsRemaining, bulkSize); // adjust array pointer ! asm.addq(arrayPtr, bulkSizeBytes); // check if there are enough array slots remaining for the bulk loop ! asm.cmpl(slotsRemaining, bulkLoopCondition); ! asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit); // continue loop ! asm.jmp(bulkVectorLoop); + if (nVectors > 1) { emitAlign(crb, asm); // same loop as bulkVectorLoop, with only one vector asm.bind(singleVectorLoop); // check if single vector load is in bounds ! asm.cmpl(slotsRemaining, singleVectorLoopCondition); asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining); // compare ! emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoCharPrefix ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false); // adjust number of array slots remaining asm.subl(slotsRemaining, arraySlotsPerVector); // adjust array pointer ! asm.addq(arrayPtr, bytesPerVector); // continue loop asm.jmpb(singleVectorLoop); + } asm.bind(lessThanVectorSizeRemaining); // check if any array slots remain asm.testl(slotsRemaining, slotsRemaining); asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound); // a vector compare will read out of bounds of the input array. // check if the out-of-bounds read would cross a memory page boundary. // load copy of low part of array pointer ! asm.movl(tmpArrayPtrLow, arrayPtr); // check if pointer + vector size would cross the page boundary ! asm.andl(tmpArrayPtrLow, (vmPageSize - 1)); ! asm.cmpl(tmpArrayPtrLow, (vmPageSize - (findTwoCharPrefix ? bytesPerVector + kind.getByteCount() : bytesPerVector))); // if the page boundary would be crossed, do byte/character-wise comparison instead. asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop); + + Label[] overBoundsMatch = {new Label(), new Label()}; // otherwise, do a vector compare that reads beyond array bounds ! emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoCharPrefix ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, overBoundsMatch, false); ! // no match ! asm.jmp(retNotFound); ! if (findTwoCharPrefix) { ! Label overBoundsFinish = new Label(); ! asm.bind(overBoundsMatch[1]); ! // get match offset of second result ! asm.bsfq(cmpResult[1], cmpResult[1]); ! asm.addl(cmpResult[1], kind.getByteCount()); ! // replace first result with second and continue ! asm.movl(cmpResult[0], cmpResult[1]); ! asm.jmpb(overBoundsFinish); ! ! asm.bind(overBoundsMatch[0]); ! emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, overBoundsFinish); ! } else { ! asm.bind(overBoundsMatch[0]); // find match offset ! asm.bsfq(cmpResult[0], cmpResult[0]); } + // adjust array pointer for match result ! asm.addq(arrayPtr, cmpResult[0]); ! if (charMode(kind)) { ! // convert byte offset to chars ! asm.shrl(cmpResult[0], 1); ! } // check if offset of matched value is greater than number of bytes remaining / out of array // bounds ! if (findTwoCharPrefix) { ! asm.decrementl(slotsRemaining); ! } ! asm.cmpl(cmpResult[0], slotsRemaining); // match is out of bounds, return no match asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound); + // adjust number of array slots remaining + if (findTwoCharPrefix) { + asm.incrementl(slotsRemaining, 1); + } + asm.subl(slotsRemaining, cmpResult[0]); // match is in bounds, return offset ! asm.jmp(retFound); // compare remaining slots in the array one-by-one asm.bind(lessThanVectorSizeRemainingLoop); ! // check if enough array slots remain ! asm.cmpl(slotsRemaining, findTwoCharPrefix ? 1 : 0); ! asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, retNotFound); // load char / byte ! if (byteMode(kind)) { ! if (findTwoCharPrefix) { ! asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr)); ! } else { ! asm.movzbl(cmpResult[0], new AMD64Address(arrayPtr)); ! } } else { ! if (findTwoCharPrefix) { ! asm.movl(cmpResult[0], new AMD64Address(arrayPtr)); ! } else { ! asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr)); ! } } // check for match ! for (int i = 0; i < nValues; i++) { ! asm.cmpl(cmpResult[0], searchValue[i]); asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound); + } // adjust number of array slots remaining asm.decrementl(slotsRemaining); // adjust array pointer ! asm.addq(arrayPtr, kind.getByteCount()); // continue loop asm.jmpb(lessThanVectorSizeRemainingLoop); ! for (int i = 1; i < nVectors; i += (findTwoCharPrefix ? 2 : 1)) { ! emitVectorFoundWithOffset(asm, kind, vectorOffsets[i], arrayPtr, cmpResult[i], slotsRemaining, vectorFound[i], retFound); ! } ! if (findTwoCharPrefix) { ! asm.bind(vectorFound[2]); ! asm.addq(arrayPtr, vectorOffsets[2]); ! // adjust number of array slots remaining ! asm.subl(slotsRemaining, charMode(kind) ? vectorOffsets[2] / 2 : vectorOffsets[2]); ! asm.movl(cmpResult[0], cmpResult[2]); ! asm.movl(cmpResult[1], cmpResult[3]); ! asm.bind(vectorFound[0]); ! emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, new Label()); ! } else { ! asm.bind(vectorFound[0]); // find index of first set bit in bit mask ! asm.bsfq(cmpResult[0], cmpResult[0]); ! } // add offset to array pointer ! asm.addq(arrayPtr, cmpResult[0]); ! if (charMode(kind)) { ! // convert byte offset to chars ! asm.shrl(cmpResult[0], 1); ! } ! // adjust number of array slots remaining ! asm.subl(slotsRemaining, cmpResult[0]); ! asm.jmpb(retFound); ! } ! private static void emitFindTwoCharPrefixMinResult(AMD64MacroAssembler asm, JavaKind kind, Register[] cmpResult, Label done) { ! // find match offset ! asm.bsfq(cmpResult[0], cmpResult[0]); ! // check if second result is also a match ! asm.testl(cmpResult[1], cmpResult[1]); ! asm.jcc(AMD64Assembler.ConditionFlag.Zero, done); ! // get match offset of second result ! asm.bsfq(cmpResult[1], cmpResult[1]); ! asm.addl(cmpResult[1], kind.getByteCount()); ! // check if first result is less than second ! asm.cmpl(cmpResult[0], cmpResult[1]); ! asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, done); ! // first result is greater than second, replace it with the second result ! asm.movl(cmpResult[0], cmpResult[1]); ! asm.bind(done); } private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) { asm.align(crb.target.wordSize * 2); } /** ! * Fills {@code vecDst} with copies of its lowest byte, word or dword. */ ! private static void emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) { ! switch (kind) { ! case Byte: if (asm.supports(CPUFeature.AVX2)) { VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst); } else if (asm.supports(CPUFeature.AVX)) { VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp); VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp); ! } else if (asm.supports(CPUFeature.SSSE3)) { asm.pxor(vecTmp, vecTmp); asm.pshufb(vecDst, vecTmp); ! } else { // SSE2 ! asm.punpcklbw(vecDst, vecDst); ! asm.punpcklbw(vecDst, vecDst); ! asm.pshufd(vecDst, vecDst, 0); ! } ! break; ! case Short: ! case Char: ! if (asm.supports(CPUFeature.AVX2)) { ! VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst); ! } else if (asm.supports(CPUFeature.AVX)) { ! VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0); ! VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0); ! } else { // SSE asm.pshuflw(vecDst, vecDst, 0); asm.pshufd(vecDst, vecDst, 0); } + break; + case Int: + if (asm.supports(CPUFeature.AVX2)) { + VexRMOp.VPBROADCASTD.emit(asm, vectorSize, vecDst, vecDst); + } else if (asm.supports(CPUFeature.AVX)) { + VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0); + } else { // SSE + asm.pshufd(vecDst, vecDst, 0); } + break; + default: + throw new UnsupportedOperationException(); } } /** * Convert a byte offset stored in {@code bytes} to an array index offset. */ ! private static void emitBytesToArraySlots(AMD64MacroAssembler asm, JavaKind kind, Register bytes) { ! if (charMode(kind)) { asm.shrl(bytes, 1); } else { ! assert byteMode(kind); } } ! private static void emitVectorCompare(AMD64MacroAssembler asm, ! JavaKind kind, AVXKind.AVXSize vectorSize, ! int nValues, ! int nVectors, ! int[] vectorOffsets, Register arrayPtr, ! Register[] vecCmp, ! Register[] vecArray, ! Register[] cmpResult, ! Label[] vectorFound, boolean alignedLoad) { // load array contents into vectors ! for (int i = 0; i < nValues; i++) { ! for (int j = 0; j < nVectors; j++) { ! emitArrayLoad(asm, vectorSize, vecArray[(i * nVectors) + j], arrayPtr, vectorOffsets[j], alignedLoad); ! } ! } // compare all loaded bytes to the search value. // matching bytes are set to 0xff, non-matching bytes are set to 0x00. ! for (int i = 0; i < nValues; i++) { ! for (int j = 0; j < nVectors; j++) { ! emitVectorCompareInst(asm, kind, vectorSize, vecArray[(i * nVectors) + j], vecCmp[i]); ! } ! } // create 32-bit-masks from the most significant bit of every byte in the comparison // results. ! for (int i = 0; i < nValues * nVectors; i++) { ! emitMOVMSK(asm, vectorSize, cmpResult[i], vecArray[i]); ! } ! // join results of comparisons against multiple values ! for (int stride = 1; stride < nValues; stride *= 2) { ! for (int i = 0; i < nVectors; i++) { ! for (int j = 0; j + stride < nValues; j += stride * 2) { ! asm.orl(cmpResult[i + (j * nVectors)], cmpResult[i + ((j + stride) * nVectors)]); ! } ! } ! } // check if a match was found ! for (int i = 0; i < nVectors; i++) { ! asm.testl(cmpResult[i], cmpResult[i]); ! asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound[i]); ! } } ! private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm, ! JavaKind kind, ! int resultOffset, ! Register result, ! Register cmpResult, ! Register slotsRemaining, ! Label entry, ! Label ret) { asm.bind(entry); if (resultOffset > 0) { // adjust array pointer asm.addq(result, resultOffset); + // adjust number of array slots remaining + asm.subl(slotsRemaining, charMode(kind) ? resultOffset / 2 : resultOffset); } // find index of first set bit in bit mask asm.bsfq(cmpResult, cmpResult); // add offset to array pointer asm.addq(result, cmpResult); + if (charMode(kind)) { + // convert byte offset to chars + asm.shrl(cmpResult, 1); + } + // adjust number of array slots remaining + asm.subl(slotsRemaining, cmpResult); asm.jmpb(ret); } private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) { AMD64Address src = new AMD64Address(arrayPtr, offset);
*** 444,469 **** // SSE asm.movdqu(vecDst, src); } } ! private void emitVectorCompare(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) { ! // compare all loaded bytes to the search value. ! // matching bytes are set to 0xff, non-matching bytes are set to 0x00. if (asm.supports(CPUFeature.AVX)) { - if (byteMode()) { VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray); ! } else { ! VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray); ! } ! } else { ! // SSE ! if (byteMode()) { asm.pcmpeqb(vecArray, vecCmp); ! } else { asm.pcmpeqw(vecArray, vecCmp); } } } private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) { if (asm.supports(CPUFeature.AVX)) { --- 582,621 ---- // SSE asm.movdqu(vecDst, src); } } ! /** ! * Compares all packed bytes/words/dwords in {@code vecArray} to {@code vecCmp}. Matching values ! * are set to all ones (0xff, 0xffff, ...), non-matching values are set to zero. ! */ ! private static void emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) { ! switch (kind) { ! case Byte: if (asm.supports(CPUFeature.AVX)) { VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray); ! } else { // SSE asm.pcmpeqb(vecArray, vecCmp); ! } ! break; ! case Short: ! case Char: ! if (asm.supports(CPUFeature.AVX)) { ! VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray); ! } else { // SSE asm.pcmpeqw(vecArray, vecCmp); } + break; + case Int: + if (asm.supports(CPUFeature.AVX)) { + VexRVMOp.VPCMPEQD.emit(asm, vectorSize, vecArray, vecCmp, vecArray); + } else { // SSE + asm.pcmpeqd(vecArray, vecCmp); + } + break; + default: + throw new UnsupportedOperationException(); } } private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) { if (asm.supports(CPUFeature.AVX)) {
< prev index next >