1 /* 2 * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 25 package org.graalvm.compiler.lir.amd64; 26 27 import static jdk.vm.ci.code.ValueUtil.asRegister; 28 import static jdk.vm.ci.code.ValueUtil.isRegister; 29 import static jdk.vm.ci.code.ValueUtil.isStackSlot; 30 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.CONST; 31 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; 32 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; 33 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.STACK; 34 35 import java.util.Objects; 36 37 import org.graalvm.compiler.asm.Label; 38 import org.graalvm.compiler.asm.amd64.AMD64Address; 39 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale; 40 import org.graalvm.compiler.asm.amd64.AMD64Assembler; 41 import org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64RMOp; 42 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp; 43 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp; 44 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMOp; 45 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRVMOp; 46 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize; 47 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 48 import org.graalvm.compiler.asm.amd64.AVXKind; 49 import org.graalvm.compiler.core.common.LIRKind; 50 import org.graalvm.compiler.core.common.NumUtil; 51 import org.graalvm.compiler.lir.ConstantValue; 52 import org.graalvm.compiler.lir.LIRInstructionClass; 53 import org.graalvm.compiler.lir.Opcode; 54 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 55 import org.graalvm.compiler.lir.gen.LIRGeneratorTool; 56 57 import jdk.vm.ci.amd64.AMD64; 58 import jdk.vm.ci.amd64.AMD64.CPUFeature; 59 import jdk.vm.ci.amd64.AMD64Kind; 60 import jdk.vm.ci.code.Register; 61 import jdk.vm.ci.meta.JavaConstant; 62 import jdk.vm.ci.meta.JavaKind; 63 import jdk.vm.ci.meta.Value; 64 65 /** 66 */ 67 @Opcode("AMD64_ARRAY_INDEX_OF") 68 public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction { 69 public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class); 70 71 private final JavaKind valueKind; 72 private final int nValues; 73 private final boolean findTwoConsecutive; 74 private final AMD64Kind vectorKind; 75 private final int arrayBaseOffset; 76 private final Scale arrayIndexScale; 77 78 @Def({REG}) protected Value resultValue; 79 @Alive({REG}) protected Value arrayPtrValue; 80 @Alive({REG}) protected Value arrayLengthValue; 81 @Use({REG}) protected Value fromIndexValue; 82 @Alive({REG, STACK, CONST}) protected Value searchValue1; 83 @Alive({REG, STACK, CONST, ILLEGAL}) protected Value searchValue2; 84 @Alive({REG, STACK, CONST, ILLEGAL}) protected Value searchValue3; 85 @Alive({REG, STACK, CONST, ILLEGAL}) protected Value searchValue4; 86 @Temp({REG}) protected Value comparisonResult1; 87 @Temp({REG, ILLEGAL}) protected Value comparisonResult2; 88 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal1; 89 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal2; 90 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal3; 91 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal4; 92 @Temp({REG, ILLEGAL}) protected Value vectorArray1; 93 @Temp({REG, ILLEGAL}) protected Value vectorArray2; 94 @Temp({REG, ILLEGAL}) protected Value vectorArray3; 95 @Temp({REG, ILLEGAL}) protected Value vectorArray4; 96 97 public AMD64ArrayIndexOfOp(JavaKind arrayKind, JavaKind valueKind, boolean findTwoConsecutive, int maxVectorSize, LIRGeneratorTool tool, 98 Value result, Value arrayPtr, Value arrayLength, Value fromIndex, Value... searchValues) { 99 super(TYPE); 100 this.valueKind = valueKind; 101 this.arrayBaseOffset = tool.getProviders().getMetaAccess().getArrayBaseOffset(arrayKind); 102 this.arrayIndexScale = Objects.requireNonNull(Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(valueKind))); 103 this.findTwoConsecutive = findTwoConsecutive; 104 assert 0 < searchValues.length && searchValues.length <= 4; 105 assert byteMode(valueKind) || charMode(valueKind); 106 assert supports(tool, CPUFeature.SSE2) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool); 107 nValues = searchValues.length; 108 assert !findTwoConsecutive || nValues == 1; 109 resultValue = result; 110 arrayPtrValue = arrayPtr; 111 arrayLengthValue = arrayLength; 112 fromIndexValue = fromIndex; 113 searchValue1 = searchValues[0]; 114 searchValue2 = nValues > 1 ? searchValues[1] : Value.ILLEGAL; 115 searchValue3 = nValues > 2 ? searchValues[2] : Value.ILLEGAL; 116 searchValue4 = nValues > 3 ? searchValues[3] : Value.ILLEGAL; 117 comparisonResult1 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())); 118 comparisonResult2 = findTwoConsecutive ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL; 119 vectorKind = supportsAVX2(tool) && (maxVectorSize < 0 || maxVectorSize >= 32) ? byteMode(valueKind) ? AMD64Kind.V256_BYTE : AMD64Kind.V256_WORD 120 : byteMode(valueKind) ? AMD64Kind.V128_BYTE : AMD64Kind.V128_WORD; 121 vectorCompareVal1 = tool.newVariable(LIRKind.value(vectorKind)); 122 vectorCompareVal2 = nValues > 1 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; 123 vectorCompareVal3 = nValues > 2 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; 124 vectorCompareVal4 = nValues > 3 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; 125 vectorArray1 = tool.newVariable(LIRKind.value(vectorKind)); 126 vectorArray2 = tool.newVariable(LIRKind.value(vectorKind)); 127 vectorArray3 = tool.newVariable(LIRKind.value(vectorKind)); 128 vectorArray4 = tool.newVariable(LIRKind.value(vectorKind)); 129 } 130 131 private static boolean byteMode(JavaKind kind) { 132 return kind == JavaKind.Byte; 133 } 134 135 private static boolean charMode(JavaKind kind) { 136 return kind == JavaKind.Char; 137 } 138 139 private JavaKind getComparisonKind() { 140 return findTwoConsecutive ? (byteMode(valueKind) ? JavaKind.Char : JavaKind.Int) : valueKind; 141 } 142 143 private AVXKind.AVXSize getVectorSize() { 144 return AVXKind.getDataSize(vectorKind); 145 } 146 147 @Override 148 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) { 149 int nVectors = nValues == 1 ? 4 : nValues == 2 ? 2 : 1; 150 Register arrayPtr = asRegister(arrayPtrValue); 151 Register arrayLength = asRegister(arrayLengthValue); 152 Register fromIndex = asRegister(fromIndexValue); 153 Register index = asRegister(resultValue); 154 Value[] searchValue = { 155 nValues > 0 ? searchValue1 : null, 156 nValues > 1 ? searchValue2 : null, 157 nValues > 2 ? searchValue3 : null, 158 nValues > 3 ? searchValue4 : null, 159 }; 160 Register[] vecCmp = { 161 nValues > 0 ? asRegister(vectorCompareVal1) : null, 162 nValues > 1 ? asRegister(vectorCompareVal2) : null, 163 nValues > 2 ? asRegister(vectorCompareVal3) : null, 164 nValues > 3 ? asRegister(vectorCompareVal4) : null, 165 }; 166 Register[] vecArray = { 167 asRegister(vectorArray1), 168 asRegister(vectorArray2), 169 asRegister(vectorArray3), 170 asRegister(vectorArray4), 171 }; 172 Register[] cmpResult = { 173 asRegister(comparisonResult1), 174 findTwoConsecutive ? asRegister(comparisonResult2) : null, 175 }; 176 Label ret = new Label(); 177 178 Label bulkVectorLoop = new Label(); 179 Label singleVectorLoop = new Label(); 180 Label[] vectorFound = { 181 new Label(), 182 new Label(), 183 new Label(), 184 new Label(), 185 }; 186 Label runVectorized = new Label(); 187 Label elementWiseLoop = new Label(); 188 Label elementWiseFound = new Label(); 189 Label elementWiseNotFound = new Label(); 190 Label skipBulkVectorLoop = new Label(); 191 int vectorSize = getVectorSize().getBytes() / valueKind.getByteCount(); 192 int bulkSize = vectorSize * nVectors; 193 JavaKind vectorCompareKind = valueKind; 194 if (findTwoConsecutive) { 195 bulkSize /= 2; 196 vectorCompareKind = byteMode(valueKind) ? JavaKind.Char : JavaKind.Int; 197 } 198 // index = fromIndex + vectorSize (+1 if findTwoConsecutive) 199 // important: this must be the first register manipulation, since fromIndex is 200 // annotated with @Use 201 asm.leaq(index, new AMD64Address(fromIndex, vectorSize + (findTwoConsecutive ? 1 : 0))); 202 203 // check if vector vector load is in bounds 204 asm.cmpq(index, arrayLength); 205 asm.jccb(AMD64Assembler.ConditionFlag.LessEqual, runVectorized); 206 207 // search range is smaller than vector size, do element-wise comparison 208 209 // index = fromIndex (+ 1 if findTwoConsecutive) 210 asm.subq(index, vectorSize); 211 // check if enough array slots remain 212 asm.cmpq(index, arrayLength); 213 asm.jccb(AMD64Assembler.ConditionFlag.GreaterEqual, elementWiseNotFound); 214 // compare one-by-one 215 asm.bind(elementWiseLoop); 216 // check for match 217 OperandSize cmpSize = getOpSize(getComparisonKind()); 218 // address = findTwoConsecutive ? array[index - 1] : array[index] 219 AMD64Address arrayAddr = new AMD64Address(arrayPtr, index, arrayIndexScale, arrayBaseOffset - (findTwoConsecutive ? valueKind.getByteCount() : 0)); 220 boolean valuesOnStack = searchValuesOnStack(searchValue); 221 if (valuesOnStack) { 222 (cmpSize == OperandSize.BYTE ? AMD64RMOp.MOVB : AMD64RMOp.MOV).emit(asm, cmpSize, cmpResult[0], arrayAddr); 223 for (int i = 0; i < nValues; i++) { 224 if (isConstant(searchValue[i])) { 225 int imm = asConstant(searchValue[i]).asInt(); 226 AMD64Assembler.AMD64BinaryArithmetic.CMP.getMIOpcode(cmpSize, NumUtil.isByte(imm)).emit(asm, cmpSize, cmpResult[0], imm); 227 } else if (isStackSlot(searchValue[i])) { 228 AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(cmpSize).emit(asm, cmpSize, cmpResult[0], (AMD64Address) crb.asAddress(searchValue[i])); 229 } else { 230 AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(cmpSize).emit(asm, cmpSize, cmpResult[0], asRegister(searchValue[i])); 231 } 232 asm.jccb(AMD64Assembler.ConditionFlag.Equal, elementWiseFound); 233 } 234 } else { 235 for (int i = 0; i < nValues; i++) { 236 if (isConstant(searchValue[i])) { 237 int imm = asConstant(searchValue[i]).asInt(); 238 AMD64Assembler.AMD64BinaryArithmetic.CMP.getMIOpcode(cmpSize, NumUtil.isByte(imm)).emit(asm, cmpSize, arrayAddr, imm); 239 } else { 240 AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(cmpSize).emit(asm, cmpSize, asRegister(searchValue[i]), arrayAddr); 241 } 242 asm.jccb(AMD64Assembler.ConditionFlag.Equal, elementWiseFound); 243 } 244 } 245 // adjust index 246 asm.incrementq(index, 1); 247 // continue loop 248 asm.cmpq(index, arrayLength); 249 asm.jccb(AMD64Assembler.ConditionFlag.Less, elementWiseLoop); 250 251 asm.bind(elementWiseNotFound); 252 asm.xorq(index, index); 253 254 if (findTwoConsecutive) { 255 asm.bind(elementWiseFound); 256 asm.decrementq(index, 1); 257 } else { 258 asm.decrementq(index, 1); 259 asm.bind(elementWiseFound); 260 } 261 asm.jmp(ret); 262 263 // vectorized implementation 264 asm.bind(runVectorized); 265 266 // move search values to vectors 267 for (int i = 0; i < nValues; i++) { 268 // fill comparison vector with copies of the search value 269 broadcastSearchValue(crb, asm, vecCmp[i], searchValue[i], cmpResult[0], vecArray[0]); 270 } 271 272 // do one unaligned vector comparison pass and adjust alignment afterwards 273 emitVectorCompare(asm, vectorCompareKind, findTwoConsecutive ? 2 : 1, arrayPtr, index, vecCmp, vecArray, cmpResult, vectorFound, false, false); 274 275 // adjust index to vector size alignment 276 asm.leaq(cmpResult[0], new AMD64Address(arrayPtr, arrayBaseOffset)); 277 if (charMode(valueKind)) { 278 asm.shrq(cmpResult[0], 1); 279 } 280 asm.addq(index, cmpResult[0]); 281 // adjust to next lower multiple of vector size 282 asm.andq(index, ~(vectorSize - 1)); 283 asm.subq(index, cmpResult[0]); 284 // add bulk size 285 asm.addq(index, bulkSize); 286 287 // check if there are enough array slots remaining for the bulk loop 288 asm.cmpq(index, arrayLength); 289 asm.jccb(AMD64Assembler.ConditionFlag.Greater, skipBulkVectorLoop); 290 291 emitAlign(crb, asm); 292 asm.bind(bulkVectorLoop); 293 // memory-aligned bulk comparison 294 emitVectorCompare(asm, vectorCompareKind, nVectors, arrayPtr, index, vecCmp, vecArray, cmpResult, vectorFound, false, !findTwoConsecutive); 295 // adjust index 296 asm.addq(index, bulkSize); 297 // check if there are enough array slots remaining for the bulk loop 298 asm.cmpq(index, arrayLength); 299 asm.jccb(AMD64Assembler.ConditionFlag.LessEqual, bulkVectorLoop); 300 301 asm.bind(skipBulkVectorLoop); 302 if ((findTwoConsecutive && nVectors == 2) || nVectors == 1) { 303 // do last load from end of array 304 asm.movq(index, arrayLength); 305 // compare 306 emitVectorCompare(asm, vectorCompareKind, findTwoConsecutive ? 2 : 1, arrayPtr, index, vecCmp, vecArray, cmpResult, vectorFound, true, false); 307 } else { 308 // remove bulk offset 309 asm.subq(index, bulkSize); 310 emitAlign(crb, asm); 311 // same loop as bulkVectorLoop, with only one vector 312 asm.bind(singleVectorLoop); 313 // add vector size 314 asm.addq(index, vectorSize); 315 // check if vector load is in bounds 316 asm.cmpq(index, arrayLength); 317 // if load would be over bounds, set the load to the end of the array 318 asm.cmovq(AMD64Assembler.ConditionFlag.Greater, index, arrayLength); 319 // compare 320 emitVectorCompare(asm, vectorCompareKind, findTwoConsecutive ? 2 : 1, arrayPtr, index, vecCmp, vecArray, cmpResult, vectorFound, true, false); 321 // check if there are enough array slots remaining for the loop 322 asm.cmpq(index, arrayLength); 323 asm.jccb(AMD64Assembler.ConditionFlag.Less, singleVectorLoop); 324 } 325 326 asm.movl(index, -1); 327 asm.jmpb(ret); 328 329 if (findTwoConsecutive) { 330 Label vectorFound2Done = new Label(); 331 332 // vectorFound[0] and vectorFound[2] behave like the single-char case 333 asm.bind(vectorFound[2]); 334 // add static offset 335 asm.subq(index, getResultIndexDelta(2)); 336 asm.jmpb(vectorFound2Done); 337 338 asm.bind(vectorFound[0]); 339 // add static offset 340 asm.subq(index, getResultIndexDelta(0)); 341 asm.bind(vectorFound2Done); 342 // find offset 343 asm.bsfq(cmpResult[0], cmpResult[0]); 344 if (charMode(valueKind)) { 345 // convert byte offset to chars 346 asm.shrl(cmpResult[0], 1); 347 } 348 // add offset to index 349 asm.addq(index, cmpResult[0]); 350 asm.jmpb(ret); 351 352 Label minResult = new Label(); 353 Label minResultDone = new Label(); 354 355 // in vectorFound[1] and vectorFound[3], we have to check the results 0 and 2 as well 356 if (nVectors > 2) { 357 asm.bind(vectorFound[3]); 358 // add offset 359 asm.subq(index, getResultIndexDelta(3)); 360 asm.jmpb(minResult); 361 } 362 363 asm.bind(vectorFound[1]); 364 // add offset 365 asm.subq(index, getResultIndexDelta(1)); 366 367 asm.bind(minResult); 368 // find offset 0 369 asm.bsfq(cmpResult[1], cmpResult[1]); 370 // check if second result is also a match 371 asm.testq(cmpResult[0], cmpResult[0]); 372 asm.jccb(AMD64Assembler.ConditionFlag.Zero, minResultDone); 373 // find offset 1 374 asm.bsfq(cmpResult[0], cmpResult[0]); 375 asm.addq(cmpResult[0], valueKind.getByteCount()); 376 // if first result is greater than second, replace it with the second result 377 asm.cmpq(cmpResult[1], cmpResult[0]); 378 asm.cmovq(AMD64Assembler.ConditionFlag.Greater, cmpResult[1], cmpResult[0]); 379 asm.bind(minResultDone); 380 if (charMode(valueKind)) { 381 // convert byte offset to chars 382 asm.shrl(cmpResult[1], 1); 383 } 384 // add offset to index 385 asm.addq(index, cmpResult[1]); 386 } else { 387 Label end = new Label(); 388 for (int i = 0; i < nVectors; i++) { 389 asm.bind(vectorFound[i]); 390 // add static offset 391 asm.subq(index, getResultIndexDelta(i)); 392 if (i < nVectors - 1) { 393 asm.jmpb(end); 394 } 395 } 396 asm.bind(end); 397 // find offset 398 asm.bsfq(cmpResult[0], cmpResult[0]); 399 if (charMode(valueKind)) { 400 // convert byte offset to chars 401 asm.shrl(cmpResult[0], 1); 402 } 403 // add offset to index 404 asm.addq(index, cmpResult[0]); 405 } 406 asm.bind(ret); 407 } 408 409 private boolean searchValuesOnStack(Value[] searchValue) { 410 for (int i = 0; i < nValues; i++) { 411 if (isStackSlot(searchValue[i])) { 412 return true; 413 } 414 } 415 return false; 416 } 417 418 private int getResultIndexDelta(int i) { 419 return (((findTwoConsecutive ? i / 2 : i) + 1) * (getVectorSize().getBytes() / valueKind.getByteCount())) + (findTwoConsecutive ? (i & 1) : 0); 420 } 421 422 private int getVectorOffset(int i) { 423 return arrayBaseOffset - getResultIndexDelta(i) * valueKind.getByteCount(); 424 } 425 426 private void broadcastSearchValue(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register dst, Value srcVal, Register tmpReg, Register tmpVector) { 427 Register src = asRegOrTmpReg(crb, asm, srcVal, tmpReg); 428 if (asm.supports(CPUFeature.AVX)) { 429 VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, dst, src); 430 } else { 431 asm.movdl(dst, src); 432 } 433 emitBroadcast(asm, getComparisonKind(), dst, tmpVector, getVectorSize()); 434 } 435 436 private static boolean isConstant(Value val) { 437 assert !(val instanceof ConstantValue) || ((ConstantValue) val).isJavaConstant(); 438 return val instanceof ConstantValue; 439 } 440 441 private static JavaConstant asConstant(Value val) { 442 return ((ConstantValue) val).getJavaConstant(); 443 } 444 445 private static Register asRegOrTmpReg(CompilationResultBuilder crb, AMD64MacroAssembler asm, Value val, Register tmpReg) { 446 if (isRegister(val)) { 447 return asRegister(val); 448 } else if (isStackSlot(val)) { 449 asm.movl(tmpReg, (AMD64Address) crb.asAddress(val)); 450 return tmpReg; 451 } else { 452 assert isConstant(val); 453 asm.movl(tmpReg, asConstant(val).asInt()); 454 return tmpReg; 455 } 456 } 457 458 private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) { 459 asm.align(crb.target.wordSize * 2); 460 } 461 462 /** 463 * Fills {@code vecDst} with copies of its lowest byte, word or dword. 464 */ 465 private static void emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) { 466 switch (kind) { 467 case Byte: 468 if (asm.supports(CPUFeature.AVX2)) { 469 VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst); 470 } else if (asm.supports(CPUFeature.AVX)) { 471 VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp); 472 VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp); 473 } else if (asm.supports(CPUFeature.SSSE3)) { 474 asm.pxor(vecTmp, vecTmp); 475 asm.pshufb(vecDst, vecTmp); 476 } else { // SSE2 477 asm.punpcklbw(vecDst, vecDst); 478 asm.punpcklbw(vecDst, vecDst); 479 asm.pshufd(vecDst, vecDst, 0); 480 } 481 break; 482 case Short: 483 case Char: 484 if (asm.supports(CPUFeature.AVX2)) { 485 VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst); 486 } else if (asm.supports(CPUFeature.AVX)) { 487 VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0); 488 VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0); 489 } else { // SSE 490 asm.pshuflw(vecDst, vecDst, 0); 491 asm.pshufd(vecDst, vecDst, 0); 492 } 493 break; 494 case Int: 495 if (asm.supports(CPUFeature.AVX2)) { 496 VexRMOp.VPBROADCASTD.emit(asm, vectorSize, vecDst, vecDst); 497 } else if (asm.supports(CPUFeature.AVX)) { 498 VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0); 499 } else { // SSE 500 asm.pshufd(vecDst, vecDst, 0); 501 } 502 break; 503 default: 504 throw new UnsupportedOperationException(); 505 } 506 } 507 508 private void emitVectorCompare(AMD64MacroAssembler asm, 509 JavaKind kind, 510 int nVectors, 511 Register arrayPtr, 512 Register index, 513 Register[] vecCmp, 514 Register[] vecArray, 515 Register[] cmpResult, 516 Label[] vectorFound, 517 boolean shortJmp, 518 boolean alignedLoad) { 519 // load array contents into vectors 520 for (int i = 0; i < nVectors; i++) { 521 int base = i * nValues; 522 for (int j = 0; j < nValues; j++) { 523 emitArrayLoad(asm, getVectorSize(), vecArray[base + j], arrayPtr, index, getVectorOffset(nVectors - (i + 1)), alignedLoad); 524 } 525 } 526 // compare all loaded bytes to the search value. 527 // matching bytes are set to 0xff, non-matching bytes are set to 0x00. 528 if (!findTwoConsecutive) { 529 for (int i = 0; i < nVectors; i++) { 530 int base = i * nValues; 531 for (int j = 0; j < nValues; j++) { 532 emitVectorCompareInst(asm, kind, getVectorSize(), vecArray[base + j], vecCmp[j]); 533 if ((j & 1) == 1) { 534 emitPOR(asm, getVectorSize(), vecArray[base + j - 1], vecArray[base + j]); 535 } 536 } 537 if (nValues > 2) { 538 emitPOR(asm, getVectorSize(), vecArray[base], vecArray[base + 2]); 539 } 540 emitMOVMSK(asm, getVectorSize(), cmpResult[0], vecArray[base]); 541 emitJnz(asm, cmpResult[0], vectorFound[nVectors - (i + 1)], shortJmp); 542 } 543 } else { 544 for (int i = 0; i < nVectors; i += 2) { 545 emitVectorCompareInst(asm, kind, getVectorSize(), vecArray[i], vecCmp[0]); 546 emitVectorCompareInst(asm, kind, getVectorSize(), vecArray[i + 1], vecCmp[0]); 547 emitMOVMSK(asm, getVectorSize(), cmpResult[1], vecArray[i]); 548 emitMOVMSK(asm, getVectorSize(), cmpResult[0], vecArray[i + 1]); 549 emitJnz(asm, cmpResult[1], vectorFound[nVectors - (i + 1)], shortJmp); 550 emitJnz(asm, cmpResult[0], vectorFound[nVectors - (i + 2)], shortJmp); 551 } 552 } 553 } 554 555 private static void emitJnz(AMD64MacroAssembler asm, Register cond, Label tgt, boolean shortJmp) { 556 asm.testl(cond, cond); 557 if (shortJmp) { 558 asm.jccb(AMD64Assembler.ConditionFlag.NotZero, tgt); 559 } else { 560 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, tgt); 561 } 562 } 563 564 private void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, Register index, int offset, boolean alignedLoad) { 565 AMD64Address src = new AMD64Address(arrayPtr, index, arrayIndexScale, offset); 566 if (asm.supports(CPUFeature.AVX)) { 567 VexMoveOp loadOp = alignedLoad ? VexMoveOp.VMOVDQA : VexMoveOp.VMOVDQU; 568 loadOp.emit(asm, vectorSize, vecDst, src); 569 } else { 570 // SSE 571 asm.movdqu(vecDst, src); 572 } 573 } 574 575 /** 576 * Compares all packed bytes/words/dwords in {@code vecArray} to {@code vecCmp}. Matching values 577 * are set to all ones (0xff, 0xffff, ...), non-matching values are set to zero. 578 */ 579 private static void emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) { 580 switch (kind) { 581 case Byte: 582 if (asm.supports(CPUFeature.AVX)) { 583 VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 584 } else { // SSE 585 asm.pcmpeqb(vecArray, vecCmp); 586 } 587 break; 588 case Short: 589 case Char: 590 if (asm.supports(CPUFeature.AVX)) { 591 VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 592 } else { // SSE 593 asm.pcmpeqw(vecArray, vecCmp); 594 } 595 break; 596 case Int: 597 if (asm.supports(CPUFeature.AVX)) { 598 VexRVMOp.VPCMPEQD.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 599 } else { // SSE 600 asm.pcmpeqd(vecArray, vecCmp); 601 } 602 break; 603 default: 604 throw new UnsupportedOperationException(); 605 } 606 } 607 608 private static void emitPOR(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) { 609 if (asm.supports(CPUFeature.AVX)) { 610 VexRVMOp.VPOR.emit(asm, vectorSize, dst, dst, vecSrc); 611 } else { 612 // SSE 613 asm.por(dst, vecSrc); 614 } 615 } 616 617 private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) { 618 if (asm.supports(CPUFeature.AVX)) { 619 VexRMOp.VPMOVMSKB.emit(asm, vectorSize, dst, vecSrc); 620 } else { 621 // SSE 622 asm.pmovmskb(dst, vecSrc); 623 } 624 } 625 626 private static OperandSize getOpSize(JavaKind kind) { 627 switch (kind) { 628 case Byte: 629 return OperandSize.BYTE; 630 case Short: 631 case Char: 632 return OperandSize.WORD; 633 case Int: 634 return OperandSize.DWORD; 635 default: 636 return OperandSize.QWORD; 637 } 638 } 639 640 private static boolean supportsAVX2(LIRGeneratorTool tool) { 641 return supports(tool, CPUFeature.AVX2); 642 } 643 644 private static boolean supports(LIRGeneratorTool tool, CPUFeature cpuFeature) { 645 return ((AMD64) tool.target().arch).getFeatures().contains(cpuFeature); 646 } 647 }