1 /* 2 * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 25 package org.graalvm.compiler.lir.amd64; 26 27 import jdk.vm.ci.amd64.AMD64; 28 import jdk.vm.ci.amd64.AMD64.CPUFeature; 29 import jdk.vm.ci.amd64.AMD64Kind; 30 import jdk.vm.ci.code.Register; 31 import jdk.vm.ci.meta.JavaKind; 32 import jdk.vm.ci.meta.Value; 33 import org.graalvm.compiler.asm.Label; 34 import org.graalvm.compiler.asm.amd64.AMD64Address; 35 import org.graalvm.compiler.asm.amd64.AMD64Assembler; 36 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp; 37 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp; 38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMOp; 39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRVMOp; 40 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 41 import org.graalvm.compiler.asm.amd64.AVXKind; 42 import org.graalvm.compiler.core.common.LIRKind; 43 import org.graalvm.compiler.lir.LIRInstructionClass; 44 import org.graalvm.compiler.lir.Opcode; 45 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 46 import org.graalvm.compiler.lir.gen.LIRGeneratorTool; 47 48 import static jdk.vm.ci.code.ValueUtil.asRegister; 49 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; 50 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; 51 52 /** 53 */ 54 @Opcode("AMD64_ARRAY_INDEX_OF") 55 public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction { 56 public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class); 57 58 private final JavaKind kind; 59 private final int vmPageSize; 60 private final int nValues; 61 private final boolean findTwoConsecutive; 62 private final AMD64Kind vectorKind; 63 64 @Def({REG}) protected Value resultValue; 65 @Alive({REG}) protected Value arrayPtrValue; 66 @Use({REG}) protected Value arrayLengthValue; 67 @Alive({REG}) protected Value searchValue1; 68 @Alive({REG, ILLEGAL}) protected Value searchValue2; 69 @Alive({REG, ILLEGAL}) protected Value searchValue3; 70 @Alive({REG, ILLEGAL}) protected Value searchValue4; 71 @Temp({REG}) protected Value arraySlotsRemaining; 72 @Temp({REG}) protected Value comparisonResult1; 73 @Temp({REG}) protected Value comparisonResult2; 74 @Temp({REG}) protected Value comparisonResult3; 75 @Temp({REG}) protected Value comparisonResult4; 76 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal1; 77 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal2; 78 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal3; 79 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal4; 80 @Temp({REG, ILLEGAL}) protected Value vectorArray1; 81 @Temp({REG, ILLEGAL}) protected Value vectorArray2; 82 @Temp({REG, ILLEGAL}) protected Value vectorArray3; 83 @Temp({REG, ILLEGAL}) protected Value vectorArray4; 84 85 public AMD64ArrayIndexOfOp(JavaKind kind, boolean findTwoConsecutive, int vmPageSize, int maxVectorSize, LIRGeneratorTool tool, Value result, Value arrayPtr, Value arrayLength, 86 Value... searchValues) { 87 super(TYPE); 88 this.kind = kind; 89 this.findTwoConsecutive = findTwoConsecutive; 90 this.vmPageSize = vmPageSize; 91 assert 0 < searchValues.length && searchValues.length <= 4; 92 assert byteMode(kind) || charMode(kind); 93 assert supports(tool, CPUFeature.SSE2) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool); 94 nValues = searchValues.length; 95 assert !findTwoConsecutive || nValues == 1; 96 resultValue = result; 97 arrayPtrValue = arrayPtr; 98 arrayLengthValue = arrayLength; 99 searchValue1 = searchValues[0]; 100 searchValue2 = nValues > 1 ? searchValues[1] : Value.ILLEGAL; 101 searchValue3 = nValues > 2 ? searchValues[2] : Value.ILLEGAL; 102 searchValue4 = nValues > 3 ? searchValues[3] : Value.ILLEGAL; 103 arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 104 comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 105 comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 106 comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 107 comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 108 vectorKind = supportsAVX2(tool) && (maxVectorSize < 0 || maxVectorSize >= 32) ? byteMode(kind) ? AMD64Kind.V256_BYTE : AMD64Kind.V256_WORD 109 : byteMode(kind) ? AMD64Kind.V128_BYTE : AMD64Kind.V128_WORD; 110 vectorCompareVal1 = tool.newVariable(LIRKind.value(vectorKind)); 111 vectorCompareVal2 = nValues > 1 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; 112 vectorCompareVal3 = nValues > 2 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; 113 vectorCompareVal4 = nValues > 3 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; 114 vectorArray1 = tool.newVariable(LIRKind.value(vectorKind)); 115 vectorArray2 = tool.newVariable(LIRKind.value(vectorKind)); 116 vectorArray3 = tool.newVariable(LIRKind.value(vectorKind)); 117 vectorArray4 = tool.newVariable(LIRKind.value(vectorKind)); 118 } 119 120 private static boolean byteMode(JavaKind kind) { 121 return kind == JavaKind.Byte; 122 } 123 124 private static boolean charMode(JavaKind kind) { 125 return kind == JavaKind.Char; 126 } 127 128 private JavaKind getComparisonKind() { 129 return findTwoConsecutive ? (byteMode(kind) ? JavaKind.Char : JavaKind.Int) : kind; 130 } 131 132 private AVXKind.AVXSize getVectorSize() { 133 return AVXKind.getDataSize(vectorKind); 134 } 135 136 @Override 137 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) { 138 Register arrayPtr = asRegister(arrayPtrValue); 139 Register arrayLength = asRegister(arrayLengthValue); 140 Register result = asRegister(resultValue); 141 Register slotsRemaining = asRegister(arraySlotsRemaining); 142 Register[] searchValue = { 143 nValues > 0 ? asRegister(searchValue1) : null, 144 nValues > 1 ? asRegister(searchValue2) : null, 145 nValues > 2 ? asRegister(searchValue3) : null, 146 nValues > 3 ? asRegister(searchValue4) : null, 147 }; 148 Register[] vecCmp = { 149 nValues > 0 ? asRegister(vectorCompareVal1) : null, 150 nValues > 1 ? asRegister(vectorCompareVal2) : null, 151 nValues > 2 ? asRegister(vectorCompareVal3) : null, 152 nValues > 3 ? asRegister(vectorCompareVal4) : null, 153 }; 154 Register[] vecArray = { 155 asRegister(vectorArray1), 156 asRegister(vectorArray2), 157 asRegister(vectorArray3), 158 asRegister(vectorArray4), 159 }; 160 Register[] cmpResult = { 161 asRegister(comparisonResult1), 162 asRegister(comparisonResult2), 163 asRegister(comparisonResult3), 164 asRegister(comparisonResult4), 165 }; 166 Label retFound = new Label(); 167 Label retNotFound = new Label(); 168 Label end = new Label(); 169 170 // load array length 171 // important: this must be the first register manipulation, since arrayLengthValue is 172 // annotated with @Use 173 asm.movl(slotsRemaining, arrayLength); 174 // load array pointer 175 asm.movq(result, arrayPtr); 176 // move search values to vectors 177 for (int i = 0; i < nValues; i++) { 178 if (asm.supports(CPUFeature.AVX)) { 179 VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp[i], searchValue[i]); 180 } else { 181 asm.movdl(vecCmp[i], searchValue[i]); 182 } 183 } 184 // fill comparison vector with copies of the search value 185 for (int i = 0; i < nValues; i++) { 186 emitBroadcast(asm, getComparisonKind(), vecCmp[i], vecArray[0], getVectorSize()); 187 } 188 189 emitArrayIndexOfChars(crb, asm, result, slotsRemaining, searchValue, vecCmp, vecArray, cmpResult, retFound, retNotFound); 190 191 // return -1 (no match) 192 asm.bind(retNotFound); 193 asm.movq(result, -1); 194 asm.jmpb(end); 195 196 asm.bind(retFound); 197 // convert array pointer to offset 198 asm.subq(result, arrayPtr); 199 if (charMode(kind)) { 200 asm.shrq(result, 1); 201 } 202 asm.bind(end); 203 } 204 205 private void emitArrayIndexOfChars(CompilationResultBuilder crb, AMD64MacroAssembler asm, 206 Register arrayPtr, 207 Register slotsRemaining, 208 Register[] searchValue, 209 Register[] vecCmp, 210 Register[] vecArray, 211 Register[] cmpResult, 212 Label retFound, 213 Label retNotFound) { 214 int nVectors = nValues == 1 ? 4 : nValues == 2 ? 2 : 1; 215 AVXKind.AVXSize vectorSize = getVectorSize(); 216 217 Label bulkVectorLoop = new Label(); 218 Label singleVectorLoop = new Label(); 219 Label[] vectorFound = { 220 new Label(), 221 new Label(), 222 new Label(), 223 new Label(), 224 }; 225 Label lessThanVectorSizeRemaining = new Label(); 226 Label lessThanVectorSizeRemainingLoop = new Label(); 227 Label bulkVectorLoopExit = nVectors == 1 ? lessThanVectorSizeRemaining : singleVectorLoop; 228 int bytesPerVector = vectorSize.getBytes(); 229 int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount(); 230 int singleVectorLoopCondition = arraySlotsPerVector; 231 int bulkSize = arraySlotsPerVector * nVectors; 232 int bulkSizeBytes = bytesPerVector * nVectors; 233 int bulkLoopCondition = bulkSize; 234 int[] vectorOffsets; 235 JavaKind vectorCompareKind = kind; 236 if (findTwoConsecutive) { 237 singleVectorLoopCondition++; 238 bulkLoopCondition++; 239 bulkSize /= 2; 240 bulkSizeBytes /= 2; 241 vectorOffsets = new int[]{0, kind.getByteCount(), bytesPerVector, bytesPerVector + kind.getByteCount()}; 242 vectorCompareKind = byteMode(kind) ? JavaKind.Char : JavaKind.Int; 243 } else { 244 vectorOffsets = new int[]{0, bytesPerVector, bytesPerVector * 2, bytesPerVector * 3}; 245 } 246 247 // load copy of low part of array pointer 248 Register tmpArrayPtrLow = cmpResult[0]; 249 asm.movl(tmpArrayPtrLow, arrayPtr); 250 251 // check if bulk vector load is in bounds 252 asm.cmpl(slotsRemaining, bulkLoopCondition); 253 asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit); 254 255 // check if array pointer is aligned to bulkSize 256 asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1); 257 asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop); 258 259 // do one unaligned bulk comparison pass and adjust alignment afterwards 260 emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false); 261 // load copy of low part of array pointer 262 asm.movl(tmpArrayPtrLow, arrayPtr); 263 // adjust array pointer 264 asm.addq(arrayPtr, bulkSizeBytes); 265 // adjust number of array slots remaining 266 asm.subl(slotsRemaining, bulkSize); 267 // get offset to bulk size alignment 268 asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1); 269 emitBytesToArraySlots(asm, kind, tmpArrayPtrLow); 270 // adjust array pointer to bulk size alignment 271 asm.andq(arrayPtr, ~(bulkSizeBytes - 1)); 272 // adjust number of array slots remaining 273 asm.addl(slotsRemaining, tmpArrayPtrLow); 274 // check if there are enough array slots remaining for the bulk loop 275 asm.cmpl(slotsRemaining, bulkLoopCondition); 276 asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit); 277 278 emitAlign(crb, asm); 279 asm.bind(bulkVectorLoop); 280 // memory-aligned bulk comparison 281 emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, !findTwoConsecutive); 282 // adjust number of array slots remaining 283 asm.subl(slotsRemaining, bulkSize); 284 // adjust array pointer 285 asm.addq(arrayPtr, bulkSizeBytes); 286 // check if there are enough array slots remaining for the bulk loop 287 asm.cmpl(slotsRemaining, bulkLoopCondition); 288 asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit); 289 // continue loop 290 asm.jmp(bulkVectorLoop); 291 292 if (nVectors > 1) { 293 emitAlign(crb, asm); 294 // same loop as bulkVectorLoop, with only one vector 295 asm.bind(singleVectorLoop); 296 // check if single vector load is in bounds 297 asm.cmpl(slotsRemaining, singleVectorLoopCondition); 298 asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining); 299 // compare 300 emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoConsecutive ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false); 301 // adjust number of array slots remaining 302 asm.subl(slotsRemaining, arraySlotsPerVector); 303 // adjust array pointer 304 asm.addq(arrayPtr, bytesPerVector); 305 // continue loop 306 asm.jmpb(singleVectorLoop); 307 } 308 309 asm.bind(lessThanVectorSizeRemaining); 310 // check if any array slots remain 311 asm.testl(slotsRemaining, slotsRemaining); 312 asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound); 313 314 // a vector compare will read out of bounds of the input array. 315 // check if the out-of-bounds read would cross a memory page boundary. 316 // load copy of low part of array pointer 317 asm.movl(tmpArrayPtrLow, arrayPtr); 318 // check if pointer + vector size would cross the page boundary 319 asm.andl(tmpArrayPtrLow, (vmPageSize - 1)); 320 asm.cmpl(tmpArrayPtrLow, (vmPageSize - (findTwoConsecutive ? bytesPerVector + kind.getByteCount() : bytesPerVector))); 321 // if the page boundary would be crossed, do byte/character-wise comparison instead. 322 asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop); 323 324 Label[] overBoundsMatch = {new Label(), new Label()}; 325 // otherwise, do a vector compare that reads beyond array bounds 326 emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoConsecutive ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, overBoundsMatch, false); 327 // no match 328 asm.jmp(retNotFound); 329 if (findTwoConsecutive) { 330 Label overBoundsFinish = new Label(); 331 asm.bind(overBoundsMatch[1]); 332 // get match offset of second result 333 asm.bsfq(cmpResult[1], cmpResult[1]); 334 asm.addl(cmpResult[1], kind.getByteCount()); 335 // replace first result with second and continue 336 asm.movl(cmpResult[0], cmpResult[1]); 337 asm.jmpb(overBoundsFinish); 338 339 asm.bind(overBoundsMatch[0]); 340 emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, overBoundsFinish); 341 } else { 342 asm.bind(overBoundsMatch[0]); 343 // find match offset 344 asm.bsfq(cmpResult[0], cmpResult[0]); 345 } 346 347 // adjust array pointer for match result 348 asm.addq(arrayPtr, cmpResult[0]); 349 if (charMode(kind)) { 350 // convert byte offset to chars 351 asm.shrl(cmpResult[0], 1); 352 } 353 // check if offset of matched value is greater than number of bytes remaining / out of array 354 // bounds 355 if (findTwoConsecutive) { 356 asm.decrementl(slotsRemaining); 357 } 358 asm.cmpl(cmpResult[0], slotsRemaining); 359 // match is out of bounds, return no match 360 asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound); 361 // adjust number of array slots remaining 362 if (findTwoConsecutive) { 363 asm.incrementl(slotsRemaining, 1); 364 } 365 asm.subl(slotsRemaining, cmpResult[0]); 366 // match is in bounds, return offset 367 asm.jmp(retFound); 368 369 // compare remaining slots in the array one-by-one 370 asm.bind(lessThanVectorSizeRemainingLoop); 371 // check if enough array slots remain 372 asm.cmpl(slotsRemaining, findTwoConsecutive ? 1 : 0); 373 asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, retNotFound); 374 // load char / byte 375 if (byteMode(kind)) { 376 if (findTwoConsecutive) { 377 asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr)); 378 } else { 379 asm.movzbl(cmpResult[0], new AMD64Address(arrayPtr)); 380 } 381 } else { 382 if (findTwoConsecutive) { 383 asm.movl(cmpResult[0], new AMD64Address(arrayPtr)); 384 } else { 385 asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr)); 386 } 387 } 388 // check for match 389 for (int i = 0; i < nValues; i++) { 390 emitCompareInst(asm, getComparisonKind(), cmpResult[0], searchValue[i]); 391 asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound); 392 } 393 // adjust number of array slots remaining 394 asm.decrementl(slotsRemaining); 395 // adjust array pointer 396 asm.addq(arrayPtr, kind.getByteCount()); 397 // continue loop 398 asm.jmpb(lessThanVectorSizeRemainingLoop); 399 400 for (int i = 1; i < nVectors; i += (findTwoConsecutive ? 2 : 1)) { 401 emitVectorFoundWithOffset(asm, kind, vectorOffsets[i], arrayPtr, cmpResult[i], slotsRemaining, vectorFound[i], retFound); 402 } 403 404 if (findTwoConsecutive) { 405 asm.bind(vectorFound[2]); 406 asm.addq(arrayPtr, vectorOffsets[2]); 407 // adjust number of array slots remaining 408 asm.subl(slotsRemaining, charMode(kind) ? vectorOffsets[2] / 2 : vectorOffsets[2]); 409 asm.movl(cmpResult[0], cmpResult[2]); 410 asm.movl(cmpResult[1], cmpResult[3]); 411 asm.bind(vectorFound[0]); 412 emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, new Label()); 413 } else { 414 asm.bind(vectorFound[0]); 415 // find index of first set bit in bit mask 416 asm.bsfq(cmpResult[0], cmpResult[0]); 417 } 418 // add offset to array pointer 419 asm.addq(arrayPtr, cmpResult[0]); 420 if (charMode(kind)) { 421 // convert byte offset to chars 422 asm.shrl(cmpResult[0], 1); 423 } 424 // adjust number of array slots remaining 425 asm.subl(slotsRemaining, cmpResult[0]); 426 asm.jmpb(retFound); 427 } 428 429 private static void emitFindTwoCharPrefixMinResult(AMD64MacroAssembler asm, JavaKind kind, Register[] cmpResult, Label done) { 430 // find match offset 431 asm.bsfq(cmpResult[0], cmpResult[0]); 432 // check if second result is also a match 433 asm.testl(cmpResult[1], cmpResult[1]); 434 asm.jcc(AMD64Assembler.ConditionFlag.Zero, done); 435 // get match offset of second result 436 asm.bsfq(cmpResult[1], cmpResult[1]); 437 asm.addl(cmpResult[1], kind.getByteCount()); 438 // check if first result is less than second 439 asm.cmpl(cmpResult[0], cmpResult[1]); 440 asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, done); 441 // first result is greater than second, replace it with the second result 442 asm.movl(cmpResult[0], cmpResult[1]); 443 asm.bind(done); 444 } 445 446 private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) { 447 asm.align(crb.target.wordSize * 2); 448 } 449 450 /** 451 * Fills {@code vecDst} with copies of its lowest byte, word or dword. 452 */ 453 private static void emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) { 454 switch (kind) { 455 case Byte: 456 if (asm.supports(CPUFeature.AVX2)) { 457 VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst); 458 } else if (asm.supports(CPUFeature.AVX)) { 459 VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp); 460 VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp); 461 } else if (asm.supports(CPUFeature.SSSE3)) { 462 asm.pxor(vecTmp, vecTmp); 463 asm.pshufb(vecDst, vecTmp); 464 } else { // SSE2 465 asm.punpcklbw(vecDst, vecDst); 466 asm.punpcklbw(vecDst, vecDst); 467 asm.pshufd(vecDst, vecDst, 0); 468 } 469 break; 470 case Short: 471 case Char: 472 if (asm.supports(CPUFeature.AVX2)) { 473 VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst); 474 } else if (asm.supports(CPUFeature.AVX)) { 475 VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0); 476 VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0); 477 } else { // SSE 478 asm.pshuflw(vecDst, vecDst, 0); 479 asm.pshufd(vecDst, vecDst, 0); 480 } 481 break; 482 case Int: 483 if (asm.supports(CPUFeature.AVX2)) { 484 VexRMOp.VPBROADCASTD.emit(asm, vectorSize, vecDst, vecDst); 485 } else if (asm.supports(CPUFeature.AVX)) { 486 VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0); 487 } else { // SSE 488 asm.pshufd(vecDst, vecDst, 0); 489 } 490 break; 491 default: 492 throw new UnsupportedOperationException(); 493 } 494 } 495 496 /** 497 * Convert a byte offset stored in {@code bytes} to an array index offset. 498 */ 499 private static void emitBytesToArraySlots(AMD64MacroAssembler asm, JavaKind kind, Register bytes) { 500 if (charMode(kind)) { 501 asm.shrl(bytes, 1); 502 } else { 503 assert byteMode(kind); 504 } 505 } 506 507 private static void emitVectorCompare(AMD64MacroAssembler asm, 508 JavaKind kind, 509 AVXKind.AVXSize vectorSize, 510 int nValues, 511 int nVectors, 512 int[] vectorOffsets, 513 Register arrayPtr, 514 Register[] vecCmp, 515 Register[] vecArray, 516 Register[] cmpResult, 517 Label[] vectorFound, 518 boolean alignedLoad) { 519 // load array contents into vectors 520 for (int i = 0; i < nValues; i++) { 521 for (int j = 0; j < nVectors; j++) { 522 emitArrayLoad(asm, vectorSize, vecArray[(i * nVectors) + j], arrayPtr, vectorOffsets[j], alignedLoad); 523 } 524 } 525 // compare all loaded bytes to the search value. 526 // matching bytes are set to 0xff, non-matching bytes are set to 0x00. 527 for (int i = 0; i < nValues; i++) { 528 for (int j = 0; j < nVectors; j++) { 529 emitVectorCompareInst(asm, kind, vectorSize, vecArray[(i * nVectors) + j], vecCmp[i]); 530 } 531 } 532 // create 32-bit-masks from the most significant bit of every byte in the comparison 533 // results. 534 for (int i = 0; i < nValues * nVectors; i++) { 535 emitMOVMSK(asm, vectorSize, cmpResult[i], vecArray[i]); 536 } 537 // join results of comparisons against multiple values 538 for (int stride = 1; stride < nValues; stride *= 2) { 539 for (int i = 0; i < nVectors; i++) { 540 for (int j = 0; j + stride < nValues; j += stride * 2) { 541 asm.orl(cmpResult[i + (j * nVectors)], cmpResult[i + ((j + stride) * nVectors)]); 542 } 543 } 544 } 545 // check if a match was found 546 for (int i = 0; i < nVectors; i++) { 547 asm.testl(cmpResult[i], cmpResult[i]); 548 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound[i]); 549 } 550 } 551 552 private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm, 553 JavaKind kind, 554 int resultOffset, 555 Register result, 556 Register cmpResult, 557 Register slotsRemaining, 558 Label entry, 559 Label ret) { 560 asm.bind(entry); 561 if (resultOffset > 0) { 562 // adjust array pointer 563 asm.addq(result, resultOffset); 564 // adjust number of array slots remaining 565 asm.subl(slotsRemaining, charMode(kind) ? resultOffset / 2 : resultOffset); 566 } 567 // find index of first set bit in bit mask 568 asm.bsfq(cmpResult, cmpResult); 569 // add offset to array pointer 570 asm.addq(result, cmpResult); 571 if (charMode(kind)) { 572 // convert byte offset to chars 573 asm.shrl(cmpResult, 1); 574 } 575 // adjust number of array slots remaining 576 asm.subl(slotsRemaining, cmpResult); 577 asm.jmpb(ret); 578 } 579 580 private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) { 581 AMD64Address src = new AMD64Address(arrayPtr, offset); 582 if (asm.supports(CPUFeature.AVX)) { 583 VexMoveOp loadOp = alignedLoad ? VexMoveOp.VMOVDQA : VexMoveOp.VMOVDQU; 584 loadOp.emit(asm, vectorSize, vecDst, src); 585 } else { 586 // SSE 587 asm.movdqu(vecDst, src); 588 } 589 } 590 591 /** 592 * Compares all packed bytes/words/dwords in {@code vecArray} to {@code vecCmp}. Matching values 593 * are set to all ones (0xff, 0xffff, ...), non-matching values are set to zero. 594 */ 595 private static void emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) { 596 switch (kind) { 597 case Byte: 598 if (asm.supports(CPUFeature.AVX)) { 599 VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 600 } else { // SSE 601 asm.pcmpeqb(vecArray, vecCmp); 602 } 603 break; 604 case Short: 605 case Char: 606 if (asm.supports(CPUFeature.AVX)) { 607 VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 608 } else { // SSE 609 asm.pcmpeqw(vecArray, vecCmp); 610 } 611 break; 612 case Int: 613 if (asm.supports(CPUFeature.AVX)) { 614 VexRVMOp.VPCMPEQD.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 615 } else { // SSE 616 asm.pcmpeqd(vecArray, vecCmp); 617 } 618 break; 619 default: 620 throw new UnsupportedOperationException(); 621 } 622 } 623 624 private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) { 625 if (asm.supports(CPUFeature.AVX)) { 626 VexRMOp.VPMOVMSKB.emit(asm, vectorSize, dst, vecSrc); 627 } else { 628 // SSE 629 asm.pmovmskb(dst, vecSrc); 630 } 631 } 632 633 private static void emitCompareInst(AMD64MacroAssembler asm, JavaKind kind, Register dst, Register src) { 634 switch (kind) { 635 case Byte: 636 asm.cmpb(dst, src); 637 break; 638 case Short: 639 case Char: 640 asm.cmpw(dst, src); 641 break; 642 case Int: 643 asm.cmpl(dst, src); 644 break; 645 default: 646 asm.cmpq(dst, src); 647 } 648 } 649 650 private static boolean supportsAVX2(LIRGeneratorTool tool) { 651 return supports(tool, CPUFeature.AVX2); 652 } 653 654 private static boolean supports(LIRGeneratorTool tool, CPUFeature cpuFeature) { 655 return ((AMD64) tool.target().arch).getFeatures().contains(cpuFeature); 656 } 657 }