1 /* 2 * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 25 package org.graalvm.compiler.lir.amd64; 26 27 import jdk.vm.ci.amd64.AMD64; 28 import jdk.vm.ci.amd64.AMD64.CPUFeature; 29 import jdk.vm.ci.amd64.AMD64Kind; 30 import jdk.vm.ci.code.Register; 31 import jdk.vm.ci.meta.JavaKind; 32 import jdk.vm.ci.meta.Value; 33 import org.graalvm.compiler.asm.Label; 34 import org.graalvm.compiler.asm.amd64.AMD64Address; 35 import org.graalvm.compiler.asm.amd64.AMD64Assembler; 36 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp; 37 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp; 38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMOp; 39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRVMOp; 40 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 41 import org.graalvm.compiler.asm.amd64.AVXKind; 42 import org.graalvm.compiler.core.common.LIRKind; 43 import org.graalvm.compiler.lir.LIRInstructionClass; 44 import org.graalvm.compiler.lir.Opcode; 45 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 46 import org.graalvm.compiler.lir.gen.LIRGeneratorTool; 47 48 import static jdk.vm.ci.code.ValueUtil.asRegister; 49 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; 50 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; 51 52 /** 53 */ 54 @Opcode("AMD64_ARRAY_INDEX_OF") 55 public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction { 56 public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class); 57 58 private final JavaKind kind; 59 private final int vmPageSize; 60 private final int nValues; 61 private final boolean findTwoConsecutive; 62 private final AMD64Kind vectorKind; 63 64 @Def({REG}) protected Value resultValue; 65 @Alive({REG}) protected Value arrayPtrValue; 66 @Use({REG}) protected Value arrayLengthValue; 67 @Alive({REG}) protected Value searchValue1; 68 @Alive({REG, ILLEGAL}) protected Value searchValue2; 69 @Alive({REG, ILLEGAL}) protected Value searchValue3; 70 @Alive({REG, ILLEGAL}) protected Value searchValue4; 71 @Temp({REG}) protected Value arraySlotsRemaining; 72 @Temp({REG}) protected Value comparisonResult1; 73 @Temp({REG}) protected Value comparisonResult2; 74 @Temp({REG}) protected Value comparisonResult3; 75 @Temp({REG}) protected Value comparisonResult4; 76 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal1; 77 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal2; 78 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal3; 79 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal4; 80 @Temp({REG, ILLEGAL}) protected Value vectorArray1; 81 @Temp({REG, ILLEGAL}) protected Value vectorArray2; 82 @Temp({REG, ILLEGAL}) protected Value vectorArray3; 83 @Temp({REG, ILLEGAL}) protected Value vectorArray4; 84 85 public AMD64ArrayIndexOfOp(JavaKind kind, boolean findTwoConsecutive, int vmPageSize, int maxVectorSize, LIRGeneratorTool tool, Value result, Value arrayPtr, Value arrayLength, 86 Value... searchValues) { 87 super(TYPE); 88 this.kind = kind; 89 this.findTwoConsecutive = findTwoConsecutive; 90 this.vmPageSize = vmPageSize; 91 assert 0 < searchValues.length && searchValues.length <= 4; 92 assert byteMode(kind) || charMode(kind); 93 assert supports(tool, CPUFeature.SSE2) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool); 94 nValues = searchValues.length; 95 assert !findTwoConsecutive || nValues == 1; 96 resultValue = result; 97 arrayPtrValue = arrayPtr; 98 arrayLengthValue = arrayLength; 99 searchValue1 = searchValues[0]; 100 searchValue2 = nValues > 1 ? searchValues[1] : Value.ILLEGAL; 101 searchValue3 = nValues > 2 ? searchValues[2] : Value.ILLEGAL; 102 searchValue4 = nValues > 3 ? searchValues[3] : Value.ILLEGAL; 103 arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 104 comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 105 comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 106 comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 107 comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 108 vectorKind = supportsAVX2(tool) && (maxVectorSize < 0 || maxVectorSize >= 32) ? byteMode(kind) ? AMD64Kind.V256_BYTE : AMD64Kind.V256_WORD 109 : byteMode(kind) ? AMD64Kind.V128_BYTE : AMD64Kind.V128_WORD; 110 vectorCompareVal1 = tool.newVariable(LIRKind.value(vectorKind)); 111 vectorCompareVal2 = nValues > 1 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; 112 vectorCompareVal3 = nValues > 2 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; 113 vectorCompareVal4 = nValues > 3 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; 114 vectorArray1 = tool.newVariable(LIRKind.value(vectorKind)); 115 vectorArray2 = tool.newVariable(LIRKind.value(vectorKind)); 116 vectorArray3 = tool.newVariable(LIRKind.value(vectorKind)); 117 vectorArray4 = tool.newVariable(LIRKind.value(vectorKind)); 118 } 119 120 private static boolean byteMode(JavaKind kind) { 121 return kind == JavaKind.Byte; 122 } 123 124 private static boolean charMode(JavaKind kind) { 125 return kind == JavaKind.Char; 126 } 127 128 @Override 129 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) { 130 Register arrayPtr = asRegister(arrayPtrValue); 131 Register arrayLength = asRegister(arrayLengthValue); 132 Register result = asRegister(resultValue); 133 Register slotsRemaining = asRegister(arraySlotsRemaining); 134 Register[] searchValue = { 135 nValues > 0 ? asRegister(searchValue1) : null, 136 nValues > 1 ? asRegister(searchValue2) : null, 137 nValues > 2 ? asRegister(searchValue3) : null, 138 nValues > 3 ? asRegister(searchValue4) : null, 139 }; 140 Register[] vecCmp = { 141 nValues > 0 ? asRegister(vectorCompareVal1) : null, 142 nValues > 1 ? asRegister(vectorCompareVal2) : null, 143 nValues > 2 ? asRegister(vectorCompareVal3) : null, 144 nValues > 3 ? asRegister(vectorCompareVal4) : null, 145 }; 146 Register[] vecArray = { 147 asRegister(vectorArray1), 148 asRegister(vectorArray2), 149 asRegister(vectorArray3), 150 asRegister(vectorArray4), 151 }; 152 Register[] cmpResult = { 153 asRegister(comparisonResult1), 154 asRegister(comparisonResult2), 155 asRegister(comparisonResult3), 156 asRegister(comparisonResult4), 157 }; 158 Label retFound = new Label(); 159 Label retNotFound = new Label(); 160 Label end = new Label(); 161 162 AVXKind.AVXSize vectorSize = AVXKind.getDataSize(vectorKind); 163 int nVectors = nValues == 1 ? 4 : nValues == 2 ? 2 : 1; 164 165 // load array length 166 // important: this must be the first register manipulation, since arrayLengthValue is 167 // annotated with @Use 168 asm.movl(slotsRemaining, arrayLength); 169 // load array pointer 170 asm.movq(result, arrayPtr); 171 // move search values to vectors 172 for (int i = 0; i < nValues; i++) { 173 if (asm.supports(CPUFeature.AVX)) { 174 VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp[i], searchValue[i]); 175 } else { 176 asm.movdl(vecCmp[i], searchValue[i]); 177 } 178 } 179 // fill comparison vector with copies of the search value 180 for (int i = 0; i < nValues; i++) { 181 emitBroadcast(asm, findTwoConsecutive ? (byteMode(kind) ? JavaKind.Char : JavaKind.Int) : kind, vecCmp[i], vecArray[0], vectorSize); 182 } 183 184 emitArrayIndexOfChars(crb, asm, kind, vectorSize, result, slotsRemaining, searchValue, vecCmp, vecArray, cmpResult, retFound, retNotFound, vmPageSize, nValues, nVectors, findTwoConsecutive); 185 186 // return -1 (no match) 187 asm.bind(retNotFound); 188 asm.movq(result, -1); 189 asm.jmpb(end); 190 191 asm.bind(retFound); 192 // convert array pointer to offset 193 asm.subq(result, arrayPtr); 194 if (charMode(kind)) { 195 asm.shrq(result, 1); 196 } 197 asm.bind(end); 198 } 199 200 private static void emitArrayIndexOfChars(CompilationResultBuilder crb, AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, 201 Register arrayPtr, 202 Register slotsRemaining, 203 Register[] searchValue, 204 Register[] vecCmp, 205 Register[] vecArray, 206 Register[] cmpResult, 207 Label retFound, 208 Label retNotFound, 209 int vmPageSize, 210 int nValues, 211 int nVectors, 212 boolean findTwoCharPrefix) { 213 Label bulkVectorLoop = new Label(); 214 Label singleVectorLoop = new Label(); 215 Label[] vectorFound = { 216 new Label(), 217 new Label(), 218 new Label(), 219 new Label(), 220 }; 221 Label lessThanVectorSizeRemaining = new Label(); 222 Label lessThanVectorSizeRemainingLoop = new Label(); 223 Label bulkVectorLoopExit = nVectors == 1 ? lessThanVectorSizeRemaining : singleVectorLoop; 224 int bytesPerVector = vectorSize.getBytes(); 225 int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount(); 226 int singleVectorLoopCondition = arraySlotsPerVector; 227 int bulkSize = arraySlotsPerVector * nVectors; 228 int bulkSizeBytes = bytesPerVector * nVectors; 229 int bulkLoopCondition = bulkSize; 230 int[] vectorOffsets; 231 JavaKind vectorCompareKind = kind; 232 if (findTwoCharPrefix) { 233 singleVectorLoopCondition++; 234 bulkLoopCondition++; 235 bulkSize /= 2; 236 bulkSizeBytes /= 2; 237 vectorOffsets = new int[]{0, kind.getByteCount(), bytesPerVector, bytesPerVector + kind.getByteCount()}; 238 vectorCompareKind = byteMode(kind) ? JavaKind.Char : JavaKind.Int; 239 } else { 240 vectorOffsets = new int[]{0, bytesPerVector, bytesPerVector * 2, bytesPerVector * 3}; 241 } 242 243 // load copy of low part of array pointer 244 Register tmpArrayPtrLow = cmpResult[0]; 245 asm.movl(tmpArrayPtrLow, arrayPtr); 246 247 // check if bulk vector load is in bounds 248 asm.cmpl(slotsRemaining, bulkLoopCondition); 249 asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit); 250 251 // check if array pointer is aligned to bulkSize 252 asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1); 253 asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop); 254 255 // do one unaligned bulk comparison pass and adjust alignment afterwards 256 emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false); 257 // load copy of low part of array pointer 258 asm.movl(tmpArrayPtrLow, arrayPtr); 259 // adjust array pointer 260 asm.addq(arrayPtr, bulkSizeBytes); 261 // adjust number of array slots remaining 262 asm.subl(slotsRemaining, bulkSize); 263 // get offset to bulk size alignment 264 asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1); 265 emitBytesToArraySlots(asm, kind, tmpArrayPtrLow); 266 // adjust array pointer to bulk size alignment 267 asm.andq(arrayPtr, ~(bulkSizeBytes - 1)); 268 // adjust number of array slots remaining 269 asm.addl(slotsRemaining, tmpArrayPtrLow); 270 // check if there are enough array slots remaining for the bulk loop 271 asm.cmpl(slotsRemaining, bulkLoopCondition); 272 asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit); 273 274 emitAlign(crb, asm); 275 asm.bind(bulkVectorLoop); 276 // memory-aligned bulk comparison 277 emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, !findTwoCharPrefix); 278 // adjust number of array slots remaining 279 asm.subl(slotsRemaining, bulkSize); 280 // adjust array pointer 281 asm.addq(arrayPtr, bulkSizeBytes); 282 // check if there are enough array slots remaining for the bulk loop 283 asm.cmpl(slotsRemaining, bulkLoopCondition); 284 asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit); 285 // continue loop 286 asm.jmp(bulkVectorLoop); 287 288 if (nVectors > 1) { 289 emitAlign(crb, asm); 290 // same loop as bulkVectorLoop, with only one vector 291 asm.bind(singleVectorLoop); 292 // check if single vector load is in bounds 293 asm.cmpl(slotsRemaining, singleVectorLoopCondition); 294 asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining); 295 // compare 296 emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoCharPrefix ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false); 297 // adjust number of array slots remaining 298 asm.subl(slotsRemaining, arraySlotsPerVector); 299 // adjust array pointer 300 asm.addq(arrayPtr, bytesPerVector); 301 // continue loop 302 asm.jmpb(singleVectorLoop); 303 } 304 305 asm.bind(lessThanVectorSizeRemaining); 306 // check if any array slots remain 307 asm.testl(slotsRemaining, slotsRemaining); 308 asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound); 309 310 // a vector compare will read out of bounds of the input array. 311 // check if the out-of-bounds read would cross a memory page boundary. 312 // load copy of low part of array pointer 313 asm.movl(tmpArrayPtrLow, arrayPtr); 314 // check if pointer + vector size would cross the page boundary 315 asm.andl(tmpArrayPtrLow, (vmPageSize - 1)); 316 asm.cmpl(tmpArrayPtrLow, (vmPageSize - (findTwoCharPrefix ? bytesPerVector + kind.getByteCount() : bytesPerVector))); 317 // if the page boundary would be crossed, do byte/character-wise comparison instead. 318 asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop); 319 320 Label[] overBoundsMatch = {new Label(), new Label()}; 321 // otherwise, do a vector compare that reads beyond array bounds 322 emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoCharPrefix ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, overBoundsMatch, false); 323 // no match 324 asm.jmp(retNotFound); 325 if (findTwoCharPrefix) { 326 Label overBoundsFinish = new Label(); 327 asm.bind(overBoundsMatch[1]); 328 // get match offset of second result 329 asm.bsfq(cmpResult[1], cmpResult[1]); 330 asm.addl(cmpResult[1], kind.getByteCount()); 331 // replace first result with second and continue 332 asm.movl(cmpResult[0], cmpResult[1]); 333 asm.jmpb(overBoundsFinish); 334 335 asm.bind(overBoundsMatch[0]); 336 emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, overBoundsFinish); 337 } else { 338 asm.bind(overBoundsMatch[0]); 339 // find match offset 340 asm.bsfq(cmpResult[0], cmpResult[0]); 341 } 342 343 // adjust array pointer for match result 344 asm.addq(arrayPtr, cmpResult[0]); 345 if (charMode(kind)) { 346 // convert byte offset to chars 347 asm.shrl(cmpResult[0], 1); 348 } 349 // check if offset of matched value is greater than number of bytes remaining / out of array 350 // bounds 351 if (findTwoCharPrefix) { 352 asm.decrementl(slotsRemaining); 353 } 354 asm.cmpl(cmpResult[0], slotsRemaining); 355 // match is out of bounds, return no match 356 asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound); 357 // adjust number of array slots remaining 358 if (findTwoCharPrefix) { 359 asm.incrementl(slotsRemaining, 1); 360 } 361 asm.subl(slotsRemaining, cmpResult[0]); 362 // match is in bounds, return offset 363 asm.jmp(retFound); 364 365 // compare remaining slots in the array one-by-one 366 asm.bind(lessThanVectorSizeRemainingLoop); 367 // check if enough array slots remain 368 asm.cmpl(slotsRemaining, findTwoCharPrefix ? 1 : 0); 369 asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, retNotFound); 370 // load char / byte 371 if (byteMode(kind)) { 372 if (findTwoCharPrefix) { 373 asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr)); 374 } else { 375 asm.movzbl(cmpResult[0], new AMD64Address(arrayPtr)); 376 } 377 } else { 378 if (findTwoCharPrefix) { 379 asm.movl(cmpResult[0], new AMD64Address(arrayPtr)); 380 } else { 381 asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr)); 382 } 383 } 384 // check for match 385 for (int i = 0; i < nValues; i++) { 386 asm.cmpl(cmpResult[0], searchValue[i]); 387 asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound); 388 } 389 // adjust number of array slots remaining 390 asm.decrementl(slotsRemaining); 391 // adjust array pointer 392 asm.addq(arrayPtr, kind.getByteCount()); 393 // continue loop 394 asm.jmpb(lessThanVectorSizeRemainingLoop); 395 396 for (int i = 1; i < nVectors; i += (findTwoCharPrefix ? 2 : 1)) { 397 emitVectorFoundWithOffset(asm, kind, vectorOffsets[i], arrayPtr, cmpResult[i], slotsRemaining, vectorFound[i], retFound); 398 } 399 400 if (findTwoCharPrefix) { 401 asm.bind(vectorFound[2]); 402 asm.addq(arrayPtr, vectorOffsets[2]); 403 // adjust number of array slots remaining 404 asm.subl(slotsRemaining, charMode(kind) ? vectorOffsets[2] / 2 : vectorOffsets[2]); 405 asm.movl(cmpResult[0], cmpResult[2]); 406 asm.movl(cmpResult[1], cmpResult[3]); 407 asm.bind(vectorFound[0]); 408 emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, new Label()); 409 } else { 410 asm.bind(vectorFound[0]); 411 // find index of first set bit in bit mask 412 asm.bsfq(cmpResult[0], cmpResult[0]); 413 } 414 // add offset to array pointer 415 asm.addq(arrayPtr, cmpResult[0]); 416 if (charMode(kind)) { 417 // convert byte offset to chars 418 asm.shrl(cmpResult[0], 1); 419 } 420 // adjust number of array slots remaining 421 asm.subl(slotsRemaining, cmpResult[0]); 422 asm.jmpb(retFound); 423 } 424 425 private static void emitFindTwoCharPrefixMinResult(AMD64MacroAssembler asm, JavaKind kind, Register[] cmpResult, Label done) { 426 // find match offset 427 asm.bsfq(cmpResult[0], cmpResult[0]); 428 // check if second result is also a match 429 asm.testl(cmpResult[1], cmpResult[1]); 430 asm.jcc(AMD64Assembler.ConditionFlag.Zero, done); 431 // get match offset of second result 432 asm.bsfq(cmpResult[1], cmpResult[1]); 433 asm.addl(cmpResult[1], kind.getByteCount()); 434 // check if first result is less than second 435 asm.cmpl(cmpResult[0], cmpResult[1]); 436 asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, done); 437 // first result is greater than second, replace it with the second result 438 asm.movl(cmpResult[0], cmpResult[1]); 439 asm.bind(done); 440 } 441 442 private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) { 443 asm.align(crb.target.wordSize * 2); 444 } 445 446 /** 447 * Fills {@code vecDst} with copies of its lowest byte, word or dword. 448 */ 449 private static void emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) { 450 switch (kind) { 451 case Byte: 452 if (asm.supports(CPUFeature.AVX2)) { 453 VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst); 454 } else if (asm.supports(CPUFeature.AVX)) { 455 VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp); 456 VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp); 457 } else if (asm.supports(CPUFeature.SSSE3)) { 458 asm.pxor(vecTmp, vecTmp); 459 asm.pshufb(vecDst, vecTmp); 460 } else { // SSE2 461 asm.punpcklbw(vecDst, vecDst); 462 asm.punpcklbw(vecDst, vecDst); 463 asm.pshufd(vecDst, vecDst, 0); 464 } 465 break; 466 case Short: 467 case Char: 468 if (asm.supports(CPUFeature.AVX2)) { 469 VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst); 470 } else if (asm.supports(CPUFeature.AVX)) { 471 VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0); 472 VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0); 473 } else { // SSE 474 asm.pshuflw(vecDst, vecDst, 0); 475 asm.pshufd(vecDst, vecDst, 0); 476 } 477 break; 478 case Int: 479 if (asm.supports(CPUFeature.AVX2)) { 480 VexRMOp.VPBROADCASTD.emit(asm, vectorSize, vecDst, vecDst); 481 } else if (asm.supports(CPUFeature.AVX)) { 482 VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0); 483 } else { // SSE 484 asm.pshufd(vecDst, vecDst, 0); 485 } 486 break; 487 default: 488 throw new UnsupportedOperationException(); 489 } 490 } 491 492 /** 493 * Convert a byte offset stored in {@code bytes} to an array index offset. 494 */ 495 private static void emitBytesToArraySlots(AMD64MacroAssembler asm, JavaKind kind, Register bytes) { 496 if (charMode(kind)) { 497 asm.shrl(bytes, 1); 498 } else { 499 assert byteMode(kind); 500 } 501 } 502 503 private static void emitVectorCompare(AMD64MacroAssembler asm, 504 JavaKind kind, 505 AVXKind.AVXSize vectorSize, 506 int nValues, 507 int nVectors, 508 int[] vectorOffsets, 509 Register arrayPtr, 510 Register[] vecCmp, 511 Register[] vecArray, 512 Register[] cmpResult, 513 Label[] vectorFound, 514 boolean alignedLoad) { 515 // load array contents into vectors 516 for (int i = 0; i < nValues; i++) { 517 for (int j = 0; j < nVectors; j++) { 518 emitArrayLoad(asm, vectorSize, vecArray[(i * nVectors) + j], arrayPtr, vectorOffsets[j], alignedLoad); 519 } 520 } 521 // compare all loaded bytes to the search value. 522 // matching bytes are set to 0xff, non-matching bytes are set to 0x00. 523 for (int i = 0; i < nValues; i++) { 524 for (int j = 0; j < nVectors; j++) { 525 emitVectorCompareInst(asm, kind, vectorSize, vecArray[(i * nVectors) + j], vecCmp[i]); 526 } 527 } 528 // create 32-bit-masks from the most significant bit of every byte in the comparison 529 // results. 530 for (int i = 0; i < nValues * nVectors; i++) { 531 emitMOVMSK(asm, vectorSize, cmpResult[i], vecArray[i]); 532 } 533 // join results of comparisons against multiple values 534 for (int stride = 1; stride < nValues; stride *= 2) { 535 for (int i = 0; i < nVectors; i++) { 536 for (int j = 0; j + stride < nValues; j += stride * 2) { 537 asm.orl(cmpResult[i + (j * nVectors)], cmpResult[i + ((j + stride) * nVectors)]); 538 } 539 } 540 } 541 // check if a match was found 542 for (int i = 0; i < nVectors; i++) { 543 asm.testl(cmpResult[i], cmpResult[i]); 544 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound[i]); 545 } 546 } 547 548 private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm, 549 JavaKind kind, 550 int resultOffset, 551 Register result, 552 Register cmpResult, 553 Register slotsRemaining, 554 Label entry, 555 Label ret) { 556 asm.bind(entry); 557 if (resultOffset > 0) { 558 // adjust array pointer 559 asm.addq(result, resultOffset); 560 // adjust number of array slots remaining 561 asm.subl(slotsRemaining, charMode(kind) ? resultOffset / 2 : resultOffset); 562 } 563 // find index of first set bit in bit mask 564 asm.bsfq(cmpResult, cmpResult); 565 // add offset to array pointer 566 asm.addq(result, cmpResult); 567 if (charMode(kind)) { 568 // convert byte offset to chars 569 asm.shrl(cmpResult, 1); 570 } 571 // adjust number of array slots remaining 572 asm.subl(slotsRemaining, cmpResult); 573 asm.jmpb(ret); 574 } 575 576 private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) { 577 AMD64Address src = new AMD64Address(arrayPtr, offset); 578 if (asm.supports(CPUFeature.AVX)) { 579 VexMoveOp loadOp = alignedLoad ? VexMoveOp.VMOVDQA : VexMoveOp.VMOVDQU; 580 loadOp.emit(asm, vectorSize, vecDst, src); 581 } else { 582 // SSE 583 asm.movdqu(vecDst, src); 584 } 585 } 586 587 /** 588 * Compares all packed bytes/words/dwords in {@code vecArray} to {@code vecCmp}. Matching values 589 * are set to all ones (0xff, 0xffff, ...), non-matching values are set to zero. 590 */ 591 private static void emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) { 592 switch (kind) { 593 case Byte: 594 if (asm.supports(CPUFeature.AVX)) { 595 VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 596 } else { // SSE 597 asm.pcmpeqb(vecArray, vecCmp); 598 } 599 break; 600 case Short: 601 case Char: 602 if (asm.supports(CPUFeature.AVX)) { 603 VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 604 } else { // SSE 605 asm.pcmpeqw(vecArray, vecCmp); 606 } 607 break; 608 case Int: 609 if (asm.supports(CPUFeature.AVX)) { 610 VexRVMOp.VPCMPEQD.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 611 } else { // SSE 612 asm.pcmpeqd(vecArray, vecCmp); 613 } 614 break; 615 default: 616 throw new UnsupportedOperationException(); 617 } 618 } 619 620 private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) { 621 if (asm.supports(CPUFeature.AVX)) { 622 VexRMOp.VPMOVMSKB.emit(asm, vectorSize, dst, vecSrc); 623 } else { 624 // SSE 625 asm.pmovmskb(dst, vecSrc); 626 } 627 } 628 629 private static boolean supportsAVX2(LIRGeneratorTool tool) { 630 return supports(tool, CPUFeature.AVX2); 631 } 632 633 private static boolean supports(LIRGeneratorTool tool, CPUFeature cpuFeature) { 634 return ((AMD64) tool.target().arch).getFeatures().contains(cpuFeature); 635 } 636 }