1 /* 2 * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 25 package org.graalvm.compiler.lir.amd64; 26 27 import static jdk.vm.ci.code.ValueUtil.asRegister; 28 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; 29 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; 30 31 import org.graalvm.compiler.asm.Label; 32 import org.graalvm.compiler.asm.amd64.AMD64Address; 33 import org.graalvm.compiler.asm.amd64.AMD64Assembler; 34 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp; 35 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp; 36 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMOp; 37 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRVMOp; 38 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 39 import org.graalvm.compiler.asm.amd64.AVXKind; 40 import org.graalvm.compiler.core.common.LIRKind; 41 import org.graalvm.compiler.lir.LIRInstructionClass; 42 import org.graalvm.compiler.lir.Opcode; 43 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 44 import org.graalvm.compiler.lir.gen.LIRGeneratorTool; 45 46 import jdk.vm.ci.amd64.AMD64; 47 import jdk.vm.ci.amd64.AMD64.CPUFeature; 48 import jdk.vm.ci.amd64.AMD64Kind; 49 import jdk.vm.ci.code.Register; 50 import jdk.vm.ci.meta.JavaKind; 51 import jdk.vm.ci.meta.Value; 52 53 /** 54 */ 55 @Opcode("AMD64_ARRAY_INDEX_OF") 56 public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction { 57 public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class); 58 59 private final JavaKind kind; 60 private final int vmPageSize; 61 62 @Def({REG}) protected Value resultValue; 63 @Alive({REG}) protected Value charArrayPtrValue; 64 @Use({REG}) protected Value charArrayLengthValue; 65 @Alive({REG}) protected Value searchCharValue; 66 @Temp({REG}) protected Value arraySlotsRemaining; 67 @Temp({REG}) protected Value comparisonResult1; 68 @Temp({REG}) protected Value comparisonResult2; 69 @Temp({REG}) protected Value comparisonResult3; 70 @Temp({REG}) protected Value comparisonResult4; 71 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal; 72 @Temp({REG, ILLEGAL}) protected Value vectorArray1; 73 @Temp({REG, ILLEGAL}) protected Value vectorArray2; 74 @Temp({REG, ILLEGAL}) protected Value vectorArray3; 75 @Temp({REG, ILLEGAL}) protected Value vectorArray4; 76 77 public AMD64ArrayIndexOfOp( 78 JavaKind kind, 79 int vmPageSize, LIRGeneratorTool tool, 80 Value result, 81 Value arrayPtr, 82 Value arrayLength, 83 Value searchChar) { 84 super(TYPE); 85 this.kind = kind; 86 this.vmPageSize = vmPageSize; 87 assert byteMode() || charMode(); 88 assert supports(tool, CPUFeature.SSSE3) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool); 89 resultValue = result; 90 charArrayPtrValue = arrayPtr; 91 charArrayLengthValue = arrayLength; 92 searchCharValue = searchChar; 93 94 this.arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 95 this.comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 96 this.comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 97 this.comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 98 this.comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 99 AMD64Kind vectorKind = byteMode() ? supportsAVX2(tool) ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE : supportsAVX2(tool) ? AMD64Kind.V256_WORD : AMD64Kind.V128_WORD; 100 this.vectorCompareVal = tool.newVariable(LIRKind.value(vectorKind)); 101 this.vectorArray1 = tool.newVariable(LIRKind.value(vectorKind)); 102 this.vectorArray2 = tool.newVariable(LIRKind.value(vectorKind)); 103 this.vectorArray3 = tool.newVariable(LIRKind.value(vectorKind)); 104 this.vectorArray4 = tool.newVariable(LIRKind.value(vectorKind)); 105 } 106 107 private boolean byteMode() { 108 return kind == JavaKind.Byte; 109 } 110 111 private boolean charMode() { 112 return kind == JavaKind.Char; 113 } 114 115 @Override 116 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) { 117 Register arrayPtr = asRegister(charArrayPtrValue); 118 Register arrayLength = asRegister(charArrayLengthValue); 119 Register searchValue = asRegister(searchCharValue); 120 Register result = asRegister(resultValue); 121 Register vecCmp = asRegister(vectorCompareVal); 122 Register vecArray1 = asRegister(vectorArray1); 123 Register vecArray2 = asRegister(vectorArray2); 124 Register vecArray3 = asRegister(vectorArray3); 125 Register vecArray4 = asRegister(vectorArray4); 126 Register slotsRemaining = asRegister(arraySlotsRemaining); 127 Register cmpResult1 = asRegister(comparisonResult1); 128 Register cmpResult2 = asRegister(comparisonResult2); 129 Register cmpResult3 = asRegister(comparisonResult3); 130 Register cmpResult4 = asRegister(comparisonResult4); 131 132 Label bulkVectorLoop = new Label(); 133 Label singleVectorLoop = new Label(); 134 Label vectorFound1 = new Label(); 135 Label vectorFound2 = new Label(); 136 Label vectorFound3 = new Label(); 137 Label vectorFound4 = new Label(); 138 Label lessThanVectorSizeRemaining = new Label(); 139 Label lessThanVectorSizeRemainingLoop = new Label(); 140 Label retFound = new Label(); 141 Label retNotFound = new Label(); 142 Label end = new Label(); 143 144 AVXKind.AVXSize vectorSize = asm.supports(CPUFeature.AVX2) ? AVXKind.AVXSize.YMM : AVXKind.AVXSize.XMM; 145 int nVectors = 4; 146 int bytesPerVector = vectorSize.getBytes(); 147 int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount(); 148 int bulkSize = arraySlotsPerVector * nVectors; 149 int bulkSizeBytes = bytesPerVector * nVectors; 150 assert bulkSizeBytes >= 64; 151 152 // load array length 153 // important: this must be the first register manipulation, since charArrayLengthValue is 154 // annotated with @Use 155 asm.movl(slotsRemaining, arrayLength); 156 // move search value to vector 157 if (asm.supports(CPUFeature.AVX)) { 158 VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp, searchValue); 159 } else { 160 asm.movdl(vecCmp, searchValue); 161 } 162 // load array pointer 163 asm.movq(result, arrayPtr); 164 // load copy of low part of array pointer 165 asm.movl(cmpResult1, arrayPtr); 166 // fill comparison vector with copies of the search value 167 emitBroadcast(asm, vecCmp, vecArray1, vectorSize); 168 169 // check if bulk vector load is in bounds 170 asm.cmpl(slotsRemaining, bulkSize); 171 asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop); 172 173 // check if array pointer is 64-byte aligned 174 asm.andl(cmpResult1, 63); 175 asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop); 176 177 // do one unaligned bulk comparison pass and adjust alignment afterwards 178 emitBulkCompare(asm, vectorSize, bytesPerVector, result, vecCmp, vecArray1, vecArray2, vecArray3, vecArray4, cmpResult1, cmpResult2, cmpResult3, cmpResult4, 179 vectorFound1, vectorFound2, vectorFound3, vectorFound4, false); 180 // load copy of low part of array pointer 181 asm.movl(cmpResult1, arrayPtr); 182 // adjust array pointer 183 asm.addq(result, bulkSizeBytes); 184 // adjust number of array slots remaining 185 asm.subl(slotsRemaining, bulkSize); 186 // get offset to 64-byte alignment 187 asm.andl(cmpResult1, 63); 188 emitBytesToArraySlots(asm, cmpResult1); 189 // adjust array pointer to 64-byte alignment 190 asm.andq(result, ~63); 191 // adjust number of array slots remaining 192 asm.addl(slotsRemaining, cmpResult1); 193 // check if there are enough array slots remaining for the bulk loop 194 asm.cmpl(slotsRemaining, bulkSize); 195 asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop); 196 197 emitAlign(crb, asm); 198 asm.bind(bulkVectorLoop); 199 // memory-aligned bulk comparison 200 emitBulkCompare(asm, vectorSize, bytesPerVector, result, vecCmp, vecArray1, vecArray2, vecArray3, vecArray4, cmpResult1, cmpResult2, cmpResult3, cmpResult4, 201 vectorFound1, vectorFound2, vectorFound3, vectorFound4, true); 202 // adjust number of array slots remaining 203 asm.subl(slotsRemaining, bulkSize); 204 // adjust array pointer 205 asm.addq(result, bulkSizeBytes); 206 // check if there are enough array slots remaining for the bulk loop 207 asm.cmpl(slotsRemaining, bulkSize); 208 asm.jcc(AMD64Assembler.ConditionFlag.Below, singleVectorLoop); 209 // continue loop 210 asm.jmpb(bulkVectorLoop); 211 212 emitAlign(crb, asm); 213 // same loop as bulkVectorLoop, with only one vector 214 asm.bind(singleVectorLoop); 215 // check if single vector load is in bounds 216 asm.cmpl(slotsRemaining, arraySlotsPerVector); 217 asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining); 218 // compare 219 emitSingleVectorCompare(asm, vectorSize, result, vecCmp, vecArray1, cmpResult1); 220 221 // check if a match was found 222 asm.testl(cmpResult1, cmpResult1); 223 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound1); 224 // adjust number of array slots remaining 225 asm.subl(slotsRemaining, arraySlotsPerVector); 226 // adjust array pointer 227 asm.addq(result, bytesPerVector); 228 // continue loop 229 asm.jmpb(singleVectorLoop); 230 231 asm.bind(lessThanVectorSizeRemaining); 232 // check if any array slots remain 233 asm.testl(slotsRemaining, slotsRemaining); 234 asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound); 235 236 // a vector compare will read out of bounds of the input array. 237 // check if the out-of-bounds read would cross a memory page boundary. 238 // load copy of low part of array pointer 239 asm.movl(cmpResult1, result); 240 // check if pointer + vector size would cross the page boundary 241 asm.andl(cmpResult1, (vmPageSize - 1)); 242 asm.cmpl(cmpResult1, (vmPageSize - bytesPerVector)); 243 // if the page boundary would be crossed, do byte/character-wise comparison instead. 244 asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop); 245 // otherwise, do a vector compare that reads beyond array bounds 246 emitSingleVectorCompare(asm, vectorSize, result, vecCmp, vecArray1, cmpResult1); 247 // check if a match was found 248 asm.testl(cmpResult1, cmpResult1); 249 asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound); 250 // find match offset 251 asm.bsfq(cmpResult1, cmpResult1); 252 if (charMode()) { 253 // convert number of remaining characters to bytes 254 asm.shll(slotsRemaining, 1); 255 } 256 // adjust array pointer for match result 257 asm.addq(result, cmpResult1); 258 // check if offset of matched value is greater than number of bytes remaining / out of array 259 // bounds 260 asm.cmpl(cmpResult1, slotsRemaining); 261 // match is out of bounds, return no match 262 asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound); 263 // match is in bounds, return offset 264 asm.jmpb(retFound); 265 266 // compare remaining slots in the array one-by-one 267 asm.bind(lessThanVectorSizeRemainingLoop); 268 // check if any array slots remain 269 asm.testl(slotsRemaining, slotsRemaining); 270 asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound); 271 // load char / byte 272 AMD64Assembler.OperandSize operandSize = byteMode() ? AMD64Assembler.OperandSize.BYTE : AMD64Assembler.OperandSize.WORD; 273 if (byteMode()) { 274 AMD64Assembler.AMD64RMOp.MOVB.emit(asm, operandSize, cmpResult1, new AMD64Address(result)); 275 } else { 276 AMD64Assembler.AMD64RMOp.MOV.emit(asm, operandSize, cmpResult1, new AMD64Address(result)); 277 } 278 // check for match 279 AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(operandSize).emit(asm, operandSize, cmpResult1, searchValue); 280 asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound); 281 // adjust number of array slots remaining 282 asm.decrementl(slotsRemaining); 283 // adjust array pointer 284 asm.addq(result, kind.getByteCount()); 285 // continue loop 286 asm.jmpb(lessThanVectorSizeRemainingLoop); 287 288 // return -1 (no match) 289 asm.bind(retNotFound); 290 asm.movl(result, -1); 291 asm.jmpb(end); 292 293 emitVectorFoundWithOffset(asm, bytesPerVector, result, cmpResult2, vectorFound2, retFound); 294 emitVectorFoundWithOffset(asm, bytesPerVector * 2, result, cmpResult3, vectorFound3, retFound); 295 emitVectorFoundWithOffset(asm, bytesPerVector * 3, result, cmpResult4, vectorFound4, retFound); 296 297 asm.bind(vectorFound1); 298 // find index of first set bit in bit mask 299 asm.bsfq(cmpResult1, cmpResult1); 300 // add offset to array pointer 301 asm.addq(result, cmpResult1); 302 303 asm.bind(retFound); 304 // convert array pointer to offset 305 asm.subq(result, arrayPtr); 306 emitBytesToArraySlots(asm, result); 307 asm.bind(end); 308 } 309 310 private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) { 311 asm.align(crb.target.wordSize * 2); 312 } 313 314 /** 315 * Fills {@code vecDst} with copies of its lowest byte or word. 316 */ 317 private void emitBroadcast(AMD64MacroAssembler asm, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) { 318 if (asm.supports(CPUFeature.AVX2)) { 319 if (byteMode()) { 320 VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst); 321 } else { 322 VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst); 323 } 324 } else if (asm.supports(CPUFeature.AVX)) { 325 if (byteMode()) { 326 // fill vecTmp with zeroes 327 VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp); 328 // broadcast loaded search value 329 VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp); 330 } else { 331 // fill low qword 332 VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0); 333 // copy low qword to high qword 334 VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0); 335 } 336 } else { 337 // SSE version 338 if (byteMode()) { 339 // fill vecTmp with zeroes 340 asm.pxor(vecTmp, vecTmp); 341 // broadcast loaded search value 342 asm.pshufb(vecDst, vecTmp); 343 } else { 344 // fill low qword 345 asm.pshuflw(vecDst, vecDst, 0); 346 // copy low qword to high qword 347 asm.pshufd(vecDst, vecDst, 0); 348 } 349 } 350 } 351 352 /** 353 * Loads {@code vectorSize} bytes from the position pointed to by {@code arrayPtr} and compares 354 * them to the search value stored in {@code vecCmp}. {@code vecArray} is overwritten by this 355 * operation. The comparison result is stored in {@code cmpResult}. 356 */ 357 private void emitSingleVectorCompare(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, 358 Register arrayPtr, Register vecCmp, Register vecArray, Register cmpResult) { 359 // load array contents into vector 360 emitArrayLoad(asm, vectorSize, vecArray, arrayPtr, 0, false); 361 // compare all loaded bytes to the search value. 362 emitVectorCompare(asm, vectorSize, vecArray, vecCmp); 363 // create a 32-bit-mask from the most significant bit of every byte in the comparison 364 // result. 365 emitMOVMSK(asm, vectorSize, cmpResult, vecArray); 366 } 367 368 /** 369 * Convert a byte offset stored in {@code bytes} to an array index offset. 370 */ 371 private void emitBytesToArraySlots(AMD64MacroAssembler asm, Register bytes) { 372 if (charMode()) { 373 asm.shrl(bytes, 1); 374 } else { 375 assert byteMode(); 376 } 377 } 378 379 private void emitBulkCompare(AMD64MacroAssembler asm, 380 AVXKind.AVXSize vectorSize, 381 int bytesPerVector, 382 Register arrayPtr, 383 Register vecCmp, 384 Register vecArray1, 385 Register vecArray2, 386 Register vecArray3, 387 Register vecArray4, 388 Register cmpResult1, 389 Register cmpResult2, 390 Register cmpResult3, 391 Register cmpResult4, 392 Label vectorFound1, 393 Label vectorFound2, 394 Label vectorFound3, 395 Label vectorFound4, 396 boolean alignedLoad) { 397 // load array contents into vectors 398 emitArrayLoad(asm, vectorSize, vecArray1, arrayPtr, 0, alignedLoad); 399 emitArrayLoad(asm, vectorSize, vecArray2, arrayPtr, bytesPerVector, alignedLoad); 400 emitArrayLoad(asm, vectorSize, vecArray3, arrayPtr, bytesPerVector * 2, alignedLoad); 401 emitArrayLoad(asm, vectorSize, vecArray4, arrayPtr, bytesPerVector * 3, alignedLoad); 402 // compare all loaded bytes to the search value. 403 // matching bytes are set to 0xff, non-matching bytes are set to 0x00. 404 emitVectorCompare(asm, vectorSize, vecArray1, vecCmp); 405 emitVectorCompare(asm, vectorSize, vecArray2, vecCmp); 406 emitVectorCompare(asm, vectorSize, vecArray3, vecCmp); 407 emitVectorCompare(asm, vectorSize, vecArray4, vecCmp); 408 // create 32-bit-masks from the most significant bit of every byte in the comparison 409 // results. 410 emitMOVMSK(asm, vectorSize, cmpResult1, vecArray1); 411 emitMOVMSK(asm, vectorSize, cmpResult2, vecArray2); 412 emitMOVMSK(asm, vectorSize, cmpResult3, vecArray3); 413 emitMOVMSK(asm, vectorSize, cmpResult4, vecArray4); 414 // check if a match was found 415 asm.testl(cmpResult1, cmpResult1); 416 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound1); 417 asm.testl(cmpResult2, cmpResult2); 418 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound2); 419 asm.testl(cmpResult3, cmpResult3); 420 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound3); 421 asm.testl(cmpResult4, cmpResult4); 422 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound4); 423 } 424 425 private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm, int resultOffset, Register result, Register cmpResult, Label entry, Label ret) { 426 asm.bind(entry); 427 if (resultOffset > 0) { 428 // adjust array pointer 429 asm.addq(result, resultOffset); 430 } 431 // find index of first set bit in bit mask 432 asm.bsfq(cmpResult, cmpResult); 433 // add offset to array pointer 434 asm.addq(result, cmpResult); 435 asm.jmpb(ret); 436 } 437 438 private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) { 439 AMD64Address src = new AMD64Address(arrayPtr, offset); 440 if (asm.supports(CPUFeature.AVX)) { 441 VexMoveOp loadOp = alignedLoad ? VexMoveOp.VMOVDQA : VexMoveOp.VMOVDQU; 442 loadOp.emit(asm, vectorSize, vecDst, src); 443 } else { 444 // SSE 445 asm.movdqu(vecDst, src); 446 } 447 } 448 449 private void emitVectorCompare(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) { 450 // compare all loaded bytes to the search value. 451 // matching bytes are set to 0xff, non-matching bytes are set to 0x00. 452 if (asm.supports(CPUFeature.AVX)) { 453 if (byteMode()) { 454 VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 455 } else { 456 VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 457 } 458 } else { 459 // SSE 460 if (byteMode()) { 461 asm.pcmpeqb(vecArray, vecCmp); 462 } else { 463 asm.pcmpeqw(vecArray, vecCmp); 464 } 465 } 466 } 467 468 private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) { 469 if (asm.supports(CPUFeature.AVX)) { 470 VexRMOp.VPMOVMSKB.emit(asm, vectorSize, dst, vecSrc); 471 } else { 472 // SSE 473 asm.pmovmskb(dst, vecSrc); 474 } 475 } 476 477 private static boolean supportsAVX2(LIRGeneratorTool tool) { 478 return supports(tool, CPUFeature.AVX2); 479 } 480 481 private static boolean supports(LIRGeneratorTool tool, CPUFeature cpuFeature) { 482 return ((AMD64) tool.target().arch).getFeatures().contains(cpuFeature); 483 } 484 }