24 25 package org.graalvm.compiler.lir.amd64; 26 27 import jdk.vm.ci.amd64.AMD64; 28 import jdk.vm.ci.amd64.AMD64.CPUFeature; 29 import jdk.vm.ci.amd64.AMD64Kind; 30 import jdk.vm.ci.code.Register; 31 import jdk.vm.ci.code.TargetDescription; 32 import jdk.vm.ci.meta.JavaKind; 33 import jdk.vm.ci.meta.Value; 34 import org.graalvm.compiler.asm.Label; 35 import org.graalvm.compiler.asm.amd64.AMD64Address; 36 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale; 37 import org.graalvm.compiler.asm.amd64.AMD64Assembler; 38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag; 39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp; 40 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize; 41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 42 import org.graalvm.compiler.asm.amd64.AVXKind; 43 import org.graalvm.compiler.core.common.LIRKind; 44 import org.graalvm.compiler.core.common.NumUtil; 45 import org.graalvm.compiler.lir.LIRInstructionClass; 46 import org.graalvm.compiler.lir.Opcode; 47 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 48 import org.graalvm.compiler.lir.gen.LIRGeneratorTool; 49 50 import static jdk.vm.ci.code.ValueUtil.asRegister; 51 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; 52 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; 53 54 /** 55 * Emits code which compares two arrays of the same length. If the CPU supports any vector 56 * instructions specialized code is emitted to leverage these instructions. 57 */ 58 @Opcode("ARRAY_EQUALS") 59 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction { 60 public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class); 61 62 private final JavaKind kind; 63 private final int arrayBaseOffset; 64 private final int arrayIndexScale; 65 private final int constantByteLength; 66 67 @Def({REG}) private Value resultValue; 68 @Alive({REG}) private Value array1Value; 69 @Alive({REG}) private Value array2Value; 70 @Alive({REG}) private Value lengthValue; 71 @Temp({REG}) private Value temp1; 72 @Temp({REG}) private Value temp2; 73 @Temp({REG}) private Value temp3; 74 @Temp({REG}) private Value temp4; 75 76 @Temp({REG, ILLEGAL}) private Value temp5; 77 @Temp({REG, ILLEGAL}) private Value tempXMM; 78 79 @Temp({REG, ILLEGAL}) private Value vectorTemp1; 80 @Temp({REG, ILLEGAL}) private Value vectorTemp2; 81 @Temp({REG, ILLEGAL}) private Value vectorTemp3; 82 @Temp({REG, ILLEGAL}) private Value vectorTemp4; 83 84 public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind, Value result, Value array1, Value array2, Value length, 85 int constantLength, boolean directPointers, int maxVectorSize) { 86 super(TYPE); 87 this.kind = kind; 88 89 this.arrayBaseOffset = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind); 90 this.arrayIndexScale = tool.getProviders().getMetaAccess().getArrayIndexScale(kind); 91 92 if (constantLength >= 0 && arrayIndexScale > 1) { 93 // scale length 94 this.constantByteLength = constantLength << NumUtil.log2Ceil(arrayIndexScale); 95 } else { 96 this.constantByteLength = constantLength; 97 } 98 99 this.resultValue = result; 100 this.array1Value = array1; 101 this.array2Value = array2; 102 this.lengthValue = length; 103 104 // Allocate some temporaries. 105 this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind())); 106 this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind())); 107 this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())); 108 this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())); 109 110 this.temp5 = kind.isNumericFloat() ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL; 111 if (kind == JavaKind.Float) { 112 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE)); 113 } else if (kind == JavaKind.Double) { 114 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 115 } else { 116 this.tempXMM = Value.ILLEGAL; 117 } 118 119 // We only need the vector temporaries if we generate SSE code. 120 if (supportsSSE41(tool.target())) { 121 if (canGenerateConstantLengthCompare(tool.target())) { 122 LIRKind lirKind = LIRKind.value(supportsAVX2(tool.target()) && (maxVectorSize < 0 || maxVectorSize >= 32) ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE); 123 this.vectorTemp1 = tool.newVariable(lirKind); 124 this.vectorTemp2 = tool.newVariable(lirKind); 125 this.vectorTemp3 = tool.newVariable(lirKind); 126 this.vectorTemp4 = tool.newVariable(lirKind); 127 } else { 128 this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 129 this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 130 this.vectorTemp3 = Value.ILLEGAL; 131 this.vectorTemp4 = Value.ILLEGAL; 132 } 133 } else { 134 this.vectorTemp1 = Value.ILLEGAL; 135 this.vectorTemp2 = Value.ILLEGAL; 136 this.vectorTemp3 = Value.ILLEGAL; 137 this.vectorTemp4 = Value.ILLEGAL; 138 } 139 } 140 141 private boolean canGenerateConstantLengthCompare(TargetDescription target) { 142 return constantByteLength >= 0 && kind.isNumericInteger() && supportsSSE41(target); 143 } 144 145 @Override 146 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) { 147 Register result = asRegister(resultValue); 148 Register array1 = asRegister(temp1); 149 Register array2 = asRegister(temp2); 150 151 Label trueLabel = new Label(); 152 Label falseLabel = new Label(); 153 Label done = new Label(); 154 155 // Load array base addresses. 156 masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset)); 157 masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset)); 158 159 if (canGenerateConstantLengthCompare(crb.target)) { 160 emitConstantLengthArrayCompareBytes(masm, array1, array2, asRegister(temp3), asRegister(temp4), 161 new Register[]{asRegister(vectorTemp1), asRegister(vectorTemp2), asRegister(vectorTemp3), asRegister(vectorTemp4)}, 162 falseLabel, constantByteLength, AVXKind.getRegisterSize(vectorTemp1).getBytes()); 163 } else { 164 Register length = asRegister(temp3); 165 166 // Get array length in bytes. 167 masm.movl(length, asRegister(lengthValue)); 168 169 if (arrayIndexScale > 1) { 170 masm.shll(length, NumUtil.log2Ceil(arrayIndexScale)); // scale length 171 } 172 173 masm.movl(result, length); // copy 174 175 emitArrayCompare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel); 176 } 177 178 // Return true 179 masm.bind(trueLabel); 180 masm.movl(result, 1); 181 masm.jmpb(done); 182 183 // Return false 184 masm.bind(falseLabel); 185 masm.xorl(result, result); 186 187 // That's it 188 masm.bind(done); 189 } 190 191 private static void emitArrayCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, 192 Register result, Register array1, Register array2, Register length, 193 Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2, 194 Label trueLabel, Label falseLabel) { 195 if (supportsAVX2(crb.target)) { 196 emitAVXCompare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel); 197 } else if (supportsSSE41(crb.target)) { 198 // this code is used for AVX as well because our backend correctly ensures that 199 // VEX-prefixed instructions are emitted if AVX is supported 200 emitSSE41Compare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel); 201 } 202 emit8ByteCompare(crb, masm, kind, result, array1, array2, length, temp4, tempXMM, trueLabel, falseLabel); 203 emitTailCompares(masm, kind, result, array1, array2, length, temp4, tempXMM, trueLabel, falseLabel); 204 } 205 206 /** 207 * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions. 208 * 209 * @param target target description of the underlying architecture 210 * @return true if the underlying architecture supports SSE 4.1 211 */ 212 private static boolean supportsSSE41(TargetDescription target) { 213 AMD64 arch = (AMD64) target.arch; 214 return arch.getFeatures().contains(CPUFeature.SSE4_1); 215 } 216 217 /** 218 * Vector size used in {@link #emitSSE41Compare}. 219 */ 220 private static final int SSE4_1_VECTOR_SIZE = 16; 221 222 /** 223 * Emits code that uses SSE4.1 128-bit (16-byte) vector compares. 224 */ 225 private static void emitSSE41Compare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, 226 Register result, Register array1, Register array2, Register length, 227 Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2, 228 Label trueLabel, Label falseLabel) { 229 assert supportsSSE41(crb.target); 230 231 Register vector1 = asRegister(vectorTemp1); 232 Register vector2 = asRegister(vectorTemp2); 233 234 Label loop = new Label(); 235 Label compareTail = new Label(); 236 237 boolean requiresNaNCheck = kind.isNumericFloat(); 238 Label loopCheck = new Label(); 239 Label nanCheck = new Label(); 240 241 // Compare 16-byte vectors 242 masm.andl(result, SSE4_1_VECTOR_SIZE - 1); // tail count (in bytes) 243 masm.andl(length, ~(SSE4_1_VECTOR_SIZE - 1)); // vector count (in bytes) 244 masm.jcc(ConditionFlag.Zero, compareTail); 245 246 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0)); 247 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0)); 248 masm.negq(length); 249 250 // Align the main loop 251 masm.align(crb.target.wordSize * 2); 252 masm.bind(loop); 253 masm.movdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0)); 254 masm.movdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0)); 255 masm.pxor(vector1, vector2); 256 masm.ptest(vector1, vector1); 257 masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel); 258 259 masm.bind(loopCheck); 260 masm.addq(length, SSE4_1_VECTOR_SIZE); 261 masm.jcc(ConditionFlag.NotZero, loop); 262 263 masm.testl(result, result); 264 masm.jcc(ConditionFlag.Zero, trueLabel); 265 266 if (requiresNaNCheck) { 267 Label unalignedCheck = new Label(); 268 masm.jmpb(unalignedCheck); 269 masm.bind(nanCheck); 270 emitFloatCompareWithinRange(crb, masm, kind, array1, array2, length, temp4, temp5, tempXMM, 0, falseLabel, SSE4_1_VECTOR_SIZE); 271 masm.jmpb(loopCheck); 272 masm.bind(unalignedCheck); 273 } 274 275 /* 276 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 277 * array. 278 */ 279 masm.movdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -SSE4_1_VECTOR_SIZE)); 280 masm.movdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -SSE4_1_VECTOR_SIZE)); 281 masm.pxor(vector1, vector2); 282 masm.ptest(vector1, vector1); 283 if (requiresNaNCheck) { 284 masm.jcc(ConditionFlag.Zero, trueLabel); 285 emitFloatCompareWithinRange(crb, masm, kind, array1, array2, result, temp4, temp5, tempXMM, -SSE4_1_VECTOR_SIZE, falseLabel, SSE4_1_VECTOR_SIZE); 286 } else { 287 masm.jcc(ConditionFlag.NotZero, falseLabel); 288 } 289 masm.jmp(trueLabel); 290 291 masm.bind(compareTail); 292 masm.movl(length, result); 293 } 294 295 /** 296 * Returns if the underlying AMD64 architecture supports AVX instructions. 297 * 298 * @param target target description of the underlying architecture 299 * @return true if the underlying architecture supports AVX 300 */ 301 private static boolean supportsAVX2(TargetDescription target) { 302 AMD64 arch = (AMD64) target.arch; 303 return arch.getFeatures().contains(CPUFeature.AVX2); 304 } 305 306 /** 307 * Vector size used in {@link #emitAVXCompare}. 308 */ 309 private static final int AVX_VECTOR_SIZE = 32; 310 311 private static void emitAVXCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register result, 312 Register array1, Register array2, Register length, 313 Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2, 314 Label trueLabel, Label falseLabel) { 315 assert supportsAVX2(crb.target); 316 317 Register vector1 = asRegister(vectorTemp1); 318 Register vector2 = asRegister(vectorTemp2); 319 320 Label loop = new Label(); 321 Label compareTail = new Label(); 322 323 boolean requiresNaNCheck = kind.isNumericFloat(); 324 Label loopCheck = new Label(); 325 Label nanCheck = new Label(); 326 327 // Compare 32-byte vectors 328 masm.andl(result, AVX_VECTOR_SIZE - 1); // tail count (in bytes) 329 masm.andl(length, ~(AVX_VECTOR_SIZE - 1)); // vector count (in bytes) 330 masm.jcc(ConditionFlag.Zero, compareTail); 331 332 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0)); 333 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0)); 334 masm.negq(length); 335 336 // Align the main loop 337 masm.align(crb.target.wordSize * 2); 338 masm.bind(loop); 339 masm.vmovdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0)); 340 masm.vmovdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0)); 341 masm.vpxor(vector1, vector1, vector2); 342 masm.vptest(vector1, vector1); 343 masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel); 344 345 masm.bind(loopCheck); 346 masm.addq(length, AVX_VECTOR_SIZE); 347 masm.jcc(ConditionFlag.NotZero, loop); 348 349 masm.testl(result, result); 350 masm.jcc(ConditionFlag.Zero, trueLabel); 351 352 if (requiresNaNCheck) { 353 Label unalignedCheck = new Label(); 354 masm.jmpb(unalignedCheck); 355 masm.bind(nanCheck); 356 emitFloatCompareWithinRange(crb, masm, kind, array1, array2, length, temp4, temp5, tempXMM, 0, falseLabel, AVX_VECTOR_SIZE); 357 masm.jmpb(loopCheck); 358 masm.bind(unalignedCheck); 359 } 360 361 /* 362 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 363 * array. 364 */ 365 masm.vmovdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -AVX_VECTOR_SIZE)); 366 masm.vmovdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -AVX_VECTOR_SIZE)); 367 masm.vpxor(vector1, vector1, vector2); 368 masm.vptest(vector1, vector1); 369 if (requiresNaNCheck) { 370 masm.jcc(ConditionFlag.Zero, trueLabel); 371 emitFloatCompareWithinRange(crb, masm, kind, array1, array2, result, temp4, temp5, tempXMM, -AVX_VECTOR_SIZE, falseLabel, AVX_VECTOR_SIZE); 372 } else { 373 masm.jcc(ConditionFlag.NotZero, falseLabel); 374 } 375 masm.jmp(trueLabel); 376 377 masm.bind(compareTail); 378 masm.movl(length, result); 379 } 380 381 /** 382 * Vector size used in {@link #emit8ByteCompare}. 383 */ 384 private static final int VECTOR_SIZE = 8; 385 386 /** 387 * Emits code that uses 8-byte vector compares. 388 */ 389 private static void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register result, Register array1, Register array2, Register length, Value temp4, 390 Value tempXMM, Label trueLabel, Label falseLabel) { 391 Label loop = new Label(); 392 Label compareTail = new Label(); 393 394 boolean requiresNaNCheck = kind.isNumericFloat(); 395 Label loopCheck = new Label(); 396 Label nanCheck = new Label(); 397 398 Register temp = asRegister(temp4); 399 400 masm.andl(result, VECTOR_SIZE - 1); // tail count (in bytes) 401 masm.andl(length, ~(VECTOR_SIZE - 1)); // vector count (in bytes) 402 masm.jcc(ConditionFlag.Zero, compareTail); 403 404 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0)); 405 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0)); 406 masm.negq(length); 407 408 // Align the main loop 409 masm.align(crb.target.wordSize * 2); 410 masm.bind(loop); 411 masm.movq(temp, new AMD64Address(array1, length, Scale.Times1, 0)); 412 masm.cmpq(temp, new AMD64Address(array2, length, Scale.Times1, 0)); 413 masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel); 414 415 masm.bind(loopCheck); 416 masm.addq(length, VECTOR_SIZE); 417 masm.jccb(ConditionFlag.NotZero, loop); 418 419 masm.testl(result, result); 420 masm.jcc(ConditionFlag.Zero, trueLabel); 421 422 if (requiresNaNCheck) { 423 // NaN check is slow path and hence placed outside of the main loop. 424 Label unalignedCheck = new Label(); 425 masm.jmpb(unalignedCheck); 426 masm.bind(nanCheck); 427 // At most two iterations, unroll in the emitted code. 428 for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) { 429 emitFloatCompare(masm, kind, array1, array2, length, temp4, tempXMM, offset, falseLabel, kind.getByteCount() == VECTOR_SIZE); 430 } 431 masm.jmpb(loopCheck); 432 masm.bind(unalignedCheck); 433 } 434 435 /* 436 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 437 * array. 438 */ 439 masm.movq(temp, new AMD64Address(array1, result, Scale.Times1, -VECTOR_SIZE)); 440 masm.cmpq(temp, new AMD64Address(array2, result, Scale.Times1, -VECTOR_SIZE)); 441 if (requiresNaNCheck) { 442 masm.jcc(ConditionFlag.Equal, trueLabel); 443 // At most two iterations, unroll in the emitted code. 444 for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) { 445 emitFloatCompare(masm, kind, array1, array2, result, temp4, tempXMM, -VECTOR_SIZE + offset, falseLabel, kind.getByteCount() == VECTOR_SIZE); 446 } 447 } else { 448 masm.jccb(ConditionFlag.NotEqual, falseLabel); 449 } 450 masm.jmpb(trueLabel); 451 452 masm.bind(compareTail); 453 masm.movl(length, result); 454 } 455 456 /** 457 * Emits code to compare the remaining 1 to 4 bytes. 458 */ 459 private static void emitTailCompares(AMD64MacroAssembler masm, JavaKind kind, Register result, Register array1, Register array2, Register length, Value temp4, Value tempXMM, 460 Label trueLabel, Label falseLabel) { 461 Label compare2Bytes = new Label(); 462 Label compare1Byte = new Label(); 463 464 Register temp = asRegister(temp4); 465 466 if (kind.getByteCount() <= 4) { 467 // Compare trailing 4 bytes, if any. 468 masm.testl(result, 4); 469 masm.jccb(ConditionFlag.Zero, compare2Bytes); 470 masm.movl(temp, new AMD64Address(array1, 0)); 471 masm.cmpl(temp, new AMD64Address(array2, 0)); 472 if (kind == JavaKind.Float) { 473 masm.jccb(ConditionFlag.Equal, trueLabel); 474 emitFloatCompare(masm, kind, array1, array2, Register.None, temp4, tempXMM, 0, falseLabel, true); 475 masm.jmpb(trueLabel); 476 } else { 477 masm.jccb(ConditionFlag.NotEqual, falseLabel); 478 } 479 if (kind.getByteCount() <= 2) { 480 // Move array pointers forward. 481 masm.leaq(array1, new AMD64Address(array1, 4)); 482 masm.leaq(array2, new AMD64Address(array2, 4)); 483 484 // Compare trailing 2 bytes, if any. 485 masm.bind(compare2Bytes); 486 masm.testl(result, 2); 487 masm.jccb(ConditionFlag.Zero, compare1Byte); 488 masm.movzwl(temp, new AMD64Address(array1, 0)); 489 masm.movzwl(length, new AMD64Address(array2, 0)); 490 masm.cmpl(temp, length); 491 masm.jccb(ConditionFlag.NotEqual, falseLabel); 492 493 // The one-byte tail compare is only required for boolean and byte arrays. 494 if (kind.getByteCount() <= 1) { 495 // Move array pointers forward before we compare the last trailing byte. 496 masm.leaq(array1, new AMD64Address(array1, 2)); 497 masm.leaq(array2, new AMD64Address(array2, 2)); 498 499 // Compare trailing byte, if any. 500 masm.bind(compare1Byte); 501 masm.testl(result, 1); 502 masm.jccb(ConditionFlag.Zero, trueLabel); 503 masm.movzbl(temp, new AMD64Address(array1, 0)); 504 masm.movzbl(length, new AMD64Address(array2, 0)); 505 masm.cmpl(temp, length); 506 masm.jccb(ConditionFlag.NotEqual, falseLabel); 507 } else { 508 masm.bind(compare1Byte); 509 } 510 } else { 511 masm.bind(compare2Bytes); 512 } 513 } 514 } 515 516 /** 517 * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}. 518 */ 519 private static void emitNaNCheck(AMD64MacroAssembler masm, JavaKind kind, Value tempXMM, AMD64Address src, Label branchIfNonNaN) { 520 assert kind.isNumericFloat(); 521 Register tempXMMReg = asRegister(tempXMM); 522 if (kind == JavaKind.Float) { 523 masm.movflt(tempXMMReg, src); 524 } else { 525 masm.movdbl(tempXMMReg, src); 526 } 527 SSEOp.UCOMIS.emit(masm, kind == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg); 528 masm.jcc(ConditionFlag.NoParity, branchIfNonNaN); 529 } 530 531 /** 532 * Emits code to compare if two floats are bitwise equal or both NaN. 533 */ 534 private static void emitFloatCompare(AMD64MacroAssembler masm, JavaKind kind, Register base1, Register base2, Register index, Value temp4, Value tempXMM, int offset, Label falseLabel, 535 boolean skipBitwiseCompare) { 536 AMD64Address address1 = new AMD64Address(base1, index, Scale.Times1, offset); 537 AMD64Address address2 = new AMD64Address(base2, index, Scale.Times1, offset); 538 539 Label bitwiseEqual = new Label(); 540 541 if (!skipBitwiseCompare) { 542 // Bitwise compare 543 Register temp = asRegister(temp4); 544 545 if (kind == JavaKind.Float) { 546 masm.movl(temp, address1); 547 masm.cmpl(temp, address2); 548 } else { 549 masm.movq(temp, address1); 550 masm.cmpq(temp, address2); 551 } 552 masm.jccb(ConditionFlag.Equal, bitwiseEqual); 553 } 554 555 emitNaNCheck(masm, kind, tempXMM, address1, falseLabel); 556 emitNaNCheck(masm, kind, tempXMM, address2, falseLabel); 557 558 masm.bind(bitwiseEqual); 559 } 560 561 /** 562 * Emits code to compare float equality within a range. 563 */ 564 private static void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register base1, Register base2, Register index, Value temp4, Value temp5, 565 Value tempXMM, int offset, Label falseLabel, int range) { 566 assert kind.isNumericFloat(); 567 Label loop = new Label(); 568 Register i = asRegister(temp5); 569 570 masm.movq(i, range); 571 masm.negq(i); 572 // Align the main loop 573 masm.align(crb.target.wordSize * 2); 574 masm.bind(loop); 575 emitFloatCompare(masm, kind, base1, base2, index, temp4, tempXMM, offset, falseLabel, kind.getByteCount() == range); 576 masm.addq(index, kind.getByteCount()); 577 masm.addq(i, kind.getByteCount()); 578 masm.jccb(ConditionFlag.NotZero, loop); 579 // Floats within the range are equal, revert change to the register index 580 masm.subq(index, range); 581 } 582 583 /** 584 * Emits specialized assembly for checking equality of memory regions 585 * {@code arrayPtr1[0..nBytes]} and {@code arrayPtr2[0..nBytes]}. If they match, execution 586 * continues directly after the emitted code block, otherwise we jump to {@code noMatch}. 587 */ 588 private static void emitConstantLengthArrayCompareBytes( 589 AMD64MacroAssembler asm, 590 Register arrayPtr1, 591 Register arrayPtr2, 592 Register tmp1, 593 Register tmp2, 594 Register[] tmpVectors, 595 Label noMatch, 596 int nBytes, 597 int bytesPerVector) { 598 assert bytesPerVector >= 16; 599 if (nBytes == 0) { 600 // do nothing 601 return; 602 } 603 if (nBytes < 16) { 604 // array is shorter than any vector register, use regular CMP instructions 605 int movSize = (nBytes < 2) ? 1 : ((nBytes < 4) ? 2 : ((nBytes < 8) ? 4 : 8)); 606 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1), movSize); 607 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2), movSize); 608 emitCmpBytes(asm, tmp1, tmp2, movSize); 609 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch); 610 if (nBytes > movSize) { 611 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1, nBytes - movSize), movSize); 612 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2, nBytes - movSize), movSize); 613 emitCmpBytes(asm, tmp1, tmp2, movSize); 614 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch); 615 } 616 } else if (nBytes < 32 && bytesPerVector >= 32) { 617 // we could use YMM registers, but the array is too short, force XMM registers 618 int bytesPerXMMVector = AVXKind.AVXSize.XMM.getBytes(); 619 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], new AMD64Address(arrayPtr1)); 620 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[1], new AMD64Address(arrayPtr2)); 621 AMD64Assembler.VexRVMOp.VPXOR.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], tmpVectors[0], tmpVectors[1]); 622 if (nBytes > bytesPerXMMVector) { 623 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], new AMD64Address(arrayPtr1, nBytes - bytesPerXMMVector)); 624 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[3], new AMD64Address(arrayPtr2, nBytes - bytesPerXMMVector)); 625 AMD64Assembler.VexRVMOp.VPXOR.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], tmpVectors[2], tmpVectors[3]); 626 AMD64Assembler.VexRMOp.VPTEST.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], tmpVectors[2]); 627 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 628 } 629 AMD64Assembler.VexRMOp.VPTEST.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], tmpVectors[0]); 630 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 631 } else if (bytesPerVector >= 32) { 632 // AVX2 supported, use YMM vectors 633 assert asm.supports(CPUFeature.AVX2); 634 int loopCount = nBytes / (bytesPerVector * 2); 635 int rest = nBytes % (bytesPerVector * 2); 636 if (loopCount > 0) { 637 if (0 < rest && rest < bytesPerVector) { 638 loopCount--; 639 } 640 if (loopCount > 0) { 641 if (loopCount > 1) { 642 asm.movl(tmp1, loopCount); 643 } 644 Label loopBegin = new Label(); 645 asm.bind(loopBegin); 646 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1)); 647 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2)); 648 asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector)); 649 asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector)); 650 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]); 651 asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]); 652 asm.vptest(tmpVectors[0], tmpVectors[0]); 653 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 654 asm.vptest(tmpVectors[2], tmpVectors[2]); 655 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 656 asm.addq(arrayPtr1, bytesPerVector * 2); 657 asm.addq(arrayPtr2, bytesPerVector * 2); 658 if (loopCount > 1) { 659 asm.decrementl(tmp1); 660 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin); 661 } 662 } 663 if (0 < rest && rest < bytesPerVector) { 664 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1)); 665 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2)); 666 asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector)); 667 asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector)); 668 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]); 669 asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]); 670 asm.vptest(tmpVectors[0], tmpVectors[0]); 671 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 672 asm.vptest(tmpVectors[2], tmpVectors[2]); 673 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 674 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1, bytesPerVector + rest)); 675 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2, bytesPerVector + rest)); 676 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]); 677 asm.vptest(tmpVectors[0], tmpVectors[0]); 678 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 679 } 680 } 681 if (rest >= bytesPerVector) { 682 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1)); 683 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2)); 684 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]); 685 if (rest > bytesPerVector) { 686 asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, rest - bytesPerVector)); 687 asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, rest - bytesPerVector)); 688 asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]); 689 asm.vptest(tmpVectors[2], tmpVectors[2]); 690 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 691 } 692 asm.vptest(tmpVectors[0], tmpVectors[0]); 693 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 694 } 695 } else { 696 // on AVX or SSE, use XMM vectors 697 int loopCount = nBytes / (bytesPerVector * 2); 698 int rest = nBytes % (bytesPerVector * 2); 699 if (loopCount > 0) { 700 if (0 < rest && rest < bytesPerVector) { 701 loopCount--; 702 } 703 if (loopCount > 0) { 704 if (loopCount > 1) { 705 asm.movl(tmp1, loopCount); 706 } 707 Label loopBegin = new Label(); 708 asm.bind(loopBegin); 709 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1)); 710 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2)); 711 asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector)); 712 asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector)); 713 asm.pxor(tmpVectors[0], tmpVectors[1]); 714 asm.pxor(tmpVectors[2], tmpVectors[3]); 715 asm.ptest(tmpVectors[0], tmpVectors[0]); 716 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 717 asm.ptest(tmpVectors[2], tmpVectors[2]); 718 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 719 asm.addq(arrayPtr1, bytesPerVector * 2); 720 asm.addq(arrayPtr2, bytesPerVector * 2); 721 if (loopCount > 1) { 722 asm.decrementl(tmp1); 723 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin); 724 } 725 } 726 if (0 < rest && rest < bytesPerVector) { 727 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1)); 728 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2)); 729 asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector)); 730 asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector)); 731 asm.pxor(tmpVectors[0], tmpVectors[1]); 732 asm.pxor(tmpVectors[2], tmpVectors[3]); 733 asm.ptest(tmpVectors[0], tmpVectors[0]); 734 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 735 asm.ptest(tmpVectors[2], tmpVectors[2]); 736 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 737 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1, bytesPerVector + rest)); 738 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2, bytesPerVector + rest)); 739 asm.pxor(tmpVectors[0], tmpVectors[1]); 740 asm.ptest(tmpVectors[0], tmpVectors[0]); 741 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 742 } 743 } 744 if (rest >= bytesPerVector) { 745 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1)); 746 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2)); 747 asm.pxor(tmpVectors[0], tmpVectors[1]); 748 if (rest > bytesPerVector) { 749 asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, rest - bytesPerVector)); 750 asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, rest - bytesPerVector)); 751 asm.pxor(tmpVectors[2], tmpVectors[3]); 752 asm.ptest(tmpVectors[2], tmpVectors[2]); 753 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 754 } 755 asm.ptest(tmpVectors[0], tmpVectors[0]); 756 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 757 } 758 } 759 } 760 761 private static void emitMovBytes(AMD64MacroAssembler asm, Register dst, AMD64Address src, int size) { 762 switch (size) { 763 case 1: 764 asm.movzbl(dst, src); 765 break; 766 case 2: 767 asm.movzwl(dst, src); 768 break; 769 case 4: 770 asm.movl(dst, src); 771 break; 772 case 8: 773 asm.movq(dst, src); 774 break; 775 default: 776 throw new IllegalStateException(); 777 } 778 } 779 780 private static void emitCmpBytes(AMD64MacroAssembler asm, Register dst, Register src, int size) { 781 if (size < 8) { 782 asm.cmpl(dst, src); 783 } else { 784 asm.cmpq(dst, src); 785 } 786 } 787 } | 24 25 package org.graalvm.compiler.lir.amd64; 26 27 import jdk.vm.ci.amd64.AMD64; 28 import jdk.vm.ci.amd64.AMD64.CPUFeature; 29 import jdk.vm.ci.amd64.AMD64Kind; 30 import jdk.vm.ci.code.Register; 31 import jdk.vm.ci.code.TargetDescription; 32 import jdk.vm.ci.meta.JavaKind; 33 import jdk.vm.ci.meta.Value; 34 import org.graalvm.compiler.asm.Label; 35 import org.graalvm.compiler.asm.amd64.AMD64Address; 36 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale; 37 import org.graalvm.compiler.asm.amd64.AMD64Assembler; 38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag; 39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp; 40 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize; 41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 42 import org.graalvm.compiler.asm.amd64.AVXKind; 43 import org.graalvm.compiler.core.common.LIRKind; 44 import org.graalvm.compiler.debug.GraalError; 45 import org.graalvm.compiler.lir.LIRInstructionClass; 46 import org.graalvm.compiler.lir.Opcode; 47 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 48 import org.graalvm.compiler.lir.gen.LIRGeneratorTool; 49 50 import static jdk.vm.ci.code.ValueUtil.asRegister; 51 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; 52 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; 53 54 import java.util.Objects; 55 56 /** 57 * Emits code which compares two arrays of the same length. If the CPU supports any vector 58 * instructions specialized code is emitted to leverage these instructions. 59 * 60 * This op can also compare arrays of different integer types (e.g. {@code byte[]} and 61 * {@code char[]}) with on-the-fly sign- or zero-extension. If one of the given arrays is a 62 * {@code char[]} array, the smaller elements are zero-extended, otherwise they are sign-extended. 63 */ 64 @Opcode("ARRAY_EQUALS") 65 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction { 66 public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class); 67 68 private final JavaKind kind1; 69 private final JavaKind kind2; 70 private final int arrayBaseOffset1; 71 private final int arrayBaseOffset2; 72 private final Scale arrayIndexScale1; 73 private final Scale arrayIndexScale2; 74 private final AVXKind.AVXSize vectorSize; 75 private final int constantLength; 76 private final boolean signExtend; 77 78 @Def({REG}) private Value resultValue; 79 @Alive({REG}) private Value array1Value; 80 @Alive({REG}) private Value array2Value; 81 @Alive({REG}) private Value lengthValue; 82 @Temp({REG}) private Value temp1; 83 @Temp({REG}) private Value temp2; 84 @Temp({REG}) private Value temp3; 85 @Temp({REG}) private Value temp4; 86 87 @Temp({REG, ILLEGAL}) private Value temp5; 88 @Temp({REG, ILLEGAL}) private Value tempXMM; 89 90 @Temp({REG, ILLEGAL}) private Value vectorTemp1; 91 @Temp({REG, ILLEGAL}) private Value vectorTemp2; 92 @Temp({REG, ILLEGAL}) private Value vectorTemp3; 93 @Temp({REG, ILLEGAL}) private Value vectorTemp4; 94 95 public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind1, JavaKind kind2, Value result, Value array1, Value array2, Value length, 96 int constantLength, boolean directPointers, int maxVectorSize) { 97 super(TYPE); 98 this.kind1 = kind1; 99 this.kind2 = kind2; 100 this.signExtend = kind1 != JavaKind.Char && kind2 != JavaKind.Char; 101 102 assert kind1.isNumericInteger() && kind2.isNumericInteger() || kind1 == kind2; 103 104 this.arrayBaseOffset1 = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind1); 105 this.arrayBaseOffset2 = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind2); 106 this.arrayIndexScale1 = Objects.requireNonNull(Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(kind1))); 107 this.arrayIndexScale2 = Objects.requireNonNull(Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(kind2))); 108 this.vectorSize = ((AMD64) tool.target().arch).getFeatures().contains(CPUFeature.AVX2) && (maxVectorSize < 0 || maxVectorSize >= 32) ? AVXKind.AVXSize.YMM : AVXKind.AVXSize.XMM; 109 this.constantLength = constantLength; 110 111 this.resultValue = result; 112 this.array1Value = array1; 113 this.array2Value = array2; 114 this.lengthValue = length; 115 116 // Allocate some temporaries. 117 this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind())); 118 this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind())); 119 this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())); 120 this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())); 121 122 this.temp5 = kind1.isNumericFloat() || kind1 != kind2 ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL; 123 if (kind1 == JavaKind.Float) { 124 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE)); 125 } else if (kind1 == JavaKind.Double) { 126 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 127 } else { 128 this.tempXMM = Value.ILLEGAL; 129 } 130 131 // We only need the vector temporaries if we generate SSE code. 132 if (supportsSSE41(tool.target())) { 133 if (canGenerateConstantLengthCompare(tool.target())) { 134 LIRKind lirKind = LIRKind.value(vectorSize == AVXKind.AVXSize.YMM ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE); 135 this.vectorTemp1 = tool.newVariable(lirKind); 136 this.vectorTemp2 = tool.newVariable(lirKind); 137 this.vectorTemp3 = tool.newVariable(lirKind); 138 this.vectorTemp4 = tool.newVariable(lirKind); 139 } else { 140 this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 141 this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 142 this.vectorTemp3 = Value.ILLEGAL; 143 this.vectorTemp4 = Value.ILLEGAL; 144 } 145 } else { 146 this.vectorTemp1 = Value.ILLEGAL; 147 this.vectorTemp2 = Value.ILLEGAL; 148 this.vectorTemp3 = Value.ILLEGAL; 149 this.vectorTemp4 = Value.ILLEGAL; 150 } 151 } 152 153 private boolean canGenerateConstantLengthCompare(TargetDescription target) { 154 return constantLength >= 0 && kind1.isNumericInteger() && (kind1 == kind2 || getElementsPerVector(AVXKind.AVXSize.XMM) <= constantLength) && supportsSSE41(target); 155 } 156 157 @Override 158 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) { 159 Register result = asRegister(resultValue); 160 Register array1 = asRegister(temp1); 161 Register array2 = asRegister(temp2); 162 163 Label trueLabel = new Label(); 164 Label falseLabel = new Label(); 165 Label done = new Label(); 166 167 // Load array base addresses. 168 masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset1)); 169 masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset2)); 170 171 if (canGenerateConstantLengthCompare(crb.target)) { 172 emitConstantLengthArrayCompareBytes(crb, masm, array1, array2, asRegister(temp3), asRegister(temp4), 173 new Register[]{asRegister(vectorTemp1), asRegister(vectorTemp2), asRegister(vectorTemp3), asRegister(vectorTemp4)}, falseLabel); 174 } else { 175 Register length = asRegister(temp3); 176 // Get array length. 177 masm.movl(length, asRegister(lengthValue)); 178 // copy 179 masm.movl(result, length); 180 emitArrayCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel); 181 } 182 183 // Return true 184 masm.bind(trueLabel); 185 masm.movl(result, 1); 186 masm.jmpb(done); 187 188 // Return false 189 masm.bind(falseLabel); 190 masm.xorl(result, result); 191 192 // That's it 193 masm.bind(done); 194 } 195 196 private void emitArrayCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, 197 Register result, Register array1, Register array2, Register length, 198 Label trueLabel, Label falseLabel) { 199 if (supportsSSE41(crb.target)) { 200 emitVectorCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel); 201 } 202 if (kind1 == kind2) { 203 emit8ByteCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel); 204 emitTailCompares(masm, result, array1, array2, length, trueLabel, falseLabel); 205 } else { 206 emitDifferentKindsElementWiseCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel); 207 } 208 } 209 210 /** 211 * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions. 212 * 213 * @param target target description of the underlying architecture 214 * @return true if the underlying architecture supports SSE 4.1 215 */ 216 private static boolean supportsSSE41(TargetDescription target) { 217 AMD64 arch = (AMD64) target.arch; 218 return arch.getFeatures().contains(CPUFeature.SSE4_1); 219 } 220 221 /** 222 * Emits code that uses SSE4.1/AVX1 128-bit (16-byte) or AVX2 256-bit (32-byte) vector compares. 223 */ 224 private void emitVectorCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, 225 Register result, Register array1, Register array2, Register length, 226 Label trueLabel, Label falseLabel) { 227 assert supportsSSE41(crb.target); 228 229 Register vector1 = asRegister(vectorTemp1); 230 Register vector2 = asRegister(vectorTemp2); 231 232 int elementsPerVector = getElementsPerVector(vectorSize); 233 234 Label loop = new Label(); 235 Label compareTail = new Label(); 236 237 boolean requiresNaNCheck = kind1.isNumericFloat(); 238 Label loopCheck = new Label(); 239 Label nanCheck = new Label(); 240 241 // Compare 16-byte vectors 242 masm.andl(result, elementsPerVector - 1); // tail count 243 masm.andl(length, ~(elementsPerVector - 1)); // vector count 244 masm.jcc(ConditionFlag.Zero, compareTail); 245 246 masm.leaq(array1, new AMD64Address(array1, length, arrayIndexScale1, 0)); 247 masm.leaq(array2, new AMD64Address(array2, length, arrayIndexScale2, 0)); 248 masm.negq(length); 249 250 // Align the main loop 251 masm.align(crb.target.wordSize * 2); 252 masm.bind(loop); 253 emitVectorLoad1(masm, vector1, array1, length, 0, vectorSize); 254 emitVectorLoad2(masm, vector2, array2, length, 0, vectorSize); 255 emitVectorCmp(masm, vector1, vector2, vectorSize); 256 masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel); 257 258 masm.bind(loopCheck); 259 masm.addq(length, elementsPerVector); 260 masm.jcc(ConditionFlag.NotZero, loop); 261 262 masm.testl(result, result); 263 masm.jcc(ConditionFlag.Zero, trueLabel); 264 265 if (requiresNaNCheck) { 266 Label unalignedCheck = new Label(); 267 masm.jmpb(unalignedCheck); 268 masm.bind(nanCheck); 269 emitFloatCompareWithinRange(crb, masm, array1, array2, length, 0, falseLabel, elementsPerVector); 270 masm.jmpb(loopCheck); 271 masm.bind(unalignedCheck); 272 } 273 274 /* 275 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 276 * array. 277 */ 278 emitVectorLoad1(masm, vector1, array1, result, scaleDisplacement1(-vectorSize.getBytes()), vectorSize); 279 emitVectorLoad2(masm, vector2, array2, result, scaleDisplacement2(-vectorSize.getBytes()), vectorSize); 280 emitVectorCmp(masm, vector1, vector2, vectorSize); 281 if (requiresNaNCheck) { 282 masm.jcc(ConditionFlag.Zero, trueLabel); 283 emitFloatCompareWithinRange(crb, masm, array1, array2, result, -vectorSize.getBytes(), falseLabel, elementsPerVector); 284 } else { 285 masm.jcc(ConditionFlag.NotZero, falseLabel); 286 } 287 masm.jmp(trueLabel); 288 289 masm.bind(compareTail); 290 masm.movl(length, result); 291 } 292 293 private int getElementsPerVector(AVXKind.AVXSize vSize) { 294 return vSize.getBytes() >> Math.max(arrayIndexScale1.log2, arrayIndexScale2.log2); 295 } 296 297 private void emitVectorLoad1(AMD64MacroAssembler asm, Register dst, Register src, int displacement, AVXKind.AVXSize size) { 298 emitVectorLoad1(asm, dst, src, Register.None, displacement, size); 299 } 300 301 private void emitVectorLoad2(AMD64MacroAssembler asm, Register dst, Register src, int displacement, AVXKind.AVXSize size) { 302 emitVectorLoad2(asm, dst, src, Register.None, displacement, size); 303 } 304 305 private void emitVectorLoad1(AMD64MacroAssembler asm, Register dst, Register src, Register index, int displacement, AVXKind.AVXSize size) { 306 emitVectorLoad(asm, dst, src, index, displacement, arrayIndexScale1, arrayIndexScale2, size); 307 } 308 309 private void emitVectorLoad2(AMD64MacroAssembler asm, Register dst, Register src, Register index, int displacement, AVXKind.AVXSize size) { 310 emitVectorLoad(asm, dst, src, index, displacement, arrayIndexScale2, arrayIndexScale1, size); 311 } 312 313 private void emitVectorLoad(AMD64MacroAssembler asm, Register dst, Register src, Register index, int displacement, Scale ownScale, Scale otherScale, AVXKind.AVXSize size) { 314 AMD64Address address = new AMD64Address(src, index, ownScale, displacement); 315 if (ownScale.value < otherScale.value) { 316 if (size == AVXKind.AVXSize.YMM) { 317 getAVX2LoadAndExtendOp(ownScale, otherScale, signExtend).emit(asm, size, dst, address); 318 } else { 319 loadAndExtendSSE(asm, dst, address, ownScale, otherScale, signExtend); 320 } 321 } else { 322 if (size == AVXKind.AVXSize.YMM) { 323 asm.vmovdqu(dst, address); 324 } else { 325 asm.movdqu(dst, address); 326 } 327 } 328 } 329 330 private int scaleDisplacement1(int displacement) { 331 return scaleDisplacement(displacement, arrayIndexScale1, arrayIndexScale2); 332 } 333 334 private int scaleDisplacement2(int displacement) { 335 return scaleDisplacement(displacement, arrayIndexScale2, arrayIndexScale1); 336 } 337 338 private static int scaleDisplacement(int displacement, Scale ownScale, Scale otherScale) { 339 if (ownScale.value < otherScale.value) { 340 return displacement >> (otherScale.log2 - ownScale.log2); 341 } 342 return displacement; 343 } 344 345 private static AMD64Assembler.VexRMOp getAVX2LoadAndExtendOp(Scale ownScale, Scale otherScale, boolean signExtend) { 346 switch (ownScale) { 347 case Times1: 348 switch (otherScale) { 349 case Times2: 350 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXBW : AMD64Assembler.VexRMOp.VPMOVZXBW; 351 case Times4: 352 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXBD : AMD64Assembler.VexRMOp.VPMOVZXBD; 353 case Times8: 354 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXBQ : AMD64Assembler.VexRMOp.VPMOVZXBQ; 355 } 356 throw GraalError.shouldNotReachHere(); 357 case Times2: 358 switch (otherScale) { 359 case Times4: 360 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXWD : AMD64Assembler.VexRMOp.VPMOVZXWD; 361 case Times8: 362 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXWQ : AMD64Assembler.VexRMOp.VPMOVZXWQ; 363 } 364 throw GraalError.shouldNotReachHere(); 365 case Times4: 366 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXDQ : AMD64Assembler.VexRMOp.VPMOVZXDQ; 367 } 368 throw GraalError.shouldNotReachHere(); 369 } 370 371 private static void loadAndExtendSSE(AMD64MacroAssembler asm, Register dst, AMD64Address src, Scale ownScale, Scale otherScale, boolean signExtend) { 372 switch (ownScale) { 373 case Times1: 374 switch (otherScale) { 375 case Times2: 376 if (signExtend) { 377 asm.pmovsxbw(dst, src); 378 } else { 379 asm.pmovzxbw(dst, src); 380 } 381 return; 382 case Times4: 383 if (signExtend) { 384 asm.pmovsxbd(dst, src); 385 } else { 386 asm.pmovzxbd(dst, src); 387 } 388 return; 389 case Times8: 390 if (signExtend) { 391 asm.pmovsxbq(dst, src); 392 } else { 393 asm.pmovzxbq(dst, src); 394 } 395 return; 396 } 397 throw GraalError.shouldNotReachHere(); 398 case Times2: 399 switch (otherScale) { 400 case Times4: 401 if (signExtend) { 402 asm.pmovsxwd(dst, src); 403 } else { 404 asm.pmovzxwd(dst, src); 405 } 406 return; 407 case Times8: 408 if (signExtend) { 409 asm.pmovsxwq(dst, src); 410 } else { 411 asm.pmovzxwq(dst, src); 412 } 413 return; 414 } 415 throw GraalError.shouldNotReachHere(); 416 case Times4: 417 if (signExtend) { 418 asm.pmovsxdq(dst, src); 419 } else { 420 asm.pmovzxdq(dst, src); 421 } 422 return; 423 } 424 throw GraalError.shouldNotReachHere(); 425 } 426 427 private static void emitVectorCmp(AMD64MacroAssembler masm, Register vector1, Register vector2, AVXKind.AVXSize size) { 428 emitVectorXor(masm, vector1, vector2, size); 429 emitVectorTest(masm, vector1, size); 430 } 431 432 private static void emitVectorXor(AMD64MacroAssembler masm, Register vector1, Register vector2, AVXKind.AVXSize size) { 433 if (size == AVXKind.AVXSize.YMM) { 434 masm.vpxor(vector1, vector1, vector2); 435 } else { 436 masm.pxor(vector1, vector2); 437 } 438 } 439 440 private static void emitVectorTest(AMD64MacroAssembler masm, Register vector1, AVXKind.AVXSize size) { 441 if (size == AVXKind.AVXSize.YMM) { 442 masm.vptest(vector1, vector1); 443 } else { 444 masm.ptest(vector1, vector1); 445 } 446 } 447 448 /** 449 * Vector size used in {@link #emit8ByteCompare}. 450 */ 451 private static final int VECTOR_SIZE = 8; 452 453 /** 454 * Emits code that uses 8-byte vector compares. 455 */ 456 private void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, 457 Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) { 458 assert kind1 == kind2; 459 Label loop = new Label(); 460 Label compareTail = new Label(); 461 462 int elementsPerVector = 8 >> arrayIndexScale1.log2; 463 464 boolean requiresNaNCheck = kind1.isNumericFloat(); 465 Label loopCheck = new Label(); 466 Label nanCheck = new Label(); 467 468 Register temp = asRegister(temp4); 469 470 masm.andl(result, elementsPerVector - 1); // tail count 471 masm.andl(length, ~(elementsPerVector - 1)); // vector count 472 masm.jcc(ConditionFlag.Zero, compareTail); 473 474 masm.leaq(array1, new AMD64Address(array1, length, arrayIndexScale1, 0)); 475 masm.leaq(array2, new AMD64Address(array2, length, arrayIndexScale2, 0)); 476 masm.negq(length); 477 478 // Align the main loop 479 masm.align(crb.target.wordSize * 2); 480 masm.bind(loop); 481 masm.movq(temp, new AMD64Address(array1, length, arrayIndexScale1, 0)); 482 masm.cmpq(temp, new AMD64Address(array2, length, arrayIndexScale2, 0)); 483 masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel); 484 485 masm.bind(loopCheck); 486 masm.addq(length, elementsPerVector); 487 masm.jccb(ConditionFlag.NotZero, loop); 488 489 masm.testl(result, result); 490 masm.jcc(ConditionFlag.Zero, trueLabel); 491 492 if (requiresNaNCheck) { 493 // NaN check is slow path and hence placed outside of the main loop. 494 Label unalignedCheck = new Label(); 495 masm.jmpb(unalignedCheck); 496 masm.bind(nanCheck); 497 // At most two iterations, unroll in the emitted code. 498 for (int offset = 0; offset < VECTOR_SIZE; offset += kind1.getByteCount()) { 499 emitFloatCompare(masm, array1, array2, length, offset, falseLabel, kind1.getByteCount() == VECTOR_SIZE); 500 } 501 masm.jmpb(loopCheck); 502 masm.bind(unalignedCheck); 503 } 504 505 /* 506 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 507 * array. 508 */ 509 masm.movq(temp, new AMD64Address(array1, result, arrayIndexScale1, -VECTOR_SIZE)); 510 masm.cmpq(temp, new AMD64Address(array2, result, arrayIndexScale2, -VECTOR_SIZE)); 511 if (requiresNaNCheck) { 512 masm.jcc(ConditionFlag.Equal, trueLabel); 513 // At most two iterations, unroll in the emitted code. 514 for (int offset = 0; offset < VECTOR_SIZE; offset += kind1.getByteCount()) { 515 emitFloatCompare(masm, array1, array2, result, -VECTOR_SIZE + offset, falseLabel, kind1.getByteCount() == VECTOR_SIZE); 516 } 517 } else { 518 masm.jccb(ConditionFlag.NotEqual, falseLabel); 519 } 520 masm.jmpb(trueLabel); 521 522 masm.bind(compareTail); 523 masm.movl(length, result); 524 } 525 526 /** 527 * Emits code to compare the remaining 1 to 4 bytes. 528 */ 529 private void emitTailCompares(AMD64MacroAssembler masm, 530 Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) { 531 assert kind1 == kind2; 532 Label compare2Bytes = new Label(); 533 Label compare1Byte = new Label(); 534 535 Register temp = asRegister(temp4); 536 537 if (kind1.getByteCount() <= 4) { 538 // Compare trailing 4 bytes, if any. 539 masm.testl(result, arrayIndexScale1.log2 == 0 ? 4 : 4 >> arrayIndexScale1.log2); 540 masm.jccb(ConditionFlag.Zero, compare2Bytes); 541 masm.movl(temp, new AMD64Address(array1, 0)); 542 masm.cmpl(temp, new AMD64Address(array2, 0)); 543 if (kind1 == JavaKind.Float) { 544 masm.jccb(ConditionFlag.Equal, trueLabel); 545 emitFloatCompare(masm, array1, array2, Register.None, 0, falseLabel, true); 546 masm.jmpb(trueLabel); 547 } else { 548 masm.jccb(ConditionFlag.NotEqual, falseLabel); 549 } 550 if (kind1.getByteCount() <= 2) { 551 // Move array pointers forward. 552 masm.leaq(array1, new AMD64Address(array1, 4)); 553 masm.leaq(array2, new AMD64Address(array2, 4)); 554 555 // Compare trailing 2 bytes, if any. 556 masm.bind(compare2Bytes); 557 masm.testl(result, arrayIndexScale1.log2 == 0 ? 2 : 2 >> arrayIndexScale1.log2); 558 masm.jccb(ConditionFlag.Zero, compare1Byte); 559 masm.movzwl(temp, new AMD64Address(array1, 0)); 560 masm.movzwl(length, new AMD64Address(array2, 0)); 561 masm.cmpl(temp, length); 562 masm.jccb(ConditionFlag.NotEqual, falseLabel); 563 564 // The one-byte tail compare is only required for boolean and byte arrays. 565 if (kind1.getByteCount() <= 1) { 566 // Move array pointers forward before we compare the last trailing byte. 567 masm.leaq(array1, new AMD64Address(array1, 2)); 568 masm.leaq(array2, new AMD64Address(array2, 2)); 569 570 // Compare trailing byte, if any. 571 masm.bind(compare1Byte); 572 masm.testl(result, 1); 573 masm.jccb(ConditionFlag.Zero, trueLabel); 574 masm.movzbl(temp, new AMD64Address(array1, 0)); 575 masm.movzbl(length, new AMD64Address(array2, 0)); 576 masm.cmpl(temp, length); 577 masm.jccb(ConditionFlag.NotEqual, falseLabel); 578 } else { 579 masm.bind(compare1Byte); 580 } 581 } else { 582 masm.bind(compare2Bytes); 583 } 584 } 585 } 586 587 private void emitDifferentKindsElementWiseCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, 588 Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) { 589 assert kind1 != kind2; 590 assert kind1.isNumericInteger() && kind2.isNumericInteger(); 591 Label loop = new Label(); 592 Label compareTail = new Label(); 593 594 int elementsPerLoopIteration = 4; 595 596 Register tmp1 = asRegister(temp4); 597 Register tmp2 = asRegister(temp5); 598 599 masm.andl(result, elementsPerLoopIteration - 1); // tail count 600 masm.andl(length, ~(elementsPerLoopIteration - 1)); // bulk loop count 601 masm.jcc(ConditionFlag.Zero, compareTail); 602 603 masm.leaq(array1, new AMD64Address(array1, length, arrayIndexScale1, 0)); 604 masm.leaq(array2, new AMD64Address(array2, length, arrayIndexScale2, 0)); 605 masm.negq(length); 606 607 // clear comparison registers because of the missing movzlq instruction 608 masm.xorq(tmp1, tmp1); 609 masm.xorq(tmp2, tmp2); 610 611 // Align the main loop 612 masm.align(crb.target.wordSize * 2); 613 masm.bind(loop); 614 for (int i = 0; i < elementsPerLoopIteration; i++) { 615 emitMovBytes(masm, tmp1, new AMD64Address(array1, length, arrayIndexScale1, i << arrayIndexScale1.log2), kind1.getByteCount()); 616 emitMovBytes(masm, tmp2, new AMD64Address(array2, length, arrayIndexScale2, i << arrayIndexScale2.log2), kind2.getByteCount()); 617 masm.cmpq(tmp1, tmp2); 618 masm.jcc(ConditionFlag.NotEqual, falseLabel); 619 } 620 masm.addq(length, elementsPerLoopIteration); 621 masm.jccb(ConditionFlag.NotZero, loop); 622 623 masm.bind(compareTail); 624 masm.testl(result, result); 625 masm.jcc(ConditionFlag.Zero, trueLabel); 626 for (int i = 0; i < elementsPerLoopIteration - 1; i++) { 627 emitMovBytes(masm, tmp1, new AMD64Address(array1, length, arrayIndexScale1, 0), kind1.getByteCount()); 628 emitMovBytes(masm, tmp2, new AMD64Address(array2, length, arrayIndexScale2, 0), kind2.getByteCount()); 629 masm.cmpq(tmp1, tmp2); 630 masm.jcc(ConditionFlag.NotEqual, falseLabel); 631 if (i < elementsPerLoopIteration - 2) { 632 masm.incrementq(length, 1); 633 masm.decrementq(result, 1); 634 masm.jcc(ConditionFlag.Zero, trueLabel); 635 } else { 636 masm.jmpb(trueLabel); 637 } 638 } 639 } 640 641 /** 642 * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}. 643 */ 644 private void emitNaNCheck(AMD64MacroAssembler masm, AMD64Address src, Label branchIfNonNaN) { 645 assert kind1.isNumericFloat(); 646 Register tempXMMReg = asRegister(tempXMM); 647 if (kind1 == JavaKind.Float) { 648 masm.movflt(tempXMMReg, src); 649 } else { 650 masm.movdbl(tempXMMReg, src); 651 } 652 SSEOp.UCOMIS.emit(masm, kind1 == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg); 653 masm.jcc(ConditionFlag.NoParity, branchIfNonNaN); 654 } 655 656 /** 657 * Emits code to compare if two floats are bitwise equal or both NaN. 658 */ 659 private void emitFloatCompare(AMD64MacroAssembler masm, Register base1, Register base2, Register index, int offset, Label falseLabel, 660 boolean skipBitwiseCompare) { 661 AMD64Address address1 = new AMD64Address(base1, index, arrayIndexScale1, offset); 662 AMD64Address address2 = new AMD64Address(base2, index, arrayIndexScale2, offset); 663 664 Label bitwiseEqual = new Label(); 665 666 if (!skipBitwiseCompare) { 667 // Bitwise compare 668 Register temp = asRegister(temp4); 669 670 if (kind1 == JavaKind.Float) { 671 masm.movl(temp, address1); 672 masm.cmpl(temp, address2); 673 } else { 674 masm.movq(temp, address1); 675 masm.cmpq(temp, address2); 676 } 677 masm.jccb(ConditionFlag.Equal, bitwiseEqual); 678 } 679 680 emitNaNCheck(masm, address1, falseLabel); 681 emitNaNCheck(masm, address2, falseLabel); 682 683 masm.bind(bitwiseEqual); 684 } 685 686 /** 687 * Emits code to compare float equality within a range. 688 */ 689 private void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm, 690 Register base1, Register base2, Register index, int offset, Label falseLabel, int range) { 691 assert kind1.isNumericFloat(); 692 Label loop = new Label(); 693 Register i = asRegister(temp5); 694 695 masm.movq(i, range); 696 masm.negq(i); 697 // Align the main loop 698 masm.align(crb.target.wordSize * 2); 699 masm.bind(loop); 700 emitFloatCompare(masm, base1, base2, index, offset, falseLabel, range == 1); 701 masm.incrementq(index, 1); 702 masm.incrementq(i, 1); 703 masm.jccb(ConditionFlag.NotZero, loop); 704 // Floats within the range are equal, revert change to the register index 705 masm.subq(index, range); 706 } 707 708 /** 709 * Emits specialized assembly for checking equality of memory regions 710 * {@code arrayPtr1[0..nBytes]} and {@code arrayPtr2[0..nBytes]}. If they match, execution 711 * continues directly after the emitted code block, otherwise we jump to {@code noMatch}. 712 */ 713 private void emitConstantLengthArrayCompareBytes( 714 CompilationResultBuilder crb, 715 AMD64MacroAssembler asm, 716 Register arrayPtr1, 717 Register arrayPtr2, 718 Register tmp1, 719 Register tmp2, 720 Register[] tmpVectors, 721 Label noMatch) { 722 if (constantLength == 0) { 723 // do nothing 724 return; 725 } 726 AVXKind.AVXSize vSize = vectorSize; 727 if (constantLength < getElementsPerVector(vectorSize)) { 728 vSize = AVXKind.AVXSize.XMM; 729 } 730 int elementsPerVector = getElementsPerVector(vSize); 731 if (elementsPerVector > constantLength) { 732 assert kind1 == kind2; 733 int byteLength = constantLength << arrayIndexScale1.log2; 734 // array is shorter than any vector register, use regular CMP instructions 735 int movSize = (byteLength < 2) ? 1 : ((byteLength < 4) ? 2 : ((byteLength < 8) ? 4 : 8)); 736 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1), movSize); 737 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2), movSize); 738 emitCmpBytes(asm, tmp1, tmp2, movSize); 739 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch); 740 if (byteLength > movSize) { 741 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1, byteLength - movSize), movSize); 742 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2, byteLength - movSize), movSize); 743 emitCmpBytes(asm, tmp1, tmp2, movSize); 744 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch); 745 } 746 } else { 747 int elementsPerVectorLoop = 2 * elementsPerVector; 748 int tailCount = constantLength & (elementsPerVectorLoop - 1); 749 int vectorCount = constantLength & ~(elementsPerVectorLoop - 1); 750 int bytesPerVector = vSize.getBytes(); 751 if (vectorCount > 0) { 752 Label loopBegin = new Label(); 753 asm.leaq(arrayPtr1, new AMD64Address(arrayPtr1, vectorCount << arrayIndexScale1.log2)); 754 asm.leaq(arrayPtr2, new AMD64Address(arrayPtr2, vectorCount << arrayIndexScale2.log2)); 755 asm.movq(tmp1, -vectorCount); 756 asm.align(crb.target.wordSize * 2); 757 asm.bind(loopBegin); 758 emitVectorLoad1(asm, tmpVectors[0], arrayPtr1, tmp1, 0, vSize); 759 emitVectorLoad2(asm, tmpVectors[1], arrayPtr2, tmp1, 0, vSize); 760 emitVectorLoad1(asm, tmpVectors[2], arrayPtr1, tmp1, scaleDisplacement1(bytesPerVector), vSize); 761 emitVectorLoad2(asm, tmpVectors[3], arrayPtr2, tmp1, scaleDisplacement2(bytesPerVector), vSize); 762 emitVectorXor(asm, tmpVectors[0], tmpVectors[1], vSize); 763 emitVectorXor(asm, tmpVectors[2], tmpVectors[3], vSize); 764 emitVectorTest(asm, tmpVectors[0], vSize); 765 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 766 emitVectorTest(asm, tmpVectors[2], vSize); 767 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 768 asm.addq(tmp1, elementsPerVectorLoop); 769 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin); 770 } 771 if (tailCount > 0) { 772 emitVectorLoad1(asm, tmpVectors[0], arrayPtr1, (tailCount << arrayIndexScale1.log2) - scaleDisplacement1(bytesPerVector), vSize); 773 emitVectorLoad2(asm, tmpVectors[1], arrayPtr2, (tailCount << arrayIndexScale2.log2) - scaleDisplacement2(bytesPerVector), vSize); 774 emitVectorXor(asm, tmpVectors[0], tmpVectors[1], vSize); 775 if (tailCount > elementsPerVector) { 776 emitVectorLoad1(asm, tmpVectors[2], arrayPtr1, 0, vSize); 777 emitVectorLoad2(asm, tmpVectors[3], arrayPtr2, 0, vSize); 778 emitVectorXor(asm, tmpVectors[2], tmpVectors[3], vSize); 779 emitVectorTest(asm, tmpVectors[2], vSize); 780 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 781 } 782 emitVectorTest(asm, tmpVectors[0], vSize); 783 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 784 } 785 } 786 } 787 788 private void emitMovBytes(AMD64MacroAssembler asm, Register dst, AMD64Address src, int size) { 789 switch (size) { 790 case 1: 791 if (signExtend) { 792 asm.movsbq(dst, src); 793 } else { 794 asm.movzbq(dst, src); 795 } 796 break; 797 case 2: 798 if (signExtend) { 799 asm.movswq(dst, src); 800 } else { 801 asm.movzwq(dst, src); 802 } 803 break; 804 case 4: 805 if (signExtend) { 806 asm.movslq(dst, src); 807 } else { 808 // there is no movzlq 809 asm.movl(dst, src); 810 } 811 break; 812 case 8: 813 asm.movq(dst, src); 814 break; 815 default: 816 throw new IllegalStateException(); 817 } 818 } 819 820 private static void emitCmpBytes(AMD64MacroAssembler asm, Register dst, Register src, int size) { 821 if (size < 8) { 822 asm.cmpl(dst, src); 823 } else { 824 asm.cmpq(dst, src); 825 } 826 } 827 } |