1 /* 2 * Copyright (c) 2013, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 25 package org.graalvm.compiler.lir.amd64; 26 27 import jdk.vm.ci.amd64.AMD64; 28 import jdk.vm.ci.amd64.AMD64.CPUFeature; 29 import jdk.vm.ci.amd64.AMD64Kind; 30 import jdk.vm.ci.code.Register; 31 import jdk.vm.ci.code.TargetDescription; 32 import jdk.vm.ci.meta.JavaKind; 33 import jdk.vm.ci.meta.Value; 34 import org.graalvm.compiler.asm.Label; 35 import org.graalvm.compiler.asm.amd64.AMD64Address; 36 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale; 37 import org.graalvm.compiler.asm.amd64.AMD64Assembler; 38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag; 39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp; 40 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize; 41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 42 import org.graalvm.compiler.asm.amd64.AVXKind; 43 import org.graalvm.compiler.core.common.LIRKind; 44 import org.graalvm.compiler.debug.GraalError; 45 import org.graalvm.compiler.lir.LIRInstructionClass; 46 import org.graalvm.compiler.lir.Opcode; 47 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 48 import org.graalvm.compiler.lir.gen.LIRGeneratorTool; 49 50 import static jdk.vm.ci.code.ValueUtil.asRegister; 51 import static org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64BinaryArithmetic.XOR; 52 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; 53 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; 54 55 import java.util.Objects; 56 57 /** 58 * Emits code which compares two arrays of the same length. If the CPU supports any vector 59 * instructions specialized code is emitted to leverage these instructions. 60 * 61 * This op can also compare arrays of different integer types (e.g. {@code byte[]} and 62 * {@code char[]}) with on-the-fly sign- or zero-extension. If one of the given arrays is a 63 * {@code char[]} array, the smaller elements are zero-extended, otherwise they are sign-extended. 64 */ 65 @Opcode("ARRAY_EQUALS") 66 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction { 67 public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class); 68 69 private final JavaKind kind1; 70 private final JavaKind kind2; 71 private final int arrayBaseOffset1; 72 private final int arrayBaseOffset2; 73 private final Scale arrayIndexScale1; 74 private final Scale arrayIndexScale2; 75 private final AVXKind.AVXSize vectorSize; 76 private final int constantLength; 77 private final boolean signExtend; 78 79 @Def({REG}) private Value resultValue; 80 @Alive({REG}) private Value array1Value; 81 @Alive({REG}) private Value array2Value; 82 @Alive({REG}) private Value lengthValue; 83 @Temp({REG, ILLEGAL}) private Value temp1; 84 @Temp({REG, ILLEGAL}) private Value temp2; 85 @Temp({REG}) private Value temp3; 86 @Temp({REG, ILLEGAL}) private Value temp4; 87 88 @Temp({REG, ILLEGAL}) private Value temp5; 89 @Temp({REG, ILLEGAL}) private Value tempXMM; 90 91 @Temp({REG, ILLEGAL}) private Value vectorTemp1; 92 @Temp({REG, ILLEGAL}) private Value vectorTemp2; 93 @Temp({REG, ILLEGAL}) private Value vectorTemp3; 94 @Temp({REG, ILLEGAL}) private Value vectorTemp4; 95 96 public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind1, JavaKind kind2, Value result, Value array1, Value array2, Value length, 97 int constantLength, boolean directPointers, int maxVectorSize) { 98 super(TYPE); 99 this.kind1 = kind1; 100 this.kind2 = kind2; 101 this.signExtend = kind1 != JavaKind.Char && kind2 != JavaKind.Char; 102 103 assert kind1.isNumericInteger() && kind2.isNumericInteger() || kind1 == kind2; 104 105 this.arrayBaseOffset1 = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind1); 106 this.arrayBaseOffset2 = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind2); 107 this.arrayIndexScale1 = Objects.requireNonNull(Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(kind1))); 108 this.arrayIndexScale2 = Objects.requireNonNull(Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(kind2))); 109 this.vectorSize = ((AMD64) tool.target().arch).getFeatures().contains(CPUFeature.AVX2) && (maxVectorSize < 0 || maxVectorSize >= 32) ? AVXKind.AVXSize.YMM : AVXKind.AVXSize.XMM; 110 this.constantLength = constantLength; 111 112 this.resultValue = result; 113 this.array1Value = array1; 114 this.array2Value = array2; 115 this.lengthValue = length; 116 117 // Allocate some temporaries. 118 if (supportsSSE41(tool.target()) && canGenerateConstantLengthCompare(tool.target()) && !constantLengthCompareNeedsTmpArrayPointers()) { 119 this.temp1 = Value.ILLEGAL; 120 this.temp2 = Value.ILLEGAL; 121 } else { 122 this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind())); 123 this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind())); 124 } 125 this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())); 126 if (supportsSSE41(tool.target()) && canGenerateConstantLengthCompare(tool.target())) { 127 this.temp4 = Value.ILLEGAL; 128 this.temp5 = Value.ILLEGAL; 129 } else { 130 this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())); 131 this.temp5 = kind1.isNumericFloat() || kind1 != kind2 ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL; 132 } 133 134 if (kind1 == JavaKind.Float) { 135 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE)); 136 } else if (kind1 == JavaKind.Double) { 137 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 138 } else { 139 this.tempXMM = Value.ILLEGAL; 140 } 141 142 // We only need the vector temporaries if we generate SSE code. 143 if (supportsSSE41(tool.target())) { 144 if (canGenerateConstantLengthCompare(tool.target())) { 145 LIRKind lirKind = LIRKind.value(vectorSize == AVXKind.AVXSize.YMM ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE); 146 this.vectorTemp1 = tool.newVariable(lirKind); 147 this.vectorTemp2 = tool.newVariable(lirKind); 148 this.vectorTemp3 = tool.newVariable(lirKind); 149 this.vectorTemp4 = tool.newVariable(lirKind); 150 } else { 151 this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 152 this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 153 this.vectorTemp3 = Value.ILLEGAL; 154 this.vectorTemp4 = Value.ILLEGAL; 155 } 156 } else { 157 this.vectorTemp1 = Value.ILLEGAL; 158 this.vectorTemp2 = Value.ILLEGAL; 159 this.vectorTemp3 = Value.ILLEGAL; 160 this.vectorTemp4 = Value.ILLEGAL; 161 } 162 } 163 164 private boolean canGenerateConstantLengthCompare(TargetDescription target) { 165 return constantLength >= 0 && kind1.isNumericInteger() && (kind1 == kind2 || getElementsPerVector(AVXKind.AVXSize.XMM) <= constantLength) && supportsSSE41(target); 166 } 167 168 @Override 169 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) { 170 Register result = asRegister(resultValue); 171 172 Label trueLabel = new Label(); 173 Label falseLabel = new Label(); 174 Label done = new Label(); 175 176 if (canGenerateConstantLengthCompare(crb.target)) { 177 emitConstantLengthArrayCompareBytes(crb, masm, new Register[]{asRegister(vectorTemp1), asRegister(vectorTemp2), asRegister(vectorTemp3), asRegister(vectorTemp4)}, falseLabel); 178 } else { 179 Register array1 = asRegister(temp1); 180 Register array2 = asRegister(temp2); 181 // Load array base addresses. 182 masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset1)); 183 masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset2)); 184 Register length = asRegister(temp3); 185 // Get array length. 186 masm.movl(length, asRegister(lengthValue)); 187 // copy 188 masm.movl(result, length); 189 emitArrayCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel); 190 } 191 192 // Return true 193 masm.bind(trueLabel); 194 masm.movl(result, 1); 195 masm.jmpb(done); 196 197 // Return false 198 masm.bind(falseLabel); 199 masm.xorl(result, result); 200 201 // That's it 202 masm.bind(done); 203 } 204 205 private void emitArrayCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, 206 Register result, Register array1, Register array2, Register length, 207 Label trueLabel, Label falseLabel) { 208 if (supportsSSE41(crb.target)) { 209 emitVectorCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel); 210 } 211 if (kind1 == kind2) { 212 emit8ByteCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel); 213 emitTailCompares(masm, result, array1, array2, length, trueLabel, falseLabel); 214 } else { 215 emitDifferentKindsElementWiseCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel); 216 } 217 } 218 219 /** 220 * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions. 221 * 222 * @param target target description of the underlying architecture 223 * @return true if the underlying architecture supports SSE 4.1 224 */ 225 private static boolean supportsSSE41(TargetDescription target) { 226 AMD64 arch = (AMD64) target.arch; 227 return arch.getFeatures().contains(CPUFeature.SSE4_1); 228 } 229 230 /** 231 * Emits code that uses SSE4.1/AVX1 128-bit (16-byte) or AVX2 256-bit (32-byte) vector compares. 232 */ 233 private void emitVectorCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, 234 Register result, Register array1, Register array2, Register length, 235 Label trueLabel, Label falseLabel) { 236 assert supportsSSE41(crb.target); 237 238 Register vector1 = asRegister(vectorTemp1); 239 Register vector2 = asRegister(vectorTemp2); 240 241 int elementsPerVector = getElementsPerVector(vectorSize); 242 243 Label loop = new Label(); 244 Label compareTail = new Label(); 245 246 boolean requiresNaNCheck = kind1.isNumericFloat(); 247 Label loopCheck = new Label(); 248 Label nanCheck = new Label(); 249 250 // Compare 16-byte vectors 251 masm.andl(result, elementsPerVector - 1); // tail count 252 masm.andl(length, ~(elementsPerVector - 1)); // vector count 253 masm.jcc(ConditionFlag.Zero, compareTail); 254 255 masm.leaq(array1, new AMD64Address(array1, length, arrayIndexScale1, 0)); 256 masm.leaq(array2, new AMD64Address(array2, length, arrayIndexScale2, 0)); 257 masm.negq(length); 258 259 // Align the main loop 260 masm.align(crb.target.wordSize * 2); 261 masm.bind(loop); 262 emitVectorLoad1(masm, vector1, array1, length, 0, vectorSize); 263 emitVectorLoad2(masm, vector2, array2, length, 0, vectorSize); 264 emitVectorCmp(masm, vector1, vector2, vectorSize); 265 masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel); 266 267 masm.bind(loopCheck); 268 masm.addq(length, elementsPerVector); 269 masm.jcc(ConditionFlag.NotZero, loop); 270 271 masm.testl(result, result); 272 masm.jcc(ConditionFlag.Zero, trueLabel); 273 274 if (requiresNaNCheck) { 275 Label unalignedCheck = new Label(); 276 masm.jmpb(unalignedCheck); 277 masm.bind(nanCheck); 278 emitFloatCompareWithinRange(crb, masm, array1, array2, length, 0, falseLabel, elementsPerVector); 279 masm.jmpb(loopCheck); 280 masm.bind(unalignedCheck); 281 } 282 283 /* 284 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 285 * array. 286 */ 287 emitVectorLoad1(masm, vector1, array1, result, scaleDisplacement1(-vectorSize.getBytes()), vectorSize); 288 emitVectorLoad2(masm, vector2, array2, result, scaleDisplacement2(-vectorSize.getBytes()), vectorSize); 289 emitVectorCmp(masm, vector1, vector2, vectorSize); 290 if (requiresNaNCheck) { 291 masm.jcc(ConditionFlag.Zero, trueLabel); 292 emitFloatCompareWithinRange(crb, masm, array1, array2, result, -vectorSize.getBytes(), falseLabel, elementsPerVector); 293 } else { 294 masm.jcc(ConditionFlag.NotZero, falseLabel); 295 } 296 masm.jmp(trueLabel); 297 298 masm.bind(compareTail); 299 masm.movl(length, result); 300 } 301 302 private int getElementsPerVector(AVXKind.AVXSize vSize) { 303 return vSize.getBytes() >> Math.max(arrayIndexScale1.log2, arrayIndexScale2.log2); 304 } 305 306 private void emitVectorLoad1(AMD64MacroAssembler asm, Register dst, Register src, int displacement, AVXKind.AVXSize size) { 307 emitVectorLoad1(asm, dst, src, Register.None, displacement, size); 308 } 309 310 private void emitVectorLoad2(AMD64MacroAssembler asm, Register dst, Register src, int displacement, AVXKind.AVXSize size) { 311 emitVectorLoad2(asm, dst, src, Register.None, displacement, size); 312 } 313 314 private void emitVectorLoad1(AMD64MacroAssembler asm, Register dst, Register src, Register index, int displacement, AVXKind.AVXSize size) { 315 emitVectorLoad(asm, dst, src, index, displacement, arrayIndexScale1, arrayIndexScale2, size); 316 } 317 318 private void emitVectorLoad2(AMD64MacroAssembler asm, Register dst, Register src, Register index, int displacement, AVXKind.AVXSize size) { 319 emitVectorLoad(asm, dst, src, index, displacement, arrayIndexScale2, arrayIndexScale1, size); 320 } 321 322 private void emitVectorLoad(AMD64MacroAssembler asm, Register dst, Register src, Register index, int displacement, Scale ownScale, Scale otherScale, AVXKind.AVXSize size) { 323 AMD64Address address = new AMD64Address(src, index, ownScale, displacement); 324 if (ownScale.value < otherScale.value) { 325 if (size == AVXKind.AVXSize.YMM) { 326 getAVX2LoadAndExtendOp(ownScale, otherScale, signExtend).emit(asm, size, dst, address); 327 } else { 328 loadAndExtendSSE(asm, dst, address, ownScale, otherScale, signExtend); 329 } 330 } else { 331 if (size == AVXKind.AVXSize.YMM) { 332 asm.vmovdqu(dst, address); 333 } else { 334 asm.movdqu(dst, address); 335 } 336 } 337 } 338 339 private int scaleDisplacement1(int displacement) { 340 return scaleDisplacement(displacement, arrayIndexScale1, arrayIndexScale2); 341 } 342 343 private int scaleDisplacement2(int displacement) { 344 return scaleDisplacement(displacement, arrayIndexScale2, arrayIndexScale1); 345 } 346 347 private static int scaleDisplacement(int displacement, Scale ownScale, Scale otherScale) { 348 if (ownScale.value < otherScale.value) { 349 return displacement >> (otherScale.log2 - ownScale.log2); 350 } 351 return displacement; 352 } 353 354 private static AMD64Assembler.VexRMOp getAVX2LoadAndExtendOp(Scale ownScale, Scale otherScale, boolean signExtend) { 355 switch (ownScale) { 356 case Times1: 357 switch (otherScale) { 358 case Times2: 359 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXBW : AMD64Assembler.VexRMOp.VPMOVZXBW; 360 case Times4: 361 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXBD : AMD64Assembler.VexRMOp.VPMOVZXBD; 362 case Times8: 363 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXBQ : AMD64Assembler.VexRMOp.VPMOVZXBQ; 364 } 365 throw GraalError.shouldNotReachHere(); 366 case Times2: 367 switch (otherScale) { 368 case Times4: 369 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXWD : AMD64Assembler.VexRMOp.VPMOVZXWD; 370 case Times8: 371 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXWQ : AMD64Assembler.VexRMOp.VPMOVZXWQ; 372 } 373 throw GraalError.shouldNotReachHere(); 374 case Times4: 375 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXDQ : AMD64Assembler.VexRMOp.VPMOVZXDQ; 376 } 377 throw GraalError.shouldNotReachHere(); 378 } 379 380 private static void loadAndExtendSSE(AMD64MacroAssembler asm, Register dst, AMD64Address src, Scale ownScale, Scale otherScale, boolean signExtend) { 381 switch (ownScale) { 382 case Times1: 383 switch (otherScale) { 384 case Times2: 385 if (signExtend) { 386 asm.pmovsxbw(dst, src); 387 } else { 388 asm.pmovzxbw(dst, src); 389 } 390 return; 391 case Times4: 392 if (signExtend) { 393 asm.pmovsxbd(dst, src); 394 } else { 395 asm.pmovzxbd(dst, src); 396 } 397 return; 398 case Times8: 399 if (signExtend) { 400 asm.pmovsxbq(dst, src); 401 } else { 402 asm.pmovzxbq(dst, src); 403 } 404 return; 405 } 406 throw GraalError.shouldNotReachHere(); 407 case Times2: 408 switch (otherScale) { 409 case Times4: 410 if (signExtend) { 411 asm.pmovsxwd(dst, src); 412 } else { 413 asm.pmovzxwd(dst, src); 414 } 415 return; 416 case Times8: 417 if (signExtend) { 418 asm.pmovsxwq(dst, src); 419 } else { 420 asm.pmovzxwq(dst, src); 421 } 422 return; 423 } 424 throw GraalError.shouldNotReachHere(); 425 case Times4: 426 if (signExtend) { 427 asm.pmovsxdq(dst, src); 428 } else { 429 asm.pmovzxdq(dst, src); 430 } 431 return; 432 } 433 throw GraalError.shouldNotReachHere(); 434 } 435 436 private static void emitVectorCmp(AMD64MacroAssembler masm, Register vector1, Register vector2, AVXKind.AVXSize size) { 437 emitVectorXor(masm, vector1, vector2, size); 438 emitVectorTest(masm, vector1, size); 439 } 440 441 private static void emitVectorXor(AMD64MacroAssembler masm, Register vector1, Register vector2, AVXKind.AVXSize size) { 442 if (size == AVXKind.AVXSize.YMM) { 443 masm.vpxor(vector1, vector1, vector2); 444 } else { 445 masm.pxor(vector1, vector2); 446 } 447 } 448 449 private static void emitVectorTest(AMD64MacroAssembler masm, Register vector1, AVXKind.AVXSize size) { 450 if (size == AVXKind.AVXSize.YMM) { 451 masm.vptest(vector1, vector1); 452 } else { 453 masm.ptest(vector1, vector1); 454 } 455 } 456 457 /** 458 * Vector size used in {@link #emit8ByteCompare}. 459 */ 460 private static final int VECTOR_SIZE = 8; 461 462 /** 463 * Emits code that uses 8-byte vector compares. 464 */ 465 private void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, 466 Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) { 467 assert kind1 == kind2; 468 Label loop = new Label(); 469 Label compareTail = new Label(); 470 471 int elementsPerVector = 8 >> arrayIndexScale1.log2; 472 473 boolean requiresNaNCheck = kind1.isNumericFloat(); 474 Label loopCheck = new Label(); 475 Label nanCheck = new Label(); 476 477 Register temp = asRegister(temp4); 478 479 masm.andl(result, elementsPerVector - 1); // tail count 480 masm.andl(length, ~(elementsPerVector - 1)); // vector count 481 masm.jcc(ConditionFlag.Zero, compareTail); 482 483 masm.leaq(array1, new AMD64Address(array1, length, arrayIndexScale1, 0)); 484 masm.leaq(array2, new AMD64Address(array2, length, arrayIndexScale2, 0)); 485 masm.negq(length); 486 487 // Align the main loop 488 masm.align(crb.target.wordSize * 2); 489 masm.bind(loop); 490 masm.movq(temp, new AMD64Address(array1, length, arrayIndexScale1, 0)); 491 masm.cmpq(temp, new AMD64Address(array2, length, arrayIndexScale2, 0)); 492 masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel); 493 494 masm.bind(loopCheck); 495 masm.addq(length, elementsPerVector); 496 masm.jccb(ConditionFlag.NotZero, loop); 497 498 masm.testl(result, result); 499 masm.jcc(ConditionFlag.Zero, trueLabel); 500 501 if (requiresNaNCheck) { 502 // NaN check is slow path and hence placed outside of the main loop. 503 Label unalignedCheck = new Label(); 504 masm.jmpb(unalignedCheck); 505 masm.bind(nanCheck); 506 // At most two iterations, unroll in the emitted code. 507 for (int offset = 0; offset < VECTOR_SIZE; offset += kind1.getByteCount()) { 508 emitFloatCompare(masm, array1, array2, length, offset, falseLabel, kind1.getByteCount() == VECTOR_SIZE); 509 } 510 masm.jmpb(loopCheck); 511 masm.bind(unalignedCheck); 512 } 513 514 /* 515 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 516 * array. 517 */ 518 masm.movq(temp, new AMD64Address(array1, result, arrayIndexScale1, -VECTOR_SIZE)); 519 masm.cmpq(temp, new AMD64Address(array2, result, arrayIndexScale2, -VECTOR_SIZE)); 520 if (requiresNaNCheck) { 521 masm.jcc(ConditionFlag.Equal, trueLabel); 522 // At most two iterations, unroll in the emitted code. 523 for (int offset = 0; offset < VECTOR_SIZE; offset += kind1.getByteCount()) { 524 emitFloatCompare(masm, array1, array2, result, -VECTOR_SIZE + offset, falseLabel, kind1.getByteCount() == VECTOR_SIZE); 525 } 526 } else { 527 masm.jccb(ConditionFlag.NotEqual, falseLabel); 528 } 529 masm.jmpb(trueLabel); 530 531 masm.bind(compareTail); 532 masm.movl(length, result); 533 } 534 535 /** 536 * Emits code to compare the remaining 1 to 4 bytes. 537 */ 538 private void emitTailCompares(AMD64MacroAssembler masm, 539 Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) { 540 assert kind1 == kind2; 541 Label compare2Bytes = new Label(); 542 Label compare1Byte = new Label(); 543 544 Register temp = asRegister(temp4); 545 546 if (kind1.getByteCount() <= 4) { 547 // Compare trailing 4 bytes, if any. 548 masm.testl(result, arrayIndexScale1.log2 == 0 ? 4 : 4 >> arrayIndexScale1.log2); 549 masm.jccb(ConditionFlag.Zero, compare2Bytes); 550 masm.movl(temp, new AMD64Address(array1, 0)); 551 masm.cmpl(temp, new AMD64Address(array2, 0)); 552 if (kind1 == JavaKind.Float) { 553 masm.jccb(ConditionFlag.Equal, trueLabel); 554 emitFloatCompare(masm, array1, array2, Register.None, 0, falseLabel, true); 555 masm.jmpb(trueLabel); 556 } else { 557 masm.jccb(ConditionFlag.NotEqual, falseLabel); 558 } 559 if (kind1.getByteCount() <= 2) { 560 // Move array pointers forward. 561 masm.leaq(array1, new AMD64Address(array1, 4)); 562 masm.leaq(array2, new AMD64Address(array2, 4)); 563 564 // Compare trailing 2 bytes, if any. 565 masm.bind(compare2Bytes); 566 masm.testl(result, arrayIndexScale1.log2 == 0 ? 2 : 2 >> arrayIndexScale1.log2); 567 masm.jccb(ConditionFlag.Zero, compare1Byte); 568 masm.movzwl(temp, new AMD64Address(array1, 0)); 569 masm.movzwl(length, new AMD64Address(array2, 0)); 570 masm.cmpl(temp, length); 571 masm.jccb(ConditionFlag.NotEqual, falseLabel); 572 573 // The one-byte tail compare is only required for boolean and byte arrays. 574 if (kind1.getByteCount() <= 1) { 575 // Move array pointers forward before we compare the last trailing byte. 576 masm.leaq(array1, new AMD64Address(array1, 2)); 577 masm.leaq(array2, new AMD64Address(array2, 2)); 578 579 // Compare trailing byte, if any. 580 masm.bind(compare1Byte); 581 masm.testl(result, 1); 582 masm.jccb(ConditionFlag.Zero, trueLabel); 583 masm.movzbl(temp, new AMD64Address(array1, 0)); 584 masm.movzbl(length, new AMD64Address(array2, 0)); 585 masm.cmpl(temp, length); 586 masm.jccb(ConditionFlag.NotEqual, falseLabel); 587 } else { 588 masm.bind(compare1Byte); 589 } 590 } else { 591 masm.bind(compare2Bytes); 592 } 593 } 594 } 595 596 private void emitDifferentKindsElementWiseCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, 597 Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) { 598 assert kind1 != kind2; 599 assert kind1.isNumericInteger() && kind2.isNumericInteger(); 600 Label loop = new Label(); 601 Label compareTail = new Label(); 602 603 int elementsPerLoopIteration = 4; 604 605 Register tmp1 = asRegister(temp4); 606 Register tmp2 = asRegister(temp5); 607 608 masm.andl(result, elementsPerLoopIteration - 1); // tail count 609 masm.andl(length, ~(elementsPerLoopIteration - 1)); // bulk loop count 610 masm.jcc(ConditionFlag.Zero, compareTail); 611 612 masm.leaq(array1, new AMD64Address(array1, length, arrayIndexScale1, 0)); 613 masm.leaq(array2, new AMD64Address(array2, length, arrayIndexScale2, 0)); 614 masm.negq(length); 615 616 // clear comparison registers because of the missing movzlq instruction 617 masm.xorq(tmp1, tmp1); 618 masm.xorq(tmp2, tmp2); 619 620 // Align the main loop 621 masm.align(crb.target.wordSize * 2); 622 masm.bind(loop); 623 for (int i = 0; i < elementsPerLoopIteration; i++) { 624 emitMovBytes(masm, tmp1, new AMD64Address(array1, length, arrayIndexScale1, i << arrayIndexScale1.log2), kind1.getByteCount()); 625 emitMovBytes(masm, tmp2, new AMD64Address(array2, length, arrayIndexScale2, i << arrayIndexScale2.log2), kind2.getByteCount()); 626 masm.cmpq(tmp1, tmp2); 627 masm.jcc(ConditionFlag.NotEqual, falseLabel); 628 } 629 masm.addq(length, elementsPerLoopIteration); 630 masm.jccb(ConditionFlag.NotZero, loop); 631 632 masm.bind(compareTail); 633 masm.testl(result, result); 634 masm.jcc(ConditionFlag.Zero, trueLabel); 635 for (int i = 0; i < elementsPerLoopIteration - 1; i++) { 636 emitMovBytes(masm, tmp1, new AMD64Address(array1, length, arrayIndexScale1, 0), kind1.getByteCount()); 637 emitMovBytes(masm, tmp2, new AMD64Address(array2, length, arrayIndexScale2, 0), kind2.getByteCount()); 638 masm.cmpq(tmp1, tmp2); 639 masm.jcc(ConditionFlag.NotEqual, falseLabel); 640 if (i < elementsPerLoopIteration - 2) { 641 masm.incrementq(length, 1); 642 masm.decrementq(result, 1); 643 masm.jcc(ConditionFlag.Zero, trueLabel); 644 } else { 645 masm.jmpb(trueLabel); 646 } 647 } 648 } 649 650 /** 651 * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}. 652 */ 653 private void emitNaNCheck(AMD64MacroAssembler masm, AMD64Address src, Label branchIfNonNaN) { 654 assert kind1.isNumericFloat(); 655 Register tempXMMReg = asRegister(tempXMM); 656 if (kind1 == JavaKind.Float) { 657 masm.movflt(tempXMMReg, src); 658 } else { 659 masm.movdbl(tempXMMReg, src); 660 } 661 SSEOp.UCOMIS.emit(masm, kind1 == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg); 662 masm.jcc(ConditionFlag.NoParity, branchIfNonNaN); 663 } 664 665 /** 666 * Emits code to compare if two floats are bitwise equal or both NaN. 667 */ 668 private void emitFloatCompare(AMD64MacroAssembler masm, Register base1, Register base2, Register index, int offset, Label falseLabel, 669 boolean skipBitwiseCompare) { 670 AMD64Address address1 = new AMD64Address(base1, index, arrayIndexScale1, offset); 671 AMD64Address address2 = new AMD64Address(base2, index, arrayIndexScale2, offset); 672 673 Label bitwiseEqual = new Label(); 674 675 if (!skipBitwiseCompare) { 676 // Bitwise compare 677 Register temp = asRegister(temp4); 678 679 if (kind1 == JavaKind.Float) { 680 masm.movl(temp, address1); 681 masm.cmpl(temp, address2); 682 } else { 683 masm.movq(temp, address1); 684 masm.cmpq(temp, address2); 685 } 686 masm.jccb(ConditionFlag.Equal, bitwiseEqual); 687 } 688 689 emitNaNCheck(masm, address1, falseLabel); 690 emitNaNCheck(masm, address2, falseLabel); 691 692 masm.bind(bitwiseEqual); 693 } 694 695 /** 696 * Emits code to compare float equality within a range. 697 */ 698 private void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm, 699 Register base1, Register base2, Register index, int offset, Label falseLabel, int range) { 700 assert kind1.isNumericFloat(); 701 Label loop = new Label(); 702 Register i = asRegister(temp5); 703 704 masm.movq(i, range); 705 masm.negq(i); 706 // Align the main loop 707 masm.align(crb.target.wordSize * 2); 708 masm.bind(loop); 709 emitFloatCompare(masm, base1, base2, index, offset, falseLabel, range == 1); 710 masm.incrementq(index, 1); 711 masm.incrementq(i, 1); 712 masm.jccb(ConditionFlag.NotZero, loop); 713 // Floats within the range are equal, revert change to the register index 714 masm.subq(index, range); 715 } 716 717 private boolean constantLengthCompareNeedsTmpArrayPointers() { 718 AVXKind.AVXSize vSize = vectorSize; 719 if (constantLength < getElementsPerVector(vectorSize)) { 720 vSize = AVXKind.AVXSize.XMM; 721 } 722 int vectorCount = constantLength & ~(2 * getElementsPerVector(vSize) - 1); 723 return vectorCount > 0; 724 } 725 726 /** 727 * Emits specialized assembly for checking equality of memory regions 728 * {@code arrayPtr1[0..nBytes]} and {@code arrayPtr2[0..nBytes]}. If they match, execution 729 * continues directly after the emitted code block, otherwise we jump to {@code noMatch}. 730 */ 731 private void emitConstantLengthArrayCompareBytes( 732 CompilationResultBuilder crb, 733 AMD64MacroAssembler asm, 734 Register[] tmpVectors, 735 Label noMatch) { 736 if (constantLength == 0) { 737 // do nothing 738 return; 739 } 740 Register arrayPtr1 = asRegister(array1Value); 741 Register arrayPtr2 = asRegister(array2Value); 742 Register tmp = asRegister(temp3); 743 AVXKind.AVXSize vSize = vectorSize; 744 if (constantLength < getElementsPerVector(vectorSize)) { 745 vSize = AVXKind.AVXSize.XMM; 746 } 747 int elementsPerVector = getElementsPerVector(vSize); 748 if (elementsPerVector > constantLength) { 749 assert kind1 == kind2; 750 int byteLength = constantLength << arrayIndexScale1.log2; 751 // array is shorter than any vector register, use regular XOR instructions 752 int movSize = (byteLength < 2) ? 1 : ((byteLength < 4) ? 2 : ((byteLength < 8) ? 4 : 8)); 753 emitMovBytes(asm, tmp, new AMD64Address(arrayPtr1, arrayBaseOffset1), movSize); 754 emitXorBytes(asm, tmp, new AMD64Address(arrayPtr2, arrayBaseOffset2), movSize); 755 asm.jccb(AMD64Assembler.ConditionFlag.NotZero, noMatch); 756 if (byteLength > movSize) { 757 emitMovBytes(asm, tmp, new AMD64Address(arrayPtr1, arrayBaseOffset1 + byteLength - movSize), movSize); 758 emitXorBytes(asm, tmp, new AMD64Address(arrayPtr2, arrayBaseOffset2 + byteLength - movSize), movSize); 759 asm.jccb(AMD64Assembler.ConditionFlag.NotZero, noMatch); 760 } 761 } else { 762 int elementsPerVectorLoop = 2 * elementsPerVector; 763 int tailCount = constantLength & (elementsPerVectorLoop - 1); 764 int vectorCount = constantLength & ~(elementsPerVectorLoop - 1); 765 int bytesPerVector = vSize.getBytes(); 766 if (vectorCount > 0) { 767 Label loopBegin = new Label(); 768 Register tmpArrayPtr1 = asRegister(temp1); 769 Register tmpArrayPtr2 = asRegister(temp2); 770 asm.leaq(tmpArrayPtr1, new AMD64Address(arrayPtr1, vectorCount << arrayIndexScale1.log2)); 771 asm.leaq(tmpArrayPtr2, new AMD64Address(arrayPtr2, vectorCount << arrayIndexScale2.log2)); 772 arrayPtr1 = tmpArrayPtr1; 773 arrayPtr2 = tmpArrayPtr2; 774 asm.movq(tmp, -vectorCount); 775 asm.align(crb.target.wordSize * 2); 776 asm.bind(loopBegin); 777 emitVectorLoad1(asm, tmpVectors[0], arrayPtr1, tmp, arrayBaseOffset1, vSize); 778 emitVectorLoad2(asm, tmpVectors[1], arrayPtr2, tmp, arrayBaseOffset2, vSize); 779 emitVectorLoad1(asm, tmpVectors[2], arrayPtr1, tmp, arrayBaseOffset1 + scaleDisplacement1(bytesPerVector), vSize); 780 emitVectorLoad2(asm, tmpVectors[3], arrayPtr2, tmp, arrayBaseOffset2 + scaleDisplacement2(bytesPerVector), vSize); 781 emitVectorXor(asm, tmpVectors[0], tmpVectors[1], vSize); 782 emitVectorXor(asm, tmpVectors[2], tmpVectors[3], vSize); 783 emitVectorTest(asm, tmpVectors[0], vSize); 784 asm.jccb(AMD64Assembler.ConditionFlag.NotZero, noMatch); 785 emitVectorTest(asm, tmpVectors[2], vSize); 786 asm.jccb(AMD64Assembler.ConditionFlag.NotZero, noMatch); 787 asm.addq(tmp, elementsPerVectorLoop); 788 asm.jccb(AMD64Assembler.ConditionFlag.NotZero, loopBegin); 789 } 790 if (tailCount > 0) { 791 emitVectorLoad1(asm, tmpVectors[0], arrayPtr1, arrayBaseOffset1 + (tailCount << arrayIndexScale1.log2) - scaleDisplacement1(bytesPerVector), vSize); 792 emitVectorLoad2(asm, tmpVectors[1], arrayPtr2, arrayBaseOffset2 + (tailCount << arrayIndexScale2.log2) - scaleDisplacement2(bytesPerVector), vSize); 793 emitVectorXor(asm, tmpVectors[0], tmpVectors[1], vSize); 794 if (tailCount > elementsPerVector) { 795 emitVectorLoad1(asm, tmpVectors[2], arrayPtr1, arrayBaseOffset1, vSize); 796 emitVectorLoad2(asm, tmpVectors[3], arrayPtr2, arrayBaseOffset2, vSize); 797 emitVectorXor(asm, tmpVectors[2], tmpVectors[3], vSize); 798 emitVectorTest(asm, tmpVectors[2], vSize); 799 asm.jccb(AMD64Assembler.ConditionFlag.NotZero, noMatch); 800 } 801 emitVectorTest(asm, tmpVectors[0], vSize); 802 asm.jccb(AMD64Assembler.ConditionFlag.NotZero, noMatch); 803 } 804 } 805 } 806 807 private void emitMovBytes(AMD64MacroAssembler asm, Register dst, AMD64Address src, int size) { 808 switch (size) { 809 case 1: 810 if (signExtend) { 811 asm.movsbq(dst, src); 812 } else { 813 asm.movzbq(dst, src); 814 } 815 break; 816 case 2: 817 if (signExtend) { 818 asm.movswq(dst, src); 819 } else { 820 asm.movzwq(dst, src); 821 } 822 break; 823 case 4: 824 if (signExtend) { 825 asm.movslq(dst, src); 826 } else { 827 // there is no movzlq 828 asm.movl(dst, src); 829 } 830 break; 831 case 8: 832 asm.movq(dst, src); 833 break; 834 default: 835 throw new IllegalStateException(); 836 } 837 } 838 839 private static void emitXorBytes(AMD64MacroAssembler asm, Register dst, AMD64Address src, int size) { 840 OperandSize opSize = getOperandSize(size); 841 XOR.getRMOpcode(opSize).emit(asm, opSize, dst, src); 842 } 843 844 private static OperandSize getOperandSize(int size) { 845 switch (size) { 846 case 1: 847 return OperandSize.BYTE; 848 case 2: 849 return OperandSize.WORD; 850 case 4: 851 return OperandSize.DWORD; 852 case 8: 853 return OperandSize.QWORD; 854 default: 855 throw new IllegalStateException(); 856 } 857 } 858 }