< prev index next >

src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.amd64/src/org/graalvm/compiler/lir/amd64/AMD64ArrayEqualsOp.java

Print this page




  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import jdk.vm.ci.amd64.AMD64;
  28 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  29 import jdk.vm.ci.amd64.AMD64Kind;
  30 import jdk.vm.ci.code.Register;
  31 import jdk.vm.ci.code.TargetDescription;
  32 import jdk.vm.ci.meta.JavaKind;
  33 import jdk.vm.ci.meta.Value;
  34 import org.graalvm.compiler.asm.Label;
  35 import org.graalvm.compiler.asm.amd64.AMD64Address;
  36 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  37 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
  38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
  39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp;
  40 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize;
  41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  42 import org.graalvm.compiler.asm.amd64.AVXKind;
  43 import org.graalvm.compiler.core.common.LIRKind;
  44 import org.graalvm.compiler.core.common.NumUtil;
  45 import org.graalvm.compiler.lir.LIRInstructionClass;
  46 import org.graalvm.compiler.lir.Opcode;
  47 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  48 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  49 
  50 import static jdk.vm.ci.code.ValueUtil.asRegister;
  51 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
  52 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
  53 


  54 /**
  55  * Emits code which compares two arrays of the same length. If the CPU supports any vector
  56  * instructions specialized code is emitted to leverage these instructions.




  57  */
  58 @Opcode("ARRAY_EQUALS")
  59 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction {
  60     public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class);
  61 
  62     private final JavaKind kind;
  63     private final int arrayBaseOffset;
  64     private final int arrayIndexScale;
  65     private final int constantByteLength;





  66 
  67     @Def({REG}) private Value resultValue;
  68     @Alive({REG}) private Value array1Value;
  69     @Alive({REG}) private Value array2Value;
  70     @Alive({REG}) private Value lengthValue;
  71     @Temp({REG}) private Value temp1;
  72     @Temp({REG}) private Value temp2;
  73     @Temp({REG}) private Value temp3;
  74     @Temp({REG}) private Value temp4;
  75 
  76     @Temp({REG, ILLEGAL}) private Value temp5;
  77     @Temp({REG, ILLEGAL}) private Value tempXMM;
  78 
  79     @Temp({REG, ILLEGAL}) private Value vectorTemp1;
  80     @Temp({REG, ILLEGAL}) private Value vectorTemp2;
  81     @Temp({REG, ILLEGAL}) private Value vectorTemp3;
  82     @Temp({REG, ILLEGAL}) private Value vectorTemp4;
  83 
  84     public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind, Value result, Value array1, Value array2, Value length,
  85                     int constantLength, boolean directPointers, int maxVectorSize) {
  86         super(TYPE);
  87         this.kind = kind;
  88 
  89         this.arrayBaseOffset = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind);
  90         this.arrayIndexScale = tool.getProviders().getMetaAccess().getArrayIndexScale(kind);
  91 
  92         if (constantLength >= 0 && arrayIndexScale > 1) {
  93             // scale length
  94             this.constantByteLength = constantLength << NumUtil.log2Ceil(arrayIndexScale);
  95         } else {
  96             this.constantByteLength = constantLength;
  97         }

  98 
  99         this.resultValue = result;
 100         this.array1Value = array1;
 101         this.array2Value = array2;
 102         this.lengthValue = length;
 103 
 104         // Allocate some temporaries.
 105         this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
 106         this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
 107         this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
 108         this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
 109 
 110         this.temp5 = kind.isNumericFloat() ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL;
 111         if (kind == JavaKind.Float) {
 112             this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE));
 113         } else if (kind == JavaKind.Double) {
 114             this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 115         } else {
 116             this.tempXMM = Value.ILLEGAL;
 117         }
 118 
 119         // We only need the vector temporaries if we generate SSE code.
 120         if (supportsSSE41(tool.target())) {
 121             if (canGenerateConstantLengthCompare(tool.target())) {
 122                 LIRKind lirKind = LIRKind.value(supportsAVX2(tool.target()) && (maxVectorSize < 0 || maxVectorSize >= 32) ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE);
 123                 this.vectorTemp1 = tool.newVariable(lirKind);
 124                 this.vectorTemp2 = tool.newVariable(lirKind);
 125                 this.vectorTemp3 = tool.newVariable(lirKind);
 126                 this.vectorTemp4 = tool.newVariable(lirKind);
 127             } else {
 128                 this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 129                 this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 130                 this.vectorTemp3 = Value.ILLEGAL;
 131                 this.vectorTemp4 = Value.ILLEGAL;
 132             }
 133         } else {
 134             this.vectorTemp1 = Value.ILLEGAL;
 135             this.vectorTemp2 = Value.ILLEGAL;
 136             this.vectorTemp3 = Value.ILLEGAL;
 137             this.vectorTemp4 = Value.ILLEGAL;
 138         }
 139     }
 140 
 141     private boolean canGenerateConstantLengthCompare(TargetDescription target) {
 142         return constantByteLength >= 0 && kind.isNumericInteger() && supportsSSE41(target);
 143     }
 144 
 145     @Override
 146     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
 147         Register result = asRegister(resultValue);
 148         Register array1 = asRegister(temp1);
 149         Register array2 = asRegister(temp2);
 150 
 151         Label trueLabel = new Label();
 152         Label falseLabel = new Label();
 153         Label done = new Label();
 154 
 155         // Load array base addresses.
 156         masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset));
 157         masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset));
 158 
 159         if (canGenerateConstantLengthCompare(crb.target)) {
 160             emitConstantLengthArrayCompareBytes(masm, array1, array2, asRegister(temp3), asRegister(temp4),
 161                             new Register[]{asRegister(vectorTemp1), asRegister(vectorTemp2), asRegister(vectorTemp3), asRegister(vectorTemp4)},
 162                             falseLabel, constantByteLength, AVXKind.getRegisterSize(vectorTemp1).getBytes());
 163         } else {
 164             Register length = asRegister(temp3);
 165 
 166             // Get array length in bytes.
 167             masm.movl(length, asRegister(lengthValue));
 168 
 169             if (arrayIndexScale > 1) {
 170                 masm.shll(length, NumUtil.log2Ceil(arrayIndexScale)); // scale length
 171             }
 172 
 173             masm.movl(result, length); // copy
 174 
 175             emitArrayCompare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel);
 176         }
 177 
 178         // Return true
 179         masm.bind(trueLabel);
 180         masm.movl(result, 1);
 181         masm.jmpb(done);
 182 
 183         // Return false
 184         masm.bind(falseLabel);
 185         masm.xorl(result, result);
 186 
 187         // That's it
 188         masm.bind(done);
 189     }
 190 
 191     private static void emitArrayCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind,
 192                     Register result, Register array1, Register array2, Register length,
 193                     Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2,
 194                     Label trueLabel, Label falseLabel) {
 195         if (supportsAVX2(crb.target)) {
 196             emitAVXCompare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel);
 197         } else if (supportsSSE41(crb.target)) {
 198             // this code is used for AVX as well because our backend correctly ensures that
 199             // VEX-prefixed instructions are emitted if AVX is supported
 200             emitSSE41Compare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel);


 201         }
 202         emit8ByteCompare(crb, masm, kind, result, array1, array2, length, temp4, tempXMM, trueLabel, falseLabel);
 203         emitTailCompares(masm, kind, result, array1, array2, length, temp4, tempXMM, trueLabel, falseLabel);
 204     }
 205 
 206     /**
 207      * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions.
 208      *
 209      * @param target target description of the underlying architecture
 210      * @return true if the underlying architecture supports SSE 4.1
 211      */
 212     private static boolean supportsSSE41(TargetDescription target) {
 213         AMD64 arch = (AMD64) target.arch;
 214         return arch.getFeatures().contains(CPUFeature.SSE4_1);
 215     }
 216 
 217     /**
 218      * Vector size used in {@link #emitSSE41Compare}.
 219      */
 220     private static final int SSE4_1_VECTOR_SIZE = 16;
 221 
 222     /**
 223      * Emits code that uses SSE4.1 128-bit (16-byte) vector compares.
 224      */
 225     private static void emitSSE41Compare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind,
 226                     Register result, Register array1, Register array2, Register length,
 227                     Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2,
 228                     Label trueLabel, Label falseLabel) {
 229         assert supportsSSE41(crb.target);
 230 
 231         Register vector1 = asRegister(vectorTemp1);
 232         Register vector2 = asRegister(vectorTemp2);
 233 


 234         Label loop = new Label();
 235         Label compareTail = new Label();
 236 
 237         boolean requiresNaNCheck = kind.isNumericFloat();
 238         Label loopCheck = new Label();
 239         Label nanCheck = new Label();
 240 
 241         // Compare 16-byte vectors
 242         masm.andl(result, SSE4_1_VECTOR_SIZE - 1); // tail count (in bytes)
 243         masm.andl(length, ~(SSE4_1_VECTOR_SIZE - 1)); // vector count (in bytes)
 244         masm.jcc(ConditionFlag.Zero, compareTail);
 245 
 246         masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
 247         masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
 248         masm.negq(length);
 249 
 250         // Align the main loop
 251         masm.align(crb.target.wordSize * 2);
 252         masm.bind(loop);
 253         masm.movdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
 254         masm.movdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
 255         masm.pxor(vector1, vector2);
 256         masm.ptest(vector1, vector1);
 257         masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
 258 
 259         masm.bind(loopCheck);
 260         masm.addq(length, SSE4_1_VECTOR_SIZE);
 261         masm.jcc(ConditionFlag.NotZero, loop);
 262 
 263         masm.testl(result, result);
 264         masm.jcc(ConditionFlag.Zero, trueLabel);
 265 
 266         if (requiresNaNCheck) {
 267             Label unalignedCheck = new Label();
 268             masm.jmpb(unalignedCheck);
 269             masm.bind(nanCheck);
 270             emitFloatCompareWithinRange(crb, masm, kind, array1, array2, length, temp4, temp5, tempXMM, 0, falseLabel, SSE4_1_VECTOR_SIZE);
 271             masm.jmpb(loopCheck);
 272             masm.bind(unalignedCheck);
 273         }
 274 
 275         /*
 276          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 277          * array.
 278          */
 279         masm.movdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -SSE4_1_VECTOR_SIZE));
 280         masm.movdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -SSE4_1_VECTOR_SIZE));
 281         masm.pxor(vector1, vector2);
 282         masm.ptest(vector1, vector1);
 283         if (requiresNaNCheck) {
 284             masm.jcc(ConditionFlag.Zero, trueLabel);
 285             emitFloatCompareWithinRange(crb, masm, kind, array1, array2, result, temp4, temp5, tempXMM, -SSE4_1_VECTOR_SIZE, falseLabel, SSE4_1_VECTOR_SIZE);
 286         } else {
 287             masm.jcc(ConditionFlag.NotZero, falseLabel);
 288         }
 289         masm.jmp(trueLabel);
 290 
 291         masm.bind(compareTail);
 292         masm.movl(length, result);
 293     }
 294 
 295     /**
 296      * Returns if the underlying AMD64 architecture supports AVX instructions.
 297      *
 298      * @param target target description of the underlying architecture
 299      * @return true if the underlying architecture supports AVX
 300      */
 301     private static boolean supportsAVX2(TargetDescription target) {
 302         AMD64 arch = (AMD64) target.arch;
 303         return arch.getFeatures().contains(CPUFeature.AVX2);
 304     }
 305 
 306     /**
 307      * Vector size used in {@link #emitAVXCompare}.
 308      */
 309     private static final int AVX_VECTOR_SIZE = 32;
 310 
 311     private static void emitAVXCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register result,
 312                     Register array1, Register array2, Register length,
 313                     Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2,
 314                     Label trueLabel, Label falseLabel) {
 315         assert supportsAVX2(crb.target);
 316 
 317         Register vector1 = asRegister(vectorTemp1);
 318         Register vector2 = asRegister(vectorTemp2);

 319 
 320         Label loop = new Label();
 321         Label compareTail = new Label();

 322 
 323         boolean requiresNaNCheck = kind.isNumericFloat();
 324         Label loopCheck = new Label();
 325         Label nanCheck = new Label();













 326 
 327         // Compare 32-byte vectors
 328         masm.andl(result, AVX_VECTOR_SIZE - 1); // tail count (in bytes)
 329         masm.andl(length, ~(AVX_VECTOR_SIZE - 1)); // vector count (in bytes)
 330         masm.jcc(ConditionFlag.Zero, compareTail);
 331 
 332         masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
 333         masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
 334         masm.negq(length);
 335 
 336         // Align the main loop
 337         masm.align(crb.target.wordSize * 2);
 338         masm.bind(loop);
 339         masm.vmovdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
 340         masm.vmovdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
 341         masm.vpxor(vector1, vector1, vector2);
 342         masm.vptest(vector1, vector1);
 343         masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
 344 
 345         masm.bind(loopCheck);
 346         masm.addq(length, AVX_VECTOR_SIZE);
 347         masm.jcc(ConditionFlag.NotZero, loop);














































































 348 
 349         masm.testl(result, result);
 350         masm.jcc(ConditionFlag.Zero, trueLabel);


 351 
 352         if (requiresNaNCheck) {
 353             Label unalignedCheck = new Label();
 354             masm.jmpb(unalignedCheck);
 355             masm.bind(nanCheck);
 356             emitFloatCompareWithinRange(crb, masm, kind, array1, array2, length, temp4, temp5, tempXMM, 0, falseLabel, AVX_VECTOR_SIZE);
 357             masm.jmpb(loopCheck);
 358             masm.bind(unalignedCheck);
 359         }

 360 
 361         /*
 362          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 363          * array.
 364          */
 365         masm.vmovdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -AVX_VECTOR_SIZE));
 366         masm.vmovdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -AVX_VECTOR_SIZE));
 367         masm.vpxor(vector1, vector1, vector2);
 368         masm.vptest(vector1, vector1);
 369         if (requiresNaNCheck) {
 370             masm.jcc(ConditionFlag.Zero, trueLabel);
 371             emitFloatCompareWithinRange(crb, masm, kind, array1, array2, result, temp4, temp5, tempXMM, -AVX_VECTOR_SIZE, falseLabel, AVX_VECTOR_SIZE);
 372         } else {
 373             masm.jcc(ConditionFlag.NotZero, falseLabel);
 374         }
 375         masm.jmp(trueLabel);
 376 
 377         masm.bind(compareTail);
 378         masm.movl(length, result);
 379     }
 380 
 381     /**
 382      * Vector size used in {@link #emit8ByteCompare}.
 383      */
 384     private static final int VECTOR_SIZE = 8;
 385 
 386     /**
 387      * Emits code that uses 8-byte vector compares.
 388      */
 389     private static void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register result, Register array1, Register array2, Register length, Value temp4,
 390                     Value tempXMM, Label trueLabel, Label falseLabel) {

 391         Label loop = new Label();
 392         Label compareTail = new Label();
 393 
 394         boolean requiresNaNCheck = kind.isNumericFloat();


 395         Label loopCheck = new Label();
 396         Label nanCheck = new Label();
 397 
 398         Register temp = asRegister(temp4);
 399 
 400         masm.andl(result, VECTOR_SIZE - 1); // tail count (in bytes)
 401         masm.andl(length, ~(VECTOR_SIZE - 1));  // vector count (in bytes)
 402         masm.jcc(ConditionFlag.Zero, compareTail);
 403 
 404         masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
 405         masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
 406         masm.negq(length);
 407 
 408         // Align the main loop
 409         masm.align(crb.target.wordSize * 2);
 410         masm.bind(loop);
 411         masm.movq(temp, new AMD64Address(array1, length, Scale.Times1, 0));
 412         masm.cmpq(temp, new AMD64Address(array2, length, Scale.Times1, 0));
 413         masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel);
 414 
 415         masm.bind(loopCheck);
 416         masm.addq(length, VECTOR_SIZE);
 417         masm.jccb(ConditionFlag.NotZero, loop);
 418 
 419         masm.testl(result, result);
 420         masm.jcc(ConditionFlag.Zero, trueLabel);
 421 
 422         if (requiresNaNCheck) {
 423             // NaN check is slow path and hence placed outside of the main loop.
 424             Label unalignedCheck = new Label();
 425             masm.jmpb(unalignedCheck);
 426             masm.bind(nanCheck);
 427             // At most two iterations, unroll in the emitted code.
 428             for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
 429                 emitFloatCompare(masm, kind, array1, array2, length, temp4, tempXMM, offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
 430             }
 431             masm.jmpb(loopCheck);
 432             masm.bind(unalignedCheck);
 433         }
 434 
 435         /*
 436          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 437          * array.
 438          */
 439         masm.movq(temp, new AMD64Address(array1, result, Scale.Times1, -VECTOR_SIZE));
 440         masm.cmpq(temp, new AMD64Address(array2, result, Scale.Times1, -VECTOR_SIZE));
 441         if (requiresNaNCheck) {
 442             masm.jcc(ConditionFlag.Equal, trueLabel);
 443             // At most two iterations, unroll in the emitted code.
 444             for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
 445                 emitFloatCompare(masm, kind, array1, array2, result, temp4, tempXMM, -VECTOR_SIZE + offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
 446             }
 447         } else {
 448             masm.jccb(ConditionFlag.NotEqual, falseLabel);
 449         }
 450         masm.jmpb(trueLabel);
 451 
 452         masm.bind(compareTail);
 453         masm.movl(length, result);
 454     }
 455 
 456     /**
 457      * Emits code to compare the remaining 1 to 4 bytes.
 458      */
 459     private static void emitTailCompares(AMD64MacroAssembler masm, JavaKind kind, Register result, Register array1, Register array2, Register length, Value temp4, Value tempXMM,
 460                     Label trueLabel, Label falseLabel) {

 461         Label compare2Bytes = new Label();
 462         Label compare1Byte = new Label();
 463 
 464         Register temp = asRegister(temp4);
 465 
 466         if (kind.getByteCount() <= 4) {
 467             // Compare trailing 4 bytes, if any.
 468             masm.testl(result, 4);
 469             masm.jccb(ConditionFlag.Zero, compare2Bytes);
 470             masm.movl(temp, new AMD64Address(array1, 0));
 471             masm.cmpl(temp, new AMD64Address(array2, 0));
 472             if (kind == JavaKind.Float) {
 473                 masm.jccb(ConditionFlag.Equal, trueLabel);
 474                 emitFloatCompare(masm, kind, array1, array2, Register.None, temp4, tempXMM, 0, falseLabel, true);
 475                 masm.jmpb(trueLabel);
 476             } else {
 477                 masm.jccb(ConditionFlag.NotEqual, falseLabel);
 478             }
 479             if (kind.getByteCount() <= 2) {
 480                 // Move array pointers forward.
 481                 masm.leaq(array1, new AMD64Address(array1, 4));
 482                 masm.leaq(array2, new AMD64Address(array2, 4));
 483 
 484                 // Compare trailing 2 bytes, if any.
 485                 masm.bind(compare2Bytes);
 486                 masm.testl(result, 2);
 487                 masm.jccb(ConditionFlag.Zero, compare1Byte);
 488                 masm.movzwl(temp, new AMD64Address(array1, 0));
 489                 masm.movzwl(length, new AMD64Address(array2, 0));
 490                 masm.cmpl(temp, length);
 491                 masm.jccb(ConditionFlag.NotEqual, falseLabel);
 492 
 493                 // The one-byte tail compare is only required for boolean and byte arrays.
 494                 if (kind.getByteCount() <= 1) {
 495                     // Move array pointers forward before we compare the last trailing byte.
 496                     masm.leaq(array1, new AMD64Address(array1, 2));
 497                     masm.leaq(array2, new AMD64Address(array2, 2));
 498 
 499                     // Compare trailing byte, if any.
 500                     masm.bind(compare1Byte);
 501                     masm.testl(result, 1);
 502                     masm.jccb(ConditionFlag.Zero, trueLabel);
 503                     masm.movzbl(temp, new AMD64Address(array1, 0));
 504                     masm.movzbl(length, new AMD64Address(array2, 0));
 505                     masm.cmpl(temp, length);
 506                     masm.jccb(ConditionFlag.NotEqual, falseLabel);
 507                 } else {
 508                     masm.bind(compare1Byte);
 509                 }
 510             } else {
 511                 masm.bind(compare2Bytes);
 512             }
 513         }
 514     }
 515 






















































 516     /**
 517      * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}.
 518      */
 519     private static void emitNaNCheck(AMD64MacroAssembler masm, JavaKind kind, Value tempXMM, AMD64Address src, Label branchIfNonNaN) {
 520         assert kind.isNumericFloat();
 521         Register tempXMMReg = asRegister(tempXMM);
 522         if (kind == JavaKind.Float) {
 523             masm.movflt(tempXMMReg, src);
 524         } else {
 525             masm.movdbl(tempXMMReg, src);
 526         }
 527         SSEOp.UCOMIS.emit(masm, kind == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg);
 528         masm.jcc(ConditionFlag.NoParity, branchIfNonNaN);
 529     }
 530 
 531     /**
 532      * Emits code to compare if two floats are bitwise equal or both NaN.
 533      */
 534     private static void emitFloatCompare(AMD64MacroAssembler masm, JavaKind kind, Register base1, Register base2, Register index, Value temp4, Value tempXMM, int offset, Label falseLabel,
 535                     boolean skipBitwiseCompare) {
 536         AMD64Address address1 = new AMD64Address(base1, index, Scale.Times1, offset);
 537         AMD64Address address2 = new AMD64Address(base2, index, Scale.Times1, offset);
 538 
 539         Label bitwiseEqual = new Label();
 540 
 541         if (!skipBitwiseCompare) {
 542             // Bitwise compare
 543             Register temp = asRegister(temp4);
 544 
 545             if (kind == JavaKind.Float) {
 546                 masm.movl(temp, address1);
 547                 masm.cmpl(temp, address2);
 548             } else {
 549                 masm.movq(temp, address1);
 550                 masm.cmpq(temp, address2);
 551             }
 552             masm.jccb(ConditionFlag.Equal, bitwiseEqual);
 553         }
 554 
 555         emitNaNCheck(masm, kind, tempXMM, address1, falseLabel);
 556         emitNaNCheck(masm, kind, tempXMM, address2, falseLabel);
 557 
 558         masm.bind(bitwiseEqual);
 559     }
 560 
 561     /**
 562      * Emits code to compare float equality within a range.
 563      */
 564     private static void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register base1, Register base2, Register index, Value temp4, Value temp5,
 565                     Value tempXMM, int offset, Label falseLabel, int range) {
 566         assert kind.isNumericFloat();
 567         Label loop = new Label();
 568         Register i = asRegister(temp5);
 569 
 570         masm.movq(i, range);
 571         masm.negq(i);
 572         // Align the main loop
 573         masm.align(crb.target.wordSize * 2);
 574         masm.bind(loop);
 575         emitFloatCompare(masm, kind, base1, base2, index, temp4, tempXMM, offset, falseLabel, kind.getByteCount() == range);
 576         masm.addq(index, kind.getByteCount());
 577         masm.addq(i, kind.getByteCount());
 578         masm.jccb(ConditionFlag.NotZero, loop);
 579         // Floats within the range are equal, revert change to the register index
 580         masm.subq(index, range);
 581     }
 582 
 583     /**
 584      * Emits specialized assembly for checking equality of memory regions
 585      * {@code arrayPtr1[0..nBytes]} and {@code arrayPtr2[0..nBytes]}. If they match, execution
 586      * continues directly after the emitted code block, otherwise we jump to {@code noMatch}.
 587      */
 588     private static void emitConstantLengthArrayCompareBytes(

 589                     AMD64MacroAssembler asm,
 590                     Register arrayPtr1,
 591                     Register arrayPtr2,
 592                     Register tmp1,
 593                     Register tmp2,
 594                     Register[] tmpVectors,
 595                     Label noMatch,
 596                     int nBytes,
 597                     int bytesPerVector) {
 598         assert bytesPerVector >= 16;
 599         if (nBytes == 0) {
 600             // do nothing
 601             return;
 602         }
 603         if (nBytes < 16) {







 604             // array is shorter than any vector register, use regular CMP instructions
 605             int movSize = (nBytes < 2) ? 1 : ((nBytes < 4) ? 2 : ((nBytes < 8) ? 4 : 8));
 606             emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1), movSize);
 607             emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2), movSize);
 608             emitCmpBytes(asm, tmp1, tmp2, movSize);
 609             asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch);
 610             if (nBytes > movSize) {
 611                 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1, nBytes - movSize), movSize);
 612                 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2, nBytes - movSize), movSize);
 613                 emitCmpBytes(asm, tmp1, tmp2, movSize);
 614                 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch);
 615             }
 616         } else if (nBytes < 32 && bytesPerVector >= 32) {
 617             // we could use YMM registers, but the array is too short, force XMM registers
 618             int bytesPerXMMVector = AVXKind.AVXSize.XMM.getBytes();
 619             AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], new AMD64Address(arrayPtr1));
 620             AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[1], new AMD64Address(arrayPtr2));
 621             AMD64Assembler.VexRVMOp.VPXOR.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 622             if (nBytes > bytesPerXMMVector) {
 623                 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], new AMD64Address(arrayPtr1, nBytes - bytesPerXMMVector));
 624                 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[3], new AMD64Address(arrayPtr2, nBytes - bytesPerXMMVector));
 625                 AMD64Assembler.VexRVMOp.VPXOR.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], tmpVectors[2], tmpVectors[3]);
 626                 AMD64Assembler.VexRMOp.VPTEST.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], tmpVectors[2]);








 627                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 628             }
 629             AMD64Assembler.VexRMOp.VPTEST.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], tmpVectors[0]);
 630             asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 631         } else if (bytesPerVector >= 32) {
 632             // AVX2 supported, use YMM vectors
 633             assert asm.supports(CPUFeature.AVX2);
 634             int loopCount = nBytes / (bytesPerVector * 2);
 635             int rest = nBytes % (bytesPerVector * 2);
 636             if (loopCount > 0) {
 637                 if (0 < rest && rest < bytesPerVector) {
 638                     loopCount--;
 639                 }
 640                 if (loopCount > 0) {
 641                     if (loopCount > 1) {
 642                         asm.movl(tmp1, loopCount);
 643                     }
 644                     Label loopBegin = new Label();
 645                     asm.bind(loopBegin);
 646                     asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 647                     asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 648                     asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
 649                     asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
 650                     asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 651                     asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]);
 652                     asm.vptest(tmpVectors[0], tmpVectors[0]);
 653                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 654                     asm.vptest(tmpVectors[2], tmpVectors[2]);
 655                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 656                     asm.addq(arrayPtr1, bytesPerVector * 2);
 657                     asm.addq(arrayPtr2, bytesPerVector * 2);
 658                     if (loopCount > 1) {
 659                         asm.decrementl(tmp1);
 660                         asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin);
 661                     }
 662                 }
 663                 if (0 < rest && rest < bytesPerVector) {
 664                     asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 665                     asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 666                     asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
 667                     asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
 668                     asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 669                     asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]);
 670                     asm.vptest(tmpVectors[0], tmpVectors[0]);
 671                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 672                     asm.vptest(tmpVectors[2], tmpVectors[2]);
 673                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 674                     asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1, bytesPerVector + rest));
 675                     asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2, bytesPerVector + rest));
 676                     asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 677                     asm.vptest(tmpVectors[0], tmpVectors[0]);
 678                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 679                 }
 680             }
 681             if (rest >= bytesPerVector) {
 682                 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 683                 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 684                 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 685                 if (rest > bytesPerVector) {
 686                     asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, rest - bytesPerVector));
 687                     asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, rest - bytesPerVector));
 688                     asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]);
 689                     asm.vptest(tmpVectors[2], tmpVectors[2]);
 690                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 691                 }
 692                 asm.vptest(tmpVectors[0], tmpVectors[0]);
 693                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);


 694             }
 695         } else {
 696             // on AVX or SSE, use XMM vectors
 697             int loopCount = nBytes / (bytesPerVector * 2);
 698             int rest = nBytes % (bytesPerVector * 2);
 699             if (loopCount > 0) {
 700                 if (0 < rest && rest < bytesPerVector) {
 701                     loopCount--;
 702                 }
 703                 if (loopCount > 0) {
 704                     if (loopCount > 1) {
 705                         asm.movl(tmp1, loopCount);
 706                     }
 707                     Label loopBegin = new Label();
 708                     asm.bind(loopBegin);
 709                     asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 710                     asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 711                     asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
 712                     asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
 713                     asm.pxor(tmpVectors[0], tmpVectors[1]);
 714                     asm.pxor(tmpVectors[2], tmpVectors[3]);
 715                     asm.ptest(tmpVectors[0], tmpVectors[0]);
 716                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 717                     asm.ptest(tmpVectors[2], tmpVectors[2]);
 718                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 719                     asm.addq(arrayPtr1, bytesPerVector * 2);
 720                     asm.addq(arrayPtr2, bytesPerVector * 2);
 721                     if (loopCount > 1) {
 722                         asm.decrementl(tmp1);
 723                         asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin);
 724                     }
 725                 }
 726                 if (0 < rest && rest < bytesPerVector) {
 727                     asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 728                     asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 729                     asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
 730                     asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
 731                     asm.pxor(tmpVectors[0], tmpVectors[1]);
 732                     asm.pxor(tmpVectors[2], tmpVectors[3]);
 733                     asm.ptest(tmpVectors[0], tmpVectors[0]);
 734                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 735                     asm.ptest(tmpVectors[2], tmpVectors[2]);
 736                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 737                     asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1, bytesPerVector + rest));
 738                     asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2, bytesPerVector + rest));
 739                     asm.pxor(tmpVectors[0], tmpVectors[1]);
 740                     asm.ptest(tmpVectors[0], tmpVectors[0]);
 741                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 742                 }
 743             }
 744             if (rest >= bytesPerVector) {
 745                 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 746                 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 747                 asm.pxor(tmpVectors[0], tmpVectors[1]);
 748                 if (rest > bytesPerVector) {
 749                     asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, rest - bytesPerVector));
 750                     asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, rest - bytesPerVector));
 751                     asm.pxor(tmpVectors[2], tmpVectors[3]);
 752                     asm.ptest(tmpVectors[2], tmpVectors[2]);
 753                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 754                 }
 755                 asm.ptest(tmpVectors[0], tmpVectors[0]);
 756                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 757             }
 758         }
 759     }
 760 
 761     private static void emitMovBytes(AMD64MacroAssembler asm, Register dst, AMD64Address src, int size) {
 762         switch (size) {
 763             case 1:
 764                 asm.movzbl(dst, src);




 765                 break;
 766             case 2:
 767                 asm.movzwl(dst, src);




 768                 break;
 769             case 4:
 770                 asm.movl(dst, src);





 771                 break;
 772             case 8:
 773                 asm.movq(dst, src);
 774                 break;
 775             default:
 776                 throw new IllegalStateException();
 777         }
 778     }
 779 
 780     private static void emitCmpBytes(AMD64MacroAssembler asm, Register dst, Register src, int size) {
 781         if (size < 8) {
 782             asm.cmpl(dst, src);
 783         } else {
 784             asm.cmpq(dst, src);
 785         }
 786     }
 787 }


  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import jdk.vm.ci.amd64.AMD64;
  28 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  29 import jdk.vm.ci.amd64.AMD64Kind;
  30 import jdk.vm.ci.code.Register;
  31 import jdk.vm.ci.code.TargetDescription;
  32 import jdk.vm.ci.meta.JavaKind;
  33 import jdk.vm.ci.meta.Value;
  34 import org.graalvm.compiler.asm.Label;
  35 import org.graalvm.compiler.asm.amd64.AMD64Address;
  36 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  37 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
  38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
  39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp;
  40 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize;
  41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  42 import org.graalvm.compiler.asm.amd64.AVXKind;
  43 import org.graalvm.compiler.core.common.LIRKind;
  44 import org.graalvm.compiler.debug.GraalError;
  45 import org.graalvm.compiler.lir.LIRInstructionClass;
  46 import org.graalvm.compiler.lir.Opcode;
  47 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  48 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  49 
  50 import static jdk.vm.ci.code.ValueUtil.asRegister;
  51 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
  52 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
  53 
  54 import java.util.Objects;
  55 
  56 /**
  57  * Emits code which compares two arrays of the same length. If the CPU supports any vector
  58  * instructions specialized code is emitted to leverage these instructions.
  59  *
  60  * This op can also compare arrays of different integer types (e.g. {@code byte[]} and
  61  * {@code char[]}) with on-the-fly sign- or zero-extension. If one of the given arrays is a
  62  * {@code char[]} array, the smaller elements are zero-extended, otherwise they are sign-extended.
  63  */
  64 @Opcode("ARRAY_EQUALS")
  65 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction {
  66     public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class);
  67 
  68     private final JavaKind kind1;
  69     private final JavaKind kind2;
  70     private final int arrayBaseOffset1;
  71     private final int arrayBaseOffset2;
  72     private final Scale arrayIndexScale1;
  73     private final Scale arrayIndexScale2;
  74     private final AVXKind.AVXSize vectorSize;
  75     private final int constantLength;
  76     private final boolean signExtend;
  77 
  78     @Def({REG}) private Value resultValue;
  79     @Alive({REG}) private Value array1Value;
  80     @Alive({REG}) private Value array2Value;
  81     @Alive({REG}) private Value lengthValue;
  82     @Temp({REG}) private Value temp1;
  83     @Temp({REG}) private Value temp2;
  84     @Temp({REG}) private Value temp3;
  85     @Temp({REG}) private Value temp4;
  86 
  87     @Temp({REG, ILLEGAL}) private Value temp5;
  88     @Temp({REG, ILLEGAL}) private Value tempXMM;
  89 
  90     @Temp({REG, ILLEGAL}) private Value vectorTemp1;
  91     @Temp({REG, ILLEGAL}) private Value vectorTemp2;
  92     @Temp({REG, ILLEGAL}) private Value vectorTemp3;
  93     @Temp({REG, ILLEGAL}) private Value vectorTemp4;
  94 
  95     public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind1, JavaKind kind2, Value result, Value array1, Value array2, Value length,
  96                     int constantLength, boolean directPointers, int maxVectorSize) {
  97         super(TYPE);
  98         this.kind1 = kind1;
  99         this.kind2 = kind2;
 100         this.signExtend = kind1 != JavaKind.Char && kind2 != JavaKind.Char;
 101 
 102         assert kind1.isNumericInteger() && kind2.isNumericInteger() || kind1 == kind2;
 103 
 104         this.arrayBaseOffset1 = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind1);
 105         this.arrayBaseOffset2 = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind2);
 106         this.arrayIndexScale1 = Objects.requireNonNull(Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(kind1)));
 107         this.arrayIndexScale2 = Objects.requireNonNull(Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(kind2)));
 108         this.vectorSize = ((AMD64) tool.target().arch).getFeatures().contains(CPUFeature.AVX2) && (maxVectorSize < 0 || maxVectorSize >= 32) ? AVXKind.AVXSize.YMM : AVXKind.AVXSize.XMM;
 109         this.constantLength = constantLength;
 110 
 111         this.resultValue = result;
 112         this.array1Value = array1;
 113         this.array2Value = array2;
 114         this.lengthValue = length;
 115 
 116         // Allocate some temporaries.
 117         this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
 118         this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
 119         this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
 120         this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
 121 
 122         this.temp5 = kind1.isNumericFloat() || kind1 != kind2 ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL;
 123         if (kind1 == JavaKind.Float) {
 124             this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE));
 125         } else if (kind1 == JavaKind.Double) {
 126             this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 127         } else {
 128             this.tempXMM = Value.ILLEGAL;
 129         }
 130 
 131         // We only need the vector temporaries if we generate SSE code.
 132         if (supportsSSE41(tool.target())) {
 133             if (canGenerateConstantLengthCompare(tool.target())) {
 134                 LIRKind lirKind = LIRKind.value(vectorSize == AVXKind.AVXSize.YMM ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE);
 135                 this.vectorTemp1 = tool.newVariable(lirKind);
 136                 this.vectorTemp2 = tool.newVariable(lirKind);
 137                 this.vectorTemp3 = tool.newVariable(lirKind);
 138                 this.vectorTemp4 = tool.newVariable(lirKind);
 139             } else {
 140                 this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 141                 this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 142                 this.vectorTemp3 = Value.ILLEGAL;
 143                 this.vectorTemp4 = Value.ILLEGAL;
 144             }
 145         } else {
 146             this.vectorTemp1 = Value.ILLEGAL;
 147             this.vectorTemp2 = Value.ILLEGAL;
 148             this.vectorTemp3 = Value.ILLEGAL;
 149             this.vectorTemp4 = Value.ILLEGAL;
 150         }
 151     }
 152 
 153     private boolean canGenerateConstantLengthCompare(TargetDescription target) {
 154         return constantLength >= 0 && kind1.isNumericInteger() && (kind1 == kind2 || getElementsPerVector(AVXKind.AVXSize.XMM) <= constantLength) && supportsSSE41(target);
 155     }
 156 
 157     @Override
 158     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
 159         Register result = asRegister(resultValue);
 160         Register array1 = asRegister(temp1);
 161         Register array2 = asRegister(temp2);
 162 
 163         Label trueLabel = new Label();
 164         Label falseLabel = new Label();
 165         Label done = new Label();
 166 
 167         // Load array base addresses.
 168         masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset1));
 169         masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset2));
 170 
 171         if (canGenerateConstantLengthCompare(crb.target)) {
 172             emitConstantLengthArrayCompareBytes(crb, masm, array1, array2, asRegister(temp3), asRegister(temp4),
 173                             new Register[]{asRegister(vectorTemp1), asRegister(vectorTemp2), asRegister(vectorTemp3), asRegister(vectorTemp4)}, falseLabel);

 174         } else {
 175             Register length = asRegister(temp3);
 176             // Get array length.

 177             masm.movl(length, asRegister(lengthValue));
 178             // copy
 179             masm.movl(result, length);
 180             emitArrayCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);





 181         }
 182 
 183         // Return true
 184         masm.bind(trueLabel);
 185         masm.movl(result, 1);
 186         masm.jmpb(done);
 187 
 188         // Return false
 189         masm.bind(falseLabel);
 190         masm.xorl(result, result);
 191 
 192         // That's it
 193         masm.bind(done);
 194     }
 195 
 196     private void emitArrayCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm,
 197                     Register result, Register array1, Register array2, Register length,

 198                     Label trueLabel, Label falseLabel) {
 199         if (supportsSSE41(crb.target)) {
 200             emitVectorCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
 201         }
 202         if (kind1 == kind2) {
 203             emit8ByteCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
 204             emitTailCompares(masm, result, array1, array2, length, trueLabel, falseLabel);
 205         } else {
 206             emitDifferentKindsElementWiseCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
 207         }


 208     }
 209 
 210     /**
 211      * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions.
 212      *
 213      * @param target target description of the underlying architecture
 214      * @return true if the underlying architecture supports SSE 4.1
 215      */
 216     private static boolean supportsSSE41(TargetDescription target) {
 217         AMD64 arch = (AMD64) target.arch;
 218         return arch.getFeatures().contains(CPUFeature.SSE4_1);
 219     }
 220 
 221     /**
 222      * Emits code that uses SSE4.1/AVX1 128-bit (16-byte) or AVX2 256-bit (32-byte) vector compares.
 223      */
 224     private void emitVectorCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm,





 225                     Register result, Register array1, Register array2, Register length,

 226                     Label trueLabel, Label falseLabel) {
 227         assert supportsSSE41(crb.target);
 228 
 229         Register vector1 = asRegister(vectorTemp1);
 230         Register vector2 = asRegister(vectorTemp2);
 231 
 232         int elementsPerVector = getElementsPerVector(vectorSize);
 233 
 234         Label loop = new Label();
 235         Label compareTail = new Label();
 236 
 237         boolean requiresNaNCheck = kind1.isNumericFloat();
 238         Label loopCheck = new Label();
 239         Label nanCheck = new Label();
 240 
 241         // Compare 16-byte vectors
 242         masm.andl(result, elementsPerVector - 1); // tail count
 243         masm.andl(length, ~(elementsPerVector - 1)); // vector count
 244         masm.jcc(ConditionFlag.Zero, compareTail);
 245 
 246         masm.leaq(array1, new AMD64Address(array1, length, arrayIndexScale1, 0));
 247         masm.leaq(array2, new AMD64Address(array2, length, arrayIndexScale2, 0));
 248         masm.negq(length);
 249 
 250         // Align the main loop
 251         masm.align(crb.target.wordSize * 2);
 252         masm.bind(loop);
 253         emitVectorLoad1(masm, vector1, array1, length, 0, vectorSize);
 254         emitVectorLoad2(masm, vector2, array2, length, 0, vectorSize);
 255         emitVectorCmp(masm, vector1, vector2, vectorSize);

 256         masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
 257 
 258         masm.bind(loopCheck);
 259         masm.addq(length, elementsPerVector);
 260         masm.jcc(ConditionFlag.NotZero, loop);
 261 
 262         masm.testl(result, result);
 263         masm.jcc(ConditionFlag.Zero, trueLabel);
 264 
 265         if (requiresNaNCheck) {
 266             Label unalignedCheck = new Label();
 267             masm.jmpb(unalignedCheck);
 268             masm.bind(nanCheck);
 269             emitFloatCompareWithinRange(crb, masm, array1, array2, length, 0, falseLabel, elementsPerVector);
 270             masm.jmpb(loopCheck);
 271             masm.bind(unalignedCheck);
 272         }
 273 
 274         /*
 275          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 276          * array.
 277          */
 278         emitVectorLoad1(masm, vector1, array1, result, scaleDisplacement1(-vectorSize.getBytes()), vectorSize);
 279         emitVectorLoad2(masm, vector2, array2, result, scaleDisplacement2(-vectorSize.getBytes()), vectorSize);
 280         emitVectorCmp(masm, vector1, vector2, vectorSize);

 281         if (requiresNaNCheck) {
 282             masm.jcc(ConditionFlag.Zero, trueLabel);
 283             emitFloatCompareWithinRange(crb, masm, array1, array2, result, -vectorSize.getBytes(), falseLabel, elementsPerVector);
 284         } else {
 285             masm.jcc(ConditionFlag.NotZero, falseLabel);
 286         }
 287         masm.jmp(trueLabel);
 288 
 289         masm.bind(compareTail);
 290         masm.movl(length, result);
 291     }
 292 
 293     private int getElementsPerVector(AVXKind.AVXSize vSize) {
 294         return vSize.getBytes() >> Math.max(arrayIndexScale1.log2, arrayIndexScale2.log2);







 295     }
 296 
 297     private void emitVectorLoad1(AMD64MacroAssembler asm, Register dst, Register src, int displacement, AVXKind.AVXSize size) {
 298         emitVectorLoad1(asm, dst, src, Register.None, displacement, size);
 299     }

 300 
 301     private void emitVectorLoad2(AMD64MacroAssembler asm, Register dst, Register src, int displacement, AVXKind.AVXSize size) {
 302         emitVectorLoad2(asm, dst, src, Register.None, displacement, size);
 303     }


 304 
 305     private void emitVectorLoad1(AMD64MacroAssembler asm, Register dst, Register src, Register index, int displacement, AVXKind.AVXSize size) {
 306         emitVectorLoad(asm, dst, src, index, displacement, arrayIndexScale1, arrayIndexScale2, size);
 307     }
 308 
 309     private void emitVectorLoad2(AMD64MacroAssembler asm, Register dst, Register src, Register index, int displacement, AVXKind.AVXSize size) {
 310         emitVectorLoad(asm, dst, src, index, displacement, arrayIndexScale2, arrayIndexScale1, size);
 311     }
 312 
 313     private void emitVectorLoad(AMD64MacroAssembler asm, Register dst, Register src, Register index, int displacement, Scale ownScale, Scale otherScale, AVXKind.AVXSize size) {
 314         AMD64Address address = new AMD64Address(src, index, ownScale, displacement);
 315         if (ownScale.value < otherScale.value) {
 316             if (size == AVXKind.AVXSize.YMM) {
 317                 getAVX2LoadAndExtendOp(ownScale, otherScale, signExtend).emit(asm, size, dst, address);
 318             } else {
 319                 loadAndExtendSSE(asm, dst, address, ownScale, otherScale, signExtend);
 320             }
 321         } else {
 322             if (size == AVXKind.AVXSize.YMM) {
 323                 asm.vmovdqu(dst, address);
 324             } else {
 325                 asm.movdqu(dst, address);
 326             }
 327         }
 328     }
 329 
 330     private int scaleDisplacement1(int displacement) {
 331         return scaleDisplacement(displacement, arrayIndexScale1, arrayIndexScale2);
 332     }

 333 
 334     private int scaleDisplacement2(int displacement) {
 335         return scaleDisplacement(displacement, arrayIndexScale2, arrayIndexScale1);
 336     }
 337 
 338     private static int scaleDisplacement(int displacement, Scale ownScale, Scale otherScale) {
 339         if (ownScale.value < otherScale.value) {
 340             return displacement >> (otherScale.log2 - ownScale.log2);
 341         }
 342         return displacement;
 343     }


 344 
 345     private static AMD64Assembler.VexRMOp getAVX2LoadAndExtendOp(Scale ownScale, Scale otherScale, boolean signExtend) {
 346         switch (ownScale) {
 347             case Times1:
 348                 switch (otherScale) {
 349                     case Times2:
 350                         return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXBW : AMD64Assembler.VexRMOp.VPMOVZXBW;
 351                     case Times4:
 352                         return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXBD : AMD64Assembler.VexRMOp.VPMOVZXBD;
 353                     case Times8:
 354                         return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXBQ : AMD64Assembler.VexRMOp.VPMOVZXBQ;
 355                 }
 356                 throw GraalError.shouldNotReachHere();
 357             case Times2:
 358                 switch (otherScale) {
 359                     case Times4:
 360                         return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXWD : AMD64Assembler.VexRMOp.VPMOVZXWD;
 361                     case Times8:
 362                         return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXWQ : AMD64Assembler.VexRMOp.VPMOVZXWQ;
 363                 }
 364                 throw GraalError.shouldNotReachHere();
 365             case Times4:
 366                 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXDQ : AMD64Assembler.VexRMOp.VPMOVZXDQ;
 367         }
 368         throw GraalError.shouldNotReachHere();
 369     }
 370 
 371     private static void loadAndExtendSSE(AMD64MacroAssembler asm, Register dst, AMD64Address src, Scale ownScale, Scale otherScale, boolean signExtend) {
 372         switch (ownScale) {
 373             case Times1:
 374                 switch (otherScale) {
 375                     case Times2:
 376                         if (signExtend) {
 377                             asm.pmovsxbw(dst, src);
 378                         } else {
 379                             asm.pmovzxbw(dst, src);
 380                         }
 381                         return;
 382                     case Times4:
 383                         if (signExtend) {
 384                             asm.pmovsxbd(dst, src);
 385                         } else {
 386                             asm.pmovzxbd(dst, src);
 387                         }
 388                         return;
 389                     case Times8:
 390                         if (signExtend) {
 391                             asm.pmovsxbq(dst, src);
 392                         } else {
 393                             asm.pmovzxbq(dst, src);
 394                         }
 395                         return;
 396                 }
 397                 throw GraalError.shouldNotReachHere();
 398             case Times2:
 399                 switch (otherScale) {
 400                     case Times4:
 401                         if (signExtend) {
 402                             asm.pmovsxwd(dst, src);
 403                         } else {
 404                             asm.pmovzxwd(dst, src);
 405                         }
 406                         return;
 407                     case Times8:
 408                         if (signExtend) {
 409                             asm.pmovsxwq(dst, src);
 410                         } else {
 411                             asm.pmovzxwq(dst, src);
 412                         }
 413                         return;
 414                 }
 415                 throw GraalError.shouldNotReachHere();
 416             case Times4:
 417                 if (signExtend) {
 418                     asm.pmovsxdq(dst, src);
 419                 } else {
 420                     asm.pmovzxdq(dst, src);
 421                 }
 422                 return;
 423         }
 424         throw GraalError.shouldNotReachHere();
 425     }
 426 
 427     private static void emitVectorCmp(AMD64MacroAssembler masm, Register vector1, Register vector2, AVXKind.AVXSize size) {
 428         emitVectorXor(masm, vector1, vector2, size);
 429         emitVectorTest(masm, vector1, size);
 430     }
 431 
 432     private static void emitVectorXor(AMD64MacroAssembler masm, Register vector1, Register vector2, AVXKind.AVXSize size) {
 433         if (size == AVXKind.AVXSize.YMM) {
 434             masm.vpxor(vector1, vector1, vector2);
 435         } else {
 436             masm.pxor(vector1, vector2);


 437         }
 438     }
 439 
 440     private static void emitVectorTest(AMD64MacroAssembler masm, Register vector1, AVXKind.AVXSize size) {
 441         if (size == AVXKind.AVXSize.YMM) {
 442             masm.vptest(vector1, vector1);








 443         } else {
 444             masm.ptest(vector1, vector1);
 445         }




 446     }
 447 
 448     /**
 449      * Vector size used in {@link #emit8ByteCompare}.
 450      */
 451     private static final int VECTOR_SIZE = 8;
 452 
 453     /**
 454      * Emits code that uses 8-byte vector compares.
 455      */
 456     private void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm,
 457                     Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
 458         assert kind1 == kind2;
 459         Label loop = new Label();
 460         Label compareTail = new Label();
 461 
 462         int elementsPerVector = 8 >> arrayIndexScale1.log2;
 463 
 464         boolean requiresNaNCheck = kind1.isNumericFloat();
 465         Label loopCheck = new Label();
 466         Label nanCheck = new Label();
 467 
 468         Register temp = asRegister(temp4);
 469 
 470         masm.andl(result, elementsPerVector - 1); // tail count
 471         masm.andl(length, ~(elementsPerVector - 1));  // vector count
 472         masm.jcc(ConditionFlag.Zero, compareTail);
 473 
 474         masm.leaq(array1, new AMD64Address(array1, length, arrayIndexScale1, 0));
 475         masm.leaq(array2, new AMD64Address(array2, length, arrayIndexScale2, 0));
 476         masm.negq(length);
 477 
 478         // Align the main loop
 479         masm.align(crb.target.wordSize * 2);
 480         masm.bind(loop);
 481         masm.movq(temp, new AMD64Address(array1, length, arrayIndexScale1, 0));
 482         masm.cmpq(temp, new AMD64Address(array2, length, arrayIndexScale2, 0));
 483         masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel);
 484 
 485         masm.bind(loopCheck);
 486         masm.addq(length, elementsPerVector);
 487         masm.jccb(ConditionFlag.NotZero, loop);
 488 
 489         masm.testl(result, result);
 490         masm.jcc(ConditionFlag.Zero, trueLabel);
 491 
 492         if (requiresNaNCheck) {
 493             // NaN check is slow path and hence placed outside of the main loop.
 494             Label unalignedCheck = new Label();
 495             masm.jmpb(unalignedCheck);
 496             masm.bind(nanCheck);
 497             // At most two iterations, unroll in the emitted code.
 498             for (int offset = 0; offset < VECTOR_SIZE; offset += kind1.getByteCount()) {
 499                 emitFloatCompare(masm, array1, array2, length, offset, falseLabel, kind1.getByteCount() == VECTOR_SIZE);
 500             }
 501             masm.jmpb(loopCheck);
 502             masm.bind(unalignedCheck);
 503         }
 504 
 505         /*
 506          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 507          * array.
 508          */
 509         masm.movq(temp, new AMD64Address(array1, result, arrayIndexScale1, -VECTOR_SIZE));
 510         masm.cmpq(temp, new AMD64Address(array2, result, arrayIndexScale2, -VECTOR_SIZE));
 511         if (requiresNaNCheck) {
 512             masm.jcc(ConditionFlag.Equal, trueLabel);
 513             // At most two iterations, unroll in the emitted code.
 514             for (int offset = 0; offset < VECTOR_SIZE; offset += kind1.getByteCount()) {
 515                 emitFloatCompare(masm, array1, array2, result, -VECTOR_SIZE + offset, falseLabel, kind1.getByteCount() == VECTOR_SIZE);
 516             }
 517         } else {
 518             masm.jccb(ConditionFlag.NotEqual, falseLabel);
 519         }
 520         masm.jmpb(trueLabel);
 521 
 522         masm.bind(compareTail);
 523         masm.movl(length, result);
 524     }
 525 
 526     /**
 527      * Emits code to compare the remaining 1 to 4 bytes.
 528      */
 529     private void emitTailCompares(AMD64MacroAssembler masm,
 530                     Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
 531         assert kind1 == kind2;
 532         Label compare2Bytes = new Label();
 533         Label compare1Byte = new Label();
 534 
 535         Register temp = asRegister(temp4);
 536 
 537         if (kind1.getByteCount() <= 4) {
 538             // Compare trailing 4 bytes, if any.
 539             masm.testl(result, arrayIndexScale1.log2 == 0 ? 4 : 4 >> arrayIndexScale1.log2);
 540             masm.jccb(ConditionFlag.Zero, compare2Bytes);
 541             masm.movl(temp, new AMD64Address(array1, 0));
 542             masm.cmpl(temp, new AMD64Address(array2, 0));
 543             if (kind1 == JavaKind.Float) {
 544                 masm.jccb(ConditionFlag.Equal, trueLabel);
 545                 emitFloatCompare(masm, array1, array2, Register.None, 0, falseLabel, true);
 546                 masm.jmpb(trueLabel);
 547             } else {
 548                 masm.jccb(ConditionFlag.NotEqual, falseLabel);
 549             }
 550             if (kind1.getByteCount() <= 2) {
 551                 // Move array pointers forward.
 552                 masm.leaq(array1, new AMD64Address(array1, 4));
 553                 masm.leaq(array2, new AMD64Address(array2, 4));
 554 
 555                 // Compare trailing 2 bytes, if any.
 556                 masm.bind(compare2Bytes);
 557                 masm.testl(result, arrayIndexScale1.log2 == 0 ? 2 : 2 >> arrayIndexScale1.log2);
 558                 masm.jccb(ConditionFlag.Zero, compare1Byte);
 559                 masm.movzwl(temp, new AMD64Address(array1, 0));
 560                 masm.movzwl(length, new AMD64Address(array2, 0));
 561                 masm.cmpl(temp, length);
 562                 masm.jccb(ConditionFlag.NotEqual, falseLabel);
 563 
 564                 // The one-byte tail compare is only required for boolean and byte arrays.
 565                 if (kind1.getByteCount() <= 1) {
 566                     // Move array pointers forward before we compare the last trailing byte.
 567                     masm.leaq(array1, new AMD64Address(array1, 2));
 568                     masm.leaq(array2, new AMD64Address(array2, 2));
 569 
 570                     // Compare trailing byte, if any.
 571                     masm.bind(compare1Byte);
 572                     masm.testl(result, 1);
 573                     masm.jccb(ConditionFlag.Zero, trueLabel);
 574                     masm.movzbl(temp, new AMD64Address(array1, 0));
 575                     masm.movzbl(length, new AMD64Address(array2, 0));
 576                     masm.cmpl(temp, length);
 577                     masm.jccb(ConditionFlag.NotEqual, falseLabel);
 578                 } else {
 579                     masm.bind(compare1Byte);
 580                 }
 581             } else {
 582                 masm.bind(compare2Bytes);
 583             }
 584         }
 585     }
 586 
 587     private void emitDifferentKindsElementWiseCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm,
 588                     Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
 589         assert kind1 != kind2;
 590         assert kind1.isNumericInteger() && kind2.isNumericInteger();
 591         Label loop = new Label();
 592         Label compareTail = new Label();
 593 
 594         int elementsPerLoopIteration = 4;
 595 
 596         Register tmp1 = asRegister(temp4);
 597         Register tmp2 = asRegister(temp5);
 598 
 599         masm.andl(result, elementsPerLoopIteration - 1); // tail count
 600         masm.andl(length, ~(elementsPerLoopIteration - 1));  // bulk loop count
 601         masm.jcc(ConditionFlag.Zero, compareTail);
 602 
 603         masm.leaq(array1, new AMD64Address(array1, length, arrayIndexScale1, 0));
 604         masm.leaq(array2, new AMD64Address(array2, length, arrayIndexScale2, 0));
 605         masm.negq(length);
 606 
 607         // clear comparison registers because of the missing movzlq instruction
 608         masm.xorq(tmp1, tmp1);
 609         masm.xorq(tmp2, tmp2);
 610 
 611         // Align the main loop
 612         masm.align(crb.target.wordSize * 2);
 613         masm.bind(loop);
 614         for (int i = 0; i < elementsPerLoopIteration; i++) {
 615             emitMovBytes(masm, tmp1, new AMD64Address(array1, length, arrayIndexScale1, i << arrayIndexScale1.log2), kind1.getByteCount());
 616             emitMovBytes(masm, tmp2, new AMD64Address(array2, length, arrayIndexScale2, i << arrayIndexScale2.log2), kind2.getByteCount());
 617             masm.cmpq(tmp1, tmp2);
 618             masm.jcc(ConditionFlag.NotEqual, falseLabel);
 619         }
 620         masm.addq(length, elementsPerLoopIteration);
 621         masm.jccb(ConditionFlag.NotZero, loop);
 622 
 623         masm.bind(compareTail);
 624         masm.testl(result, result);
 625         masm.jcc(ConditionFlag.Zero, trueLabel);
 626         for (int i = 0; i < elementsPerLoopIteration - 1; i++) {
 627             emitMovBytes(masm, tmp1, new AMD64Address(array1, length, arrayIndexScale1, 0), kind1.getByteCount());
 628             emitMovBytes(masm, tmp2, new AMD64Address(array2, length, arrayIndexScale2, 0), kind2.getByteCount());
 629             masm.cmpq(tmp1, tmp2);
 630             masm.jcc(ConditionFlag.NotEqual, falseLabel);
 631             if (i < elementsPerLoopIteration - 2) {
 632                 masm.incrementq(length, 1);
 633                 masm.decrementq(result, 1);
 634                 masm.jcc(ConditionFlag.Zero, trueLabel);
 635             } else {
 636                 masm.jmpb(trueLabel);
 637             }
 638         }
 639     }
 640 
 641     /**
 642      * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}.
 643      */
 644     private void emitNaNCheck(AMD64MacroAssembler masm, AMD64Address src, Label branchIfNonNaN) {
 645         assert kind1.isNumericFloat();
 646         Register tempXMMReg = asRegister(tempXMM);
 647         if (kind1 == JavaKind.Float) {
 648             masm.movflt(tempXMMReg, src);
 649         } else {
 650             masm.movdbl(tempXMMReg, src);
 651         }
 652         SSEOp.UCOMIS.emit(masm, kind1 == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg);
 653         masm.jcc(ConditionFlag.NoParity, branchIfNonNaN);
 654     }
 655 
 656     /**
 657      * Emits code to compare if two floats are bitwise equal or both NaN.
 658      */
 659     private void emitFloatCompare(AMD64MacroAssembler masm, Register base1, Register base2, Register index, int offset, Label falseLabel,
 660                     boolean skipBitwiseCompare) {
 661         AMD64Address address1 = new AMD64Address(base1, index, arrayIndexScale1, offset);
 662         AMD64Address address2 = new AMD64Address(base2, index, arrayIndexScale2, offset);
 663 
 664         Label bitwiseEqual = new Label();
 665 
 666         if (!skipBitwiseCompare) {
 667             // Bitwise compare
 668             Register temp = asRegister(temp4);
 669 
 670             if (kind1 == JavaKind.Float) {
 671                 masm.movl(temp, address1);
 672                 masm.cmpl(temp, address2);
 673             } else {
 674                 masm.movq(temp, address1);
 675                 masm.cmpq(temp, address2);
 676             }
 677             masm.jccb(ConditionFlag.Equal, bitwiseEqual);
 678         }
 679 
 680         emitNaNCheck(masm, address1, falseLabel);
 681         emitNaNCheck(masm, address2, falseLabel);
 682 
 683         masm.bind(bitwiseEqual);
 684     }
 685 
 686     /**
 687      * Emits code to compare float equality within a range.
 688      */
 689     private void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm,
 690                     Register base1, Register base2, Register index, int offset, Label falseLabel, int range) {
 691         assert kind1.isNumericFloat();
 692         Label loop = new Label();
 693         Register i = asRegister(temp5);
 694 
 695         masm.movq(i, range);
 696         masm.negq(i);
 697         // Align the main loop
 698         masm.align(crb.target.wordSize * 2);
 699         masm.bind(loop);
 700         emitFloatCompare(masm, base1, base2, index, offset, falseLabel, range == 1);
 701         masm.incrementq(index, 1);
 702         masm.incrementq(i, 1);
 703         masm.jccb(ConditionFlag.NotZero, loop);
 704         // Floats within the range are equal, revert change to the register index
 705         masm.subq(index, range);
 706     }
 707 
 708     /**
 709      * Emits specialized assembly for checking equality of memory regions
 710      * {@code arrayPtr1[0..nBytes]} and {@code arrayPtr2[0..nBytes]}. If they match, execution
 711      * continues directly after the emitted code block, otherwise we jump to {@code noMatch}.
 712      */
 713     private void emitConstantLengthArrayCompareBytes(
 714                     CompilationResultBuilder crb,
 715                     AMD64MacroAssembler asm,
 716                     Register arrayPtr1,
 717                     Register arrayPtr2,
 718                     Register tmp1,
 719                     Register tmp2,
 720                     Register[] tmpVectors,
 721                     Label noMatch) {
 722         if (constantLength == 0) {



 723             // do nothing
 724             return;
 725         }
 726         AVXKind.AVXSize vSize = vectorSize;
 727         if (constantLength < getElementsPerVector(vectorSize)) {
 728             vSize = AVXKind.AVXSize.XMM;
 729         }
 730         int elementsPerVector = getElementsPerVector(vSize);
 731         if (elementsPerVector > constantLength) {
 732             assert kind1 == kind2;
 733             int byteLength = constantLength << arrayIndexScale1.log2;
 734             // array is shorter than any vector register, use regular CMP instructions
 735             int movSize = (byteLength < 2) ? 1 : ((byteLength < 4) ? 2 : ((byteLength < 8) ? 4 : 8));
 736             emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1), movSize);
 737             emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2), movSize);
 738             emitCmpBytes(asm, tmp1, tmp2, movSize);
 739             asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch);
 740             if (byteLength > movSize) {
 741                 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1, byteLength - movSize), movSize);
 742                 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2, byteLength - movSize), movSize);
 743                 emitCmpBytes(asm, tmp1, tmp2, movSize);
 744                 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch);
 745             }
 746         } else {
 747             int elementsPerVectorLoop = 2 * elementsPerVector;
 748             int tailCount = constantLength & (elementsPerVectorLoop - 1);
 749             int vectorCount = constantLength & ~(elementsPerVectorLoop - 1);
 750             int bytesPerVector = vSize.getBytes();
 751             if (vectorCount > 0) {
 752                 Label loopBegin = new Label();
 753                 asm.leaq(arrayPtr1, new AMD64Address(arrayPtr1, vectorCount << arrayIndexScale1.log2));
 754                 asm.leaq(arrayPtr2, new AMD64Address(arrayPtr2, vectorCount << arrayIndexScale2.log2));
 755                 asm.movq(tmp1, -vectorCount);
 756                 asm.align(crb.target.wordSize * 2);
 757                 asm.bind(loopBegin);
 758                 emitVectorLoad1(asm, tmpVectors[0], arrayPtr1, tmp1, 0, vSize);
 759                 emitVectorLoad2(asm, tmpVectors[1], arrayPtr2, tmp1, 0, vSize);
 760                 emitVectorLoad1(asm, tmpVectors[2], arrayPtr1, tmp1, scaleDisplacement1(bytesPerVector), vSize);
 761                 emitVectorLoad2(asm, tmpVectors[3], arrayPtr2, tmp1, scaleDisplacement2(bytesPerVector), vSize);
 762                 emitVectorXor(asm, tmpVectors[0], tmpVectors[1], vSize);
 763                 emitVectorXor(asm, tmpVectors[2], tmpVectors[3], vSize);
 764                 emitVectorTest(asm, tmpVectors[0], vSize);
 765                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 766                 emitVectorTest(asm, tmpVectors[2], vSize);
































































 767                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 768                 asm.addq(tmp1, elementsPerVectorLoop);
 769                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin);
 770             }
 771             if (tailCount > 0) {
 772                 emitVectorLoad1(asm, tmpVectors[0], arrayPtr1, (tailCount << arrayIndexScale1.log2) - scaleDisplacement1(bytesPerVector), vSize);
 773                 emitVectorLoad2(asm, tmpVectors[1], arrayPtr2, (tailCount << arrayIndexScale2.log2) - scaleDisplacement2(bytesPerVector), vSize);
 774                 emitVectorXor(asm, tmpVectors[0], tmpVectors[1], vSize);
 775                 if (tailCount > elementsPerVector) {
 776                     emitVectorLoad1(asm, tmpVectors[2], arrayPtr1, 0, vSize);
 777                     emitVectorLoad2(asm, tmpVectors[3], arrayPtr2, 0, vSize);
 778                     emitVectorXor(asm, tmpVectors[2], tmpVectors[3], vSize);
 779                     emitVectorTest(asm, tmpVectors[2], vSize);












 780                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);








 781                 }
 782                 emitVectorTest(asm, tmpVectors[0], vSize);





























 783                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 784             }
 785         }
 786     }
 787 
 788     private void emitMovBytes(AMD64MacroAssembler asm, Register dst, AMD64Address src, int size) {
 789         switch (size) {
 790             case 1:
 791                 if (signExtend) {
 792                     asm.movsbq(dst, src);
 793                 } else {
 794                     asm.movzbq(dst, src);
 795                 }
 796                 break;
 797             case 2:
 798                 if (signExtend) {
 799                     asm.movswq(dst, src);
 800                 } else {
 801                     asm.movzwq(dst, src);
 802                 }
 803                 break;
 804             case 4:
 805                 if (signExtend) {
 806                     asm.movslq(dst, src);
 807                 } else {
 808                     // there is no movzlq
 809                     asm.movl(dst, src);
 810                 }
 811                 break;
 812             case 8:
 813                 asm.movq(dst, src);
 814                 break;
 815             default:
 816                 throw new IllegalStateException();
 817         }
 818     }
 819 
 820     private static void emitCmpBytes(AMD64MacroAssembler asm, Register dst, Register src, int size) {
 821         if (size < 8) {
 822             asm.cmpl(dst, src);
 823         } else {
 824             asm.cmpq(dst, src);
 825         }
 826     }
 827 }
< prev index next >