1 /*
   2  * Copyright (c) 2013, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import jdk.vm.ci.amd64.AMD64;
  28 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  29 import jdk.vm.ci.amd64.AMD64Kind;
  30 import jdk.vm.ci.code.Register;
  31 import jdk.vm.ci.code.TargetDescription;
  32 import jdk.vm.ci.meta.JavaKind;
  33 import jdk.vm.ci.meta.Value;
  34 import org.graalvm.compiler.asm.Label;
  35 import org.graalvm.compiler.asm.amd64.AMD64Address;
  36 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  37 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
  38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
  39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp;
  40 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize;
  41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  42 import org.graalvm.compiler.asm.amd64.AVXKind;
  43 import org.graalvm.compiler.core.common.LIRKind;
  44 import org.graalvm.compiler.core.common.NumUtil;
  45 import org.graalvm.compiler.lir.LIRInstructionClass;
  46 import org.graalvm.compiler.lir.Opcode;
  47 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  48 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  49 
  50 import static jdk.vm.ci.code.ValueUtil.asRegister;
  51 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
  52 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
  53 
  54 /**
  55  * Emits code which compares two arrays of the same length. If the CPU supports any vector
  56  * instructions specialized code is emitted to leverage these instructions.
  57  */
  58 @Opcode("ARRAY_EQUALS")
  59 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction {
  60     public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class);
  61 
  62     private final JavaKind kind;
  63     private final int arrayBaseOffset;
  64     private final int arrayIndexScale;
  65     private final int constantByteLength;
  66 
  67     @Def({REG}) private Value resultValue;
  68     @Alive({REG}) private Value array1Value;
  69     @Alive({REG}) private Value array2Value;
  70     @Alive({REG}) private Value lengthValue;
  71     @Temp({REG}) private Value temp1;
  72     @Temp({REG}) private Value temp2;
  73     @Temp({REG}) private Value temp3;
  74     @Temp({REG}) private Value temp4;
  75 
  76     @Temp({REG, ILLEGAL}) private Value temp5;
  77     @Temp({REG, ILLEGAL}) private Value tempXMM;
  78 
  79     @Temp({REG, ILLEGAL}) private Value vectorTemp1;
  80     @Temp({REG, ILLEGAL}) private Value vectorTemp2;
  81     @Temp({REG, ILLEGAL}) private Value vectorTemp3;
  82     @Temp({REG, ILLEGAL}) private Value vectorTemp4;
  83 
  84     public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind, Value result, Value array1, Value array2, Value length,
  85                     int constantLength, boolean directPointers, int maxVectorSize) {
  86         super(TYPE);
  87         this.kind = kind;
  88 
  89         this.arrayBaseOffset = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind);
  90         this.arrayIndexScale = tool.getProviders().getMetaAccess().getArrayIndexScale(kind);
  91 
  92         if (constantLength >= 0 && arrayIndexScale > 1) {
  93             // scale length
  94             this.constantByteLength = constantLength << NumUtil.log2Ceil(arrayIndexScale);
  95         } else {
  96             this.constantByteLength = constantLength;
  97         }
  98 
  99         this.resultValue = result;
 100         this.array1Value = array1;
 101         this.array2Value = array2;
 102         this.lengthValue = length;
 103 
 104         // Allocate some temporaries.
 105         this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
 106         this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
 107         this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
 108         this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
 109 
 110         this.temp5 = kind.isNumericFloat() ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL;
 111         if (kind == JavaKind.Float) {
 112             this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE));
 113         } else if (kind == JavaKind.Double) {
 114             this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 115         } else {
 116             this.tempXMM = Value.ILLEGAL;
 117         }
 118 
 119         // We only need the vector temporaries if we generate SSE code.
 120         if (supportsSSE41(tool.target())) {
 121             if (canGenerateConstantLengthCompare(tool.target())) {
 122                 LIRKind lirKind = LIRKind.value(supportsAVX2(tool.target()) && (maxVectorSize < 0 || maxVectorSize >= 32) ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE);
 123                 this.vectorTemp1 = tool.newVariable(lirKind);
 124                 this.vectorTemp2 = tool.newVariable(lirKind);
 125                 this.vectorTemp3 = tool.newVariable(lirKind);
 126                 this.vectorTemp4 = tool.newVariable(lirKind);
 127             } else {
 128                 this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 129                 this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 130                 this.vectorTemp3 = Value.ILLEGAL;
 131                 this.vectorTemp4 = Value.ILLEGAL;
 132             }
 133         } else {
 134             this.vectorTemp1 = Value.ILLEGAL;
 135             this.vectorTemp2 = Value.ILLEGAL;
 136             this.vectorTemp3 = Value.ILLEGAL;
 137             this.vectorTemp4 = Value.ILLEGAL;
 138         }
 139     }
 140 
 141     private boolean canGenerateConstantLengthCompare(TargetDescription target) {
 142         return constantByteLength >= 0 && kind.isNumericInteger() && supportsSSE41(target);
 143     }
 144 
 145     @Override
 146     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
 147         Register result = asRegister(resultValue);
 148         Register array1 = asRegister(temp1);
 149         Register array2 = asRegister(temp2);
 150 
 151         Label trueLabel = new Label();
 152         Label falseLabel = new Label();
 153         Label done = new Label();
 154 
 155         // Load array base addresses.
 156         masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset));
 157         masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset));
 158 
 159         if (canGenerateConstantLengthCompare(crb.target)) {
 160             emitConstantLengthArrayCompareBytes(masm, array1, array2, asRegister(temp3), asRegister(temp4),
 161                             new Register[]{asRegister(vectorTemp1), asRegister(vectorTemp2), asRegister(vectorTemp3), asRegister(vectorTemp4)},
 162                             falseLabel, constantByteLength, AVXKind.getRegisterSize(vectorTemp1).getBytes());
 163         } else {
 164             Register length = asRegister(temp3);
 165 
 166             // Get array length in bytes.
 167             masm.movl(length, asRegister(lengthValue));
 168 
 169             if (arrayIndexScale > 1) {
 170                 masm.shll(length, NumUtil.log2Ceil(arrayIndexScale)); // scale length
 171             }
 172 
 173             masm.movl(result, length); // copy
 174 
 175             emitArrayCompare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel);
 176         }
 177 
 178         // Return true
 179         masm.bind(trueLabel);
 180         masm.movl(result, 1);
 181         masm.jmpb(done);
 182 
 183         // Return false
 184         masm.bind(falseLabel);
 185         masm.xorl(result, result);
 186 
 187         // That's it
 188         masm.bind(done);
 189     }
 190 
 191     private static void emitArrayCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind,
 192                     Register result, Register array1, Register array2, Register length,
 193                     Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2,
 194                     Label trueLabel, Label falseLabel) {
 195         if (supportsAVX2(crb.target)) {
 196             emitAVXCompare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel);
 197         } else if (supportsSSE41(crb.target)) {
 198             // this code is used for AVX as well because our backend correctly ensures that
 199             // VEX-prefixed instructions are emitted if AVX is supported
 200             emitSSE41Compare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel);
 201         }
 202         emit8ByteCompare(crb, masm, kind, result, array1, array2, length, temp4, tempXMM, trueLabel, falseLabel);
 203         emitTailCompares(masm, kind, result, array1, array2, length, temp4, tempXMM, trueLabel, falseLabel);
 204     }
 205 
 206     /**
 207      * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions.
 208      *
 209      * @param target target description of the underlying architecture
 210      * @return true if the underlying architecture supports SSE 4.1
 211      */
 212     private static boolean supportsSSE41(TargetDescription target) {
 213         AMD64 arch = (AMD64) target.arch;
 214         return arch.getFeatures().contains(CPUFeature.SSE4_1);
 215     }
 216 
 217     /**
 218      * Vector size used in {@link #emitSSE41Compare}.
 219      */
 220     private static final int SSE4_1_VECTOR_SIZE = 16;
 221 
 222     /**
 223      * Emits code that uses SSE4.1 128-bit (16-byte) vector compares.
 224      */
 225     private static void emitSSE41Compare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind,
 226                     Register result, Register array1, Register array2, Register length,
 227                     Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2,
 228                     Label trueLabel, Label falseLabel) {
 229         assert supportsSSE41(crb.target);
 230 
 231         Register vector1 = asRegister(vectorTemp1);
 232         Register vector2 = asRegister(vectorTemp2);
 233 
 234         Label loop = new Label();
 235         Label compareTail = new Label();
 236 
 237         boolean requiresNaNCheck = kind.isNumericFloat();
 238         Label loopCheck = new Label();
 239         Label nanCheck = new Label();
 240 
 241         // Compare 16-byte vectors
 242         masm.andl(result, SSE4_1_VECTOR_SIZE - 1); // tail count (in bytes)
 243         masm.andl(length, ~(SSE4_1_VECTOR_SIZE - 1)); // vector count (in bytes)
 244         masm.jcc(ConditionFlag.Zero, compareTail);
 245 
 246         masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
 247         masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
 248         masm.negq(length);
 249 
 250         // Align the main loop
 251         masm.align(crb.target.wordSize * 2);
 252         masm.bind(loop);
 253         masm.movdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
 254         masm.movdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
 255         masm.pxor(vector1, vector2);
 256         masm.ptest(vector1, vector1);
 257         masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
 258 
 259         masm.bind(loopCheck);
 260         masm.addq(length, SSE4_1_VECTOR_SIZE);
 261         masm.jcc(ConditionFlag.NotZero, loop);
 262 
 263         masm.testl(result, result);
 264         masm.jcc(ConditionFlag.Zero, trueLabel);
 265 
 266         if (requiresNaNCheck) {
 267             Label unalignedCheck = new Label();
 268             masm.jmpb(unalignedCheck);
 269             masm.bind(nanCheck);
 270             emitFloatCompareWithinRange(crb, masm, kind, array1, array2, length, temp4, temp5, tempXMM, 0, falseLabel, SSE4_1_VECTOR_SIZE);
 271             masm.jmpb(loopCheck);
 272             masm.bind(unalignedCheck);
 273         }
 274 
 275         /*
 276          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 277          * array.
 278          */
 279         masm.movdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -SSE4_1_VECTOR_SIZE));
 280         masm.movdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -SSE4_1_VECTOR_SIZE));
 281         masm.pxor(vector1, vector2);
 282         masm.ptest(vector1, vector1);
 283         if (requiresNaNCheck) {
 284             masm.jcc(ConditionFlag.Zero, trueLabel);
 285             emitFloatCompareWithinRange(crb, masm, kind, array1, array2, result, temp4, temp5, tempXMM, -SSE4_1_VECTOR_SIZE, falseLabel, SSE4_1_VECTOR_SIZE);
 286         } else {
 287             masm.jcc(ConditionFlag.NotZero, falseLabel);
 288         }
 289         masm.jmp(trueLabel);
 290 
 291         masm.bind(compareTail);
 292         masm.movl(length, result);
 293     }
 294 
 295     /**
 296      * Returns if the underlying AMD64 architecture supports AVX instructions.
 297      *
 298      * @param target target description of the underlying architecture
 299      * @return true if the underlying architecture supports AVX
 300      */
 301     private static boolean supportsAVX2(TargetDescription target) {
 302         AMD64 arch = (AMD64) target.arch;
 303         return arch.getFeatures().contains(CPUFeature.AVX2);
 304     }
 305 
 306     /**
 307      * Vector size used in {@link #emitAVXCompare}.
 308      */
 309     private static final int AVX_VECTOR_SIZE = 32;
 310 
 311     private static void emitAVXCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register result,
 312                     Register array1, Register array2, Register length,
 313                     Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2,
 314                     Label trueLabel, Label falseLabel) {
 315         assert supportsAVX2(crb.target);
 316 
 317         Register vector1 = asRegister(vectorTemp1);
 318         Register vector2 = asRegister(vectorTemp2);
 319 
 320         Label loop = new Label();
 321         Label compareTail = new Label();
 322 
 323         boolean requiresNaNCheck = kind.isNumericFloat();
 324         Label loopCheck = new Label();
 325         Label nanCheck = new Label();
 326 
 327         // Compare 32-byte vectors
 328         masm.andl(result, AVX_VECTOR_SIZE - 1); // tail count (in bytes)
 329         masm.andl(length, ~(AVX_VECTOR_SIZE - 1)); // vector count (in bytes)
 330         masm.jcc(ConditionFlag.Zero, compareTail);
 331 
 332         masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
 333         masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
 334         masm.negq(length);
 335 
 336         // Align the main loop
 337         masm.align(crb.target.wordSize * 2);
 338         masm.bind(loop);
 339         masm.vmovdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
 340         masm.vmovdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
 341         masm.vpxor(vector1, vector1, vector2);
 342         masm.vptest(vector1, vector1);
 343         masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
 344 
 345         masm.bind(loopCheck);
 346         masm.addq(length, AVX_VECTOR_SIZE);
 347         masm.jcc(ConditionFlag.NotZero, loop);
 348 
 349         masm.testl(result, result);
 350         masm.jcc(ConditionFlag.Zero, trueLabel);
 351 
 352         if (requiresNaNCheck) {
 353             Label unalignedCheck = new Label();
 354             masm.jmpb(unalignedCheck);
 355             masm.bind(nanCheck);
 356             emitFloatCompareWithinRange(crb, masm, kind, array1, array2, length, temp4, temp5, tempXMM, 0, falseLabel, AVX_VECTOR_SIZE);
 357             masm.jmpb(loopCheck);
 358             masm.bind(unalignedCheck);
 359         }
 360 
 361         /*
 362          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 363          * array.
 364          */
 365         masm.vmovdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -AVX_VECTOR_SIZE));
 366         masm.vmovdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -AVX_VECTOR_SIZE));
 367         masm.vpxor(vector1, vector1, vector2);
 368         masm.vptest(vector1, vector1);
 369         if (requiresNaNCheck) {
 370             masm.jcc(ConditionFlag.Zero, trueLabel);
 371             emitFloatCompareWithinRange(crb, masm, kind, array1, array2, result, temp4, temp5, tempXMM, -AVX_VECTOR_SIZE, falseLabel, AVX_VECTOR_SIZE);
 372         } else {
 373             masm.jcc(ConditionFlag.NotZero, falseLabel);
 374         }
 375         masm.jmp(trueLabel);
 376 
 377         masm.bind(compareTail);
 378         masm.movl(length, result);
 379     }
 380 
 381     /**
 382      * Vector size used in {@link #emit8ByteCompare}.
 383      */
 384     private static final int VECTOR_SIZE = 8;
 385 
 386     /**
 387      * Emits code that uses 8-byte vector compares.
 388      */
 389     private static void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register result, Register array1, Register array2, Register length, Value temp4,
 390                     Value tempXMM, Label trueLabel, Label falseLabel) {
 391         Label loop = new Label();
 392         Label compareTail = new Label();
 393 
 394         boolean requiresNaNCheck = kind.isNumericFloat();
 395         Label loopCheck = new Label();
 396         Label nanCheck = new Label();
 397 
 398         Register temp = asRegister(temp4);
 399 
 400         masm.andl(result, VECTOR_SIZE - 1); // tail count (in bytes)
 401         masm.andl(length, ~(VECTOR_SIZE - 1));  // vector count (in bytes)
 402         masm.jcc(ConditionFlag.Zero, compareTail);
 403 
 404         masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
 405         masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
 406         masm.negq(length);
 407 
 408         // Align the main loop
 409         masm.align(crb.target.wordSize * 2);
 410         masm.bind(loop);
 411         masm.movq(temp, new AMD64Address(array1, length, Scale.Times1, 0));
 412         masm.cmpq(temp, new AMD64Address(array2, length, Scale.Times1, 0));
 413         masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel);
 414 
 415         masm.bind(loopCheck);
 416         masm.addq(length, VECTOR_SIZE);
 417         masm.jccb(ConditionFlag.NotZero, loop);
 418 
 419         masm.testl(result, result);
 420         masm.jcc(ConditionFlag.Zero, trueLabel);
 421 
 422         if (requiresNaNCheck) {
 423             // NaN check is slow path and hence placed outside of the main loop.
 424             Label unalignedCheck = new Label();
 425             masm.jmpb(unalignedCheck);
 426             masm.bind(nanCheck);
 427             // At most two iterations, unroll in the emitted code.
 428             for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
 429                 emitFloatCompare(masm, kind, array1, array2, length, temp4, tempXMM, offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
 430             }
 431             masm.jmpb(loopCheck);
 432             masm.bind(unalignedCheck);
 433         }
 434 
 435         /*
 436          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 437          * array.
 438          */
 439         masm.movq(temp, new AMD64Address(array1, result, Scale.Times1, -VECTOR_SIZE));
 440         masm.cmpq(temp, new AMD64Address(array2, result, Scale.Times1, -VECTOR_SIZE));
 441         if (requiresNaNCheck) {
 442             masm.jcc(ConditionFlag.Equal, trueLabel);
 443             // At most two iterations, unroll in the emitted code.
 444             for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
 445                 emitFloatCompare(masm, kind, array1, array2, result, temp4, tempXMM, -VECTOR_SIZE + offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
 446             }
 447         } else {
 448             masm.jccb(ConditionFlag.NotEqual, falseLabel);
 449         }
 450         masm.jmpb(trueLabel);
 451 
 452         masm.bind(compareTail);
 453         masm.movl(length, result);
 454     }
 455 
 456     /**
 457      * Emits code to compare the remaining 1 to 4 bytes.
 458      */
 459     private static void emitTailCompares(AMD64MacroAssembler masm, JavaKind kind, Register result, Register array1, Register array2, Register length, Value temp4, Value tempXMM,
 460                     Label trueLabel, Label falseLabel) {
 461         Label compare2Bytes = new Label();
 462         Label compare1Byte = new Label();
 463 
 464         Register temp = asRegister(temp4);
 465 
 466         if (kind.getByteCount() <= 4) {
 467             // Compare trailing 4 bytes, if any.
 468             masm.testl(result, 4);
 469             masm.jccb(ConditionFlag.Zero, compare2Bytes);
 470             masm.movl(temp, new AMD64Address(array1, 0));
 471             masm.cmpl(temp, new AMD64Address(array2, 0));
 472             if (kind == JavaKind.Float) {
 473                 masm.jccb(ConditionFlag.Equal, trueLabel);
 474                 emitFloatCompare(masm, kind, array1, array2, Register.None, temp4, tempXMM, 0, falseLabel, true);
 475                 masm.jmpb(trueLabel);
 476             } else {
 477                 masm.jccb(ConditionFlag.NotEqual, falseLabel);
 478             }
 479             if (kind.getByteCount() <= 2) {
 480                 // Move array pointers forward.
 481                 masm.leaq(array1, new AMD64Address(array1, 4));
 482                 masm.leaq(array2, new AMD64Address(array2, 4));
 483 
 484                 // Compare trailing 2 bytes, if any.
 485                 masm.bind(compare2Bytes);
 486                 masm.testl(result, 2);
 487                 masm.jccb(ConditionFlag.Zero, compare1Byte);
 488                 masm.movzwl(temp, new AMD64Address(array1, 0));
 489                 masm.movzwl(length, new AMD64Address(array2, 0));
 490                 masm.cmpl(temp, length);
 491                 masm.jccb(ConditionFlag.NotEqual, falseLabel);
 492 
 493                 // The one-byte tail compare is only required for boolean and byte arrays.
 494                 if (kind.getByteCount() <= 1) {
 495                     // Move array pointers forward before we compare the last trailing byte.
 496                     masm.leaq(array1, new AMD64Address(array1, 2));
 497                     masm.leaq(array2, new AMD64Address(array2, 2));
 498 
 499                     // Compare trailing byte, if any.
 500                     masm.bind(compare1Byte);
 501                     masm.testl(result, 1);
 502                     masm.jccb(ConditionFlag.Zero, trueLabel);
 503                     masm.movzbl(temp, new AMD64Address(array1, 0));
 504                     masm.movzbl(length, new AMD64Address(array2, 0));
 505                     masm.cmpl(temp, length);
 506                     masm.jccb(ConditionFlag.NotEqual, falseLabel);
 507                 } else {
 508                     masm.bind(compare1Byte);
 509                 }
 510             } else {
 511                 masm.bind(compare2Bytes);
 512             }
 513         }
 514     }
 515 
 516     /**
 517      * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}.
 518      */
 519     private static void emitNaNCheck(AMD64MacroAssembler masm, JavaKind kind, Value tempXMM, AMD64Address src, Label branchIfNonNaN) {
 520         assert kind.isNumericFloat();
 521         Register tempXMMReg = asRegister(tempXMM);
 522         if (kind == JavaKind.Float) {
 523             masm.movflt(tempXMMReg, src);
 524         } else {
 525             masm.movdbl(tempXMMReg, src);
 526         }
 527         SSEOp.UCOMIS.emit(masm, kind == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg);
 528         masm.jcc(ConditionFlag.NoParity, branchIfNonNaN);
 529     }
 530 
 531     /**
 532      * Emits code to compare if two floats are bitwise equal or both NaN.
 533      */
 534     private static void emitFloatCompare(AMD64MacroAssembler masm, JavaKind kind, Register base1, Register base2, Register index, Value temp4, Value tempXMM, int offset, Label falseLabel,
 535                     boolean skipBitwiseCompare) {
 536         AMD64Address address1 = new AMD64Address(base1, index, Scale.Times1, offset);
 537         AMD64Address address2 = new AMD64Address(base2, index, Scale.Times1, offset);
 538 
 539         Label bitwiseEqual = new Label();
 540 
 541         if (!skipBitwiseCompare) {
 542             // Bitwise compare
 543             Register temp = asRegister(temp4);
 544 
 545             if (kind == JavaKind.Float) {
 546                 masm.movl(temp, address1);
 547                 masm.cmpl(temp, address2);
 548             } else {
 549                 masm.movq(temp, address1);
 550                 masm.cmpq(temp, address2);
 551             }
 552             masm.jccb(ConditionFlag.Equal, bitwiseEqual);
 553         }
 554 
 555         emitNaNCheck(masm, kind, tempXMM, address1, falseLabel);
 556         emitNaNCheck(masm, kind, tempXMM, address2, falseLabel);
 557 
 558         masm.bind(bitwiseEqual);
 559     }
 560 
 561     /**
 562      * Emits code to compare float equality within a range.
 563      */
 564     private static void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register base1, Register base2, Register index, Value temp4, Value temp5,
 565                     Value tempXMM, int offset, Label falseLabel, int range) {
 566         assert kind.isNumericFloat();
 567         Label loop = new Label();
 568         Register i = asRegister(temp5);
 569 
 570         masm.movq(i, range);
 571         masm.negq(i);
 572         // Align the main loop
 573         masm.align(crb.target.wordSize * 2);
 574         masm.bind(loop);
 575         emitFloatCompare(masm, kind, base1, base2, index, temp4, tempXMM, offset, falseLabel, kind.getByteCount() == range);
 576         masm.addq(index, kind.getByteCount());
 577         masm.addq(i, kind.getByteCount());
 578         masm.jccb(ConditionFlag.NotZero, loop);
 579         // Floats within the range are equal, revert change to the register index
 580         masm.subq(index, range);
 581     }
 582 
 583     /**
 584      * Emits specialized assembly for checking equality of memory regions
 585      * {@code arrayPtr1[0..nBytes]} and {@code arrayPtr2[0..nBytes]}. If they match, execution
 586      * continues directly after the emitted code block, otherwise we jump to {@code noMatch}.
 587      */
 588     private static void emitConstantLengthArrayCompareBytes(
 589                     AMD64MacroAssembler asm,
 590                     Register arrayPtr1,
 591                     Register arrayPtr2,
 592                     Register tmp1,
 593                     Register tmp2,
 594                     Register[] tmpVectors,
 595                     Label noMatch,
 596                     int nBytes,
 597                     int bytesPerVector) {
 598         assert bytesPerVector >= 16;
 599         if (nBytes == 0) {
 600             // do nothing
 601             return;
 602         }
 603         if (nBytes < 16) {
 604             // array is shorter than any vector register, use regular CMP instructions
 605             int movSize = (nBytes < 2) ? 1 : ((nBytes < 4) ? 2 : ((nBytes < 8) ? 4 : 8));
 606             emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1), movSize);
 607             emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2), movSize);
 608             emitCmpBytes(asm, tmp1, tmp2, movSize);
 609             asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch);
 610             if (nBytes > movSize) {
 611                 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1, nBytes - movSize), movSize);
 612                 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2, nBytes - movSize), movSize);
 613                 emitCmpBytes(asm, tmp1, tmp2, movSize);
 614                 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch);
 615             }
 616         } else if (nBytes < 32 && bytesPerVector >= 32) {
 617             // we could use YMM registers, but the array is too short, force XMM registers
 618             int bytesPerXMMVector = AVXKind.AVXSize.XMM.getBytes();
 619             AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], new AMD64Address(arrayPtr1));
 620             AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[1], new AMD64Address(arrayPtr2));
 621             AMD64Assembler.VexRVMOp.VPXOR.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 622             if (nBytes > bytesPerXMMVector) {
 623                 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], new AMD64Address(arrayPtr1, nBytes - bytesPerXMMVector));
 624                 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[3], new AMD64Address(arrayPtr2, nBytes - bytesPerXMMVector));
 625                 AMD64Assembler.VexRVMOp.VPXOR.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], tmpVectors[2], tmpVectors[3]);
 626                 AMD64Assembler.VexRMOp.VPTEST.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], tmpVectors[2]);
 627                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 628             }
 629             AMD64Assembler.VexRMOp.VPTEST.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], tmpVectors[0]);
 630             asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 631         } else if (bytesPerVector >= 32) {
 632             // AVX2 supported, use YMM vectors
 633             assert asm.supports(CPUFeature.AVX2);
 634             int loopCount = nBytes / (bytesPerVector * 2);
 635             int rest = nBytes % (bytesPerVector * 2);
 636             if (loopCount > 0) {
 637                 if (0 < rest && rest < bytesPerVector) {
 638                     loopCount--;
 639                 }
 640                 if (loopCount > 0) {
 641                     if (loopCount > 1) {
 642                         asm.movl(tmp1, loopCount);
 643                     }
 644                     Label loopBegin = new Label();
 645                     asm.bind(loopBegin);
 646                     asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 647                     asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 648                     asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
 649                     asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
 650                     asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 651                     asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]);
 652                     asm.vptest(tmpVectors[0], tmpVectors[0]);
 653                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 654                     asm.vptest(tmpVectors[2], tmpVectors[2]);
 655                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 656                     asm.addq(arrayPtr1, bytesPerVector * 2);
 657                     asm.addq(arrayPtr2, bytesPerVector * 2);
 658                     if (loopCount > 1) {
 659                         asm.decrementl(tmp1);
 660                         asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin);
 661                     }
 662                 }
 663                 if (0 < rest && rest < bytesPerVector) {
 664                     asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 665                     asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 666                     asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
 667                     asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
 668                     asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 669                     asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]);
 670                     asm.vptest(tmpVectors[0], tmpVectors[0]);
 671                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 672                     asm.vptest(tmpVectors[2], tmpVectors[2]);
 673                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 674                     asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1, bytesPerVector + rest));
 675                     asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2, bytesPerVector + rest));
 676                     asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 677                     asm.vptest(tmpVectors[0], tmpVectors[0]);
 678                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 679                 }
 680             }
 681             if (rest >= bytesPerVector) {
 682                 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 683                 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 684                 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 685                 if (rest > bytesPerVector) {
 686                     asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, rest - bytesPerVector));
 687                     asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, rest - bytesPerVector));
 688                     asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]);
 689                     asm.vptest(tmpVectors[2], tmpVectors[2]);
 690                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 691                 }
 692                 asm.vptest(tmpVectors[0], tmpVectors[0]);
 693                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 694             }
 695         } else {
 696             // on AVX or SSE, use XMM vectors
 697             int loopCount = nBytes / (bytesPerVector * 2);
 698             int rest = nBytes % (bytesPerVector * 2);
 699             if (loopCount > 0) {
 700                 if (0 < rest && rest < bytesPerVector) {
 701                     loopCount--;
 702                 }
 703                 if (loopCount > 0) {
 704                     if (loopCount > 1) {
 705                         asm.movl(tmp1, loopCount);
 706                     }
 707                     Label loopBegin = new Label();
 708                     asm.bind(loopBegin);
 709                     asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 710                     asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 711                     asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
 712                     asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
 713                     asm.pxor(tmpVectors[0], tmpVectors[1]);
 714                     asm.pxor(tmpVectors[2], tmpVectors[3]);
 715                     asm.ptest(tmpVectors[0], tmpVectors[0]);
 716                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 717                     asm.ptest(tmpVectors[2], tmpVectors[2]);
 718                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 719                     asm.addq(arrayPtr1, bytesPerVector * 2);
 720                     asm.addq(arrayPtr2, bytesPerVector * 2);
 721                     if (loopCount > 1) {
 722                         asm.decrementl(tmp1);
 723                         asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin);
 724                     }
 725                 }
 726                 if (0 < rest && rest < bytesPerVector) {
 727                     asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 728                     asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 729                     asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
 730                     asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
 731                     asm.pxor(tmpVectors[0], tmpVectors[1]);
 732                     asm.pxor(tmpVectors[2], tmpVectors[3]);
 733                     asm.ptest(tmpVectors[0], tmpVectors[0]);
 734                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 735                     asm.ptest(tmpVectors[2], tmpVectors[2]);
 736                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 737                     asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1, bytesPerVector + rest));
 738                     asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2, bytesPerVector + rest));
 739                     asm.pxor(tmpVectors[0], tmpVectors[1]);
 740                     asm.ptest(tmpVectors[0], tmpVectors[0]);
 741                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 742                 }
 743             }
 744             if (rest >= bytesPerVector) {
 745                 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 746                 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 747                 asm.pxor(tmpVectors[0], tmpVectors[1]);
 748                 if (rest > bytesPerVector) {
 749                     asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, rest - bytesPerVector));
 750                     asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, rest - bytesPerVector));
 751                     asm.pxor(tmpVectors[2], tmpVectors[3]);
 752                     asm.ptest(tmpVectors[2], tmpVectors[2]);
 753                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 754                 }
 755                 asm.ptest(tmpVectors[0], tmpVectors[0]);
 756                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 757             }
 758         }
 759     }
 760 
 761     private static void emitMovBytes(AMD64MacroAssembler asm, Register dst, AMD64Address src, int size) {
 762         switch (size) {
 763             case 1:
 764                 asm.movzbl(dst, src);
 765                 break;
 766             case 2:
 767                 asm.movzwl(dst, src);
 768                 break;
 769             case 4:
 770                 asm.movl(dst, src);
 771                 break;
 772             case 8:
 773                 asm.movq(dst, src);
 774                 break;
 775             default:
 776                 throw new IllegalStateException();
 777         }
 778     }
 779 
 780     private static void emitCmpBytes(AMD64MacroAssembler asm, Register dst, Register src, int size) {
 781         if (size < 8) {
 782             asm.cmpl(dst, src);
 783         } else {
 784             asm.cmpq(dst, src);
 785         }
 786     }
 787 }