New src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.amd64/src/org/graalvm/compiler/lir/amd64/AMD64ArrayEqualsOp.java

   1 /*
   2  * Copyright (c) 2013, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import jdk.vm.ci.amd64.AMD64;
  28 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  29 import jdk.vm.ci.amd64.AMD64Kind;
  30 import jdk.vm.ci.code.Register;
  31 import jdk.vm.ci.code.TargetDescription;
  32 import jdk.vm.ci.meta.JavaKind;
  33 import jdk.vm.ci.meta.Value;
  34 import org.graalvm.compiler.asm.Label;
  35 import org.graalvm.compiler.asm.amd64.AMD64Address;
  36 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  37 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
  38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
  39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp;
  40 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize;
  41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  42 import org.graalvm.compiler.asm.amd64.AVXKind;
  43 import org.graalvm.compiler.core.common.LIRKind;
  44 import org.graalvm.compiler.debug.GraalError;
  45 import org.graalvm.compiler.lir.LIRInstructionClass;
  46 import org.graalvm.compiler.lir.Opcode;
  47 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  48 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  49 
  50 import static jdk.vm.ci.code.ValueUtil.asRegister;
  51 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
  52 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
  53 
  54 import java.util.Objects;
  55 
  56 /**
  57  * Emits code which compares two arrays of the same length. If the CPU supports any vector
  58  * instructions specialized code is emitted to leverage these instructions.
  59  *
  60  * This op can also compare arrays of different integer types (e.g. {@code byte[]} and
  61  * {@code char[]}) with on-the-fly sign- or zero-extension. If one of the given arrays is a
  62  * {@code char[]} array, the smaller elements are zero-extended, otherwise they are sign-extended.
  63  */
  64 @Opcode("ARRAY_EQUALS")
  65 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction {
  66     public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class);
  67 
  68     private final JavaKind kind1;
  69     private final JavaKind kind2;
  70     private final int arrayBaseOffset1;
  71     private final int arrayBaseOffset2;
  72     private final Scale arrayIndexScale1;
  73     private final Scale arrayIndexScale2;
  74     private final AVXKind.AVXSize vectorSize;
  75     private final int constantLength;
  76     private final boolean signExtend;
  77 
  78     @Def({REG}) private Value resultValue;
  79     @Alive({REG}) private Value array1Value;
  80     @Alive({REG}) private Value array2Value;
  81     @Alive({REG}) private Value lengthValue;
  82     @Temp({REG}) private Value temp1;
  83     @Temp({REG}) private Value temp2;
  84     @Temp({REG}) private Value temp3;
  85     @Temp({REG}) private Value temp4;
  86 
  87     @Temp({REG, ILLEGAL}) private Value temp5;
  88     @Temp({REG, ILLEGAL}) private Value tempXMM;
  89 
  90     @Temp({REG, ILLEGAL}) private Value vectorTemp1;
  91     @Temp({REG, ILLEGAL}) private Value vectorTemp2;
  92     @Temp({REG, ILLEGAL}) private Value vectorTemp3;
  93     @Temp({REG, ILLEGAL}) private Value vectorTemp4;
  94 
  95     public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind1, JavaKind kind2, Value result, Value array1, Value array2, Value length,
  96                     int constantLength, boolean directPointers, int maxVectorSize) {
  97         super(TYPE);
  98         this.kind1 = kind1;
  99         this.kind2 = kind2;
 100         this.signExtend = kind1 != JavaKind.Char && kind2 != JavaKind.Char;
 101 
 102         assert kind1.isNumericInteger() && kind2.isNumericInteger() || kind1 == kind2;
 103 
 104         this.arrayBaseOffset1 = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind1);
 105         this.arrayBaseOffset2 = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind2);
 106         this.arrayIndexScale1 = Objects.requireNonNull(Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(kind1)));
 107         this.arrayIndexScale2 = Objects.requireNonNull(Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(kind2)));
 108         this.vectorSize = ((AMD64) tool.target().arch).getFeatures().contains(CPUFeature.AVX2) && (maxVectorSize < 0 || maxVectorSize >= 32) ? AVXKind.AVXSize.YMM : AVXKind.AVXSize.XMM;
 109         this.constantLength = constantLength;
 110 
 111         this.resultValue = result;
 112         this.array1Value = array1;
 113         this.array2Value = array2;
 114         this.lengthValue = length;
 115 
 116         // Allocate some temporaries.
 117         this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
 118         this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
 119         this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
 120         this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
 121 
 122         this.temp5 = kind1.isNumericFloat() || kind1 != kind2 ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL;
 123         if (kind1 == JavaKind.Float) {
 124             this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE));
 125         } else if (kind1 == JavaKind.Double) {
 126             this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 127         } else {
 128             this.tempXMM = Value.ILLEGAL;
 129         }
 130 
 131         // We only need the vector temporaries if we generate SSE code.
 132         if (supportsSSE41(tool.target())) {
 133             if (canGenerateConstantLengthCompare(tool.target())) {
 134                 LIRKind lirKind = LIRKind.value(vectorSize == AVXKind.AVXSize.YMM ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE);
 135                 this.vectorTemp1 = tool.newVariable(lirKind);
 136                 this.vectorTemp2 = tool.newVariable(lirKind);
 137                 this.vectorTemp3 = tool.newVariable(lirKind);
 138                 this.vectorTemp4 = tool.newVariable(lirKind);
 139             } else {
 140                 this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 141                 this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 142                 this.vectorTemp3 = Value.ILLEGAL;
 143                 this.vectorTemp4 = Value.ILLEGAL;
 144             }
 145         } else {
 146             this.vectorTemp1 = Value.ILLEGAL;
 147             this.vectorTemp2 = Value.ILLEGAL;
 148             this.vectorTemp3 = Value.ILLEGAL;
 149             this.vectorTemp4 = Value.ILLEGAL;
 150         }
 151     }
 152 
 153     private boolean canGenerateConstantLengthCompare(TargetDescription target) {
 154         return constantLength >= 0 && kind1.isNumericInteger() && (kind1 == kind2 || getElementsPerVector(AVXKind.AVXSize.XMM) <= constantLength) && supportsSSE41(target);
 155     }
 156 
 157     @Override
 158     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
 159         Register result = asRegister(resultValue);
 160         Register array1 = asRegister(temp1);
 161         Register array2 = asRegister(temp2);
 162 
 163         Label trueLabel = new Label();
 164         Label falseLabel = new Label();
 165         Label done = new Label();
 166 
 167         // Load array base addresses.
 168         masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset1));
 169         masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset2));
 170 
 171         if (canGenerateConstantLengthCompare(crb.target)) {
 172             emitConstantLengthArrayCompareBytes(crb, masm, array1, array2, asRegister(temp3), asRegister(temp4),
 173                             new Register[]{asRegister(vectorTemp1), asRegister(vectorTemp2), asRegister(vectorTemp3), asRegister(vectorTemp4)}, falseLabel);
 174         } else {
 175             Register length = asRegister(temp3);
 176             // Get array length.
 177             masm.movl(length, asRegister(lengthValue));
 178             // copy
 179             masm.movl(result, length);
 180             emitArrayCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
 181         }
 182 
 183         // Return true
 184         masm.bind(trueLabel);
 185         masm.movl(result, 1);
 186         masm.jmpb(done);
 187 
 188         // Return false
 189         masm.bind(falseLabel);
 190         masm.xorl(result, result);
 191 
 192         // That's it
 193         masm.bind(done);
 194     }
 195 
 196     private void emitArrayCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm,
 197                     Register result, Register array1, Register array2, Register length,
 198                     Label trueLabel, Label falseLabel) {
 199         if (supportsSSE41(crb.target)) {
 200             emitVectorCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
 201         }
 202         if (kind1 == kind2) {
 203             emit8ByteCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
 204             emitTailCompares(masm, result, array1, array2, length, trueLabel, falseLabel);
 205         } else {
 206             emitDifferentKindsElementWiseCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
 207         }
 208     }
 209 
 210     /**
 211      * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions.
 212      *
 213      * @param target target description of the underlying architecture
 214      * @return true if the underlying architecture supports SSE 4.1
 215      */
 216     private static boolean supportsSSE41(TargetDescription target) {
 217         AMD64 arch = (AMD64) target.arch;
 218         return arch.getFeatures().contains(CPUFeature.SSE4_1);
 219     }
 220 
 221     /**
 222      * Emits code that uses SSE4.1/AVX1 128-bit (16-byte) or AVX2 256-bit (32-byte) vector compares.
 223      */
 224     private void emitVectorCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm,
 225                     Register result, Register array1, Register array2, Register length,
 226                     Label trueLabel, Label falseLabel) {
 227         assert supportsSSE41(crb.target);
 228 
 229         Register vector1 = asRegister(vectorTemp1);
 230         Register vector2 = asRegister(vectorTemp2);
 231 
 232         int elementsPerVector = getElementsPerVector(vectorSize);
 233 
 234         Label loop = new Label();
 235         Label compareTail = new Label();
 236 
 237         boolean requiresNaNCheck = kind1.isNumericFloat();
 238         Label loopCheck = new Label();
 239         Label nanCheck = new Label();
 240 
 241         // Compare 16-byte vectors
 242         masm.andl(result, elementsPerVector - 1); // tail count
 243         masm.andl(length, ~(elementsPerVector - 1)); // vector count
 244         masm.jcc(ConditionFlag.Zero, compareTail);
 245 
 246         masm.leaq(array1, new AMD64Address(array1, length, arrayIndexScale1, 0));
 247         masm.leaq(array2, new AMD64Address(array2, length, arrayIndexScale2, 0));
 248         masm.negq(length);
 249 
 250         // Align the main loop
 251         masm.align(crb.target.wordSize * 2);
 252         masm.bind(loop);
 253         emitVectorLoad1(masm, vector1, array1, length, 0, vectorSize);
 254         emitVectorLoad2(masm, vector2, array2, length, 0, vectorSize);
 255         emitVectorCmp(masm, vector1, vector2, vectorSize);
 256         masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
 257 
 258         masm.bind(loopCheck);
 259         masm.addq(length, elementsPerVector);
 260         masm.jcc(ConditionFlag.NotZero, loop);
 261 
 262         masm.testl(result, result);
 263         masm.jcc(ConditionFlag.Zero, trueLabel);
 264 
 265         if (requiresNaNCheck) {
 266             Label unalignedCheck = new Label();
 267             masm.jmpb(unalignedCheck);
 268             masm.bind(nanCheck);
 269             emitFloatCompareWithinRange(crb, masm, array1, array2, length, 0, falseLabel, elementsPerVector);
 270             masm.jmpb(loopCheck);
 271             masm.bind(unalignedCheck);
 272         }
 273 
 274         /*
 275          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 276          * array.
 277          */
 278         emitVectorLoad1(masm, vector1, array1, result, scaleDisplacement1(-vectorSize.getBytes()), vectorSize);
 279         emitVectorLoad2(masm, vector2, array2, result, scaleDisplacement2(-vectorSize.getBytes()), vectorSize);
 280         emitVectorCmp(masm, vector1, vector2, vectorSize);
 281         if (requiresNaNCheck) {
 282             masm.jcc(ConditionFlag.Zero, trueLabel);
 283             emitFloatCompareWithinRange(crb, masm, array1, array2, result, -vectorSize.getBytes(), falseLabel, elementsPerVector);
 284         } else {
 285             masm.jcc(ConditionFlag.NotZero, falseLabel);
 286         }
 287         masm.jmp(trueLabel);
 288 
 289         masm.bind(compareTail);
 290         masm.movl(length, result);
 291     }
 292 
 293     private int getElementsPerVector(AVXKind.AVXSize vSize) {
 294         return vSize.getBytes() >> Math.max(arrayIndexScale1.log2, arrayIndexScale2.log2);
 295     }
 296 
 297     private void emitVectorLoad1(AMD64MacroAssembler asm, Register dst, Register src, int displacement, AVXKind.AVXSize size) {
 298         emitVectorLoad1(asm, dst, src, Register.None, displacement, size);
 299     }
 300 
 301     private void emitVectorLoad2(AMD64MacroAssembler asm, Register dst, Register src, int displacement, AVXKind.AVXSize size) {
 302         emitVectorLoad2(asm, dst, src, Register.None, displacement, size);
 303     }
 304 
 305     private void emitVectorLoad1(AMD64MacroAssembler asm, Register dst, Register src, Register index, int displacement, AVXKind.AVXSize size) {
 306         emitVectorLoad(asm, dst, src, index, displacement, arrayIndexScale1, arrayIndexScale2, size);
 307     }
 308 
 309     private void emitVectorLoad2(AMD64MacroAssembler asm, Register dst, Register src, Register index, int displacement, AVXKind.AVXSize size) {
 310         emitVectorLoad(asm, dst, src, index, displacement, arrayIndexScale2, arrayIndexScale1, size);
 311     }
 312 
 313     private void emitVectorLoad(AMD64MacroAssembler asm, Register dst, Register src, Register index, int displacement, Scale ownScale, Scale otherScale, AVXKind.AVXSize size) {
 314         AMD64Address address = new AMD64Address(src, index, ownScale, displacement);
 315         if (ownScale.value < otherScale.value) {
 316             if (size == AVXKind.AVXSize.YMM) {
 317                 getAVX2LoadAndExtendOp(ownScale, otherScale, signExtend).emit(asm, size, dst, address);
 318             } else {
 319                 loadAndExtendSSE(asm, dst, address, ownScale, otherScale, signExtend);
 320             }
 321         } else {
 322             if (size == AVXKind.AVXSize.YMM) {
 323                 asm.vmovdqu(dst, address);
 324             } else {
 325                 asm.movdqu(dst, address);
 326             }
 327         }
 328     }
 329 
 330     private int scaleDisplacement1(int displacement) {
 331         return scaleDisplacement(displacement, arrayIndexScale1, arrayIndexScale2);
 332     }
 333 
 334     private int scaleDisplacement2(int displacement) {
 335         return scaleDisplacement(displacement, arrayIndexScale2, arrayIndexScale1);
 336     }
 337 
 338     private static int scaleDisplacement(int displacement, Scale ownScale, Scale otherScale) {
 339         if (ownScale.value < otherScale.value) {
 340             return displacement >> (otherScale.log2 - ownScale.log2);
 341         }
 342         return displacement;
 343     }
 344 
 345     private static AMD64Assembler.VexRMOp getAVX2LoadAndExtendOp(Scale ownScale, Scale otherScale, boolean signExtend) {
 346         switch (ownScale) {
 347             case Times1:
 348                 switch (otherScale) {
 349                     case Times2:
 350                         return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXBW : AMD64Assembler.VexRMOp.VPMOVZXBW;
 351                     case Times4:
 352                         return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXBD : AMD64Assembler.VexRMOp.VPMOVZXBD;
 353                     case Times8:
 354                         return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXBQ : AMD64Assembler.VexRMOp.VPMOVZXBQ;
 355                 }
 356                 throw GraalError.shouldNotReachHere();
 357             case Times2:
 358                 switch (otherScale) {
 359                     case Times4:
 360                         return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXWD : AMD64Assembler.VexRMOp.VPMOVZXWD;
 361                     case Times8:
 362                         return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXWQ : AMD64Assembler.VexRMOp.VPMOVZXWQ;
 363                 }
 364                 throw GraalError.shouldNotReachHere();
 365             case Times4:
 366                 return signExtend ? AMD64Assembler.VexRMOp.VPMOVSXDQ : AMD64Assembler.VexRMOp.VPMOVZXDQ;
 367         }
 368         throw GraalError.shouldNotReachHere();
 369     }
 370 
 371     private static void loadAndExtendSSE(AMD64MacroAssembler asm, Register dst, AMD64Address src, Scale ownScale, Scale otherScale, boolean signExtend) {
 372         switch (ownScale) {
 373             case Times1:
 374                 switch (otherScale) {
 375                     case Times2:
 376                         if (signExtend) {
 377                             asm.pmovsxbw(dst, src);
 378                         } else {
 379                             asm.pmovzxbw(dst, src);
 380                         }
 381                         return;
 382                     case Times4:
 383                         if (signExtend) {
 384                             asm.pmovsxbd(dst, src);
 385                         } else {
 386                             asm.pmovzxbd(dst, src);
 387                         }
 388                         return;
 389                     case Times8:
 390                         if (signExtend) {
 391                             asm.pmovsxbq(dst, src);
 392                         } else {
 393                             asm.pmovzxbq(dst, src);
 394                         }
 395                         return;
 396                 }
 397                 throw GraalError.shouldNotReachHere();
 398             case Times2:
 399                 switch (otherScale) {
 400                     case Times4:
 401                         if (signExtend) {
 402                             asm.pmovsxwd(dst, src);
 403                         } else {
 404                             asm.pmovzxwd(dst, src);
 405                         }
 406                         return;
 407                     case Times8:
 408                         if (signExtend) {
 409                             asm.pmovsxwq(dst, src);
 410                         } else {
 411                             asm.pmovzxwq(dst, src);
 412                         }
 413                         return;
 414                 }
 415                 throw GraalError.shouldNotReachHere();
 416             case Times4:
 417                 if (signExtend) {
 418                     asm.pmovsxdq(dst, src);
 419                 } else {
 420                     asm.pmovzxdq(dst, src);
 421                 }
 422                 return;
 423         }
 424         throw GraalError.shouldNotReachHere();
 425     }
 426 
 427     private static void emitVectorCmp(AMD64MacroAssembler masm, Register vector1, Register vector2, AVXKind.AVXSize size) {
 428         emitVectorXor(masm, vector1, vector2, size);
 429         emitVectorTest(masm, vector1, size);
 430     }
 431 
 432     private static void emitVectorXor(AMD64MacroAssembler masm, Register vector1, Register vector2, AVXKind.AVXSize size) {
 433         if (size == AVXKind.AVXSize.YMM) {
 434             masm.vpxor(vector1, vector1, vector2);
 435         } else {
 436             masm.pxor(vector1, vector2);
 437         }
 438     }
 439 
 440     private static void emitVectorTest(AMD64MacroAssembler masm, Register vector1, AVXKind.AVXSize size) {
 441         if (size == AVXKind.AVXSize.YMM) {
 442             masm.vptest(vector1, vector1);
 443         } else {
 444             masm.ptest(vector1, vector1);
 445         }
 446     }
 447 
 448     /**
 449      * Vector size used in {@link #emit8ByteCompare}.
 450      */
 451     private static final int VECTOR_SIZE = 8;
 452 
 453     /**
 454      * Emits code that uses 8-byte vector compares.
 455      */
 456     private void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm,
 457                     Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
 458         assert kind1 == kind2;
 459         Label loop = new Label();
 460         Label compareTail = new Label();
 461 
 462         int elementsPerVector = 8 >> arrayIndexScale1.log2;
 463 
 464         boolean requiresNaNCheck = kind1.isNumericFloat();
 465         Label loopCheck = new Label();
 466         Label nanCheck = new Label();
 467 
 468         Register temp = asRegister(temp4);
 469 
 470         masm.andl(result, elementsPerVector - 1); // tail count
 471         masm.andl(length, ~(elementsPerVector - 1));  // vector count
 472         masm.jcc(ConditionFlag.Zero, compareTail);
 473 
 474         masm.leaq(array1, new AMD64Address(array1, length, arrayIndexScale1, 0));
 475         masm.leaq(array2, new AMD64Address(array2, length, arrayIndexScale2, 0));
 476         masm.negq(length);
 477 
 478         // Align the main loop
 479         masm.align(crb.target.wordSize * 2);
 480         masm.bind(loop);
 481         masm.movq(temp, new AMD64Address(array1, length, arrayIndexScale1, 0));
 482         masm.cmpq(temp, new AMD64Address(array2, length, arrayIndexScale2, 0));
 483         masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel);
 484 
 485         masm.bind(loopCheck);
 486         masm.addq(length, elementsPerVector);
 487         masm.jccb(ConditionFlag.NotZero, loop);
 488 
 489         masm.testl(result, result);
 490         masm.jcc(ConditionFlag.Zero, trueLabel);
 491 
 492         if (requiresNaNCheck) {
 493             // NaN check is slow path and hence placed outside of the main loop.
 494             Label unalignedCheck = new Label();
 495             masm.jmpb(unalignedCheck);
 496             masm.bind(nanCheck);
 497             // At most two iterations, unroll in the emitted code.
 498             for (int offset = 0; offset < VECTOR_SIZE; offset += kind1.getByteCount()) {
 499                 emitFloatCompare(masm, array1, array2, length, offset, falseLabel, kind1.getByteCount() == VECTOR_SIZE);
 500             }
 501             masm.jmpb(loopCheck);
 502             masm.bind(unalignedCheck);
 503         }
 504 
 505         /*
 506          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 507          * array.
 508          */
 509         masm.movq(temp, new AMD64Address(array1, result, arrayIndexScale1, -VECTOR_SIZE));
 510         masm.cmpq(temp, new AMD64Address(array2, result, arrayIndexScale2, -VECTOR_SIZE));
 511         if (requiresNaNCheck) {
 512             masm.jcc(ConditionFlag.Equal, trueLabel);
 513             // At most two iterations, unroll in the emitted code.
 514             for (int offset = 0; offset < VECTOR_SIZE; offset += kind1.getByteCount()) {
 515                 emitFloatCompare(masm, array1, array2, result, -VECTOR_SIZE + offset, falseLabel, kind1.getByteCount() == VECTOR_SIZE);
 516             }
 517         } else {
 518             masm.jccb(ConditionFlag.NotEqual, falseLabel);
 519         }
 520         masm.jmpb(trueLabel);
 521 
 522         masm.bind(compareTail);
 523         masm.movl(length, result);
 524     }
 525 
 526     /**
 527      * Emits code to compare the remaining 1 to 4 bytes.
 528      */
 529     private void emitTailCompares(AMD64MacroAssembler masm,
 530                     Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
 531         assert kind1 == kind2;
 532         Label compare2Bytes = new Label();
 533         Label compare1Byte = new Label();
 534 
 535         Register temp = asRegister(temp4);
 536 
 537         if (kind1.getByteCount() <= 4) {
 538             // Compare trailing 4 bytes, if any.
 539             masm.testl(result, arrayIndexScale1.log2 == 0 ? 4 : 4 >> arrayIndexScale1.log2);
 540             masm.jccb(ConditionFlag.Zero, compare2Bytes);
 541             masm.movl(temp, new AMD64Address(array1, 0));
 542             masm.cmpl(temp, new AMD64Address(array2, 0));
 543             if (kind1 == JavaKind.Float) {
 544                 masm.jccb(ConditionFlag.Equal, trueLabel);
 545                 emitFloatCompare(masm, array1, array2, Register.None, 0, falseLabel, true);
 546                 masm.jmpb(trueLabel);
 547             } else {
 548                 masm.jccb(ConditionFlag.NotEqual, falseLabel);
 549             }
 550             if (kind1.getByteCount() <= 2) {
 551                 // Move array pointers forward.
 552                 masm.leaq(array1, new AMD64Address(array1, 4));
 553                 masm.leaq(array2, new AMD64Address(array2, 4));
 554 
 555                 // Compare trailing 2 bytes, if any.
 556                 masm.bind(compare2Bytes);
 557                 masm.testl(result, arrayIndexScale1.log2 == 0 ? 2 : 2 >> arrayIndexScale1.log2);
 558                 masm.jccb(ConditionFlag.Zero, compare1Byte);
 559                 masm.movzwl(temp, new AMD64Address(array1, 0));
 560                 masm.movzwl(length, new AMD64Address(array2, 0));
 561                 masm.cmpl(temp, length);
 562                 masm.jccb(ConditionFlag.NotEqual, falseLabel);
 563 
 564                 // The one-byte tail compare is only required for boolean and byte arrays.
 565                 if (kind1.getByteCount() <= 1) {
 566                     // Move array pointers forward before we compare the last trailing byte.
 567                     masm.leaq(array1, new AMD64Address(array1, 2));
 568                     masm.leaq(array2, new AMD64Address(array2, 2));
 569 
 570                     // Compare trailing byte, if any.
 571                     masm.bind(compare1Byte);
 572                     masm.testl(result, 1);
 573                     masm.jccb(ConditionFlag.Zero, trueLabel);
 574                     masm.movzbl(temp, new AMD64Address(array1, 0));
 575                     masm.movzbl(length, new AMD64Address(array2, 0));
 576                     masm.cmpl(temp, length);
 577                     masm.jccb(ConditionFlag.NotEqual, falseLabel);
 578                 } else {
 579                     masm.bind(compare1Byte);
 580                 }
 581             } else {
 582                 masm.bind(compare2Bytes);
 583             }
 584         }
 585     }
 586 
 587     private void emitDifferentKindsElementWiseCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm,
 588                     Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
 589         assert kind1 != kind2;
 590         assert kind1.isNumericInteger() && kind2.isNumericInteger();
 591         Label loop = new Label();
 592         Label compareTail = new Label();
 593 
 594         int elementsPerLoopIteration = 4;
 595 
 596         Register tmp1 = asRegister(temp4);
 597         Register tmp2 = asRegister(temp5);
 598 
 599         masm.andl(result, elementsPerLoopIteration - 1); // tail count
 600         masm.andl(length, ~(elementsPerLoopIteration - 1));  // bulk loop count
 601         masm.jcc(ConditionFlag.Zero, compareTail);
 602 
 603         masm.leaq(array1, new AMD64Address(array1, length, arrayIndexScale1, 0));
 604         masm.leaq(array2, new AMD64Address(array2, length, arrayIndexScale2, 0));
 605         masm.negq(length);
 606 
 607         // clear comparison registers because of the missing movzlq instruction
 608         masm.xorq(tmp1, tmp1);
 609         masm.xorq(tmp2, tmp2);
 610 
 611         // Align the main loop
 612         masm.align(crb.target.wordSize * 2);
 613         masm.bind(loop);
 614         for (int i = 0; i < elementsPerLoopIteration; i++) {
 615             emitMovBytes(masm, tmp1, new AMD64Address(array1, length, arrayIndexScale1, i << arrayIndexScale1.log2), kind1.getByteCount());
 616             emitMovBytes(masm, tmp2, new AMD64Address(array2, length, arrayIndexScale2, i << arrayIndexScale2.log2), kind2.getByteCount());
 617             masm.cmpq(tmp1, tmp2);
 618             masm.jcc(ConditionFlag.NotEqual, falseLabel);
 619         }
 620         masm.addq(length, elementsPerLoopIteration);
 621         masm.jccb(ConditionFlag.NotZero, loop);
 622 
 623         masm.bind(compareTail);
 624         masm.testl(result, result);
 625         masm.jcc(ConditionFlag.Zero, trueLabel);
 626         for (int i = 0; i < elementsPerLoopIteration - 1; i++) {
 627             emitMovBytes(masm, tmp1, new AMD64Address(array1, length, arrayIndexScale1, 0), kind1.getByteCount());
 628             emitMovBytes(masm, tmp2, new AMD64Address(array2, length, arrayIndexScale2, 0), kind2.getByteCount());
 629             masm.cmpq(tmp1, tmp2);
 630             masm.jcc(ConditionFlag.NotEqual, falseLabel);
 631             if (i < elementsPerLoopIteration - 2) {
 632                 masm.incrementq(length, 1);
 633                 masm.decrementq(result, 1);
 634                 masm.jcc(ConditionFlag.Zero, trueLabel);
 635             } else {
 636                 masm.jmpb(trueLabel);
 637             }
 638         }
 639     }
 640 
 641     /**
 642      * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}.
 643      */
 644     private void emitNaNCheck(AMD64MacroAssembler masm, AMD64Address src, Label branchIfNonNaN) {
 645         assert kind1.isNumericFloat();
 646         Register tempXMMReg = asRegister(tempXMM);
 647         if (kind1 == JavaKind.Float) {
 648             masm.movflt(tempXMMReg, src);
 649         } else {
 650             masm.movdbl(tempXMMReg, src);
 651         }
 652         SSEOp.UCOMIS.emit(masm, kind1 == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg);
 653         masm.jcc(ConditionFlag.NoParity, branchIfNonNaN);
 654     }
 655 
 656     /**
 657      * Emits code to compare if two floats are bitwise equal or both NaN.
 658      */
 659     private void emitFloatCompare(AMD64MacroAssembler masm, Register base1, Register base2, Register index, int offset, Label falseLabel,
 660                     boolean skipBitwiseCompare) {
 661         AMD64Address address1 = new AMD64Address(base1, index, arrayIndexScale1, offset);
 662         AMD64Address address2 = new AMD64Address(base2, index, arrayIndexScale2, offset);
 663 
 664         Label bitwiseEqual = new Label();
 665 
 666         if (!skipBitwiseCompare) {
 667             // Bitwise compare
 668             Register temp = asRegister(temp4);
 669 
 670             if (kind1 == JavaKind.Float) {
 671                 masm.movl(temp, address1);
 672                 masm.cmpl(temp, address2);
 673             } else {
 674                 masm.movq(temp, address1);
 675                 masm.cmpq(temp, address2);
 676             }
 677             masm.jccb(ConditionFlag.Equal, bitwiseEqual);
 678         }
 679 
 680         emitNaNCheck(masm, address1, falseLabel);
 681         emitNaNCheck(masm, address2, falseLabel);
 682 
 683         masm.bind(bitwiseEqual);
 684     }
 685 
 686     /**
 687      * Emits code to compare float equality within a range.
 688      */
 689     private void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm,
 690                     Register base1, Register base2, Register index, int offset, Label falseLabel, int range) {
 691         assert kind1.isNumericFloat();
 692         Label loop = new Label();
 693         Register i = asRegister(temp5);
 694 
 695         masm.movq(i, range);
 696         masm.negq(i);
 697         // Align the main loop
 698         masm.align(crb.target.wordSize * 2);
 699         masm.bind(loop);
 700         emitFloatCompare(masm, base1, base2, index, offset, falseLabel, range == 1);
 701         masm.incrementq(index, 1);
 702         masm.incrementq(i, 1);
 703         masm.jccb(ConditionFlag.NotZero, loop);
 704         // Floats within the range are equal, revert change to the register index
 705         masm.subq(index, range);
 706     }
 707 
 708     /**
 709      * Emits specialized assembly for checking equality of memory regions
 710      * {@code arrayPtr1[0..nBytes]} and {@code arrayPtr2[0..nBytes]}. If they match, execution
 711      * continues directly after the emitted code block, otherwise we jump to {@code noMatch}.
 712      */
 713     private void emitConstantLengthArrayCompareBytes(
 714                     CompilationResultBuilder crb,
 715                     AMD64MacroAssembler asm,
 716                     Register arrayPtr1,
 717                     Register arrayPtr2,
 718                     Register tmp1,
 719                     Register tmp2,
 720                     Register[] tmpVectors,
 721                     Label noMatch) {
 722         if (constantLength == 0) {
 723             // do nothing
 724             return;
 725         }
 726         AVXKind.AVXSize vSize = vectorSize;
 727         if (constantLength < getElementsPerVector(vectorSize)) {
 728             vSize = AVXKind.AVXSize.XMM;
 729         }
 730         int elementsPerVector = getElementsPerVector(vSize);
 731         if (elementsPerVector > constantLength) {
 732             assert kind1 == kind2;
 733             int byteLength = constantLength << arrayIndexScale1.log2;
 734             // array is shorter than any vector register, use regular CMP instructions
 735             int movSize = (byteLength < 2) ? 1 : ((byteLength < 4) ? 2 : ((byteLength < 8) ? 4 : 8));
 736             emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1), movSize);
 737             emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2), movSize);
 738             emitCmpBytes(asm, tmp1, tmp2, movSize);
 739             asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch);
 740             if (byteLength > movSize) {
 741                 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1, byteLength - movSize), movSize);
 742                 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2, byteLength - movSize), movSize);
 743                 emitCmpBytes(asm, tmp1, tmp2, movSize);
 744                 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch);
 745             }
 746         } else {
 747             int elementsPerVectorLoop = 2 * elementsPerVector;
 748             int tailCount = constantLength & (elementsPerVectorLoop - 1);
 749             int vectorCount = constantLength & ~(elementsPerVectorLoop - 1);
 750             int bytesPerVector = vSize.getBytes();
 751             if (vectorCount > 0) {
 752                 Label loopBegin = new Label();
 753                 asm.leaq(arrayPtr1, new AMD64Address(arrayPtr1, vectorCount << arrayIndexScale1.log2));
 754                 asm.leaq(arrayPtr2, new AMD64Address(arrayPtr2, vectorCount << arrayIndexScale2.log2));
 755                 asm.movq(tmp1, -vectorCount);
 756                 asm.align(crb.target.wordSize * 2);
 757                 asm.bind(loopBegin);
 758                 emitVectorLoad1(asm, tmpVectors[0], arrayPtr1, tmp1, 0, vSize);
 759                 emitVectorLoad2(asm, tmpVectors[1], arrayPtr2, tmp1, 0, vSize);
 760                 emitVectorLoad1(asm, tmpVectors[2], arrayPtr1, tmp1, scaleDisplacement1(bytesPerVector), vSize);
 761                 emitVectorLoad2(asm, tmpVectors[3], arrayPtr2, tmp1, scaleDisplacement2(bytesPerVector), vSize);
 762                 emitVectorXor(asm, tmpVectors[0], tmpVectors[1], vSize);
 763                 emitVectorXor(asm, tmpVectors[2], tmpVectors[3], vSize);
 764                 emitVectorTest(asm, tmpVectors[0], vSize);
 765                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 766                 emitVectorTest(asm, tmpVectors[2], vSize);
 767                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 768                 asm.addq(tmp1, elementsPerVectorLoop);
 769                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin);
 770             }
 771             if (tailCount > 0) {
 772                 emitVectorLoad1(asm, tmpVectors[0], arrayPtr1, (tailCount << arrayIndexScale1.log2) - scaleDisplacement1(bytesPerVector), vSize);
 773                 emitVectorLoad2(asm, tmpVectors[1], arrayPtr2, (tailCount << arrayIndexScale2.log2) - scaleDisplacement2(bytesPerVector), vSize);
 774                 emitVectorXor(asm, tmpVectors[0], tmpVectors[1], vSize);
 775                 if (tailCount > elementsPerVector) {
 776                     emitVectorLoad1(asm, tmpVectors[2], arrayPtr1, 0, vSize);
 777                     emitVectorLoad2(asm, tmpVectors[3], arrayPtr2, 0, vSize);
 778                     emitVectorXor(asm, tmpVectors[2], tmpVectors[3], vSize);
 779                     emitVectorTest(asm, tmpVectors[2], vSize);
 780                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 781                 }
 782                 emitVectorTest(asm, tmpVectors[0], vSize);
 783                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 784             }
 785         }
 786     }
 787 
 788     private void emitMovBytes(AMD64MacroAssembler asm, Register dst, AMD64Address src, int size) {
 789         switch (size) {
 790             case 1:
 791                 if (signExtend) {
 792                     asm.movsbq(dst, src);
 793                 } else {
 794                     asm.movzbq(dst, src);
 795                 }
 796                 break;
 797             case 2:
 798                 if (signExtend) {
 799                     asm.movswq(dst, src);
 800                 } else {
 801                     asm.movzwq(dst, src);
 802                 }
 803                 break;
 804             case 4:
 805                 if (signExtend) {
 806                     asm.movslq(dst, src);
 807                 } else {
 808                     // there is no movzlq
 809                     asm.movl(dst, src);
 810                 }
 811                 break;
 812             case 8:
 813                 asm.movq(dst, src);
 814                 break;
 815             default:
 816                 throw new IllegalStateException();
 817         }
 818     }
 819 
 820     private static void emitCmpBytes(AMD64MacroAssembler asm, Register dst, Register src, int size) {
 821         if (size < 8) {
 822             asm.cmpl(dst, src);
 823         } else {
 824             asm.cmpq(dst, src);
 825         }
 826     }
 827 }