< prev index next >

src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.amd64/src/org/graalvm/compiler/lir/amd64/AMD64ArrayEqualsOp.java

Print this page
rev 52509 : [mq]: graal


   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import static jdk.vm.ci.code.ValueUtil.asRegister;
  28 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
  29 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
  30 



  31 import org.graalvm.compiler.asm.Label;
  32 import org.graalvm.compiler.asm.amd64.AMD64Address;
  33 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;

  34 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
  35 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp;
  36 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize;
  37 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;

  38 import org.graalvm.compiler.core.common.LIRKind;
  39 import org.graalvm.compiler.core.common.NumUtil;
  40 import org.graalvm.compiler.lir.LIRInstructionClass;
  41 import org.graalvm.compiler.lir.Opcode;
  42 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  43 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  44 
  45 import jdk.vm.ci.amd64.AMD64;
  46 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  47 import jdk.vm.ci.amd64.AMD64Kind;
  48 import jdk.vm.ci.code.Register;
  49 import jdk.vm.ci.code.TargetDescription;
  50 import jdk.vm.ci.meta.JavaKind;
  51 import jdk.vm.ci.meta.Value;
  52 
  53 /**
  54  * Emits code which compares two arrays of the same length. If the CPU supports any vector
  55  * instructions specialized code is emitted to leverage these instructions.
  56  */
  57 @Opcode("ARRAY_EQUALS")
  58 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction {
  59     public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class);
  60 
  61     private final JavaKind kind;
  62     private final int arrayBaseOffset;
  63     private final int arrayIndexScale;

  64 
  65     @Def({REG}) protected Value resultValue;
  66     @Alive({REG}) protected Value array1Value;
  67     @Alive({REG}) protected Value array2Value;
  68     @Alive({REG}) protected Value lengthValue;
  69     @Temp({REG}) protected Value temp1;
  70     @Temp({REG}) protected Value temp2;
  71     @Temp({REG}) protected Value temp3;
  72     @Temp({REG}) protected Value temp4;








  73 
  74     @Temp({REG, ILLEGAL}) protected Value temp5;
  75     @Temp({REG, ILLEGAL}) protected Value tempXMM;
  76 
  77     @Temp({REG, ILLEGAL}) protected Value vectorTemp1;
  78     @Temp({REG, ILLEGAL}) protected Value vectorTemp2;
  79 
  80     public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind, Value result, Value array1, Value array2, Value length) {
  81         super(TYPE);
  82         this.kind = kind;
  83 
  84         this.arrayBaseOffset = tool.getProviders().getArrayOffsetProvider().arrayBaseOffset(kind);
  85         this.arrayIndexScale = tool.getProviders().getArrayOffsetProvider().arrayScalingFactor(kind);







  86 
  87         this.resultValue = result;
  88         this.array1Value = array1;
  89         this.array2Value = array2;
  90         this.lengthValue = length;
  91 
  92         // Allocate some temporaries.
  93         this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
  94         this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
  95         this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
  96         this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
  97 
  98         this.temp5 = kind.isNumericFloat() ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL;
  99         if (kind == JavaKind.Float) {
 100             this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE));
 101         } else if (kind == JavaKind.Double) {
 102             this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 103         } else {
 104             this.tempXMM = Value.ILLEGAL;
 105         }
 106 
 107         // We only need the vector temporaries if we generate SSE code.
 108         if (supportsSSE41(tool.target())) {







 109             this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 110             this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));



 111         } else {
 112             this.vectorTemp1 = Value.ILLEGAL;
 113             this.vectorTemp2 = Value.ILLEGAL;



 114         }



 115     }
 116 
 117     @Override
 118     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
 119         Register result = asRegister(resultValue);
 120         Register array1 = asRegister(temp1);
 121         Register array2 = asRegister(temp2);
 122         Register length = asRegister(temp3);
 123 
 124         Label trueLabel = new Label();
 125         Label falseLabel = new Label();
 126         Label done = new Label();
 127 
 128         // Load array base addresses.
 129         masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset));
 130         masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset));
 131 







 132         // Get array length in bytes.
 133         masm.movl(length, asRegister(lengthValue));
 134 
 135         if (arrayIndexScale > 1) {
 136             masm.shll(length, NumUtil.log2Ceil(arrayIndexScale)); // scale length
 137         }
 138 
 139         masm.movl(result, length); // copy
 140 
 141         if (supportsAVX2(crb.target)) {
 142             emitAVXCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
 143         } else if (supportsSSE41(crb.target)) {
 144             // this code is used for AVX as well because our backend correctly ensures that
 145             // VEX-prefixed instructions are emitted if AVX is supported
 146             emitSSE41Compare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
 147         }
 148 
 149         emit8ByteCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
 150         emitTailCompares(masm, result, array1, array2, length, trueLabel, falseLabel);
 151 
 152         // Return true
 153         masm.bind(trueLabel);
 154         masm.movl(result, 1);
 155         masm.jmpb(done);
 156 
 157         // Return false
 158         masm.bind(falseLabel);
 159         masm.xorl(result, result);
 160 
 161         // That's it
 162         masm.bind(done);
 163     }
 164 















 165     /**
 166      * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions.
 167      *
 168      * @param target target description of the underlying architecture
 169      * @return true if the underlying architecture supports SSE 4.1
 170      */
 171     private static boolean supportsSSE41(TargetDescription target) {
 172         AMD64 arch = (AMD64) target.arch;
 173         return arch.getFeatures().contains(CPUFeature.SSE4_1);
 174     }
 175 
 176     /**
 177      * Vector size used in {@link #emitSSE41Compare}.
 178      */
 179     private static final int SSE4_1_VECTOR_SIZE = 16;
 180 
 181     /**
 182      * Emits code that uses SSE4.1 128-bit (16-byte) vector compares.
 183      */
 184     private void emitSSE41Compare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {



 185         assert supportsSSE41(crb.target);
 186 
 187         Register vector1 = asRegister(vectorTemp1, AMD64Kind.DOUBLE);
 188         Register vector2 = asRegister(vectorTemp2, AMD64Kind.DOUBLE);
 189 
 190         Label loop = new Label();
 191         Label compareTail = new Label();
 192 
 193         boolean requiresNaNCheck = kind.isNumericFloat();
 194         Label loopCheck = new Label();
 195         Label nanCheck = new Label();
 196 
 197         // Compare 16-byte vectors
 198         masm.andl(result, SSE4_1_VECTOR_SIZE - 1); // tail count (in bytes)
 199         masm.andl(length, ~(SSE4_1_VECTOR_SIZE - 1)); // vector count (in bytes)
 200         masm.jcc(ConditionFlag.Zero, compareTail);
 201 
 202         masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
 203         masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
 204         masm.negq(length);
 205 
 206         // Align the main loop
 207         masm.align(crb.target.wordSize * 2);
 208         masm.bind(loop);
 209         masm.movdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
 210         masm.movdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
 211         masm.pxor(vector1, vector2);
 212         masm.ptest(vector1, vector1);
 213         masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
 214 
 215         masm.bind(loopCheck);
 216         masm.addq(length, SSE4_1_VECTOR_SIZE);
 217         masm.jcc(ConditionFlag.NotZero, loop);
 218 
 219         masm.testl(result, result);
 220         masm.jcc(ConditionFlag.Zero, trueLabel);
 221 
 222         if (requiresNaNCheck) {
 223             Label unalignedCheck = new Label();
 224             masm.jmpb(unalignedCheck);
 225             masm.bind(nanCheck);
 226             emitFloatCompareWithinRange(crb, masm, array1, array2, length, 0, falseLabel, SSE4_1_VECTOR_SIZE);
 227             masm.jmpb(loopCheck);
 228             masm.bind(unalignedCheck);
 229         }
 230 
 231         /*
 232          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 233          * array.
 234          */
 235         masm.movdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -SSE4_1_VECTOR_SIZE));
 236         masm.movdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -SSE4_1_VECTOR_SIZE));
 237         masm.pxor(vector1, vector2);
 238         masm.ptest(vector1, vector1);
 239         if (requiresNaNCheck) {
 240             masm.jcc(ConditionFlag.Zero, trueLabel);
 241             emitFloatCompareWithinRange(crb, masm, array1, array2, result, -SSE4_1_VECTOR_SIZE, falseLabel, SSE4_1_VECTOR_SIZE);
 242         } else {
 243             masm.jcc(ConditionFlag.NotZero, falseLabel);
 244         }
 245         masm.jmp(trueLabel);
 246 
 247         masm.bind(compareTail);
 248         masm.movl(length, result);
 249     }
 250 
 251     /**
 252      * Returns if the underlying AMD64 architecture supports AVX instructions.
 253      *
 254      * @param target target description of the underlying architecture
 255      * @return true if the underlying architecture supports AVX
 256      */
 257     private static boolean supportsAVX2(TargetDescription target) {
 258         AMD64 arch = (AMD64) target.arch;
 259         return arch.getFeatures().contains(CPUFeature.AVX2);
 260     }
 261 
 262     /**
 263      * Vector size used in {@link #emitAVXCompare}.
 264      */
 265     private static final int AVX_VECTOR_SIZE = 32;
 266 
 267     private void emitAVXCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {



 268         assert supportsAVX2(crb.target);
 269 
 270         Register vector1 = asRegister(vectorTemp1, AMD64Kind.DOUBLE);
 271         Register vector2 = asRegister(vectorTemp2, AMD64Kind.DOUBLE);
 272 
 273         Label loop = new Label();
 274         Label compareTail = new Label();
 275 
 276         boolean requiresNaNCheck = kind.isNumericFloat();
 277         Label loopCheck = new Label();
 278         Label nanCheck = new Label();
 279 
 280         // Compare 32-byte vectors
 281         masm.andl(result, AVX_VECTOR_SIZE - 1); // tail count (in bytes)
 282         masm.andl(length, ~(AVX_VECTOR_SIZE - 1)); // vector count (in bytes)
 283         masm.jcc(ConditionFlag.Zero, compareTail);
 284 
 285         masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
 286         masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
 287         masm.negq(length);
 288 
 289         // Align the main loop
 290         masm.align(crb.target.wordSize * 2);
 291         masm.bind(loop);
 292         masm.vmovdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
 293         masm.vmovdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
 294         masm.vpxor(vector1, vector1, vector2);
 295         masm.vptest(vector1, vector1);
 296         masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
 297 
 298         masm.bind(loopCheck);
 299         masm.addq(length, AVX_VECTOR_SIZE);
 300         masm.jcc(ConditionFlag.NotZero, loop);
 301 
 302         masm.testl(result, result);
 303         masm.jcc(ConditionFlag.Zero, trueLabel);
 304 
 305         if (requiresNaNCheck) {
 306             Label unalignedCheck = new Label();
 307             masm.jmpb(unalignedCheck);
 308             masm.bind(nanCheck);
 309             emitFloatCompareWithinRange(crb, masm, array1, array2, length, 0, falseLabel, AVX_VECTOR_SIZE);
 310             masm.jmpb(loopCheck);
 311             masm.bind(unalignedCheck);
 312         }
 313 
 314         /*
 315          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 316          * array.
 317          */
 318         masm.vmovdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -AVX_VECTOR_SIZE));
 319         masm.vmovdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -AVX_VECTOR_SIZE));
 320         masm.vpxor(vector1, vector1, vector2);
 321         masm.vptest(vector1, vector1);
 322         if (requiresNaNCheck) {
 323             masm.jcc(ConditionFlag.Zero, trueLabel);
 324             emitFloatCompareWithinRange(crb, masm, array1, array2, result, -AVX_VECTOR_SIZE, falseLabel, AVX_VECTOR_SIZE);
 325         } else {
 326             masm.jcc(ConditionFlag.NotZero, falseLabel);
 327         }
 328         masm.jmp(trueLabel);
 329 
 330         masm.bind(compareTail);
 331         masm.movl(length, result);
 332     }
 333 
 334     /**
 335      * Vector size used in {@link #emit8ByteCompare}.
 336      */
 337     private static final int VECTOR_SIZE = 8;
 338 
 339     /**
 340      * Emits code that uses 8-byte vector compares.
 341      */
 342     private void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {

 343         Label loop = new Label();
 344         Label compareTail = new Label();
 345 
 346         boolean requiresNaNCheck = kind.isNumericFloat();
 347         Label loopCheck = new Label();
 348         Label nanCheck = new Label();
 349 
 350         Register temp = asRegister(temp4);
 351 
 352         masm.andl(result, VECTOR_SIZE - 1); // tail count (in bytes)
 353         masm.andl(length, ~(VECTOR_SIZE - 1));  // vector count (in bytes)
 354         masm.jcc(ConditionFlag.Zero, compareTail);
 355 
 356         masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
 357         masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
 358         masm.negq(length);
 359 
 360         // Align the main loop
 361         masm.align(crb.target.wordSize * 2);
 362         masm.bind(loop);
 363         masm.movq(temp, new AMD64Address(array1, length, Scale.Times1, 0));
 364         masm.cmpq(temp, new AMD64Address(array2, length, Scale.Times1, 0));
 365         masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel);
 366 
 367         masm.bind(loopCheck);
 368         masm.addq(length, VECTOR_SIZE);
 369         masm.jccb(ConditionFlag.NotZero, loop);
 370 
 371         masm.testl(result, result);
 372         masm.jcc(ConditionFlag.Zero, trueLabel);
 373 
 374         if (requiresNaNCheck) {
 375             // NaN check is slow path and hence placed outside of the main loop.
 376             Label unalignedCheck = new Label();
 377             masm.jmpb(unalignedCheck);
 378             masm.bind(nanCheck);
 379             // At most two iterations, unroll in the emitted code.
 380             for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
 381                 emitFloatCompare(masm, array1, array2, length, offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
 382             }
 383             masm.jmpb(loopCheck);
 384             masm.bind(unalignedCheck);
 385         }
 386 
 387         /*
 388          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 389          * array.
 390          */
 391         masm.movq(temp, new AMD64Address(array1, result, Scale.Times1, -VECTOR_SIZE));
 392         masm.cmpq(temp, new AMD64Address(array2, result, Scale.Times1, -VECTOR_SIZE));
 393         if (requiresNaNCheck) {
 394             masm.jcc(ConditionFlag.Equal, trueLabel);
 395             // At most two iterations, unroll in the emitted code.
 396             for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
 397                 emitFloatCompare(masm, array1, array2, result, -VECTOR_SIZE + offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
 398             }
 399         } else {
 400             masm.jccb(ConditionFlag.NotEqual, falseLabel);
 401         }
 402         masm.jmpb(trueLabel);
 403 
 404         masm.bind(compareTail);
 405         masm.movl(length, result);
 406     }
 407 
 408     /**
 409      * Emits code to compare the remaining 1 to 4 bytes.
 410      */
 411     private void emitTailCompares(AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {

 412         Label compare2Bytes = new Label();
 413         Label compare1Byte = new Label();
 414 
 415         Register temp = asRegister(temp4);
 416 
 417         if (kind.getByteCount() <= 4) {
 418             // Compare trailing 4 bytes, if any.
 419             masm.testl(result, 4);
 420             masm.jccb(ConditionFlag.Zero, compare2Bytes);
 421             masm.movl(temp, new AMD64Address(array1, 0));
 422             masm.cmpl(temp, new AMD64Address(array2, 0));
 423             if (kind == JavaKind.Float) {
 424                 masm.jccb(ConditionFlag.Equal, trueLabel);
 425                 emitFloatCompare(masm, array1, array2, Register.None, 0, falseLabel, true);
 426                 masm.jmpb(trueLabel);
 427             } else {
 428                 masm.jccb(ConditionFlag.NotEqual, falseLabel);
 429             }
 430             if (kind.getByteCount() <= 2) {
 431                 // Move array pointers forward.
 432                 masm.leaq(array1, new AMD64Address(array1, 4));
 433                 masm.leaq(array2, new AMD64Address(array2, 4));
 434 
 435                 // Compare trailing 2 bytes, if any.
 436                 masm.bind(compare2Bytes);
 437                 masm.testl(result, 2);
 438                 masm.jccb(ConditionFlag.Zero, compare1Byte);
 439                 masm.movzwl(temp, new AMD64Address(array1, 0));
 440                 masm.movzwl(length, new AMD64Address(array2, 0));
 441                 masm.cmpl(temp, length);
 442                 masm.jccb(ConditionFlag.NotEqual, falseLabel);
 443 
 444                 // The one-byte tail compare is only required for boolean and byte arrays.
 445                 if (kind.getByteCount() <= 1) {


 450                     // Compare trailing byte, if any.
 451                     masm.bind(compare1Byte);
 452                     masm.testl(result, 1);
 453                     masm.jccb(ConditionFlag.Zero, trueLabel);
 454                     masm.movzbl(temp, new AMD64Address(array1, 0));
 455                     masm.movzbl(length, new AMD64Address(array2, 0));
 456                     masm.cmpl(temp, length);
 457                     masm.jccb(ConditionFlag.NotEqual, falseLabel);
 458                 } else {
 459                     masm.bind(compare1Byte);
 460                 }
 461             } else {
 462                 masm.bind(compare2Bytes);
 463             }
 464         }
 465     }
 466 
 467     /**
 468      * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}.
 469      */
 470     private void emitNaNCheck(AMD64MacroAssembler masm, AMD64Address src, Label branchIfNonNaN) {
 471         assert kind.isNumericFloat();
 472         Register tempXMMReg = asRegister(tempXMM);
 473         if (kind == JavaKind.Float) {
 474             masm.movflt(tempXMMReg, src);
 475         } else {
 476             masm.movdbl(tempXMMReg, src);
 477         }
 478         SSEOp.UCOMIS.emit(masm, kind == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg);
 479         masm.jcc(ConditionFlag.NoParity, branchIfNonNaN);
 480     }
 481 
 482     /**
 483      * Emits code to compare if two floats are bitwise equal or both NaN.
 484      */
 485     private void emitFloatCompare(AMD64MacroAssembler masm, Register base1, Register base2, Register index, int offset, Label falseLabel, boolean skipBitwiseCompare) {

 486         AMD64Address address1 = new AMD64Address(base1, index, Scale.Times1, offset);
 487         AMD64Address address2 = new AMD64Address(base2, index, Scale.Times1, offset);
 488 
 489         Label bitwiseEqual = new Label();
 490 
 491         if (!skipBitwiseCompare) {
 492             // Bitwise compare
 493             Register temp = asRegister(temp4);
 494 
 495             if (kind == JavaKind.Float) {
 496                 masm.movl(temp, address1);
 497                 masm.cmpl(temp, address2);
 498             } else {
 499                 masm.movq(temp, address1);
 500                 masm.cmpq(temp, address2);
 501             }
 502             masm.jccb(ConditionFlag.Equal, bitwiseEqual);
 503         }
 504 
 505         emitNaNCheck(masm, address1, falseLabel);
 506         emitNaNCheck(masm, address2, falseLabel);
 507 
 508         masm.bind(bitwiseEqual);
 509     }
 510 
 511     /**
 512      * Emits code to compare float equality within a range.
 513      */
 514     private void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register base1, Register base2, Register index, int offset, Label falseLabel, int range) {

 515         assert kind.isNumericFloat();
 516         Label loop = new Label();
 517         Register i = asRegister(temp5);
 518 
 519         masm.movq(i, range);
 520         masm.negq(i);
 521         // Align the main loop
 522         masm.align(crb.target.wordSize * 2);
 523         masm.bind(loop);
 524         emitFloatCompare(masm, base1, base2, index, offset, falseLabel, kind.getByteCount() == range);
 525         masm.addq(index, kind.getByteCount());
 526         masm.addq(i, kind.getByteCount());
 527         masm.jccb(ConditionFlag.NotZero, loop);
 528         // Floats within the range are equal, revert change to the register index
 529         masm.subq(index, range);













































































































































































































 530     }
 531 }


   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import jdk.vm.ci.amd64.AMD64;
  28 import jdk.vm.ci.amd64.AMD64.CPUFeature;
  29 import jdk.vm.ci.amd64.AMD64Kind;
  30 import jdk.vm.ci.code.Register;
  31 import jdk.vm.ci.code.TargetDescription;
  32 import jdk.vm.ci.meta.JavaKind;
  33 import jdk.vm.ci.meta.Value;
  34 import org.graalvm.compiler.asm.Label;
  35 import org.graalvm.compiler.asm.amd64.AMD64Address;
  36 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
  37 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
  38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
  39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp;
  40 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize;
  41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  42 import org.graalvm.compiler.asm.amd64.AVXKind;
  43 import org.graalvm.compiler.core.common.LIRKind;
  44 import org.graalvm.compiler.core.common.NumUtil;
  45 import org.graalvm.compiler.lir.LIRInstructionClass;
  46 import org.graalvm.compiler.lir.Opcode;
  47 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  48 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  49 
  50 import static jdk.vm.ci.code.ValueUtil.asRegister;
  51 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
  52 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;




  53 
  54 /**
  55  * Emits code which compares two arrays of the same length. If the CPU supports any vector
  56  * instructions specialized code is emitted to leverage these instructions.
  57  */
  58 @Opcode("ARRAY_EQUALS")
  59 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction {
  60     public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class);
  61 
  62     private final JavaKind kind;
  63     private final int arrayBaseOffset;
  64     private final int arrayIndexScale;
  65     private final int constantByteLength;
  66 
  67     @Def({REG}) private Value resultValue;
  68     @Alive({REG}) private Value array1Value;
  69     @Alive({REG}) private Value array2Value;
  70     @Alive({REG}) private Value lengthValue;
  71     @Temp({REG}) private Value temp1;
  72     @Temp({REG}) private Value temp2;
  73     @Temp({REG}) private Value temp3;
  74     @Temp({REG}) private Value temp4;
  75 
  76     @Temp({REG, ILLEGAL}) private Value temp5;
  77     @Temp({REG, ILLEGAL}) private Value tempXMM;
  78 
  79     @Temp({REG, ILLEGAL}) private Value vectorTemp1;
  80     @Temp({REG, ILLEGAL}) private Value vectorTemp2;
  81     @Temp({REG, ILLEGAL}) private Value vectorTemp3;
  82     @Temp({REG, ILLEGAL}) private Value vectorTemp4;
  83 
  84     public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind, Value result, Value array1, Value array2, Value length,
  85                     int constantLength, boolean directPointers, int maxVectorSize) {





  86         super(TYPE);
  87         this.kind = kind;
  88 
  89         this.arrayBaseOffset = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind);
  90         this.arrayIndexScale = tool.getProviders().getMetaAccess().getArrayIndexScale(kind);
  91 
  92         if (constantLength >= 0 && arrayIndexScale > 1) {
  93             // scale length
  94             this.constantByteLength = constantLength << NumUtil.log2Ceil(arrayIndexScale);
  95         } else {
  96             this.constantByteLength = constantLength;
  97         }
  98 
  99         this.resultValue = result;
 100         this.array1Value = array1;
 101         this.array2Value = array2;
 102         this.lengthValue = length;
 103 
 104         // Allocate some temporaries.
 105         this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
 106         this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
 107         this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
 108         this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
 109 
 110         this.temp5 = kind.isNumericFloat() ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL;
 111         if (kind == JavaKind.Float) {
 112             this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE));
 113         } else if (kind == JavaKind.Double) {
 114             this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 115         } else {
 116             this.tempXMM = Value.ILLEGAL;
 117         }
 118 
 119         // We only need the vector temporaries if we generate SSE code.
 120         if (supportsSSE41(tool.target())) {
 121             if (canGenerateConstantLengthCompare(tool.target())) {
 122                 LIRKind lirKind = LIRKind.value(supportsAVX2(tool.target()) && (maxVectorSize < 0 || maxVectorSize >= 32) ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE);
 123                 this.vectorTemp1 = tool.newVariable(lirKind);
 124                 this.vectorTemp2 = tool.newVariable(lirKind);
 125                 this.vectorTemp3 = tool.newVariable(lirKind);
 126                 this.vectorTemp4 = tool.newVariable(lirKind);
 127             } else {
 128                 this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 129                 this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
 130                 this.vectorTemp3 = Value.ILLEGAL;
 131                 this.vectorTemp4 = Value.ILLEGAL;
 132             }
 133         } else {
 134             this.vectorTemp1 = Value.ILLEGAL;
 135             this.vectorTemp2 = Value.ILLEGAL;
 136             this.vectorTemp3 = Value.ILLEGAL;
 137             this.vectorTemp4 = Value.ILLEGAL;
 138         }
 139     }
 140 
 141     private boolean canGenerateConstantLengthCompare(TargetDescription target) {
 142         return constantByteLength >= 0 && kind.isNumericInteger() && supportsSSE41(target);
 143     }
 144 
 145     @Override
 146     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
 147         Register result = asRegister(resultValue);
 148         Register array1 = asRegister(temp1);
 149         Register array2 = asRegister(temp2);

 150 
 151         Label trueLabel = new Label();
 152         Label falseLabel = new Label();
 153         Label done = new Label();
 154 
 155         // Load array base addresses.
 156         masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset));
 157         masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset));
 158 
 159         if (canGenerateConstantLengthCompare(crb.target)) {
 160             emitConstantLengthArrayCompareBytes(masm, array1, array2, asRegister(temp3), asRegister(temp4),
 161                             new Register[]{asRegister(vectorTemp1), asRegister(vectorTemp2), asRegister(vectorTemp3), asRegister(vectorTemp4)},
 162                             falseLabel, constantByteLength, AVXKind.getRegisterSize(vectorTemp1).getBytes());
 163         } else {
 164             Register length = asRegister(temp3);
 165 
 166             // Get array length in bytes.
 167             masm.movl(length, asRegister(lengthValue));
 168 
 169             if (arrayIndexScale > 1) {
 170                 masm.shll(length, NumUtil.log2Ceil(arrayIndexScale)); // scale length
 171             }
 172 
 173             masm.movl(result, length); // copy
 174 
 175             emitArrayCompare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel);





 176         }
 177 



 178         // Return true
 179         masm.bind(trueLabel);
 180         masm.movl(result, 1);
 181         masm.jmpb(done);
 182 
 183         // Return false
 184         masm.bind(falseLabel);
 185         masm.xorl(result, result);
 186 
 187         // That's it
 188         masm.bind(done);
 189     }
 190 
 191     private static void emitArrayCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind,
 192                     Register result, Register array1, Register array2, Register length,
 193                     Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2,
 194                     Label trueLabel, Label falseLabel) {
 195         if (supportsAVX2(crb.target)) {
 196             emitAVXCompare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel);
 197         } else if (supportsSSE41(crb.target)) {
 198             // this code is used for AVX as well because our backend correctly ensures that
 199             // VEX-prefixed instructions are emitted if AVX is supported
 200             emitSSE41Compare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel);
 201         }
 202         emit8ByteCompare(crb, masm, kind, result, array1, array2, length, temp4, tempXMM, trueLabel, falseLabel);
 203         emitTailCompares(masm, kind, result, array1, array2, length, temp4, tempXMM, trueLabel, falseLabel);
 204     }
 205 
 206     /**
 207      * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions.
 208      *
 209      * @param target target description of the underlying architecture
 210      * @return true if the underlying architecture supports SSE 4.1
 211      */
 212     private static boolean supportsSSE41(TargetDescription target) {
 213         AMD64 arch = (AMD64) target.arch;
 214         return arch.getFeatures().contains(CPUFeature.SSE4_1);
 215     }
 216 
 217     /**
 218      * Vector size used in {@link #emitSSE41Compare}.
 219      */
 220     private static final int SSE4_1_VECTOR_SIZE = 16;
 221 
 222     /**
 223      * Emits code that uses SSE4.1 128-bit (16-byte) vector compares.
 224      */
 225     private static void emitSSE41Compare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind,
 226                     Register result, Register array1, Register array2, Register length,
 227                     Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2,
 228                     Label trueLabel, Label falseLabel) {
 229         assert supportsSSE41(crb.target);
 230 
 231         Register vector1 = asRegister(vectorTemp1);
 232         Register vector2 = asRegister(vectorTemp2);
 233 
 234         Label loop = new Label();
 235         Label compareTail = new Label();
 236 
 237         boolean requiresNaNCheck = kind.isNumericFloat();
 238         Label loopCheck = new Label();
 239         Label nanCheck = new Label();
 240 
 241         // Compare 16-byte vectors
 242         masm.andl(result, SSE4_1_VECTOR_SIZE - 1); // tail count (in bytes)
 243         masm.andl(length, ~(SSE4_1_VECTOR_SIZE - 1)); // vector count (in bytes)
 244         masm.jcc(ConditionFlag.Zero, compareTail);
 245 
 246         masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
 247         masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
 248         masm.negq(length);
 249 
 250         // Align the main loop
 251         masm.align(crb.target.wordSize * 2);
 252         masm.bind(loop);
 253         masm.movdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
 254         masm.movdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
 255         masm.pxor(vector1, vector2);
 256         masm.ptest(vector1, vector1);
 257         masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
 258 
 259         masm.bind(loopCheck);
 260         masm.addq(length, SSE4_1_VECTOR_SIZE);
 261         masm.jcc(ConditionFlag.NotZero, loop);
 262 
 263         masm.testl(result, result);
 264         masm.jcc(ConditionFlag.Zero, trueLabel);
 265 
 266         if (requiresNaNCheck) {
 267             Label unalignedCheck = new Label();
 268             masm.jmpb(unalignedCheck);
 269             masm.bind(nanCheck);
 270             emitFloatCompareWithinRange(crb, masm, kind, array1, array2, length, temp4, temp5, tempXMM, 0, falseLabel, SSE4_1_VECTOR_SIZE);
 271             masm.jmpb(loopCheck);
 272             masm.bind(unalignedCheck);
 273         }
 274 
 275         /*
 276          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 277          * array.
 278          */
 279         masm.movdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -SSE4_1_VECTOR_SIZE));
 280         masm.movdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -SSE4_1_VECTOR_SIZE));
 281         masm.pxor(vector1, vector2);
 282         masm.ptest(vector1, vector1);
 283         if (requiresNaNCheck) {
 284             masm.jcc(ConditionFlag.Zero, trueLabel);
 285             emitFloatCompareWithinRange(crb, masm, kind, array1, array2, result, temp4, temp5, tempXMM, -SSE4_1_VECTOR_SIZE, falseLabel, SSE4_1_VECTOR_SIZE);
 286         } else {
 287             masm.jcc(ConditionFlag.NotZero, falseLabel);
 288         }
 289         masm.jmp(trueLabel);
 290 
 291         masm.bind(compareTail);
 292         masm.movl(length, result);
 293     }
 294 
 295     /**
 296      * Returns if the underlying AMD64 architecture supports AVX instructions.
 297      *
 298      * @param target target description of the underlying architecture
 299      * @return true if the underlying architecture supports AVX
 300      */
 301     private static boolean supportsAVX2(TargetDescription target) {
 302         AMD64 arch = (AMD64) target.arch;
 303         return arch.getFeatures().contains(CPUFeature.AVX2);
 304     }
 305 
 306     /**
 307      * Vector size used in {@link #emitAVXCompare}.
 308      */
 309     private static final int AVX_VECTOR_SIZE = 32;
 310 
 311     private static void emitAVXCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register result,
 312                     Register array1, Register array2, Register length,
 313                     Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2,
 314                     Label trueLabel, Label falseLabel) {
 315         assert supportsAVX2(crb.target);
 316 
 317         Register vector1 = asRegister(vectorTemp1);
 318         Register vector2 = asRegister(vectorTemp2);
 319 
 320         Label loop = new Label();
 321         Label compareTail = new Label();
 322 
 323         boolean requiresNaNCheck = kind.isNumericFloat();
 324         Label loopCheck = new Label();
 325         Label nanCheck = new Label();
 326 
 327         // Compare 32-byte vectors
 328         masm.andl(result, AVX_VECTOR_SIZE - 1); // tail count (in bytes)
 329         masm.andl(length, ~(AVX_VECTOR_SIZE - 1)); // vector count (in bytes)
 330         masm.jcc(ConditionFlag.Zero, compareTail);
 331 
 332         masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
 333         masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
 334         masm.negq(length);
 335 
 336         // Align the main loop
 337         masm.align(crb.target.wordSize * 2);
 338         masm.bind(loop);
 339         masm.vmovdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
 340         masm.vmovdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
 341         masm.vpxor(vector1, vector1, vector2);
 342         masm.vptest(vector1, vector1);
 343         masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
 344 
 345         masm.bind(loopCheck);
 346         masm.addq(length, AVX_VECTOR_SIZE);
 347         masm.jcc(ConditionFlag.NotZero, loop);
 348 
 349         masm.testl(result, result);
 350         masm.jcc(ConditionFlag.Zero, trueLabel);
 351 
 352         if (requiresNaNCheck) {
 353             Label unalignedCheck = new Label();
 354             masm.jmpb(unalignedCheck);
 355             masm.bind(nanCheck);
 356             emitFloatCompareWithinRange(crb, masm, kind, array1, array2, length, temp4, temp5, tempXMM, 0, falseLabel, AVX_VECTOR_SIZE);
 357             masm.jmpb(loopCheck);
 358             masm.bind(unalignedCheck);
 359         }
 360 
 361         /*
 362          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 363          * array.
 364          */
 365         masm.vmovdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -AVX_VECTOR_SIZE));
 366         masm.vmovdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -AVX_VECTOR_SIZE));
 367         masm.vpxor(vector1, vector1, vector2);
 368         masm.vptest(vector1, vector1);
 369         if (requiresNaNCheck) {
 370             masm.jcc(ConditionFlag.Zero, trueLabel);
 371             emitFloatCompareWithinRange(crb, masm, kind, array1, array2, result, temp4, temp5, tempXMM, -AVX_VECTOR_SIZE, falseLabel, AVX_VECTOR_SIZE);
 372         } else {
 373             masm.jcc(ConditionFlag.NotZero, falseLabel);
 374         }
 375         masm.jmp(trueLabel);
 376 
 377         masm.bind(compareTail);
 378         masm.movl(length, result);
 379     }
 380 
 381     /**
 382      * Vector size used in {@link #emit8ByteCompare}.
 383      */
 384     private static final int VECTOR_SIZE = 8;
 385 
 386     /**
 387      * Emits code that uses 8-byte vector compares.
 388      */
 389     private static void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register result, Register array1, Register array2, Register length, Value temp4,
 390                     Value tempXMM, Label trueLabel, Label falseLabel) {
 391         Label loop = new Label();
 392         Label compareTail = new Label();
 393 
 394         boolean requiresNaNCheck = kind.isNumericFloat();
 395         Label loopCheck = new Label();
 396         Label nanCheck = new Label();
 397 
 398         Register temp = asRegister(temp4);
 399 
 400         masm.andl(result, VECTOR_SIZE - 1); // tail count (in bytes)
 401         masm.andl(length, ~(VECTOR_SIZE - 1));  // vector count (in bytes)
 402         masm.jcc(ConditionFlag.Zero, compareTail);
 403 
 404         masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
 405         masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
 406         masm.negq(length);
 407 
 408         // Align the main loop
 409         masm.align(crb.target.wordSize * 2);
 410         masm.bind(loop);
 411         masm.movq(temp, new AMD64Address(array1, length, Scale.Times1, 0));
 412         masm.cmpq(temp, new AMD64Address(array2, length, Scale.Times1, 0));
 413         masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel);
 414 
 415         masm.bind(loopCheck);
 416         masm.addq(length, VECTOR_SIZE);
 417         masm.jccb(ConditionFlag.NotZero, loop);
 418 
 419         masm.testl(result, result);
 420         masm.jcc(ConditionFlag.Zero, trueLabel);
 421 
 422         if (requiresNaNCheck) {
 423             // NaN check is slow path and hence placed outside of the main loop.
 424             Label unalignedCheck = new Label();
 425             masm.jmpb(unalignedCheck);
 426             masm.bind(nanCheck);
 427             // At most two iterations, unroll in the emitted code.
 428             for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
 429                 emitFloatCompare(masm, kind, array1, array2, length, temp4, tempXMM, offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
 430             }
 431             masm.jmpb(loopCheck);
 432             masm.bind(unalignedCheck);
 433         }
 434 
 435         /*
 436          * Compare the remaining bytes with an unaligned memory load aligned to the end of the
 437          * array.
 438          */
 439         masm.movq(temp, new AMD64Address(array1, result, Scale.Times1, -VECTOR_SIZE));
 440         masm.cmpq(temp, new AMD64Address(array2, result, Scale.Times1, -VECTOR_SIZE));
 441         if (requiresNaNCheck) {
 442             masm.jcc(ConditionFlag.Equal, trueLabel);
 443             // At most two iterations, unroll in the emitted code.
 444             for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
 445                 emitFloatCompare(masm, kind, array1, array2, result, temp4, tempXMM, -VECTOR_SIZE + offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
 446             }
 447         } else {
 448             masm.jccb(ConditionFlag.NotEqual, falseLabel);
 449         }
 450         masm.jmpb(trueLabel);
 451 
 452         masm.bind(compareTail);
 453         masm.movl(length, result);
 454     }
 455 
 456     /**
 457      * Emits code to compare the remaining 1 to 4 bytes.
 458      */
 459     private static void emitTailCompares(AMD64MacroAssembler masm, JavaKind kind, Register result, Register array1, Register array2, Register length, Value temp4, Value tempXMM,
 460                     Label trueLabel, Label falseLabel) {
 461         Label compare2Bytes = new Label();
 462         Label compare1Byte = new Label();
 463 
 464         Register temp = asRegister(temp4);
 465 
 466         if (kind.getByteCount() <= 4) {
 467             // Compare trailing 4 bytes, if any.
 468             masm.testl(result, 4);
 469             masm.jccb(ConditionFlag.Zero, compare2Bytes);
 470             masm.movl(temp, new AMD64Address(array1, 0));
 471             masm.cmpl(temp, new AMD64Address(array2, 0));
 472             if (kind == JavaKind.Float) {
 473                 masm.jccb(ConditionFlag.Equal, trueLabel);
 474                 emitFloatCompare(masm, kind, array1, array2, Register.None, temp4, tempXMM, 0, falseLabel, true);
 475                 masm.jmpb(trueLabel);
 476             } else {
 477                 masm.jccb(ConditionFlag.NotEqual, falseLabel);
 478             }
 479             if (kind.getByteCount() <= 2) {
 480                 // Move array pointers forward.
 481                 masm.leaq(array1, new AMD64Address(array1, 4));
 482                 masm.leaq(array2, new AMD64Address(array2, 4));
 483 
 484                 // Compare trailing 2 bytes, if any.
 485                 masm.bind(compare2Bytes);
 486                 masm.testl(result, 2);
 487                 masm.jccb(ConditionFlag.Zero, compare1Byte);
 488                 masm.movzwl(temp, new AMD64Address(array1, 0));
 489                 masm.movzwl(length, new AMD64Address(array2, 0));
 490                 masm.cmpl(temp, length);
 491                 masm.jccb(ConditionFlag.NotEqual, falseLabel);
 492 
 493                 // The one-byte tail compare is only required for boolean and byte arrays.
 494                 if (kind.getByteCount() <= 1) {


 499                     // Compare trailing byte, if any.
 500                     masm.bind(compare1Byte);
 501                     masm.testl(result, 1);
 502                     masm.jccb(ConditionFlag.Zero, trueLabel);
 503                     masm.movzbl(temp, new AMD64Address(array1, 0));
 504                     masm.movzbl(length, new AMD64Address(array2, 0));
 505                     masm.cmpl(temp, length);
 506                     masm.jccb(ConditionFlag.NotEqual, falseLabel);
 507                 } else {
 508                     masm.bind(compare1Byte);
 509                 }
 510             } else {
 511                 masm.bind(compare2Bytes);
 512             }
 513         }
 514     }
 515 
 516     /**
 517      * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}.
 518      */
 519     private static void emitNaNCheck(AMD64MacroAssembler masm, JavaKind kind, Value tempXMM, AMD64Address src, Label branchIfNonNaN) {
 520         assert kind.isNumericFloat();
 521         Register tempXMMReg = asRegister(tempXMM);
 522         if (kind == JavaKind.Float) {
 523             masm.movflt(tempXMMReg, src);
 524         } else {
 525             masm.movdbl(tempXMMReg, src);
 526         }
 527         SSEOp.UCOMIS.emit(masm, kind == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg);
 528         masm.jcc(ConditionFlag.NoParity, branchIfNonNaN);
 529     }
 530 
 531     /**
 532      * Emits code to compare if two floats are bitwise equal or both NaN.
 533      */
 534     private static void emitFloatCompare(AMD64MacroAssembler masm, JavaKind kind, Register base1, Register base2, Register index, Value temp4, Value tempXMM, int offset, Label falseLabel,
 535                     boolean skipBitwiseCompare) {
 536         AMD64Address address1 = new AMD64Address(base1, index, Scale.Times1, offset);
 537         AMD64Address address2 = new AMD64Address(base2, index, Scale.Times1, offset);
 538 
 539         Label bitwiseEqual = new Label();
 540 
 541         if (!skipBitwiseCompare) {
 542             // Bitwise compare
 543             Register temp = asRegister(temp4);
 544 
 545             if (kind == JavaKind.Float) {
 546                 masm.movl(temp, address1);
 547                 masm.cmpl(temp, address2);
 548             } else {
 549                 masm.movq(temp, address1);
 550                 masm.cmpq(temp, address2);
 551             }
 552             masm.jccb(ConditionFlag.Equal, bitwiseEqual);
 553         }
 554 
 555         emitNaNCheck(masm, kind, tempXMM, address1, falseLabel);
 556         emitNaNCheck(masm, kind, tempXMM, address2, falseLabel);
 557 
 558         masm.bind(bitwiseEqual);
 559     }
 560 
 561     /**
 562      * Emits code to compare float equality within a range.
 563      */
 564     private static void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register base1, Register base2, Register index, Value temp4, Value temp5,
 565                     Value tempXMM, int offset, Label falseLabel, int range) {
 566         assert kind.isNumericFloat();
 567         Label loop = new Label();
 568         Register i = asRegister(temp5);
 569 
 570         masm.movq(i, range);
 571         masm.negq(i);
 572         // Align the main loop
 573         masm.align(crb.target.wordSize * 2);
 574         masm.bind(loop);
 575         emitFloatCompare(masm, kind, base1, base2, index, temp4, tempXMM, offset, falseLabel, kind.getByteCount() == range);
 576         masm.addq(index, kind.getByteCount());
 577         masm.addq(i, kind.getByteCount());
 578         masm.jccb(ConditionFlag.NotZero, loop);
 579         // Floats within the range are equal, revert change to the register index
 580         masm.subq(index, range);
 581     }
 582 
 583     /**
 584      * Emits specialized assembly for checking equality of memory regions
 585      * {@code arrayPtr1[0..nBytes]} and {@code arrayPtr2[0..nBytes]}. If they match, execution
 586      * continues directly after the emitted code block, otherwise we jump to {@code noMatch}.
 587      */
 588     private static void emitConstantLengthArrayCompareBytes(
 589                     AMD64MacroAssembler asm,
 590                     Register arrayPtr1,
 591                     Register arrayPtr2,
 592                     Register tmp1,
 593                     Register tmp2,
 594                     Register[] tmpVectors,
 595                     Label noMatch,
 596                     int nBytes,
 597                     int bytesPerVector) {
 598         assert bytesPerVector >= 16;
 599         if (nBytes == 0) {
 600             // do nothing
 601             return;
 602         }
 603         if (nBytes < 16) {
 604             // array is shorter than any vector register, use regular CMP instructions
 605             int movSize = (nBytes < 2) ? 1 : ((nBytes < 4) ? 2 : ((nBytes < 8) ? 4 : 8));
 606             emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1), movSize);
 607             emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2), movSize);
 608             emitCmpBytes(asm, tmp1, tmp2, movSize);
 609             asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch);
 610             if (nBytes > movSize) {
 611                 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1, nBytes - movSize), movSize);
 612                 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2, nBytes - movSize), movSize);
 613                 emitCmpBytes(asm, tmp1, tmp2, movSize);
 614                 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch);
 615             }
 616         } else if (nBytes < 32 && bytesPerVector >= 32) {
 617             // we could use YMM registers, but the array is too short, force XMM registers
 618             int bytesPerXMMVector = AVXKind.AVXSize.XMM.getBytes();
 619             AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], new AMD64Address(arrayPtr1));
 620             AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[1], new AMD64Address(arrayPtr2));
 621             AMD64Assembler.VexRVMOp.VPXOR.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 622             if (nBytes > bytesPerXMMVector) {
 623                 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], new AMD64Address(arrayPtr1, nBytes - bytesPerXMMVector));
 624                 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[3], new AMD64Address(arrayPtr2, nBytes - bytesPerXMMVector));
 625                 AMD64Assembler.VexRVMOp.VPXOR.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], tmpVectors[2], tmpVectors[3]);
 626                 AMD64Assembler.VexRMOp.VPTEST.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], tmpVectors[2]);
 627                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 628             }
 629             AMD64Assembler.VexRMOp.VPTEST.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], tmpVectors[0]);
 630             asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 631         } else if (bytesPerVector >= 32) {
 632             // AVX2 supported, use YMM vectors
 633             assert asm.supports(CPUFeature.AVX2);
 634             int loopCount = nBytes / (bytesPerVector * 2);
 635             int rest = nBytes % (bytesPerVector * 2);
 636             if (loopCount > 0) {
 637                 if (0 < rest && rest < bytesPerVector) {
 638                     loopCount--;
 639                 }
 640                 if (loopCount > 0) {
 641                     if (loopCount > 1) {
 642                         asm.movl(tmp1, loopCount);
 643                     }
 644                     Label loopBegin = new Label();
 645                     asm.bind(loopBegin);
 646                     asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 647                     asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 648                     asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
 649                     asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
 650                     asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 651                     asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]);
 652                     asm.vptest(tmpVectors[0], tmpVectors[0]);
 653                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 654                     asm.vptest(tmpVectors[2], tmpVectors[2]);
 655                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 656                     asm.addq(arrayPtr1, bytesPerVector * 2);
 657                     asm.addq(arrayPtr2, bytesPerVector * 2);
 658                     if (loopCount > 1) {
 659                         asm.decrementl(tmp1);
 660                         asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin);
 661                     }
 662                 }
 663                 if (0 < rest && rest < bytesPerVector) {
 664                     asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 665                     asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 666                     asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
 667                     asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
 668                     asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 669                     asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]);
 670                     asm.vptest(tmpVectors[0], tmpVectors[0]);
 671                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 672                     asm.vptest(tmpVectors[2], tmpVectors[2]);
 673                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 674                     asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1, bytesPerVector + rest));
 675                     asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2, bytesPerVector + rest));
 676                     asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 677                     asm.vptest(tmpVectors[0], tmpVectors[0]);
 678                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 679                 }
 680             }
 681             if (rest >= bytesPerVector) {
 682                 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 683                 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 684                 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
 685                 if (rest > bytesPerVector) {
 686                     asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, rest - bytesPerVector));
 687                     asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, rest - bytesPerVector));
 688                     asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]);
 689                     asm.vptest(tmpVectors[2], tmpVectors[2]);
 690                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 691                 }
 692                 asm.vptest(tmpVectors[0], tmpVectors[0]);
 693                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 694             }
 695         } else {
 696             // on AVX or SSE, use XMM vectors
 697             int loopCount = nBytes / (bytesPerVector * 2);
 698             int rest = nBytes % (bytesPerVector * 2);
 699             if (loopCount > 0) {
 700                 if (0 < rest && rest < bytesPerVector) {
 701                     loopCount--;
 702                 }
 703                 if (loopCount > 0) {
 704                     if (loopCount > 1) {
 705                         asm.movl(tmp1, loopCount);
 706                     }
 707                     Label loopBegin = new Label();
 708                     asm.bind(loopBegin);
 709                     asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 710                     asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 711                     asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
 712                     asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
 713                     asm.pxor(tmpVectors[0], tmpVectors[1]);
 714                     asm.pxor(tmpVectors[2], tmpVectors[3]);
 715                     asm.ptest(tmpVectors[0], tmpVectors[0]);
 716                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 717                     asm.ptest(tmpVectors[2], tmpVectors[2]);
 718                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 719                     asm.addq(arrayPtr1, bytesPerVector * 2);
 720                     asm.addq(arrayPtr2, bytesPerVector * 2);
 721                     if (loopCount > 1) {
 722                         asm.decrementl(tmp1);
 723                         asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin);
 724                     }
 725                 }
 726                 if (0 < rest && rest < bytesPerVector) {
 727                     asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 728                     asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 729                     asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
 730                     asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
 731                     asm.pxor(tmpVectors[0], tmpVectors[1]);
 732                     asm.pxor(tmpVectors[2], tmpVectors[3]);
 733                     asm.ptest(tmpVectors[0], tmpVectors[0]);
 734                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 735                     asm.ptest(tmpVectors[2], tmpVectors[2]);
 736                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 737                     asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1, bytesPerVector + rest));
 738                     asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2, bytesPerVector + rest));
 739                     asm.pxor(tmpVectors[0], tmpVectors[1]);
 740                     asm.ptest(tmpVectors[0], tmpVectors[0]);
 741                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 742                 }
 743             }
 744             if (rest >= bytesPerVector) {
 745                 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
 746                 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
 747                 asm.pxor(tmpVectors[0], tmpVectors[1]);
 748                 if (rest > bytesPerVector) {
 749                     asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, rest - bytesPerVector));
 750                     asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, rest - bytesPerVector));
 751                     asm.pxor(tmpVectors[2], tmpVectors[3]);
 752                     asm.ptest(tmpVectors[2], tmpVectors[2]);
 753                     asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 754                 }
 755                 asm.ptest(tmpVectors[0], tmpVectors[0]);
 756                 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
 757             }
 758         }
 759     }
 760 
 761     private static void emitMovBytes(AMD64MacroAssembler asm, Register dst, AMD64Address src, int size) {
 762         switch (size) {
 763             case 1:
 764                 asm.movzbl(dst, src);
 765                 break;
 766             case 2:
 767                 asm.movzwl(dst, src);
 768                 break;
 769             case 4:
 770                 asm.movl(dst, src);
 771                 break;
 772             case 8:
 773                 asm.movq(dst, src);
 774                 break;
 775             default:
 776                 throw new IllegalStateException();
 777         }
 778     }
 779 
 780     private static void emitCmpBytes(AMD64MacroAssembler asm, Register dst, Register src, int size) {
 781         if (size < 8) {
 782             asm.cmpl(dst, src);
 783         } else {
 784             asm.cmpq(dst, src);
 785         }
 786     }
 787 }
< prev index next >