1 /* 2 * Copyright (c) 2013, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 25 package org.graalvm.compiler.lir.amd64; 26 27 import jdk.vm.ci.amd64.AMD64; 28 import jdk.vm.ci.amd64.AMD64.CPUFeature; 29 import jdk.vm.ci.amd64.AMD64Kind; 30 import jdk.vm.ci.code.Register; 31 import jdk.vm.ci.code.TargetDescription; 32 import jdk.vm.ci.meta.JavaKind; 33 import jdk.vm.ci.meta.Value; 34 import org.graalvm.compiler.asm.Label; 35 import org.graalvm.compiler.asm.amd64.AMD64Address; 36 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale; 37 import org.graalvm.compiler.asm.amd64.AMD64Assembler; 38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag; 39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp; 40 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize; 41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 42 import org.graalvm.compiler.asm.amd64.AVXKind; 43 import org.graalvm.compiler.core.common.LIRKind; 44 import org.graalvm.compiler.core.common.NumUtil; 45 import org.graalvm.compiler.lir.LIRInstructionClass; 46 import org.graalvm.compiler.lir.Opcode; 47 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 48 import org.graalvm.compiler.lir.gen.LIRGeneratorTool; 49 50 import static jdk.vm.ci.code.ValueUtil.asRegister; 51 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; 52 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; 53 54 /** 55 * Emits code which compares two arrays of the same length. If the CPU supports any vector 56 * instructions specialized code is emitted to leverage these instructions. 57 */ 58 @Opcode("ARRAY_EQUALS") 59 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction { 60 public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class); 61 62 private final JavaKind kind; 63 private final int arrayBaseOffset; 64 private final int arrayIndexScale; 65 private final int constantByteLength; 66 67 @Def({REG}) private Value resultValue; 68 @Alive({REG}) private Value array1Value; 69 @Alive({REG}) private Value array2Value; 70 @Alive({REG}) private Value lengthValue; 71 @Temp({REG}) private Value temp1; 72 @Temp({REG}) private Value temp2; 73 @Temp({REG}) private Value temp3; 74 @Temp({REG}) private Value temp4; 75 76 @Temp({REG, ILLEGAL}) private Value temp5; 77 @Temp({REG, ILLEGAL}) private Value tempXMM; 78 79 @Temp({REG, ILLEGAL}) private Value vectorTemp1; 80 @Temp({REG, ILLEGAL}) private Value vectorTemp2; 81 @Temp({REG, ILLEGAL}) private Value vectorTemp3; 82 @Temp({REG, ILLEGAL}) private Value vectorTemp4; 83 84 public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind, Value result, Value array1, Value array2, Value length, 85 int constantLength, boolean directPointers, int maxVectorSize) { 86 super(TYPE); 87 this.kind = kind; 88 89 this.arrayBaseOffset = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind); 90 this.arrayIndexScale = tool.getProviders().getMetaAccess().getArrayIndexScale(kind); 91 92 if (constantLength >= 0 && arrayIndexScale > 1) { 93 // scale length 94 this.constantByteLength = constantLength << NumUtil.log2Ceil(arrayIndexScale); 95 } else { 96 this.constantByteLength = constantLength; 97 } 98 99 this.resultValue = result; 100 this.array1Value = array1; 101 this.array2Value = array2; 102 this.lengthValue = length; 103 104 // Allocate some temporaries. 105 this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind())); 106 this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind())); 107 this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())); 108 this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())); 109 110 this.temp5 = kind.isNumericFloat() ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL; 111 if (kind == JavaKind.Float) { 112 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE)); 113 } else if (kind == JavaKind.Double) { 114 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 115 } else { 116 this.tempXMM = Value.ILLEGAL; 117 } 118 119 // We only need the vector temporaries if we generate SSE code. 120 if (supportsSSE41(tool.target())) { 121 if (canGenerateConstantLengthCompare(tool.target())) { 122 LIRKind lirKind = LIRKind.value(supportsAVX2(tool.target()) && (maxVectorSize < 0 || maxVectorSize >= 32) ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE); 123 this.vectorTemp1 = tool.newVariable(lirKind); 124 this.vectorTemp2 = tool.newVariable(lirKind); 125 this.vectorTemp3 = tool.newVariable(lirKind); 126 this.vectorTemp4 = tool.newVariable(lirKind); 127 } else { 128 this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 129 this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 130 this.vectorTemp3 = Value.ILLEGAL; 131 this.vectorTemp4 = Value.ILLEGAL; 132 } 133 } else { 134 this.vectorTemp1 = Value.ILLEGAL; 135 this.vectorTemp2 = Value.ILLEGAL; 136 this.vectorTemp3 = Value.ILLEGAL; 137 this.vectorTemp4 = Value.ILLEGAL; 138 } 139 } 140 141 private boolean canGenerateConstantLengthCompare(TargetDescription target) { 142 return constantByteLength >= 0 && kind.isNumericInteger() && supportsSSE41(target); 143 } 144 145 @Override 146 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) { 147 Register result = asRegister(resultValue); 148 Register array1 = asRegister(temp1); 149 Register array2 = asRegister(temp2); 150 151 Label trueLabel = new Label(); 152 Label falseLabel = new Label(); 153 Label done = new Label(); 154 155 // Load array base addresses. 156 masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset)); 157 masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset)); 158 159 if (canGenerateConstantLengthCompare(crb.target)) { 160 emitConstantLengthArrayCompareBytes(masm, array1, array2, asRegister(temp3), asRegister(temp4), 161 new Register[]{asRegister(vectorTemp1), asRegister(vectorTemp2), asRegister(vectorTemp3), asRegister(vectorTemp4)}, 162 falseLabel, constantByteLength, AVXKind.getRegisterSize(vectorTemp1).getBytes()); 163 } else { 164 Register length = asRegister(temp3); 165 166 // Get array length in bytes. 167 masm.movl(length, asRegister(lengthValue)); 168 169 if (arrayIndexScale > 1) { 170 masm.shll(length, NumUtil.log2Ceil(arrayIndexScale)); // scale length 171 } 172 173 masm.movl(result, length); // copy 174 175 emitArrayCompare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel); 176 } 177 178 // Return true 179 masm.bind(trueLabel); 180 masm.movl(result, 1); 181 masm.jmpb(done); 182 183 // Return false 184 masm.bind(falseLabel); 185 masm.xorl(result, result); 186 187 // That's it 188 masm.bind(done); 189 } 190 191 private static void emitArrayCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, 192 Register result, Register array1, Register array2, Register length, 193 Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2, 194 Label trueLabel, Label falseLabel) { 195 if (supportsAVX2(crb.target)) { 196 emitAVXCompare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel); 197 } else if (supportsSSE41(crb.target)) { 198 // this code is used for AVX as well because our backend correctly ensures that 199 // VEX-prefixed instructions are emitted if AVX is supported 200 emitSSE41Compare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel); 201 } 202 emit8ByteCompare(crb, masm, kind, result, array1, array2, length, temp4, tempXMM, trueLabel, falseLabel); 203 emitTailCompares(masm, kind, result, array1, array2, length, temp4, tempXMM, trueLabel, falseLabel); 204 } 205 206 /** 207 * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions. 208 * 209 * @param target target description of the underlying architecture 210 * @return true if the underlying architecture supports SSE 4.1 211 */ 212 private static boolean supportsSSE41(TargetDescription target) { 213 AMD64 arch = (AMD64) target.arch; 214 return arch.getFeatures().contains(CPUFeature.SSE4_1); 215 } 216 217 /** 218 * Vector size used in {@link #emitSSE41Compare}. 219 */ 220 private static final int SSE4_1_VECTOR_SIZE = 16; 221 222 /** 223 * Emits code that uses SSE4.1 128-bit (16-byte) vector compares. 224 */ 225 private static void emitSSE41Compare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, 226 Register result, Register array1, Register array2, Register length, 227 Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2, 228 Label trueLabel, Label falseLabel) { 229 assert supportsSSE41(crb.target); 230 231 Register vector1 = asRegister(vectorTemp1); 232 Register vector2 = asRegister(vectorTemp2); 233 234 Label loop = new Label(); 235 Label compareTail = new Label(); 236 237 boolean requiresNaNCheck = kind.isNumericFloat(); 238 Label loopCheck = new Label(); 239 Label nanCheck = new Label(); 240 241 // Compare 16-byte vectors 242 masm.andl(result, SSE4_1_VECTOR_SIZE - 1); // tail count (in bytes) 243 masm.andl(length, ~(SSE4_1_VECTOR_SIZE - 1)); // vector count (in bytes) 244 masm.jcc(ConditionFlag.Zero, compareTail); 245 246 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0)); 247 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0)); 248 masm.negq(length); 249 250 // Align the main loop 251 masm.align(crb.target.wordSize * 2); 252 masm.bind(loop); 253 masm.movdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0)); 254 masm.movdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0)); 255 masm.pxor(vector1, vector2); 256 masm.ptest(vector1, vector1); 257 masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel); 258 259 masm.bind(loopCheck); 260 masm.addq(length, SSE4_1_VECTOR_SIZE); 261 masm.jcc(ConditionFlag.NotZero, loop); 262 263 masm.testl(result, result); 264 masm.jcc(ConditionFlag.Zero, trueLabel); 265 266 if (requiresNaNCheck) { 267 Label unalignedCheck = new Label(); 268 masm.jmpb(unalignedCheck); 269 masm.bind(nanCheck); 270 emitFloatCompareWithinRange(crb, masm, kind, array1, array2, length, temp4, temp5, tempXMM, 0, falseLabel, SSE4_1_VECTOR_SIZE); 271 masm.jmpb(loopCheck); 272 masm.bind(unalignedCheck); 273 } 274 275 /* 276 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 277 * array. 278 */ 279 masm.movdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -SSE4_1_VECTOR_SIZE)); 280 masm.movdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -SSE4_1_VECTOR_SIZE)); 281 masm.pxor(vector1, vector2); 282 masm.ptest(vector1, vector1); 283 if (requiresNaNCheck) { 284 masm.jcc(ConditionFlag.Zero, trueLabel); 285 emitFloatCompareWithinRange(crb, masm, kind, array1, array2, result, temp4, temp5, tempXMM, -SSE4_1_VECTOR_SIZE, falseLabel, SSE4_1_VECTOR_SIZE); 286 } else { 287 masm.jcc(ConditionFlag.NotZero, falseLabel); 288 } 289 masm.jmp(trueLabel); 290 291 masm.bind(compareTail); 292 masm.movl(length, result); 293 } 294 295 /** 296 * Returns if the underlying AMD64 architecture supports AVX instructions. 297 * 298 * @param target target description of the underlying architecture 299 * @return true if the underlying architecture supports AVX 300 */ 301 private static boolean supportsAVX2(TargetDescription target) { 302 AMD64 arch = (AMD64) target.arch; 303 return arch.getFeatures().contains(CPUFeature.AVX2); 304 } 305 306 /** 307 * Vector size used in {@link #emitAVXCompare}. 308 */ 309 private static final int AVX_VECTOR_SIZE = 32; 310 311 private static void emitAVXCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register result, 312 Register array1, Register array2, Register length, 313 Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2, 314 Label trueLabel, Label falseLabel) { 315 assert supportsAVX2(crb.target); 316 317 Register vector1 = asRegister(vectorTemp1); 318 Register vector2 = asRegister(vectorTemp2); 319 320 Label loop = new Label(); 321 Label compareTail = new Label(); 322 323 boolean requiresNaNCheck = kind.isNumericFloat(); 324 Label loopCheck = new Label(); 325 Label nanCheck = new Label(); 326 327 // Compare 32-byte vectors 328 masm.andl(result, AVX_VECTOR_SIZE - 1); // tail count (in bytes) 329 masm.andl(length, ~(AVX_VECTOR_SIZE - 1)); // vector count (in bytes) 330 masm.jcc(ConditionFlag.Zero, compareTail); 331 332 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0)); 333 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0)); 334 masm.negq(length); 335 336 // Align the main loop 337 masm.align(crb.target.wordSize * 2); 338 masm.bind(loop); 339 masm.vmovdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0)); 340 masm.vmovdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0)); 341 masm.vpxor(vector1, vector1, vector2); 342 masm.vptest(vector1, vector1); 343 masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel); 344 345 masm.bind(loopCheck); 346 masm.addq(length, AVX_VECTOR_SIZE); 347 masm.jcc(ConditionFlag.NotZero, loop); 348 349 masm.testl(result, result); 350 masm.jcc(ConditionFlag.Zero, trueLabel); 351 352 if (requiresNaNCheck) { 353 Label unalignedCheck = new Label(); 354 masm.jmpb(unalignedCheck); 355 masm.bind(nanCheck); 356 emitFloatCompareWithinRange(crb, masm, kind, array1, array2, length, temp4, temp5, tempXMM, 0, falseLabel, AVX_VECTOR_SIZE); 357 masm.jmpb(loopCheck); 358 masm.bind(unalignedCheck); 359 } 360 361 /* 362 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 363 * array. 364 */ 365 masm.vmovdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -AVX_VECTOR_SIZE)); 366 masm.vmovdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -AVX_VECTOR_SIZE)); 367 masm.vpxor(vector1, vector1, vector2); 368 masm.vptest(vector1, vector1); 369 if (requiresNaNCheck) { 370 masm.jcc(ConditionFlag.Zero, trueLabel); 371 emitFloatCompareWithinRange(crb, masm, kind, array1, array2, result, temp4, temp5, tempXMM, -AVX_VECTOR_SIZE, falseLabel, AVX_VECTOR_SIZE); 372 } else { 373 masm.jcc(ConditionFlag.NotZero, falseLabel); 374 } 375 masm.jmp(trueLabel); 376 377 masm.bind(compareTail); 378 masm.movl(length, result); 379 } 380 381 /** 382 * Vector size used in {@link #emit8ByteCompare}. 383 */ 384 private static final int VECTOR_SIZE = 8; 385 386 /** 387 * Emits code that uses 8-byte vector compares. 388 */ 389 private static void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register result, Register array1, Register array2, Register length, Value temp4, 390 Value tempXMM, Label trueLabel, Label falseLabel) { 391 Label loop = new Label(); 392 Label compareTail = new Label(); 393 394 boolean requiresNaNCheck = kind.isNumericFloat(); 395 Label loopCheck = new Label(); 396 Label nanCheck = new Label(); 397 398 Register temp = asRegister(temp4); 399 400 masm.andl(result, VECTOR_SIZE - 1); // tail count (in bytes) 401 masm.andl(length, ~(VECTOR_SIZE - 1)); // vector count (in bytes) 402 masm.jcc(ConditionFlag.Zero, compareTail); 403 404 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0)); 405 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0)); 406 masm.negq(length); 407 408 // Align the main loop 409 masm.align(crb.target.wordSize * 2); 410 masm.bind(loop); 411 masm.movq(temp, new AMD64Address(array1, length, Scale.Times1, 0)); 412 masm.cmpq(temp, new AMD64Address(array2, length, Scale.Times1, 0)); 413 masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel); 414 415 masm.bind(loopCheck); 416 masm.addq(length, VECTOR_SIZE); 417 masm.jccb(ConditionFlag.NotZero, loop); 418 419 masm.testl(result, result); 420 masm.jcc(ConditionFlag.Zero, trueLabel); 421 422 if (requiresNaNCheck) { 423 // NaN check is slow path and hence placed outside of the main loop. 424 Label unalignedCheck = new Label(); 425 masm.jmpb(unalignedCheck); 426 masm.bind(nanCheck); 427 // At most two iterations, unroll in the emitted code. 428 for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) { 429 emitFloatCompare(masm, kind, array1, array2, length, temp4, tempXMM, offset, falseLabel, kind.getByteCount() == VECTOR_SIZE); 430 } 431 masm.jmpb(loopCheck); 432 masm.bind(unalignedCheck); 433 } 434 435 /* 436 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 437 * array. 438 */ 439 masm.movq(temp, new AMD64Address(array1, result, Scale.Times1, -VECTOR_SIZE)); 440 masm.cmpq(temp, new AMD64Address(array2, result, Scale.Times1, -VECTOR_SIZE)); 441 if (requiresNaNCheck) { 442 masm.jcc(ConditionFlag.Equal, trueLabel); 443 // At most two iterations, unroll in the emitted code. 444 for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) { 445 emitFloatCompare(masm, kind, array1, array2, result, temp4, tempXMM, -VECTOR_SIZE + offset, falseLabel, kind.getByteCount() == VECTOR_SIZE); 446 } 447 } else { 448 masm.jccb(ConditionFlag.NotEqual, falseLabel); 449 } 450 masm.jmpb(trueLabel); 451 452 masm.bind(compareTail); 453 masm.movl(length, result); 454 } 455 456 /** 457 * Emits code to compare the remaining 1 to 4 bytes. 458 */ 459 private static void emitTailCompares(AMD64MacroAssembler masm, JavaKind kind, Register result, Register array1, Register array2, Register length, Value temp4, Value tempXMM, 460 Label trueLabel, Label falseLabel) { 461 Label compare2Bytes = new Label(); 462 Label compare1Byte = new Label(); 463 464 Register temp = asRegister(temp4); 465 466 if (kind.getByteCount() <= 4) { 467 // Compare trailing 4 bytes, if any. 468 masm.testl(result, 4); 469 masm.jccb(ConditionFlag.Zero, compare2Bytes); 470 masm.movl(temp, new AMD64Address(array1, 0)); 471 masm.cmpl(temp, new AMD64Address(array2, 0)); 472 if (kind == JavaKind.Float) { 473 masm.jccb(ConditionFlag.Equal, trueLabel); 474 emitFloatCompare(masm, kind, array1, array2, Register.None, temp4, tempXMM, 0, falseLabel, true); 475 masm.jmpb(trueLabel); 476 } else { 477 masm.jccb(ConditionFlag.NotEqual, falseLabel); 478 } 479 if (kind.getByteCount() <= 2) { 480 // Move array pointers forward. 481 masm.leaq(array1, new AMD64Address(array1, 4)); 482 masm.leaq(array2, new AMD64Address(array2, 4)); 483 484 // Compare trailing 2 bytes, if any. 485 masm.bind(compare2Bytes); 486 masm.testl(result, 2); 487 masm.jccb(ConditionFlag.Zero, compare1Byte); 488 masm.movzwl(temp, new AMD64Address(array1, 0)); 489 masm.movzwl(length, new AMD64Address(array2, 0)); 490 masm.cmpl(temp, length); 491 masm.jccb(ConditionFlag.NotEqual, falseLabel); 492 493 // The one-byte tail compare is only required for boolean and byte arrays. 494 if (kind.getByteCount() <= 1) { 495 // Move array pointers forward before we compare the last trailing byte. 496 masm.leaq(array1, new AMD64Address(array1, 2)); 497 masm.leaq(array2, new AMD64Address(array2, 2)); 498 499 // Compare trailing byte, if any. 500 masm.bind(compare1Byte); 501 masm.testl(result, 1); 502 masm.jccb(ConditionFlag.Zero, trueLabel); 503 masm.movzbl(temp, new AMD64Address(array1, 0)); 504 masm.movzbl(length, new AMD64Address(array2, 0)); 505 masm.cmpl(temp, length); 506 masm.jccb(ConditionFlag.NotEqual, falseLabel); 507 } else { 508 masm.bind(compare1Byte); 509 } 510 } else { 511 masm.bind(compare2Bytes); 512 } 513 } 514 } 515 516 /** 517 * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}. 518 */ 519 private static void emitNaNCheck(AMD64MacroAssembler masm, JavaKind kind, Value tempXMM, AMD64Address src, Label branchIfNonNaN) { 520 assert kind.isNumericFloat(); 521 Register tempXMMReg = asRegister(tempXMM); 522 if (kind == JavaKind.Float) { 523 masm.movflt(tempXMMReg, src); 524 } else { 525 masm.movdbl(tempXMMReg, src); 526 } 527 SSEOp.UCOMIS.emit(masm, kind == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg); 528 masm.jcc(ConditionFlag.NoParity, branchIfNonNaN); 529 } 530 531 /** 532 * Emits code to compare if two floats are bitwise equal or both NaN. 533 */ 534 private static void emitFloatCompare(AMD64MacroAssembler masm, JavaKind kind, Register base1, Register base2, Register index, Value temp4, Value tempXMM, int offset, Label falseLabel, 535 boolean skipBitwiseCompare) { 536 AMD64Address address1 = new AMD64Address(base1, index, Scale.Times1, offset); 537 AMD64Address address2 = new AMD64Address(base2, index, Scale.Times1, offset); 538 539 Label bitwiseEqual = new Label(); 540 541 if (!skipBitwiseCompare) { 542 // Bitwise compare 543 Register temp = asRegister(temp4); 544 545 if (kind == JavaKind.Float) { 546 masm.movl(temp, address1); 547 masm.cmpl(temp, address2); 548 } else { 549 masm.movq(temp, address1); 550 masm.cmpq(temp, address2); 551 } 552 masm.jccb(ConditionFlag.Equal, bitwiseEqual); 553 } 554 555 emitNaNCheck(masm, kind, tempXMM, address1, falseLabel); 556 emitNaNCheck(masm, kind, tempXMM, address2, falseLabel); 557 558 masm.bind(bitwiseEqual); 559 } 560 561 /** 562 * Emits code to compare float equality within a range. 563 */ 564 private static void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register base1, Register base2, Register index, Value temp4, Value temp5, 565 Value tempXMM, int offset, Label falseLabel, int range) { 566 assert kind.isNumericFloat(); 567 Label loop = new Label(); 568 Register i = asRegister(temp5); 569 570 masm.movq(i, range); 571 masm.negq(i); 572 // Align the main loop 573 masm.align(crb.target.wordSize * 2); 574 masm.bind(loop); 575 emitFloatCompare(masm, kind, base1, base2, index, temp4, tempXMM, offset, falseLabel, kind.getByteCount() == range); 576 masm.addq(index, kind.getByteCount()); 577 masm.addq(i, kind.getByteCount()); 578 masm.jccb(ConditionFlag.NotZero, loop); 579 // Floats within the range are equal, revert change to the register index 580 masm.subq(index, range); 581 } 582 583 /** 584 * Emits specialized assembly for checking equality of memory regions 585 * {@code arrayPtr1[0..nBytes]} and {@code arrayPtr2[0..nBytes]}. If they match, execution 586 * continues directly after the emitted code block, otherwise we jump to {@code noMatch}. 587 */ 588 private static void emitConstantLengthArrayCompareBytes( 589 AMD64MacroAssembler asm, 590 Register arrayPtr1, 591 Register arrayPtr2, 592 Register tmp1, 593 Register tmp2, 594 Register[] tmpVectors, 595 Label noMatch, 596 int nBytes, 597 int bytesPerVector) { 598 assert bytesPerVector >= 16; 599 if (nBytes == 0) { 600 // do nothing 601 return; 602 } 603 if (nBytes < 16) { 604 // array is shorter than any vector register, use regular CMP instructions 605 int movSize = (nBytes < 2) ? 1 : ((nBytes < 4) ? 2 : ((nBytes < 8) ? 4 : 8)); 606 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1), movSize); 607 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2), movSize); 608 emitCmpBytes(asm, tmp1, tmp2, movSize); 609 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch); 610 if (nBytes > movSize) { 611 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1, nBytes - movSize), movSize); 612 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2, nBytes - movSize), movSize); 613 emitCmpBytes(asm, tmp1, tmp2, movSize); 614 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch); 615 } 616 } else if (nBytes < 32 && bytesPerVector >= 32) { 617 // we could use YMM registers, but the array is too short, force XMM registers 618 int bytesPerXMMVector = AVXKind.AVXSize.XMM.getBytes(); 619 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], new AMD64Address(arrayPtr1)); 620 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[1], new AMD64Address(arrayPtr2)); 621 AMD64Assembler.VexRVMOp.VPXOR.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], tmpVectors[0], tmpVectors[1]); 622 if (nBytes > bytesPerXMMVector) { 623 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], new AMD64Address(arrayPtr1, nBytes - bytesPerXMMVector)); 624 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[3], new AMD64Address(arrayPtr2, nBytes - bytesPerXMMVector)); 625 AMD64Assembler.VexRVMOp.VPXOR.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], tmpVectors[2], tmpVectors[3]); 626 AMD64Assembler.VexRMOp.VPTEST.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], tmpVectors[2]); 627 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 628 } 629 AMD64Assembler.VexRMOp.VPTEST.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], tmpVectors[0]); 630 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 631 } else if (bytesPerVector >= 32) { 632 // AVX2 supported, use YMM vectors 633 assert asm.supports(CPUFeature.AVX2); 634 int loopCount = nBytes / (bytesPerVector * 2); 635 int rest = nBytes % (bytesPerVector * 2); 636 if (loopCount > 0) { 637 if (0 < rest && rest < bytesPerVector) { 638 loopCount--; 639 } 640 if (loopCount > 0) { 641 if (loopCount > 1) { 642 asm.movl(tmp1, loopCount); 643 } 644 Label loopBegin = new Label(); 645 asm.bind(loopBegin); 646 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1)); 647 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2)); 648 asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector)); 649 asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector)); 650 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]); 651 asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]); 652 asm.vptest(tmpVectors[0], tmpVectors[0]); 653 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 654 asm.vptest(tmpVectors[2], tmpVectors[2]); 655 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 656 asm.addq(arrayPtr1, bytesPerVector * 2); 657 asm.addq(arrayPtr2, bytesPerVector * 2); 658 if (loopCount > 1) { 659 asm.decrementl(tmp1); 660 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin); 661 } 662 } 663 if (0 < rest && rest < bytesPerVector) { 664 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1)); 665 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2)); 666 asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector)); 667 asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector)); 668 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]); 669 asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]); 670 asm.vptest(tmpVectors[0], tmpVectors[0]); 671 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 672 asm.vptest(tmpVectors[2], tmpVectors[2]); 673 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 674 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1, bytesPerVector + rest)); 675 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2, bytesPerVector + rest)); 676 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]); 677 asm.vptest(tmpVectors[0], tmpVectors[0]); 678 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 679 } 680 } 681 if (rest >= bytesPerVector) { 682 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1)); 683 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2)); 684 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]); 685 if (rest > bytesPerVector) { 686 asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, rest - bytesPerVector)); 687 asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, rest - bytesPerVector)); 688 asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]); 689 asm.vptest(tmpVectors[2], tmpVectors[2]); 690 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 691 } 692 asm.vptest(tmpVectors[0], tmpVectors[0]); 693 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 694 } 695 } else { 696 // on AVX or SSE, use XMM vectors 697 int loopCount = nBytes / (bytesPerVector * 2); 698 int rest = nBytes % (bytesPerVector * 2); 699 if (loopCount > 0) { 700 if (0 < rest && rest < bytesPerVector) { 701 loopCount--; 702 } 703 if (loopCount > 0) { 704 if (loopCount > 1) { 705 asm.movl(tmp1, loopCount); 706 } 707 Label loopBegin = new Label(); 708 asm.bind(loopBegin); 709 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1)); 710 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2)); 711 asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector)); 712 asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector)); 713 asm.pxor(tmpVectors[0], tmpVectors[1]); 714 asm.pxor(tmpVectors[2], tmpVectors[3]); 715 asm.ptest(tmpVectors[0], tmpVectors[0]); 716 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 717 asm.ptest(tmpVectors[2], tmpVectors[2]); 718 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 719 asm.addq(arrayPtr1, bytesPerVector * 2); 720 asm.addq(arrayPtr2, bytesPerVector * 2); 721 if (loopCount > 1) { 722 asm.decrementl(tmp1); 723 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin); 724 } 725 } 726 if (0 < rest && rest < bytesPerVector) { 727 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1)); 728 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2)); 729 asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector)); 730 asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector)); 731 asm.pxor(tmpVectors[0], tmpVectors[1]); 732 asm.pxor(tmpVectors[2], tmpVectors[3]); 733 asm.ptest(tmpVectors[0], tmpVectors[0]); 734 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 735 asm.ptest(tmpVectors[2], tmpVectors[2]); 736 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 737 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1, bytesPerVector + rest)); 738 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2, bytesPerVector + rest)); 739 asm.pxor(tmpVectors[0], tmpVectors[1]); 740 asm.ptest(tmpVectors[0], tmpVectors[0]); 741 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 742 } 743 } 744 if (rest >= bytesPerVector) { 745 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1)); 746 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2)); 747 asm.pxor(tmpVectors[0], tmpVectors[1]); 748 if (rest > bytesPerVector) { 749 asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, rest - bytesPerVector)); 750 asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, rest - bytesPerVector)); 751 asm.pxor(tmpVectors[2], tmpVectors[3]); 752 asm.ptest(tmpVectors[2], tmpVectors[2]); 753 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 754 } 755 asm.ptest(tmpVectors[0], tmpVectors[0]); 756 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch); 757 } 758 } 759 } 760 761 private static void emitMovBytes(AMD64MacroAssembler asm, Register dst, AMD64Address src, int size) { 762 switch (size) { 763 case 1: 764 asm.movzbl(dst, src); 765 break; 766 case 2: 767 asm.movzwl(dst, src); 768 break; 769 case 4: 770 asm.movl(dst, src); 771 break; 772 case 8: 773 asm.movq(dst, src); 774 break; 775 default: 776 throw new IllegalStateException(); 777 } 778 } 779 780 private static void emitCmpBytes(AMD64MacroAssembler asm, Register dst, Register src, int size) { 781 if (size < 8) { 782 asm.cmpl(dst, src); 783 } else { 784 asm.cmpq(dst, src); 785 } 786 } 787 }