1 /* 2 * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 25 package org.graalvm.compiler.lir.amd64; 26 27 import static jdk.vm.ci.code.ValueUtil.asRegister; 28 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; 29 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; 30 31 import org.graalvm.compiler.asm.Label; 32 import org.graalvm.compiler.asm.amd64.AMD64Address; 33 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale; 34 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag; 35 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp; 36 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize; 37 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 38 import org.graalvm.compiler.core.common.LIRKind; 39 import org.graalvm.compiler.core.common.NumUtil; 40 import org.graalvm.compiler.lir.LIRInstructionClass; 41 import org.graalvm.compiler.lir.Opcode; 42 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 43 import org.graalvm.compiler.lir.gen.LIRGeneratorTool; 44 45 import jdk.vm.ci.amd64.AMD64; 46 import jdk.vm.ci.amd64.AMD64.CPUFeature; 47 import jdk.vm.ci.amd64.AMD64Kind; 48 import jdk.vm.ci.code.Register; 49 import jdk.vm.ci.code.TargetDescription; 50 import jdk.vm.ci.meta.JavaKind; 51 import jdk.vm.ci.meta.Value; 52 53 /** 54 * Emits code which compares two arrays of the same length. If the CPU supports any vector 55 * instructions specialized code is emitted to leverage these instructions. 56 */ 57 @Opcode("ARRAY_EQUALS") 58 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction { 59 public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class); 60 61 private final JavaKind kind; 62 private final int arrayBaseOffset; 63 private final int arrayIndexScale; 64 65 @Def({REG}) protected Value resultValue; 66 @Alive({REG}) protected Value array1Value; 67 @Alive({REG}) protected Value array2Value; 68 @Alive({REG}) protected Value lengthValue; 69 @Temp({REG}) protected Value temp1; 70 @Temp({REG}) protected Value temp2; 71 @Temp({REG}) protected Value temp3; 72 @Temp({REG}) protected Value temp4; 73 74 @Temp({REG, ILLEGAL}) protected Value temp5; 75 @Temp({REG, ILLEGAL}) protected Value tempXMM; 76 77 @Temp({REG, ILLEGAL}) protected Value vectorTemp1; 78 @Temp({REG, ILLEGAL}) protected Value vectorTemp2; 79 80 public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind, Value result, Value array1, Value array2, Value length) { 81 super(TYPE); 82 this.kind = kind; 83 84 this.arrayBaseOffset = tool.getProviders().getArrayOffsetProvider().arrayBaseOffset(kind); 85 this.arrayIndexScale = tool.getProviders().getArrayOffsetProvider().arrayScalingFactor(kind); 86 87 this.resultValue = result; 88 this.array1Value = array1; 89 this.array2Value = array2; 90 this.lengthValue = length; 91 92 // Allocate some temporaries. 93 this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind())); 94 this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind())); 95 this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())); 96 this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())); 97 98 this.temp5 = kind.isNumericFloat() ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL; 99 if (kind == JavaKind.Float) { 100 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE)); 101 } else if (kind == JavaKind.Double) { 102 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 103 } else { 104 this.tempXMM = Value.ILLEGAL; 105 } 106 107 // We only need the vector temporaries if we generate SSE code. 108 if (supportsSSE41(tool.target())) { 109 this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 110 this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); 111 } else { 112 this.vectorTemp1 = Value.ILLEGAL; 113 this.vectorTemp2 = Value.ILLEGAL; 114 } 115 } 116 117 @Override 118 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) { 119 Register result = asRegister(resultValue); 120 Register array1 = asRegister(temp1); 121 Register array2 = asRegister(temp2); 122 Register length = asRegister(temp3); 123 124 Label trueLabel = new Label(); 125 Label falseLabel = new Label(); 126 Label done = new Label(); 127 128 // Load array base addresses. 129 masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset)); 130 masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset)); 131 132 // Get array length in bytes. 133 masm.movl(length, asRegister(lengthValue)); 134 135 if (arrayIndexScale > 1) { 136 masm.shll(length, NumUtil.log2Ceil(arrayIndexScale)); // scale length 137 } 138 139 masm.movl(result, length); // copy 140 141 if (supportsAVX2(crb.target)) { 142 emitAVXCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel); 143 } else if (supportsSSE41(crb.target)) { 144 // this code is used for AVX as well because our backend correctly ensures that 145 // VEX-prefixed instructions are emitted if AVX is supported 146 emitSSE41Compare(crb, masm, result, array1, array2, length, trueLabel, falseLabel); 147 } 148 149 emit8ByteCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel); 150 emitTailCompares(masm, result, array1, array2, length, trueLabel, falseLabel); 151 152 // Return true 153 masm.bind(trueLabel); 154 masm.movl(result, 1); 155 masm.jmpb(done); 156 157 // Return false 158 masm.bind(falseLabel); 159 masm.xorl(result, result); 160 161 // That's it 162 masm.bind(done); 163 } 164 165 /** 166 * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions. 167 * 168 * @param target target description of the underlying architecture 169 * @return true if the underlying architecture supports SSE 4.1 170 */ 171 private static boolean supportsSSE41(TargetDescription target) { 172 AMD64 arch = (AMD64) target.arch; 173 return arch.getFeatures().contains(CPUFeature.SSE4_1); 174 } 175 176 /** 177 * Vector size used in {@link #emitSSE41Compare}. 178 */ 179 private static final int SSE4_1_VECTOR_SIZE = 16; 180 181 /** 182 * Emits code that uses SSE4.1 128-bit (16-byte) vector compares. 183 */ 184 private void emitSSE41Compare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) { 185 assert supportsSSE41(crb.target); 186 187 Register vector1 = asRegister(vectorTemp1, AMD64Kind.DOUBLE); 188 Register vector2 = asRegister(vectorTemp2, AMD64Kind.DOUBLE); 189 190 Label loop = new Label(); 191 Label compareTail = new Label(); 192 193 boolean requiresNaNCheck = kind.isNumericFloat(); 194 Label loopCheck = new Label(); 195 Label nanCheck = new Label(); 196 197 // Compare 16-byte vectors 198 masm.andl(result, SSE4_1_VECTOR_SIZE - 1); // tail count (in bytes) 199 masm.andl(length, ~(SSE4_1_VECTOR_SIZE - 1)); // vector count (in bytes) 200 masm.jcc(ConditionFlag.Zero, compareTail); 201 202 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0)); 203 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0)); 204 masm.negq(length); 205 206 // Align the main loop 207 masm.align(crb.target.wordSize * 2); 208 masm.bind(loop); 209 masm.movdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0)); 210 masm.movdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0)); 211 masm.pxor(vector1, vector2); 212 masm.ptest(vector1, vector1); 213 masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel); 214 215 masm.bind(loopCheck); 216 masm.addq(length, SSE4_1_VECTOR_SIZE); 217 masm.jcc(ConditionFlag.NotZero, loop); 218 219 masm.testl(result, result); 220 masm.jcc(ConditionFlag.Zero, trueLabel); 221 222 if (requiresNaNCheck) { 223 Label unalignedCheck = new Label(); 224 masm.jmpb(unalignedCheck); 225 masm.bind(nanCheck); 226 emitFloatCompareWithinRange(crb, masm, array1, array2, length, 0, falseLabel, SSE4_1_VECTOR_SIZE); 227 masm.jmpb(loopCheck); 228 masm.bind(unalignedCheck); 229 } 230 231 /* 232 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 233 * array. 234 */ 235 masm.movdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -SSE4_1_VECTOR_SIZE)); 236 masm.movdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -SSE4_1_VECTOR_SIZE)); 237 masm.pxor(vector1, vector2); 238 masm.ptest(vector1, vector1); 239 if (requiresNaNCheck) { 240 masm.jcc(ConditionFlag.Zero, trueLabel); 241 emitFloatCompareWithinRange(crb, masm, array1, array2, result, -SSE4_1_VECTOR_SIZE, falseLabel, SSE4_1_VECTOR_SIZE); 242 } else { 243 masm.jcc(ConditionFlag.NotZero, falseLabel); 244 } 245 masm.jmp(trueLabel); 246 247 masm.bind(compareTail); 248 masm.movl(length, result); 249 } 250 251 /** 252 * Returns if the underlying AMD64 architecture supports AVX instructions. 253 * 254 * @param target target description of the underlying architecture 255 * @return true if the underlying architecture supports AVX 256 */ 257 private static boolean supportsAVX2(TargetDescription target) { 258 AMD64 arch = (AMD64) target.arch; 259 return arch.getFeatures().contains(CPUFeature.AVX2); 260 } 261 262 /** 263 * Vector size used in {@link #emitAVXCompare}. 264 */ 265 private static final int AVX_VECTOR_SIZE = 32; 266 267 private void emitAVXCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) { 268 assert supportsAVX2(crb.target); 269 270 Register vector1 = asRegister(vectorTemp1, AMD64Kind.DOUBLE); 271 Register vector2 = asRegister(vectorTemp2, AMD64Kind.DOUBLE); 272 273 Label loop = new Label(); 274 Label compareTail = new Label(); 275 276 boolean requiresNaNCheck = kind.isNumericFloat(); 277 Label loopCheck = new Label(); 278 Label nanCheck = new Label(); 279 280 // Compare 32-byte vectors 281 masm.andl(result, AVX_VECTOR_SIZE - 1); // tail count (in bytes) 282 masm.andl(length, ~(AVX_VECTOR_SIZE - 1)); // vector count (in bytes) 283 masm.jcc(ConditionFlag.Zero, compareTail); 284 285 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0)); 286 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0)); 287 masm.negq(length); 288 289 // Align the main loop 290 masm.align(crb.target.wordSize * 2); 291 masm.bind(loop); 292 masm.vmovdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0)); 293 masm.vmovdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0)); 294 masm.vpxor(vector1, vector1, vector2); 295 masm.vptest(vector1, vector1); 296 masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel); 297 298 masm.bind(loopCheck); 299 masm.addq(length, AVX_VECTOR_SIZE); 300 masm.jcc(ConditionFlag.NotZero, loop); 301 302 masm.testl(result, result); 303 masm.jcc(ConditionFlag.Zero, trueLabel); 304 305 if (requiresNaNCheck) { 306 Label unalignedCheck = new Label(); 307 masm.jmpb(unalignedCheck); 308 masm.bind(nanCheck); 309 emitFloatCompareWithinRange(crb, masm, array1, array2, length, 0, falseLabel, AVX_VECTOR_SIZE); 310 masm.jmpb(loopCheck); 311 masm.bind(unalignedCheck); 312 } 313 314 /* 315 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 316 * array. 317 */ 318 masm.vmovdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -AVX_VECTOR_SIZE)); 319 masm.vmovdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -AVX_VECTOR_SIZE)); 320 masm.vpxor(vector1, vector1, vector2); 321 masm.vptest(vector1, vector1); 322 if (requiresNaNCheck) { 323 masm.jcc(ConditionFlag.Zero, trueLabel); 324 emitFloatCompareWithinRange(crb, masm, array1, array2, result, -AVX_VECTOR_SIZE, falseLabel, AVX_VECTOR_SIZE); 325 } else { 326 masm.jcc(ConditionFlag.NotZero, falseLabel); 327 } 328 masm.jmp(trueLabel); 329 330 masm.bind(compareTail); 331 masm.movl(length, result); 332 } 333 334 /** 335 * Vector size used in {@link #emit8ByteCompare}. 336 */ 337 private static final int VECTOR_SIZE = 8; 338 339 /** 340 * Emits code that uses 8-byte vector compares. 341 */ 342 private void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) { 343 Label loop = new Label(); 344 Label compareTail = new Label(); 345 346 boolean requiresNaNCheck = kind.isNumericFloat(); 347 Label loopCheck = new Label(); 348 Label nanCheck = new Label(); 349 350 Register temp = asRegister(temp4); 351 352 masm.andl(result, VECTOR_SIZE - 1); // tail count (in bytes) 353 masm.andl(length, ~(VECTOR_SIZE - 1)); // vector count (in bytes) 354 masm.jcc(ConditionFlag.Zero, compareTail); 355 356 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0)); 357 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0)); 358 masm.negq(length); 359 360 // Align the main loop 361 masm.align(crb.target.wordSize * 2); 362 masm.bind(loop); 363 masm.movq(temp, new AMD64Address(array1, length, Scale.Times1, 0)); 364 masm.cmpq(temp, new AMD64Address(array2, length, Scale.Times1, 0)); 365 masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel); 366 367 masm.bind(loopCheck); 368 masm.addq(length, VECTOR_SIZE); 369 masm.jccb(ConditionFlag.NotZero, loop); 370 371 masm.testl(result, result); 372 masm.jcc(ConditionFlag.Zero, trueLabel); 373 374 if (requiresNaNCheck) { 375 // NaN check is slow path and hence placed outside of the main loop. 376 Label unalignedCheck = new Label(); 377 masm.jmpb(unalignedCheck); 378 masm.bind(nanCheck); 379 // At most two iterations, unroll in the emitted code. 380 for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) { 381 emitFloatCompare(masm, array1, array2, length, offset, falseLabel, kind.getByteCount() == VECTOR_SIZE); 382 } 383 masm.jmpb(loopCheck); 384 masm.bind(unalignedCheck); 385 } 386 387 /* 388 * Compare the remaining bytes with an unaligned memory load aligned to the end of the 389 * array. 390 */ 391 masm.movq(temp, new AMD64Address(array1, result, Scale.Times1, -VECTOR_SIZE)); 392 masm.cmpq(temp, new AMD64Address(array2, result, Scale.Times1, -VECTOR_SIZE)); 393 if (requiresNaNCheck) { 394 masm.jcc(ConditionFlag.Equal, trueLabel); 395 // At most two iterations, unroll in the emitted code. 396 for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) { 397 emitFloatCompare(masm, array1, array2, result, -VECTOR_SIZE + offset, falseLabel, kind.getByteCount() == VECTOR_SIZE); 398 } 399 } else { 400 masm.jccb(ConditionFlag.NotEqual, falseLabel); 401 } 402 masm.jmpb(trueLabel); 403 404 masm.bind(compareTail); 405 masm.movl(length, result); 406 } 407 408 /** 409 * Emits code to compare the remaining 1 to 4 bytes. 410 */ 411 private void emitTailCompares(AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) { 412 Label compare2Bytes = new Label(); 413 Label compare1Byte = new Label(); 414 415 Register temp = asRegister(temp4); 416 417 if (kind.getByteCount() <= 4) { 418 // Compare trailing 4 bytes, if any. 419 masm.testl(result, 4); 420 masm.jccb(ConditionFlag.Zero, compare2Bytes); 421 masm.movl(temp, new AMD64Address(array1, 0)); 422 masm.cmpl(temp, new AMD64Address(array2, 0)); 423 if (kind == JavaKind.Float) { 424 masm.jccb(ConditionFlag.Equal, trueLabel); 425 emitFloatCompare(masm, array1, array2, Register.None, 0, falseLabel, true); 426 masm.jmpb(trueLabel); 427 } else { 428 masm.jccb(ConditionFlag.NotEqual, falseLabel); 429 } 430 if (kind.getByteCount() <= 2) { 431 // Move array pointers forward. 432 masm.leaq(array1, new AMD64Address(array1, 4)); 433 masm.leaq(array2, new AMD64Address(array2, 4)); 434 435 // Compare trailing 2 bytes, if any. 436 masm.bind(compare2Bytes); 437 masm.testl(result, 2); 438 masm.jccb(ConditionFlag.Zero, compare1Byte); 439 masm.movzwl(temp, new AMD64Address(array1, 0)); 440 masm.movzwl(length, new AMD64Address(array2, 0)); 441 masm.cmpl(temp, length); 442 masm.jccb(ConditionFlag.NotEqual, falseLabel); 443 444 // The one-byte tail compare is only required for boolean and byte arrays. 445 if (kind.getByteCount() <= 1) { 446 // Move array pointers forward before we compare the last trailing byte. 447 masm.leaq(array1, new AMD64Address(array1, 2)); 448 masm.leaq(array2, new AMD64Address(array2, 2)); 449 450 // Compare trailing byte, if any. 451 masm.bind(compare1Byte); 452 masm.testl(result, 1); 453 masm.jccb(ConditionFlag.Zero, trueLabel); 454 masm.movzbl(temp, new AMD64Address(array1, 0)); 455 masm.movzbl(length, new AMD64Address(array2, 0)); 456 masm.cmpl(temp, length); 457 masm.jccb(ConditionFlag.NotEqual, falseLabel); 458 } else { 459 masm.bind(compare1Byte); 460 } 461 } else { 462 masm.bind(compare2Bytes); 463 } 464 } 465 } 466 467 /** 468 * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}. 469 */ 470 private void emitNaNCheck(AMD64MacroAssembler masm, AMD64Address src, Label branchIfNonNaN) { 471 assert kind.isNumericFloat(); 472 Register tempXMMReg = asRegister(tempXMM); 473 if (kind == JavaKind.Float) { 474 masm.movflt(tempXMMReg, src); 475 } else { 476 masm.movdbl(tempXMMReg, src); 477 } 478 SSEOp.UCOMIS.emit(masm, kind == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg); 479 masm.jcc(ConditionFlag.NoParity, branchIfNonNaN); 480 } 481 482 /** 483 * Emits code to compare if two floats are bitwise equal or both NaN. 484 */ 485 private void emitFloatCompare(AMD64MacroAssembler masm, Register base1, Register base2, Register index, int offset, Label falseLabel, boolean skipBitwiseCompare) { 486 AMD64Address address1 = new AMD64Address(base1, index, Scale.Times1, offset); 487 AMD64Address address2 = new AMD64Address(base2, index, Scale.Times1, offset); 488 489 Label bitwiseEqual = new Label(); 490 491 if (!skipBitwiseCompare) { 492 // Bitwise compare 493 Register temp = asRegister(temp4); 494 495 if (kind == JavaKind.Float) { 496 masm.movl(temp, address1); 497 masm.cmpl(temp, address2); 498 } else { 499 masm.movq(temp, address1); 500 masm.cmpq(temp, address2); 501 } 502 masm.jccb(ConditionFlag.Equal, bitwiseEqual); 503 } 504 505 emitNaNCheck(masm, address1, falseLabel); 506 emitNaNCheck(masm, address2, falseLabel); 507 508 masm.bind(bitwiseEqual); 509 } 510 511 /** 512 * Emits code to compare float equality within a range. 513 */ 514 private void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register base1, Register base2, Register index, int offset, Label falseLabel, int range) { 515 assert kind.isNumericFloat(); 516 Label loop = new Label(); 517 Register i = asRegister(temp5); 518 519 masm.movq(i, range); 520 masm.negq(i); 521 // Align the main loop 522 masm.align(crb.target.wordSize * 2); 523 masm.bind(loop); 524 emitFloatCompare(masm, base1, base2, index, offset, falseLabel, kind.getByteCount() == range); 525 masm.addq(index, kind.getByteCount()); 526 masm.addq(i, kind.getByteCount()); 527 masm.jccb(ConditionFlag.NotZero, loop); 528 // Floats within the range are equal, revert change to the register index 529 masm.subq(index, range); 530 } 531 }