7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 */
23
24
25 package org.graalvm.compiler.lir.amd64;
26
27 import static jdk.vm.ci.code.ValueUtil.asRegister;
28 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
29 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
30
31 import org.graalvm.compiler.asm.Label;
32 import org.graalvm.compiler.asm.amd64.AMD64Address;
33 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
34 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
35 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp;
36 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize;
37 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
38 import org.graalvm.compiler.core.common.LIRKind;
39 import org.graalvm.compiler.core.common.NumUtil;
40 import org.graalvm.compiler.lir.LIRInstructionClass;
41 import org.graalvm.compiler.lir.Opcode;
42 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
43 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
44
45 import jdk.vm.ci.amd64.AMD64;
46 import jdk.vm.ci.amd64.AMD64.CPUFeature;
47 import jdk.vm.ci.amd64.AMD64Kind;
48 import jdk.vm.ci.code.Register;
49 import jdk.vm.ci.code.TargetDescription;
50 import jdk.vm.ci.meta.JavaKind;
51 import jdk.vm.ci.meta.Value;
52
53 /**
54 * Emits code which compares two arrays of the same length. If the CPU supports any vector
55 * instructions specialized code is emitted to leverage these instructions.
56 */
57 @Opcode("ARRAY_EQUALS")
58 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction {
59 public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class);
60
61 private final JavaKind kind;
62 private final int arrayBaseOffset;
63 private final int arrayIndexScale;
64
65 @Def({REG}) protected Value resultValue;
66 @Alive({REG}) protected Value array1Value;
67 @Alive({REG}) protected Value array2Value;
68 @Alive({REG}) protected Value lengthValue;
69 @Temp({REG}) protected Value temp1;
70 @Temp({REG}) protected Value temp2;
71 @Temp({REG}) protected Value temp3;
72 @Temp({REG}) protected Value temp4;
73
74 @Temp({REG, ILLEGAL}) protected Value temp5;
75 @Temp({REG, ILLEGAL}) protected Value tempXMM;
76
77 @Temp({REG, ILLEGAL}) protected Value vectorTemp1;
78 @Temp({REG, ILLEGAL}) protected Value vectorTemp2;
79
80 public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind, Value result, Value array1, Value array2, Value length) {
81 super(TYPE);
82 this.kind = kind;
83
84 this.arrayBaseOffset = tool.getProviders().getArrayOffsetProvider().arrayBaseOffset(kind);
85 this.arrayIndexScale = tool.getProviders().getArrayOffsetProvider().arrayScalingFactor(kind);
86
87 this.resultValue = result;
88 this.array1Value = array1;
89 this.array2Value = array2;
90 this.lengthValue = length;
91
92 // Allocate some temporaries.
93 this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
94 this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
95 this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
96 this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
97
98 this.temp5 = kind.isNumericFloat() ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL;
99 if (kind == JavaKind.Float) {
100 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE));
101 } else if (kind == JavaKind.Double) {
102 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
103 } else {
104 this.tempXMM = Value.ILLEGAL;
105 }
106
107 // We only need the vector temporaries if we generate SSE code.
108 if (supportsSSE41(tool.target())) {
109 this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
110 this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
111 } else {
112 this.vectorTemp1 = Value.ILLEGAL;
113 this.vectorTemp2 = Value.ILLEGAL;
114 }
115 }
116
117 @Override
118 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
119 Register result = asRegister(resultValue);
120 Register array1 = asRegister(temp1);
121 Register array2 = asRegister(temp2);
122 Register length = asRegister(temp3);
123
124 Label trueLabel = new Label();
125 Label falseLabel = new Label();
126 Label done = new Label();
127
128 // Load array base addresses.
129 masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset));
130 masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset));
131
132 // Get array length in bytes.
133 masm.movl(length, asRegister(lengthValue));
134
135 if (arrayIndexScale > 1) {
136 masm.shll(length, NumUtil.log2Ceil(arrayIndexScale)); // scale length
137 }
138
139 masm.movl(result, length); // copy
140
141 if (supportsAVX2(crb.target)) {
142 emitAVXCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
143 } else if (supportsSSE41(crb.target)) {
144 // this code is used for AVX as well because our backend correctly ensures that
145 // VEX-prefixed instructions are emitted if AVX is supported
146 emitSSE41Compare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
147 }
148
149 emit8ByteCompare(crb, masm, result, array1, array2, length, trueLabel, falseLabel);
150 emitTailCompares(masm, result, array1, array2, length, trueLabel, falseLabel);
151
152 // Return true
153 masm.bind(trueLabel);
154 masm.movl(result, 1);
155 masm.jmpb(done);
156
157 // Return false
158 masm.bind(falseLabel);
159 masm.xorl(result, result);
160
161 // That's it
162 masm.bind(done);
163 }
164
165 /**
166 * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions.
167 *
168 * @param target target description of the underlying architecture
169 * @return true if the underlying architecture supports SSE 4.1
170 */
171 private static boolean supportsSSE41(TargetDescription target) {
172 AMD64 arch = (AMD64) target.arch;
173 return arch.getFeatures().contains(CPUFeature.SSE4_1);
174 }
175
176 /**
177 * Vector size used in {@link #emitSSE41Compare}.
178 */
179 private static final int SSE4_1_VECTOR_SIZE = 16;
180
181 /**
182 * Emits code that uses SSE4.1 128-bit (16-byte) vector compares.
183 */
184 private void emitSSE41Compare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
185 assert supportsSSE41(crb.target);
186
187 Register vector1 = asRegister(vectorTemp1, AMD64Kind.DOUBLE);
188 Register vector2 = asRegister(vectorTemp2, AMD64Kind.DOUBLE);
189
190 Label loop = new Label();
191 Label compareTail = new Label();
192
193 boolean requiresNaNCheck = kind.isNumericFloat();
194 Label loopCheck = new Label();
195 Label nanCheck = new Label();
196
197 // Compare 16-byte vectors
198 masm.andl(result, SSE4_1_VECTOR_SIZE - 1); // tail count (in bytes)
199 masm.andl(length, ~(SSE4_1_VECTOR_SIZE - 1)); // vector count (in bytes)
200 masm.jcc(ConditionFlag.Zero, compareTail);
201
202 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
203 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
204 masm.negq(length);
205
206 // Align the main loop
207 masm.align(crb.target.wordSize * 2);
208 masm.bind(loop);
209 masm.movdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
210 masm.movdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
211 masm.pxor(vector1, vector2);
212 masm.ptest(vector1, vector1);
213 masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
214
215 masm.bind(loopCheck);
216 masm.addq(length, SSE4_1_VECTOR_SIZE);
217 masm.jcc(ConditionFlag.NotZero, loop);
218
219 masm.testl(result, result);
220 masm.jcc(ConditionFlag.Zero, trueLabel);
221
222 if (requiresNaNCheck) {
223 Label unalignedCheck = new Label();
224 masm.jmpb(unalignedCheck);
225 masm.bind(nanCheck);
226 emitFloatCompareWithinRange(crb, masm, array1, array2, length, 0, falseLabel, SSE4_1_VECTOR_SIZE);
227 masm.jmpb(loopCheck);
228 masm.bind(unalignedCheck);
229 }
230
231 /*
232 * Compare the remaining bytes with an unaligned memory load aligned to the end of the
233 * array.
234 */
235 masm.movdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -SSE4_1_VECTOR_SIZE));
236 masm.movdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -SSE4_1_VECTOR_SIZE));
237 masm.pxor(vector1, vector2);
238 masm.ptest(vector1, vector1);
239 if (requiresNaNCheck) {
240 masm.jcc(ConditionFlag.Zero, trueLabel);
241 emitFloatCompareWithinRange(crb, masm, array1, array2, result, -SSE4_1_VECTOR_SIZE, falseLabel, SSE4_1_VECTOR_SIZE);
242 } else {
243 masm.jcc(ConditionFlag.NotZero, falseLabel);
244 }
245 masm.jmp(trueLabel);
246
247 masm.bind(compareTail);
248 masm.movl(length, result);
249 }
250
251 /**
252 * Returns if the underlying AMD64 architecture supports AVX instructions.
253 *
254 * @param target target description of the underlying architecture
255 * @return true if the underlying architecture supports AVX
256 */
257 private static boolean supportsAVX2(TargetDescription target) {
258 AMD64 arch = (AMD64) target.arch;
259 return arch.getFeatures().contains(CPUFeature.AVX2);
260 }
261
262 /**
263 * Vector size used in {@link #emitAVXCompare}.
264 */
265 private static final int AVX_VECTOR_SIZE = 32;
266
267 private void emitAVXCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
268 assert supportsAVX2(crb.target);
269
270 Register vector1 = asRegister(vectorTemp1, AMD64Kind.DOUBLE);
271 Register vector2 = asRegister(vectorTemp2, AMD64Kind.DOUBLE);
272
273 Label loop = new Label();
274 Label compareTail = new Label();
275
276 boolean requiresNaNCheck = kind.isNumericFloat();
277 Label loopCheck = new Label();
278 Label nanCheck = new Label();
279
280 // Compare 32-byte vectors
281 masm.andl(result, AVX_VECTOR_SIZE - 1); // tail count (in bytes)
282 masm.andl(length, ~(AVX_VECTOR_SIZE - 1)); // vector count (in bytes)
283 masm.jcc(ConditionFlag.Zero, compareTail);
284
285 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
286 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
287 masm.negq(length);
288
289 // Align the main loop
290 masm.align(crb.target.wordSize * 2);
291 masm.bind(loop);
292 masm.vmovdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
293 masm.vmovdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
294 masm.vpxor(vector1, vector1, vector2);
295 masm.vptest(vector1, vector1);
296 masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
297
298 masm.bind(loopCheck);
299 masm.addq(length, AVX_VECTOR_SIZE);
300 masm.jcc(ConditionFlag.NotZero, loop);
301
302 masm.testl(result, result);
303 masm.jcc(ConditionFlag.Zero, trueLabel);
304
305 if (requiresNaNCheck) {
306 Label unalignedCheck = new Label();
307 masm.jmpb(unalignedCheck);
308 masm.bind(nanCheck);
309 emitFloatCompareWithinRange(crb, masm, array1, array2, length, 0, falseLabel, AVX_VECTOR_SIZE);
310 masm.jmpb(loopCheck);
311 masm.bind(unalignedCheck);
312 }
313
314 /*
315 * Compare the remaining bytes with an unaligned memory load aligned to the end of the
316 * array.
317 */
318 masm.vmovdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -AVX_VECTOR_SIZE));
319 masm.vmovdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -AVX_VECTOR_SIZE));
320 masm.vpxor(vector1, vector1, vector2);
321 masm.vptest(vector1, vector1);
322 if (requiresNaNCheck) {
323 masm.jcc(ConditionFlag.Zero, trueLabel);
324 emitFloatCompareWithinRange(crb, masm, array1, array2, result, -AVX_VECTOR_SIZE, falseLabel, AVX_VECTOR_SIZE);
325 } else {
326 masm.jcc(ConditionFlag.NotZero, falseLabel);
327 }
328 masm.jmp(trueLabel);
329
330 masm.bind(compareTail);
331 masm.movl(length, result);
332 }
333
334 /**
335 * Vector size used in {@link #emit8ByteCompare}.
336 */
337 private static final int VECTOR_SIZE = 8;
338
339 /**
340 * Emits code that uses 8-byte vector compares.
341 */
342 private void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
343 Label loop = new Label();
344 Label compareTail = new Label();
345
346 boolean requiresNaNCheck = kind.isNumericFloat();
347 Label loopCheck = new Label();
348 Label nanCheck = new Label();
349
350 Register temp = asRegister(temp4);
351
352 masm.andl(result, VECTOR_SIZE - 1); // tail count (in bytes)
353 masm.andl(length, ~(VECTOR_SIZE - 1)); // vector count (in bytes)
354 masm.jcc(ConditionFlag.Zero, compareTail);
355
356 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
357 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
358 masm.negq(length);
359
360 // Align the main loop
361 masm.align(crb.target.wordSize * 2);
362 masm.bind(loop);
363 masm.movq(temp, new AMD64Address(array1, length, Scale.Times1, 0));
364 masm.cmpq(temp, new AMD64Address(array2, length, Scale.Times1, 0));
365 masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel);
366
367 masm.bind(loopCheck);
368 masm.addq(length, VECTOR_SIZE);
369 masm.jccb(ConditionFlag.NotZero, loop);
370
371 masm.testl(result, result);
372 masm.jcc(ConditionFlag.Zero, trueLabel);
373
374 if (requiresNaNCheck) {
375 // NaN check is slow path and hence placed outside of the main loop.
376 Label unalignedCheck = new Label();
377 masm.jmpb(unalignedCheck);
378 masm.bind(nanCheck);
379 // At most two iterations, unroll in the emitted code.
380 for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
381 emitFloatCompare(masm, array1, array2, length, offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
382 }
383 masm.jmpb(loopCheck);
384 masm.bind(unalignedCheck);
385 }
386
387 /*
388 * Compare the remaining bytes with an unaligned memory load aligned to the end of the
389 * array.
390 */
391 masm.movq(temp, new AMD64Address(array1, result, Scale.Times1, -VECTOR_SIZE));
392 masm.cmpq(temp, new AMD64Address(array2, result, Scale.Times1, -VECTOR_SIZE));
393 if (requiresNaNCheck) {
394 masm.jcc(ConditionFlag.Equal, trueLabel);
395 // At most two iterations, unroll in the emitted code.
396 for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
397 emitFloatCompare(masm, array1, array2, result, -VECTOR_SIZE + offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
398 }
399 } else {
400 masm.jccb(ConditionFlag.NotEqual, falseLabel);
401 }
402 masm.jmpb(trueLabel);
403
404 masm.bind(compareTail);
405 masm.movl(length, result);
406 }
407
408 /**
409 * Emits code to compare the remaining 1 to 4 bytes.
410 */
411 private void emitTailCompares(AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
412 Label compare2Bytes = new Label();
413 Label compare1Byte = new Label();
414
415 Register temp = asRegister(temp4);
416
417 if (kind.getByteCount() <= 4) {
418 // Compare trailing 4 bytes, if any.
419 masm.testl(result, 4);
420 masm.jccb(ConditionFlag.Zero, compare2Bytes);
421 masm.movl(temp, new AMD64Address(array1, 0));
422 masm.cmpl(temp, new AMD64Address(array2, 0));
423 if (kind == JavaKind.Float) {
424 masm.jccb(ConditionFlag.Equal, trueLabel);
425 emitFloatCompare(masm, array1, array2, Register.None, 0, falseLabel, true);
426 masm.jmpb(trueLabel);
427 } else {
428 masm.jccb(ConditionFlag.NotEqual, falseLabel);
429 }
430 if (kind.getByteCount() <= 2) {
431 // Move array pointers forward.
432 masm.leaq(array1, new AMD64Address(array1, 4));
433 masm.leaq(array2, new AMD64Address(array2, 4));
434
435 // Compare trailing 2 bytes, if any.
436 masm.bind(compare2Bytes);
437 masm.testl(result, 2);
438 masm.jccb(ConditionFlag.Zero, compare1Byte);
439 masm.movzwl(temp, new AMD64Address(array1, 0));
440 masm.movzwl(length, new AMD64Address(array2, 0));
441 masm.cmpl(temp, length);
442 masm.jccb(ConditionFlag.NotEqual, falseLabel);
443
444 // The one-byte tail compare is only required for boolean and byte arrays.
445 if (kind.getByteCount() <= 1) {
450 // Compare trailing byte, if any.
451 masm.bind(compare1Byte);
452 masm.testl(result, 1);
453 masm.jccb(ConditionFlag.Zero, trueLabel);
454 masm.movzbl(temp, new AMD64Address(array1, 0));
455 masm.movzbl(length, new AMD64Address(array2, 0));
456 masm.cmpl(temp, length);
457 masm.jccb(ConditionFlag.NotEqual, falseLabel);
458 } else {
459 masm.bind(compare1Byte);
460 }
461 } else {
462 masm.bind(compare2Bytes);
463 }
464 }
465 }
466
467 /**
468 * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}.
469 */
470 private void emitNaNCheck(AMD64MacroAssembler masm, AMD64Address src, Label branchIfNonNaN) {
471 assert kind.isNumericFloat();
472 Register tempXMMReg = asRegister(tempXMM);
473 if (kind == JavaKind.Float) {
474 masm.movflt(tempXMMReg, src);
475 } else {
476 masm.movdbl(tempXMMReg, src);
477 }
478 SSEOp.UCOMIS.emit(masm, kind == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg);
479 masm.jcc(ConditionFlag.NoParity, branchIfNonNaN);
480 }
481
482 /**
483 * Emits code to compare if two floats are bitwise equal or both NaN.
484 */
485 private void emitFloatCompare(AMD64MacroAssembler masm, Register base1, Register base2, Register index, int offset, Label falseLabel, boolean skipBitwiseCompare) {
486 AMD64Address address1 = new AMD64Address(base1, index, Scale.Times1, offset);
487 AMD64Address address2 = new AMD64Address(base2, index, Scale.Times1, offset);
488
489 Label bitwiseEqual = new Label();
490
491 if (!skipBitwiseCompare) {
492 // Bitwise compare
493 Register temp = asRegister(temp4);
494
495 if (kind == JavaKind.Float) {
496 masm.movl(temp, address1);
497 masm.cmpl(temp, address2);
498 } else {
499 masm.movq(temp, address1);
500 masm.cmpq(temp, address2);
501 }
502 masm.jccb(ConditionFlag.Equal, bitwiseEqual);
503 }
504
505 emitNaNCheck(masm, address1, falseLabel);
506 emitNaNCheck(masm, address2, falseLabel);
507
508 masm.bind(bitwiseEqual);
509 }
510
511 /**
512 * Emits code to compare float equality within a range.
513 */
514 private void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register base1, Register base2, Register index, int offset, Label falseLabel, int range) {
515 assert kind.isNumericFloat();
516 Label loop = new Label();
517 Register i = asRegister(temp5);
518
519 masm.movq(i, range);
520 masm.negq(i);
521 // Align the main loop
522 masm.align(crb.target.wordSize * 2);
523 masm.bind(loop);
524 emitFloatCompare(masm, base1, base2, index, offset, falseLabel, kind.getByteCount() == range);
525 masm.addq(index, kind.getByteCount());
526 masm.addq(i, kind.getByteCount());
527 masm.jccb(ConditionFlag.NotZero, loop);
528 // Floats within the range are equal, revert change to the register index
529 masm.subq(index, range);
530 }
531 }
|
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 */
23
24
25 package org.graalvm.compiler.lir.amd64;
26
27 import jdk.vm.ci.amd64.AMD64;
28 import jdk.vm.ci.amd64.AMD64.CPUFeature;
29 import jdk.vm.ci.amd64.AMD64Kind;
30 import jdk.vm.ci.code.Register;
31 import jdk.vm.ci.code.TargetDescription;
32 import jdk.vm.ci.meta.JavaKind;
33 import jdk.vm.ci.meta.Value;
34 import org.graalvm.compiler.asm.Label;
35 import org.graalvm.compiler.asm.amd64.AMD64Address;
36 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
37 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.SSEOp;
40 import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize;
41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
42 import org.graalvm.compiler.asm.amd64.AVXKind;
43 import org.graalvm.compiler.core.common.LIRKind;
44 import org.graalvm.compiler.core.common.NumUtil;
45 import org.graalvm.compiler.lir.LIRInstructionClass;
46 import org.graalvm.compiler.lir.Opcode;
47 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
48 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
49
50 import static jdk.vm.ci.code.ValueUtil.asRegister;
51 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
52 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
53
54 /**
55 * Emits code which compares two arrays of the same length. If the CPU supports any vector
56 * instructions specialized code is emitted to leverage these instructions.
57 */
58 @Opcode("ARRAY_EQUALS")
59 public final class AMD64ArrayEqualsOp extends AMD64LIRInstruction {
60 public static final LIRInstructionClass<AMD64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AMD64ArrayEqualsOp.class);
61
62 private final JavaKind kind;
63 private final int arrayBaseOffset;
64 private final int arrayIndexScale;
65 private final int constantByteLength;
66
67 @Def({REG}) private Value resultValue;
68 @Alive({REG}) private Value array1Value;
69 @Alive({REG}) private Value array2Value;
70 @Alive({REG}) private Value lengthValue;
71 @Temp({REG}) private Value temp1;
72 @Temp({REG}) private Value temp2;
73 @Temp({REG}) private Value temp3;
74 @Temp({REG}) private Value temp4;
75
76 @Temp({REG, ILLEGAL}) private Value temp5;
77 @Temp({REG, ILLEGAL}) private Value tempXMM;
78
79 @Temp({REG, ILLEGAL}) private Value vectorTemp1;
80 @Temp({REG, ILLEGAL}) private Value vectorTemp2;
81 @Temp({REG, ILLEGAL}) private Value vectorTemp3;
82 @Temp({REG, ILLEGAL}) private Value vectorTemp4;
83
84 public AMD64ArrayEqualsOp(LIRGeneratorTool tool, JavaKind kind, Value result, Value array1, Value array2, Value length,
85 int constantLength, boolean directPointers, int maxVectorSize) {
86 super(TYPE);
87 this.kind = kind;
88
89 this.arrayBaseOffset = directPointers ? 0 : tool.getProviders().getMetaAccess().getArrayBaseOffset(kind);
90 this.arrayIndexScale = tool.getProviders().getMetaAccess().getArrayIndexScale(kind);
91
92 if (constantLength >= 0 && arrayIndexScale > 1) {
93 // scale length
94 this.constantByteLength = constantLength << NumUtil.log2Ceil(arrayIndexScale);
95 } else {
96 this.constantByteLength = constantLength;
97 }
98
99 this.resultValue = result;
100 this.array1Value = array1;
101 this.array2Value = array2;
102 this.lengthValue = length;
103
104 // Allocate some temporaries.
105 this.temp1 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
106 this.temp2 = tool.newVariable(LIRKind.unknownReference(tool.target().arch.getWordKind()));
107 this.temp3 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
108 this.temp4 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
109
110 this.temp5 = kind.isNumericFloat() ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL;
111 if (kind == JavaKind.Float) {
112 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.SINGLE));
113 } else if (kind == JavaKind.Double) {
114 this.tempXMM = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
115 } else {
116 this.tempXMM = Value.ILLEGAL;
117 }
118
119 // We only need the vector temporaries if we generate SSE code.
120 if (supportsSSE41(tool.target())) {
121 if (canGenerateConstantLengthCompare(tool.target())) {
122 LIRKind lirKind = LIRKind.value(supportsAVX2(tool.target()) && (maxVectorSize < 0 || maxVectorSize >= 32) ? AMD64Kind.V256_BYTE : AMD64Kind.V128_BYTE);
123 this.vectorTemp1 = tool.newVariable(lirKind);
124 this.vectorTemp2 = tool.newVariable(lirKind);
125 this.vectorTemp3 = tool.newVariable(lirKind);
126 this.vectorTemp4 = tool.newVariable(lirKind);
127 } else {
128 this.vectorTemp1 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
129 this.vectorTemp2 = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE));
130 this.vectorTemp3 = Value.ILLEGAL;
131 this.vectorTemp4 = Value.ILLEGAL;
132 }
133 } else {
134 this.vectorTemp1 = Value.ILLEGAL;
135 this.vectorTemp2 = Value.ILLEGAL;
136 this.vectorTemp3 = Value.ILLEGAL;
137 this.vectorTemp4 = Value.ILLEGAL;
138 }
139 }
140
141 private boolean canGenerateConstantLengthCompare(TargetDescription target) {
142 return constantByteLength >= 0 && kind.isNumericInteger() && supportsSSE41(target);
143 }
144
145 @Override
146 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
147 Register result = asRegister(resultValue);
148 Register array1 = asRegister(temp1);
149 Register array2 = asRegister(temp2);
150
151 Label trueLabel = new Label();
152 Label falseLabel = new Label();
153 Label done = new Label();
154
155 // Load array base addresses.
156 masm.leaq(array1, new AMD64Address(asRegister(array1Value), arrayBaseOffset));
157 masm.leaq(array2, new AMD64Address(asRegister(array2Value), arrayBaseOffset));
158
159 if (canGenerateConstantLengthCompare(crb.target)) {
160 emitConstantLengthArrayCompareBytes(masm, array1, array2, asRegister(temp3), asRegister(temp4),
161 new Register[]{asRegister(vectorTemp1), asRegister(vectorTemp2), asRegister(vectorTemp3), asRegister(vectorTemp4)},
162 falseLabel, constantByteLength, AVXKind.getRegisterSize(vectorTemp1).getBytes());
163 } else {
164 Register length = asRegister(temp3);
165
166 // Get array length in bytes.
167 masm.movl(length, asRegister(lengthValue));
168
169 if (arrayIndexScale > 1) {
170 masm.shll(length, NumUtil.log2Ceil(arrayIndexScale)); // scale length
171 }
172
173 masm.movl(result, length); // copy
174
175 emitArrayCompare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel);
176 }
177
178 // Return true
179 masm.bind(trueLabel);
180 masm.movl(result, 1);
181 masm.jmpb(done);
182
183 // Return false
184 masm.bind(falseLabel);
185 masm.xorl(result, result);
186
187 // That's it
188 masm.bind(done);
189 }
190
191 private static void emitArrayCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind,
192 Register result, Register array1, Register array2, Register length,
193 Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2,
194 Label trueLabel, Label falseLabel) {
195 if (supportsAVX2(crb.target)) {
196 emitAVXCompare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel);
197 } else if (supportsSSE41(crb.target)) {
198 // this code is used for AVX as well because our backend correctly ensures that
199 // VEX-prefixed instructions are emitted if AVX is supported
200 emitSSE41Compare(crb, masm, kind, result, array1, array2, length, temp4, temp5, tempXMM, vectorTemp1, vectorTemp2, trueLabel, falseLabel);
201 }
202 emit8ByteCompare(crb, masm, kind, result, array1, array2, length, temp4, tempXMM, trueLabel, falseLabel);
203 emitTailCompares(masm, kind, result, array1, array2, length, temp4, tempXMM, trueLabel, falseLabel);
204 }
205
206 /**
207 * Returns if the underlying AMD64 architecture supports SSE 4.1 instructions.
208 *
209 * @param target target description of the underlying architecture
210 * @return true if the underlying architecture supports SSE 4.1
211 */
212 private static boolean supportsSSE41(TargetDescription target) {
213 AMD64 arch = (AMD64) target.arch;
214 return arch.getFeatures().contains(CPUFeature.SSE4_1);
215 }
216
217 /**
218 * Vector size used in {@link #emitSSE41Compare}.
219 */
220 private static final int SSE4_1_VECTOR_SIZE = 16;
221
222 /**
223 * Emits code that uses SSE4.1 128-bit (16-byte) vector compares.
224 */
225 private static void emitSSE41Compare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind,
226 Register result, Register array1, Register array2, Register length,
227 Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2,
228 Label trueLabel, Label falseLabel) {
229 assert supportsSSE41(crb.target);
230
231 Register vector1 = asRegister(vectorTemp1);
232 Register vector2 = asRegister(vectorTemp2);
233
234 Label loop = new Label();
235 Label compareTail = new Label();
236
237 boolean requiresNaNCheck = kind.isNumericFloat();
238 Label loopCheck = new Label();
239 Label nanCheck = new Label();
240
241 // Compare 16-byte vectors
242 masm.andl(result, SSE4_1_VECTOR_SIZE - 1); // tail count (in bytes)
243 masm.andl(length, ~(SSE4_1_VECTOR_SIZE - 1)); // vector count (in bytes)
244 masm.jcc(ConditionFlag.Zero, compareTail);
245
246 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
247 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
248 masm.negq(length);
249
250 // Align the main loop
251 masm.align(crb.target.wordSize * 2);
252 masm.bind(loop);
253 masm.movdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
254 masm.movdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
255 masm.pxor(vector1, vector2);
256 masm.ptest(vector1, vector1);
257 masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
258
259 masm.bind(loopCheck);
260 masm.addq(length, SSE4_1_VECTOR_SIZE);
261 masm.jcc(ConditionFlag.NotZero, loop);
262
263 masm.testl(result, result);
264 masm.jcc(ConditionFlag.Zero, trueLabel);
265
266 if (requiresNaNCheck) {
267 Label unalignedCheck = new Label();
268 masm.jmpb(unalignedCheck);
269 masm.bind(nanCheck);
270 emitFloatCompareWithinRange(crb, masm, kind, array1, array2, length, temp4, temp5, tempXMM, 0, falseLabel, SSE4_1_VECTOR_SIZE);
271 masm.jmpb(loopCheck);
272 masm.bind(unalignedCheck);
273 }
274
275 /*
276 * Compare the remaining bytes with an unaligned memory load aligned to the end of the
277 * array.
278 */
279 masm.movdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -SSE4_1_VECTOR_SIZE));
280 masm.movdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -SSE4_1_VECTOR_SIZE));
281 masm.pxor(vector1, vector2);
282 masm.ptest(vector1, vector1);
283 if (requiresNaNCheck) {
284 masm.jcc(ConditionFlag.Zero, trueLabel);
285 emitFloatCompareWithinRange(crb, masm, kind, array1, array2, result, temp4, temp5, tempXMM, -SSE4_1_VECTOR_SIZE, falseLabel, SSE4_1_VECTOR_SIZE);
286 } else {
287 masm.jcc(ConditionFlag.NotZero, falseLabel);
288 }
289 masm.jmp(trueLabel);
290
291 masm.bind(compareTail);
292 masm.movl(length, result);
293 }
294
295 /**
296 * Returns if the underlying AMD64 architecture supports AVX instructions.
297 *
298 * @param target target description of the underlying architecture
299 * @return true if the underlying architecture supports AVX
300 */
301 private static boolean supportsAVX2(TargetDescription target) {
302 AMD64 arch = (AMD64) target.arch;
303 return arch.getFeatures().contains(CPUFeature.AVX2);
304 }
305
306 /**
307 * Vector size used in {@link #emitAVXCompare}.
308 */
309 private static final int AVX_VECTOR_SIZE = 32;
310
311 private static void emitAVXCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register result,
312 Register array1, Register array2, Register length,
313 Value temp4, Value temp5, Value tempXMM, Value vectorTemp1, Value vectorTemp2,
314 Label trueLabel, Label falseLabel) {
315 assert supportsAVX2(crb.target);
316
317 Register vector1 = asRegister(vectorTemp1);
318 Register vector2 = asRegister(vectorTemp2);
319
320 Label loop = new Label();
321 Label compareTail = new Label();
322
323 boolean requiresNaNCheck = kind.isNumericFloat();
324 Label loopCheck = new Label();
325 Label nanCheck = new Label();
326
327 // Compare 32-byte vectors
328 masm.andl(result, AVX_VECTOR_SIZE - 1); // tail count (in bytes)
329 masm.andl(length, ~(AVX_VECTOR_SIZE - 1)); // vector count (in bytes)
330 masm.jcc(ConditionFlag.Zero, compareTail);
331
332 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
333 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
334 masm.negq(length);
335
336 // Align the main loop
337 masm.align(crb.target.wordSize * 2);
338 masm.bind(loop);
339 masm.vmovdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
340 masm.vmovdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
341 masm.vpxor(vector1, vector1, vector2);
342 masm.vptest(vector1, vector1);
343 masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
344
345 masm.bind(loopCheck);
346 masm.addq(length, AVX_VECTOR_SIZE);
347 masm.jcc(ConditionFlag.NotZero, loop);
348
349 masm.testl(result, result);
350 masm.jcc(ConditionFlag.Zero, trueLabel);
351
352 if (requiresNaNCheck) {
353 Label unalignedCheck = new Label();
354 masm.jmpb(unalignedCheck);
355 masm.bind(nanCheck);
356 emitFloatCompareWithinRange(crb, masm, kind, array1, array2, length, temp4, temp5, tempXMM, 0, falseLabel, AVX_VECTOR_SIZE);
357 masm.jmpb(loopCheck);
358 masm.bind(unalignedCheck);
359 }
360
361 /*
362 * Compare the remaining bytes with an unaligned memory load aligned to the end of the
363 * array.
364 */
365 masm.vmovdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -AVX_VECTOR_SIZE));
366 masm.vmovdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -AVX_VECTOR_SIZE));
367 masm.vpxor(vector1, vector1, vector2);
368 masm.vptest(vector1, vector1);
369 if (requiresNaNCheck) {
370 masm.jcc(ConditionFlag.Zero, trueLabel);
371 emitFloatCompareWithinRange(crb, masm, kind, array1, array2, result, temp4, temp5, tempXMM, -AVX_VECTOR_SIZE, falseLabel, AVX_VECTOR_SIZE);
372 } else {
373 masm.jcc(ConditionFlag.NotZero, falseLabel);
374 }
375 masm.jmp(trueLabel);
376
377 masm.bind(compareTail);
378 masm.movl(length, result);
379 }
380
381 /**
382 * Vector size used in {@link #emit8ByteCompare}.
383 */
384 private static final int VECTOR_SIZE = 8;
385
386 /**
387 * Emits code that uses 8-byte vector compares.
388 */
389 private static void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register result, Register array1, Register array2, Register length, Value temp4,
390 Value tempXMM, Label trueLabel, Label falseLabel) {
391 Label loop = new Label();
392 Label compareTail = new Label();
393
394 boolean requiresNaNCheck = kind.isNumericFloat();
395 Label loopCheck = new Label();
396 Label nanCheck = new Label();
397
398 Register temp = asRegister(temp4);
399
400 masm.andl(result, VECTOR_SIZE - 1); // tail count (in bytes)
401 masm.andl(length, ~(VECTOR_SIZE - 1)); // vector count (in bytes)
402 masm.jcc(ConditionFlag.Zero, compareTail);
403
404 masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
405 masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
406 masm.negq(length);
407
408 // Align the main loop
409 masm.align(crb.target.wordSize * 2);
410 masm.bind(loop);
411 masm.movq(temp, new AMD64Address(array1, length, Scale.Times1, 0));
412 masm.cmpq(temp, new AMD64Address(array2, length, Scale.Times1, 0));
413 masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel);
414
415 masm.bind(loopCheck);
416 masm.addq(length, VECTOR_SIZE);
417 masm.jccb(ConditionFlag.NotZero, loop);
418
419 masm.testl(result, result);
420 masm.jcc(ConditionFlag.Zero, trueLabel);
421
422 if (requiresNaNCheck) {
423 // NaN check is slow path and hence placed outside of the main loop.
424 Label unalignedCheck = new Label();
425 masm.jmpb(unalignedCheck);
426 masm.bind(nanCheck);
427 // At most two iterations, unroll in the emitted code.
428 for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
429 emitFloatCompare(masm, kind, array1, array2, length, temp4, tempXMM, offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
430 }
431 masm.jmpb(loopCheck);
432 masm.bind(unalignedCheck);
433 }
434
435 /*
436 * Compare the remaining bytes with an unaligned memory load aligned to the end of the
437 * array.
438 */
439 masm.movq(temp, new AMD64Address(array1, result, Scale.Times1, -VECTOR_SIZE));
440 masm.cmpq(temp, new AMD64Address(array2, result, Scale.Times1, -VECTOR_SIZE));
441 if (requiresNaNCheck) {
442 masm.jcc(ConditionFlag.Equal, trueLabel);
443 // At most two iterations, unroll in the emitted code.
444 for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
445 emitFloatCompare(masm, kind, array1, array2, result, temp4, tempXMM, -VECTOR_SIZE + offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
446 }
447 } else {
448 masm.jccb(ConditionFlag.NotEqual, falseLabel);
449 }
450 masm.jmpb(trueLabel);
451
452 masm.bind(compareTail);
453 masm.movl(length, result);
454 }
455
456 /**
457 * Emits code to compare the remaining 1 to 4 bytes.
458 */
459 private static void emitTailCompares(AMD64MacroAssembler masm, JavaKind kind, Register result, Register array1, Register array2, Register length, Value temp4, Value tempXMM,
460 Label trueLabel, Label falseLabel) {
461 Label compare2Bytes = new Label();
462 Label compare1Byte = new Label();
463
464 Register temp = asRegister(temp4);
465
466 if (kind.getByteCount() <= 4) {
467 // Compare trailing 4 bytes, if any.
468 masm.testl(result, 4);
469 masm.jccb(ConditionFlag.Zero, compare2Bytes);
470 masm.movl(temp, new AMD64Address(array1, 0));
471 masm.cmpl(temp, new AMD64Address(array2, 0));
472 if (kind == JavaKind.Float) {
473 masm.jccb(ConditionFlag.Equal, trueLabel);
474 emitFloatCompare(masm, kind, array1, array2, Register.None, temp4, tempXMM, 0, falseLabel, true);
475 masm.jmpb(trueLabel);
476 } else {
477 masm.jccb(ConditionFlag.NotEqual, falseLabel);
478 }
479 if (kind.getByteCount() <= 2) {
480 // Move array pointers forward.
481 masm.leaq(array1, new AMD64Address(array1, 4));
482 masm.leaq(array2, new AMD64Address(array2, 4));
483
484 // Compare trailing 2 bytes, if any.
485 masm.bind(compare2Bytes);
486 masm.testl(result, 2);
487 masm.jccb(ConditionFlag.Zero, compare1Byte);
488 masm.movzwl(temp, new AMD64Address(array1, 0));
489 masm.movzwl(length, new AMD64Address(array2, 0));
490 masm.cmpl(temp, length);
491 masm.jccb(ConditionFlag.NotEqual, falseLabel);
492
493 // The one-byte tail compare is only required for boolean and byte arrays.
494 if (kind.getByteCount() <= 1) {
499 // Compare trailing byte, if any.
500 masm.bind(compare1Byte);
501 masm.testl(result, 1);
502 masm.jccb(ConditionFlag.Zero, trueLabel);
503 masm.movzbl(temp, new AMD64Address(array1, 0));
504 masm.movzbl(length, new AMD64Address(array2, 0));
505 masm.cmpl(temp, length);
506 masm.jccb(ConditionFlag.NotEqual, falseLabel);
507 } else {
508 masm.bind(compare1Byte);
509 }
510 } else {
511 masm.bind(compare2Bytes);
512 }
513 }
514 }
515
516 /**
517 * Emits code to fall through if {@code src} is NaN, otherwise jump to {@code branchOrdered}.
518 */
519 private static void emitNaNCheck(AMD64MacroAssembler masm, JavaKind kind, Value tempXMM, AMD64Address src, Label branchIfNonNaN) {
520 assert kind.isNumericFloat();
521 Register tempXMMReg = asRegister(tempXMM);
522 if (kind == JavaKind.Float) {
523 masm.movflt(tempXMMReg, src);
524 } else {
525 masm.movdbl(tempXMMReg, src);
526 }
527 SSEOp.UCOMIS.emit(masm, kind == JavaKind.Float ? OperandSize.PS : OperandSize.PD, tempXMMReg, tempXMMReg);
528 masm.jcc(ConditionFlag.NoParity, branchIfNonNaN);
529 }
530
531 /**
532 * Emits code to compare if two floats are bitwise equal or both NaN.
533 */
534 private static void emitFloatCompare(AMD64MacroAssembler masm, JavaKind kind, Register base1, Register base2, Register index, Value temp4, Value tempXMM, int offset, Label falseLabel,
535 boolean skipBitwiseCompare) {
536 AMD64Address address1 = new AMD64Address(base1, index, Scale.Times1, offset);
537 AMD64Address address2 = new AMD64Address(base2, index, Scale.Times1, offset);
538
539 Label bitwiseEqual = new Label();
540
541 if (!skipBitwiseCompare) {
542 // Bitwise compare
543 Register temp = asRegister(temp4);
544
545 if (kind == JavaKind.Float) {
546 masm.movl(temp, address1);
547 masm.cmpl(temp, address2);
548 } else {
549 masm.movq(temp, address1);
550 masm.cmpq(temp, address2);
551 }
552 masm.jccb(ConditionFlag.Equal, bitwiseEqual);
553 }
554
555 emitNaNCheck(masm, kind, tempXMM, address1, falseLabel);
556 emitNaNCheck(masm, kind, tempXMM, address2, falseLabel);
557
558 masm.bind(bitwiseEqual);
559 }
560
561 /**
562 * Emits code to compare float equality within a range.
563 */
564 private static void emitFloatCompareWithinRange(CompilationResultBuilder crb, AMD64MacroAssembler masm, JavaKind kind, Register base1, Register base2, Register index, Value temp4, Value temp5,
565 Value tempXMM, int offset, Label falseLabel, int range) {
566 assert kind.isNumericFloat();
567 Label loop = new Label();
568 Register i = asRegister(temp5);
569
570 masm.movq(i, range);
571 masm.negq(i);
572 // Align the main loop
573 masm.align(crb.target.wordSize * 2);
574 masm.bind(loop);
575 emitFloatCompare(masm, kind, base1, base2, index, temp4, tempXMM, offset, falseLabel, kind.getByteCount() == range);
576 masm.addq(index, kind.getByteCount());
577 masm.addq(i, kind.getByteCount());
578 masm.jccb(ConditionFlag.NotZero, loop);
579 // Floats within the range are equal, revert change to the register index
580 masm.subq(index, range);
581 }
582
583 /**
584 * Emits specialized assembly for checking equality of memory regions
585 * {@code arrayPtr1[0..nBytes]} and {@code arrayPtr2[0..nBytes]}. If they match, execution
586 * continues directly after the emitted code block, otherwise we jump to {@code noMatch}.
587 */
588 private static void emitConstantLengthArrayCompareBytes(
589 AMD64MacroAssembler asm,
590 Register arrayPtr1,
591 Register arrayPtr2,
592 Register tmp1,
593 Register tmp2,
594 Register[] tmpVectors,
595 Label noMatch,
596 int nBytes,
597 int bytesPerVector) {
598 assert bytesPerVector >= 16;
599 if (nBytes == 0) {
600 // do nothing
601 return;
602 }
603 if (nBytes < 16) {
604 // array is shorter than any vector register, use regular CMP instructions
605 int movSize = (nBytes < 2) ? 1 : ((nBytes < 4) ? 2 : ((nBytes < 8) ? 4 : 8));
606 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1), movSize);
607 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2), movSize);
608 emitCmpBytes(asm, tmp1, tmp2, movSize);
609 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch);
610 if (nBytes > movSize) {
611 emitMovBytes(asm, tmp1, new AMD64Address(arrayPtr1, nBytes - movSize), movSize);
612 emitMovBytes(asm, tmp2, new AMD64Address(arrayPtr2, nBytes - movSize), movSize);
613 emitCmpBytes(asm, tmp1, tmp2, movSize);
614 asm.jcc(AMD64Assembler.ConditionFlag.NotEqual, noMatch);
615 }
616 } else if (nBytes < 32 && bytesPerVector >= 32) {
617 // we could use YMM registers, but the array is too short, force XMM registers
618 int bytesPerXMMVector = AVXKind.AVXSize.XMM.getBytes();
619 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], new AMD64Address(arrayPtr1));
620 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[1], new AMD64Address(arrayPtr2));
621 AMD64Assembler.VexRVMOp.VPXOR.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], tmpVectors[0], tmpVectors[1]);
622 if (nBytes > bytesPerXMMVector) {
623 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], new AMD64Address(arrayPtr1, nBytes - bytesPerXMMVector));
624 AMD64Assembler.VexMoveOp.VMOVDQU.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[3], new AMD64Address(arrayPtr2, nBytes - bytesPerXMMVector));
625 AMD64Assembler.VexRVMOp.VPXOR.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], tmpVectors[2], tmpVectors[3]);
626 AMD64Assembler.VexRMOp.VPTEST.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[2], tmpVectors[2]);
627 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
628 }
629 AMD64Assembler.VexRMOp.VPTEST.emit(asm, AVXKind.AVXSize.XMM, tmpVectors[0], tmpVectors[0]);
630 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
631 } else if (bytesPerVector >= 32) {
632 // AVX2 supported, use YMM vectors
633 assert asm.supports(CPUFeature.AVX2);
634 int loopCount = nBytes / (bytesPerVector * 2);
635 int rest = nBytes % (bytesPerVector * 2);
636 if (loopCount > 0) {
637 if (0 < rest && rest < bytesPerVector) {
638 loopCount--;
639 }
640 if (loopCount > 0) {
641 if (loopCount > 1) {
642 asm.movl(tmp1, loopCount);
643 }
644 Label loopBegin = new Label();
645 asm.bind(loopBegin);
646 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
647 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
648 asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
649 asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
650 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
651 asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]);
652 asm.vptest(tmpVectors[0], tmpVectors[0]);
653 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
654 asm.vptest(tmpVectors[2], tmpVectors[2]);
655 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
656 asm.addq(arrayPtr1, bytesPerVector * 2);
657 asm.addq(arrayPtr2, bytesPerVector * 2);
658 if (loopCount > 1) {
659 asm.decrementl(tmp1);
660 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin);
661 }
662 }
663 if (0 < rest && rest < bytesPerVector) {
664 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
665 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
666 asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
667 asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
668 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
669 asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]);
670 asm.vptest(tmpVectors[0], tmpVectors[0]);
671 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
672 asm.vptest(tmpVectors[2], tmpVectors[2]);
673 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
674 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1, bytesPerVector + rest));
675 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2, bytesPerVector + rest));
676 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
677 asm.vptest(tmpVectors[0], tmpVectors[0]);
678 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
679 }
680 }
681 if (rest >= bytesPerVector) {
682 asm.vmovdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
683 asm.vmovdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
684 asm.vpxor(tmpVectors[0], tmpVectors[0], tmpVectors[1]);
685 if (rest > bytesPerVector) {
686 asm.vmovdqu(tmpVectors[2], new AMD64Address(arrayPtr1, rest - bytesPerVector));
687 asm.vmovdqu(tmpVectors[3], new AMD64Address(arrayPtr2, rest - bytesPerVector));
688 asm.vpxor(tmpVectors[2], tmpVectors[2], tmpVectors[3]);
689 asm.vptest(tmpVectors[2], tmpVectors[2]);
690 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
691 }
692 asm.vptest(tmpVectors[0], tmpVectors[0]);
693 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
694 }
695 } else {
696 // on AVX or SSE, use XMM vectors
697 int loopCount = nBytes / (bytesPerVector * 2);
698 int rest = nBytes % (bytesPerVector * 2);
699 if (loopCount > 0) {
700 if (0 < rest && rest < bytesPerVector) {
701 loopCount--;
702 }
703 if (loopCount > 0) {
704 if (loopCount > 1) {
705 asm.movl(tmp1, loopCount);
706 }
707 Label loopBegin = new Label();
708 asm.bind(loopBegin);
709 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
710 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
711 asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
712 asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
713 asm.pxor(tmpVectors[0], tmpVectors[1]);
714 asm.pxor(tmpVectors[2], tmpVectors[3]);
715 asm.ptest(tmpVectors[0], tmpVectors[0]);
716 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
717 asm.ptest(tmpVectors[2], tmpVectors[2]);
718 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
719 asm.addq(arrayPtr1, bytesPerVector * 2);
720 asm.addq(arrayPtr2, bytesPerVector * 2);
721 if (loopCount > 1) {
722 asm.decrementl(tmp1);
723 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, loopBegin);
724 }
725 }
726 if (0 < rest && rest < bytesPerVector) {
727 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
728 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
729 asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, bytesPerVector));
730 asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, bytesPerVector));
731 asm.pxor(tmpVectors[0], tmpVectors[1]);
732 asm.pxor(tmpVectors[2], tmpVectors[3]);
733 asm.ptest(tmpVectors[0], tmpVectors[0]);
734 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
735 asm.ptest(tmpVectors[2], tmpVectors[2]);
736 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
737 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1, bytesPerVector + rest));
738 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2, bytesPerVector + rest));
739 asm.pxor(tmpVectors[0], tmpVectors[1]);
740 asm.ptest(tmpVectors[0], tmpVectors[0]);
741 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
742 }
743 }
744 if (rest >= bytesPerVector) {
745 asm.movdqu(tmpVectors[0], new AMD64Address(arrayPtr1));
746 asm.movdqu(tmpVectors[1], new AMD64Address(arrayPtr2));
747 asm.pxor(tmpVectors[0], tmpVectors[1]);
748 if (rest > bytesPerVector) {
749 asm.movdqu(tmpVectors[2], new AMD64Address(arrayPtr1, rest - bytesPerVector));
750 asm.movdqu(tmpVectors[3], new AMD64Address(arrayPtr2, rest - bytesPerVector));
751 asm.pxor(tmpVectors[2], tmpVectors[3]);
752 asm.ptest(tmpVectors[2], tmpVectors[2]);
753 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
754 }
755 asm.ptest(tmpVectors[0], tmpVectors[0]);
756 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, noMatch);
757 }
758 }
759 }
760
761 private static void emitMovBytes(AMD64MacroAssembler asm, Register dst, AMD64Address src, int size) {
762 switch (size) {
763 case 1:
764 asm.movzbl(dst, src);
765 break;
766 case 2:
767 asm.movzwl(dst, src);
768 break;
769 case 4:
770 asm.movl(dst, src);
771 break;
772 case 8:
773 asm.movq(dst, src);
774 break;
775 default:
776 throw new IllegalStateException();
777 }
778 }
779
780 private static void emitCmpBytes(AMD64MacroAssembler asm, Register dst, Register src, int size) {
781 if (size < 8) {
782 asm.cmpl(dst, src);
783 } else {
784 asm.cmpq(dst, src);
785 }
786 }
787 }
|