--- old/src/hotspot/cpu/aarch64/aarch64.ad 2017-11-10 17:48:55.574507864 +0300 +++ new/src/hotspot/cpu/aarch64/aarch64.ad 2017-11-10 17:48:55.438510177 +0300 @@ -16154,7 +16154,7 @@ ins_pipe(pipe_class_memory); %} -instruct string_equalsL(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt, +instruct string_equalsL(iRegP_R1 str1, iRegP_R3 str2, iRegP_R4 cnt, iRegI_R0 result, rFlagsReg cr) %{ predicate(((StrEqualsNode*)n)->encoding() == StrIntrinsicNode::LL); @@ -16190,7 +16190,7 @@ %} instruct array_equalsB(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, - iRegP_R10 tmp, rFlagsReg cr) + iRegP_R4 tmp, rFlagsReg cr) %{ predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (AryEq ary1 ary2)); @@ -16206,7 +16206,7 @@ %} instruct array_equalsC(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, - iRegP_R10 tmp, rFlagsReg cr) + iRegP_R4 tmp, rFlagsReg cr) %{ predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (AryEq ary1 ary2)); --- old/src/hotspot/cpu/aarch64/globals_aarch64.hpp 2017-11-10 17:48:56.062499565 +0300 +++ new/src/hotspot/cpu/aarch64/globals_aarch64.hpp 2017-11-10 17:48:55.914502082 +0300 @@ -145,6 +145,8 @@ "Use CRC32 instructions for CRC32 computation") \ product(bool, UseSIMDForMemoryOps, false, \ "Use SIMD instructions in generated memory move code") \ + product(bool, UseSIMDForArrayEquals, true, \ + "Use SIMD instructions in generated array equals code") \ product(bool, AvoidUnalignedAccesses, false, \ "Avoid generating unaligned memory accesses") \ product(bool, UseLSE, false, \ --- old/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp 2017-11-10 17:48:56.382494123 +0300 +++ new/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp 2017-11-10 17:48:56.246496435 +0300 @@ -5068,6 +5068,16 @@ // Check for short strings, i.e. smaller than wordSize. subs(cnt1, cnt1, elem_per_word); br(Assembler::LT, SHORT); + // large loop algo should be used for >= 64(large loop) + 16(post loop) bytes + cmp(cnt1, 80/elem_size); + br(Assembler::LT, NEXT_WORD); + RuntimeAddress stub = elem_size == 1 + ? RuntimeAddress(StubRoutines::aarch64::large_array_equals_byte()) + : RuntimeAddress(StubRoutines::aarch64::large_array_equals_char()); + assert(stub.target() != NULL, "array_equals_long_* stub has not been generated"); + trampoline_call(stub); + cbz(result, DONE); + mov(result, false); // reset result // Main 8 byte comparison loop. bind(NEXT_WORD); { ldr(tmp1, Address(post(a1, wordSize))); --- old/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp 2017-11-10 17:48:56.742488002 +0300 +++ new/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp 2017-11-10 17:48:56.606490314 +0300 @@ -3888,6 +3888,128 @@ __ ret(lr); return entry; } + + address generate_large_array_equals_byte() { + return generate_large_array_equals(1); + } + + address generate_large_array_equals_char() { + return generate_large_array_equals(2); + } + + // a1 = r1 - array1 address + // a2 = r2 - array2 address + // result = r0 - return value. Already contains "false" + // cnt1 = r4 - amount of elements left to check, reduced by elem_per_word + address generate_large_array_equals(int elem_size) { + StubCodeMark mark(this, "StubRoutines", elem_size == 1 + ? "large_array_equals_byte" + : "large_array_equals_char"); + Register a1 = r1, a2 = r2, result = r0, cnt1 = r4, tmp1 = rscratch1, + tmp2 = rscratch2, tmp3 = r6, tmp4 = r7; + Label LARGE_LOOP, NOT_EQUAL; + int elem_per_word = wordSize/elem_size; + int branchThreshold = MAX(80, SoftwarePrefetchHintDistance)/elem_size - elem_per_word; + RegSet spilled_regs = RegSet::of(tmp3, tmp4); + + assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4); + + __ align(CodeEntryAlignment); + address entry = __ pc(); + __ enter(); + + if (!UseSIMDForArrayEquals) { + // pre-loop + __ push(spilled_regs, sp); + __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); + __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); + } + __ bind(LARGE_LOOP); // unrolled to 64 bytes loop with possible prefetching + if (SoftwarePrefetchHintDistance >= 0) { + __ prfm(Address(a1, SoftwarePrefetchHintDistance)); + __ prfm(Address(a2, SoftwarePrefetchHintDistance)); + } + if (UseSIMDForArrayEquals) { + __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); + __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); + __ eor(v0, __ T2D, v0, v4); + __ eor(v1, __ T2D, v1, v5); + __ eor(v2, __ T2D, v2, v6); + __ eor(v3, __ T2D, v3, v7); + + __ orr(v0, __ T2D, v0, v1); + __ orr(v1, __ T2D, v2, v3); + __ orr(v0, __ T2D, v0, v1); + + __ umov(tmp1, v0, __ D, 0); + __ cbnz(tmp1, NOT_EQUAL); + __ umov(tmp1, v0, __ D, 1); + __ cbnz(tmp1, NOT_EQUAL); + __ sub(cnt1, cnt1, 64/elem_size); + __ cmp(cnt1, branchThreshold); + __ br(__ GT, LARGE_LOOP); + } else { + __ eor(tmp1, tmp1, tmp2); + __ ldr(tmp2, Address(__ post(a2, wordSize))); + __ cbnz(tmp1, NOT_EQUAL); + __ ldr(tmp1, Address(__ post(a1, wordSize))); + __ eor(tmp3, tmp3, tmp4); + __ ldr(tmp4, Address(__ post(a2, wordSize))); + __ cbnz(tmp3, NOT_EQUAL); + __ ldr(tmp3, Address(__ post(a1, wordSize))); + + __ eor(tmp1, tmp1, tmp2); + __ ldr(tmp2, Address(__ post(a2, wordSize))); + __ cbnz(tmp1, NOT_EQUAL); + __ ldr(tmp1, Address(__ post(a1, wordSize))); + __ eor(tmp3, tmp3, tmp4); + __ ldr(tmp4, Address(__ post(a2, wordSize))); + __ cbnz(tmp3, NOT_EQUAL); + __ ldr(tmp3, Address(__ post(a1, wordSize))); + + __ eor(tmp1, tmp1, tmp2); + __ ldr(tmp2, Address(__ post(a2, wordSize))); + __ cbnz(tmp1, NOT_EQUAL); + __ ldr(tmp1, Address(__ post(a1, wordSize))); + __ eor(tmp3, tmp3, tmp4); + __ ldr(tmp4, Address(__ post(a2, wordSize))); + __ cbnz(tmp3, NOT_EQUAL); + __ ldr(tmp3, Address(__ post(a1, wordSize))); + + // loads below are for next loop iteration + __ eor(tmp1, tmp1, tmp2); + __ ldr(tmp2, Address(__ post(a2, wordSize))); + __ cbnz(tmp1, NOT_EQUAL); + __ ldr(tmp1, Address(__ post(a1, wordSize))); + __ eor(tmp3, tmp3, tmp4); + __ ldr(tmp4, Address(__ post(a2, wordSize))); + __ cbnz(tmp3, NOT_EQUAL); + __ ldr(tmp3, Address(__ post(a1, wordSize))); + + __ sub(cnt1, cnt1, 8 * elem_per_word); + // run this loop until we have memory to prefetch(but at least 64+16 bytes). + __ cmp(cnt1, branchThreshold); + __ br(Assembler::GT, LARGE_LOOP); + // both a1 and a2 are shifted more than needed by wordSize and tmp1-tmp4 + // contains still-not-checked value. Check it in this post-loop, also update + // cnt1 accordingly + __ eor(tmp1, tmp1, tmp2); + __ cbnz(tmp1, NOT_EQUAL); + __ eor(tmp3, tmp3, tmp4); + __ cbnz(tmp3, NOT_EQUAL); + __ sub(cnt1, cnt1, 2 * elem_per_word); + } + + __ mov(result, true); + __ bind(NOT_EQUAL); + if (!UseSIMDForArrayEquals) { + __ pop(spilled_regs, sp); + } + __ leave(); + __ ret(lr); + return entry; + } + /** * Arguments: * @@ -4970,6 +5092,10 @@ // has negatives stub for large arrays. StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); + // array equals stub for large arrays. + StubRoutines::aarch64::_large_array_equals_byte = generate_large_array_equals_byte(); + StubRoutines::aarch64::_large_array_equals_char = generate_large_array_equals_char(); + if (UseMultiplyToLenIntrinsic) { StubRoutines::_multiplyToLen = generate_multiplyToLen(); } --- old/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp 2017-11-10 17:48:57.122481541 +0300 +++ new/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp 2017-11-10 17:48:56.990483785 +0300 @@ -46,6 +46,8 @@ address StubRoutines::aarch64::_zero_blocks = NULL; address StubRoutines::aarch64::_has_negatives = NULL; address StubRoutines::aarch64::_has_negatives_long = NULL; +address StubRoutines::aarch64::_large_array_equals_byte = NULL; +address StubRoutines::aarch64::_large_array_equals_char = NULL; bool StubRoutines::aarch64::_completed = false; /** --- old/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp 2017-11-10 17:48:57.474475556 +0300 +++ new/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp 2017-11-10 17:48:57.330478004 +0300 @@ -65,6 +65,8 @@ static address _has_negatives; static address _has_negatives_long; + static address _large_array_equals_byte; + static address _large_array_equals_char; static bool _completed; public: @@ -131,6 +133,14 @@ return _has_negatives_long; } + static address large_array_equals_byte() { + return _large_array_equals_byte; + } + + static address large_array_equals_char() { + return _large_array_equals_char; + } + static bool complete() { return _completed; } --- old/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp 2017-11-10 17:48:57.826469571 +0300 +++ new/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp 2017-11-10 17:48:57.682472020 +0300 @@ -191,6 +191,9 @@ if (FLAG_IS_DEFAULT(UseSIMDForMemoryOps)) { FLAG_SET_DEFAULT(UseSIMDForMemoryOps, (_variant > 0)); } + if ((_model == 0x0a1 || _model2 == 0x0a1) && FLAG_IS_DEFAULT(UseSIMDForArrayEquals)) { + UseSIMDForArrayEquals = false; // ThunderX T88 is slow with SIMD + } } if (_cpu == CPU_ARM && (_model == 0xd03 || _model2 == 0xd03)) _features |= CPU_A53MAC; if (_cpu == CPU_ARM && (_model == 0xd07 || _model2 == 0xd07)) _features |= CPU_STXR_PREFETCH;