3834 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3835 __ orr(tmp1, tmp1, tmp3);
3836 __ cbnz(tmp1, NOT_EQUAL);
3837 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3838 __ eor(tmp5, tmp5, tmp6);
3839 __ eor(tmp7, tmp7, tmp8);
3840 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3841 __ orr(tmp5, tmp5, tmp7);
3842 __ cbnz(tmp5, NOT_EQUAL);
3843 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3844 __ eor(tmp1, tmp1, tmp2);
3845 __ eor(tmp3, tmp3, tmp4);
3846 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3847 __ orr(tmp1, tmp1, tmp3);
3848 __ cbnz(tmp1, NOT_EQUAL);
3849 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3850 __ eor(tmp5, tmp5, tmp6);
3851 __ sub(cnt1, cnt1, 8 * wordSize);
3852 __ eor(tmp7, tmp7, tmp8);
3853 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3854 // tmp6 is not used. MacroAssembler::subs is used here (rather than
3855 // cmp) because subs allows an unlimited range of immediate operand.
3856 __ subs(tmp6, cnt1, loopThreshold);
3857 __ orr(tmp5, tmp5, tmp7);
3858 __ cbnz(tmp5, NOT_EQUAL);
3859 __ br(__ GE, LOOP);
3860 // post-loop
3861 __ eor(tmp1, tmp1, tmp2);
3862 __ eor(tmp3, tmp3, tmp4);
3863 __ orr(tmp1, tmp1, tmp3);
3864 __ sub(cnt1, cnt1, 2 * wordSize);
3865 __ cbnz(tmp1, NOT_EQUAL);
3866 }
3867
3868 void generate_large_array_equals_loop_simd(int loopThreshold,
3869 bool usePrefetch, Label &NOT_EQUAL) {
3870 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3871 tmp2 = rscratch2;
3872 Label LOOP;
3873
3874 __ bind(LOOP);
3875 if (usePrefetch) {
3876 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3877 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3878 }
3879 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3880 __ sub(cnt1, cnt1, 8 * wordSize);
3881 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3882 __ subs(tmp1, cnt1, loopThreshold);
3883 __ eor(v0, __ T16B, v0, v4);
3884 __ eor(v1, __ T16B, v1, v5);
3885 __ eor(v2, __ T16B, v2, v6);
3886 __ eor(v3, __ T16B, v3, v7);
3887 __ orr(v0, __ T16B, v0, v1);
3888 __ orr(v1, __ T16B, v2, v3);
3889 __ orr(v0, __ T16B, v0, v1);
3890 __ umov(tmp1, v0, __ D, 0);
3891 __ umov(tmp2, v0, __ D, 1);
3892 __ orr(tmp1, tmp1, tmp2);
3893 __ cbnz(tmp1, NOT_EQUAL);
3894 __ br(__ GE, LOOP);
3895 }
3896
3897 // a1 = r1 - array1 address
3898 // a2 = r2 - array2 address
3899 // result = r0 - return value. Already contains "false"
3900 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3901 // r3-r5 are reserved temporary registers
3902 address generate_large_array_equals() {
3921 // also advance pointers to use post-increment instead of pre-increment
3922 __ add(a1, a1, wordSize);
3923 __ add(a2, a2, wordSize);
3924 if (AvoidUnalignedAccesses) {
3925 // both implementations (SIMD/nonSIMD) are using relatively large load
3926 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3927 // on some CPUs in case of address is not at least 16-byte aligned.
3928 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3929 // load if needed at least for 1st address and make if 16-byte aligned.
3930 Label ALIGNED16;
3931 __ tbz(a1, 3, ALIGNED16);
3932 __ ldr(tmp1, Address(__ post(a1, wordSize)));
3933 __ ldr(tmp2, Address(__ post(a2, wordSize)));
3934 __ sub(cnt1, cnt1, wordSize);
3935 __ eor(tmp1, tmp1, tmp2);
3936 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3937 __ bind(ALIGNED16);
3938 }
3939 if (UseSIMDForArrayEquals) {
3940 if (SoftwarePrefetchHintDistance >= 0) {
3941 __ subs(tmp1, cnt1, prefetchLoopThreshold);
3942 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3943 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3944 /* prfm = */ true, NOT_EQUAL);
3945 __ cmp(cnt1, nonPrefetchLoopThreshold);
3946 __ br(__ LT, TAIL);
3947 }
3948 __ bind(NO_PREFETCH_LARGE_LOOP);
3949 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3950 /* prfm = */ false, NOT_EQUAL);
3951 } else {
3952 __ push(spilled_regs, sp);
3953 if (SoftwarePrefetchHintDistance >= 0) {
3954 __ subs(tmp1, cnt1, prefetchLoopThreshold);
3955 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3956 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3957 /* prfm = */ true, NOT_EQUAL);
3958 __ cmp(cnt1, nonPrefetchLoopThreshold);
3959 __ br(__ LT, TAIL);
3960 }
3961 __ bind(NO_PREFETCH_LARGE_LOOP);
3962 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3963 /* prfm = */ false, NOT_EQUAL);
3964 }
3965 __ bind(TAIL);
3966 __ cbz(cnt1, EQUAL);
3967 __ subs(cnt1, cnt1, wordSize);
3968 __ br(__ LE, POST_LOOP);
3969 __ bind(SMALL_LOOP);
3970 __ ldr(tmp1, Address(__ post(a1, wordSize)));
3971 __ ldr(tmp2, Address(__ post(a2, wordSize)));
3972 __ subs(cnt1, cnt1, wordSize);
3973 __ eor(tmp1, tmp1, tmp2);
3974 __ cbnz(tmp1, NOT_EQUAL);
4089 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4090 __ add(str2, str2, isLU ? wordSize : wordSize/2);
4091 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4092 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4093 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4094 __ eor(rscratch2, tmp1, tmp2);
4095 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4096 __ mov(rscratch1, tmp2);
4097 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4098 Register strU = isLU ? str2 : str1,
4099 strL = isLU ? str1 : str2,
4100 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4101 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4102 __ push(spilled_regs, sp);
4103 __ sub(tmp2, strL, cnt2); // strL pointer to load from
4104 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4105
4106 __ ldr(tmp3, Address(__ post(cnt1, 8)));
4107
4108 if (SoftwarePrefetchHintDistance >= 0) {
4109 __ cmp(cnt2, prefetchLoopExitCondition);
4110 __ br(__ LT, SMALL_LOOP);
4111 __ bind(LARGE_LOOP_PREFETCH);
4112 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4113 __ mov(tmp4, 2);
4114 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4115 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4116 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4117 __ subs(tmp4, tmp4, 1);
4118 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4119 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4120 __ mov(tmp4, 2);
4121 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4122 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4123 __ subs(tmp4, tmp4, 1);
4124 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4125 __ sub(cnt2, cnt2, 64);
4126 __ cmp(cnt2, prefetchLoopExitCondition);
4127 __ br(__ GE, LARGE_LOOP_PREFETCH);
4128 }
4129 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4130 __ subs(cnt2, cnt2, 16);
4131 __ br(__ LT, TAIL);
4132 __ b(SMALL_LOOP_ENTER);
4133 __ bind(SMALL_LOOP); // smaller loop
4134 __ subs(cnt2, cnt2, 16);
4135 __ bind(SMALL_LOOP_ENTER);
4136 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4137 __ br(__ GE, SMALL_LOOP);
4138 __ cbz(cnt2, LOAD_LAST);
4139 __ bind(TAIL); // 1..15 characters left
4140 __ cmp(cnt2, -8);
4141 __ br(__ GT, TAIL_LOAD_16);
4142 __ ldrd(vtmp, Address(tmp2));
4143 __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4144
4145 __ ldr(tmpU, Address(__ post(cnt1, 8)));
4146 __ fmovd(tmpL, vtmp3);
4223 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4224 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4225 DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4226 // exit from large loop when less than 64 bytes left to read or we're about
4227 // to prefetch memory behind array border
4228 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4229 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4230 // update cnt2 counter with already loaded 8 bytes
4231 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4232 // update pointers, because of previous read
4233 __ add(str1, str1, wordSize);
4234 __ add(str2, str2, wordSize);
4235 if (SoftwarePrefetchHintDistance >= 0) {
4236 __ bind(LARGE_LOOP_PREFETCH);
4237 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4238 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4239 compare_string_16_bytes_same(DIFF, DIFF2);
4240 compare_string_16_bytes_same(DIFF, DIFF2);
4241 __ sub(cnt2, cnt2, isLL ? 64 : 32);
4242 compare_string_16_bytes_same(DIFF, DIFF2);
4243 __ cmp(cnt2, largeLoopExitCondition);
4244 compare_string_16_bytes_same(DIFF, DIFF2);
4245 __ br(__ GT, LARGE_LOOP_PREFETCH);
4246 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4247 // less than 16 bytes left?
4248 __ subs(cnt2, cnt2, isLL ? 16 : 8);
4249 __ br(__ LT, TAIL);
4250 }
4251 __ bind(SMALL_LOOP);
4252 compare_string_16_bytes_same(DIFF, DIFF2);
4253 __ subs(cnt2, cnt2, isLL ? 16 : 8);
4254 __ br(__ GE, SMALL_LOOP);
4255 __ bind(TAIL);
4256 __ adds(cnt2, cnt2, isLL ? 16 : 8);
4257 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4258 __ subs(cnt2, cnt2, isLL ? 8 : 4);
4259 __ br(__ LE, CHECK_LAST);
4260 __ eor(rscratch2, tmp1, tmp2);
4261 __ cbnz(rscratch2, DIFF);
4262 __ ldr(tmp1, Address(__ post(str1, 8)));
4263 __ ldr(tmp2, Address(__ post(str2, 8)));
4642 // R2 = len
4643 // R3 = len >> 3
4644 // V0 = 0
4645 // v1 = loaded 8 bytes
4646 address generate_large_byte_array_inflate() {
4647 __ align(CodeEntryAlignment);
4648 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4649 address entry = __ pc();
4650 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4651 Register src = r0, dst = r1, len = r2, octetCounter = r3;
4652 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4653
4654 // do one more 8-byte read to have address 16-byte aligned in most cases
4655 // also use single store instruction
4656 __ ldrd(v2, __ post(src, 8));
4657 __ sub(octetCounter, octetCounter, 2);
4658 __ zip1(v1, __ T16B, v1, v0);
4659 __ zip1(v2, __ T16B, v2, v0);
4660 __ st1(v1, v2, __ T16B, __ post(dst, 32));
4661 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4662 __ cmp(octetCounter, large_loop_threshold);
4663 __ br(__ LE, LOOP_START);
4664 __ b(LOOP_PRFM_START);
4665 __ bind(LOOP_PRFM);
4666 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4667 __ bind(LOOP_PRFM_START);
4668 __ prfm(Address(src, SoftwarePrefetchHintDistance));
4669 __ sub(octetCounter, octetCounter, 8);
4670 __ cmp(octetCounter, large_loop_threshold);
4671 inflate_and_store_2_fp_registers(true, v3, v4);
4672 inflate_and_store_2_fp_registers(true, v5, v6);
4673 __ br(__ GT, LOOP_PRFM);
4674 __ cmp(octetCounter, 8);
4675 __ br(__ LT, DONE);
4676 __ bind(LOOP);
4677 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4678 __ bind(LOOP_START);
4679 __ sub(octetCounter, octetCounter, 8);
4680 __ cmp(octetCounter, 8);
4681 inflate_and_store_2_fp_registers(false, v3, v4);
4682 inflate_and_store_2_fp_registers(false, v5, v6);
4683 __ br(__ GE, LOOP);
4684 __ bind(DONE);
4685 __ ret(lr);
4686 return entry;
4687 }
4688
4689 /**
4690 * Arguments:
|
3834 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3835 __ orr(tmp1, tmp1, tmp3);
3836 __ cbnz(tmp1, NOT_EQUAL);
3837 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3838 __ eor(tmp5, tmp5, tmp6);
3839 __ eor(tmp7, tmp7, tmp8);
3840 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3841 __ orr(tmp5, tmp5, tmp7);
3842 __ cbnz(tmp5, NOT_EQUAL);
3843 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3844 __ eor(tmp1, tmp1, tmp2);
3845 __ eor(tmp3, tmp3, tmp4);
3846 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3847 __ orr(tmp1, tmp1, tmp3);
3848 __ cbnz(tmp1, NOT_EQUAL);
3849 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3850 __ eor(tmp5, tmp5, tmp6);
3851 __ sub(cnt1, cnt1, 8 * wordSize);
3852 __ eor(tmp7, tmp7, tmp8);
3853 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3854 __ cmp(tmp6, cnt1, loopThreshold);
3855 __ orr(tmp5, tmp5, tmp7);
3856 __ cbnz(tmp5, NOT_EQUAL);
3857 __ br(__ GE, LOOP);
3858 // post-loop
3859 __ eor(tmp1, tmp1, tmp2);
3860 __ eor(tmp3, tmp3, tmp4);
3861 __ orr(tmp1, tmp1, tmp3);
3862 __ sub(cnt1, cnt1, 2 * wordSize);
3863 __ cbnz(tmp1, NOT_EQUAL);
3864 }
3865
3866 void generate_large_array_equals_loop_simd(int loopThreshold,
3867 bool usePrefetch, Label &NOT_EQUAL) {
3868 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3869 tmp2 = rscratch2;
3870 Label LOOP;
3871
3872 __ bind(LOOP);
3873 if (usePrefetch) {
3874 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3875 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3876 }
3877 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3878 __ sub(cnt1, cnt1, 8 * wordSize);
3879 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3880 __ cmp(tmp1, cnt1, loopThreshold);
3881 __ eor(v0, __ T16B, v0, v4);
3882 __ eor(v1, __ T16B, v1, v5);
3883 __ eor(v2, __ T16B, v2, v6);
3884 __ eor(v3, __ T16B, v3, v7);
3885 __ orr(v0, __ T16B, v0, v1);
3886 __ orr(v1, __ T16B, v2, v3);
3887 __ orr(v0, __ T16B, v0, v1);
3888 __ umov(tmp1, v0, __ D, 0);
3889 __ umov(tmp2, v0, __ D, 1);
3890 __ orr(tmp1, tmp1, tmp2);
3891 __ cbnz(tmp1, NOT_EQUAL);
3892 __ br(__ GE, LOOP);
3893 }
3894
3895 // a1 = r1 - array1 address
3896 // a2 = r2 - array2 address
3897 // result = r0 - return value. Already contains "false"
3898 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3899 // r3-r5 are reserved temporary registers
3900 address generate_large_array_equals() {
3919 // also advance pointers to use post-increment instead of pre-increment
3920 __ add(a1, a1, wordSize);
3921 __ add(a2, a2, wordSize);
3922 if (AvoidUnalignedAccesses) {
3923 // both implementations (SIMD/nonSIMD) are using relatively large load
3924 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3925 // on some CPUs in case of address is not at least 16-byte aligned.
3926 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3927 // load if needed at least for 1st address and make if 16-byte aligned.
3928 Label ALIGNED16;
3929 __ tbz(a1, 3, ALIGNED16);
3930 __ ldr(tmp1, Address(__ post(a1, wordSize)));
3931 __ ldr(tmp2, Address(__ post(a2, wordSize)));
3932 __ sub(cnt1, cnt1, wordSize);
3933 __ eor(tmp1, tmp1, tmp2);
3934 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3935 __ bind(ALIGNED16);
3936 }
3937 if (UseSIMDForArrayEquals) {
3938 if (SoftwarePrefetchHintDistance >= 0) {
3939 __ cmp(tmp1, cnt1, prefetchLoopThreshold);
3940 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3941 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3942 /* prfm = */ true, NOT_EQUAL);
3943 __ cmp(cnt1, nonPrefetchLoopThreshold);
3944 __ br(__ LT, TAIL);
3945 }
3946 __ bind(NO_PREFETCH_LARGE_LOOP);
3947 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3948 /* prfm = */ false, NOT_EQUAL);
3949 } else {
3950 __ push(spilled_regs, sp);
3951 if (SoftwarePrefetchHintDistance >= 0) {
3952 __ cmp(tmp1, cnt1, prefetchLoopThreshold);
3953 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3954 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3955 /* prfm = */ true, NOT_EQUAL);
3956 __ cmp(cnt1, nonPrefetchLoopThreshold);
3957 __ br(__ LT, TAIL);
3958 }
3959 __ bind(NO_PREFETCH_LARGE_LOOP);
3960 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3961 /* prfm = */ false, NOT_EQUAL);
3962 }
3963 __ bind(TAIL);
3964 __ cbz(cnt1, EQUAL);
3965 __ subs(cnt1, cnt1, wordSize);
3966 __ br(__ LE, POST_LOOP);
3967 __ bind(SMALL_LOOP);
3968 __ ldr(tmp1, Address(__ post(a1, wordSize)));
3969 __ ldr(tmp2, Address(__ post(a2, wordSize)));
3970 __ subs(cnt1, cnt1, wordSize);
3971 __ eor(tmp1, tmp1, tmp2);
3972 __ cbnz(tmp1, NOT_EQUAL);
4087 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4088 __ add(str2, str2, isLU ? wordSize : wordSize/2);
4089 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4090 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4091 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4092 __ eor(rscratch2, tmp1, tmp2);
4093 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4094 __ mov(rscratch1, tmp2);
4095 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4096 Register strU = isLU ? str2 : str1,
4097 strL = isLU ? str1 : str2,
4098 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4099 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4100 __ push(spilled_regs, sp);
4101 __ sub(tmp2, strL, cnt2); // strL pointer to load from
4102 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4103
4104 __ ldr(tmp3, Address(__ post(cnt1, 8)));
4105
4106 if (SoftwarePrefetchHintDistance >= 0) {
4107 __ cmp(rscratch2, cnt2, prefetchLoopExitCondition);
4108 __ br(__ LT, SMALL_LOOP);
4109 __ bind(LARGE_LOOP_PREFETCH);
4110 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4111 __ mov(tmp4, 2);
4112 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4113 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4114 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4115 __ subs(tmp4, tmp4, 1);
4116 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4117 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4118 __ mov(tmp4, 2);
4119 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4120 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4121 __ subs(tmp4, tmp4, 1);
4122 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4123 __ sub(cnt2, cnt2, 64);
4124 __ cmp(rscratch2, cnt2, prefetchLoopExitCondition);
4125 __ br(__ GE, LARGE_LOOP_PREFETCH);
4126 }
4127 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4128 __ subs(cnt2, cnt2, 16);
4129 __ br(__ LT, TAIL);
4130 __ b(SMALL_LOOP_ENTER);
4131 __ bind(SMALL_LOOP); // smaller loop
4132 __ subs(cnt2, cnt2, 16);
4133 __ bind(SMALL_LOOP_ENTER);
4134 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4135 __ br(__ GE, SMALL_LOOP);
4136 __ cbz(cnt2, LOAD_LAST);
4137 __ bind(TAIL); // 1..15 characters left
4138 __ cmp(cnt2, -8);
4139 __ br(__ GT, TAIL_LOAD_16);
4140 __ ldrd(vtmp, Address(tmp2));
4141 __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4142
4143 __ ldr(tmpU, Address(__ post(cnt1, 8)));
4144 __ fmovd(tmpL, vtmp3);
4221 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4222 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4223 DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4224 // exit from large loop when less than 64 bytes left to read or we're about
4225 // to prefetch memory behind array border
4226 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4227 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4228 // update cnt2 counter with already loaded 8 bytes
4229 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4230 // update pointers, because of previous read
4231 __ add(str1, str1, wordSize);
4232 __ add(str2, str2, wordSize);
4233 if (SoftwarePrefetchHintDistance >= 0) {
4234 __ bind(LARGE_LOOP_PREFETCH);
4235 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4236 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4237 compare_string_16_bytes_same(DIFF, DIFF2);
4238 compare_string_16_bytes_same(DIFF, DIFF2);
4239 __ sub(cnt2, cnt2, isLL ? 64 : 32);
4240 compare_string_16_bytes_same(DIFF, DIFF2);
4241 __ cmp(rscratch2, cnt2, largeLoopExitCondition);
4242 compare_string_16_bytes_same(DIFF, DIFF2);
4243 __ br(__ GT, LARGE_LOOP_PREFETCH);
4244 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4245 // less than 16 bytes left?
4246 __ subs(cnt2, cnt2, isLL ? 16 : 8);
4247 __ br(__ LT, TAIL);
4248 }
4249 __ bind(SMALL_LOOP);
4250 compare_string_16_bytes_same(DIFF, DIFF2);
4251 __ subs(cnt2, cnt2, isLL ? 16 : 8);
4252 __ br(__ GE, SMALL_LOOP);
4253 __ bind(TAIL);
4254 __ adds(cnt2, cnt2, isLL ? 16 : 8);
4255 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4256 __ subs(cnt2, cnt2, isLL ? 8 : 4);
4257 __ br(__ LE, CHECK_LAST);
4258 __ eor(rscratch2, tmp1, tmp2);
4259 __ cbnz(rscratch2, DIFF);
4260 __ ldr(tmp1, Address(__ post(str1, 8)));
4261 __ ldr(tmp2, Address(__ post(str2, 8)));
4640 // R2 = len
4641 // R3 = len >> 3
4642 // V0 = 0
4643 // v1 = loaded 8 bytes
4644 address generate_large_byte_array_inflate() {
4645 __ align(CodeEntryAlignment);
4646 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4647 address entry = __ pc();
4648 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4649 Register src = r0, dst = r1, len = r2, octetCounter = r3;
4650 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4651
4652 // do one more 8-byte read to have address 16-byte aligned in most cases
4653 // also use single store instruction
4654 __ ldrd(v2, __ post(src, 8));
4655 __ sub(octetCounter, octetCounter, 2);
4656 __ zip1(v1, __ T16B, v1, v0);
4657 __ zip1(v2, __ T16B, v2, v0);
4658 __ st1(v1, v2, __ T16B, __ post(dst, 32));
4659 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4660 __ cmp(rscratch1, octetCounter, large_loop_threshold);
4661 __ br(__ LE, LOOP_START);
4662 __ b(LOOP_PRFM_START);
4663 __ bind(LOOP_PRFM);
4664 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4665 __ bind(LOOP_PRFM_START);
4666 __ prfm(Address(src, SoftwarePrefetchHintDistance));
4667 __ sub(octetCounter, octetCounter, 8);
4668 __ cmp(rscratch1, octetCounter, large_loop_threshold);
4669 inflate_and_store_2_fp_registers(true, v3, v4);
4670 inflate_and_store_2_fp_registers(true, v5, v6);
4671 __ br(__ GT, LOOP_PRFM);
4672 __ cmp(octetCounter, 8);
4673 __ br(__ LT, DONE);
4674 __ bind(LOOP);
4675 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4676 __ bind(LOOP_START);
4677 __ sub(octetCounter, octetCounter, 8);
4678 __ cmp(octetCounter, 8);
4679 inflate_and_store_2_fp_registers(false, v3, v4);
4680 inflate_and_store_2_fp_registers(false, v5, v6);
4681 __ br(__ GE, LOOP);
4682 __ bind(DONE);
4683 __ ret(lr);
4684 return entry;
4685 }
4686
4687 /**
4688 * Arguments:
|