< prev index next >

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Print this page




3834     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3835     __ orr(tmp1, tmp1, tmp3);
3836     __ cbnz(tmp1, NOT_EQUAL);
3837     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3838     __ eor(tmp5, tmp5, tmp6);
3839     __ eor(tmp7, tmp7, tmp8);
3840     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3841     __ orr(tmp5, tmp5, tmp7);
3842     __ cbnz(tmp5, NOT_EQUAL);
3843     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3844     __ eor(tmp1, tmp1, tmp2);
3845     __ eor(tmp3, tmp3, tmp4);
3846     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3847     __ orr(tmp1, tmp1, tmp3);
3848     __ cbnz(tmp1, NOT_EQUAL);
3849     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3850     __ eor(tmp5, tmp5, tmp6);
3851     __ sub(cnt1, cnt1, 8 * wordSize);
3852     __ eor(tmp7, tmp7, tmp8);
3853     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3854     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3855     // cmp) because subs allows an unlimited range of immediate operand.
3856     __ subs(tmp6, cnt1, loopThreshold);
3857     __ orr(tmp5, tmp5, tmp7);
3858     __ cbnz(tmp5, NOT_EQUAL);
3859     __ br(__ GE, LOOP);
3860     // post-loop
3861     __ eor(tmp1, tmp1, tmp2);
3862     __ eor(tmp3, tmp3, tmp4);
3863     __ orr(tmp1, tmp1, tmp3);
3864     __ sub(cnt1, cnt1, 2 * wordSize);
3865     __ cbnz(tmp1, NOT_EQUAL);
3866   }
3867 
3868   void generate_large_array_equals_loop_simd(int loopThreshold,
3869         bool usePrefetch, Label &NOT_EQUAL) {
3870     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3871         tmp2 = rscratch2;
3872     Label LOOP;
3873 
3874     __ bind(LOOP);
3875     if (usePrefetch) {
3876       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3877       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3878     }
3879     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3880     __ sub(cnt1, cnt1, 8 * wordSize);
3881     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3882     __ subs(tmp1, cnt1, loopThreshold);
3883     __ eor(v0, __ T16B, v0, v4);
3884     __ eor(v1, __ T16B, v1, v5);
3885     __ eor(v2, __ T16B, v2, v6);
3886     __ eor(v3, __ T16B, v3, v7);
3887     __ orr(v0, __ T16B, v0, v1);
3888     __ orr(v1, __ T16B, v2, v3);
3889     __ orr(v0, __ T16B, v0, v1);
3890     __ umov(tmp1, v0, __ D, 0);
3891     __ umov(tmp2, v0, __ D, 1);
3892     __ orr(tmp1, tmp1, tmp2);
3893     __ cbnz(tmp1, NOT_EQUAL);
3894     __ br(__ GE, LOOP);
3895   }
3896 
3897   // a1 = r1 - array1 address
3898   // a2 = r2 - array2 address
3899   // result = r0 - return value. Already contains "false"
3900   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3901   // r3-r5 are reserved temporary registers
3902   address generate_large_array_equals() {


3921     // also advance pointers to use post-increment instead of pre-increment
3922     __ add(a1, a1, wordSize);
3923     __ add(a2, a2, wordSize);
3924     if (AvoidUnalignedAccesses) {
3925       // both implementations (SIMD/nonSIMD) are using relatively large load
3926       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3927       // on some CPUs in case of address is not at least 16-byte aligned.
3928       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3929       // load if needed at least for 1st address and make if 16-byte aligned.
3930       Label ALIGNED16;
3931       __ tbz(a1, 3, ALIGNED16);
3932       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3933       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3934       __ sub(cnt1, cnt1, wordSize);
3935       __ eor(tmp1, tmp1, tmp2);
3936       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3937       __ bind(ALIGNED16);
3938     }
3939     if (UseSIMDForArrayEquals) {
3940       if (SoftwarePrefetchHintDistance >= 0) {
3941         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3942         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3943         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3944             /* prfm = */ true, NOT_EQUAL);
3945         __ cmp(cnt1, nonPrefetchLoopThreshold);
3946         __ br(__ LT, TAIL);
3947       }
3948       __ bind(NO_PREFETCH_LARGE_LOOP);
3949       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3950           /* prfm = */ false, NOT_EQUAL);
3951     } else {
3952       __ push(spilled_regs, sp);
3953       if (SoftwarePrefetchHintDistance >= 0) {
3954         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3955         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3956         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3957             /* prfm = */ true, NOT_EQUAL);
3958         __ cmp(cnt1, nonPrefetchLoopThreshold);
3959         __ br(__ LT, TAIL);
3960       }
3961       __ bind(NO_PREFETCH_LARGE_LOOP);
3962       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3963           /* prfm = */ false, NOT_EQUAL);
3964     }
3965     __ bind(TAIL);
3966       __ cbz(cnt1, EQUAL);
3967       __ subs(cnt1, cnt1, wordSize);
3968       __ br(__ LE, POST_LOOP);
3969     __ bind(SMALL_LOOP);
3970       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3971       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3972       __ subs(cnt1, cnt1, wordSize);
3973       __ eor(tmp1, tmp1, tmp2);
3974       __ cbnz(tmp1, NOT_EQUAL);


4089     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4090     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4091     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4092     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4093     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4094     __ eor(rscratch2, tmp1, tmp2);
4095     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4096     __ mov(rscratch1, tmp2);
4097     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4098     Register strU = isLU ? str2 : str1,
4099              strL = isLU ? str1 : str2,
4100              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4101              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4102     __ push(spilled_regs, sp);
4103     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4104     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4105 
4106     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4107 
4108     if (SoftwarePrefetchHintDistance >= 0) {
4109       __ cmp(cnt2, prefetchLoopExitCondition);
4110       __ br(__ LT, SMALL_LOOP);
4111       __ bind(LARGE_LOOP_PREFETCH);
4112         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4113         __ mov(tmp4, 2);
4114         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4115         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4116           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4117           __ subs(tmp4, tmp4, 1);
4118           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4119           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4120           __ mov(tmp4, 2);
4121         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4122           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4123           __ subs(tmp4, tmp4, 1);
4124           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4125           __ sub(cnt2, cnt2, 64);
4126           __ cmp(cnt2, prefetchLoopExitCondition);
4127           __ br(__ GE, LARGE_LOOP_PREFETCH);
4128     }
4129     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4130     __ subs(cnt2, cnt2, 16);
4131     __ br(__ LT, TAIL);
4132     __ b(SMALL_LOOP_ENTER);
4133     __ bind(SMALL_LOOP); // smaller loop
4134       __ subs(cnt2, cnt2, 16);
4135     __ bind(SMALL_LOOP_ENTER);
4136       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4137       __ br(__ GE, SMALL_LOOP);
4138       __ cbz(cnt2, LOAD_LAST);
4139     __ bind(TAIL); // 1..15 characters left
4140       __ cmp(cnt2, -8);
4141       __ br(__ GT, TAIL_LOAD_16);
4142       __ ldrd(vtmp, Address(tmp2));
4143       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4144 
4145       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4146       __ fmovd(tmpL, vtmp3);


4223     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4224         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4225         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4226     // exit from large loop when less than 64 bytes left to read or we're about
4227     // to prefetch memory behind array border
4228     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4229     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4230     // update cnt2 counter with already loaded 8 bytes
4231     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4232     // update pointers, because of previous read
4233     __ add(str1, str1, wordSize);
4234     __ add(str2, str2, wordSize);
4235     if (SoftwarePrefetchHintDistance >= 0) {
4236       __ bind(LARGE_LOOP_PREFETCH);
4237         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4238         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4239         compare_string_16_bytes_same(DIFF, DIFF2);
4240         compare_string_16_bytes_same(DIFF, DIFF2);
4241         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4242         compare_string_16_bytes_same(DIFF, DIFF2);
4243         __ cmp(cnt2, largeLoopExitCondition);
4244         compare_string_16_bytes_same(DIFF, DIFF2);
4245         __ br(__ GT, LARGE_LOOP_PREFETCH);
4246         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4247         // less than 16 bytes left?
4248         __ subs(cnt2, cnt2, isLL ? 16 : 8);
4249         __ br(__ LT, TAIL);
4250     }
4251     __ bind(SMALL_LOOP);
4252       compare_string_16_bytes_same(DIFF, DIFF2);
4253       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4254       __ br(__ GE, SMALL_LOOP);
4255     __ bind(TAIL);
4256       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4257       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4258       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4259       __ br(__ LE, CHECK_LAST);
4260       __ eor(rscratch2, tmp1, tmp2);
4261       __ cbnz(rscratch2, DIFF);
4262       __ ldr(tmp1, Address(__ post(str1, 8)));
4263       __ ldr(tmp2, Address(__ post(str2, 8)));


4642   // R2 = len
4643   // R3 = len >> 3
4644   // V0 = 0
4645   // v1 = loaded 8 bytes
4646   address generate_large_byte_array_inflate() {
4647     __ align(CodeEntryAlignment);
4648     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4649     address entry = __ pc();
4650     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4651     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4652     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4653 
4654     // do one more 8-byte read to have address 16-byte aligned in most cases
4655     // also use single store instruction
4656     __ ldrd(v2, __ post(src, 8));
4657     __ sub(octetCounter, octetCounter, 2);
4658     __ zip1(v1, __ T16B, v1, v0);
4659     __ zip1(v2, __ T16B, v2, v0);
4660     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4661     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4662     __ cmp(octetCounter, large_loop_threshold);
4663     __ br(__ LE, LOOP_START);
4664     __ b(LOOP_PRFM_START);
4665     __ bind(LOOP_PRFM);
4666       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4667     __ bind(LOOP_PRFM_START);
4668       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4669       __ sub(octetCounter, octetCounter, 8);
4670       __ cmp(octetCounter, large_loop_threshold);
4671       inflate_and_store_2_fp_registers(true, v3, v4);
4672       inflate_and_store_2_fp_registers(true, v5, v6);
4673       __ br(__ GT, LOOP_PRFM);
4674       __ cmp(octetCounter, 8);
4675       __ br(__ LT, DONE);
4676     __ bind(LOOP);
4677       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4678       __ bind(LOOP_START);
4679       __ sub(octetCounter, octetCounter, 8);
4680       __ cmp(octetCounter, 8);
4681       inflate_and_store_2_fp_registers(false, v3, v4);
4682       inflate_and_store_2_fp_registers(false, v5, v6);
4683       __ br(__ GE, LOOP);
4684     __ bind(DONE);
4685       __ ret(lr);
4686     return entry;
4687   }
4688 
4689   /**
4690    *  Arguments:




3834     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3835     __ orr(tmp1, tmp1, tmp3);
3836     __ cbnz(tmp1, NOT_EQUAL);
3837     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3838     __ eor(tmp5, tmp5, tmp6);
3839     __ eor(tmp7, tmp7, tmp8);
3840     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3841     __ orr(tmp5, tmp5, tmp7);
3842     __ cbnz(tmp5, NOT_EQUAL);
3843     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3844     __ eor(tmp1, tmp1, tmp2);
3845     __ eor(tmp3, tmp3, tmp4);
3846     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3847     __ orr(tmp1, tmp1, tmp3);
3848     __ cbnz(tmp1, NOT_EQUAL);
3849     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3850     __ eor(tmp5, tmp5, tmp6);
3851     __ sub(cnt1, cnt1, 8 * wordSize);
3852     __ eor(tmp7, tmp7, tmp8);
3853     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3854     __ cmp(tmp6, cnt1, loopThreshold);


3855     __ orr(tmp5, tmp5, tmp7);
3856     __ cbnz(tmp5, NOT_EQUAL);
3857     __ br(__ GE, LOOP);
3858     // post-loop
3859     __ eor(tmp1, tmp1, tmp2);
3860     __ eor(tmp3, tmp3, tmp4);
3861     __ orr(tmp1, tmp1, tmp3);
3862     __ sub(cnt1, cnt1, 2 * wordSize);
3863     __ cbnz(tmp1, NOT_EQUAL);
3864   }
3865 
3866   void generate_large_array_equals_loop_simd(int loopThreshold,
3867         bool usePrefetch, Label &NOT_EQUAL) {
3868     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3869         tmp2 = rscratch2;
3870     Label LOOP;
3871 
3872     __ bind(LOOP);
3873     if (usePrefetch) {
3874       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3875       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3876     }
3877     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3878     __ sub(cnt1, cnt1, 8 * wordSize);
3879     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3880     __ cmp(tmp1, cnt1, loopThreshold);
3881     __ eor(v0, __ T16B, v0, v4);
3882     __ eor(v1, __ T16B, v1, v5);
3883     __ eor(v2, __ T16B, v2, v6);
3884     __ eor(v3, __ T16B, v3, v7);
3885     __ orr(v0, __ T16B, v0, v1);
3886     __ orr(v1, __ T16B, v2, v3);
3887     __ orr(v0, __ T16B, v0, v1);
3888     __ umov(tmp1, v0, __ D, 0);
3889     __ umov(tmp2, v0, __ D, 1);
3890     __ orr(tmp1, tmp1, tmp2);
3891     __ cbnz(tmp1, NOT_EQUAL);
3892     __ br(__ GE, LOOP);
3893   }
3894 
3895   // a1 = r1 - array1 address
3896   // a2 = r2 - array2 address
3897   // result = r0 - return value. Already contains "false"
3898   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3899   // r3-r5 are reserved temporary registers
3900   address generate_large_array_equals() {


3919     // also advance pointers to use post-increment instead of pre-increment
3920     __ add(a1, a1, wordSize);
3921     __ add(a2, a2, wordSize);
3922     if (AvoidUnalignedAccesses) {
3923       // both implementations (SIMD/nonSIMD) are using relatively large load
3924       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3925       // on some CPUs in case of address is not at least 16-byte aligned.
3926       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3927       // load if needed at least for 1st address and make if 16-byte aligned.
3928       Label ALIGNED16;
3929       __ tbz(a1, 3, ALIGNED16);
3930       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3931       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3932       __ sub(cnt1, cnt1, wordSize);
3933       __ eor(tmp1, tmp1, tmp2);
3934       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3935       __ bind(ALIGNED16);
3936     }
3937     if (UseSIMDForArrayEquals) {
3938       if (SoftwarePrefetchHintDistance >= 0) {
3939         __ cmp(tmp1, cnt1, prefetchLoopThreshold);
3940         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3941         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3942             /* prfm = */ true, NOT_EQUAL);
3943         __ cmp(cnt1, nonPrefetchLoopThreshold);
3944         __ br(__ LT, TAIL);
3945       }
3946       __ bind(NO_PREFETCH_LARGE_LOOP);
3947       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3948           /* prfm = */ false, NOT_EQUAL);
3949     } else {
3950       __ push(spilled_regs, sp);
3951       if (SoftwarePrefetchHintDistance >= 0) {
3952         __ cmp(tmp1, cnt1, prefetchLoopThreshold);
3953         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3954         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3955             /* prfm = */ true, NOT_EQUAL);
3956         __ cmp(cnt1, nonPrefetchLoopThreshold);
3957         __ br(__ LT, TAIL);
3958       }
3959       __ bind(NO_PREFETCH_LARGE_LOOP);
3960       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3961           /* prfm = */ false, NOT_EQUAL);
3962     }
3963     __ bind(TAIL);
3964       __ cbz(cnt1, EQUAL);
3965       __ subs(cnt1, cnt1, wordSize);
3966       __ br(__ LE, POST_LOOP);
3967     __ bind(SMALL_LOOP);
3968       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3969       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3970       __ subs(cnt1, cnt1, wordSize);
3971       __ eor(tmp1, tmp1, tmp2);
3972       __ cbnz(tmp1, NOT_EQUAL);


4087     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4088     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4089     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4090     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4091     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4092     __ eor(rscratch2, tmp1, tmp2);
4093     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4094     __ mov(rscratch1, tmp2);
4095     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4096     Register strU = isLU ? str2 : str1,
4097              strL = isLU ? str1 : str2,
4098              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4099              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4100     __ push(spilled_regs, sp);
4101     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4102     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4103 
4104     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4105 
4106     if (SoftwarePrefetchHintDistance >= 0) {
4107       __ cmp(rscratch2, cnt2, prefetchLoopExitCondition);
4108       __ br(__ LT, SMALL_LOOP);
4109       __ bind(LARGE_LOOP_PREFETCH);
4110         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4111         __ mov(tmp4, 2);
4112         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4113         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4114           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4115           __ subs(tmp4, tmp4, 1);
4116           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4117           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4118           __ mov(tmp4, 2);
4119         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4120           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4121           __ subs(tmp4, tmp4, 1);
4122           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4123           __ sub(cnt2, cnt2, 64);
4124           __ cmp(rscratch2, cnt2, prefetchLoopExitCondition); 
4125           __ br(__ GE, LARGE_LOOP_PREFETCH);
4126     }
4127     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4128     __ subs(cnt2, cnt2, 16);
4129     __ br(__ LT, TAIL);
4130     __ b(SMALL_LOOP_ENTER);
4131     __ bind(SMALL_LOOP); // smaller loop
4132       __ subs(cnt2, cnt2, 16);
4133     __ bind(SMALL_LOOP_ENTER);
4134       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4135       __ br(__ GE, SMALL_LOOP);
4136       __ cbz(cnt2, LOAD_LAST);
4137     __ bind(TAIL); // 1..15 characters left
4138       __ cmp(cnt2, -8);
4139       __ br(__ GT, TAIL_LOAD_16);
4140       __ ldrd(vtmp, Address(tmp2));
4141       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4142 
4143       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4144       __ fmovd(tmpL, vtmp3);


4221     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4222         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4223         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4224     // exit from large loop when less than 64 bytes left to read or we're about
4225     // to prefetch memory behind array border
4226     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4227     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4228     // update cnt2 counter with already loaded 8 bytes
4229     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4230     // update pointers, because of previous read
4231     __ add(str1, str1, wordSize);
4232     __ add(str2, str2, wordSize);
4233     if (SoftwarePrefetchHintDistance >= 0) {
4234       __ bind(LARGE_LOOP_PREFETCH);
4235         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4236         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4237         compare_string_16_bytes_same(DIFF, DIFF2);
4238         compare_string_16_bytes_same(DIFF, DIFF2);
4239         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4240         compare_string_16_bytes_same(DIFF, DIFF2);
4241         __ cmp(rscratch2, cnt2, largeLoopExitCondition);
4242         compare_string_16_bytes_same(DIFF, DIFF2);
4243         __ br(__ GT, LARGE_LOOP_PREFETCH);
4244         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4245         // less than 16 bytes left?
4246         __ subs(cnt2, cnt2, isLL ? 16 : 8);
4247         __ br(__ LT, TAIL);
4248     }
4249     __ bind(SMALL_LOOP);
4250       compare_string_16_bytes_same(DIFF, DIFF2);
4251       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4252       __ br(__ GE, SMALL_LOOP);
4253     __ bind(TAIL);
4254       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4255       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4256       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4257       __ br(__ LE, CHECK_LAST);
4258       __ eor(rscratch2, tmp1, tmp2);
4259       __ cbnz(rscratch2, DIFF);
4260       __ ldr(tmp1, Address(__ post(str1, 8)));
4261       __ ldr(tmp2, Address(__ post(str2, 8)));


4640   // R2 = len
4641   // R3 = len >> 3
4642   // V0 = 0
4643   // v1 = loaded 8 bytes
4644   address generate_large_byte_array_inflate() {
4645     __ align(CodeEntryAlignment);
4646     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4647     address entry = __ pc();
4648     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4649     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4650     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4651 
4652     // do one more 8-byte read to have address 16-byte aligned in most cases
4653     // also use single store instruction
4654     __ ldrd(v2, __ post(src, 8));
4655     __ sub(octetCounter, octetCounter, 2);
4656     __ zip1(v1, __ T16B, v1, v0);
4657     __ zip1(v2, __ T16B, v2, v0);
4658     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4659     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4660     __ cmp(rscratch1, octetCounter, large_loop_threshold);
4661     __ br(__ LE, LOOP_START);
4662     __ b(LOOP_PRFM_START);
4663     __ bind(LOOP_PRFM);
4664       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4665     __ bind(LOOP_PRFM_START);
4666       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4667       __ sub(octetCounter, octetCounter, 8);
4668       __ cmp(rscratch1, octetCounter, large_loop_threshold);
4669       inflate_and_store_2_fp_registers(true, v3, v4);
4670       inflate_and_store_2_fp_registers(true, v5, v6);
4671       __ br(__ GT, LOOP_PRFM);
4672       __ cmp(octetCounter, 8);
4673       __ br(__ LT, DONE);
4674     __ bind(LOOP);
4675       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4676       __ bind(LOOP_START);
4677       __ sub(octetCounter, octetCounter, 8);
4678       __ cmp(octetCounter, 8);
4679       inflate_and_store_2_fp_registers(false, v3, v4);
4680       inflate_and_store_2_fp_registers(false, v5, v6);
4681       __ br(__ GE, LOOP);
4682     __ bind(DONE);
4683       __ ret(lr);
4684     return entry;
4685   }
4686 
4687   /**
4688    *  Arguments:


< prev index next >