617
618 return start;
619 }
620
621 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
622
623 // The inner part of zero_words(). This is the bulk operation,
624 // zeroing words in blocks, possibly using DC ZVA to do it. The
625 // caller is responsible for zeroing the last few words.
626 //
627 // Inputs:
628 // r10: the HeapWord-aligned base address of an array to zero.
629 // r11: the count in HeapWords, r11 > 0.
630 //
631 // Returns r10 and r11, adjusted for the caller to clear.
632 // r10: the base address of the tail of words left to clear.
633 // r11: the number of words in the tail.
634 // r11 < MacroAssembler::zero_words_block_size.
635
636 address generate_zero_blocks() {
637 Label store_pair, loop_store_pair, done;
638 Label base_aligned;
639
640 Register base = r10, cnt = r11;
641
642 __ align(CodeEntryAlignment);
643 StubCodeMark mark(this, "StubRoutines", "zero_blocks");
644 address start = __ pc();
645
646 if (UseBlockZeroing) {
647 int zva_length = VM_Version::zva_length();
648
649 // Ensure ZVA length can be divided by 16. This is required by
650 // the subsequent operations.
651 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
652
653 __ tbz(base, 3, base_aligned);
654 __ str(zr, Address(__ post(base, 8)));
655 __ sub(cnt, cnt, 1);
656 __ bind(base_aligned);
657
1022 __ add(d, d, 8);
1023 }
1024
1025 }
1026
1027 __ ret(lr);
1028 }
1029 }
1030
1031 // Small copy: less than 16 bytes.
1032 //
1033 // NB: Ignores all of the bits of count which represent more than 15
1034 // bytes, so a caller doesn't have to mask them.
1035
1036 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1037 bool is_backwards = step < 0;
1038 size_t granularity = uabs(step);
1039 int direction = is_backwards ? -1 : 1;
1040 int unit = wordSize * direction;
1041
1042 Label Lpair, Lword, Lint, Lshort, Lbyte;
1043
1044 assert(granularity
1045 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1046
1047 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1048
1049 // ??? I don't know if this bit-test-and-branch is the right thing
1050 // to do. It does a lot of jumping, resulting in several
1051 // mispredicted branches. It might make more sense to do this
1052 // with something like Duff's device with a single computed branch.
1053
1054 __ tbz(count, 3 - exact_log2(granularity), Lword);
1055 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1056 __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1057 __ bind(Lword);
1058
1059 if (granularity <= sizeof (jint)) {
1060 __ tbz(count, 2 - exact_log2(granularity), Lint);
1061 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1062 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1079 }
1080
1081 Label copy_f, copy_b;
1082
1083 // All-singing all-dancing memory copy.
1084 //
1085 // Copy count units of memory from s to d. The size of a unit is
1086 // step, which can be positive or negative depending on the direction
1087 // of copy. If is_aligned is false, we align the source address.
1088 //
1089
1090 void copy_memory(bool is_aligned, Register s, Register d,
1091 Register count, Register tmp, int step) {
1092 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1093 bool is_backwards = step < 0;
1094 int granularity = uabs(step);
1095 const Register t0 = r3, t1 = r4;
1096
1097 // <= 96 bytes do inline. Direction doesn't matter because we always
1098 // load all the data before writing anything
1099 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1100 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1101 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1102 const Register send = r17, dend = r18;
1103
1104 if (PrefetchCopyIntervalInBytes > 0)
1105 __ prfm(Address(s, 0), PLDL1KEEP);
1106 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1107 __ br(Assembler::HI, copy_big);
1108
1109 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1110 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1111
1112 __ cmp(count, u1(16/granularity));
1113 __ br(Assembler::LS, copy16);
1114
1115 __ cmp(count, u1(64/granularity));
1116 __ br(Assembler::HI, copy80);
1117
1118 __ cmp(count, u1(32/granularity));
1119 __ br(Assembler::LS, copy32);
1949
1950 //
1951 // Generate generic array copy stubs
1952 //
1953 // Input:
1954 // c_rarg0 - src oop
1955 // c_rarg1 - src_pos (32-bits)
1956 // c_rarg2 - dst oop
1957 // c_rarg3 - dst_pos (32-bits)
1958 // c_rarg4 - element count (32-bits)
1959 //
1960 // Output:
1961 // r0 == 0 - success
1962 // r0 == -1^K - failure, where K is partial transfer count
1963 //
1964 address generate_generic_copy(const char *name,
1965 address byte_copy_entry, address short_copy_entry,
1966 address int_copy_entry, address oop_copy_entry,
1967 address long_copy_entry, address checkcast_copy_entry) {
1968
1969 Label L_failed, L_failed_0, L_objArray;
1970 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1971
1972 // Input registers
1973 const Register src = c_rarg0; // source array oop
1974 const Register src_pos = c_rarg1; // source position
1975 const Register dst = c_rarg2; // destination array oop
1976 const Register dst_pos = c_rarg3; // destination position
1977 const Register length = c_rarg4;
1978
1979 StubCodeMark mark(this, "StubRoutines", name);
1980
1981 __ align(CodeEntryAlignment);
1982 address start = __ pc();
1983
1984 __ enter(); // required for proper stackwalking of RuntimeStub frame
1985
1986 // bump this on entry, not on exit:
1987 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1988
1989 //-----------------------------------------------------------------------
4349 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4350 : "indexof_linear_uu";
4351 __ align(CodeEntryAlignment);
4352 StubCodeMark mark(this, "StubRoutines", stubName);
4353 address entry = __ pc();
4354
4355 int str1_chr_size = str1_isL ? 1 : 2;
4356 int str2_chr_size = str2_isL ? 1 : 2;
4357 int str1_chr_shift = str1_isL ? 0 : 1;
4358 int str2_chr_shift = str2_isL ? 0 : 1;
4359 bool isL = str1_isL && str2_isL;
4360 // parameters
4361 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4362 // temporary registers
4363 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4364 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4365 // redefinitions
4366 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4367
4368 __ push(spilled_regs, sp);
4369 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, L_SMALL_MATCH_LOOP,
4370 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4371 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4372 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4373 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4374 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4375 // Read whole register from str1. It is safe, because length >=8 here
4376 __ ldr(ch1, Address(str1));
4377 // Read whole register from str2. It is safe, because length >=8 here
4378 __ ldr(ch2, Address(str2));
4379 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4380 if (str1_isL != str2_isL) {
4381 __ eor(v0, __ T16B, v0, v0);
4382 }
4383 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4384 __ mul(first, first, tmp1);
4385 // check if we have less than 1 register to check
4386 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4387 if (str1_isL != str2_isL) {
4388 __ fmovd(v1, ch1);
4389 }
|
617
618 return start;
619 }
620
621 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
622
623 // The inner part of zero_words(). This is the bulk operation,
624 // zeroing words in blocks, possibly using DC ZVA to do it. The
625 // caller is responsible for zeroing the last few words.
626 //
627 // Inputs:
628 // r10: the HeapWord-aligned base address of an array to zero.
629 // r11: the count in HeapWords, r11 > 0.
630 //
631 // Returns r10 and r11, adjusted for the caller to clear.
632 // r10: the base address of the tail of words left to clear.
633 // r11: the number of words in the tail.
634 // r11 < MacroAssembler::zero_words_block_size.
635
636 address generate_zero_blocks() {
637 Label done;
638 Label base_aligned;
639
640 Register base = r10, cnt = r11;
641
642 __ align(CodeEntryAlignment);
643 StubCodeMark mark(this, "StubRoutines", "zero_blocks");
644 address start = __ pc();
645
646 if (UseBlockZeroing) {
647 int zva_length = VM_Version::zva_length();
648
649 // Ensure ZVA length can be divided by 16. This is required by
650 // the subsequent operations.
651 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
652
653 __ tbz(base, 3, base_aligned);
654 __ str(zr, Address(__ post(base, 8)));
655 __ sub(cnt, cnt, 1);
656 __ bind(base_aligned);
657
1022 __ add(d, d, 8);
1023 }
1024
1025 }
1026
1027 __ ret(lr);
1028 }
1029 }
1030
1031 // Small copy: less than 16 bytes.
1032 //
1033 // NB: Ignores all of the bits of count which represent more than 15
1034 // bytes, so a caller doesn't have to mask them.
1035
1036 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1037 bool is_backwards = step < 0;
1038 size_t granularity = uabs(step);
1039 int direction = is_backwards ? -1 : 1;
1040 int unit = wordSize * direction;
1041
1042 Label Lword, Lint, Lshort, Lbyte;
1043
1044 assert(granularity
1045 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1046
1047 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1048
1049 // ??? I don't know if this bit-test-and-branch is the right thing
1050 // to do. It does a lot of jumping, resulting in several
1051 // mispredicted branches. It might make more sense to do this
1052 // with something like Duff's device with a single computed branch.
1053
1054 __ tbz(count, 3 - exact_log2(granularity), Lword);
1055 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1056 __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1057 __ bind(Lword);
1058
1059 if (granularity <= sizeof (jint)) {
1060 __ tbz(count, 2 - exact_log2(granularity), Lint);
1061 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1062 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1079 }
1080
1081 Label copy_f, copy_b;
1082
1083 // All-singing all-dancing memory copy.
1084 //
1085 // Copy count units of memory from s to d. The size of a unit is
1086 // step, which can be positive or negative depending on the direction
1087 // of copy. If is_aligned is false, we align the source address.
1088 //
1089
1090 void copy_memory(bool is_aligned, Register s, Register d,
1091 Register count, Register tmp, int step) {
1092 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1093 bool is_backwards = step < 0;
1094 int granularity = uabs(step);
1095 const Register t0 = r3, t1 = r4;
1096
1097 // <= 96 bytes do inline. Direction doesn't matter because we always
1098 // load all the data before writing anything
1099 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1100 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1101 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1102 const Register send = r17, dend = r18;
1103
1104 if (PrefetchCopyIntervalInBytes > 0)
1105 __ prfm(Address(s, 0), PLDL1KEEP);
1106 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1107 __ br(Assembler::HI, copy_big);
1108
1109 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1110 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1111
1112 __ cmp(count, u1(16/granularity));
1113 __ br(Assembler::LS, copy16);
1114
1115 __ cmp(count, u1(64/granularity));
1116 __ br(Assembler::HI, copy80);
1117
1118 __ cmp(count, u1(32/granularity));
1119 __ br(Assembler::LS, copy32);
1949
1950 //
1951 // Generate generic array copy stubs
1952 //
1953 // Input:
1954 // c_rarg0 - src oop
1955 // c_rarg1 - src_pos (32-bits)
1956 // c_rarg2 - dst oop
1957 // c_rarg3 - dst_pos (32-bits)
1958 // c_rarg4 - element count (32-bits)
1959 //
1960 // Output:
1961 // r0 == 0 - success
1962 // r0 == -1^K - failure, where K is partial transfer count
1963 //
1964 address generate_generic_copy(const char *name,
1965 address byte_copy_entry, address short_copy_entry,
1966 address int_copy_entry, address oop_copy_entry,
1967 address long_copy_entry, address checkcast_copy_entry) {
1968
1969 Label L_failed, L_objArray;
1970 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1971
1972 // Input registers
1973 const Register src = c_rarg0; // source array oop
1974 const Register src_pos = c_rarg1; // source position
1975 const Register dst = c_rarg2; // destination array oop
1976 const Register dst_pos = c_rarg3; // destination position
1977 const Register length = c_rarg4;
1978
1979 StubCodeMark mark(this, "StubRoutines", name);
1980
1981 __ align(CodeEntryAlignment);
1982 address start = __ pc();
1983
1984 __ enter(); // required for proper stackwalking of RuntimeStub frame
1985
1986 // bump this on entry, not on exit:
1987 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1988
1989 //-----------------------------------------------------------------------
4349 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4350 : "indexof_linear_uu";
4351 __ align(CodeEntryAlignment);
4352 StubCodeMark mark(this, "StubRoutines", stubName);
4353 address entry = __ pc();
4354
4355 int str1_chr_size = str1_isL ? 1 : 2;
4356 int str2_chr_size = str2_isL ? 1 : 2;
4357 int str1_chr_shift = str1_isL ? 0 : 1;
4358 int str2_chr_shift = str2_isL ? 0 : 1;
4359 bool isL = str1_isL && str2_isL;
4360 // parameters
4361 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4362 // temporary registers
4363 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4364 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4365 // redefinitions
4366 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4367
4368 __ push(spilled_regs, sp);
4369 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4370 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4371 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4372 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4373 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4374 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4375 // Read whole register from str1. It is safe, because length >=8 here
4376 __ ldr(ch1, Address(str1));
4377 // Read whole register from str2. It is safe, because length >=8 here
4378 __ ldr(ch2, Address(str2));
4379 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4380 if (str1_isL != str2_isL) {
4381 __ eor(v0, __ T16B, v0, v0);
4382 }
4383 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4384 __ mul(first, first, tmp1);
4385 // check if we have less than 1 register to check
4386 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4387 if (str1_isL != str2_isL) {
4388 __ fmovd(v1, ch1);
4389 }
|