open Sdiff src/hotspot/cpu/aarch64

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

rev 51719 : [mq]: 8210676

 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // The inner part of zero_words().  This is the bulk operation,
 624   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 625   // caller is responsible for zeroing the last few words.
 626   //
 627   // Inputs:
 628   // r10: the HeapWord-aligned base address of an array to zero.
 629   // r11: the count in HeapWords, r11 > 0.
 630   //
 631   // Returns r10 and r11, adjusted for the caller to clear.
 632   // r10: the base address of the tail of words left to clear.
 633   // r11: the number of words in the tail.
 634   //      r11 < MacroAssembler::zero_words_block_size.
 635 
 636   address generate_zero_blocks() {
 637     Label store_pair, loop_store_pair, done;
 638     Label base_aligned;
 639 
 640     Register base = r10, cnt = r11;
 641 
 642     __ align(CodeEntryAlignment);
 643     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 644     address start = __ pc();
 645 
 646     if (UseBlockZeroing) {
 647       int zva_length = VM_Version::zva_length();
 648 
 649       // Ensure ZVA length can be divided by 16. This is required by
 650       // the subsequent operations.
 651       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 652 
 653       __ tbz(base, 3, base_aligned);
 654       __ str(zr, Address(__ post(base, 8)));
 655       __ sub(cnt, cnt, 1);
 656       __ bind(base_aligned);
 657

1022          __ add(d, d, 8);
1023        }
1024 
1025       }
1026 
1027       __ ret(lr);
1028       }
1029   }
1030 
1031   // Small copy: less than 16 bytes.
1032   //
1033   // NB: Ignores all of the bits of count which represent more than 15
1034   // bytes, so a caller doesn't have to mask them.
1035 
1036   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1037     bool is_backwards = step < 0;
1038     size_t granularity = uabs(step);
1039     int direction = is_backwards ? -1 : 1;
1040     int unit = wordSize * direction;
1041 
1042     Label Lpair, Lword, Lint, Lshort, Lbyte;
1043 
1044     assert(granularity
1045            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1046 
1047     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1048 
1049     // ??? I don't know if this bit-test-and-branch is the right thing
1050     // to do.  It does a lot of jumping, resulting in several
1051     // mispredicted branches.  It might make more sense to do this
1052     // with something like Duff's device with a single computed branch.
1053 
1054     __ tbz(count, 3 - exact_log2(granularity), Lword);
1055     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1056     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1057     __ bind(Lword);
1058 
1059     if (granularity <= sizeof (jint)) {
1060       __ tbz(count, 2 - exact_log2(granularity), Lint);
1061       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1062       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));

1079   }
1080 
1081   Label copy_f, copy_b;
1082 
1083   // All-singing all-dancing memory copy.
1084   //
1085   // Copy count units of memory from s to d.  The size of a unit is
1086   // step, which can be positive or negative depending on the direction
1087   // of copy.  If is_aligned is false, we align the source address.
1088   //
1089 
1090   void copy_memory(bool is_aligned, Register s, Register d,
1091                    Register count, Register tmp, int step) {
1092     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1093     bool is_backwards = step < 0;
1094     int granularity = uabs(step);
1095     const Register t0 = r3, t1 = r4;
1096 
1097     // <= 96 bytes do inline. Direction doesn't matter because we always
1098     // load all the data before writing anything
1099     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1100     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1101     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1102     const Register send = r17, dend = r18;
1103 
1104     if (PrefetchCopyIntervalInBytes > 0)
1105       __ prfm(Address(s, 0), PLDL1KEEP);
1106     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1107     __ br(Assembler::HI, copy_big);
1108 
1109     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1110     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1111 
1112     __ cmp(count, u1(16/granularity));
1113     __ br(Assembler::LS, copy16);
1114 
1115     __ cmp(count, u1(64/granularity));
1116     __ br(Assembler::HI, copy80);
1117 
1118     __ cmp(count, u1(32/granularity));
1119     __ br(Assembler::LS, copy32);

1949 
1950   //
1951   //  Generate generic array copy stubs
1952   //
1953   //  Input:
1954   //    c_rarg0    -  src oop
1955   //    c_rarg1    -  src_pos (32-bits)
1956   //    c_rarg2    -  dst oop
1957   //    c_rarg3    -  dst_pos (32-bits)
1958   //    c_rarg4    -  element count (32-bits)
1959   //
1960   //  Output:
1961   //    r0 ==  0  -  success
1962   //    r0 == -1^K - failure, where K is partial transfer count
1963   //
1964   address generate_generic_copy(const char *name,
1965                                 address byte_copy_entry, address short_copy_entry,
1966                                 address int_copy_entry, address oop_copy_entry,
1967                                 address long_copy_entry, address checkcast_copy_entry) {
1968 
1969     Label L_failed, L_failed_0, L_objArray;
1970     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1971 
1972     // Input registers
1973     const Register src        = c_rarg0;  // source array oop
1974     const Register src_pos    = c_rarg1;  // source position
1975     const Register dst        = c_rarg2;  // destination array oop
1976     const Register dst_pos    = c_rarg3;  // destination position
1977     const Register length     = c_rarg4;
1978 
1979     StubCodeMark mark(this, "StubRoutines", name);
1980 
1981     __ align(CodeEntryAlignment);
1982     address start = __ pc();
1983 
1984     __ enter(); // required for proper stackwalking of RuntimeStub frame
1985 
1986     // bump this on entry, not on exit:
1987     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1988 
1989     //-----------------------------------------------------------------------

4349         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4350         : "indexof_linear_uu";
4351     __ align(CodeEntryAlignment);
4352     StubCodeMark mark(this, "StubRoutines", stubName);
4353     address entry = __ pc();
4354 
4355     int str1_chr_size = str1_isL ? 1 : 2;
4356     int str2_chr_size = str2_isL ? 1 : 2;
4357     int str1_chr_shift = str1_isL ? 0 : 1;
4358     int str2_chr_shift = str2_isL ? 0 : 1;
4359     bool isL = str1_isL && str2_isL;
4360    // parameters
4361     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4362     // temporary registers
4363     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4364     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4365     // redefinitions
4366     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4367 
4368     __ push(spilled_regs, sp);
4369     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, L_SMALL_MATCH_LOOP,
4370         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4371         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4372         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4373         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4374         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4375     // Read whole register from str1. It is safe, because length >=8 here
4376     __ ldr(ch1, Address(str1));
4377     // Read whole register from str2. It is safe, because length >=8 here
4378     __ ldr(ch2, Address(str2));
4379     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4380     if (str1_isL != str2_isL) {
4381       __ eor(v0, __ T16B, v0, v0);
4382     }
4383     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4384     __ mul(first, first, tmp1);
4385     // check if we have less than 1 register to check
4386     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4387     if (str1_isL != str2_isL) {
4388       __ fmovd(v1, ch1);
4389     }

 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // The inner part of zero_words().  This is the bulk operation,
 624   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 625   // caller is responsible for zeroing the last few words.
 626   //
 627   // Inputs:
 628   // r10: the HeapWord-aligned base address of an array to zero.
 629   // r11: the count in HeapWords, r11 > 0.
 630   //
 631   // Returns r10 and r11, adjusted for the caller to clear.
 632   // r10: the base address of the tail of words left to clear.
 633   // r11: the number of words in the tail.
 634   //      r11 < MacroAssembler::zero_words_block_size.
 635 
 636   address generate_zero_blocks() {
 637     Label done;
 638     Label base_aligned;
 639 
 640     Register base = r10, cnt = r11;
 641 
 642     __ align(CodeEntryAlignment);
 643     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 644     address start = __ pc();
 645 
 646     if (UseBlockZeroing) {
 647       int zva_length = VM_Version::zva_length();
 648 
 649       // Ensure ZVA length can be divided by 16. This is required by
 650       // the subsequent operations.
 651       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 652 
 653       __ tbz(base, 3, base_aligned);
 654       __ str(zr, Address(__ post(base, 8)));
 655       __ sub(cnt, cnt, 1);
 656       __ bind(base_aligned);
 657

1022          __ add(d, d, 8);
1023        }
1024 
1025       }
1026 
1027       __ ret(lr);
1028       }
1029   }
1030 
1031   // Small copy: less than 16 bytes.
1032   //
1033   // NB: Ignores all of the bits of count which represent more than 15
1034   // bytes, so a caller doesn't have to mask them.
1035 
1036   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1037     bool is_backwards = step < 0;
1038     size_t granularity = uabs(step);
1039     int direction = is_backwards ? -1 : 1;
1040     int unit = wordSize * direction;
1041 
1042     Label Lword, Lint, Lshort, Lbyte;
1043 
1044     assert(granularity
1045            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1046 
1047     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1048 
1049     // ??? I don't know if this bit-test-and-branch is the right thing
1050     // to do.  It does a lot of jumping, resulting in several
1051     // mispredicted branches.  It might make more sense to do this
1052     // with something like Duff's device with a single computed branch.
1053 
1054     __ tbz(count, 3 - exact_log2(granularity), Lword);
1055     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1056     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1057     __ bind(Lword);
1058 
1059     if (granularity <= sizeof (jint)) {
1060       __ tbz(count, 2 - exact_log2(granularity), Lint);
1061       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1062       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));

1079   }
1080 
1081   Label copy_f, copy_b;
1082 
1083   // All-singing all-dancing memory copy.
1084   //
1085   // Copy count units of memory from s to d.  The size of a unit is
1086   // step, which can be positive or negative depending on the direction
1087   // of copy.  If is_aligned is false, we align the source address.
1088   //
1089 
1090   void copy_memory(bool is_aligned, Register s, Register d,
1091                    Register count, Register tmp, int step) {
1092     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1093     bool is_backwards = step < 0;
1094     int granularity = uabs(step);
1095     const Register t0 = r3, t1 = r4;
1096 
1097     // <= 96 bytes do inline. Direction doesn't matter because we always
1098     // load all the data before writing anything
1099     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1100     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1101     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1102     const Register send = r17, dend = r18;
1103 
1104     if (PrefetchCopyIntervalInBytes > 0)
1105       __ prfm(Address(s, 0), PLDL1KEEP);
1106     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1107     __ br(Assembler::HI, copy_big);
1108 
1109     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1110     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1111 
1112     __ cmp(count, u1(16/granularity));
1113     __ br(Assembler::LS, copy16);
1114 
1115     __ cmp(count, u1(64/granularity));
1116     __ br(Assembler::HI, copy80);
1117 
1118     __ cmp(count, u1(32/granularity));
1119     __ br(Assembler::LS, copy32);

1949 
1950   //
1951   //  Generate generic array copy stubs
1952   //
1953   //  Input:
1954   //    c_rarg0    -  src oop
1955   //    c_rarg1    -  src_pos (32-bits)
1956   //    c_rarg2    -  dst oop
1957   //    c_rarg3    -  dst_pos (32-bits)
1958   //    c_rarg4    -  element count (32-bits)
1959   //
1960   //  Output:
1961   //    r0 ==  0  -  success
1962   //    r0 == -1^K - failure, where K is partial transfer count
1963   //
1964   address generate_generic_copy(const char *name,
1965                                 address byte_copy_entry, address short_copy_entry,
1966                                 address int_copy_entry, address oop_copy_entry,
1967                                 address long_copy_entry, address checkcast_copy_entry) {
1968 
1969     Label L_failed, L_objArray;
1970     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1971 
1972     // Input registers
1973     const Register src        = c_rarg0;  // source array oop
1974     const Register src_pos    = c_rarg1;  // source position
1975     const Register dst        = c_rarg2;  // destination array oop
1976     const Register dst_pos    = c_rarg3;  // destination position
1977     const Register length     = c_rarg4;
1978 
1979     StubCodeMark mark(this, "StubRoutines", name);
1980 
1981     __ align(CodeEntryAlignment);
1982     address start = __ pc();
1983 
1984     __ enter(); // required for proper stackwalking of RuntimeStub frame
1985 
1986     // bump this on entry, not on exit:
1987     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1988 
1989     //-----------------------------------------------------------------------

4349         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4350         : "indexof_linear_uu";
4351     __ align(CodeEntryAlignment);
4352     StubCodeMark mark(this, "StubRoutines", stubName);
4353     address entry = __ pc();
4354 
4355     int str1_chr_size = str1_isL ? 1 : 2;
4356     int str2_chr_size = str2_isL ? 1 : 2;
4357     int str1_chr_shift = str1_isL ? 0 : 1;
4358     int str2_chr_shift = str2_isL ? 0 : 1;
4359     bool isL = str1_isL && str2_isL;
4360    // parameters
4361     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4362     // temporary registers
4363     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4364     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4365     // redefinitions
4366     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4367 
4368     __ push(spilled_regs, sp);
4369     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4370         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4371         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4372         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4373         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4374         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4375     // Read whole register from str1. It is safe, because length >=8 here
4376     __ ldr(ch1, Address(str1));
4377     // Read whole register from str2. It is safe, because length >=8 here
4378     __ ldr(ch2, Address(str2));
4379     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4380     if (str1_isL != str2_isL) {
4381       __ eor(v0, __ T16B, v0, v0);
4382     }
4383     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4384     __ mul(first, first, tmp1);
4385     // check if we have less than 1 register to check
4386     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4387     if (str1_isL != str2_isL) {
4388       __ fmovd(v1, ch1);
4389     }

< prev index next >