548 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
549 __ ldr(c_rarg3, Address(c_rarg2));
550 __ add(c_rarg3, c_rarg3, 1);
551 __ str(c_rarg3, Address(c_rarg2));
552
553 // object is in r0
554 // make sure object is 'reasonable'
555 __ cbz(r0, exit); // if obj is NULL it is OK
556
557 #if INCLUDE_ZGC
558 if (UseZGC) {
559 // Check if mask is good.
560 // verifies that ZAddressBadMask & r0 == 0
561 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
562 __ andr(c_rarg2, r0, c_rarg3);
563 __ cbnz(c_rarg2, error);
564 }
565 #endif
566
567 // Check if the oop is in the right area of memory
568 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
569 __ andr(c_rarg2, r0, c_rarg3);
570 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
571
572 // Compare c_rarg2 and c_rarg3. We don't use a compare
573 // instruction here because the flags register is live.
574 __ eor(c_rarg2, c_rarg2, c_rarg3);
575 __ cbnz(c_rarg2, error);
576
577 // make sure klass is 'reasonable', which is not zero.
578 __ load_klass(r0, r0); // get klass
579 __ cbz(r0, error); // if klass is NULL it is broken
580
581 // return if everything seems ok
582 __ bind(exit);
583
584 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
585 __ ret(lr);
586
587 // handle errors
588 __ bind(error);
589 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
590
680 } copy_direction;
681
682 // Bulk copy of blocks of 8 words.
683 //
684 // count is a count of words.
685 //
686 // Precondition: count >= 8
687 //
688 // Postconditions:
689 //
690 // The least significant bit of count contains the remaining count
691 // of words to copy. The rest of count is trash.
692 //
693 // s and d are adjusted to point to the remaining words to copy
694 //
695 void generate_copy_longs(Label &start, Register s, Register d, Register count,
696 copy_direction direction) {
697 int unit = wordSize * direction;
698 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
699
700 int offset;
701 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
702 t4 = r7, t5 = r10, t6 = r11, t7 = r12;
703 const Register stride = r13;
704
705 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
706 assert_different_registers(s, d, count, rscratch1);
707
708 Label again, drain;
709 const char *stub_name;
710 if (direction == copy_forwards)
711 stub_name = "forward_copy_longs";
712 else
713 stub_name = "backward_copy_longs";
714
715 __ align(CodeEntryAlignment);
716
717 StubCodeMark mark(this, "StubRoutines", stub_name);
718
719 __ bind(start);
720
1071
1072 // All-singing all-dancing memory copy.
1073 //
1074 // Copy count units of memory from s to d. The size of a unit is
1075 // step, which can be positive or negative depending on the direction
1076 // of copy. If is_aligned is false, we align the source address.
1077 //
1078
1079 void copy_memory(bool is_aligned, Register s, Register d,
1080 Register count, Register tmp, int step) {
1081 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1082 bool is_backwards = step < 0;
1083 int granularity = uabs(step);
1084 const Register t0 = r3, t1 = r4;
1085
1086 // <= 96 bytes do inline. Direction doesn't matter because we always
1087 // load all the data before writing anything
1088 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1089 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1090 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1091 const Register send = r17, dend = r18;
1092
1093 if (PrefetchCopyIntervalInBytes > 0)
1094 __ prfm(Address(s, 0), PLDL1KEEP);
1095 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1096 __ br(Assembler::HI, copy_big);
1097
1098 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1099 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1100
1101 __ cmp(count, u1(16/granularity));
1102 __ br(Assembler::LS, copy16);
1103
1104 __ cmp(count, u1(64/granularity));
1105 __ br(Assembler::HI, copy80);
1106
1107 __ cmp(count, u1(32/granularity));
1108 __ br(Assembler::LS, copy32);
1109
1110 // 33..64 bytes
1111 if (UseSIMDForMemoryOps) {
1263 // count and do a bulk copy of words.
1264 __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1265 if (direction == copy_forwards)
1266 __ bl(copy_f);
1267 else
1268 __ bl(copy_b);
1269
1270 // And the tail.
1271 copy_memory_small(s, d, count, tmp, step);
1272
1273 if (granularity >= 8) __ bind(copy8);
1274 if (granularity >= 4) __ bind(copy4);
1275 __ bind(finish);
1276 }
1277
1278
1279 void clobber_registers() {
1280 #ifdef ASSERT
1281 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1282 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1283 for (Register r = r3; r <= r18; r++)
1284 if (r != rscratch1) __ mov(r, rscratch1);
1285 #endif
1286 }
1287
1288 // Scan over array at a for count oops, verifying each one.
1289 // Preserves a and count, clobbers rscratch1 and rscratch2.
1290 void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1291 Label loop, end;
1292 __ mov(rscratch1, a);
1293 __ mov(rscratch2, zr);
1294 __ bind(loop);
1295 __ cmp(rscratch2, count);
1296 __ br(Assembler::HS, end);
1297 if (size == (size_t)wordSize) {
1298 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1299 __ verify_oop(temp);
1300 } else {
1301 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1302 __ decode_heap_oop(temp); // calls verify_oop
1303 }
1304 __ add(rscratch2, rscratch2, size);
1305 __ b(loop);
1698 //
1699 // Output:
1700 // r0 == 0 - success
1701 // r0 == -1^K - failure, where K is partial transfer count
1702 //
1703 address generate_checkcast_copy(const char *name, address *entry,
1704 bool dest_uninitialized = false) {
1705
1706 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1707
1708 // Input registers (after setup_arg_regs)
1709 const Register from = c_rarg0; // source array address
1710 const Register to = c_rarg1; // destination array address
1711 const Register count = c_rarg2; // elementscount
1712 const Register ckoff = c_rarg3; // super_check_offset
1713 const Register ckval = c_rarg4; // super_klass
1714
1715 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1716 RegSet wb_post_saved_regs = RegSet::of(count);
1717
1718 // Registers used as temps (r18, r19, r20 are save-on-entry)
1719 const Register count_save = r21; // orig elementscount
1720 const Register start_to = r20; // destination array start address
1721 const Register copied_oop = r18; // actual oop copied
1722 const Register r19_klass = r19; // oop._klass
1723
1724 //---------------------------------------------------------------
1725 // Assembler stub will be used for this call to arraycopy
1726 // if the two arrays are subtypes of Object[] but the
1727 // destination array type is not equal to or a supertype
1728 // of the source type. Each element must be separately
1729 // checked.
1730
1731 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1732 copied_oop, r19_klass, count_save);
1733
1734 __ align(CodeEntryAlignment);
1735 StubCodeMark mark(this, "StubRoutines", name);
1736 address start = __ pc();
1737
1738 __ enter(); // required for proper stackwalking of RuntimeStub frame
1739
1740 #ifdef ASSERT
1741 // caller guarantees that the arrays really are different
1742 // otherwise, we would have to make conjoint checks
1743 { Label L;
1744 array_overlap_test(L, TIMES_OOP);
1745 __ stop("checkcast_copy within a single array");
1746 __ bind(L);
1747 }
1748 #endif //ASSERT
1749
1750 // Caller of this entry point must set up the argument registers.
1751 if (entry != NULL) {
1752 *entry = __ pc();
1753 BLOCK_COMMENT("Entry:");
1754 }
1755
1756 // Empty array: Nothing to do.
1757 __ cbz(count, L_done);
1758
1759 __ push(RegSet::of(r18, r19, r20, r21), sp);
1760
1761 #ifdef ASSERT
1762 BLOCK_COMMENT("assert consistent ckoff/ckval");
1763 // The ckoff and ckval must be mutually consistent,
1764 // even though caller generates both.
1765 { Label L;
1766 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1767 __ ldrw(start_to, Address(ckval, sco_offset));
1768 __ cmpw(ckoff, start_to);
1769 __ br(Assembler::EQ, L);
1770 __ stop("super_check_offset inconsistent");
1771 __ bind(L);
1772 }
1773 #endif //ASSERT
1774
1775 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1776 bool is_oop = true;
1777 if (dest_uninitialized) {
1778 decorators |= IS_DEST_UNINITIALIZED;
1779 }
1808 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1809 __ cbz(copied_oop, L_store_element);
1810
1811 __ load_klass(r19_klass, copied_oop);// query the object klass
1812 generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1813 // ======== end loop ========
1814
1815 // It was a real error; we must depend on the caller to finish the job.
1816 // Register count = remaining oops, count_orig = total oops.
1817 // Emit GC store barriers for the oops we have copied and report
1818 // their number to the caller.
1819
1820 __ subs(count, count_save, count); // K = partially copied oop count
1821 __ eon(count, count, zr); // report (-1^K) to caller
1822 __ br(Assembler::EQ, L_done_pop);
1823
1824 __ BIND(L_do_card_marks);
1825 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1826
1827 __ bind(L_done_pop);
1828 __ pop(RegSet::of(r18, r19, r20, r21), sp);
1829 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1830
1831 __ bind(L_done);
1832 __ mov(r0, count);
1833 __ leave();
1834 __ ret(lr);
1835
1836 return start;
1837 }
1838
1839 // Perform range checks on the proposed arraycopy.
1840 // Kills temp, but nothing else.
1841 // Also, clean the sign bits of src_pos and dst_pos.
1842 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
1843 Register src_pos, // source position (c_rarg1)
1844 Register dst, // destination array oo (c_rarg2)
1845 Register dst_pos, // destination position (c_rarg3)
1846 Register length,
1847 Register temp,
1848 Label& L_failed) {
1985 // (6) src and dst should be arrays.
1986 // (7) src_pos + length must not exceed length of src.
1987 // (8) dst_pos + length must not exceed length of dst.
1988 //
1989
1990 // if (src == NULL) return -1;
1991 __ cbz(src, L_failed);
1992
1993 // if (src_pos < 0) return -1;
1994 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
1995
1996 // if (dst == NULL) return -1;
1997 __ cbz(dst, L_failed);
1998
1999 // if (dst_pos < 0) return -1;
2000 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2001
2002 // registers used as temp
2003 const Register scratch_length = r16; // elements count to copy
2004 const Register scratch_src_klass = r17; // array klass
2005 const Register lh = r18; // layout helper
2006
2007 // if (length < 0) return -1;
2008 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2009 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2010
2011 __ load_klass(scratch_src_klass, src);
2012 #ifdef ASSERT
2013 // assert(src->klass() != NULL);
2014 {
2015 BLOCK_COMMENT("assert klasses not null {");
2016 Label L1, L2;
2017 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL
2018 __ bind(L1);
2019 __ stop("broken null klass");
2020 __ bind(L2);
2021 __ load_klass(rscratch1, dst);
2022 __ cbz(rscratch1, L1); // this would be broken also
2023 BLOCK_COMMENT("} assert klasses not null done");
2024 }
2025 #endif
2056 Label L;
2057 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2058 __ cmpw(lh, rscratch2);
2059 __ br(Assembler::GE, L);
2060 __ stop("must be a primitive array");
2061 __ bind(L);
2062 BLOCK_COMMENT("} assert primitive array done");
2063 }
2064 #endif
2065
2066 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2067 rscratch2, L_failed);
2068
2069 // TypeArrayKlass
2070 //
2071 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2072 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2073 //
2074
2075 const Register rscratch1_offset = rscratch1; // array offset
2076 const Register r18_elsize = lh; // element size
2077
2078 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2079 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2080 __ add(src, src, rscratch1_offset); // src array offset
2081 __ add(dst, dst, rscratch1_offset); // dst array offset
2082 BLOCK_COMMENT("choose copy loop based on element size");
2083
2084 // next registers should be set before the jump to corresponding stub
2085 const Register from = c_rarg0; // source array address
2086 const Register to = c_rarg1; // destination array address
2087 const Register count = c_rarg2; // elements count
2088
2089 // 'from', 'to', 'count' registers should be set in such order
2090 // since they are the same as 'src', 'src_pos', 'dst'.
2091
2092 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2093
2094 // The possible values of elsize are 0-3, i.e. exact_log2(element
2095 // size in bytes). We do a simple bitwise binary search.
2096 __ BIND(L_copy_bytes);
2097 __ tbnz(r18_elsize, 1, L_copy_ints);
2098 __ tbnz(r18_elsize, 0, L_copy_shorts);
2099 __ lea(from, Address(src, src_pos));// src_addr
2100 __ lea(to, Address(dst, dst_pos));// dst_addr
2101 __ movw(count, scratch_length); // length
2102 __ b(RuntimeAddress(byte_copy_entry));
2103
2104 __ BIND(L_copy_shorts);
2105 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2106 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2107 __ movw(count, scratch_length); // length
2108 __ b(RuntimeAddress(short_copy_entry));
2109
2110 __ BIND(L_copy_ints);
2111 __ tbnz(r18_elsize, 0, L_copy_longs);
2112 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2113 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2114 __ movw(count, scratch_length); // length
2115 __ b(RuntimeAddress(int_copy_entry));
2116
2117 __ BIND(L_copy_longs);
2118 #ifdef ASSERT
2119 {
2120 BLOCK_COMMENT("assert long copy {");
2121 Label L;
2122 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2123 __ cmpw(r18_elsize, LogBytesPerLong);
2124 __ br(Assembler::EQ, L);
2125 __ stop("must be long copy, but elsize is wrong");
2126 __ bind(L);
2127 BLOCK_COMMENT("} assert long copy done");
2128 }
2129 #endif
2130 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2131 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2132 __ movw(count, scratch_length); // length
2133 __ b(RuntimeAddress(long_copy_entry));
2134
2135 // ObjArrayKlass
2136 __ BIND(L_objArray);
2137 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2138
2139 Label L_plain_copy, L_checkcast_copy;
2140 // test array classes for subtyping
2141 __ load_klass(r18, dst);
2142 __ cmp(scratch_src_klass, r18); // usual case is exact equality
2143 __ br(Assembler::NE, L_checkcast_copy);
2144
2145 // Identically typed arrays can be copied without element-wise checks.
2146 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2147 rscratch2, L_failed);
2148
2149 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2150 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2151 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2152 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2153 __ movw(count, scratch_length); // length
2154 __ BIND(L_plain_copy);
2155 __ b(RuntimeAddress(oop_copy_entry));
2156
2157 __ BIND(L_checkcast_copy);
2158 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass)
2159 {
2160 // Before looking at dst.length, make sure dst is also an objArray.
2161 __ ldrw(rscratch1, Address(r18, lh_offset));
2162 __ movw(rscratch2, objArray_lh);
2163 __ eorw(rscratch1, rscratch1, rscratch2);
2164 __ cbnzw(rscratch1, L_failed);
2165
2166 // It is safe to examine both src.length and dst.length.
2167 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2168 r18, L_failed);
2169
2170 __ load_klass(dst_klass, dst); // reload
2171
2172 // Marshal the base address arguments now, freeing registers.
2173 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2174 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2175 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2176 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2177 __ movw(count, length); // length (reloaded)
2178 Register sco_temp = c_rarg3; // this register is free now
2179 assert_different_registers(from, to, count, sco_temp,
2180 dst_klass, scratch_src_klass);
2181 // assert_clean_int(count, sco_temp);
2182
2183 // Generate the type check.
2184 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2185 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2186
2187 // Smashes rscratch1, rscratch2
2188 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
3266 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3267
3268 // Aliases
3269 Register adler = c_rarg0;
3270 Register s1 = c_rarg0;
3271 Register s2 = c_rarg3;
3272 Register buff = c_rarg1;
3273 Register len = c_rarg2;
3274 Register nmax = r4;
3275 Register base = r5;
3276 Register count = r6;
3277 Register temp0 = rscratch1;
3278 Register temp1 = rscratch2;
3279 FloatRegister vbytes = v0;
3280 FloatRegister vs1acc = v1;
3281 FloatRegister vs2acc = v2;
3282 FloatRegister vtable = v3;
3283
3284 // Max number of bytes we can process before having to take the mod
3285 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3286 unsigned long BASE = 0xfff1;
3287 unsigned long NMAX = 0x15B0;
3288
3289 __ mov(base, BASE);
3290 __ mov(nmax, NMAX);
3291
3292 // Load accumulation coefficients for the upper 16 bits
3293 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3294 __ ld1(vtable, __ T16B, Address(temp0));
3295
3296 // s1 is initialized to the lower 16 bits of adler
3297 // s2 is initialized to the upper 16 bits of adler
3298 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
3299 __ uxth(s1, adler); // s1 = (adler & 0xffff)
3300
3301 // The pipelined loop needs at least 16 elements for 1 iteration
3302 // It does check this, but it is more effective to skip to the cleanup loop
3303 __ cmp(len, (u1)16);
3304 __ br(Assembler::HS, L_nmax);
3305 __ cbz(len, L_combine);
3306
3307 __ bind(L_simple_by1_loop);
4044 // r1 = str1
4045 // r2 = cnt1
4046 // r3 = str2
4047 // r4 = cnt2
4048 // r10 = tmp1
4049 // r11 = tmp2
4050 address generate_compare_long_string_different_encoding(bool isLU) {
4051 __ align(CodeEntryAlignment);
4052 StubCodeMark mark(this, "StubRoutines", isLU
4053 ? "compare_long_string_different_encoding LU"
4054 : "compare_long_string_different_encoding UL");
4055 address entry = __ pc();
4056 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4057 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4058 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4059 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4060 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4061 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4062 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4063
4064 int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
4065
4066 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4067 // cnt2 == amount of characters left to compare
4068 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4069 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4070 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4071 __ add(str2, str2, isLU ? wordSize : wordSize/2);
4072 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4073 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4074 __ eor(rscratch2, tmp1, tmp2);
4075 __ mov(rscratch1, tmp2);
4076 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4077 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4078 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4079 __ push(spilled_regs, sp);
4080 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4081 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4082
4083 __ ldr(tmp3, Address(__ post(cnt1, 8)));
4084
4202 // r0 = result
4203 // r1 = str1
4204 // r2 = cnt1
4205 // r3 = str2
4206 // r4 = cnt2
4207 // r10 = tmp1
4208 // r11 = tmp2
4209 address generate_compare_long_string_same_encoding(bool isLL) {
4210 __ align(CodeEntryAlignment);
4211 StubCodeMark mark(this, "StubRoutines", isLL
4212 ? "compare_long_string_same_encoding LL"
4213 : "compare_long_string_same_encoding UU");
4214 address entry = __ pc();
4215 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4216 tmp1 = r10, tmp2 = r11;
4217 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4218 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4219 DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4220 // exit from large loop when less than 64 bytes left to read or we're about
4221 // to prefetch memory behind array border
4222 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4223 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4224 // update cnt2 counter with already loaded 8 bytes
4225 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4226 // update pointers, because of previous read
4227 __ add(str1, str1, wordSize);
4228 __ add(str2, str2, wordSize);
4229 if (SoftwarePrefetchHintDistance >= 0) {
4230 __ bind(LARGE_LOOP_PREFETCH);
4231 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4232 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4233 compare_string_16_bytes_same(DIFF, DIFF2);
4234 compare_string_16_bytes_same(DIFF, DIFF2);
4235 __ sub(cnt2, cnt2, isLL ? 64 : 32);
4236 compare_string_16_bytes_same(DIFF, DIFF2);
4237 __ subs(rscratch2, cnt2, largeLoopExitCondition);
4238 compare_string_16_bytes_same(DIFF, DIFF2);
4239 __ br(__ GT, LARGE_LOOP_PREFETCH);
4240 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4241 }
4242 // less than 16 bytes left?
4628 if (generatePrfm) {
4629 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4630 }
4631 __ zip1(v3, __ T16B, src2, v0);
4632 __ zip2(v4, __ T16B, src2, v0);
4633 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4634 }
4635
4636 // R0 = src
4637 // R1 = dst
4638 // R2 = len
4639 // R3 = len >> 3
4640 // V0 = 0
4641 // v1 = loaded 8 bytes
4642 address generate_large_byte_array_inflate() {
4643 __ align(CodeEntryAlignment);
4644 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4645 address entry = __ pc();
4646 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4647 Register src = r0, dst = r1, len = r2, octetCounter = r3;
4648 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4649
4650 // do one more 8-byte read to have address 16-byte aligned in most cases
4651 // also use single store instruction
4652 __ ldrd(v2, __ post(src, 8));
4653 __ sub(octetCounter, octetCounter, 2);
4654 __ zip1(v1, __ T16B, v1, v0);
4655 __ zip1(v2, __ T16B, v2, v0);
4656 __ st1(v1, v2, __ T16B, __ post(dst, 32));
4657 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4658 __ subs(rscratch1, octetCounter, large_loop_threshold);
4659 __ br(__ LE, LOOP_START);
4660 __ b(LOOP_PRFM_START);
4661 __ bind(LOOP_PRFM);
4662 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4663 __ bind(LOOP_PRFM_START);
4664 __ prfm(Address(src, SoftwarePrefetchHintDistance));
4665 __ sub(octetCounter, octetCounter, 8);
4666 __ subs(rscratch1, octetCounter, large_loop_threshold);
4667 inflate_and_store_2_fp_registers(true, v3, v4);
4668 inflate_and_store_2_fp_registers(true, v5, v6);
4878
4879 class MontgomeryMultiplyGenerator : public MacroAssembler {
4880
4881 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4882 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4883
4884 RegSet _toSave;
4885 bool _squaring;
4886
4887 public:
4888 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4889 : MacroAssembler(as->code()), _squaring(squaring) {
4890
4891 // Register allocation
4892
4893 Register reg = c_rarg0;
4894 Pa_base = reg; // Argument registers
4895 if (squaring)
4896 Pb_base = Pa_base;
4897 else
4898 Pb_base = ++reg;
4899 Pn_base = ++reg;
4900 Rlen= ++reg;
4901 inv = ++reg;
4902 Pm_base = ++reg;
4903
4904 // Working registers:
4905 Ra = ++reg; // The current digit of a, b, n, and m.
4906 Rb = ++reg;
4907 Rm = ++reg;
4908 Rn = ++reg;
4909
4910 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m.
4911 Pb = ++reg;
4912 Pm = ++reg;
4913 Pn = ++reg;
4914
4915 t0 = ++reg; // Three registers which form a
4916 t1 = ++reg; // triple-precision accumuator.
4917 t2 = ++reg;
4918
4919 Ri = ++reg; // Inner and outer loop indexes.
4920 Rj = ++reg;
4921
4922 Rhi_ab = ++reg; // Product registers: low and high parts
4923 Rlo_ab = ++reg; // of a*b and m*n.
4924 Rhi_mn = ++reg;
4925 Rlo_mn = ++reg;
4926
4927 // r19 and up are callee-saved.
4928 _toSave = RegSet::range(r19, reg) + Pm_base;
4929 }
4930
4931 private:
4932 void save_regs() {
4933 push(_toSave, sp);
4934 }
4935
4936 void restore_regs() {
4937 pop(_toSave, sp);
4938 }
4939
4940 template <typename T>
4941 void unroll_2(Register count, T block) {
4942 Label loop, end, odd;
4943 tbnz(count, 0, odd);
4944 cbz(count, end);
4945 align(16);
4946 bind(loop);
4947 (this->*block)();
4948 bind(odd);
4949 (this->*block)();
4950 subs(count, count, 2);
4951 br(Assembler::GT, loop);
5364 }
5365 block_comment("} // i");
5366
5367 normalize(Rlen);
5368
5369 mov(Ra, Pm_base); // Save Pm_base in Ra
5370 restore_regs(); // Restore caller's Pm_base
5371
5372 // Copy our result into caller's Pm_base
5373 reverse(Pm_base, Ra, Rlen, t0, t1);
5374
5375 leave();
5376 bind(nothing);
5377 ret(lr);
5378
5379 return entry;
5380 }
5381 // In C, approximately:
5382
5383 // void
5384 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5385 // unsigned long Pn_base[], unsigned long Pm_base[],
5386 // unsigned long inv, int len) {
5387 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5388 // unsigned long *Pa, *Pb, *Pn, *Pm;
5389 // unsigned long Ra, Rb, Rn, Rm;
5390
5391 // int i;
5392
5393 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5394
5395 // for (i = 0; i < len; i++) {
5396 // int j;
5397
5398 // Pa = Pa_base;
5399 // Pb = Pb_base + i;
5400 // Pm = Pm_base;
5401 // Pn = Pn_base + i;
5402
5403 // Ra = *Pa;
5404 // Rb = *Pb;
5405 // Rm = *Pm;
5406 // Rn = *Pn;
5407
5408 // int iters = i;
5409 // for (j = 0; iters--; j++) {
5577 bind(end);
5578 block_comment("} // i");
5579 }
5580
5581 normalize(Rlen);
5582
5583 mov(Ra, Pm_base); // Save Pm_base in Ra
5584 restore_regs(); // Restore caller's Pm_base
5585
5586 // Copy our result into caller's Pm_base
5587 reverse(Pm_base, Ra, Rlen, t0, t1);
5588
5589 leave();
5590 ret(lr);
5591
5592 return entry;
5593 }
5594 // In C, approximately:
5595
5596 // void
5597 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5598 // unsigned long Pm_base[], unsigned long inv, int len) {
5599 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5600 // unsigned long *Pa, *Pb, *Pn, *Pm;
5601 // unsigned long Ra, Rb, Rn, Rm;
5602
5603 // int i;
5604
5605 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5606
5607 // for (i = 0; i < len; i++) {
5608 // int j;
5609
5610 // Pa = Pa_base;
5611 // Pb = Pa_base + i;
5612 // Pm = Pm_base;
5613 // Pn = Pn_base + i;
5614
5615 // Ra = *Pa;
5616 // Rb = *Pb;
5617 // Rm = *Pm;
5618 // Rn = *Pn;
5619
5620 // int iters = (i+1)/2;
5621 // for (j = 0; iters--; j++) {
|
548 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
549 __ ldr(c_rarg3, Address(c_rarg2));
550 __ add(c_rarg3, c_rarg3, 1);
551 __ str(c_rarg3, Address(c_rarg2));
552
553 // object is in r0
554 // make sure object is 'reasonable'
555 __ cbz(r0, exit); // if obj is NULL it is OK
556
557 #if INCLUDE_ZGC
558 if (UseZGC) {
559 // Check if mask is good.
560 // verifies that ZAddressBadMask & r0 == 0
561 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
562 __ andr(c_rarg2, r0, c_rarg3);
563 __ cbnz(c_rarg2, error);
564 }
565 #endif
566
567 // Check if the oop is in the right area of memory
568 // Make sure we cast to `address` or it ends up calling the wrong `mov`
569 // with MSVC, leading to a crash.
570 __ mov(c_rarg3, (address) Universe::verify_oop_mask());
571 __ andr(c_rarg2, r0, c_rarg3);
572 __ mov(c_rarg3, (address) Universe::verify_oop_bits());
573
574 // Compare c_rarg2 and c_rarg3. We don't use a compare
575 // instruction here because the flags register is live.
576 __ eor(c_rarg2, c_rarg2, c_rarg3);
577 __ cbnz(c_rarg2, error);
578
579 // make sure klass is 'reasonable', which is not zero.
580 __ load_klass(r0, r0); // get klass
581 __ cbz(r0, error); // if klass is NULL it is broken
582
583 // return if everything seems ok
584 __ bind(exit);
585
586 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
587 __ ret(lr);
588
589 // handle errors
590 __ bind(error);
591 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
592
682 } copy_direction;
683
684 // Bulk copy of blocks of 8 words.
685 //
686 // count is a count of words.
687 //
688 // Precondition: count >= 8
689 //
690 // Postconditions:
691 //
692 // The least significant bit of count contains the remaining count
693 // of words to copy. The rest of count is trash.
694 //
695 // s and d are adjusted to point to the remaining words to copy
696 //
697 void generate_copy_longs(Label &start, Register s, Register d, Register count,
698 copy_direction direction) {
699 int unit = wordSize * direction;
700 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
701
702 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
703 t4 = r7, t5 = r10, t6 = r11, t7 = r12;
704 const Register stride = r13;
705
706 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
707 assert_different_registers(s, d, count, rscratch1);
708
709 Label again, drain;
710 const char *stub_name;
711 if (direction == copy_forwards)
712 stub_name = "forward_copy_longs";
713 else
714 stub_name = "backward_copy_longs";
715
716 __ align(CodeEntryAlignment);
717
718 StubCodeMark mark(this, "StubRoutines", stub_name);
719
720 __ bind(start);
721
1072
1073 // All-singing all-dancing memory copy.
1074 //
1075 // Copy count units of memory from s to d. The size of a unit is
1076 // step, which can be positive or negative depending on the direction
1077 // of copy. If is_aligned is false, we align the source address.
1078 //
1079
1080 void copy_memory(bool is_aligned, Register s, Register d,
1081 Register count, Register tmp, int step) {
1082 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1083 bool is_backwards = step < 0;
1084 int granularity = uabs(step);
1085 const Register t0 = r3, t1 = r4;
1086
1087 // <= 96 bytes do inline. Direction doesn't matter because we always
1088 // load all the data before writing anything
1089 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1090 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1091 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1092 const Register send = r17, dend = r16;
1093
1094 if (PrefetchCopyIntervalInBytes > 0)
1095 __ prfm(Address(s, 0), PLDL1KEEP);
1096 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1097 __ br(Assembler::HI, copy_big);
1098
1099 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1100 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1101
1102 __ cmp(count, u1(16/granularity));
1103 __ br(Assembler::LS, copy16);
1104
1105 __ cmp(count, u1(64/granularity));
1106 __ br(Assembler::HI, copy80);
1107
1108 __ cmp(count, u1(32/granularity));
1109 __ br(Assembler::LS, copy32);
1110
1111 // 33..64 bytes
1112 if (UseSIMDForMemoryOps) {
1264 // count and do a bulk copy of words.
1265 __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1266 if (direction == copy_forwards)
1267 __ bl(copy_f);
1268 else
1269 __ bl(copy_b);
1270
1271 // And the tail.
1272 copy_memory_small(s, d, count, tmp, step);
1273
1274 if (granularity >= 8) __ bind(copy8);
1275 if (granularity >= 4) __ bind(copy4);
1276 __ bind(finish);
1277 }
1278
1279
1280 void clobber_registers() {
1281 #ifdef ASSERT
1282 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1283 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1284 for (Register r = r3; r <= NOT_WIN64(r18) WIN64_ONLY(r17); r++)
1285 if (r != rscratch1) __ mov(r, rscratch1);
1286 #endif
1287
1288 }
1289
1290 // Scan over array at a for count oops, verifying each one.
1291 // Preserves a and count, clobbers rscratch1 and rscratch2.
1292 void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1293 Label loop, end;
1294 __ mov(rscratch1, a);
1295 __ mov(rscratch2, zr);
1296 __ bind(loop);
1297 __ cmp(rscratch2, count);
1298 __ br(Assembler::HS, end);
1299 if (size == (size_t)wordSize) {
1300 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1301 __ verify_oop(temp);
1302 } else {
1303 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1304 __ decode_heap_oop(temp); // calls verify_oop
1305 }
1306 __ add(rscratch2, rscratch2, size);
1307 __ b(loop);
1700 //
1701 // Output:
1702 // r0 == 0 - success
1703 // r0 == -1^K - failure, where K is partial transfer count
1704 //
1705 address generate_checkcast_copy(const char *name, address *entry,
1706 bool dest_uninitialized = false) {
1707
1708 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1709
1710 // Input registers (after setup_arg_regs)
1711 const Register from = c_rarg0; // source array address
1712 const Register to = c_rarg1; // destination array address
1713 const Register count = c_rarg2; // elementscount
1714 const Register ckoff = c_rarg3; // super_check_offset
1715 const Register ckval = c_rarg4; // super_klass
1716
1717 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1718 RegSet wb_post_saved_regs = RegSet::of(count);
1719
1720 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1721 const Register copied_oop = r22; // actual oop copied
1722 const Register count_save = r21; // orig elementscount
1723 const Register start_to = r20; // destination array start address
1724 const Register r19_klass = r19; // oop._klass
1725
1726 //---------------------------------------------------------------
1727 // Assembler stub will be used for this call to arraycopy
1728 // if the two arrays are subtypes of Object[] but the
1729 // destination array type is not equal to or a supertype
1730 // of the source type. Each element must be separately
1731 // checked.
1732
1733 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1734 copied_oop, r19_klass, count_save);
1735
1736 __ align(CodeEntryAlignment);
1737 StubCodeMark mark(this, "StubRoutines", name);
1738 address start = __ pc();
1739
1740 __ enter(); // required for proper stackwalking of RuntimeStub frame
1741
1742 #ifdef ASSERT
1743 // caller guarantees that the arrays really are different
1744 // otherwise, we would have to make conjoint checks
1745 { Label L;
1746 array_overlap_test(L, TIMES_OOP);
1747 __ stop("checkcast_copy within a single array");
1748 __ bind(L);
1749 }
1750 #endif //ASSERT
1751
1752 // Caller of this entry point must set up the argument registers.
1753 if (entry != NULL) {
1754 *entry = __ pc();
1755 BLOCK_COMMENT("Entry:");
1756 }
1757
1758 // Empty array: Nothing to do.
1759 __ cbz(count, L_done);
1760 __ push(RegSet::of(r19, r20, r21, r22), sp);
1761
1762 #ifdef ASSERT
1763 BLOCK_COMMENT("assert consistent ckoff/ckval");
1764 // The ckoff and ckval must be mutually consistent,
1765 // even though caller generates both.
1766 { Label L;
1767 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1768 __ ldrw(start_to, Address(ckval, sco_offset));
1769 __ cmpw(ckoff, start_to);
1770 __ br(Assembler::EQ, L);
1771 __ stop("super_check_offset inconsistent");
1772 __ bind(L);
1773 }
1774 #endif //ASSERT
1775
1776 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1777 bool is_oop = true;
1778 if (dest_uninitialized) {
1779 decorators |= IS_DEST_UNINITIALIZED;
1780 }
1809 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1810 __ cbz(copied_oop, L_store_element);
1811
1812 __ load_klass(r19_klass, copied_oop);// query the object klass
1813 generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1814 // ======== end loop ========
1815
1816 // It was a real error; we must depend on the caller to finish the job.
1817 // Register count = remaining oops, count_orig = total oops.
1818 // Emit GC store barriers for the oops we have copied and report
1819 // their number to the caller.
1820
1821 __ subs(count, count_save, count); // K = partially copied oop count
1822 __ eon(count, count, zr); // report (-1^K) to caller
1823 __ br(Assembler::EQ, L_done_pop);
1824
1825 __ BIND(L_do_card_marks);
1826 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1827
1828 __ bind(L_done_pop);
1829 __ pop(RegSet::of(r19, r20, r21, r22), sp);
1830 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1831
1832 __ bind(L_done);
1833 __ mov(r0, count);
1834 __ leave();
1835 __ ret(lr);
1836
1837 return start;
1838 }
1839
1840 // Perform range checks on the proposed arraycopy.
1841 // Kills temp, but nothing else.
1842 // Also, clean the sign bits of src_pos and dst_pos.
1843 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
1844 Register src_pos, // source position (c_rarg1)
1845 Register dst, // destination array oo (c_rarg2)
1846 Register dst_pos, // destination position (c_rarg3)
1847 Register length,
1848 Register temp,
1849 Label& L_failed) {
1986 // (6) src and dst should be arrays.
1987 // (7) src_pos + length must not exceed length of src.
1988 // (8) dst_pos + length must not exceed length of dst.
1989 //
1990
1991 // if (src == NULL) return -1;
1992 __ cbz(src, L_failed);
1993
1994 // if (src_pos < 0) return -1;
1995 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
1996
1997 // if (dst == NULL) return -1;
1998 __ cbz(dst, L_failed);
1999
2000 // if (dst_pos < 0) return -1;
2001 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2002
2003 // registers used as temp
2004 const Register scratch_length = r16; // elements count to copy
2005 const Register scratch_src_klass = r17; // array klass
2006 const Register lh = r15; // layout helper
2007
2008 // if (length < 0) return -1;
2009 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2010 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2011
2012 __ load_klass(scratch_src_klass, src);
2013 #ifdef ASSERT
2014 // assert(src->klass() != NULL);
2015 {
2016 BLOCK_COMMENT("assert klasses not null {");
2017 Label L1, L2;
2018 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL
2019 __ bind(L1);
2020 __ stop("broken null klass");
2021 __ bind(L2);
2022 __ load_klass(rscratch1, dst);
2023 __ cbz(rscratch1, L1); // this would be broken also
2024 BLOCK_COMMENT("} assert klasses not null done");
2025 }
2026 #endif
2057 Label L;
2058 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2059 __ cmpw(lh, rscratch2);
2060 __ br(Assembler::GE, L);
2061 __ stop("must be a primitive array");
2062 __ bind(L);
2063 BLOCK_COMMENT("} assert primitive array done");
2064 }
2065 #endif
2066
2067 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2068 rscratch2, L_failed);
2069
2070 // TypeArrayKlass
2071 //
2072 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2073 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2074 //
2075
2076 const Register rscratch1_offset = rscratch1; // array offset
2077 const Register r15_elsize = lh; // element size
2078
2079 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2080 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2081 __ add(src, src, rscratch1_offset); // src array offset
2082 __ add(dst, dst, rscratch1_offset); // dst array offset
2083 BLOCK_COMMENT("choose copy loop based on element size");
2084
2085 // next registers should be set before the jump to corresponding stub
2086 const Register from = c_rarg0; // source array address
2087 const Register to = c_rarg1; // destination array address
2088 const Register count = c_rarg2; // elements count
2089
2090 // 'from', 'to', 'count' registers should be set in such order
2091 // since they are the same as 'src', 'src_pos', 'dst'.
2092
2093 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2094
2095 // The possible values of elsize are 0-3, i.e. exact_log2(element
2096 // size in bytes). We do a simple bitwise binary search.
2097 __ BIND(L_copy_bytes);
2098 __ tbnz(r15_elsize, 1, L_copy_ints);
2099 __ tbnz(r15_elsize, 0, L_copy_shorts);
2100 __ lea(from, Address(src, src_pos));// src_addr
2101 __ lea(to, Address(dst, dst_pos));// dst_addr
2102 __ movw(count, scratch_length); // length
2103 __ b(RuntimeAddress(byte_copy_entry));
2104
2105 __ BIND(L_copy_shorts);
2106 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2107 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2108 __ movw(count, scratch_length); // length
2109 __ b(RuntimeAddress(short_copy_entry));
2110
2111 __ BIND(L_copy_ints);
2112 __ tbnz(r15_elsize, 0, L_copy_longs);
2113 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2114 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2115 __ movw(count, scratch_length); // length
2116 __ b(RuntimeAddress(int_copy_entry));
2117
2118 __ BIND(L_copy_longs);
2119 #ifdef ASSERT
2120 {
2121 BLOCK_COMMENT("assert long copy {");
2122 Label L;
2123 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2124 __ cmpw(r15_elsize, LogBytesPerLong);
2125 __ br(Assembler::EQ, L);
2126 __ stop("must be long copy, but elsize is wrong");
2127 __ bind(L);
2128 BLOCK_COMMENT("} assert long copy done");
2129 }
2130 #endif
2131 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2132 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2133 __ movw(count, scratch_length); // length
2134 __ b(RuntimeAddress(long_copy_entry));
2135
2136 // ObjArrayKlass
2137 __ BIND(L_objArray);
2138 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2139
2140 Label L_plain_copy, L_checkcast_copy;
2141 // test array classes for subtyping
2142 __ load_klass(r15, dst);
2143 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2144 __ br(Assembler::NE, L_checkcast_copy);
2145
2146 // Identically typed arrays can be copied without element-wise checks.
2147 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2148 rscratch2, L_failed);
2149
2150 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2151 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2152 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2153 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2154 __ movw(count, scratch_length); // length
2155 __ BIND(L_plain_copy);
2156 __ b(RuntimeAddress(oop_copy_entry));
2157
2158 __ BIND(L_checkcast_copy);
2159 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2160 {
2161 // Before looking at dst.length, make sure dst is also an objArray.
2162 __ ldrw(rscratch1, Address(r15, lh_offset));
2163 __ movw(rscratch2, objArray_lh);
2164 __ eorw(rscratch1, rscratch1, rscratch2);
2165 __ cbnzw(rscratch1, L_failed);
2166
2167 // It is safe to examine both src.length and dst.length.
2168 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2169 r15, L_failed);
2170
2171 __ load_klass(dst_klass, dst); // reload
2172
2173 // Marshal the base address arguments now, freeing registers.
2174 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2175 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2176 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2177 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2178 __ movw(count, length); // length (reloaded)
2179 Register sco_temp = c_rarg3; // this register is free now
2180 assert_different_registers(from, to, count, sco_temp,
2181 dst_klass, scratch_src_klass);
2182 // assert_clean_int(count, sco_temp);
2183
2184 // Generate the type check.
2185 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2186 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2187
2188 // Smashes rscratch1, rscratch2
2189 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
3267 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3268
3269 // Aliases
3270 Register adler = c_rarg0;
3271 Register s1 = c_rarg0;
3272 Register s2 = c_rarg3;
3273 Register buff = c_rarg1;
3274 Register len = c_rarg2;
3275 Register nmax = r4;
3276 Register base = r5;
3277 Register count = r6;
3278 Register temp0 = rscratch1;
3279 Register temp1 = rscratch2;
3280 FloatRegister vbytes = v0;
3281 FloatRegister vs1acc = v1;
3282 FloatRegister vs2acc = v2;
3283 FloatRegister vtable = v3;
3284
3285 // Max number of bytes we can process before having to take the mod
3286 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3287 uint64_t BASE = 0xfff1;
3288 uint64_t NMAX = 0x15B0;
3289
3290 __ mov(base, BASE);
3291 __ mov(nmax, NMAX);
3292
3293 // Load accumulation coefficients for the upper 16 bits
3294 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3295 __ ld1(vtable, __ T16B, Address(temp0));
3296
3297 // s1 is initialized to the lower 16 bits of adler
3298 // s2 is initialized to the upper 16 bits of adler
3299 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
3300 __ uxth(s1, adler); // s1 = (adler & 0xffff)
3301
3302 // The pipelined loop needs at least 16 elements for 1 iteration
3303 // It does check this, but it is more effective to skip to the cleanup loop
3304 __ cmp(len, (u1)16);
3305 __ br(Assembler::HS, L_nmax);
3306 __ cbz(len, L_combine);
3307
3308 __ bind(L_simple_by1_loop);
4045 // r1 = str1
4046 // r2 = cnt1
4047 // r3 = str2
4048 // r4 = cnt2
4049 // r10 = tmp1
4050 // r11 = tmp2
4051 address generate_compare_long_string_different_encoding(bool isLU) {
4052 __ align(CodeEntryAlignment);
4053 StubCodeMark mark(this, "StubRoutines", isLU
4054 ? "compare_long_string_different_encoding LU"
4055 : "compare_long_string_different_encoding UL");
4056 address entry = __ pc();
4057 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4058 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4059 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4060 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4061 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4062 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4063 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4064
4065 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4066
4067 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4068 // cnt2 == amount of characters left to compare
4069 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4070 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4071 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4072 __ add(str2, str2, isLU ? wordSize : wordSize/2);
4073 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4074 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4075 __ eor(rscratch2, tmp1, tmp2);
4076 __ mov(rscratch1, tmp2);
4077 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4078 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4079 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4080 __ push(spilled_regs, sp);
4081 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4082 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4083
4084 __ ldr(tmp3, Address(__ post(cnt1, 8)));
4085
4203 // r0 = result
4204 // r1 = str1
4205 // r2 = cnt1
4206 // r3 = str2
4207 // r4 = cnt2
4208 // r10 = tmp1
4209 // r11 = tmp2
4210 address generate_compare_long_string_same_encoding(bool isLL) {
4211 __ align(CodeEntryAlignment);
4212 StubCodeMark mark(this, "StubRoutines", isLL
4213 ? "compare_long_string_same_encoding LL"
4214 : "compare_long_string_same_encoding UU");
4215 address entry = __ pc();
4216 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4217 tmp1 = r10, tmp2 = r11;
4218 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4219 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4220 DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4221 // exit from large loop when less than 64 bytes left to read or we're about
4222 // to prefetch memory behind array border
4223 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4224 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4225 // update cnt2 counter with already loaded 8 bytes
4226 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4227 // update pointers, because of previous read
4228 __ add(str1, str1, wordSize);
4229 __ add(str2, str2, wordSize);
4230 if (SoftwarePrefetchHintDistance >= 0) {
4231 __ bind(LARGE_LOOP_PREFETCH);
4232 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4233 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4234 compare_string_16_bytes_same(DIFF, DIFF2);
4235 compare_string_16_bytes_same(DIFF, DIFF2);
4236 __ sub(cnt2, cnt2, isLL ? 64 : 32);
4237 compare_string_16_bytes_same(DIFF, DIFF2);
4238 __ subs(rscratch2, cnt2, largeLoopExitCondition);
4239 compare_string_16_bytes_same(DIFF, DIFF2);
4240 __ br(__ GT, LARGE_LOOP_PREFETCH);
4241 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4242 }
4243 // less than 16 bytes left?
4629 if (generatePrfm) {
4630 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4631 }
4632 __ zip1(v3, __ T16B, src2, v0);
4633 __ zip2(v4, __ T16B, src2, v0);
4634 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4635 }
4636
4637 // R0 = src
4638 // R1 = dst
4639 // R2 = len
4640 // R3 = len >> 3
4641 // V0 = 0
4642 // v1 = loaded 8 bytes
4643 address generate_large_byte_array_inflate() {
4644 __ align(CodeEntryAlignment);
4645 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4646 address entry = __ pc();
4647 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4648 Register src = r0, dst = r1, len = r2, octetCounter = r3;
4649 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
4650
4651 // do one more 8-byte read to have address 16-byte aligned in most cases
4652 // also use single store instruction
4653 __ ldrd(v2, __ post(src, 8));
4654 __ sub(octetCounter, octetCounter, 2);
4655 __ zip1(v1, __ T16B, v1, v0);
4656 __ zip1(v2, __ T16B, v2, v0);
4657 __ st1(v1, v2, __ T16B, __ post(dst, 32));
4658 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4659 __ subs(rscratch1, octetCounter, large_loop_threshold);
4660 __ br(__ LE, LOOP_START);
4661 __ b(LOOP_PRFM_START);
4662 __ bind(LOOP_PRFM);
4663 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4664 __ bind(LOOP_PRFM_START);
4665 __ prfm(Address(src, SoftwarePrefetchHintDistance));
4666 __ sub(octetCounter, octetCounter, 8);
4667 __ subs(rscratch1, octetCounter, large_loop_threshold);
4668 inflate_and_store_2_fp_registers(true, v3, v4);
4669 inflate_and_store_2_fp_registers(true, v5, v6);
4879
4880 class MontgomeryMultiplyGenerator : public MacroAssembler {
4881
4882 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4883 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4884
4885 RegSet _toSave;
4886 bool _squaring;
4887
4888 public:
4889 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4890 : MacroAssembler(as->code()), _squaring(squaring) {
4891
4892 // Register allocation
4893
4894 Register reg = c_rarg0;
4895 Pa_base = reg; // Argument registers
4896 if (squaring)
4897 Pb_base = Pa_base;
4898 else
4899 Pb_base = next_reg(reg);
4900 Pn_base = next_reg(reg);
4901 Rlen= next_reg(reg);
4902 inv = next_reg(reg);
4903 Pm_base = next_reg(reg);
4904
4905 // Working registers:
4906 Ra = next_reg(reg); // The current digit of a, b, n, and m.
4907 Rb = next_reg(reg);
4908 Rm = next_reg(reg);
4909 Rn = next_reg(reg);
4910
4911 Pa = next_reg(reg); // Pointers to the current/next digit of a, b, n, and m.
4912 Pb = next_reg(reg);
4913 Pm = next_reg(reg);
4914 Pn = next_reg(reg);
4915
4916 t0 = next_reg(reg); // Three registers which form a
4917 t1 = next_reg(reg); // triple-precision accumuator.
4918 t2 = next_reg(reg);
4919
4920 Ri = next_reg(reg); // Inner and outer loop indexes.
4921 Rj = next_reg(reg);
4922
4923 Rhi_ab = next_reg(reg); // Product registers: low and high parts
4924 Rlo_ab = next_reg(reg); // of a*b and m*n.
4925 Rhi_mn = next_reg(reg);
4926 Rlo_mn = next_reg(reg);
4927
4928 // r19 and up are callee-saved.
4929 _toSave = RegSet::range(r19, reg) + Pm_base;
4930 }
4931
4932 private:
4933 Register next_reg(Register ®) {
4934 #ifdef _WIN64
4935 // skip r18 on Windows, it's used by native TLS
4936 return ++reg == r18 ? ++reg : reg;
4937 #else
4938 return ++reg;
4939 #endif
4940 }
4941
4942 void save_regs() {
4943 push(_toSave, sp);
4944 }
4945
4946 void restore_regs() {
4947 pop(_toSave, sp);
4948 }
4949
4950 template <typename T>
4951 void unroll_2(Register count, T block) {
4952 Label loop, end, odd;
4953 tbnz(count, 0, odd);
4954 cbz(count, end);
4955 align(16);
4956 bind(loop);
4957 (this->*block)();
4958 bind(odd);
4959 (this->*block)();
4960 subs(count, count, 2);
4961 br(Assembler::GT, loop);
5374 }
5375 block_comment("} // i");
5376
5377 normalize(Rlen);
5378
5379 mov(Ra, Pm_base); // Save Pm_base in Ra
5380 restore_regs(); // Restore caller's Pm_base
5381
5382 // Copy our result into caller's Pm_base
5383 reverse(Pm_base, Ra, Rlen, t0, t1);
5384
5385 leave();
5386 bind(nothing);
5387 ret(lr);
5388
5389 return entry;
5390 }
5391 // In C, approximately:
5392
5393 // void
5394 // montgomery_multiply(uint64_t Pa_base[], uint64_t Pb_base[],
5395 // uint64_t Pn_base[], uint64_t Pm_base[],
5396 // uint64_t inv, int len) {
5397 // uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5398 // uint64_t *Pa, *Pb, *Pn, *Pm;
5399 // uint64_t Ra, Rb, Rn, Rm;
5400
5401 // int i;
5402
5403 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5404
5405 // for (i = 0; i < len; i++) {
5406 // int j;
5407
5408 // Pa = Pa_base;
5409 // Pb = Pb_base + i;
5410 // Pm = Pm_base;
5411 // Pn = Pn_base + i;
5412
5413 // Ra = *Pa;
5414 // Rb = *Pb;
5415 // Rm = *Pm;
5416 // Rn = *Pn;
5417
5418 // int iters = i;
5419 // for (j = 0; iters--; j++) {
5587 bind(end);
5588 block_comment("} // i");
5589 }
5590
5591 normalize(Rlen);
5592
5593 mov(Ra, Pm_base); // Save Pm_base in Ra
5594 restore_regs(); // Restore caller's Pm_base
5595
5596 // Copy our result into caller's Pm_base
5597 reverse(Pm_base, Ra, Rlen, t0, t1);
5598
5599 leave();
5600 ret(lr);
5601
5602 return entry;
5603 }
5604 // In C, approximately:
5605
5606 // void
5607 // montgomery_square(uint64_t Pa_base[], uint64_t Pn_base[],
5608 // uint64_t Pm_base[], uint64_t inv, int len) {
5609 // uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5610 // uint64_t *Pa, *Pb, *Pn, *Pm;
5611 // uint64_t Ra, Rb, Rn, Rm;
5612
5613 // int i;
5614
5615 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5616
5617 // for (i = 0; i < len; i++) {
5618 // int j;
5619
5620 // Pa = Pa_base;
5621 // Pb = Pa_base + i;
5622 // Pm = Pm_base;
5623 // Pn = Pn_base + i;
5624
5625 // Ra = *Pa;
5626 // Rb = *Pb;
5627 // Rm = *Pm;
5628 // Rn = *Pn;
5629
5630 // int iters = (i+1)/2;
5631 // for (j = 0; iters--; j++) {
|