< prev index next >

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Print this page
8248238: Adding Windows support to OpenJDK on AArch64

Summary: LP64 vs LLP64 changes to add Windows support

Contributed-by: Monica Beckwith <monica.beckwith@microsoft.com>, Ludovic Henry <luhenry@microsoft.com>
Reviewed-by:
8248238: Adding Windows support to OpenJDK on AArch64

Summary: Adding Windows support for AArch64

Contributed-by: Ludovic Henry <luhenry@microsoft.com>, Monica Beckwith <monica.beckwith@microsoft.com>
Reviewed-by:


 548     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 549     __ ldr(c_rarg3, Address(c_rarg2));
 550     __ add(c_rarg3, c_rarg3, 1);
 551     __ str(c_rarg3, Address(c_rarg2));
 552 
 553     // object is in r0
 554     // make sure object is 'reasonable'
 555     __ cbz(r0, exit); // if obj is NULL it is OK
 556 
 557 #if INCLUDE_ZGC
 558     if (UseZGC) {
 559       // Check if mask is good.
 560       // verifies that ZAddressBadMask & r0 == 0
 561       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 562       __ andr(c_rarg2, r0, c_rarg3);
 563       __ cbnz(c_rarg2, error);
 564     }
 565 #endif
 566 
 567     // Check if the oop is in the right area of memory
 568     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());


 569     __ andr(c_rarg2, r0, c_rarg3);
 570     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 571 
 572     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 573     // instruction here because the flags register is live.
 574     __ eor(c_rarg2, c_rarg2, c_rarg3);
 575     __ cbnz(c_rarg2, error);
 576 
 577     // make sure klass is 'reasonable', which is not zero.
 578     __ load_klass(r0, r0);  // get klass
 579     __ cbz(r0, error);      // if klass is NULL it is broken
 580 
 581     // return if everything seems ok
 582     __ bind(exit);
 583 
 584     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 585     __ ret(lr);
 586 
 587     // handle errors
 588     __ bind(error);
 589     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 590 


 680   } copy_direction;
 681 
 682   // Bulk copy of blocks of 8 words.
 683   //
 684   // count is a count of words.
 685   //
 686   // Precondition: count >= 8
 687   //
 688   // Postconditions:
 689   //
 690   // The least significant bit of count contains the remaining count
 691   // of words to copy.  The rest of count is trash.
 692   //
 693   // s and d are adjusted to point to the remaining words to copy
 694   //
 695   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 696                            copy_direction direction) {
 697     int unit = wordSize * direction;
 698     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 699 
 700     int offset;
 701     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 702       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 703     const Register stride = r13;
 704 
 705     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 706     assert_different_registers(s, d, count, rscratch1);
 707 
 708     Label again, drain;
 709     const char *stub_name;
 710     if (direction == copy_forwards)
 711       stub_name = "forward_copy_longs";
 712     else
 713       stub_name = "backward_copy_longs";
 714 
 715     __ align(CodeEntryAlignment);
 716 
 717     StubCodeMark mark(this, "StubRoutines", stub_name);
 718 
 719     __ bind(start);
 720 


1071 
1072   // All-singing all-dancing memory copy.
1073   //
1074   // Copy count units of memory from s to d.  The size of a unit is
1075   // step, which can be positive or negative depending on the direction
1076   // of copy.  If is_aligned is false, we align the source address.
1077   //
1078 
1079   void copy_memory(bool is_aligned, Register s, Register d,
1080                    Register count, Register tmp, int step) {
1081     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1082     bool is_backwards = step < 0;
1083     int granularity = uabs(step);
1084     const Register t0 = r3, t1 = r4;
1085 
1086     // <= 96 bytes do inline. Direction doesn't matter because we always
1087     // load all the data before writing anything
1088     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1089     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1090     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1091     const Register send = r17, dend = r18;
1092 
1093     if (PrefetchCopyIntervalInBytes > 0)
1094       __ prfm(Address(s, 0), PLDL1KEEP);
1095     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1096     __ br(Assembler::HI, copy_big);
1097 
1098     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1099     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1100 
1101     __ cmp(count, u1(16/granularity));
1102     __ br(Assembler::LS, copy16);
1103 
1104     __ cmp(count, u1(64/granularity));
1105     __ br(Assembler::HI, copy80);
1106 
1107     __ cmp(count, u1(32/granularity));
1108     __ br(Assembler::LS, copy32);
1109 
1110     // 33..64 bytes
1111     if (UseSIMDForMemoryOps) {


1263     // count and do a bulk copy of words.
1264     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1265     if (direction == copy_forwards)
1266       __ bl(copy_f);
1267     else
1268       __ bl(copy_b);
1269 
1270     // And the tail.
1271     copy_memory_small(s, d, count, tmp, step);
1272 
1273     if (granularity >= 8) __ bind(copy8);
1274     if (granularity >= 4) __ bind(copy4);
1275     __ bind(finish);
1276   }
1277 
1278 
1279   void clobber_registers() {
1280 #ifdef ASSERT
1281     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1282     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1283     for (Register r = r3; r <= r18; r++)
1284       if (r != rscratch1) __ mov(r, rscratch1);
1285 #endif

1286   }
1287 
1288   // Scan over array at a for count oops, verifying each one.
1289   // Preserves a and count, clobbers rscratch1 and rscratch2.
1290   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1291     Label loop, end;
1292     __ mov(rscratch1, a);
1293     __ mov(rscratch2, zr);
1294     __ bind(loop);
1295     __ cmp(rscratch2, count);
1296     __ br(Assembler::HS, end);
1297     if (size == (size_t)wordSize) {
1298       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1299       __ verify_oop(temp);
1300     } else {
1301       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1302       __ decode_heap_oop(temp); // calls verify_oop
1303     }
1304     __ add(rscratch2, rscratch2, size);
1305     __ b(loop);


1698   //
1699   //  Output:
1700   //    r0 ==  0  -  success
1701   //    r0 == -1^K - failure, where K is partial transfer count
1702   //
1703   address generate_checkcast_copy(const char *name, address *entry,
1704                                   bool dest_uninitialized = false) {
1705 
1706     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1707 
1708     // Input registers (after setup_arg_regs)
1709     const Register from        = c_rarg0;   // source array address
1710     const Register to          = c_rarg1;   // destination array address
1711     const Register count       = c_rarg2;   // elementscount
1712     const Register ckoff       = c_rarg3;   // super_check_offset
1713     const Register ckval       = c_rarg4;   // super_klass
1714 
1715     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1716     RegSet wb_post_saved_regs = RegSet::of(count);
1717 
1718     // Registers used as temps (r18, r19, r20 are save-on-entry)

1719     const Register count_save  = r21;       // orig elementscount
1720     const Register start_to    = r20;       // destination array start address
1721     const Register copied_oop  = r18;       // actual oop copied
1722     const Register r19_klass   = r19;       // oop._klass
1723 
1724     //---------------------------------------------------------------
1725     // Assembler stub will be used for this call to arraycopy
1726     // if the two arrays are subtypes of Object[] but the
1727     // destination array type is not equal to or a supertype
1728     // of the source type.  Each element must be separately
1729     // checked.
1730 
1731     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1732                                copied_oop, r19_klass, count_save);
1733 
1734     __ align(CodeEntryAlignment);
1735     StubCodeMark mark(this, "StubRoutines", name);
1736     address start = __ pc();
1737 
1738     __ enter(); // required for proper stackwalking of RuntimeStub frame
1739 
1740 #ifdef ASSERT
1741     // caller guarantees that the arrays really are different
1742     // otherwise, we would have to make conjoint checks
1743     { Label L;
1744       array_overlap_test(L, TIMES_OOP);
1745       __ stop("checkcast_copy within a single array");
1746       __ bind(L);
1747     }
1748 #endif //ASSERT
1749 
1750     // Caller of this entry point must set up the argument registers.
1751     if (entry != NULL) {
1752       *entry = __ pc();
1753       BLOCK_COMMENT("Entry:");
1754     }
1755 
1756      // Empty array:  Nothing to do.
1757     __ cbz(count, L_done);
1758 
1759     __ push(RegSet::of(r18, r19, r20, r21), sp);
1760 
1761 #ifdef ASSERT
1762     BLOCK_COMMENT("assert consistent ckoff/ckval");
1763     // The ckoff and ckval must be mutually consistent,
1764     // even though caller generates both.
1765     { Label L;
1766       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1767       __ ldrw(start_to, Address(ckval, sco_offset));
1768       __ cmpw(ckoff, start_to);
1769       __ br(Assembler::EQ, L);
1770       __ stop("super_check_offset inconsistent");
1771       __ bind(L);
1772     }
1773 #endif //ASSERT
1774 
1775     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1776     bool is_oop = true;
1777     if (dest_uninitialized) {
1778       decorators |= IS_DEST_UNINITIALIZED;
1779     }


1808     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1809     __ cbz(copied_oop, L_store_element);
1810 
1811     __ load_klass(r19_klass, copied_oop);// query the object klass
1812     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1813     // ======== end loop ========
1814 
1815     // It was a real error; we must depend on the caller to finish the job.
1816     // Register count = remaining oops, count_orig = total oops.
1817     // Emit GC store barriers for the oops we have copied and report
1818     // their number to the caller.
1819 
1820     __ subs(count, count_save, count);     // K = partially copied oop count
1821     __ eon(count, count, zr);                   // report (-1^K) to caller
1822     __ br(Assembler::EQ, L_done_pop);
1823 
1824     __ BIND(L_do_card_marks);
1825     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1826 
1827     __ bind(L_done_pop);
1828     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1829     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1830 
1831     __ bind(L_done);
1832     __ mov(r0, count);
1833     __ leave();
1834     __ ret(lr);
1835 
1836     return start;
1837   }
1838 
1839   // Perform range checks on the proposed arraycopy.
1840   // Kills temp, but nothing else.
1841   // Also, clean the sign bits of src_pos and dst_pos.
1842   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1843                               Register src_pos, // source position (c_rarg1)
1844                               Register dst,     // destination array oo (c_rarg2)
1845                               Register dst_pos, // destination position (c_rarg3)
1846                               Register length,
1847                               Register temp,
1848                               Label& L_failed) {


1985     // (6) src and dst should be arrays.
1986     // (7) src_pos + length must not exceed length of src.
1987     // (8) dst_pos + length must not exceed length of dst.
1988     //
1989 
1990     //  if (src == NULL) return -1;
1991     __ cbz(src, L_failed);
1992 
1993     //  if (src_pos < 0) return -1;
1994     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1995 
1996     //  if (dst == NULL) return -1;
1997     __ cbz(dst, L_failed);
1998 
1999     //  if (dst_pos < 0) return -1;
2000     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2001 
2002     // registers used as temp
2003     const Register scratch_length    = r16; // elements count to copy
2004     const Register scratch_src_klass = r17; // array klass
2005     const Register lh                = r18; // layout helper
2006 
2007     //  if (length < 0) return -1;
2008     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2009     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2010 
2011     __ load_klass(scratch_src_klass, src);
2012 #ifdef ASSERT
2013     //  assert(src->klass() != NULL);
2014     {
2015       BLOCK_COMMENT("assert klasses not null {");
2016       Label L1, L2;
2017       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2018       __ bind(L1);
2019       __ stop("broken null klass");
2020       __ bind(L2);
2021       __ load_klass(rscratch1, dst);
2022       __ cbz(rscratch1, L1);     // this would be broken also
2023       BLOCK_COMMENT("} assert klasses not null done");
2024     }
2025 #endif


2056       Label L;
2057       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2058       __ cmpw(lh, rscratch2);
2059       __ br(Assembler::GE, L);
2060       __ stop("must be a primitive array");
2061       __ bind(L);
2062       BLOCK_COMMENT("} assert primitive array done");
2063     }
2064 #endif
2065 
2066     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2067                            rscratch2, L_failed);
2068 
2069     // TypeArrayKlass
2070     //
2071     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2072     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2073     //
2074 
2075     const Register rscratch1_offset = rscratch1;    // array offset
2076     const Register r18_elsize = lh; // element size
2077 
2078     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2079            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2080     __ add(src, src, rscratch1_offset);           // src array offset
2081     __ add(dst, dst, rscratch1_offset);           // dst array offset
2082     BLOCK_COMMENT("choose copy loop based on element size");
2083 
2084     // next registers should be set before the jump to corresponding stub
2085     const Register from     = c_rarg0;  // source array address
2086     const Register to       = c_rarg1;  // destination array address
2087     const Register count    = c_rarg2;  // elements count
2088 
2089     // 'from', 'to', 'count' registers should be set in such order
2090     // since they are the same as 'src', 'src_pos', 'dst'.
2091 
2092     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2093 
2094     // The possible values of elsize are 0-3, i.e. exact_log2(element
2095     // size in bytes).  We do a simple bitwise binary search.
2096   __ BIND(L_copy_bytes);
2097     __ tbnz(r18_elsize, 1, L_copy_ints);
2098     __ tbnz(r18_elsize, 0, L_copy_shorts);
2099     __ lea(from, Address(src, src_pos));// src_addr
2100     __ lea(to,   Address(dst, dst_pos));// dst_addr
2101     __ movw(count, scratch_length); // length
2102     __ b(RuntimeAddress(byte_copy_entry));
2103 
2104   __ BIND(L_copy_shorts);
2105     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2106     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2107     __ movw(count, scratch_length); // length
2108     __ b(RuntimeAddress(short_copy_entry));
2109 
2110   __ BIND(L_copy_ints);
2111     __ tbnz(r18_elsize, 0, L_copy_longs);
2112     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2113     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2114     __ movw(count, scratch_length); // length
2115     __ b(RuntimeAddress(int_copy_entry));
2116 
2117   __ BIND(L_copy_longs);
2118 #ifdef ASSERT
2119     {
2120       BLOCK_COMMENT("assert long copy {");
2121       Label L;
2122       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2123       __ cmpw(r18_elsize, LogBytesPerLong);
2124       __ br(Assembler::EQ, L);
2125       __ stop("must be long copy, but elsize is wrong");
2126       __ bind(L);
2127       BLOCK_COMMENT("} assert long copy done");
2128     }
2129 #endif
2130     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2131     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2132     __ movw(count, scratch_length); // length
2133     __ b(RuntimeAddress(long_copy_entry));
2134 
2135     // ObjArrayKlass
2136   __ BIND(L_objArray);
2137     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2138 
2139     Label L_plain_copy, L_checkcast_copy;
2140     //  test array classes for subtyping
2141     __ load_klass(r18, dst);
2142     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2143     __ br(Assembler::NE, L_checkcast_copy);
2144 
2145     // Identically typed arrays can be copied without element-wise checks.
2146     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2147                            rscratch2, L_failed);
2148 
2149     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2150     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2151     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2152     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2153     __ movw(count, scratch_length); // length
2154   __ BIND(L_plain_copy);
2155     __ b(RuntimeAddress(oop_copy_entry));
2156 
2157   __ BIND(L_checkcast_copy);
2158     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2159     {
2160       // Before looking at dst.length, make sure dst is also an objArray.
2161       __ ldrw(rscratch1, Address(r18, lh_offset));
2162       __ movw(rscratch2, objArray_lh);
2163       __ eorw(rscratch1, rscratch1, rscratch2);
2164       __ cbnzw(rscratch1, L_failed);
2165 
2166       // It is safe to examine both src.length and dst.length.
2167       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2168                              r18, L_failed);
2169 
2170       __ load_klass(dst_klass, dst); // reload
2171 
2172       // Marshal the base address arguments now, freeing registers.
2173       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2174       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2175       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2176       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2177       __ movw(count, length);           // length (reloaded)
2178       Register sco_temp = c_rarg3;      // this register is free now
2179       assert_different_registers(from, to, count, sco_temp,
2180                                  dst_klass, scratch_src_klass);
2181       // assert_clean_int(count, sco_temp);
2182 
2183       // Generate the type check.
2184       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2185       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2186 
2187       // Smashes rscratch1, rscratch2
2188       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);


3266     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3267 
3268     // Aliases
3269     Register adler  = c_rarg0;
3270     Register s1     = c_rarg0;
3271     Register s2     = c_rarg3;
3272     Register buff   = c_rarg1;
3273     Register len    = c_rarg2;
3274     Register nmax  = r4;
3275     Register base  = r5;
3276     Register count = r6;
3277     Register temp0 = rscratch1;
3278     Register temp1 = rscratch2;
3279     FloatRegister vbytes = v0;
3280     FloatRegister vs1acc = v1;
3281     FloatRegister vs2acc = v2;
3282     FloatRegister vtable = v3;
3283 
3284     // Max number of bytes we can process before having to take the mod
3285     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3286     unsigned long BASE = 0xfff1;
3287     unsigned long NMAX = 0x15B0;
3288 
3289     __ mov(base, BASE);
3290     __ mov(nmax, NMAX);
3291 
3292     // Load accumulation coefficients for the upper 16 bits
3293     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3294     __ ld1(vtable, __ T16B, Address(temp0));
3295 
3296     // s1 is initialized to the lower 16 bits of adler
3297     // s2 is initialized to the upper 16 bits of adler
3298     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3299     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3300 
3301     // The pipelined loop needs at least 16 elements for 1 iteration
3302     // It does check this, but it is more effective to skip to the cleanup loop
3303     __ cmp(len, (u1)16);
3304     __ br(Assembler::HS, L_nmax);
3305     __ cbz(len, L_combine);
3306 
3307     __ bind(L_simple_by1_loop);


4044   // r1  = str1
4045   // r2  = cnt1
4046   // r3  = str2
4047   // r4  = cnt2
4048   // r10 = tmp1
4049   // r11 = tmp2
4050   address generate_compare_long_string_different_encoding(bool isLU) {
4051     __ align(CodeEntryAlignment);
4052     StubCodeMark mark(this, "StubRoutines", isLU
4053         ? "compare_long_string_different_encoding LU"
4054         : "compare_long_string_different_encoding UL");
4055     address entry = __ pc();
4056     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4057         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4058         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4059     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4060         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4061     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4062     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4063 
4064     int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
4065 
4066     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4067     // cnt2 == amount of characters left to compare
4068     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4069     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4070     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4071     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4072     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4073     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4074     __ eor(rscratch2, tmp1, tmp2);
4075     __ mov(rscratch1, tmp2);
4076     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4077     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4078              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4079     __ push(spilled_regs, sp);
4080     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4081     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4082 
4083     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4084 


4202   // r0  = result
4203   // r1  = str1
4204   // r2  = cnt1
4205   // r3  = str2
4206   // r4  = cnt2
4207   // r10 = tmp1
4208   // r11 = tmp2
4209   address generate_compare_long_string_same_encoding(bool isLL) {
4210     __ align(CodeEntryAlignment);
4211     StubCodeMark mark(this, "StubRoutines", isLL
4212         ? "compare_long_string_same_encoding LL"
4213         : "compare_long_string_same_encoding UU");
4214     address entry = __ pc();
4215     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4216         tmp1 = r10, tmp2 = r11;
4217     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4218         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4219         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4220     // exit from large loop when less than 64 bytes left to read or we're about
4221     // to prefetch memory behind array border
4222     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4223     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4224     // update cnt2 counter with already loaded 8 bytes
4225     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4226     // update pointers, because of previous read
4227     __ add(str1, str1, wordSize);
4228     __ add(str2, str2, wordSize);
4229     if (SoftwarePrefetchHintDistance >= 0) {
4230       __ bind(LARGE_LOOP_PREFETCH);
4231         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4232         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4233         compare_string_16_bytes_same(DIFF, DIFF2);
4234         compare_string_16_bytes_same(DIFF, DIFF2);
4235         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4236         compare_string_16_bytes_same(DIFF, DIFF2);
4237         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4238         compare_string_16_bytes_same(DIFF, DIFF2);
4239         __ br(__ GT, LARGE_LOOP_PREFETCH);
4240         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4241     }
4242     // less than 16 bytes left?


4628     if (generatePrfm) {
4629       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4630     }
4631     __ zip1(v3, __ T16B, src2, v0);
4632     __ zip2(v4, __ T16B, src2, v0);
4633     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4634   }
4635 
4636   // R0 = src
4637   // R1 = dst
4638   // R2 = len
4639   // R3 = len >> 3
4640   // V0 = 0
4641   // v1 = loaded 8 bytes
4642   address generate_large_byte_array_inflate() {
4643     __ align(CodeEntryAlignment);
4644     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4645     address entry = __ pc();
4646     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4647     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4648     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4649 
4650     // do one more 8-byte read to have address 16-byte aligned in most cases
4651     // also use single store instruction
4652     __ ldrd(v2, __ post(src, 8));
4653     __ sub(octetCounter, octetCounter, 2);
4654     __ zip1(v1, __ T16B, v1, v0);
4655     __ zip1(v2, __ T16B, v2, v0);
4656     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4657     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4658     __ subs(rscratch1, octetCounter, large_loop_threshold);
4659     __ br(__ LE, LOOP_START);
4660     __ b(LOOP_PRFM_START);
4661     __ bind(LOOP_PRFM);
4662       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4663     __ bind(LOOP_PRFM_START);
4664       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4665       __ sub(octetCounter, octetCounter, 8);
4666       __ subs(rscratch1, octetCounter, large_loop_threshold);
4667       inflate_and_store_2_fp_registers(true, v3, v4);
4668       inflate_and_store_2_fp_registers(true, v5, v6);


4878 
4879   class MontgomeryMultiplyGenerator : public MacroAssembler {
4880 
4881     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4882       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4883 
4884     RegSet _toSave;
4885     bool _squaring;
4886 
4887   public:
4888     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4889       : MacroAssembler(as->code()), _squaring(squaring) {
4890 
4891       // Register allocation
4892 
4893       Register reg = c_rarg0;
4894       Pa_base = reg;       // Argument registers
4895       if (squaring)
4896         Pb_base = Pa_base;
4897       else
4898         Pb_base = ++reg;
4899       Pn_base = ++reg;
4900       Rlen= ++reg;
4901       inv = ++reg;
4902       Pm_base = ++reg;
4903 
4904                           // Working registers:
4905       Ra =  ++reg;        // The current digit of a, b, n, and m.
4906       Rb =  ++reg;
4907       Rm =  ++reg;
4908       Rn =  ++reg;
4909 
4910       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4911       Pb =  ++reg;
4912       Pm =  ++reg;
4913       Pn =  ++reg;
4914 
4915       t0 =  ++reg;        // Three registers which form a
4916       t1 =  ++reg;        // triple-precision accumuator.
4917       t2 =  ++reg;
4918 
4919       Ri =  ++reg;        // Inner and outer loop indexes.
4920       Rj =  ++reg;
4921 
4922       Rhi_ab = ++reg;     // Product registers: low and high parts
4923       Rlo_ab = ++reg;     // of a*b and m*n.
4924       Rhi_mn = ++reg;
4925       Rlo_mn = ++reg;
4926 
4927       // r19 and up are callee-saved.
4928       _toSave = RegSet::range(r19, reg) + Pm_base;
4929     }
4930 
4931   private:









4932     void save_regs() {
4933       push(_toSave, sp);
4934     }
4935 
4936     void restore_regs() {
4937       pop(_toSave, sp);
4938     }
4939 
4940     template <typename T>
4941     void unroll_2(Register count, T block) {
4942       Label loop, end, odd;
4943       tbnz(count, 0, odd);
4944       cbz(count, end);
4945       align(16);
4946       bind(loop);
4947       (this->*block)();
4948       bind(odd);
4949       (this->*block)();
4950       subs(count, count, 2);
4951       br(Assembler::GT, loop);


5364       }
5365       block_comment("} // i");
5366 
5367       normalize(Rlen);
5368 
5369       mov(Ra, Pm_base);  // Save Pm_base in Ra
5370       restore_regs();  // Restore caller's Pm_base
5371 
5372       // Copy our result into caller's Pm_base
5373       reverse(Pm_base, Ra, Rlen, t0, t1);
5374 
5375       leave();
5376       bind(nothing);
5377       ret(lr);
5378 
5379       return entry;
5380     }
5381     // In C, approximately:
5382 
5383     // void
5384     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5385     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5386     //                     unsigned long inv, int len) {
5387     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5388     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5389     //   unsigned long Ra, Rb, Rn, Rm;
5390 
5391     //   int i;
5392 
5393     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5394 
5395     //   for (i = 0; i < len; i++) {
5396     //     int j;
5397 
5398     //     Pa = Pa_base;
5399     //     Pb = Pb_base + i;
5400     //     Pm = Pm_base;
5401     //     Pn = Pn_base + i;
5402 
5403     //     Ra = *Pa;
5404     //     Rb = *Pb;
5405     //     Rm = *Pm;
5406     //     Rn = *Pn;
5407 
5408     //     int iters = i;
5409     //     for (j = 0; iters--; j++) {


5577         bind(end);
5578         block_comment("} // i");
5579       }
5580 
5581       normalize(Rlen);
5582 
5583       mov(Ra, Pm_base);  // Save Pm_base in Ra
5584       restore_regs();  // Restore caller's Pm_base
5585 
5586       // Copy our result into caller's Pm_base
5587       reverse(Pm_base, Ra, Rlen, t0, t1);
5588 
5589       leave();
5590       ret(lr);
5591 
5592       return entry;
5593     }
5594     // In C, approximately:
5595 
5596     // void
5597     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5598     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5599     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5600     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5601     //   unsigned long Ra, Rb, Rn, Rm;
5602 
5603     //   int i;
5604 
5605     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5606 
5607     //   for (i = 0; i < len; i++) {
5608     //     int j;
5609 
5610     //     Pa = Pa_base;
5611     //     Pb = Pa_base + i;
5612     //     Pm = Pm_base;
5613     //     Pn = Pn_base + i;
5614 
5615     //     Ra = *Pa;
5616     //     Rb = *Pb;
5617     //     Rm = *Pm;
5618     //     Rn = *Pn;
5619 
5620     //     int iters = (i+1)/2;
5621     //     for (j = 0; iters--; j++) {




 548     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 549     __ ldr(c_rarg3, Address(c_rarg2));
 550     __ add(c_rarg3, c_rarg3, 1);
 551     __ str(c_rarg3, Address(c_rarg2));
 552 
 553     // object is in r0
 554     // make sure object is 'reasonable'
 555     __ cbz(r0, exit); // if obj is NULL it is OK
 556 
 557 #if INCLUDE_ZGC
 558     if (UseZGC) {
 559       // Check if mask is good.
 560       // verifies that ZAddressBadMask & r0 == 0
 561       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 562       __ andr(c_rarg2, r0, c_rarg3);
 563       __ cbnz(c_rarg2, error);
 564     }
 565 #endif
 566 
 567     // Check if the oop is in the right area of memory
 568     // Make sure we cast to `address` or it ends up calling the wrong `mov`
 569     // with MSVC, leading to a crash.
 570     __ mov(c_rarg3, (address) Universe::verify_oop_mask());
 571     __ andr(c_rarg2, r0, c_rarg3);
 572     __ mov(c_rarg3, (address) Universe::verify_oop_bits());
 573 
 574     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 575     // instruction here because the flags register is live.
 576     __ eor(c_rarg2, c_rarg2, c_rarg3);
 577     __ cbnz(c_rarg2, error);
 578 
 579     // make sure klass is 'reasonable', which is not zero.
 580     __ load_klass(r0, r0);  // get klass
 581     __ cbz(r0, error);      // if klass is NULL it is broken
 582 
 583     // return if everything seems ok
 584     __ bind(exit);
 585 
 586     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 587     __ ret(lr);
 588 
 589     // handle errors
 590     __ bind(error);
 591     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 592 


 682   } copy_direction;
 683 
 684   // Bulk copy of blocks of 8 words.
 685   //
 686   // count is a count of words.
 687   //
 688   // Precondition: count >= 8
 689   //
 690   // Postconditions:
 691   //
 692   // The least significant bit of count contains the remaining count
 693   // of words to copy.  The rest of count is trash.
 694   //
 695   // s and d are adjusted to point to the remaining words to copy
 696   //
 697   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 698                            copy_direction direction) {
 699     int unit = wordSize * direction;
 700     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 701 

 702     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 703       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 704     const Register stride = r13;
 705 
 706     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 707     assert_different_registers(s, d, count, rscratch1);
 708 
 709     Label again, drain;
 710     const char *stub_name;
 711     if (direction == copy_forwards)
 712       stub_name = "forward_copy_longs";
 713     else
 714       stub_name = "backward_copy_longs";
 715 
 716     __ align(CodeEntryAlignment);
 717 
 718     StubCodeMark mark(this, "StubRoutines", stub_name);
 719 
 720     __ bind(start);
 721 


1072 
1073   // All-singing all-dancing memory copy.
1074   //
1075   // Copy count units of memory from s to d.  The size of a unit is
1076   // step, which can be positive or negative depending on the direction
1077   // of copy.  If is_aligned is false, we align the source address.
1078   //
1079 
1080   void copy_memory(bool is_aligned, Register s, Register d,
1081                    Register count, Register tmp, int step) {
1082     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1083     bool is_backwards = step < 0;
1084     int granularity = uabs(step);
1085     const Register t0 = r3, t1 = r4;
1086 
1087     // <= 96 bytes do inline. Direction doesn't matter because we always
1088     // load all the data before writing anything
1089     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1090     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1091     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1092     const Register send = r17, dend = r16;
1093 
1094     if (PrefetchCopyIntervalInBytes > 0)
1095       __ prfm(Address(s, 0), PLDL1KEEP);
1096     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1097     __ br(Assembler::HI, copy_big);
1098 
1099     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1100     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1101 
1102     __ cmp(count, u1(16/granularity));
1103     __ br(Assembler::LS, copy16);
1104 
1105     __ cmp(count, u1(64/granularity));
1106     __ br(Assembler::HI, copy80);
1107 
1108     __ cmp(count, u1(32/granularity));
1109     __ br(Assembler::LS, copy32);
1110 
1111     // 33..64 bytes
1112     if (UseSIMDForMemoryOps) {


1264     // count and do a bulk copy of words.
1265     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1266     if (direction == copy_forwards)
1267       __ bl(copy_f);
1268     else
1269       __ bl(copy_b);
1270 
1271     // And the tail.
1272     copy_memory_small(s, d, count, tmp, step);
1273 
1274     if (granularity >= 8) __ bind(copy8);
1275     if (granularity >= 4) __ bind(copy4);
1276     __ bind(finish);
1277   }
1278 
1279 
1280   void clobber_registers() {
1281 #ifdef ASSERT
1282     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1283     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1284     for (Register r = r3; r <= NOT_WIN64(r18) WIN64_ONLY(r17); r++)
1285       if (r != rscratch1) __ mov(r, rscratch1);
1286 #endif
1287 
1288   }
1289 
1290   // Scan over array at a for count oops, verifying each one.
1291   // Preserves a and count, clobbers rscratch1 and rscratch2.
1292   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1293     Label loop, end;
1294     __ mov(rscratch1, a);
1295     __ mov(rscratch2, zr);
1296     __ bind(loop);
1297     __ cmp(rscratch2, count);
1298     __ br(Assembler::HS, end);
1299     if (size == (size_t)wordSize) {
1300       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1301       __ verify_oop(temp);
1302     } else {
1303       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1304       __ decode_heap_oop(temp); // calls verify_oop
1305     }
1306     __ add(rscratch2, rscratch2, size);
1307     __ b(loop);


1700   //
1701   //  Output:
1702   //    r0 ==  0  -  success
1703   //    r0 == -1^K - failure, where K is partial transfer count
1704   //
1705   address generate_checkcast_copy(const char *name, address *entry,
1706                                   bool dest_uninitialized = false) {
1707 
1708     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1709 
1710     // Input registers (after setup_arg_regs)
1711     const Register from        = c_rarg0;   // source array address
1712     const Register to          = c_rarg1;   // destination array address
1713     const Register count       = c_rarg2;   // elementscount
1714     const Register ckoff       = c_rarg3;   // super_check_offset
1715     const Register ckval       = c_rarg4;   // super_klass
1716 
1717     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1718     RegSet wb_post_saved_regs = RegSet::of(count);
1719 
1720     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1721     const Register copied_oop  = r22;       // actual oop copied
1722     const Register count_save  = r21;       // orig elementscount
1723     const Register start_to    = r20;       // destination array start address

1724     const Register r19_klass   = r19;       // oop._klass
1725 
1726     //---------------------------------------------------------------
1727     // Assembler stub will be used for this call to arraycopy
1728     // if the two arrays are subtypes of Object[] but the
1729     // destination array type is not equal to or a supertype
1730     // of the source type.  Each element must be separately
1731     // checked.
1732 
1733     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1734                                copied_oop, r19_klass, count_save);
1735 
1736     __ align(CodeEntryAlignment);
1737     StubCodeMark mark(this, "StubRoutines", name);
1738     address start = __ pc();
1739 
1740     __ enter(); // required for proper stackwalking of RuntimeStub frame
1741 
1742 #ifdef ASSERT
1743     // caller guarantees that the arrays really are different
1744     // otherwise, we would have to make conjoint checks
1745     { Label L;
1746       array_overlap_test(L, TIMES_OOP);
1747       __ stop("checkcast_copy within a single array");
1748       __ bind(L);
1749     }
1750 #endif //ASSERT
1751 
1752     // Caller of this entry point must set up the argument registers.
1753     if (entry != NULL) {
1754       *entry = __ pc();
1755       BLOCK_COMMENT("Entry:");
1756     }
1757 
1758      // Empty array:  Nothing to do.
1759     __ cbz(count, L_done);
1760     __ push(RegSet::of(r19, r20, r21, r22), sp);

1761 
1762 #ifdef ASSERT
1763     BLOCK_COMMENT("assert consistent ckoff/ckval");
1764     // The ckoff and ckval must be mutually consistent,
1765     // even though caller generates both.
1766     { Label L;
1767       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1768       __ ldrw(start_to, Address(ckval, sco_offset));
1769       __ cmpw(ckoff, start_to);
1770       __ br(Assembler::EQ, L);
1771       __ stop("super_check_offset inconsistent");
1772       __ bind(L);
1773     }
1774 #endif //ASSERT
1775 
1776     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1777     bool is_oop = true;
1778     if (dest_uninitialized) {
1779       decorators |= IS_DEST_UNINITIALIZED;
1780     }


1809     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1810     __ cbz(copied_oop, L_store_element);
1811 
1812     __ load_klass(r19_klass, copied_oop);// query the object klass
1813     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1814     // ======== end loop ========
1815 
1816     // It was a real error; we must depend on the caller to finish the job.
1817     // Register count = remaining oops, count_orig = total oops.
1818     // Emit GC store barriers for the oops we have copied and report
1819     // their number to the caller.
1820 
1821     __ subs(count, count_save, count);     // K = partially copied oop count
1822     __ eon(count, count, zr);                   // report (-1^K) to caller
1823     __ br(Assembler::EQ, L_done_pop);
1824 
1825     __ BIND(L_do_card_marks);
1826     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1827 
1828     __ bind(L_done_pop);
1829     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1830     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1831 
1832     __ bind(L_done);
1833     __ mov(r0, count);
1834     __ leave();
1835     __ ret(lr);
1836 
1837     return start;
1838   }
1839 
1840   // Perform range checks on the proposed arraycopy.
1841   // Kills temp, but nothing else.
1842   // Also, clean the sign bits of src_pos and dst_pos.
1843   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1844                               Register src_pos, // source position (c_rarg1)
1845                               Register dst,     // destination array oo (c_rarg2)
1846                               Register dst_pos, // destination position (c_rarg3)
1847                               Register length,
1848                               Register temp,
1849                               Label& L_failed) {


1986     // (6) src and dst should be arrays.
1987     // (7) src_pos + length must not exceed length of src.
1988     // (8) dst_pos + length must not exceed length of dst.
1989     //
1990 
1991     //  if (src == NULL) return -1;
1992     __ cbz(src, L_failed);
1993 
1994     //  if (src_pos < 0) return -1;
1995     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1996 
1997     //  if (dst == NULL) return -1;
1998     __ cbz(dst, L_failed);
1999 
2000     //  if (dst_pos < 0) return -1;
2001     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2002 
2003     // registers used as temp
2004     const Register scratch_length    = r16; // elements count to copy
2005     const Register scratch_src_klass = r17; // array klass
2006     const Register lh                = r15; // layout helper
2007 
2008     //  if (length < 0) return -1;
2009     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2010     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2011 
2012     __ load_klass(scratch_src_klass, src);
2013 #ifdef ASSERT
2014     //  assert(src->klass() != NULL);
2015     {
2016       BLOCK_COMMENT("assert klasses not null {");
2017       Label L1, L2;
2018       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2019       __ bind(L1);
2020       __ stop("broken null klass");
2021       __ bind(L2);
2022       __ load_klass(rscratch1, dst);
2023       __ cbz(rscratch1, L1);     // this would be broken also
2024       BLOCK_COMMENT("} assert klasses not null done");
2025     }
2026 #endif


2057       Label L;
2058       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2059       __ cmpw(lh, rscratch2);
2060       __ br(Assembler::GE, L);
2061       __ stop("must be a primitive array");
2062       __ bind(L);
2063       BLOCK_COMMENT("} assert primitive array done");
2064     }
2065 #endif
2066 
2067     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2068                            rscratch2, L_failed);
2069 
2070     // TypeArrayKlass
2071     //
2072     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2073     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2074     //
2075 
2076     const Register rscratch1_offset = rscratch1;    // array offset
2077     const Register r15_elsize = lh; // element size
2078 
2079     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2080            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2081     __ add(src, src, rscratch1_offset);           // src array offset
2082     __ add(dst, dst, rscratch1_offset);           // dst array offset
2083     BLOCK_COMMENT("choose copy loop based on element size");
2084 
2085     // next registers should be set before the jump to corresponding stub
2086     const Register from     = c_rarg0;  // source array address
2087     const Register to       = c_rarg1;  // destination array address
2088     const Register count    = c_rarg2;  // elements count
2089 
2090     // 'from', 'to', 'count' registers should be set in such order
2091     // since they are the same as 'src', 'src_pos', 'dst'.
2092 
2093     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2094 
2095     // The possible values of elsize are 0-3, i.e. exact_log2(element
2096     // size in bytes).  We do a simple bitwise binary search.
2097   __ BIND(L_copy_bytes);
2098     __ tbnz(r15_elsize, 1, L_copy_ints);
2099     __ tbnz(r15_elsize, 0, L_copy_shorts);
2100     __ lea(from, Address(src, src_pos));// src_addr
2101     __ lea(to,   Address(dst, dst_pos));// dst_addr
2102     __ movw(count, scratch_length); // length
2103     __ b(RuntimeAddress(byte_copy_entry));
2104 
2105   __ BIND(L_copy_shorts);
2106     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2107     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2108     __ movw(count, scratch_length); // length
2109     __ b(RuntimeAddress(short_copy_entry));
2110 
2111   __ BIND(L_copy_ints);
2112     __ tbnz(r15_elsize, 0, L_copy_longs);
2113     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2114     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2115     __ movw(count, scratch_length); // length
2116     __ b(RuntimeAddress(int_copy_entry));
2117 
2118   __ BIND(L_copy_longs);
2119 #ifdef ASSERT
2120     {
2121       BLOCK_COMMENT("assert long copy {");
2122       Label L;
2123       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2124       __ cmpw(r15_elsize, LogBytesPerLong);
2125       __ br(Assembler::EQ, L);
2126       __ stop("must be long copy, but elsize is wrong");
2127       __ bind(L);
2128       BLOCK_COMMENT("} assert long copy done");
2129     }
2130 #endif
2131     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2132     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2133     __ movw(count, scratch_length); // length
2134     __ b(RuntimeAddress(long_copy_entry));
2135 
2136     // ObjArrayKlass
2137   __ BIND(L_objArray);
2138     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2139 
2140     Label L_plain_copy, L_checkcast_copy;
2141     //  test array classes for subtyping
2142     __ load_klass(r15, dst);
2143     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2144     __ br(Assembler::NE, L_checkcast_copy);
2145 
2146     // Identically typed arrays can be copied without element-wise checks.
2147     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2148                            rscratch2, L_failed);
2149 
2150     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2151     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2152     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2153     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2154     __ movw(count, scratch_length); // length
2155   __ BIND(L_plain_copy);
2156     __ b(RuntimeAddress(oop_copy_entry));
2157 
2158   __ BIND(L_checkcast_copy);
2159     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2160     {
2161       // Before looking at dst.length, make sure dst is also an objArray.
2162       __ ldrw(rscratch1, Address(r15, lh_offset));
2163       __ movw(rscratch2, objArray_lh);
2164       __ eorw(rscratch1, rscratch1, rscratch2);
2165       __ cbnzw(rscratch1, L_failed);
2166 
2167       // It is safe to examine both src.length and dst.length.
2168       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2169                              r15, L_failed);
2170 
2171       __ load_klass(dst_klass, dst); // reload
2172 
2173       // Marshal the base address arguments now, freeing registers.
2174       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2175       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2176       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2177       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2178       __ movw(count, length);           // length (reloaded)
2179       Register sco_temp = c_rarg3;      // this register is free now
2180       assert_different_registers(from, to, count, sco_temp,
2181                                  dst_klass, scratch_src_klass);
2182       // assert_clean_int(count, sco_temp);
2183 
2184       // Generate the type check.
2185       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2186       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2187 
2188       // Smashes rscratch1, rscratch2
2189       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);


3267     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3268 
3269     // Aliases
3270     Register adler  = c_rarg0;
3271     Register s1     = c_rarg0;
3272     Register s2     = c_rarg3;
3273     Register buff   = c_rarg1;
3274     Register len    = c_rarg2;
3275     Register nmax  = r4;
3276     Register base  = r5;
3277     Register count = r6;
3278     Register temp0 = rscratch1;
3279     Register temp1 = rscratch2;
3280     FloatRegister vbytes = v0;
3281     FloatRegister vs1acc = v1;
3282     FloatRegister vs2acc = v2;
3283     FloatRegister vtable = v3;
3284 
3285     // Max number of bytes we can process before having to take the mod
3286     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3287     uint64_t BASE = 0xfff1;
3288     uint64_t NMAX = 0x15B0;
3289 
3290     __ mov(base, BASE);
3291     __ mov(nmax, NMAX);
3292 
3293     // Load accumulation coefficients for the upper 16 bits
3294     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3295     __ ld1(vtable, __ T16B, Address(temp0));
3296 
3297     // s1 is initialized to the lower 16 bits of adler
3298     // s2 is initialized to the upper 16 bits of adler
3299     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3300     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3301 
3302     // The pipelined loop needs at least 16 elements for 1 iteration
3303     // It does check this, but it is more effective to skip to the cleanup loop
3304     __ cmp(len, (u1)16);
3305     __ br(Assembler::HS, L_nmax);
3306     __ cbz(len, L_combine);
3307 
3308     __ bind(L_simple_by1_loop);


4045   // r1  = str1
4046   // r2  = cnt1
4047   // r3  = str2
4048   // r4  = cnt2
4049   // r10 = tmp1
4050   // r11 = tmp2
4051   address generate_compare_long_string_different_encoding(bool isLU) {
4052     __ align(CodeEntryAlignment);
4053     StubCodeMark mark(this, "StubRoutines", isLU
4054         ? "compare_long_string_different_encoding LU"
4055         : "compare_long_string_different_encoding UL");
4056     address entry = __ pc();
4057     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4058         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4059         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4060     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4061         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4062     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4063     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4064 
4065     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4066 
4067     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4068     // cnt2 == amount of characters left to compare
4069     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4070     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4071     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4072     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4073     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4074     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4075     __ eor(rscratch2, tmp1, tmp2);
4076     __ mov(rscratch1, tmp2);
4077     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4078     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4079              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4080     __ push(spilled_regs, sp);
4081     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4082     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4083 
4084     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4085 


4203   // r0  = result
4204   // r1  = str1
4205   // r2  = cnt1
4206   // r3  = str2
4207   // r4  = cnt2
4208   // r10 = tmp1
4209   // r11 = tmp2
4210   address generate_compare_long_string_same_encoding(bool isLL) {
4211     __ align(CodeEntryAlignment);
4212     StubCodeMark mark(this, "StubRoutines", isLL
4213         ? "compare_long_string_same_encoding LL"
4214         : "compare_long_string_same_encoding UU");
4215     address entry = __ pc();
4216     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4217         tmp1 = r10, tmp2 = r11;
4218     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4219         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4220         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4221     // exit from large loop when less than 64 bytes left to read or we're about
4222     // to prefetch memory behind array border
4223     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4224     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4225     // update cnt2 counter with already loaded 8 bytes
4226     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4227     // update pointers, because of previous read
4228     __ add(str1, str1, wordSize);
4229     __ add(str2, str2, wordSize);
4230     if (SoftwarePrefetchHintDistance >= 0) {
4231       __ bind(LARGE_LOOP_PREFETCH);
4232         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4233         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4234         compare_string_16_bytes_same(DIFF, DIFF2);
4235         compare_string_16_bytes_same(DIFF, DIFF2);
4236         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4237         compare_string_16_bytes_same(DIFF, DIFF2);
4238         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4239         compare_string_16_bytes_same(DIFF, DIFF2);
4240         __ br(__ GT, LARGE_LOOP_PREFETCH);
4241         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4242     }
4243     // less than 16 bytes left?


4629     if (generatePrfm) {
4630       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4631     }
4632     __ zip1(v3, __ T16B, src2, v0);
4633     __ zip2(v4, __ T16B, src2, v0);
4634     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4635   }
4636 
4637   // R0 = src
4638   // R1 = dst
4639   // R2 = len
4640   // R3 = len >> 3
4641   // V0 = 0
4642   // v1 = loaded 8 bytes
4643   address generate_large_byte_array_inflate() {
4644     __ align(CodeEntryAlignment);
4645     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4646     address entry = __ pc();
4647     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4648     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4649     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
4650 
4651     // do one more 8-byte read to have address 16-byte aligned in most cases
4652     // also use single store instruction
4653     __ ldrd(v2, __ post(src, 8));
4654     __ sub(octetCounter, octetCounter, 2);
4655     __ zip1(v1, __ T16B, v1, v0);
4656     __ zip1(v2, __ T16B, v2, v0);
4657     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4658     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4659     __ subs(rscratch1, octetCounter, large_loop_threshold);
4660     __ br(__ LE, LOOP_START);
4661     __ b(LOOP_PRFM_START);
4662     __ bind(LOOP_PRFM);
4663       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4664     __ bind(LOOP_PRFM_START);
4665       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4666       __ sub(octetCounter, octetCounter, 8);
4667       __ subs(rscratch1, octetCounter, large_loop_threshold);
4668       inflate_and_store_2_fp_registers(true, v3, v4);
4669       inflate_and_store_2_fp_registers(true, v5, v6);


4879 
4880   class MontgomeryMultiplyGenerator : public MacroAssembler {
4881 
4882     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4883       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4884 
4885     RegSet _toSave;
4886     bool _squaring;
4887 
4888   public:
4889     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4890       : MacroAssembler(as->code()), _squaring(squaring) {
4891 
4892       // Register allocation
4893 
4894       Register reg = c_rarg0;
4895       Pa_base = reg;       // Argument registers
4896       if (squaring)
4897         Pb_base = Pa_base;
4898       else
4899         Pb_base = next_reg(reg);
4900       Pn_base = next_reg(reg);
4901       Rlen= next_reg(reg);
4902       inv = next_reg(reg);
4903       Pm_base = next_reg(reg);
4904 
4905                           // Working registers:
4906       Ra =  next_reg(reg); // The current digit of a, b, n, and m.
4907       Rb =  next_reg(reg);
4908       Rm =  next_reg(reg);
4909       Rn =  next_reg(reg);
4910 
4911       Pa =  next_reg(reg); // Pointers to the current/next digit of a, b, n, and m.
4912       Pb =  next_reg(reg);
4913       Pm =  next_reg(reg);
4914       Pn =  next_reg(reg);
4915 
4916       t0 =  next_reg(reg); // Three registers which form a
4917       t1 =  next_reg(reg); // triple-precision accumuator.
4918       t2 =  next_reg(reg);
4919 
4920       Ri =  next_reg(reg); // Inner and outer loop indexes.
4921       Rj =  next_reg(reg);
4922 
4923       Rhi_ab = next_reg(reg); // Product registers: low and high parts
4924       Rlo_ab = next_reg(reg); // of a*b and m*n.
4925       Rhi_mn = next_reg(reg);
4926       Rlo_mn = next_reg(reg);
4927 
4928       // r19 and up are callee-saved.
4929       _toSave = RegSet::range(r19, reg) + Pm_base;
4930     }
4931 
4932   private:
4933     Register next_reg(Register &reg) {
4934 #ifdef _WIN64
4935       // skip r18 on Windows, it's used by native TLS
4936       return ++reg == r18 ? ++reg : reg;
4937 #else
4938       return ++reg;
4939 #endif
4940     }
4941 
4942     void save_regs() {
4943       push(_toSave, sp);
4944     }
4945 
4946     void restore_regs() {
4947       pop(_toSave, sp);
4948     }
4949 
4950     template <typename T>
4951     void unroll_2(Register count, T block) {
4952       Label loop, end, odd;
4953       tbnz(count, 0, odd);
4954       cbz(count, end);
4955       align(16);
4956       bind(loop);
4957       (this->*block)();
4958       bind(odd);
4959       (this->*block)();
4960       subs(count, count, 2);
4961       br(Assembler::GT, loop);


5374       }
5375       block_comment("} // i");
5376 
5377       normalize(Rlen);
5378 
5379       mov(Ra, Pm_base);  // Save Pm_base in Ra
5380       restore_regs();  // Restore caller's Pm_base
5381 
5382       // Copy our result into caller's Pm_base
5383       reverse(Pm_base, Ra, Rlen, t0, t1);
5384 
5385       leave();
5386       bind(nothing);
5387       ret(lr);
5388 
5389       return entry;
5390     }
5391     // In C, approximately:
5392 
5393     // void
5394     // montgomery_multiply(uint64_t Pa_base[], uint64_t Pb_base[],
5395     //                     uint64_t Pn_base[], uint64_t Pm_base[],
5396     //                     uint64_t inv, int len) {
5397     //   uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5398     //   uint64_t *Pa, *Pb, *Pn, *Pm;
5399     //   uint64_t Ra, Rb, Rn, Rm;
5400 
5401     //   int i;
5402 
5403     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5404 
5405     //   for (i = 0; i < len; i++) {
5406     //     int j;
5407 
5408     //     Pa = Pa_base;
5409     //     Pb = Pb_base + i;
5410     //     Pm = Pm_base;
5411     //     Pn = Pn_base + i;
5412 
5413     //     Ra = *Pa;
5414     //     Rb = *Pb;
5415     //     Rm = *Pm;
5416     //     Rn = *Pn;
5417 
5418     //     int iters = i;
5419     //     for (j = 0; iters--; j++) {


5587         bind(end);
5588         block_comment("} // i");
5589       }
5590 
5591       normalize(Rlen);
5592 
5593       mov(Ra, Pm_base);  // Save Pm_base in Ra
5594       restore_regs();  // Restore caller's Pm_base
5595 
5596       // Copy our result into caller's Pm_base
5597       reverse(Pm_base, Ra, Rlen, t0, t1);
5598 
5599       leave();
5600       ret(lr);
5601 
5602       return entry;
5603     }
5604     // In C, approximately:
5605 
5606     // void
5607     // montgomery_square(uint64_t Pa_base[], uint64_t Pn_base[],
5608     //                   uint64_t Pm_base[], uint64_t inv, int len) {
5609     //   uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5610     //   uint64_t *Pa, *Pb, *Pn, *Pm;
5611     //   uint64_t Ra, Rb, Rn, Rm;
5612 
5613     //   int i;
5614 
5615     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5616 
5617     //   for (i = 0; i < len; i++) {
5618     //     int j;
5619 
5620     //     Pa = Pa_base;
5621     //     Pb = Pa_base + i;
5622     //     Pm = Pm_base;
5623     //     Pn = Pn_base + i;
5624 
5625     //     Ra = *Pa;
5626     //     Rb = *Pb;
5627     //     Rm = *Pm;
5628     //     Rn = *Pn;
5629 
5630     //     int iters = (i+1)/2;
5631     //     for (j = 0; iters--; j++) {


< prev index next >