1107 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
1108 __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
1109 __ srl_ptr(count, CardTableModRefBS::card_shift, count);
1110 __ sub(count, addr, count);
1111 AddressLiteral rs(ct->byte_map_base);
1112 __ set(rs, tmp);
1113 __ BIND(L_loop);
1114 __ stb(G0, tmp, addr);
1115 __ subcc(count, 1, count);
1116 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1117 __ delayed()->add(addr, 1, addr);
1118 }
1119 break;
1120 case BarrierSet::ModRef:
1121 break;
1122 default:
1123 ShouldNotReachHere();
1124 }
1125 }
1126
1127
1128 // Copy big chunks forward with shift
1129 //
1130 // Inputs:
1131 // from - source arrays
1132 // to - destination array aligned to 8-bytes
1133 // count - elements count to copy >= the count equivalent to 16 bytes
1134 // count_dec - elements count's decrement equivalent to 16 bytes
1135 // L_copy_bytes - copy exit label
1136 //
1137 void copy_16_bytes_forward_with_shift(Register from, Register to,
1138 Register count, int count_dec, Label& L_copy_bytes) {
1139 Label L_loop, L_aligned_copy, L_copy_last_bytes;
1140
1141 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1142 __ andcc(from, 7, G1); // misaligned bytes
1143 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1144 __ delayed()->nop();
1145
1146 const Register left_shift = G1; // left shift bit counter
1147 const Register right_shift = G5; // right shift bit counter
1148
1149 __ sll(G1, LogBitsPerByte, left_shift);
1150 __ mov(64, right_shift);
1151 __ sub(right_shift, left_shift, right_shift);
1152
1153 //
1154 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1155 // to form 2 aligned 8-bytes chunks to store.
1156 //
1157 __ deccc(count, count_dec); // Pre-decrement 'count'
1158 __ andn(from, 7, from); // Align address
1159 __ ldx(from, 0, O3);
1160 __ inc(from, 8);
1161 __ align(OptoLoopAlignment);
1162 __ BIND(L_loop);
1163 __ ldx(from, 0, O4);
1164 __ deccc(count, count_dec); // Can we do next iteration after this one?
1165 __ ldx(from, 8, G4);
1166 __ inc(to, 16);
1167 __ inc(from, 16);
1168 __ sllx(O3, left_shift, O3);
1169 __ srlx(O4, right_shift, G3);
1170 __ bset(G3, O3);
1171 __ stx(O3, to, -16);
1172 __ sllx(O4, left_shift, O4);
1173 __ srlx(G4, right_shift, G3);
1174 __ bset(G3, O4);
1175 __ stx(O4, to, -8);
1176 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1177 __ delayed()->mov(G4, O3);
1178
1179 __ inccc(count, count_dec>>1 ); // + 8 bytes
1180 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1181 __ delayed()->inc(count, count_dec>>1); // restore 'count'
1182
1183 // copy 8 bytes, part of them already loaded in O3
1184 __ ldx(from, 0, O4);
1185 __ inc(to, 8);
1186 __ inc(from, 8);
1187 __ sllx(O3, left_shift, O3);
1188 __ srlx(O4, right_shift, G3);
1189 __ bset(O3, G3);
1190 __ stx(G3, to, -8);
1191
1192 __ BIND(L_copy_last_bytes);
1193 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1194 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1195 __ delayed()->sub(from, right_shift, from); // restore address
1196
1197 __ BIND(L_aligned_copy);
1198 }
1199
1200 // Copy big chunks backward with shift
1201 //
1202 // Inputs:
1203 // end_from - source arrays end address
1204 // end_to - destination array end address aligned to 8-bytes
1205 // count - elements count to copy >= the count equivalent to 16 bytes
1206 // count_dec - elements count's decrement equivalent to 16 bytes
1207 // L_aligned_copy - aligned copy exit label
1331 __ sub(count, G1, count);
1332 __ BIND(L_align);
1333 __ ldub(from, 0, O3);
1334 __ deccc(G1);
1335 __ inc(from);
1336 __ stb(O3, to, 0);
1337 __ br(Assembler::notZero, false, Assembler::pt, L_align);
1338 __ delayed()->inc(to);
1339 __ BIND(L_skip_alignment);
1340 }
1341 #ifdef _LP64
1342 if (!aligned)
1343 #endif
1344 {
1345 // Copy with shift 16 bytes per iteration if arrays do not have
1346 // the same alignment mod 8, otherwise fall through to the next
1347 // code for aligned copy.
1348 // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1349 // Also jump over aligned copy after the copy with shift completed.
1350
1351 copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte);
1352 }
1353
1354 // Both array are 8 bytes aligned, copy 16 bytes at a time
1355 __ and3(count, 7, G4); // Save count
1356 __ srl(count, 3, count);
1357 generate_disjoint_long_copy_core(aligned);
1358 __ mov(G4, count); // Restore count
1359
1360 // copy tailing bytes
1361 __ BIND(L_copy_byte);
1362 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1363 __ align(OptoLoopAlignment);
1364 __ BIND(L_copy_byte_loop);
1365 __ ldub(from, offset, O3);
1366 __ deccc(count);
1367 __ stb(O3, to, offset);
1368 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1369 __ delayed()->inc(offset);
1370
1371 __ BIND(L_exit);
1559 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1560 __ delayed()->lduh(from, 0, O3);
1561 __ dec(count, 2);
1562 __ lduh(from, 2, O4);
1563 __ inc(from, 4);
1564 __ inc(to, 4);
1565 __ sth(O3, to, -4);
1566 __ sth(O4, to, -2);
1567 __ BIND(L_skip_alignment2);
1568 }
1569 #ifdef _LP64
1570 if (!aligned)
1571 #endif
1572 {
1573 // Copy with shift 16 bytes per iteration if arrays do not have
1574 // the same alignment mod 8, otherwise fall through to the next
1575 // code for aligned copy.
1576 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1577 // Also jump over aligned copy after the copy with shift completed.
1578
1579 copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes);
1580 }
1581
1582 // Both array are 8 bytes aligned, copy 16 bytes at a time
1583 __ and3(count, 3, G4); // Save
1584 __ srl(count, 2, count);
1585 generate_disjoint_long_copy_core(aligned);
1586 __ mov(G4, count); // restore
1587
1588 // copy 1 element at a time
1589 __ BIND(L_copy_2_bytes);
1590 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1591 __ align(OptoLoopAlignment);
1592 __ BIND(L_copy_2_bytes_loop);
1593 __ lduh(from, offset, O3);
1594 __ deccc(count);
1595 __ sth(O3, to, offset);
1596 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1597 __ delayed()->inc(offset, 2);
1598
1599 __ BIND(L_exit);
1933 // copy 1 element (2 bytes) at a time
1934 __ BIND(L_copy_2_bytes);
1935 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1936 __ BIND(L_copy_2_bytes_loop);
1937 __ dec(end_from, 2);
1938 __ dec(end_to, 2);
1939 __ lduh(end_from, 0, O4);
1940 __ deccc(count);
1941 __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
1942 __ delayed()->sth(O4, end_to, 0);
1943
1944 __ BIND(L_exit);
1945 // O3, O4 are used as temp registers
1946 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1947 __ retl();
1948 __ delayed()->mov(G0, O0); // return 0
1949 return start;
1950 }
1951
1952 //
1953 // Generate core code for disjoint int copy (and oop copy on 32-bit).
1954 // If "aligned" is true, the "from" and "to" addresses are assumed
1955 // to be heapword aligned.
1956 //
1957 // Arguments:
1958 // from: O0
1959 // to: O1
1960 // count: O2 treated as signed
1961 //
1962 void generate_disjoint_int_copy_core(bool aligned) {
1963
1964 Label L_skip_alignment, L_aligned_copy;
1965 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1966
1967 const Register from = O0; // source array address
1968 const Register to = O1; // destination array address
1969 const Register count = O2; // elements count
1970 const Register offset = O5; // offset from start of arrays
1971 // O3, O4, G3, G4 are used as temp registers
1972
1973 // 'aligned' == true when it is known statically during compilation
1974 // of this arraycopy call site that both 'from' and 'to' addresses
1975 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1976 //
1977 // Aligned arrays have 4 bytes alignment in 32-bits VM
1978 // and 8 bytes - in 64-bits VM.
1979 //
1980 #ifdef _LP64
1981 if (!aligned)
1982 #endif
1983 {
1984 // The next check could be put under 'ifndef' since the code in
1985 // generate_disjoint_long_copy_core() has own checks and set 'offset'.
1996 __ inc(from, 4);
1997 __ inc(to, 4);
1998 __ dec(count);
1999 __ st(O3, to, -4);
2000 __ BIND(L_skip_alignment);
2001
2002 // if arrays have same alignment mod 8, do 4 elements copy
2003 __ andcc(from, 7, G0);
2004 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2005 __ delayed()->ld(from, 0, O3);
2006
2007 //
2008 // Load 2 aligned 8-bytes chunks and use one from previous iteration
2009 // to form 2 aligned 8-bytes chunks to store.
2010 //
2011 // copy_16_bytes_forward_with_shift() is not used here since this
2012 // code is more optimal.
2013
2014 // copy with shift 4 elements (16 bytes) at a time
2015 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4
2016
2017 __ align(OptoLoopAlignment);
2018 __ BIND(L_copy_16_bytes);
2019 __ ldx(from, 4, O4);
2020 __ deccc(count, 4); // Can we do next iteration after this one?
2021 __ ldx(from, 12, G4);
2022 __ inc(to, 16);
2023 __ inc(from, 16);
2024 __ sllx(O3, 32, O3);
2025 __ srlx(O4, 32, G3);
2026 __ bset(G3, O3);
2027 __ stx(O3, to, -16);
2028 __ sllx(O4, 32, O4);
2029 __ srlx(G4, 32, G3);
2030 __ bset(G3, O4);
2031 __ stx(O4, to, -8);
2032 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2033 __ delayed()->mov(G4, O3);
2034
2035 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2036 __ delayed()->inc(count, 4); // restore 'count'
2037
2038 __ BIND(L_aligned_copy);
2039 }
2040 // copy 4 elements (16 bytes) at a time
2041 __ and3(count, 1, G4); // Save
2042 __ srl(count, 1, count);
2043 generate_disjoint_long_copy_core(aligned);
2044 __ mov(G4, count); // Restore
2045
2046 // copy 1 element at a time
2047 __ BIND(L_copy_4_bytes);
2048 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2049 __ BIND(L_copy_4_bytes_loop);
2050 __ ld(from, offset, O3);
2051 __ deccc(count);
2052 __ st(O3, to, offset);
2053 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2054 __ delayed()->inc(offset, 4);
2055 __ BIND(L_exit);
2056 }
2057
2058 //
2059 // Generate stub for disjoint int copy. If "aligned" is true, the
2206 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2207
2208 if (entry != NULL) {
2209 *entry = __ pc();
2210 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2211 BLOCK_COMMENT("Entry:");
2212 }
2213
2214 array_overlap_test(nooverlap_target, 2);
2215
2216 generate_conjoint_int_copy_core(aligned);
2217
2218 // O3, O4 are used as temp registers
2219 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2220 __ retl();
2221 __ delayed()->mov(G0, O0); // return 0
2222 return start;
2223 }
2224
2225 //
2226 // Generate core code for disjoint long copy (and oop copy on 64-bit).
2227 // "aligned" is ignored, because we must make the stronger
2228 // assumption that both addresses are always 64-bit aligned.
2229 //
2230 // Arguments:
2231 // from: O0
2232 // to: O1
2233 // count: O2 treated as signed
2234 //
2235 // count -= 2;
2236 // if ( count >= 0 ) { // >= 2 elements
2237 // if ( count > 6) { // >= 8 elements
2238 // count -= 6; // original count - 8
2239 // do {
2240 // copy_8_elements;
2241 // count -= 8;
2242 // } while ( count >= 0 );
2243 // count += 6;
2244 // }
2245 // if ( count >= 0 ) { // >= 2 elements
2250 // }
2251 // count += 2;
2252 // if ( count != 0 ) { // 1 element left
2253 // copy_1_element;
2254 // }
2255 //
2256 void generate_disjoint_long_copy_core(bool aligned) {
2257 Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2258 const Register from = O0; // source array address
2259 const Register to = O1; // destination array address
2260 const Register count = O2; // elements count
2261 const Register offset0 = O4; // element offset
2262 const Register offset8 = O5; // next element offset
2263
2264 __ deccc(count, 2);
2265 __ mov(G0, offset0); // offset from start of arrays (0)
2266 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2267 __ delayed()->add(offset0, 8, offset8);
2268
2269 // Copy by 64 bytes chunks
2270 Label L_copy_64_bytes;
2271 const Register from64 = O3; // source address
2272 const Register to64 = G3; // destination address
2273 __ subcc(count, 6, O3);
2274 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2275 __ delayed()->mov(to, to64);
2276 // Now we can use O4(offset0), O5(offset8) as temps
2277 __ mov(O3, count);
2278 __ mov(from, from64);
2279
2280 __ align(OptoLoopAlignment);
2281 __ BIND(L_copy_64_bytes);
2282 for( int off = 0; off < 64; off += 16 ) {
2283 __ ldx(from64, off+0, O4);
2284 __ ldx(from64, off+8, O5);
2285 __ stx(O4, to64, off+0);
2286 __ stx(O5, to64, off+8);
2287 }
2288 __ deccc(count, 8);
2289 __ inc(from64, 64);
2290 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes);
2291 __ delayed()->inc(to64, 64);
2292
2293 // Restore O4(offset0), O5(offset8)
2294 __ sub(from64, from, offset0);
2295 __ inccc(count, 6);
2296 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2297 __ delayed()->add(offset0, 8, offset8);
2298
2299 // Copy by 16 bytes chunks
2300 __ align(OptoLoopAlignment);
2301 __ BIND(L_copy_16_bytes);
2302 __ ldx(from, offset0, O3);
2303 __ ldx(from, offset8, G3);
2304 __ deccc(count, 2);
2305 __ stx(O3, to, offset0);
2306 __ inc(offset0, 16);
2307 __ stx(G3, to, offset8);
2308 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2309 __ delayed()->inc(offset8, 16);
2310
2311 // Copy last 8 bytes
2312 __ BIND(L_copy_8_bytes);
2313 __ inccc(count, 2);
2314 __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2315 __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
|
1107 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
1108 __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
1109 __ srl_ptr(count, CardTableModRefBS::card_shift, count);
1110 __ sub(count, addr, count);
1111 AddressLiteral rs(ct->byte_map_base);
1112 __ set(rs, tmp);
1113 __ BIND(L_loop);
1114 __ stb(G0, tmp, addr);
1115 __ subcc(count, 1, count);
1116 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1117 __ delayed()->add(addr, 1, addr);
1118 }
1119 break;
1120 case BarrierSet::ModRef:
1121 break;
1122 default:
1123 ShouldNotReachHere();
1124 }
1125 }
1126
1127 //
1128 // Generate main code for disjoint arraycopy
1129 //
1130 typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
1131 Label& L_loop, bool use_prefetch, bool use_bis);
1132
1133 void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
1134 int iter_size, CopyLoopFunc copy_loop_func) {
1135 Label L_copy;
1136
1137 assert(log2_elem_size <= 3, "the following code should be changed");
1138 int count_dec = 16>>log2_elem_size;
1139
1140 int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
1141 assert(prefetch_dist < 4096, "invalid value");
1142 prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
1143 int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
1144
1145 if (UseBlockCopy) {
1146 Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
1147
1148 // 64 bytes tail + bytes copied in one loop iteration
1149 int tail_size = 64 + iter_size;
1150 int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
1151 // Use BIS copy only for big arrays since it requires membar.
1152 __ set(block_copy_count, O4);
1153 __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
1154 // This code is for disjoint source and destination:
1155 // to <= from || to >= from+count
1156 // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
1157 __ sub(from, to, O4);
1158 __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
1159 __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
1160
1161 __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
1162 // BIS should not be used to copy tail (64 bytes+iter_size)
1163 // to avoid zeroing of following values.
1164 __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
1165
1166 if (prefetch_count > 0) { // rounded up to one iteration count
1167 // Do prefetching only if copy size is bigger
1168 // than prefetch distance.
1169 __ set(prefetch_count, O4);
1170 __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
1171 __ sub(count, prefetch_count, count);
1172
1173 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
1174 __ add(count, prefetch_count, count); // restore count
1175
1176 } // prefetch_count > 0
1177
1178 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
1179 __ add(count, (tail_size>>log2_elem_size), count); // restore count
1180
1181 __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
1182 // BIS needs membar.
1183 __ membar(Assembler::StoreLoad);
1184 // Copy tail
1185 __ ba_short(L_copy);
1186
1187 __ BIND(L_skip_block_copy);
1188 } // UseBlockCopy
1189
1190 if (prefetch_count > 0) { // rounded up to one iteration count
1191 // Do prefetching only if copy size is bigger
1192 // than prefetch distance.
1193 __ set(prefetch_count, O4);
1194 __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
1195 __ sub(count, prefetch_count, count);
1196
1197 Label L_copy_prefetch;
1198 (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
1199 __ add(count, prefetch_count, count); // restore count
1200
1201 } // prefetch_count > 0
1202
1203 (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
1204 }
1205
1206
1207
1208 //
1209 // Helper methods for copy_16_bytes_forward_with_shift()
1210 //
1211 void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
1212 Label& L_loop, bool use_prefetch, bool use_bis) {
1213
1214 const Register left_shift = G1; // left shift bit counter
1215 const Register right_shift = G5; // right shift bit counter
1216
1217 __ align(OptoLoopAlignment);
1218 __ BIND(L_loop);
1219 if (use_prefetch) {
1220 if (ArraycopySrcPrefetchDistance > 0) {
1221 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1222 }
1223 if (ArraycopyDstPrefetchDistance > 0) {
1224 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1225 }
1226 }
1227 __ ldx(from, 0, O4);
1228 __ ldx(from, 8, G4);
1229 __ inc(to, 16);
1230 __ inc(from, 16);
1231 __ deccc(count, count_dec); // Can we do next iteration after this one?
1232 __ srlx(O4, right_shift, G3);
1233 __ bset(G3, O3);
1234 __ sllx(O4, left_shift, O4);
1235 __ srlx(G4, right_shift, G3);
1236 __ bset(G3, O4);
1237 if (use_bis) {
1238 __ stxa(O3, to, -16);
1239 __ stxa(O4, to, -8);
1240 } else {
1241 __ stx(O3, to, -16);
1242 __ stx(O4, to, -8);
1243 }
1244 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1245 __ delayed()->sllx(G4, left_shift, O3);
1246 }
1247
1248 // Copy big chunks forward with shift
1249 //
1250 // Inputs:
1251 // from - source arrays
1252 // to - destination array aligned to 8-bytes
1253 // count - elements count to copy >= the count equivalent to 16 bytes
1254 // count_dec - elements count's decrement equivalent to 16 bytes
1255 // L_copy_bytes - copy exit label
1256 //
1257 void copy_16_bytes_forward_with_shift(Register from, Register to,
1258 Register count, int log2_elem_size, Label& L_copy_bytes) {
1259 Label L_aligned_copy, L_copy_last_bytes;
1260 assert(log2_elem_size <= 3, "the following code should be changed");
1261 int count_dec = 16>>log2_elem_size;
1262
1263 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1264 __ andcc(from, 7, G1); // misaligned bytes
1265 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1266 __ delayed()->nop();
1267
1268 const Register left_shift = G1; // left shift bit counter
1269 const Register right_shift = G5; // right shift bit counter
1270
1271 __ sll(G1, LogBitsPerByte, left_shift);
1272 __ mov(64, right_shift);
1273 __ sub(right_shift, left_shift, right_shift);
1274
1275 //
1276 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1277 // to form 2 aligned 8-bytes chunks to store.
1278 //
1279 __ dec(count, count_dec); // Pre-decrement 'count'
1280 __ andn(from, 7, from); // Align address
1281 __ ldx(from, 0, O3);
1282 __ inc(from, 8);
1283 __ sllx(O3, left_shift, O3);
1284
1285 disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop);
1286
1287 __ inccc(count, count_dec>>1 ); // + 8 bytes
1288 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1289 __ delayed()->inc(count, count_dec>>1); // restore 'count'
1290
1291 // copy 8 bytes, part of them already loaded in O3
1292 __ ldx(from, 0, O4);
1293 __ inc(to, 8);
1294 __ inc(from, 8);
1295 __ srlx(O4, right_shift, G3);
1296 __ bset(O3, G3);
1297 __ stx(G3, to, -8);
1298
1299 __ BIND(L_copy_last_bytes);
1300 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1301 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1302 __ delayed()->sub(from, right_shift, from); // restore address
1303
1304 __ BIND(L_aligned_copy);
1305 }
1306
1307 // Copy big chunks backward with shift
1308 //
1309 // Inputs:
1310 // end_from - source arrays end address
1311 // end_to - destination array end address aligned to 8-bytes
1312 // count - elements count to copy >= the count equivalent to 16 bytes
1313 // count_dec - elements count's decrement equivalent to 16 bytes
1314 // L_aligned_copy - aligned copy exit label
1438 __ sub(count, G1, count);
1439 __ BIND(L_align);
1440 __ ldub(from, 0, O3);
1441 __ deccc(G1);
1442 __ inc(from);
1443 __ stb(O3, to, 0);
1444 __ br(Assembler::notZero, false, Assembler::pt, L_align);
1445 __ delayed()->inc(to);
1446 __ BIND(L_skip_alignment);
1447 }
1448 #ifdef _LP64
1449 if (!aligned)
1450 #endif
1451 {
1452 // Copy with shift 16 bytes per iteration if arrays do not have
1453 // the same alignment mod 8, otherwise fall through to the next
1454 // code for aligned copy.
1455 // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1456 // Also jump over aligned copy after the copy with shift completed.
1457
1458 copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
1459 }
1460
1461 // Both array are 8 bytes aligned, copy 16 bytes at a time
1462 __ and3(count, 7, G4); // Save count
1463 __ srl(count, 3, count);
1464 generate_disjoint_long_copy_core(aligned);
1465 __ mov(G4, count); // Restore count
1466
1467 // copy tailing bytes
1468 __ BIND(L_copy_byte);
1469 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1470 __ align(OptoLoopAlignment);
1471 __ BIND(L_copy_byte_loop);
1472 __ ldub(from, offset, O3);
1473 __ deccc(count);
1474 __ stb(O3, to, offset);
1475 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1476 __ delayed()->inc(offset);
1477
1478 __ BIND(L_exit);
1666 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1667 __ delayed()->lduh(from, 0, O3);
1668 __ dec(count, 2);
1669 __ lduh(from, 2, O4);
1670 __ inc(from, 4);
1671 __ inc(to, 4);
1672 __ sth(O3, to, -4);
1673 __ sth(O4, to, -2);
1674 __ BIND(L_skip_alignment2);
1675 }
1676 #ifdef _LP64
1677 if (!aligned)
1678 #endif
1679 {
1680 // Copy with shift 16 bytes per iteration if arrays do not have
1681 // the same alignment mod 8, otherwise fall through to the next
1682 // code for aligned copy.
1683 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1684 // Also jump over aligned copy after the copy with shift completed.
1685
1686 copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
1687 }
1688
1689 // Both array are 8 bytes aligned, copy 16 bytes at a time
1690 __ and3(count, 3, G4); // Save
1691 __ srl(count, 2, count);
1692 generate_disjoint_long_copy_core(aligned);
1693 __ mov(G4, count); // restore
1694
1695 // copy 1 element at a time
1696 __ BIND(L_copy_2_bytes);
1697 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1698 __ align(OptoLoopAlignment);
1699 __ BIND(L_copy_2_bytes_loop);
1700 __ lduh(from, offset, O3);
1701 __ deccc(count);
1702 __ sth(O3, to, offset);
1703 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1704 __ delayed()->inc(offset, 2);
1705
1706 __ BIND(L_exit);
2040 // copy 1 element (2 bytes) at a time
2041 __ BIND(L_copy_2_bytes);
2042 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2043 __ BIND(L_copy_2_bytes_loop);
2044 __ dec(end_from, 2);
2045 __ dec(end_to, 2);
2046 __ lduh(end_from, 0, O4);
2047 __ deccc(count);
2048 __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
2049 __ delayed()->sth(O4, end_to, 0);
2050
2051 __ BIND(L_exit);
2052 // O3, O4 are used as temp registers
2053 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
2054 __ retl();
2055 __ delayed()->mov(G0, O0); // return 0
2056 return start;
2057 }
2058
2059 //
2060 // Helper methods for generate_disjoint_int_copy_core()
2061 //
2062 void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
2063 Label& L_loop, bool use_prefetch, bool use_bis) {
2064
2065 __ align(OptoLoopAlignment);
2066 __ BIND(L_loop);
2067 if (use_prefetch) {
2068 if (ArraycopySrcPrefetchDistance > 0) {
2069 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
2070 }
2071 if (ArraycopyDstPrefetchDistance > 0) {
2072 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
2073 }
2074 }
2075 __ ldx(from, 4, O4);
2076 __ ldx(from, 12, G4);
2077 __ inc(to, 16);
2078 __ inc(from, 16);
2079 __ deccc(count, 4); // Can we do next iteration after this one?
2080
2081 __ srlx(O4, 32, G3);
2082 __ bset(G3, O3);
2083 __ sllx(O4, 32, O4);
2084 __ srlx(G4, 32, G3);
2085 __ bset(G3, O4);
2086 if (use_bis) {
2087 __ stxa(O3, to, -16);
2088 __ stxa(O4, to, -8);
2089 } else {
2090 __ stx(O3, to, -16);
2091 __ stx(O4, to, -8);
2092 }
2093 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2094 __ delayed()->sllx(G4, 32, O3);
2095
2096 }
2097
2098 //
2099 // Generate core code for disjoint int copy (and oop copy on 32-bit).
2100 // If "aligned" is true, the "from" and "to" addresses are assumed
2101 // to be heapword aligned.
2102 //
2103 // Arguments:
2104 // from: O0
2105 // to: O1
2106 // count: O2 treated as signed
2107 //
2108 void generate_disjoint_int_copy_core(bool aligned) {
2109
2110 Label L_skip_alignment, L_aligned_copy;
2111 Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2112
2113 const Register from = O0; // source array address
2114 const Register to = O1; // destination array address
2115 const Register count = O2; // elements count
2116 const Register offset = O5; // offset from start of arrays
2117 // O3, O4, G3, G4 are used as temp registers
2118
2119 // 'aligned' == true when it is known statically during compilation
2120 // of this arraycopy call site that both 'from' and 'to' addresses
2121 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
2122 //
2123 // Aligned arrays have 4 bytes alignment in 32-bits VM
2124 // and 8 bytes - in 64-bits VM.
2125 //
2126 #ifdef _LP64
2127 if (!aligned)
2128 #endif
2129 {
2130 // The next check could be put under 'ifndef' since the code in
2131 // generate_disjoint_long_copy_core() has own checks and set 'offset'.
2142 __ inc(from, 4);
2143 __ inc(to, 4);
2144 __ dec(count);
2145 __ st(O3, to, -4);
2146 __ BIND(L_skip_alignment);
2147
2148 // if arrays have same alignment mod 8, do 4 elements copy
2149 __ andcc(from, 7, G0);
2150 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2151 __ delayed()->ld(from, 0, O3);
2152
2153 //
2154 // Load 2 aligned 8-bytes chunks and use one from previous iteration
2155 // to form 2 aligned 8-bytes chunks to store.
2156 //
2157 // copy_16_bytes_forward_with_shift() is not used here since this
2158 // code is more optimal.
2159
2160 // copy with shift 4 elements (16 bytes) at a time
2161 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4
2162 __ sllx(O3, 32, O3);
2163
2164 disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop);
2165
2166 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2167 __ delayed()->inc(count, 4); // restore 'count'
2168
2169 __ BIND(L_aligned_copy);
2170 } // !aligned
2171
2172 // copy 4 elements (16 bytes) at a time
2173 __ and3(count, 1, G4); // Save
2174 __ srl(count, 1, count);
2175 generate_disjoint_long_copy_core(aligned);
2176 __ mov(G4, count); // Restore
2177
2178 // copy 1 element at a time
2179 __ BIND(L_copy_4_bytes);
2180 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2181 __ BIND(L_copy_4_bytes_loop);
2182 __ ld(from, offset, O3);
2183 __ deccc(count);
2184 __ st(O3, to, offset);
2185 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2186 __ delayed()->inc(offset, 4);
2187 __ BIND(L_exit);
2188 }
2189
2190 //
2191 // Generate stub for disjoint int copy. If "aligned" is true, the
2338 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2339
2340 if (entry != NULL) {
2341 *entry = __ pc();
2342 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2343 BLOCK_COMMENT("Entry:");
2344 }
2345
2346 array_overlap_test(nooverlap_target, 2);
2347
2348 generate_conjoint_int_copy_core(aligned);
2349
2350 // O3, O4 are used as temp registers
2351 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2352 __ retl();
2353 __ delayed()->mov(G0, O0); // return 0
2354 return start;
2355 }
2356
2357 //
2358 // Helper methods for generate_disjoint_long_copy_core()
2359 //
2360 void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
2361 Label& L_loop, bool use_prefetch, bool use_bis) {
2362 __ align(OptoLoopAlignment);
2363 __ BIND(L_loop);
2364 for (int off = 0; off < 64; off += 16) {
2365 if (use_prefetch && (off & 31) == 0) {
2366 if (ArraycopySrcPrefetchDistance > 0) {
2367 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
2368 }
2369 if (ArraycopyDstPrefetchDistance > 0) {
2370 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
2371 }
2372 }
2373 __ ldx(from, off+0, O4);
2374 __ ldx(from, off+8, O5);
2375 if (use_bis) {
2376 __ stxa(O4, to, off+0);
2377 __ stxa(O5, to, off+8);
2378 } else {
2379 __ stx(O4, to, off+0);
2380 __ stx(O5, to, off+8);
2381 }
2382 }
2383 __ deccc(count, 8);
2384 __ inc(from, 64);
2385 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2386 __ delayed()->inc(to, 64);
2387 }
2388
2389 //
2390 // Generate core code for disjoint long copy (and oop copy on 64-bit).
2391 // "aligned" is ignored, because we must make the stronger
2392 // assumption that both addresses are always 64-bit aligned.
2393 //
2394 // Arguments:
2395 // from: O0
2396 // to: O1
2397 // count: O2 treated as signed
2398 //
2399 // count -= 2;
2400 // if ( count >= 0 ) { // >= 2 elements
2401 // if ( count > 6) { // >= 8 elements
2402 // count -= 6; // original count - 8
2403 // do {
2404 // copy_8_elements;
2405 // count -= 8;
2406 // } while ( count >= 0 );
2407 // count += 6;
2408 // }
2409 // if ( count >= 0 ) { // >= 2 elements
2414 // }
2415 // count += 2;
2416 // if ( count != 0 ) { // 1 element left
2417 // copy_1_element;
2418 // }
2419 //
2420 void generate_disjoint_long_copy_core(bool aligned) {
2421 Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2422 const Register from = O0; // source array address
2423 const Register to = O1; // destination array address
2424 const Register count = O2; // elements count
2425 const Register offset0 = O4; // element offset
2426 const Register offset8 = O5; // next element offset
2427
2428 __ deccc(count, 2);
2429 __ mov(G0, offset0); // offset from start of arrays (0)
2430 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2431 __ delayed()->add(offset0, 8, offset8);
2432
2433 // Copy by 64 bytes chunks
2434
2435 const Register from64 = O3; // source address
2436 const Register to64 = G3; // destination address
2437 __ subcc(count, 6, O3);
2438 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2439 __ delayed()->mov(to, to64);
2440 // Now we can use O4(offset0), O5(offset8) as temps
2441 __ mov(O3, count);
2442 // count >= 0 (original count - 8)
2443 __ mov(from, from64);
2444
2445 disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop);
2446
2447 // Restore O4(offset0), O5(offset8)
2448 __ sub(from64, from, offset0);
2449 __ inccc(count, 6); // restore count
2450 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2451 __ delayed()->add(offset0, 8, offset8);
2452
2453 // Copy by 16 bytes chunks
2454 __ align(OptoLoopAlignment);
2455 __ BIND(L_copy_16_bytes);
2456 __ ldx(from, offset0, O3);
2457 __ ldx(from, offset8, G3);
2458 __ deccc(count, 2);
2459 __ stx(O3, to, offset0);
2460 __ inc(offset0, 16);
2461 __ stx(G3, to, offset8);
2462 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2463 __ delayed()->inc(offset8, 16);
2464
2465 // Copy last 8 bytes
2466 __ BIND(L_copy_8_bytes);
2467 __ inccc(count, 2);
2468 __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2469 __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
|