src/cpu/sparc/vm/stubGenerator_sparc.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File 7039731 Sdiff src/cpu/sparc/vm

src/cpu/sparc/vm/stubGenerator_sparc.cpp

Print this page




1107           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
1108           __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
1109           __ srl_ptr(count, CardTableModRefBS::card_shift, count);
1110           __ sub(count, addr, count);
1111           AddressLiteral rs(ct->byte_map_base);
1112           __ set(rs, tmp);
1113         __ BIND(L_loop);
1114           __ stb(G0, tmp, addr);
1115           __ subcc(count, 1, count);
1116           __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1117           __ delayed()->add(addr, 1, addr);
1118         }
1119         break;
1120       case BarrierSet::ModRef:
1121         break;
1122       default:
1123         ShouldNotReachHere();
1124     }
1125   }
1126 





1127 



















































































































1128   // Copy big chunks forward with shift
1129   //
1130   // Inputs:
1131   //   from      - source arrays
1132   //   to        - destination array aligned to 8-bytes
1133   //   count     - elements count to copy >= the count equivalent to 16 bytes
1134   //   count_dec - elements count's decrement equivalent to 16 bytes
1135   //   L_copy_bytes - copy exit label
1136   //
1137   void copy_16_bytes_forward_with_shift(Register from, Register to,
1138                      Register count, int count_dec, Label& L_copy_bytes) {
1139     Label L_loop, L_aligned_copy, L_copy_last_bytes;


1140 
1141     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1142       __ andcc(from, 7, G1); // misaligned bytes
1143       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1144       __ delayed()->nop();
1145 
1146     const Register left_shift  = G1; // left  shift bit counter
1147     const Register right_shift = G5; // right shift bit counter
1148 
1149       __ sll(G1, LogBitsPerByte, left_shift);
1150       __ mov(64, right_shift);
1151       __ sub(right_shift, left_shift, right_shift);
1152 
1153     //
1154     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1155     // to form 2 aligned 8-bytes chunks to store.
1156     //
1157       __ deccc(count, count_dec); // Pre-decrement 'count'
1158       __ andn(from, 7, from);     // Align address
1159       __ ldx(from, 0, O3);
1160       __ inc(from, 8);
1161       __ align(OptoLoopAlignment);
1162     __ BIND(L_loop);
1163       __ ldx(from, 0, O4);
1164       __ deccc(count, count_dec); // Can we do next iteration after this one?
1165       __ ldx(from, 8, G4);
1166       __ inc(to, 16);
1167       __ inc(from, 16);
1168       __ sllx(O3, left_shift,  O3);
1169       __ srlx(O4, right_shift, G3);
1170       __ bset(G3, O3);
1171       __ stx(O3, to, -16);
1172       __ sllx(O4, left_shift,  O4);
1173       __ srlx(G4, right_shift, G3);
1174       __ bset(G3, O4);
1175       __ stx(O4, to, -8);
1176       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1177       __ delayed()->mov(G4, O3);
1178 


1179       __ inccc(count, count_dec>>1 ); // + 8 bytes
1180       __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1181       __ delayed()->inc(count, count_dec>>1); // restore 'count'
1182 
1183       // copy 8 bytes, part of them already loaded in O3
1184       __ ldx(from, 0, O4);
1185       __ inc(to, 8);
1186       __ inc(from, 8);
1187       __ sllx(O3, left_shift,  O3);
1188       __ srlx(O4, right_shift, G3);
1189       __ bset(O3, G3);
1190       __ stx(G3, to, -8);
1191 
1192     __ BIND(L_copy_last_bytes);
1193       __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1194       __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1195       __ delayed()->sub(from, right_shift, from);       // restore address
1196 
1197     __ BIND(L_aligned_copy);
1198   }
1199 
1200   // Copy big chunks backward with shift
1201   //
1202   // Inputs:
1203   //   end_from  - source arrays end address
1204   //   end_to    - destination array end address aligned to 8-bytes
1205   //   count     - elements count to copy >= the count equivalent to 16 bytes
1206   //   count_dec - elements count's decrement equivalent to 16 bytes
1207   //   L_aligned_copy - aligned copy exit label


1331       __ sub(count, G1, count);
1332     __ BIND(L_align);
1333       __ ldub(from, 0, O3);
1334       __ deccc(G1);
1335       __ inc(from);
1336       __ stb(O3, to, 0);
1337       __ br(Assembler::notZero, false, Assembler::pt, L_align);
1338       __ delayed()->inc(to);
1339     __ BIND(L_skip_alignment);
1340     }
1341 #ifdef _LP64
1342     if (!aligned)
1343 #endif
1344     {
1345       // Copy with shift 16 bytes per iteration if arrays do not have
1346       // the same alignment mod 8, otherwise fall through to the next
1347       // code for aligned copy.
1348       // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1349       // Also jump over aligned copy after the copy with shift completed.
1350 
1351       copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte);
1352     }
1353 
1354     // Both array are 8 bytes aligned, copy 16 bytes at a time
1355       __ and3(count, 7, G4); // Save count
1356       __ srl(count, 3, count);
1357      generate_disjoint_long_copy_core(aligned);
1358       __ mov(G4, count);     // Restore count
1359 
1360     // copy tailing bytes
1361     __ BIND(L_copy_byte);
1362       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1363       __ align(OptoLoopAlignment);
1364     __ BIND(L_copy_byte_loop);
1365       __ ldub(from, offset, O3);
1366       __ deccc(count);
1367       __ stb(O3, to, offset);
1368       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1369       __ delayed()->inc(offset);
1370 
1371     __ BIND(L_exit);


1559       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1560       __ delayed()->lduh(from, 0, O3);
1561       __ dec(count, 2);
1562       __ lduh(from, 2, O4);
1563       __ inc(from, 4);
1564       __ inc(to, 4);
1565       __ sth(O3, to, -4);
1566       __ sth(O4, to, -2);
1567     __ BIND(L_skip_alignment2);
1568     }
1569 #ifdef _LP64
1570     if (!aligned)
1571 #endif
1572     {
1573       // Copy with shift 16 bytes per iteration if arrays do not have
1574       // the same alignment mod 8, otherwise fall through to the next
1575       // code for aligned copy.
1576       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1577       // Also jump over aligned copy after the copy with shift completed.
1578 
1579       copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes);
1580     }
1581 
1582     // Both array are 8 bytes aligned, copy 16 bytes at a time
1583       __ and3(count, 3, G4); // Save
1584       __ srl(count, 2, count);
1585      generate_disjoint_long_copy_core(aligned);
1586       __ mov(G4, count); // restore
1587 
1588     // copy 1 element at a time
1589     __ BIND(L_copy_2_bytes);
1590       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1591       __ align(OptoLoopAlignment);
1592     __ BIND(L_copy_2_bytes_loop);
1593       __ lduh(from, offset, O3);
1594       __ deccc(count);
1595       __ sth(O3, to, offset);
1596       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1597       __ delayed()->inc(offset, 2);
1598 
1599     __ BIND(L_exit);


1933     // copy 1 element (2 bytes) at a time
1934     __ BIND(L_copy_2_bytes);
1935       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1936     __ BIND(L_copy_2_bytes_loop);
1937       __ dec(end_from, 2);
1938       __ dec(end_to, 2);
1939       __ lduh(end_from, 0, O4);
1940       __ deccc(count);
1941       __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
1942       __ delayed()->sth(O4, end_to, 0);
1943 
1944     __ BIND(L_exit);
1945     // O3, O4 are used as temp registers
1946     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1947     __ retl();
1948     __ delayed()->mov(G0, O0); // return 0
1949     return start;
1950   }
1951 
1952   //







































1953   //  Generate core code for disjoint int copy (and oop copy on 32-bit).
1954   //  If "aligned" is true, the "from" and "to" addresses are assumed
1955   //  to be heapword aligned.
1956   //
1957   // Arguments:
1958   //      from:  O0
1959   //      to:    O1
1960   //      count: O2 treated as signed
1961   //
1962   void generate_disjoint_int_copy_core(bool aligned) {
1963 
1964     Label L_skip_alignment, L_aligned_copy;
1965     Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1966 
1967     const Register from      = O0;   // source array address
1968     const Register to        = O1;   // destination array address
1969     const Register count     = O2;   // elements count
1970     const Register offset    = O5;   // offset from start of arrays
1971     // O3, O4, G3, G4 are used as temp registers
1972 
1973     // 'aligned' == true when it is known statically during compilation
1974     // of this arraycopy call site that both 'from' and 'to' addresses
1975     // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1976     //
1977     // Aligned arrays have 4 bytes alignment in 32-bits VM
1978     // and 8 bytes - in 64-bits VM.
1979     //
1980 #ifdef _LP64
1981     if (!aligned)
1982 #endif
1983     {
1984       // The next check could be put under 'ifndef' since the code in
1985       // generate_disjoint_long_copy_core() has own checks and set 'offset'.


1996       __ inc(from, 4);
1997       __ inc(to, 4);
1998       __ dec(count);
1999       __ st(O3, to, -4);
2000     __ BIND(L_skip_alignment);
2001 
2002     // if arrays have same alignment mod 8, do 4 elements copy
2003       __ andcc(from, 7, G0);
2004       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2005       __ delayed()->ld(from, 0, O3);
2006 
2007     //
2008     // Load 2 aligned 8-bytes chunks and use one from previous iteration
2009     // to form 2 aligned 8-bytes chunks to store.
2010     //
2011     // copy_16_bytes_forward_with_shift() is not used here since this
2012     // code is more optimal.
2013 
2014     // copy with shift 4 elements (16 bytes) at a time
2015       __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
2016 
2017       __ align(OptoLoopAlignment);
2018     __ BIND(L_copy_16_bytes);
2019       __ ldx(from, 4, O4);
2020       __ deccc(count, 4); // Can we do next iteration after this one?
2021       __ ldx(from, 12, G4);
2022       __ inc(to, 16);
2023       __ inc(from, 16);
2024       __ sllx(O3, 32, O3);
2025       __ srlx(O4, 32, G3);
2026       __ bset(G3, O3);
2027       __ stx(O3, to, -16);
2028       __ sllx(O4, 32, O4);
2029       __ srlx(G4, 32, G3);
2030       __ bset(G3, O4);
2031       __ stx(O4, to, -8);
2032       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2033       __ delayed()->mov(G4, O3);
2034 


2035       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2036       __ delayed()->inc(count, 4); // restore 'count'
2037 
2038     __ BIND(L_aligned_copy);
2039     }

2040     // copy 4 elements (16 bytes) at a time
2041       __ and3(count, 1, G4); // Save
2042       __ srl(count, 1, count);
2043      generate_disjoint_long_copy_core(aligned);
2044       __ mov(G4, count);     // Restore
2045 
2046     // copy 1 element at a time
2047     __ BIND(L_copy_4_bytes);
2048       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2049     __ BIND(L_copy_4_bytes_loop);
2050       __ ld(from, offset, O3);
2051       __ deccc(count);
2052       __ st(O3, to, offset);
2053       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2054       __ delayed()->inc(offset, 4);
2055     __ BIND(L_exit);
2056   }
2057 
2058   //
2059   //  Generate stub for disjoint int copy.  If "aligned" is true, the


2206     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2207 
2208     if (entry != NULL) {
2209       *entry = __ pc();
2210       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2211       BLOCK_COMMENT("Entry:");
2212     }
2213 
2214     array_overlap_test(nooverlap_target, 2);
2215 
2216     generate_conjoint_int_copy_core(aligned);
2217 
2218     // O3, O4 are used as temp registers
2219     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2220     __ retl();
2221     __ delayed()->mov(G0, O0); // return 0
2222     return start;
2223   }
2224 
2225   //
































2226   //  Generate core code for disjoint long copy (and oop copy on 64-bit).
2227   //  "aligned" is ignored, because we must make the stronger
2228   //  assumption that both addresses are always 64-bit aligned.
2229   //
2230   // Arguments:
2231   //      from:  O0
2232   //      to:    O1
2233   //      count: O2 treated as signed
2234   //
2235   // count -= 2;
2236   // if ( count >= 0 ) { // >= 2 elements
2237   //   if ( count > 6) { // >= 8 elements
2238   //     count -= 6; // original count - 8
2239   //     do {
2240   //       copy_8_elements;
2241   //       count -= 8;
2242   //     } while ( count >= 0 );
2243   //     count += 6;
2244   //   }
2245   //   if ( count >= 0 ) { // >= 2 elements


2250   // }
2251   // count += 2;
2252   // if ( count != 0 ) { // 1 element left
2253   //   copy_1_element;
2254   // }
2255   //
2256   void generate_disjoint_long_copy_core(bool aligned) {
2257     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2258     const Register from    = O0;  // source array address
2259     const Register to      = O1;  // destination array address
2260     const Register count   = O2;  // elements count
2261     const Register offset0 = O4;  // element offset
2262     const Register offset8 = O5;  // next element offset
2263 
2264       __ deccc(count, 2);
2265       __ mov(G0, offset0);   // offset from start of arrays (0)
2266       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2267       __ delayed()->add(offset0, 8, offset8);
2268 
2269     // Copy by 64 bytes chunks
2270     Label L_copy_64_bytes;
2271     const Register from64 = O3;  // source address
2272     const Register to64   = G3;  // destination address
2273       __ subcc(count, 6, O3);
2274       __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2275       __ delayed()->mov(to,   to64);
2276       // Now we can use O4(offset0), O5(offset8) as temps
2277       __ mov(O3, count);

2278       __ mov(from, from64);
2279 
2280       __ align(OptoLoopAlignment);
2281     __ BIND(L_copy_64_bytes);
2282       for( int off = 0; off < 64; off += 16 ) {
2283         __ ldx(from64,  off+0, O4);
2284         __ ldx(from64,  off+8, O5);
2285         __ stx(O4, to64,  off+0);
2286         __ stx(O5, to64,  off+8);
2287       }
2288       __ deccc(count, 8);
2289       __ inc(from64, 64);
2290       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes);
2291       __ delayed()->inc(to64, 64);
2292 
2293       // Restore O4(offset0), O5(offset8)
2294       __ sub(from64, from, offset0);
2295       __ inccc(count, 6);
2296       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2297       __ delayed()->add(offset0, 8, offset8);
2298 
2299       // Copy by 16 bytes chunks
2300       __ align(OptoLoopAlignment);
2301     __ BIND(L_copy_16_bytes);
2302       __ ldx(from, offset0, O3);
2303       __ ldx(from, offset8, G3);
2304       __ deccc(count, 2);
2305       __ stx(O3, to, offset0);
2306       __ inc(offset0, 16);
2307       __ stx(G3, to, offset8);
2308       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2309       __ delayed()->inc(offset8, 16);
2310 
2311       // Copy last 8 bytes
2312     __ BIND(L_copy_8_bytes);
2313       __ inccc(count, 2);
2314       __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2315       __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs




1107           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
1108           __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
1109           __ srl_ptr(count, CardTableModRefBS::card_shift, count);
1110           __ sub(count, addr, count);
1111           AddressLiteral rs(ct->byte_map_base);
1112           __ set(rs, tmp);
1113         __ BIND(L_loop);
1114           __ stb(G0, tmp, addr);
1115           __ subcc(count, 1, count);
1116           __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1117           __ delayed()->add(addr, 1, addr);
1118         }
1119         break;
1120       case BarrierSet::ModRef:
1121         break;
1122       default:
1123         ShouldNotReachHere();
1124     }
1125   }
1126 
1127   //
1128   // Generate main code for disjoint arraycopy
1129   //
1130   typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
1131                                               Label& L_loop, bool use_prefetch, bool use_bis);
1132 
1133   void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
1134                           int iter_size, CopyLoopFunc copy_loop_func) {
1135     Label L_copy;
1136 
1137     assert(log2_elem_size <= 3, "the following code should be changed");
1138     int count_dec = 16>>log2_elem_size;
1139 
1140     int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
1141     assert(prefetch_dist < 4096, "invalid value");
1142     prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
1143     int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
1144 
1145     if (UseBlockCopy) {
1146       Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
1147 
1148       // 64 bytes tail + bytes copied in one loop iteration
1149       int tail_size = 64 + iter_size;
1150       int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
1151       // Use BIS copy only for big arrays since it requires membar.
1152       __ set(block_copy_count, O4);
1153       __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
1154       // This code is for disjoint source and destination:
1155       //   to <= from || to >= from+count
1156       // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
1157       __ sub(from, to, O4);
1158       __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
1159       __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
1160 
1161       __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
1162       // BIS should not be used to copy tail (64 bytes+iter_size)
1163       // to avoid zeroing of following values.
1164       __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
1165 
1166       if (prefetch_count > 0) { // rounded up to one iteration count
1167         // Do prefetching only if copy size is bigger
1168         // than prefetch distance.
1169         __ set(prefetch_count, O4);
1170         __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
1171         __ sub(count, prefetch_count, count);
1172 
1173         (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
1174         __ add(count, prefetch_count, count); // restore count
1175 
1176       } // prefetch_count > 0
1177 
1178       (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
1179       __ add(count, (tail_size>>log2_elem_size), count); // restore count
1180 
1181       __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
1182       // BIS needs membar.
1183       __ membar(Assembler::StoreLoad);
1184       // Copy tail
1185       __ ba_short(L_copy);
1186 
1187       __ BIND(L_skip_block_copy);
1188     } // UseBlockCopy
1189 
1190     if (prefetch_count > 0) { // rounded up to one iteration count
1191       // Do prefetching only if copy size is bigger
1192       // than prefetch distance.
1193       __ set(prefetch_count, O4);
1194       __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
1195       __ sub(count, prefetch_count, count);
1196  
1197       Label L_copy_prefetch;
1198       (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
1199       __ add(count, prefetch_count, count); // restore count
1200 
1201     } // prefetch_count > 0
1202 
1203     (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
1204   }
1205 
1206 
1207 
1208   // 
1209   // Helper methods for copy_16_bytes_forward_with_shift()
1210   // 
1211   void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
1212                                 Label& L_loop, bool use_prefetch, bool use_bis) {
1213 
1214     const Register left_shift  = G1; // left  shift bit counter
1215     const Register right_shift = G5; // right shift bit counter
1216 
1217     __ align(OptoLoopAlignment);
1218     __ BIND(L_loop);
1219     if (use_prefetch) {
1220       if (ArraycopySrcPrefetchDistance > 0) {
1221         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1222       }
1223       if (ArraycopyDstPrefetchDistance > 0) {
1224         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1225       }
1226     }
1227     __ ldx(from, 0, O4);
1228     __ ldx(from, 8, G4);
1229     __ inc(to, 16);
1230     __ inc(from, 16);
1231     __ deccc(count, count_dec); // Can we do next iteration after this one?
1232     __ srlx(O4, right_shift, G3);
1233     __ bset(G3, O3);
1234     __ sllx(O4, left_shift,  O4);
1235     __ srlx(G4, right_shift, G3);
1236     __ bset(G3, O4);
1237     if (use_bis) {
1238       __ stxa(O3, to, -16);
1239       __ stxa(O4, to, -8);
1240     } else {
1241       __ stx(O3, to, -16);
1242       __ stx(O4, to, -8);
1243     }
1244     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1245     __ delayed()->sllx(G4, left_shift,  O3);
1246   }
1247 
1248   // Copy big chunks forward with shift
1249   //
1250   // Inputs:
1251   //   from      - source arrays
1252   //   to        - destination array aligned to 8-bytes
1253   //   count     - elements count to copy >= the count equivalent to 16 bytes
1254   //   count_dec - elements count's decrement equivalent to 16 bytes
1255   //   L_copy_bytes - copy exit label
1256   //
1257   void copy_16_bytes_forward_with_shift(Register from, Register to,
1258                      Register count, int log2_elem_size, Label& L_copy_bytes) {
1259     Label L_aligned_copy, L_copy_last_bytes;
1260     assert(log2_elem_size <= 3, "the following code should be changed");
1261     int count_dec = 16>>log2_elem_size;
1262 
1263     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1264     __ andcc(from, 7, G1); // misaligned bytes
1265     __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1266     __ delayed()->nop();
1267 
1268     const Register left_shift  = G1; // left  shift bit counter
1269     const Register right_shift = G5; // right shift bit counter
1270 
1271     __ sll(G1, LogBitsPerByte, left_shift);
1272     __ mov(64, right_shift);
1273     __ sub(right_shift, left_shift, right_shift);
1274 
1275     //
1276     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1277     // to form 2 aligned 8-bytes chunks to store.
1278     //
1279     __ dec(count, count_dec);   // Pre-decrement 'count'
1280     __ andn(from, 7, from);     // Align address
1281     __ ldx(from, 0, O3);
1282     __ inc(from, 8);







1283     __ sllx(O3, left_shift,  O3);









1284 
1285     disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop);
1286 
1287     __ inccc(count, count_dec>>1 ); // + 8 bytes
1288     __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1289     __ delayed()->inc(count, count_dec>>1); // restore 'count'
1290 
1291     // copy 8 bytes, part of them already loaded in O3
1292     __ ldx(from, 0, O4);
1293     __ inc(to, 8);
1294     __ inc(from, 8);

1295     __ srlx(O4, right_shift, G3);
1296     __ bset(O3, G3);
1297     __ stx(G3, to, -8);
1298 
1299     __ BIND(L_copy_last_bytes);
1300     __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1301     __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1302     __ delayed()->sub(from, right_shift, from);       // restore address
1303 
1304     __ BIND(L_aligned_copy);
1305   }
1306 
1307   // Copy big chunks backward with shift
1308   //
1309   // Inputs:
1310   //   end_from  - source arrays end address
1311   //   end_to    - destination array end address aligned to 8-bytes
1312   //   count     - elements count to copy >= the count equivalent to 16 bytes
1313   //   count_dec - elements count's decrement equivalent to 16 bytes
1314   //   L_aligned_copy - aligned copy exit label


1438       __ sub(count, G1, count);
1439     __ BIND(L_align);
1440       __ ldub(from, 0, O3);
1441       __ deccc(G1);
1442       __ inc(from);
1443       __ stb(O3, to, 0);
1444       __ br(Assembler::notZero, false, Assembler::pt, L_align);
1445       __ delayed()->inc(to);
1446     __ BIND(L_skip_alignment);
1447     }
1448 #ifdef _LP64
1449     if (!aligned)
1450 #endif
1451     {
1452       // Copy with shift 16 bytes per iteration if arrays do not have
1453       // the same alignment mod 8, otherwise fall through to the next
1454       // code for aligned copy.
1455       // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1456       // Also jump over aligned copy after the copy with shift completed.
1457 
1458       copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
1459     }
1460 
1461     // Both array are 8 bytes aligned, copy 16 bytes at a time
1462       __ and3(count, 7, G4); // Save count
1463       __ srl(count, 3, count);
1464      generate_disjoint_long_copy_core(aligned);
1465       __ mov(G4, count);     // Restore count
1466 
1467     // copy tailing bytes
1468     __ BIND(L_copy_byte);
1469       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1470       __ align(OptoLoopAlignment);
1471     __ BIND(L_copy_byte_loop);
1472       __ ldub(from, offset, O3);
1473       __ deccc(count);
1474       __ stb(O3, to, offset);
1475       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1476       __ delayed()->inc(offset);
1477 
1478     __ BIND(L_exit);


1666       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1667       __ delayed()->lduh(from, 0, O3);
1668       __ dec(count, 2);
1669       __ lduh(from, 2, O4);
1670       __ inc(from, 4);
1671       __ inc(to, 4);
1672       __ sth(O3, to, -4);
1673       __ sth(O4, to, -2);
1674     __ BIND(L_skip_alignment2);
1675     }
1676 #ifdef _LP64
1677     if (!aligned)
1678 #endif
1679     {
1680       // Copy with shift 16 bytes per iteration if arrays do not have
1681       // the same alignment mod 8, otherwise fall through to the next
1682       // code for aligned copy.
1683       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1684       // Also jump over aligned copy after the copy with shift completed.
1685 
1686       copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
1687     }
1688 
1689     // Both array are 8 bytes aligned, copy 16 bytes at a time
1690       __ and3(count, 3, G4); // Save
1691       __ srl(count, 2, count);
1692      generate_disjoint_long_copy_core(aligned);
1693       __ mov(G4, count); // restore
1694 
1695     // copy 1 element at a time
1696     __ BIND(L_copy_2_bytes);
1697       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1698       __ align(OptoLoopAlignment);
1699     __ BIND(L_copy_2_bytes_loop);
1700       __ lduh(from, offset, O3);
1701       __ deccc(count);
1702       __ sth(O3, to, offset);
1703       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1704       __ delayed()->inc(offset, 2);
1705 
1706     __ BIND(L_exit);


2040     // copy 1 element (2 bytes) at a time
2041     __ BIND(L_copy_2_bytes);
2042       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2043     __ BIND(L_copy_2_bytes_loop);
2044       __ dec(end_from, 2);
2045       __ dec(end_to, 2);
2046       __ lduh(end_from, 0, O4);
2047       __ deccc(count);
2048       __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
2049       __ delayed()->sth(O4, end_to, 0);
2050 
2051     __ BIND(L_exit);
2052     // O3, O4 are used as temp registers
2053     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
2054     __ retl();
2055     __ delayed()->mov(G0, O0); // return 0
2056     return start;
2057   }
2058 
2059   // 
2060   // Helper methods for generate_disjoint_int_copy_core()
2061   // 
2062   void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
2063                           Label& L_loop, bool use_prefetch, bool use_bis) {
2064 
2065     __ align(OptoLoopAlignment);
2066     __ BIND(L_loop);
2067     if (use_prefetch) {
2068       if (ArraycopySrcPrefetchDistance > 0) {
2069         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
2070       }
2071       if (ArraycopyDstPrefetchDistance > 0) {
2072         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
2073       }
2074     }
2075     __ ldx(from, 4, O4);
2076     __ ldx(from, 12, G4);
2077     __ inc(to, 16);
2078     __ inc(from, 16);
2079     __ deccc(count, 4); // Can we do next iteration after this one?
2080 
2081     __ srlx(O4, 32, G3);
2082     __ bset(G3, O3);
2083     __ sllx(O4, 32, O4);
2084     __ srlx(G4, 32, G3);
2085     __ bset(G3, O4);
2086     if (use_bis) {
2087       __ stxa(O3, to, -16);
2088       __ stxa(O4, to, -8);
2089     } else {
2090       __ stx(O3, to, -16);
2091       __ stx(O4, to, -8);
2092     }
2093     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2094     __ delayed()->sllx(G4, 32,  O3);
2095 
2096   }
2097 
2098   //
2099   //  Generate core code for disjoint int copy (and oop copy on 32-bit).
2100   //  If "aligned" is true, the "from" and "to" addresses are assumed
2101   //  to be heapword aligned.
2102   //
2103   // Arguments:
2104   //      from:  O0
2105   //      to:    O1
2106   //      count: O2 treated as signed
2107   //
2108   void generate_disjoint_int_copy_core(bool aligned) {
2109 
2110     Label L_skip_alignment, L_aligned_copy;
2111     Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2112 
2113     const Register from      = O0;   // source array address
2114     const Register to        = O1;   // destination array address
2115     const Register count     = O2;   // elements count
2116     const Register offset    = O5;   // offset from start of arrays
2117     // O3, O4, G3, G4 are used as temp registers
2118 
2119     // 'aligned' == true when it is known statically during compilation
2120     // of this arraycopy call site that both 'from' and 'to' addresses
2121     // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
2122     //
2123     // Aligned arrays have 4 bytes alignment in 32-bits VM
2124     // and 8 bytes - in 64-bits VM.
2125     //
2126 #ifdef _LP64
2127     if (!aligned)
2128 #endif
2129     {
2130       // The next check could be put under 'ifndef' since the code in
2131       // generate_disjoint_long_copy_core() has own checks and set 'offset'.


2142       __ inc(from, 4);
2143       __ inc(to, 4);
2144       __ dec(count);
2145       __ st(O3, to, -4);
2146     __ BIND(L_skip_alignment);
2147 
2148     // if arrays have same alignment mod 8, do 4 elements copy
2149       __ andcc(from, 7, G0);
2150       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2151       __ delayed()->ld(from, 0, O3);
2152 
2153     //
2154     // Load 2 aligned 8-bytes chunks and use one from previous iteration
2155     // to form 2 aligned 8-bytes chunks to store.
2156     //
2157     // copy_16_bytes_forward_with_shift() is not used here since this
2158     // code is more optimal.
2159 
2160     // copy with shift 4 elements (16 bytes) at a time
2161       __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4








2162       __ sllx(O3, 32,  O3);









2163 
2164       disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop);
2165 
2166       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2167       __ delayed()->inc(count, 4); // restore 'count'
2168 
2169     __ BIND(L_aligned_copy);
2170     } // !aligned
2171 
2172     // copy 4 elements (16 bytes) at a time
2173       __ and3(count, 1, G4); // Save
2174       __ srl(count, 1, count);
2175      generate_disjoint_long_copy_core(aligned);
2176       __ mov(G4, count);     // Restore
2177 
2178     // copy 1 element at a time
2179     __ BIND(L_copy_4_bytes);
2180       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2181     __ BIND(L_copy_4_bytes_loop);
2182       __ ld(from, offset, O3);
2183       __ deccc(count);
2184       __ st(O3, to, offset);
2185       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2186       __ delayed()->inc(offset, 4);
2187     __ BIND(L_exit);
2188   }
2189 
2190   //
2191   //  Generate stub for disjoint int copy.  If "aligned" is true, the


2338     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2339 
2340     if (entry != NULL) {
2341       *entry = __ pc();
2342       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2343       BLOCK_COMMENT("Entry:");
2344     }
2345 
2346     array_overlap_test(nooverlap_target, 2);
2347 
2348     generate_conjoint_int_copy_core(aligned);
2349 
2350     // O3, O4 are used as temp registers
2351     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2352     __ retl();
2353     __ delayed()->mov(G0, O0); // return 0
2354     return start;
2355   }
2356 
2357   // 
2358   // Helper methods for generate_disjoint_long_copy_core()
2359   // 
2360   void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
2361                           Label& L_loop, bool use_prefetch, bool use_bis) {
2362     __ align(OptoLoopAlignment);
2363     __ BIND(L_loop);
2364     for (int off = 0; off < 64; off += 16) {
2365       if (use_prefetch && (off & 31) == 0) {
2366         if (ArraycopySrcPrefetchDistance > 0) {
2367           __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
2368         }
2369         if (ArraycopyDstPrefetchDistance > 0) {
2370           __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
2371         }
2372       } 
2373       __ ldx(from,  off+0, O4);
2374       __ ldx(from,  off+8, O5);
2375       if (use_bis) {
2376         __ stxa(O4, to,  off+0);
2377         __ stxa(O5, to,  off+8);
2378       } else {
2379         __ stx(O4, to,  off+0);
2380         __ stx(O5, to,  off+8);
2381       }
2382     }
2383     __ deccc(count, 8);
2384     __ inc(from, 64);
2385     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2386     __ delayed()->inc(to, 64);
2387   }
2388 
2389   //
2390   //  Generate core code for disjoint long copy (and oop copy on 64-bit).
2391   //  "aligned" is ignored, because we must make the stronger
2392   //  assumption that both addresses are always 64-bit aligned.
2393   //
2394   // Arguments:
2395   //      from:  O0
2396   //      to:    O1
2397   //      count: O2 treated as signed
2398   //
2399   // count -= 2;
2400   // if ( count >= 0 ) { // >= 2 elements
2401   //   if ( count > 6) { // >= 8 elements
2402   //     count -= 6; // original count - 8
2403   //     do {
2404   //       copy_8_elements;
2405   //       count -= 8;
2406   //     } while ( count >= 0 );
2407   //     count += 6;
2408   //   }
2409   //   if ( count >= 0 ) { // >= 2 elements


2414   // }
2415   // count += 2;
2416   // if ( count != 0 ) { // 1 element left
2417   //   copy_1_element;
2418   // }
2419   //
2420   void generate_disjoint_long_copy_core(bool aligned) {
2421     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2422     const Register from    = O0;  // source array address
2423     const Register to      = O1;  // destination array address
2424     const Register count   = O2;  // elements count
2425     const Register offset0 = O4;  // element offset
2426     const Register offset8 = O5;  // next element offset
2427 
2428     __ deccc(count, 2);
2429     __ mov(G0, offset0);   // offset from start of arrays (0)
2430     __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2431     __ delayed()->add(offset0, 8, offset8);
2432 
2433     // Copy by 64 bytes chunks
2434 
2435     const Register from64 = O3;  // source address
2436     const Register to64   = G3;  // destination address
2437     __ subcc(count, 6, O3);
2438     __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2439     __ delayed()->mov(to,   to64);
2440     // Now we can use O4(offset0), O5(offset8) as temps
2441     __ mov(O3, count);
2442     // count >= 0 (original count - 8)
2443     __ mov(from, from64);
2444 
2445     disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop);











2446 
2447       // Restore O4(offset0), O5(offset8)
2448       __ sub(from64, from, offset0);
2449       __ inccc(count, 6); // restore count
2450       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2451       __ delayed()->add(offset0, 8, offset8);
2452 
2453       // Copy by 16 bytes chunks
2454       __ align(OptoLoopAlignment);
2455     __ BIND(L_copy_16_bytes);
2456       __ ldx(from, offset0, O3);
2457       __ ldx(from, offset8, G3);
2458       __ deccc(count, 2);
2459       __ stx(O3, to, offset0);
2460       __ inc(offset0, 16);
2461       __ stx(G3, to, offset8);
2462       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2463       __ delayed()->inc(offset8, 16);
2464 
2465       // Copy last 8 bytes
2466     __ BIND(L_copy_8_bytes);
2467       __ inccc(count, 2);
2468       __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2469       __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs


src/cpu/sparc/vm/stubGenerator_sparc.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File