src/cpu/x86/vm/stubGenerator_x86_64.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File 8005544 Sdiff src/cpu/x86/vm

src/cpu/x86/vm/stubGenerator_x86_64.cpp

Print this page




1269         __ BIND(L_loop);
1270           __ movb(Address(start, count, Address::times_1), 0);
1271           __ decrement(count);
1272           __ jcc(Assembler::greaterEqual, L_loop);
1273         }
1274         break;
1275       default:
1276         ShouldNotReachHere();
1277 
1278     }
1279   }
1280 
1281 
1282   // Copy big chunks forward
1283   //
1284   // Inputs:
1285   //   end_from     - source arrays end address
1286   //   end_to       - destination array end address
1287   //   qword_count  - 64-bits element count, negative
1288   //   to           - scratch
1289   //   L_copy_32_bytes - entry label
1290   //   L_copy_8_bytes  - exit  label
1291   //
1292   void copy_32_bytes_forward(Register end_from, Register end_to,
1293                              Register qword_count, Register to,
1294                              Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
1295     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1296     Label L_loop;
1297     __ align(OptoLoopAlignment);



1298   __ BIND(L_loop);
1299     if(UseUnalignedLoadStores) {
























1300       __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1301       __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1302       __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1303       __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1304 


1305     } else {


1306       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1307       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1308       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1309       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1310       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1311       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1312       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1313       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1314     }
1315   __ BIND(L_copy_32_bytes);
1316     __ addptr(qword_count, 4);
1317     __ jcc(Assembler::lessEqual, L_loop);

1318     __ subptr(qword_count, 4);
1319     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1320   }
1321 
1322 
1323   // Copy big chunks backward
1324   //
1325   // Inputs:
1326   //   from         - source arrays address
1327   //   dest         - destination array address
1328   //   qword_count  - 64-bits element count
1329   //   to           - scratch
1330   //   L_copy_32_bytes - entry label
1331   //   L_copy_8_bytes  - exit  label
1332   //
1333   void copy_32_bytes_backward(Register from, Register dest,
1334                               Register qword_count, Register to,
1335                               Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
1336     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1337     Label L_loop;
1338     __ align(OptoLoopAlignment);



1339   __ BIND(L_loop);
1340     if(UseUnalignedLoadStores) {

























1341       __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1342       __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1343       __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1344       __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1345 


1346     } else {


1347       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1348       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1349       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1350       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1351       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1352       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1353       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1354       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1355     }
1356   __ BIND(L_copy_32_bytes);
1357     __ subptr(qword_count, 4);
1358     __ jcc(Assembler::greaterEqual, L_loop);

1359     __ addptr(qword_count, 4);
1360     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1361   }
1362 
1363 
1364   // Arguments:
1365   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1366   //             ignored
1367   //   name    - stub name string
1368   //
1369   // Inputs:
1370   //   c_rarg0   - source array address
1371   //   c_rarg1   - destination array address
1372   //   c_rarg2   - element count, treated as ssize_t, can be zero
1373   //
1374   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1375   // we let the hardware handle it.  The one to eight bytes within words,
1376   // dwords or qwords that span cache line boundaries will still be loaded
1377   // and stored atomically.
1378   //
1379   // Side Effects:
1380   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1381   //   used by generate_conjoint_byte_copy().
1382   //
1383   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1384     __ align(CodeEntryAlignment);
1385     StubCodeMark mark(this, "StubRoutines", name);
1386     address start = __ pc();
1387 
1388     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1389     Label L_copy_byte, L_exit;
1390     const Register from        = rdi;  // source array address
1391     const Register to          = rsi;  // destination array address
1392     const Register count       = rdx;  // elements count
1393     const Register byte_count  = rcx;
1394     const Register qword_count = count;
1395     const Register end_from    = from; // source array end address
1396     const Register end_to      = to;   // destination array end address
1397     // End pointers are inclusive, and if count is not zero they point
1398     // to the last unit copied:  end_to[0] := end_from[0]
1399 
1400     __ enter(); // required for proper stackwalking of RuntimeStub frame
1401     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1402 
1403     if (entry != NULL) {
1404       *entry = __ pc();
1405        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1406       BLOCK_COMMENT("Entry:");
1407     }
1408 
1409     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1410                       // r9 and r10 may be used to save non-volatile registers
1411 
1412     // 'from', 'to' and 'count' are now valid
1413     __ movptr(byte_count, count);
1414     __ shrptr(count, 3); // count => qword_count
1415 
1416     // Copy from low to high addresses.  Use 'to' as scratch.
1417     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1418     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1419     __ negptr(qword_count); // make the count negative
1420     __ jmp(L_copy_32_bytes);
1421 
1422     // Copy trailing qwords
1423   __ BIND(L_copy_8_bytes);
1424     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1425     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1426     __ increment(qword_count);
1427     __ jcc(Assembler::notZero, L_copy_8_bytes);
1428 
1429     // Check for and copy trailing dword
1430   __ BIND(L_copy_4_bytes);
1431     __ testl(byte_count, 4);
1432     __ jccb(Assembler::zero, L_copy_2_bytes);
1433     __ movl(rax, Address(end_from, 8));
1434     __ movl(Address(end_to, 8), rax);
1435 
1436     __ addptr(end_from, 4);
1437     __ addptr(end_to, 4);
1438 
1439     // Check for and copy trailing word
1440   __ BIND(L_copy_2_bytes);


1443     __ movw(rax, Address(end_from, 8));
1444     __ movw(Address(end_to, 8), rax);
1445 
1446     __ addptr(end_from, 2);
1447     __ addptr(end_to, 2);
1448 
1449     // Check for and copy trailing byte
1450   __ BIND(L_copy_byte);
1451     __ testl(byte_count, 1);
1452     __ jccb(Assembler::zero, L_exit);
1453     __ movb(rax, Address(end_from, 8));
1454     __ movb(Address(end_to, 8), rax);
1455 
1456   __ BIND(L_exit);
1457     restore_arg_regs();
1458     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1459     __ xorptr(rax, rax); // return 0
1460     __ leave(); // required for proper stackwalking of RuntimeStub frame
1461     __ ret(0);
1462 
1463     // Copy in 32-bytes chunks
1464     copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1465     __ jmp(L_copy_4_bytes);
1466 
1467     return start;
1468   }
1469 
1470   // Arguments:
1471   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1472   //             ignored
1473   //   name    - stub name string
1474   //
1475   // Inputs:
1476   //   c_rarg0   - source array address
1477   //   c_rarg1   - destination array address
1478   //   c_rarg2   - element count, treated as ssize_t, can be zero
1479   //
1480   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1481   // we let the hardware handle it.  The one to eight bytes within words,
1482   // dwords or qwords that span cache line boundaries will still be loaded
1483   // and stored atomically.
1484   //
1485   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1486                                       address* entry, const char *name) {
1487     __ align(CodeEntryAlignment);
1488     StubCodeMark mark(this, "StubRoutines", name);
1489     address start = __ pc();
1490 
1491     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1492     const Register from        = rdi;  // source array address
1493     const Register to          = rsi;  // destination array address
1494     const Register count       = rdx;  // elements count
1495     const Register byte_count  = rcx;
1496     const Register qword_count = count;
1497 
1498     __ enter(); // required for proper stackwalking of RuntimeStub frame
1499     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1500 
1501     if (entry != NULL) {
1502       *entry = __ pc();
1503       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1504       BLOCK_COMMENT("Entry:");
1505     }
1506 
1507     array_overlap_test(nooverlap_target, Address::times_1);
1508     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1509                       // r9 and r10 may be used to save non-volatile registers
1510 
1511     // 'from', 'to' and 'count' are now valid


1514 
1515     // Copy from high to low addresses.
1516 
1517     // Check for and copy trailing byte
1518     __ testl(byte_count, 1);
1519     __ jcc(Assembler::zero, L_copy_2_bytes);
1520     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1521     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1522     __ decrement(byte_count); // Adjust for possible trailing word
1523 
1524     // Check for and copy trailing word
1525   __ BIND(L_copy_2_bytes);
1526     __ testl(byte_count, 2);
1527     __ jcc(Assembler::zero, L_copy_4_bytes);
1528     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1529     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1530 
1531     // Check for and copy trailing dword
1532   __ BIND(L_copy_4_bytes);
1533     __ testl(byte_count, 4);
1534     __ jcc(Assembler::zero, L_copy_32_bytes);
1535     __ movl(rax, Address(from, qword_count, Address::times_8));
1536     __ movl(Address(to, qword_count, Address::times_8), rax);
1537     __ jmp(L_copy_32_bytes);
1538 
1539     // Copy trailing qwords
1540   __ BIND(L_copy_8_bytes);
1541     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1542     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1543     __ decrement(qword_count);
1544     __ jcc(Assembler::notZero, L_copy_8_bytes);
1545 
1546     restore_arg_regs();
1547     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1548     __ xorptr(rax, rax); // return 0
1549     __ leave(); // required for proper stackwalking of RuntimeStub frame
1550     __ ret(0);
1551 
1552     // Copy in 32-bytes chunks
1553     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1554 
1555     restore_arg_regs();
1556     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1557     __ xorptr(rax, rax); // return 0
1558     __ leave(); // required for proper stackwalking of RuntimeStub frame
1559     __ ret(0);
1560 
1561     return start;
1562   }
1563 
1564   // Arguments:
1565   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1566   //             ignored
1567   //   name    - stub name string
1568   //
1569   // Inputs:
1570   //   c_rarg0   - source array address
1571   //   c_rarg1   - destination array address
1572   //   c_rarg2   - element count, treated as ssize_t, can be zero
1573   //
1574   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1575   // let the hardware handle it.  The two or four words within dwords
1576   // or qwords that span cache line boundaries will still be loaded
1577   // and stored atomically.
1578   //
1579   // Side Effects:
1580   //   disjoint_short_copy_entry is set to the no-overlap entry point
1581   //   used by generate_conjoint_short_copy().
1582   //
1583   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1584     __ align(CodeEntryAlignment);
1585     StubCodeMark mark(this, "StubRoutines", name);
1586     address start = __ pc();
1587 
1588     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1589     const Register from        = rdi;  // source array address
1590     const Register to          = rsi;  // destination array address
1591     const Register count       = rdx;  // elements count
1592     const Register word_count  = rcx;
1593     const Register qword_count = count;
1594     const Register end_from    = from; // source array end address
1595     const Register end_to      = to;   // destination array end address
1596     // End pointers are inclusive, and if count is not zero they point
1597     // to the last unit copied:  end_to[0] := end_from[0]
1598 
1599     __ enter(); // required for proper stackwalking of RuntimeStub frame
1600     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1601 
1602     if (entry != NULL) {
1603       *entry = __ pc();
1604       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1605       BLOCK_COMMENT("Entry:");
1606     }
1607 
1608     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1609                       // r9 and r10 may be used to save non-volatile registers
1610 
1611     // 'from', 'to' and 'count' are now valid
1612     __ movptr(word_count, count);
1613     __ shrptr(count, 2); // count => qword_count
1614 
1615     // Copy from low to high addresses.  Use 'to' as scratch.
1616     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1617     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1618     __ negptr(qword_count);
1619     __ jmp(L_copy_32_bytes);
1620 
1621     // Copy trailing qwords
1622   __ BIND(L_copy_8_bytes);
1623     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1624     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1625     __ increment(qword_count);
1626     __ jcc(Assembler::notZero, L_copy_8_bytes);
1627 
1628     // Original 'dest' is trashed, so we can't use it as a
1629     // base register for a possible trailing word copy
1630 
1631     // Check for and copy trailing dword
1632   __ BIND(L_copy_4_bytes);
1633     __ testl(word_count, 2);
1634     __ jccb(Assembler::zero, L_copy_2_bytes);
1635     __ movl(rax, Address(end_from, 8));
1636     __ movl(Address(end_to, 8), rax);
1637 
1638     __ addptr(end_from, 4);
1639     __ addptr(end_to, 4);
1640 
1641     // Check for and copy trailing word
1642   __ BIND(L_copy_2_bytes);
1643     __ testl(word_count, 1);
1644     __ jccb(Assembler::zero, L_exit);
1645     __ movw(rax, Address(end_from, 8));
1646     __ movw(Address(end_to, 8), rax);
1647 
1648   __ BIND(L_exit);
1649     restore_arg_regs();
1650     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1651     __ xorptr(rax, rax); // return 0
1652     __ leave(); // required for proper stackwalking of RuntimeStub frame
1653     __ ret(0);
1654 
1655     // Copy in 32-bytes chunks
1656     copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1657     __ jmp(L_copy_4_bytes);
1658 
1659     return start;
1660   }
1661 
1662   address generate_fill(BasicType t, bool aligned, const char *name) {
1663     __ align(CodeEntryAlignment);
1664     StubCodeMark mark(this, "StubRoutines", name);
1665     address start = __ pc();
1666 
1667     BLOCK_COMMENT("Entry:");
1668 
1669     const Register to       = c_rarg0;  // source array address
1670     const Register value    = c_rarg1;  // value
1671     const Register count    = c_rarg2;  // elements count
1672 
1673     __ enter(); // required for proper stackwalking of RuntimeStub frame
1674 
1675     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1676 


1683   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1684   //             ignored
1685   //   name    - stub name string
1686   //
1687   // Inputs:
1688   //   c_rarg0   - source array address
1689   //   c_rarg1   - destination array address
1690   //   c_rarg2   - element count, treated as ssize_t, can be zero
1691   //
1692   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1693   // let the hardware handle it.  The two or four words within dwords
1694   // or qwords that span cache line boundaries will still be loaded
1695   // and stored atomically.
1696   //
1697   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1698                                        address *entry, const char *name) {
1699     __ align(CodeEntryAlignment);
1700     StubCodeMark mark(this, "StubRoutines", name);
1701     address start = __ pc();
1702 
1703     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes;
1704     const Register from        = rdi;  // source array address
1705     const Register to          = rsi;  // destination array address
1706     const Register count       = rdx;  // elements count
1707     const Register word_count  = rcx;
1708     const Register qword_count = count;
1709 
1710     __ enter(); // required for proper stackwalking of RuntimeStub frame
1711     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1712 
1713     if (entry != NULL) {
1714       *entry = __ pc();
1715       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1716       BLOCK_COMMENT("Entry:");
1717     }
1718 
1719     array_overlap_test(nooverlap_target, Address::times_2);
1720     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1721                       // r9 and r10 may be used to save non-volatile registers
1722 
1723     // 'from', 'to' and 'count' are now valid
1724     __ movptr(word_count, count);
1725     __ shrptr(count, 2); // count => qword_count
1726 
1727     // Copy from high to low addresses.  Use 'to' as scratch.
1728 
1729     // Check for and copy trailing word
1730     __ testl(word_count, 1);
1731     __ jccb(Assembler::zero, L_copy_4_bytes);
1732     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1733     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1734 
1735     // Check for and copy trailing dword
1736   __ BIND(L_copy_4_bytes);
1737     __ testl(word_count, 2);
1738     __ jcc(Assembler::zero, L_copy_32_bytes);
1739     __ movl(rax, Address(from, qword_count, Address::times_8));
1740     __ movl(Address(to, qword_count, Address::times_8), rax);
1741     __ jmp(L_copy_32_bytes);
1742 
1743     // Copy trailing qwords
1744   __ BIND(L_copy_8_bytes);
1745     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1746     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1747     __ decrement(qword_count);
1748     __ jcc(Assembler::notZero, L_copy_8_bytes);
1749 
1750     restore_arg_regs();
1751     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1752     __ xorptr(rax, rax); // return 0
1753     __ leave(); // required for proper stackwalking of RuntimeStub frame
1754     __ ret(0);
1755 
1756     // Copy in 32-bytes chunks
1757     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1758 
1759     restore_arg_regs();
1760     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1761     __ xorptr(rax, rax); // return 0
1762     __ leave(); // required for proper stackwalking of RuntimeStub frame
1763     __ ret(0);
1764 
1765     return start;
1766   }
1767 
1768   // Arguments:
1769   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1770   //             ignored
1771   //   is_oop  - true => oop array, so generate store check code
1772   //   name    - stub name string
1773   //
1774   // Inputs:
1775   //   c_rarg0   - source array address
1776   //   c_rarg1   - destination array address
1777   //   c_rarg2   - element count, treated as ssize_t, can be zero
1778   //
1779   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1780   // the hardware handle it.  The two dwords within qwords that span
1781   // cache line boundaries will still be loaded and stored atomicly.
1782   //
1783   // Side Effects:
1784   //   disjoint_int_copy_entry is set to the no-overlap entry point
1785   //   used by generate_conjoint_int_oop_copy().
1786   //
1787   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1788                                          const char *name, bool dest_uninitialized = false) {
1789     __ align(CodeEntryAlignment);
1790     StubCodeMark mark(this, "StubRoutines", name);
1791     address start = __ pc();
1792 
1793     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1794     const Register from        = rdi;  // source array address
1795     const Register to          = rsi;  // destination array address
1796     const Register count       = rdx;  // elements count
1797     const Register dword_count = rcx;
1798     const Register qword_count = count;
1799     const Register end_from    = from; // source array end address
1800     const Register end_to      = to;   // destination array end address
1801     const Register saved_to    = r11;  // saved destination array address
1802     // End pointers are inclusive, and if count is not zero they point
1803     // to the last unit copied:  end_to[0] := end_from[0]
1804 
1805     __ enter(); // required for proper stackwalking of RuntimeStub frame
1806     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1807 
1808     if (entry != NULL) {
1809       *entry = __ pc();
1810       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1811       BLOCK_COMMENT("Entry:");
1812     }
1813 
1814     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1815                       // r9 and r10 may be used to save non-volatile registers
1816     if (is_oop) {
1817       __ movq(saved_to, to);
1818       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1819     }
1820 
1821     // 'from', 'to' and 'count' are now valid
1822     __ movptr(dword_count, count);
1823     __ shrptr(count, 1); // count => qword_count
1824 
1825     // Copy from low to high addresses.  Use 'to' as scratch.
1826     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1827     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1828     __ negptr(qword_count);
1829     __ jmp(L_copy_32_bytes);
1830 
1831     // Copy trailing qwords
1832   __ BIND(L_copy_8_bytes);
1833     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1834     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1835     __ increment(qword_count);
1836     __ jcc(Assembler::notZero, L_copy_8_bytes);
1837 
1838     // Check for and copy trailing dword
1839   __ BIND(L_copy_4_bytes);
1840     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1841     __ jccb(Assembler::zero, L_exit);
1842     __ movl(rax, Address(end_from, 8));
1843     __ movl(Address(end_to, 8), rax);
1844 
1845   __ BIND(L_exit);
1846     if (is_oop) {
1847       __ leaq(end_to, Address(saved_to, dword_count, Address::times_4, -4));
1848       gen_write_ref_array_post_barrier(saved_to, end_to, rax);
1849     }
1850     restore_arg_regs();
1851     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1852     __ xorptr(rax, rax); // return 0
1853     __ leave(); // required for proper stackwalking of RuntimeStub frame
1854     __ ret(0);
1855 
1856     // Copy 32-bytes chunks
1857     copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1858     __ jmp(L_copy_4_bytes);
1859 
1860     return start;
1861   }
1862 
1863   // Arguments:
1864   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1865   //             ignored
1866   //   is_oop  - true => oop array, so generate store check code
1867   //   name    - stub name string
1868   //
1869   // Inputs:
1870   //   c_rarg0   - source array address
1871   //   c_rarg1   - destination array address
1872   //   c_rarg2   - element count, treated as ssize_t, can be zero
1873   //
1874   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1875   // the hardware handle it.  The two dwords within qwords that span
1876   // cache line boundaries will still be loaded and stored atomicly.
1877   //
1878   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1879                                          address *entry, const char *name,
1880                                          bool dest_uninitialized = false) {
1881     __ align(CodeEntryAlignment);
1882     StubCodeMark mark(this, "StubRoutines", name);
1883     address start = __ pc();
1884 
1885     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
1886     const Register from        = rdi;  // source array address
1887     const Register to          = rsi;  // destination array address
1888     const Register count       = rdx;  // elements count
1889     const Register dword_count = rcx;
1890     const Register qword_count = count;
1891 
1892     __ enter(); // required for proper stackwalking of RuntimeStub frame
1893     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1894 
1895     if (entry != NULL) {
1896       *entry = __ pc();
1897        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1898       BLOCK_COMMENT("Entry:");
1899     }
1900 
1901     array_overlap_test(nooverlap_target, Address::times_4);
1902     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1903                       // r9 and r10 may be used to save non-volatile registers
1904 
1905     if (is_oop) {
1906       // no registers are destroyed by this call
1907       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1908     }
1909 
1910     assert_clean_int(count, rax); // Make sure 'count' is clean int.
1911     // 'from', 'to' and 'count' are now valid
1912     __ movptr(dword_count, count);
1913     __ shrptr(count, 1); // count => qword_count
1914 
1915     // Copy from high to low addresses.  Use 'to' as scratch.
1916 
1917     // Check for and copy trailing dword
1918     __ testl(dword_count, 1);
1919     __ jcc(Assembler::zero, L_copy_32_bytes);
1920     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
1921     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
1922     __ jmp(L_copy_32_bytes);
1923 
1924     // Copy trailing qwords
1925   __ BIND(L_copy_8_bytes);
1926     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1927     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1928     __ decrement(qword_count);
1929     __ jcc(Assembler::notZero, L_copy_8_bytes);
1930 
1931     if (is_oop) {
1932       __ jmp(L_exit);
1933     }
1934     restore_arg_regs();
1935     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1936     __ xorptr(rax, rax); // return 0
1937     __ leave(); // required for proper stackwalking of RuntimeStub frame
1938     __ ret(0);
1939 
1940     // Copy in 32-bytes chunks
1941     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
1942 
1943    __ bind(L_exit);
1944      if (is_oop) {
1945        Register end_to = rdx;
1946        __ leaq(end_to, Address(to, dword_count, Address::times_4, -4));
1947        gen_write_ref_array_post_barrier(to, end_to, rax);
1948      }
1949     restore_arg_regs();
1950     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1951     __ xorptr(rax, rax); // return 0
1952     __ leave(); // required for proper stackwalking of RuntimeStub frame
1953     __ ret(0);
1954 
1955     return start;
1956   }
1957 
1958   // Arguments:
1959   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1960   //             ignored
1961   //   is_oop  - true => oop array, so generate store check code
1962   //   name    - stub name string
1963   //
1964   // Inputs:
1965   //   c_rarg0   - source array address
1966   //   c_rarg1   - destination array address
1967   //   c_rarg2   - element count, treated as ssize_t, can be zero
1968   //
1969  // Side Effects:
1970   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1971   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1972   //
1973   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
1974                                           const char *name, bool dest_uninitialized = false) {
1975     __ align(CodeEntryAlignment);
1976     StubCodeMark mark(this, "StubRoutines", name);
1977     address start = __ pc();
1978 
1979     Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
1980     const Register from        = rdi;  // source array address
1981     const Register to          = rsi;  // destination array address
1982     const Register qword_count = rdx;  // elements count
1983     const Register end_from    = from; // source array end address
1984     const Register end_to      = rcx;  // destination array end address
1985     const Register saved_to    = to;
1986     // End pointers are inclusive, and if count is not zero they point
1987     // to the last unit copied:  end_to[0] := end_from[0]
1988 
1989     __ enter(); // required for proper stackwalking of RuntimeStub frame
1990     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
1991     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1992 
1993     if (entry != NULL) {
1994       *entry = __ pc();
1995       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1996       BLOCK_COMMENT("Entry:");
1997     }
1998 
1999     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2000                       // r9 and r10 may be used to save non-volatile registers
2001     // 'from', 'to' and 'qword_count' are now valid
2002     if (is_oop) {
2003       // no registers are destroyed by this call
2004       gen_write_ref_array_pre_barrier(to, qword_count, dest_uninitialized);
2005     }
2006 
2007     // Copy from low to high addresses.  Use 'to' as scratch.
2008     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2009     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2010     __ negptr(qword_count);
2011     __ jmp(L_copy_32_bytes);
2012 
2013     // Copy trailing qwords
2014   __ BIND(L_copy_8_bytes);
2015     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2016     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2017     __ increment(qword_count);
2018     __ jcc(Assembler::notZero, L_copy_8_bytes);
2019 
2020     if (is_oop) {
2021       __ jmp(L_exit);
2022     } else {
2023       restore_arg_regs();
2024       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2025       __ xorptr(rax, rax); // return 0
2026       __ leave(); // required for proper stackwalking of RuntimeStub frame
2027       __ ret(0);
2028     }
2029 
2030     // Copy 64-byte chunks
2031     copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
2032 
2033     if (is_oop) {
2034     __ BIND(L_exit);
2035       gen_write_ref_array_post_barrier(saved_to, end_to, rax);
2036     }
2037     restore_arg_regs();
2038     if (is_oop) {
2039       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2040     } else {
2041       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2042     }
2043     __ xorptr(rax, rax); // return 0
2044     __ leave(); // required for proper stackwalking of RuntimeStub frame
2045     __ ret(0);
2046 
2047     return start;
2048   }
2049 
2050   // Arguments:
2051   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2052   //             ignored
2053   //   is_oop  - true => oop array, so generate store check code
2054   //   name    - stub name string
2055   //
2056   // Inputs:
2057   //   c_rarg0   - source array address
2058   //   c_rarg1   - destination array address
2059   //   c_rarg2   - element count, treated as ssize_t, can be zero
2060   //
2061   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2062                                           address nooverlap_target, address *entry,
2063                                           const char *name, bool dest_uninitialized = false) {
2064     __ align(CodeEntryAlignment);
2065     StubCodeMark mark(this, "StubRoutines", name);
2066     address start = __ pc();
2067 
2068     Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
2069     const Register from        = rdi;  // source array address
2070     const Register to          = rsi;  // destination array address
2071     const Register qword_count = rdx;  // elements count
2072     const Register saved_count = rcx;
2073 
2074     __ enter(); // required for proper stackwalking of RuntimeStub frame
2075     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2076 
2077     if (entry != NULL) {
2078       *entry = __ pc();
2079       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2080       BLOCK_COMMENT("Entry:");
2081     }
2082 
2083     array_overlap_test(nooverlap_target, Address::times_8);
2084     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2085                       // r9 and r10 may be used to save non-volatile registers
2086     // 'from', 'to' and 'qword_count' are now valid
2087     if (is_oop) {
2088       // Save to and count for store barrier
2089       __ movptr(saved_count, qword_count);
2090       // No registers are destroyed by this call
2091       gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2092     }
2093 
2094     __ jmp(L_copy_32_bytes);
2095 
2096     // Copy trailing qwords
2097   __ BIND(L_copy_8_bytes);
2098     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2099     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2100     __ decrement(qword_count);
2101     __ jcc(Assembler::notZero, L_copy_8_bytes);
2102 
2103     if (is_oop) {
2104       __ jmp(L_exit);
2105     } else {
2106       restore_arg_regs();
2107       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2108       __ xorptr(rax, rax); // return 0
2109       __ leave(); // required for proper stackwalking of RuntimeStub frame
2110       __ ret(0);
2111     }
2112 
2113     // Copy in 32-bytes chunks
2114     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
2115 
2116     if (is_oop) {
2117     __ BIND(L_exit);
2118       __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
2119       gen_write_ref_array_post_barrier(to, rcx, rax);
2120     }
2121     restore_arg_regs();
2122     if (is_oop) {
2123       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2124     } else {
2125       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2126     }
2127     __ xorptr(rax, rax); // return 0
2128     __ leave(); // required for proper stackwalking of RuntimeStub frame
2129     __ ret(0);
2130 
2131     return start;
2132   }
2133 
2134 




1269         __ BIND(L_loop);
1270           __ movb(Address(start, count, Address::times_1), 0);
1271           __ decrement(count);
1272           __ jcc(Assembler::greaterEqual, L_loop);
1273         }
1274         break;
1275       default:
1276         ShouldNotReachHere();
1277 
1278     }
1279   }
1280 
1281 
1282   // Copy big chunks forward
1283   //
1284   // Inputs:
1285   //   end_from     - source arrays end address
1286   //   end_to       - destination array end address
1287   //   qword_count  - 64-bits element count, negative
1288   //   to           - scratch
1289   //   L_copy_bytes - entry label
1290   //   L_copy_8_bytes  - exit  label
1291   //
1292   void copy_bytes_forward(Register end_from, Register end_to,
1293                              Register qword_count, Register to,
1294                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1295     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1296     Label L_loop;
1297     __ align(OptoLoopAlignment);
1298     if (UseUnalignedLoadStores) {
1299       Label L_end;
1300       // Copy 64-bytes per iteration
1301       __ BIND(L_loop);
1302       if (UseAVX >= 2) {
1303         __ vmovdqu(xmm0,Address(end_from, qword_count, Address::times_8, -56));
1304         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1305         __ vmovdqu(xmm1,Address(end_from, qword_count, Address::times_8, -24));
1306         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1307       } else {
1308         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1309         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1310         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1311         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1312         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1313         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1314         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1315         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1316       }
1317       __ BIND(L_copy_bytes);
1318       __ addptr(qword_count, 8);
1319       __ jcc(Assembler::lessEqual, L_loop);
1320       __ subptr(qword_count, 4);  // sub(8) and add(4)
1321       __ jccb(Assembler::greater, L_end);
1322       // Copy trailing 32 bytes
1323       if (UseAVX >= 2) {
1324         __ vmovdqu(xmm0,Address(end_from, qword_count, Address::times_8, -24));
1325         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1326       } else {
1327         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1328         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1329         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1330         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1331       }
1332       __ addptr(qword_count, 4);
1333       __ BIND(L_end);
1334     } else {
1335       // Copy 32-bytes per iteration
1336       __ BIND(L_loop);
1337       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1338       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1339       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1340       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1341       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1342       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1343       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1344       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1345 
1346       __ BIND(L_copy_bytes);
1347       __ addptr(qword_count, 4);
1348       __ jcc(Assembler::lessEqual, L_loop);
1349     }
1350     __ subptr(qword_count, 4);
1351     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1352   }
1353 

1354   // Copy big chunks backward
1355   //
1356   // Inputs:
1357   //   from         - source arrays address
1358   //   dest         - destination array address
1359   //   qword_count  - 64-bits element count
1360   //   to           - scratch
1361   //   L_copy_bytes - entry label
1362   //   L_copy_8_bytes  - exit  label
1363   //
1364   void copy_bytes_backward(Register from, Register dest,
1365                               Register qword_count, Register to,
1366                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1367     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1368     Label L_loop;
1369     __ align(OptoLoopAlignment);
1370     if (UseUnalignedLoadStores) {
1371       Label L_end;
1372       // Copy 64-bytes per iteration
1373       __ BIND(L_loop);
1374       if (UseAVX >= 2) {
1375         __ vmovdqu(xmm0,Address(from, qword_count, Address::times_8, 32));
1376         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1377         __ vmovdqu(xmm1,Address(from, qword_count, Address::times_8,  0));
1378         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1379       } else {
1380         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1381         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1382         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1383         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1384         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1385         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1386         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1387         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1388       }
1389       __ BIND(L_copy_bytes);
1390       __ subptr(qword_count, 8);
1391       __ jcc(Assembler::greaterEqual, L_loop);
1392 
1393       __ addptr(qword_count, 4);  // add(8) and sub(4)
1394       __ jccb(Assembler::less, L_end);
1395       // Copy trailing 32 bytes
1396       if (UseAVX >= 2) {
1397         __ vmovdqu(xmm0,Address(from, qword_count, Address::times_8, 0));
1398         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1399       } else {
1400         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1401         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1402         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1403         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1404       }
1405       __ subptr(qword_count, 4);
1406       __ BIND(L_end);
1407     } else {
1408       // Copy 32-bytes per iteration
1409       __ BIND(L_loop);
1410       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1411       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1412       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1413       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1414       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1415       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1416       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1417       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1418 
1419       __ BIND(L_copy_bytes);
1420       __ subptr(qword_count, 4);
1421       __ jcc(Assembler::greaterEqual, L_loop);
1422     }
1423     __ addptr(qword_count, 4);
1424     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1425   }
1426 
1427 
1428   // Arguments:
1429   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1430   //             ignored
1431   //   name    - stub name string
1432   //
1433   // Inputs:
1434   //   c_rarg0   - source array address
1435   //   c_rarg1   - destination array address
1436   //   c_rarg2   - element count, treated as ssize_t, can be zero
1437   //
1438   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1439   // we let the hardware handle it.  The one to eight bytes within words,
1440   // dwords or qwords that span cache line boundaries will still be loaded
1441   // and stored atomically.
1442   //
1443   // Side Effects:
1444   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1445   //   used by generate_conjoint_byte_copy().
1446   //
1447   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1448     __ align(CodeEntryAlignment);
1449     StubCodeMark mark(this, "StubRoutines", name);
1450     address start = __ pc();
1451 
1452     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1453     Label L_copy_byte, L_exit;
1454     const Register from        = rdi;  // source array address
1455     const Register to          = rsi;  // destination array address
1456     const Register count       = rdx;  // elements count
1457     const Register byte_count  = rcx;
1458     const Register qword_count = count;
1459     const Register end_from    = from; // source array end address
1460     const Register end_to      = to;   // destination array end address
1461     // End pointers are inclusive, and if count is not zero they point
1462     // to the last unit copied:  end_to[0] := end_from[0]
1463 
1464     __ enter(); // required for proper stackwalking of RuntimeStub frame
1465     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1466 
1467     if (entry != NULL) {
1468       *entry = __ pc();
1469        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1470       BLOCK_COMMENT("Entry:");
1471     }
1472 
1473     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1474                       // r9 and r10 may be used to save non-volatile registers
1475 
1476     // 'from', 'to' and 'count' are now valid
1477     __ movptr(byte_count, count);
1478     __ shrptr(count, 3); // count => qword_count
1479 
1480     // Copy from low to high addresses.  Use 'to' as scratch.
1481     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1482     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1483     __ negptr(qword_count); // make the count negative
1484     __ jmp(L_copy_bytes);
1485 
1486     // Copy trailing qwords
1487   __ BIND(L_copy_8_bytes);
1488     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1489     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1490     __ increment(qword_count);
1491     __ jcc(Assembler::notZero, L_copy_8_bytes);
1492 
1493     // Check for and copy trailing dword
1494   __ BIND(L_copy_4_bytes);
1495     __ testl(byte_count, 4);
1496     __ jccb(Assembler::zero, L_copy_2_bytes);
1497     __ movl(rax, Address(end_from, 8));
1498     __ movl(Address(end_to, 8), rax);
1499 
1500     __ addptr(end_from, 4);
1501     __ addptr(end_to, 4);
1502 
1503     // Check for and copy trailing word
1504   __ BIND(L_copy_2_bytes);


1507     __ movw(rax, Address(end_from, 8));
1508     __ movw(Address(end_to, 8), rax);
1509 
1510     __ addptr(end_from, 2);
1511     __ addptr(end_to, 2);
1512 
1513     // Check for and copy trailing byte
1514   __ BIND(L_copy_byte);
1515     __ testl(byte_count, 1);
1516     __ jccb(Assembler::zero, L_exit);
1517     __ movb(rax, Address(end_from, 8));
1518     __ movb(Address(end_to, 8), rax);
1519 
1520   __ BIND(L_exit);
1521     restore_arg_regs();
1522     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1523     __ xorptr(rax, rax); // return 0
1524     __ leave(); // required for proper stackwalking of RuntimeStub frame
1525     __ ret(0);
1526 
1527     // Copy in multi-bytes chunks
1528     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1529     __ jmp(L_copy_4_bytes);
1530 
1531     return start;
1532   }
1533 
1534   // Arguments:
1535   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1536   //             ignored
1537   //   name    - stub name string
1538   //
1539   // Inputs:
1540   //   c_rarg0   - source array address
1541   //   c_rarg1   - destination array address
1542   //   c_rarg2   - element count, treated as ssize_t, can be zero
1543   //
1544   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1545   // we let the hardware handle it.  The one to eight bytes within words,
1546   // dwords or qwords that span cache line boundaries will still be loaded
1547   // and stored atomically.
1548   //
1549   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1550                                       address* entry, const char *name) {
1551     __ align(CodeEntryAlignment);
1552     StubCodeMark mark(this, "StubRoutines", name);
1553     address start = __ pc();
1554 
1555     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1556     const Register from        = rdi;  // source array address
1557     const Register to          = rsi;  // destination array address
1558     const Register count       = rdx;  // elements count
1559     const Register byte_count  = rcx;
1560     const Register qword_count = count;
1561 
1562     __ enter(); // required for proper stackwalking of RuntimeStub frame
1563     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1564 
1565     if (entry != NULL) {
1566       *entry = __ pc();
1567       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1568       BLOCK_COMMENT("Entry:");
1569     }
1570 
1571     array_overlap_test(nooverlap_target, Address::times_1);
1572     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1573                       // r9 and r10 may be used to save non-volatile registers
1574 
1575     // 'from', 'to' and 'count' are now valid


1578 
1579     // Copy from high to low addresses.
1580 
1581     // Check for and copy trailing byte
1582     __ testl(byte_count, 1);
1583     __ jcc(Assembler::zero, L_copy_2_bytes);
1584     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1585     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1586     __ decrement(byte_count); // Adjust for possible trailing word
1587 
1588     // Check for and copy trailing word
1589   __ BIND(L_copy_2_bytes);
1590     __ testl(byte_count, 2);
1591     __ jcc(Assembler::zero, L_copy_4_bytes);
1592     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1593     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1594 
1595     // Check for and copy trailing dword
1596   __ BIND(L_copy_4_bytes);
1597     __ testl(byte_count, 4);
1598     __ jcc(Assembler::zero, L_copy_bytes);
1599     __ movl(rax, Address(from, qword_count, Address::times_8));
1600     __ movl(Address(to, qword_count, Address::times_8), rax);
1601     __ jmp(L_copy_bytes);
1602 
1603     // Copy trailing qwords
1604   __ BIND(L_copy_8_bytes);
1605     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1606     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1607     __ decrement(qword_count);
1608     __ jcc(Assembler::notZero, L_copy_8_bytes);
1609 
1610     restore_arg_regs();
1611     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1612     __ xorptr(rax, rax); // return 0
1613     __ leave(); // required for proper stackwalking of RuntimeStub frame
1614     __ ret(0);
1615 
1616     // Copy in multi-bytes chunks
1617     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1618 
1619     restore_arg_regs();
1620     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1621     __ xorptr(rax, rax); // return 0
1622     __ leave(); // required for proper stackwalking of RuntimeStub frame
1623     __ ret(0);
1624 
1625     return start;
1626   }
1627 
1628   // Arguments:
1629   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1630   //             ignored
1631   //   name    - stub name string
1632   //
1633   // Inputs:
1634   //   c_rarg0   - source array address
1635   //   c_rarg1   - destination array address
1636   //   c_rarg2   - element count, treated as ssize_t, can be zero
1637   //
1638   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1639   // let the hardware handle it.  The two or four words within dwords
1640   // or qwords that span cache line boundaries will still be loaded
1641   // and stored atomically.
1642   //
1643   // Side Effects:
1644   //   disjoint_short_copy_entry is set to the no-overlap entry point
1645   //   used by generate_conjoint_short_copy().
1646   //
1647   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1648     __ align(CodeEntryAlignment);
1649     StubCodeMark mark(this, "StubRoutines", name);
1650     address start = __ pc();
1651 
1652     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1653     const Register from        = rdi;  // source array address
1654     const Register to          = rsi;  // destination array address
1655     const Register count       = rdx;  // elements count
1656     const Register word_count  = rcx;
1657     const Register qword_count = count;
1658     const Register end_from    = from; // source array end address
1659     const Register end_to      = to;   // destination array end address
1660     // End pointers are inclusive, and if count is not zero they point
1661     // to the last unit copied:  end_to[0] := end_from[0]
1662 
1663     __ enter(); // required for proper stackwalking of RuntimeStub frame
1664     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1665 
1666     if (entry != NULL) {
1667       *entry = __ pc();
1668       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1669       BLOCK_COMMENT("Entry:");
1670     }
1671 
1672     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1673                       // r9 and r10 may be used to save non-volatile registers
1674 
1675     // 'from', 'to' and 'count' are now valid
1676     __ movptr(word_count, count);
1677     __ shrptr(count, 2); // count => qword_count
1678 
1679     // Copy from low to high addresses.  Use 'to' as scratch.
1680     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1681     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1682     __ negptr(qword_count);
1683     __ jmp(L_copy_bytes);
1684 
1685     // Copy trailing qwords
1686   __ BIND(L_copy_8_bytes);
1687     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1688     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1689     __ increment(qword_count);
1690     __ jcc(Assembler::notZero, L_copy_8_bytes);
1691 
1692     // Original 'dest' is trashed, so we can't use it as a
1693     // base register for a possible trailing word copy
1694 
1695     // Check for and copy trailing dword
1696   __ BIND(L_copy_4_bytes);
1697     __ testl(word_count, 2);
1698     __ jccb(Assembler::zero, L_copy_2_bytes);
1699     __ movl(rax, Address(end_from, 8));
1700     __ movl(Address(end_to, 8), rax);
1701 
1702     __ addptr(end_from, 4);
1703     __ addptr(end_to, 4);
1704 
1705     // Check for and copy trailing word
1706   __ BIND(L_copy_2_bytes);
1707     __ testl(word_count, 1);
1708     __ jccb(Assembler::zero, L_exit);
1709     __ movw(rax, Address(end_from, 8));
1710     __ movw(Address(end_to, 8), rax);
1711 
1712   __ BIND(L_exit);
1713     restore_arg_regs();
1714     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1715     __ xorptr(rax, rax); // return 0
1716     __ leave(); // required for proper stackwalking of RuntimeStub frame
1717     __ ret(0);
1718 
1719     // Copy in multi-bytes chunks
1720     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1721     __ jmp(L_copy_4_bytes);
1722 
1723     return start;
1724   }
1725 
1726   address generate_fill(BasicType t, bool aligned, const char *name) {
1727     __ align(CodeEntryAlignment);
1728     StubCodeMark mark(this, "StubRoutines", name);
1729     address start = __ pc();
1730 
1731     BLOCK_COMMENT("Entry:");
1732 
1733     const Register to       = c_rarg0;  // source array address
1734     const Register value    = c_rarg1;  // value
1735     const Register count    = c_rarg2;  // elements count
1736 
1737     __ enter(); // required for proper stackwalking of RuntimeStub frame
1738 
1739     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1740 


1747   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1748   //             ignored
1749   //   name    - stub name string
1750   //
1751   // Inputs:
1752   //   c_rarg0   - source array address
1753   //   c_rarg1   - destination array address
1754   //   c_rarg2   - element count, treated as ssize_t, can be zero
1755   //
1756   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1757   // let the hardware handle it.  The two or four words within dwords
1758   // or qwords that span cache line boundaries will still be loaded
1759   // and stored atomically.
1760   //
1761   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1762                                        address *entry, const char *name) {
1763     __ align(CodeEntryAlignment);
1764     StubCodeMark mark(this, "StubRoutines", name);
1765     address start = __ pc();
1766 
1767     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1768     const Register from        = rdi;  // source array address
1769     const Register to          = rsi;  // destination array address
1770     const Register count       = rdx;  // elements count
1771     const Register word_count  = rcx;
1772     const Register qword_count = count;
1773 
1774     __ enter(); // required for proper stackwalking of RuntimeStub frame
1775     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1776 
1777     if (entry != NULL) {
1778       *entry = __ pc();
1779       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1780       BLOCK_COMMENT("Entry:");
1781     }
1782 
1783     array_overlap_test(nooverlap_target, Address::times_2);
1784     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1785                       // r9 and r10 may be used to save non-volatile registers
1786 
1787     // 'from', 'to' and 'count' are now valid
1788     __ movptr(word_count, count);
1789     __ shrptr(count, 2); // count => qword_count
1790 
1791     // Copy from high to low addresses.  Use 'to' as scratch.
1792 
1793     // Check for and copy trailing word
1794     __ testl(word_count, 1);
1795     __ jccb(Assembler::zero, L_copy_4_bytes);
1796     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1797     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1798 
1799     // Check for and copy trailing dword
1800   __ BIND(L_copy_4_bytes);
1801     __ testl(word_count, 2);
1802     __ jcc(Assembler::zero, L_copy_bytes);
1803     __ movl(rax, Address(from, qword_count, Address::times_8));
1804     __ movl(Address(to, qword_count, Address::times_8), rax);
1805     __ jmp(L_copy_bytes);
1806 
1807     // Copy trailing qwords
1808   __ BIND(L_copy_8_bytes);
1809     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1810     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1811     __ decrement(qword_count);
1812     __ jcc(Assembler::notZero, L_copy_8_bytes);
1813 
1814     restore_arg_regs();
1815     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1816     __ xorptr(rax, rax); // return 0
1817     __ leave(); // required for proper stackwalking of RuntimeStub frame
1818     __ ret(0);
1819 
1820     // Copy in multi-bytes chunks
1821     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1822 
1823     restore_arg_regs();
1824     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1825     __ xorptr(rax, rax); // return 0
1826     __ leave(); // required for proper stackwalking of RuntimeStub frame
1827     __ ret(0);
1828 
1829     return start;
1830   }
1831 
1832   // Arguments:
1833   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1834   //             ignored
1835   //   is_oop  - true => oop array, so generate store check code
1836   //   name    - stub name string
1837   //
1838   // Inputs:
1839   //   c_rarg0   - source array address
1840   //   c_rarg1   - destination array address
1841   //   c_rarg2   - element count, treated as ssize_t, can be zero
1842   //
1843   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1844   // the hardware handle it.  The two dwords within qwords that span
1845   // cache line boundaries will still be loaded and stored atomicly.
1846   //
1847   // Side Effects:
1848   //   disjoint_int_copy_entry is set to the no-overlap entry point
1849   //   used by generate_conjoint_int_oop_copy().
1850   //
1851   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1852                                          const char *name, bool dest_uninitialized = false) {
1853     __ align(CodeEntryAlignment);
1854     StubCodeMark mark(this, "StubRoutines", name);
1855     address start = __ pc();
1856 
1857     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1858     const Register from        = rdi;  // source array address
1859     const Register to          = rsi;  // destination array address
1860     const Register count       = rdx;  // elements count
1861     const Register dword_count = rcx;
1862     const Register qword_count = count;
1863     const Register end_from    = from; // source array end address
1864     const Register end_to      = to;   // destination array end address
1865     const Register saved_to    = r11;  // saved destination array address
1866     // End pointers are inclusive, and if count is not zero they point
1867     // to the last unit copied:  end_to[0] := end_from[0]
1868 
1869     __ enter(); // required for proper stackwalking of RuntimeStub frame
1870     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1871 
1872     if (entry != NULL) {
1873       *entry = __ pc();
1874       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1875       BLOCK_COMMENT("Entry:");
1876     }
1877 
1878     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1879                       // r9 and r10 may be used to save non-volatile registers
1880     if (is_oop) {
1881       __ movq(saved_to, to);
1882       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1883     }
1884 
1885     // 'from', 'to' and 'count' are now valid
1886     __ movptr(dword_count, count);
1887     __ shrptr(count, 1); // count => qword_count
1888 
1889     // Copy from low to high addresses.  Use 'to' as scratch.
1890     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1891     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1892     __ negptr(qword_count);
1893     __ jmp(L_copy_bytes);
1894 
1895     // Copy trailing qwords
1896   __ BIND(L_copy_8_bytes);
1897     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1898     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1899     __ increment(qword_count);
1900     __ jcc(Assembler::notZero, L_copy_8_bytes);
1901 
1902     // Check for and copy trailing dword
1903   __ BIND(L_copy_4_bytes);
1904     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1905     __ jccb(Assembler::zero, L_exit);
1906     __ movl(rax, Address(end_from, 8));
1907     __ movl(Address(end_to, 8), rax);
1908 
1909   __ BIND(L_exit);
1910     if (is_oop) {
1911       __ leaq(end_to, Address(saved_to, dword_count, Address::times_4, -4));
1912       gen_write_ref_array_post_barrier(saved_to, end_to, rax);
1913     }
1914     restore_arg_regs();
1915     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1916     __ xorptr(rax, rax); // return 0
1917     __ leave(); // required for proper stackwalking of RuntimeStub frame
1918     __ ret(0);
1919 
1920     // Copy in multi-bytes chunks
1921     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1922     __ jmp(L_copy_4_bytes);
1923 
1924     return start;
1925   }
1926 
1927   // Arguments:
1928   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1929   //             ignored
1930   //   is_oop  - true => oop array, so generate store check code
1931   //   name    - stub name string
1932   //
1933   // Inputs:
1934   //   c_rarg0   - source array address
1935   //   c_rarg1   - destination array address
1936   //   c_rarg2   - element count, treated as ssize_t, can be zero
1937   //
1938   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1939   // the hardware handle it.  The two dwords within qwords that span
1940   // cache line boundaries will still be loaded and stored atomicly.
1941   //
1942   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1943                                          address *entry, const char *name,
1944                                          bool dest_uninitialized = false) {
1945     __ align(CodeEntryAlignment);
1946     StubCodeMark mark(this, "StubRoutines", name);
1947     address start = __ pc();
1948 
1949     Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
1950     const Register from        = rdi;  // source array address
1951     const Register to          = rsi;  // destination array address
1952     const Register count       = rdx;  // elements count
1953     const Register dword_count = rcx;
1954     const Register qword_count = count;
1955 
1956     __ enter(); // required for proper stackwalking of RuntimeStub frame
1957     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1958 
1959     if (entry != NULL) {
1960       *entry = __ pc();
1961        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1962       BLOCK_COMMENT("Entry:");
1963     }
1964 
1965     array_overlap_test(nooverlap_target, Address::times_4);
1966     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1967                       // r9 and r10 may be used to save non-volatile registers
1968 
1969     if (is_oop) {
1970       // no registers are destroyed by this call
1971       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1972     }
1973 
1974     assert_clean_int(count, rax); // Make sure 'count' is clean int.
1975     // 'from', 'to' and 'count' are now valid
1976     __ movptr(dword_count, count);
1977     __ shrptr(count, 1); // count => qword_count
1978 
1979     // Copy from high to low addresses.  Use 'to' as scratch.
1980 
1981     // Check for and copy trailing dword
1982     __ testl(dword_count, 1);
1983     __ jcc(Assembler::zero, L_copy_bytes);
1984     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
1985     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
1986     __ jmp(L_copy_bytes);
1987 
1988     // Copy trailing qwords
1989   __ BIND(L_copy_8_bytes);
1990     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1991     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1992     __ decrement(qword_count);
1993     __ jcc(Assembler::notZero, L_copy_8_bytes);
1994 
1995     if (is_oop) {
1996       __ jmp(L_exit);
1997     }
1998     restore_arg_regs();
1999     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2000     __ xorptr(rax, rax); // return 0
2001     __ leave(); // required for proper stackwalking of RuntimeStub frame
2002     __ ret(0);
2003 
2004     // Copy in multi-bytes chunks
2005     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2006 
2007    __ bind(L_exit);
2008      if (is_oop) {
2009        Register end_to = rdx;
2010        __ leaq(end_to, Address(to, dword_count, Address::times_4, -4));
2011        gen_write_ref_array_post_barrier(to, end_to, rax);
2012      }
2013     restore_arg_regs();
2014     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2015     __ xorptr(rax, rax); // return 0
2016     __ leave(); // required for proper stackwalking of RuntimeStub frame
2017     __ ret(0);
2018 
2019     return start;
2020   }
2021 
2022   // Arguments:
2023   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2024   //             ignored
2025   //   is_oop  - true => oop array, so generate store check code
2026   //   name    - stub name string
2027   //
2028   // Inputs:
2029   //   c_rarg0   - source array address
2030   //   c_rarg1   - destination array address
2031   //   c_rarg2   - element count, treated as ssize_t, can be zero
2032   //
2033  // Side Effects:
2034   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2035   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2036   //
2037   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2038                                           const char *name, bool dest_uninitialized = false) {
2039     __ align(CodeEntryAlignment);
2040     StubCodeMark mark(this, "StubRoutines", name);
2041     address start = __ pc();
2042 
2043     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2044     const Register from        = rdi;  // source array address
2045     const Register to          = rsi;  // destination array address
2046     const Register qword_count = rdx;  // elements count
2047     const Register end_from    = from; // source array end address
2048     const Register end_to      = rcx;  // destination array end address
2049     const Register saved_to    = to;
2050     // End pointers are inclusive, and if count is not zero they point
2051     // to the last unit copied:  end_to[0] := end_from[0]
2052 
2053     __ enter(); // required for proper stackwalking of RuntimeStub frame
2054     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2055     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2056 
2057     if (entry != NULL) {
2058       *entry = __ pc();
2059       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2060       BLOCK_COMMENT("Entry:");
2061     }
2062 
2063     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2064                       // r9 and r10 may be used to save non-volatile registers
2065     // 'from', 'to' and 'qword_count' are now valid
2066     if (is_oop) {
2067       // no registers are destroyed by this call
2068       gen_write_ref_array_pre_barrier(to, qword_count, dest_uninitialized);
2069     }
2070 
2071     // Copy from low to high addresses.  Use 'to' as scratch.
2072     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2073     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2074     __ negptr(qword_count);
2075     __ jmp(L_copy_bytes);
2076 
2077     // Copy trailing qwords
2078   __ BIND(L_copy_8_bytes);
2079     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2080     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2081     __ increment(qword_count);
2082     __ jcc(Assembler::notZero, L_copy_8_bytes);
2083 
2084     if (is_oop) {
2085       __ jmp(L_exit);
2086     } else {
2087       restore_arg_regs();
2088       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2089       __ xorptr(rax, rax); // return 0
2090       __ leave(); // required for proper stackwalking of RuntimeStub frame
2091       __ ret(0);
2092     }
2093 
2094     // Copy in multi-bytes chunks
2095     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2096 
2097     if (is_oop) {
2098     __ BIND(L_exit);
2099       gen_write_ref_array_post_barrier(saved_to, end_to, rax);
2100     }
2101     restore_arg_regs();
2102     if (is_oop) {
2103       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2104     } else {
2105       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2106     }
2107     __ xorptr(rax, rax); // return 0
2108     __ leave(); // required for proper stackwalking of RuntimeStub frame
2109     __ ret(0);
2110 
2111     return start;
2112   }
2113 
2114   // Arguments:
2115   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2116   //             ignored
2117   //   is_oop  - true => oop array, so generate store check code
2118   //   name    - stub name string
2119   //
2120   // Inputs:
2121   //   c_rarg0   - source array address
2122   //   c_rarg1   - destination array address
2123   //   c_rarg2   - element count, treated as ssize_t, can be zero
2124   //
2125   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2126                                           address nooverlap_target, address *entry,
2127                                           const char *name, bool dest_uninitialized = false) {
2128     __ align(CodeEntryAlignment);
2129     StubCodeMark mark(this, "StubRoutines", name);
2130     address start = __ pc();
2131 
2132     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2133     const Register from        = rdi;  // source array address
2134     const Register to          = rsi;  // destination array address
2135     const Register qword_count = rdx;  // elements count
2136     const Register saved_count = rcx;
2137 
2138     __ enter(); // required for proper stackwalking of RuntimeStub frame
2139     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2140 
2141     if (entry != NULL) {
2142       *entry = __ pc();
2143       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2144       BLOCK_COMMENT("Entry:");
2145     }
2146 
2147     array_overlap_test(nooverlap_target, Address::times_8);
2148     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2149                       // r9 and r10 may be used to save non-volatile registers
2150     // 'from', 'to' and 'qword_count' are now valid
2151     if (is_oop) {
2152       // Save to and count for store barrier
2153       __ movptr(saved_count, qword_count);
2154       // No registers are destroyed by this call
2155       gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2156     }
2157 
2158     __ jmp(L_copy_bytes);
2159 
2160     // Copy trailing qwords
2161   __ BIND(L_copy_8_bytes);
2162     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2163     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2164     __ decrement(qword_count);
2165     __ jcc(Assembler::notZero, L_copy_8_bytes);
2166 
2167     if (is_oop) {
2168       __ jmp(L_exit);
2169     } else {
2170       restore_arg_regs();
2171       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2172       __ xorptr(rax, rax); // return 0
2173       __ leave(); // required for proper stackwalking of RuntimeStub frame
2174       __ ret(0);
2175     }
2176 
2177     // Copy in multi-bytes chunks
2178     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2179 
2180     if (is_oop) {
2181     __ BIND(L_exit);
2182       __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
2183       gen_write_ref_array_post_barrier(to, rcx, rax);
2184     }
2185     restore_arg_regs();
2186     if (is_oop) {
2187       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2188     } else {
2189       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2190     }
2191     __ xorptr(rax, rax); // return 0
2192     __ leave(); // required for proper stackwalking of RuntimeStub frame
2193     __ ret(0);
2194 
2195     return start;
2196   }
2197 
2198 


src/cpu/x86/vm/stubGenerator_x86_64.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File