385 }
386 }
387 #endif
388 __ movptr(r15, r15_save);
389 __ movptr(r14, r14_save);
390 __ movptr(r13, r13_save);
391 __ movptr(r12, r12_save);
392 __ movptr(rbx, rbx_save);
393
394 #ifdef _WIN64
395 __ movptr(rdi, rdi_save);
396 __ movptr(rsi, rsi_save);
397 #else
398 __ ldmxcsr(mxcsr_save);
399 #endif
400
401 // restore rsp
402 __ addptr(rsp, -rsp_after_call_off * wordSize);
403
404 // return
405 __ pop(rbp);
406 __ ret(0);
407
408 // handle return types different from T_INT
409 __ BIND(is_long);
410 __ movq(Address(c_rarg0, 0), rax);
411 __ jmp(exit);
412
413 __ BIND(is_float);
414 __ movflt(Address(c_rarg0, 0), xmm0);
415 __ jmp(exit);
416
417 __ BIND(is_double);
418 __ movdbl(Address(c_rarg0, 0), xmm0);
419 __ jmp(exit);
420
421 return start;
422 }
423
424 // Return point for a Java call if there's an exception thrown in
1537 __ BIND(L_copy_2_bytes);
1538 __ testl(byte_count, 2);
1539 __ jccb(Assembler::zero, L_copy_byte);
1540 __ movw(rax, Address(end_from, 8));
1541 __ movw(Address(end_to, 8), rax);
1542
1543 __ addptr(end_from, 2);
1544 __ addptr(end_to, 2);
1545
1546 // Check for and copy trailing byte
1547 __ BIND(L_copy_byte);
1548 __ testl(byte_count, 1);
1549 __ jccb(Assembler::zero, L_exit);
1550 __ movb(rax, Address(end_from, 8));
1551 __ movb(Address(end_to, 8), rax);
1552
1553 __ BIND(L_exit);
1554 restore_arg_regs();
1555 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1556 __ xorptr(rax, rax); // return 0
1557 __ leave(); // required for proper stackwalking of RuntimeStub frame
1558 __ ret(0);
1559
1560 // Copy in multi-bytes chunks
1561 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1562 __ jmp(L_copy_4_bytes);
1563
1564 return start;
1565 }
1566
1567 // Arguments:
1568 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1569 // ignored
1570 // name - stub name string
1571 //
1572 // Inputs:
1573 // c_rarg0 - source array address
1574 // c_rarg1 - destination array address
1575 // c_rarg2 - element count, treated as ssize_t, can be zero
1576 //
1626 __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1627
1628 // Check for and copy trailing dword
1629 __ BIND(L_copy_4_bytes);
1630 __ testl(byte_count, 4);
1631 __ jcc(Assembler::zero, L_copy_bytes);
1632 __ movl(rax, Address(from, qword_count, Address::times_8));
1633 __ movl(Address(to, qword_count, Address::times_8), rax);
1634 __ jmp(L_copy_bytes);
1635
1636 // Copy trailing qwords
1637 __ BIND(L_copy_8_bytes);
1638 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1639 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1640 __ decrement(qword_count);
1641 __ jcc(Assembler::notZero, L_copy_8_bytes);
1642
1643 restore_arg_regs();
1644 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1645 __ xorptr(rax, rax); // return 0
1646 __ leave(); // required for proper stackwalking of RuntimeStub frame
1647 __ ret(0);
1648
1649 // Copy in multi-bytes chunks
1650 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1651
1652 restore_arg_regs();
1653 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1654 __ xorptr(rax, rax); // return 0
1655 __ leave(); // required for proper stackwalking of RuntimeStub frame
1656 __ ret(0);
1657
1658 return start;
1659 }
1660
1661 // Arguments:
1662 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1663 // ignored
1664 // name - stub name string
1665 //
1666 // Inputs:
1667 // c_rarg0 - source array address
1668 // c_rarg1 - destination array address
1669 // c_rarg2 - element count, treated as ssize_t, can be zero
1670 //
1671 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1672 // let the hardware handle it. The two or four words within dwords
1673 // or qwords that span cache line boundaries will still be loaded
1674 // and stored atomically.
1729 __ BIND(L_copy_4_bytes);
1730 __ testl(word_count, 2);
1731 __ jccb(Assembler::zero, L_copy_2_bytes);
1732 __ movl(rax, Address(end_from, 8));
1733 __ movl(Address(end_to, 8), rax);
1734
1735 __ addptr(end_from, 4);
1736 __ addptr(end_to, 4);
1737
1738 // Check for and copy trailing word
1739 __ BIND(L_copy_2_bytes);
1740 __ testl(word_count, 1);
1741 __ jccb(Assembler::zero, L_exit);
1742 __ movw(rax, Address(end_from, 8));
1743 __ movw(Address(end_to, 8), rax);
1744
1745 __ BIND(L_exit);
1746 restore_arg_regs();
1747 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1748 __ xorptr(rax, rax); // return 0
1749 __ leave(); // required for proper stackwalking of RuntimeStub frame
1750 __ ret(0);
1751
1752 // Copy in multi-bytes chunks
1753 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1754 __ jmp(L_copy_4_bytes);
1755
1756 return start;
1757 }
1758
1759 address generate_fill(BasicType t, bool aligned, const char *name) {
1760 __ align(CodeEntryAlignment);
1761 StubCodeMark mark(this, "StubRoutines", name);
1762 address start = __ pc();
1763
1764 BLOCK_COMMENT("Entry:");
1765
1766 const Register to = c_rarg0; // source array address
1767 const Register value = c_rarg1; // value
1768 const Register count = c_rarg2; // elements count
1769
1770 __ enter(); // required for proper stackwalking of RuntimeStub frame
1771
1772 __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1773
1774 __ leave(); // required for proper stackwalking of RuntimeStub frame
1775 __ ret(0);
1776 return start;
1777 }
1778
1779 // Arguments:
1780 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1781 // ignored
1782 // name - stub name string
1783 //
1784 // Inputs:
1785 // c_rarg0 - source array address
1786 // c_rarg1 - destination array address
1787 // c_rarg2 - element count, treated as ssize_t, can be zero
1788 //
1789 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1790 // let the hardware handle it. The two or four words within dwords
1791 // or qwords that span cache line boundaries will still be loaded
1792 // and stored atomically.
1793 //
1830 __ movw(Address(to, word_count, Address::times_2, -2), rax);
1831
1832 // Check for and copy trailing dword
1833 __ BIND(L_copy_4_bytes);
1834 __ testl(word_count, 2);
1835 __ jcc(Assembler::zero, L_copy_bytes);
1836 __ movl(rax, Address(from, qword_count, Address::times_8));
1837 __ movl(Address(to, qword_count, Address::times_8), rax);
1838 __ jmp(L_copy_bytes);
1839
1840 // Copy trailing qwords
1841 __ BIND(L_copy_8_bytes);
1842 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1843 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1844 __ decrement(qword_count);
1845 __ jcc(Assembler::notZero, L_copy_8_bytes);
1846
1847 restore_arg_regs();
1848 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1849 __ xorptr(rax, rax); // return 0
1850 __ leave(); // required for proper stackwalking of RuntimeStub frame
1851 __ ret(0);
1852
1853 // Copy in multi-bytes chunks
1854 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1855
1856 restore_arg_regs();
1857 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1858 __ xorptr(rax, rax); // return 0
1859 __ leave(); // required for proper stackwalking of RuntimeStub frame
1860 __ ret(0);
1861
1862 return start;
1863 }
1864
1865 // Arguments:
1866 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1867 // ignored
1868 // is_oop - true => oop array, so generate store check code
1869 // name - stub name string
1870 //
1871 // Inputs:
1872 // c_rarg0 - source array address
1873 // c_rarg1 - destination array address
1874 // c_rarg2 - element count, treated as ssize_t, can be zero
1875 //
1876 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1877 // the hardware handle it. The two dwords within qwords that span
1878 // cache line boundaries will still be loaded and stored atomicly.
1928 // Copy trailing qwords
1929 __ BIND(L_copy_8_bytes);
1930 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1931 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1932 __ increment(qword_count);
1933 __ jcc(Assembler::notZero, L_copy_8_bytes);
1934
1935 // Check for and copy trailing dword
1936 __ BIND(L_copy_4_bytes);
1937 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1938 __ jccb(Assembler::zero, L_exit);
1939 __ movl(rax, Address(end_from, 8));
1940 __ movl(Address(end_to, 8), rax);
1941
1942 __ BIND(L_exit);
1943 if (is_oop) {
1944 gen_write_ref_array_post_barrier(saved_to, dword_count, rax);
1945 }
1946 restore_arg_regs();
1947 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1948 __ xorptr(rax, rax); // return 0
1949 __ leave(); // required for proper stackwalking of RuntimeStub frame
1950 __ ret(0);
1951
1952 // Copy in multi-bytes chunks
1953 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1954 __ jmp(L_copy_4_bytes);
1955
1956 return start;
1957 }
1958
1959 // Arguments:
1960 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1961 // ignored
1962 // is_oop - true => oop array, so generate store check code
1963 // name - stub name string
1964 //
1965 // Inputs:
1966 // c_rarg0 - source array address
1967 // c_rarg1 - destination array address
2013 // Check for and copy trailing dword
2014 __ testl(dword_count, 1);
2015 __ jcc(Assembler::zero, L_copy_bytes);
2016 __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2017 __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2018 __ jmp(L_copy_bytes);
2019
2020 // Copy trailing qwords
2021 __ BIND(L_copy_8_bytes);
2022 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2023 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2024 __ decrement(qword_count);
2025 __ jcc(Assembler::notZero, L_copy_8_bytes);
2026
2027 if (is_oop) {
2028 __ jmp(L_exit);
2029 }
2030 restore_arg_regs();
2031 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2032 __ xorptr(rax, rax); // return 0
2033 __ leave(); // required for proper stackwalking of RuntimeStub frame
2034 __ ret(0);
2035
2036 // Copy in multi-bytes chunks
2037 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2038
2039 __ BIND(L_exit);
2040 if (is_oop) {
2041 gen_write_ref_array_post_barrier(to, dword_count, rax);
2042 }
2043 restore_arg_regs();
2044 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2045 __ xorptr(rax, rax); // return 0
2046 __ leave(); // required for proper stackwalking of RuntimeStub frame
2047 __ ret(0);
2048
2049 return start;
2050 }
2051
2052 // Arguments:
2053 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2054 // ignored
2055 // is_oop - true => oop array, so generate store check code
2056 // name - stub name string
2057 //
2058 // Inputs:
2059 // c_rarg0 - source array address
2060 // c_rarg1 - destination array address
2061 // c_rarg2 - element count, treated as ssize_t, can be zero
2062 //
2063 // Side Effects:
2064 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2065 // no-overlap entry point used by generate_conjoint_long_oop_copy().
2103
2104 // Copy from low to high addresses. Use 'to' as scratch.
2105 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2106 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2107 __ negptr(qword_count);
2108 __ jmp(L_copy_bytes);
2109
2110 // Copy trailing qwords
2111 __ BIND(L_copy_8_bytes);
2112 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2113 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2114 __ increment(qword_count);
2115 __ jcc(Assembler::notZero, L_copy_8_bytes);
2116
2117 if (is_oop) {
2118 __ jmp(L_exit);
2119 } else {
2120 restore_arg_regs();
2121 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2122 __ xorptr(rax, rax); // return 0
2123 __ leave(); // required for proper stackwalking of RuntimeStub frame
2124 __ ret(0);
2125 }
2126
2127 // Copy in multi-bytes chunks
2128 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2129
2130 if (is_oop) {
2131 __ BIND(L_exit);
2132 gen_write_ref_array_post_barrier(saved_to, saved_count, rax);
2133 }
2134 restore_arg_regs();
2135 if (is_oop) {
2136 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2137 } else {
2138 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2139 }
2140 __ xorptr(rax, rax); // return 0
2141 __ leave(); // required for proper stackwalking of RuntimeStub frame
2142 __ ret(0);
2143
2144 return start;
2145 }
2146
2147 // Arguments:
2148 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2149 // ignored
2150 // is_oop - true => oop array, so generate store check code
2151 // name - stub name string
2152 //
2153 // Inputs:
2154 // c_rarg0 - source array address
2155 // c_rarg1 - destination array address
2156 // c_rarg2 - element count, treated as ssize_t, can be zero
2157 //
2158 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2159 address nooverlap_target, address *entry,
2186 __ movptr(saved_count, qword_count);
2187 // No registers are destroyed by this call
2188 gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2189 }
2190
2191 __ jmp(L_copy_bytes);
2192
2193 // Copy trailing qwords
2194 __ BIND(L_copy_8_bytes);
2195 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2196 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2197 __ decrement(qword_count);
2198 __ jcc(Assembler::notZero, L_copy_8_bytes);
2199
2200 if (is_oop) {
2201 __ jmp(L_exit);
2202 } else {
2203 restore_arg_regs();
2204 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2205 __ xorptr(rax, rax); // return 0
2206 __ leave(); // required for proper stackwalking of RuntimeStub frame
2207 __ ret(0);
2208 }
2209
2210 // Copy in multi-bytes chunks
2211 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2212
2213 if (is_oop) {
2214 __ BIND(L_exit);
2215 gen_write_ref_array_post_barrier(to, saved_count, rax);
2216 }
2217 restore_arg_regs();
2218 if (is_oop) {
2219 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2220 } else {
2221 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2222 }
2223 __ xorptr(rax, rax); // return 0
2224 __ leave(); // required for proper stackwalking of RuntimeStub frame
2225 __ ret(0);
2226
2227 return start;
2228 }
2229
2230
2231 // Helper for generating a dynamic type check.
2232 // Smashes no registers.
2233 void generate_type_check(Register sub_klass,
2234 Register super_check_offset,
2235 Register super_klass,
2236 Label& L_success) {
2237 assert_different_registers(sub_klass, super_check_offset, super_klass);
2238
2239 BLOCK_COMMENT("type_check:");
2240
2241 Label L_miss;
2242
3757
3758 const XMMRegister msgtmp1 = xmm4;
3759 const XMMRegister msgtmp2 = xmm5;
3760 const XMMRegister msgtmp3 = xmm6;
3761 const XMMRegister msgtmp4 = xmm7;
3762
3763 const XMMRegister shuf_mask = xmm8;
3764
3765 __ enter();
3766
3767 __ subptr(rsp, 4 * wordSize);
3768
3769 if (VM_Version::supports_sha()) {
3770 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3771 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3772 } else if (VM_Version::supports_avx2()) {
3773 __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3774 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3775 }
3776 __ addptr(rsp, 4 * wordSize);
3777
3778 __ leave();
3779 __ ret(0);
3780 return start;
3781 }
3782
3783 address generate_sha512_implCompress(bool multi_block, const char *name) {
3784 assert(VM_Version::supports_avx2(), "");
3785 assert(VM_Version::supports_bmi2(), "");
3786 __ align(CodeEntryAlignment);
3787 StubCodeMark mark(this, "StubRoutines", name);
3788 address start = __ pc();
3789
3790 Register buf = c_rarg0;
3791 Register state = c_rarg1;
3792 Register ofs = c_rarg2;
3793 Register limit = c_rarg3;
3794
3795 const XMMRegister msg = xmm0;
3796 const XMMRegister state0 = xmm1;
3797 const XMMRegister state1 = xmm2;
3798 const XMMRegister msgtmp0 = xmm3;
3799 const XMMRegister msgtmp1 = xmm4;
3800 const XMMRegister msgtmp2 = xmm5;
3801 const XMMRegister msgtmp3 = xmm6;
3802 const XMMRegister msgtmp4 = xmm7;
3803
3804 const XMMRegister shuf_mask = xmm8;
3805
3806 __ enter();
3807
3808 __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3809 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3810
3811 __ leave();
3812 __ ret(0);
3813 return start;
3814 }
3815
3816 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3817 // to hide instruction latency
3818 //
3819 // Arguments:
3820 //
3821 // Inputs:
3822 // c_rarg0 - source byte array address
3823 // c_rarg1 - destination byte array address
3824 // c_rarg2 - K (key) in little endian int array
3825 // c_rarg3 - counter vector byte array address
3826 // Linux
3827 // c_rarg4 - input length
3828 // c_rarg5 - saved encryptedCounter start
3829 // rbp + 6 * wordSize - saved used length
3830 // Windows
4264 __ movdqu(xmm_temp4, xmm_temp3);
4265 __ movdqu(xmm_temp5, xmm_temp3);
4266 __ psrld(xmm_temp2, 1); // packed left shifting >> 1
4267 __ psrld(xmm_temp4, 2); // packed left shifting >> 2
4268 __ psrld(xmm_temp5, 7); // packed left shifting >> 7
4269 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
4270 __ pxor(xmm_temp2, xmm_temp5);
4271 __ pxor(xmm_temp2, xmm_temp8);
4272 __ pxor(xmm_temp3, xmm_temp2);
4273 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
4274
4275 __ decrement(blocks);
4276 __ jcc(Assembler::zero, L_exit);
4277 __ movdqu(xmm_temp0, xmm_temp6);
4278 __ addptr(data, 16);
4279 __ jmp(L_ghash_loop);
4280
4281 __ BIND(L_exit);
4282 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
4283 __ movdqu(Address(state, 0), xmm_temp6); // store the result
4284
4285 __ leave();
4286 __ ret(0);
4287 return start;
4288 }
4289
4290 /**
4291 * Arguments:
4292 *
4293 * Inputs:
4294 * c_rarg0 - int crc
4295 * c_rarg1 - byte* buf
4296 * c_rarg2 - int length
4297 *
4298 * Ouput:
4299 * rax - int crc result
4300 */
4301 address generate_updateBytesCRC32() {
4302 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4303
4304 __ align(CodeEntryAlignment);
4305 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4306
4307 address start = __ pc();
4308 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4309 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4310 // rscratch1: r10
4311 const Register crc = c_rarg0; // crc
4312 const Register buf = c_rarg1; // source java byte array address
4313 const Register len = c_rarg2; // length
4314 const Register table = c_rarg3; // crc_table address (reuse register)
4315 const Register tmp = r11;
4316 assert_different_registers(crc, buf, len, table, tmp, rax);
4317
4318 BLOCK_COMMENT("Entry:");
4319 __ enter(); // required for proper stackwalking of RuntimeStub frame
4320
4321 __ kernel_crc32(crc, buf, len, table, tmp);
4322
4323 __ movl(rax, crc);
4324 __ leave(); // required for proper stackwalking of RuntimeStub frame
4325 __ ret(0);
4326
4327 return start;
4328 }
4329
4330 /**
4331 * Arguments:
4332 *
4333 * Inputs:
4334 * c_rarg0 - int crc
4335 * c_rarg1 - byte* buf
4336 * c_rarg2 - long length
4337 * c_rarg3 - table_start - optional (present only when doing a library_call,
4338 * not used by x86 algorithm)
4339 *
4340 * Ouput:
4341 * rax - int crc result
4342 */
4343 address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
4363 const Register z = r8;
4364 #endif
4365 assert_different_registers(crc, buf, len, a, j, k, l, y, z);
4366
4367 BLOCK_COMMENT("Entry:");
4368 __ enter(); // required for proper stackwalking of RuntimeStub frame
4369 #ifdef _WIN64
4370 __ push(y);
4371 __ push(z);
4372 #endif
4373 __ crc32c_ipl_alg2_alt2(crc, buf, len,
4374 a, j, k,
4375 l, y, z,
4376 c_farg0, c_farg1, c_farg2,
4377 is_pclmulqdq_supported);
4378 __ movl(rax, crc);
4379 #ifdef _WIN64
4380 __ pop(z);
4381 __ pop(y);
4382 #endif
4383 __ leave(); // required for proper stackwalking of RuntimeStub frame
4384 __ ret(0);
4385
4386 return start;
4387 }
4388
4389 /**
4390 * Arguments:
4391 *
4392 * Input:
4393 * c_rarg0 - x address
4394 * c_rarg1 - x length
4395 * c_rarg2 - y address
4396 * c_rarg3 - y lenth
4397 * not Win64
4398 * c_rarg4 - z address
4399 * c_rarg5 - z length
4400 * Win64
4401 * rsp+40 - z address
4402 * rsp+48 - z length
4477 __ xchgq(obja, scale); //now obja and scale contains the correct contents
4478
4479 const Register tmp1 = r10;
4480 const Register tmp2 = r11;
4481 #endif
4482 #ifndef _WIN64 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4483 const Register obja = c_rarg0; //U:rdi
4484 const Register objb = c_rarg1; //U:rsi
4485 const Register length = c_rarg2; //U:rdx
4486 const Register scale = c_rarg3; //U:rcx
4487 const Register tmp1 = r8;
4488 const Register tmp2 = r9;
4489 #endif
4490 const Register result = rax; //return value
4491 const XMMRegister vec0 = xmm0;
4492 const XMMRegister vec1 = xmm1;
4493 const XMMRegister vec2 = xmm2;
4494
4495 __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
4496
4497 __ leave();
4498 __ ret(0);
4499
4500 return start;
4501 }
4502
4503 /**
4504 * Arguments:
4505 *
4506 // Input:
4507 // c_rarg0 - x address
4508 // c_rarg1 - x length
4509 // c_rarg2 - z address
4510 // c_rarg3 - z lenth
4511 *
4512 */
4513 address generate_squareToLen() {
4514
4515 __ align(CodeEntryAlignment);
4516 StubCodeMark mark(this, "StubRoutines", "squareToLen");
|
385 }
386 }
387 #endif
388 __ movptr(r15, r15_save);
389 __ movptr(r14, r14_save);
390 __ movptr(r13, r13_save);
391 __ movptr(r12, r12_save);
392 __ movptr(rbx, rbx_save);
393
394 #ifdef _WIN64
395 __ movptr(rdi, rdi_save);
396 __ movptr(rsi, rsi_save);
397 #else
398 __ ldmxcsr(mxcsr_save);
399 #endif
400
401 // restore rsp
402 __ addptr(rsp, -rsp_after_call_off * wordSize);
403
404 // return
405 __ vzeroupper();
406 __ pop(rbp);
407 __ ret(0);
408
409 // handle return types different from T_INT
410 __ BIND(is_long);
411 __ movq(Address(c_rarg0, 0), rax);
412 __ jmp(exit);
413
414 __ BIND(is_float);
415 __ movflt(Address(c_rarg0, 0), xmm0);
416 __ jmp(exit);
417
418 __ BIND(is_double);
419 __ movdbl(Address(c_rarg0, 0), xmm0);
420 __ jmp(exit);
421
422 return start;
423 }
424
425 // Return point for a Java call if there's an exception thrown in
1538 __ BIND(L_copy_2_bytes);
1539 __ testl(byte_count, 2);
1540 __ jccb(Assembler::zero, L_copy_byte);
1541 __ movw(rax, Address(end_from, 8));
1542 __ movw(Address(end_to, 8), rax);
1543
1544 __ addptr(end_from, 2);
1545 __ addptr(end_to, 2);
1546
1547 // Check for and copy trailing byte
1548 __ BIND(L_copy_byte);
1549 __ testl(byte_count, 1);
1550 __ jccb(Assembler::zero, L_exit);
1551 __ movb(rax, Address(end_from, 8));
1552 __ movb(Address(end_to, 8), rax);
1553
1554 __ BIND(L_exit);
1555 restore_arg_regs();
1556 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1557 __ xorptr(rax, rax); // return 0
1558 __ vzeroupper();
1559 __ leave(); // required for proper stackwalking of RuntimeStub frame
1560 __ ret(0);
1561
1562 // Copy in multi-bytes chunks
1563 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1564 __ jmp(L_copy_4_bytes);
1565
1566 return start;
1567 }
1568
1569 // Arguments:
1570 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1571 // ignored
1572 // name - stub name string
1573 //
1574 // Inputs:
1575 // c_rarg0 - source array address
1576 // c_rarg1 - destination array address
1577 // c_rarg2 - element count, treated as ssize_t, can be zero
1578 //
1628 __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1629
1630 // Check for and copy trailing dword
1631 __ BIND(L_copy_4_bytes);
1632 __ testl(byte_count, 4);
1633 __ jcc(Assembler::zero, L_copy_bytes);
1634 __ movl(rax, Address(from, qword_count, Address::times_8));
1635 __ movl(Address(to, qword_count, Address::times_8), rax);
1636 __ jmp(L_copy_bytes);
1637
1638 // Copy trailing qwords
1639 __ BIND(L_copy_8_bytes);
1640 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1641 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1642 __ decrement(qword_count);
1643 __ jcc(Assembler::notZero, L_copy_8_bytes);
1644
1645 restore_arg_regs();
1646 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1647 __ xorptr(rax, rax); // return 0
1648 __ vzeroupper();
1649 __ leave(); // required for proper stackwalking of RuntimeStub frame
1650 __ ret(0);
1651
1652 // Copy in multi-bytes chunks
1653 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1654
1655 restore_arg_regs();
1656 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1657 __ xorptr(rax, rax); // return 0
1658 __ vzeroupper();
1659 __ leave(); // required for proper stackwalking of RuntimeStub frame
1660 __ ret(0);
1661
1662 return start;
1663 }
1664
1665 // Arguments:
1666 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1667 // ignored
1668 // name - stub name string
1669 //
1670 // Inputs:
1671 // c_rarg0 - source array address
1672 // c_rarg1 - destination array address
1673 // c_rarg2 - element count, treated as ssize_t, can be zero
1674 //
1675 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1676 // let the hardware handle it. The two or four words within dwords
1677 // or qwords that span cache line boundaries will still be loaded
1678 // and stored atomically.
1733 __ BIND(L_copy_4_bytes);
1734 __ testl(word_count, 2);
1735 __ jccb(Assembler::zero, L_copy_2_bytes);
1736 __ movl(rax, Address(end_from, 8));
1737 __ movl(Address(end_to, 8), rax);
1738
1739 __ addptr(end_from, 4);
1740 __ addptr(end_to, 4);
1741
1742 // Check for and copy trailing word
1743 __ BIND(L_copy_2_bytes);
1744 __ testl(word_count, 1);
1745 __ jccb(Assembler::zero, L_exit);
1746 __ movw(rax, Address(end_from, 8));
1747 __ movw(Address(end_to, 8), rax);
1748
1749 __ BIND(L_exit);
1750 restore_arg_regs();
1751 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1752 __ xorptr(rax, rax); // return 0
1753 __ vzeroupper();
1754 __ leave(); // required for proper stackwalking of RuntimeStub frame
1755 __ ret(0);
1756
1757 // Copy in multi-bytes chunks
1758 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1759 __ jmp(L_copy_4_bytes);
1760
1761 return start;
1762 }
1763
1764 address generate_fill(BasicType t, bool aligned, const char *name) {
1765 __ align(CodeEntryAlignment);
1766 StubCodeMark mark(this, "StubRoutines", name);
1767 address start = __ pc();
1768
1769 BLOCK_COMMENT("Entry:");
1770
1771 const Register to = c_rarg0; // source array address
1772 const Register value = c_rarg1; // value
1773 const Register count = c_rarg2; // elements count
1774
1775 __ enter(); // required for proper stackwalking of RuntimeStub frame
1776
1777 __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1778
1779 __ vzeroupper();
1780 __ leave(); // required for proper stackwalking of RuntimeStub frame
1781 __ ret(0);
1782 return start;
1783 }
1784
1785 // Arguments:
1786 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1787 // ignored
1788 // name - stub name string
1789 //
1790 // Inputs:
1791 // c_rarg0 - source array address
1792 // c_rarg1 - destination array address
1793 // c_rarg2 - element count, treated as ssize_t, can be zero
1794 //
1795 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1796 // let the hardware handle it. The two or four words within dwords
1797 // or qwords that span cache line boundaries will still be loaded
1798 // and stored atomically.
1799 //
1836 __ movw(Address(to, word_count, Address::times_2, -2), rax);
1837
1838 // Check for and copy trailing dword
1839 __ BIND(L_copy_4_bytes);
1840 __ testl(word_count, 2);
1841 __ jcc(Assembler::zero, L_copy_bytes);
1842 __ movl(rax, Address(from, qword_count, Address::times_8));
1843 __ movl(Address(to, qword_count, Address::times_8), rax);
1844 __ jmp(L_copy_bytes);
1845
1846 // Copy trailing qwords
1847 __ BIND(L_copy_8_bytes);
1848 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1849 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1850 __ decrement(qword_count);
1851 __ jcc(Assembler::notZero, L_copy_8_bytes);
1852
1853 restore_arg_regs();
1854 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1855 __ xorptr(rax, rax); // return 0
1856 __ vzeroupper();
1857 __ leave(); // required for proper stackwalking of RuntimeStub frame
1858 __ ret(0);
1859
1860 // Copy in multi-bytes chunks
1861 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1862
1863 restore_arg_regs();
1864 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1865 __ xorptr(rax, rax); // return 0
1866 __ vzeroupper();
1867 __ leave(); // required for proper stackwalking of RuntimeStub frame
1868 __ ret(0);
1869
1870 return start;
1871 }
1872
1873 // Arguments:
1874 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1875 // ignored
1876 // is_oop - true => oop array, so generate store check code
1877 // name - stub name string
1878 //
1879 // Inputs:
1880 // c_rarg0 - source array address
1881 // c_rarg1 - destination array address
1882 // c_rarg2 - element count, treated as ssize_t, can be zero
1883 //
1884 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1885 // the hardware handle it. The two dwords within qwords that span
1886 // cache line boundaries will still be loaded and stored atomicly.
1936 // Copy trailing qwords
1937 __ BIND(L_copy_8_bytes);
1938 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1939 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1940 __ increment(qword_count);
1941 __ jcc(Assembler::notZero, L_copy_8_bytes);
1942
1943 // Check for and copy trailing dword
1944 __ BIND(L_copy_4_bytes);
1945 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1946 __ jccb(Assembler::zero, L_exit);
1947 __ movl(rax, Address(end_from, 8));
1948 __ movl(Address(end_to, 8), rax);
1949
1950 __ BIND(L_exit);
1951 if (is_oop) {
1952 gen_write_ref_array_post_barrier(saved_to, dword_count, rax);
1953 }
1954 restore_arg_regs();
1955 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1956 __ vzeroupper();
1957 __ xorptr(rax, rax); // return 0
1958 __ leave(); // required for proper stackwalking of RuntimeStub frame
1959 __ ret(0);
1960
1961 // Copy in multi-bytes chunks
1962 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1963 __ jmp(L_copy_4_bytes);
1964
1965 return start;
1966 }
1967
1968 // Arguments:
1969 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1970 // ignored
1971 // is_oop - true => oop array, so generate store check code
1972 // name - stub name string
1973 //
1974 // Inputs:
1975 // c_rarg0 - source array address
1976 // c_rarg1 - destination array address
2022 // Check for and copy trailing dword
2023 __ testl(dword_count, 1);
2024 __ jcc(Assembler::zero, L_copy_bytes);
2025 __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2026 __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2027 __ jmp(L_copy_bytes);
2028
2029 // Copy trailing qwords
2030 __ BIND(L_copy_8_bytes);
2031 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2032 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2033 __ decrement(qword_count);
2034 __ jcc(Assembler::notZero, L_copy_8_bytes);
2035
2036 if (is_oop) {
2037 __ jmp(L_exit);
2038 }
2039 restore_arg_regs();
2040 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2041 __ xorptr(rax, rax); // return 0
2042 __ vzeroupper();
2043 __ leave(); // required for proper stackwalking of RuntimeStub frame
2044 __ ret(0);
2045
2046 // Copy in multi-bytes chunks
2047 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2048
2049 __ BIND(L_exit);
2050 if (is_oop) {
2051 gen_write_ref_array_post_barrier(to, dword_count, rax);
2052 }
2053 restore_arg_regs();
2054 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2055 __ xorptr(rax, rax); // return 0
2056 __ vzeroupper();
2057 __ leave(); // required for proper stackwalking of RuntimeStub frame
2058 __ ret(0);
2059
2060 return start;
2061 }
2062
2063 // Arguments:
2064 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2065 // ignored
2066 // is_oop - true => oop array, so generate store check code
2067 // name - stub name string
2068 //
2069 // Inputs:
2070 // c_rarg0 - source array address
2071 // c_rarg1 - destination array address
2072 // c_rarg2 - element count, treated as ssize_t, can be zero
2073 //
2074 // Side Effects:
2075 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2076 // no-overlap entry point used by generate_conjoint_long_oop_copy().
2114
2115 // Copy from low to high addresses. Use 'to' as scratch.
2116 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2117 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2118 __ negptr(qword_count);
2119 __ jmp(L_copy_bytes);
2120
2121 // Copy trailing qwords
2122 __ BIND(L_copy_8_bytes);
2123 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2124 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2125 __ increment(qword_count);
2126 __ jcc(Assembler::notZero, L_copy_8_bytes);
2127
2128 if (is_oop) {
2129 __ jmp(L_exit);
2130 } else {
2131 restore_arg_regs();
2132 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2133 __ xorptr(rax, rax); // return 0
2134 __ vzeroupper();
2135 __ leave(); // required for proper stackwalking of RuntimeStub frame
2136 __ ret(0);
2137 }
2138
2139 // Copy in multi-bytes chunks
2140 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2141
2142 if (is_oop) {
2143 __ BIND(L_exit);
2144 gen_write_ref_array_post_barrier(saved_to, saved_count, rax);
2145 }
2146 restore_arg_regs();
2147 if (is_oop) {
2148 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2149 } else {
2150 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2151 }
2152 __ vzeroupper();
2153 __ xorptr(rax, rax); // return 0
2154 __ leave(); // required for proper stackwalking of RuntimeStub frame
2155 __ ret(0);
2156
2157 return start;
2158 }
2159
2160 // Arguments:
2161 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2162 // ignored
2163 // is_oop - true => oop array, so generate store check code
2164 // name - stub name string
2165 //
2166 // Inputs:
2167 // c_rarg0 - source array address
2168 // c_rarg1 - destination array address
2169 // c_rarg2 - element count, treated as ssize_t, can be zero
2170 //
2171 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2172 address nooverlap_target, address *entry,
2199 __ movptr(saved_count, qword_count);
2200 // No registers are destroyed by this call
2201 gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2202 }
2203
2204 __ jmp(L_copy_bytes);
2205
2206 // Copy trailing qwords
2207 __ BIND(L_copy_8_bytes);
2208 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2209 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2210 __ decrement(qword_count);
2211 __ jcc(Assembler::notZero, L_copy_8_bytes);
2212
2213 if (is_oop) {
2214 __ jmp(L_exit);
2215 } else {
2216 restore_arg_regs();
2217 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2218 __ xorptr(rax, rax); // return 0
2219 __ vzeroupper();
2220 __ leave(); // required for proper stackwalking of RuntimeStub frame
2221 __ ret(0);
2222 }
2223
2224 // Copy in multi-bytes chunks
2225 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2226
2227 if (is_oop) {
2228 __ BIND(L_exit);
2229 gen_write_ref_array_post_barrier(to, saved_count, rax);
2230 }
2231 restore_arg_regs();
2232 if (is_oop) {
2233 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2234 } else {
2235 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2236 }
2237 __ vzeroupper();
2238 __ xorptr(rax, rax); // return 0
2239 __ leave(); // required for proper stackwalking of RuntimeStub frame
2240 __ ret(0);
2241
2242 return start;
2243 }
2244
2245
2246 // Helper for generating a dynamic type check.
2247 // Smashes no registers.
2248 void generate_type_check(Register sub_klass,
2249 Register super_check_offset,
2250 Register super_klass,
2251 Label& L_success) {
2252 assert_different_registers(sub_klass, super_check_offset, super_klass);
2253
2254 BLOCK_COMMENT("type_check:");
2255
2256 Label L_miss;
2257
3772
3773 const XMMRegister msgtmp1 = xmm4;
3774 const XMMRegister msgtmp2 = xmm5;
3775 const XMMRegister msgtmp3 = xmm6;
3776 const XMMRegister msgtmp4 = xmm7;
3777
3778 const XMMRegister shuf_mask = xmm8;
3779
3780 __ enter();
3781
3782 __ subptr(rsp, 4 * wordSize);
3783
3784 if (VM_Version::supports_sha()) {
3785 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3786 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3787 } else if (VM_Version::supports_avx2()) {
3788 __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3789 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3790 }
3791 __ addptr(rsp, 4 * wordSize);
3792 __ vzeroupper();
3793 __ leave();
3794 __ ret(0);
3795 return start;
3796 }
3797
3798 address generate_sha512_implCompress(bool multi_block, const char *name) {
3799 assert(VM_Version::supports_avx2(), "");
3800 assert(VM_Version::supports_bmi2(), "");
3801 __ align(CodeEntryAlignment);
3802 StubCodeMark mark(this, "StubRoutines", name);
3803 address start = __ pc();
3804
3805 Register buf = c_rarg0;
3806 Register state = c_rarg1;
3807 Register ofs = c_rarg2;
3808 Register limit = c_rarg3;
3809
3810 const XMMRegister msg = xmm0;
3811 const XMMRegister state0 = xmm1;
3812 const XMMRegister state1 = xmm2;
3813 const XMMRegister msgtmp0 = xmm3;
3814 const XMMRegister msgtmp1 = xmm4;
3815 const XMMRegister msgtmp2 = xmm5;
3816 const XMMRegister msgtmp3 = xmm6;
3817 const XMMRegister msgtmp4 = xmm7;
3818
3819 const XMMRegister shuf_mask = xmm8;
3820
3821 __ enter();
3822
3823 __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3824 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3825
3826 __ vzeroupper();
3827 __ leave();
3828 __ ret(0);
3829 return start;
3830 }
3831
3832 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3833 // to hide instruction latency
3834 //
3835 // Arguments:
3836 //
3837 // Inputs:
3838 // c_rarg0 - source byte array address
3839 // c_rarg1 - destination byte array address
3840 // c_rarg2 - K (key) in little endian int array
3841 // c_rarg3 - counter vector byte array address
3842 // Linux
3843 // c_rarg4 - input length
3844 // c_rarg5 - saved encryptedCounter start
3845 // rbp + 6 * wordSize - saved used length
3846 // Windows
4280 __ movdqu(xmm_temp4, xmm_temp3);
4281 __ movdqu(xmm_temp5, xmm_temp3);
4282 __ psrld(xmm_temp2, 1); // packed left shifting >> 1
4283 __ psrld(xmm_temp4, 2); // packed left shifting >> 2
4284 __ psrld(xmm_temp5, 7); // packed left shifting >> 7
4285 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
4286 __ pxor(xmm_temp2, xmm_temp5);
4287 __ pxor(xmm_temp2, xmm_temp8);
4288 __ pxor(xmm_temp3, xmm_temp2);
4289 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
4290
4291 __ decrement(blocks);
4292 __ jcc(Assembler::zero, L_exit);
4293 __ movdqu(xmm_temp0, xmm_temp6);
4294 __ addptr(data, 16);
4295 __ jmp(L_ghash_loop);
4296
4297 __ BIND(L_exit);
4298 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
4299 __ movdqu(Address(state, 0), xmm_temp6); // store the result
4300 __ leave();
4301 __ ret(0);
4302 return start;
4303 }
4304
4305 /**
4306 * Arguments:
4307 *
4308 * Inputs:
4309 * c_rarg0 - int crc
4310 * c_rarg1 - byte* buf
4311 * c_rarg2 - int length
4312 *
4313 * Ouput:
4314 * rax - int crc result
4315 */
4316 address generate_updateBytesCRC32() {
4317 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4318
4319 __ align(CodeEntryAlignment);
4320 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4321
4322 address start = __ pc();
4323 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4324 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4325 // rscratch1: r10
4326 const Register crc = c_rarg0; // crc
4327 const Register buf = c_rarg1; // source java byte array address
4328 const Register len = c_rarg2; // length
4329 const Register table = c_rarg3; // crc_table address (reuse register)
4330 const Register tmp = r11;
4331 assert_different_registers(crc, buf, len, table, tmp, rax);
4332
4333 BLOCK_COMMENT("Entry:");
4334 __ enter(); // required for proper stackwalking of RuntimeStub frame
4335
4336 __ kernel_crc32(crc, buf, len, table, tmp);
4337
4338 __ movl(rax, crc);
4339 __ vzeroupper();
4340 __ leave(); // required for proper stackwalking of RuntimeStub frame
4341 __ ret(0);
4342
4343 return start;
4344 }
4345
4346 /**
4347 * Arguments:
4348 *
4349 * Inputs:
4350 * c_rarg0 - int crc
4351 * c_rarg1 - byte* buf
4352 * c_rarg2 - long length
4353 * c_rarg3 - table_start - optional (present only when doing a library_call,
4354 * not used by x86 algorithm)
4355 *
4356 * Ouput:
4357 * rax - int crc result
4358 */
4359 address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
4379 const Register z = r8;
4380 #endif
4381 assert_different_registers(crc, buf, len, a, j, k, l, y, z);
4382
4383 BLOCK_COMMENT("Entry:");
4384 __ enter(); // required for proper stackwalking of RuntimeStub frame
4385 #ifdef _WIN64
4386 __ push(y);
4387 __ push(z);
4388 #endif
4389 __ crc32c_ipl_alg2_alt2(crc, buf, len,
4390 a, j, k,
4391 l, y, z,
4392 c_farg0, c_farg1, c_farg2,
4393 is_pclmulqdq_supported);
4394 __ movl(rax, crc);
4395 #ifdef _WIN64
4396 __ pop(z);
4397 __ pop(y);
4398 #endif
4399 __ vzeroupper();
4400 __ leave(); // required for proper stackwalking of RuntimeStub frame
4401 __ ret(0);
4402
4403 return start;
4404 }
4405
4406 /**
4407 * Arguments:
4408 *
4409 * Input:
4410 * c_rarg0 - x address
4411 * c_rarg1 - x length
4412 * c_rarg2 - y address
4413 * c_rarg3 - y lenth
4414 * not Win64
4415 * c_rarg4 - z address
4416 * c_rarg5 - z length
4417 * Win64
4418 * rsp+40 - z address
4419 * rsp+48 - z length
4494 __ xchgq(obja, scale); //now obja and scale contains the correct contents
4495
4496 const Register tmp1 = r10;
4497 const Register tmp2 = r11;
4498 #endif
4499 #ifndef _WIN64 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4500 const Register obja = c_rarg0; //U:rdi
4501 const Register objb = c_rarg1; //U:rsi
4502 const Register length = c_rarg2; //U:rdx
4503 const Register scale = c_rarg3; //U:rcx
4504 const Register tmp1 = r8;
4505 const Register tmp2 = r9;
4506 #endif
4507 const Register result = rax; //return value
4508 const XMMRegister vec0 = xmm0;
4509 const XMMRegister vec1 = xmm1;
4510 const XMMRegister vec2 = xmm2;
4511
4512 __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
4513
4514 __ vzeroupper();
4515 __ leave();
4516 __ ret(0);
4517
4518 return start;
4519 }
4520
4521 /**
4522 * Arguments:
4523 *
4524 // Input:
4525 // c_rarg0 - x address
4526 // c_rarg1 - x length
4527 // c_rarg2 - z address
4528 // c_rarg3 - z lenth
4529 *
4530 */
4531 address generate_squareToLen() {
4532
4533 __ align(CodeEntryAlignment);
4534 StubCodeMark mark(this, "StubRoutines", "squareToLen");
|