< prev index next >

src/cpu/x86/vm/stubGenerator_x86_64.cpp

Print this page




 385       }
 386     }
 387 #endif
 388     __ movptr(r15, r15_save);
 389     __ movptr(r14, r14_save);
 390     __ movptr(r13, r13_save);
 391     __ movptr(r12, r12_save);
 392     __ movptr(rbx, rbx_save);
 393 
 394 #ifdef _WIN64
 395     __ movptr(rdi, rdi_save);
 396     __ movptr(rsi, rsi_save);
 397 #else
 398     __ ldmxcsr(mxcsr_save);
 399 #endif
 400 
 401     // restore rsp
 402     __ addptr(rsp, -rsp_after_call_off * wordSize);
 403 
 404     // return

 405     __ pop(rbp);
 406     __ ret(0);
 407 
 408     // handle return types different from T_INT
 409     __ BIND(is_long);
 410     __ movq(Address(c_rarg0, 0), rax);
 411     __ jmp(exit);
 412 
 413     __ BIND(is_float);
 414     __ movflt(Address(c_rarg0, 0), xmm0);
 415     __ jmp(exit);
 416 
 417     __ BIND(is_double);
 418     __ movdbl(Address(c_rarg0, 0), xmm0);
 419     __ jmp(exit);
 420 
 421     return start;
 422   }
 423 
 424   // Return point for a Java call if there's an exception thrown in


1537   __ BIND(L_copy_2_bytes);
1538     __ testl(byte_count, 2);
1539     __ jccb(Assembler::zero, L_copy_byte);
1540     __ movw(rax, Address(end_from, 8));
1541     __ movw(Address(end_to, 8), rax);
1542 
1543     __ addptr(end_from, 2);
1544     __ addptr(end_to, 2);
1545 
1546     // Check for and copy trailing byte
1547   __ BIND(L_copy_byte);
1548     __ testl(byte_count, 1);
1549     __ jccb(Assembler::zero, L_exit);
1550     __ movb(rax, Address(end_from, 8));
1551     __ movb(Address(end_to, 8), rax);
1552 
1553   __ BIND(L_exit);
1554     restore_arg_regs();
1555     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1556     __ xorptr(rax, rax); // return 0

1557     __ leave(); // required for proper stackwalking of RuntimeStub frame
1558     __ ret(0);
1559 
1560     // Copy in multi-bytes chunks
1561     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1562     __ jmp(L_copy_4_bytes);
1563 
1564     return start;
1565   }
1566 
1567   // Arguments:
1568   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1569   //             ignored
1570   //   name    - stub name string
1571   //
1572   // Inputs:
1573   //   c_rarg0   - source array address
1574   //   c_rarg1   - destination array address
1575   //   c_rarg2   - element count, treated as ssize_t, can be zero
1576   //


1626     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1627 
1628     // Check for and copy trailing dword
1629   __ BIND(L_copy_4_bytes);
1630     __ testl(byte_count, 4);
1631     __ jcc(Assembler::zero, L_copy_bytes);
1632     __ movl(rax, Address(from, qword_count, Address::times_8));
1633     __ movl(Address(to, qword_count, Address::times_8), rax);
1634     __ jmp(L_copy_bytes);
1635 
1636     // Copy trailing qwords
1637   __ BIND(L_copy_8_bytes);
1638     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1639     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1640     __ decrement(qword_count);
1641     __ jcc(Assembler::notZero, L_copy_8_bytes);
1642 
1643     restore_arg_regs();
1644     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1645     __ xorptr(rax, rax); // return 0

1646     __ leave(); // required for proper stackwalking of RuntimeStub frame
1647     __ ret(0);
1648 
1649     // Copy in multi-bytes chunks
1650     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1651 
1652     restore_arg_regs();
1653     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1654     __ xorptr(rax, rax); // return 0

1655     __ leave(); // required for proper stackwalking of RuntimeStub frame
1656     __ ret(0);
1657 
1658     return start;
1659   }
1660 
1661   // Arguments:
1662   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1663   //             ignored
1664   //   name    - stub name string
1665   //
1666   // Inputs:
1667   //   c_rarg0   - source array address
1668   //   c_rarg1   - destination array address
1669   //   c_rarg2   - element count, treated as ssize_t, can be zero
1670   //
1671   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1672   // let the hardware handle it.  The two or four words within dwords
1673   // or qwords that span cache line boundaries will still be loaded
1674   // and stored atomically.


1729   __ BIND(L_copy_4_bytes);
1730     __ testl(word_count, 2);
1731     __ jccb(Assembler::zero, L_copy_2_bytes);
1732     __ movl(rax, Address(end_from, 8));
1733     __ movl(Address(end_to, 8), rax);
1734 
1735     __ addptr(end_from, 4);
1736     __ addptr(end_to, 4);
1737 
1738     // Check for and copy trailing word
1739   __ BIND(L_copy_2_bytes);
1740     __ testl(word_count, 1);
1741     __ jccb(Assembler::zero, L_exit);
1742     __ movw(rax, Address(end_from, 8));
1743     __ movw(Address(end_to, 8), rax);
1744 
1745   __ BIND(L_exit);
1746     restore_arg_regs();
1747     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1748     __ xorptr(rax, rax); // return 0

1749     __ leave(); // required for proper stackwalking of RuntimeStub frame
1750     __ ret(0);
1751 
1752     // Copy in multi-bytes chunks
1753     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1754     __ jmp(L_copy_4_bytes);
1755 
1756     return start;
1757   }
1758 
1759   address generate_fill(BasicType t, bool aligned, const char *name) {
1760     __ align(CodeEntryAlignment);
1761     StubCodeMark mark(this, "StubRoutines", name);
1762     address start = __ pc();
1763 
1764     BLOCK_COMMENT("Entry:");
1765 
1766     const Register to       = c_rarg0;  // source array address
1767     const Register value    = c_rarg1;  // value
1768     const Register count    = c_rarg2;  // elements count
1769 
1770     __ enter(); // required for proper stackwalking of RuntimeStub frame
1771 
1772     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1773 

1774     __ leave(); // required for proper stackwalking of RuntimeStub frame
1775     __ ret(0);
1776     return start;
1777   }
1778 
1779   // Arguments:
1780   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1781   //             ignored
1782   //   name    - stub name string
1783   //
1784   // Inputs:
1785   //   c_rarg0   - source array address
1786   //   c_rarg1   - destination array address
1787   //   c_rarg2   - element count, treated as ssize_t, can be zero
1788   //
1789   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1790   // let the hardware handle it.  The two or four words within dwords
1791   // or qwords that span cache line boundaries will still be loaded
1792   // and stored atomically.
1793   //


1830     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1831 
1832     // Check for and copy trailing dword
1833   __ BIND(L_copy_4_bytes);
1834     __ testl(word_count, 2);
1835     __ jcc(Assembler::zero, L_copy_bytes);
1836     __ movl(rax, Address(from, qword_count, Address::times_8));
1837     __ movl(Address(to, qword_count, Address::times_8), rax);
1838     __ jmp(L_copy_bytes);
1839 
1840     // Copy trailing qwords
1841   __ BIND(L_copy_8_bytes);
1842     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1843     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1844     __ decrement(qword_count);
1845     __ jcc(Assembler::notZero, L_copy_8_bytes);
1846 
1847     restore_arg_regs();
1848     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1849     __ xorptr(rax, rax); // return 0

1850     __ leave(); // required for proper stackwalking of RuntimeStub frame
1851     __ ret(0);
1852 
1853     // Copy in multi-bytes chunks
1854     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1855 
1856     restore_arg_regs();
1857     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1858     __ xorptr(rax, rax); // return 0

1859     __ leave(); // required for proper stackwalking of RuntimeStub frame
1860     __ ret(0);
1861 
1862     return start;
1863   }
1864 
1865   // Arguments:
1866   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1867   //             ignored
1868   //   is_oop  - true => oop array, so generate store check code
1869   //   name    - stub name string
1870   //
1871   // Inputs:
1872   //   c_rarg0   - source array address
1873   //   c_rarg1   - destination array address
1874   //   c_rarg2   - element count, treated as ssize_t, can be zero
1875   //
1876   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1877   // the hardware handle it.  The two dwords within qwords that span
1878   // cache line boundaries will still be loaded and stored atomicly.


1928     // Copy trailing qwords
1929   __ BIND(L_copy_8_bytes);
1930     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1931     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1932     __ increment(qword_count);
1933     __ jcc(Assembler::notZero, L_copy_8_bytes);
1934 
1935     // Check for and copy trailing dword
1936   __ BIND(L_copy_4_bytes);
1937     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1938     __ jccb(Assembler::zero, L_exit);
1939     __ movl(rax, Address(end_from, 8));
1940     __ movl(Address(end_to, 8), rax);
1941 
1942   __ BIND(L_exit);
1943     if (is_oop) {
1944       gen_write_ref_array_post_barrier(saved_to, dword_count, rax);
1945     }
1946     restore_arg_regs();
1947     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free

1948     __ xorptr(rax, rax); // return 0
1949     __ leave(); // required for proper stackwalking of RuntimeStub frame
1950     __ ret(0);
1951 
1952     // Copy in multi-bytes chunks
1953     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1954     __ jmp(L_copy_4_bytes);
1955 
1956     return start;
1957   }
1958 
1959   // Arguments:
1960   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1961   //             ignored
1962   //   is_oop  - true => oop array, so generate store check code
1963   //   name    - stub name string
1964   //
1965   // Inputs:
1966   //   c_rarg0   - source array address
1967   //   c_rarg1   - destination array address


2013     // Check for and copy trailing dword
2014     __ testl(dword_count, 1);
2015     __ jcc(Assembler::zero, L_copy_bytes);
2016     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2017     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2018     __ jmp(L_copy_bytes);
2019 
2020     // Copy trailing qwords
2021   __ BIND(L_copy_8_bytes);
2022     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2023     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2024     __ decrement(qword_count);
2025     __ jcc(Assembler::notZero, L_copy_8_bytes);
2026 
2027     if (is_oop) {
2028       __ jmp(L_exit);
2029     }
2030     restore_arg_regs();
2031     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2032     __ xorptr(rax, rax); // return 0

2033     __ leave(); // required for proper stackwalking of RuntimeStub frame
2034     __ ret(0);
2035 
2036     // Copy in multi-bytes chunks
2037     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2038 
2039   __ BIND(L_exit);
2040     if (is_oop) {
2041       gen_write_ref_array_post_barrier(to, dword_count, rax);
2042     }
2043     restore_arg_regs();
2044     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2045     __ xorptr(rax, rax); // return 0

2046     __ leave(); // required for proper stackwalking of RuntimeStub frame
2047     __ ret(0);
2048 
2049     return start;
2050   }
2051 
2052   // Arguments:
2053   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2054   //             ignored
2055   //   is_oop  - true => oop array, so generate store check code
2056   //   name    - stub name string
2057   //
2058   // Inputs:
2059   //   c_rarg0   - source array address
2060   //   c_rarg1   - destination array address
2061   //   c_rarg2   - element count, treated as ssize_t, can be zero
2062   //
2063  // Side Effects:
2064   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2065   //   no-overlap entry point used by generate_conjoint_long_oop_copy().


2103 
2104     // Copy from low to high addresses.  Use 'to' as scratch.
2105     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2106     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2107     __ negptr(qword_count);
2108     __ jmp(L_copy_bytes);
2109 
2110     // Copy trailing qwords
2111   __ BIND(L_copy_8_bytes);
2112     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2113     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2114     __ increment(qword_count);
2115     __ jcc(Assembler::notZero, L_copy_8_bytes);
2116 
2117     if (is_oop) {
2118       __ jmp(L_exit);
2119     } else {
2120       restore_arg_regs();
2121       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2122       __ xorptr(rax, rax); // return 0

2123       __ leave(); // required for proper stackwalking of RuntimeStub frame
2124       __ ret(0);
2125     }
2126 
2127     // Copy in multi-bytes chunks
2128     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2129 
2130     if (is_oop) {
2131     __ BIND(L_exit);
2132       gen_write_ref_array_post_barrier(saved_to, saved_count, rax);
2133     }
2134     restore_arg_regs();
2135     if (is_oop) {
2136       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2137     } else {
2138       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2139     }

2140     __ xorptr(rax, rax); // return 0
2141     __ leave(); // required for proper stackwalking of RuntimeStub frame
2142     __ ret(0);
2143 
2144     return start;
2145   }
2146 
2147   // Arguments:
2148   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2149   //             ignored
2150   //   is_oop  - true => oop array, so generate store check code
2151   //   name    - stub name string
2152   //
2153   // Inputs:
2154   //   c_rarg0   - source array address
2155   //   c_rarg1   - destination array address
2156   //   c_rarg2   - element count, treated as ssize_t, can be zero
2157   //
2158   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2159                                           address nooverlap_target, address *entry,


2186       __ movptr(saved_count, qword_count);
2187       // No registers are destroyed by this call
2188       gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2189     }
2190 
2191     __ jmp(L_copy_bytes);
2192 
2193     // Copy trailing qwords
2194   __ BIND(L_copy_8_bytes);
2195     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2196     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2197     __ decrement(qword_count);
2198     __ jcc(Assembler::notZero, L_copy_8_bytes);
2199 
2200     if (is_oop) {
2201       __ jmp(L_exit);
2202     } else {
2203       restore_arg_regs();
2204       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2205       __ xorptr(rax, rax); // return 0

2206       __ leave(); // required for proper stackwalking of RuntimeStub frame
2207       __ ret(0);
2208     }
2209 
2210     // Copy in multi-bytes chunks
2211     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2212 
2213     if (is_oop) {
2214     __ BIND(L_exit);
2215       gen_write_ref_array_post_barrier(to, saved_count, rax);
2216     }
2217     restore_arg_regs();
2218     if (is_oop) {
2219       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2220     } else {
2221       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2222     }

2223     __ xorptr(rax, rax); // return 0
2224     __ leave(); // required for proper stackwalking of RuntimeStub frame
2225     __ ret(0);
2226 
2227     return start;
2228   }
2229 
2230 
2231   // Helper for generating a dynamic type check.
2232   // Smashes no registers.
2233   void generate_type_check(Register sub_klass,
2234                            Register super_check_offset,
2235                            Register super_klass,
2236                            Label& L_success) {
2237     assert_different_registers(sub_klass, super_check_offset, super_klass);
2238 
2239     BLOCK_COMMENT("type_check:");
2240 
2241     Label L_miss;
2242 


3757 
3758     const XMMRegister msgtmp1 = xmm4;
3759     const XMMRegister msgtmp2 = xmm5;
3760     const XMMRegister msgtmp3 = xmm6;
3761     const XMMRegister msgtmp4 = xmm7;
3762 
3763     const XMMRegister shuf_mask = xmm8;
3764 
3765     __ enter();
3766 
3767     __ subptr(rsp, 4 * wordSize);
3768 
3769     if (VM_Version::supports_sha()) {
3770       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3771         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3772     } else if (VM_Version::supports_avx2()) {
3773       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3774         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3775     }
3776     __ addptr(rsp, 4 * wordSize);
3777 
3778     __ leave();
3779     __ ret(0);
3780     return start;
3781   }
3782 
3783   address generate_sha512_implCompress(bool multi_block, const char *name) {
3784     assert(VM_Version::supports_avx2(), "");
3785     assert(VM_Version::supports_bmi2(), "");
3786     __ align(CodeEntryAlignment);
3787     StubCodeMark mark(this, "StubRoutines", name);
3788     address start = __ pc();
3789 
3790     Register buf = c_rarg0;
3791     Register state = c_rarg1;
3792     Register ofs = c_rarg2;
3793     Register limit = c_rarg3;
3794 
3795     const XMMRegister msg = xmm0;
3796     const XMMRegister state0 = xmm1;
3797     const XMMRegister state1 = xmm2;
3798     const XMMRegister msgtmp0 = xmm3;
3799     const XMMRegister msgtmp1 = xmm4;
3800     const XMMRegister msgtmp2 = xmm5;
3801     const XMMRegister msgtmp3 = xmm6;
3802     const XMMRegister msgtmp4 = xmm7;
3803 
3804     const XMMRegister shuf_mask = xmm8;
3805 
3806     __ enter();
3807 
3808     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3809     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3810 

3811     __ leave();
3812     __ ret(0);
3813     return start;
3814   }
3815 
3816   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3817   // to hide instruction latency
3818   //
3819   // Arguments:
3820   //
3821   // Inputs:
3822   //   c_rarg0   - source byte array address
3823   //   c_rarg1   - destination byte array address
3824   //   c_rarg2   - K (key) in little endian int array
3825   //   c_rarg3   - counter vector byte array address
3826   //   Linux
3827   //     c_rarg4   -          input length
3828   //     c_rarg5   -          saved encryptedCounter start
3829   //     rbp + 6 * wordSize - saved used length
3830   //   Windows


4264     __ movdqu(xmm_temp4, xmm_temp3);
4265     __ movdqu(xmm_temp5, xmm_temp3);
4266     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4267     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4268     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4269     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4270     __ pxor(xmm_temp2, xmm_temp5);
4271     __ pxor(xmm_temp2, xmm_temp8);
4272     __ pxor(xmm_temp3, xmm_temp2);
4273     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4274 
4275     __ decrement(blocks);
4276     __ jcc(Assembler::zero, L_exit);
4277     __ movdqu(xmm_temp0, xmm_temp6);
4278     __ addptr(data, 16);
4279     __ jmp(L_ghash_loop);
4280 
4281     __ BIND(L_exit);
4282     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4283     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
4284 
4285     __ leave();
4286     __ ret(0);
4287     return start;
4288   }
4289 
4290   /**
4291    *  Arguments:
4292    *
4293    * Inputs:
4294    *   c_rarg0   - int crc
4295    *   c_rarg1   - byte* buf
4296    *   c_rarg2   - int length
4297    *
4298    * Ouput:
4299    *       rax   - int crc result
4300    */
4301   address generate_updateBytesCRC32() {
4302     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4303 
4304     __ align(CodeEntryAlignment);
4305     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4306 
4307     address start = __ pc();
4308     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4309     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4310     // rscratch1: r10
4311     const Register crc   = c_rarg0;  // crc
4312     const Register buf   = c_rarg1;  // source java byte array address
4313     const Register len   = c_rarg2;  // length
4314     const Register table = c_rarg3;  // crc_table address (reuse register)
4315     const Register tmp   = r11;
4316     assert_different_registers(crc, buf, len, table, tmp, rax);
4317 
4318     BLOCK_COMMENT("Entry:");
4319     __ enter(); // required for proper stackwalking of RuntimeStub frame
4320 
4321     __ kernel_crc32(crc, buf, len, table, tmp);
4322 
4323     __ movl(rax, crc);

4324     __ leave(); // required for proper stackwalking of RuntimeStub frame
4325     __ ret(0);
4326 
4327     return start;
4328   }
4329 
4330   /**
4331   *  Arguments:
4332   *
4333   * Inputs:
4334   *   c_rarg0   - int crc
4335   *   c_rarg1   - byte* buf
4336   *   c_rarg2   - long length
4337   *   c_rarg3   - table_start - optional (present only when doing a library_call,
4338   *              not used by x86 algorithm)
4339   *
4340   * Ouput:
4341   *       rax   - int crc result
4342   */
4343   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {


4363       const Register z = r8;
4364 #endif
4365       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
4366 
4367       BLOCK_COMMENT("Entry:");
4368       __ enter(); // required for proper stackwalking of RuntimeStub frame
4369 #ifdef _WIN64
4370       __ push(y);
4371       __ push(z);
4372 #endif
4373       __ crc32c_ipl_alg2_alt2(crc, buf, len,
4374                               a, j, k,
4375                               l, y, z,
4376                               c_farg0, c_farg1, c_farg2,
4377                               is_pclmulqdq_supported);
4378       __ movl(rax, crc);
4379 #ifdef _WIN64
4380       __ pop(z);
4381       __ pop(y);
4382 #endif

4383       __ leave(); // required for proper stackwalking of RuntimeStub frame
4384       __ ret(0);
4385 
4386       return start;
4387   }
4388 
4389   /**
4390    *  Arguments:
4391    *
4392    *  Input:
4393    *    c_rarg0   - x address
4394    *    c_rarg1   - x length
4395    *    c_rarg2   - y address
4396    *    c_rarg3   - y lenth
4397    * not Win64
4398    *    c_rarg4   - z address
4399    *    c_rarg5   - z length
4400    * Win64
4401    *    rsp+40    - z address
4402    *    rsp+48    - z length


4477     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
4478 
4479     const Register tmp1 = r10;
4480     const Register tmp2 = r11;
4481 #endif
4482 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4483     const Register obja = c_rarg0;   //U:rdi
4484     const Register objb = c_rarg1;   //U:rsi
4485     const Register length = c_rarg2; //U:rdx
4486     const Register scale = c_rarg3;  //U:rcx
4487     const Register tmp1 = r8;
4488     const Register tmp2 = r9;
4489 #endif
4490     const Register result = rax; //return value
4491     const XMMRegister vec0 = xmm0;
4492     const XMMRegister vec1 = xmm1;
4493     const XMMRegister vec2 = xmm2;
4494 
4495     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
4496 

4497     __ leave();
4498     __ ret(0);
4499 
4500     return start;
4501   }
4502 
4503 /**
4504    *  Arguments:
4505    *
4506   //  Input:
4507   //    c_rarg0   - x address
4508   //    c_rarg1   - x length
4509   //    c_rarg2   - z address
4510   //    c_rarg3   - z lenth
4511    *
4512    */
4513   address generate_squareToLen() {
4514 
4515     __ align(CodeEntryAlignment);
4516     StubCodeMark mark(this, "StubRoutines", "squareToLen");




 385       }
 386     }
 387 #endif
 388     __ movptr(r15, r15_save);
 389     __ movptr(r14, r14_save);
 390     __ movptr(r13, r13_save);
 391     __ movptr(r12, r12_save);
 392     __ movptr(rbx, rbx_save);
 393 
 394 #ifdef _WIN64
 395     __ movptr(rdi, rdi_save);
 396     __ movptr(rsi, rsi_save);
 397 #else
 398     __ ldmxcsr(mxcsr_save);
 399 #endif
 400 
 401     // restore rsp
 402     __ addptr(rsp, -rsp_after_call_off * wordSize);
 403 
 404     // return
 405     __ vzeroupper();
 406     __ pop(rbp);
 407     __ ret(0);
 408 
 409     // handle return types different from T_INT
 410     __ BIND(is_long);
 411     __ movq(Address(c_rarg0, 0), rax);
 412     __ jmp(exit);
 413 
 414     __ BIND(is_float);
 415     __ movflt(Address(c_rarg0, 0), xmm0);
 416     __ jmp(exit);
 417 
 418     __ BIND(is_double);
 419     __ movdbl(Address(c_rarg0, 0), xmm0);
 420     __ jmp(exit);
 421 
 422     return start;
 423   }
 424 
 425   // Return point for a Java call if there's an exception thrown in


1538   __ BIND(L_copy_2_bytes);
1539     __ testl(byte_count, 2);
1540     __ jccb(Assembler::zero, L_copy_byte);
1541     __ movw(rax, Address(end_from, 8));
1542     __ movw(Address(end_to, 8), rax);
1543 
1544     __ addptr(end_from, 2);
1545     __ addptr(end_to, 2);
1546 
1547     // Check for and copy trailing byte
1548   __ BIND(L_copy_byte);
1549     __ testl(byte_count, 1);
1550     __ jccb(Assembler::zero, L_exit);
1551     __ movb(rax, Address(end_from, 8));
1552     __ movb(Address(end_to, 8), rax);
1553 
1554   __ BIND(L_exit);
1555     restore_arg_regs();
1556     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1557     __ xorptr(rax, rax); // return 0
1558     __ vzeroupper();
1559     __ leave(); // required for proper stackwalking of RuntimeStub frame
1560     __ ret(0);
1561 
1562     // Copy in multi-bytes chunks
1563     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1564     __ jmp(L_copy_4_bytes);
1565 
1566     return start;
1567   }
1568 
1569   // Arguments:
1570   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1571   //             ignored
1572   //   name    - stub name string
1573   //
1574   // Inputs:
1575   //   c_rarg0   - source array address
1576   //   c_rarg1   - destination array address
1577   //   c_rarg2   - element count, treated as ssize_t, can be zero
1578   //


1628     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1629 
1630     // Check for and copy trailing dword
1631   __ BIND(L_copy_4_bytes);
1632     __ testl(byte_count, 4);
1633     __ jcc(Assembler::zero, L_copy_bytes);
1634     __ movl(rax, Address(from, qword_count, Address::times_8));
1635     __ movl(Address(to, qword_count, Address::times_8), rax);
1636     __ jmp(L_copy_bytes);
1637 
1638     // Copy trailing qwords
1639   __ BIND(L_copy_8_bytes);
1640     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1641     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1642     __ decrement(qword_count);
1643     __ jcc(Assembler::notZero, L_copy_8_bytes);
1644 
1645     restore_arg_regs();
1646     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1647     __ xorptr(rax, rax); // return 0
1648     __ vzeroupper();
1649     __ leave(); // required for proper stackwalking of RuntimeStub frame
1650     __ ret(0);
1651 
1652     // Copy in multi-bytes chunks
1653     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1654 
1655     restore_arg_regs();
1656     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1657     __ xorptr(rax, rax); // return 0
1658     __ vzeroupper();
1659     __ leave(); // required for proper stackwalking of RuntimeStub frame
1660     __ ret(0);
1661 
1662     return start;
1663   }
1664 
1665   // Arguments:
1666   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1667   //             ignored
1668   //   name    - stub name string
1669   //
1670   // Inputs:
1671   //   c_rarg0   - source array address
1672   //   c_rarg1   - destination array address
1673   //   c_rarg2   - element count, treated as ssize_t, can be zero
1674   //
1675   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1676   // let the hardware handle it.  The two or four words within dwords
1677   // or qwords that span cache line boundaries will still be loaded
1678   // and stored atomically.


1733   __ BIND(L_copy_4_bytes);
1734     __ testl(word_count, 2);
1735     __ jccb(Assembler::zero, L_copy_2_bytes);
1736     __ movl(rax, Address(end_from, 8));
1737     __ movl(Address(end_to, 8), rax);
1738 
1739     __ addptr(end_from, 4);
1740     __ addptr(end_to, 4);
1741 
1742     // Check for and copy trailing word
1743   __ BIND(L_copy_2_bytes);
1744     __ testl(word_count, 1);
1745     __ jccb(Assembler::zero, L_exit);
1746     __ movw(rax, Address(end_from, 8));
1747     __ movw(Address(end_to, 8), rax);
1748 
1749   __ BIND(L_exit);
1750     restore_arg_regs();
1751     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1752     __ xorptr(rax, rax); // return 0
1753     __ vzeroupper();
1754     __ leave(); // required for proper stackwalking of RuntimeStub frame
1755     __ ret(0);
1756 
1757     // Copy in multi-bytes chunks
1758     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1759     __ jmp(L_copy_4_bytes);
1760 
1761     return start;
1762   }
1763 
1764   address generate_fill(BasicType t, bool aligned, const char *name) {
1765     __ align(CodeEntryAlignment);
1766     StubCodeMark mark(this, "StubRoutines", name);
1767     address start = __ pc();
1768 
1769     BLOCK_COMMENT("Entry:");
1770 
1771     const Register to       = c_rarg0;  // source array address
1772     const Register value    = c_rarg1;  // value
1773     const Register count    = c_rarg2;  // elements count
1774 
1775     __ enter(); // required for proper stackwalking of RuntimeStub frame
1776 
1777     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1778 
1779     __ vzeroupper();
1780     __ leave(); // required for proper stackwalking of RuntimeStub frame
1781     __ ret(0);
1782     return start;
1783   }
1784 
1785   // Arguments:
1786   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1787   //             ignored
1788   //   name    - stub name string
1789   //
1790   // Inputs:
1791   //   c_rarg0   - source array address
1792   //   c_rarg1   - destination array address
1793   //   c_rarg2   - element count, treated as ssize_t, can be zero
1794   //
1795   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1796   // let the hardware handle it.  The two or four words within dwords
1797   // or qwords that span cache line boundaries will still be loaded
1798   // and stored atomically.
1799   //


1836     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1837 
1838     // Check for and copy trailing dword
1839   __ BIND(L_copy_4_bytes);
1840     __ testl(word_count, 2);
1841     __ jcc(Assembler::zero, L_copy_bytes);
1842     __ movl(rax, Address(from, qword_count, Address::times_8));
1843     __ movl(Address(to, qword_count, Address::times_8), rax);
1844     __ jmp(L_copy_bytes);
1845 
1846     // Copy trailing qwords
1847   __ BIND(L_copy_8_bytes);
1848     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1849     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1850     __ decrement(qword_count);
1851     __ jcc(Assembler::notZero, L_copy_8_bytes);
1852 
1853     restore_arg_regs();
1854     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1855     __ xorptr(rax, rax); // return 0
1856     __ vzeroupper();
1857     __ leave(); // required for proper stackwalking of RuntimeStub frame
1858     __ ret(0);
1859 
1860     // Copy in multi-bytes chunks
1861     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1862 
1863     restore_arg_regs();
1864     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1865     __ xorptr(rax, rax); // return 0
1866     __ vzeroupper();
1867     __ leave(); // required for proper stackwalking of RuntimeStub frame
1868     __ ret(0);
1869 
1870     return start;
1871   }
1872 
1873   // Arguments:
1874   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1875   //             ignored
1876   //   is_oop  - true => oop array, so generate store check code
1877   //   name    - stub name string
1878   //
1879   // Inputs:
1880   //   c_rarg0   - source array address
1881   //   c_rarg1   - destination array address
1882   //   c_rarg2   - element count, treated as ssize_t, can be zero
1883   //
1884   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1885   // the hardware handle it.  The two dwords within qwords that span
1886   // cache line boundaries will still be loaded and stored atomicly.


1936     // Copy trailing qwords
1937   __ BIND(L_copy_8_bytes);
1938     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1939     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1940     __ increment(qword_count);
1941     __ jcc(Assembler::notZero, L_copy_8_bytes);
1942 
1943     // Check for and copy trailing dword
1944   __ BIND(L_copy_4_bytes);
1945     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1946     __ jccb(Assembler::zero, L_exit);
1947     __ movl(rax, Address(end_from, 8));
1948     __ movl(Address(end_to, 8), rax);
1949 
1950   __ BIND(L_exit);
1951     if (is_oop) {
1952       gen_write_ref_array_post_barrier(saved_to, dword_count, rax);
1953     }
1954     restore_arg_regs();
1955     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1956     __ vzeroupper();
1957     __ xorptr(rax, rax); // return 0
1958     __ leave(); // required for proper stackwalking of RuntimeStub frame
1959     __ ret(0);
1960 
1961     // Copy in multi-bytes chunks
1962     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1963     __ jmp(L_copy_4_bytes);
1964 
1965     return start;
1966   }
1967 
1968   // Arguments:
1969   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1970   //             ignored
1971   //   is_oop  - true => oop array, so generate store check code
1972   //   name    - stub name string
1973   //
1974   // Inputs:
1975   //   c_rarg0   - source array address
1976   //   c_rarg1   - destination array address


2022     // Check for and copy trailing dword
2023     __ testl(dword_count, 1);
2024     __ jcc(Assembler::zero, L_copy_bytes);
2025     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2026     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2027     __ jmp(L_copy_bytes);
2028 
2029     // Copy trailing qwords
2030   __ BIND(L_copy_8_bytes);
2031     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2032     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2033     __ decrement(qword_count);
2034     __ jcc(Assembler::notZero, L_copy_8_bytes);
2035 
2036     if (is_oop) {
2037       __ jmp(L_exit);
2038     }
2039     restore_arg_regs();
2040     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2041     __ xorptr(rax, rax); // return 0
2042     __ vzeroupper();
2043     __ leave(); // required for proper stackwalking of RuntimeStub frame
2044     __ ret(0);
2045 
2046     // Copy in multi-bytes chunks
2047     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2048 
2049   __ BIND(L_exit);
2050     if (is_oop) {
2051       gen_write_ref_array_post_barrier(to, dword_count, rax);
2052     }
2053     restore_arg_regs();
2054     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2055     __ xorptr(rax, rax); // return 0
2056     __ vzeroupper();
2057     __ leave(); // required for proper stackwalking of RuntimeStub frame
2058     __ ret(0);
2059 
2060     return start;
2061   }
2062 
2063   // Arguments:
2064   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2065   //             ignored
2066   //   is_oop  - true => oop array, so generate store check code
2067   //   name    - stub name string
2068   //
2069   // Inputs:
2070   //   c_rarg0   - source array address
2071   //   c_rarg1   - destination array address
2072   //   c_rarg2   - element count, treated as ssize_t, can be zero
2073   //
2074  // Side Effects:
2075   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2076   //   no-overlap entry point used by generate_conjoint_long_oop_copy().


2114 
2115     // Copy from low to high addresses.  Use 'to' as scratch.
2116     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2117     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2118     __ negptr(qword_count);
2119     __ jmp(L_copy_bytes);
2120 
2121     // Copy trailing qwords
2122   __ BIND(L_copy_8_bytes);
2123     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2124     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2125     __ increment(qword_count);
2126     __ jcc(Assembler::notZero, L_copy_8_bytes);
2127 
2128     if (is_oop) {
2129       __ jmp(L_exit);
2130     } else {
2131       restore_arg_regs();
2132       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2133       __ xorptr(rax, rax); // return 0
2134       __ vzeroupper();
2135       __ leave(); // required for proper stackwalking of RuntimeStub frame
2136       __ ret(0);
2137     }
2138 
2139     // Copy in multi-bytes chunks
2140     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2141 
2142     if (is_oop) {
2143     __ BIND(L_exit);
2144       gen_write_ref_array_post_barrier(saved_to, saved_count, rax);
2145     }
2146     restore_arg_regs();
2147     if (is_oop) {
2148       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2149     } else {
2150       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2151     }
2152     __ vzeroupper();
2153     __ xorptr(rax, rax); // return 0
2154     __ leave(); // required for proper stackwalking of RuntimeStub frame
2155     __ ret(0);
2156 
2157     return start;
2158   }
2159 
2160   // Arguments:
2161   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2162   //             ignored
2163   //   is_oop  - true => oop array, so generate store check code
2164   //   name    - stub name string
2165   //
2166   // Inputs:
2167   //   c_rarg0   - source array address
2168   //   c_rarg1   - destination array address
2169   //   c_rarg2   - element count, treated as ssize_t, can be zero
2170   //
2171   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2172                                           address nooverlap_target, address *entry,


2199       __ movptr(saved_count, qword_count);
2200       // No registers are destroyed by this call
2201       gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2202     }
2203 
2204     __ jmp(L_copy_bytes);
2205 
2206     // Copy trailing qwords
2207   __ BIND(L_copy_8_bytes);
2208     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2209     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2210     __ decrement(qword_count);
2211     __ jcc(Assembler::notZero, L_copy_8_bytes);
2212 
2213     if (is_oop) {
2214       __ jmp(L_exit);
2215     } else {
2216       restore_arg_regs();
2217       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2218       __ xorptr(rax, rax); // return 0
2219       __ vzeroupper();
2220       __ leave(); // required for proper stackwalking of RuntimeStub frame
2221       __ ret(0);
2222     }
2223 
2224     // Copy in multi-bytes chunks
2225     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2226 
2227     if (is_oop) {
2228     __ BIND(L_exit);
2229       gen_write_ref_array_post_barrier(to, saved_count, rax);
2230     }
2231     restore_arg_regs();
2232     if (is_oop) {
2233       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2234     } else {
2235       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2236     }
2237     __ vzeroupper();
2238     __ xorptr(rax, rax); // return 0
2239     __ leave(); // required for proper stackwalking of RuntimeStub frame
2240     __ ret(0);
2241 
2242     return start;
2243   }
2244 
2245 
2246   // Helper for generating a dynamic type check.
2247   // Smashes no registers.
2248   void generate_type_check(Register sub_klass,
2249                            Register super_check_offset,
2250                            Register super_klass,
2251                            Label& L_success) {
2252     assert_different_registers(sub_klass, super_check_offset, super_klass);
2253 
2254     BLOCK_COMMENT("type_check:");
2255 
2256     Label L_miss;
2257 


3772 
3773     const XMMRegister msgtmp1 = xmm4;
3774     const XMMRegister msgtmp2 = xmm5;
3775     const XMMRegister msgtmp3 = xmm6;
3776     const XMMRegister msgtmp4 = xmm7;
3777 
3778     const XMMRegister shuf_mask = xmm8;
3779 
3780     __ enter();
3781 
3782     __ subptr(rsp, 4 * wordSize);
3783 
3784     if (VM_Version::supports_sha()) {
3785       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3786         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3787     } else if (VM_Version::supports_avx2()) {
3788       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3789         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3790     }
3791     __ addptr(rsp, 4 * wordSize);
3792     __ vzeroupper();
3793     __ leave();
3794     __ ret(0);
3795     return start;
3796   }
3797 
3798   address generate_sha512_implCompress(bool multi_block, const char *name) {
3799     assert(VM_Version::supports_avx2(), "");
3800     assert(VM_Version::supports_bmi2(), "");
3801     __ align(CodeEntryAlignment);
3802     StubCodeMark mark(this, "StubRoutines", name);
3803     address start = __ pc();
3804 
3805     Register buf = c_rarg0;
3806     Register state = c_rarg1;
3807     Register ofs = c_rarg2;
3808     Register limit = c_rarg3;
3809 
3810     const XMMRegister msg = xmm0;
3811     const XMMRegister state0 = xmm1;
3812     const XMMRegister state1 = xmm2;
3813     const XMMRegister msgtmp0 = xmm3;
3814     const XMMRegister msgtmp1 = xmm4;
3815     const XMMRegister msgtmp2 = xmm5;
3816     const XMMRegister msgtmp3 = xmm6;
3817     const XMMRegister msgtmp4 = xmm7;
3818 
3819     const XMMRegister shuf_mask = xmm8;
3820 
3821     __ enter();
3822 
3823     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3824     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3825 
3826     __ vzeroupper();
3827     __ leave();
3828     __ ret(0);
3829     return start;
3830   }
3831 
3832   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3833   // to hide instruction latency
3834   //
3835   // Arguments:
3836   //
3837   // Inputs:
3838   //   c_rarg0   - source byte array address
3839   //   c_rarg1   - destination byte array address
3840   //   c_rarg2   - K (key) in little endian int array
3841   //   c_rarg3   - counter vector byte array address
3842   //   Linux
3843   //     c_rarg4   -          input length
3844   //     c_rarg5   -          saved encryptedCounter start
3845   //     rbp + 6 * wordSize - saved used length
3846   //   Windows


4280     __ movdqu(xmm_temp4, xmm_temp3);
4281     __ movdqu(xmm_temp5, xmm_temp3);
4282     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4283     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4284     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4285     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4286     __ pxor(xmm_temp2, xmm_temp5);
4287     __ pxor(xmm_temp2, xmm_temp8);
4288     __ pxor(xmm_temp3, xmm_temp2);
4289     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4290 
4291     __ decrement(blocks);
4292     __ jcc(Assembler::zero, L_exit);
4293     __ movdqu(xmm_temp0, xmm_temp6);
4294     __ addptr(data, 16);
4295     __ jmp(L_ghash_loop);
4296 
4297     __ BIND(L_exit);
4298     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4299     __ movdqu(Address(state, 0), xmm_temp6);   // store the result

4300     __ leave();
4301     __ ret(0);
4302     return start;
4303   }
4304 
4305   /**
4306    *  Arguments:
4307    *
4308    * Inputs:
4309    *   c_rarg0   - int crc
4310    *   c_rarg1   - byte* buf
4311    *   c_rarg2   - int length
4312    *
4313    * Ouput:
4314    *       rax   - int crc result
4315    */
4316   address generate_updateBytesCRC32() {
4317     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4318 
4319     __ align(CodeEntryAlignment);
4320     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4321 
4322     address start = __ pc();
4323     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4324     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4325     // rscratch1: r10
4326     const Register crc   = c_rarg0;  // crc
4327     const Register buf   = c_rarg1;  // source java byte array address
4328     const Register len   = c_rarg2;  // length
4329     const Register table = c_rarg3;  // crc_table address (reuse register)
4330     const Register tmp   = r11;
4331     assert_different_registers(crc, buf, len, table, tmp, rax);
4332 
4333     BLOCK_COMMENT("Entry:");
4334     __ enter(); // required for proper stackwalking of RuntimeStub frame
4335 
4336     __ kernel_crc32(crc, buf, len, table, tmp);
4337 
4338     __ movl(rax, crc);
4339     __ vzeroupper();
4340     __ leave(); // required for proper stackwalking of RuntimeStub frame
4341     __ ret(0);
4342 
4343     return start;
4344   }
4345 
4346   /**
4347   *  Arguments:
4348   *
4349   * Inputs:
4350   *   c_rarg0   - int crc
4351   *   c_rarg1   - byte* buf
4352   *   c_rarg2   - long length
4353   *   c_rarg3   - table_start - optional (present only when doing a library_call,
4354   *              not used by x86 algorithm)
4355   *
4356   * Ouput:
4357   *       rax   - int crc result
4358   */
4359   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {


4379       const Register z = r8;
4380 #endif
4381       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
4382 
4383       BLOCK_COMMENT("Entry:");
4384       __ enter(); // required for proper stackwalking of RuntimeStub frame
4385 #ifdef _WIN64
4386       __ push(y);
4387       __ push(z);
4388 #endif
4389       __ crc32c_ipl_alg2_alt2(crc, buf, len,
4390                               a, j, k,
4391                               l, y, z,
4392                               c_farg0, c_farg1, c_farg2,
4393                               is_pclmulqdq_supported);
4394       __ movl(rax, crc);
4395 #ifdef _WIN64
4396       __ pop(z);
4397       __ pop(y);
4398 #endif
4399       __ vzeroupper();
4400       __ leave(); // required for proper stackwalking of RuntimeStub frame
4401       __ ret(0);
4402 
4403       return start;
4404   }
4405 
4406   /**
4407    *  Arguments:
4408    *
4409    *  Input:
4410    *    c_rarg0   - x address
4411    *    c_rarg1   - x length
4412    *    c_rarg2   - y address
4413    *    c_rarg3   - y lenth
4414    * not Win64
4415    *    c_rarg4   - z address
4416    *    c_rarg5   - z length
4417    * Win64
4418    *    rsp+40    - z address
4419    *    rsp+48    - z length


4494     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
4495 
4496     const Register tmp1 = r10;
4497     const Register tmp2 = r11;
4498 #endif
4499 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4500     const Register obja = c_rarg0;   //U:rdi
4501     const Register objb = c_rarg1;   //U:rsi
4502     const Register length = c_rarg2; //U:rdx
4503     const Register scale = c_rarg3;  //U:rcx
4504     const Register tmp1 = r8;
4505     const Register tmp2 = r9;
4506 #endif
4507     const Register result = rax; //return value
4508     const XMMRegister vec0 = xmm0;
4509     const XMMRegister vec1 = xmm1;
4510     const XMMRegister vec2 = xmm2;
4511 
4512     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
4513 
4514     __ vzeroupper();
4515     __ leave();
4516     __ ret(0);
4517 
4518     return start;
4519   }
4520 
4521 /**
4522    *  Arguments:
4523    *
4524   //  Input:
4525   //    c_rarg0   - x address
4526   //    c_rarg1   - x length
4527   //    c_rarg2   - z address
4528   //    c_rarg3   - z lenth
4529    *
4530    */
4531   address generate_squareToLen() {
4532 
4533     __ align(CodeEntryAlignment);
4534     StubCodeMark mark(this, "StubRoutines", "squareToLen");


< prev index next >