1889 // ignored
1890 // is_oop - true => oop array, so generate store check code
1891 // name - stub name string
1892 //
1893 // Inputs:
1894 // c_rarg0 - source array address
1895 // c_rarg1 - destination array address
1896 // c_rarg2 - element count, treated as ssize_t, can be zero
1897 //
1898 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1899 // the hardware handle it. The two dwords within qwords that span
1900 // cache line boundaries will still be loaded and stored atomicly.
1901 //
1902 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1903 address *entry, const char *name,
1904 bool dest_uninitialized = false) {
1905 __ align(CodeEntryAlignment);
1906 StubCodeMark mark(this, "StubRoutines", name);
1907 address start = __ pc();
1908
1909 Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
1910 const Register from = rdi; // source array address
1911 const Register to = rsi; // destination array address
1912 const Register count = rdx; // elements count
1913 const Register dword_count = rcx;
1914 const Register qword_count = count;
1915
1916 __ enter(); // required for proper stackwalking of RuntimeStub frame
1917 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1918
1919 if (entry != NULL) {
1920 *entry = __ pc();
1921 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1922 BLOCK_COMMENT("Entry:");
1923 }
1924
1925 array_overlap_test(nooverlap_target, Address::times_4);
1926 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1927 // r9 and r10 may be used to save non-volatile registers
1928
1929 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
3831 const XMMRegister xmm_result2 = xmm7;
3832 const XMMRegister xmm_result3 = xmm8;
3833 const XMMRegister xmm_result4 = xmm9;
3834 const XMMRegister xmm_result5 = xmm10;
3835
3836 const XMMRegister xmm_from0 = xmm11;
3837 const XMMRegister xmm_from1 = xmm12;
3838 const XMMRegister xmm_from2 = xmm13;
3839 const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
3840 const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
3841 const XMMRegister xmm_from5 = xmm4;
3842
3843 //for key_128, key_192, key_256
3844 const int rounds[3] = {10, 12, 14};
3845 Label L_exit_preLoop, L_preLoop_start;
3846 Label L_multiBlock_loopTop[3];
3847 Label L_singleBlockLoopTop[3];
3848 Label L__incCounter[3][6]; //for 6 blocks
3849 Label L__incCounter_single[3]; //for single block, key128, key192, key256
3850 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
3851 Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
3852
3853 Label L_exit;
3854
3855 __ enter(); // required for proper stackwalking of RuntimeStub frame
3856
3857 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3858 // context for the registers used, where all instructions below are using 128-bit mode
3859 // On EVEX without VL and BW, these instructions will all be AVX.
3860 if (VM_Version::supports_avx512vlbw()) {
3861 __ movl(rax, 0xffff);
3862 __ kmovql(k1, rax);
3863 }
3864
3865 #ifdef _WIN64
3866 // allocate spill slots for r13, r14
3867 enum {
3868 saved_r13_offset,
3869 saved_r14_offset
3870 };
3871 __ subptr(rsp, 2 * wordSize);
|
1889 // ignored
1890 // is_oop - true => oop array, so generate store check code
1891 // name - stub name string
1892 //
1893 // Inputs:
1894 // c_rarg0 - source array address
1895 // c_rarg1 - destination array address
1896 // c_rarg2 - element count, treated as ssize_t, can be zero
1897 //
1898 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1899 // the hardware handle it. The two dwords within qwords that span
1900 // cache line boundaries will still be loaded and stored atomicly.
1901 //
1902 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1903 address *entry, const char *name,
1904 bool dest_uninitialized = false) {
1905 __ align(CodeEntryAlignment);
1906 StubCodeMark mark(this, "StubRoutines", name);
1907 address start = __ pc();
1908
1909 Label L_copy_bytes, L_copy_8_bytes, L_exit;
1910 const Register from = rdi; // source array address
1911 const Register to = rsi; // destination array address
1912 const Register count = rdx; // elements count
1913 const Register dword_count = rcx;
1914 const Register qword_count = count;
1915
1916 __ enter(); // required for proper stackwalking of RuntimeStub frame
1917 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1918
1919 if (entry != NULL) {
1920 *entry = __ pc();
1921 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1922 BLOCK_COMMENT("Entry:");
1923 }
1924
1925 array_overlap_test(nooverlap_target, Address::times_4);
1926 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1927 // r9 and r10 may be used to save non-volatile registers
1928
1929 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
3831 const XMMRegister xmm_result2 = xmm7;
3832 const XMMRegister xmm_result3 = xmm8;
3833 const XMMRegister xmm_result4 = xmm9;
3834 const XMMRegister xmm_result5 = xmm10;
3835
3836 const XMMRegister xmm_from0 = xmm11;
3837 const XMMRegister xmm_from1 = xmm12;
3838 const XMMRegister xmm_from2 = xmm13;
3839 const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
3840 const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
3841 const XMMRegister xmm_from5 = xmm4;
3842
3843 //for key_128, key_192, key_256
3844 const int rounds[3] = {10, 12, 14};
3845 Label L_exit_preLoop, L_preLoop_start;
3846 Label L_multiBlock_loopTop[3];
3847 Label L_singleBlockLoopTop[3];
3848 Label L__incCounter[3][6]; //for 6 blocks
3849 Label L__incCounter_single[3]; //for single block, key128, key192, key256
3850 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
3851 Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
3852
3853 Label L_exit;
3854
3855 __ enter(); // required for proper stackwalking of RuntimeStub frame
3856
3857 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3858 // context for the registers used, where all instructions below are using 128-bit mode
3859 // On EVEX without VL and BW, these instructions will all be AVX.
3860 if (VM_Version::supports_avx512vlbw()) {
3861 __ movl(rax, 0xffff);
3862 __ kmovql(k1, rax);
3863 }
3864
3865 #ifdef _WIN64
3866 // allocate spill slots for r13, r14
3867 enum {
3868 saved_r13_offset,
3869 saved_r14_offset
3870 };
3871 __ subptr(rsp, 2 * wordSize);
|