5677 __ enter(); // required for proper stackwalking of RuntimeStub frame 5678 5679 setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx 5680 // len => rcx, k => r8 5681 // r9 and r10 may be used to save non-volatile registers 5682 #ifdef _WIN64 5683 // last argument is on stack on Win64 5684 __ movl(k, Address(rsp, 6 * wordSize)); 5685 #endif 5686 __ movptr(r11, rdx); // move offset in rdx to offset(r11) 5687 __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); 5688 5689 restore_arg_regs(); 5690 5691 __ leave(); // required for proper stackwalking of RuntimeStub frame 5692 __ ret(0); 5693 5694 return start; 5695 } 5696 5697 address generate_libmExp() { 5698 StubCodeMark mark(this, "StubRoutines", "libmExp"); 5699 5700 address start = __ pc(); 5701 5702 const XMMRegister x0 = xmm0; 5703 const XMMRegister x1 = xmm1; 5704 const XMMRegister x2 = xmm2; 5705 const XMMRegister x3 = xmm3; 5706 5707 const XMMRegister x4 = xmm4; 5708 const XMMRegister x5 = xmm5; 5709 const XMMRegister x6 = xmm6; 5710 const XMMRegister x7 = xmm7; 5711 5712 const Register tmp = r11; 5713 5714 BLOCK_COMMENT("Entry:"); 5715 __ enter(); // required for proper stackwalking of RuntimeStub frame 5716 6296 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 6297 &StubRoutines::_safefetch32_fault_pc, 6298 &StubRoutines::_safefetch32_continuation_pc); 6299 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 6300 &StubRoutines::_safefetchN_fault_pc, 6301 &StubRoutines::_safefetchN_continuation_pc); 6302 6303 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 6304 if (bs_nm != NULL) { 6305 StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier(); 6306 } 6307 #ifdef COMPILER2 6308 if (UseMultiplyToLenIntrinsic) { 6309 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 6310 } 6311 if (UseSquareToLenIntrinsic) { 6312 StubRoutines::_squareToLen = generate_squareToLen(); 6313 } 6314 if (UseMulAddIntrinsic) { 6315 StubRoutines::_mulAdd = generate_mulAdd(); 6316 } 6317 #ifndef _WINDOWS 6318 if (UseMontgomeryMultiplyIntrinsic) { 6319 StubRoutines::_montgomeryMultiply 6320 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); 6321 } 6322 if (UseMontgomerySquareIntrinsic) { 6323 StubRoutines::_montgomerySquare 6324 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); 6325 } 6326 #endif // WINDOWS 6327 #endif // COMPILER2 6328 6329 if (UseVectorizedMismatchIntrinsic) { 6330 StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch(); 6331 } 6332 } 6333 6334 public: 6335 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { | 5677 __ enter(); // required for proper stackwalking of RuntimeStub frame 5678 5679 setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx 5680 // len => rcx, k => r8 5681 // r9 and r10 may be used to save non-volatile registers 5682 #ifdef _WIN64 5683 // last argument is on stack on Win64 5684 __ movl(k, Address(rsp, 6 * wordSize)); 5685 #endif 5686 __ movptr(r11, rdx); // move offset in rdx to offset(r11) 5687 __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); 5688 5689 restore_arg_regs(); 5690 5691 __ leave(); // required for proper stackwalking of RuntimeStub frame 5692 __ ret(0); 5693 5694 return start; 5695 } 5696 5697 address generate_bigIntegerRightShift() { 5698 __ align(CodeEntryAlignment); 5699 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 5700 5701 address start = __ pc(); 5702 Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit; 5703 // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8. 5704 const Register newArr = rdi; 5705 const Register oldArr = rsi; 5706 const Register newIdx = rdx; 5707 const Register shiftCount = rcx; // It was intentional to have shiftCount in rcx since it is used implicitly for shift. 5708 const Register totalNumIter = r8; 5709 5710 // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps. 5711 // For everything else, we prefer using r9 and r10 since we do not have to save them before use. 5712 const Register tmp1 = r11; // Caller save. 5713 const Register tmp2 = rax; // Caller save. 5714 const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9); // Windows: Callee save. Linux: Caller save. 5715 const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10); // Windows: Callee save. Linux: Caller save. 5716 const Register tmp5 = r14; // Callee save. 5717 const Register tmp6 = r15; 5718 5719 const XMMRegister x0 = xmm0; 5720 const XMMRegister x1 = xmm1; 5721 const XMMRegister x2 = xmm2; 5722 5723 BLOCK_COMMENT("Entry:"); 5724 __ enter(); // required for proper stackwalking of RuntimeStub frame 5725 5726 #ifdef _WINDOWS 5727 setup_arg_regs(4); 5728 // For windows, since last argument is on stack, we need to move it to the appropriate register. 5729 __ movl(totalNumIter, Address(rsp, 6 * wordSize)); 5730 // Save callee save registers. 5731 __ push(tmp3); 5732 __ push(tmp4); 5733 #endif 5734 __ push(tmp5); 5735 5736 // Rename temps used throughout the code. 5737 const Register idx = tmp1; 5738 const Register nIdx = tmp2; 5739 5740 __ xorl(idx, idx); 5741 5742 // Start right shift from end of the array. 5743 // For example, if #iteration = 4 and newIdx = 1 5744 // then dest[4] = src[4] >> shiftCount | src[3] <<< (shiftCount - 32) 5745 // if #iteration = 4 and newIdx = 0 5746 // then dest[3] = src[4] >> shiftCount | src[3] <<< (shiftCount - 32) 5747 __ movl(idx, totalNumIter); 5748 __ movl(nIdx, idx); 5749 __ addl(nIdx, newIdx); 5750 5751 // If vectorization is enabled, check if the number of iterations is at least 64 5752 // If not, then go to ShifTwo processing 2 iterations 5753 if (VM_Version::supports_vbmi2()) { 5754 __ cmpptr(totalNumIter, (AVX3Threshold/64)); 5755 __ jcc(Assembler::less, ShiftTwo); 5756 5757 if (AVX3Threshold < 16 * 64) { 5758 __ cmpl(totalNumIter, 16); 5759 __ jcc(Assembler::less, ShiftTwo); 5760 } 5761 __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit); 5762 __ subl(idx, 16); 5763 __ subl(nIdx, 16); 5764 __ BIND(Shift512Loop); 5765 __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 4), Assembler::AVX_512bit); 5766 __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit); 5767 __ vpshrdvd(x2, x1, x0, Assembler::AVX_512bit); 5768 __ evmovdqul(Address(newArr, nIdx, Address::times_4), x2, Assembler::AVX_512bit); 5769 __ subl(nIdx, 16); 5770 __ subl(idx, 16); 5771 __ jcc(Assembler::greaterEqual, Shift512Loop); 5772 __ addl(idx, 16); 5773 __ addl(nIdx, 16); 5774 } 5775 __ BIND(ShiftTwo); 5776 __ cmpl(idx, 2); 5777 __ jcc(Assembler::less, ShiftOne); 5778 __ subl(idx, 2); 5779 __ subl(nIdx, 2); 5780 __ BIND(ShiftTwoLoop); 5781 __ movl(tmp5, Address(oldArr, idx, Address::times_4, 8)); 5782 __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4)); 5783 __ movl(tmp3, Address(oldArr, idx, Address::times_4)); 5784 __ shrdl(tmp5, tmp4); 5785 __ shrdl(tmp4, tmp3); 5786 __ movl(Address(newArr, nIdx, Address::times_4, 4), tmp5); 5787 __ movl(Address(newArr, nIdx, Address::times_4), tmp4); 5788 __ subl(nIdx, 2); 5789 __ subl(idx, 2); 5790 __ jcc(Assembler::greaterEqual, ShiftTwoLoop); 5791 __ addl(idx, 2); 5792 __ addl(nIdx, 2); 5793 5794 // Do the last iteration 5795 __ BIND(ShiftOne); 5796 __ cmpl(idx, 1); 5797 __ jcc(Assembler::less, Exit); 5798 __ subl(idx, 1); 5799 __ subl(nIdx, 1); 5800 __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4)); 5801 __ movl(tmp3, Address(oldArr, idx, Address::times_4)); 5802 __ shrdl(tmp4, tmp3); 5803 __ movl(Address(newArr, nIdx, Address::times_4), tmp4); 5804 __ BIND(Exit); 5805 // Restore callee save registers. 5806 __ pop(tmp5); 5807 #ifdef _WINDOWS 5808 __ pop(tmp4); 5809 __ pop(tmp3); 5810 restore_arg_regs(); 5811 #endif 5812 __ leave(); // required for proper stackwalking of RuntimeStub frame 5813 __ ret(0); 5814 return start; 5815 } 5816 5817 /** 5818 * Arguments: 5819 * 5820 * Input: 5821 * c_rarg0 - newArr address 5822 * c_rarg1 - oldArr address 5823 * c_rarg2 - newIdx 5824 * c_rarg3 - shiftCount 5825 * not Win64 5826 * c_rarg4 - numIter 5827 * Win64 5828 * rsp40 - numIter 5829 */ 5830 address generate_bigIntegerLeftShift() { 5831 __ align(CodeEntryAlignment); 5832 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 5833 address start = __ pc(); 5834 Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit; 5835 // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8. 5836 const Register newArr = rdi; 5837 const Register oldArr = rsi; 5838 const Register newIdx = rdx; 5839 const Register shiftCount = rcx; // It was intentional to have shiftCount in rcx since it is used implicitly for shift. 5840 const Register totalNumIter = r8; 5841 // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps. 5842 // For everything else, we prefer using r9 and r10 since we do not have to save them before use. 5843 const Register tmp1 = r11; // Caller save. 5844 const Register tmp2 = rax; // Caller save. 5845 const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9); // Windows: Callee save. Linux: Caller save. 5846 const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10); // Windows: Callee save. Linux: Caller save. 5847 const Register tmp5 = r14; // Callee save. 5848 5849 const XMMRegister x0 = xmm0; 5850 const XMMRegister x1 = xmm1; 5851 const XMMRegister x2 = xmm2; 5852 BLOCK_COMMENT("Entry:"); 5853 __ enter(); // required for proper stackwalking of RuntimeStub frame 5854 5855 #ifdef _WINDOWS 5856 setup_arg_regs(4); 5857 // For windows, since last argument is on stack, we need to move it to the appropriate register. 5858 __ movl(totalNumIter, Address(rsp, 6 * wordSize)); 5859 // Save callee save registers. 5860 __ push(tmp3); 5861 __ push(tmp4); 5862 #endif 5863 __ push(tmp5); 5864 5865 // Rename temps used throughout the code 5866 const Register idx = tmp1; 5867 const Register numIterTmp = tmp2; 5868 5869 // Start idx from zero. 5870 __ xorl(idx, idx); 5871 // Compute interior pointer for new array. We do this so that we can use same index for both old and new arrays. 5872 __ lea(newArr, Address(newArr, newIdx, Address::times_4)); 5873 __ movl(numIterTmp, totalNumIter); 5874 5875 // If vectorization is enabled, check if the number of iterations is at least 64 5876 // If not, then go to ShiftTwo shifting two numbers at a time 5877 if (VM_Version::supports_vbmi2()) { 5878 __ cmpl(totalNumIter, (AVX3Threshold/64)); 5879 __ jcc(Assembler::less, ShiftTwo); 5880 5881 if (AVX3Threshold < 16 * 64) { 5882 __ cmpl(totalNumIter, 16); 5883 __ jcc(Assembler::less, ShiftTwo); 5884 } 5885 __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit); 5886 __ subl(numIterTmp, 16); 5887 __ BIND(Shift512Loop); 5888 __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit); 5889 __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 0x4), Assembler::AVX_512bit); 5890 __ vpshldvd(x1, x2, x0, Assembler::AVX_512bit); 5891 __ evmovdqul(Address(newArr, idx, Address::times_4), x1, Assembler::AVX_512bit); 5892 __ addl(idx, 16); 5893 __ subl(numIterTmp, 16); 5894 __ jcc(Assembler::greaterEqual, Shift512Loop); 5895 __ addl(numIterTmp, 16); 5896 } 5897 __ BIND(ShiftTwo); 5898 __ cmpl(totalNumIter, 1); 5899 __ jcc(Assembler::less, Exit); 5900 __ movl(tmp3, Address(oldArr, idx, Address::times_4)); 5901 __ subl(numIterTmp, 2); 5902 __ jcc(Assembler::less, ShiftOne); 5903 5904 __ BIND(ShiftTwoLoop); 5905 __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4)); 5906 __ movl(tmp5, Address(oldArr, idx, Address::times_4, 0x8)); 5907 __ shldl(tmp3, tmp4); 5908 __ shldl(tmp4, tmp5); 5909 __ movl(Address(newArr, idx, Address::times_4), tmp3); 5910 __ movl(Address(newArr, idx, Address::times_4, 0x4), tmp4); 5911 __ movl(tmp3, tmp5); 5912 __ addl(idx, 2); 5913 __ subl(numIterTmp, 2); 5914 __ jcc(Assembler::greaterEqual, ShiftTwoLoop); 5915 5916 // Do the last iteration 5917 __ BIND(ShiftOne); 5918 __ addl(numIterTmp, 2); 5919 __ cmpl(numIterTmp, 1); 5920 __ jcc(Assembler::less, Exit); 5921 __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4)); 5922 __ shldl(tmp3, tmp4); 5923 __ movl(Address(newArr, idx, Address::times_4), tmp3); 5924 5925 __ BIND(Exit); 5926 // Restore callee save registers. 5927 __ pop(tmp5); 5928 #ifdef _WINDOWS 5929 __ pop(tmp4); 5930 __ pop(tmp3); 5931 restore_arg_regs(); 5932 #endif 5933 __ leave(); // required for proper stackwalking of RuntimeStub frame 5934 __ ret(0); 5935 return start; 5936 } 5937 5938 address generate_libmExp() { 5939 StubCodeMark mark(this, "StubRoutines", "libmExp"); 5940 5941 address start = __ pc(); 5942 5943 const XMMRegister x0 = xmm0; 5944 const XMMRegister x1 = xmm1; 5945 const XMMRegister x2 = xmm2; 5946 const XMMRegister x3 = xmm3; 5947 5948 const XMMRegister x4 = xmm4; 5949 const XMMRegister x5 = xmm5; 5950 const XMMRegister x6 = xmm6; 5951 const XMMRegister x7 = xmm7; 5952 5953 const Register tmp = r11; 5954 5955 BLOCK_COMMENT("Entry:"); 5956 __ enter(); // required for proper stackwalking of RuntimeStub frame 5957 6537 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 6538 &StubRoutines::_safefetch32_fault_pc, 6539 &StubRoutines::_safefetch32_continuation_pc); 6540 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 6541 &StubRoutines::_safefetchN_fault_pc, 6542 &StubRoutines::_safefetchN_continuation_pc); 6543 6544 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 6545 if (bs_nm != NULL) { 6546 StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier(); 6547 } 6548 #ifdef COMPILER2 6549 if (UseMultiplyToLenIntrinsic) { 6550 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 6551 } 6552 if (UseSquareToLenIntrinsic) { 6553 StubRoutines::_squareToLen = generate_squareToLen(); 6554 } 6555 if (UseMulAddIntrinsic) { 6556 StubRoutines::_mulAdd = generate_mulAdd(); 6557 } 6558 if (VM_Version::supports_vbmi2()) { 6559 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 6560 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 6561 } 6562 #ifndef _WINDOWS 6563 if (UseMontgomeryMultiplyIntrinsic) { 6564 StubRoutines::_montgomeryMultiply 6565 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); 6566 } 6567 if (UseMontgomerySquareIntrinsic) { 6568 StubRoutines::_montgomerySquare 6569 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); 6570 } 6571 #endif // WINDOWS 6572 #endif // COMPILER2 6573 6574 if (UseVectorizedMismatchIntrinsic) { 6575 StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch(); 6576 } 6577 } 6578 6579 public: 6580 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { |