< prev index next >

## src/cpu/aarch64/vm/stubGenerator_aarch64.cpp

Print this page

 ``` ``````3653 // 3654 // so, given that the product we're reducing is 3655 // a == lo + hi * z^128 3656 // substituting, 3657 // === lo - hi * p(z) (mod (z^128 + p(z))) 3658 // 3659 // we reduce by multiplying hi by p(z) and subtracting the result 3660 // from (i.e. XORing it with) lo. Because p has no nonzero high 3661 // bits we can do this with two 64-bit multiplications, lo*p and 3662 // hi*p. 3663 3664 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3665 __ ext(t1, __ T16B, t0, z, 8); 3666 __ eor(hi, __ T16B, hi, t1); 3667 __ ext(t1, __ T16B, z, t0, 8); 3668 __ eor(lo, __ T16B, lo, t1); 3669 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3670 __ eor(result, __ T16B, lo, t0); 3671 } 3672 3673 /** 3674 * Arguments: 3675 * 3676 * Input: 3677 * c_rarg0 - current state address 3678 * c_rarg1 - H key address 3679 * c_rarg2 - data address 3680 * c_rarg3 - number of blocks 3681 * 3682 * Output: 3683 * Updated state at c_rarg0 3684 */ 3685 address generate_ghash_processBlocks() { 3686 // Bafflingly, GCM uses little-endian for the byte order, but 3687 // big-endian for the bit order. For example, the polynomial 1 is 3688 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3689 // 3690 // So, we must either reverse the bytes in each word and do 3691 // everything big-endian or reverse the bits in each byte and do 3692 // it little-endian. On AArch64 it's more idiomatic to reverse ``````4669 // assert(Ra == Pa_base[j], "must be"); 4670 // MACC(Ra, Ra, t0, t1, t2); 4671 // } 4672 // iters = (2*len-i)/2; 4673 // assert(iters == len-j, "must be"); 4674 // for (; iters--; j++) { 4675 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4676 // MACC(Rm, Rn, t0, t1, t2); 4677 // Rm = *++Pm; 4678 // Rn = *--Pn; 4679 // } 4680 // Pm_base[i-len] = t0; 4681 // t0 = t1; t1 = t2; t2 = 0; 4682 // } 4683 4684 // while (t0) 4685 // t0 = sub(Pm_base, Pn_base, t0, len); 4686 // } 4687 }; 4688 4689 // Initialization 4690 void generate_initial() { 4691 // Generate initial stubs and initializes the entry points 4692 4693 // entry points that exist in all platforms Note: This is code 4694 // that could be shared among different platforms - however the 4695 // benefit seems to be smaller than the disadvantage of having a 4696 // much more complicated generator structure. See also comment in 4697 // stubRoutines.hpp. 4698 4699 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4700 4701 StubRoutines::_call_stub_entry = 4702 generate_call_stub(StubRoutines::_call_stub_return_address); 4703 4704 // is referenced by megamorphic call 4705 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4706 4707 // Build this early so it's available for the interpreter. 4708 StubRoutines::_throw_StackOverflowError_entry = ``````4726 StubRoutines::_throw_AbstractMethodError_entry = 4727 generate_throw_exception("AbstractMethodError throw_exception", 4728 CAST_FROM_FN_PTR(address, 4729 SharedRuntime:: 4730 throw_AbstractMethodError)); 4731 4732 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4733 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4734 CAST_FROM_FN_PTR(address, 4735 SharedRuntime:: 4736 throw_IncompatibleClassChangeError)); 4737 4738 StubRoutines::_throw_NullPointerException_at_call_entry = 4739 generate_throw_exception("NullPointerException at call throw_exception", 4740 CAST_FROM_FN_PTR(address, 4741 SharedRuntime:: 4742 throw_NullPointerException_at_call)); 4743 4744 // arraycopy stubs used by compilers 4745 generate_arraycopy_stubs(); 4746 4747 if (UseMultiplyToLenIntrinsic) { 4748 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4749 } 4750 4751 if (UseMontgomeryMultiplyIntrinsic) { 4752 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4753 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4754 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4755 } 4756 4757 if (UseMontgomerySquareIntrinsic) { 4758 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4759 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4760 // We use generate_multiply() rather than generate_square() 4761 // because it's faster for the sizes of modulus we care about. 4762 StubRoutines::_montgomerySquare = g.generate_multiply(); 4763 } 4764 4765 #ifndef BUILTIN_SIM ``` ``` ``````3653 // 3654 // so, given that the product we're reducing is 3655 // a == lo + hi * z^128 3656 // substituting, 3657 // === lo - hi * p(z) (mod (z^128 + p(z))) 3658 // 3659 // we reduce by multiplying hi by p(z) and subtracting the result 3660 // from (i.e. XORing it with) lo. Because p has no nonzero high 3661 // bits we can do this with two 64-bit multiplications, lo*p and 3662 // hi*p. 3663 3664 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3665 __ ext(t1, __ T16B, t0, z, 8); 3666 __ eor(hi, __ T16B, hi, t1); 3667 __ ext(t1, __ T16B, z, t0, 8); 3668 __ eor(lo, __ T16B, lo, t1); 3669 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3670 __ eor(result, __ T16B, lo, t0); 3671 } 3672 3673 address generate_has_negatives(address &has_negatives_long) { 3674 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3675 const int large_loop_size = 64; 3676 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3677 int dcache_line = VM_Version::dcache_line_size(); 3678 3679 Register ary1 = r1, len = r2, result = r0; 3680 3681 __ align(CodeEntryAlignment); 3682 address entry = __ pc(); 3683 3684 __ enter(); 3685 3686 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3687 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3688 3689 __ cmp(len, 15); 3690 __ br(Assembler::GT, LEN_OVER_15); 3691 // The only case when execution falls into this code is when pointer is near 3692 // the end of memory page and we have to avoid reading next page 3693 __ add(ary1, ary1, len); 3694 __ subs(len, len, 8); 3695 __ br(Assembler::GT, LEN_OVER_8); 3696 __ ldr(rscratch2, Address(ary1, -8)); 3697 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3698 __ lsrv(rscratch2, rscratch2, rscratch1); 3699 __ tst(rscratch2, UPPER_BIT_MASK); 3700 __ cset(result, Assembler::NE); 3701 __ leave(); 3702 __ ret(lr); 3703 __ bind(LEN_OVER_8); 3704 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3705 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3706 __ tst(rscratch2, UPPER_BIT_MASK); 3707 __ br(Assembler::NE, RET_TRUE_NO_POP); 3708 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3709 __ lsrv(rscratch1, rscratch1, rscratch2); 3710 __ tst(rscratch1, UPPER_BIT_MASK); 3711 __ cset(result, Assembler::NE); 3712 __ leave(); 3713 __ ret(lr); 3714 3715 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3716 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3717 3718 has_negatives_long = __ pc(); // 2nd entry point 3719 3720 __ enter(); 3721 3722 __ bind(LEN_OVER_15); 3723 __ push(spilled_regs, sp); 3724 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3725 __ cbz(rscratch2, ALIGNED); 3726 __ ldp(tmp6, tmp1, Address(ary1)); 3727 __ mov(tmp5, 16); 3728 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3729 __ add(ary1, ary1, rscratch1); 3730 __ sub(len, len, rscratch1); 3731 __ orr(tmp6, tmp6, tmp1); 3732 __ tst(tmp6, UPPER_BIT_MASK); 3733 __ br(Assembler::NE, RET_TRUE); 3734 3735 __ bind(ALIGNED); 3736 __ cmp(len, large_loop_size); 3737 __ br(Assembler::LT, CHECK_16); 3738 // Perform 16-byte load as early return in pre-loop to handle situation 3739 // when initially aligned large array has negative values at starting bytes, 3740 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3741 // slower. Cases with negative bytes further ahead won't be affected that 3742 // much. In fact, it'll be faster due to early loads, less instructions and 3743 // less branches in LARGE_LOOP. 3744 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3745 __ sub(len, len, 16); 3746 __ orr(tmp6, tmp6, tmp1); 3747 __ tst(tmp6, UPPER_BIT_MASK); 3748 __ br(Assembler::NE, RET_TRUE); 3749 __ cmp(len, large_loop_size); 3750 __ br(Assembler::LT, CHECK_16); 3751 3752 if (SoftwarePrefetchHintDistance >= 0 3753 && SoftwarePrefetchHintDistance >= dcache_line) { 3754 // initial prefetch 3755 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3756 } 3757 __ bind(LARGE_LOOP); 3758 if (SoftwarePrefetchHintDistance >= 0) { 3759 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3760 } 3761 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3762 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3763 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3764 // instructions per cycle and have less branches, but this approach disables 3765 // early return, thus, all 64 bytes are loaded and checked every time. 3766 __ ldp(tmp2, tmp3, Address(ary1)); 3767 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3768 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3769 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3770 __ add(ary1, ary1, large_loop_size); 3771 __ sub(len, len, large_loop_size); 3772 __ orr(tmp2, tmp2, tmp3); 3773 __ orr(tmp4, tmp4, tmp5); 3774 __ orr(rscratch1, rscratch1, rscratch2); 3775 __ orr(tmp6, tmp6, tmp1); 3776 __ orr(tmp2, tmp2, tmp4); 3777 __ orr(rscratch1, rscratch1, tmp6); 3778 __ orr(tmp2, tmp2, rscratch1); 3779 __ tst(tmp2, UPPER_BIT_MASK); 3780 __ br(Assembler::NE, RET_TRUE); 3781 __ cmp(len, large_loop_size); 3782 __ br(Assembler::GE, LARGE_LOOP); 3783 3784 __ bind(CHECK_16); // small 16-byte load pre-loop 3785 __ cmp(len, 16); 3786 __ br(Assembler::LT, POST_LOOP16); 3787 3788 __ bind(LOOP16); // small 16-byte load loop 3789 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3790 __ sub(len, len, 16); 3791 __ orr(tmp2, tmp2, tmp3); 3792 __ tst(tmp2, UPPER_BIT_MASK); 3793 __ br(Assembler::NE, RET_TRUE); 3794 __ cmp(len, 16); 3795 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3796 3797 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3798 __ cmp(len, 8); 3799 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3800 __ ldr(tmp3, Address(__ post(ary1, 8))); 3801 __ sub(len, len, 8); 3802 __ tst(tmp3, UPPER_BIT_MASK); 3803 __ br(Assembler::NE, RET_TRUE); 3804 3805 __ bind(POST_LOOP16_LOAD_TAIL); 3806 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3807 __ ldr(tmp1, Address(ary1)); 3808 __ mov(tmp2, 64); 3809 __ sub(tmp4, tmp2, len, __ LSL, 3); 3810 __ lslv(tmp1, tmp1, tmp4); 3811 __ tst(tmp1, UPPER_BIT_MASK); 3812 __ br(Assembler::NE, RET_TRUE); 3813 // Fallthrough 3814 3815 __ bind(RET_FALSE); 3816 __ pop(spilled_regs, sp); 3817 __ leave(); 3818 __ mov(result, zr); 3819 __ ret(lr); 3820 3821 __ bind(RET_TRUE); 3822 __ pop(spilled_regs, sp); 3823 __ bind(RET_TRUE_NO_POP); 3824 __ leave(); 3825 __ mov(result, 1); 3826 __ ret(lr); 3827 3828 __ bind(DONE); 3829 __ pop(spilled_regs, sp); 3830 __ leave(); 3831 __ ret(lr); 3832 return entry; 3833 } 3834 /** 3835 * Arguments: 3836 * 3837 * Input: 3838 * c_rarg0 - current state address 3839 * c_rarg1 - H key address 3840 * c_rarg2 - data address 3841 * c_rarg3 - number of blocks 3842 * 3843 * Output: 3844 * Updated state at c_rarg0 3845 */ 3846 address generate_ghash_processBlocks() { 3847 // Bafflingly, GCM uses little-endian for the byte order, but 3848 // big-endian for the bit order. For example, the polynomial 1 is 3849 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3850 // 3851 // So, we must either reverse the bytes in each word and do 3852 // everything big-endian or reverse the bits in each byte and do 3853 // it little-endian. On AArch64 it's more idiomatic to reverse ``````4830 // assert(Ra == Pa_base[j], "must be"); 4831 // MACC(Ra, Ra, t0, t1, t2); 4832 // } 4833 // iters = (2*len-i)/2; 4834 // assert(iters == len-j, "must be"); 4835 // for (; iters--; j++) { 4836 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4837 // MACC(Rm, Rn, t0, t1, t2); 4838 // Rm = *++Pm; 4839 // Rn = *--Pn; 4840 // } 4841 // Pm_base[i-len] = t0; 4842 // t0 = t1; t1 = t2; t2 = 0; 4843 // } 4844 4845 // while (t0) 4846 // t0 = sub(Pm_base, Pn_base, t0, len); 4847 // } 4848 }; 4849 4850 4851 // Initialization 4852 void generate_initial() { 4853 // Generate initial stubs and initializes the entry points 4854 4855 // entry points that exist in all platforms Note: This is code 4856 // that could be shared among different platforms - however the 4857 // benefit seems to be smaller than the disadvantage of having a 4858 // much more complicated generator structure. See also comment in 4859 // stubRoutines.hpp. 4860 4861 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4862 4863 StubRoutines::_call_stub_entry = 4864 generate_call_stub(StubRoutines::_call_stub_return_address); 4865 4866 // is referenced by megamorphic call 4867 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4868 4869 // Build this early so it's available for the interpreter. 4870 StubRoutines::_throw_StackOverflowError_entry = ``````4888 StubRoutines::_throw_AbstractMethodError_entry = 4889 generate_throw_exception("AbstractMethodError throw_exception", 4890 CAST_FROM_FN_PTR(address, 4891 SharedRuntime:: 4892 throw_AbstractMethodError)); 4893 4894 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4895 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4896 CAST_FROM_FN_PTR(address, 4897 SharedRuntime:: 4898 throw_IncompatibleClassChangeError)); 4899 4900 StubRoutines::_throw_NullPointerException_at_call_entry = 4901 generate_throw_exception("NullPointerException at call throw_exception", 4902 CAST_FROM_FN_PTR(address, 4903 SharedRuntime:: 4904 throw_NullPointerException_at_call)); 4905 4906 // arraycopy stubs used by compilers 4907 generate_arraycopy_stubs(); 4908 4909 // has negatives stub for large arrays. 4910 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 4911 4912 if (UseMultiplyToLenIntrinsic) { 4913 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4914 } 4915 4916 if (UseMontgomeryMultiplyIntrinsic) { 4917 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4918 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4919 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4920 } 4921 4922 if (UseMontgomerySquareIntrinsic) { 4923 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4924 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4925 // We use generate_multiply() rather than generate_square() 4926 // because it's faster for the sizes of modulus we care about. 4927 StubRoutines::_montgomerySquare = g.generate_multiply(); 4928 } 4929 4930 #ifndef BUILTIN_SIM ```
< prev index next >