3653 //
3654 // so, given that the product we're reducing is
3655 // a == lo + hi * z^128
3656 // substituting,
3657 // === lo - hi * p(z) (mod (z^128 + p(z)))
3658 //
3659 // we reduce by multiplying hi by p(z) and subtracting the result
3660 // from (i.e. XORing it with) lo. Because p has no nonzero high
3661 // bits we can do this with two 64-bit multiplications, lo*p and
3662 // hi*p.
3663
3664 __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3665 __ ext(t1, __ T16B, t0, z, 8);
3666 __ eor(hi, __ T16B, hi, t1);
3667 __ ext(t1, __ T16B, z, t0, 8);
3668 __ eor(lo, __ T16B, lo, t1);
3669 __ pmull(t0, __ T1Q, hi, p, __ T1D);
3670 __ eor(result, __ T16B, lo, t0);
3671 }
3672
3673 /**
3674 * Arguments:
3675 *
3676 * Input:
3677 * c_rarg0 - current state address
3678 * c_rarg1 - H key address
3679 * c_rarg2 - data address
3680 * c_rarg3 - number of blocks
3681 *
3682 * Output:
3683 * Updated state at c_rarg0
3684 */
3685 address generate_ghash_processBlocks() {
3686 // Bafflingly, GCM uses little-endian for the byte order, but
3687 // big-endian for the bit order. For example, the polynomial 1 is
3688 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3689 //
3690 // So, we must either reverse the bytes in each word and do
3691 // everything big-endian or reverse the bits in each byte and do
3692 // it little-endian. On AArch64 it's more idiomatic to reverse
4669 // assert(Ra == Pa_base[j], "must be");
4670 // MACC(Ra, Ra, t0, t1, t2);
4671 // }
4672 // iters = (2*len-i)/2;
4673 // assert(iters == len-j, "must be");
4674 // for (; iters--; j++) {
4675 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4676 // MACC(Rm, Rn, t0, t1, t2);
4677 // Rm = *++Pm;
4678 // Rn = *--Pn;
4679 // }
4680 // Pm_base[i-len] = t0;
4681 // t0 = t1; t1 = t2; t2 = 0;
4682 // }
4683
4684 // while (t0)
4685 // t0 = sub(Pm_base, Pn_base, t0, len);
4686 // }
4687 };
4688
4689 // Initialization
4690 void generate_initial() {
4691 // Generate initial stubs and initializes the entry points
4692
4693 // entry points that exist in all platforms Note: This is code
4694 // that could be shared among different platforms - however the
4695 // benefit seems to be smaller than the disadvantage of having a
4696 // much more complicated generator structure. See also comment in
4697 // stubRoutines.hpp.
4698
4699 StubRoutines::_forward_exception_entry = generate_forward_exception();
4700
4701 StubRoutines::_call_stub_entry =
4702 generate_call_stub(StubRoutines::_call_stub_return_address);
4703
4704 // is referenced by megamorphic call
4705 StubRoutines::_catch_exception_entry = generate_catch_exception();
4706
4707 // Build this early so it's available for the interpreter.
4708 StubRoutines::_throw_StackOverflowError_entry =
4726 StubRoutines::_throw_AbstractMethodError_entry =
4727 generate_throw_exception("AbstractMethodError throw_exception",
4728 CAST_FROM_FN_PTR(address,
4729 SharedRuntime::
4730 throw_AbstractMethodError));
4731
4732 StubRoutines::_throw_IncompatibleClassChangeError_entry =
4733 generate_throw_exception("IncompatibleClassChangeError throw_exception",
4734 CAST_FROM_FN_PTR(address,
4735 SharedRuntime::
4736 throw_IncompatibleClassChangeError));
4737
4738 StubRoutines::_throw_NullPointerException_at_call_entry =
4739 generate_throw_exception("NullPointerException at call throw_exception",
4740 CAST_FROM_FN_PTR(address,
4741 SharedRuntime::
4742 throw_NullPointerException_at_call));
4743
4744 // arraycopy stubs used by compilers
4745 generate_arraycopy_stubs();
4746
4747 if (UseMultiplyToLenIntrinsic) {
4748 StubRoutines::_multiplyToLen = generate_multiplyToLen();
4749 }
4750
4751 if (UseMontgomeryMultiplyIntrinsic) {
4752 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4753 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4754 StubRoutines::_montgomeryMultiply = g.generate_multiply();
4755 }
4756
4757 if (UseMontgomerySquareIntrinsic) {
4758 StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4759 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4760 // We use generate_multiply() rather than generate_square()
4761 // because it's faster for the sizes of modulus we care about.
4762 StubRoutines::_montgomerySquare = g.generate_multiply();
4763 }
4764
4765 #ifndef BUILTIN_SIM
|
3653 //
3654 // so, given that the product we're reducing is
3655 // a == lo + hi * z^128
3656 // substituting,
3657 // === lo - hi * p(z) (mod (z^128 + p(z)))
3658 //
3659 // we reduce by multiplying hi by p(z) and subtracting the result
3660 // from (i.e. XORing it with) lo. Because p has no nonzero high
3661 // bits we can do this with two 64-bit multiplications, lo*p and
3662 // hi*p.
3663
3664 __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3665 __ ext(t1, __ T16B, t0, z, 8);
3666 __ eor(hi, __ T16B, hi, t1);
3667 __ ext(t1, __ T16B, z, t0, 8);
3668 __ eor(lo, __ T16B, lo, t1);
3669 __ pmull(t0, __ T1Q, hi, p, __ T1D);
3670 __ eor(result, __ T16B, lo, t0);
3671 }
3672
3673 address generate_has_negatives(address &has_negatives_long) {
3674 StubCodeMark mark(this, "StubRoutines", "has_negatives");
3675 const int large_loop_size = 64;
3676 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3677 int dcache_line = VM_Version::dcache_line_size();
3678
3679 Register ary1 = r1, len = r2, result = r0;
3680
3681 __ align(CodeEntryAlignment);
3682 address entry = __ pc();
3683
3684 __ enter();
3685
3686 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3687 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3688
3689 __ cmp(len, 15);
3690 __ br(Assembler::GT, LEN_OVER_15);
3691 // The only case when execution falls into this code is when pointer is near
3692 // the end of memory page and we have to avoid reading next page
3693 __ add(ary1, ary1, len);
3694 __ subs(len, len, 8);
3695 __ br(Assembler::GT, LEN_OVER_8);
3696 __ ldr(rscratch2, Address(ary1, -8));
3697 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
3698 __ lsrv(rscratch2, rscratch2, rscratch1);
3699 __ tst(rscratch2, UPPER_BIT_MASK);
3700 __ cset(result, Assembler::NE);
3701 __ leave();
3702 __ ret(lr);
3703 __ bind(LEN_OVER_8);
3704 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3705 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3706 __ tst(rscratch2, UPPER_BIT_MASK);
3707 __ br(Assembler::NE, RET_TRUE_NO_POP);
3708 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3709 __ lsrv(rscratch1, rscratch1, rscratch2);
3710 __ tst(rscratch1, UPPER_BIT_MASK);
3711 __ cset(result, Assembler::NE);
3712 __ leave();
3713 __ ret(lr);
3714
3715 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3716 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3717
3718 has_negatives_long = __ pc(); // 2nd entry point
3719
3720 __ enter();
3721
3722 __ bind(LEN_OVER_15);
3723 __ push(spilled_regs, sp);
3724 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3725 __ cbz(rscratch2, ALIGNED);
3726 __ ldp(tmp6, tmp1, Address(ary1));
3727 __ mov(tmp5, 16);
3728 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3729 __ add(ary1, ary1, rscratch1);
3730 __ sub(len, len, rscratch1);
3731 __ orr(tmp6, tmp6, tmp1);
3732 __ tst(tmp6, UPPER_BIT_MASK);
3733 __ br(Assembler::NE, RET_TRUE);
3734
3735 __ bind(ALIGNED);
3736 __ cmp(len, large_loop_size);
3737 __ br(Assembler::LT, CHECK_16);
3738 // Perform 16-byte load as early return in pre-loop to handle situation
3739 // when initially aligned large array has negative values at starting bytes,
3740 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3741 // slower. Cases with negative bytes further ahead won't be affected that
3742 // much. In fact, it'll be faster due to early loads, less instructions and
3743 // less branches in LARGE_LOOP.
3744 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3745 __ sub(len, len, 16);
3746 __ orr(tmp6, tmp6, tmp1);
3747 __ tst(tmp6, UPPER_BIT_MASK);
3748 __ br(Assembler::NE, RET_TRUE);
3749 __ cmp(len, large_loop_size);
3750 __ br(Assembler::LT, CHECK_16);
3751
3752 if (SoftwarePrefetchHintDistance >= 0
3753 && SoftwarePrefetchHintDistance >= dcache_line) {
3754 // initial prefetch
3755 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3756 }
3757 __ bind(LARGE_LOOP);
3758 if (SoftwarePrefetchHintDistance >= 0) {
3759 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3760 }
3761 // Issue load instructions first, since it can save few CPU/MEM cycles, also
3762 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3763 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3764 // instructions per cycle and have less branches, but this approach disables
3765 // early return, thus, all 64 bytes are loaded and checked every time.
3766 __ ldp(tmp2, tmp3, Address(ary1));
3767 __ ldp(tmp4, tmp5, Address(ary1, 16));
3768 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3769 __ ldp(tmp6, tmp1, Address(ary1, 48));
3770 __ add(ary1, ary1, large_loop_size);
3771 __ sub(len, len, large_loop_size);
3772 __ orr(tmp2, tmp2, tmp3);
3773 __ orr(tmp4, tmp4, tmp5);
3774 __ orr(rscratch1, rscratch1, rscratch2);
3775 __ orr(tmp6, tmp6, tmp1);
3776 __ orr(tmp2, tmp2, tmp4);
3777 __ orr(rscratch1, rscratch1, tmp6);
3778 __ orr(tmp2, tmp2, rscratch1);
3779 __ tst(tmp2, UPPER_BIT_MASK);
3780 __ br(Assembler::NE, RET_TRUE);
3781 __ cmp(len, large_loop_size);
3782 __ br(Assembler::GE, LARGE_LOOP);
3783
3784 __ bind(CHECK_16); // small 16-byte load pre-loop
3785 __ cmp(len, 16);
3786 __ br(Assembler::LT, POST_LOOP16);
3787
3788 __ bind(LOOP16); // small 16-byte load loop
3789 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3790 __ sub(len, len, 16);
3791 __ orr(tmp2, tmp2, tmp3);
3792 __ tst(tmp2, UPPER_BIT_MASK);
3793 __ br(Assembler::NE, RET_TRUE);
3794 __ cmp(len, 16);
3795 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3796
3797 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3798 __ cmp(len, 8);
3799 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3800 __ ldr(tmp3, Address(__ post(ary1, 8)));
3801 __ sub(len, len, 8);
3802 __ tst(tmp3, UPPER_BIT_MASK);
3803 __ br(Assembler::NE, RET_TRUE);
3804
3805 __ bind(POST_LOOP16_LOAD_TAIL);
3806 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3807 __ ldr(tmp1, Address(ary1));
3808 __ mov(tmp2, 64);
3809 __ sub(tmp4, tmp2, len, __ LSL, 3);
3810 __ lslv(tmp1, tmp1, tmp4);
3811 __ tst(tmp1, UPPER_BIT_MASK);
3812 __ br(Assembler::NE, RET_TRUE);
3813 // Fallthrough
3814
3815 __ bind(RET_FALSE);
3816 __ pop(spilled_regs, sp);
3817 __ leave();
3818 __ mov(result, zr);
3819 __ ret(lr);
3820
3821 __ bind(RET_TRUE);
3822 __ pop(spilled_regs, sp);
3823 __ bind(RET_TRUE_NO_POP);
3824 __ leave();
3825 __ mov(result, 1);
3826 __ ret(lr);
3827
3828 __ bind(DONE);
3829 __ pop(spilled_regs, sp);
3830 __ leave();
3831 __ ret(lr);
3832 return entry;
3833 }
3834 /**
3835 * Arguments:
3836 *
3837 * Input:
3838 * c_rarg0 - current state address
3839 * c_rarg1 - H key address
3840 * c_rarg2 - data address
3841 * c_rarg3 - number of blocks
3842 *
3843 * Output:
3844 * Updated state at c_rarg0
3845 */
3846 address generate_ghash_processBlocks() {
3847 // Bafflingly, GCM uses little-endian for the byte order, but
3848 // big-endian for the bit order. For example, the polynomial 1 is
3849 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3850 //
3851 // So, we must either reverse the bytes in each word and do
3852 // everything big-endian or reverse the bits in each byte and do
3853 // it little-endian. On AArch64 it's more idiomatic to reverse
4830 // assert(Ra == Pa_base[j], "must be");
4831 // MACC(Ra, Ra, t0, t1, t2);
4832 // }
4833 // iters = (2*len-i)/2;
4834 // assert(iters == len-j, "must be");
4835 // for (; iters--; j++) {
4836 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4837 // MACC(Rm, Rn, t0, t1, t2);
4838 // Rm = *++Pm;
4839 // Rn = *--Pn;
4840 // }
4841 // Pm_base[i-len] = t0;
4842 // t0 = t1; t1 = t2; t2 = 0;
4843 // }
4844
4845 // while (t0)
4846 // t0 = sub(Pm_base, Pn_base, t0, len);
4847 // }
4848 };
4849
4850
4851 // Initialization
4852 void generate_initial() {
4853 // Generate initial stubs and initializes the entry points
4854
4855 // entry points that exist in all platforms Note: This is code
4856 // that could be shared among different platforms - however the
4857 // benefit seems to be smaller than the disadvantage of having a
4858 // much more complicated generator structure. See also comment in
4859 // stubRoutines.hpp.
4860
4861 StubRoutines::_forward_exception_entry = generate_forward_exception();
4862
4863 StubRoutines::_call_stub_entry =
4864 generate_call_stub(StubRoutines::_call_stub_return_address);
4865
4866 // is referenced by megamorphic call
4867 StubRoutines::_catch_exception_entry = generate_catch_exception();
4868
4869 // Build this early so it's available for the interpreter.
4870 StubRoutines::_throw_StackOverflowError_entry =
4888 StubRoutines::_throw_AbstractMethodError_entry =
4889 generate_throw_exception("AbstractMethodError throw_exception",
4890 CAST_FROM_FN_PTR(address,
4891 SharedRuntime::
4892 throw_AbstractMethodError));
4893
4894 StubRoutines::_throw_IncompatibleClassChangeError_entry =
4895 generate_throw_exception("IncompatibleClassChangeError throw_exception",
4896 CAST_FROM_FN_PTR(address,
4897 SharedRuntime::
4898 throw_IncompatibleClassChangeError));
4899
4900 StubRoutines::_throw_NullPointerException_at_call_entry =
4901 generate_throw_exception("NullPointerException at call throw_exception",
4902 CAST_FROM_FN_PTR(address,
4903 SharedRuntime::
4904 throw_NullPointerException_at_call));
4905
4906 // arraycopy stubs used by compilers
4907 generate_arraycopy_stubs();
4908
4909 // has negatives stub for large arrays.
4910 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
4911
4912 if (UseMultiplyToLenIntrinsic) {
4913 StubRoutines::_multiplyToLen = generate_multiplyToLen();
4914 }
4915
4916 if (UseMontgomeryMultiplyIntrinsic) {
4917 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4918 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4919 StubRoutines::_montgomeryMultiply = g.generate_multiply();
4920 }
4921
4922 if (UseMontgomerySquareIntrinsic) {
4923 StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4924 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4925 // We use generate_multiply() rather than generate_square()
4926 // because it's faster for the sizes of modulus we care about.
4927 StubRoutines::_montgomerySquare = g.generate_multiply();
4928 }
4929
4930 #ifndef BUILTIN_SIM
|