hotspot Sdiff src/cpu/aarch64/vm

src/cpu/aarch64/vm/stubGenerator_aarch64.cpp

3653     //
3654     // so, given that the product we're reducing is
3655     //    a == lo + hi * z^128
3656     // substituting,
3657     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3658     //
3659     // we reduce by multiplying hi by p(z) and subtracting the result
3660     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3661     // bits we can do this with two 64-bit multiplications, lo*p and
3662     // hi*p.
3663 
3664     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3665     __ ext(t1, __ T16B, t0, z, 8);
3666     __ eor(hi, __ T16B, hi, t1);
3667     __ ext(t1, __ T16B, z, t0, 8);
3668     __ eor(lo, __ T16B, lo, t1);
3669     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3670     __ eor(result, __ T16B, lo, t0);
3671   }
3672 

































































































































































3673   /**
3674    *  Arguments:
3675    *
3676    *  Input:
3677    *  c_rarg0   - current state address
3678    *  c_rarg1   - H key address
3679    *  c_rarg2   - data address
3680    *  c_rarg3   - number of blocks
3681    *
3682    *  Output:
3683    *  Updated state at c_rarg0
3684    */
3685   address generate_ghash_processBlocks() {
3686     // Bafflingly, GCM uses little-endian for the byte order, but
3687     // big-endian for the bit order.  For example, the polynomial 1 is
3688     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3689     //
3690     // So, we must either reverse the bytes in each word and do
3691     // everything big-endian or reverse the bits in each byte and do
3692     // it little-endian.  On AArch64 it's more idiomatic to reverse

4669     //       assert(Ra == Pa_base[j], "must be");
4670     //       MACC(Ra, Ra, t0, t1, t2);
4671     //     }
4672     //     iters =  (2*len-i)/2;
4673     //     assert(iters == len-j, "must be");
4674     //     for (; iters--; j++) {
4675     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4676     //       MACC(Rm, Rn, t0, t1, t2);
4677     //       Rm = *++Pm;
4678     //       Rn = *--Pn;
4679     //     }
4680     //     Pm_base[i-len] = t0;
4681     //     t0 = t1; t1 = t2; t2 = 0;
4682     //   }
4683 
4684     //   while (t0)
4685     //     t0 = sub(Pm_base, Pn_base, t0, len);
4686     // }
4687   };
4688 

4689   // Initialization
4690   void generate_initial() {
4691     // Generate initial stubs and initializes the entry points
4692 
4693     // entry points that exist in all platforms Note: This is code
4694     // that could be shared among different platforms - however the
4695     // benefit seems to be smaller than the disadvantage of having a
4696     // much more complicated generator structure. See also comment in
4697     // stubRoutines.hpp.
4698 
4699     StubRoutines::_forward_exception_entry = generate_forward_exception();
4700 
4701     StubRoutines::_call_stub_entry =
4702       generate_call_stub(StubRoutines::_call_stub_return_address);
4703 
4704     // is referenced by megamorphic call
4705     StubRoutines::_catch_exception_entry = generate_catch_exception();
4706 
4707     // Build this early so it's available for the interpreter.
4708     StubRoutines::_throw_StackOverflowError_entry =

4726     StubRoutines::_throw_AbstractMethodError_entry =
4727       generate_throw_exception("AbstractMethodError throw_exception",
4728                                CAST_FROM_FN_PTR(address,
4729                                                 SharedRuntime::
4730                                                 throw_AbstractMethodError));
4731 
4732     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4733       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4734                                CAST_FROM_FN_PTR(address,
4735                                                 SharedRuntime::
4736                                                 throw_IncompatibleClassChangeError));
4737 
4738     StubRoutines::_throw_NullPointerException_at_call_entry =
4739       generate_throw_exception("NullPointerException at call throw_exception",
4740                                CAST_FROM_FN_PTR(address,
4741                                                 SharedRuntime::
4742                                                 throw_NullPointerException_at_call));
4743 
4744     // arraycopy stubs used by compilers
4745     generate_arraycopy_stubs();



4746 
4747     if (UseMultiplyToLenIntrinsic) {
4748       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4749     }
4750 
4751     if (UseMontgomeryMultiplyIntrinsic) {
4752       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4753       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4754       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4755     }
4756 
4757     if (UseMontgomerySquareIntrinsic) {
4758       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4759       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4760       // We use generate_multiply() rather than generate_square()
4761       // because it's faster for the sizes of modulus we care about.
4762       StubRoutines::_montgomerySquare = g.generate_multiply();
4763     }
4764 
4765 #ifndef BUILTIN_SIM

3653     //
3654     // so, given that the product we're reducing is
3655     //    a == lo + hi * z^128
3656     // substituting,
3657     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3658     //
3659     // we reduce by multiplying hi by p(z) and subtracting the result
3660     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3661     // bits we can do this with two 64-bit multiplications, lo*p and
3662     // hi*p.
3663 
3664     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3665     __ ext(t1, __ T16B, t0, z, 8);
3666     __ eor(hi, __ T16B, hi, t1);
3667     __ ext(t1, __ T16B, z, t0, 8);
3668     __ eor(lo, __ T16B, lo, t1);
3669     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3670     __ eor(result, __ T16B, lo, t0);
3671   }
3672 
3673   address generate_has_negatives(address &has_negatives_long) {
3674     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3675     const int large_loop_size = 64;
3676     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3677     int dcache_line = VM_Version::dcache_line_size();
3678 
3679     Register ary1 = r1, len = r2, result = r0;
3680 
3681     __ align(CodeEntryAlignment);
3682     address entry = __ pc();
3683 
3684     __ enter();
3685 
3686   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3687         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3688 
3689   __ cmp(len, 15);
3690   __ br(Assembler::GT, LEN_OVER_15);
3691   // The only case when execution falls into this code is when pointer is near
3692   // the end of memory page and we have to avoid reading next page
3693   __ add(ary1, ary1, len);
3694   __ subs(len, len, 8);
3695   __ br(Assembler::GT, LEN_OVER_8);
3696   __ ldr(rscratch2, Address(ary1, -8));
3697   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3698   __ lsrv(rscratch2, rscratch2, rscratch1);
3699   __ tst(rscratch2, UPPER_BIT_MASK);
3700   __ cset(result, Assembler::NE);
3701   __ leave();
3702   __ ret(lr);
3703   __ bind(LEN_OVER_8);
3704   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3705   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3706   __ tst(rscratch2, UPPER_BIT_MASK);
3707   __ br(Assembler::NE, RET_TRUE_NO_POP);
3708   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3709   __ lsrv(rscratch1, rscratch1, rscratch2);
3710   __ tst(rscratch1, UPPER_BIT_MASK);
3711   __ cset(result, Assembler::NE);
3712   __ leave();
3713   __ ret(lr);
3714 
3715   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3716   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3717 
3718   has_negatives_long = __ pc(); // 2nd entry point
3719 
3720   __ enter();
3721 
3722   __ bind(LEN_OVER_15);
3723     __ push(spilled_regs, sp);
3724     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3725     __ cbz(rscratch2, ALIGNED);
3726     __ ldp(tmp6, tmp1, Address(ary1));
3727     __ mov(tmp5, 16);
3728     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3729     __ add(ary1, ary1, rscratch1);
3730     __ sub(len, len, rscratch1);
3731     __ orr(tmp6, tmp6, tmp1);
3732     __ tst(tmp6, UPPER_BIT_MASK);
3733     __ br(Assembler::NE, RET_TRUE);
3734 
3735   __ bind(ALIGNED);
3736     __ cmp(len, large_loop_size);
3737     __ br(Assembler::LT, CHECK_16);
3738     // Perform 16-byte load as early return in pre-loop to handle situation
3739     // when initially aligned large array has negative values at starting bytes,
3740     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3741     // slower. Cases with negative bytes further ahead won't be affected that
3742     // much. In fact, it'll be faster due to early loads, less instructions and
3743     // less branches in LARGE_LOOP.
3744     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3745     __ sub(len, len, 16);
3746     __ orr(tmp6, tmp6, tmp1);
3747     __ tst(tmp6, UPPER_BIT_MASK);
3748     __ br(Assembler::NE, RET_TRUE);
3749     __ cmp(len, large_loop_size);
3750     __ br(Assembler::LT, CHECK_16);
3751 
3752     if (SoftwarePrefetchHintDistance >= 0
3753         && SoftwarePrefetchHintDistance >= dcache_line) {
3754       // initial prefetch
3755       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3756     }
3757   __ bind(LARGE_LOOP);
3758     if (SoftwarePrefetchHintDistance >= 0) {
3759       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3760     }
3761     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3762     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3763     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3764     // instructions per cycle and have less branches, but this approach disables
3765     // early return, thus, all 64 bytes are loaded and checked every time.
3766     __ ldp(tmp2, tmp3, Address(ary1));
3767     __ ldp(tmp4, tmp5, Address(ary1, 16));
3768     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3769     __ ldp(tmp6, tmp1, Address(ary1, 48));
3770     __ add(ary1, ary1, large_loop_size);
3771     __ sub(len, len, large_loop_size);
3772     __ orr(tmp2, tmp2, tmp3);
3773     __ orr(tmp4, tmp4, tmp5);
3774     __ orr(rscratch1, rscratch1, rscratch2);
3775     __ orr(tmp6, tmp6, tmp1);
3776     __ orr(tmp2, tmp2, tmp4);
3777     __ orr(rscratch1, rscratch1, tmp6);
3778     __ orr(tmp2, tmp2, rscratch1);
3779     __ tst(tmp2, UPPER_BIT_MASK);
3780     __ br(Assembler::NE, RET_TRUE);
3781     __ cmp(len, large_loop_size);
3782     __ br(Assembler::GE, LARGE_LOOP);
3783 
3784   __ bind(CHECK_16); // small 16-byte load pre-loop 
3785     __ cmp(len, 16);
3786     __ br(Assembler::LT, POST_LOOP16);
3787 
3788   __ bind(LOOP16); // small 16-byte load loop
3789     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3790     __ sub(len, len, 16);
3791     __ orr(tmp2, tmp2, tmp3);
3792     __ tst(tmp2, UPPER_BIT_MASK);
3793     __ br(Assembler::NE, RET_TRUE);
3794     __ cmp(len, 16);
3795     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3796 
3797   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3798     __ cmp(len, 8);
3799     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3800     __ ldr(tmp3, Address(__ post(ary1, 8)));
3801     __ sub(len, len, 8);
3802     __ tst(tmp3, UPPER_BIT_MASK);
3803     __ br(Assembler::NE, RET_TRUE);
3804 
3805   __ bind(POST_LOOP16_LOAD_TAIL);
3806     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3807     __ ldr(tmp1, Address(ary1));
3808     __ mov(tmp2, 64);
3809     __ sub(tmp4, tmp2, len, __ LSL, 3);
3810     __ lslv(tmp1, tmp1, tmp4);
3811     __ tst(tmp1, UPPER_BIT_MASK);
3812     __ br(Assembler::NE, RET_TRUE);
3813     // Fallthrough
3814 
3815   __ bind(RET_FALSE);
3816     __ pop(spilled_regs, sp);
3817     __ leave();
3818     __ mov(result, zr);
3819     __ ret(lr);
3820 
3821   __ bind(RET_TRUE);
3822     __ pop(spilled_regs, sp);
3823   __ bind(RET_TRUE_NO_POP);
3824     __ leave();
3825     __ mov(result, 1);
3826     __ ret(lr);
3827 
3828   __ bind(DONE);
3829     __ pop(spilled_regs, sp);
3830     __ leave();
3831     __ ret(lr);
3832     return entry;
3833   }
3834   /**
3835    *  Arguments:
3836    *
3837    *  Input:
3838    *  c_rarg0   - current state address
3839    *  c_rarg1   - H key address
3840    *  c_rarg2   - data address
3841    *  c_rarg3   - number of blocks
3842    *
3843    *  Output:
3844    *  Updated state at c_rarg0
3845    */
3846   address generate_ghash_processBlocks() {
3847     // Bafflingly, GCM uses little-endian for the byte order, but
3848     // big-endian for the bit order.  For example, the polynomial 1 is
3849     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3850     //
3851     // So, we must either reverse the bytes in each word and do
3852     // everything big-endian or reverse the bits in each byte and do
3853     // it little-endian.  On AArch64 it's more idiomatic to reverse

4830     //       assert(Ra == Pa_base[j], "must be");
4831     //       MACC(Ra, Ra, t0, t1, t2);
4832     //     }
4833     //     iters =  (2*len-i)/2;
4834     //     assert(iters == len-j, "must be");
4835     //     for (; iters--; j++) {
4836     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4837     //       MACC(Rm, Rn, t0, t1, t2);
4838     //       Rm = *++Pm;
4839     //       Rn = *--Pn;
4840     //     }
4841     //     Pm_base[i-len] = t0;
4842     //     t0 = t1; t1 = t2; t2 = 0;
4843     //   }
4844 
4845     //   while (t0)
4846     //     t0 = sub(Pm_base, Pn_base, t0, len);
4847     // }
4848   };
4849 
4850 
4851   // Initialization
4852   void generate_initial() {
4853     // Generate initial stubs and initializes the entry points
4854 
4855     // entry points that exist in all platforms Note: This is code
4856     // that could be shared among different platforms - however the
4857     // benefit seems to be smaller than the disadvantage of having a
4858     // much more complicated generator structure. See also comment in
4859     // stubRoutines.hpp.
4860 
4861     StubRoutines::_forward_exception_entry = generate_forward_exception();
4862 
4863     StubRoutines::_call_stub_entry =
4864       generate_call_stub(StubRoutines::_call_stub_return_address);
4865 
4866     // is referenced by megamorphic call
4867     StubRoutines::_catch_exception_entry = generate_catch_exception();
4868 
4869     // Build this early so it's available for the interpreter.
4870     StubRoutines::_throw_StackOverflowError_entry =

4888     StubRoutines::_throw_AbstractMethodError_entry =
4889       generate_throw_exception("AbstractMethodError throw_exception",
4890                                CAST_FROM_FN_PTR(address,
4891                                                 SharedRuntime::
4892                                                 throw_AbstractMethodError));
4893 
4894     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4895       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4896                                CAST_FROM_FN_PTR(address,
4897                                                 SharedRuntime::
4898                                                 throw_IncompatibleClassChangeError));
4899 
4900     StubRoutines::_throw_NullPointerException_at_call_entry =
4901       generate_throw_exception("NullPointerException at call throw_exception",
4902                                CAST_FROM_FN_PTR(address,
4903                                                 SharedRuntime::
4904                                                 throw_NullPointerException_at_call));
4905 
4906     // arraycopy stubs used by compilers
4907     generate_arraycopy_stubs();
4908 
4909     // has negatives stub for large arrays.
4910     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
4911 
4912     if (UseMultiplyToLenIntrinsic) {
4913       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4914     }
4915 
4916     if (UseMontgomeryMultiplyIntrinsic) {
4917       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4918       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4919       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4920     }
4921 
4922     if (UseMontgomerySquareIntrinsic) {
4923       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4924       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4925       // We use generate_multiply() rather than generate_square()
4926       // because it's faster for the sizes of modulus we care about.
4927       StubRoutines::_montgomerySquare = g.generate_multiply();
4928     }
4929 
4930 #ifndef BUILTIN_SIM

< prev index next >