3871 3872 __ bind(RET_FALSE); 3873 __ pop(spilled_regs, sp); 3874 __ leave(); 3875 __ mov(result, zr); 3876 __ ret(lr); 3877 3878 __ bind(RET_TRUE); 3879 __ pop(spilled_regs, sp); 3880 __ bind(RET_TRUE_NO_POP); 3881 __ leave(); 3882 __ mov(result, 1); 3883 __ ret(lr); 3884 3885 __ bind(DONE); 3886 __ pop(spilled_regs, sp); 3887 __ leave(); 3888 __ ret(lr); 3889 return entry; 3890 } 3891 /** 3892 * Arguments: 3893 * 3894 * Input: 3895 * c_rarg0 - current state address 3896 * c_rarg1 - H key address 3897 * c_rarg2 - data address 3898 * c_rarg3 - number of blocks 3899 * 3900 * Output: 3901 * Updated state at c_rarg0 3902 */ 3903 address generate_ghash_processBlocks() { 3904 // Bafflingly, GCM uses little-endian for the byte order, but 3905 // big-endian for the bit order. For example, the polynomial 1 is 3906 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3907 // 3908 // So, we must either reverse the bytes in each word and do 3909 // everything big-endian or reverse the bits in each byte and do 3910 // it little-endian. On AArch64 it's more idiomatic to reverse 4952 SharedRuntime:: 4953 throw_AbstractMethodError)); 4954 4955 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4956 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4957 CAST_FROM_FN_PTR(address, 4958 SharedRuntime:: 4959 throw_IncompatibleClassChangeError)); 4960 4961 StubRoutines::_throw_NullPointerException_at_call_entry = 4962 generate_throw_exception("NullPointerException at call throw_exception", 4963 CAST_FROM_FN_PTR(address, 4964 SharedRuntime:: 4965 throw_NullPointerException_at_call)); 4966 4967 // arraycopy stubs used by compilers 4968 generate_arraycopy_stubs(); 4969 4970 // has negatives stub for large arrays. 4971 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 4972 4973 if (UseMultiplyToLenIntrinsic) { 4974 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4975 } 4976 4977 if (UseSquareToLenIntrinsic) { 4978 StubRoutines::_squareToLen = generate_squareToLen(); 4979 } 4980 4981 if (UseMulAddIntrinsic) { 4982 StubRoutines::_mulAdd = generate_mulAdd(); 4983 } 4984 4985 if (UseMontgomeryMultiplyIntrinsic) { 4986 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4987 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4988 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4989 } 4990 4991 if (UseMontgomerySquareIntrinsic) { | 3871 3872 __ bind(RET_FALSE); 3873 __ pop(spilled_regs, sp); 3874 __ leave(); 3875 __ mov(result, zr); 3876 __ ret(lr); 3877 3878 __ bind(RET_TRUE); 3879 __ pop(spilled_regs, sp); 3880 __ bind(RET_TRUE_NO_POP); 3881 __ leave(); 3882 __ mov(result, 1); 3883 __ ret(lr); 3884 3885 __ bind(DONE); 3886 __ pop(spilled_regs, sp); 3887 __ leave(); 3888 __ ret(lr); 3889 return entry; 3890 } 3891 3892 address generate_large_array_equals_byte() { 3893 return generate_large_array_equals(1); 3894 } 3895 3896 address generate_large_array_equals_char() { 3897 return generate_large_array_equals(2); 3898 } 3899 3900 // a1 = r1 - array1 address 3901 // a2 = r2 - array2 address 3902 // result = r0 - return value. Already contains "false" 3903 // cnt1 = r4 - amount of elements left to check, reduced by elem_per_word 3904 address generate_large_array_equals(int elem_size) { 3905 StubCodeMark mark(this, "StubRoutines", elem_size == 1 3906 ? "large_array_equals_byte" 3907 : "large_array_equals_char"); 3908 Register a1 = r1, a2 = r2, result = r0, cnt1 = r4, tmp1 = rscratch1, 3909 tmp2 = rscratch2, tmp3 = r6, tmp4 = r7; 3910 Label LARGE_LOOP, NOT_EQUAL; 3911 int elem_per_word = wordSize/elem_size; 3912 int branchThreshold = MAX(80, SoftwarePrefetchHintDistance)/elem_size - elem_per_word; 3913 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 3914 3915 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4); 3916 3917 __ align(CodeEntryAlignment); 3918 address entry = __ pc(); 3919 __ enter(); 3920 3921 if (!UseSIMDForArrayEquals) { 3922 // pre-loop 3923 __ push(spilled_regs, sp); 3924 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3925 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3926 } 3927 __ bind(LARGE_LOOP); // unrolled to 64 bytes loop with possible prefetching 3928 if (SoftwarePrefetchHintDistance >= 0) { 3929 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3930 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3931 } 3932 if (UseSIMDForArrayEquals) { 3933 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3934 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3935 __ eor(v0, __ T2D, v0, v4); 3936 __ eor(v1, __ T2D, v1, v5); 3937 __ eor(v2, __ T2D, v2, v6); 3938 __ eor(v3, __ T2D, v3, v7); 3939 3940 __ orr(v0, __ T2D, v0, v1); 3941 __ orr(v1, __ T2D, v2, v3); 3942 __ orr(v0, __ T2D, v0, v1); 3943 3944 __ umov(tmp1, v0, __ D, 0); 3945 __ cbnz(tmp1, NOT_EQUAL); 3946 __ umov(tmp1, v0, __ D, 1); 3947 __ cbnz(tmp1, NOT_EQUAL); 3948 __ sub(cnt1, cnt1, 64/elem_size); 3949 __ cmp(cnt1, branchThreshold); 3950 __ br(__ GT, LARGE_LOOP); 3951 } else { 3952 __ eor(tmp1, tmp1, tmp2); 3953 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3954 __ cbnz(tmp1, NOT_EQUAL); 3955 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3956 __ eor(tmp3, tmp3, tmp4); 3957 __ ldr(tmp4, Address(__ post(a2, wordSize))); 3958 __ cbnz(tmp3, NOT_EQUAL); 3959 __ ldr(tmp3, Address(__ post(a1, wordSize))); 3960 3961 __ eor(tmp1, tmp1, tmp2); 3962 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3963 __ cbnz(tmp1, NOT_EQUAL); 3964 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3965 __ eor(tmp3, tmp3, tmp4); 3966 __ ldr(tmp4, Address(__ post(a2, wordSize))); 3967 __ cbnz(tmp3, NOT_EQUAL); 3968 __ ldr(tmp3, Address(__ post(a1, wordSize))); 3969 3970 __ eor(tmp1, tmp1, tmp2); 3971 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3972 __ cbnz(tmp1, NOT_EQUAL); 3973 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3974 __ eor(tmp3, tmp3, tmp4); 3975 __ ldr(tmp4, Address(__ post(a2, wordSize))); 3976 __ cbnz(tmp3, NOT_EQUAL); 3977 __ ldr(tmp3, Address(__ post(a1, wordSize))); 3978 3979 // loads below are for next loop iteration 3980 __ eor(tmp1, tmp1, tmp2); 3981 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3982 __ cbnz(tmp1, NOT_EQUAL); 3983 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3984 __ eor(tmp3, tmp3, tmp4); 3985 __ ldr(tmp4, Address(__ post(a2, wordSize))); 3986 __ cbnz(tmp3, NOT_EQUAL); 3987 __ ldr(tmp3, Address(__ post(a1, wordSize))); 3988 3989 __ sub(cnt1, cnt1, 8 * elem_per_word); 3990 // run this loop until we have memory to prefetch(but at least 64+16 bytes). 3991 __ cmp(cnt1, branchThreshold); 3992 __ br(Assembler::GT, LARGE_LOOP); 3993 // both a1 and a2 are shifted more than needed by wordSize and tmp1-tmp4 3994 // contains still-not-checked value. Check it in this post-loop, also update 3995 // cnt1 accordingly 3996 __ eor(tmp1, tmp1, tmp2); 3997 __ cbnz(tmp1, NOT_EQUAL); 3998 __ eor(tmp3, tmp3, tmp4); 3999 __ cbnz(tmp3, NOT_EQUAL); 4000 __ sub(cnt1, cnt1, 2 * elem_per_word); 4001 } 4002 4003 __ mov(result, true); 4004 __ bind(NOT_EQUAL); 4005 if (!UseSIMDForArrayEquals) { 4006 __ pop(spilled_regs, sp); 4007 } 4008 __ leave(); 4009 __ ret(lr); 4010 return entry; 4011 } 4012 4013 /** 4014 * Arguments: 4015 * 4016 * Input: 4017 * c_rarg0 - current state address 4018 * c_rarg1 - H key address 4019 * c_rarg2 - data address 4020 * c_rarg3 - number of blocks 4021 * 4022 * Output: 4023 * Updated state at c_rarg0 4024 */ 4025 address generate_ghash_processBlocks() { 4026 // Bafflingly, GCM uses little-endian for the byte order, but 4027 // big-endian for the bit order. For example, the polynomial 1 is 4028 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4029 // 4030 // So, we must either reverse the bytes in each word and do 4031 // everything big-endian or reverse the bits in each byte and do 4032 // it little-endian. On AArch64 it's more idiomatic to reverse 5074 SharedRuntime:: 5075 throw_AbstractMethodError)); 5076 5077 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5078 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5079 CAST_FROM_FN_PTR(address, 5080 SharedRuntime:: 5081 throw_IncompatibleClassChangeError)); 5082 5083 StubRoutines::_throw_NullPointerException_at_call_entry = 5084 generate_throw_exception("NullPointerException at call throw_exception", 5085 CAST_FROM_FN_PTR(address, 5086 SharedRuntime:: 5087 throw_NullPointerException_at_call)); 5088 5089 // arraycopy stubs used by compilers 5090 generate_arraycopy_stubs(); 5091 5092 // has negatives stub for large arrays. 5093 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5094 5095 // array equals stub for large arrays. 5096 StubRoutines::aarch64::_large_array_equals_byte = generate_large_array_equals_byte(); 5097 StubRoutines::aarch64::_large_array_equals_char = generate_large_array_equals_char(); 5098 5099 if (UseMultiplyToLenIntrinsic) { 5100 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5101 } 5102 5103 if (UseSquareToLenIntrinsic) { 5104 StubRoutines::_squareToLen = generate_squareToLen(); 5105 } 5106 5107 if (UseMulAddIntrinsic) { 5108 StubRoutines::_mulAdd = generate_mulAdd(); 5109 } 5110 5111 if (UseMontgomeryMultiplyIntrinsic) { 5112 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5113 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5114 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5115 } 5116 5117 if (UseMontgomerySquareIntrinsic) { |