3971 __ eor(tmp1, tmp1, tmp2); 3972 __ cbnz(tmp1, NOT_EQUAL); 3973 __ br(__ GT, SMALL_LOOP); 3974 __ bind(POST_LOOP); 3975 __ ldr(tmp1, Address(a1, cnt1)); 3976 __ ldr(tmp2, Address(a2, cnt1)); 3977 __ eor(tmp1, tmp1, tmp2); 3978 __ cbnz(tmp1, NOT_EQUAL); 3979 __ bind(EQUAL); 3980 __ mov(result, true); 3981 __ bind(NOT_EQUAL); 3982 if (!UseSIMDForArrayEquals) { 3983 __ pop(spilled_regs, sp); 3984 } 3985 __ bind(NOT_EQUAL_NO_POP); 3986 __ leave(); 3987 __ ret(lr); 3988 return entry; 3989 } 3990 3991 3992 /** 3993 * Arguments: 3994 * 3995 * Input: 3996 * c_rarg0 - current state address 3997 * c_rarg1 - H key address 3998 * c_rarg2 - data address 3999 * c_rarg3 - number of blocks 4000 * 4001 * Output: 4002 * Updated state at c_rarg0 4003 */ 4004 address generate_ghash_processBlocks() { 4005 // Bafflingly, GCM uses little-endian for the byte order, but 4006 // big-endian for the bit order. For example, the polynomial 1 is 4007 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4008 // 4009 // So, we must either reverse the bytes in each word and do 4010 // everything big-endian or reverse the bits in each byte and do 5058 CAST_FROM_FN_PTR(address, 5059 SharedRuntime:: 5060 throw_IncompatibleClassChangeError)); 5061 5062 StubRoutines::_throw_NullPointerException_at_call_entry = 5063 generate_throw_exception("NullPointerException at call throw_exception", 5064 CAST_FROM_FN_PTR(address, 5065 SharedRuntime:: 5066 throw_NullPointerException_at_call)); 5067 5068 // arraycopy stubs used by compilers 5069 generate_arraycopy_stubs(); 5070 5071 // has negatives stub for large arrays. 5072 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5073 5074 // array equals stub for large arrays. 5075 if (!UseSimpleArrayEquals) { 5076 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5077 } 5078 5079 if (UseMultiplyToLenIntrinsic) { 5080 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5081 } 5082 5083 if (UseSquareToLenIntrinsic) { 5084 StubRoutines::_squareToLen = generate_squareToLen(); 5085 } 5086 5087 if (UseMulAddIntrinsic) { 5088 StubRoutines::_mulAdd = generate_mulAdd(); 5089 } 5090 5091 if (UseMontgomeryMultiplyIntrinsic) { 5092 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5093 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5094 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5095 } 5096 5097 if (UseMontgomerySquareIntrinsic) { | 3971 __ eor(tmp1, tmp1, tmp2); 3972 __ cbnz(tmp1, NOT_EQUAL); 3973 __ br(__ GT, SMALL_LOOP); 3974 __ bind(POST_LOOP); 3975 __ ldr(tmp1, Address(a1, cnt1)); 3976 __ ldr(tmp2, Address(a2, cnt1)); 3977 __ eor(tmp1, tmp1, tmp2); 3978 __ cbnz(tmp1, NOT_EQUAL); 3979 __ bind(EQUAL); 3980 __ mov(result, true); 3981 __ bind(NOT_EQUAL); 3982 if (!UseSIMDForArrayEquals) { 3983 __ pop(spilled_regs, sp); 3984 } 3985 __ bind(NOT_EQUAL_NO_POP); 3986 __ leave(); 3987 __ ret(lr); 3988 return entry; 3989 } 3990 3991 // ALGORITHM DESCRIPTION - LOG() 3992 // --------------------- 3993 // 3994 // x=2^k * mx, mx in [1,2) 3995 // 3996 // Get B~1/mx based on the output of rcpss instruction (B0) 3997 // B = int((B0*2^7+0.5))/2^7 3998 // 3999 // Reduced argument: r=B*mx-1.0 (computed accurately in high and low parts) 4000 // 4001 // Result: k*log(2) - log(B) + p(r) if |x-1| >= small value (2^-6) and 4002 // p(r) is a degree 7 polynomial 4003 // -log(B) read from data table (high, low parts) 4004 // Result is formed from high and low parts 4005 // 4006 // Special cases: 4007 // 1. log(NaN) = quiet NaN 4008 // 2. log(+INF) = +INF 4009 // 3. log(0) = -INF 4010 // 4. log(1) = +0 4011 // 5. log(x) = NaN if x < -0, including -INF 4012 // 4013 address generate_dlog() { 4014 StubCodeMark mark(this, "StubRoutines", "dlog"); 4015 __ align(CodeEntryAlignment); 4016 address entry = __ pc(); 4017 Label DONE, CHECK_CORNER_CASES, SPECIAL_CASE, MAIN, 4018 CHECKED_CORNER_CASES, RETURN_MINF_OR_NAN; 4019 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 4020 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 4021 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 4022 const long INF_OR_NAN_PREFIX = 0x7FF0; 4023 const long MINF_OR_MNAN_PREFIX = 0xFFF0; 4024 const long ONE_PREFIX = 0x3FF0; 4025 __ movz(tmp2, ONE_PREFIX, 48); 4026 __ movz(tmp4, 0x0010, 48); 4027 __ fmovd(rscratch1, v0); 4028 __ lea(rscratch2, ExternalAddress((address)StubRoutines::aarch64::_log_tbl)); 4029 __ movz(tmp5, 0x7F); 4030 __ add(tmp1, rscratch1, tmp4); 4031 __ cmp(tmp2, rscratch1); 4032 __ lsr(tmp3, rscratch1, 29); 4033 __ ccmp(tmp1, tmp4, 0b1101 /* LE */, __ NE); 4034 __ bfm(tmp3, tmp5, 41, 8); 4035 __ fmovs(vtmp5, tmp3); 4036 __ ld1(tmpC1, tmpC2, tmpC3, tmpC4, __ T2D, __ post(rscratch2, 64)); 4037 __ br(__ LE, CHECK_CORNER_CASES); 4038 __ BIND(CHECKED_CORNER_CASES); 4039 // all corner cases are handled 4040 __ frecpe(vtmp5, vtmp5, S); 4041 __ lsr(tmp2, rscratch1, 48); 4042 __ movz(tmp4, 0x77f0, 48); 4043 __ fmovd(vtmp4, 1.0d); 4044 __ movz(tmp1, INF_OR_NAN_PREFIX, 48); 4045 __ bfm(tmp4, rscratch1, 0, 51); 4046 __ fmovd(vtmp1, tmp4); 4047 __ subw(tmp2, tmp2, 16); 4048 __ cmp(tmp2, 0x8000); 4049 __ br(__ GE, SPECIAL_CASE); 4050 __ bind(MAIN); 4051 __ fmovs(tmp3, vtmp5); 4052 __ mov(tmp5, 0x3FE0); 4053 __ mov(rscratch1, 0xffffe00000000000); 4054 __ andr(tmp2, tmp2, tmp1, __ LSR, 48); 4055 __ sub(tmp2, tmp2, tmp5); 4056 __ scvtfwd(vtmp5, tmp2); 4057 __ addw(tmp3, tmp3, 0x8000); 4058 __ andr(tmp4, tmp4, rscratch1); 4059 __ andr(rscratch1, rscratch1, tmp3, __ LSL, 29); 4060 __ ubfm(tmp3, tmp3, 16, 23); 4061 __ ldrq(vtmp2, Address(rscratch2, tmp3, Address::lsl(4))); 4062 __ fmovd(vtmp3, tmp4); 4063 __ fmovd(vtmp0, rscratch1); 4064 __ fsubd(vtmp1, vtmp1, vtmp3); 4065 __ fnmsub(vtmp3, vtmp3, vtmp0, vtmp4); 4066 __ fmlavsd(vtmp2, tmpC4, vtmp5, 0); 4067 __ fmaddd(vtmp1, vtmp1, vtmp0, vtmp3); 4068 __ ins(vtmp5, __ D, vtmp2, 0, 1); 4069 __ faddd(vtmp0, vtmp2, vtmp1); 4070 __ fmlavsd(tmpC3, tmpC2, vtmp1, 0); 4071 __ fsubd(vtmp2, vtmp2, vtmp0); 4072 __ fmuld(vtmp3, vtmp1, vtmp1); 4073 __ faddd(tmpC4, vtmp1, vtmp2); 4074 __ fmlavsd(tmpC3, tmpC1, vtmp3, 0); 4075 __ faddd(tmpC4, tmpC4, vtmp5); 4076 __ fmuld(vtmp4, vtmp3, vtmp1); 4077 __ faddd(vtmp0, vtmp0, tmpC4); 4078 __ fmlavsd(tmpC3, vtmp4, tmpC3, 1); 4079 __ fmaddd(vtmp0, tmpC3, vtmp3, vtmp0); 4080 __ ret(lr); 4081 __ BIND(SPECIAL_CASE); 4082 __ movz(tmp2, 0x47F0, 48); 4083 __ fmovd(vtmp1, tmp2); 4084 __ fmuld(vtmp0, vtmp1, vtmp0); 4085 __ fmovd(vtmp1, vtmp0); 4086 __ umov(tmp2, vtmp1, __ S, 3); 4087 __ orr(vtmp0, __ T16B, vtmp0, vtmp4); 4088 __ ushr(vtmp5, __ T2D, vtmp0, 27); 4089 __ ushr(vtmp5, __ T4S, vtmp5, 2); 4090 __ frecpe(vtmp5, vtmp5, S); 4091 __ shl(vtmp1, __ T2D, vtmp1, 12); 4092 __ ushr(vtmp1, __ T2D, vtmp1, 12); 4093 __ b(MAIN); 4094 __ bind(RETURN_MINF_OR_NAN); 4095 __ movz(tmp1, MINF_OR_MNAN_PREFIX, 48); 4096 __ orr(rscratch1, rscratch1, tmp1); 4097 __ fmovd(v0, rscratch1); 4098 __ ret(lr); 4099 __ BIND(CHECK_CORNER_CASES); 4100 __ movz(tmp1, INF_OR_NAN_PREFIX, 48); 4101 __ cmp(rscratch1, zr); 4102 __ br(__ LE, RETURN_MINF_OR_NAN); 4103 __ cmp(rscratch1, tmp1); 4104 __ br(__ GE, DONE); // special cases 1 and 2 4105 __ cmp(rscratch1, tmp2); 4106 __ br(__ NE, CHECKED_CORNER_CASES); 4107 // special case 4 4108 __ fmovd(v0, 0.0d); 4109 __ BIND(DONE); 4110 __ ret(lr); 4111 return entry; 4112 } 4113 4114 /** 4115 * Arguments: 4116 * 4117 * Input: 4118 * c_rarg0 - current state address 4119 * c_rarg1 - H key address 4120 * c_rarg2 - data address 4121 * c_rarg3 - number of blocks 4122 * 4123 * Output: 4124 * Updated state at c_rarg0 4125 */ 4126 address generate_ghash_processBlocks() { 4127 // Bafflingly, GCM uses little-endian for the byte order, but 4128 // big-endian for the bit order. For example, the polynomial 1 is 4129 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4130 // 4131 // So, we must either reverse the bytes in each word and do 4132 // everything big-endian or reverse the bits in each byte and do 5180 CAST_FROM_FN_PTR(address, 5181 SharedRuntime:: 5182 throw_IncompatibleClassChangeError)); 5183 5184 StubRoutines::_throw_NullPointerException_at_call_entry = 5185 generate_throw_exception("NullPointerException at call throw_exception", 5186 CAST_FROM_FN_PTR(address, 5187 SharedRuntime:: 5188 throw_NullPointerException_at_call)); 5189 5190 // arraycopy stubs used by compilers 5191 generate_arraycopy_stubs(); 5192 5193 // has negatives stub for large arrays. 5194 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5195 5196 // array equals stub for large arrays. 5197 if (!UseSimpleArrayEquals) { 5198 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5199 } 5200 5201 StubRoutines::_dlog = generate_dlog(); 5202 5203 if (UseMultiplyToLenIntrinsic) { 5204 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5205 } 5206 5207 if (UseSquareToLenIntrinsic) { 5208 StubRoutines::_squareToLen = generate_squareToLen(); 5209 } 5210 5211 if (UseMulAddIntrinsic) { 5212 StubRoutines::_mulAdd = generate_mulAdd(); 5213 } 5214 5215 if (UseMontgomeryMultiplyIntrinsic) { 5216 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5217 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5218 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5219 } 5220 5221 if (UseMontgomerySquareIntrinsic) { |