3971 __ eor(tmp1, tmp1, tmp2); 3972 __ cbnz(tmp1, NOT_EQUAL); 3973 __ br(__ GT, SMALL_LOOP); 3974 __ bind(POST_LOOP); 3975 __ ldr(tmp1, Address(a1, cnt1)); 3976 __ ldr(tmp2, Address(a2, cnt1)); 3977 __ eor(tmp1, tmp1, tmp2); 3978 __ cbnz(tmp1, NOT_EQUAL); 3979 __ bind(EQUAL); 3980 __ mov(result, true); 3981 __ bind(NOT_EQUAL); 3982 if (!UseSIMDForArrayEquals) { 3983 __ pop(spilled_regs, sp); 3984 } 3985 __ bind(NOT_EQUAL_NO_POP); 3986 __ leave(); 3987 __ ret(lr); 3988 return entry; 3989 } 3990 3991 3992 /** 3993 * Arguments: 3994 * 3995 * Input: 3996 * c_rarg0 - current state address 3997 * c_rarg1 - H key address 3998 * c_rarg2 - data address 3999 * c_rarg3 - number of blocks 4000 * 4001 * Output: 4002 * Updated state at c_rarg0 4003 */ 4004 address generate_ghash_processBlocks() { 4005 // Bafflingly, GCM uses little-endian for the byte order, but 4006 // big-endian for the bit order. For example, the polynomial 1 is 4007 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4008 // 4009 // So, we must either reverse the bytes in each word and do 4010 // everything big-endian or reverse the bits in each byte and do 5058 CAST_FROM_FN_PTR(address, 5059 SharedRuntime:: 5060 throw_IncompatibleClassChangeError)); 5061 5062 StubRoutines::_throw_NullPointerException_at_call_entry = 5063 generate_throw_exception("NullPointerException at call throw_exception", 5064 CAST_FROM_FN_PTR(address, 5065 SharedRuntime:: 5066 throw_NullPointerException_at_call)); 5067 5068 // arraycopy stubs used by compilers 5069 generate_arraycopy_stubs(); 5070 5071 // has negatives stub for large arrays. 5072 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5073 5074 // array equals stub for large arrays. 5075 if (!UseSimpleArrayEquals) { 5076 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5077 } 5078 5079 if (UseMultiplyToLenIntrinsic) { 5080 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5081 } 5082 5083 if (UseSquareToLenIntrinsic) { 5084 StubRoutines::_squareToLen = generate_squareToLen(); 5085 } 5086 5087 if (UseMulAddIntrinsic) { 5088 StubRoutines::_mulAdd = generate_mulAdd(); 5089 } 5090 5091 if (UseMontgomeryMultiplyIntrinsic) { 5092 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5093 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5094 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5095 } 5096 5097 if (UseMontgomerySquareIntrinsic) { | 3971 __ eor(tmp1, tmp1, tmp2); 3972 __ cbnz(tmp1, NOT_EQUAL); 3973 __ br(__ GT, SMALL_LOOP); 3974 __ bind(POST_LOOP); 3975 __ ldr(tmp1, Address(a1, cnt1)); 3976 __ ldr(tmp2, Address(a2, cnt1)); 3977 __ eor(tmp1, tmp1, tmp2); 3978 __ cbnz(tmp1, NOT_EQUAL); 3979 __ bind(EQUAL); 3980 __ mov(result, true); 3981 __ bind(NOT_EQUAL); 3982 if (!UseSIMDForArrayEquals) { 3983 __ pop(spilled_regs, sp); 3984 } 3985 __ bind(NOT_EQUAL_NO_POP); 3986 __ leave(); 3987 __ ret(lr); 3988 return entry; 3989 } 3990 3991 void inflate_and_store_2_fp_registers(bool generatePrfm, 3992 FloatRegister src1, FloatRegister src2) { 3993 Register dst = r1; 3994 __ zip1(v1, __ T16B, src1, v0); 3995 __ zip2(v2, __ T16B, src1, v0); 3996 if (generatePrfm) { 3997 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 3998 } 3999 __ zip1(v3, __ T16B, src2, v0); 4000 __ zip2(v4, __ T16B, src2, v0); 4001 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4002 } 4003 4004 // R0 = src 4005 // R1 = dst 4006 // R2 = len 4007 // R3 = len >> 3 4008 // V0 = 0 4009 // v1 = loaded 8 bytes 4010 address generate_large_byte_array_inflate() { 4011 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4012 __ align(CodeEntryAlignment); 4013 address entry = __ pc(); 4014 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4015 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4016 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4017 4018 // do one more 8-byte read to have address 16-byte aligned in most cases 4019 // also use single store instruction 4020 __ ldrd(v2, __ post(src, 8)); 4021 __ sub(octetCounter, octetCounter, 2); 4022 __ zip1(v1, __ T16B, v1, v0); 4023 __ zip1(v2, __ T16B, v2, v0); 4024 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4025 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4026 __ cmp(octetCounter, large_loop_threshold); 4027 __ br(__ LE, LOOP_START); 4028 __ b(LOOP_PRFM_START); 4029 __ bind(LOOP_PRFM); 4030 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4031 __ bind(LOOP_PRFM_START); 4032 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4033 __ sub(octetCounter, octetCounter, 8); 4034 __ cmp(octetCounter, large_loop_threshold); 4035 inflate_and_store_2_fp_registers(true, v3, v4); 4036 inflate_and_store_2_fp_registers(true, v5, v6); 4037 __ br(__ GT, LOOP_PRFM); 4038 __ cmp(octetCounter, 8); 4039 __ br(__ LT, DONE); 4040 __ bind(LOOP); 4041 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4042 __ bind(LOOP_START); 4043 __ sub(octetCounter, octetCounter, 8); 4044 __ cmp(octetCounter, 8); 4045 inflate_and_store_2_fp_registers(false, v3, v4); 4046 inflate_and_store_2_fp_registers(false, v5, v6); 4047 __ br(__ GE, LOOP); 4048 __ bind(DONE); 4049 __ ret(lr); 4050 return entry; 4051 } 4052 4053 /** 4054 * Arguments: 4055 * 4056 * Input: 4057 * c_rarg0 - current state address 4058 * c_rarg1 - H key address 4059 * c_rarg2 - data address 4060 * c_rarg3 - number of blocks 4061 * 4062 * Output: 4063 * Updated state at c_rarg0 4064 */ 4065 address generate_ghash_processBlocks() { 4066 // Bafflingly, GCM uses little-endian for the byte order, but 4067 // big-endian for the bit order. For example, the polynomial 1 is 4068 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4069 // 4070 // So, we must either reverse the bytes in each word and do 4071 // everything big-endian or reverse the bits in each byte and do 5119 CAST_FROM_FN_PTR(address, 5120 SharedRuntime:: 5121 throw_IncompatibleClassChangeError)); 5122 5123 StubRoutines::_throw_NullPointerException_at_call_entry = 5124 generate_throw_exception("NullPointerException at call throw_exception", 5125 CAST_FROM_FN_PTR(address, 5126 SharedRuntime:: 5127 throw_NullPointerException_at_call)); 5128 5129 // arraycopy stubs used by compilers 5130 generate_arraycopy_stubs(); 5131 5132 // has negatives stub for large arrays. 5133 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5134 5135 // array equals stub for large arrays. 5136 if (!UseSimpleArrayEquals) { 5137 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5138 } 5139 5140 // byte_array_inflate stub for large arrays. 5141 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5142 5143 if (UseMultiplyToLenIntrinsic) { 5144 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5145 } 5146 5147 if (UseSquareToLenIntrinsic) { 5148 StubRoutines::_squareToLen = generate_squareToLen(); 5149 } 5150 5151 if (UseMulAddIntrinsic) { 5152 StubRoutines::_mulAdd = generate_mulAdd(); 5153 } 5154 5155 if (UseMontgomeryMultiplyIntrinsic) { 5156 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5157 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5158 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5159 } 5160 5161 if (UseMontgomerySquareIntrinsic) { |