hs Sdiff src/hotspot/cpu/aarch64

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

3871 
3872   __ bind(RET_FALSE);
3873     __ pop(spilled_regs, sp);
3874     __ leave();
3875     __ mov(result, zr);
3876     __ ret(lr);
3877 
3878   __ bind(RET_TRUE);
3879     __ pop(spilled_regs, sp);
3880   __ bind(RET_TRUE_NO_POP);
3881     __ leave();
3882     __ mov(result, 1);
3883     __ ret(lr);
3884 
3885   __ bind(DONE);
3886     __ pop(spilled_regs, sp);
3887     __ leave();
3888     __ ret(lr);
3889     return entry;
3890   }


























































































































3891   /**
3892    *  Arguments:
3893    *
3894    *  Input:
3895    *  c_rarg0   - current state address
3896    *  c_rarg1   - H key address
3897    *  c_rarg2   - data address
3898    *  c_rarg3   - number of blocks
3899    *
3900    *  Output:
3901    *  Updated state at c_rarg0
3902    */
3903   address generate_ghash_processBlocks() {
3904     // Bafflingly, GCM uses little-endian for the byte order, but
3905     // big-endian for the bit order.  For example, the polynomial 1 is
3906     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3907     //
3908     // So, we must either reverse the bytes in each word and do
3909     // everything big-endian or reverse the bits in each byte and do
3910     // it little-endian.  On AArch64 it's more idiomatic to reverse

4952                                                 SharedRuntime::
4953                                                 throw_AbstractMethodError));
4954 
4955     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4956       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4957                                CAST_FROM_FN_PTR(address,
4958                                                 SharedRuntime::
4959                                                 throw_IncompatibleClassChangeError));
4960 
4961     StubRoutines::_throw_NullPointerException_at_call_entry =
4962       generate_throw_exception("NullPointerException at call throw_exception",
4963                                CAST_FROM_FN_PTR(address,
4964                                                 SharedRuntime::
4965                                                 throw_NullPointerException_at_call));
4966 
4967     // arraycopy stubs used by compilers
4968     generate_arraycopy_stubs();
4969 
4970     // has negatives stub for large arrays.
4971     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);




4972 
4973     if (UseMultiplyToLenIntrinsic) {
4974       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4975     }
4976 
4977     if (UseSquareToLenIntrinsic) {
4978       StubRoutines::_squareToLen = generate_squareToLen();
4979     }
4980 
4981     if (UseMulAddIntrinsic) {
4982       StubRoutines::_mulAdd = generate_mulAdd();
4983     }
4984 
4985     if (UseMontgomeryMultiplyIntrinsic) {
4986       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4987       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4988       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4989     }
4990 
4991     if (UseMontgomerySquareIntrinsic) {

3871 
3872   __ bind(RET_FALSE);
3873     __ pop(spilled_regs, sp);
3874     __ leave();
3875     __ mov(result, zr);
3876     __ ret(lr);
3877 
3878   __ bind(RET_TRUE);
3879     __ pop(spilled_regs, sp);
3880   __ bind(RET_TRUE_NO_POP);
3881     __ leave();
3882     __ mov(result, 1);
3883     __ ret(lr);
3884 
3885   __ bind(DONE);
3886     __ pop(spilled_regs, sp);
3887     __ leave();
3888     __ ret(lr);
3889     return entry;
3890   }
3891 
3892   address generate_large_array_equals_byte() {
3893     return generate_large_array_equals(1);
3894   }
3895 
3896   address generate_large_array_equals_char() {
3897     return generate_large_array_equals(2);
3898   }
3899 
3900   // a1 = r1 - array1 address
3901   // a2 = r2 - array2 address
3902   // result = r0 - return value. Already contains "false"
3903   // cnt1 = r4 - amount of elements left to check, reduced by elem_per_word
3904   address generate_large_array_equals(int elem_size) {
3905     StubCodeMark mark(this, "StubRoutines", elem_size == 1
3906         ? "large_array_equals_byte"
3907         : "large_array_equals_char");
3908     Register a1 = r1, a2 = r2, result = r0, cnt1 = r4, tmp1 = rscratch1,
3909         tmp2 = rscratch2, tmp3 = r6, tmp4 = r7;
3910     Label LARGE_LOOP, NOT_EQUAL;
3911     int elem_per_word = wordSize/elem_size;
3912     int branchThreshold = MAX(80, SoftwarePrefetchHintDistance)/elem_size - elem_per_word;
3913     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
3914 
3915     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4);
3916 
3917     __ align(CodeEntryAlignment);
3918     address entry = __ pc();
3919     __ enter();
3920 
3921     if (!UseSIMDForArrayEquals) {
3922       // pre-loop
3923       __ push(spilled_regs, sp);
3924       __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3925       __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3926     }
3927     __ bind(LARGE_LOOP); // unrolled to 64 bytes loop with possible prefetching
3928     if (SoftwarePrefetchHintDistance >= 0) {
3929       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3930       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3931     }
3932     if (UseSIMDForArrayEquals) {
3933       __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3934       __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3935       __ eor(v0, __ T2D, v0, v4);
3936       __ eor(v1, __ T2D, v1, v5);
3937       __ eor(v2, __ T2D, v2, v6);
3938       __ eor(v3, __ T2D, v3, v7);
3939 
3940       __ orr(v0, __ T2D, v0, v1);
3941       __ orr(v1, __ T2D, v2, v3);
3942       __ orr(v0, __ T2D, v0, v1);
3943 
3944       __ umov(tmp1, v0, __ D, 0);
3945       __ cbnz(tmp1, NOT_EQUAL);
3946       __ umov(tmp1, v0, __ D, 1);
3947       __ cbnz(tmp1, NOT_EQUAL);
3948       __ sub(cnt1, cnt1, 64/elem_size);
3949       __ cmp(cnt1, branchThreshold);
3950       __ br(__ GT, LARGE_LOOP);
3951     } else {
3952       __ eor(tmp1, tmp1, tmp2);
3953       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3954       __ cbnz(tmp1, NOT_EQUAL);
3955       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3956       __ eor(tmp3, tmp3, tmp4);
3957       __ ldr(tmp4, Address(__ post(a2, wordSize)));
3958       __ cbnz(tmp3, NOT_EQUAL);
3959       __ ldr(tmp3, Address(__ post(a1, wordSize)));
3960 
3961       __ eor(tmp1, tmp1, tmp2);
3962       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3963       __ cbnz(tmp1, NOT_EQUAL);
3964       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3965       __ eor(tmp3, tmp3, tmp4);
3966       __ ldr(tmp4, Address(__ post(a2, wordSize)));
3967       __ cbnz(tmp3, NOT_EQUAL);
3968       __ ldr(tmp3, Address(__ post(a1, wordSize)));
3969 
3970       __ eor(tmp1, tmp1, tmp2);
3971       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3972       __ cbnz(tmp1, NOT_EQUAL);
3973       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3974       __ eor(tmp3, tmp3, tmp4);
3975       __ ldr(tmp4, Address(__ post(a2, wordSize)));
3976       __ cbnz(tmp3, NOT_EQUAL);
3977       __ ldr(tmp3, Address(__ post(a1, wordSize)));
3978 
3979       // loads below are for next loop iteration
3980       __ eor(tmp1, tmp1, tmp2);
3981       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3982       __ cbnz(tmp1, NOT_EQUAL);
3983       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3984       __ eor(tmp3, tmp3, tmp4);
3985       __ ldr(tmp4, Address(__ post(a2, wordSize)));
3986       __ cbnz(tmp3, NOT_EQUAL);
3987       __ ldr(tmp3, Address(__ post(a1, wordSize)));
3988 
3989       __ sub(cnt1, cnt1, 8 * elem_per_word);
3990       // run this loop until we have memory to prefetch(but at least 64+16 bytes).
3991       __ cmp(cnt1, branchThreshold);
3992       __ br(Assembler::GT, LARGE_LOOP);
3993       // both a1 and a2 are shifted more than needed by wordSize and tmp1-tmp4
3994       // contains still-not-checked value. Check it in this post-loop, also update
3995       // cnt1 accordingly
3996       __ eor(tmp1, tmp1, tmp2);
3997       __ cbnz(tmp1, NOT_EQUAL);
3998       __ eor(tmp3, tmp3, tmp4);
3999       __ cbnz(tmp3, NOT_EQUAL);
4000       __ sub(cnt1, cnt1, 2 * elem_per_word);
4001     }
4002 
4003     __ mov(result, true);
4004     __ bind(NOT_EQUAL);
4005     if (!UseSIMDForArrayEquals) {
4006       __ pop(spilled_regs, sp);
4007     }
4008     __ leave();
4009     __ ret(lr);
4010     return entry;
4011   }
4012 
4013   /**
4014    *  Arguments:
4015    *
4016    *  Input:
4017    *  c_rarg0   - current state address
4018    *  c_rarg1   - H key address
4019    *  c_rarg2   - data address
4020    *  c_rarg3   - number of blocks
4021    *
4022    *  Output:
4023    *  Updated state at c_rarg0
4024    */
4025   address generate_ghash_processBlocks() {
4026     // Bafflingly, GCM uses little-endian for the byte order, but
4027     // big-endian for the bit order.  For example, the polynomial 1 is
4028     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4029     //
4030     // So, we must either reverse the bytes in each word and do
4031     // everything big-endian or reverse the bits in each byte and do
4032     // it little-endian.  On AArch64 it's more idiomatic to reverse

5074                                                 SharedRuntime::
5075                                                 throw_AbstractMethodError));
5076 
5077     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5078       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5079                                CAST_FROM_FN_PTR(address,
5080                                                 SharedRuntime::
5081                                                 throw_IncompatibleClassChangeError));
5082 
5083     StubRoutines::_throw_NullPointerException_at_call_entry =
5084       generate_throw_exception("NullPointerException at call throw_exception",
5085                                CAST_FROM_FN_PTR(address,
5086                                                 SharedRuntime::
5087                                                 throw_NullPointerException_at_call));
5088 
5089     // arraycopy stubs used by compilers
5090     generate_arraycopy_stubs();
5091 
5092     // has negatives stub for large arrays.
5093     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5094 
5095     // array equals stub for large arrays.
5096     StubRoutines::aarch64::_large_array_equals_byte = generate_large_array_equals_byte();
5097     StubRoutines::aarch64::_large_array_equals_char = generate_large_array_equals_char();
5098 
5099     if (UseMultiplyToLenIntrinsic) {
5100       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5101     }
5102 
5103     if (UseSquareToLenIntrinsic) {
5104       StubRoutines::_squareToLen = generate_squareToLen();
5105     }
5106 
5107     if (UseMulAddIntrinsic) {
5108       StubRoutines::_mulAdd = generate_mulAdd();
5109     }
5110 
5111     if (UseMontgomeryMultiplyIntrinsic) {
5112       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5113       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5114       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5115     }
5116 
5117     if (UseMontgomerySquareIntrinsic) {

< prev index next >