jdk11u-dev Sdiff src/hotspot/cpu/aarch64

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

rev 54140 : 8255625: AArch64: Implement Base64.encodeBlock accelerator/intrinsic
Reviewed-by: aph

4713       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4714                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4715                      /*temps*/v6, v20, v18, v21);
4716       // Reduce v7:v5 by the field polynomial
4717       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4718 
4719       __ sub(blocks, blocks, 1);
4720       __ cbnz(blocks, L_ghash_loop);
4721     }
4722 
4723     // The bit-reversed result is at this point in v0
4724     __ rev64(v1, __ T16B, v0);
4725     __ rbit(v1, __ T16B, v1);
4726 
4727     __ st1(v1, __ T16B, state);
4728     __ ret(lr);
4729 
4730     return start;
4731   }
4732 
















































































































































4733   // Continuation point for throwing of implicit exceptions that are
4734   // not handled in the current activation. Fabricates an exception
4735   // oop and initiates normal exception dispatching in this
4736   // frame. Since we need to preserve callee-saved values (currently
4737   // only for C2, but done for C1 as well) we need a callee-saved oop
4738   // map and therefore have to make these stubs into RuntimeStubs
4739   // rather than BufferBlobs.  If the compiler needs all registers to
4740   // be preserved between the fault point and the exception handler
4741   // then it must assume responsibility for that in
4742   // AbstractCompiler::continuation_for_implicit_null_exception or
4743   // continuation_for_implicit_division_by_zero_exception. All other
4744   // implicit exceptions (e.g., NullPointerException or
4745   // AbstractMethodError on entry) are either at call sites or
4746   // otherwise assume that stack unwinding will be initiated, so
4747   // caller saved registers were assumed volatile in the compiler.
4748 
4749 #undef __
4750 #define __ masm->
4751 
4752   address generate_throw_exception(const char* name,

5764       StubRoutines::_mulAdd = generate_mulAdd();
5765     }
5766 
5767     if (UseMontgomeryMultiplyIntrinsic) {
5768       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5769       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5770       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5771     }
5772 
5773     if (UseMontgomerySquareIntrinsic) {
5774       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5775       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5776       // We use generate_multiply() rather than generate_square()
5777       // because it's faster for the sizes of modulus we care about.
5778       StubRoutines::_montgomerySquare = g.generate_multiply();
5779     }
5780 
5781     // generate GHASH intrinsics code
5782     if (UseGHASHIntrinsics) {
5783       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();




5784     }
5785 
5786     if (UseAESIntrinsics) {
5787       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5788       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5789       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5790       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5791     }
5792 
5793     if (UseSHA1Intrinsics) {
5794       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5795       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5796     }
5797     if (UseSHA256Intrinsics) {
5798       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5799       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5800     }
5801 
5802     // generate Adler32 intrinsics code
5803     if (UseAdler32Intrinsics) {

4713       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4714                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4715                      /*temps*/v6, v20, v18, v21);
4716       // Reduce v7:v5 by the field polynomial
4717       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4718 
4719       __ sub(blocks, blocks, 1);
4720       __ cbnz(blocks, L_ghash_loop);
4721     }
4722 
4723     // The bit-reversed result is at this point in v0
4724     __ rev64(v1, __ T16B, v0);
4725     __ rbit(v1, __ T16B, v1);
4726 
4727     __ st1(v1, __ T16B, state);
4728     __ ret(lr);
4729 
4730     return start;
4731   }
4732 
4733   void generate_base64_encode_simdround(Register src, Register dst,
4734         FloatRegister codec, u8 size) {
4735 
4736     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
4737     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
4738     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
4739 
4740     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
4741 
4742     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
4743 
4744     __ ushr(ind0, arrangement, in0,  2);
4745 
4746     __ ushr(ind1, arrangement, in1,  2);
4747     __ shl(in0,   arrangement, in0,  6);
4748     __ orr(ind1,  arrangement, ind1, in0);
4749     __ ushr(ind1, arrangement, ind1, 2);
4750 
4751     __ ushr(ind2, arrangement, in2,  4);
4752     __ shl(in1,   arrangement, in1,  4);
4753     __ orr(ind2,  arrangement, in1,  ind2);
4754     __ ushr(ind2, arrangement, ind2, 2);
4755 
4756     __ shl(ind3,  arrangement, in2,  2);
4757     __ ushr(ind3, arrangement, ind3, 2);
4758 
4759     __ tbl(out0,  arrangement, codec,  4, ind0);
4760     __ tbl(out1,  arrangement, codec,  4, ind1);
4761     __ tbl(out2,  arrangement, codec,  4, ind2);
4762     __ tbl(out3,  arrangement, codec,  4, ind3);
4763 
4764     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
4765   }
4766 
4767    /**
4768    *  Arguments:
4769    *
4770    *  Input:
4771    *  c_rarg0   - src_start
4772    *  c_rarg1   - src_offset
4773    *  c_rarg2   - src_length
4774    *  c_rarg3   - dest_start
4775    *  c_rarg4   - dest_offset
4776    *  c_rarg5   - isURL
4777    *
4778    */
4779   address generate_base64_encodeBlock() {
4780 
4781     static const char toBase64[64] = {
4782       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
4783       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
4784       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
4785       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
4786       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
4787     };
4788 
4789     static const char toBase64URL[64] = {
4790       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
4791       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
4792       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
4793       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
4794       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
4795     };
4796 
4797     __ align(CodeEntryAlignment);
4798     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
4799     address start = __ pc();
4800 
4801     Register src   = c_rarg0;  // source array
4802     Register soff  = c_rarg1;  // source start offset
4803     Register send  = c_rarg2;  // source end offset
4804     Register dst   = c_rarg3;  // dest array
4805     Register doff  = c_rarg4;  // position for writing to dest array
4806     Register isURL = c_rarg5;  // Base64 or URL chracter set
4807 
4808     // c_rarg6 and c_rarg7 are free to use as temps
4809     Register codec  = c_rarg6;
4810     Register length = c_rarg7;
4811 
4812     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
4813 
4814     __ add(src, src, soff);
4815     __ add(dst, dst, doff);
4816     __ sub(length, send, soff);
4817 
4818     // load the codec base address
4819     __ lea(codec, ExternalAddress((address) toBase64));
4820     __ cbz(isURL, ProcessData);
4821     __ lea(codec, ExternalAddress((address) toBase64URL));
4822 
4823     __ BIND(ProcessData);
4824 
4825     // too short to formup a SIMD loop, roll back
4826     __ cmp(length, (u1)24);
4827     __ br(Assembler::LT, Process3B);
4828 
4829     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
4830 
4831     __ BIND(Process48B);
4832     __ cmp(length, (u1)48);
4833     __ br(Assembler::LT, Process24B);
4834     generate_base64_encode_simdround(src, dst, v0, 16);
4835     __ sub(length, length, 48);
4836     __ b(Process48B);
4837 
4838     __ BIND(Process24B);
4839     __ cmp(length, (u1)24);
4840     __ br(Assembler::LT, SIMDExit);
4841     generate_base64_encode_simdround(src, dst, v0, 8);
4842     __ sub(length, length, 24);
4843 
4844     __ BIND(SIMDExit);
4845     __ cbz(length, Exit);
4846 
4847     __ BIND(Process3B);
4848     //  3 src bytes, 24 bits
4849     __ ldrb(r10, __ post(src, 1));
4850     __ ldrb(r11, __ post(src, 1));
4851     __ ldrb(r12, __ post(src, 1));
4852     __ orrw(r11, r11, r10, Assembler::LSL, 8);
4853     __ orrw(r12, r12, r11, Assembler::LSL, 8);
4854     // codec index
4855     __ ubfmw(r15, r12, 18, 23);
4856     __ ubfmw(r14, r12, 12, 17);
4857     __ ubfmw(r13, r12, 6,  11);
4858     __ andw(r12,  r12, 63);
4859     // get the code based on the codec
4860     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
4861     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
4862     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
4863     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
4864     __ strb(r15, __ post(dst, 1));
4865     __ strb(r14, __ post(dst, 1));
4866     __ strb(r13, __ post(dst, 1));
4867     __ strb(r12, __ post(dst, 1));
4868     __ sub(length, length, 3);
4869     __ cbnz(length, Process3B);
4870 
4871     __ BIND(Exit);
4872     __ ret(lr);
4873 
4874     return start;
4875   }
4876 
4877   // Continuation point for throwing of implicit exceptions that are
4878   // not handled in the current activation. Fabricates an exception
4879   // oop and initiates normal exception dispatching in this
4880   // frame. Since we need to preserve callee-saved values (currently
4881   // only for C2, but done for C1 as well) we need a callee-saved oop
4882   // map and therefore have to make these stubs into RuntimeStubs
4883   // rather than BufferBlobs.  If the compiler needs all registers to
4884   // be preserved between the fault point and the exception handler
4885   // then it must assume responsibility for that in
4886   // AbstractCompiler::continuation_for_implicit_null_exception or
4887   // continuation_for_implicit_division_by_zero_exception. All other
4888   // implicit exceptions (e.g., NullPointerException or
4889   // AbstractMethodError on entry) are either at call sites or
4890   // otherwise assume that stack unwinding will be initiated, so
4891   // caller saved registers were assumed volatile in the compiler.
4892 
4893 #undef __
4894 #define __ masm->
4895 
4896   address generate_throw_exception(const char* name,

5908       StubRoutines::_mulAdd = generate_mulAdd();
5909     }
5910 
5911     if (UseMontgomeryMultiplyIntrinsic) {
5912       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5913       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5914       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5915     }
5916 
5917     if (UseMontgomerySquareIntrinsic) {
5918       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5919       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5920       // We use generate_multiply() rather than generate_square()
5921       // because it's faster for the sizes of modulus we care about.
5922       StubRoutines::_montgomerySquare = g.generate_multiply();
5923     }
5924 
5925     // generate GHASH intrinsics code
5926     if (UseGHASHIntrinsics) {
5927       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5928     }
5929 
5930     if (UseBASE64Intrinsics) {
5931         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
5932     }
5933 
5934     if (UseAESIntrinsics) {
5935       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5936       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5937       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5938       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5939     }
5940 
5941     if (UseSHA1Intrinsics) {
5942       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5943       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5944     }
5945     if (UseSHA256Intrinsics) {
5946       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5947       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5948     }
5949 
5950     // generate Adler32 intrinsics code
5951     if (UseAdler32Intrinsics) {

< prev index next >