4713 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4714 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4715 /*temps*/v6, v20, v18, v21);
4716 // Reduce v7:v5 by the field polynomial
4717 ghash_reduce(v0, v5, v7, v26, vzr, v20);
4718
4719 __ sub(blocks, blocks, 1);
4720 __ cbnz(blocks, L_ghash_loop);
4721 }
4722
4723 // The bit-reversed result is at this point in v0
4724 __ rev64(v1, __ T16B, v0);
4725 __ rbit(v1, __ T16B, v1);
4726
4727 __ st1(v1, __ T16B, state);
4728 __ ret(lr);
4729
4730 return start;
4731 }
4732
4733 // Continuation point for throwing of implicit exceptions that are
4734 // not handled in the current activation. Fabricates an exception
4735 // oop and initiates normal exception dispatching in this
4736 // frame. Since we need to preserve callee-saved values (currently
4737 // only for C2, but done for C1 as well) we need a callee-saved oop
4738 // map and therefore have to make these stubs into RuntimeStubs
4739 // rather than BufferBlobs. If the compiler needs all registers to
4740 // be preserved between the fault point and the exception handler
4741 // then it must assume responsibility for that in
4742 // AbstractCompiler::continuation_for_implicit_null_exception or
4743 // continuation_for_implicit_division_by_zero_exception. All other
4744 // implicit exceptions (e.g., NullPointerException or
4745 // AbstractMethodError on entry) are either at call sites or
4746 // otherwise assume that stack unwinding will be initiated, so
4747 // caller saved registers were assumed volatile in the compiler.
4748
4749 #undef __
4750 #define __ masm->
4751
4752 address generate_throw_exception(const char* name,
5764 StubRoutines::_mulAdd = generate_mulAdd();
5765 }
5766
5767 if (UseMontgomeryMultiplyIntrinsic) {
5768 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5769 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5770 StubRoutines::_montgomeryMultiply = g.generate_multiply();
5771 }
5772
5773 if (UseMontgomerySquareIntrinsic) {
5774 StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5775 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5776 // We use generate_multiply() rather than generate_square()
5777 // because it's faster for the sizes of modulus we care about.
5778 StubRoutines::_montgomerySquare = g.generate_multiply();
5779 }
5780
5781 // generate GHASH intrinsics code
5782 if (UseGHASHIntrinsics) {
5783 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5784 }
5785
5786 if (UseAESIntrinsics) {
5787 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5788 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5789 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5790 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5791 }
5792
5793 if (UseSHA1Intrinsics) {
5794 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
5795 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
5796 }
5797 if (UseSHA256Intrinsics) {
5798 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
5799 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
5800 }
5801
5802 // generate Adler32 intrinsics code
5803 if (UseAdler32Intrinsics) {
|
4713 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4714 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4715 /*temps*/v6, v20, v18, v21);
4716 // Reduce v7:v5 by the field polynomial
4717 ghash_reduce(v0, v5, v7, v26, vzr, v20);
4718
4719 __ sub(blocks, blocks, 1);
4720 __ cbnz(blocks, L_ghash_loop);
4721 }
4722
4723 // The bit-reversed result is at this point in v0
4724 __ rev64(v1, __ T16B, v0);
4725 __ rbit(v1, __ T16B, v1);
4726
4727 __ st1(v1, __ T16B, state);
4728 __ ret(lr);
4729
4730 return start;
4731 }
4732
4733 void generate_base64_encode_simdround(Register src, Register dst,
4734 FloatRegister codec, u8 size) {
4735
4736 FloatRegister in0 = v4, in1 = v5, in2 = v6;
4737 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
4738 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
4739
4740 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
4741
4742 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
4743
4744 __ ushr(ind0, arrangement, in0, 2);
4745
4746 __ ushr(ind1, arrangement, in1, 2);
4747 __ shl(in0, arrangement, in0, 6);
4748 __ orr(ind1, arrangement, ind1, in0);
4749 __ ushr(ind1, arrangement, ind1, 2);
4750
4751 __ ushr(ind2, arrangement, in2, 4);
4752 __ shl(in1, arrangement, in1, 4);
4753 __ orr(ind2, arrangement, in1, ind2);
4754 __ ushr(ind2, arrangement, ind2, 2);
4755
4756 __ shl(ind3, arrangement, in2, 2);
4757 __ ushr(ind3, arrangement, ind3, 2);
4758
4759 __ tbl(out0, arrangement, codec, 4, ind0);
4760 __ tbl(out1, arrangement, codec, 4, ind1);
4761 __ tbl(out2, arrangement, codec, 4, ind2);
4762 __ tbl(out3, arrangement, codec, 4, ind3);
4763
4764 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
4765 }
4766
4767 /**
4768 * Arguments:
4769 *
4770 * Input:
4771 * c_rarg0 - src_start
4772 * c_rarg1 - src_offset
4773 * c_rarg2 - src_length
4774 * c_rarg3 - dest_start
4775 * c_rarg4 - dest_offset
4776 * c_rarg5 - isURL
4777 *
4778 */
4779 address generate_base64_encodeBlock() {
4780
4781 static const char toBase64[64] = {
4782 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
4783 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
4784 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
4785 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
4786 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
4787 };
4788
4789 static const char toBase64URL[64] = {
4790 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
4791 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
4792 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
4793 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
4794 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
4795 };
4796
4797 __ align(CodeEntryAlignment);
4798 StubCodeMark mark(this, "StubRoutines", "encodeBlock");
4799 address start = __ pc();
4800
4801 Register src = c_rarg0; // source array
4802 Register soff = c_rarg1; // source start offset
4803 Register send = c_rarg2; // source end offset
4804 Register dst = c_rarg3; // dest array
4805 Register doff = c_rarg4; // position for writing to dest array
4806 Register isURL = c_rarg5; // Base64 or URL chracter set
4807
4808 // c_rarg6 and c_rarg7 are free to use as temps
4809 Register codec = c_rarg6;
4810 Register length = c_rarg7;
4811
4812 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
4813
4814 __ add(src, src, soff);
4815 __ add(dst, dst, doff);
4816 __ sub(length, send, soff);
4817
4818 // load the codec base address
4819 __ lea(codec, ExternalAddress((address) toBase64));
4820 __ cbz(isURL, ProcessData);
4821 __ lea(codec, ExternalAddress((address) toBase64URL));
4822
4823 __ BIND(ProcessData);
4824
4825 // too short to formup a SIMD loop, roll back
4826 __ cmp(length, (u1)24);
4827 __ br(Assembler::LT, Process3B);
4828
4829 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
4830
4831 __ BIND(Process48B);
4832 __ cmp(length, (u1)48);
4833 __ br(Assembler::LT, Process24B);
4834 generate_base64_encode_simdround(src, dst, v0, 16);
4835 __ sub(length, length, 48);
4836 __ b(Process48B);
4837
4838 __ BIND(Process24B);
4839 __ cmp(length, (u1)24);
4840 __ br(Assembler::LT, SIMDExit);
4841 generate_base64_encode_simdround(src, dst, v0, 8);
4842 __ sub(length, length, 24);
4843
4844 __ BIND(SIMDExit);
4845 __ cbz(length, Exit);
4846
4847 __ BIND(Process3B);
4848 // 3 src bytes, 24 bits
4849 __ ldrb(r10, __ post(src, 1));
4850 __ ldrb(r11, __ post(src, 1));
4851 __ ldrb(r12, __ post(src, 1));
4852 __ orrw(r11, r11, r10, Assembler::LSL, 8);
4853 __ orrw(r12, r12, r11, Assembler::LSL, 8);
4854 // codec index
4855 __ ubfmw(r15, r12, 18, 23);
4856 __ ubfmw(r14, r12, 12, 17);
4857 __ ubfmw(r13, r12, 6, 11);
4858 __ andw(r12, r12, 63);
4859 // get the code based on the codec
4860 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
4861 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
4862 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
4863 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
4864 __ strb(r15, __ post(dst, 1));
4865 __ strb(r14, __ post(dst, 1));
4866 __ strb(r13, __ post(dst, 1));
4867 __ strb(r12, __ post(dst, 1));
4868 __ sub(length, length, 3);
4869 __ cbnz(length, Process3B);
4870
4871 __ BIND(Exit);
4872 __ ret(lr);
4873
4874 return start;
4875 }
4876
4877 // Continuation point for throwing of implicit exceptions that are
4878 // not handled in the current activation. Fabricates an exception
4879 // oop and initiates normal exception dispatching in this
4880 // frame. Since we need to preserve callee-saved values (currently
4881 // only for C2, but done for C1 as well) we need a callee-saved oop
4882 // map and therefore have to make these stubs into RuntimeStubs
4883 // rather than BufferBlobs. If the compiler needs all registers to
4884 // be preserved between the fault point and the exception handler
4885 // then it must assume responsibility for that in
4886 // AbstractCompiler::continuation_for_implicit_null_exception or
4887 // continuation_for_implicit_division_by_zero_exception. All other
4888 // implicit exceptions (e.g., NullPointerException or
4889 // AbstractMethodError on entry) are either at call sites or
4890 // otherwise assume that stack unwinding will be initiated, so
4891 // caller saved registers were assumed volatile in the compiler.
4892
4893 #undef __
4894 #define __ masm->
4895
4896 address generate_throw_exception(const char* name,
5908 StubRoutines::_mulAdd = generate_mulAdd();
5909 }
5910
5911 if (UseMontgomeryMultiplyIntrinsic) {
5912 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5913 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5914 StubRoutines::_montgomeryMultiply = g.generate_multiply();
5915 }
5916
5917 if (UseMontgomerySquareIntrinsic) {
5918 StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5919 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5920 // We use generate_multiply() rather than generate_square()
5921 // because it's faster for the sizes of modulus we care about.
5922 StubRoutines::_montgomerySquare = g.generate_multiply();
5923 }
5924
5925 // generate GHASH intrinsics code
5926 if (UseGHASHIntrinsics) {
5927 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5928 }
5929
5930 if (UseBASE64Intrinsics) {
5931 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
5932 }
5933
5934 if (UseAESIntrinsics) {
5935 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5936 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5937 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5938 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5939 }
5940
5941 if (UseSHA1Intrinsics) {
5942 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
5943 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
5944 }
5945 if (UseSHA256Intrinsics) {
5946 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
5947 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
5948 }
5949
5950 // generate Adler32 intrinsics code
5951 if (UseAdler32Intrinsics) {
|