4893 __ or3(O2, O3, O0); 4894 4895 __ sllx(O5, 1, O1); 4896 __ srlx(G1, 63, O2); 4897 __ or3(O1, O2, O1); 4898 __ xor3(O1, G3, O1); 4899 4900 __ deccc(len); 4901 __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop); 4902 __ delayed()->add(data, 16, data); 4903 4904 __ stx(O0, I0, 0); 4905 __ stx(O1, I0, 8); 4906 4907 __ ret(); 4908 __ delayed()->restore(); 4909 4910 return start; 4911 } 4912 4913 void generate_initial() { 4914 // Generates all stubs and initializes the entry points 4915 4916 //------------------------------------------------------------------------------------------------------------------------ 4917 // entry points that exist in all platforms 4918 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 4919 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 4920 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4921 4922 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 4923 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4924 4925 //------------------------------------------------------------------------------------------------------------------------ 4926 // entry points that are platform specific 4927 StubRoutines::Sparc::_test_stop_entry = generate_test_stop(); 4928 4929 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine(); 4930 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows(); 4931 4932 #if !defined(COMPILER2) && !defined(_LP64) 4984 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 4985 } 4986 // generate GHASH intrinsics code 4987 if (UseGHASHIntrinsics) { 4988 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4989 } 4990 4991 // generate SHA1/SHA256/SHA512 intrinsics code 4992 if (UseSHA1Intrinsics) { 4993 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4994 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4995 } 4996 if (UseSHA256Intrinsics) { 4997 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4998 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4999 } 5000 if (UseSHA512Intrinsics) { 5001 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 5002 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 5003 } 5004 } 5005 5006 5007 public: 5008 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5009 // replace the standard masm with a special one: 5010 _masm = new MacroAssembler(code); 5011 5012 _stub_count = !all ? 0x100 : 0x200; 5013 if (all) { 5014 generate_all(); 5015 } else { 5016 generate_initial(); 5017 } 5018 5019 // make sure this stub is available for all local calls 5020 if (_atomic_add_stub.is_unbound()) { 5021 // generate a second time, if necessary 5022 (void) generate_atomic_add(); 5023 } | 4893 __ or3(O2, O3, O0); 4894 4895 __ sllx(O5, 1, O1); 4896 __ srlx(G1, 63, O2); 4897 __ or3(O1, O2, O1); 4898 __ xor3(O1, G3, O1); 4899 4900 __ deccc(len); 4901 __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop); 4902 __ delayed()->add(data, 16, data); 4903 4904 __ stx(O0, I0, 0); 4905 __ stx(O1, I0, 8); 4906 4907 __ ret(); 4908 __ delayed()->restore(); 4909 4910 return start; 4911 } 4912 4913 #define CHUNK_LEN 128 /* 128 x 8B = 1KB */ 4914 #define CHUNK_K1 0x1307a0206 /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */ 4915 #define CHUNK_K2 0x1a0f717c4 /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */ 4916 #define CHUNK_K3 0x0170076fa /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */ 4917 4918 /** 4919 * Arguments: 4920 * 4921 * Inputs: 4922 * O0 - int crc 4923 * O1 - byte* buf 4924 * O2 - int len 4925 * O3 - int* table 4926 * 4927 * Output: 4928 * O0 - int crc result 4929 */ 4930 address generate_updateBytesCRC32C() { 4931 assert(UseCRC32CIntrinsics, "need CRC32C instruction"); 4932 4933 __ align(CodeEntryAlignment); 4934 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4935 address start = __ pc(); 4936 4937 const Register crc = O0; // crc 4938 const Register buf = O1; // source java byte array address 4939 const Register len = O2; // number of bytes 4940 const Register table = O3; // byteTable 4941 4942 Label L_crc32c_head, L_crc32c_aligned; 4943 Label L_crc32c_parallel, L_crc32c_parallel_loop; 4944 Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop; 4945 Label L_crc32c_done, L_crc32c_tail, L_crc32c_return; 4946 4947 __ cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return); 4948 4949 // clear upper 32 bits of crc 4950 __ clruwu(crc); 4951 4952 __ and3(buf, 7, G4); 4953 __ cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned); 4954 4955 __ mov(8, G1); 4956 __ sub(G1, G4, G4); 4957 4958 // ------ process the misaligned head (7 bytes or less) ------ 4959 __ BIND(L_crc32c_head); 4960 4961 // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF]; 4962 __ ldub(buf, 0, G1); 4963 __ update_byte_crc32(crc, G1, table); 4964 4965 __ inc(buf); 4966 __ dec(len); 4967 __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return); 4968 __ dec(G4); 4969 __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head); 4970 4971 // ------ process the 8-byte-aligned body ------ 4972 __ BIND(L_crc32c_aligned); 4973 __ nop(); 4974 __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail); 4975 4976 // reverse the byte order of lower 32 bits to big endian, and move to FP side 4977 __ movitof_revbytes(crc, F0, G1, G3); 4978 4979 __ set(CHUNK_LEN*8*4, G4); 4980 __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial); 4981 4982 // ------ process four 1KB chunks in parallel ------ 4983 __ BIND(L_crc32c_parallel); 4984 4985 __ fzero(FloatRegisterImpl::D, F2); 4986 __ fzero(FloatRegisterImpl::D, F4); 4987 __ fzero(FloatRegisterImpl::D, F6); 4988 4989 __ mov(CHUNK_LEN - 1, G4); 4990 __ BIND(L_crc32c_parallel_loop); 4991 // schedule ldf's ahead of crc32c's to hide the load-use latency 4992 __ ldf(FloatRegisterImpl::D, buf, 0, F8); 4993 __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8, F10); 4994 __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12); 4995 __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14); 4996 __ crc32c(F0, F8, F0); 4997 __ crc32c(F2, F10, F2); 4998 __ crc32c(F4, F12, F4); 4999 __ crc32c(F6, F14, F6); 5000 __ inc(buf, 8); 5001 __ dec(G4); 5002 __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop); 5003 5004 __ ldf(FloatRegisterImpl::D, buf, 0, F8); 5005 __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8, F10); 5006 __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12); 5007 __ crc32c(F0, F8, F0); 5008 __ crc32c(F2, F10, F2); 5009 __ crc32c(F4, F12, F4); 5010 5011 __ inc(buf, CHUNK_LEN*24); 5012 __ ldfl(FloatRegisterImpl::D, buf, G0, F14); // load in little endian 5013 __ inc(buf, 8); 5014 5015 __ prefetch(buf, 0, Assembler::severalReads); 5016 __ prefetch(buf, CHUNK_LEN*8, Assembler::severalReads); 5017 __ prefetch(buf, CHUNK_LEN*16, Assembler::severalReads); 5018 __ prefetch(buf, CHUNK_LEN*24, Assembler::severalReads); 5019 5020 // move to INT side, and reverse the byte order of lower 32 bits to little endian 5021 __ movftoi_revbytes(F0, O4, G1, G4); 5022 __ movftoi_revbytes(F2, O5, G1, G4); 5023 __ movftoi_revbytes(F4, G5, G1, G4); 5024 5025 // combine the results of 4 chunks 5026 __ set64(CHUNK_K1, G3, G1); 5027 __ xmulx(O4, G3, O4); 5028 __ set64(CHUNK_K2, G3, G1); 5029 __ xmulx(O5, G3, O5); 5030 __ set64(CHUNK_K3, G3, G1); 5031 __ xmulx(G5, G3, G5); 5032 5033 __ movdtox(F14, G4); 5034 __ xor3(O4, O5, O5); 5035 __ xor3(G5, O5, O5); 5036 __ xor3(G4, O5, O5); 5037 5038 // reverse the byte order to big endian, via stack, and move to FP side 5039 __ add(SP, -8, G1); 5040 __ srlx(G1, 3, G1); 5041 __ sllx(G1, 3, G1); 5042 __ stx(O5, G1, G0); 5043 __ ldfl(FloatRegisterImpl::D, G1, G0, F2); // load in little endian 5044 5045 __ crc32c(F6, F2, F0); 5046 5047 __ set(CHUNK_LEN*8*4, G4); 5048 __ sub(len, G4, len); 5049 __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel); 5050 __ nop(); 5051 __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done); 5052 5053 __ BIND(L_crc32c_serial); 5054 5055 __ mov(32, G4); 5056 __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8); 5057 5058 // ------ process 32B chunks ------ 5059 __ BIND(L_crc32c_x32_loop); 5060 __ ldf(FloatRegisterImpl::D, buf, 0, F2); 5061 __ inc(buf, 8); 5062 __ crc32c(F0, F2, F0); 5063 __ ldf(FloatRegisterImpl::D, buf, 0, F2); 5064 __ inc(buf, 8); 5065 __ crc32c(F0, F2, F0); 5066 __ ldf(FloatRegisterImpl::D, buf, 0, F2); 5067 __ inc(buf, 8); 5068 __ crc32c(F0, F2, F0); 5069 __ ldf(FloatRegisterImpl::D, buf, 0, F2); 5070 __ inc(buf, 8); 5071 __ crc32c(F0, F2, F0); 5072 __ dec(len, 32); 5073 __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop); 5074 5075 __ BIND(L_crc32c_x8); 5076 __ nop(); 5077 __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done); 5078 5079 // ------ process 8B chunks ------ 5080 __ BIND(L_crc32c_x8_loop); 5081 __ ldf(FloatRegisterImpl::D, buf, 0, F2); 5082 __ inc(buf, 8); 5083 __ crc32c(F0, F2, F0); 5084 __ dec(len, 8); 5085 __ cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop); 5086 5087 __ BIND(L_crc32c_done); 5088 5089 // move to INT side, and reverse the byte order of lower 32 bits to little endian 5090 __ movftoi_revbytes(F0, crc, G1, G3); 5091 5092 __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return); 5093 5094 // ------ process the misaligned tail (7 bytes or less) ------ 5095 __ BIND(L_crc32c_tail); 5096 5097 // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF]; 5098 __ ldub(buf, 0, G1); 5099 __ update_byte_crc32(crc, G1, table); 5100 5101 __ inc(buf); 5102 __ dec(len); 5103 __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail); 5104 5105 __ BIND(L_crc32c_return); 5106 __ nop(); 5107 __ retl(); 5108 __ delayed()->nop(); 5109 5110 return start; 5111 } 5112 5113 void generate_initial() { 5114 // Generates all stubs and initializes the entry points 5115 5116 //------------------------------------------------------------------------------------------------------------------------ 5117 // entry points that exist in all platforms 5118 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 5119 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 5120 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5121 5122 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 5123 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5124 5125 //------------------------------------------------------------------------------------------------------------------------ 5126 // entry points that are platform specific 5127 StubRoutines::Sparc::_test_stop_entry = generate_test_stop(); 5128 5129 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine(); 5130 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows(); 5131 5132 #if !defined(COMPILER2) && !defined(_LP64) 5184 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 5185 } 5186 // generate GHASH intrinsics code 5187 if (UseGHASHIntrinsics) { 5188 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5189 } 5190 5191 // generate SHA1/SHA256/SHA512 intrinsics code 5192 if (UseSHA1Intrinsics) { 5193 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5194 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5195 } 5196 if (UseSHA256Intrinsics) { 5197 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5198 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5199 } 5200 if (UseSHA512Intrinsics) { 5201 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 5202 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 5203 } 5204 5205 // generate CRC32C intrinsic code 5206 if (UseCRC32CIntrinsics) { 5207 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5208 } 5209 } 5210 5211 5212 public: 5213 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5214 // replace the standard masm with a special one: 5215 _masm = new MacroAssembler(code); 5216 5217 _stub_count = !all ? 0x100 : 0x200; 5218 if (all) { 5219 generate_all(); 5220 } else { 5221 generate_initial(); 5222 } 5223 5224 // make sure this stub is available for all local calls 5225 if (_atomic_add_stub.is_unbound()) { 5226 // generate a second time, if necessary 5227 (void) generate_atomic_add(); 5228 } |