src/cpu/sparc/vm/stubGenerator_sparc.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File hotspot Sdiff src/cpu/sparc/vm

src/cpu/sparc/vm/stubGenerator_sparc.cpp

Print this page




4893       __ or3(O2, O3, O0);
4894 
4895       __ sllx(O5, 1, O1);
4896       __ srlx(G1, 63, O2);
4897       __ or3(O1, O2, O1);
4898       __ xor3(O1, G3, O1);
4899 
4900       __ deccc(len);
4901       __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
4902       __ delayed()->add(data, 16, data);
4903 
4904       __ stx(O0, I0, 0);
4905       __ stx(O1, I0, 8);
4906 
4907       __ ret();
4908       __ delayed()->restore();
4909 
4910       return start;
4911   }
4912 








































































































































































































4913   void generate_initial() {
4914     // Generates all stubs and initializes the entry points
4915 
4916     //------------------------------------------------------------------------------------------------------------------------
4917     // entry points that exist in all platforms
4918     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
4919     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
4920     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
4921 
4922     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
4923     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
4924 
4925     //------------------------------------------------------------------------------------------------------------------------
4926     // entry points that are platform specific
4927     StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
4928 
4929     StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
4930     StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
4931 
4932 #if !defined(COMPILER2) && !defined(_LP64)


4984       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
4985     }
4986     // generate GHASH intrinsics code
4987     if (UseGHASHIntrinsics) {
4988       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4989     }
4990 
4991     // generate SHA1/SHA256/SHA512 intrinsics code
4992     if (UseSHA1Intrinsics) {
4993       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4994       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4995     }
4996     if (UseSHA256Intrinsics) {
4997       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4998       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4999     }
5000     if (UseSHA512Intrinsics) {
5001       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
5002       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
5003     }





5004   }
5005 
5006 
5007  public:
5008   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5009     // replace the standard masm with a special one:
5010     _masm = new MacroAssembler(code);
5011 
5012     _stub_count = !all ? 0x100 : 0x200;
5013     if (all) {
5014       generate_all();
5015     } else {
5016       generate_initial();
5017     }
5018 
5019     // make sure this stub is available for all local calls
5020     if (_atomic_add_stub.is_unbound()) {
5021       // generate a second time, if necessary
5022       (void) generate_atomic_add();
5023     }




4893       __ or3(O2, O3, O0);
4894 
4895       __ sllx(O5, 1, O1);
4896       __ srlx(G1, 63, O2);
4897       __ or3(O1, O2, O1);
4898       __ xor3(O1, G3, O1);
4899 
4900       __ deccc(len);
4901       __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
4902       __ delayed()->add(data, 16, data);
4903 
4904       __ stx(O0, I0, 0);
4905       __ stx(O1, I0, 8);
4906 
4907       __ ret();
4908       __ delayed()->restore();
4909 
4910       return start;
4911   }
4912 
4913 #define CHUNK_LEN       128             /* 128 x 8B = 1KB */
4914 #define CHUNK_K1        0x1307a0206     /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */
4915 #define CHUNK_K2        0x1a0f717c4     /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */
4916 #define CHUNK_K3        0x0170076fa     /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */
4917 
4918   /**
4919    *  Arguments:
4920    *
4921    * Inputs:
4922    *   O0   - int   crc
4923    *   O1   - byte* buf
4924    *   O2   - int   len
4925    *   O3   - int*  table
4926    *
4927    * Output:
4928    *   O0   - int crc result
4929    */
4930   address generate_updateBytesCRC32C() {
4931     assert(UseCRC32CIntrinsics, "need CRC32C instruction");
4932 
4933     __ align(CodeEntryAlignment);
4934     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4935     address start = __ pc();
4936 
4937     const Register crc   = O0;  // crc
4938     const Register buf   = O1;  // source java byte array address
4939     const Register len   = O2;  // number of bytes
4940     const Register table = O3;  // byteTable
4941 
4942     Label L_crc32c_head, L_crc32c_aligned;
4943     Label L_crc32c_parallel, L_crc32c_parallel_loop;
4944     Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop;
4945     Label L_crc32c_done, L_crc32c_tail, L_crc32c_return;
4946 
4947     __ cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return);
4948 
4949     // clear upper 32 bits of crc
4950     __ clruwu(crc);
4951 
4952     __ and3(buf, 7, G4);
4953     __ cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned);
4954 
4955     __ mov(8, G1);
4956     __ sub(G1, G4, G4);
4957 
4958     // ------ process the misaligned head (7 bytes or less) ------
4959     __ BIND(L_crc32c_head);
4960 
4961     // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
4962     __ ldub(buf, 0, G1);
4963     __ update_byte_crc32(crc, G1, table);
4964 
4965     __ inc(buf);
4966     __ dec(len);
4967     __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return);
4968     __ dec(G4);
4969     __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head);
4970 
4971     // ------ process the 8-byte-aligned body ------
4972     __ BIND(L_crc32c_aligned);
4973     __ nop();
4974     __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail);
4975 
4976     // reverse the byte order of lower 32 bits to big endian, and move to FP side
4977     __ movitof_revbytes(crc, F0, G1, G3);
4978 
4979     __ set(CHUNK_LEN*8*4, G4);
4980     __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial);
4981 
4982     // ------ process four 1KB chunks in parallel ------
4983     __ BIND(L_crc32c_parallel);
4984 
4985     __ fzero(FloatRegisterImpl::D, F2);
4986     __ fzero(FloatRegisterImpl::D, F4);
4987     __ fzero(FloatRegisterImpl::D, F6);
4988 
4989     __ mov(CHUNK_LEN - 1, G4);
4990     __ BIND(L_crc32c_parallel_loop);
4991     // schedule ldf's ahead of crc32c's to hide the load-use latency
4992     __ ldf(FloatRegisterImpl::D, buf, 0,            F8);
4993     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
4994     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
4995     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14);
4996     __ crc32c(F0, F8,  F0);
4997     __ crc32c(F2, F10, F2);
4998     __ crc32c(F4, F12, F4);
4999     __ crc32c(F6, F14, F6);
5000     __ inc(buf, 8);
5001     __ dec(G4);
5002     __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop);
5003 
5004     __ ldf(FloatRegisterImpl::D, buf, 0,            F8);
5005     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
5006     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
5007     __ crc32c(F0, F8,  F0);
5008     __ crc32c(F2, F10, F2);
5009     __ crc32c(F4, F12, F4);
5010 
5011     __ inc(buf, CHUNK_LEN*24);
5012     __ ldfl(FloatRegisterImpl::D, buf, G0, F14);        // load in little endian
5013     __ inc(buf, 8);
5014 
5015     __ prefetch(buf, 0,            Assembler::severalReads);
5016     __ prefetch(buf, CHUNK_LEN*8,  Assembler::severalReads);
5017     __ prefetch(buf, CHUNK_LEN*16, Assembler::severalReads);
5018     __ prefetch(buf, CHUNK_LEN*24, Assembler::severalReads);
5019 
5020     // move to INT side, and reverse the byte order of lower 32 bits to little endian
5021     __ movftoi_revbytes(F0, O4, G1, G4);
5022     __ movftoi_revbytes(F2, O5, G1, G4);
5023     __ movftoi_revbytes(F4, G5, G1, G4);
5024 
5025     // combine the results of 4 chunks
5026     __ set64(CHUNK_K1, G3, G1);
5027     __ xmulx(O4, G3, O4);
5028     __ set64(CHUNK_K2, G3, G1);
5029     __ xmulx(O5, G3, O5);
5030     __ set64(CHUNK_K3, G3, G1);
5031     __ xmulx(G5, G3, G5);
5032 
5033     __ movdtox(F14, G4);
5034     __ xor3(O4, O5, O5);
5035     __ xor3(G5, O5, O5);
5036     __ xor3(G4, O5, O5);
5037 
5038     // reverse the byte order to big endian, via stack, and move to FP side
5039     __ add(SP, -8, G1);
5040     __ srlx(G1, 3, G1);
5041     __ sllx(G1, 3, G1);
5042     __ stx(O5, G1, G0);
5043     __ ldfl(FloatRegisterImpl::D, G1, G0, F2);  // load in little endian
5044 
5045     __ crc32c(F6, F2, F0);
5046 
5047     __ set(CHUNK_LEN*8*4, G4);
5048     __ sub(len, G4, len);
5049     __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel);
5050     __ nop();
5051     __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done);
5052 
5053     __ BIND(L_crc32c_serial);
5054 
5055     __ mov(32, G4);
5056     __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8);
5057 
5058     // ------ process 32B chunks ------
5059     __ BIND(L_crc32c_x32_loop);
5060     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
5061     __ inc(buf, 8);
5062     __ crc32c(F0, F2, F0);
5063     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
5064     __ inc(buf, 8);
5065     __ crc32c(F0, F2, F0);
5066     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
5067     __ inc(buf, 8);
5068     __ crc32c(F0, F2, F0);
5069     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
5070     __ inc(buf, 8);
5071     __ crc32c(F0, F2, F0);
5072     __ dec(len, 32);
5073     __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop);
5074 
5075     __ BIND(L_crc32c_x8);
5076     __ nop();
5077     __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done);
5078 
5079     // ------ process 8B chunks ------
5080     __ BIND(L_crc32c_x8_loop);
5081     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
5082     __ inc(buf, 8);
5083     __ crc32c(F0, F2, F0);
5084     __ dec(len, 8);
5085     __ cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop);
5086 
5087     __ BIND(L_crc32c_done);
5088 
5089     // move to INT side, and reverse the byte order of lower 32 bits to little endian
5090     __ movftoi_revbytes(F0, crc, G1, G3);
5091 
5092     __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return);
5093 
5094     // ------ process the misaligned tail (7 bytes or less) ------
5095     __ BIND(L_crc32c_tail);
5096 
5097     // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
5098     __ ldub(buf, 0, G1);
5099     __ update_byte_crc32(crc, G1, table);
5100 
5101     __ inc(buf);
5102     __ dec(len);
5103     __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
5104 
5105     __ BIND(L_crc32c_return);
5106     __ nop();
5107     __ retl();
5108     __ delayed()->nop();
5109 
5110     return start;
5111   }
5112 
5113   void generate_initial() {
5114     // Generates all stubs and initializes the entry points
5115 
5116     //------------------------------------------------------------------------------------------------------------------------
5117     // entry points that exist in all platforms
5118     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
5119     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
5120     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
5121 
5122     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
5123     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
5124 
5125     //------------------------------------------------------------------------------------------------------------------------
5126     // entry points that are platform specific
5127     StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
5128 
5129     StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
5130     StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
5131 
5132 #if !defined(COMPILER2) && !defined(_LP64)


5184       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5185     }
5186     // generate GHASH intrinsics code
5187     if (UseGHASHIntrinsics) {
5188       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5189     }
5190 
5191     // generate SHA1/SHA256/SHA512 intrinsics code
5192     if (UseSHA1Intrinsics) {
5193       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5194       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5195     }
5196     if (UseSHA256Intrinsics) {
5197       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5198       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5199     }
5200     if (UseSHA512Intrinsics) {
5201       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
5202       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
5203     }
5204 
5205     // generate CRC32C intrinsic code
5206     if (UseCRC32CIntrinsics) {
5207       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5208     }
5209   }
5210 
5211 
5212  public:
5213   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5214     // replace the standard masm with a special one:
5215     _masm = new MacroAssembler(code);
5216 
5217     _stub_count = !all ? 0x100 : 0x200;
5218     if (all) {
5219       generate_all();
5220     } else {
5221       generate_initial();
5222     }
5223 
5224     // make sure this stub is available for all local calls
5225     if (_atomic_add_stub.is_unbound()) {
5226       // generate a second time, if necessary
5227       (void) generate_atomic_add();
5228     }


src/cpu/sparc/vm/stubGenerator_sparc.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File