5093 5094 // ------ process the misaligned tail (7 bytes or less) ------ 5095 __ BIND(L_crc32c_tail); 5096 5097 // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF]; 5098 __ ldub(buf, 0, G1); 5099 __ update_byte_crc32(crc, G1, table); 5100 5101 __ inc(buf); 5102 __ dec(len); 5103 __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail); 5104 5105 __ BIND(L_crc32c_return); 5106 __ nop(); 5107 __ retl(); 5108 __ delayed()->nop(); 5109 5110 return start; 5111 } 5112 5113 void generate_initial() { 5114 // Generates all stubs and initializes the entry points 5115 5116 //------------------------------------------------------------------------------------------------------------------------ 5117 // entry points that exist in all platforms 5118 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 5119 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 5120 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5121 5122 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 5123 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5124 5125 //------------------------------------------------------------------------------------------------------------------------ 5126 // entry points that are platform specific 5127 StubRoutines::Sparc::_test_stop_entry = generate_test_stop(); 5128 5129 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine(); 5130 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows(); 5131 5132 #if !defined(COMPILER2) && !defined(_LP64) 5189 } 5190 5191 // generate SHA1/SHA256/SHA512 intrinsics code 5192 if (UseSHA1Intrinsics) { 5193 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5194 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5195 } 5196 if (UseSHA256Intrinsics) { 5197 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5198 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5199 } 5200 if (UseSHA512Intrinsics) { 5201 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 5202 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 5203 } 5204 5205 // generate CRC32C intrinsic code 5206 if (UseCRC32CIntrinsics) { 5207 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5208 } 5209 } 5210 5211 5212 public: 5213 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5214 // replace the standard masm with a special one: 5215 _masm = new MacroAssembler(code); 5216 5217 _stub_count = !all ? 0x100 : 0x200; 5218 if (all) { 5219 generate_all(); 5220 } else { 5221 generate_initial(); 5222 } 5223 5224 // make sure this stub is available for all local calls 5225 if (_atomic_add_stub.is_unbound()) { 5226 // generate a second time, if necessary 5227 (void) generate_atomic_add(); 5228 } | 5093 5094 // ------ process the misaligned tail (7 bytes or less) ------ 5095 __ BIND(L_crc32c_tail); 5096 5097 // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF]; 5098 __ ldub(buf, 0, G1); 5099 __ update_byte_crc32(crc, G1, table); 5100 5101 __ inc(buf); 5102 __ dec(len); 5103 __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail); 5104 5105 __ BIND(L_crc32c_return); 5106 __ nop(); 5107 __ retl(); 5108 __ delayed()->nop(); 5109 5110 return start; 5111 } 5112 5113 #define ADLER32_NUM_TEMPS 16 5114 5115 /** 5116 * Arguments: 5117 * 5118 * Inputs: 5119 * O0 - int adler 5120 * O1 - byte* buff 5121 * O2 - int len 5122 * 5123 * Output: 5124 * O0 - int adler result 5125 */ 5126 address generate_updateBytesAdler32() { 5127 __ align(CodeEntryAlignment); 5128 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 5129 address start = __ pc(); 5130 5131 Label L_cleanup_loop, L_cleanup_loop_check; 5132 Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check; 5133 Label L_nmax_check_done; 5134 5135 // Aliases 5136 Register s1 = O0; 5137 Register s2 = O3; 5138 Register buff = O1; 5139 Register len = O2; 5140 Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7}; 5141 5142 // Max number of bytes we can process before having to take the mod 5143 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 5144 unsigned long NMAX = 0x15B0; 5145 5146 // Zero-out the upper bits of len 5147 __ clruwu(len); 5148 5149 // Create the mask 0xFFFF 5150 __ set64(0x00FFFF, O4, O5); // O5 is the temp register 5151 5152 // s1 is initialized to the lower 16 bits of adler 5153 // s2 is initialized to the upper 16 bits of adler 5154 __ srlx(O0, 16, O5); // adler >> 16 5155 __ and3(O0, O4, s1); // s1 = (adler & 0xFFFF) 5156 __ and3(O5, O4, s2); // s2 = ((adler >> 16) & 0xFFFF) 5157 5158 // The pipelined loop needs at least 16 elements for 1 iteration 5159 // It does check this, but it is more effective to skip to the cleanup loop 5160 // Setup the constant for cutoff checking 5161 __ mov(15, O4); 5162 5163 // Check if we are above the cutoff, if not go to the cleanup loop immediately 5164 __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check); 5165 5166 // Free up some registers for our use 5167 for (int i = 0; i < ADLER32_NUM_TEMPS; i++) { 5168 __ movxtod(temp[i], as_FloatRegister(2*i)); 5169 } 5170 5171 // Loop maintenance stuff is done at the end of the loop, so skip to there 5172 __ ba_short(L_main_loop_check); 5173 5174 __ BIND(L_main_loop); 5175 5176 // Prologue for inner loop 5177 __ ldub(buff, 0, L0); 5178 __ dec(O5); 5179 5180 for (int i = 1; i < 8; i++) { 5181 __ ldub(buff, i, temp[i]); 5182 } 5183 5184 __ inc(buff, 8); 5185 5186 // Inner loop processes 16 elements at a time, might never execute if only 16 elements 5187 // to be processed by the outter loop 5188 __ ba_short(L_inner_loop_check); 5189 5190 __ BIND(L_inner_loop); 5191 5192 for (int i = 0; i < 8; i++) { 5193 __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]); 5194 __ add(s1, temp[i], s1); 5195 __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]); 5196 __ add(s2, s1, s2); 5197 } 5198 5199 // Original temp 0-7 used and new loads to temp 0-7 issued 5200 // temp 8-15 ready to be consumed 5201 __ add(s1, I0, s1); 5202 __ dec(O5); 5203 __ add(s2, s1, s2); 5204 __ add(s1, I1, s1); 5205 __ inc(buff, 16); 5206 __ add(s2, s1, s2); 5207 5208 for (int i = 0; i < 6; i++) { 5209 __ add(s1, temp[10+i], s1); 5210 __ add(s2, s1, s2); 5211 } 5212 5213 __ BIND(L_inner_loop_check); 5214 __ nop(); 5215 __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop); 5216 5217 // Epilogue 5218 for (int i = 0; i < 4; i++) { 5219 __ ldub(buff, (2*i), temp[8+(2*i)]); 5220 __ add(s1, temp[i], s1); 5221 __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]); 5222 __ add(s2, s1, s2); 5223 } 5224 5225 __ add(s1, temp[4], s1); 5226 __ inc(buff, 8); 5227 5228 for (int i = 0; i < 11; i++) { 5229 __ add(s2, s1, s2); 5230 __ add(s1, temp[5+i], s1); 5231 } 5232 5233 __ add(s2, s1, s2); 5234 5235 // Take the mod for s1 and s2 5236 __ set64(0xFFF1, L0, L1); 5237 __ udivx(s1, L0, L1); 5238 __ udivx(s2, L0, L2); 5239 __ mulx(L0, L1, L1); 5240 __ mulx(L0, L2, L2); 5241 __ sub(s1, L1, s1); 5242 __ sub(s2, L2, s2); 5243 5244 // Make sure there is something left to process 5245 __ BIND(L_main_loop_check); 5246 __ set64(NMAX, L0, L1); 5247 // k = len < NMAX ? len : NMAX 5248 __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done); 5249 __ andn(len, 0x0F, L0); // only loop a multiple of 16 times 5250 __ BIND(L_nmax_check_done); 5251 __ mov(L0, O5); 5252 __ sub(len, L0, len); // len -= k 5253 5254 __ srlx(O5, 4, O5); // multiplies of 16 5255 __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop); 5256 5257 // Restore anything we used, take the mod one last time, combine and return 5258 // Restore any registers we saved 5259 for (int i = 0; i < ADLER32_NUM_TEMPS; i++) { 5260 __ movdtox(as_FloatRegister(2*i), temp[i]); 5261 } 5262 5263 // There might be nothing left to process 5264 __ ba_short(L_cleanup_loop_check); 5265 5266 __ BIND(L_cleanup_loop); 5267 __ ldub(buff, 0, O4); // load single byte form buffer 5268 __ inc(buff); // buff++ 5269 __ add(s1, O4, s1); // s1 += *buff++; 5270 __ dec(len); // len-- 5271 __ add(s1, s2, s2); // s2 += s1; 5272 __ BIND(L_cleanup_loop_check); 5273 __ nop(); 5274 __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop); 5275 5276 // Take the mod one last time 5277 __ set64(0xFFF1, O1, O2); 5278 __ udivx(s1, O1, O2); 5279 __ udivx(s2, O1, O5); 5280 __ mulx(O1, O2, O2); 5281 __ mulx(O1, O5, O5); 5282 __ sub(s1, O2, s1); 5283 __ sub(s2, O5, s2); 5284 5285 // Combine lower bits and higher bits 5286 __ sllx(s2, 16, s2); // s2 = s2 << 16 5287 __ or3(s1, s2, s1); // adler = s2 | s1 5288 // Final return value is in O0 5289 __ retl(); 5290 __ delayed()->nop(); 5291 5292 return start; 5293 } 5294 5295 void generate_initial() { 5296 // Generates all stubs and initializes the entry points 5297 5298 //------------------------------------------------------------------------------------------------------------------------ 5299 // entry points that exist in all platforms 5300 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 5301 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 5302 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5303 5304 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 5305 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5306 5307 //------------------------------------------------------------------------------------------------------------------------ 5308 // entry points that are platform specific 5309 StubRoutines::Sparc::_test_stop_entry = generate_test_stop(); 5310 5311 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine(); 5312 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows(); 5313 5314 #if !defined(COMPILER2) && !defined(_LP64) 5371 } 5372 5373 // generate SHA1/SHA256/SHA512 intrinsics code 5374 if (UseSHA1Intrinsics) { 5375 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5376 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5377 } 5378 if (UseSHA256Intrinsics) { 5379 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5380 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5381 } 5382 if (UseSHA512Intrinsics) { 5383 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 5384 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 5385 } 5386 5387 // generate CRC32C intrinsic code 5388 if (UseCRC32CIntrinsics) { 5389 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5390 } 5391 5392 // generate Adler32 intrinsics code 5393 if (UseAdler32Intrinsics) { 5394 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5395 } 5396 } 5397 5398 5399 public: 5400 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5401 // replace the standard masm with a special one: 5402 _masm = new MacroAssembler(code); 5403 5404 _stub_count = !all ? 0x100 : 0x200; 5405 if (all) { 5406 generate_all(); 5407 } else { 5408 generate_initial(); 5409 } 5410 5411 // make sure this stub is available for all local calls 5412 if (_atomic_add_stub.is_unbound()) { 5413 // generate a second time, if necessary 5414 (void) generate_atomic_add(); 5415 } |