hotspot Sdiff src/cpu/sparc/vm

src/cpu/sparc/vm/stubGenerator_sparc.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File hotspot Sdiff src/cpu/sparc/vm

src/cpu/sparc/vm/stubGenerator_sparc.cpp

Print this page

5093 
5094     // ------ process the misaligned tail (7 bytes or less) ------
5095     __ BIND(L_crc32c_tail);
5096 
5097     // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
5098     __ ldub(buf, 0, G1);
5099     __ update_byte_crc32(crc, G1, table);
5100 
5101     __ inc(buf);
5102     __ dec(len);
5103     __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
5104 
5105     __ BIND(L_crc32c_return);
5106     __ nop();
5107     __ retl();
5108     __ delayed()->nop();
5109 
5110     return start;
5111   }
5112 






















































































































































































5113   void generate_initial() {
5114     // Generates all stubs and initializes the entry points
5115 
5116     //------------------------------------------------------------------------------------------------------------------------
5117     // entry points that exist in all platforms
5118     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
5119     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
5120     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
5121 
5122     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
5123     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
5124 
5125     //------------------------------------------------------------------------------------------------------------------------
5126     // entry points that are platform specific
5127     StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
5128 
5129     StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
5130     StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
5131 
5132 #if !defined(COMPILER2) && !defined(_LP64)

5189     }
5190 
5191     // generate SHA1/SHA256/SHA512 intrinsics code
5192     if (UseSHA1Intrinsics) {
5193       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5194       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5195     }
5196     if (UseSHA256Intrinsics) {
5197       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5198       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5199     }
5200     if (UseSHA512Intrinsics) {
5201       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
5202       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
5203     }
5204 
5205     // generate CRC32C intrinsic code
5206     if (UseCRC32CIntrinsics) {
5207       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5208     }





5209   }
5210 
5211 
5212  public:
5213   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5214     // replace the standard masm with a special one:
5215     _masm = new MacroAssembler(code);
5216 
5217     _stub_count = !all ? 0x100 : 0x200;
5218     if (all) {
5219       generate_all();
5220     } else {
5221       generate_initial();
5222     }
5223 
5224     // make sure this stub is available for all local calls
5225     if (_atomic_add_stub.is_unbound()) {
5226       // generate a second time, if necessary
5227       (void) generate_atomic_add();
5228     }

5093 
5094     // ------ process the misaligned tail (7 bytes or less) ------
5095     __ BIND(L_crc32c_tail);
5096 
5097     // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
5098     __ ldub(buf, 0, G1);
5099     __ update_byte_crc32(crc, G1, table);
5100 
5101     __ inc(buf);
5102     __ dec(len);
5103     __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
5104 
5105     __ BIND(L_crc32c_return);
5106     __ nop();
5107     __ retl();
5108     __ delayed()->nop();
5109 
5110     return start;
5111   }
5112 
5113 #define ADLER32_NUM_TEMPS 16
5114 
5115   /**
5116    *  Arguments:
5117    *
5118    * Inputs:
5119    *   O0   - int   adler
5120    *   O1   - byte* buff
5121    *   O2   - int   len
5122    *
5123    * Output:
5124    *   O0   - int adler result
5125    */
5126   address generate_updateBytesAdler32() {
5127     __ align(CodeEntryAlignment);
5128     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
5129     address start = __ pc();
5130     
5131     Label L_cleanup_loop, L_cleanup_loop_check;
5132     Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check;
5133     Label L_nmax_check_done;
5134 
5135     // Aliases
5136     Register s1     = O0;
5137     Register s2     = O3;
5138     Register buff   = O1;
5139     Register len    = O2;
5140     Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7};
5141     
5142     // Max number of bytes we can process before having to take the mod
5143     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
5144     unsigned long NMAX = 0x15B0;
5145 
5146     // Zero-out the upper bits of len
5147     __ clruwu(len);
5148 
5149     // Create the mask 0xFFFF
5150     __ set64(0x00FFFF, O4, O5); // O5 is the temp register
5151 
5152     // s1 is initialized to the lower 16 bits of adler
5153     // s2 is initialized to the upper 16 bits of adler
5154     __ srlx(O0, 16, O5); // adler >> 16
5155     __ and3(O0, O4, s1); // s1  = (adler & 0xFFFF)
5156     __ and3(O5, O4, s2); // s2  = ((adler >> 16) & 0xFFFF)
5157 
5158     // The pipelined loop needs at least 16 elements for 1 iteration
5159     // It does check this, but it is more effective to skip to the cleanup loop
5160     // Setup the constant for cutoff checking
5161     __ mov(15, O4);
5162 
5163     // Check if we are above the cutoff, if not go to the cleanup loop immediately
5164     __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check);
5165     
5166     // Free up some registers for our use
5167     for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
5168       __ movxtod(temp[i], as_FloatRegister(2*i));
5169     }
5170 
5171     // Loop maintenance stuff is done at the end of the loop, so skip to there
5172     __ ba_short(L_main_loop_check);
5173     
5174     __ BIND(L_main_loop);
5175 
5176     // Prologue for inner loop
5177     __ ldub(buff, 0, L0);
5178     __ dec(O5);
5179 
5180     for (int i = 1; i < 8; i++) {
5181       __ ldub(buff, i, temp[i]);
5182     }
5183 
5184     __ inc(buff, 8);
5185 
5186     // Inner loop processes 16 elements at a time, might never execute if only 16 elements
5187     // to be processed by the outter loop
5188     __ ba_short(L_inner_loop_check);
5189 
5190     __ BIND(L_inner_loop);
5191 
5192     for (int i = 0; i < 8; i++) {
5193       __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]);
5194       __ add(s1, temp[i], s1);
5195       __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]);
5196       __ add(s2, s1, s2);
5197     }
5198 
5199     // Original temp 0-7 used and new loads to temp 0-7 issued
5200     // temp 8-15 ready to be consumed
5201     __ add(s1, I0, s1);
5202     __ dec(O5);
5203     __ add(s2, s1, s2);
5204     __ add(s1, I1, s1);
5205     __ inc(buff, 16);
5206     __ add(s2, s1, s2);
5207 
5208     for (int i = 0; i < 6; i++) {
5209       __ add(s1, temp[10+i], s1);
5210       __ add(s2, s1, s2);
5211     }
5212 
5213     __ BIND(L_inner_loop_check);
5214     __ nop();
5215     __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop);
5216 
5217     // Epilogue
5218     for (int i = 0; i < 4; i++) {
5219       __ ldub(buff, (2*i), temp[8+(2*i)]);
5220       __ add(s1, temp[i], s1);
5221       __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]);
5222       __ add(s2, s1, s2);
5223     }
5224 
5225     __ add(s1, temp[4], s1);
5226     __ inc(buff, 8);
5227 
5228     for (int i = 0; i < 11; i++) {
5229       __ add(s2, s1, s2);
5230       __ add(s1, temp[5+i], s1);
5231     }
5232 
5233     __ add(s2, s1, s2);
5234 
5235     // Take the mod for s1 and s2
5236     __ set64(0xFFF1, L0, L1);
5237     __ udivx(s1, L0, L1);
5238     __ udivx(s2, L0, L2);
5239     __ mulx(L0, L1, L1);
5240     __ mulx(L0, L2, L2);
5241     __ sub(s1, L1, s1);
5242     __ sub(s2, L2, s2);
5243     
5244     // Make sure there is something left to process
5245     __ BIND(L_main_loop_check);
5246     __ set64(NMAX, L0, L1);
5247     // k = len < NMAX ? len : NMAX
5248     __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done);
5249     __ andn(len, 0x0F, L0); // only loop a multiple of 16 times
5250     __ BIND(L_nmax_check_done);
5251     __ mov(L0, O5);
5252     __ sub(len, L0, len); // len -= k
5253     
5254     __ srlx(O5, 4, O5); // multiplies of 16
5255     __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop);
5256 
5257     // Restore anything we used, take the mod one last time, combine and return
5258     // Restore any registers we saved
5259     for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
5260       __ movdtox(as_FloatRegister(2*i), temp[i]);
5261     }
5262 
5263     // There might be nothing left to process
5264     __ ba_short(L_cleanup_loop_check);
5265 
5266     __ BIND(L_cleanup_loop);
5267     __ ldub(buff, 0, O4); // load single byte form buffer
5268     __ inc(buff); // buff++
5269     __ add(s1, O4, s1); // s1 += *buff++;
5270     __ dec(len); // len--
5271     __ add(s1, s2, s2); // s2 += s1;
5272     __ BIND(L_cleanup_loop_check);
5273     __ nop();
5274     __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop);
5275     
5276     // Take the mod one last time
5277     __ set64(0xFFF1, O1, O2);
5278     __ udivx(s1, O1, O2);
5279     __ udivx(s2, O1, O5);
5280     __ mulx(O1, O2, O2);
5281     __ mulx(O1, O5, O5);
5282     __ sub(s1, O2, s1);
5283     __ sub(s2, O5, s2);
5284 
5285     // Combine lower bits and higher bits
5286     __ sllx(s2, 16, s2); // s2 = s2 << 16
5287     __ or3(s1, s2, s1);  // adler = s2 | s1
5288     // Final return value is in O0
5289     __ retl();
5290     __ delayed()->nop();
5291 
5292     return start;
5293   }
5294 
5295   void generate_initial() {
5296     // Generates all stubs and initializes the entry points
5297 
5298     //------------------------------------------------------------------------------------------------------------------------
5299     // entry points that exist in all platforms
5300     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
5301     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
5302     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
5303 
5304     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
5305     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
5306 
5307     //------------------------------------------------------------------------------------------------------------------------
5308     // entry points that are platform specific
5309     StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
5310 
5311     StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
5312     StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
5313 
5314 #if !defined(COMPILER2) && !defined(_LP64)

5371     }
5372 
5373     // generate SHA1/SHA256/SHA512 intrinsics code
5374     if (UseSHA1Intrinsics) {
5375       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5376       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5377     }
5378     if (UseSHA256Intrinsics) {
5379       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5380       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5381     }
5382     if (UseSHA512Intrinsics) {
5383       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
5384       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
5385     }
5386 
5387     // generate CRC32C intrinsic code
5388     if (UseCRC32CIntrinsics) {
5389       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5390     }
5391 
5392     // generate Adler32 intrinsics code
5393     if (UseAdler32Intrinsics) {
5394       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();  
5395     }    
5396   }
5397 
5398 
5399  public:
5400   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5401     // replace the standard masm with a special one:
5402     _masm = new MacroAssembler(code);
5403 
5404     _stub_count = !all ? 0x100 : 0x200;
5405     if (all) {
5406       generate_all();
5407     } else {
5408       generate_initial();
5409     }
5410 
5411     // make sure this stub is available for all local calls
5412     if (_atomic_add_stub.is_unbound()) {
5413       // generate a second time, if necessary
5414       (void) generate_atomic_add();
5415     }

src/cpu/sparc/vm/stubGenerator_sparc.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File