hotspot Cdiff src/cpu/sparc/vm/stubGenerator

src/cpu/sparc/vm/stubGenerator_sparc.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File hotspot Cdiff src/cpu/sparc/vm/stubGenerator_sparc.cpp
src/cpu/sparc/vm/stubGenerator_sparc.cpp

Print this page

*** 4908,4917 ****
--- 4908,5117 ----
        __ delayed()->restore();
  
        return start;
    }
  
+ #define CHUNK_LEN       128             /* 128 x 8B = 1KB */
+ #define CHUNK_K1        0x1307a0206     /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */
+ #define CHUNK_K2        0x1a0f717c4     /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */
+ #define CHUNK_K3        0x0170076fa     /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */
+ 
+   /**
+    *  Arguments:
+    *
+    * Inputs:
+    *   O0   - int   crc
+    *   O1   - byte* buf
+    *   O2   - int   len
+    *   O3   - int*  table
+    *
+    * Output:
+    *   O0   - int crc result
+    */
+   address generate_updateBytesCRC32C() {
+     assert(UseCRC32CIntrinsics, "need CRC32C instruction");
+ 
+     __ align(CodeEntryAlignment);
+     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
+     address start = __ pc();
+ 
+     const Register crc   = O0;  // crc
+     const Register buf   = O1;  // source java byte array address
+     const Register len   = O2;  // number of bytes
+     const Register table = O3;  // byteTable
+ 
+     Label L_crc32c_head, L_crc32c_aligned;
+     Label L_crc32c_parallel, L_crc32c_parallel_loop;
+     Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop;
+     Label L_crc32c_done, L_crc32c_tail, L_crc32c_return;
+ 
+     __ cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return);
+ 
+     // clear upper 32 bits of crc
+     __ clruwu(crc);
+ 
+     __ and3(buf, 7, G4);
+     __ cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned);
+ 
+     __ mov(8, G1);
+     __ sub(G1, G4, G4);
+ 
+     // ------ process the misaligned head (7 bytes or less) ------
+     __ BIND(L_crc32c_head);
+ 
+     // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
+     __ ldub(buf, 0, G1);
+     __ update_byte_crc32(crc, G1, table);
+ 
+     __ inc(buf);
+     __ dec(len);
+     __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return);
+     __ dec(G4);
+     __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head);
+ 
+     // ------ process the 8-byte-aligned body ------
+     __ BIND(L_crc32c_aligned);
+     __ nop();
+     __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail);
+ 
+     // reverse the byte order of lower 32 bits to big endian, and move to FP side
+     __ movitof_revbytes(crc, F0, G1, G3);
+ 
+     __ set(CHUNK_LEN*8*4, G4);
+     __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial);
+ 
+     // ------ process four 1KB chunks in parallel ------
+     __ BIND(L_crc32c_parallel);
+ 
+     __ fzero(FloatRegisterImpl::D, F2);
+     __ fzero(FloatRegisterImpl::D, F4);
+     __ fzero(FloatRegisterImpl::D, F6);
+ 
+     __ mov(CHUNK_LEN - 1, G4);
+     __ BIND(L_crc32c_parallel_loop);
+     // schedule ldf's ahead of crc32c's to hide the load-use latency
+     __ ldf(FloatRegisterImpl::D, buf, 0,            F8);
+     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
+     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
+     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14);
+     __ crc32c(F0, F8,  F0);
+     __ crc32c(F2, F10, F2);
+     __ crc32c(F4, F12, F4);
+     __ crc32c(F6, F14, F6);
+     __ inc(buf, 8);
+     __ dec(G4);
+     __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop);
+ 
+     __ ldf(FloatRegisterImpl::D, buf, 0,            F8);
+     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
+     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
+     __ crc32c(F0, F8,  F0);
+     __ crc32c(F2, F10, F2);
+     __ crc32c(F4, F12, F4);
+ 
+     __ inc(buf, CHUNK_LEN*24);
+     __ ldfl(FloatRegisterImpl::D, buf, G0, F14);        // load in little endian
+     __ inc(buf, 8);
+ 
+     __ prefetch(buf, 0,            Assembler::severalReads);
+     __ prefetch(buf, CHUNK_LEN*8,  Assembler::severalReads);
+     __ prefetch(buf, CHUNK_LEN*16, Assembler::severalReads);
+     __ prefetch(buf, CHUNK_LEN*24, Assembler::severalReads);
+ 
+     // move to INT side, and reverse the byte order of lower 32 bits to little endian
+     __ movftoi_revbytes(F0, O4, G1, G4);
+     __ movftoi_revbytes(F2, O5, G1, G4);
+     __ movftoi_revbytes(F4, G5, G1, G4);
+ 
+     // combine the results of 4 chunks
+     __ set64(CHUNK_K1, G3, G1);
+     __ xmulx(O4, G3, O4);
+     __ set64(CHUNK_K2, G3, G1);
+     __ xmulx(O5, G3, O5);
+     __ set64(CHUNK_K3, G3, G1);
+     __ xmulx(G5, G3, G5);
+ 
+     __ movdtox(F14, G4);
+     __ xor3(O4, O5, O5);
+     __ xor3(G5, O5, O5);
+     __ xor3(G4, O5, O5);
+ 
+     // reverse the byte order to big endian, via stack, and move to FP side
+     __ add(SP, -8, G1);
+     __ srlx(G1, 3, G1);
+     __ sllx(G1, 3, G1);
+     __ stx(O5, G1, G0);
+     __ ldfl(FloatRegisterImpl::D, G1, G0, F2);  // load in little endian
+ 
+     __ crc32c(F6, F2, F0);
+ 
+     __ set(CHUNK_LEN*8*4, G4);
+     __ sub(len, G4, len);
+     __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel);
+     __ nop();
+     __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done);
+ 
+     __ BIND(L_crc32c_serial);
+ 
+     __ mov(32, G4);
+     __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8);
+ 
+     // ------ process 32B chunks ------
+     __ BIND(L_crc32c_x32_loop);
+     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+     __ inc(buf, 8);
+     __ crc32c(F0, F2, F0);
+     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+     __ inc(buf, 8);
+     __ crc32c(F0, F2, F0);
+     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+     __ inc(buf, 8);
+     __ crc32c(F0, F2, F0);
+     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+     __ inc(buf, 8);
+     __ crc32c(F0, F2, F0);
+     __ dec(len, 32);
+     __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop);
+ 
+     __ BIND(L_crc32c_x8);
+     __ nop();
+     __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done);
+ 
+     // ------ process 8B chunks ------
+     __ BIND(L_crc32c_x8_loop);
+     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+     __ inc(buf, 8);
+     __ crc32c(F0, F2, F0);
+     __ dec(len, 8);
+     __ cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop);
+ 
+     __ BIND(L_crc32c_done);
+ 
+     // move to INT side, and reverse the byte order of lower 32 bits to little endian
+     __ movftoi_revbytes(F0, crc, G1, G3);
+ 
+     __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return);
+ 
+     // ------ process the misaligned tail (7 bytes or less) ------
+     __ BIND(L_crc32c_tail);
+ 
+     // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
+     __ ldub(buf, 0, G1);
+     __ update_byte_crc32(crc, G1, table);
+ 
+     __ inc(buf);
+     __ dec(len);
+     __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
+ 
+     __ BIND(L_crc32c_return);
+     __ nop();
+     __ retl();
+     __ delayed()->nop();
+ 
+     return start;
+   }
+ 
    void generate_initial() {
      // Generates all stubs and initializes the entry points
  
      //------------------------------------------------------------------------------------------------------------------------
      // entry points that exist in all platforms
*** 4999,5008 ****
--- 5199,5213 ----
      }
      if (UseSHA512Intrinsics) {
        StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
        StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
      }
+ 
+     // generate CRC32C intrinsic code
+     if (UseCRC32CIntrinsics) {
+       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
+     }
    }
  
  
   public:
    StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
src/cpu/sparc/vm/stubGenerator_sparc.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File