src/cpu/sparc/vm/stubGenerator_sparc.cpp
Index
Unified diffs
Context diffs
Sdiffs
Wdiffs
Patch
New
Old
Previous File
Next File
hotspot Cdiff src/cpu/sparc/vm/stubGenerator_sparc.cpp
src/cpu/sparc/vm/stubGenerator_sparc.cpp
Print this page
*** 4908,4917 ****
--- 4908,5117 ----
__ delayed()->restore();
return start;
}
+ #define CHUNK_LEN 128 /* 128 x 8B = 1KB */
+ #define CHUNK_K1 0x1307a0206 /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */
+ #define CHUNK_K2 0x1a0f717c4 /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */
+ #define CHUNK_K3 0x0170076fa /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */
+
+ /**
+ * Arguments:
+ *
+ * Inputs:
+ * O0 - int crc
+ * O1 - byte* buf
+ * O2 - int len
+ * O3 - int* table
+ *
+ * Output:
+ * O0 - int crc result
+ */
+ address generate_updateBytesCRC32C() {
+ assert(UseCRC32CIntrinsics, "need CRC32C instruction");
+
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
+ address start = __ pc();
+
+ const Register crc = O0; // crc
+ const Register buf = O1; // source java byte array address
+ const Register len = O2; // number of bytes
+ const Register table = O3; // byteTable
+
+ Label L_crc32c_head, L_crc32c_aligned;
+ Label L_crc32c_parallel, L_crc32c_parallel_loop;
+ Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop;
+ Label L_crc32c_done, L_crc32c_tail, L_crc32c_return;
+
+ __ cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return);
+
+ // clear upper 32 bits of crc
+ __ clruwu(crc);
+
+ __ and3(buf, 7, G4);
+ __ cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned);
+
+ __ mov(8, G1);
+ __ sub(G1, G4, G4);
+
+ // ------ process the misaligned head (7 bytes or less) ------
+ __ BIND(L_crc32c_head);
+
+ // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
+ __ ldub(buf, 0, G1);
+ __ update_byte_crc32(crc, G1, table);
+
+ __ inc(buf);
+ __ dec(len);
+ __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return);
+ __ dec(G4);
+ __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head);
+
+ // ------ process the 8-byte-aligned body ------
+ __ BIND(L_crc32c_aligned);
+ __ nop();
+ __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail);
+
+ // reverse the byte order of lower 32 bits to big endian, and move to FP side
+ __ movitof_revbytes(crc, F0, G1, G3);
+
+ __ set(CHUNK_LEN*8*4, G4);
+ __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial);
+
+ // ------ process four 1KB chunks in parallel ------
+ __ BIND(L_crc32c_parallel);
+
+ __ fzero(FloatRegisterImpl::D, F2);
+ __ fzero(FloatRegisterImpl::D, F4);
+ __ fzero(FloatRegisterImpl::D, F6);
+
+ __ mov(CHUNK_LEN - 1, G4);
+ __ BIND(L_crc32c_parallel_loop);
+ // schedule ldf's ahead of crc32c's to hide the load-use latency
+ __ ldf(FloatRegisterImpl::D, buf, 0, F8);
+ __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8, F10);
+ __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
+ __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14);
+ __ crc32c(F0, F8, F0);
+ __ crc32c(F2, F10, F2);
+ __ crc32c(F4, F12, F4);
+ __ crc32c(F6, F14, F6);
+ __ inc(buf, 8);
+ __ dec(G4);
+ __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop);
+
+ __ ldf(FloatRegisterImpl::D, buf, 0, F8);
+ __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8, F10);
+ __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
+ __ crc32c(F0, F8, F0);
+ __ crc32c(F2, F10, F2);
+ __ crc32c(F4, F12, F4);
+
+ __ inc(buf, CHUNK_LEN*24);
+ __ ldfl(FloatRegisterImpl::D, buf, G0, F14); // load in little endian
+ __ inc(buf, 8);
+
+ __ prefetch(buf, 0, Assembler::severalReads);
+ __ prefetch(buf, CHUNK_LEN*8, Assembler::severalReads);
+ __ prefetch(buf, CHUNK_LEN*16, Assembler::severalReads);
+ __ prefetch(buf, CHUNK_LEN*24, Assembler::severalReads);
+
+ // move to INT side, and reverse the byte order of lower 32 bits to little endian
+ __ movftoi_revbytes(F0, O4, G1, G4);
+ __ movftoi_revbytes(F2, O5, G1, G4);
+ __ movftoi_revbytes(F4, G5, G1, G4);
+
+ // combine the results of 4 chunks
+ __ set64(CHUNK_K1, G3, G1);
+ __ xmulx(O4, G3, O4);
+ __ set64(CHUNK_K2, G3, G1);
+ __ xmulx(O5, G3, O5);
+ __ set64(CHUNK_K3, G3, G1);
+ __ xmulx(G5, G3, G5);
+
+ __ movdtox(F14, G4);
+ __ xor3(O4, O5, O5);
+ __ xor3(G5, O5, O5);
+ __ xor3(G4, O5, O5);
+
+ // reverse the byte order to big endian, via stack, and move to FP side
+ __ add(SP, -8, G1);
+ __ srlx(G1, 3, G1);
+ __ sllx(G1, 3, G1);
+ __ stx(O5, G1, G0);
+ __ ldfl(FloatRegisterImpl::D, G1, G0, F2); // load in little endian
+
+ __ crc32c(F6, F2, F0);
+
+ __ set(CHUNK_LEN*8*4, G4);
+ __ sub(len, G4, len);
+ __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel);
+ __ nop();
+ __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done);
+
+ __ BIND(L_crc32c_serial);
+
+ __ mov(32, G4);
+ __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8);
+
+ // ------ process 32B chunks ------
+ __ BIND(L_crc32c_x32_loop);
+ __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+ __ inc(buf, 8);
+ __ crc32c(F0, F2, F0);
+ __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+ __ inc(buf, 8);
+ __ crc32c(F0, F2, F0);
+ __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+ __ inc(buf, 8);
+ __ crc32c(F0, F2, F0);
+ __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+ __ inc(buf, 8);
+ __ crc32c(F0, F2, F0);
+ __ dec(len, 32);
+ __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop);
+
+ __ BIND(L_crc32c_x8);
+ __ nop();
+ __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done);
+
+ // ------ process 8B chunks ------
+ __ BIND(L_crc32c_x8_loop);
+ __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+ __ inc(buf, 8);
+ __ crc32c(F0, F2, F0);
+ __ dec(len, 8);
+ __ cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop);
+
+ __ BIND(L_crc32c_done);
+
+ // move to INT side, and reverse the byte order of lower 32 bits to little endian
+ __ movftoi_revbytes(F0, crc, G1, G3);
+
+ __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return);
+
+ // ------ process the misaligned tail (7 bytes or less) ------
+ __ BIND(L_crc32c_tail);
+
+ // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
+ __ ldub(buf, 0, G1);
+ __ update_byte_crc32(crc, G1, table);
+
+ __ inc(buf);
+ __ dec(len);
+ __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
+
+ __ BIND(L_crc32c_return);
+ __ nop();
+ __ retl();
+ __ delayed()->nop();
+
+ return start;
+ }
+
void generate_initial() {
// Generates all stubs and initializes the entry points
//------------------------------------------------------------------------------------------------------------------------
// entry points that exist in all platforms
*** 4999,5008 ****
--- 5199,5213 ----
}
if (UseSHA512Intrinsics) {
StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
}
+
+ // generate CRC32C intrinsic code
+ if (UseCRC32CIntrinsics) {
+ StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
+ }
}
public:
StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
src/cpu/sparc/vm/stubGenerator_sparc.cpp
Index
Unified diffs
Context diffs
Sdiffs
Wdiffs
Patch
New
Old
Previous File
Next File