--- old/src/cpu/x86/vm/macroAssembler_x86.cpp 2015-08-26 13:22:19.832731300 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp 2015-08-26 13:22:19.573705400 -0700 @@ -8467,8 +8467,504 @@ notl(crc); // ~c } -#undef BIND -#undef BLOCK_COMMENT +namespace CRC32C { +#include "crc32c.h" + +#define Nehalem(x) x +#define Westmere(x) x + +#undef IN +#define IN(x) x +#define INOUT(x) x +#undef OUT +#define OUT(x) x +#define Scratch(x) x + +#undef D + +#ifdef _LP64 +// S. Gueron / Information Processing Letters 112 (2012) 184 +// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table. +// Input: A 32 bit value B = [byte3, byte2, byte1, byte0]. +// Output: the 64-bit carry-less product of B * CONST + void IPL_Alg4(INOUT(Register B), uint32_t n, + Scratch(Register C), Scratch(Register D), Scratch(Register Z), + MacroAssembler * This) { + This->lea(Z, ExternalAddress(StubRoutines::crc32c_table_addr())); + if (n > 0) { + This->addq(Z, n * 256 * 8); + } + // Q1 = TABLEExt[n][B & 0xFF]; + This->movl(C, B); + This->andl(C, 0x000000FF); + This->shll(C, 3); + This->addq(C, Z); + This->movq(C, Address(C, 0)); + + // Q2 = TABLEExt[n][B >> 8 & 0xFF]; + This->movl(D, B); + This->shrl(D, 8); + This->andl(D, 0x000000FF); + This->shll(D, 3); + This->addq(D, Z); + This->movq(D, Address(D, 0)); + + This->shlq(D, 8); + This->xorq(C, D); + + // Q3 = TABLEExt[n][B >> 16 & 0xFF]; + This->movl(D, B); + This->shrl(D, 16); + This->andl(D, 0x000000FF); + This->shll(D, 3); + This->addq(D, Z); + This->movq(D, Address(D, 0)); + + This->shlq(D, 16); + This->xorq(C, D); + + // Q4 = TABLEExt[n][B >> 24 & 0xFF]; + This->shrl(B, 24); + This->andl(B, 0x000000FF); + This->shll(B, 3); + This->addq(B, Z); + This->movq(B, Address(B, 0)); + + This->shlq(B, 24); + This->xorq(B, C); + // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; + } + + void PCLMULQDQ(Westmere(Scratch(XMMRegister crcXMM)), + INOUT(Register crc), + uint32_t CONSTOrPreCompConstIndex, bool IsPclmulqdqSupported, + Westmere(Scratch(XMMRegister DXMM)), + Scratch(Register A), + Nehalem(Scratch(Register B)), Nehalem(Scratch(Register C)), + MacroAssembler * This) { + if (IsPclmulqdqSupported) { + This->movdl(crcXMM, crc); // modified blindly + + This->movl(A, CONSTOrPreCompConstIndex); + This->movdl(DXMM, A); + This->pclmulqdq(crcXMM, DXMM, 0); + + This->movdq(crc, crcXMM); + } else { + IPL_Alg4(crc, CONSTOrPreCompConstIndex, A, B, C, This); + } + } + + // Recombination Alternative 2: No bit-reflections + // T1 = (CRC_A * U1) << 1 + // T2 = (CRC_B * U2) << 1 + // C1 = T1 >> 32 + // C2 = T2 >> 32 + // T1 = T1 & 0xFFFFFFFF + // T2 = T2 & 0xFFFFFFFF + // T1 = CRC32(0, T1) + // T2 = CRC32(0, T2) + // C1 = C1 ^ T1 + // C2 = C2 ^ T2 + // CRC = C1 ^ C2 ^ CRC_C + void RecAlt2(uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, INOUT(Register crcA), IN(Scratch(Register crcB)), IN(Register crcC), + Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), + Scratch(Register E), Scratch(Register F), + Nehalem(Scratch(Register G)), + MacroAssembler * This) { + PCLMULQDQ(AXMM, crcA, CONSTOrPreCompConstIndexU1, IsPclmulqdqSupported, CXMM, E, F, G, This); + PCLMULQDQ(BXMM, crcB, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, CXMM, E, F, G, This); + This->shlq(crcA, 1); + This->movl(E, crcA); + This->shrq(crcA, 32); + This->xorl(F, F); + This->crc32(F, E, 4); + This->xorl(crcA, F); // we don't care about upper 32 bit contents here + This->shlq(crcB, 1); + This->movl(E, crcB); + This->shrq(crcB, 32); + This->xorl(F, F); + This->crc32(F, E, 4); + This->xorl(crcB, F); + This->xorl(crcA, crcB); + This->xorl(crcA, crcC); + } + + // Set N to predefined value + // Subtract from a lenght of a buffer + // execute in a loop: + // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0 + // for i = 1 to N do + // CRC_A = CRC32(CRC_A, A[i]) + // CRC_B = CRC32(CRC_B, B[i]) + // CRC_C = CRC32(CRC_C, C[i]) + // end for + // Recombine + void ProcChunk(uint32_t size, uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, + INOUT(Register len), INOUT(Register buf), INOUT(Register crc), + Scratch(Register E), Scratch(Register F), Scratch(Register end), + Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), + Scratch(Register G), Scratch(Register H), + Nehalem(Scratch(Register I)), + MacroAssembler * This) { + Label L_processPartitions; + Label L_processPartition; + Label L_exit; + + This->bind(L_processPartitions); + This->cmpl(len, 3 * size); + This->jcc(Assembler::less, L_exit); + This->xorl(E, E); + This->xorl(F, F); + This->movq(end, buf); + This->addq(end, size); + + This->bind(L_processPartition); + This->crc32(crc, Address(buf, 0), 8); + This->crc32(E, Address(buf, size), 8); + This->crc32(F, Address(buf, size * 2), 8); + This->addq(buf, 8); + This->cmpq(buf, end); + This->jcc(Assembler::less, L_processPartition); + RecAlt2(CONSTOrPreCompConstIndexU1, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, crc, E, F, + AXMM, BXMM, CXMM, + G, H, + I, + This); + This->addq(buf, 2 * size); + This->subl(len, 3 * size); + This->jmp(L_processPartitions); + + This->bind(L_exit); + } +#else +void IPL_Alg4(INOUT(Register B), uint32_t n, + Scratch(Register C), Scratch(Register D), Scratch(Register Z), + Scratch(XMMRegister CXMM), Scratch(XMMRegister DXMM), + MacroAssembler * This) { + This->lea(Z, ExternalAddress(StubRoutines::crc32c_table_addr())); + if (n > 0) { + This->addl(Z, n * 256 * 8); + } + // Q1 = TABLEExt[n][B & 0xFF]; + This->movl(C, B); + This->andl(C, 0x000000FF); + This->shll(C, 3); + This->addl(C, Z); + This->movq(CXMM, Address(C, 0)); + + // Q2 = TABLEExt[n][B >> 8 & 0xFF]; + This->movl(D, B); + This->shrl(D, 8); + This->andl(D, 0x000000FF); + This->shll(D, 3); + This->addl(D, Z); + This->movq(DXMM, Address(D, 0)); + + This->psllq(DXMM, 8); + This->pxor(CXMM, DXMM); + + // Q3 = TABLEExt[n][B >> 16 & 0xFF]; + This->movl(D, B); + This->shrl(D, 16); + This->andl(D, 0x000000FF); + This->shll(D, 3); + This->addl(D, Z); + This->movq(DXMM, Address(D, 0)); + + This->psllq(DXMM, 16); + This->pxor(CXMM, DXMM); + + // Q4 = TABLEExt[n][B >> 24 & 0xFF]; + This->shrl(B, 24); + This->andl(B, 0x000000FF); + This->shll(B, 3); + This->addl(B, Z); + This->movq(DXMM, Address(B, 0)); + + This->psllq(DXMM, 24); + This->pxor(CXMM, DXMM); // Result in CXMM + // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; +} + +void PCLMULQDQ(Westmere(Scratch(XMMRegister crcXMM)), + INOUT(Register crc), + uint32_t CONSTOrPreCompConstIndex, bool IsPclmulqdqSupported, + Westmere(Scratch(XMMRegister DXMM)), + Scratch(Register A), + Nehalem(Scratch(Register B)), Nehalem(Scratch(Register C)), + MacroAssembler * This) { + if (IsPclmulqdqSupported) { + This->movdl(crcXMM, crc); + + This->movl(A, CONSTOrPreCompConstIndex); + This->movdl(DXMM, A); + This->pclmulqdq(crcXMM, DXMM, 0); + // Keep result in XMM since GPR is 32 bit in length + } else { + IPL_Alg4(crc, CONSTOrPreCompConstIndex, A, B, C, crcXMM, DXMM, This); + } +} + +void RecAlt2(uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, INOUT(Register crcA), IN(Scratch(Register crcB)), IN(Register crcC), + Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), + Scratch(Register E), Scratch(Register F), + Nehalem(Scratch(Register G)), + MacroAssembler * This) { + PCLMULQDQ(AXMM, crcA, CONSTOrPreCompConstIndexU1, IsPclmulqdqSupported, CXMM, E, F, G, This); + PCLMULQDQ(BXMM, crcB, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, CXMM, E, F, G, This); + + This->psllq(AXMM, 1); + This->movdl(E, AXMM); + This->psrlq(AXMM, 32); + This->movdl(crcA, AXMM); + + This->xorl(F, F); + This->crc32(F, E, 4); + This->xorl(crcA, F); + + This->psllq(BXMM, 1); + This->movdl(E, BXMM); + This->psrlq(BXMM, 32); + This->movdl(crcB, BXMM); + + This->xorl(F, F); + This->crc32(F, E, 4); + This->xorl(crcB, F); + This->xorl(crcA, crcB); + This->xorl(crcA, crcC); +} + +void ProcChunk(uint32_t size, uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, + INOUT(Register len), INOUT(Register buf), INOUT(Register crc), + Scratch(Register E), Scratch(Register F), Scratch(Register end), + Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), + Scratch(Register G), Scratch(Register H), + Nehalem(Scratch(Register I)), + MacroAssembler * This) { + Label L_processPartitions; + Label L_processPartition; + Label L_exit; + + This->bind(L_processPartitions); + This->cmpl(len, 3 * size); + This->jcc(Assembler::less, L_exit); + This->xorl(E, E); + This->xorl(F, F); + This->movl(end, buf); + This->addl(end, size); + + This->bind(L_processPartition); + This->crc32(crc, Address(buf, 0), 4); + This->crc32(E, Address(buf, size), 4); + This->crc32(F, Address(buf, size*2), 4); + This->crc32(crc, Address(buf, 0+4), 4); + This->crc32(E, Address(buf, size+4), 4); + This->crc32(F, Address(buf, size*2+4), 4); + This->addl(buf, 8); + This->cmpl(buf, end); + This->jcc(Assembler::less, L_processPartition); + + This->push(end); + This->push(len); + This->push(buf); + G = end; + H = len; + I = buf; + + RecAlt2(CONSTOrPreCompConstIndexU1, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, crc, E, F, + AXMM, BXMM, CXMM, + G, H, + I, + This); + + This->pop(buf); + This->pop(len); + This->pop(end); + + This->addl(buf, 2 * size); + This->subl(len, 3 * size); + This->jmp(L_processPartitions); + + This->bind(L_exit); +} +#endif //LP64 +} +#undef D + +#ifdef _LP64 +// Algorithm 2: Pipelined usage of the CRC32 instruction. +// Input: A buffer I of L bytes. +// Output: the CRC32C value of the buffer. +// Notations: +// Write L = 24N + r, with N = floor (L/24). +// r = L mod 24 (0 <= r < 24). +// Consider I as the concatenation of A|B|C|R, where A, B, C, each, +// N quadwords, and R consists of r bytes. +// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1 +// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1 +// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1 +// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1 +void MacroAssembler::crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len, + Scratch(Register A), Scratch(Register B), Scratch(Register C), + Scratch(Register D), Scratch(Register E), Scratch(Register F), + Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), + bool IsPclmulqdqSupported) { + uint32_t CONSTOrPreCompConstIndex[CRC32C::NUM_PRECOMPUTED_CONSTANTS]; + Label L_wordByWord; + Label L_byteByByteProlog; + Label L_byteByByte; + Label L_exit; + + if (IsPclmulqdqSupported ) { + CONSTOrPreCompConstIndex[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; + CONSTOrPreCompConstIndex[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1); + + CONSTOrPreCompConstIndex[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); + CONSTOrPreCompConstIndex[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); + + CONSTOrPreCompConstIndex[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); + CONSTOrPreCompConstIndex[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); + assert((CRC32C::NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\""); + } else { + CONSTOrPreCompConstIndex[0] = 1; + CONSTOrPreCompConstIndex[1] = 0; + + CONSTOrPreCompConstIndex[2] = 3; + CONSTOrPreCompConstIndex[3] = 2; + + CONSTOrPreCompConstIndex[4] = 5; + CONSTOrPreCompConstIndex[5] = 4; + } + CRC32C::ProcChunk(CRC32C::HIGH, CONSTOrPreCompConstIndex[0], CONSTOrPreCompConstIndex[1], IsPclmulqdqSupported, + len, buf, crc, + A, B, C, + AXMM, BXMM, CXMM, + D, E, + F, + this); + CRC32C::ProcChunk(CRC32C::MIDDLE, CONSTOrPreCompConstIndex[2], CONSTOrPreCompConstIndex[3], IsPclmulqdqSupported, + len, buf, crc, + A, B, C, + AXMM, BXMM, CXMM, + D, E, + F, + this); + CRC32C::ProcChunk(CRC32C::LOW, CONSTOrPreCompConstIndex[4], CONSTOrPreCompConstIndex[5], IsPclmulqdqSupported, + len, buf, crc, + A, B, C, + AXMM, BXMM, CXMM, + D, E, + F, + this); + movl(A, len); + andl(A, 0x00000007); + negl(A); + addl(A, len); + addq(A, buf); + + BIND(L_wordByWord); + cmpq(buf, A); + jcc(Assembler::greaterEqual, L_byteByByteProlog); + crc32(crc, Address(buf, 0), 4); + addq(buf, 4); + jmp(L_wordByWord); + + BIND(L_byteByByteProlog); + andl(len, 0x00000007); + movl(B, 1); + + BIND(L_byteByByte); + cmpl(B, len); + jccb(Assembler::greater, L_exit); + crc32(crc, Address(buf, 0), 1); + incq(buf); + incl(B); + jmp(L_byteByByte); + + BIND(L_exit); +} +#else +void MacroAssembler::crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len, + Scratch(Register A), Scratch(Register B), Scratch(Register C), + Scratch(Register D), Scratch(Register E), Scratch(Register F), + Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), + bool IsPclmulqdqSupported) { + uint32_t CONSTOrPreCompConstIndex[CRC32C::NUM_PRECOMPUTED_CONSTANTS]; + Label L_wordByWord; + Label L_byteByByteProlog; + Label L_byteByByte; + Label L_exit; + + if (IsPclmulqdqSupported) { + CONSTOrPreCompConstIndex[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; + CONSTOrPreCompConstIndex[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1); + + CONSTOrPreCompConstIndex[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); + CONSTOrPreCompConstIndex[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); + + CONSTOrPreCompConstIndex[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); + CONSTOrPreCompConstIndex[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); + } else { + CONSTOrPreCompConstIndex[0] = 1; + CONSTOrPreCompConstIndex[1] = 0; + + CONSTOrPreCompConstIndex[2] = 3; + CONSTOrPreCompConstIndex[3] = 2; + + CONSTOrPreCompConstIndex[4] = 5; + CONSTOrPreCompConstIndex[5] = 4; + } + CRC32C::ProcChunk(CRC32C::HIGH, CONSTOrPreCompConstIndex[0], CONSTOrPreCompConstIndex[1], IsPclmulqdqSupported, + len, buf, crc, + A, B, C, + AXMM, BXMM, CXMM, + D, E, + F, + this); + CRC32C::ProcChunk(CRC32C::MIDDLE, CONSTOrPreCompConstIndex[2], CONSTOrPreCompConstIndex[3], IsPclmulqdqSupported, + len, buf, crc, + A, B, C, + AXMM, BXMM, CXMM, + D, E, + F, + this); + CRC32C::ProcChunk(CRC32C::LOW, CONSTOrPreCompConstIndex[4], CONSTOrPreCompConstIndex[5], IsPclmulqdqSupported, + len, buf, crc, + A, B, C, + AXMM, BXMM, CXMM, + D, E, + F, + this); + movl(A, len); + andl(A, 0x00000007); + negl(A); + addl(A, len); + addl(A, buf); + + BIND(L_wordByWord); + cmpl(buf, A); + jcc(Assembler::greaterEqual, L_byteByByteProlog); + crc32(crc, Address(buf,0), 4); + addl(buf, 4); + jmp(L_wordByWord); + + BIND(L_byteByByteProlog); + andl(len, 0x00000007); + movl(B, 1); + + BIND(L_byteByByte); + cmpl(B, len); + jccb(Assembler::greater, L_exit); + movb(A, Address(buf, 0)); + crc32(crc, A, 1); + incl(buf); + incl(B); + jmp(L_byteByByte); + + BIND(L_exit); +} +#endif // LP64 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {