< prev index next >

src/cpu/x86/vm/macroAssembler_x86.cpp

Print this page

        

*** 8465,8476 **** BIND(L_exit); notl(crc); // ~c } ! #undef BIND ! #undef BLOCK_COMMENT Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { switch (cond) { // Note some conditions are synonyms for others --- 8465,8972 ---- BIND(L_exit); notl(crc); // ~c } ! namespace CRC32C { ! #include "crc32c.h" ! ! #define Nehalem(x) x ! #define Westmere(x) x ! ! #undef IN ! #define IN(x) x ! #define INOUT(x) x ! #undef OUT ! #define OUT(x) x ! #define Scratch(x) x ! ! #undef D ! ! #ifdef _LP64 ! // S. Gueron / Information Processing Letters 112 (2012) 184 ! // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table. ! // Input: A 32 bit value B = [byte3, byte2, byte1, byte0]. ! // Output: the 64-bit carry-less product of B * CONST ! void IPL_Alg4(INOUT(Register B), uint32_t n, ! Scratch(Register C), Scratch(Register D), Scratch(Register Z), ! MacroAssembler * This) { ! This->lea(Z, ExternalAddress(StubRoutines::crc32c_table_addr())); ! if (n > 0) { ! This->addq(Z, n * 256 * 8); ! } ! // Q1 = TABLEExt[n][B & 0xFF]; ! This->movl(C, B); ! This->andl(C, 0x000000FF); ! This->shll(C, 3); ! This->addq(C, Z); ! This->movq(C, Address(C, 0)); ! ! // Q2 = TABLEExt[n][B >> 8 & 0xFF]; ! This->movl(D, B); ! This->shrl(D, 8); ! This->andl(D, 0x000000FF); ! This->shll(D, 3); ! This->addq(D, Z); ! This->movq(D, Address(D, 0)); ! ! This->shlq(D, 8); ! This->xorq(C, D); ! ! // Q3 = TABLEExt[n][B >> 16 & 0xFF]; ! This->movl(D, B); ! This->shrl(D, 16); ! This->andl(D, 0x000000FF); ! This->shll(D, 3); ! This->addq(D, Z); ! This->movq(D, Address(D, 0)); ! ! This->shlq(D, 16); ! This->xorq(C, D); ! ! // Q4 = TABLEExt[n][B >> 24 & 0xFF]; ! This->shrl(B, 24); ! This->andl(B, 0x000000FF); ! This->shll(B, 3); ! This->addq(B, Z); ! This->movq(B, Address(B, 0)); ! ! This->shlq(B, 24); ! This->xorq(B, C); ! // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; ! } ! ! void PCLMULQDQ(Westmere(Scratch(XMMRegister crcXMM)), ! INOUT(Register crc), ! uint32_t CONSTOrPreCompConstIndex, bool IsPclmulqdqSupported, ! Westmere(Scratch(XMMRegister DXMM)), ! Scratch(Register A), ! Nehalem(Scratch(Register B)), Nehalem(Scratch(Register C)), ! MacroAssembler * This) { ! if (IsPclmulqdqSupported) { ! This->movdl(crcXMM, crc); // modified blindly ! ! This->movl(A, CONSTOrPreCompConstIndex); ! This->movdl(DXMM, A); ! This->pclmulqdq(crcXMM, DXMM, 0); ! ! This->movdq(crc, crcXMM); ! } else { ! IPL_Alg4(crc, CONSTOrPreCompConstIndex, A, B, C, This); ! } ! } ! ! // Recombination Alternative 2: No bit-reflections ! // T1 = (CRC_A * U1) << 1 ! // T2 = (CRC_B * U2) << 1 ! // C1 = T1 >> 32 ! // C2 = T2 >> 32 ! // T1 = T1 & 0xFFFFFFFF ! // T2 = T2 & 0xFFFFFFFF ! // T1 = CRC32(0, T1) ! // T2 = CRC32(0, T2) ! // C1 = C1 ^ T1 ! // C2 = C2 ^ T2 ! // CRC = C1 ^ C2 ^ CRC_C ! void RecAlt2(uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, INOUT(Register crcA), IN(Scratch(Register crcB)), IN(Register crcC), ! Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), ! Scratch(Register E), Scratch(Register F), ! Nehalem(Scratch(Register G)), ! MacroAssembler * This) { ! PCLMULQDQ(AXMM, crcA, CONSTOrPreCompConstIndexU1, IsPclmulqdqSupported, CXMM, E, F, G, This); ! PCLMULQDQ(BXMM, crcB, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, CXMM, E, F, G, This); ! This->shlq(crcA, 1); ! This->movl(E, crcA); ! This->shrq(crcA, 32); ! This->xorl(F, F); ! This->crc32(F, E, 4); ! This->xorl(crcA, F); // we don't care about upper 32 bit contents here ! This->shlq(crcB, 1); ! This->movl(E, crcB); ! This->shrq(crcB, 32); ! This->xorl(F, F); ! This->crc32(F, E, 4); ! This->xorl(crcB, F); ! This->xorl(crcA, crcB); ! This->xorl(crcA, crcC); ! } ! ! // Set N to predefined value ! // Subtract from a lenght of a buffer ! // execute in a loop: ! // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0 ! // for i = 1 to N do ! // CRC_A = CRC32(CRC_A, A[i]) ! // CRC_B = CRC32(CRC_B, B[i]) ! // CRC_C = CRC32(CRC_C, C[i]) ! // end for ! // Recombine ! void ProcChunk(uint32_t size, uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, ! INOUT(Register len), INOUT(Register buf), INOUT(Register crc), ! Scratch(Register E), Scratch(Register F), Scratch(Register end), ! Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), ! Scratch(Register G), Scratch(Register H), ! Nehalem(Scratch(Register I)), ! MacroAssembler * This) { ! Label L_processPartitions; ! Label L_processPartition; ! Label L_exit; ! ! This->bind(L_processPartitions); ! This->cmpl(len, 3 * size); ! This->jcc(Assembler::less, L_exit); ! This->xorl(E, E); ! This->xorl(F, F); ! This->movq(end, buf); ! This->addq(end, size); ! ! This->bind(L_processPartition); ! This->crc32(crc, Address(buf, 0), 8); ! This->crc32(E, Address(buf, size), 8); ! This->crc32(F, Address(buf, size * 2), 8); ! This->addq(buf, 8); ! This->cmpq(buf, end); ! This->jcc(Assembler::less, L_processPartition); ! RecAlt2(CONSTOrPreCompConstIndexU1, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, crc, E, F, ! AXMM, BXMM, CXMM, ! G, H, ! I, ! This); ! This->addq(buf, 2 * size); ! This->subl(len, 3 * size); ! This->jmp(L_processPartitions); ! ! This->bind(L_exit); ! } ! #else ! void IPL_Alg4(INOUT(Register B), uint32_t n, ! Scratch(Register C), Scratch(Register D), Scratch(Register Z), ! Scratch(XMMRegister CXMM), Scratch(XMMRegister DXMM), ! MacroAssembler * This) { ! This->lea(Z, ExternalAddress(StubRoutines::crc32c_table_addr())); ! if (n > 0) { ! This->addl(Z, n * 256 * 8); ! } ! // Q1 = TABLEExt[n][B & 0xFF]; ! This->movl(C, B); ! This->andl(C, 0x000000FF); ! This->shll(C, 3); ! This->addl(C, Z); ! This->movq(CXMM, Address(C, 0)); ! ! // Q2 = TABLEExt[n][B >> 8 & 0xFF]; ! This->movl(D, B); ! This->shrl(D, 8); ! This->andl(D, 0x000000FF); ! This->shll(D, 3); ! This->addl(D, Z); ! This->movq(DXMM, Address(D, 0)); ! ! This->psllq(DXMM, 8); ! This->pxor(CXMM, DXMM); ! ! // Q3 = TABLEExt[n][B >> 16 & 0xFF]; ! This->movl(D, B); ! This->shrl(D, 16); ! This->andl(D, 0x000000FF); ! This->shll(D, 3); ! This->addl(D, Z); ! This->movq(DXMM, Address(D, 0)); ! ! This->psllq(DXMM, 16); ! This->pxor(CXMM, DXMM); ! ! // Q4 = TABLEExt[n][B >> 24 & 0xFF]; ! This->shrl(B, 24); ! This->andl(B, 0x000000FF); ! This->shll(B, 3); ! This->addl(B, Z); ! This->movq(DXMM, Address(B, 0)); ! ! This->psllq(DXMM, 24); ! This->pxor(CXMM, DXMM); // Result in CXMM ! // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; ! } ! ! void PCLMULQDQ(Westmere(Scratch(XMMRegister crcXMM)), ! INOUT(Register crc), ! uint32_t CONSTOrPreCompConstIndex, bool IsPclmulqdqSupported, ! Westmere(Scratch(XMMRegister DXMM)), ! Scratch(Register A), ! Nehalem(Scratch(Register B)), Nehalem(Scratch(Register C)), ! MacroAssembler * This) { ! if (IsPclmulqdqSupported) { ! This->movdl(crcXMM, crc); ! ! This->movl(A, CONSTOrPreCompConstIndex); ! This->movdl(DXMM, A); ! This->pclmulqdq(crcXMM, DXMM, 0); ! // Keep result in XMM since GPR is 32 bit in length ! } else { ! IPL_Alg4(crc, CONSTOrPreCompConstIndex, A, B, C, crcXMM, DXMM, This); ! } ! } ! ! void RecAlt2(uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, INOUT(Register crcA), IN(Scratch(Register crcB)), IN(Register crcC), ! Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), ! Scratch(Register E), Scratch(Register F), ! Nehalem(Scratch(Register G)), ! MacroAssembler * This) { ! PCLMULQDQ(AXMM, crcA, CONSTOrPreCompConstIndexU1, IsPclmulqdqSupported, CXMM, E, F, G, This); ! PCLMULQDQ(BXMM, crcB, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, CXMM, E, F, G, This); ! ! This->psllq(AXMM, 1); ! This->movdl(E, AXMM); ! This->psrlq(AXMM, 32); ! This->movdl(crcA, AXMM); ! ! This->xorl(F, F); ! This->crc32(F, E, 4); ! This->xorl(crcA, F); ! ! This->psllq(BXMM, 1); ! This->movdl(E, BXMM); ! This->psrlq(BXMM, 32); ! This->movdl(crcB, BXMM); ! ! This->xorl(F, F); ! This->crc32(F, E, 4); ! This->xorl(crcB, F); ! This->xorl(crcA, crcB); ! This->xorl(crcA, crcC); ! } ! ! void ProcChunk(uint32_t size, uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, ! INOUT(Register len), INOUT(Register buf), INOUT(Register crc), ! Scratch(Register E), Scratch(Register F), Scratch(Register end), ! Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), ! Scratch(Register G), Scratch(Register H), ! Nehalem(Scratch(Register I)), ! MacroAssembler * This) { ! Label L_processPartitions; ! Label L_processPartition; ! Label L_exit; ! ! This->bind(L_processPartitions); ! This->cmpl(len, 3 * size); ! This->jcc(Assembler::less, L_exit); ! This->xorl(E, E); ! This->xorl(F, F); ! This->movl(end, buf); ! This->addl(end, size); ! ! This->bind(L_processPartition); ! This->crc32(crc, Address(buf, 0), 4); ! This->crc32(E, Address(buf, size), 4); ! This->crc32(F, Address(buf, size*2), 4); ! This->crc32(crc, Address(buf, 0+4), 4); ! This->crc32(E, Address(buf, size+4), 4); ! This->crc32(F, Address(buf, size*2+4), 4); ! This->addl(buf, 8); ! This->cmpl(buf, end); ! This->jcc(Assembler::less, L_processPartition); ! ! This->push(end); ! This->push(len); ! This->push(buf); ! G = end; ! H = len; ! I = buf; ! ! RecAlt2(CONSTOrPreCompConstIndexU1, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, crc, E, F, ! AXMM, BXMM, CXMM, ! G, H, ! I, ! This); ! ! This->pop(buf); ! This->pop(len); ! This->pop(end); ! ! This->addl(buf, 2 * size); ! This->subl(len, 3 * size); ! This->jmp(L_processPartitions); ! ! This->bind(L_exit); ! } ! #endif //LP64 ! } ! #undef D ! ! #ifdef _LP64 ! // Algorithm 2: Pipelined usage of the CRC32 instruction. ! // Input: A buffer I of L bytes. ! // Output: the CRC32C value of the buffer. ! // Notations: ! // Write L = 24N + r, with N = floor (L/24). ! // r = L mod 24 (0 <= r < 24). ! // Consider I as the concatenation of A|B|C|R, where A, B, C, each, ! // N quadwords, and R consists of r bytes. ! // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1 ! // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1 ! // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1 ! // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1 ! void MacroAssembler::crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len, ! Scratch(Register A), Scratch(Register B), Scratch(Register C), ! Scratch(Register D), Scratch(Register E), Scratch(Register F), ! Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), ! bool IsPclmulqdqSupported) { ! uint32_t CONSTOrPreCompConstIndex[CRC32C::NUM_PRECOMPUTED_CONSTANTS]; ! Label L_wordByWord; ! Label L_byteByByteProlog; ! Label L_byteByByte; ! Label L_exit; ! ! if (IsPclmulqdqSupported ) { ! CONSTOrPreCompConstIndex[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; ! CONSTOrPreCompConstIndex[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1); ! ! CONSTOrPreCompConstIndex[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); ! CONSTOrPreCompConstIndex[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); ! ! CONSTOrPreCompConstIndex[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); ! CONSTOrPreCompConstIndex[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); ! assert((CRC32C::NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\""); ! } else { ! CONSTOrPreCompConstIndex[0] = 1; ! CONSTOrPreCompConstIndex[1] = 0; ! ! CONSTOrPreCompConstIndex[2] = 3; ! CONSTOrPreCompConstIndex[3] = 2; ! ! CONSTOrPreCompConstIndex[4] = 5; ! CONSTOrPreCompConstIndex[5] = 4; ! } ! CRC32C::ProcChunk(CRC32C::HIGH, CONSTOrPreCompConstIndex[0], CONSTOrPreCompConstIndex[1], IsPclmulqdqSupported, ! len, buf, crc, ! A, B, C, ! AXMM, BXMM, CXMM, ! D, E, ! F, ! this); ! CRC32C::ProcChunk(CRC32C::MIDDLE, CONSTOrPreCompConstIndex[2], CONSTOrPreCompConstIndex[3], IsPclmulqdqSupported, ! len, buf, crc, ! A, B, C, ! AXMM, BXMM, CXMM, ! D, E, ! F, ! this); ! CRC32C::ProcChunk(CRC32C::LOW, CONSTOrPreCompConstIndex[4], CONSTOrPreCompConstIndex[5], IsPclmulqdqSupported, ! len, buf, crc, ! A, B, C, ! AXMM, BXMM, CXMM, ! D, E, ! F, ! this); ! movl(A, len); ! andl(A, 0x00000007); ! negl(A); ! addl(A, len); ! addq(A, buf); ! ! BIND(L_wordByWord); ! cmpq(buf, A); ! jcc(Assembler::greaterEqual, L_byteByByteProlog); ! crc32(crc, Address(buf, 0), 4); ! addq(buf, 4); ! jmp(L_wordByWord); ! ! BIND(L_byteByByteProlog); ! andl(len, 0x00000007); ! movl(B, 1); ! ! BIND(L_byteByByte); ! cmpl(B, len); ! jccb(Assembler::greater, L_exit); ! crc32(crc, Address(buf, 0), 1); ! incq(buf); ! incl(B); ! jmp(L_byteByByte); ! ! BIND(L_exit); ! } ! #else ! void MacroAssembler::crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len, ! Scratch(Register A), Scratch(Register B), Scratch(Register C), ! Scratch(Register D), Scratch(Register E), Scratch(Register F), ! Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), ! bool IsPclmulqdqSupported) { ! uint32_t CONSTOrPreCompConstIndex[CRC32C::NUM_PRECOMPUTED_CONSTANTS]; ! Label L_wordByWord; ! Label L_byteByByteProlog; ! Label L_byteByByte; ! Label L_exit; ! ! if (IsPclmulqdqSupported) { ! CONSTOrPreCompConstIndex[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; ! CONSTOrPreCompConstIndex[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1); ! ! CONSTOrPreCompConstIndex[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); ! CONSTOrPreCompConstIndex[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); ! ! CONSTOrPreCompConstIndex[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); ! CONSTOrPreCompConstIndex[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); ! } else { ! CONSTOrPreCompConstIndex[0] = 1; ! CONSTOrPreCompConstIndex[1] = 0; ! ! CONSTOrPreCompConstIndex[2] = 3; ! CONSTOrPreCompConstIndex[3] = 2; ! ! CONSTOrPreCompConstIndex[4] = 5; ! CONSTOrPreCompConstIndex[5] = 4; ! } ! CRC32C::ProcChunk(CRC32C::HIGH, CONSTOrPreCompConstIndex[0], CONSTOrPreCompConstIndex[1], IsPclmulqdqSupported, ! len, buf, crc, ! A, B, C, ! AXMM, BXMM, CXMM, ! D, E, ! F, ! this); ! CRC32C::ProcChunk(CRC32C::MIDDLE, CONSTOrPreCompConstIndex[2], CONSTOrPreCompConstIndex[3], IsPclmulqdqSupported, ! len, buf, crc, ! A, B, C, ! AXMM, BXMM, CXMM, ! D, E, ! F, ! this); ! CRC32C::ProcChunk(CRC32C::LOW, CONSTOrPreCompConstIndex[4], CONSTOrPreCompConstIndex[5], IsPclmulqdqSupported, ! len, buf, crc, ! A, B, C, ! AXMM, BXMM, CXMM, ! D, E, ! F, ! this); ! movl(A, len); ! andl(A, 0x00000007); ! negl(A); ! addl(A, len); ! addl(A, buf); ! ! BIND(L_wordByWord); ! cmpl(buf, A); ! jcc(Assembler::greaterEqual, L_byteByByteProlog); ! crc32(crc, Address(buf,0), 4); ! addl(buf, 4); ! jmp(L_wordByWord); ! ! BIND(L_byteByByteProlog); ! andl(len, 0x00000007); ! movl(B, 1); ! ! BIND(L_byteByByte); ! cmpl(B, len); ! jccb(Assembler::greater, L_exit); ! movb(A, Address(buf, 0)); ! crc32(crc, A, 1); ! incl(buf); ! incl(B); ! jmp(L_byteByByte); ! ! BIND(L_exit); ! } ! #endif // LP64 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { switch (cond) { // Note some conditions are synonyms for others
< prev index next >