< prev index next >
src/cpu/x86/vm/macroAssembler_x86.cpp
Print this page
*** 8465,8476 ****
BIND(L_exit);
notl(crc); // ~c
}
! #undef BIND
! #undef BLOCK_COMMENT
Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
switch (cond) {
// Note some conditions are synonyms for others
--- 8465,8972 ----
BIND(L_exit);
notl(crc); // ~c
}
! namespace CRC32C {
! #include "crc32c.h"
!
! #define Nehalem(x) x
! #define Westmere(x) x
!
! #undef IN
! #define IN(x) x
! #define INOUT(x) x
! #undef OUT
! #define OUT(x) x
! #define Scratch(x) x
!
! #undef D
!
! #ifdef _LP64
! // S. Gueron / Information Processing Letters 112 (2012) 184
! // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
! // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
! // Output: the 64-bit carry-less product of B * CONST
! void IPL_Alg4(INOUT(Register B), uint32_t n,
! Scratch(Register C), Scratch(Register D), Scratch(Register Z),
! MacroAssembler * This) {
! This->lea(Z, ExternalAddress(StubRoutines::crc32c_table_addr()));
! if (n > 0) {
! This->addq(Z, n * 256 * 8);
! }
! // Q1 = TABLEExt[n][B & 0xFF];
! This->movl(C, B);
! This->andl(C, 0x000000FF);
! This->shll(C, 3);
! This->addq(C, Z);
! This->movq(C, Address(C, 0));
!
! // Q2 = TABLEExt[n][B >> 8 & 0xFF];
! This->movl(D, B);
! This->shrl(D, 8);
! This->andl(D, 0x000000FF);
! This->shll(D, 3);
! This->addq(D, Z);
! This->movq(D, Address(D, 0));
!
! This->shlq(D, 8);
! This->xorq(C, D);
!
! // Q3 = TABLEExt[n][B >> 16 & 0xFF];
! This->movl(D, B);
! This->shrl(D, 16);
! This->andl(D, 0x000000FF);
! This->shll(D, 3);
! This->addq(D, Z);
! This->movq(D, Address(D, 0));
!
! This->shlq(D, 16);
! This->xorq(C, D);
!
! // Q4 = TABLEExt[n][B >> 24 & 0xFF];
! This->shrl(B, 24);
! This->andl(B, 0x000000FF);
! This->shll(B, 3);
! This->addq(B, Z);
! This->movq(B, Address(B, 0));
!
! This->shlq(B, 24);
! This->xorq(B, C);
! // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
! }
!
! void PCLMULQDQ(Westmere(Scratch(XMMRegister crcXMM)),
! INOUT(Register crc),
! uint32_t CONSTOrPreCompConstIndex, bool IsPclmulqdqSupported,
! Westmere(Scratch(XMMRegister DXMM)),
! Scratch(Register A),
! Nehalem(Scratch(Register B)), Nehalem(Scratch(Register C)),
! MacroAssembler * This) {
! if (IsPclmulqdqSupported) {
! This->movdl(crcXMM, crc); // modified blindly
!
! This->movl(A, CONSTOrPreCompConstIndex);
! This->movdl(DXMM, A);
! This->pclmulqdq(crcXMM, DXMM, 0);
!
! This->movdq(crc, crcXMM);
! } else {
! IPL_Alg4(crc, CONSTOrPreCompConstIndex, A, B, C, This);
! }
! }
!
! // Recombination Alternative 2: No bit-reflections
! // T1 = (CRC_A * U1) << 1
! // T2 = (CRC_B * U2) << 1
! // C1 = T1 >> 32
! // C2 = T2 >> 32
! // T1 = T1 & 0xFFFFFFFF
! // T2 = T2 & 0xFFFFFFFF
! // T1 = CRC32(0, T1)
! // T2 = CRC32(0, T2)
! // C1 = C1 ^ T1
! // C2 = C2 ^ T2
! // CRC = C1 ^ C2 ^ CRC_C
! void RecAlt2(uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, INOUT(Register crcA), IN(Scratch(Register crcB)), IN(Register crcC),
! Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)),
! Scratch(Register E), Scratch(Register F),
! Nehalem(Scratch(Register G)),
! MacroAssembler * This) {
! PCLMULQDQ(AXMM, crcA, CONSTOrPreCompConstIndexU1, IsPclmulqdqSupported, CXMM, E, F, G, This);
! PCLMULQDQ(BXMM, crcB, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, CXMM, E, F, G, This);
! This->shlq(crcA, 1);
! This->movl(E, crcA);
! This->shrq(crcA, 32);
! This->xorl(F, F);
! This->crc32(F, E, 4);
! This->xorl(crcA, F); // we don't care about upper 32 bit contents here
! This->shlq(crcB, 1);
! This->movl(E, crcB);
! This->shrq(crcB, 32);
! This->xorl(F, F);
! This->crc32(F, E, 4);
! This->xorl(crcB, F);
! This->xorl(crcA, crcB);
! This->xorl(crcA, crcC);
! }
!
! // Set N to predefined value
! // Subtract from a lenght of a buffer
! // execute in a loop:
! // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
! // for i = 1 to N do
! // CRC_A = CRC32(CRC_A, A[i])
! // CRC_B = CRC32(CRC_B, B[i])
! // CRC_C = CRC32(CRC_C, C[i])
! // end for
! // Recombine
! void ProcChunk(uint32_t size, uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported,
! INOUT(Register len), INOUT(Register buf), INOUT(Register crc),
! Scratch(Register E), Scratch(Register F), Scratch(Register end),
! Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)),
! Scratch(Register G), Scratch(Register H),
! Nehalem(Scratch(Register I)),
! MacroAssembler * This) {
! Label L_processPartitions;
! Label L_processPartition;
! Label L_exit;
!
! This->bind(L_processPartitions);
! This->cmpl(len, 3 * size);
! This->jcc(Assembler::less, L_exit);
! This->xorl(E, E);
! This->xorl(F, F);
! This->movq(end, buf);
! This->addq(end, size);
!
! This->bind(L_processPartition);
! This->crc32(crc, Address(buf, 0), 8);
! This->crc32(E, Address(buf, size), 8);
! This->crc32(F, Address(buf, size * 2), 8);
! This->addq(buf, 8);
! This->cmpq(buf, end);
! This->jcc(Assembler::less, L_processPartition);
! RecAlt2(CONSTOrPreCompConstIndexU1, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, crc, E, F,
! AXMM, BXMM, CXMM,
! G, H,
! I,
! This);
! This->addq(buf, 2 * size);
! This->subl(len, 3 * size);
! This->jmp(L_processPartitions);
!
! This->bind(L_exit);
! }
! #else
! void IPL_Alg4(INOUT(Register B), uint32_t n,
! Scratch(Register C), Scratch(Register D), Scratch(Register Z),
! Scratch(XMMRegister CXMM), Scratch(XMMRegister DXMM),
! MacroAssembler * This) {
! This->lea(Z, ExternalAddress(StubRoutines::crc32c_table_addr()));
! if (n > 0) {
! This->addl(Z, n * 256 * 8);
! }
! // Q1 = TABLEExt[n][B & 0xFF];
! This->movl(C, B);
! This->andl(C, 0x000000FF);
! This->shll(C, 3);
! This->addl(C, Z);
! This->movq(CXMM, Address(C, 0));
!
! // Q2 = TABLEExt[n][B >> 8 & 0xFF];
! This->movl(D, B);
! This->shrl(D, 8);
! This->andl(D, 0x000000FF);
! This->shll(D, 3);
! This->addl(D, Z);
! This->movq(DXMM, Address(D, 0));
!
! This->psllq(DXMM, 8);
! This->pxor(CXMM, DXMM);
!
! // Q3 = TABLEExt[n][B >> 16 & 0xFF];
! This->movl(D, B);
! This->shrl(D, 16);
! This->andl(D, 0x000000FF);
! This->shll(D, 3);
! This->addl(D, Z);
! This->movq(DXMM, Address(D, 0));
!
! This->psllq(DXMM, 16);
! This->pxor(CXMM, DXMM);
!
! // Q4 = TABLEExt[n][B >> 24 & 0xFF];
! This->shrl(B, 24);
! This->andl(B, 0x000000FF);
! This->shll(B, 3);
! This->addl(B, Z);
! This->movq(DXMM, Address(B, 0));
!
! This->psllq(DXMM, 24);
! This->pxor(CXMM, DXMM); // Result in CXMM
! // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
! }
!
! void PCLMULQDQ(Westmere(Scratch(XMMRegister crcXMM)),
! INOUT(Register crc),
! uint32_t CONSTOrPreCompConstIndex, bool IsPclmulqdqSupported,
! Westmere(Scratch(XMMRegister DXMM)),
! Scratch(Register A),
! Nehalem(Scratch(Register B)), Nehalem(Scratch(Register C)),
! MacroAssembler * This) {
! if (IsPclmulqdqSupported) {
! This->movdl(crcXMM, crc);
!
! This->movl(A, CONSTOrPreCompConstIndex);
! This->movdl(DXMM, A);
! This->pclmulqdq(crcXMM, DXMM, 0);
! // Keep result in XMM since GPR is 32 bit in length
! } else {
! IPL_Alg4(crc, CONSTOrPreCompConstIndex, A, B, C, crcXMM, DXMM, This);
! }
! }
!
! void RecAlt2(uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, INOUT(Register crcA), IN(Scratch(Register crcB)), IN(Register crcC),
! Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)),
! Scratch(Register E), Scratch(Register F),
! Nehalem(Scratch(Register G)),
! MacroAssembler * This) {
! PCLMULQDQ(AXMM, crcA, CONSTOrPreCompConstIndexU1, IsPclmulqdqSupported, CXMM, E, F, G, This);
! PCLMULQDQ(BXMM, crcB, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, CXMM, E, F, G, This);
!
! This->psllq(AXMM, 1);
! This->movdl(E, AXMM);
! This->psrlq(AXMM, 32);
! This->movdl(crcA, AXMM);
!
! This->xorl(F, F);
! This->crc32(F, E, 4);
! This->xorl(crcA, F);
!
! This->psllq(BXMM, 1);
! This->movdl(E, BXMM);
! This->psrlq(BXMM, 32);
! This->movdl(crcB, BXMM);
!
! This->xorl(F, F);
! This->crc32(F, E, 4);
! This->xorl(crcB, F);
! This->xorl(crcA, crcB);
! This->xorl(crcA, crcC);
! }
!
! void ProcChunk(uint32_t size, uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported,
! INOUT(Register len), INOUT(Register buf), INOUT(Register crc),
! Scratch(Register E), Scratch(Register F), Scratch(Register end),
! Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)),
! Scratch(Register G), Scratch(Register H),
! Nehalem(Scratch(Register I)),
! MacroAssembler * This) {
! Label L_processPartitions;
! Label L_processPartition;
! Label L_exit;
!
! This->bind(L_processPartitions);
! This->cmpl(len, 3 * size);
! This->jcc(Assembler::less, L_exit);
! This->xorl(E, E);
! This->xorl(F, F);
! This->movl(end, buf);
! This->addl(end, size);
!
! This->bind(L_processPartition);
! This->crc32(crc, Address(buf, 0), 4);
! This->crc32(E, Address(buf, size), 4);
! This->crc32(F, Address(buf, size*2), 4);
! This->crc32(crc, Address(buf, 0+4), 4);
! This->crc32(E, Address(buf, size+4), 4);
! This->crc32(F, Address(buf, size*2+4), 4);
! This->addl(buf, 8);
! This->cmpl(buf, end);
! This->jcc(Assembler::less, L_processPartition);
!
! This->push(end);
! This->push(len);
! This->push(buf);
! G = end;
! H = len;
! I = buf;
!
! RecAlt2(CONSTOrPreCompConstIndexU1, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, crc, E, F,
! AXMM, BXMM, CXMM,
! G, H,
! I,
! This);
!
! This->pop(buf);
! This->pop(len);
! This->pop(end);
!
! This->addl(buf, 2 * size);
! This->subl(len, 3 * size);
! This->jmp(L_processPartitions);
!
! This->bind(L_exit);
! }
! #endif //LP64
! }
! #undef D
!
! #ifdef _LP64
! // Algorithm 2: Pipelined usage of the CRC32 instruction.
! // Input: A buffer I of L bytes.
! // Output: the CRC32C value of the buffer.
! // Notations:
! // Write L = 24N + r, with N = floor (L/24).
! // r = L mod 24 (0 <= r < 24).
! // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
! // N quadwords, and R consists of r bytes.
! // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
! // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
! // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
! // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
! void MacroAssembler::crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len,
! Scratch(Register A), Scratch(Register B), Scratch(Register C),
! Scratch(Register D), Scratch(Register E), Scratch(Register F),
! Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)),
! bool IsPclmulqdqSupported) {
! uint32_t CONSTOrPreCompConstIndex[CRC32C::NUM_PRECOMPUTED_CONSTANTS];
! Label L_wordByWord;
! Label L_byteByByteProlog;
! Label L_byteByByte;
! Label L_exit;
!
! if (IsPclmulqdqSupported ) {
! CONSTOrPreCompConstIndex[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
! CONSTOrPreCompConstIndex[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
!
! CONSTOrPreCompConstIndex[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
! CONSTOrPreCompConstIndex[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
!
! CONSTOrPreCompConstIndex[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
! CONSTOrPreCompConstIndex[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
! assert((CRC32C::NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
! } else {
! CONSTOrPreCompConstIndex[0] = 1;
! CONSTOrPreCompConstIndex[1] = 0;
!
! CONSTOrPreCompConstIndex[2] = 3;
! CONSTOrPreCompConstIndex[3] = 2;
!
! CONSTOrPreCompConstIndex[4] = 5;
! CONSTOrPreCompConstIndex[5] = 4;
! }
! CRC32C::ProcChunk(CRC32C::HIGH, CONSTOrPreCompConstIndex[0], CONSTOrPreCompConstIndex[1], IsPclmulqdqSupported,
! len, buf, crc,
! A, B, C,
! AXMM, BXMM, CXMM,
! D, E,
! F,
! this);
! CRC32C::ProcChunk(CRC32C::MIDDLE, CONSTOrPreCompConstIndex[2], CONSTOrPreCompConstIndex[3], IsPclmulqdqSupported,
! len, buf, crc,
! A, B, C,
! AXMM, BXMM, CXMM,
! D, E,
! F,
! this);
! CRC32C::ProcChunk(CRC32C::LOW, CONSTOrPreCompConstIndex[4], CONSTOrPreCompConstIndex[5], IsPclmulqdqSupported,
! len, buf, crc,
! A, B, C,
! AXMM, BXMM, CXMM,
! D, E,
! F,
! this);
! movl(A, len);
! andl(A, 0x00000007);
! negl(A);
! addl(A, len);
! addq(A, buf);
!
! BIND(L_wordByWord);
! cmpq(buf, A);
! jcc(Assembler::greaterEqual, L_byteByByteProlog);
! crc32(crc, Address(buf, 0), 4);
! addq(buf, 4);
! jmp(L_wordByWord);
!
! BIND(L_byteByByteProlog);
! andl(len, 0x00000007);
! movl(B, 1);
!
! BIND(L_byteByByte);
! cmpl(B, len);
! jccb(Assembler::greater, L_exit);
! crc32(crc, Address(buf, 0), 1);
! incq(buf);
! incl(B);
! jmp(L_byteByByte);
!
! BIND(L_exit);
! }
! #else
! void MacroAssembler::crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len,
! Scratch(Register A), Scratch(Register B), Scratch(Register C),
! Scratch(Register D), Scratch(Register E), Scratch(Register F),
! Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)),
! bool IsPclmulqdqSupported) {
! uint32_t CONSTOrPreCompConstIndex[CRC32C::NUM_PRECOMPUTED_CONSTANTS];
! Label L_wordByWord;
! Label L_byteByByteProlog;
! Label L_byteByByte;
! Label L_exit;
!
! if (IsPclmulqdqSupported) {
! CONSTOrPreCompConstIndex[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
! CONSTOrPreCompConstIndex[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
!
! CONSTOrPreCompConstIndex[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
! CONSTOrPreCompConstIndex[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
!
! CONSTOrPreCompConstIndex[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
! CONSTOrPreCompConstIndex[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
! } else {
! CONSTOrPreCompConstIndex[0] = 1;
! CONSTOrPreCompConstIndex[1] = 0;
!
! CONSTOrPreCompConstIndex[2] = 3;
! CONSTOrPreCompConstIndex[3] = 2;
!
! CONSTOrPreCompConstIndex[4] = 5;
! CONSTOrPreCompConstIndex[5] = 4;
! }
! CRC32C::ProcChunk(CRC32C::HIGH, CONSTOrPreCompConstIndex[0], CONSTOrPreCompConstIndex[1], IsPclmulqdqSupported,
! len, buf, crc,
! A, B, C,
! AXMM, BXMM, CXMM,
! D, E,
! F,
! this);
! CRC32C::ProcChunk(CRC32C::MIDDLE, CONSTOrPreCompConstIndex[2], CONSTOrPreCompConstIndex[3], IsPclmulqdqSupported,
! len, buf, crc,
! A, B, C,
! AXMM, BXMM, CXMM,
! D, E,
! F,
! this);
! CRC32C::ProcChunk(CRC32C::LOW, CONSTOrPreCompConstIndex[4], CONSTOrPreCompConstIndex[5], IsPclmulqdqSupported,
! len, buf, crc,
! A, B, C,
! AXMM, BXMM, CXMM,
! D, E,
! F,
! this);
! movl(A, len);
! andl(A, 0x00000007);
! negl(A);
! addl(A, len);
! addl(A, buf);
!
! BIND(L_wordByWord);
! cmpl(buf, A);
! jcc(Assembler::greaterEqual, L_byteByByteProlog);
! crc32(crc, Address(buf,0), 4);
! addl(buf, 4);
! jmp(L_wordByWord);
!
! BIND(L_byteByByteProlog);
! andl(len, 0x00000007);
! movl(B, 1);
!
! BIND(L_byteByByte);
! cmpl(B, len);
! jccb(Assembler::greater, L_exit);
! movb(A, Address(buf, 0));
! crc32(crc, A, 1);
! incl(buf);
! incl(B);
! jmp(L_byteByByte);
!
! BIND(L_exit);
! }
! #endif // LP64
Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
switch (cond) {
// Note some conditions are synonyms for others
< prev index next >