< prev index next >
src/cpu/x86/vm/macroAssembler_x86.cpp
Print this page
*** 43,52 ****
--- 43,53 ----
#if INCLUDE_ALL_GCS
#include "gc/g1/g1CollectedHeap.inline.hpp"
#include "gc/g1/g1SATBCardTableModRefBS.hpp"
#include "gc/g1/heapRegion.hpp"
#endif // INCLUDE_ALL_GCS
+ #include "crc32c.h"
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#define STOP(error) stop(error)
#else
*** 8516,8525 ****
--- 8517,8991 ----
BIND(L_exit);
notl(crc); // ~c
}
+ #ifdef _LP64
+ // S. Gueron / Information Processing Letters 112 (2012) 184
+ // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
+ // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
+ // Output: the 64-bit carry-less product of B * CONST
+ void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
+ Register tmp1, Register tmp2, Register tmp3) {
+ lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
+ if (n > 0) {
+ addq(tmp3, n * 256 * 8);
+ }
+ // Q1 = TABLEExt[n][B & 0xFF];
+ movl(tmp1, in);
+ andl(tmp1, 0x000000FF);
+ shll(tmp1, 3);
+ addq(tmp1, tmp3);
+ movq(tmp1, Address(tmp1, 0));
+
+ // Q2 = TABLEExt[n][B >> 8 & 0xFF];
+ movl(tmp2, in);
+ shrl(tmp2, 8);
+ andl(tmp2, 0x000000FF);
+ shll(tmp2, 3);
+ addq(tmp2, tmp3);
+ movq(tmp2, Address(tmp2, 0));
+
+ shlq(tmp2, 8);
+ xorq(tmp1, tmp2);
+
+ // Q3 = TABLEExt[n][B >> 16 & 0xFF];
+ movl(tmp2, in);
+ shrl(tmp2, 16);
+ andl(tmp2, 0x000000FF);
+ shll(tmp2, 3);
+ addq(tmp2, tmp3);
+ movq(tmp2, Address(tmp2, 0));
+
+ shlq(tmp2, 16);
+ xorq(tmp1, tmp2);
+
+ // Q4 = TABLEExt[n][B >> 24 & 0xFF];
+ shrl(in, 24);
+ andl(in, 0x000000FF);
+ shll(in, 3);
+ addq(in, tmp3);
+ movq(in, Address(in, 0));
+
+ shlq(in, 24);
+ xorq(in, tmp1);
+ // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
+ }
+
+ void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
+ Register in_out,
+ uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
+ XMMRegister w_xtmp2,
+ Register tmp1,
+ Register n_tmp2, Register n_tmp3) {
+ if (is_pclmulqdq_supported) {
+ movdl(w_xtmp1, in_out); // modified blindly
+
+ movl(tmp1, const_or_pre_comp_const_index);
+ movdl(w_xtmp2, tmp1);
+ pclmulqdq(w_xtmp1, w_xtmp2, 0);
+
+ movdq(in_out, w_xtmp1);
+ } else {
+ crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
+ }
+ }
+
+ // Recombination Alternative 2: No bit-reflections
+ // T1 = (CRC_A * U1) << 1
+ // T2 = (CRC_B * U2) << 1
+ // C1 = T1 >> 32
+ // C2 = T2 >> 32
+ // T1 = T1 & 0xFFFFFFFF
+ // T2 = T2 & 0xFFFFFFFF
+ // T1 = CRC32(0, T1)
+ // T2 = CRC32(0, T2)
+ // C1 = C1 ^ T1
+ // C2 = C2 ^ T2
+ // CRC = C1 ^ C2 ^ CRC_C
+ void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ Register tmp1, Register tmp2,
+ Register n_tmp3) {
+ crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+ crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+ shlq(in_out, 1);
+ movl(tmp1, in_out);
+ shrq(in_out, 32);
+ xorl(tmp2, tmp2);
+ crc32(tmp2, tmp1, 4);
+ xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
+ shlq(in1, 1);
+ movl(tmp1, in1);
+ shrq(in1, 32);
+ xorl(tmp2, tmp2);
+ crc32(tmp2, tmp1, 4);
+ xorl(in1, tmp2);
+ xorl(in_out, in1);
+ xorl(in_out, in2);
+ }
+
+ // Set N to predefined value
+ // Subtract from a lenght of a buffer
+ // execute in a loop:
+ // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
+ // for i = 1 to N do
+ // CRC_A = CRC32(CRC_A, A[i])
+ // CRC_B = CRC32(CRC_B, B[i])
+ // CRC_C = CRC32(CRC_C, C[i])
+ // end for
+ // Recombine
+ void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
+ Register in_out1, Register in_out2, Register in_out3,
+ Register tmp1, Register tmp2, Register tmp3,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ Register tmp4, Register tmp5,
+ Register n_tmp6) {
+ Label L_processPartitions;
+ Label L_processPartition;
+ Label L_exit;
+
+ bind(L_processPartitions);
+ cmpl(in_out1, 3 * size);
+ jcc(Assembler::less, L_exit);
+ xorl(tmp1, tmp1);
+ xorl(tmp2, tmp2);
+ movq(tmp3, in_out2);
+ addq(tmp3, size);
+
+ bind(L_processPartition);
+ crc32(in_out3, Address(in_out2, 0), 8);
+ crc32(tmp1, Address(in_out2, size), 8);
+ crc32(tmp2, Address(in_out2, size * 2), 8);
+ addq(in_out2, 8);
+ cmpq(in_out2, tmp3);
+ jcc(Assembler::less, L_processPartition);
+ crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ n_tmp6);
+ addq(in_out2, 2 * size);
+ subl(in_out1, 3 * size);
+ jmp(L_processPartitions);
+
+ bind(L_exit);
+ }
+ #else
+ void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
+ Register tmp1, Register tmp2, Register tmp3,
+ XMMRegister xtmp1, XMMRegister xtmp2) {
+ lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
+ if (n > 0) {
+ addl(tmp3, n * 256 * 8);
+ }
+ // Q1 = TABLEExt[n][B & 0xFF];
+ movl(tmp1, in_out);
+ andl(tmp1, 0x000000FF);
+ shll(tmp1, 3);
+ addl(tmp1, tmp3);
+ movq(xtmp1, Address(tmp1, 0));
+
+ // Q2 = TABLEExt[n][B >> 8 & 0xFF];
+ movl(tmp2, in_out);
+ shrl(tmp2, 8);
+ andl(tmp2, 0x000000FF);
+ shll(tmp2, 3);
+ addl(tmp2, tmp3);
+ movq(xtmp2, Address(tmp2, 0));
+
+ psllq(xtmp2, 8);
+ pxor(xtmp1, xtmp2);
+
+ // Q3 = TABLEExt[n][B >> 16 & 0xFF];
+ movl(tmp2, in_out);
+ shrl(tmp2, 16);
+ andl(tmp2, 0x000000FF);
+ shll(tmp2, 3);
+ addl(tmp2, tmp3);
+ movq(xtmp2, Address(tmp2, 0));
+
+ psllq(xtmp2, 16);
+ pxor(xtmp1, xtmp2);
+
+ // Q4 = TABLEExt[n][B >> 24 & 0xFF];
+ shrl(in_out, 24);
+ andl(in_out, 0x000000FF);
+ shll(in_out, 3);
+ addl(in_out, tmp3);
+ movq(xtmp2, Address(in_out, 0));
+
+ psllq(xtmp2, 24);
+ pxor(xtmp1, xtmp2); // Result in CXMM
+ // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
+ }
+
+ void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
+ Register in_out,
+ uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
+ XMMRegister w_xtmp2,
+ Register tmp1,
+ Register n_tmp2, Register n_tmp3) {
+ if (is_pclmulqdq_supported) {
+ movdl(w_xtmp1, in_out);
+
+ movl(tmp1, const_or_pre_comp_const_index);
+ movdl(w_xtmp2, tmp1);
+ pclmulqdq(w_xtmp1, w_xtmp2, 0);
+ // Keep result in XMM since GPR is 32 bit in length
+ } else {
+ crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
+ }
+ }
+
+ void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ Register tmp1, Register tmp2,
+ Register n_tmp3) {
+ crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+ crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+
+ psllq(w_xtmp1, 1);
+ movdl(tmp1, w_xtmp1);
+ psrlq(w_xtmp1, 32);
+ movdl(in_out, w_xtmp1);
+
+ xorl(tmp2, tmp2);
+ crc32(tmp2, tmp1, 4);
+ xorl(in_out, tmp2);
+
+ psllq(w_xtmp2, 1);
+ movdl(tmp1, w_xtmp2);
+ psrlq(w_xtmp2, 32);
+ movdl(in1, w_xtmp2);
+
+ xorl(tmp2, tmp2);
+ crc32(tmp2, tmp1, 4);
+ xorl(in1, tmp2);
+ xorl(in_out, in1);
+ xorl(in_out, in2);
+ }
+
+ void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
+ Register in_out1, Register in_out2, Register in_out3,
+ Register tmp1, Register tmp2, Register tmp3,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ Register tmp4, Register tmp5,
+ Register n_tmp6) {
+ Label L_processPartitions;
+ Label L_processPartition;
+ Label L_exit;
+
+ bind(L_processPartitions);
+ cmpl(in_out1, 3 * size);
+ jcc(Assembler::less, L_exit);
+ xorl(tmp1, tmp1);
+ xorl(tmp2, tmp2);
+ movl(tmp3, in_out2);
+ addl(tmp3, size);
+
+ bind(L_processPartition);
+ crc32(in_out3, Address(in_out2, 0), 4);
+ crc32(tmp1, Address(in_out2, size), 4);
+ crc32(tmp2, Address(in_out2, size*2), 4);
+ crc32(in_out3, Address(in_out2, 0+4), 4);
+ crc32(tmp1, Address(in_out2, size+4), 4);
+ crc32(tmp2, Address(in_out2, size*2+4), 4);
+ addl(in_out2, 8);
+ cmpl(in_out2, tmp3);
+ jcc(Assembler::less, L_processPartition);
+
+ push(tmp3);
+ push(in_out1);
+ push(in_out2);
+ tmp4 = tmp3;
+ tmp5 = in_out1;
+ n_tmp6 = in_out2;
+
+ crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ n_tmp6);
+
+ pop(in_out2);
+ pop(in_out1);
+ pop(tmp3);
+
+ addl(in_out2, 2 * size);
+ subl(in_out1, 3 * size);
+ jmp(L_processPartitions);
+
+ bind(L_exit);
+ }
+ #endif //LP64
+
+ #ifdef _LP64
+ // Algorithm 2: Pipelined usage of the CRC32 instruction.
+ // Input: A buffer I of L bytes.
+ // Output: the CRC32C value of the buffer.
+ // Notations:
+ // Write L = 24N + r, with N = floor (L/24).
+ // r = L mod 24 (0 <= r < 24).
+ // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
+ // N quadwords, and R consists of r bytes.
+ // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
+ // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
+ // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
+ // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
+ void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
+ Register tmp1, Register tmp2, Register tmp3,
+ Register tmp4, Register tmp5, Register tmp6,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ bool is_pclmulqdq_supported) {
+ uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
+ Label L_wordByWord;
+ Label L_byteByByteProlog;
+ Label L_byteByByte;
+ Label L_exit;
+
+ if (is_pclmulqdq_supported ) {
+ const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
+ const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
+
+ const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
+ const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
+
+ const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
+ const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
+ assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
+ } else {
+ const_or_pre_comp_const_index[0] = 1;
+ const_or_pre_comp_const_index[1] = 0;
+
+ const_or_pre_comp_const_index[2] = 3;
+ const_or_pre_comp_const_index[3] = 2;
+
+ const_or_pre_comp_const_index[4] = 5;
+ const_or_pre_comp_const_index[5] = 4;
+ }
+ crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
+ in2, in1, in_out,
+ tmp1, tmp2, tmp3,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ tmp6);
+ crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
+ in2, in1, in_out,
+ tmp1, tmp2, tmp3,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ tmp6);
+ crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
+ in2, in1, in_out,
+ tmp1, tmp2, tmp3,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ tmp6);
+ movl(tmp1, in2);
+ andl(tmp1, 0x00000007);
+ negl(tmp1);
+ addl(tmp1, in2);
+ addq(tmp1, in1);
+
+ BIND(L_wordByWord);
+ cmpq(in1, tmp1);
+ jcc(Assembler::greaterEqual, L_byteByByteProlog);
+ crc32(in_out, Address(in1, 0), 4);
+ addq(in1, 4);
+ jmp(L_wordByWord);
+
+ BIND(L_byteByByteProlog);
+ andl(in2, 0x00000007);
+ movl(tmp2, 1);
+
+ BIND(L_byteByByte);
+ cmpl(tmp2, in2);
+ jccb(Assembler::greater, L_exit);
+ crc32(in_out, Address(in1, 0), 1);
+ incq(in1);
+ incl(tmp2);
+ jmp(L_byteByByte);
+
+ BIND(L_exit);
+ }
+ #else
+ void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
+ Register tmp1, Register tmp2, Register tmp3,
+ Register tmp4, Register tmp5, Register tmp6,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ bool is_pclmulqdq_supported) {
+ uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
+ Label L_wordByWord;
+ Label L_byteByByteProlog;
+ Label L_byteByByte;
+ Label L_exit;
+
+ if (is_pclmulqdq_supported) {
+ const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
+ const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
+
+ const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
+ const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
+
+ const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
+ const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
+ } else {
+ const_or_pre_comp_const_index[0] = 1;
+ const_or_pre_comp_const_index[1] = 0;
+
+ const_or_pre_comp_const_index[2] = 3;
+ const_or_pre_comp_const_index[3] = 2;
+
+ const_or_pre_comp_const_index[4] = 5;
+ const_or_pre_comp_const_index[5] = 4;
+ }
+ crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
+ in2, in1, in_out,
+ tmp1, tmp2, tmp3,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ tmp6);
+ crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
+ in2, in1, in_out,
+ tmp1, tmp2, tmp3,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ tmp6);
+ crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
+ in2, in1, in_out,
+ tmp1, tmp2, tmp3,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ tmp6);
+ movl(tmp1, in2);
+ andl(tmp1, 0x00000007);
+ negl(tmp1);
+ addl(tmp1, in2);
+ addl(tmp1, in1);
+
+ BIND(L_wordByWord);
+ cmpl(in1, tmp1);
+ jcc(Assembler::greaterEqual, L_byteByByteProlog);
+ crc32(in_out, Address(in1,0), 4);
+ addl(in1, 4);
+ jmp(L_wordByWord);
+
+ BIND(L_byteByByteProlog);
+ andl(in2, 0x00000007);
+ movl(tmp2, 1);
+
+ BIND(L_byteByByte);
+ cmpl(tmp2, in2);
+ jccb(Assembler::greater, L_exit);
+ movb(tmp1, Address(in1, 0));
+ crc32(in_out, tmp1, 1);
+ incl(in1);
+ incl(tmp2);
+ jmp(L_byteByByte);
+
+ BIND(L_exit);
+ }
+ #endif // LP64
#undef BIND
#undef BLOCK_COMMENT
Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
< prev index next >