--- old/src/cpu/x86/vm/assembler_x86.cpp 2015-09-15 16:47:02.941623100 -0700 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2015-09-15 16:47:02.682623100 -0700 @@ -1605,6 +1605,85 @@ emit_int8((unsigned char)0xA2); } +// Opcode / Instruction Op / En 64 - Bit Mode Compat / Leg Mode Description Implemented +// F2 0F 38 F0 / r CRC32 r32, r / m8 RM Valid Valid Accumulate CRC32 on r / m8. v +// F2 REX 0F 38 F0 / r CRC32 r32, r / m8* RM Valid N.E. Accumulate CRC32 on r / m8. - +// F2 REX.W 0F 38 F0 / r CRC32 r64, r / m8 RM Valid N.E. Accumulate CRC32 on r / m8. - +// +// F2 0F 38 F1 / r CRC32 r32, r / m16 RM Valid Valid Accumulate CRC32 on r / m16. v +// +// F2 0F 38 F1 / r CRC32 r32, r / m32 RM Valid Valid Accumulate CRC32 on r / m32. v +// +// F2 REX.W 0F 38 F1 / r CRC32 r64, r / m64 RM Valid N.E. Accumulate CRC32 on r / m64. v +void Assembler::crc32(Register crc, Register v, int8_t sizeInBytes) { + assert(VM_Version::supports_sse4_2(), ""); + int8_t w = 0x01; + Prefix p = Prefix_EMPTY; + + emit_int8((int8_t)0xF2); + switch (sizeInBytes) { + case 1: + w = 0; + break; + case 2: + case 4: + break; + LP64_ONLY(case 8:) + // This instruction is not valid in 32 bits + // Note: + // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf + // + // Page B - 72 Vol. 2C says + // qwreg2 to qwreg 1111 0010 : 0100 1R0B : 0000 1111 : 0011 1000 : 1111 0000 : 11 qwreg1 qwreg2 + // mem64 to qwreg 1111 0010 : 0100 1R0B : 0000 1111 : 0011 1000 : 1111 0000 : mod qwreg r / m + // F0!!! + // while 3 - 208 Vol. 2A + // F2 REX.W 0F 38 F1 / r CRC32 r64, r / m64 RM Valid N.E.Accumulate CRC32 on r / m64. + // + // the 0 on a last bit is reserved for a different flavor of this instruction : + // F2 REX.W 0F 38 F0 / r CRC32 r64, r / m8 RM Valid N.E.Accumulate CRC32 on r / m8. + p = REX_W; + break; + default: + assert(0, "Unsupported value for a sizeInBytes argument"); + break; + } + LP64_ONLY(prefix(crc, v, p);) + emit_int8((int8_t)0x0F); + emit_int8(0x38); + emit_int8((int8_t)(0xF0 | w)); + emit_int8(0xC0 | ((crc->encoding() & 0x7) << 3) | (v->encoding() & 7)); +} + +void Assembler::crc32(Register crc, Address adr, int8_t sizeInBytes) { + assert(VM_Version::supports_sse4_2(), ""); + InstructionMark im(this); + int8_t w = 0x01; + Prefix p = Prefix_EMPTY; + + emit_int8((int8_t)0xF2); + switch (sizeInBytes) { + case 1: + w = 0; + break; + case 2: + case 4: + break; + LP64_ONLY(case 8:) + // This instruction is not valid in 32 bits + p = REX_W; + break; + default: + assert(0, "Unsupported value for a sizeInBytes argument"); + break; + } + LP64_ONLY(prefix(crc, adr, p);) + emit_int8((int8_t)0x0F); + emit_int8(0x38); + emit_int8((int8_t)(0xF0 | w)); + emit_operand(crc, adr); +} + void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3); @@ -6011,6 +6090,14 @@ emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding())); } +// 0F A4 / r ib +void Assembler::shldl(Register dst, Register src, int8_t imm8) { + emit_int8(0x0F); + emit_int8((unsigned char)0xA4); + emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding())); + emit_int8(imm8); +} + void Assembler::shrdl(Register dst, Register src) { emit_int8(0x0F); emit_int8((unsigned char)0xAD); @@ -6196,6 +6283,40 @@ } } +void Assembler::prefix(Register dst, Register src, Prefix p) { + if (src->encoding() >= 8) { + p = (Prefix)(p | REX_B); + } + if (dst->encoding() >= 8) { + p = (Prefix)( p | REX_R); + } + if (p != Prefix_EMPTY) { + // do not generate an empty prefix + prefix(p); + } +} + +void Assembler::prefix(Register dst, Address adr, Prefix p) { + if (adr.base_needs_rex()) { + if (adr.index_needs_rex()) { + assert(false, "prefix(Register dst, Address adr, Prefix p) does not support handling of an X"); + } else { + prefix(REX_B); + } + } else { + if (adr.index_needs_rex()) { + assert(false, "prefix(Register dst, Address adr, Prefix p) does not support handling of an X"); + } + } + if (dst->encoding() >= 8) { + p = (Prefix)(p | REX_R); + } + if (p != Prefix_EMPTY) { + // do not generate an empty prefix + prefix(p); + } +} + void Assembler::prefix(Address adr) { if (adr.base_needs_rex()) { if (adr.index_needs_rex()) { --- old/src/cpu/x86/vm/assembler_x86.hpp 2015-09-15 16:47:04.736623100 -0700 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2015-09-15 16:47:04.478623100 -0700 @@ -504,7 +504,8 @@ VEX_3bytes = 0xC4, VEX_2bytes = 0xC5, - EVEX_4bytes = 0x62 + EVEX_4bytes = 0x62, + Prefix_EMPTY = 0x0 }; enum VexPrefix { @@ -608,6 +609,8 @@ int prefixq_and_encode(int dst_enc, int src_enc); void prefix(Register reg); + void prefix(Register dst, Register src, Prefix p); + void prefix(Register dst, Address adr, Prefix p); void prefix(Address adr); void prefixq(Address adr); @@ -1165,6 +1168,10 @@ // Identify processor type and features void cpuid(); + // CRC32C + void crc32(Register crc, Register v, int8_t sizeInBytes); + void crc32(Register crc, Address adr, int8_t sizeInBytes); + // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value void cvtsd2ss(XMMRegister dst, XMMRegister src); void cvtsd2ss(XMMRegister dst, Address src); @@ -1764,6 +1771,7 @@ void setb(Condition cc, Register dst); void shldl(Register dst, Register src); + void shldl(Register dst, Register src, int8_t imm8); void shll(Register dst, int imm8); void shll(Register dst); --- old/src/cpu/x86/vm/assembler_x86.inline.hpp 2015-09-15 16:47:06.303623100 -0700 +++ new/src/cpu/x86/vm/assembler_x86.inline.hpp 2015-09-15 16:47:06.042623100 -0700 @@ -37,6 +37,8 @@ inline int Assembler::prefixq_and_encode(int dst_enc, int src_enc) { return dst_enc << 3 | src_enc; } inline void Assembler::prefix(Register reg) {} +inline void Assembler::prefix(Register dst, Register src, Prefix p) {} +inline void Assembler::prefix(Register dst, Address adr, Prefix p) {} inline void Assembler::prefix(Address adr) {} inline void Assembler::prefixq(Address adr) {} --- old/src/cpu/x86/vm/interpreterGenerator_x86.hpp 2015-09-15 16:47:07.847623100 -0700 +++ new/src/cpu/x86/vm/interpreterGenerator_x86.hpp 2015-09-15 16:47:07.588123100 -0700 @@ -42,6 +42,7 @@ address generate_Reference_get_entry(); address generate_CRC32_update_entry(); address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind); + address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind); #ifndef _LP64 address generate_Float_intBitsToFloat_entry(); address generate_Float_floatToRawIntBits_entry(); --- old/src/cpu/x86/vm/macroAssembler_x86.cpp 2015-09-15 16:47:09.425623100 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp 2015-09-15 16:47:09.169123100 -0700 @@ -45,6 +45,7 @@ #include "gc/g1/g1SATBCardTableModRefBS.hpp" #include "gc/g1/heapRegion.hpp" #endif // INCLUDE_ALL_GCS +#include "crc32c.h" #ifdef PRODUCT #define BLOCK_COMMENT(str) /* nothing */ @@ -8518,6 +8519,471 @@ notl(crc); // ~c } +#ifdef _LP64 +// S. Gueron / Information Processing Letters 112 (2012) 184 +// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table. +// Input: A 32 bit value B = [byte3, byte2, byte1, byte0]. +// Output: the 64-bit carry-less product of B * CONST +void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n, + Register tmp1, Register tmp2, Register tmp3) { + lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); + if (n > 0) { + addq(tmp3, n * 256 * 8); + } + // Q1 = TABLEExt[n][B & 0xFF]; + movl(tmp1, in); + andl(tmp1, 0x000000FF); + shll(tmp1, 3); + addq(tmp1, tmp3); + movq(tmp1, Address(tmp1, 0)); + + // Q2 = TABLEExt[n][B >> 8 & 0xFF]; + movl(tmp2, in); + shrl(tmp2, 8); + andl(tmp2, 0x000000FF); + shll(tmp2, 3); + addq(tmp2, tmp3); + movq(tmp2, Address(tmp2, 0)); + + shlq(tmp2, 8); + xorq(tmp1, tmp2); + + // Q3 = TABLEExt[n][B >> 16 & 0xFF]; + movl(tmp2, in); + shrl(tmp2, 16); + andl(tmp2, 0x000000FF); + shll(tmp2, 3); + addq(tmp2, tmp3); + movq(tmp2, Address(tmp2, 0)); + + shlq(tmp2, 16); + xorq(tmp1, tmp2); + + // Q4 = TABLEExt[n][B >> 24 & 0xFF]; + shrl(in, 24); + andl(in, 0x000000FF); + shll(in, 3); + addq(in, tmp3); + movq(in, Address(in, 0)); + + shlq(in, 24); + xorq(in, tmp1); + // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; +} + +void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, + Register in_out, + uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, + XMMRegister w_xtmp2, + Register tmp1, + Register n_tmp2, Register n_tmp3) { + if (is_pclmulqdq_supported) { + movdl(w_xtmp1, in_out); // modified blindly + + movl(tmp1, const_or_pre_comp_const_index); + movdl(w_xtmp2, tmp1); + pclmulqdq(w_xtmp1, w_xtmp2, 0); + + movdq(in_out, w_xtmp1); + } else { + crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3); + } +} + +// Recombination Alternative 2: No bit-reflections +// T1 = (CRC_A * U1) << 1 +// T2 = (CRC_B * U2) << 1 +// C1 = T1 >> 32 +// C2 = T2 >> 32 +// T1 = T1 & 0xFFFFFFFF +// T2 = T2 & 0xFFFFFFFF +// T1 = CRC32(0, T1) +// T2 = CRC32(0, T2) +// C1 = C1 ^ T1 +// C2 = C2 ^ T2 +// CRC = C1 ^ C2 ^ CRC_C +void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, + XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, + Register tmp1, Register tmp2, + Register n_tmp3) { + crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); + crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); + shlq(in_out, 1); + movl(tmp1, in_out); + shrq(in_out, 32); + xorl(tmp2, tmp2); + crc32(tmp2, tmp1, 4); + xorl(in_out, tmp2); // we don't care about upper 32 bit contents here + shlq(in1, 1); + movl(tmp1, in1); + shrq(in1, 32); + xorl(tmp2, tmp2); + crc32(tmp2, tmp1, 4); + xorl(in1, tmp2); + xorl(in_out, in1); + xorl(in_out, in2); +} + +// Set N to predefined value +// Subtract from a lenght of a buffer +// execute in a loop: +// CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0 +// for i = 1 to N do +// CRC_A = CRC32(CRC_A, A[i]) +// CRC_B = CRC32(CRC_B, B[i]) +// CRC_C = CRC32(CRC_C, C[i]) +// end for +// Recombine +void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, + Register in_out1, Register in_out2, Register in_out3, + Register tmp1, Register tmp2, Register tmp3, + XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, + Register tmp4, Register tmp5, + Register n_tmp6) { + Label L_processPartitions; + Label L_processPartition; + Label L_exit; + + bind(L_processPartitions); + cmpl(in_out1, 3 * size); + jcc(Assembler::less, L_exit); + xorl(tmp1, tmp1); + xorl(tmp2, tmp2); + movq(tmp3, in_out2); + addq(tmp3, size); + + bind(L_processPartition); + crc32(in_out3, Address(in_out2, 0), 8); + crc32(tmp1, Address(in_out2, size), 8); + crc32(tmp2, Address(in_out2, size * 2), 8); + addq(in_out2, 8); + cmpq(in_out2, tmp3); + jcc(Assembler::less, L_processPartition); + crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, + w_xtmp1, w_xtmp2, w_xtmp3, + tmp4, tmp5, + n_tmp6); + addq(in_out2, 2 * size); + subl(in_out1, 3 * size); + jmp(L_processPartitions); + + bind(L_exit); +} +#else +void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n, + Register tmp1, Register tmp2, Register tmp3, + XMMRegister xtmp1, XMMRegister xtmp2) { + lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); + if (n > 0) { + addl(tmp3, n * 256 * 8); + } + // Q1 = TABLEExt[n][B & 0xFF]; + movl(tmp1, in_out); + andl(tmp1, 0x000000FF); + shll(tmp1, 3); + addl(tmp1, tmp3); + movq(xtmp1, Address(tmp1, 0)); + + // Q2 = TABLEExt[n][B >> 8 & 0xFF]; + movl(tmp2, in_out); + shrl(tmp2, 8); + andl(tmp2, 0x000000FF); + shll(tmp2, 3); + addl(tmp2, tmp3); + movq(xtmp2, Address(tmp2, 0)); + + psllq(xtmp2, 8); + pxor(xtmp1, xtmp2); + + // Q3 = TABLEExt[n][B >> 16 & 0xFF]; + movl(tmp2, in_out); + shrl(tmp2, 16); + andl(tmp2, 0x000000FF); + shll(tmp2, 3); + addl(tmp2, tmp3); + movq(xtmp2, Address(tmp2, 0)); + + psllq(xtmp2, 16); + pxor(xtmp1, xtmp2); + + // Q4 = TABLEExt[n][B >> 24 & 0xFF]; + shrl(in_out, 24); + andl(in_out, 0x000000FF); + shll(in_out, 3); + addl(in_out, tmp3); + movq(xtmp2, Address(in_out, 0)); + + psllq(xtmp2, 24); + pxor(xtmp1, xtmp2); // Result in CXMM + // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; +} + +void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, + Register in_out, + uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, + XMMRegister w_xtmp2, + Register tmp1, + Register n_tmp2, Register n_tmp3) { + if (is_pclmulqdq_supported) { + movdl(w_xtmp1, in_out); + + movl(tmp1, const_or_pre_comp_const_index); + movdl(w_xtmp2, tmp1); + pclmulqdq(w_xtmp1, w_xtmp2, 0); + // Keep result in XMM since GPR is 32 bit in length + } else { + crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2); + } +} + +void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, + XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, + Register tmp1, Register tmp2, + Register n_tmp3) { + crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); + crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); + + psllq(w_xtmp1, 1); + movdl(tmp1, w_xtmp1); + psrlq(w_xtmp1, 32); + movdl(in_out, w_xtmp1); + + xorl(tmp2, tmp2); + crc32(tmp2, tmp1, 4); + xorl(in_out, tmp2); + + psllq(w_xtmp2, 1); + movdl(tmp1, w_xtmp2); + psrlq(w_xtmp2, 32); + movdl(in1, w_xtmp2); + + xorl(tmp2, tmp2); + crc32(tmp2, tmp1, 4); + xorl(in1, tmp2); + xorl(in_out, in1); + xorl(in_out, in2); +} + +void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, + Register in_out1, Register in_out2, Register in_out3, + Register tmp1, Register tmp2, Register tmp3, + XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, + Register tmp4, Register tmp5, + Register n_tmp6) { + Label L_processPartitions; + Label L_processPartition; + Label L_exit; + + bind(L_processPartitions); + cmpl(in_out1, 3 * size); + jcc(Assembler::less, L_exit); + xorl(tmp1, tmp1); + xorl(tmp2, tmp2); + movl(tmp3, in_out2); + addl(tmp3, size); + + bind(L_processPartition); + crc32(in_out3, Address(in_out2, 0), 4); + crc32(tmp1, Address(in_out2, size), 4); + crc32(tmp2, Address(in_out2, size*2), 4); + crc32(in_out3, Address(in_out2, 0+4), 4); + crc32(tmp1, Address(in_out2, size+4), 4); + crc32(tmp2, Address(in_out2, size*2+4), 4); + addl(in_out2, 8); + cmpl(in_out2, tmp3); + jcc(Assembler::less, L_processPartition); + + push(tmp3); + push(in_out1); + push(in_out2); + tmp4 = tmp3; + tmp5 = in_out1; + n_tmp6 = in_out2; + + crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, + w_xtmp1, w_xtmp2, w_xtmp3, + tmp4, tmp5, + n_tmp6); + + pop(in_out2); + pop(in_out1); + pop(tmp3); + + addl(in_out2, 2 * size); + subl(in_out1, 3 * size); + jmp(L_processPartitions); + + bind(L_exit); +} +#endif //LP64 + +#ifdef _LP64 +// Algorithm 2: Pipelined usage of the CRC32 instruction. +// Input: A buffer I of L bytes. +// Output: the CRC32C value of the buffer. +// Notations: +// Write L = 24N + r, with N = floor (L/24). +// r = L mod 24 (0 <= r < 24). +// Consider I as the concatenation of A|B|C|R, where A, B, C, each, +// N quadwords, and R consists of r bytes. +// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1 +// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1 +// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1 +// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1 +void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, + Register tmp1, Register tmp2, Register tmp3, + Register tmp4, Register tmp5, Register tmp6, + XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, + bool is_pclmulqdq_supported) { + uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; + Label L_wordByWord; + Label L_byteByByteProlog; + Label L_byteByByte; + Label L_exit; + + if (is_pclmulqdq_supported ) { + const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; + const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1); + + const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); + const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); + + const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); + const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); + assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\""); + } else { + const_or_pre_comp_const_index[0] = 1; + const_or_pre_comp_const_index[1] = 0; + + const_or_pre_comp_const_index[2] = 3; + const_or_pre_comp_const_index[3] = 2; + + const_or_pre_comp_const_index[4] = 5; + const_or_pre_comp_const_index[5] = 4; + } + crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, + in2, in1, in_out, + tmp1, tmp2, tmp3, + w_xtmp1, w_xtmp2, w_xtmp3, + tmp4, tmp5, + tmp6); + crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, + in2, in1, in_out, + tmp1, tmp2, tmp3, + w_xtmp1, w_xtmp2, w_xtmp3, + tmp4, tmp5, + tmp6); + crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, + in2, in1, in_out, + tmp1, tmp2, tmp3, + w_xtmp1, w_xtmp2, w_xtmp3, + tmp4, tmp5, + tmp6); + movl(tmp1, in2); + andl(tmp1, 0x00000007); + negl(tmp1); + addl(tmp1, in2); + addq(tmp1, in1); + + BIND(L_wordByWord); + cmpq(in1, tmp1); + jcc(Assembler::greaterEqual, L_byteByByteProlog); + crc32(in_out, Address(in1, 0), 4); + addq(in1, 4); + jmp(L_wordByWord); + + BIND(L_byteByByteProlog); + andl(in2, 0x00000007); + movl(tmp2, 1); + + BIND(L_byteByByte); + cmpl(tmp2, in2); + jccb(Assembler::greater, L_exit); + crc32(in_out, Address(in1, 0), 1); + incq(in1); + incl(tmp2); + jmp(L_byteByByte); + + BIND(L_exit); +} +#else +void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, + Register tmp1, Register tmp2, Register tmp3, + Register tmp4, Register tmp5, Register tmp6, + XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, + bool is_pclmulqdq_supported) { + uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; + Label L_wordByWord; + Label L_byteByByteProlog; + Label L_byteByByte; + Label L_exit; + + if (is_pclmulqdq_supported) { + const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; + const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1); + + const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); + const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); + + const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); + const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); + } else { + const_or_pre_comp_const_index[0] = 1; + const_or_pre_comp_const_index[1] = 0; + + const_or_pre_comp_const_index[2] = 3; + const_or_pre_comp_const_index[3] = 2; + + const_or_pre_comp_const_index[4] = 5; + const_or_pre_comp_const_index[5] = 4; + } + crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, + in2, in1, in_out, + tmp1, tmp2, tmp3, + w_xtmp1, w_xtmp2, w_xtmp3, + tmp4, tmp5, + tmp6); + crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, + in2, in1, in_out, + tmp1, tmp2, tmp3, + w_xtmp1, w_xtmp2, w_xtmp3, + tmp4, tmp5, + tmp6); + crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, + in2, in1, in_out, + tmp1, tmp2, tmp3, + w_xtmp1, w_xtmp2, w_xtmp3, + tmp4, tmp5, + tmp6); + movl(tmp1, in2); + andl(tmp1, 0x00000007); + negl(tmp1); + addl(tmp1, in2); + addl(tmp1, in1); + + BIND(L_wordByWord); + cmpl(in1, tmp1); + jcc(Assembler::greaterEqual, L_byteByByteProlog); + crc32(in_out, Address(in1,0), 4); + addl(in1, 4); + jmp(L_wordByWord); + + BIND(L_byteByByteProlog); + andl(in2, 0x00000007); + movl(tmp2, 1); + + BIND(L_byteByByte); + cmpl(tmp2, in2); + jccb(Assembler::greater, L_exit); + movb(tmp1, Address(in1, 0)); + crc32(in_out, tmp1, 1); + incl(in1); + incl(tmp2); + jmp(L_byteByByte); + + BIND(L_exit); +} +#endif // LP64 #undef BIND #undef BLOCK_COMMENT --- old/src/cpu/x86/vm/macroAssembler_x86.hpp 2015-09-15 16:47:11.272623100 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.hpp 2015-09-15 16:47:11.018623100 -0700 @@ -1275,9 +1275,42 @@ Register raxReg); #endif - // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic. + // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic. void update_byte_crc32(Register crc, Register val, Register table); void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp); + // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic + // Note on a naming convention: + // Prefix w = register only used on a Westmere+ architecture + // Prefix n = register only used on a Nehalem architecture +#ifdef _LP64 + void crc32c_ipl_alg4(Register in_out, uint32_t n, + Register tmp1, Register tmp2, Register tmp3); +#else + void crc32c_ipl_alg4(Register in_out, uint32_t n, + Register tmp1, Register tmp2, Register tmp3, + XMMRegister xtmp1, XMMRegister xtmp2); +#endif + void crc32c_pclmulqdq(XMMRegister w_xtmp1, + Register in_out, + uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, + XMMRegister w_xtmp2, + Register tmp1, + Register n_tmp2, Register n_tmp3); + void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, + XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, + Register tmp1, Register tmp2, + Register n_tmp3); + void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, + Register in_out1, Register in_out2, Register in_out3, + Register tmp1, Register tmp2, Register tmp3, + XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, + Register tmp4, Register tmp5, + Register n_tmp6); + void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, + Register tmp1, Register tmp2, Register tmp3, + Register tmp4, Register tmp5, Register tmp6, + XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, + bool is_pclmulqdq_supported); // Fold 128-bit data chunk void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset); void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf); --- old/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2015-09-15 16:47:12.859623100 -0700 +++ new/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2015-09-15 16:47:12.603623100 -0700 @@ -2943,6 +2943,63 @@ return start; } + /** + * Arguments: + * + * Inputs: + * rsp(4) - int crc + * rsp(8) - byte* buf + * rsp(12) - int length + * rsp(16) - table_start - optional (present only when doing a library_calll, + * not used by x86 algorithm) + * + * Ouput: + * rax - int crc result + */ + address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) { + assert(UseCRC32CIntrinsics, "need SSE4_2"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); + address start = __ pc(); + const Register crc = rax; // crc + const Register buf = rcx; // source java byte array address + const Register len = rdx; // length + const Register d = rbx; + const Register g = rsi; + const Register h = rdi; + const Register empty = 0; // will never be used, in order not + // to change a signature for crc32c_IPL_Alg2_Alt2 + // between 64/32 I'm just keeping it here + assert_different_registers(crc, buf, len, d, g, h); + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 + + // we need to add additional 4 because __ enter + // have just pushed ebp on a stack + Address buf_arg(rsp, 4 + 4 + 4); + Address len_arg(rsp, 4 + 4 + 8); + // Load up: + __ movl(crc, crc_arg); + __ movl(buf, buf_arg); + __ movl(len, len_arg); + __ push(d); + __ push(g); + __ push(h); + __ crc32c_ipl_alg2_alt2(crc, buf, len, + d, g, h, + empty, empty, empty, + xmm0, xmm1, xmm2, + is_pclmulqdq_supported); + __ pop(h); + __ pop(g); + __ pop(d); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + // Safefetch stubs. void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) { @@ -3156,6 +3213,13 @@ StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table; StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); } + + if (UseCRC32CIntrinsics) { + bool supports_clmul = VM_Version::supports_clmul(); + StubRoutines::x86::generate_CRC32C_table(supports_clmul); + StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table; + StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul); + } } --- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-09-15 16:47:14.516623100 -0700 +++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-09-15 16:47:14.258623100 -0700 @@ -3895,6 +3895,64 @@ return start; } + /** + * Arguments: + * + * Inputs: + * c_rarg0 - int crc + * c_rarg1 - byte* buf + * c_rarg2 - long length + * c_rarg3 - table_start - optional (present only when doing a library_calll, + * not used by x86 algorithm) + * + * Ouput: + * rax - int crc result + */ + address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) { + assert(UseCRC32CIntrinsics, "need SSE4_2"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); + address start = __ pc(); + //reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs + //Windows RCX RDX R8 R9 none none XMM0..XMM3 + //Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7 + const Register crc = c_rarg0; // crc + const Register buf = c_rarg1; // source java byte array address + const Register len = c_rarg2; // length + const Register a = rax; + const Register j = r9; + const Register k = r10; + const Register l = r11; +#ifdef _WIN64 + const Register y = rdi; + const Register z = rsi; +#else + const Register y = rcx; + const Register z = r8; +#endif + assert_different_registers(crc, buf, len, a, j, k, l, y, z); + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame +#ifdef _WIN64 + __ push(y); + __ push(z); +#endif + __ crc32c_ipl_alg2_alt2(crc, buf, len, + a, j, k, + l, y, z, + c_farg0, c_farg1, c_farg2, + is_pclmulqdq_supported); + __ movl(rax, crc); +#ifdef _WIN64 + __ pop(z); + __ pop(y); +#endif + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } /** * Arguments: @@ -4239,6 +4297,13 @@ StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table; StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); } + + if (UseCRC32CIntrinsics) { + bool supports_clmul = VM_Version::supports_clmul(); + StubRoutines::x86::generate_CRC32C_table(supports_clmul); + StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table; + StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul); + } } void generate_all() { --- old/src/cpu/x86/vm/stubRoutines_x86.cpp 2015-09-15 16:47:16.187623100 -0700 +++ new/src/cpu/x86/vm/stubRoutines_x86.cpp 2015-09-15 16:47:15.932123100 -0700 @@ -27,6 +27,7 @@ #include "runtime/frame.inline.hpp" #include "runtime/stubRoutines.hpp" #include "runtime/thread.inline.hpp" +#include "crc32c.h" // Implementation of the platform-specific part of StubRoutines - for // a description of how to extend it, see the stubRoutines.hpp file. @@ -130,3 +131,107 @@ 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL, 0x2d02ef8dUL }; + +#define D 32 +#define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41) + +#define TILL_CYCLE 31 +uint32_t _crc32c_pow_2k_table[TILL_CYCLE]; // because _crc32c_pow_2k_table[TILL_CYCLE == 31] == _crc32c_pow_2k_table[0] + +// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 8 +// Listing 1: Multiplication of normalized polynomials +// "a" and "b" occupy D least significant bits. +uint32_t crc32c_multiply(uint32_t a, uint32_t b) { + uint32_t product = 0; + uint32_t b_pow_x_table[D + 1]; // b_pow_x_table[k] = (b * x**k) mod P + b_pow_x_table[0] = b; + for (int k = 0; k < D; ++k) { + // If "a" has non-zero coefficient at x**k,/ add ((b * x**k) mod P) to the result. + if ((a & (uint64_t)(1 << (D - 1 - k))) != 0) product ^= b_pow_x_table[k]; + + // Compute b_pow_x_table[k+1] = (b ** x**(k+1)) mod P. + if (b_pow_x_table[k] & 1) { + // If degree of (b_pow_x_table[k] * x) is D, then + // degree of (b_pow_x_table[k] * x - P) is less than D. + b_pow_x_table[k + 1] = (b_pow_x_table[k] >> 1) ^ P; + } + else { + b_pow_x_table[k + 1] = b_pow_x_table[k] >> 1; + } + } + return product; +} +#undef D +#undef P + +// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 9 +void crc32c_init_pow_2k(void) { + // _crc32c_pow_2k_table(0) = + // x^(2^k) mod P(x) = x mod P(x) = x + // Since we are operating on a reflected values + // x = 10b, reflect(x) = 0x40000000 + _crc32c_pow_2k_table[0] = 0x40000000; + + for (int k = 1; k < TILL_CYCLE; k++) { + // _crc32c_pow_2k_table(k+1) = _crc32c_pow_2k_table(k-1)^2 mod P(x) + uint32_t tmp = _crc32c_pow_2k_table[k - 1]; + _crc32c_pow_2k_table[k] = crc32c_multiply(tmp, tmp); + } +} + +// x^N mod P(x) +uint32_t crc32c_f_pow_n(uint32_t n) { + // result = 1 (polynomial) + uint32_t one, result = 0x80000000, i = 0; + + while (one = (n & 1), (n == 1 || n - one > 0)) { + if (one) { + result = crc32c_multiply(result, _crc32c_pow_2k_table[i]); + } + n >>= 1; + i++; + } + + return result; +} + +juint *StubRoutines::x86::_crc32c_table; + +void StubRoutines::x86::generate_CRC32C_table(bool is_pclmulqdq_table_supported) { + + static juint pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; + + crc32c_init_pow_2k(); + + pow_n[0] = crc32c_f_pow_n(CRC32C_HIGH * 8); // 8N * 8 = 64N + pow_n[1] = crc32c_f_pow_n(CRC32C_HIGH * 8 * 2); // 128N + + pow_n[2] = crc32c_f_pow_n(CRC32C_MIDDLE * 8); + pow_n[3] = crc32c_f_pow_n(CRC32C_MIDDLE * 8 * 2); + + pow_n[4] = crc32c_f_pow_n(CRC32C_LOW * 8); + pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1] = + crc32c_f_pow_n(CRC32C_LOW * 8 * 2); + + if (is_pclmulqdq_table_supported) { + _crc32c_table = pow_n; + } else { + static julong pclmulqdq_table[CRC32C_NUM_PRECOMPUTED_CONSTANTS * 256]; + + for (int j = 0; j < CRC32C_NUM_PRECOMPUTED_CONSTANTS; j++) { + static juint X_CONST = pow_n[j]; + for (int64_t i = 0; i < 256; i++) { // to force 64 bit wide computations + // S. Gueron / Information Processing Letters 112 (2012) 184 + // Algorithm 3: Generating a carry-less multiplication lookup table. + // Input: A 32-bit constant, X_CONST. + // Output: A table of 256 entries, each one is a 64-bit quadword, + // that can be used for computing "byte" * X_CONST, for a given byte. + pclmulqdq_table[j * 256 + i] = + ((i & 1) * X_CONST) ^ ((i & 2) * X_CONST) ^ ((i & 4) * X_CONST) ^ + ((i & 8) * X_CONST) ^ ((i & 16) * X_CONST) ^ ((i & 32) * X_CONST) ^ + ((i & 64) * X_CONST) ^ ((i & 128) * X_CONST); + } + } + _crc32c_table = (juint*)pclmulqdq_table; + } +} --- old/src/cpu/x86/vm/stubRoutines_x86.hpp 2015-09-15 16:47:17.708623100 -0700 +++ new/src/cpu/x86/vm/stubRoutines_x86.hpp 2015-09-15 16:47:17.450623100 -0700 @@ -36,6 +36,8 @@ // masks and table for CRC32 static uint64_t _crc_by128_masks[]; static juint _crc_table[]; + // table for CRC32C + static juint* _crc32c_table; // swap mask for ghash static address _ghash_long_swap_mask_addr; static address _ghash_byte_swap_mask_addr; @@ -46,5 +48,6 @@ static address crc_by128_masks_addr() { return (address)_crc_by128_masks; } static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; } static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; } + static void generate_CRC32C_table(bool is_pclmulqdq_supported); #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP --- old/src/cpu/x86/vm/templateInterpreter_x86_32.cpp 2015-09-15 16:47:19.219123100 -0700 +++ new/src/cpu/x86/vm/templateInterpreter_x86_32.cpp 2015-09-15 16:47:18.965123100 -0700 @@ -790,18 +790,25 @@ const Register buf = rdx; // source java byte array address const Register len = rdi; // length + // value x86_32 + // interp. arg ptr ESP + 4 + // int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int len) + // 3 2 1 0 + // int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len) + // 4 2,3 1 0 + // Arguments are reversed on java expression stack - __ movl(len, Address(rsp, wordSize)); // Length + __ movl(len, Address(rsp, 4 + 0)); // Length // Calculate address of start element if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) { - __ movptr(buf, Address(rsp, 3*wordSize)); // long buf - __ addptr(buf, Address(rsp, 2*wordSize)); // + offset - __ movl(crc, Address(rsp, 5*wordSize)); // Initial CRC + __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // long buf + __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset + __ movl(crc, Address(rsp, 4 + 4 * wordSize)); // Initial CRC } else { - __ movptr(buf, Address(rsp, 3*wordSize)); // byte[] array + __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // byte[] array __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size - __ addptr(buf, Address(rsp, 2*wordSize)); // + offset - __ movl(crc, Address(rsp, 4*wordSize)); // Initial CRC + __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset + __ movl(crc, Address(rsp, 4 + 3 * wordSize)); // Initial CRC } __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32()), crc, buf, len); @@ -819,6 +826,53 @@ return entry; } + return generate_native_entry(false); +} + +/** +* Method entry for static native methods: +* int java.util.zip.CRC32C.updateBytes(int crc, byte[] b, int off, int end) +* int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end) +*/ +address InterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { + if (UseCRC32CIntrinsics) { + address entry = __ pc(); + // Load parameters + const Register crc = rax; // crc + const Register buf = rcx; // source java byte array address + const Register len = rdx; // length + const Register end = len; + + // value x86_32 + // interp. arg ptr ESP + 4 + // int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int end) + // 3 2 1 0 + // int java.util.zip.CRC32.updateByteBuffer(int crc, long address, int off, int end) + // 4 2,3 1 0 + + // Arguments are reversed on java expression stack + __ movl(end, Address(rsp, 4 + 0)); // end + __ subl(len, Address(rsp, 4 + 1 * wordSize)); // end - offset == length + // Calculate address of start element + if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) { + __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // long address + __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset + __ movl(crc, Address(rsp, 4 + 4 * wordSize)); // Initial CRC + } else { + __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // byte[] array + __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size + __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset + __ movl(crc, Address(rsp, 4 + 3 * wordSize)); // Initial CRC + } + __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32C()), crc, buf, len); + // result in rax + // _areturn + __ pop(rdi); // get return address + __ mov(rsp, rsi); // set sp to sender sp + __ jmp(rdi); + + return entry; + } return generate_native_entry(false); } --- old/src/cpu/x86/vm/templateInterpreter_x86_64.cpp 2015-09-15 16:47:20.825623100 -0700 +++ new/src/cpu/x86/vm/templateInterpreter_x86_64.cpp 2015-09-15 16:47:20.568123100 -0700 @@ -804,6 +804,57 @@ return generate_native_entry(false); } +/** +* Method entry for static native methods: +* int java.util.zip.CRC32C.updateBytes(int crc, byte[] b, int off, int end) +* int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end) +*/ +address InterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { + if (UseCRC32CIntrinsics) { + address entry = __ pc(); + // Load parameters + const Register crc = c_rarg0; // crc + const Register buf = c_rarg1; // source java byte array address + const Register len = c_rarg2; + const Register off = c_rarg3; // offset + const Register end = len; + + // Arguments are reversed on java expression stack + // Calculate address of start element + if (kind == Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer) { + __ movptr(buf, Address(rsp, 3 * wordSize)); // long buf + __ movl2ptr(off, Address(rsp, 2 * wordSize)); // offset + __ addq(buf, off); // + offset + __ movl(crc, Address(rsp, 5 * wordSize)); // Initial CRC + // Note on 5 * wordSize vs. 4 * wordSize: + // * int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end) + // 4 2,3 1 0 + // end starts at SP + 8 + // The Java(R) Virtual Machine Specification Java SE 7 Edition + // 4.10.2.3. Values of Types long and double + // "When calculating operand stack length, values of type long and double have length two." + } else { + __ movptr(buf, Address(rsp, 3 * wordSize)); // byte[] array + __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size + __ movl2ptr(off, Address(rsp, 2 * wordSize)); // offset + __ addq(buf, off); // + offset + __ movl(crc, Address(rsp, 4 * wordSize)); // Initial CRC + } + __ movl(end, Address(rsp, wordSize)); // end + __ subl(end, off); // end - off + __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32C()), crc, buf, len); + // result in rax + // _areturn + __ pop(rdi); // get return address + __ mov(rsp, r13); // set sp to sender sp + __ jmp(rdi); + + return entry; + } + + return generate_native_entry(false); +} + // Interpreter stub for calling a native method. (asm interpreter) // This sets up a somewhat different looking stack for calling the // native method than the typical interpreter frame setup. --- old/src/cpu/x86/vm/vm_version_x86.cpp 2015-09-15 16:47:22.420123100 -0700 +++ new/src/cpu/x86/vm/vm_version_x86.cpp 2015-09-15 16:47:22.163623100 -0700 @@ -665,6 +665,18 @@ FLAG_SET_DEFAULT(UseCRC32Intrinsics, false); } + if (supports_sse4_2()) { + if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) { + UseCRC32CIntrinsics = true; + } + } + else if (UseCRC32CIntrinsics) { + if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) { + warning("CRC32C intrinsics are not available on this CPU"); + } + FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false); + } + // The AES intrinsic stubs require AES instruction support (of course) // but also require sse3 mode for instructions it use. if (UseAES && (UseSSE > 2)) { @@ -708,12 +720,6 @@ FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); } - if (UseCRC32CIntrinsics) { - if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) - warning("CRC32C intrinsics are not available on this CPU"); - FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false); - } - if (UseAdler32Intrinsics) { warning("Adler32Intrinsics not available on this CPU."); FLAG_SET_DEFAULT(UseAdler32Intrinsics, false); --- old/src/cpu/zero/vm/interpreterGenerator_zero.hpp 2015-09-15 16:47:23.960123100 -0700 +++ new/src/cpu/zero/vm/interpreterGenerator_zero.hpp 2015-09-15 16:47:23.705623100 -0700 @@ -42,4 +42,5 @@ // Not supported address generate_CRC32_update_entry() { return NULL; } address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; } + address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; } #endif // CPU_ZERO_VM_INTERPRETERGENERATOR_ZERO_HPP --- old/src/share/vm/interpreter/abstractInterpreter.hpp 2015-09-15 16:47:25.454623100 -0700 +++ new/src/share/vm/interpreter/abstractInterpreter.hpp 2015-09-15 16:47:25.198623100 -0700 @@ -90,6 +90,8 @@ java_util_zip_CRC32_update, // implementation of java.util.zip.CRC32.update() java_util_zip_CRC32_updateBytes, // implementation of java.util.zip.CRC32.updateBytes() java_util_zip_CRC32_updateByteBuffer, // implementation of java.util.zip.CRC32.updateByteBuffer() + java_util_zip_CRC32C_updateBytes, // implementation of java.util.zip.CRC32C.updateBytes(crc, b[], off, end) + java_util_zip_CRC32C_updateDirectByteBuffer, // implementation of java.util.zip.CRC32C.updateDirectByteBuffer(crc, address, off, end) java_lang_Float_intBitsToFloat, // implementation of java.lang.Float.intBitsToFloat() java_lang_Float_floatToRawIntBits, // implementation of java.lang.Float.floatToRawIntBits() java_lang_Double_longBitsToDouble, // implementation of java.lang.Double.longBitsToDouble() --- old/src/share/vm/interpreter/interpreter.cpp 2015-09-15 16:47:27.041623100 -0700 +++ new/src/share/vm/interpreter/interpreter.cpp 2015-09-15 16:47:26.786623100 -0700 @@ -234,6 +234,13 @@ case vmIntrinsics::_updateByteBufferCRC32 : return java_util_zip_CRC32_updateByteBuffer; } } + if (UseCRC32CIntrinsics) { + // Use optimized stub code for CRC32C methods. + switch (m->intrinsic_id()) { + case vmIntrinsics::_updateBytesCRC32C : return java_util_zip_CRC32C_updateBytes; + case vmIntrinsics::_updateDirectByteBufferCRC32C : return java_util_zip_CRC32C_updateDirectByteBuffer; + } + } switch(m->intrinsic_id()) { case vmIntrinsics::_intBitsToFloat: return java_lang_Float_intBitsToFloat; @@ -349,6 +356,8 @@ case java_util_zip_CRC32_update : tty->print("java_util_zip_CRC32_update"); break; case java_util_zip_CRC32_updateBytes : tty->print("java_util_zip_CRC32_updateBytes"); break; case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break; + case java_util_zip_CRC32C_updateBytes : tty->print("java_util_zip_CRC32C_updateBytes"); break; + case java_util_zip_CRC32C_updateDirectByteBuffer: tty->print("java_util_zip_CRC32C_updateDirectByteByffer"); break; default: if (kind >= method_handle_invoke_FIRST && kind <= method_handle_invoke_LAST) { @@ -567,6 +576,10 @@ : // fall thru case Interpreter::java_util_zip_CRC32_updateByteBuffer : entry_point = generate_CRC32_updateBytes_entry(kind); break; + case Interpreter::java_util_zip_CRC32C_updateBytes + : // fall thru + case Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer + : entry_point = generate_CRC32C_updateBytes_entry(kind); break; #if defined(TARGET_ARCH_x86) && !defined(_LP64) // On x86_32 platforms, a special entry is generated for the following four methods. // On other platforms the normal entry is used to enter these methods. @@ -582,9 +595,9 @@ case Interpreter::java_lang_Float_intBitsToFloat: case Interpreter::java_lang_Float_floatToRawIntBits: case Interpreter::java_lang_Double_longBitsToDouble: - case Interpreter::java_lang_Double_doubleToRawLongBits: - entry_point = generate_native_entry(false); - break; + case Interpreter::java_lang_Double_doubleToRawLongBits: + entry_point = generate_native_entry(false); + break; #endif // defined(TARGET_ARCH_x86) && !defined(_LP64) #endif // CC_INTERP default: --- old/src/share/vm/interpreter/templateInterpreter.cpp 2015-09-15 16:47:28.583623100 -0700 +++ new/src/share/vm/interpreter/templateInterpreter.cpp 2015-09-15 16:47:28.325623100 -0700 @@ -418,6 +418,11 @@ method_entry(java_util_zip_CRC32_updateByteBuffer) } + if (UseCRC32CIntrinsics) { + method_entry(java_util_zip_CRC32C_updateBytes) + method_entry(java_util_zip_CRC32C_updateDirectByteBuffer) + } + method_entry(java_lang_Float_intBitsToFloat); method_entry(java_lang_Float_floatToRawIntBits); method_entry(java_lang_Double_longBitsToDouble); --- old/src/share/vm/runtime/stubRoutines.cpp 2015-09-15 16:47:30.112623100 -0700 +++ new/src/share/vm/runtime/stubRoutines.cpp 2015-09-15 16:47:29.855123100 -0700 @@ -136,8 +136,9 @@ address StubRoutines::_sha512_implCompressMB = NULL; address StubRoutines::_updateBytesCRC32 = NULL; -address StubRoutines::_crc_table_adr = NULL; +address StubRoutines::_crc_table_adr = NULL; +address StubRoutines::_crc32c_table_addr = NULL; address StubRoutines::_updateBytesCRC32C = NULL; address StubRoutines::_updateBytesAdler32 = NULL; --- old/src/share/vm/runtime/stubRoutines.hpp 2015-09-15 16:47:31.592623100 -0700 +++ new/src/share/vm/runtime/stubRoutines.hpp 2015-09-15 16:47:31.335623100 -0700 @@ -197,6 +197,7 @@ static address _updateBytesCRC32; static address _crc_table_adr; + static address _crc32c_table_addr; static address _updateBytesCRC32C; static address _updateBytesAdler32; @@ -364,6 +365,7 @@ static address updateBytesCRC32() { return _updateBytesCRC32; } static address crc_table_addr() { return _crc_table_adr; } + static address crc32c_table_addr() { return _crc32c_table_addr; } static address updateBytesCRC32C() { return _updateBytesCRC32C; } static address updateBytesAdler32() { return _updateBytesAdler32; } --- old/src/share/vm/runtime/vmStructs.cpp 2015-09-15 16:47:33.140123100 -0700 +++ new/src/share/vm/runtime/vmStructs.cpp 2015-09-15 16:47:32.877623100 -0700 @@ -832,6 +832,7 @@ static_field(StubRoutines, _ghash_processBlocks, address) \ static_field(StubRoutines, _updateBytesCRC32, address) \ static_field(StubRoutines, _crc_table_adr, address) \ + static_field(StubRoutines, _crc32c_table_addr, address) \ static_field(StubRoutines, _updateBytesCRC32C, address) \ static_field(StubRoutines, _multiplyToLen, address) \ static_field(StubRoutines, _squareToLen, address) \ --- /dev/null 2015-09-15 16:47:34.000000000 -0700 +++ new/src/cpu/x86/vm/crc32c.h 2015-09-15 16:47:34.481123100 -0700 @@ -0,0 +1,66 @@ +/* +* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* This code is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License version 2 only, as +* published by the Free Software Foundation. +* +* This code is distributed in the hope that it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* version 2 for more details (a copy is included in the LICENSE file that +* accompanied this code). +* +* You should have received a copy of the GNU General Public License version +* 2 along with this work; if not, write to the Free Software Foundation, +* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +* +* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +* or visit www.oracle.com if you need additional information or have any +* questions. +* +*/ + +enum { + // S. Gueron / Information Processing Letters 112 (2012) 184 + // shows than anything above 6K and below 32K is a good choice + // 32K does not deliver any further performance gains + // 6K=8*256 (*3 as we compute 3 blocks together) + // + // Thus selecting the smallest value so it could apply to the largest number + // of buffer sizes. + CRC32C_HIGH = 8 * 256, + + // empirical + // based on ubench study using methodology described in + // V. Gopal et al. / Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction April 2011 8 + // + // arbitrary value between 27 and 256 + CRC32C_MIDDLE = 8 * 86, + + // V. Gopal et al. / Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction April 2011 9 + // shows that 240 and 1024 are equally good choices as the 216==8*27 + // + // Selecting the smallest value which resulted in a significant performance improvement over + // sequential version + CRC32C_LOW = 8 * 27, + + CRC32C_NUM_ChunkSizeInBytes = 3, + + // We need to compute powers of 64N and 128N for each "chunk" size + CRC32C_NUM_PRECOMPUTED_CONSTANTS = ( 2 * CRC32C_NUM_ChunkSizeInBytes ) +}; +// Notes: +// 1. Why we need to choose a "chunk" approach? +// Overhead of computing a powers and powers of for an arbitrary buffer of size N is significant +// (implementation approaches a library perf.) +// 2. Why only 3 "chunks"? +// Performance experiments results showed that a HIGH+LOW was not delivering a stable speedup +// curve. +// +// Disclaimer: +// If you ever decide to increase/decrease number of "chunks" be sure to modify +// a) constants table generation (hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp) +// b) constant fetch from that table (macroAssembler_x86.cpp) +// c) unrolled for loop (macroAssembler_x86.cpp)