--- old/src/cpu/x86/vm/assembler_x86.cpp 2015-08-26 13:22:13.057053800 -0700 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2015-08-26 13:22:12.799028000 -0700 @@ -1605,6 +1605,85 @@ emit_int8((unsigned char)0xA2); } +// Opcode / Instruction Op / En 64 - Bit Mode Compat / Leg Mode Description Implemented +// F2 0F 38 F0 / r CRC32 r32, r / m8 RM Valid Valid Accumulate CRC32 on r / m8. v +// F2 REX 0F 38 F0 / r CRC32 r32, r / m8* RM Valid N.E. Accumulate CRC32 on r / m8. - +// F2 REX.W 0F 38 F0 / r CRC32 r64, r / m8 RM Valid N.E. Accumulate CRC32 on r / m8. - +// +// F2 0F 38 F1 / r CRC32 r32, r / m16 RM Valid Valid Accumulate CRC32 on r / m16. v +// +// F2 0F 38 F1 / r CRC32 r32, r / m32 RM Valid Valid Accumulate CRC32 on r / m32. v +// +// F2 REX.W 0F 38 F1 / r CRC32 r64, r / m64 RM Valid N.E. Accumulate CRC32 on r / m64. v +void Assembler::crc32(Register crc, Register v, int8_t sizeInBytes) { + assert(VM_Version::supports_sse4_2(), ""); + int8_t w = 0x01; + Prefix p = Prefix_EMPTY; + + emit_int8((int8_t)0xF2); + switch (sizeInBytes) { + case 1: + w = 0; + break; + case 2: + case 4: + break; + LP64_ONLY(case 8:) + // This instruction is not valid in 32 bits + // Note: + // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf + // + // Page B - 72 Vol. 2C says + // qwreg2 to qwreg 1111 0010 : 0100 1R0B : 0000 1111 : 0011 1000 : 1111 0000 : 11 qwreg1 qwreg2 + // mem64 to qwreg 1111 0010 : 0100 1R0B : 0000 1111 : 0011 1000 : 1111 0000 : mod qwreg r / m + // F0!!! + // while 3 - 208 Vol. 2A + // F2 REX.W 0F 38 F1 / r CRC32 r64, r / m64 RM Valid N.E.Accumulate CRC32 on r / m64. + // + // the 0 on a last bit is reserved for a different flavor of this instruction : + // F2 REX.W 0F 38 F0 / r CRC32 r64, r / m8 RM Valid N.E.Accumulate CRC32 on r / m8. + p = REX_W; + break; + default: + assert(0, "Unsupported value for a sizeInBytes argument"); + break; + } + LP64_ONLY(prefix(crc, v, p);) + emit_int8((int8_t)0x0F); + emit_int8(0x38); + emit_int8((int8_t)(0xF0 | w)); + emit_int8(0xC0 | ((crc->encoding() & 0x7) << 3) | (v->encoding() & 7)); +} + +void Assembler::crc32(Register crc, Address adr, int8_t sizeInBytes) { + assert(VM_Version::supports_sse4_2(), ""); + InstructionMark im(this); + int8_t w = 0x01; + Prefix p = Prefix_EMPTY; + + emit_int8((int8_t)0xF2); + switch (sizeInBytes) { + case 1: + w = 0; + break; + case 2: + case 4: + break; + LP64_ONLY(case 8:) + // This instruction is not valid in 32 bits + p = REX_W; + break; + default: + assert(0, "Unsupported value for a sizeInBytes argument"); + break; + } + LP64_ONLY(prefix(crc, adr, p);) + emit_int8((int8_t)0x0F); + emit_int8(0x38); + emit_int8((int8_t)(0xF0 | w)); + emit_operand(crc, adr); +} + void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3); @@ -6004,6 +6083,14 @@ emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding())); } +// 0F A4 / r ib +void Assembler::shldl(Register dst, Register src, int8_t imm8) { + emit_int8(0x0F); + emit_int8((unsigned char)0xA4); + emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding())); + emit_int8(imm8); +} + void Assembler::shrdl(Register dst, Register src) { emit_int8(0x0F); emit_int8((unsigned char)0xAD); @@ -6189,6 +6276,40 @@ } } +void Assembler::prefix(Register dst, Register src, Prefix p) { + if (src->encoding() >= 8) { + p = (Prefix)(p | REX_B); + } + if (dst->encoding() >= 8) { + p = (Prefix)( p | REX_R); + } + if (p != Prefix_EMPTY) { + // do not generate an empty prefix + prefix(p); + } +} + +void Assembler::prefix(Register dst, Address adr, Prefix p) { + if (adr.base_needs_rex()) { + if (adr.index_needs_rex()) { + assert(false, "prefix(Register dst, Address adr, Prefix p) does not support handling of an X"); + } else { + prefix(REX_B); + } + } else { + if (adr.index_needs_rex()) { + assert(false, "prefix(Register dst, Address adr, Prefix p) does not support handling of an X"); + } + } + if (dst->encoding() >= 8) { + p = (Prefix)(p | REX_R); + } + if (p != Prefix_EMPTY) { + // do not generate an empty prefix + prefix(p); + } +} + void Assembler::prefix(Address adr) { if (adr.base_needs_rex()) { if (adr.index_needs_rex()) { --- old/src/cpu/x86/vm/assembler_x86.hpp 2015-08-26 13:22:14.946242700 -0700 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2015-08-26 13:22:14.690217100 -0700 @@ -504,7 +504,8 @@ VEX_3bytes = 0xC4, VEX_2bytes = 0xC5, - EVEX_4bytes = 0x62 + EVEX_4bytes = 0x62, + Prefix_EMPTY = 0x0 }; enum VexPrefix { @@ -608,6 +609,8 @@ int prefixq_and_encode(int dst_enc, int src_enc); void prefix(Register reg); + void prefix(Register dst, Register src, Prefix p); + void prefix(Register dst, Address adr, Prefix p); void prefix(Address adr); void prefixq(Address adr); @@ -1165,6 +1168,10 @@ // Identify processor type and features void cpuid(); + // CRC32C + void crc32(Register crc, Register v, int8_t sizeInBytes); + void crc32(Register crc, Address adr, int8_t sizeInBytes); + // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value void cvtsd2ss(XMMRegister dst, XMMRegister src); void cvtsd2ss(XMMRegister dst, Address src); @@ -1764,6 +1771,7 @@ void setb(Condition cc, Register dst); void shldl(Register dst, Register src); + void shldl(Register dst, Register src, int8_t imm8); void shll(Register dst, int imm8); void shll(Register dst); --- old/src/cpu/x86/vm/assembler_x86.inline.hpp 2015-08-26 13:22:16.618409900 -0700 +++ new/src/cpu/x86/vm/assembler_x86.inline.hpp 2015-08-26 13:22:16.361384200 -0700 @@ -37,6 +37,8 @@ inline int Assembler::prefixq_and_encode(int dst_enc, int src_enc) { return dst_enc << 3 | src_enc; } inline void Assembler::prefix(Register reg) {} +inline void Assembler::prefix(Register dst, Register src, Prefix p) {} +inline void Assembler::prefix(Register dst, Address adr, Prefix p) {} inline void Assembler::prefix(Address adr) {} inline void Assembler::prefixq(Address adr) {} --- old/src/cpu/x86/vm/interpreterGenerator_x86.hpp 2015-08-26 13:22:18.228570900 -0700 +++ new/src/cpu/x86/vm/interpreterGenerator_x86.hpp 2015-08-26 13:22:17.969545000 -0700 @@ -42,6 +42,7 @@ address generate_Reference_get_entry(); address generate_CRC32_update_entry(); address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind); + address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind); void lock_method(void); void generate_stack_overflow_check(void); --- old/src/cpu/x86/vm/macroAssembler_x86.cpp 2015-08-26 13:22:19.832731300 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp 2015-08-26 13:22:19.573705400 -0700 @@ -8467,8 +8467,504 @@ notl(crc); // ~c } -#undef BIND -#undef BLOCK_COMMENT +namespace CRC32C { +#include "crc32c.h" + +#define Nehalem(x) x +#define Westmere(x) x + +#undef IN +#define IN(x) x +#define INOUT(x) x +#undef OUT +#define OUT(x) x +#define Scratch(x) x + +#undef D + +#ifdef _LP64 +// S. Gueron / Information Processing Letters 112 (2012) 184 +// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table. +// Input: A 32 bit value B = [byte3, byte2, byte1, byte0]. +// Output: the 64-bit carry-less product of B * CONST + void IPL_Alg4(INOUT(Register B), uint32_t n, + Scratch(Register C), Scratch(Register D), Scratch(Register Z), + MacroAssembler * This) { + This->lea(Z, ExternalAddress(StubRoutines::crc32c_table_addr())); + if (n > 0) { + This->addq(Z, n * 256 * 8); + } + // Q1 = TABLEExt[n][B & 0xFF]; + This->movl(C, B); + This->andl(C, 0x000000FF); + This->shll(C, 3); + This->addq(C, Z); + This->movq(C, Address(C, 0)); + + // Q2 = TABLEExt[n][B >> 8 & 0xFF]; + This->movl(D, B); + This->shrl(D, 8); + This->andl(D, 0x000000FF); + This->shll(D, 3); + This->addq(D, Z); + This->movq(D, Address(D, 0)); + + This->shlq(D, 8); + This->xorq(C, D); + + // Q3 = TABLEExt[n][B >> 16 & 0xFF]; + This->movl(D, B); + This->shrl(D, 16); + This->andl(D, 0x000000FF); + This->shll(D, 3); + This->addq(D, Z); + This->movq(D, Address(D, 0)); + + This->shlq(D, 16); + This->xorq(C, D); + + // Q4 = TABLEExt[n][B >> 24 & 0xFF]; + This->shrl(B, 24); + This->andl(B, 0x000000FF); + This->shll(B, 3); + This->addq(B, Z); + This->movq(B, Address(B, 0)); + + This->shlq(B, 24); + This->xorq(B, C); + // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; + } + + void PCLMULQDQ(Westmere(Scratch(XMMRegister crcXMM)), + INOUT(Register crc), + uint32_t CONSTOrPreCompConstIndex, bool IsPclmulqdqSupported, + Westmere(Scratch(XMMRegister DXMM)), + Scratch(Register A), + Nehalem(Scratch(Register B)), Nehalem(Scratch(Register C)), + MacroAssembler * This) { + if (IsPclmulqdqSupported) { + This->movdl(crcXMM, crc); // modified blindly + + This->movl(A, CONSTOrPreCompConstIndex); + This->movdl(DXMM, A); + This->pclmulqdq(crcXMM, DXMM, 0); + + This->movdq(crc, crcXMM); + } else { + IPL_Alg4(crc, CONSTOrPreCompConstIndex, A, B, C, This); + } + } + + // Recombination Alternative 2: No bit-reflections + // T1 = (CRC_A * U1) << 1 + // T2 = (CRC_B * U2) << 1 + // C1 = T1 >> 32 + // C2 = T2 >> 32 + // T1 = T1 & 0xFFFFFFFF + // T2 = T2 & 0xFFFFFFFF + // T1 = CRC32(0, T1) + // T2 = CRC32(0, T2) + // C1 = C1 ^ T1 + // C2 = C2 ^ T2 + // CRC = C1 ^ C2 ^ CRC_C + void RecAlt2(uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, INOUT(Register crcA), IN(Scratch(Register crcB)), IN(Register crcC), + Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), + Scratch(Register E), Scratch(Register F), + Nehalem(Scratch(Register G)), + MacroAssembler * This) { + PCLMULQDQ(AXMM, crcA, CONSTOrPreCompConstIndexU1, IsPclmulqdqSupported, CXMM, E, F, G, This); + PCLMULQDQ(BXMM, crcB, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, CXMM, E, F, G, This); + This->shlq(crcA, 1); + This->movl(E, crcA); + This->shrq(crcA, 32); + This->xorl(F, F); + This->crc32(F, E, 4); + This->xorl(crcA, F); // we don't care about upper 32 bit contents here + This->shlq(crcB, 1); + This->movl(E, crcB); + This->shrq(crcB, 32); + This->xorl(F, F); + This->crc32(F, E, 4); + This->xorl(crcB, F); + This->xorl(crcA, crcB); + This->xorl(crcA, crcC); + } + + // Set N to predefined value + // Subtract from a lenght of a buffer + // execute in a loop: + // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0 + // for i = 1 to N do + // CRC_A = CRC32(CRC_A, A[i]) + // CRC_B = CRC32(CRC_B, B[i]) + // CRC_C = CRC32(CRC_C, C[i]) + // end for + // Recombine + void ProcChunk(uint32_t size, uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, + INOUT(Register len), INOUT(Register buf), INOUT(Register crc), + Scratch(Register E), Scratch(Register F), Scratch(Register end), + Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), + Scratch(Register G), Scratch(Register H), + Nehalem(Scratch(Register I)), + MacroAssembler * This) { + Label L_processPartitions; + Label L_processPartition; + Label L_exit; + + This->bind(L_processPartitions); + This->cmpl(len, 3 * size); + This->jcc(Assembler::less, L_exit); + This->xorl(E, E); + This->xorl(F, F); + This->movq(end, buf); + This->addq(end, size); + + This->bind(L_processPartition); + This->crc32(crc, Address(buf, 0), 8); + This->crc32(E, Address(buf, size), 8); + This->crc32(F, Address(buf, size * 2), 8); + This->addq(buf, 8); + This->cmpq(buf, end); + This->jcc(Assembler::less, L_processPartition); + RecAlt2(CONSTOrPreCompConstIndexU1, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, crc, E, F, + AXMM, BXMM, CXMM, + G, H, + I, + This); + This->addq(buf, 2 * size); + This->subl(len, 3 * size); + This->jmp(L_processPartitions); + + This->bind(L_exit); + } +#else +void IPL_Alg4(INOUT(Register B), uint32_t n, + Scratch(Register C), Scratch(Register D), Scratch(Register Z), + Scratch(XMMRegister CXMM), Scratch(XMMRegister DXMM), + MacroAssembler * This) { + This->lea(Z, ExternalAddress(StubRoutines::crc32c_table_addr())); + if (n > 0) { + This->addl(Z, n * 256 * 8); + } + // Q1 = TABLEExt[n][B & 0xFF]; + This->movl(C, B); + This->andl(C, 0x000000FF); + This->shll(C, 3); + This->addl(C, Z); + This->movq(CXMM, Address(C, 0)); + + // Q2 = TABLEExt[n][B >> 8 & 0xFF]; + This->movl(D, B); + This->shrl(D, 8); + This->andl(D, 0x000000FF); + This->shll(D, 3); + This->addl(D, Z); + This->movq(DXMM, Address(D, 0)); + + This->psllq(DXMM, 8); + This->pxor(CXMM, DXMM); + + // Q3 = TABLEExt[n][B >> 16 & 0xFF]; + This->movl(D, B); + This->shrl(D, 16); + This->andl(D, 0x000000FF); + This->shll(D, 3); + This->addl(D, Z); + This->movq(DXMM, Address(D, 0)); + + This->psllq(DXMM, 16); + This->pxor(CXMM, DXMM); + + // Q4 = TABLEExt[n][B >> 24 & 0xFF]; + This->shrl(B, 24); + This->andl(B, 0x000000FF); + This->shll(B, 3); + This->addl(B, Z); + This->movq(DXMM, Address(B, 0)); + + This->psllq(DXMM, 24); + This->pxor(CXMM, DXMM); // Result in CXMM + // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; +} + +void PCLMULQDQ(Westmere(Scratch(XMMRegister crcXMM)), + INOUT(Register crc), + uint32_t CONSTOrPreCompConstIndex, bool IsPclmulqdqSupported, + Westmere(Scratch(XMMRegister DXMM)), + Scratch(Register A), + Nehalem(Scratch(Register B)), Nehalem(Scratch(Register C)), + MacroAssembler * This) { + if (IsPclmulqdqSupported) { + This->movdl(crcXMM, crc); + + This->movl(A, CONSTOrPreCompConstIndex); + This->movdl(DXMM, A); + This->pclmulqdq(crcXMM, DXMM, 0); + // Keep result in XMM since GPR is 32 bit in length + } else { + IPL_Alg4(crc, CONSTOrPreCompConstIndex, A, B, C, crcXMM, DXMM, This); + } +} + +void RecAlt2(uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, INOUT(Register crcA), IN(Scratch(Register crcB)), IN(Register crcC), + Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), + Scratch(Register E), Scratch(Register F), + Nehalem(Scratch(Register G)), + MacroAssembler * This) { + PCLMULQDQ(AXMM, crcA, CONSTOrPreCompConstIndexU1, IsPclmulqdqSupported, CXMM, E, F, G, This); + PCLMULQDQ(BXMM, crcB, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, CXMM, E, F, G, This); + + This->psllq(AXMM, 1); + This->movdl(E, AXMM); + This->psrlq(AXMM, 32); + This->movdl(crcA, AXMM); + + This->xorl(F, F); + This->crc32(F, E, 4); + This->xorl(crcA, F); + + This->psllq(BXMM, 1); + This->movdl(E, BXMM); + This->psrlq(BXMM, 32); + This->movdl(crcB, BXMM); + + This->xorl(F, F); + This->crc32(F, E, 4); + This->xorl(crcB, F); + This->xorl(crcA, crcB); + This->xorl(crcA, crcC); +} + +void ProcChunk(uint32_t size, uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, + INOUT(Register len), INOUT(Register buf), INOUT(Register crc), + Scratch(Register E), Scratch(Register F), Scratch(Register end), + Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), + Scratch(Register G), Scratch(Register H), + Nehalem(Scratch(Register I)), + MacroAssembler * This) { + Label L_processPartitions; + Label L_processPartition; + Label L_exit; + + This->bind(L_processPartitions); + This->cmpl(len, 3 * size); + This->jcc(Assembler::less, L_exit); + This->xorl(E, E); + This->xorl(F, F); + This->movl(end, buf); + This->addl(end, size); + + This->bind(L_processPartition); + This->crc32(crc, Address(buf, 0), 4); + This->crc32(E, Address(buf, size), 4); + This->crc32(F, Address(buf, size*2), 4); + This->crc32(crc, Address(buf, 0+4), 4); + This->crc32(E, Address(buf, size+4), 4); + This->crc32(F, Address(buf, size*2+4), 4); + This->addl(buf, 8); + This->cmpl(buf, end); + This->jcc(Assembler::less, L_processPartition); + + This->push(end); + This->push(len); + This->push(buf); + G = end; + H = len; + I = buf; + + RecAlt2(CONSTOrPreCompConstIndexU1, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, crc, E, F, + AXMM, BXMM, CXMM, + G, H, + I, + This); + + This->pop(buf); + This->pop(len); + This->pop(end); + + This->addl(buf, 2 * size); + This->subl(len, 3 * size); + This->jmp(L_processPartitions); + + This->bind(L_exit); +} +#endif //LP64 +} +#undef D + +#ifdef _LP64 +// Algorithm 2: Pipelined usage of the CRC32 instruction. +// Input: A buffer I of L bytes. +// Output: the CRC32C value of the buffer. +// Notations: +// Write L = 24N + r, with N = floor (L/24). +// r = L mod 24 (0 <= r < 24). +// Consider I as the concatenation of A|B|C|R, where A, B, C, each, +// N quadwords, and R consists of r bytes. +// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1 +// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1 +// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1 +// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1 +void MacroAssembler::crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len, + Scratch(Register A), Scratch(Register B), Scratch(Register C), + Scratch(Register D), Scratch(Register E), Scratch(Register F), + Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), + bool IsPclmulqdqSupported) { + uint32_t CONSTOrPreCompConstIndex[CRC32C::NUM_PRECOMPUTED_CONSTANTS]; + Label L_wordByWord; + Label L_byteByByteProlog; + Label L_byteByByte; + Label L_exit; + + if (IsPclmulqdqSupported ) { + CONSTOrPreCompConstIndex[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; + CONSTOrPreCompConstIndex[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1); + + CONSTOrPreCompConstIndex[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); + CONSTOrPreCompConstIndex[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); + + CONSTOrPreCompConstIndex[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); + CONSTOrPreCompConstIndex[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); + assert((CRC32C::NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\""); + } else { + CONSTOrPreCompConstIndex[0] = 1; + CONSTOrPreCompConstIndex[1] = 0; + + CONSTOrPreCompConstIndex[2] = 3; + CONSTOrPreCompConstIndex[3] = 2; + + CONSTOrPreCompConstIndex[4] = 5; + CONSTOrPreCompConstIndex[5] = 4; + } + CRC32C::ProcChunk(CRC32C::HIGH, CONSTOrPreCompConstIndex[0], CONSTOrPreCompConstIndex[1], IsPclmulqdqSupported, + len, buf, crc, + A, B, C, + AXMM, BXMM, CXMM, + D, E, + F, + this); + CRC32C::ProcChunk(CRC32C::MIDDLE, CONSTOrPreCompConstIndex[2], CONSTOrPreCompConstIndex[3], IsPclmulqdqSupported, + len, buf, crc, + A, B, C, + AXMM, BXMM, CXMM, + D, E, + F, + this); + CRC32C::ProcChunk(CRC32C::LOW, CONSTOrPreCompConstIndex[4], CONSTOrPreCompConstIndex[5], IsPclmulqdqSupported, + len, buf, crc, + A, B, C, + AXMM, BXMM, CXMM, + D, E, + F, + this); + movl(A, len); + andl(A, 0x00000007); + negl(A); + addl(A, len); + addq(A, buf); + + BIND(L_wordByWord); + cmpq(buf, A); + jcc(Assembler::greaterEqual, L_byteByByteProlog); + crc32(crc, Address(buf, 0), 4); + addq(buf, 4); + jmp(L_wordByWord); + + BIND(L_byteByByteProlog); + andl(len, 0x00000007); + movl(B, 1); + + BIND(L_byteByByte); + cmpl(B, len); + jccb(Assembler::greater, L_exit); + crc32(crc, Address(buf, 0), 1); + incq(buf); + incl(B); + jmp(L_byteByByte); + + BIND(L_exit); +} +#else +void MacroAssembler::crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len, + Scratch(Register A), Scratch(Register B), Scratch(Register C), + Scratch(Register D), Scratch(Register E), Scratch(Register F), + Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), + bool IsPclmulqdqSupported) { + uint32_t CONSTOrPreCompConstIndex[CRC32C::NUM_PRECOMPUTED_CONSTANTS]; + Label L_wordByWord; + Label L_byteByByteProlog; + Label L_byteByByte; + Label L_exit; + + if (IsPclmulqdqSupported) { + CONSTOrPreCompConstIndex[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; + CONSTOrPreCompConstIndex[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1); + + CONSTOrPreCompConstIndex[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); + CONSTOrPreCompConstIndex[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); + + CONSTOrPreCompConstIndex[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); + CONSTOrPreCompConstIndex[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); + } else { + CONSTOrPreCompConstIndex[0] = 1; + CONSTOrPreCompConstIndex[1] = 0; + + CONSTOrPreCompConstIndex[2] = 3; + CONSTOrPreCompConstIndex[3] = 2; + + CONSTOrPreCompConstIndex[4] = 5; + CONSTOrPreCompConstIndex[5] = 4; + } + CRC32C::ProcChunk(CRC32C::HIGH, CONSTOrPreCompConstIndex[0], CONSTOrPreCompConstIndex[1], IsPclmulqdqSupported, + len, buf, crc, + A, B, C, + AXMM, BXMM, CXMM, + D, E, + F, + this); + CRC32C::ProcChunk(CRC32C::MIDDLE, CONSTOrPreCompConstIndex[2], CONSTOrPreCompConstIndex[3], IsPclmulqdqSupported, + len, buf, crc, + A, B, C, + AXMM, BXMM, CXMM, + D, E, + F, + this); + CRC32C::ProcChunk(CRC32C::LOW, CONSTOrPreCompConstIndex[4], CONSTOrPreCompConstIndex[5], IsPclmulqdqSupported, + len, buf, crc, + A, B, C, + AXMM, BXMM, CXMM, + D, E, + F, + this); + movl(A, len); + andl(A, 0x00000007); + negl(A); + addl(A, len); + addl(A, buf); + + BIND(L_wordByWord); + cmpl(buf, A); + jcc(Assembler::greaterEqual, L_byteByByteProlog); + crc32(crc, Address(buf,0), 4); + addl(buf, 4); + jmp(L_wordByWord); + + BIND(L_byteByByteProlog); + andl(len, 0x00000007); + movl(B, 1); + + BIND(L_byteByByte); + cmpl(B, len); + jccb(Assembler::greater, L_exit); + movb(A, Address(buf, 0)); + crc32(crc, A, 1); + incl(buf); + incl(B); + jmp(L_byteByByte); + + BIND(L_exit); +} +#endif // LP64 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { --- old/src/cpu/x86/vm/macroAssembler_x86.hpp 2015-08-26 13:22:21.751923200 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.hpp 2015-08-26 13:22:21.500898100 -0700 @@ -1258,9 +1258,15 @@ Register raxReg); #endif - // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic. + // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic. void update_byte_crc32(Register crc, Register val, Register table); void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp); + // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic + void crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len, + Register A, Register B, Register C, + Register D, Register E, Register F, + XMMRegister AXMM, XMMRegister BXMM, XMMRegister CXMM, + bool IsPclmulqdqSupported); // Fold 128-bit data chunk void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset); void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf); --- old/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2015-08-26 13:22:23.385086500 -0700 +++ new/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2015-08-26 13:22:23.134061400 -0700 @@ -2941,6 +2941,63 @@ return start; } + /** + * Arguments: + * + * Inputs: + * rsp(4) - int crc + * rsp(8) - byte* buf + * rsp(12) - int length + * rsp(16) - table_start - optional (present only when doing a library_calll, + * not used by x86 algorithm) + * + * Ouput: + * rax - int crc result + */ + address generate_updateBytesCRC32C(bool IsPclmulqdqSupported) { + assert(UseCRC32CIntrinsics, "need SSE4_2"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); + address start = __ pc(); + const Register crc = rax; // crc + const Register buf = rcx; // source java byte array address + const Register len = rdx; // length + const Register D = rbx; + const Register G = rsi; + const Register H = rdi; + const Register empty = 0; // will never be used, in order not + // to change a signature for crc32c_IPL_Alg2Alt2Fast + // between 64/32 I'm just keeping it here + assert_different_registers(crc, buf, len, D, G, H); + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 + + // we need to add additional 4 because __ enter + // have just pushed ebp on a stack + Address buf_arg(rsp, 4 + 4 + 4); + Address len_arg(rsp, 4 + 4 + 8); + // Load up: + __ movl(crc, crc_arg); + __ movl(buf, buf_arg); + __ movl(len, len_arg); + __ push(D); + __ push(G); + __ push(H); + __ crc32c_IPL_Alg2Alt2Fast(crc, buf, len, + D, G, H, + empty, empty, empty, + xmm0, xmm1, xmm2, + IsPclmulqdqSupported); + __ pop(H); + __ pop(G); + __ pop(D); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + // Safefetch stubs. void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) { @@ -3154,6 +3211,13 @@ StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table; StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); } + + if (UseCRC32CIntrinsics) { + bool supports_clmul; + StubRoutines::x86::GenerateCRC32CTable(supports_clmul = VM_Version::supports_clmul()); + StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table; + StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul); + } } --- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-08-26 13:22:25.091257100 -0700 +++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-08-26 13:22:24.815229500 -0700 @@ -3895,6 +3895,64 @@ return start; } + /** + * Arguments: + * + * Inputs: + * c_rarg0 - int crc + * c_rarg1 - byte* buf + * c_rarg2 - long length + * c_rarg3 - table_start - optional (present only when doing a library_calll, + * not used by x86 algorithm) + * + * Ouput: + * rax - int crc result + */ + address generate_updateBytesCRC32C(bool IsPclmulqdqSupported) { + assert(UseCRC32CIntrinsics, "need SSE4_2"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); + address start = __ pc(); + //reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs + //Windows RCX RDX R8 R9 none none XMM0..XMM3 + //Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7 + const Register crc = c_rarg0; // crc + const Register buf = c_rarg1; // source java byte array address + const Register len = c_rarg2; // length + const Register A = rax; + const Register J = r9; + const Register K = r10; + const Register L = r11; +#ifdef _WIN64 + const Register Y = rdi; + const Register Z = rsi; +#else + const Register Y = rcx; + const Register Z = r8; +#endif + assert_different_registers(crc, buf, len, A, J, K, L, Y, Z); + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame +#ifdef _WIN64 + __ push(Y); + __ push(Z); +#endif + __ crc32c_IPL_Alg2Alt2Fast(crc, buf, len, + A, J, K, + L, Y, Z, + c_farg0, c_farg1, c_farg2, + IsPclmulqdqSupported); + __ movl(rax, crc); +#ifdef _WIN64 + __ pop(Z); + __ pop(Y); +#endif + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } /** * Arguments: @@ -4239,6 +4297,13 @@ StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table; StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); } + + if (UseCRC32CIntrinsics) { + bool supports_clmul; + StubRoutines::x86::GenerateCRC32CTable(supports_clmul = VM_Version::supports_clmul()); + StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table; + StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul); + } } void generate_all() { --- old/src/cpu/x86/vm/stubRoutines_x86.cpp 2015-08-26 13:22:26.820430000 -0700 +++ new/src/cpu/x86/vm/stubRoutines_x86.cpp 2015-08-26 13:22:26.564404400 -0700 @@ -130,3 +130,114 @@ 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL, 0x2d02ef8dUL }; + +namespace CRC32C { + #include "crc32c.h" + + #undef CONST + static juint x; + #define CONST x + + #define D 32 + #define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41) + + #define TILL_CYCLE 31 + uint32_t Pow2k[TILL_CYCLE]; // because Pow2k[TILL_CYCLE == 31] == Pow2k[0] + + // A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 8 + // Listing 1: Multiplication of normalized polynomials + // "a" and "b" occupy D least significant bits. + uint32_t Multiply(uint32_t a, uint32_t b) { + uint32_t product = 0; + uint32_t bPowX[D + 1]; // bPowX[k] = (b * x**k) mod P + bPowX[0] = b; + for (int k = 0; k < D; ++k) { + // If "a" has non-zero coefficient at x**k,/ add ((b * x**k) mod P) to the result. + if ((a & (uint64_t)(1 << (D - 1 - k))) != 0) product ^= bPowX[k]; + + // Compute bPowX[k+1] = (b ** x**(k+1)) mod P. + if (bPowX[k] & 1) { + // If degree of (bPowX[k] * x) is D, then + // degree of (bPowX[k] * x - P) is less than D. + bPowX[k + 1] = (bPowX[k] >> 1) ^ P; + } + else { + bPowX[k + 1] = bPowX[k] >> 1; + } + } + return product; + } + + // A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 9 + void InitPow2k(void) { + // Pow2k(0) = + // x^(2^k) mod P(x) = x mod P(x) = x + // Since we are operating on a reflected values + // x = 10b, reflect(x) = 0x40000000 + Pow2k[0] = 0x40000000; + + for (int k = 1; k < TILL_CYCLE; k++) { + // Pow2k(k+1) = Pow2k(k-1)^2 mod P(x) + uint32_t tmp = Pow2k[k - 1]; + Pow2k[k] = Multiply(tmp, tmp); + } + } + + // x^N mod P(x) + uint32_t FPowN(uint32_t n) { + // result = 1 (polynomial) + uint32_t one, result = 0x80000000, i = 0; + + while (one = (n & 1), (n == 1 || n - one > 0)) { + if (one) { + result = Multiply(result, Pow2k[i]); + } + n >>= 1; + i++; + } + + return result; + } +} + +juint *StubRoutines::x86::_crc32c_table; + +void StubRoutines::x86::GenerateCRC32CTable(bool IsPclmulqdqSupported) { + using namespace CRC32C; + + static juint PowN[NUM_PRECOMPUTED_CONSTANTS]; + + InitPow2k(); + + PowN[0] = FPowN(HIGH * 8); // 8N * 8 = 64N + PowN[1] = FPowN(HIGH * 8 * 2); // 128N + + PowN[2] = FPowN(MIDDLE * 8); + PowN[3] = FPowN(MIDDLE * 8 * 2); + + PowN[4] = FPowN(LOW * 8); + PowN[NUM_PRECOMPUTED_CONSTANTS - 1] = + FPowN(LOW * 8 * 2); + + if (IsPclmulqdqSupported) { + _crc32c_table = PowN; + } else { + static julong PCLMULQDQ[NUM_PRECOMPUTED_CONSTANTS * 256]; + + for (int j = 0; j < NUM_PRECOMPUTED_CONSTANTS; j++) { + CONST = PowN[j]; + for (int64_t i = 0; i < 256; i++) { // to force 64 bit wide computations + // S. Gueron / Information Processing Letters 112 (2012) 184 + // Algorithm 3: Generating a carry-less multiplication lookup table. + // Input: A 32-bit constant, CONST. + // Output: A table of 256 entries, each one is a 64-bit quadword, + // that can be used for computing "byte" * CONST, for a given byte. + PCLMULQDQ[j * 256 + i] = + ((i & 1) * CONST) ^ ((i & 2) * CONST) ^ ((i & 4) * CONST) ^ + ((i & 8) * CONST) ^ ((i & 16) * CONST) ^ ((i & 32) * CONST) ^ + ((i & 64) * CONST) ^ ((i & 128) * CONST); + } + } + _crc32c_table = (juint*)PCLMULQDQ; + } +} --- old/src/cpu/x86/vm/stubRoutines_x86.hpp 2015-08-26 13:22:28.525600500 -0700 +++ new/src/cpu/x86/vm/stubRoutines_x86.hpp 2015-08-26 13:22:28.239571900 -0700 @@ -36,6 +36,8 @@ // masks and table for CRC32 static uint64_t _crc_by128_masks[]; static juint _crc_table[]; + // table for CRC32C + static juint* _crc32c_table; // swap mask for ghash static address _ghash_long_swap_mask_addr; static address _ghash_byte_swap_mask_addr; @@ -46,5 +48,6 @@ static address crc_by128_masks_addr() { return (address)_crc_by128_masks; } static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; } static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; } + static void GenerateCRC32CTable(bool IsPclmulqdqSupported); #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP --- old/src/cpu/x86/vm/templateInterpreter_x86_32.cpp 2015-08-26 13:22:30.226770600 -0700 +++ new/src/cpu/x86/vm/templateInterpreter_x86_32.cpp 2015-08-26 13:22:29.961744100 -0700 @@ -809,18 +809,25 @@ const Register buf = rdx; // source java byte array address const Register len = rdi; // length + // value x86_32 + // interp. arg ptr ESP + 4 + // int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int len) + // 3 2 1 0 + // int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len) + // 4 2,3 1 0 + // Arguments are reversed on java expression stack - __ movl(len, Address(rsp, wordSize)); // Length + __ movl(len, Address(rsp, 4 + 0)); // Length // Calculate address of start element if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) { - __ movptr(buf, Address(rsp, 3*wordSize)); // long buf - __ addptr(buf, Address(rsp, 2*wordSize)); // + offset - __ movl(crc, Address(rsp, 5*wordSize)); // Initial CRC + __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // long buf + __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset + __ movl(crc, Address(rsp, 4 + 4 * wordSize)); // Initial CRC } else { - __ movptr(buf, Address(rsp, 3*wordSize)); // byte[] array + __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // byte[] array __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size - __ addptr(buf, Address(rsp, 2*wordSize)); // + offset - __ movl(crc, Address(rsp, 4*wordSize)); // Initial CRC + __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset + __ movl(crc, Address(rsp, 4 + 3 * wordSize)); // Initial CRC } __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32()), crc, buf, len); @@ -838,6 +845,53 @@ return entry; } + return generate_native_entry(false); +} + +/** +* Method entry for static native methods: +* int java.util.zip.CRC32C.updateBytes(int crc, byte[] b, int off, int end) +* int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end) +*/ +address InterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { + if (UseCRC32CIntrinsics) { + address entry = __ pc(); + // Load parameters + const Register crc = rax; // crc + const Register buf = rcx; // source java byte array address + const Register len = rdx; // length + const Register end = len; + + // value x86_32 + // interp. arg ptr ESP + 4 + // int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int end) + // 3 2 1 0 + // int java.util.zip.CRC32.updateByteBuffer(int crc, long address, int off, int end) + // 4 2,3 1 0 + + // Arguments are reversed on java expression stack + __ movl(end, Address(rsp, 4 + 0)); // end + __ subl(len, Address(rsp, 4 + 1 * wordSize)); // end - offset == length + // Calculate address of start element + if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) { + __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // long address + __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset + __ movl(crc, Address(rsp, 4 + 4 * wordSize)); // Initial CRC + } else { + __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // byte[] array + __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size + __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset + __ movl(crc, Address(rsp, 4 + 3 * wordSize)); // Initial CRC + } + __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32C()), crc, buf, len); + // result in rax + // _areturn + __ pop(rdi); // get return address + __ mov(rsp, rsi); // set sp to sender sp + __ jmp(rdi); + + return entry; + } return generate_native_entry(false); } --- old/src/cpu/x86/vm/templateInterpreter_x86_64.cpp 2015-08-26 13:22:32.024950400 -0700 +++ new/src/cpu/x86/vm/templateInterpreter_x86_64.cpp 2015-08-26 13:22:31.770925000 -0700 @@ -804,6 +804,57 @@ return generate_native_entry(false); } +/** +* Method entry for static native methods: +* int java.util.zip.CRC32C.updateBytes(int crc, byte[] b, int off, int end) +* int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end) +*/ +address InterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { + if (UseCRC32CIntrinsics) { + address entry = __ pc(); + // Load parameters + const Register crc = c_rarg0; // crc + const Register buf = c_rarg1; // source java byte array address + const Register len = c_rarg2; + const Register off = c_rarg3; // offset + const Register end = len; + + // Arguments are reversed on java expression stack + // Calculate address of start element + if (kind == Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer) { + __ movptr(buf, Address(rsp, 3 * wordSize)); // long buf + __ movl2ptr(off, Address(rsp, 2 * wordSize)); // offset + __ addq(buf, off); // + offset + __ movl(crc, Address(rsp, 5 * wordSize)); // Initial CRC + // Note on 5 * wordSize vs. 4 * wordSize: + // * int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end) + // 4 2,3 1 0 + // end starts at SP + 8 + // The JavaŽ Virtual Machine Specification Java SE 7 Edition + // 4.10.2.3. Values of Types long and double + // "When calculating operand stack length, values of type long and double have length two." + } else { + __ movptr(buf, Address(rsp, 3 * wordSize)); // byte[] array + __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size + __ movl2ptr(off, Address(rsp, 2 * wordSize)); // offset + __ addq(buf, off); // + offset + __ movl(crc, Address(rsp, 4 * wordSize)); // Initial CRC + } + __ movl(end, Address(rsp, wordSize)); // end + __ subl(end, off); // end - off + __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32C()), crc, buf, len); + // result in rax + // _areturn + __ pop(rdi); // get return address + __ mov(rsp, r13); // set sp to sender sp + __ jmp(rdi); + + return entry; + } + + return generate_native_entry(false); +} + // Interpreter stub for calling a native method. (asm interpreter) // This sets up a somewhat different looking stack for calling the // native method than the typical interpreter frame setup. --- old/src/cpu/x86/vm/vm_version_x86.cpp 2015-08-26 13:22:33.742122100 -0700 +++ new/src/cpu/x86/vm/vm_version_x86.cpp 2015-08-26 13:22:33.492097100 -0700 @@ -665,6 +665,18 @@ FLAG_SET_DEFAULT(UseCRC32Intrinsics, false); } + if (supports_sse4_2()) { + if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) { + UseCRC32CIntrinsics = true; + } + } + else if (UseCRC32CIntrinsics) { + if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) { + warning("CRC32C intrinsics are not available on this CPU"); + } + FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false); + } + // The AES intrinsic stubs require AES instruction support (of course) // but also require sse3 mode for instructions it use. if (UseAES && (UseSSE > 2)) { @@ -699,12 +711,6 @@ FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); } - if (UseCRC32CIntrinsics) { - if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) - warning("CRC32C intrinsics are not available on this CPU"); - FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false); - } - // Adjust RTM (Restricted Transactional Memory) flags if (!supports_rtm() && UseRTMLocking) { // Can't continue because UseRTMLocking affects UseBiasedLocking flag --- old/src/cpu/zero/vm/interpreterGenerator_zero.hpp 2015-08-26 13:22:35.320279900 -0700 +++ new/src/cpu/zero/vm/interpreterGenerator_zero.hpp 2015-08-26 13:22:35.070254900 -0700 @@ -42,4 +42,5 @@ // Not supported address generate_CRC32_update_entry() { return NULL; } address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; } + address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; } #endif // CPU_ZERO_VM_INTERPRETERGENERATOR_ZERO_HPP --- old/src/share/vm/classfile/vmSymbols.hpp 2015-08-26 13:22:36.933441200 -0700 +++ new/src/share/vm/classfile/vmSymbols.hpp 2015-08-26 13:22:36.676415500 -0700 @@ -865,9 +865,9 @@ \ /* support for java.util.zip.CRC32C */ \ do_class(java_util_zip_CRC32C, "java/util/zip/CRC32C") \ - do_intrinsic(_updateBytesCRC32C, java_util_zip_CRC32C, updateBytes_name, updateBytes_signature, F_S) \ - do_intrinsic(_updateDirectByteBufferCRC32C, java_util_zip_CRC32C, updateDirectByteBuffer_name, updateByteBuffer_signature, F_S) \ - do_name( updateDirectByteBuffer_name, "updateDirectByteBuffer") \ + do_intrinsic(_updateBytesCRC32C, java_util_zip_CRC32C, updateBytes_name, updateBytes_signature, F_S) \ + do_intrinsic(_updateDirectByteBufferCRC32C, java_util_zip_CRC32C, updateDirectByteBuffer_name, updateByteBuffer_signature, F_S) \ + do_name(updateDirectByteBuffer_name, "updateDirectByteBuffer") \ \ /* support for sun.misc.Unsafe */ \ do_class(sun_misc_Unsafe, "sun/misc/Unsafe") \ --- old/src/share/vm/interpreter/abstractInterpreter.hpp 2015-08-26 13:22:38.538601700 -0700 +++ new/src/share/vm/interpreter/abstractInterpreter.hpp 2015-08-26 13:22:38.288576700 -0700 @@ -90,6 +90,8 @@ java_util_zip_CRC32_update, // implementation of java.util.zip.CRC32.update() java_util_zip_CRC32_updateBytes, // implementation of java.util.zip.CRC32.updateBytes() java_util_zip_CRC32_updateByteBuffer, // implementation of java.util.zip.CRC32.updateByteBuffer() + java_util_zip_CRC32C_updateBytes, // implementation of java.util.zip.CRC32C.updateBytes(crc, b[], off, end) + java_util_zip_CRC32C_updateDirectByteBuffer, // implementation of java.util.zip.CRC32C.updateDirectByteBuffer(crc, address, off, end) number_of_method_entries, invalid = -1 }; --- old/src/share/vm/interpreter/interpreter.cpp 2015-08-26 13:22:40.114759300 -0700 +++ new/src/share/vm/interpreter/interpreter.cpp 2015-08-26 13:22:39.864734300 -0700 @@ -232,6 +232,13 @@ case vmIntrinsics::_updateByteBufferCRC32 : return java_util_zip_CRC32_updateByteBuffer; } } + if (UseCRC32CIntrinsics) { + // Use optimized stub code for CRC32C methods. + switch (m->intrinsic_id()) { + case vmIntrinsics::_updateBytesCRC32C : return java_util_zip_CRC32C_updateBytes; + case vmIntrinsics::_updateDirectByteBufferCRC32C : return java_util_zip_CRC32C_updateDirectByteBuffer; + } + } #endif // Native method? @@ -339,6 +346,8 @@ case java_util_zip_CRC32_update : tty->print("java_util_zip_CRC32_update"); break; case java_util_zip_CRC32_updateBytes : tty->print("java_util_zip_CRC32_updateBytes"); break; case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break; + case java_util_zip_CRC32C_updateBytes : tty->print("java_util_zip_CRC32C_updateBytes"); break; + case java_util_zip_CRC32C_updateDirectByteBuffer: tty->print("java_util_zip_CRC32C_updateDirectByteByffer"); break; default: if (kind >= method_handle_invoke_FIRST && kind <= method_handle_invoke_LAST) { @@ -557,6 +566,11 @@ : // fall thru case Interpreter::java_util_zip_CRC32_updateByteBuffer : entry_point = generate_CRC32_updateBytes_entry(kind); break; + case Interpreter::java_util_zip_CRC32C_updateBytes + : // fall thru + case Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer + : entry_point = generate_CRC32C_updateBytes_entry(kind); break; + #endif // CC_INTERP default: fatal(err_msg("unexpected method kind: %d", kind)); --- old/src/share/vm/interpreter/templateInterpreter.cpp 2015-08-26 13:22:41.694917300 -0700 +++ new/src/share/vm/interpreter/templateInterpreter.cpp 2015-08-26 13:22:41.433891200 -0700 @@ -390,6 +390,11 @@ method_entry(java_util_zip_CRC32_updateByteBuffer) } + if (UseCRC32CIntrinsics) { + method_entry(java_util_zip_CRC32C_updateBytes) + method_entry(java_util_zip_CRC32C_updateDirectByteBuffer) + } + initialize_method_handle_entries(); // all native method kinds (must be one contiguous block) --- old/src/share/vm/runtime/stubRoutines.cpp 2015-08-26 13:22:43.281075900 -0700 +++ new/src/share/vm/runtime/stubRoutines.cpp 2015-08-26 13:22:43.029050700 -0700 @@ -135,8 +135,9 @@ address StubRoutines::_sha512_implCompressMB = NULL; address StubRoutines::_updateBytesCRC32 = NULL; -address StubRoutines::_crc_table_adr = NULL; +address StubRoutines::_crc_table_adr = NULL; +address StubRoutines::_crc32c_table_addr = NULL; address StubRoutines::_updateBytesCRC32C = NULL; address StubRoutines::_multiplyToLen = NULL; --- old/src/share/vm/runtime/stubRoutines.hpp 2015-08-26 13:22:44.890236800 -0700 +++ new/src/share/vm/runtime/stubRoutines.hpp 2015-08-26 13:22:44.638211600 -0700 @@ -197,6 +197,7 @@ static address _updateBytesCRC32; static address _crc_table_adr; + static address _crc32c_table_addr; static address _updateBytesCRC32C; static address _multiplyToLen; @@ -361,6 +362,7 @@ static address updateBytesCRC32() { return _updateBytesCRC32; } static address crc_table_addr() { return _crc_table_adr; } + static address crc32c_table_addr() { return _crc32c_table_addr; } static address updateBytesCRC32C() { return _updateBytesCRC32C; } static address multiplyToLen() {return _multiplyToLen; } --- old/src/share/vm/runtime/vmStructs.cpp 2015-08-26 13:22:46.480395800 -0700 +++ new/src/share/vm/runtime/vmStructs.cpp 2015-08-26 13:22:46.220369800 -0700 @@ -830,6 +830,7 @@ static_field(StubRoutines, _ghash_processBlocks, address) \ static_field(StubRoutines, _updateBytesCRC32, address) \ static_field(StubRoutines, _crc_table_adr, address) \ + static_field(StubRoutines, _crc32c_table_addr, address) \ static_field(StubRoutines, _updateBytesCRC32C, address) \ static_field(StubRoutines, _multiplyToLen, address) \ static_field(StubRoutines, _squareToLen, address) \ --- /dev/null 2015-08-26 13:22:48.000000000 -0700 +++ new/src/cpu/x86/vm/crc32c.h 2015-08-26 13:22:47.906538400 -0700 @@ -0,0 +1,66 @@ +/* +* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* This code is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License version 2 only, as +* published by the Free Software Foundation. +* +* This code is distributed in the hope that it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* version 2 for more details (a copy is included in the LICENSE file that +* accompanied this code). +* +* You should have received a copy of the GNU General Public License version +* 2 along with this work; if not, write to the Free Software Foundation, +* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +* +* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +* or visit www.oracle.com if you need additional information or have any +* questions. +* +*/ + +enum { + // S. Gueron / Information Processing Letters 112 (2012) 184 + // shows than anything above 6K and below 32K is a good choice + // 32K does not deliver any further performance gains + // 6K=8*256 (*3 as we compute 3 blocks together) + // + // Thus selecting the smallest value so it could apply to the largest number + // of buffer sizes. + HIGH = 8 * 256, + + // empirical + // based on ubench study using methodology described in + // V. Gopal et al. / Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction April 2011 8 + // + // arbitrary value between 27 and 256 + MIDDLE = 8 * 86, + + // V. Gopal et al. / Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction April 2011 9 + // shows that 240 and 1024 are equally good choices as the 216==8*27 + // + // Selecting the smallest value which resulted in a significant performance improvement over + // sequential version + LOW = 8 * 27, + + NUM_ChunkSizeInBytes = 3 +}; +// Notes: +// 1. Why we need to choose a "chunk" approach? +// Overhead of computing a powers and powers of for an arbitrary buffer of size N is significant +// (implementation approaches a library perf.) +// 2. Why only 3 "chunks"? +// Performance experiments results showed that a HIGH+LOW was not delivering a stable speedup +// curve. +// +// Disclaimer: +// If you ever decide to increase/decrease number of "chunks" be sure to modify +// a) constants table generation (C:\Java\jdk9hs-comp\hotspot\src\cpu\x86\vm\stubRoutines_x86.cpp) +// b) constant fetch from that table (macroAssembler_x86.cpp) +// c) unrolled for loop (macroAssembler_x86.cpp) + +// We need to compute powers of 64N and 128N for each "chunk" size +enum { NUM_PRECOMPUTED_CONSTANTS = 2 * NUM_ChunkSizeInBytes };