--- old/src/cpu/x86/vm/assembler_x86.cpp 2016-04-15 10:37:34.385656000 -0700 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2016-04-15 10:37:34.224123700 -0700 @@ -2369,6 +2369,39 @@ emit_operand(dst, src); } +void Assembler::movdqa(Address dst, XMMRegister src) { + assert(VM_Version::supports_sse2(), ""); + InstructionMark im(this); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x7F); + emit_operand(src, dst); +} + +void Assembler::vmovdqa(XMMRegister dst, Address src) { + assert(UseAVX > 0, ""); + InstructionMark im(this); + InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x6F); + emit_operand(dst, src); +} + +void Assembler::vmovdqa(Address dst, XMMRegister src) { + assert(UseAVX > 0, ""); + InstructionMark im(this); + InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + // swap src<->dst for encoding + assert(src != xnoreg, "sanity"); + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x7F); + emit_operand(src, dst); +} + + void Assembler::movdqu(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionMark im(this); @@ -3170,6 +3203,16 @@ emit_int8(imm8); } +void Assembler::vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8) { + assert(VM_Version::supports_avx2(), ""); + InstructionAttr attributes(AVX_256bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x46); + emit_int8(0xC0 | encode); + emit_int8(imm8); +} + + void Assembler::pause() { emit_int8((unsigned char)0xF3); emit_int8((unsigned char)0x90); @@ -3638,6 +3681,14 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::vpshufb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(VM_Version::supports_ssse3(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x00); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::pshufb(XMMRegister dst, Address src) { assert(VM_Version::supports_ssse3(), ""); InstructionMark im(this); @@ -3651,7 +3702,17 @@ void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) { assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_128bit; + int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_128bit; + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x70); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(mode & 0xFF); +} + +void Assembler::vpshufd(XMMRegister dst, XMMRegister src, int mode, int vector_len) { + assert(isByte(mode), "invalid value"); + NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x70); @@ -3699,13 +3760,23 @@ // Shift left 128 bit value in dst XMMRegister by shift number of bytes. NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); - // XMM3 is for /3 encoding: 66 0F 73 /3 ib int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x73); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift); } +void Assembler::vpsrldq(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + // Shift left 128 bit value in dst XMMRegister by shift number of bytes. + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + // XMM3 is for /3 encoding: 66 0F 73 /3 ib + int encode = simd_prefix_and_encode(xmm3, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x73); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(shift); +} + void Assembler::pslldq(XMMRegister dst, int shift) { // Shift left 128 bit value in dst XMMRegister by shift number of bytes. NOT_LP64(assert(VM_Version::supports_sse2(), "")); @@ -3717,6 +3788,17 @@ emit_int8(shift); } +void Assembler::vpslldq(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + // Shift left 128 bit value in dst XMMRegister by shift number of bytes. + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + // XMM7 is for /7 encoding: 66 0F 73 /7 ib + int encode = simd_prefix_and_encode(xmm7, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x73); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(shift); +} + void Assembler::ptest(XMMRegister dst, Address src) { assert(VM_Version::supports_sse4_1(), ""); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); @@ -3981,6 +4063,15 @@ emit_int8(imm8); } +void Assembler::vpalignr(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { + assert(VM_Version::supports_ssse3(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8((unsigned char)0x0F); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8); +} + void Assembler::pblendw(XMMRegister dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); @@ -8191,6 +8282,15 @@ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A, &attributes); emit_int8((unsigned char)0xF0); emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8); +} + +void Assembler::rorxd(Register dst, Register src, int imm8) { + assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported"); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A, &attributes); + emit_int8((unsigned char)0xF0); + emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); } --- old/src/cpu/x86/vm/assembler_x86.hpp 2016-04-15 10:37:36.030985000 -0700 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2016-04-15 10:37:35.886456100 -0700 @@ -1364,6 +1364,10 @@ void movdqa(XMMRegister dst, XMMRegister src); void movdqa(XMMRegister dst, Address src); + void movdqa(Address dst, XMMRegister src); + void vmovdqa(XMMRegister dst, Address src); + void vmovdqa(Address dst, XMMRegister src); + // Move Unaligned Double Quadword void movdqu(Address dst, XMMRegister src); void movdqu(XMMRegister dst, Address src); @@ -1519,6 +1523,7 @@ // Pemutation of 64bit words void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len); void vpermq(XMMRegister dst, XMMRegister src, int imm8); + void vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8); void pause(); @@ -1603,10 +1608,12 @@ // Shuffle Bytes void pshufb(XMMRegister dst, XMMRegister src); void pshufb(XMMRegister dst, Address src); + void vpshufb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); // Shuffle Packed Doublewords void pshufd(XMMRegister dst, XMMRegister src, int mode); void pshufd(XMMRegister dst, Address src, int mode); + void vpshufd(XMMRegister dst, XMMRegister src, int mode, int vector_len); // Shuffle Packed Low Words void pshuflw(XMMRegister dst, XMMRegister src, int mode); @@ -1614,8 +1621,10 @@ // Shift Right by bytes Logical DoubleQuadword Immediate void psrldq(XMMRegister dst, int shift); + void vpsrldq(XMMRegister dst, XMMRegister src, int shift, int vector_len); // Shift Left by bytes Logical DoubleQuadword Immediate void pslldq(XMMRegister dst, int shift); + void vpslldq(XMMRegister dst, XMMRegister src, int shift, int vector_len); // Logical Compare 128bit void ptest(XMMRegister dst, XMMRegister src); @@ -1658,6 +1667,7 @@ #ifdef _LP64 void rorq(Register dst, int imm8); void rorxq(Register dst, Register src, int imm8); + void rorxd(Register dst, Register src, int imm8); #endif void sahf(); @@ -1681,6 +1691,8 @@ void setb(Condition cc, Register dst); void palignr(XMMRegister dst, XMMRegister src, int imm8); + void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len); + void pblendw(XMMRegister dst, XMMRegister src, int imm8); void sha1rnds4(XMMRegister dst, XMMRegister src, int imm8); --- old/src/cpu/x86/vm/macroAssembler_x86.hpp 2016-04-15 10:37:37.449768700 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.hpp 2016-04-15 10:37:37.298738500 -0700 @@ -902,6 +902,45 @@ void ldmxcsr(Address src) { Assembler::ldmxcsr(src); } void ldmxcsr(AddressLiteral src); +#ifdef _LP64 + private: + void sha256_AVX2_one_round_compute( + Register reg_old_h, + Register reg_a, + Register reg_b, + Register reg_c, + Register reg_d, + Register reg_e, + Register reg_f, + Register reg_g, + Register reg_h, + int iter); + void sha256_AVX2_four_rounds_compute_first(int start); + void sha256_AVX2_four_rounds_compute_last(int start); + void sha256_AVX2_one_round_and_sched( + XMMRegister xmm_0, /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */ + XMMRegister xmm_1, /* ymm5 */ /* full cycle is 16 iterations */ + XMMRegister xmm_2, /* ymm6 */ + XMMRegister xmm_3, /* ymm7 */ + Register reg_a, /* == eax on 0 iteration, then rotate 8 register right on each next iteration */ + Register reg_b, /* ebx */ /* full cycle is 8 iterations */ + Register reg_c, /* edi */ + Register reg_d, /* esi */ + Register reg_e, /* r8d */ + Register reg_f, /* r9d */ + Register reg_g, /* r10d */ + Register reg_h, /* r11d */ + int iter); + + void addm(int disp, Register r1, Register r2); + + public: + void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, + XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, + Register buf, Register state, Register ofs, Register limit, Register rsp, + bool multi_block, XMMRegister shuf_mask); +#endif + void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0, XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask, Register buf, Register state, Register ofs, Register limit, Register rsp, --- old/src/cpu/x86/vm/macroAssembler_x86_sha.cpp 2016-04-15 10:37:38.690516800 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86_sha.cpp 2016-04-15 10:37:38.525483800 -0700 @@ -29,6 +29,546 @@ #include "runtime/stubRoutines.hpp" #include "macroAssembler_x86.hpp" +#ifdef _LP64 +/* + The algorithm below is based on Intel publication: + "Fast SHA-256 Implementations on IntelĀ® Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal. + The assembly code was originally provided by Sean Gulley and in many places preserves + the original assembly NAMES and comments to simplify matching Java assembly with its original. + The Java version was substantially redesigned to replace 1200 assembly instruction with + much shorter run-time generator of the same code in memory. +*/ + +void MacroAssembler::sha256_AVX2_one_round_compute( + Register reg_old_h, + Register reg_a, + Register reg_b, + Register reg_c, + Register reg_d, + Register reg_e, + Register reg_f, + Register reg_g, + Register reg_h, + int iter) { + const Register& reg_y0 = r13; + const Register& reg_y1 = r14; + const Register& reg_y2 = r15; + const Register& reg_y3 = rcx; + const Register& reg_T1 = r12; + //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;; + if (iter%4 > 0) { + addl(reg_old_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- + } + movl(reg_y2, reg_f); // reg_y2 = reg_f ; CH + rorxd(reg_y0, reg_e, 25); // reg_y0 = reg_e >> 25 ; S1A + rorxd(reg_y1, reg_e, 11); // reg_y1 = reg_e >> 11 ; S1B + xorl(reg_y2, reg_g); // reg_y2 = reg_f^reg_g ; CH + + xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_h>>11) ; S1 + rorxd(reg_y1, reg_e, 6); // reg_y1 = (reg_e >> 6) ; S1 + andl(reg_y2, reg_e); // reg_y2 = (reg_f^reg_g)®_e ; CH + + if (iter%4 > 0) { + addl(reg_old_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- + } + + xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 + rorxd(reg_T1, reg_a, 13); // reg_T1 = reg_a >> 13 ; S0B + xorl(reg_y2, reg_g); // reg_y2 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH + rorxd(reg_y1, reg_a, 22); // reg_y1 = reg_a >> 22 ; S0A + movl(reg_y3, reg_a); // reg_y3 = reg_a ; MAJA + + xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ; S0 + rorxd(reg_T1, reg_a, 2); // reg_T1 = (reg_a >> 2) ; S0 + addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; -- + orl(reg_y3, reg_c); // reg_y3 = reg_a|reg_c ; MAJA + + xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 + movl(reg_T1, reg_a); // reg_T1 = reg_a ; MAJB + andl(reg_y3, reg_b); // reg_y3 = (reg_a|reg_c)®_b ; MAJA + andl(reg_T1, reg_c); // reg_T1 = reg_a®_c ; MAJB + addl(reg_y2, reg_y0); // reg_y2 = S1 + CH ; -- + + + addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- + orl(reg_y3, reg_T1); // reg_y3 = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ + addl(reg_h, reg_y1); // reg_h = k + w + reg_h + S0 ; -- + + addl(reg_d, reg_y2); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- + + + if (iter%4 == 3) { + addl(reg_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- + addl(reg_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- + } +} + +void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) { + sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi, r8, r9, r10, r11, start + 0); + sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi, r8, r9, r10, start + 1); + sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi, r8, r9, start + 2); + sha256_AVX2_one_round_compute(r9, r9, r10, r11, rax, rbx, rdi, rsi, r8, start + 3); +} + +void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) { + sha256_AVX2_one_round_compute(r8, r8, r9, r10, r11, rax, rbx, rdi, rsi, start + 0); + sha256_AVX2_one_round_compute(rsi, rsi, r8, r9, r10, r11, rax, rbx, rdi, start + 1); + sha256_AVX2_one_round_compute(rdi, rdi, rsi, r8, r9, r10, r11, rax, rbx, start + 2); + sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi, r8, r9, r10, r11, rax, start + 3); +} + +void MacroAssembler::sha256_AVX2_one_round_and_sched( + XMMRegister xmm_0, /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */ + XMMRegister xmm_1, /* ymm5 */ /* full cycle is 16 iterations */ + XMMRegister xmm_2, /* ymm6 */ + XMMRegister xmm_3, /* ymm7 */ + Register reg_a, /* == rax on 0 iteration, then rotate 8 register right on each next iteration */ + Register reg_b, /* rbx */ /* full cycle is 8 iterations */ + Register reg_c, /* rdi */ + Register reg_d, /* rsi */ + Register reg_e, /* r8 */ + Register reg_f, /* r9d */ + Register reg_g, /* r10d */ + Register reg_h, /* r11d */ + int iter) +{ + movl(rcx, reg_a); // rcx = reg_a ; MAJA + rorxd(r13, reg_e, 25); // r13 = reg_e >> 25 ; S1A + rorxd(r14, reg_e, 11); // r14 = reg_e >> 11 ; S1B + addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); + orl(rcx, reg_c); // rcx = reg_a|reg_c ; MAJA + + movl(r15, reg_f); // r15 = reg_f ; CH + rorxd(r12, reg_a, 13); // r12 = reg_a >> 13 ; S0B + xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ; S1 + xorl(r15, reg_g); // r15 = reg_f^reg_g ; CH + + rorxd(r14, reg_e, 6); // r14 = (reg_e >> 6) ; S1 + andl(r15, reg_e); // r15 = (reg_f^reg_g)®_e ; CH + + xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 + rorxd(r14, reg_a, 22); // r14 = reg_a >> 22 ; S0A + addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- + + andl(rcx, reg_b); // rcx = (reg_a|reg_c)®_b ; MAJA + xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ; S0 + + rorxd(r12, reg_a, 2); // r12 = (reg_a >> 2) ; S0 + xorl(r15, reg_g); // r15 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH + + xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 + movl(r12, reg_a); // r12 = reg_a ; MAJB + andl(r12, reg_c); // r12 = reg_a®_c ; MAJB + addl(r15, r13); // r15 = S1 + CH ; -- + + orl(rcx, r12); // rcx = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ + addl(reg_h, r14); // reg_h = k + w + reg_h + S0 ; -- + addl(reg_d, r15); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- + + addl(reg_h, r15); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- + addl(reg_h, rcx); // reg_h = t1 + S0 + MAJ ; -- + + if (iter%4 == 0) { + vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit); // ymm0 = W[-7] + vpaddd(xmm0, xmm0, xmm_0, AVX_256bit); // ymm0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1 + vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit); // ymm1 = W[-15] + vpsrld(xmm2, xmm1, 7, AVX_256bit); + vpslld(xmm3, xmm1, 32-7, AVX_256bit); + vpor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 + vpsrld(xmm2, xmm1,18, AVX_256bit); + } else if (iter%4 == 1 ) { + vpsrld(xmm8, xmm1, 3, AVX_256bit); // ymm8 = W[-15] >> 3 + vpslld(xmm1, xmm1, 32-18, AVX_256bit); + vpxor(xmm3, xmm3, xmm1, AVX_256bit); + vpxor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 ^ W[-15] ror 18 + vpxor(xmm1, xmm3, xmm8, AVX_256bit); // ymm1 = s0 + vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit); // 11111010b ; ymm2 = W[-2] {BBAA} + vpaddd(xmm0, xmm0, xmm1, AVX_256bit); // ymm0 = W[-16] + W[-7] + s0 + vpsrld(xmm8, xmm2, 10, AVX_256bit); // ymm8 = W[-2] >> 10 {BBAA} + } else if (iter%4 == 2) { + vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xBxA} + vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xBxA} + vpxor(xmm2, xmm2, xmm3, AVX_256bit); + vpxor(xmm8, xmm8, xmm2, AVX_256bit); // ymm8 = s1 {xBxA} + vpshufb(xmm8, xmm8, xmm10, AVX_256bit); // ymm8 = s1 {00BA} + vpaddd(xmm0, xmm0, xmm8, AVX_256bit); // ymm0 = {..., ..., W[1], W[0]} + vpshufd(xmm2, xmm0, 0x50, AVX_256bit); // 01010000b ; ymm2 = W[-2] {DDCC} + } else if (iter%4 == 3) { + vpsrld(xmm11, xmm2, 10, AVX_256bit); // ymm11 = W[-2] >> 10 {DDCC} + vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xDxC} + vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xDxC} + vpxor(xmm2, xmm2, xmm3, AVX_256bit); + vpxor(xmm11, xmm11, xmm2, AVX_256bit); // ymm11 = s1 {xDxC} + vpshufb(xmm11, xmm11, xmm12, AVX_256bit); // ymm11 = s1 {DC00} + vpaddd(xmm_0, xmm11, xmm0, AVX_256bit); // xmm_0 = {W[3], W[2], W[1], W[0]} + } +} + +void MacroAssembler::addm(int disp, Register r1, Register r2) { + addl(r2, Address(r1, disp)); + movl(Address(r1, disp), r2); +} + +void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, + XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, + Register buf, Register state, Register ofs, Register limit, Register rsp, + bool multi_block, XMMRegister shuf_mask) { + + Label loop0, loop1, loop2, loop3, + last_block_enter, do_last_block, only_one_block, done_hash, + compute_size, compute_size_end, + compute_size1, compute_size_end1; + + address K256_W = StubRoutines::x86::k256_W_addr(); + address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); + address pshuffle_byte_flip_mask_addr = 0; + +const XMMRegister& SHUF_00BA = xmm10; // ymm10: shuffle xBxA -> 00BA +const XMMRegister& SHUF_DC00 = xmm12; // ymm12: shuffle xDxC -> DC00 +const XMMRegister& BYTE_FLIP_MASK = xmm13; // ymm13 + +const XMMRegister& X_BYTE_FLIP_MASK = xmm13; //XMM version of BYTE_FLIP_MASK + +const Register& NUM_BLKS = r8; // 3rd arg +const Register& CTX = rdx; // 2nd arg +const Register& INP = rcx; // 1st arg + +const Register& c = rdi; +const Register& d = rsi; +const Register& e = r8; // clobbers NUM_BLKS +const Register& y3 = rcx; // clobbers INP + +const Register& TBL = rbp; +const Register& SRND = CTX; // SRND is same register as CTX + +const Register& a = rax; +const Register& b = rbx; +const Register& f = r9; +const Register& g = r10; +const Register& h = r11; + +const Register& T1 = r12; +const Register& y0 = r13; +const Register& y1 = r14; +const Register& y2 = r15; + + +enum { + _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round +#ifndef _WIN64 + _XMM_SAVE_SIZE = 0, +#else + _XMM_SAVE_SIZE = 8*16, +#endif + _INP_END_SIZE = 8, + _INP_SIZE = 8, + _CTX_SIZE = 8, + _RSP_SIZE = 8, + + _XFER = 0, + _XMM_SAVE = _XFER + _XFER_SIZE, + _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE, + _INP = _INP_END + _INP_END_SIZE, + _CTX = _INP + _INP_SIZE, + _RSP = _CTX + _CTX_SIZE, + STACK_SIZE = _RSP + _RSP_SIZE, +}; + +#ifndef _WIN64 + push(rcx); // linux: this is limit, need at the end + push(rdx); // linux: this is ofs +#else + push(r8); // win64: this is ofs + push(r9); // win64: this is limit, we need them again at the very and +#endif + + + push(rbx); +#ifdef _WIN64 + push(rsi); + push(rdi); +#endif + push(rbp); + push(r12); + push(r13); + push(r14); + push(r15); + + movq(rax, rsp); + subq(rsp, STACK_SIZE); + andq(rsp, -32); + movq(Address(rsp, _RSP), rax); + +#ifndef _WIN64 + // copy linux params to win64 params, therefore the rest of code will be the same for both + movq(r9, rcx); + movq(r8, rdx); + movq(rdx, rsi); + movq(rcx, rdi); +#endif + + // setting original assembly ABI + /** message to encrypt in INP */ + lea(INP, Address(rcx, 0)); // rcx == message (buf) ;; linux: INP = buf = rdi + /** digest in CTX */ + movq(CTX, rdx); // rdx = digest (state) ;; linux: CTX = state = rsi + + /** NUM_BLK is the length of message, need to set it from ofs and limit */ + if (multi_block) { + + // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8 + // on entry r8 = ofs + // on exit r8 = NUM_BLKS + + xorq(rax, rax); + + bind(compute_size); + cmpptr(r8, r9); // assume the original ofs <= limit ;; linux: cmp rcx, rdx + jccb(Assembler::aboveEqual, compute_size_end); + addq(r8, 64); //;; linux: ofs = rdx + addq(rax, 64); + jmpb(compute_size); + + bind(compute_size_end); + movq(NUM_BLKS, rax); // NUM_BLK (r8) ;; linux: NUM_BLK = rdx + + cmpq(NUM_BLKS, 0); + jcc(Assembler::equal, done_hash); + + } else { + xorq(NUM_BLKS, NUM_BLKS); + addq(NUM_BLKS, 64); + }//if (!multi_block) + + lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block + movq(Address(rsp, _INP_END), NUM_BLKS); // + + cmpptr(INP, NUM_BLKS); //cmp INP, NUM_BLKS + jcc(Assembler::equal, only_one_block); //je only_one_block + + // load initial digest + movl(a, Address(CTX, 4*0)); + movl(b, Address(CTX, 4*1)); + movl(c, Address(CTX, 4*2)); + movl(d, Address(CTX, 4*3)); + movl(e, Address(CTX, 4*4)); + movl(f, Address(CTX, 4*5)); + movl(g, Address(CTX, 4*6)); + movl(h, Address(CTX, 4*7)); + + pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; + vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] + vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] + vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] + + movq(Address(rsp, _CTX), CTX); // store + +bind(loop0); + lea(TBL, ExternalAddress(K256_W)); + + // assume buffers not aligned + + // Load first 16 dwords from two blocks + vmovdqu(xmm0, Address(INP, 0*32)); + vmovdqu(xmm1, Address(INP, 1*32)); + vmovdqu(xmm2, Address(INP, 2*32)); + vmovdqu(xmm3, Address(INP, 3*32)); + + // byte swap data + vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit); + vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit); + vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit); + vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit); + + // transpose data into high/low halves + vperm2i128(xmm4, xmm0, xmm2, 0x20); + vperm2i128(xmm5, xmm0, xmm2, 0x31); + vperm2i128(xmm6, xmm1, xmm3, 0x20); + vperm2i128(xmm7, xmm1, xmm3, 0x31); + +bind(last_block_enter); + addq(INP, 64); + movq(Address(rsp, _INP), INP); + + //;; schedule 48 input dwords, by doing 3 rounds of 12 each + xorq(SRND, SRND); + +align(16); +bind(loop1); + vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); + vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); + sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8, r9, r10, r11, 0); + sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8, r9, r10, 1); + sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8, r9, 2); + sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9, r10, r11, rax, rbx, rdi, rsi, r8, 3); + + vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); + vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); + sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8, r9, r10, r11, rax, rbx, rdi, rsi, 8+0); + sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8, r9, r10, r11, rax, rbx, rdi, 8+1); + sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8, r9, r10, r11, rax, rbx, 8+2); + sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8, r9, r10, r11, rax, 8+3); + + vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit); + vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9); + sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8, r9, r10, r11, 16+0); + sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8, r9, r10, 16+1); + sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8, r9, 16+2); + sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9, r10, r11, rax, rbx, rdi, rsi, r8, 16+3); + + vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit); + vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9); + + sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8, r9, r10, r11, rax, rbx, rdi, rsi, 24+0); + sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8, r9, r10, r11, rax, rbx, rdi, 24+1); + sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8, r9, r10, r11, rax, rbx, 24+2); + sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8, r9, r10, r11, rax, 24+3); + + addq(SRND, 4*32); + cmpq(SRND, 3 * 4*32); + jcc(Assembler::below, loop1); + +bind(loop2); + // Do last 16 rounds with no scheduling + vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); + vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); + sha256_AVX2_four_rounds_compute_first(0); + + vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); + vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); + sha256_AVX2_four_rounds_compute_last(0 + 8); + + addq(SRND, 2*32); + + vmovdqu(xmm4, xmm6); + vmovdqu(xmm5, xmm7); + + cmpq(SRND, 4 * 4*32); + jcc(Assembler::below, loop2); + + movq(CTX, Address(rsp, _CTX)); + movq(INP, Address(rsp, _INP)); + + addm(4*0, CTX, a); + addm(4*1, CTX, b); + addm(4*2, CTX, c); + addm(4*3, CTX, d); + addm(4*4, CTX, e); + addm(4*5, CTX, f); + addm(4*6, CTX, g); + addm(4*7, CTX, h); + + cmpq(INP, Address(rsp, _INP_END)); + jcc(Assembler::above, done_hash); + + //Do second block using previously scheduled results + xorq(SRND, SRND); +align(16); +bind(loop3); + sha256_AVX2_four_rounds_compute_first(4); + sha256_AVX2_four_rounds_compute_last(4+8); + + addq(SRND, 2*32); + cmpq(SRND, 4 * 4*32); + jcc(Assembler::below, loop3); + + movq(CTX, Address(rsp, _CTX)); + movq(INP, Address(rsp, _INP)); + addq(INP, 64); + + addm(4*0, CTX, a); + addm(4*1, CTX, b); + addm(4*2, CTX, c); + addm(4*3, CTX, d); + addm(4*4, CTX, e); + addm(4*5, CTX, f); + addm(4*6, CTX, g); + addm(4*7, CTX, h); + + cmpq(INP, Address(rsp, _INP_END)); + jcc(Assembler::below, loop0); + jccb(Assembler::above, done_hash); + +bind(do_last_block); + lea(TBL, ExternalAddress(K256_W)); + + movdqu(xmm4, Address(INP, 0*16)); + movdqu(xmm5, Address(INP, 1*16)); + movdqu(xmm6, Address(INP, 2*16)); + movdqu(xmm7, Address(INP, 3*16)); + + vpshufb(xmm4, xmm4, xmm13, AVX_128bit); + vpshufb(xmm5, xmm5, xmm13, AVX_128bit); + vpshufb(xmm6, xmm6, xmm13, AVX_128bit); + vpshufb(xmm7, xmm7, xmm13, AVX_128bit); + + jmp(last_block_enter); + +bind(only_one_block); + + // load initial digest ;; table should be preloaded with following values + movl(a, Address(CTX, 4*0)); // 0x6a09e667 + movl(b, Address(CTX, 4*1)); // 0xbb67ae85 + movl(c, Address(CTX, 4*2)); // 0x3c6ef372 + movl(d, Address(CTX, 4*3)); // 0xa54ff53a + movl(e, Address(CTX, 4*4)); // 0x510e527f + movl(f, Address(CTX, 4*5)); // 0x9b05688c + movl(g, Address(CTX, 4*6)); // 0x1f83d9ab + movl(h, Address(CTX, 4*7)); // 0x5be0cd19 + + + pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; + vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] + vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] + vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] + + movq(Address(rsp, _CTX), CTX); + jmpb(do_last_block); + +bind(done_hash); + + movq(rsp, Address(rsp, _RSP)); + + pop(r15); + pop(r14); + pop(r13); + pop(r12); + pop(rbp); +#ifdef _WIN64 + pop(rdi); + pop(rsi); +#endif + pop(rbx); + +#ifdef _WIN64 + pop(r9); + pop(r8); +#else + pop(rdx); + pop(rcx); +#endif + + if (multi_block) { +#ifdef _WIN64 +const Register& limit_end = r9; +const Register& ofs_end = r8; +#else +const Register& limit_end = rcx; +const Register& ofs_end = rdx; +#endif + movq(rax, ofs_end); + +bind(compute_size1); + cmpptr(rax, limit_end); // assume the original ofs <= limit + jccb(Assembler::aboveEqual, compute_size_end1); + addq(rax, 64); + jmpb(compute_size1); + +bind(compute_size_end1); + } +} +#endif //#ifdef _LP64 + // ofs and limit are used for multi-block byte array. // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0, --- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2016-04-15 10:37:39.895757800 -0700 +++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2016-04-15 10:37:39.742227100 -0700 @@ -3772,12 +3772,29 @@ address start = __ pc(); __ emit_data64(0x0405060700010203, relocInfo::none); __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none); + + if (VM_Version::supports_avx2()) { + __ emit_data64(0x0405060700010203, relocInfo::none); // second copy + __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none); + // _SHUF_00BA + __ emit_data64(0x0b0a090803020100, relocInfo::none); + __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); + __ emit_data64(0x0b0a090803020100, relocInfo::none); + __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); + // _SHUF_DC00 + __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); + __ emit_data64(0x0b0a090803020100, relocInfo::none); + __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); + __ emit_data64(0x0b0a090803020100, relocInfo::none); + } + return start; } // ofs and limit are use for multi-block byte array. // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) address generate_sha256_implCompress(bool multi_block, const char *name) { + assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), ""); __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); @@ -3806,16 +3823,37 @@ __ movdqu(Address(rsp, 0), xmm6); __ movdqu(Address(rsp, 2 * wordSize), xmm7); __ movdqu(Address(rsp, 4 * wordSize), xmm8); + + if (!VM_Version::supports_sha() && VM_Version::supports_avx2()) { + __ subptr(rsp, 10 * wordSize); + __ movdqu(Address(rsp, 0), xmm9); + __ movdqu(Address(rsp, 2 * wordSize), xmm10); + __ movdqu(Address(rsp, 4 * wordSize), xmm11); + __ movdqu(Address(rsp, 6 * wordSize), xmm12); + __ movdqu(Address(rsp, 8 * wordSize), xmm13); + } #endif __ subptr(rsp, 4 * wordSize); - __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, - buf, state, ofs, limit, rsp, multi_block, shuf_mask); - + if (VM_Version::supports_sha()) { + __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, + buf, state, ofs, limit, rsp, multi_block, shuf_mask); + } else if (VM_Version::supports_avx2()) { + __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, + buf, state, ofs, limit, rsp, multi_block, shuf_mask); + } __ addptr(rsp, 4 * wordSize); #ifdef _WIN64 // restore xmm regs belonging to calling function + if (!VM_Version::supports_sha() && VM_Version::supports_avx2()) { + __ movdqu(xmm9, Address(rsp, 0)); + __ movdqu(xmm10, Address(rsp, 2 * wordSize)); + __ movdqu(xmm11, Address(rsp, 4 * wordSize)); + __ movdqu(xmm12, Address(rsp, 6 * wordSize)); + __ movdqu(xmm13, Address(rsp, 8 * wordSize)); + __ addptr(rsp, 10 * wordSize); + } __ movdqu(xmm6, Address(rsp, 0)); __ movdqu(xmm7, Address(rsp, 2 * wordSize)); __ movdqu(xmm8, Address(rsp, 4 * wordSize)); @@ -5218,6 +5256,7 @@ } if (UseSHA256Intrinsics) { StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256; + StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W; StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask(); StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); --- old/src/cpu/x86/vm/stubRoutines_x86.cpp 2016-04-15 10:37:41.491076800 -0700 +++ new/src/cpu/x86/vm/stubRoutines_x86.cpp 2016-04-15 10:37:41.329044400 -0700 @@ -46,6 +46,9 @@ address StubRoutines::x86::_upper_word_mask_addr = NULL; address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL; address StubRoutines::x86::_k256_adr = NULL; +#ifdef _LP64 +address StubRoutines::x86::_k256_W_adr = NULL; +#endif address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = NULL; //tables common for sin and cos @@ -289,3 +292,57 @@ 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL }; + +#ifdef _LP64 +// used in MacroAssembler::sha256_AVX2 +ALIGNED_(64) juint StubRoutines::x86::_k256_W[] = +{ + 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, + 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, + + 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, + 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, + + 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, + 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, + + 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, + 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, + + 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, + 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, + + 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, + 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, + + 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, + 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, + + 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, + 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, + + 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, + 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, + + 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, + 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, + + 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, + 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, + + 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, + 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, + + 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, + 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, + + 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, + 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, + + 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, + 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, + + 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL, + 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL +}; +#endif --- old/src/cpu/x86/vm/stubRoutines_x86.hpp 2016-04-15 10:37:42.905859700 -0700 +++ new/src/cpu/x86/vm/stubRoutines_x86.hpp 2016-04-15 10:37:42.730324600 -0700 @@ -54,6 +54,10 @@ //k256 table for sha256 static juint _k256[]; static address _k256_adr; +#ifdef _LP64 + static juint _k256_W[]; + static address _k256_W_adr; +#endif // byte flip mask for sha256 static address _pshuffle_byte_flip_mask_addr; @@ -109,6 +113,9 @@ static address upper_word_mask_addr() { return _upper_word_mask_addr; } static address shuffle_byte_flip_mask_addr() { return _shuffle_byte_flip_mask_addr; } static address k256_addr() { return _k256_adr; } +#ifdef _LP64 + static address k256_W_addr() { return _k256_W_adr; } +#endif static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; } static void generate_CRC32C_table(bool is_pclmulqdq_supported); static address _ONEHALF_addr() { return _ONEHALF_adr; } --- old/src/cpu/x86/vm/vm_version_x86.cpp 2016-04-15 10:37:44.273133100 -0700 +++ new/src/cpu/x86/vm/vm_version_x86.cpp 2016-04-15 10:37:44.108100100 -0700 @@ -731,7 +731,7 @@ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); } - if (supports_sha()) { + if (supports_sha() LP64_ONLY(|| supports_avx2())) { if (FLAG_IS_DEFAULT(UseSHA)) { UseSHA = true; } @@ -740,7 +740,7 @@ FLAG_SET_DEFAULT(UseSHA, false); } - if (UseSHA) { + if (supports_sha() && UseSHA) { if (FLAG_IS_DEFAULT(UseSHA1Intrinsics)) { FLAG_SET_DEFAULT(UseSHA1Intrinsics, true); }