--- old/src/cpu/x86/vm/assembler_x86.cpp 2016-02-26 14:22:29.275261700 -0800 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2016-02-26 14:22:28.908225000 -0800 @@ -777,6 +777,7 @@ case 0x6E: // movd case 0x7E: // movd case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush + case 0xFE: // paddd debug_only(has_disp32 = true); break; @@ -926,6 +927,7 @@ ip++; // skip P2, move to opcode // To find the end of instruction (which == end_pc_operand). switch (0xFF & *ip) { + case 0x22: // pinsrd r, r/a, #8 case 0x61: // pcmpestri r, r/a, #8 case 0x70: // pshufd r, r/a, #8 case 0x73: // psrldq r, #8 @@ -3953,6 +3955,83 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::palignr(XMMRegister dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_ssse3(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8((unsigned char)0x0F); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8); +} + +void Assembler::pblendw(XMMRegister dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8((unsigned char)0x0E); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8); +} + +void Assembler::sha1rnds4(XMMRegister dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sha(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_3A, &attributes); + emit_int8((unsigned char)0xCC); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8((unsigned char)imm8); +} + +void Assembler::sha1nexte(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sha(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xC8); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::sha1msg1(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sha(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xC9); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::sha1msg2(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sha(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xCA); + emit_int8((unsigned char)(0xC0 | encode)); +} + +// xmm0 is implicit additional source to this instruction. +void Assembler::sha256rnds2(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sha(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xCB); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::sha256msg1(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sha(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xCC); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::sha256msg2(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sha(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xCD); + emit_int8((unsigned char)(0xC0 | encode)); +} + + void Assembler::shll(Register dst, int imm8) { assert(isShiftCount(imm8), "illegal shift count"); int encode = prefix_and_encode(dst->encoding()); @@ -4931,6 +5010,15 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::paddd(XMMRegister dst, Address src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionMark im(this); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xFE); + emit_operand(dst, src); +} + void Assembler::paddq(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); --- old/src/cpu/x86/vm/assembler_x86.hpp 2016-02-26 14:22:31.788513000 -0800 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2016-02-26 14:22:31.446478800 -0800 @@ -1672,6 +1672,18 @@ void setb(Condition cc, Register dst); + void palignr(XMMRegister dst, XMMRegister src, int imm8); + void pblendw(XMMRegister dst, XMMRegister src, int imm8); + + void sha1rnds4(XMMRegister dst, XMMRegister src, int imm8); + void sha1nexte(XMMRegister dst, XMMRegister src); + void sha1msg1(XMMRegister dst, XMMRegister src); + void sha1msg2(XMMRegister dst, XMMRegister src); + // xmm0 is implicit additional source to the following instruction. + void sha256rnds2(XMMRegister dst, XMMRegister src); + void sha256msg1(XMMRegister dst, XMMRegister src); + void sha256msg2(XMMRegister dst, XMMRegister src); + void shldl(Register dst, Register src); void shldl(Register dst, Register src, int8_t imm8); @@ -1868,6 +1880,7 @@ void paddb(XMMRegister dst, XMMRegister src); void paddw(XMMRegister dst, XMMRegister src); void paddd(XMMRegister dst, XMMRegister src); + void paddd(XMMRegister dst, Address src); void paddq(XMMRegister dst, XMMRegister src); void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); --- old/src/cpu/x86/vm/macroAssembler_x86.hpp 2016-02-26 14:22:34.021736300 -0800 +++ new/src/cpu/x86/vm/macroAssembler_x86.hpp 2016-02-26 14:22:33.678702000 -0800 @@ -903,6 +903,16 @@ void ldmxcsr(Address src) { Assembler::ldmxcsr(src); } void ldmxcsr(AddressLiteral src); + void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0, + XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask, + Register buf, Register state, Register ofs, Register limit, Register rsp, + bool multi_block); + + void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, + XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, + Register buf, Register state, Register ofs, Register limit, Register rsp, + bool multi_block LP64_ONLY(COMMA XMMRegister shuf_mask)); + void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx, Register rdx, Register tmp); --- old/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2016-02-26 14:22:36.185952700 -0800 +++ new/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2016-02-26 14:22:35.850919200 -0800 @@ -3068,6 +3068,136 @@ return start; } + address generate_upper_word_mask() { + __ align(64); + StubCodeMark mark(this, "StubRoutines", "upper_word_mask"); + address start = __ pc(); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0xFFFFFFFF, relocInfo::none, 0); + return start; + } + + address generate_shuffle_byte_flip_mask() { + __ align(64); + StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask"); + address start = __ pc(); + __ emit_data(0x0c0d0e0f, relocInfo::none, 0); + __ emit_data(0x08090a0b, relocInfo::none, 0); + __ emit_data(0x04050607, relocInfo::none, 0); + __ emit_data(0x00010203, relocInfo::none, 0); + return start; + } + + // ofs and limit are use for multi-block byte array. + // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) + address generate_sha1_implCompress(bool multi_block, const char *name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + Register buf = rax; + Register state = rdx; + Register ofs = rcx; + Register limit = rdi; + + const Address buf_param(rbp, 8 + 0); + const Address state_param(rbp, 8 + 4); + const Address ofs_param(rbp, 8 + 8); + const Address limit_param(rbp, 8 + 12); + + const XMMRegister abcd = xmm0; + const XMMRegister e0 = xmm1; + const XMMRegister e1 = xmm2; + const XMMRegister msg0 = xmm3; + + const XMMRegister msg1 = xmm4; + const XMMRegister msg2 = xmm5; + const XMMRegister msg3 = xmm6; + const XMMRegister shuf_mask = xmm7; + + __ enter(); + __ subptr(rsp, 8 * wordSize); + if (multi_block) { + __ push(limit); + } + __ movptr(buf, buf_param); + __ movptr(state, state_param); + if (multi_block) { + __ movptr(ofs, ofs_param); + __ movptr(limit, limit_param); + } + + __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask, + buf, state, ofs, limit, rsp, multi_block); + + if (multi_block) { + __ pop(limit); + } + __ addptr(rsp, 8 * wordSize); + __ leave(); + __ ret(0); + return start; + } + + address generate_pshuffle_byte_flip_mask() { + __ align(64); + StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask"); + address start = __ pc(); + __ emit_data(0x00010203, relocInfo::none, 0); + __ emit_data(0x04050607, relocInfo::none, 0); + __ emit_data(0x08090a0b, relocInfo::none, 0); + __ emit_data(0x0c0d0e0f, relocInfo::none, 0); + return start; + } + + // ofs and limit are use for multi-block byte array. + // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) + address generate_sha256_implCompress(bool multi_block, const char *name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + Register buf = rbx; + Register state = rsi; + Register ofs = rdx; + Register limit = rcx; + + const Address buf_param(rbp, 8 + 0); + const Address state_param(rbp, 8 + 4); + const Address ofs_param(rbp, 8 + 8); + const Address limit_param(rbp, 8 + 12); + + const XMMRegister msg = xmm0; + const XMMRegister state0 = xmm1; + const XMMRegister state1 = xmm2; + const XMMRegister msgtmp0 = xmm3; + + const XMMRegister msgtmp1 = xmm4; + const XMMRegister msgtmp2 = xmm5; + const XMMRegister msgtmp3 = xmm6; + const XMMRegister msgtmp4 = xmm7; + + __ enter(); + __ subptr(rsp, 8 * wordSize); + handleSOERegisters(true /*saving*/); + __ movptr(buf, buf_param); + __ movptr(state, state_param); + if (multi_block) { + __ movptr(ofs, ofs_param); + __ movptr(limit, limit_param); + } + + __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, + buf, state, ofs, limit, rsp, multi_block); + + handleSOERegisters(false); + __ addptr(rsp, 8 * wordSize); + __ leave(); + __ ret(0); + return start; + } // byte swap x86 long address generate_ghash_long_swap_mask() { @@ -3772,6 +3902,19 @@ StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); } + if (UseSHA1Intrinsics) { + StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask(); + StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask(); + StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); + StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); + } + if (UseSHA256Intrinsics) { + StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256; + StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask(); + StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); + StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); + } + // Generate GHASH intrinsics code if (UseGHASHIntrinsics) { StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); --- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2016-02-26 14:22:38.466180700 -0800 +++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2016-02-26 14:22:38.122146300 -0800 @@ -3695,6 +3695,133 @@ return start; } + address generate_upper_word_mask() { + __ align(64); + StubCodeMark mark(this, "StubRoutines", "upper_word_mask"); + address start = __ pc(); + __ emit_data64(0x0000000000000000, relocInfo::none); + __ emit_data64(0xFFFFFFFF00000000, relocInfo::none); + return start; + } + + address generate_shuffle_byte_flip_mask() { + __ align(64); + StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask"); + address start = __ pc(); + __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); + __ emit_data64(0x0001020304050607, relocInfo::none); + return start; + } + + // ofs and limit are use for multi-block byte array. + // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) + address generate_sha1_implCompress(bool multi_block, const char *name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + Register buf = c_rarg0; + Register state = c_rarg1; + Register ofs = c_rarg2; + Register limit = c_rarg3; + + const XMMRegister abcd = xmm0; + const XMMRegister e0 = xmm1; + const XMMRegister e1 = xmm2; + const XMMRegister msg0 = xmm3; + + const XMMRegister msg1 = xmm4; + const XMMRegister msg2 = xmm5; + const XMMRegister msg3 = xmm6; + const XMMRegister shuf_mask = xmm7; + + __ enter(); + +#ifdef _WIN64 + // save the xmm registers which must be preserved 6-7 + __ subptr(rsp, 4 * wordSize); + __ movdqu(Address(rsp, 0), xmm6); + __ movdqu(Address(rsp, 2 * wordSize), xmm7); +#endif + + __ subptr(rsp, 4 * wordSize); + + __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask, + buf, state, ofs, limit, rsp, multi_block); + + __ addptr(rsp, 4 * wordSize); +#ifdef _WIN64 + // restore xmm regs belonging to calling function + __ movdqu(xmm6, Address(rsp, 0)); + __ movdqu(xmm7, Address(rsp, 2 * wordSize)); + __ addptr(rsp, 4 * wordSize); +#endif + + __ leave(); + __ ret(0); + return start; + } + + address generate_pshuffle_byte_flip_mask() { + __ align(64); + StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask"); + address start = __ pc(); + __ emit_data64(0x0405060700010203, relocInfo::none); + __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none); + return start; + } + +// ofs and limit are use for multi-block byte array. +// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) + address generate_sha256_implCompress(bool multi_block, const char *name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + Register buf = c_rarg0; + Register state = c_rarg1; + Register ofs = c_rarg2; + Register limit = c_rarg3; + + const XMMRegister msg = xmm0; + const XMMRegister state0 = xmm1; + const XMMRegister state1 = xmm2; + const XMMRegister msgtmp0 = xmm3; + + const XMMRegister msgtmp1 = xmm4; + const XMMRegister msgtmp2 = xmm5; + const XMMRegister msgtmp3 = xmm6; + const XMMRegister msgtmp4 = xmm7; + + const XMMRegister shuf_mask = xmm8; + + __ enter(); +#ifdef _WIN64 + // save the xmm registers which must be preserved 6-7 + __ subptr(rsp, 6 * wordSize); + __ movdqu(Address(rsp, 0), xmm6); + __ movdqu(Address(rsp, 2 * wordSize), xmm7); + __ movdqu(Address(rsp, 4 * wordSize), xmm8); +#endif + + __ subptr(rsp, 4 * wordSize); + + __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, + buf, state, ofs, limit, rsp, multi_block, shuf_mask); + + __ addptr(rsp, 4 * wordSize); +#ifdef _WIN64 + // restore xmm regs belonging to calling function + __ movdqu(xmm6, Address(rsp, 0)); + __ movdqu(xmm7, Address(rsp, 2 * wordSize)); + __ movdqu(xmm8, Address(rsp, 4 * wordSize)); + __ addptr(rsp, 6 * wordSize); +#endif + __ leave(); + __ ret(0); + return start; + } + // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time // to hide instruction latency // @@ -4974,6 +5101,19 @@ StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); } + if (UseSHA1Intrinsics) { + StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask(); + StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask(); + StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); + StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); + } + if (UseSHA256Intrinsics) { + StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256; + StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask(); + StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); + StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); + } + // Generate GHASH intrinsics code if (UseGHASHIntrinsics) { StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); --- old/src/cpu/x86/vm/stubRoutines_x86.cpp 2016-02-26 14:22:40.794413500 -0800 +++ new/src/cpu/x86/vm/stubRoutines_x86.cpp 2016-02-26 14:22:40.453379400 -0800 @@ -29,6 +29,12 @@ #include "runtime/thread.inline.hpp" #include "crc32c.h" +#ifdef _MSC_VER +#define ALIGNED_(x) __declspec(align(x)) +#else +#define ALIGNED_(x) __attribute__ ((aligned(x))) +#endif + // Implementation of the platform-specific part of StubRoutines - for // a description of how to extend it, see the stubRoutines.hpp file. @@ -37,6 +43,10 @@ address StubRoutines::x86::_counter_shuffle_mask_addr = NULL; address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL; address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL; +address StubRoutines::x86::_upper_word_mask_addr = NULL; +address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL; +address StubRoutines::x86::_k256_adr = NULL; +address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = NULL; uint64_t StubRoutines::x86::_crc_by128_masks[] = { @@ -236,3 +246,23 @@ _crc32c_table = (juint*)pclmulqdq_table; } } + +ALIGNED_(64) juint StubRoutines::x86::_k256[] = +{ + 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, + 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, + 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, + 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, + 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, + 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, + 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, + 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, + 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, + 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, + 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, + 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, + 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, + 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, + 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, + 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL +}; --- old/src/cpu/x86/vm/stubRoutines_x86.hpp 2016-02-26 14:23:10.553389100 -0800 +++ new/src/cpu/x86/vm/stubRoutines_x86.hpp 2016-02-26 14:23:10.220355800 -0800 @@ -46,6 +46,17 @@ static address _ghash_long_swap_mask_addr; static address _ghash_byte_swap_mask_addr; + // upper word mask for sha1 + static address _upper_word_mask_addr; + // byte flip mask for sha1 + static address _shuffle_byte_flip_mask_addr; + + //k256 table for sha256 + static juint _k256[]; + static address _k256_adr; + // byte flip mask for sha256 + static address _pshuffle_byte_flip_mask_addr; + public: static address verify_mxcsr_entry() { return _verify_mxcsr_entry; } static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; } @@ -53,5 +64,9 @@ static address crc_by128_masks_addr() { return (address)_crc_by128_masks; } static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; } static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; } + static address upper_word_mask_addr() { return _upper_word_mask_addr; } + static address shuffle_byte_flip_mask_addr() { return _shuffle_byte_flip_mask_addr; } + static address k256_addr() { return _k256_adr; } + static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; } static void generate_CRC32C_table(bool is_pclmulqdq_supported); #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP --- old/src/cpu/x86/vm/vmStructs_x86.hpp 2016-02-26 14:23:12.618595600 -0800 +++ new/src/cpu/x86/vm/vmStructs_x86.hpp 2016-02-26 14:23:12.279561700 -0800 @@ -68,10 +68,11 @@ declare_constant(VM_Version::CPU_AVX512DQ) \ declare_constant(VM_Version::CPU_AVX512PF) \ declare_constant(VM_Version::CPU_AVX512ER) \ - declare_constant(VM_Version::CPU_AVX512CD) \ - declare_constant(VM_Version::CPU_AVX512BW) + declare_constant(VM_Version::CPU_AVX512CD) #define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \ - declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) + declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \ + declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \ + declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA) #endif // CPU_X86_VM_VMSTRUCTS_X86_HPP --- old/src/cpu/x86/vm/vm_version_x86.cpp 2016-02-26 14:23:14.761809900 -0800 +++ new/src/cpu/x86/vm/vm_version_x86.cpp 2016-02-26 14:23:14.410774800 -0800 @@ -577,7 +577,7 @@ } char buf[256]; - jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", cores_per_cpu(), threads_per_core(), cpu_family(), _model, _stepping, (supports_cmov() ? ", cmov" : ""), @@ -608,7 +608,8 @@ (supports_bmi1() ? ", bmi1" : ""), (supports_bmi2() ? ", bmi2" : ""), (supports_adx() ? ", adx" : ""), - (supports_evex() ? ", evex" : "")); + (supports_evex() ? ", evex" : ""), + (supports_sha() ? ", sha" : "")); _features_string = os::strdup(buf); // UseSSE is set to the smaller of what hardware supports and what @@ -730,17 +731,31 @@ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); } - if (UseSHA) { - warning("SHA instructions are not available on this CPU"); + if (supports_sha()) { + if (FLAG_IS_DEFAULT(UseSHA)) { + UseSHA = true; + } + } + else if (UseSHA) { + if (!FLAG_IS_DEFAULT(UseSHA)) + warning("SHA instructions are not available on this CPU"); FLAG_SET_DEFAULT(UseSHA, false); } - if (UseSHA1Intrinsics) { + if (UseSHA && supports_sha()) { + if (FLAG_IS_DEFAULT(UseSHA1Intrinsics)) { + FLAG_SET_DEFAULT(UseSHA1Intrinsics, true); + } + } else if (UseSHA1Intrinsics) { warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU."); FLAG_SET_DEFAULT(UseSHA1Intrinsics, false); } - if (UseSHA256Intrinsics) { + if (UseSHA && supports_sha()) { + if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) { + FLAG_SET_DEFAULT(UseSHA256Intrinsics, true); + } + } else if (UseSHA256Intrinsics) { warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU."); FLAG_SET_DEFAULT(UseSHA256Intrinsics, false); } --- old/src/cpu/x86/vm/vm_version_x86.hpp 2016-02-26 14:23:17.090042700 -0800 +++ new/src/cpu/x86/vm/vm_version_x86.hpp 2016-02-26 14:23:16.731006800 -0800 @@ -221,7 +221,7 @@ avx512pf : 1, avx512er : 1, avx512cd : 1, - : 1, + sha : 1, avx512bw : 1, avx512vl : 1; } bits; @@ -282,11 +282,13 @@ CPU_AVX512DQ = (1 << 27), CPU_AVX512PF = (1 << 28), CPU_AVX512ER = (1 << 29), - CPU_AVX512CD = (1 << 30), - CPU_AVX512BW = (1 << 31) + CPU_AVX512CD = (1 << 30) + // Keeping sign bit 31 unassigned. }; -#define CPU_AVX512VL UCONST64(0x100000000) // EVEX instructions with smaller vector length : enums are limited to 32bit +#define CPU_AVX512BW ((uint64_t)UCONST64(0x100000000)) // enums are limited to 31 bit +#define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length +#define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions enum Extended_Family { // AMD @@ -516,6 +518,8 @@ result |= CPU_ADX; if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0) result |= CPU_BMI2; + if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0) + result |= CPU_SHA; if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0) result |= CPU_LZCNT; // for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw @@ -721,6 +725,7 @@ static bool supports_avx512nobw() { return (supports_evex() && !supports_avx512bw()); } static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); } static bool supports_avxonly() { return ((supports_avx2() || supports_avx()) && !supports_evex()); } + static bool supports_sha() { return (_features & CPU_SHA) != 0; } // Intel features static bool is_intel_family_core() { return is_intel() && extended_cpu_family() == CPU_FAMILY_INTEL_CORE; } --- old/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java 2016-02-26 14:23:19.244258100 -0800 +++ new/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java 2016-02-26 14:23:18.894223100 -0800 @@ -202,7 +202,8 @@ AVX512ER, AVX512CD, AVX512BW, - AVX512VL + AVX512VL, + SHA } private final EnumSet features; --- old/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java 2016-02-26 14:23:21.372470900 -0800 +++ new/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java 2016-02-26 14:23:21.021435800 -0800 @@ -122,6 +122,9 @@ if ((config.vmVersionFeatures & config.amd64AVX512VL) != 0) { features.add(AMD64.CPUFeature.AVX512VL); } + if ((config.vmVersionFeatures & config.amd64SHA) != 0) { + features.add(AMD64.CPUFeature.SHA); + } return features; } --- old/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotVMConfig.java 2016-02-26 14:23:23.580691700 -0800 +++ new/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotVMConfig.java 2016-02-26 14:23:23.224656100 -0800 @@ -944,6 +944,7 @@ @HotSpotVMConstant(name = "VM_Version::CPU_AVX512CD", archs = {"amd64"}) @Stable public long amd64AVX512CD; @HotSpotVMConstant(name = "VM_Version::CPU_AVX512BW", archs = {"amd64"}) @Stable public long amd64AVX512BW; @HotSpotVMConstant(name = "VM_Version::CPU_AVX512VL", archs = {"amd64"}) @Stable public long amd64AVX512VL; + @HotSpotVMConstant(name = "VM_Version::CPU_SHA", archs = {"amd64"}) @Stable public long amd64SHA; // SPARC specific values @HotSpotVMConstant(name = "VM_Version::vis3_instructions_m", archs = {"sparc"}) @Stable public int sparcVis3Instructions; --- old/src/share/vm/jvmci/vmStructs_jvmci.cpp 2016-02-26 14:23:25.840917700 -0800 +++ new/src/share/vm/jvmci/vmStructs_jvmci.cpp 2016-02-26 14:23:25.477881400 -0800 @@ -626,11 +626,12 @@ declare_constant(VM_Version::CPU_AVX512DQ) \ declare_constant(VM_Version::CPU_AVX512PF) \ declare_constant(VM_Version::CPU_AVX512ER) \ - declare_constant(VM_Version::CPU_AVX512CD) \ - declare_constant(VM_Version::CPU_AVX512BW) + declare_constant(VM_Version::CPU_AVX512CD) #define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \ - declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) + declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \ + declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \ + declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA) #endif // TARGET_ARCH_x86 --- old/src/share/vm/runtime/globals.hpp 2016-02-26 14:23:28.094143000 -0800 +++ new/src/share/vm/runtime/globals.hpp 2016-02-26 14:23:27.733106900 -0800 @@ -725,7 +725,7 @@ \ product(bool, UseSHA, false, \ "Control whether SHA instructions can be used " \ - "on SPARC and on ARM") \ + "on SPARC, on ARM and on x86") \ \ product(bool, UseGHASHIntrinsics, false, \ "Use intrinsics for GHASH versions of crypto") \ --- /dev/null 2016-02-26 14:23:30.000000000 -0800 +++ new/src/cpu/x86/vm/macroAssembler_intel_x86.cpp 2016-02-26 14:23:30.083341900 -0800 @@ -0,0 +1,487 @@ +/* +* Copyright (c) 2016, Intel Corporation. +* +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* This code is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License version 2 only, as +* published by the Free Software Foundation. +* +* This code is distributed in the hope that it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* version 2 for more details (a copy is included in the LICENSE file that +* accompanied this code). +* +* You should have received a copy of the GNU General Public License version +* 2 along with this work; if not, write to the Free Software Foundation, +* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +* +* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +* or visit www.oracle.com if you need additional information or have any +* questions. +* +*/ + +#include "precompiled.hpp" +#include "asm/assembler.hpp" +#include "asm/assembler.inline.hpp" +#include "macroAssembler_x86.hpp" + +// ofs and limit are used for multi-block byte array. +// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) +void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0, + XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask, + Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) { + + Label start, done_hash, loop0; + + address upper_word_mask = StubRoutines::x86::upper_word_mask_addr(); + address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr(); + + bind(start); + movdqu(abcd, Address(state, 0)); + pinsrd(e0, Address(state, 16), 3); + movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000 + pand(e0, shuf_mask); + pshufd(abcd, abcd, 0x1B); + movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f + + bind(loop0); + // Save hash values for addition after rounds + movdqu(Address(rsp, 0), e0); + movdqu(Address(rsp, 16), abcd); + + + // Rounds 0 - 3 + movdqu(msg0, Address(buf, 0)); + pshufb(msg0, shuf_mask); + paddd(e0, msg0); + movdqa(e1, abcd); + sha1rnds4(abcd, e0, 0); + + // Rounds 4 - 7 + movdqu(msg1, Address(buf, 16)); + pshufb(msg1, shuf_mask); + sha1nexte(e1, msg1); + movdqa(e0, abcd); + sha1rnds4(abcd, e1, 0); + sha1msg1(msg0, msg1); + + // Rounds 8 - 11 + movdqu(msg2, Address(buf, 32)); + pshufb(msg2, shuf_mask); + sha1nexte(e0, msg2); + movdqa(e1, abcd); + sha1rnds4(abcd, e0, 0); + sha1msg1(msg1, msg2); + pxor(msg0, msg2); + + // Rounds 12 - 15 + movdqu(msg3, Address(buf, 48)); + pshufb(msg3, shuf_mask); + sha1nexte(e1, msg3); + movdqa(e0, abcd); + sha1msg2(msg0, msg3); + sha1rnds4(abcd, e1, 0); + sha1msg1(msg2, msg3); + pxor(msg1, msg3); + + // Rounds 16 - 19 + sha1nexte(e0, msg0); + movdqa(e1, abcd); + sha1msg2(msg1, msg0); + sha1rnds4(abcd, e0, 0); + sha1msg1(msg3, msg0); + pxor(msg2, msg0); + + // Rounds 20 - 23 + sha1nexte(e1, msg1); + movdqa(e0, abcd); + sha1msg2(msg2, msg1); + sha1rnds4(abcd, e1, 1); + sha1msg1(msg0, msg1); + pxor(msg3, msg1); + + // Rounds 24 - 27 + sha1nexte(e0, msg2); + movdqa(e1, abcd); + sha1msg2(msg3, msg2); + sha1rnds4(abcd, e0, 1); + sha1msg1(msg1, msg2); + pxor(msg0, msg2); + + // Rounds 28 - 31 + sha1nexte(e1, msg3); + movdqa(e0, abcd); + sha1msg2(msg0, msg3); + sha1rnds4(abcd, e1, 1); + sha1msg1(msg2, msg3); + pxor(msg1, msg3); + + // Rounds 32 - 35 + sha1nexte(e0, msg0); + movdqa(e1, abcd); + sha1msg2(msg1, msg0); + sha1rnds4(abcd, e0, 1); + sha1msg1(msg3, msg0); + pxor(msg2, msg0); + + // Rounds 36 - 39 + sha1nexte(e1, msg1); + movdqa(e0, abcd); + sha1msg2(msg2, msg1); + sha1rnds4(abcd, e1, 1); + sha1msg1(msg0, msg1); + pxor(msg3, msg1); + + // Rounds 40 - 43 + sha1nexte(e0, msg2); + movdqa(e1, abcd); + sha1msg2(msg3, msg2); + sha1rnds4(abcd, e0, 2); + sha1msg1(msg1, msg2); + pxor(msg0, msg2); + + // Rounds 44 - 47 + sha1nexte(e1, msg3); + movdqa(e0, abcd); + sha1msg2(msg0, msg3); + sha1rnds4(abcd, e1, 2); + sha1msg1(msg2, msg3); + pxor(msg1, msg3); + + // Rounds 48 - 51 + sha1nexte(e0, msg0); + movdqa(e1, abcd); + sha1msg2(msg1, msg0); + sha1rnds4(abcd, e0, 2); + sha1msg1(msg3, msg0); + pxor(msg2, msg0); + + // Rounds 52 - 55 + sha1nexte(e1, msg1); + movdqa(e0, abcd); + sha1msg2(msg2, msg1); + sha1rnds4(abcd, e1, 2); + sha1msg1(msg0, msg1); + pxor(msg3, msg1); + + // Rounds 56 - 59 + sha1nexte(e0, msg2); + movdqa(e1, abcd); + sha1msg2(msg3, msg2); + sha1rnds4(abcd, e0, 2); + sha1msg1(msg1, msg2); + pxor(msg0, msg2); + + // Rounds 60 - 63 + sha1nexte(e1, msg3); + movdqa(e0, abcd); + sha1msg2(msg0, msg3); + sha1rnds4(abcd, e1, 3); + sha1msg1(msg2, msg3); + pxor(msg1, msg3); + + // Rounds 64 - 67 + sha1nexte(e0, msg0); + movdqa(e1, abcd); + sha1msg2(msg1, msg0); + sha1rnds4(abcd, e0, 3); + sha1msg1(msg3, msg0); + pxor(msg2, msg0); + + // Rounds 68 - 71 + sha1nexte(e1, msg1); + movdqa(e0, abcd); + sha1msg2(msg2, msg1); + sha1rnds4(abcd, e1, 3); + pxor(msg3, msg1); + + // Rounds 72 - 75 + sha1nexte(e0, msg2); + movdqa(e1, abcd); + sha1msg2(msg3, msg2); + sha1rnds4(abcd, e0, 3); + + // Rounds 76 - 79 + sha1nexte(e1, msg3); + movdqa(e0, abcd); + sha1rnds4(abcd, e1, 3); + + // add current hash values with previously saved + movdqu(msg0, Address(rsp, 0)); + sha1nexte(e0, msg0); + movdqu(msg0, Address(rsp, 16)); + paddd(abcd, msg0); + + if (multi_block) { + // increment data pointer and loop if more to process + addptr(buf, 64); + addptr(ofs, 64); + cmpptr(ofs, limit); + jcc(Assembler::belowEqual, loop0); + movptr(rax, ofs); //return ofs + } + // write hash values back in the correct order + pshufd(abcd, abcd, 0x1b); + movdqu(Address(state, 0), abcd); + pextrd(Address(state, 16), e0, 3); + + bind(done_hash); + +} + +// xmm0 (msg) is used as an implicit argument to sh256rnds2 +// and state0 and state1 can never use xmm0 register. +// ofs and limit are used for multi-block byte array. +// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) +void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, + XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, + Register buf, Register state, Register ofs, Register limit, Register rsp, + bool multi_block LP64_ONLY(COMMA XMMRegister shuf_mask)) { + Label start, done_hash, loop0; + + address K256 = StubRoutines::x86::k256_addr(); + address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); + + bind(start); + movdqu(state0, Address(state, 0)); + movdqu(state1, Address(state, 16)); + + pshufd(state0, state0, 0xB1); + pshufd(state1, state1, 0x1B); + movdqa(msgtmp4, state0); + palignr(state0, state1, 8); + pblendw(state1, msgtmp4, 0xF0); + +#ifdef _LP64 + movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask)); +#endif + lea(rax, ExternalAddress(K256)); + + bind(loop0); + movdqu(Address(rsp, 0), state0); + movdqu(Address(rsp, 16), state1); + + // Rounds 0-3 + movdqu(msg, Address(buf, 0)); +#ifdef _LP64 + pshufb(msg, shuf_mask); +#else + pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); +#endif + movdqa(msgtmp0, msg); + paddd(msg, Address(rax, 0)); + sha256rnds2(state1, state0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + + // Rounds 4-7 + movdqu(msg, Address(buf, 16)); +#ifdef _LP64 + pshufb(msg, shuf_mask); +#else + pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); +#endif + movdqa(msgtmp1, msg); + paddd(msg, Address(rax, 16)); + sha256rnds2(state1, state0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp0, msgtmp1); + + // Rounds 8-11 + movdqu(msg, Address(buf, 32)); +#ifdef _LP64 + pshufb(msg, shuf_mask); +#else + pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); +#endif + movdqa(msgtmp2, msg); + paddd(msg, Address(rax, 32)); + sha256rnds2(state1, state0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp1, msgtmp2); + + // Rounds 12-15 + movdqu(msg, Address(buf, 48)); +#ifdef _LP64 + pshufb(msg, shuf_mask); +#else + pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); +#endif + movdqa(msgtmp3, msg); + paddd(msg, Address(rax, 48)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp3); + palignr(msgtmp4, msgtmp2, 4); + paddd(msgtmp0, msgtmp4); + sha256msg2(msgtmp0, msgtmp3); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp2, msgtmp3); + + // Rounds 16-19 + movdqa(msg, msgtmp0); + paddd(msg, Address(rax, 64)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp0); + palignr(msgtmp4, msgtmp3, 4); + paddd(msgtmp1, msgtmp4); + sha256msg2(msgtmp1, msgtmp0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp3, msgtmp0); + + // Rounds 20-23 + movdqa(msg, msgtmp1); + paddd(msg, Address(rax, 80)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp1); + palignr(msgtmp4, msgtmp0, 4); + paddd(msgtmp2, msgtmp4); + sha256msg2(msgtmp2, msgtmp1); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp0, msgtmp1); + + // Rounds 24-27 + movdqa(msg, msgtmp2); + paddd(msg, Address(rax, 96)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp2); + palignr(msgtmp4, msgtmp1, 4); + paddd(msgtmp3, msgtmp4); + sha256msg2(msgtmp3, msgtmp2); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp1, msgtmp2); + + // Rounds 28-31 + movdqa(msg, msgtmp3); + paddd(msg, Address(rax, 112)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp3); + palignr(msgtmp4, msgtmp2, 4); + paddd(msgtmp0, msgtmp4); + sha256msg2(msgtmp0, msgtmp3); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp2, msgtmp3); + + // Rounds 32-35 + movdqa(msg, msgtmp0); + paddd(msg, Address(rax, 128)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp0); + palignr(msgtmp4, msgtmp3, 4); + paddd(msgtmp1, msgtmp4); + sha256msg2(msgtmp1, msgtmp0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp3, msgtmp0); + + // Rounds 36-39 + movdqa(msg, msgtmp1); + paddd(msg, Address(rax, 144)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp1); + palignr(msgtmp4, msgtmp0, 4); + paddd(msgtmp2, msgtmp4); + sha256msg2(msgtmp2, msgtmp1); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp0, msgtmp1); + + // Rounds 40-43 + movdqa(msg, msgtmp2); + paddd(msg, Address(rax, 160)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp2); + palignr(msgtmp4, msgtmp1, 4); + paddd(msgtmp3, msgtmp4); + sha256msg2(msgtmp3, msgtmp2); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp1, msgtmp2); + + // Rounds 44-47 + movdqa(msg, msgtmp3); + paddd(msg, Address(rax, 176)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp3); + palignr(msgtmp4, msgtmp2, 4); + paddd(msgtmp0, msgtmp4); + sha256msg2(msgtmp0, msgtmp3); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp2, msgtmp3); + + // Rounds 48-51 + movdqa(msg, msgtmp0); + paddd(msg, Address(rax, 192)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp0); + palignr(msgtmp4, msgtmp3, 4); + paddd(msgtmp1, msgtmp4); + sha256msg2(msgtmp1, msgtmp0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp3, msgtmp0); + + // Rounds 52-55 + movdqa(msg, msgtmp1); + paddd(msg, Address(rax, 208)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp1); + palignr(msgtmp4, msgtmp0, 4); + paddd(msgtmp2, msgtmp4); + sha256msg2(msgtmp2, msgtmp1); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + + // Rounds 56-59 + movdqa(msg, msgtmp2); + paddd(msg, Address(rax, 224)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp2); + palignr(msgtmp4, msgtmp1, 4); + paddd(msgtmp3, msgtmp4); + sha256msg2(msgtmp3, msgtmp2); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + + // Rounds 60-63 + movdqa(msg, msgtmp3); + paddd(msg, Address(rax, 240)); + sha256rnds2(state1, state0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + movdqu(msg, Address(rsp, 0)); + paddd(state0, msg); + movdqu(msg, Address(rsp, 16)); + paddd(state1, msg); + + if (multi_block) { + // increment data pointer and loop if more to process + addptr(buf, 64); + addptr(ofs, 64); + cmpptr(ofs, limit); + jcc(Assembler::belowEqual, loop0); + movptr(rax, ofs); //return ofs + } + + pshufd(state0, state0, 0x1B); + pshufd(state1, state1, 0xB1); + movdqa(msgtmp4, state0); + pblendw(state0, state1, 0xF0); + palignr(state1, msgtmp4, 8); + + movdqu(Address(state, 0), state0); + movdqu(Address(state, 16), state1); + + bind(done_hash); + +}