--- old/src/hotspot/cpu/x86/assembler_x86.cpp 2018-11-19 12:13:01.539821000 -0800 +++ new/src/hotspot/cpu/x86/assembler_x86.cpp 2018-11-19 12:13:00.850752100 -0800 @@ -4178,6 +4178,17 @@ emit_int8(shift); } +void Assembler::vpsrldq(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + vector_len == AVX_256bit ? VM_Version::supports_avx2() : + vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : 0, ""); + InstructionAttr attributes(vector_len, /*vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(xmm3->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x73); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(shift & 0xFF); +} + void Assembler::pslldq(XMMRegister dst, int shift) { // Shift left 128 bit value in dst XMMRegister by shift number of bytes. NOT_LP64(assert(VM_Version::supports_sse2(), "")); @@ -4189,6 +4200,17 @@ emit_int8(shift); } +void Assembler::vpslldq(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + vector_len == AVX_256bit ? VM_Version::supports_avx2() : + vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : 0, ""); + InstructionAttr attributes(vector_len, /*vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(xmm7->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x73); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(shift & 0xFF); +} + void Assembler::ptest(XMMRegister dst, Address src) { assert(VM_Version::supports_sse4_1(), ""); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); @@ -4200,7 +4222,7 @@ } void Assembler::ptest(XMMRegister dst, XMMRegister src) { - assert(VM_Version::supports_sse4_1(), ""); + assert(VM_Version::supports_sse4_1() || VM_Version::supports_avx(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int8(0x17); --- old/src/hotspot/cpu/x86/assembler_x86.hpp 2018-11-19 12:13:05.614228400 -0800 +++ new/src/hotspot/cpu/x86/assembler_x86.hpp 2018-11-19 12:13:04.859152900 -0800 @@ -2055,6 +2055,7 @@ void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void vpslldq(XMMRegister dst, XMMRegister src, int shift, int vector_len); // Logical shift right packed integers void psrlw(XMMRegister dst, int shift); @@ -2069,6 +2070,7 @@ void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void vpsrldq(XMMRegister dst, XMMRegister src, int shift, int vector_len); void evpsrlvw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpsllvw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); --- old/src/hotspot/cpu/x86/macroAssembler_x86.hpp 2018-11-19 12:13:09.163583300 -0800 +++ new/src/hotspot/cpu/x86/macroAssembler_x86.hpp 2018-11-19 12:13:08.495516500 -0800 @@ -943,12 +943,17 @@ int iter); void addm(int disp, Register r1, Register r2); + void gfmul(XMMRegister tmp0, XMMRegister t); + void schoolbookAAD(int i, Register subkeyH, XMMRegister data, XMMRegister tmp0, + XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3); + void generateHtbl(Register htbl); public: void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block, XMMRegister shuf_mask); + void avx_ghash(Register state, Register htbl, Register data, Register blocks); #endif #ifdef _LP64 @@ -1498,6 +1503,15 @@ // 0x11 - multiply upper 64 bits [64:127] Assembler::vpclmulqdq(dst, nds, src, 0x11); } + void vpclmullqhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) { + // 0x10 - multiply nds[0:63] and src[64:127] + Assembler::vpclmulqdq(dst, nds, src, 0x10); + } + void vpclmulhqlqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) { + //0x01 - multiply nds[64:127] and src[0:63] + Assembler::vpclmulqdq(dst, nds, src, 0x01); + } + void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { // 0x00 - multiply lower 64 bits [0:63] Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len); --- old/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp 2018-11-19 12:13:12.768943800 -0800 +++ new/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp 2018-11-19 12:13:12.089875900 -0800 @@ -4388,6 +4388,59 @@ return start; } +// Polynomial x^128+x^127+x^126+x^121+1 +address ghash_polynomial_addr() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr"); + address start = __ pc(); + __ emit_data64(0x0000000000000001, relocInfo::none); + __ emit_data64(0xc200000000000000, relocInfo::none); + return start; +} + +address ghash_shufflemask_addr() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr"); + address start = __ pc(); + __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none); + __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none); + return start; +} + +// Ghash single and multi block operations using AVX instructions +address generate_avx_ghash_processBlocks() { + __ align(CodeEntryAlignment); + + StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); + address start = __ pc(); + + // arguments + const Register state = c_rarg0; + const Register htbl = c_rarg1; + const Register data = c_rarg2; + const Register blocks = c_rarg3; + __ enter(); + // Save state before entering routine + __ push(r12); + __ push(r13); + __ push(r14); + __ push(r15); + __ push(rbx); + __ push(rbp); + + __ avx_ghash(state, htbl, data, blocks); + + __ pop(rbp); + __ pop(rbx); + __ pop(r15); + __ pop(r14); + __ pop(r13); + __ pop(r12); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + return start; +} + // byte swap x86 long address generate_ghash_long_swap_mask() { __ align(CodeEntryAlignment); @@ -5886,10 +5939,16 @@ // Generate GHASH intrinsics code if (UseGHASHIntrinsics) { - StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); - StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); - StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); - } + StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); + StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); + if (VM_Version::supports_avx()) { + StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr(); + StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr(); + StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks(); + } else { + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); + } + } if (UseBASE64Intrinsics) { StubRoutines::x86::_and_mask = base64_and_mask_addr(); --- old/src/hotspot/cpu/x86/stubRoutines_x86.cpp 2018-11-19 12:13:16.583325200 -0800 +++ new/src/hotspot/cpu/x86/stubRoutines_x86.cpp 2018-11-19 12:13:15.906257500 -0800 @@ -38,6 +38,8 @@ address StubRoutines::x86::_counter_shuffle_mask_addr = NULL; address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL; address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL; +address StubRoutines::x86::_ghash_poly_addr = NULL; +address StubRoutines::x86::_ghash_shuffmask_addr = NULL; address StubRoutines::x86::_upper_word_mask_addr = NULL; address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL; address StubRoutines::x86::_k256_adr = NULL; --- old/src/hotspot/cpu/x86/stubRoutines_x86.hpp 2018-11-19 12:13:21.121779000 -0800 +++ new/src/hotspot/cpu/x86/stubRoutines_x86.hpp 2018-11-19 12:13:20.169683800 -0800 @@ -128,6 +128,8 @@ // swap mask for ghash static address _ghash_long_swap_mask_addr; static address _ghash_byte_swap_mask_addr; + static address _ghash_poly_addr; + static address _ghash_shuffmask_addr; // upper word mask for sha1 static address _upper_word_mask_addr; @@ -205,6 +207,8 @@ static address crc_by128_masks_addr() { return (address)_crc_by128_masks; } static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; } static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; } + static address ghash_shufflemask_addr() { return _ghash_shuffmask_addr; } + static address ghash_polynomial_addr() { return _ghash_poly_addr; } static address upper_word_mask_addr() { return _upper_word_mask_addr; } static address shuffle_byte_flip_mask_addr() { return _shuffle_byte_flip_mask_addr; } static address k256_addr() { return _k256_adr; } --- old/src/java.base/share/classes/com/sun/crypto/provider/GHASH.java 2018-11-19 12:13:25.846251400 -0800 +++ new/src/java.base/share/classes/com/sun/crypto/provider/GHASH.java 2018-11-19 12:13:24.942161000 -0800 @@ -128,6 +128,8 @@ // hash subkey H; should not change after the object has been constructed private final long[] subkeyH; + // subkeyHtbl holds 2*9 powers of subkeyH computed using carry-less multiplication + private long[] subkeyHtbl; // buffer for storing hash private final long[] state; @@ -149,9 +151,16 @@ throw new ProviderException("Internal error"); } state = new long[2]; + subkeyHtbl = new long[2*9]; this.subkeyH = new long[2]; this.subkeyH[0] = getLong(subkeyH, 0); this.subkeyH[1] = getLong(subkeyH, 8); + subkeyHtbl[0] = this.subkeyH[0]; + subkeyHtbl[1] = this.subkeyH[1]; + for (int i = 1; i < 9 ; i++) { + subkeyHtbl[2*i] = 0; + subkeyHtbl[2*i+1] = 0; + } } /** @@ -194,11 +203,11 @@ if (inLen == 0) { return; } - ghashRangeCheck(in, inOfs, inLen, state, subkeyH); - processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyH); + ghashRangeCheck(in, inOfs, inLen, state, subkeyHtbl); + processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyHtbl); } - private static void ghashRangeCheck(byte[] in, int inOfs, int inLen, long[] st, long[] subH) { + private static void ghashRangeCheck(byte[] in, int inOfs, int inLen, long[] st, long[] subkeyHtbl) { if (inLen < 0) { throw new RuntimeException("invalid input length: " + inLen); } @@ -219,9 +228,9 @@ throw new RuntimeException("internal state has invalid length: " + st.length); } - if (subH.length != 2) { + if (subkeyHtbl.length != 18) { throw new RuntimeException("internal subkeyH has invalid length: " + - subH.length); + subkeyHtbl.length); } } /* --- /dev/null 2018-11-19 12:13:31.000000000 -0800 +++ new/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp 2018-11-19 12:13:29.678634600 -0800 @@ -0,0 +1,301 @@ +/* +* Copyright (c) 2018, Intel Corporation. +* +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* This code is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License version 2 only, as +* published by the Free Software Foundation. +* +* This code is distributed in the hope that it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* version 2 for more details (a copy is included in the LICENSE file that +* accompanied this code). +* +* You should have received a copy of the GNU General Public License version +* 2 along with this work; if not, write to the Free Software Foundation, +* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +* +* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +* or visit www.oracle.com if you need additional information or have any +* questions. +* +*/ + +#include "precompiled.hpp" +#include "asm/assembler.hpp" +#include "asm/assembler.inline.hpp" +#include "runtime/stubRoutines.hpp" +#include "macroAssembler_x86.hpp" + +// Multiply 128 x 128 bits, using 4 pclmulqdq operations +void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data, + XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) { + movdqu(xmm15, Address(htbl, i * 16)); + vpclmulhqlqdq(tmp3, data, xmm15); // 0x01 + vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); + vpclmulldq(tmp3, data, xmm15); // 0x00 + vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit); + vpclmulhdq(tmp3, data, xmm15); // 0x11 + vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); + vpclmullqhqdq(tmp3, data, xmm15); // 0x10 + vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); +} + +// Multiply two 128 bit numbers resulting in a 256 bit value +// Result of the multiplication followed by reduction stored in state +void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) { + const XMMRegister tmp1 = xmm4; + const XMMRegister tmp2 = xmm5; + const XMMRegister tmp3 = xmm6; + const XMMRegister tmp4 = xmm7; + + vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0) + vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1) + vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0) + vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1) + + vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0) + + vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); + vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); + vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result + vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication + // Follows the reduction technique mentioned in + // Shift-XOR reduction described in Gueron-Kounavis May 2010 + // First phase of reduction + // + vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31 + vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30 + vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25 + // xor the shifted versions + vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); + vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); + vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); + vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); + vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete + // + // Second phase of the reduction + // + vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1 + vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2 + vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7 + vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions + vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit); + vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); + vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit); + vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state + ret(0); +} + +// This method takes in the subkey after expansion and generates 16 * 8 powers of subkey H using GFMUL operation. +// The powers are used for carry-less multiplication in scalar multiblock ghash operations. +void MacroAssembler::generateHtbl(Register htbl) { + const XMMRegister t = xmm0; + const XMMRegister tmp0 = xmm1; + Label GFMUL; + // load the original subkey hash + movdqu(t, Address(htbl, 0)); + // shuffle using long swap mask + movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); + vpshufb(t, t, xmm10, Assembler::AVX_128bit); + //Save the shuffled mask as the first htbl entry + movdqu(Address(htbl, 0 * 16), t); + movdqu(tmp0, t); + // Compute H' = GFMUL(H, 2) + vpsrld(xmm3, t, 7, Assembler::AVX_128bit); + movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr())); + vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit); + movl(rax, 0xff00); + movdl(xmm4, rax); + vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit); + movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr())); + vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit); + vpsrld(xmm3, t, 31, Assembler::AVX_128bit); + vpslld(xmm4, t, 1, Assembler::AVX_128bit); + vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit); + vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2 + + //Adding p(x)<<1 to xmm5 which holds the reduction polynomial + vpxor(t, t, xmm5, Assembler::AVX_128bit); + // tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H) + movdqu(tmp0, t); + // store GFMUL(H,2) + movdqu(Address(htbl, 1 * 16), t); // H * 2 + call(GFMUL, relocInfo::none); + movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2 + call(GFMUL, relocInfo::none); + movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2 + call(GFMUL, relocInfo::none); + movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2 + call(GFMUL, relocInfo::none); + movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2 + call(GFMUL, relocInfo::none); + movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2 + call(GFMUL, relocInfo::none); + movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2 + call(GFMUL, relocInfo::none); + movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2 + + ret(0); + bind(GFMUL); + gfmul(tmp0, t); +} + +// Multiblock and single block GHASH computation using Shift XOR reduction technique +void MacroAssembler::avx_ghash(Register input_state, Register htbl, + Register input_data, Register blocks) { + + // temporary variables to hold input data and input state + const XMMRegister data = xmm1; + const XMMRegister state = xmm0; + // temporary variables to hold intermediate results + const XMMRegister tmp0 = xmm3; + const XMMRegister tmp1 = xmm4; + const XMMRegister tmp2 = xmm5; + const XMMRegister tmp3 = xmm6; + const XMMRegister tmp4 = xmm7; + // temporary variables to hold byte and long swap masks + const XMMRegister bswap_mask = xmm2; + const XMMRegister lswap_mask = xmm14; + + Label GENERATE_HTBL, BEGIN_PROCESS, GHASH_LOOP, BLOCK8_REDUCTION, + ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH; + + testptr(blocks, blocks); + jcc(Assembler::zero, EXIT_GHASH); + + // Check if Hashtable has been already generated + movdqu(tmp2, Address(htbl, 2 * 16)); + ptest(tmp2, tmp2); + jcc(Assembler::notZero, BEGIN_PROCESS); + call(GENERATE_HTBL, relocInfo::none); + + // Shuffle the input state + bind(BEGIN_PROCESS); + movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); + movdqu(state, Address(input_state, 0)); + vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); + + //Do 8 multiplies followed by a reduction processing 8 blocks of data at a time + //Each block = 16 bytes. + bind(PROCESS_8_BLOCKS); + cmpl(blocks, 8); + jcc(Assembler::below, ONE_BLK_INIT); + subl(blocks, 8); + movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); + movdqu(data, Address(input_data, 16 * 7)); + vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); + //Loading 1*16 as calculated powers of H required starts at that location. + movdqu(xmm15, Address(htbl, 1 * 16)); + //Perform carryless multiplication of (H*2, data block #7) + vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1 + vpclmulldq(tmp0, data, xmm15);//a0 * b0 + vpclmulhdq(tmp1, data, xmm15);//a1 * b1 + vpclmullqhqdq(tmp3, data, xmm15);//a1* b0 + vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0) + + movdqu(data, Address(input_data, 16 * 6)); + vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); + // Perform carryless multiplication of (H^2 * 2, data block #6) + schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3); + + movdqu(data, Address(input_data, 16 * 5)); + vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); + // Perform carryless multiplication of (H^3 * 2, data block #5) + schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3); + movdqu(data, Address(input_data, 16 * 4)); + vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); + // Perform carryless multiplication of (H^4 * 2, data block #4) + schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3); + movdqu(data, Address(input_data, 16 * 3)); + vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); + // Perform carryless multiplication of (H^5 * 2, data block #3) + schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3); + movdqu(data, Address(input_data, 16 * 2)); + vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); + // Perform carryless multiplication of (H^6 * 2, data block #2) + schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3); + movdqu(data, Address(input_data, 16 * 1)); + vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); + // Perform carryless multiplication of (H^7 * 2, data block #1) + schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3); + movdqu(data, Address(input_data, 16 * 0)); + // xor data block#0 with input state before perfoming carry-less multiplication + vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); + vpxor(data, data, state, Assembler::AVX_128bit); + // Perform carryless multiplication of (H^8 * 2, data block #0) + schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3); + vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); + vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); + vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of + vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation + + // we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1 + // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0 + // Follows the reduction technique mentioned in + // Shift-XOR reduction described in Gueron-Kounavis May 2010 + bind(BLOCK8_REDUCTION); + // First Phase of the reduction + vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31 + vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30 + vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25 + // xor the shifted versions + vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); + vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); + + vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); + vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); + + vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete + // second phase of the reduction + vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1 + vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2 + vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7 + // xor the shifted versions + vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit); + vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit); + vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); + vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit); + // Final result is in state + vpxor(state, tmp0, tmp1, Assembler::AVX_128bit); + + lea(input_data, Address(input_data, 16 * 8)); + jmp(PROCESS_8_BLOCKS); + + // Since this is one block operation we will only use H * 2 i.e. the first power of H + bind(ONE_BLK_INIT); + movdqu(tmp0, Address(htbl, 1 * 16)); + movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); + + //Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction. + bind(PROCESS_1_BLOCK); + cmpl(blocks, 0); + jcc(Assembler::equal, SAVE_STATE); + subl(blocks, 1); + movdqu(data, Address(input_data, 0)); + vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); + vpxor(state, state, data, Assembler::AVX_128bit); + // gfmul(H*2, state) + call(GHASH_LOOP, relocInfo::none); + addptr(input_data, 16); + jmp(PROCESS_1_BLOCK); + + bind(SAVE_STATE); + vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); + movdqu(Address(input_state, 0), state); + jmp(EXIT_GHASH); + + bind(GHASH_LOOP); + gfmul(tmp0, state); + bind(GENERATE_HTBL); + generateHtbl(htbl); + + bind(EXIT_GHASH); + // zero out xmm registers used for Htbl storage + vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); + vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit); + vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit); + vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit); +} \ No newline at end of file