# HG changeset patch # User enevill # Date 1437485788 0 # Tue Jul 21 13:36:28 2015 +0000 # Node ID 629a5f148b83ec050e21d6577b4e61563bb01b2e # Parent 0d3c20ac648e7debf8aacb0656efa1b3f9d2c2bc 8131062: aarch64: add support for GHASH acceleration Summary: Add support for GHASH using pmull Reviewed-by: duke Contributed-by: alexander.alexeev@caviumnetworks.com diff --git a/src/cpu/aarch64/vm/assembler_aarch64.hpp b/src/cpu/aarch64/vm/assembler_aarch64.hpp --- a/src/cpu/aarch64/vm/assembler_aarch64.hpp +++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp @@ -1896,7 +1896,7 @@ public: enum SIMD_Arrangement { - T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D + T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D, T1Q }; enum SIMD_RegVariant { @@ -2225,14 +2225,16 @@ f(0b001111, 15, 10), rf(Vn, 5), rf(Xd, 0); } - // We do not handle the 1Q arrangement. void pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) { starti; - assert(Ta == T8H && (Tb == T8B || Tb == T16B), "Invalid Size specifier"); - f(0, 31), f(Tb & 1, 30), f(0b001110001, 29, 21), rf(Vm, 16), f(0b111000, 15, 10); - rf(Vn, 5), rf(Vd, 0); + assert((Ta == T1Q && (Tb == T1D || Tb == T2D)) || + (Ta == T8H && (Tb == T8B || Tb == T16B)), "Invalid Size specifier"); + int size = (Ta == T1Q) ? 0b11 : 0b00; + f(0, 31), f(Tb & 1, 30), f(0b001110, 29, 24), f(size, 23, 22); + f(1, 21), rf(Vm, 16), f(0b111000, 15, 10), rf(Vn, 5), rf(Vd, 0); } void pmull2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) { + assert(Tb == T2D || Tb == T16B, "pmull2 assumes T2D or T16B as the second size specifier"); pmull(Vd, Ta, Vn, Vm, Tb); } @@ -2245,15 +2247,6 @@ f(0b100001010010, 21, 10), rf(Vn, 5), rf(Vd, 0); } - void rev32(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) - { - starti; - assert(T <= T8H, "must be one of T8B, T16B, T4H, T8H"); - f(0, 31), f((int)T & 1, 30), f(0b101110, 29, 24); - f(T <= T16B ? 0b00 : 0b01, 23, 22), f(0b100000000010, 21, 10); - rf(Vn, 5), rf(Vd, 0); - } - void dup(FloatRegister Vd, SIMD_Arrangement T, Register Xs) { starti; @@ -2290,6 +2283,57 @@ #undef INSN + // Table vector lookup +#define INSN(NAME, op) \ + void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, unsigned registers, FloatRegister Vm) { \ + starti; \ + assert(T == T8B || T == T16B, "invalid arrangement"); \ + assert(0 < registers && registers <= 4, "invalid number of registers"); \ + f(0, 31), f((int)T & 1, 30), f(0b001110000, 29, 21), rf(Vm, 16), f(0, 15); \ + f(registers - 1, 14, 13), f(op, 12),f(0b00, 11, 10), rf(Vn, 5), rf(Vd, 0); \ + } + + INSN(tbl, 0); + INSN(tbx, 1); + +#undef INSN + +#define INSN(NAME, U, opcode) \ + void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) { \ + starti; \ + assert((ASSERTION), MSG); \ + f(0, 31), f((int)T & 1, 30), f(U, 29), f(0b01110, 28, 24); \ + f((int)(T >> 1), 23, 22), f(0b10000, 21, 17), f(opcode, 16, 12); \ + f(0b10, 11, 10), rf(Vn, 5), rf(Vd, 0); \ + } + +#define MSG "invalid arrangement" + +#define ASSERTION (T == T8B || T == T16B || T == T4H || T == T8H || T == T2S || T == T4S) + INSN(rev64, 0, 0b00000); +#undef ASSERTION + +#define ASSERTION (T == T8B || T == T16B || T == T4H || T == T8H) + INSN(rev32, 1, 0b00000); +#undef ASSERTION + +#define ASSERTION (T == T8B || T == T16B) + INSN(rev16, 0, 0b00001); +#undef ASSERTION + +#undef MSG + +#undef INSN + +void ext(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm, int index) + { + starti; + assert(T == T8B || T == T16B, "invalid arrangement"); + assert((T == T8B && index <= 0b0111) || (T == T16B && index <= 0b1111), "Invalid index value"); + f(0, 31), f((int)T & 1, 30), f(0b101110000, 29, 21); + rf(Vm, 16), f(0, 15), f(index, 14, 11); + f(0, 10), rf(Vn, 5), rf(Vd, 0); + } /* Simulator extensions to the ISA diff --git a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp --- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp +++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp @@ -2437,6 +2437,137 @@ return start; } + /** + * Arguments: + * + * Input: + * c_rarg0 - current state address + * c_rarg1 - H key address + * c_rarg2 - data address + * c_rarg3 - number of blocks + * + * Output: + * Updated state at c_rarg0 + */ + address generate_ghash_processBlocks() { + __ align(CodeEntryAlignment); + Label L_ghash_loop, L_exit; + + StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); + address start = __ pc(); + + Register state = c_rarg0; + Register subkeyH = c_rarg1; + Register data = c_rarg2; + Register blocks = c_rarg3; + + FloatRegister vzr = v30; + __ eor(vzr, __ T16B, vzr, vzr); // zero register + + __ mov(v26, __ T16B, 1); + __ mov(v27, __ T16B, 63); + __ mov(v28, __ T16B, 62); + __ mov(v29, __ T16B, 57); + + __ ldrq(v6, Address(state)); + __ ldrq(v16, Address(subkeyH)); + + __ ext(v0, __ T16B, v6, v6, 0x08); + __ ext(v1, __ T16B, v16, v16, 0x08); + __ eor(v16, __ T16B, v16, v1); + + __ bind(L_ghash_loop); + + __ ldrq(v2, Address(__ post(data, 0x10))); + __ rev64(v2, __ T16B, v2); // swap data + + __ ext(v6, __ T16B, v0, v0, 0x08); + __ eor(v6, __ T16B, v6, v2); + __ ext(v2, __ T16B, v6, v6, 0x08); + + __ pmull2(v7, __ T1Q, v2, v1, __ T2D); // A1*B1 + __ eor(v6, __ T16B, v6, v2); + __ pmull(v5, __ T1Q, v2, v1, __ T1D); // A0*B0 + __ pmull(v20, __ T1Q, v6, v16, __ T1D); // (A1 + A0)(B1 + B0) + + __ ext(v21, __ T16B, v5, v7, 0x08); + __ eor(v18, __ T16B, v7, v5); // A1*B1 xor A0*B0 + __ eor(v20, __ T16B, v20, v21); + __ eor(v20, __ T16B, v20, v18); + + // Registers pair holds the result of carry-less multiplication + __ ins(v7, __ D, v20, 0, 1); + __ ins(v5, __ D, v20, 1, 0); + + // Result of the multiplication is shifted by one bit position + // [X3:X2:X1:X0] = [X3:X2:X1:X0] << 1 + __ ushr(v18, __ T2D, v5, -63 & 63); + __ ins(v25, __ D, v18, 1, 0); + __ ins(v25, __ D, vzr, 0, 0); + __ ushl(v5, __ T2D, v5, v26); + __ orr(v5, __ T16B, v5, v25); + + __ ushr(v19, __ T2D, v7, -63 & 63); + __ ins(v19, __ D, v19, 1, 0); + __ ins(v19, __ D, v18, 0, 1); + __ ushl(v7, __ T2D, v7, v26); + __ orr(v6, __ T16B, v7, v19); + + __ ins(v24, __ D, v5, 0, 1); + + // A = X0 << 63 + __ ushl(v21, __ T2D, v5, v27); + + // A = X0 << 62 + __ ushl(v22, __ T2D, v5, v28); + + // A = X0 << 57 + __ ushl(v23, __ T2D, v5, v29); + + // D = X1^A^B^C + __ eor(v21, __ T16B, v21, v22); + __ eor(v21, __ T16B, v21, v23); + __ eor(v21, __ T16B, v21, v24); + __ ins(v5, __ D, v21, 1, 0); + + // [E1:E0] = [D:X0] >> 1 + __ ushr(v20, __ T2D, v5, -1 & 63); + __ ushl(v18, __ T2D, v5, v27); + __ ext(v25, __ T16B, v18, vzr, 0x08); + __ orr(v19, __ T16B, v20, v25); + + __ eor(v7, __ T16B, v5, v19); + + // [F1:F0] = [D:X0] >> 2 + __ ushr(v20, __ T2D, v5, -2 & 63); + __ ushl(v18, __ T2D, v5, v28); + __ ins(v25, __ D, v18, 0, 1); + __ orr(v19, __ T16B, v20, v25); + + __ eor(v7, __ T16B, v7, v19); + + // [G1:G0] = [D:X0] >> 7 + __ ushr(v20, __ T2D, v5, -7 & 63); + __ ushl(v18, __ T2D, v5, v29); + __ ins(v25, __ D, v18, 0, 1); + __ orr(v19, __ T16B, v20, v25); + + // [H1:H0] = [D^E1^F1^G1:X0^E0^F0^G0] + __ eor(v7, __ T16B, v7, v19); + + // Result = [H1:H0]^[X3:X2] + __ eor(v0, __ T16B, v7, v6); + + __ subs(blocks, blocks, 1); + __ cbnz(blocks, L_ghash_loop); + + __ ext(v1, __ T16B, v0, v0, 0x08); + __ st1(v1, __ T16B, state); + __ ret(lr); + + return start; + } + // Continuation point for throwing of implicit exceptions that are // not handled in the current activation. Fabricates an exception // oop and initiates normal exception dispatching in this @@ -2604,6 +2735,11 @@ } #ifndef BUILTIN_SIM + // generate GHASH intrinsics code + if (UseGHASHIntrinsics) { + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); + } + if (UseAESIntrinsics) { StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); diff --git a/src/cpu/aarch64/vm/vm_version_aarch64.cpp b/src/cpu/aarch64/vm/vm_version_aarch64.cpp --- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp +++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp @@ -45,6 +45,10 @@ #define HWCAP_AES (1<<3) #endif +#ifndef HWCAP_PMULL +#define HWCAP_PMULL (1<<4) +#endif + #ifndef HWCAP_SHA1 #define HWCAP_SHA1 (1<<5) #endif @@ -190,11 +194,6 @@ } } - if (UseGHASHIntrinsics) { - warning("GHASH intrinsics are not available on this CPU"); - FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); - } - if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) { UseCRC32Intrinsics = true; } @@ -244,6 +243,15 @@ FLAG_SET_DEFAULT(UseSHA, false); } + if (auxv & HWCAP_PMULL) { + if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { + FLAG_SET_DEFAULT(UseGHASHIntrinsics, true); + } + } else if (UseGHASHIntrinsics) { + warning("GHASH intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + } + // This machine allows unaligned memory accesses if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) { FLAG_SET_DEFAULT(UseUnalignedAccesses, true);