# HG changeset patch # User ascarpino # Date 1434588505 25200 # Wed Jun 17 17:48:25 2015 -0700 # Node ID d0f48f9ec09ec99f9ff0639be276a9734d01b03d # Parent 4170228e11e6313e948e6ddcae9af3eed06b1fbe 8073108: Use x86 and SPARC CPU instructions for GHASH acceleration Reviewed-by: kvn, jrose diff --git a/src/cpu/ppc/vm/vm_version_ppc.cpp b/src/cpu/ppc/vm/vm_version_ppc.cpp --- a/src/cpu/ppc/vm/vm_version_ppc.cpp +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp @@ -194,6 +194,11 @@ FLAG_SET_DEFAULT(UseAESIntrinsics, false); } + if (UseGHASHIntrinsics) { + warning("GHASH intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + } + if (has_vshasig()) { if (FLAG_IS_DEFAULT(UseSHA)) { UseSHA = true; diff --git a/src/cpu/sparc/vm/assembler_sparc.hpp b/src/cpu/sparc/vm/assembler_sparc.hpp --- a/src/cpu/sparc/vm/assembler_sparc.hpp +++ b/src/cpu/sparc/vm/assembler_sparc.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -129,6 +129,7 @@ flog3_op3 = 0x36, edge_op3 = 0x36, fsrc_op3 = 0x36, + xmulx_op3 = 0x36, impdep2_op3 = 0x37, stpartialf_op3 = 0x37, jmpl_op3 = 0x38, @@ -220,6 +221,8 @@ mdtox_opf = 0x110, mstouw_opf = 0x111, mstosw_opf = 0x113, + xmulx_opf = 0x115, + xmulxhi_opf = 0x116, mxtod_opf = 0x118, mwtos_opf = 0x119, @@ -1212,6 +1215,9 @@ void movwtos( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(mftoi_op3) | opf(mwtos_opf) | rs2(s)); } void movxtod( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(mftoi_op3) | opf(mxtod_opf) | rs2(s)); } + void xmulx(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulx_opf) | rs2(s2)); } + void xmulxhi(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2)); } + // Crypto SHA instructions void sha1() { sha1_only(); emit_int32( op(arith_op) | op3(sha_op3) | opf(sha1_opf)); } diff --git a/src/cpu/sparc/vm/stubGenerator_sparc.cpp b/src/cpu/sparc/vm/stubGenerator_sparc.cpp --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp @@ -4788,7 +4788,131 @@ return start; } - void generate_initial() { + /* Single and multi-block ghash operations */ + address generate_ghash_processBlocks() { + __ align(CodeEntryAlignment); + Label L_ghash_loop, L_aligned, L_main; + StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); + address start = __ pc(); + + Register state = I0; + Register subkeyH = I1; + Register data = I2; + Register len = I3; + + __ save_frame(0); + + __ ldx(state, 0, O0); + __ ldx(state, 8, O1); + + // Loop label for multiblock operations + __ BIND(L_ghash_loop); + + // Check if 'data' is unaligned + __ andcc(data, 7, G1); + __ br(Assembler::zero, false, Assembler::pt, L_aligned); + __ delayed()->nop(); + + Register left_shift = L1; + Register right_shift = L2; + Register data_ptr = L3; + + // Get left and right shift values in bits + __ sll(G1, LogBitsPerByte, left_shift); + __ mov(64, right_shift); + __ sub(right_shift, left_shift, right_shift); + + // Align to read 'data' + __ sub(data, G1, data_ptr); + + // Load first 8 bytes of 'data' + __ ldx(data_ptr, 0, O4); + __ sllx(O4, left_shift, O4); + __ ldx(data_ptr, 8, O5); + __ srlx(O5, right_shift, G4); + __ bset(G4, O4); + + // Load second 8 bytes of 'data' + __ sllx(O5, left_shift, O5); + __ ldx(data_ptr, 16, G4); + __ srlx(G4, right_shift, G4); + __ ba(L_main); + __ delayed()->bset(G4, O5); + + // If 'data' is aligned, load normally + __ BIND(L_aligned); + __ ldx(data, 0, O4); + __ ldx(data, 8, O5); + + __ BIND(L_main); + __ ldx(subkeyH, 0, O2); + __ ldx(subkeyH, 8, O3); + + __ xor3(O0, O4, O0); + __ xor3(O1, O5, O1); + + __ xmulxhi(O0, O3, G3); + __ xmulx(O0, O2, O5); + __ xmulxhi(O1, O2, G4); + __ xmulxhi(O1, O3, G5); + __ xmulx(O0, O3, G1); + __ xmulx(O1, O3, G2); + __ xmulx(O1, O2, O3); + __ xmulxhi(O0, O2, O4); + + __ mov(0xE1, O0); + __ sllx(O0, 56, O0); + + __ xor3(O5, G3, O5); + __ xor3(O5, G4, O5); + __ xor3(G5, G1, G1); + __ xor3(G1, O3, G1); + __ srlx(G2, 63, O1); + __ srlx(G1, 63, G3); + __ sllx(G2, 63, O3); + __ sllx(G2, 58, O2); + __ xor3(O3, O2, O2); + + __ sllx(G1, 1, G1); + __ or3(G1, O1, G1); + + __ xor3(G1, O2, G1); + + __ sllx(G2, 1, G2); + + __ xmulxhi(G1, O0, O1); + __ xmulx(G1, O0, O2); + __ xmulxhi(G2, O0, O3); + __ xmulx(G2, O0, G1); + + __ xor3(O4, O1, O4); + __ xor3(O5, O2, O5); + __ xor3(O5, O3, O5); + + __ sllx(O4, 1, O2); + __ srlx(O5, 63, O3); + + __ or3(O2, O3, O0); + + __ sllx(O5, 1, O1); + __ srlx(G1, 63, O2); + __ or3(O1, O2, O1); + __ xor3(O1, G3, O1); + + __ deccc(len); + __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop); + __ delayed()->add(data, 16, data); + + __ stx(O0, I0, 0); + __ stx(O1, I0, 8); + + __ ret(); + __ delayed()->restore(); + + return start; + } + +void generate_initial() { // Generates all stubs and initializes the entry points //------------------------------------------------------------------------------------------------------------------------ @@ -4861,6 +4985,11 @@ StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); } + // generate GHASH intrinsics code + if (UseGHASHIntrinsics) { + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); + } + // generate SHA1/SHA256/SHA512 intrinsics code if (UseSHA1Intrinsics) { StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); diff --git a/src/cpu/sparc/vm/vm_version_sparc.cpp b/src/cpu/sparc/vm/vm_version_sparc.cpp --- a/src/cpu/sparc/vm/vm_version_sparc.cpp +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp @@ -319,6 +319,17 @@ } } + // GHASH/GCM intrinsics + if (has_vis3() && (UseVIS > 2)) { + if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { + UseGHASHIntrinsics = true; + } + } else if (UseGHASHIntrinsics) { + if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) + warning("GHASH intrinsics require VIS3 insructions support. Intriniscs will be disabled"); + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + } + // SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times if (has_sha1() || has_sha256() || has_sha512()) { if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp --- a/src/cpu/x86/vm/assembler_x86.cpp +++ b/src/cpu/x86/vm/assembler_x86.cpp @@ -2575,6 +2575,15 @@ emit_int8(shift); } +void Assembler::pslldq(XMMRegister dst, int shift) { + // Shift 128 bit value in xmm register by number of bytes. + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66); + emit_int8(0x73); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(shift); +} + void Assembler::ptest(XMMRegister dst, Address src) { assert(VM_Version::supports_sse4_1(), ""); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp --- a/src/cpu/x86/vm/assembler_x86.hpp +++ b/src/cpu/x86/vm/assembler_x86.hpp @@ -1527,6 +1527,8 @@ // Shift Right by bytes Logical DoubleQuadword Immediate void psrldq(XMMRegister dst, int shift); + // Shift Left by bytes Logical DoubleQuadword Immediate + void pslldq(XMMRegister dst, int shift); // Logical Compare 128bit void ptest(XMMRegister dst, XMMRegister src); diff --git a/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/src/cpu/x86/vm/stubGenerator_x86_32.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp @@ -2719,6 +2719,167 @@ return start; } + // byte swap x86 long + address generate_ghash_long_swap_mask() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask"); + address start = __ pc(); + __ emit_data(0x0b0a0908, relocInfo::none, 0); + __ emit_data(0x0f0e0d0c, relocInfo::none, 0); + __ emit_data(0x03020100, relocInfo::none, 0); + __ emit_data(0x07060504, relocInfo::none, 0); + + return start; + } + + // byte swap x86 byte array + address generate_ghash_byte_swap_mask() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask"); + address start = __ pc(); + __ emit_data(0x0c0d0e0f, relocInfo::none, 0); + __ emit_data(0x08090a0b, relocInfo::none, 0); + __ emit_data(0x04050607, relocInfo::none, 0); + __ emit_data(0x00010203, relocInfo::none, 0); + return start; + } + + /* Single and multi-block ghash operations */ + address generate_ghash_processBlocks() { + assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support"); + __ align(CodeEntryAlignment); + Label L_ghash_loop, L_exit; + StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); + address start = __ pc(); + + const Register state = rdi; + const Register subkeyH = rsi; + const Register data = rdx; + const Register blocks = rcx; + + const Address state_param(rbp, 8+0); + const Address subkeyH_param(rbp, 8+4); + const Address data_param(rbp, 8+8); + const Address blocks_param(rbp, 8+12); + + const XMMRegister xmm_temp0 = xmm0; + const XMMRegister xmm_temp1 = xmm1; + const XMMRegister xmm_temp2 = xmm2; + const XMMRegister xmm_temp3 = xmm3; + const XMMRegister xmm_temp4 = xmm4; + const XMMRegister xmm_temp5 = xmm5; + const XMMRegister xmm_temp6 = xmm6; + const XMMRegister xmm_temp7 = xmm7; + + __ enter(); + + __ movptr(state, state_param); + __ movptr(subkeyH, subkeyH_param); + __ movptr(data, data_param); + __ movptr(blocks, blocks_param); + + __ movdqu(xmm_temp0, Address(state, 0)); + __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); + + __ movdqu(xmm_temp1, Address(subkeyH, 0)); + __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); + + __ BIND(L_ghash_loop); + __ movdqu(xmm_temp2, Address(data, 0)); + __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); + + __ pxor(xmm_temp0, xmm_temp2); + + // + // Multiply with the hash key + // + __ movdqu(xmm_temp3, xmm_temp0); + __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0 + __ movdqu(xmm_temp4, xmm_temp0); + __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1 + + __ movdqu(xmm_temp5, xmm_temp0); + __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0 + __ movdqu(xmm_temp6, xmm_temp0); + __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1 + + __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0 + + __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5 + __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right + __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left + __ pxor(xmm_temp3, xmm_temp5); + __ pxor(xmm_temp6, xmm_temp4); // Register pair holds the result + // of the carry-less multiplication of + // xmm0 by xmm1. + + // We shift the result of the multiplication by one bit position + // to the left to cope for the fact that the bits are reversed. + __ movdqu(xmm_temp7, xmm_temp3); + __ movdqu(xmm_temp4, xmm_temp6); + __ pslld (xmm_temp3, 1); + __ pslld(xmm_temp6, 1); + __ psrld(xmm_temp7, 31); + __ psrld(xmm_temp4, 31); + __ movdqu(xmm_temp5, xmm_temp7); + __ pslldq(xmm_temp4, 4); + __ pslldq(xmm_temp7, 4); + __ psrldq(xmm_temp5, 12); + __ por(xmm_temp3, xmm_temp7); + __ por(xmm_temp6, xmm_temp4); + __ por(xmm_temp6, xmm_temp5); + + // + // First phase of the reduction + // + // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts + // independently. + __ movdqu(xmm_temp7, xmm_temp3); + __ movdqu(xmm_temp4, xmm_temp3); + __ movdqu(xmm_temp5, xmm_temp3); + __ pslld(xmm_temp7, 31); // packed right shift shifting << 31 + __ pslld(xmm_temp4, 30); // packed right shift shifting << 30 + __ pslld(xmm_temp5, 25); // packed right shift shifting << 25 + __ pxor(xmm_temp7, xmm_temp4); // xor the shifted versions + __ pxor(xmm_temp7, xmm_temp5); + __ movdqu(xmm_temp4, xmm_temp7); + __ pslldq(xmm_temp7, 12); + __ psrldq(xmm_temp4, 4); + __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete + + // + // Second phase of the reduction + // + // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these + // shift operations. + __ movdqu(xmm_temp2, xmm_temp3); + __ movdqu(xmm_temp7, xmm_temp3); + __ movdqu(xmm_temp5, xmm_temp3); + __ psrld(xmm_temp2, 1); // packed left shifting >> 1 + __ psrld(xmm_temp7, 2); // packed left shifting >> 2 + __ psrld(xmm_temp5, 7); // packed left shifting >> 7 + __ pxor(xmm_temp2, xmm_temp7); // xor the shifted versions + __ pxor(xmm_temp2, xmm_temp5); + __ pxor(xmm_temp2, xmm_temp4); + __ pxor(xmm_temp3, xmm_temp2); + __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6 + + __ decrement(blocks); + __ jcc(Assembler::zero, L_exit); + __ movdqu(xmm_temp0, xmm_temp6); + __ addptr(data, 16); + __ jmp(L_ghash_loop); + + __ BIND(L_exit); + // Byte swap 16-byte result + __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); + __ movdqu(Address(state, 0), xmm_temp6); // store the result + + __ leave(); + __ ret(0); + return start; + } + /** * Arguments: * @@ -3018,6 +3179,13 @@ StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); } + // Generate GHASH intrinsics code + if (UseGHASHIntrinsics) { + StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); + StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); + } + // Safefetch stubs. generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, &StubRoutines::_safefetch32_fault_pc, diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp @@ -3639,6 +3639,175 @@ return start; } + + // byte swap x86 long + address generate_ghash_long_swap_mask() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask"); + address start = __ pc(); + __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none ); + __ emit_data64(0x0706050403020100, relocInfo::none ); + return start; + } + + // byte swap x86 byte array + address generate_ghash_byte_swap_mask() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask"); + address start = __ pc(); + __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none ); + __ emit_data64(0x0001020304050607, relocInfo::none ); + return start; + } + + /* Single and multi-block ghash operations */ + address generate_ghash_processBlocks() { + __ align(CodeEntryAlignment); + Label L_ghash_loop, L_exit; + StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); + address start = __ pc(); + + const Register state = c_rarg0; + const Register subkeyH = c_rarg1; + const Register data = c_rarg2; + const Register blocks = c_rarg3; + +#ifdef _WIN64 + const int XMM_REG_LAST = 10; +#endif + + const XMMRegister xmm_temp0 = xmm0; + const XMMRegister xmm_temp1 = xmm1; + const XMMRegister xmm_temp2 = xmm2; + const XMMRegister xmm_temp3 = xmm3; + const XMMRegister xmm_temp4 = xmm4; + const XMMRegister xmm_temp5 = xmm5; + const XMMRegister xmm_temp6 = xmm6; + const XMMRegister xmm_temp7 = xmm7; + const XMMRegister xmm_temp8 = xmm8; + const XMMRegister xmm_temp9 = xmm9; + const XMMRegister xmm_temp10 = xmm10; + + __ enter(); + +#ifdef _WIN64 + // save the xmm registers which must be preserved 6-10 + __ subptr(rsp, -rsp_after_call_off * wordSize); + for (int i = 6; i <= XMM_REG_LAST; i++) { + __ movdqu(xmm_save(i), as_XMMRegister(i)); + } +#endif + + __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); + + __ movdqu(xmm_temp0, Address(state, 0)); + __ pshufb(xmm_temp0, xmm_temp10); + + + __ BIND(L_ghash_loop); + __ movdqu(xmm_temp2, Address(data, 0)); + __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); + + __ movdqu(xmm_temp1, Address(subkeyH, 0)); + __ pshufb(xmm_temp1, xmm_temp10); + + __ pxor(xmm_temp0, xmm_temp2); + + // + // Multiply with the hash key + // + __ movdqu(xmm_temp3, xmm_temp0); + __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0 + __ movdqu(xmm_temp4, xmm_temp0); + __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1 + + __ movdqu(xmm_temp5, xmm_temp0); + __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0 + __ movdqu(xmm_temp6, xmm_temp0); + __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1 + + __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0 + + __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5 + __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right + __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left + __ pxor(xmm_temp3, xmm_temp5); + __ pxor(xmm_temp6, xmm_temp4); // Register pair holds the result + // of the carry-less multiplication of + // xmm0 by xmm1. + + // We shift the result of the multiplication by one bit position + // to the left to cope for the fact that the bits are reversed. + __ movdqu(xmm_temp7, xmm_temp3); + __ movdqu(xmm_temp8, xmm_temp6); + __ pslld(xmm_temp3, 1); + __ pslld(xmm_temp6, 1); + __ psrld(xmm_temp7, 31); + __ psrld(xmm_temp8, 31); + __ movdqu(xmm_temp9, xmm_temp7); + __ pslldq(xmm_temp8, 4); + __ pslldq(xmm_temp7, 4); + __ psrldq(xmm_temp9, 12); + __ por(xmm_temp3, xmm_temp7); + __ por(xmm_temp6, xmm_temp8); + __ por(xmm_temp6, xmm_temp9); + + // + // First phase of the reduction + // + // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts + // independently. + __ movdqu(xmm_temp7, xmm_temp3); + __ movdqu(xmm_temp8, xmm_temp3); + __ movdqu(xmm_temp9, xmm_temp3); + __ pslld(xmm_temp7, 31); // packed right shift shifting << 31 + __ pslld(xmm_temp8, 30); // packed right shift shifting << 30 + __ pslld(xmm_temp9, 25); // packed right shift shifting << 25 + __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions + __ pxor(xmm_temp7, xmm_temp9); + __ movdqu(xmm_temp8, xmm_temp7); + __ pslldq(xmm_temp7, 12); + __ psrldq(xmm_temp8, 4); + __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete + + // + // Second phase of the reduction + // + // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these + // shift operations. + __ movdqu(xmm_temp2, xmm_temp3); + __ movdqu(xmm_temp4, xmm_temp3); + __ movdqu(xmm_temp5, xmm_temp3); + __ psrld(xmm_temp2, 1); // packed left shifting >> 1 + __ psrld(xmm_temp4, 2); // packed left shifting >> 2 + __ psrld(xmm_temp5, 7); // packed left shifting >> 7 + __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions + __ pxor(xmm_temp2, xmm_temp5); + __ pxor(xmm_temp2, xmm_temp8); + __ pxor(xmm_temp3, xmm_temp2); + __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6 + + __ decrement(blocks); + __ jcc(Assembler::zero, L_exit); + __ movdqu(xmm_temp0, xmm_temp6); + __ addptr(data, 16); + __ jmp(L_ghash_loop); + + __ BIND(L_exit); + __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result + __ movdqu(Address(state, 0), xmm_temp6); // store the result + +#ifdef _WIN64 + // restore xmm regs belonging to calling function + for (int i = 6; i <= XMM_REG_LAST; i++) { + __ movdqu(as_XMMRegister(i), xmm_save(i)); + } +#endif + __ leave(); + __ ret(0); + return start; + } + /** * Arguments: * @@ -4077,6 +4246,13 @@ StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); } + // Generate GHASH intrinsics code + if (UseGHASHIntrinsics) { + StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); + StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); + } + // Safefetch stubs. generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, &StubRoutines::_safefetch32_fault_pc, diff --git a/src/cpu/x86/vm/stubRoutines_x86.cpp b/src/cpu/x86/vm/stubRoutines_x86.cpp --- a/src/cpu/x86/vm/stubRoutines_x86.cpp +++ b/src/cpu/x86/vm/stubRoutines_x86.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -33,6 +33,8 @@ address StubRoutines::x86::_verify_mxcsr_entry = NULL; address StubRoutines::x86::_key_shuffle_mask_addr = NULL; +address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL; +address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL; uint64_t StubRoutines::x86::_crc_by128_masks[] = { diff --git a/src/cpu/x86/vm/stubRoutines_x86.hpp b/src/cpu/x86/vm/stubRoutines_x86.hpp --- a/src/cpu/x86/vm/stubRoutines_x86.hpp +++ b/src/cpu/x86/vm/stubRoutines_x86.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -36,10 +36,15 @@ // masks and table for CRC32 static uint64_t _crc_by128_masks[]; static juint _crc_table[]; + // swap mask for ghash + static address _ghash_long_swap_mask_addr; + static address _ghash_byte_swap_mask_addr; public: static address verify_mxcsr_entry() { return _verify_mxcsr_entry; } static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; } static address crc_by128_masks_addr() { return (address)_crc_by128_masks; } + static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; } + static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; } #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp --- a/src/cpu/x86/vm/vm_version_x86.cpp +++ b/src/cpu/x86/vm/vm_version_x86.cpp @@ -594,6 +594,17 @@ FLAG_SET_DEFAULT(UseAESIntrinsics, false); } + // GHASH/GCM intrinsics + if (UseCLMUL && (UseSSE > 2)) { + if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { + UseGHASHIntrinsics = true; + } + } else if (UseGHASHIntrinsics) { + if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) + warning("GHASH intrinsic requires CLMUL and SSE2 instructions on this CPU"); + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + } + if (UseSHA) { warning("SHA instructions are not available on this CPU"); FLAG_SET_DEFAULT(UseSHA, false); diff --git a/src/share/vm/classfile/vmSymbols.hpp b/src/share/vm/classfile/vmSymbols.hpp --- a/src/share/vm/classfile/vmSymbols.hpp +++ b/src/share/vm/classfile/vmSymbols.hpp @@ -863,6 +863,12 @@ do_name( implCompressMB_name, "implCompressMultiBlock0") \ do_signature(implCompressMB_signature, "([BII)I") \ \ + /* support for com.sun.crypto.provider.GHASH */ \ + do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH") \ + do_intrinsic(_ghash_processBlocks, com_sun_crypto_provider_ghash, processBlocks_name, ghash_processBlocks_signature, F_S) \ + do_name(processBlocks_name, "processBlocks") \ + do_signature(ghash_processBlocks_signature, "([BII[J[J)V") \ + \ /* support for java.util.zip */ \ do_class(java_util_zip_CRC32, "java/util/zip/CRC32") \ do_intrinsic(_updateCRC32, java_util_zip_CRC32, update_name, int2_int_signature, F_SN) \ diff --git a/src/share/vm/opto/escape.cpp b/src/share/vm/opto/escape.cpp --- a/src/share/vm/opto/escape.cpp +++ b/src/share/vm/opto/escape.cpp @@ -952,6 +952,7 @@ strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 || strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 || strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 || + strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 || strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 || strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 || strcmp(call->as_CallLeaf()->_name, "sha256_implCompress") == 0 || diff --git a/src/share/vm/opto/library_call.cpp b/src/share/vm/opto/library_call.cpp --- a/src/share/vm/opto/library_call.cpp +++ b/src/share/vm/opto/library_call.cpp @@ -311,6 +311,7 @@ Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting); Node* get_key_start_from_aescrypt_object(Node* aescrypt_object); Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object); + bool inline_ghash_processBlocks(); bool inline_sha_implCompress(vmIntrinsics::ID id); bool inline_digestBase_implCompressMB(int predicate); bool inline_sha_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass_SHA, @@ -570,6 +571,10 @@ predicates = 3; break; + case vmIntrinsics::_ghash_processBlocks: + if (!UseGHASHIntrinsics) return NULL; + break; + case vmIntrinsics::_updateCRC32: case vmIntrinsics::_updateBytesCRC32: case vmIntrinsics::_updateByteBufferCRC32: @@ -957,6 +962,9 @@ case vmIntrinsics::_montgomerySquare: return inline_montgomerySquare(); + case vmIntrinsics::_ghash_processBlocks: + return inline_ghash_processBlocks(); + case vmIntrinsics::_encodeISOArray: return inline_encodeISOArray(); @@ -6599,6 +6607,35 @@ return _gvn.transform(region); } +//------------------------------inline_ghash_processBlocks +bool LibraryCallKit::inline_ghash_processBlocks() { + address stubAddr; + const char *stubName; + assert(UseGHASHIntrinsics, "need GHASH intrinsics support"); + + stubAddr = StubRoutines::ghash_processBlocks(); + stubName = "ghash_processBlocks"; + + Node* data = argument(0); + Node* offset = argument(1); + Node* len = argument(2); + Node* state = argument(3); + Node* subkeyH = argument(4); + + Node* state_start = array_element_address(state, intcon(0), T_LONG); + assert(state_start, "state is NULL"); + Node* subkeyH_start = array_element_address(subkeyH, intcon(0), T_LONG); + assert(subkeyH_start, "subkeyH is NULL"); + Node* data_start = array_element_address(data, offset, T_BYTE); + assert(data_start, "data is NULL"); + + Node* ghash = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::ghash_processBlocks_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + state_start, subkeyH_start, data_start, len); + return true; +} + //------------------------------inline_sha_implCompress----------------------- // // Calculate SHA (i.e., SHA-1) for single-block byte[] array. diff --git a/src/share/vm/opto/runtime.cpp b/src/share/vm/opto/runtime.cpp --- a/src/share/vm/opto/runtime.cpp +++ b/src/share/vm/opto/runtime.cpp @@ -1085,6 +1085,25 @@ return TypeFunc::make(domain, range); } +// GHASH block processing +const TypeFunc* OptoRuntime::ghash_processBlocks_Type() { + int argcnt = 4; + + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // state + fields[argp++] = TypePtr::NOTNULL; // subkeyH + fields[argp++] = TypePtr::NOTNULL; // data + fields[argp++] = TypeInt::INT; // blocks + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = NULL; // void + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} //------------- Interpreter state access for on stack replacement const TypeFunc* OptoRuntime::osr_end_Type() { diff --git a/src/share/vm/opto/runtime.hpp b/src/share/vm/opto/runtime.hpp --- a/src/share/vm/opto/runtime.hpp +++ b/src/share/vm/opto/runtime.hpp @@ -311,6 +311,8 @@ static const TypeFunc* montgomeryMultiply_Type(); static const TypeFunc* montgomerySquare_Type(); + static const TypeFunc* ghash_processBlocks_Type(); + static const TypeFunc* updateBytesCRC32_Type(); // leaf on stack replacement interpreter accessor types diff --git a/src/share/vm/runtime/globals.hpp b/src/share/vm/runtime/globals.hpp --- a/src/share/vm/runtime/globals.hpp +++ b/src/share/vm/runtime/globals.hpp @@ -602,6 +602,9 @@ product(bool, UseSHA, false, \ "Control whether SHA instructions can be used on SPARC") \ \ + product(bool, UseGHASHIntrinsics, false, \ + "Use intrinsics for GHASH versions of crypto") \ + \ product(uintx, LargePageSizeInBytes, 0, \ "Large page size (0 to let VM choose the page size)") \ \ diff --git a/src/share/vm/runtime/stubRoutines.cpp b/src/share/vm/runtime/stubRoutines.cpp --- a/src/share/vm/runtime/stubRoutines.cpp +++ b/src/share/vm/runtime/stubRoutines.cpp @@ -124,6 +124,7 @@ address StubRoutines::_aescrypt_decryptBlock = NULL; address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL; address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL; +address StubRoutines::_ghash_processBlocks = NULL; address StubRoutines::_sha1_implCompress = NULL; address StubRoutines::_sha1_implCompressMB = NULL; diff --git a/src/share/vm/runtime/stubRoutines.hpp b/src/share/vm/runtime/stubRoutines.hpp --- a/src/share/vm/runtime/stubRoutines.hpp +++ b/src/share/vm/runtime/stubRoutines.hpp @@ -197,6 +197,7 @@ static address _aescrypt_decryptBlock; static address _cipherBlockChaining_encryptAESCrypt; static address _cipherBlockChaining_decryptAESCrypt; + static address _ghash_processBlocks; static address _sha1_implCompress; static address _sha1_implCompressMB; @@ -359,6 +360,7 @@ static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; } static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; } static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; } + static address ghash_processBlocks() { return _ghash_processBlocks; } static address sha1_implCompress() { return _sha1_implCompress; } static address sha1_implCompressMB() { return _sha1_implCompressMB; } diff --git a/src/share/vm/runtime/vmStructs.cpp b/src/share/vm/runtime/vmStructs.cpp --- a/src/share/vm/runtime/vmStructs.cpp +++ b/src/share/vm/runtime/vmStructs.cpp @@ -810,6 +810,7 @@ static_field(StubRoutines, _aescrypt_decryptBlock, address) \ static_field(StubRoutines, _cipherBlockChaining_encryptAESCrypt, address) \ static_field(StubRoutines, _cipherBlockChaining_decryptAESCrypt, address) \ + static_field(StubRoutines, _ghash_processBlocks, address) \ static_field(StubRoutines, _updateBytesCRC32, address) \ static_field(StubRoutines, _crc_table_adr, address) \ static_field(StubRoutines, _multiplyToLen, address) \ diff --git a/test/compiler/7184394/TestAESBase.java b/test/compiler/7184394/TestAESBase.java --- a/test/compiler/7184394/TestAESBase.java +++ b/test/compiler/7184394/TestAESBase.java @@ -29,6 +29,7 @@ import javax.crypto.Cipher; import javax.crypto.KeyGenerator; import javax.crypto.SecretKey; +import javax.crypto.spec.GCMParameterSpec; import javax.crypto.spec.IvParameterSpec; import javax.crypto.spec.SecretKeySpec; import java.security.AlgorithmParameters; @@ -64,6 +65,10 @@ Cipher dCipher; AlgorithmParameters algParams; SecretKey key; + GCMParameterSpec gcm_spec; + byte[] aad; + int tlen = 12; + byte[] iv; static int numThreads = 0; int threadId; @@ -102,6 +107,12 @@ int ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0); IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]); cipher.init(Cipher.ENCRYPT_MODE, key, initVector); + } else if (mode.equals("GCM")) { + iv = new byte[64]; + random.nextBytes(iv); + aad = new byte[5]; + random.nextBytes(aad); + gcm_init(); } else { algParams = cipher.getParameters(); cipher.init(Cipher.ENCRYPT_MODE, key, algParams); @@ -188,4 +199,12 @@ } abstract void childShowCipher(); + + void gcm_init() throws Exception { + tlen = 12; + gcm_spec = new GCMParameterSpec(tlen * 8, iv); + cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE"); + cipher.init(Cipher.ENCRYPT_MODE, key, gcm_spec); + cipher.update(aad); + } } diff --git a/test/compiler/7184394/TestAESEncode.java b/test/compiler/7184394/TestAESEncode.java --- a/test/compiler/7184394/TestAESEncode.java +++ b/test/compiler/7184394/TestAESEncode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -32,7 +32,11 @@ @Override public void run() { try { - if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams); + if (mode.equals("GCM")) { + gcm_init(); + } else if (!noReinit) { + cipher.init(Cipher.ENCRYPT_MODE, key, algParams); + } encode = new byte[encodeLength]; if (testingMisalignment) { int tempSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset); diff --git a/test/compiler/7184394/TestAESMain.java b/test/compiler/7184394/TestAESMain.java --- a/test/compiler/7184394/TestAESMain.java +++ b/test/compiler/7184394/TestAESMain.java @@ -41,6 +41,13 @@ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 TestAESMain * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DdecOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain * * @author Tom Deneau */