1 /* 2 * Copyright (c) 2018, Intel Corporation. 3 * 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/assembler.hpp" 28 #include "asm/assembler.inline.hpp" 29 #include "runtime/stubRoutines.hpp" 30 #include "macroAssembler_x86.hpp" 31 32 // Multiply 128 x 128 bits, using 4 pclmulqdq operations 33 void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data, 34 XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) { 35 movdqu(xmm15, Address(htbl, i * 16)); 36 vpclmulhqlqdq(tmp3, data, xmm15); // 0x01 37 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); 38 vpclmulldq(tmp3, data, xmm15); // 0x00 39 vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit); 40 vpclmulhdq(tmp3, data, xmm15); // 0x11 41 vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); 42 vpclmullqhqdq(tmp3, data, xmm15); // 0x10 43 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); 44 } 45 46 // Multiply two 128 bit numbers resulting in a 256 bit value 47 // Result of the multiplication followed by reduction stored in state 48 void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) { 49 const XMMRegister tmp1 = xmm4; 50 const XMMRegister tmp2 = xmm5; 51 const XMMRegister tmp3 = xmm6; 52 const XMMRegister tmp4 = xmm7; 53 54 vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0) 55 vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1) 56 vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0) 57 vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1) 58 59 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0) 60 61 vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); 62 vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); 63 vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result 64 vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication 65 // Follows the reduction technique mentioned in 66 // Shift-XOR reduction described in Gueron-Kounavis May 2010 67 // First phase of reduction 68 // 69 vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31 70 vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30 71 vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25 72 // xor the shifted versions 73 vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); 74 vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); 75 vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); 76 vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); 77 vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete 78 // 79 // Second phase of the reduction 80 // 81 vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1 82 vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2 83 vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7 84 vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions 85 vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit); 86 vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); 87 vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit); 88 vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state 89 ret(0); 90 } 91 92 // This method takes in the subkey after expansion and generates 16 * 8 powers of subkey H using GFMUL operation. 93 // The powers are used for carry-less multiplication in scalar multiblock ghash operations. 94 void MacroAssembler::generateHtbl(Register htbl) { 95 const XMMRegister t = xmm0; 96 const XMMRegister tmp0 = xmm1; 97 Label GFMUL; 98 // load the original subkey hash 99 movdqu(t, Address(htbl, 0)); 100 // shuffle using long swap mask 101 movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); 102 vpshufb(t, t, xmm10, Assembler::AVX_128bit); 103 //Save the shuffled mask as the first htbl entry 104 movdqu(Address(htbl, 0 * 16), t); 105 movdqu(tmp0, t); 106 // Compute H' = GFMUL(H, 2) 107 vpsrld(xmm3, t, 7, Assembler::AVX_128bit); 108 movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr())); 109 vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit); 110 movl(rax, 0xff00); 111 movdl(xmm4, rax); 112 vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit); 113 movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr())); 114 vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit); 115 vpsrld(xmm3, t, 31, Assembler::AVX_128bit); 116 vpslld(xmm4, t, 1, Assembler::AVX_128bit); 117 vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit); 118 vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2 119 120 //Adding p(x)<<1 to xmm5 which holds the reduction polynomial 121 vpxor(t, t, xmm5, Assembler::AVX_128bit); 122 // tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H) 123 movdqu(tmp0, t); 124 // store GFMUL(H,2) 125 movdqu(Address(htbl, 1 * 16), t); // H * 2 126 call(GFMUL, relocInfo::none); 127 movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2 128 call(GFMUL, relocInfo::none); 129 movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2 130 call(GFMUL, relocInfo::none); 131 movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2 132 call(GFMUL, relocInfo::none); 133 movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2 134 call(GFMUL, relocInfo::none); 135 movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2 136 call(GFMUL, relocInfo::none); 137 movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2 138 call(GFMUL, relocInfo::none); 139 movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2 140 141 ret(0); 142 bind(GFMUL); 143 gfmul(tmp0, t); 144 } 145 146 // Multiblock and single block GHASH computation using Shift XOR reduction technique 147 void MacroAssembler::avx_ghash(Register input_state, Register htbl, 148 Register input_data, Register blocks) { 149 150 // temporary variables to hold input data and input state 151 const XMMRegister data = xmm1; 152 const XMMRegister state = xmm0; 153 // temporary variables to hold intermediate results 154 const XMMRegister tmp0 = xmm3; 155 const XMMRegister tmp1 = xmm4; 156 const XMMRegister tmp2 = xmm5; 157 const XMMRegister tmp3 = xmm6; 158 const XMMRegister tmp4 = xmm7; 159 // temporary variables to hold byte and long swap masks 160 const XMMRegister bswap_mask = xmm2; 161 const XMMRegister lswap_mask = xmm14; 162 163 Label GENERATE_HTBL, BEGIN_PROCESS, GHASH_LOOP, BLOCK8_REDUCTION, 164 ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH; 165 166 testptr(blocks, blocks); 167 jcc(Assembler::zero, EXIT_GHASH); 168 169 // Check if Hashtable has been already generated 170 movdqu(tmp2, Address(htbl, 2 * 16)); 171 ptest(tmp2, tmp2); 172 jcc(Assembler::notZero, BEGIN_PROCESS); 173 call(GENERATE_HTBL, relocInfo::none); 174 175 // Shuffle the input state 176 bind(BEGIN_PROCESS); 177 movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); 178 movdqu(state, Address(input_state, 0)); 179 vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); 180 181 //Do 8 multiplies followed by a reduction processing 8 blocks of data at a time 182 //Each block = 16 bytes. 183 bind(PROCESS_8_BLOCKS); 184 cmpl(blocks, 8); 185 jcc(Assembler::below, ONE_BLK_INIT); 186 subl(blocks, 8); 187 movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); 188 movdqu(data, Address(input_data, 16 * 7)); 189 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 190 //Loading 1*16 as calculated powers of H required starts at that location. 191 movdqu(xmm15, Address(htbl, 1 * 16)); 192 //Perform carryless multiplication of (H*2, data block #7) 193 vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1 194 vpclmulldq(tmp0, data, xmm15);//a0 * b0 195 vpclmulhdq(tmp1, data, xmm15);//a1 * b1 196 vpclmullqhqdq(tmp3, data, xmm15);//a1* b0 197 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0) 198 199 movdqu(data, Address(input_data, 16 * 6)); 200 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 201 // Perform carryless multiplication of (H^2 * 2, data block #6) 202 schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3); 203 204 movdqu(data, Address(input_data, 16 * 5)); 205 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 206 // Perform carryless multiplication of (H^3 * 2, data block #5) 207 schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3); 208 movdqu(data, Address(input_data, 16 * 4)); 209 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 210 // Perform carryless multiplication of (H^4 * 2, data block #4) 211 schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3); 212 movdqu(data, Address(input_data, 16 * 3)); 213 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 214 // Perform carryless multiplication of (H^5 * 2, data block #3) 215 schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3); 216 movdqu(data, Address(input_data, 16 * 2)); 217 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 218 // Perform carryless multiplication of (H^6 * 2, data block #2) 219 schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3); 220 movdqu(data, Address(input_data, 16 * 1)); 221 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 222 // Perform carryless multiplication of (H^7 * 2, data block #1) 223 schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3); 224 movdqu(data, Address(input_data, 16 * 0)); 225 // xor data block#0 with input state before perfoming carry-less multiplication 226 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 227 vpxor(data, data, state, Assembler::AVX_128bit); 228 // Perform carryless multiplication of (H^8 * 2, data block #0) 229 schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3); 230 vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); 231 vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); 232 vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of 233 vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation 234 235 // we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1 236 // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0 237 // Follows the reduction technique mentioned in 238 // Shift-XOR reduction described in Gueron-Kounavis May 2010 239 bind(BLOCK8_REDUCTION); 240 // First Phase of the reduction 241 vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31 242 vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30 243 vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25 244 // xor the shifted versions 245 vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); 246 vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); 247 248 vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); 249 vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); 250 251 vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete 252 // second phase of the reduction 253 vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1 254 vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2 255 vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7 256 // xor the shifted versions 257 vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit); 258 vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit); 259 vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); 260 vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit); 261 // Final result is in state 262 vpxor(state, tmp0, tmp1, Assembler::AVX_128bit); 263 264 lea(input_data, Address(input_data, 16 * 8)); 265 jmp(PROCESS_8_BLOCKS); 266 267 // Since this is one block operation we will only use H * 2 i.e. the first power of H 268 bind(ONE_BLK_INIT); 269 movdqu(tmp0, Address(htbl, 1 * 16)); 270 movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); 271 272 //Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction. 273 bind(PROCESS_1_BLOCK); 274 cmpl(blocks, 0); 275 jcc(Assembler::equal, SAVE_STATE); 276 subl(blocks, 1); 277 movdqu(data, Address(input_data, 0)); 278 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 279 vpxor(state, state, data, Assembler::AVX_128bit); 280 // gfmul(H*2, state) 281 call(GHASH_LOOP, relocInfo::none); 282 addptr(input_data, 16); 283 jmp(PROCESS_1_BLOCK); 284 285 bind(SAVE_STATE); 286 vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); 287 movdqu(Address(input_state, 0), state); 288 jmp(EXIT_GHASH); 289 290 bind(GHASH_LOOP); 291 gfmul(tmp0, state); 292 bind(GENERATE_HTBL); 293 generateHtbl(htbl); 294 295 bind(EXIT_GHASH); 296 // zero out xmm registers used for Htbl storage 297 vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); 298 vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit); 299 vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit); 300 vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit); 301 }