1 /* 2 * Copyright (c) 2018, Intel Corporation. 3 * 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/assembler.hpp" 28 #include "asm/assembler.inline.hpp" 29 #include "runtime/stubRoutines.hpp" 30 #include "macroAssembler_x86.hpp" 31 32 // Multiply 128 x 128 bits, using 4 pclmulqdq operations 33 void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data, 34 XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) { 35 movdqu(xmm15, Address(htbl, i * 16)); 36 vpclmulhqlqdq(tmp3, data, xmm15); // 0x01 37 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); 38 vpclmulldq(tmp3, data, xmm15); // 0x00 39 vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit); 40 vpclmulhdq(tmp3, data, xmm15); // 0x11 41 vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); 42 vpclmullqhqdq(tmp3, data, xmm15); // 0x10 43 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); 44 } 45 46 // Multiply two 128 bit numbers resulting in a 256 bit value 47 // Result of the multiplication followed by reduction stored in state 48 void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) { 49 const XMMRegister tmp1 = xmm4; 50 const XMMRegister tmp2 = xmm5; 51 const XMMRegister tmp3 = xmm6; 52 const XMMRegister tmp4 = xmm7; 53 54 vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0) 55 vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1) 56 vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0) 57 vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1) 58 59 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0) 60 61 vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); 62 vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); 63 vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result 64 vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication 65 // Follows the reduction technique mentioned in 66 // Shift-XOR reduction described in Gueron-Kounavis May 2010 67 // First phase of reduction 68 // 69 vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31 70 vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30 71 vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25 72 // xor the shifted versions 73 vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); 74 vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); 75 vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); 76 vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); 77 vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete 78 // 79 // Second phase of the reduction 80 // 81 vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1 82 vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2 83 vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7 84 vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions 85 vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit); 86 vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); 87 vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit); 88 vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state 89 ret(0); 90 } 91 92 // This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H. 93 // The power of H is used in reduction process for one block ghash 94 void MacroAssembler::generateHtbl_one_block(Register htbl) { 95 const XMMRegister t = xmm13; 96 const XMMRegister tmp0 = xmm1; 97 98 // load the original subkey hash 99 movdqu(t, Address(htbl, 0)); 100 // shuffle using long swap mask 101 movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); 102 vpshufb(t, t, xmm10, Assembler::AVX_128bit); 103 //Save the shuffled mask as the first htbl entry 104 movdqu(Address(htbl, 0 * 16), t); 105 movdqu(tmp0, t); 106 // Compute H' = GFMUL(H, 2) 107 vpsrld(xmm3, t, 7, Assembler::AVX_128bit); 108 movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr())); 109 vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit); 110 movl(rax, 0xff00); 111 movdl(xmm4, rax); 112 vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit); 113 movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr())); 114 vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit); 115 vpsrld(xmm3, t, 31, Assembler::AVX_128bit); 116 vpslld(xmm4, t, 1, Assembler::AVX_128bit); 117 vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit); 118 vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2 119 120 //Adding p(x)<<1 to xmm5 which holds the reduction polynomial 121 vpxor(t, t, xmm5, Assembler::AVX_128bit); 122 movdqu(Address(htbl, 1 * 16), t); // H * 2 123 124 ret(0); 125 } 126 127 // This method takes the subkey after expansion as input and generates the remaining powers of subkey H. 128 // The power of H is used in reduction process for eight block ghash 129 void MacroAssembler::generateHtbl_eight_blocks(Register htbl) { 130 const XMMRegister t = xmm13; 131 const XMMRegister tmp0 = xmm1; 132 Label GFMUL; 133 134 movdqu(t, Address(htbl, 1 * 16)); 135 movdqu(tmp0, t); 136 137 // tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H) 138 call(GFMUL, relocInfo::none); 139 movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2 140 call(GFMUL, relocInfo::none); 141 movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2 142 call(GFMUL, relocInfo::none); 143 movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2 144 call(GFMUL, relocInfo::none); 145 movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2 146 call(GFMUL, relocInfo::none); 147 movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2 148 call(GFMUL, relocInfo::none); 149 movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2 150 call(GFMUL, relocInfo::none); 151 movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2 152 ret(0); 153 154 bind(GFMUL); 155 gfmul(tmp0, t); 156 } 157 158 // Multiblock and single block GHASH computation using Shift XOR reduction technique 159 void MacroAssembler::avx_ghash(Register input_state, Register htbl, 160 Register input_data, Register blocks) { 161 162 // temporary variables to hold input data and input state 163 const XMMRegister data = xmm1; 164 const XMMRegister state = xmm0; 165 // temporary variables to hold intermediate results 166 const XMMRegister tmp0 = xmm3; 167 const XMMRegister tmp1 = xmm4; 168 const XMMRegister tmp2 = xmm5; 169 const XMMRegister tmp3 = xmm6; 170 // temporary variables to hold byte and long swap masks 171 const XMMRegister bswap_mask = xmm2; 172 const XMMRegister lswap_mask = xmm14; 173 174 Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION, 175 ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH; 176 177 testptr(blocks, blocks); 178 jcc(Assembler::zero, EXIT_GHASH); 179 180 // Check if Hashtable (1*16) has been already generated 181 // For anything less than 8 blocks, we generate only the first power of H. 182 movdqu(tmp2, Address(htbl, 1 * 16)); 183 ptest(tmp2, tmp2); 184 jcc(Assembler::notZero, BEGIN_PROCESS); 185 call(GENERATE_HTBL_1_BLK, relocInfo::none); 186 187 // Shuffle the input state 188 bind(BEGIN_PROCESS); 189 movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); 190 movdqu(state, Address(input_state, 0)); 191 vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); 192 193 cmpl(blocks, 8); 194 jcc(Assembler::below, ONE_BLK_INIT); 195 // If we have 8 blocks or more data, then generate remaining powers of H 196 movdqu(tmp2, Address(htbl, 2 * 16)); 197 ptest(tmp2, tmp2); 198 jcc(Assembler::notZero, PROCESS_8_BLOCKS); 199 call(GENERATE_HTBL_8_BLKS, relocInfo::none); 200 201 //Do 8 multiplies followed by a reduction processing 8 blocks of data at a time 202 //Each block = 16 bytes. 203 bind(PROCESS_8_BLOCKS); 204 subl(blocks, 8); 205 movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); 206 movdqu(data, Address(input_data, 16 * 7)); 207 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 208 //Loading 1*16 as calculated powers of H required starts at that location. 209 movdqu(xmm15, Address(htbl, 1 * 16)); 210 //Perform carryless multiplication of (H*2, data block #7) 211 vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1 212 vpclmulldq(tmp0, data, xmm15);//a0 * b0 213 vpclmulhdq(tmp1, data, xmm15);//a1 * b1 214 vpclmullqhqdq(tmp3, data, xmm15);//a1* b0 215 vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0) 216 217 movdqu(data, Address(input_data, 16 * 6)); 218 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 219 // Perform carryless multiplication of (H^2 * 2, data block #6) 220 schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3); 221 222 movdqu(data, Address(input_data, 16 * 5)); 223 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 224 // Perform carryless multiplication of (H^3 * 2, data block #5) 225 schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3); 226 movdqu(data, Address(input_data, 16 * 4)); 227 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 228 // Perform carryless multiplication of (H^4 * 2, data block #4) 229 schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3); 230 movdqu(data, Address(input_data, 16 * 3)); 231 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 232 // Perform carryless multiplication of (H^5 * 2, data block #3) 233 schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3); 234 movdqu(data, Address(input_data, 16 * 2)); 235 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 236 // Perform carryless multiplication of (H^6 * 2, data block #2) 237 schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3); 238 movdqu(data, Address(input_data, 16 * 1)); 239 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 240 // Perform carryless multiplication of (H^7 * 2, data block #1) 241 schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3); 242 movdqu(data, Address(input_data, 16 * 0)); 243 // xor data block#0 with input state before perfoming carry-less multiplication 244 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 245 vpxor(data, data, state, Assembler::AVX_128bit); 246 // Perform carryless multiplication of (H^8 * 2, data block #0) 247 schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3); 248 vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); 249 vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); 250 vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of 251 vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation 252 253 // we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1 254 // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0 255 // Follows the reduction technique mentioned in 256 // Shift-XOR reduction described in Gueron-Kounavis May 2010 257 bind(BLOCK8_REDUCTION); 258 // First Phase of the reduction 259 vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31 260 vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30 261 vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25 262 // xor the shifted versions 263 vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); 264 vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); 265 266 vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); 267 vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); 268 269 vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete 270 // second phase of the reduction 271 vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1 272 vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2 273 vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7 274 // xor the shifted versions 275 vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit); 276 vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit); 277 vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); 278 vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit); 279 // Final result is in state 280 vpxor(state, tmp0, tmp1, Assembler::AVX_128bit); 281 282 lea(input_data, Address(input_data, 16 * 8)); 283 cmpl(blocks, 8); 284 jcc(Assembler::below, ONE_BLK_INIT); 285 jmp(PROCESS_8_BLOCKS); 286 287 // Since this is one block operation we will only use H * 2 i.e. the first power of H 288 bind(ONE_BLK_INIT); 289 movdqu(tmp0, Address(htbl, 1 * 16)); 290 movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); 291 292 //Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction. 293 bind(PROCESS_1_BLOCK); 294 cmpl(blocks, 0); 295 jcc(Assembler::equal, SAVE_STATE); 296 subl(blocks, 1); 297 movdqu(data, Address(input_data, 0)); 298 vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); 299 vpxor(state, state, data, Assembler::AVX_128bit); 300 // gfmul(H*2, state) 301 call(GFMUL, relocInfo::none); 302 addptr(input_data, 16); 303 jmp(PROCESS_1_BLOCK); 304 305 bind(SAVE_STATE); 306 vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); 307 movdqu(Address(input_state, 0), state); 308 jmp(EXIT_GHASH); 309 310 bind(GFMUL); 311 gfmul(tmp0, state); 312 313 bind(GENERATE_HTBL_1_BLK); 314 generateHtbl_one_block(htbl); 315 316 bind(GENERATE_HTBL_8_BLKS); 317 generateHtbl_eight_blocks(htbl); 318 319 bind(EXIT_GHASH); 320 // zero out xmm registers used for Htbl storage 321 vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); 322 vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit); 323 vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit); 324 vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit); 325 }