New src/hotspot/cpu/x86/macroAssembler_x86

   1 /*
   2 * Copyright (c) 2018, Intel Corporation.
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This code is free software; you can redistribute it and/or modify it
   7 * under the terms of the GNU General Public License version 2 only, as
   8 * published by the Free Software Foundation.
   9 *
  10 * This code is distributed in the hope that it will be useful, but WITHOUT
  11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13 * version 2 for more details (a copy is included in the LICENSE file that
  14 * accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License version
  17 * 2 along with this work; if not, write to the Free Software Foundation,
  18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19 *
  20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21 * or visit www.oracle.com if you need additional information or have any
  22 * questions.
  23 *
  24 */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "runtime/stubRoutines.hpp"
  30 #include "macroAssembler_x86.hpp"
  31 
  32 // Multiply 128 x 128 bits, using 4 pclmulqdq operations
  33 void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data,
  34     XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) {
  35     movdqu(xmm15, Address(htbl, i * 16));
  36     vpclmulhqlqdq(tmp3, data, xmm15); // 0x01
  37     vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
  38     vpclmulldq(tmp3, data, xmm15); // 0x00
  39     vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);
  40     vpclmulhdq(tmp3, data, xmm15); // 0x11
  41     vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit);
  42     vpclmullqhqdq(tmp3, data, xmm15); // 0x10
  43     vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
  44 }
  45 
  46 // Multiply two 128 bit numbers resulting in a 256 bit value
  47 // Result of the multiplication followed by reduction stored in state
  48 void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) {
  49     const XMMRegister tmp1 = xmm4;
  50     const XMMRegister tmp2 = xmm5;
  51     const XMMRegister tmp3 = xmm6;
  52     const XMMRegister tmp4 = xmm7;
  53 
  54     vpclmulldq(tmp1, state, tmp0); //0x00  (a0 * b0)
  55     vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1)
  56     vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0)
  57     vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1)
  58 
  59     vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0)
  60 
  61     vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
  62     vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
  63     vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result
  64     vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication
  65     // Follows the reduction technique mentioned in
  66     // Shift-XOR reduction described in Gueron-Kounavis May 2010
  67     // First phase of reduction
  68     //
  69     vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31
  70     vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30
  71     vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25
  72     // xor the shifted versions
  73     vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
  74     vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
  75     vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
  76     vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
  77     vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete
  78     //
  79     // Second phase of the reduction
  80     //
  81     vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1
  82     vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2
  83     vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7
  84     vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions
  85     vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit);
  86     vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
  87     vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);
  88     vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state
  89     ret(0);
  90 }
  91 
  92 // This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H.
  93 // The power of H is used in reduction process for one block ghash
  94 void MacroAssembler::generateHtbl_one_block(Register htbl) {
  95     const XMMRegister t = xmm13;
  96     const XMMRegister tmp0 = xmm1;
  97 
  98     // load the original subkey hash
  99     movdqu(t, Address(htbl, 0));
 100     // shuffle using long swap mask
 101     movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
 102     vpshufb(t, t, xmm10, Assembler::AVX_128bit);
 103     //Save the shuffled mask as the first htbl entry
 104     movdqu(Address(htbl, 0 * 16), t);
 105     movdqu(tmp0, t);
 106     // Compute H' = GFMUL(H, 2)
 107     vpsrld(xmm3, t, 7, Assembler::AVX_128bit);
 108     movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr()));
 109     vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit);
 110     movl(rax, 0xff00);
 111     movdl(xmm4, rax);
 112     vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit);
 113     movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr()));
 114     vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit);
 115     vpsrld(xmm3, t, 31, Assembler::AVX_128bit);
 116     vpslld(xmm4, t, 1, Assembler::AVX_128bit);
 117     vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit);
 118     vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2
 119 
 120     //Adding p(x)<<1 to xmm5 which holds the reduction polynomial
 121     vpxor(t, t, xmm5, Assembler::AVX_128bit);
 122     movdqu(Address(htbl, 1 * 16), t); // H * 2
 123 
 124     ret(0);
 125 }
 126 
 127 // This method takes the subkey after expansion as input and generates the remaining powers of subkey H.
 128 // The power of H is used in reduction process for eight block ghash
 129 void MacroAssembler::generateHtbl_eight_blocks(Register htbl) {
 130     const XMMRegister t = xmm13;
 131     const XMMRegister tmp0 = xmm1;
 132     Label GFMUL;
 133    
 134     movdqu(t, Address(htbl, 1 * 16));
 135     movdqu(tmp0, t);
 136 
 137     // tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H)
 138     call(GFMUL, relocInfo::none);
 139     movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2
 140     call(GFMUL, relocInfo::none);
 141     movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2
 142     call(GFMUL, relocInfo::none);
 143     movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2
 144     call(GFMUL, relocInfo::none);
 145     movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2
 146     call(GFMUL, relocInfo::none);
 147     movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2
 148     call(GFMUL, relocInfo::none);
 149     movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2
 150     call(GFMUL, relocInfo::none);
 151     movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2
 152     ret(0);
 153 
 154     bind(GFMUL);
 155     gfmul(tmp0, t);
 156 }
 157 
 158 // Multiblock and single block GHASH computation using Shift XOR reduction technique
 159 void MacroAssembler::avx_ghash(Register input_state, Register htbl,
 160     Register input_data, Register blocks) {
 161 
 162     // temporary variables to hold input data and input state
 163     const XMMRegister data = xmm1;
 164     const XMMRegister state = xmm0;
 165     // temporary variables to hold intermediate results
 166     const XMMRegister tmp0 = xmm3;
 167     const XMMRegister tmp1 = xmm4;
 168     const XMMRegister tmp2 = xmm5;
 169     const XMMRegister tmp3 = xmm6;
 170     // temporary variables to hold byte and long swap masks
 171     const XMMRegister bswap_mask = xmm2;
 172     const XMMRegister lswap_mask = xmm14;
 173 
 174     Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION,
 175           ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH;
 176 
 177     testptr(blocks, blocks);
 178     jcc(Assembler::zero, EXIT_GHASH);
 179 
 180     // Check if Hashtable (1*16) has been already generated
 181     // For anything less than 8 blocks, we generate only the first power of H.
 182     movdqu(tmp2, Address(htbl, 1 * 16));
 183     ptest(tmp2, tmp2);
 184     jcc(Assembler::notZero, BEGIN_PROCESS);
 185     call(GENERATE_HTBL_1_BLK, relocInfo::none);
 186 
 187     // Shuffle the input state
 188     bind(BEGIN_PROCESS);
 189     movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
 190     movdqu(state, Address(input_state, 0));
 191     vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
 192 
 193     cmpl(blocks, 8);
 194     jcc(Assembler::below, ONE_BLK_INIT);
 195     // If we have 8 blocks or more data, then generate remaining powers of H
 196     movdqu(tmp2, Address(htbl, 2 * 16));
 197     ptest(tmp2, tmp2);
 198     jcc(Assembler::notZero, PROCESS_8_BLOCKS);
 199     call(GENERATE_HTBL_8_BLKS, relocInfo::none);
 200 
 201     //Do 8 multiplies followed by a reduction processing 8 blocks of data at a time
 202     //Each block = 16 bytes.
 203     bind(PROCESS_8_BLOCKS);
 204     subl(blocks, 8);
 205     movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
 206     movdqu(data, Address(input_data, 16 * 7));
 207     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
 208     //Loading 1*16 as calculated powers of H required starts at that location.
 209     movdqu(xmm15, Address(htbl, 1 * 16));
 210     //Perform carryless multiplication of (H*2, data block #7)
 211     vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1
 212     vpclmulldq(tmp0, data, xmm15);//a0 * b0
 213     vpclmulhdq(tmp1, data, xmm15);//a1 * b1
 214     vpclmullqhqdq(tmp3, data, xmm15);//a1* b0
 215     vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0)
 216 
 217     movdqu(data, Address(input_data, 16 * 6));
 218     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
 219     // Perform carryless multiplication of (H^2 * 2, data block #6)
 220     schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3);
 221 
 222     movdqu(data, Address(input_data, 16 * 5));
 223     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
 224     // Perform carryless multiplication of (H^3 * 2, data block #5)
 225     schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3);
 226     movdqu(data, Address(input_data, 16 * 4));
 227     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
 228     // Perform carryless multiplication of (H^4 * 2, data block #4)
 229     schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3);
 230     movdqu(data, Address(input_data, 16 * 3));
 231     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
 232     // Perform carryless multiplication of (H^5 * 2, data block #3)
 233     schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3);
 234     movdqu(data, Address(input_data, 16 * 2));
 235     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
 236     // Perform carryless multiplication of (H^6 * 2, data block #2)
 237     schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3);
 238     movdqu(data, Address(input_data, 16 * 1));
 239     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
 240     // Perform carryless multiplication of (H^7 * 2, data block #1)
 241     schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3);
 242     movdqu(data, Address(input_data, 16 * 0));
 243     // xor data block#0 with input state before perfoming carry-less multiplication
 244     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
 245     vpxor(data, data, state, Assembler::AVX_128bit);
 246     // Perform carryless multiplication of (H^8 * 2, data block #0)
 247     schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3);
 248     vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
 249     vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
 250     vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of
 251     vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation
 252 
 253     // we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1
 254     // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0
 255     // Follows the reduction technique mentioned in
 256     // Shift-XOR reduction described in Gueron-Kounavis May 2010
 257     bind(BLOCK8_REDUCTION);
 258     // First Phase of the reduction
 259     vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31
 260     vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30
 261     vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25
 262     // xor the shifted versions
 263     vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
 264     vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
 265 
 266     vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
 267     vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
 268 
 269     vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete
 270     // second phase of the reduction
 271     vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1
 272     vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2
 273     vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7
 274     // xor the shifted versions
 275     vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);
 276     vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit);
 277     vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
 278     vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit);
 279     // Final result is in state
 280     vpxor(state, tmp0, tmp1, Assembler::AVX_128bit);
 281 
 282     lea(input_data, Address(input_data, 16 * 8));
 283     cmpl(blocks, 8);
 284     jcc(Assembler::below, ONE_BLK_INIT);
 285     jmp(PROCESS_8_BLOCKS);
 286 
 287     // Since this is one block operation we will only use H * 2 i.e. the first power of H
 288     bind(ONE_BLK_INIT);
 289     movdqu(tmp0, Address(htbl, 1 * 16));
 290     movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
 291 
 292     //Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction.
 293     bind(PROCESS_1_BLOCK);
 294     cmpl(blocks, 0);
 295     jcc(Assembler::equal, SAVE_STATE);
 296     subl(blocks, 1);
 297     movdqu(data, Address(input_data, 0));
 298     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
 299     vpxor(state, state, data, Assembler::AVX_128bit);
 300     // gfmul(H*2, state)
 301     call(GFMUL, relocInfo::none);
 302     addptr(input_data, 16);
 303     jmp(PROCESS_1_BLOCK);
 304 
 305     bind(SAVE_STATE);
 306     vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
 307     movdqu(Address(input_state, 0), state);
 308     jmp(EXIT_GHASH);
 309 
 310     bind(GFMUL);
 311     gfmul(tmp0, state);
 312 
 313     bind(GENERATE_HTBL_1_BLK);
 314     generateHtbl_one_block(htbl);
 315 
 316     bind(GENERATE_HTBL_8_BLKS);
 317     generateHtbl_eight_blocks(htbl);
 318 
 319     bind(EXIT_GHASH);
 320     // zero out xmm registers used for Htbl storage
 321     vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
 322     vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
 323     vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
 324     vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
 325 }