< prev index next >

src/cpu/x86/vm/macroAssembler_x86_sha.cpp

Print this page




  12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13 * version 2 for more details (a copy is included in the LICENSE file that
  14 * accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License version
  17 * 2 along with this work; if not, write to the Free Software Foundation,
  18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19 *
  20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21 * or visit www.oracle.com if you need additional information or have any
  22 * questions.
  23 *
  24 */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "runtime/stubRoutines.hpp"
  30 #include "macroAssembler_x86.hpp"
  31 




























































































































































































































































































































































































































































































































































  32 // ofs and limit are used for multi-block byte array.
  33 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
  34 void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
  35   XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
  36   Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
  37 
  38   Label start, done_hash, loop0;
  39 
  40   address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
  41   address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
  42 
  43   bind(start);
  44   movdqu(abcd, Address(state, 0));
  45   pinsrd(e0, Address(state, 16), 3);
  46   movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
  47   pand(e0, shuf_mask);
  48   pshufd(abcd, abcd, 0x1B);
  49   movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
  50 
  51   bind(loop0);




  12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13 * version 2 for more details (a copy is included in the LICENSE file that
  14 * accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License version
  17 * 2 along with this work; if not, write to the Free Software Foundation,
  18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19 *
  20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21 * or visit www.oracle.com if you need additional information or have any
  22 * questions.
  23 *
  24 */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "runtime/stubRoutines.hpp"
  30 #include "macroAssembler_x86.hpp"
  31 
  32 #ifdef _LP64
  33 /*
  34   The algorithm below is based on Intel publication:
  35   "Fast SHA-256 Implementations on IntelĀ® Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal.
  36   The assembly code was originally provided by Sean Gulley and in many places preserves
  37   the original assembly NAMES and comments to simplify matching Java assembly with its original.
  38   The Java version was substantially redesigned to replace 1200 assembly instruction with
  39   much shorter run-time generator of the same code in memory.
  40 */
  41 
  42 void MacroAssembler::sha256_AVX2_one_round_compute(
  43     Register  reg_old_h,
  44     Register  reg_a,
  45     Register  reg_b,
  46     Register  reg_c,
  47     Register  reg_d,
  48     Register  reg_e,
  49     Register  reg_f,
  50     Register  reg_g,
  51     Register  reg_h,
  52     int iter) {
  53   const Register& reg_y0     = r13;
  54   const Register& reg_y1     = r14;
  55   const Register& reg_y2     = r15;
  56   const Register& reg_y3     = rcx;
  57   const Register& reg_T1     = r12;
  58   //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;;
  59   if (iter%4 > 0) {
  60     addl(reg_old_h, reg_y2);   // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
  61   }
  62   movl(reg_y2, reg_f);         // reg_y2 = reg_f                                ; CH
  63   rorxd(reg_y0, reg_e, 25);    // reg_y0 = reg_e >> 25   ; S1A
  64   rorxd(reg_y1, reg_e, 11);    // reg_y1 = reg_e >> 11    ; S1B
  65   xorl(reg_y2, reg_g);         // reg_y2 = reg_f^reg_g                              ; CH
  66 
  67   xorl(reg_y0, reg_y1);        // reg_y0 = (reg_e>>25) ^ (reg_h>>11)  ; S1
  68   rorxd(reg_y1, reg_e, 6);     // reg_y1 = (reg_e >> 6)    ; S1
  69   andl(reg_y2, reg_e);         // reg_y2 = (reg_f^reg_g)&reg_e                          ; CH
  70 
  71   if (iter%4 > 0) {
  72     addl(reg_old_h, reg_y3);   // reg_h = t1 + S0 + MAJ                     ; --
  73   }
  74 
  75   xorl(reg_y0, reg_y1);       // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
  76   rorxd(reg_T1, reg_a, 13);   // reg_T1 = reg_a >> 13    ; S0B
  77   xorl(reg_y2, reg_g);        // reg_y2 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
  78   rorxd(reg_y1, reg_a, 22);   // reg_y1 = reg_a >> 22    ; S0A
  79   movl(reg_y3, reg_a);        // reg_y3 = reg_a                                ; MAJA
  80 
  81   xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13)  ; S0
  82   rorxd(reg_T1, reg_a, 2);    // reg_T1 = (reg_a >> 2)    ; S0
  83   addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; --
  84   orl(reg_y3, reg_c);         // reg_y3 = reg_a|reg_c                              ; MAJA
  85 
  86   xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
  87   movl(reg_T1, reg_a);        // reg_T1 = reg_a                                ; MAJB
  88   andl(reg_y3, reg_b);        // reg_y3 = (reg_a|reg_c)&reg_b                          ; MAJA
  89   andl(reg_T1, reg_c);        // reg_T1 = reg_a&reg_c                              ; MAJB
  90   addl(reg_y2, reg_y0);       // reg_y2 = S1 + CH                          ; --
  91 
  92 
  93   addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
  94   orl(reg_y3, reg_T1);        // reg_y3 = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
  95   addl(reg_h, reg_y1);        // reg_h = k + w + reg_h + S0                    ; --
  96 
  97   addl(reg_d, reg_y2);        // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
  98 
  99 
 100   if (iter%4 == 3) {
 101     addl(reg_h, reg_y2);      // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
 102     addl(reg_h, reg_y3);      // reg_h = t1 + S0 + MAJ                     ; --
 103   }
 104 }
 105 
 106 void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) {
 107     sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi,  r8,  r9, r10, r11, start + 0);
 108     sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi,  r8,  r9, r10, start + 1);
 109     sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi,  r8,  r9, start + 2);
 110     sha256_AVX2_one_round_compute(r9,  r9,  r10, r11, rax, rbx, rdi, rsi,  r8, start + 3);
 111 }
 112 
 113 void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) {
 114     sha256_AVX2_one_round_compute(r8,  r8,   r9, r10, r11, rax, rbx, rdi, rsi, start + 0);
 115     sha256_AVX2_one_round_compute(rsi, rsi,  r8,  r9, r10, r11, rax, rbx, rdi, start + 1);
 116     sha256_AVX2_one_round_compute(rdi, rdi, rsi,  r8,  r9, r10, r11, rax, rbx, start + 2);
 117     sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi,  r8,  r9, r10, r11, rax, start + 3);
 118 }
 119 
 120 void MacroAssembler::sha256_AVX2_one_round_and_sched(
 121         XMMRegister  xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
 122         XMMRegister  xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
 123         XMMRegister  xmm_2,     /* ymm6 */
 124         XMMRegister  xmm_3,     /* ymm7 */
 125         Register  reg_a,        /* == rax on 0 iteration, then rotate 8 register right on each next iteration */
 126         Register  reg_b,        /* rbx */    /* full cycle is 8 iterations */
 127         Register  reg_c,        /* rdi */
 128         Register  reg_d,        /* rsi */
 129         Register  reg_e,        /* r8 */
 130         Register  reg_f,        /* r9d */
 131         Register  reg_g,        /* r10d */
 132         Register  reg_h,        /* r11d */
 133         int iter)
 134 {
 135   movl(rcx, reg_a);           // rcx = reg_a               ; MAJA
 136   rorxd(r13, reg_e, 25);      // r13 = reg_e >> 25    ; S1A
 137   rorxd(r14, reg_e, 11);      //  r14 = reg_e >> 11    ; S1B
 138   addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter));
 139   orl(rcx, reg_c);            // rcx = reg_a|reg_c          ; MAJA
 140 
 141   movl(r15, reg_f);           // r15 = reg_f               ; CH
 142   rorxd(r12, reg_a, 13);      // r12 = reg_a >> 13      ; S0B
 143   xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11)  ; S1
 144   xorl(r15, reg_g);           // r15 = reg_f^reg_g         ; CH
 145 
 146   rorxd(r14, reg_e, 6);       // r14 = (reg_e >> 6)    ; S1
 147   andl(r15, reg_e);           // r15 = (reg_f^reg_g)&reg_e ; CH
 148 
 149   xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
 150   rorxd(r14, reg_a, 22);      // r14 = reg_a >> 22    ; S0A
 151   addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
 152 
 153   andl(rcx, reg_b);          // rcx = (reg_a|reg_c)&reg_b                          ; MAJA
 154   xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13)  ; S0
 155 
 156   rorxd(r12, reg_a, 2);      // r12 = (reg_a >> 2)    ; S0
 157   xorl(r15, reg_g);          // r15 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
 158 
 159   xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
 160   movl(r12, reg_a);          // r12 = reg_a                                ; MAJB
 161   andl(r12, reg_c);          // r12 = reg_a&reg_c                              ; MAJB
 162   addl(r15, r13);            // r15 = S1 + CH                          ; --
 163 
 164   orl(rcx, r12);             // rcx = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
 165   addl(reg_h, r14);          // reg_h = k + w + reg_h + S0                    ; --
 166   addl(reg_d, r15);          // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
 167 
 168   addl(reg_h, r15);          // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
 169   addl(reg_h, rcx);          // reg_h = t1 + S0 + MAJ                     ; --
 170 
 171   if (iter%4 == 0) {
 172     vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit);   // ymm0 = W[-7]
 173     vpaddd(xmm0, xmm0, xmm_0, AVX_256bit);         // ymm0 = W[-7] + W[-16]; y1 = (e >> 6)     ; S1
 174     vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit);   // ymm1 = W[-15]
 175     vpsrld(xmm2, xmm1, 7, AVX_256bit);
 176     vpslld(xmm3, xmm1, 32-7, AVX_256bit);
 177     vpor(xmm3, xmm3, xmm2, AVX_256bit);            // ymm3 = W[-15] ror 7
 178     vpsrld(xmm2, xmm1,18, AVX_256bit);
 179   } else if (iter%4 == 1 ) {
 180     vpsrld(xmm8, xmm1, 3, AVX_256bit);             // ymm8 = W[-15] >> 3
 181     vpslld(xmm1, xmm1, 32-18, AVX_256bit);
 182     vpxor(xmm3, xmm3, xmm1, AVX_256bit);
 183     vpxor(xmm3, xmm3, xmm2, AVX_256bit);           // ymm3 = W[-15] ror 7 ^ W[-15] ror 18
 184     vpxor(xmm1, xmm3, xmm8, AVX_256bit);           // ymm1 = s0
 185     vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit);        // 11111010b ; ymm2 = W[-2] {BBAA}
 186     vpaddd(xmm0, xmm0, xmm1, AVX_256bit);          // ymm0 = W[-16] + W[-7] + s0
 187     vpsrld(xmm8, xmm2, 10, AVX_256bit);            // ymm8 = W[-2] >> 10 {BBAA}
 188   } else if (iter%4 == 2) {
 189     vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xBxA}
 190     vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xBxA}
 191     vpxor(xmm2, xmm2, xmm3, AVX_256bit);
 192     vpxor(xmm8, xmm8, xmm2, AVX_256bit);           // ymm8 = s1 {xBxA}
 193     vpshufb(xmm8, xmm8, xmm10, AVX_256bit);        // ymm8 = s1 {00BA}
 194     vpaddd(xmm0, xmm0, xmm8, AVX_256bit);          // ymm0 = {..., ..., W[1], W[0]}
 195     vpshufd(xmm2, xmm0, 0x50, AVX_256bit);         // 01010000b ; ymm2 = W[-2] {DDCC}
 196   } else if (iter%4 == 3) {
 197     vpsrld(xmm11, xmm2, 10, AVX_256bit);           // ymm11 = W[-2] >> 10 {DDCC}
 198     vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xDxC}
 199     vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xDxC}
 200     vpxor(xmm2, xmm2, xmm3, AVX_256bit);
 201     vpxor(xmm11, xmm11, xmm2, AVX_256bit);         // ymm11 = s1 {xDxC}
 202     vpshufb(xmm11, xmm11, xmm12, AVX_256bit);      // ymm11 = s1 {DC00}
 203     vpaddd(xmm_0, xmm11, xmm0, AVX_256bit);        // xmm_0 = {W[3], W[2], W[1], W[0]}
 204   }
 205 }
 206 
 207 void MacroAssembler::addm(int disp, Register r1, Register r2) {
 208   addl(r2, Address(r1, disp));
 209   movl(Address(r1, disp), r2);
 210 }
 211 
 212 void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
 213   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
 214   Register buf, Register state, Register ofs, Register limit, Register rsp,
 215   bool multi_block, XMMRegister shuf_mask) {
 216 
 217   Label loop0, loop1, loop2, loop3,
 218         last_block_enter, do_last_block, only_one_block, done_hash,
 219         compute_size, compute_size_end,
 220         compute_size1, compute_size_end1;
 221 
 222   address K256_W = StubRoutines::x86::k256_W_addr();
 223   address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
 224   address pshuffle_byte_flip_mask_addr = 0;
 225 
 226 const XMMRegister& SHUF_00BA        = xmm10;    // ymm10: shuffle xBxA -> 00BA
 227 const XMMRegister& SHUF_DC00        = xmm12;    // ymm12: shuffle xDxC -> DC00
 228 const XMMRegister& BYTE_FLIP_MASK    = xmm13;   // ymm13
 229 
 230 const XMMRegister& X_BYTE_FLIP_MASK  = xmm13;   //XMM version of BYTE_FLIP_MASK
 231 
 232 const Register& NUM_BLKS = r8;   // 3rd arg
 233 const Register& CTX      = rdx;  // 2nd arg
 234 const Register& INP      = rcx;  // 1st arg
 235 
 236 const Register& c       = rdi;
 237 const Register& d       = rsi;
 238 const Register& e       = r8;    // clobbers NUM_BLKS
 239 const Register& y3       = rcx;  // clobbers INP
 240 
 241 const Register& TBL      = rbp;
 242 const Register& SRND    = CTX;   // SRND is same register as CTX
 243 
 244 const Register& a        = rax;
 245 const Register& b        = rbx;
 246 const Register& f        = r9;
 247 const Register& g        = r10;
 248 const Register& h        = r11;
 249 
 250 const Register& T1       = r12;
 251 const Register& y0       = r13;
 252 const Register& y1       = r14;
 253 const Register& y2       = r15;
 254 
 255 
 256 enum {
 257   _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round
 258 #ifndef _WIN64
 259   _XMM_SAVE_SIZE = 0,
 260 #else
 261   _XMM_SAVE_SIZE = 8*16,
 262 #endif
 263   _INP_END_SIZE = 8,
 264   _INP_SIZE = 8,
 265   _CTX_SIZE = 8,
 266   _RSP_SIZE = 8,
 267 
 268   _XFER = 0,
 269   _XMM_SAVE  = _XFER     + _XFER_SIZE,
 270   _INP_END  = _XMM_SAVE + _XMM_SAVE_SIZE,
 271   _INP     = _INP_END  + _INP_END_SIZE,
 272   _CTX     = _INP      + _INP_SIZE,
 273   _RSP     = _CTX      + _CTX_SIZE,
 274   STACK_SIZE = _RSP      + _RSP_SIZE,
 275 };
 276 
 277 #ifndef _WIN64
 278   push(rcx);    // linux: this is limit, need at the end
 279   push(rdx);    // linux: this is ofs
 280 #else
 281   push(r8);     // win64: this is ofs
 282   push(r9);     // win64: this is limit, we need them again at the very and
 283 #endif
 284 
 285 
 286   push(rbx);
 287 #ifdef _WIN64
 288   push(rsi);
 289   push(rdi);
 290 #endif
 291   push(rbp);
 292   push(r12);
 293   push(r13);
 294   push(r14);
 295   push(r15);
 296 
 297   movq(rax, rsp);
 298   subq(rsp, STACK_SIZE);
 299   andq(rsp, -32);
 300   movq(Address(rsp, _RSP), rax);
 301 
 302 #ifndef _WIN64
 303   // copy linux params to win64 params, therefore the rest of code will be the same for both
 304   movq(r9,  rcx);
 305   movq(r8,  rdx);
 306   movq(rdx, rsi);
 307   movq(rcx, rdi);
 308 #endif
 309 
 310   // setting original assembly ABI
 311   /** message to encrypt in INP */
 312   lea(INP, Address(rcx, 0));    // rcx == message (buf)     ;; linux: INP = buf = rdi
 313   /** digest in CTX             */
 314   movq(CTX, rdx);               // rdx = digest  (state)    ;; linux: CTX = state = rsi
 315 
 316   /** NUM_BLK is the length of message, need to set it from ofs and limit  */
 317   if (multi_block) {
 318 
 319     // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8
 320     // on entry r8 = ofs
 321     // on exit  r8 = NUM_BLKS
 322 
 323     xorq(rax, rax);
 324 
 325     bind(compute_size);
 326     cmpptr(r8, r9); // assume the original ofs <= limit ;; linux:  cmp rcx, rdx
 327     jccb(Assembler::aboveEqual, compute_size_end);
 328     addq(r8, 64);                                          //;; linux: ofs = rdx
 329     addq(rax, 64);
 330     jmpb(compute_size);
 331 
 332     bind(compute_size_end);
 333     movq(NUM_BLKS, rax);  // NUM_BLK (r8)                  ;; linux: NUM_BLK = rdx
 334 
 335     cmpq(NUM_BLKS, 0);
 336     jcc(Assembler::equal, done_hash);
 337 
 338     } else {
 339     xorq(NUM_BLKS, NUM_BLKS);
 340     addq(NUM_BLKS, 64);
 341   }//if (!multi_block)
 342 
 343   lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block
 344   movq(Address(rsp, _INP_END), NUM_BLKS);  //
 345 
 346   cmpptr(INP, NUM_BLKS);                   //cmp INP, NUM_BLKS
 347   jcc(Assembler::equal, only_one_block);   //je only_one_block
 348 
 349   // load initial digest
 350   movl(a, Address(CTX, 4*0));
 351   movl(b, Address(CTX, 4*1));
 352   movl(c, Address(CTX, 4*2));
 353   movl(d, Address(CTX, 4*3));
 354   movl(e, Address(CTX, 4*4));
 355   movl(f, Address(CTX, 4*5));
 356   movl(g, Address(CTX, 4*6));
 357   movl(h, Address(CTX, 4*7));
 358 
 359   pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
 360   vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
 361   vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
 362   vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
 363 
 364   movq(Address(rsp, _CTX), CTX);           // store
 365 
 366 bind(loop0);
 367   lea(TBL, ExternalAddress(K256_W));
 368 
 369   // assume buffers not aligned
 370 
 371   // Load first 16 dwords from two blocks
 372   vmovdqu(xmm0, Address(INP, 0*32));
 373   vmovdqu(xmm1, Address(INP, 1*32));
 374   vmovdqu(xmm2, Address(INP, 2*32));
 375   vmovdqu(xmm3, Address(INP, 3*32));
 376 
 377   // byte swap data
 378   vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit);
 379   vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit);
 380   vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit);
 381   vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit);
 382 
 383   // transpose data into high/low halves
 384   vperm2i128(xmm4, xmm0, xmm2, 0x20);
 385   vperm2i128(xmm5, xmm0, xmm2, 0x31);
 386   vperm2i128(xmm6, xmm1, xmm3, 0x20);
 387   vperm2i128(xmm7, xmm1, xmm3, 0x31);
 388 
 389 bind(last_block_enter);
 390   addq(INP, 64);
 391   movq(Address(rsp, _INP), INP);
 392 
 393   //;; schedule 48 input dwords, by doing 3 rounds of 12 each
 394   xorq(SRND, SRND);
 395 
 396 align(16);
 397 bind(loop1);
 398   vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
 399   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
 400   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 0);
 401   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 1);
 402   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  2);
 403   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  3);
 404 
 405   vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
 406   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
 407   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  8+0);
 408   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  8+1);
 409   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  8+2);
 410   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  8+3);
 411 
 412   vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit);
 413   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9);
 414   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 16+0);
 415   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 16+1);
 416   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  16+2);
 417   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  16+3);
 418 
 419   vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit);
 420   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9);
 421 
 422   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  24+0);
 423   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  24+1);
 424   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  24+2);
 425   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  24+3);
 426 
 427   addq(SRND, 4*32);
 428   cmpq(SRND, 3 * 4*32);
 429   jcc(Assembler::below, loop1);
 430 
 431 bind(loop2);
 432   // Do last 16 rounds with no scheduling
 433   vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
 434   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
 435   sha256_AVX2_four_rounds_compute_first(0);
 436 
 437   vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
 438   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
 439   sha256_AVX2_four_rounds_compute_last(0 + 8);
 440 
 441   addq(SRND, 2*32);
 442 
 443   vmovdqu(xmm4, xmm6);
 444   vmovdqu(xmm5, xmm7);
 445 
 446   cmpq(SRND, 4 * 4*32);
 447   jcc(Assembler::below, loop2);
 448 
 449   movq(CTX, Address(rsp, _CTX));
 450   movq(INP, Address(rsp, _INP));
 451 
 452   addm(4*0, CTX, a);
 453   addm(4*1, CTX, b);
 454   addm(4*2, CTX, c);
 455   addm(4*3, CTX, d);
 456   addm(4*4, CTX, e);
 457   addm(4*5, CTX, f);
 458   addm(4*6, CTX, g);
 459   addm(4*7, CTX, h);
 460 
 461   cmpq(INP, Address(rsp, _INP_END));
 462   jcc(Assembler::above, done_hash);
 463 
 464   //Do second block using previously scheduled results
 465   xorq(SRND, SRND);
 466 align(16);
 467 bind(loop3);
 468   sha256_AVX2_four_rounds_compute_first(4);
 469   sha256_AVX2_four_rounds_compute_last(4+8);
 470 
 471   addq(SRND, 2*32);
 472   cmpq(SRND, 4 * 4*32);
 473   jcc(Assembler::below, loop3);
 474 
 475   movq(CTX, Address(rsp, _CTX));
 476   movq(INP, Address(rsp, _INP));
 477   addq(INP, 64);
 478 
 479   addm(4*0, CTX, a);
 480   addm(4*1, CTX, b);
 481   addm(4*2, CTX, c);
 482   addm(4*3, CTX, d);
 483   addm(4*4, CTX, e);
 484   addm(4*5, CTX, f);
 485   addm(4*6, CTX, g);
 486   addm(4*7, CTX, h);
 487 
 488   cmpq(INP, Address(rsp, _INP_END));
 489   jcc(Assembler::below, loop0);
 490   jccb(Assembler::above, done_hash);
 491 
 492 bind(do_last_block);
 493   lea(TBL, ExternalAddress(K256_W));
 494 
 495   movdqu(xmm4, Address(INP, 0*16));
 496   movdqu(xmm5, Address(INP, 1*16));
 497   movdqu(xmm6, Address(INP, 2*16));
 498   movdqu(xmm7, Address(INP, 3*16));
 499 
 500   vpshufb(xmm4, xmm4, xmm13, AVX_128bit);
 501   vpshufb(xmm5, xmm5, xmm13, AVX_128bit);
 502   vpshufb(xmm6, xmm6, xmm13, AVX_128bit);
 503   vpshufb(xmm7, xmm7, xmm13, AVX_128bit);
 504 
 505   jmp(last_block_enter);
 506 
 507 bind(only_one_block);
 508 
 509   // load initial digest ;; table should be preloaded with following values
 510   movl(a, Address(CTX, 4*0));   // 0x6a09e667
 511   movl(b, Address(CTX, 4*1));   // 0xbb67ae85
 512   movl(c, Address(CTX, 4*2));   // 0x3c6ef372
 513   movl(d, Address(CTX, 4*3));   // 0xa54ff53a
 514   movl(e, Address(CTX, 4*4));   // 0x510e527f
 515   movl(f, Address(CTX, 4*5));   // 0x9b05688c
 516   movl(g, Address(CTX, 4*6));   // 0x1f83d9ab
 517   movl(h, Address(CTX, 4*7));   // 0x5be0cd19
 518 
 519 
 520   pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
 521   vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
 522   vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
 523   vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
 524 
 525   movq(Address(rsp, _CTX), CTX);
 526   jmpb(do_last_block);
 527 
 528 bind(done_hash);
 529 
 530   movq(rsp, Address(rsp, _RSP));
 531 
 532   pop(r15);
 533   pop(r14);
 534   pop(r13);
 535   pop(r12);
 536   pop(rbp);
 537 #ifdef _WIN64
 538   pop(rdi);
 539   pop(rsi);
 540 #endif
 541   pop(rbx);
 542 
 543 #ifdef _WIN64
 544   pop(r9);
 545   pop(r8);
 546 #else
 547   pop(rdx);
 548   pop(rcx);
 549 #endif
 550 
 551   if (multi_block) {
 552 #ifdef _WIN64
 553 const Register& limit_end = r9;
 554 const Register& ofs_end   = r8;
 555 #else
 556 const Register& limit_end = rcx;
 557 const Register& ofs_end   = rdx;
 558 #endif
 559     movq(rax, ofs_end);
 560 
 561 bind(compute_size1);
 562     cmpptr(rax, limit_end); // assume the original ofs <= limit
 563     jccb(Assembler::aboveEqual, compute_size_end1);
 564     addq(rax, 64);
 565     jmpb(compute_size1);
 566 
 567 bind(compute_size_end1);
 568   }
 569 }
 570 #endif //#ifdef _LP64
 571 
 572 // ofs and limit are used for multi-block byte array.
 573 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
 574 void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
 575   XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
 576   Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
 577 
 578   Label start, done_hash, loop0;
 579 
 580   address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
 581   address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
 582 
 583   bind(start);
 584   movdqu(abcd, Address(state, 0));
 585   pinsrd(e0, Address(state, 16), 3);
 586   movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
 587   pand(e0, shuf_mask);
 588   pshufd(abcd, abcd, 0x1B);
 589   movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
 590 
 591   bind(loop0);


< prev index next >