1 /*
   2 * Copyright (c) 2016, Intel Corporation.
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This code is free software; you can redistribute it and/or modify it
   7 * under the terms of the GNU General Public License version 2 only, as
   8 * published by the Free Software Foundation.
   9 *
  10 * This code is distributed in the hope that it will be useful, but WITHOUT
  11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13 * version 2 for more details (a copy is included in the LICENSE file that
  14 * accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License version
  17 * 2 along with this work; if not, write to the Free Software Foundation,
  18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19 *
  20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21 * or visit www.oracle.com if you need additional information or have any
  22 * questions.
  23 *
  24 */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "runtime/stubRoutines.hpp"
  30 #include "macroAssembler_x86.hpp"
  31 
  32 #ifdef _LP64
  33 /*
  34   The algorithm below is based on Intel publication:
  35   "Fast SHA-256 Implementations on IntelĀ® Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal.
  36   The assembly code was originally provided by Sean Gulley and in many places preserves
  37   the original assembly NAMES and comments to simplify matching Java assembly with its original.
  38   The Java version was substantially redesigned to replace 1200 assembly instruction with
  39   much shorter run-time generator of the same code in memory.
  40 */
  41 
  42 void MacroAssembler::sha256_AVX2_one_round_compute(
  43     Register  reg_old_h,
  44     Register  reg_a,
  45     Register  reg_b,
  46     Register  reg_c,
  47     Register  reg_d,
  48     Register  reg_e,
  49     Register  reg_f,
  50     Register  reg_g,
  51     Register  reg_h,
  52     int iter) {
  53   const Register& reg_y0     = r13;
  54   const Register& reg_y1     = r14;
  55   const Register& reg_y2     = r15;
  56   const Register& reg_y3     = rcx;
  57   const Register& reg_T1     = r12;
  58   //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;;
  59   if (iter%4 > 0) {
  60     addl(reg_old_h, reg_y2);   // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
  61   }
  62   movl(reg_y2, reg_f);         // reg_y2 = reg_f                                ; CH
  63   rorxd(reg_y0, reg_e, 25);    // reg_y0 = reg_e >> 25   ; S1A
  64   rorxd(reg_y1, reg_e, 11);    // reg_y1 = reg_e >> 11    ; S1B
  65   xorl(reg_y2, reg_g);         // reg_y2 = reg_f^reg_g                              ; CH
  66 
  67   xorl(reg_y0, reg_y1);        // reg_y0 = (reg_e>>25) ^ (reg_h>>11)  ; S1
  68   rorxd(reg_y1, reg_e, 6);     // reg_y1 = (reg_e >> 6)    ; S1
  69   andl(reg_y2, reg_e);         // reg_y2 = (reg_f^reg_g)&reg_e                          ; CH
  70 
  71   if (iter%4 > 0) {
  72     addl(reg_old_h, reg_y3);   // reg_h = t1 + S0 + MAJ                     ; --
  73   }
  74 
  75   xorl(reg_y0, reg_y1);       // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
  76   rorxd(reg_T1, reg_a, 13);   // reg_T1 = reg_a >> 13    ; S0B
  77   xorl(reg_y2, reg_g);        // reg_y2 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
  78   rorxd(reg_y1, reg_a, 22);   // reg_y1 = reg_a >> 22    ; S0A
  79   movl(reg_y3, reg_a);        // reg_y3 = reg_a                                ; MAJA
  80 
  81   xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13)  ; S0
  82   rorxd(reg_T1, reg_a, 2);    // reg_T1 = (reg_a >> 2)    ; S0
  83   addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; --
  84   orl(reg_y3, reg_c);         // reg_y3 = reg_a|reg_c                              ; MAJA
  85 
  86   xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
  87   movl(reg_T1, reg_a);        // reg_T1 = reg_a                                ; MAJB
  88   andl(reg_y3, reg_b);        // reg_y3 = (reg_a|reg_c)&reg_b                          ; MAJA
  89   andl(reg_T1, reg_c);        // reg_T1 = reg_a&reg_c                              ; MAJB
  90   addl(reg_y2, reg_y0);       // reg_y2 = S1 + CH                          ; --
  91 
  92 
  93   addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
  94   orl(reg_y3, reg_T1);        // reg_y3 = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
  95   addl(reg_h, reg_y1);        // reg_h = k + w + reg_h + S0                    ; --
  96 
  97   addl(reg_d, reg_y2);        // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
  98 
  99 
 100   if (iter%4 == 3) {
 101     addl(reg_h, reg_y2);      // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
 102     addl(reg_h, reg_y3);      // reg_h = t1 + S0 + MAJ                     ; --
 103   }
 104 }
 105 
 106 void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) {
 107     sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi,  r8,  r9, r10, r11, start + 0);
 108     sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi,  r8,  r9, r10, start + 1);
 109     sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi,  r8,  r9, start + 2);
 110     sha256_AVX2_one_round_compute(r9,  r9,  r10, r11, rax, rbx, rdi, rsi,  r8, start + 3);
 111 }
 112 
 113 void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) {
 114     sha256_AVX2_one_round_compute(r8,  r8,   r9, r10, r11, rax, rbx, rdi, rsi, start + 0);
 115     sha256_AVX2_one_round_compute(rsi, rsi,  r8,  r9, r10, r11, rax, rbx, rdi, start + 1);
 116     sha256_AVX2_one_round_compute(rdi, rdi, rsi,  r8,  r9, r10, r11, rax, rbx, start + 2);
 117     sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi,  r8,  r9, r10, r11, rax, start + 3);
 118 }
 119 
 120 void MacroAssembler::sha256_AVX2_one_round_and_sched(
 121         XMMRegister  xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
 122         XMMRegister  xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
 123         XMMRegister  xmm_2,     /* ymm6 */
 124         XMMRegister  xmm_3,     /* ymm7 */
 125         Register  reg_a,        /* == rax on 0 iteration, then rotate 8 register right on each next iteration */
 126         Register  reg_b,        /* rbx */    /* full cycle is 8 iterations */
 127         Register  reg_c,        /* rdi */
 128         Register  reg_d,        /* rsi */
 129         Register  reg_e,        /* r8 */
 130         Register  reg_f,        /* r9d */
 131         Register  reg_g,        /* r10d */
 132         Register  reg_h,        /* r11d */
 133         int iter)
 134 {
 135   movl(rcx, reg_a);           // rcx = reg_a               ; MAJA
 136   rorxd(r13, reg_e, 25);      // r13 = reg_e >> 25    ; S1A
 137   rorxd(r14, reg_e, 11);      //  r14 = reg_e >> 11    ; S1B
 138   addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter));
 139   orl(rcx, reg_c);            // rcx = reg_a|reg_c          ; MAJA
 140 
 141   movl(r15, reg_f);           // r15 = reg_f               ; CH
 142   rorxd(r12, reg_a, 13);      // r12 = reg_a >> 13      ; S0B
 143   xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11)  ; S1
 144   xorl(r15, reg_g);           // r15 = reg_f^reg_g         ; CH
 145 
 146   rorxd(r14, reg_e, 6);       // r14 = (reg_e >> 6)    ; S1
 147   andl(r15, reg_e);           // r15 = (reg_f^reg_g)&reg_e ; CH
 148 
 149   xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
 150   rorxd(r14, reg_a, 22);      // r14 = reg_a >> 22    ; S0A
 151   addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
 152 
 153   andl(rcx, reg_b);          // rcx = (reg_a|reg_c)&reg_b                          ; MAJA
 154   xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13)  ; S0
 155 
 156   rorxd(r12, reg_a, 2);      // r12 = (reg_a >> 2)    ; S0
 157   xorl(r15, reg_g);          // r15 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
 158 
 159   xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
 160   movl(r12, reg_a);          // r12 = reg_a                                ; MAJB
 161   andl(r12, reg_c);          // r12 = reg_a&reg_c                              ; MAJB
 162   addl(r15, r13);            // r15 = S1 + CH                          ; --
 163 
 164   orl(rcx, r12);             // rcx = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
 165   addl(reg_h, r14);          // reg_h = k + w + reg_h + S0                    ; --
 166   addl(reg_d, r15);          // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
 167 
 168   addl(reg_h, r15);          // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
 169   addl(reg_h, rcx);          // reg_h = t1 + S0 + MAJ                     ; --
 170 
 171   if (iter%4 == 0) {
 172     vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit);   // ymm0 = W[-7]
 173     vpaddd(xmm0, xmm0, xmm_0, AVX_256bit);         // ymm0 = W[-7] + W[-16]; y1 = (e >> 6)     ; S1
 174     vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit);   // ymm1 = W[-15]
 175     vpsrld(xmm2, xmm1, 7, AVX_256bit);
 176     vpslld(xmm3, xmm1, 32-7, AVX_256bit);
 177     vpor(xmm3, xmm3, xmm2, AVX_256bit);            // ymm3 = W[-15] ror 7
 178     vpsrld(xmm2, xmm1,18, AVX_256bit);
 179   } else if (iter%4 == 1 ) {
 180     vpsrld(xmm8, xmm1, 3, AVX_256bit);             // ymm8 = W[-15] >> 3
 181     vpslld(xmm1, xmm1, 32-18, AVX_256bit);
 182     vpxor(xmm3, xmm3, xmm1, AVX_256bit);
 183     vpxor(xmm3, xmm3, xmm2, AVX_256bit);           // ymm3 = W[-15] ror 7 ^ W[-15] ror 18
 184     vpxor(xmm1, xmm3, xmm8, AVX_256bit);           // ymm1 = s0
 185     vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit);        // 11111010b ; ymm2 = W[-2] {BBAA}
 186     vpaddd(xmm0, xmm0, xmm1, AVX_256bit);          // ymm0 = W[-16] + W[-7] + s0
 187     vpsrld(xmm8, xmm2, 10, AVX_256bit);            // ymm8 = W[-2] >> 10 {BBAA}
 188   } else if (iter%4 == 2) {
 189     vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xBxA}
 190     vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xBxA}
 191     vpxor(xmm2, xmm2, xmm3, AVX_256bit);
 192     vpxor(xmm8, xmm8, xmm2, AVX_256bit);           // ymm8 = s1 {xBxA}
 193     vpshufb(xmm8, xmm8, xmm10, AVX_256bit);        // ymm8 = s1 {00BA}
 194     vpaddd(xmm0, xmm0, xmm8, AVX_256bit);          // ymm0 = {..., ..., W[1], W[0]}
 195     vpshufd(xmm2, xmm0, 0x50, AVX_256bit);         // 01010000b ; ymm2 = W[-2] {DDCC}
 196   } else if (iter%4 == 3) {
 197     vpsrld(xmm11, xmm2, 10, AVX_256bit);           // ymm11 = W[-2] >> 10 {DDCC}
 198     vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xDxC}
 199     vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xDxC}
 200     vpxor(xmm2, xmm2, xmm3, AVX_256bit);
 201     vpxor(xmm11, xmm11, xmm2, AVX_256bit);         // ymm11 = s1 {xDxC}
 202     vpshufb(xmm11, xmm11, xmm12, AVX_256bit);      // ymm11 = s1 {DC00}
 203     vpaddd(xmm_0, xmm11, xmm0, AVX_256bit);        // xmm_0 = {W[3], W[2], W[1], W[0]}
 204   }
 205 }
 206 
 207 void MacroAssembler::addm(int disp, Register r1, Register r2) {
 208   addl(r2, Address(r1, disp));
 209   movl(Address(r1, disp), r2);
 210 }
 211 
 212 void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
 213   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
 214   Register buf, Register state, Register ofs, Register limit, Register rsp,
 215   bool multi_block, XMMRegister shuf_mask) {
 216 
 217   Label loop0, loop1, loop2, loop3,
 218         last_block_enter, do_last_block, only_one_block, done_hash,
 219         compute_size, compute_size_end,
 220         compute_size1, compute_size_end1;
 221 
 222   address K256_W = StubRoutines::x86::k256_W_addr();
 223   address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
 224   address pshuffle_byte_flip_mask_addr = 0;
 225 
 226 const XMMRegister& SHUF_00BA        = xmm10;    // ymm10: shuffle xBxA -> 00BA
 227 const XMMRegister& SHUF_DC00        = xmm12;    // ymm12: shuffle xDxC -> DC00
 228 const XMMRegister& BYTE_FLIP_MASK    = xmm13;   // ymm13
 229 
 230 const XMMRegister& X_BYTE_FLIP_MASK  = xmm13;   //XMM version of BYTE_FLIP_MASK
 231 
 232 const Register& NUM_BLKS = r8;   // 3rd arg
 233 const Register& CTX      = rdx;  // 2nd arg
 234 const Register& INP      = rcx;  // 1st arg
 235 
 236 const Register& c       = rdi;
 237 const Register& d       = rsi;
 238 const Register& e       = r8;    // clobbers NUM_BLKS
 239 const Register& y3       = rcx;  // clobbers INP
 240 
 241 const Register& TBL      = rbp;
 242 const Register& SRND    = CTX;   // SRND is same register as CTX
 243 
 244 const Register& a        = rax;
 245 const Register& b        = rbx;
 246 const Register& f        = r9;
 247 const Register& g        = r10;
 248 const Register& h        = r11;
 249 
 250 const Register& T1       = r12;
 251 const Register& y0       = r13;
 252 const Register& y1       = r14;
 253 const Register& y2       = r15;
 254 
 255 
 256 enum {
 257   _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round
 258 #ifndef _WIN64
 259   _XMM_SAVE_SIZE = 0,
 260 #else
 261   _XMM_SAVE_SIZE = 8*16,
 262 #endif
 263   _INP_END_SIZE = 8,
 264   _INP_SIZE = 8,
 265   _CTX_SIZE = 8,
 266   _RSP_SIZE = 8,
 267 
 268   _XFER = 0,
 269   _XMM_SAVE  = _XFER     + _XFER_SIZE,
 270   _INP_END  = _XMM_SAVE + _XMM_SAVE_SIZE,
 271   _INP     = _INP_END  + _INP_END_SIZE,
 272   _CTX     = _INP      + _INP_SIZE,
 273   _RSP     = _CTX      + _CTX_SIZE,
 274   STACK_SIZE = _RSP      + _RSP_SIZE,
 275 };
 276 
 277 #ifndef _WIN64
 278   push(rcx);    // linux: this is limit, need at the end
 279   push(rdx);    // linux: this is ofs
 280 #else
 281   push(r8);     // win64: this is ofs
 282   push(r9);     // win64: this is limit, we need them again at the very and
 283 #endif
 284 
 285 
 286   push(rbx);
 287 #ifdef _WIN64
 288   push(rsi);
 289   push(rdi);
 290 #endif
 291   push(rbp);
 292   push(r12);
 293   push(r13);
 294   push(r14);
 295   push(r15);
 296 
 297   movq(rax, rsp);
 298   subq(rsp, STACK_SIZE);
 299   andq(rsp, -32);
 300   movq(Address(rsp, _RSP), rax);
 301 
 302 #ifndef _WIN64
 303   // copy linux params to win64 params, therefore the rest of code will be the same for both
 304   movq(r9,  rcx);
 305   movq(r8,  rdx);
 306   movq(rdx, rsi);
 307   movq(rcx, rdi);
 308 #endif
 309 
 310   // setting original assembly ABI
 311   /** message to encrypt in INP */
 312   lea(INP, Address(rcx, 0));    // rcx == message (buf)     ;; linux: INP = buf = rdi
 313   /** digest in CTX             */
 314   movq(CTX, rdx);               // rdx = digest  (state)    ;; linux: CTX = state = rsi
 315 
 316   /** NUM_BLK is the length of message, need to set it from ofs and limit  */
 317   if (multi_block) {
 318 
 319     // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8
 320     // on entry r8 = ofs
 321     // on exit  r8 = NUM_BLKS
 322 
 323     xorq(rax, rax);
 324 
 325     bind(compute_size);
 326     cmpptr(r8, r9); // assume the original ofs <= limit ;; linux:  cmp rcx, rdx
 327     jccb(Assembler::aboveEqual, compute_size_end);
 328     addq(r8, 64);                                          //;; linux: ofs = rdx
 329     addq(rax, 64);
 330     jmpb(compute_size);
 331 
 332     bind(compute_size_end);
 333     movq(NUM_BLKS, rax);  // NUM_BLK (r8)                  ;; linux: NUM_BLK = rdx
 334 
 335     cmpq(NUM_BLKS, 0);
 336     jcc(Assembler::equal, done_hash);
 337 
 338     } else {
 339     xorq(NUM_BLKS, NUM_BLKS);
 340     addq(NUM_BLKS, 64);
 341   }//if (!multi_block)
 342 
 343   lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block
 344   movq(Address(rsp, _INP_END), NUM_BLKS);  //
 345 
 346   cmpptr(INP, NUM_BLKS);                   //cmp INP, NUM_BLKS
 347   jcc(Assembler::equal, only_one_block);   //je only_one_block
 348 
 349   // load initial digest
 350   movl(a, Address(CTX, 4*0));
 351   movl(b, Address(CTX, 4*1));
 352   movl(c, Address(CTX, 4*2));
 353   movl(d, Address(CTX, 4*3));
 354   movl(e, Address(CTX, 4*4));
 355   movl(f, Address(CTX, 4*5));
 356   movl(g, Address(CTX, 4*6));
 357   movl(h, Address(CTX, 4*7));
 358 
 359   pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
 360   vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
 361   vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
 362   vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
 363 
 364   movq(Address(rsp, _CTX), CTX);           // store
 365 
 366 bind(loop0);
 367   lea(TBL, ExternalAddress(K256_W));
 368 
 369   // assume buffers not aligned
 370 
 371   // Load first 16 dwords from two blocks
 372   vmovdqu(xmm0, Address(INP, 0*32));
 373   vmovdqu(xmm1, Address(INP, 1*32));
 374   vmovdqu(xmm2, Address(INP, 2*32));
 375   vmovdqu(xmm3, Address(INP, 3*32));
 376 
 377   // byte swap data
 378   vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit);
 379   vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit);
 380   vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit);
 381   vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit);
 382 
 383   // transpose data into high/low halves
 384   vperm2i128(xmm4, xmm0, xmm2, 0x20);
 385   vperm2i128(xmm5, xmm0, xmm2, 0x31);
 386   vperm2i128(xmm6, xmm1, xmm3, 0x20);
 387   vperm2i128(xmm7, xmm1, xmm3, 0x31);
 388 
 389 bind(last_block_enter);
 390   addq(INP, 64);
 391   movq(Address(rsp, _INP), INP);
 392 
 393   //;; schedule 48 input dwords, by doing 3 rounds of 12 each
 394   xorq(SRND, SRND);
 395 
 396 align(16);
 397 bind(loop1);
 398   vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
 399   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
 400   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 0);
 401   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 1);
 402   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  2);
 403   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  3);
 404 
 405   vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
 406   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
 407   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  8+0);
 408   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  8+1);
 409   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  8+2);
 410   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  8+3);
 411 
 412   vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit);
 413   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9);
 414   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 16+0);
 415   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 16+1);
 416   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  16+2);
 417   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  16+3);
 418 
 419   vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit);
 420   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9);
 421 
 422   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  24+0);
 423   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  24+1);
 424   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  24+2);
 425   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  24+3);
 426 
 427   addq(SRND, 4*32);
 428   cmpq(SRND, 3 * 4*32);
 429   jcc(Assembler::below, loop1);
 430 
 431 bind(loop2);
 432   // Do last 16 rounds with no scheduling
 433   vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
 434   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
 435   sha256_AVX2_four_rounds_compute_first(0);
 436 
 437   vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
 438   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
 439   sha256_AVX2_four_rounds_compute_last(0 + 8);
 440 
 441   addq(SRND, 2*32);
 442 
 443   vmovdqu(xmm4, xmm6);
 444   vmovdqu(xmm5, xmm7);
 445 
 446   cmpq(SRND, 4 * 4*32);
 447   jcc(Assembler::below, loop2);
 448 
 449   movq(CTX, Address(rsp, _CTX));
 450   movq(INP, Address(rsp, _INP));
 451 
 452   addm(4*0, CTX, a);
 453   addm(4*1, CTX, b);
 454   addm(4*2, CTX, c);
 455   addm(4*3, CTX, d);
 456   addm(4*4, CTX, e);
 457   addm(4*5, CTX, f);
 458   addm(4*6, CTX, g);
 459   addm(4*7, CTX, h);
 460 
 461   cmpq(INP, Address(rsp, _INP_END));
 462   jcc(Assembler::above, done_hash);
 463 
 464   //Do second block using previously scheduled results
 465   xorq(SRND, SRND);
 466 align(16);
 467 bind(loop3);
 468   sha256_AVX2_four_rounds_compute_first(4);
 469   sha256_AVX2_four_rounds_compute_last(4+8);
 470 
 471   addq(SRND, 2*32);
 472   cmpq(SRND, 4 * 4*32);
 473   jcc(Assembler::below, loop3);
 474 
 475   movq(CTX, Address(rsp, _CTX));
 476   movq(INP, Address(rsp, _INP));
 477   addq(INP, 64);
 478 
 479   addm(4*0, CTX, a);
 480   addm(4*1, CTX, b);
 481   addm(4*2, CTX, c);
 482   addm(4*3, CTX, d);
 483   addm(4*4, CTX, e);
 484   addm(4*5, CTX, f);
 485   addm(4*6, CTX, g);
 486   addm(4*7, CTX, h);
 487 
 488   cmpq(INP, Address(rsp, _INP_END));
 489   jcc(Assembler::below, loop0);
 490   jccb(Assembler::above, done_hash);
 491 
 492 bind(do_last_block);
 493   lea(TBL, ExternalAddress(K256_W));
 494 
 495   movdqu(xmm4, Address(INP, 0*16));
 496   movdqu(xmm5, Address(INP, 1*16));
 497   movdqu(xmm6, Address(INP, 2*16));
 498   movdqu(xmm7, Address(INP, 3*16));
 499 
 500   vpshufb(xmm4, xmm4, xmm13, AVX_128bit);
 501   vpshufb(xmm5, xmm5, xmm13, AVX_128bit);
 502   vpshufb(xmm6, xmm6, xmm13, AVX_128bit);
 503   vpshufb(xmm7, xmm7, xmm13, AVX_128bit);
 504 
 505   jmp(last_block_enter);
 506 
 507 bind(only_one_block);
 508 
 509   // load initial digest ;; table should be preloaded with following values
 510   movl(a, Address(CTX, 4*0));   // 0x6a09e667
 511   movl(b, Address(CTX, 4*1));   // 0xbb67ae85
 512   movl(c, Address(CTX, 4*2));   // 0x3c6ef372
 513   movl(d, Address(CTX, 4*3));   // 0xa54ff53a
 514   movl(e, Address(CTX, 4*4));   // 0x510e527f
 515   movl(f, Address(CTX, 4*5));   // 0x9b05688c
 516   movl(g, Address(CTX, 4*6));   // 0x1f83d9ab
 517   movl(h, Address(CTX, 4*7));   // 0x5be0cd19
 518 
 519 
 520   pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
 521   vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
 522   vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
 523   vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
 524 
 525   movq(Address(rsp, _CTX), CTX);
 526   jmpb(do_last_block);
 527 
 528 bind(done_hash);
 529 
 530   movq(rsp, Address(rsp, _RSP));
 531 
 532   pop(r15);
 533   pop(r14);
 534   pop(r13);
 535   pop(r12);
 536   pop(rbp);
 537 #ifdef _WIN64
 538   pop(rdi);
 539   pop(rsi);
 540 #endif
 541   pop(rbx);
 542 
 543 #ifdef _WIN64
 544   pop(r9);
 545   pop(r8);
 546 #else
 547   pop(rdx);
 548   pop(rcx);
 549 #endif
 550 
 551   if (multi_block) {
 552 #ifdef _WIN64
 553 const Register& limit_end = r9;
 554 const Register& ofs_end   = r8;
 555 #else
 556 const Register& limit_end = rcx;
 557 const Register& ofs_end   = rdx;
 558 #endif
 559     movq(rax, ofs_end);
 560 
 561 bind(compute_size1);
 562     cmpptr(rax, limit_end); // assume the original ofs <= limit
 563     jccb(Assembler::aboveEqual, compute_size_end1);
 564     addq(rax, 64);
 565     jmpb(compute_size1);
 566 
 567 bind(compute_size_end1);
 568   }
 569 }
 570 #endif //#ifdef _LP64
 571 
 572 // ofs and limit are used for multi-block byte array.
 573 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
 574 void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
 575   XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
 576   Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
 577 
 578   Label start, done_hash, loop0;
 579 
 580   address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
 581   address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
 582 
 583   bind(start);
 584   movdqu(abcd, Address(state, 0));
 585   pinsrd(e0, Address(state, 16), 3);
 586   movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
 587   pand(e0, shuf_mask);
 588   pshufd(abcd, abcd, 0x1B);
 589   movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
 590 
 591   bind(loop0);
 592   // Save hash values for addition after rounds
 593   movdqu(Address(rsp, 0), e0);
 594   movdqu(Address(rsp, 16), abcd);
 595 
 596 
 597   // Rounds 0 - 3
 598   movdqu(msg0, Address(buf, 0));
 599   pshufb(msg0, shuf_mask);
 600   paddd(e0, msg0);
 601   movdqa(e1, abcd);
 602   sha1rnds4(abcd, e0, 0);
 603 
 604   // Rounds 4 - 7
 605   movdqu(msg1, Address(buf, 16));
 606   pshufb(msg1, shuf_mask);
 607   sha1nexte(e1, msg1);
 608   movdqa(e0, abcd);
 609   sha1rnds4(abcd, e1, 0);
 610   sha1msg1(msg0, msg1);
 611 
 612   // Rounds 8 - 11
 613   movdqu(msg2, Address(buf, 32));
 614   pshufb(msg2, shuf_mask);
 615   sha1nexte(e0, msg2);
 616   movdqa(e1, abcd);
 617   sha1rnds4(abcd, e0, 0);
 618   sha1msg1(msg1, msg2);
 619   pxor(msg0, msg2);
 620 
 621   // Rounds 12 - 15
 622   movdqu(msg3, Address(buf, 48));
 623   pshufb(msg3, shuf_mask);
 624   sha1nexte(e1, msg3);
 625   movdqa(e0, abcd);
 626   sha1msg2(msg0, msg3);
 627   sha1rnds4(abcd, e1, 0);
 628   sha1msg1(msg2, msg3);
 629   pxor(msg1, msg3);
 630 
 631   // Rounds 16 - 19
 632   sha1nexte(e0, msg0);
 633   movdqa(e1, abcd);
 634   sha1msg2(msg1, msg0);
 635   sha1rnds4(abcd, e0, 0);
 636   sha1msg1(msg3, msg0);
 637   pxor(msg2, msg0);
 638 
 639   // Rounds 20 - 23
 640   sha1nexte(e1, msg1);
 641   movdqa(e0, abcd);
 642   sha1msg2(msg2, msg1);
 643   sha1rnds4(abcd, e1, 1);
 644   sha1msg1(msg0, msg1);
 645   pxor(msg3, msg1);
 646 
 647   // Rounds 24 - 27
 648   sha1nexte(e0, msg2);
 649   movdqa(e1, abcd);
 650   sha1msg2(msg3, msg2);
 651   sha1rnds4(abcd, e0, 1);
 652   sha1msg1(msg1, msg2);
 653   pxor(msg0, msg2);
 654 
 655   // Rounds 28 - 31
 656   sha1nexte(e1, msg3);
 657   movdqa(e0, abcd);
 658   sha1msg2(msg0, msg3);
 659   sha1rnds4(abcd, e1, 1);
 660   sha1msg1(msg2, msg3);
 661   pxor(msg1, msg3);
 662 
 663   // Rounds 32 - 35
 664   sha1nexte(e0, msg0);
 665   movdqa(e1, abcd);
 666   sha1msg2(msg1, msg0);
 667   sha1rnds4(abcd, e0, 1);
 668   sha1msg1(msg3, msg0);
 669   pxor(msg2, msg0);
 670 
 671   // Rounds 36 - 39
 672   sha1nexte(e1, msg1);
 673   movdqa(e0, abcd);
 674   sha1msg2(msg2, msg1);
 675   sha1rnds4(abcd, e1, 1);
 676   sha1msg1(msg0, msg1);
 677   pxor(msg3, msg1);
 678 
 679   // Rounds 40 - 43
 680   sha1nexte(e0, msg2);
 681   movdqa(e1, abcd);
 682   sha1msg2(msg3, msg2);
 683   sha1rnds4(abcd, e0, 2);
 684   sha1msg1(msg1, msg2);
 685   pxor(msg0, msg2);
 686 
 687   // Rounds 44 - 47
 688   sha1nexte(e1, msg3);
 689   movdqa(e0, abcd);
 690   sha1msg2(msg0, msg3);
 691   sha1rnds4(abcd, e1, 2);
 692   sha1msg1(msg2, msg3);
 693   pxor(msg1, msg3);
 694 
 695   // Rounds 48 - 51
 696   sha1nexte(e0, msg0);
 697   movdqa(e1, abcd);
 698   sha1msg2(msg1, msg0);
 699   sha1rnds4(abcd, e0, 2);
 700   sha1msg1(msg3, msg0);
 701   pxor(msg2, msg0);
 702 
 703   // Rounds 52 - 55
 704   sha1nexte(e1, msg1);
 705   movdqa(e0, abcd);
 706   sha1msg2(msg2, msg1);
 707   sha1rnds4(abcd, e1, 2);
 708   sha1msg1(msg0, msg1);
 709   pxor(msg3, msg1);
 710 
 711   // Rounds 56 - 59
 712   sha1nexte(e0, msg2);
 713   movdqa(e1, abcd);
 714   sha1msg2(msg3, msg2);
 715   sha1rnds4(abcd, e0, 2);
 716   sha1msg1(msg1, msg2);
 717   pxor(msg0, msg2);
 718 
 719   // Rounds 60 - 63
 720   sha1nexte(e1, msg3);
 721   movdqa(e0, abcd);
 722   sha1msg2(msg0, msg3);
 723   sha1rnds4(abcd, e1, 3);
 724   sha1msg1(msg2, msg3);
 725   pxor(msg1, msg3);
 726 
 727   // Rounds 64 - 67
 728   sha1nexte(e0, msg0);
 729   movdqa(e1, abcd);
 730   sha1msg2(msg1, msg0);
 731   sha1rnds4(abcd, e0, 3);
 732   sha1msg1(msg3, msg0);
 733   pxor(msg2, msg0);
 734 
 735   // Rounds 68 - 71
 736   sha1nexte(e1, msg1);
 737   movdqa(e0, abcd);
 738   sha1msg2(msg2, msg1);
 739   sha1rnds4(abcd, e1, 3);
 740   pxor(msg3, msg1);
 741 
 742   // Rounds 72 - 75
 743   sha1nexte(e0, msg2);
 744   movdqa(e1, abcd);
 745   sha1msg2(msg3, msg2);
 746   sha1rnds4(abcd, e0, 3);
 747 
 748   // Rounds 76 - 79
 749   sha1nexte(e1, msg3);
 750   movdqa(e0, abcd);
 751   sha1rnds4(abcd, e1, 3);
 752 
 753   // add current hash values with previously saved
 754   movdqu(msg0, Address(rsp, 0));
 755   sha1nexte(e0, msg0);
 756   movdqu(msg0, Address(rsp, 16));
 757   paddd(abcd, msg0);
 758 
 759   if (multi_block) {
 760     // increment data pointer and loop if more to process
 761     addptr(buf, 64);
 762     addptr(ofs, 64);
 763     cmpptr(ofs, limit);
 764     jcc(Assembler::belowEqual, loop0);
 765     movptr(rax, ofs); //return ofs
 766   }
 767   // write hash values back in the correct order
 768   pshufd(abcd, abcd, 0x1b);
 769   movdqu(Address(state, 0), abcd);
 770   pextrd(Address(state, 16), e0, 3);
 771 
 772   bind(done_hash);
 773 
 774 }
 775 
 776 // xmm0 (msg) is used as an implicit argument to sh256rnds2
 777 // and state0 and state1 can never use xmm0 register.
 778 // ofs and limit are used for multi-block byte array.
 779 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
 780 #ifdef _LP64
 781 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
 782   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
 783   Register buf, Register state, Register ofs, Register limit, Register rsp,
 784   bool multi_block, XMMRegister shuf_mask) {
 785 #else
 786 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
 787   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
 788   Register buf, Register state, Register ofs, Register limit, Register rsp,
 789   bool multi_block) {
 790 #endif
 791   Label start, done_hash, loop0;
 792 
 793   address K256 = StubRoutines::x86::k256_addr();
 794   address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
 795 
 796   bind(start);
 797   movdqu(state0, Address(state, 0));
 798   movdqu(state1, Address(state, 16));
 799 
 800   pshufd(state0, state0, 0xB1);
 801   pshufd(state1, state1, 0x1B);
 802   movdqa(msgtmp4, state0);
 803   palignr(state0, state1, 8);
 804   pblendw(state1, msgtmp4, 0xF0);
 805 
 806 #ifdef _LP64
 807   movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask));
 808 #endif
 809   lea(rax, ExternalAddress(K256));
 810 
 811   bind(loop0);
 812   movdqu(Address(rsp, 0), state0);
 813   movdqu(Address(rsp, 16), state1);
 814 
 815   // Rounds 0-3
 816   movdqu(msg, Address(buf, 0));
 817 #ifdef _LP64
 818   pshufb(msg, shuf_mask);
 819 #else
 820   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
 821 #endif
 822   movdqa(msgtmp0, msg);
 823   paddd(msg, Address(rax, 0));
 824   sha256rnds2(state1, state0);
 825   pshufd(msg, msg, 0x0E);
 826   sha256rnds2(state0, state1);
 827 
 828   // Rounds 4-7
 829   movdqu(msg, Address(buf, 16));
 830 #ifdef _LP64
 831   pshufb(msg, shuf_mask);
 832 #else
 833   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
 834 #endif
 835   movdqa(msgtmp1, msg);
 836   paddd(msg, Address(rax, 16));
 837   sha256rnds2(state1, state0);
 838   pshufd(msg, msg, 0x0E);
 839   sha256rnds2(state0, state1);
 840   sha256msg1(msgtmp0, msgtmp1);
 841 
 842   // Rounds 8-11
 843   movdqu(msg, Address(buf, 32));
 844 #ifdef _LP64
 845   pshufb(msg, shuf_mask);
 846 #else
 847   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
 848 #endif
 849   movdqa(msgtmp2, msg);
 850   paddd(msg, Address(rax, 32));
 851   sha256rnds2(state1, state0);
 852   pshufd(msg, msg, 0x0E);
 853   sha256rnds2(state0, state1);
 854   sha256msg1(msgtmp1, msgtmp2);
 855 
 856   // Rounds 12-15
 857   movdqu(msg, Address(buf, 48));
 858 #ifdef _LP64
 859   pshufb(msg, shuf_mask);
 860 #else
 861   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
 862 #endif
 863   movdqa(msgtmp3, msg);
 864   paddd(msg, Address(rax, 48));
 865   sha256rnds2(state1, state0);
 866   movdqa(msgtmp4, msgtmp3);
 867   palignr(msgtmp4, msgtmp2, 4);
 868   paddd(msgtmp0, msgtmp4);
 869   sha256msg2(msgtmp0, msgtmp3);
 870   pshufd(msg, msg, 0x0E);
 871   sha256rnds2(state0, state1);
 872   sha256msg1(msgtmp2, msgtmp3);
 873 
 874   // Rounds 16-19
 875   movdqa(msg, msgtmp0);
 876   paddd(msg, Address(rax, 64));
 877   sha256rnds2(state1, state0);
 878   movdqa(msgtmp4, msgtmp0);
 879   palignr(msgtmp4, msgtmp3, 4);
 880   paddd(msgtmp1, msgtmp4);
 881   sha256msg2(msgtmp1, msgtmp0);
 882   pshufd(msg, msg, 0x0E);
 883   sha256rnds2(state0, state1);
 884   sha256msg1(msgtmp3, msgtmp0);
 885 
 886   // Rounds 20-23
 887   movdqa(msg, msgtmp1);
 888   paddd(msg, Address(rax, 80));
 889   sha256rnds2(state1, state0);
 890   movdqa(msgtmp4, msgtmp1);
 891   palignr(msgtmp4, msgtmp0, 4);
 892   paddd(msgtmp2, msgtmp4);
 893   sha256msg2(msgtmp2, msgtmp1);
 894   pshufd(msg, msg, 0x0E);
 895   sha256rnds2(state0, state1);
 896   sha256msg1(msgtmp0, msgtmp1);
 897 
 898   // Rounds 24-27
 899   movdqa(msg, msgtmp2);
 900   paddd(msg, Address(rax, 96));
 901   sha256rnds2(state1, state0);
 902   movdqa(msgtmp4, msgtmp2);
 903   palignr(msgtmp4, msgtmp1, 4);
 904   paddd(msgtmp3, msgtmp4);
 905   sha256msg2(msgtmp3, msgtmp2);
 906   pshufd(msg, msg, 0x0E);
 907   sha256rnds2(state0, state1);
 908   sha256msg1(msgtmp1, msgtmp2);
 909 
 910   // Rounds 28-31
 911   movdqa(msg, msgtmp3);
 912   paddd(msg, Address(rax, 112));
 913   sha256rnds2(state1, state0);
 914   movdqa(msgtmp4, msgtmp3);
 915   palignr(msgtmp4, msgtmp2, 4);
 916   paddd(msgtmp0, msgtmp4);
 917   sha256msg2(msgtmp0, msgtmp3);
 918   pshufd(msg, msg, 0x0E);
 919   sha256rnds2(state0, state1);
 920   sha256msg1(msgtmp2, msgtmp3);
 921 
 922   // Rounds 32-35
 923   movdqa(msg, msgtmp0);
 924   paddd(msg, Address(rax, 128));
 925   sha256rnds2(state1, state0);
 926   movdqa(msgtmp4, msgtmp0);
 927   palignr(msgtmp4, msgtmp3, 4);
 928   paddd(msgtmp1, msgtmp4);
 929   sha256msg2(msgtmp1, msgtmp0);
 930   pshufd(msg, msg, 0x0E);
 931   sha256rnds2(state0, state1);
 932   sha256msg1(msgtmp3, msgtmp0);
 933 
 934   // Rounds 36-39
 935   movdqa(msg, msgtmp1);
 936   paddd(msg, Address(rax, 144));
 937   sha256rnds2(state1, state0);
 938   movdqa(msgtmp4, msgtmp1);
 939   palignr(msgtmp4, msgtmp0, 4);
 940   paddd(msgtmp2, msgtmp4);
 941   sha256msg2(msgtmp2, msgtmp1);
 942   pshufd(msg, msg, 0x0E);
 943   sha256rnds2(state0, state1);
 944   sha256msg1(msgtmp0, msgtmp1);
 945 
 946   // Rounds 40-43
 947   movdqa(msg, msgtmp2);
 948   paddd(msg, Address(rax, 160));
 949   sha256rnds2(state1, state0);
 950   movdqa(msgtmp4, msgtmp2);
 951   palignr(msgtmp4, msgtmp1, 4);
 952   paddd(msgtmp3, msgtmp4);
 953   sha256msg2(msgtmp3, msgtmp2);
 954   pshufd(msg, msg, 0x0E);
 955   sha256rnds2(state0, state1);
 956   sha256msg1(msgtmp1, msgtmp2);
 957 
 958   // Rounds 44-47
 959   movdqa(msg, msgtmp3);
 960   paddd(msg, Address(rax, 176));
 961   sha256rnds2(state1, state0);
 962   movdqa(msgtmp4, msgtmp3);
 963   palignr(msgtmp4, msgtmp2, 4);
 964   paddd(msgtmp0, msgtmp4);
 965   sha256msg2(msgtmp0, msgtmp3);
 966   pshufd(msg, msg, 0x0E);
 967   sha256rnds2(state0, state1);
 968   sha256msg1(msgtmp2, msgtmp3);
 969 
 970   // Rounds 48-51
 971   movdqa(msg, msgtmp0);
 972   paddd(msg, Address(rax, 192));
 973   sha256rnds2(state1, state0);
 974   movdqa(msgtmp4, msgtmp0);
 975   palignr(msgtmp4, msgtmp3, 4);
 976   paddd(msgtmp1, msgtmp4);
 977   sha256msg2(msgtmp1, msgtmp0);
 978   pshufd(msg, msg, 0x0E);
 979   sha256rnds2(state0, state1);
 980   sha256msg1(msgtmp3, msgtmp0);
 981 
 982   // Rounds 52-55
 983   movdqa(msg, msgtmp1);
 984   paddd(msg, Address(rax, 208));
 985   sha256rnds2(state1, state0);
 986   movdqa(msgtmp4, msgtmp1);
 987   palignr(msgtmp4, msgtmp0, 4);
 988   paddd(msgtmp2, msgtmp4);
 989   sha256msg2(msgtmp2, msgtmp1);
 990   pshufd(msg, msg, 0x0E);
 991   sha256rnds2(state0, state1);
 992 
 993   // Rounds 56-59
 994   movdqa(msg, msgtmp2);
 995   paddd(msg, Address(rax, 224));
 996   sha256rnds2(state1, state0);
 997   movdqa(msgtmp4, msgtmp2);
 998   palignr(msgtmp4, msgtmp1, 4);
 999   paddd(msgtmp3, msgtmp4);
1000   sha256msg2(msgtmp3, msgtmp2);
1001   pshufd(msg, msg, 0x0E);
1002   sha256rnds2(state0, state1);
1003 
1004   // Rounds 60-63
1005   movdqa(msg, msgtmp3);
1006   paddd(msg, Address(rax, 240));
1007   sha256rnds2(state1, state0);
1008   pshufd(msg, msg, 0x0E);
1009   sha256rnds2(state0, state1);
1010   movdqu(msg, Address(rsp, 0));
1011   paddd(state0, msg);
1012   movdqu(msg, Address(rsp, 16));
1013   paddd(state1, msg);
1014 
1015   if (multi_block) {
1016     // increment data pointer and loop if more to process
1017     addptr(buf, 64);
1018     addptr(ofs, 64);
1019     cmpptr(ofs, limit);
1020     jcc(Assembler::belowEqual, loop0);
1021     movptr(rax, ofs); //return ofs
1022   }
1023 
1024   pshufd(state0, state0, 0x1B);
1025   pshufd(state1, state1, 0xB1);
1026   movdqa(msgtmp4, state0);
1027   pblendw(state0, state1, 0xF0);
1028   palignr(state1, msgtmp4, 8);
1029 
1030   movdqu(Address(state, 0), state0);
1031   movdqu(Address(state, 16), state1);
1032 
1033   bind(done_hash);
1034 
1035 }