1 /* 2 * Copyright (c) 2016, Intel Corporation. 3 * 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/assembler.hpp" 28 #include "asm/assembler.inline.hpp" 29 #include "runtime/stubRoutines.hpp" 30 #include "macroAssembler_x86.hpp" 31 32 #ifdef _LP64 33 /* 34 The algorithm below is based on Intel publication: 35 "Fast SHA-256 Implementations on IntelĀ® Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal. 36 The assembly code was originally provided by Sean Gulley and in many places preserves 37 the original assembly NAMES and comments to simplify matching Java assembly with its original. 38 The Java version was substantially redesigned to replace 1200 assembly instruction with 39 much shorter run-time generator of the same code in memory. 40 */ 41 42 void MacroAssembler::sha256_AVX2_one_round_compute( 43 Register reg_old_h, 44 Register reg_a, 45 Register reg_b, 46 Register reg_c, 47 Register reg_d, 48 Register reg_e, 49 Register reg_f, 50 Register reg_g, 51 Register reg_h, 52 int iter) { 53 const Register& reg_y0 = r13; 54 const Register& reg_y1 = r14; 55 const Register& reg_y2 = r15; 56 const Register& reg_y3 = rcx; 57 const Register& reg_T1 = r12; 58 //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;; 59 if (iter%4 > 0) { 60 addl(reg_old_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- 61 } 62 movl(reg_y2, reg_f); // reg_y2 = reg_f ; CH 63 rorxd(reg_y0, reg_e, 25); // reg_y0 = reg_e >> 25 ; S1A 64 rorxd(reg_y1, reg_e, 11); // reg_y1 = reg_e >> 11 ; S1B 65 xorl(reg_y2, reg_g); // reg_y2 = reg_f^reg_g ; CH 66 67 xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_h>>11) ; S1 68 rorxd(reg_y1, reg_e, 6); // reg_y1 = (reg_e >> 6) ; S1 69 andl(reg_y2, reg_e); // reg_y2 = (reg_f^reg_g)®_e ; CH 70 71 if (iter%4 > 0) { 72 addl(reg_old_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- 73 } 74 75 xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 76 rorxd(reg_T1, reg_a, 13); // reg_T1 = reg_a >> 13 ; S0B 77 xorl(reg_y2, reg_g); // reg_y2 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH 78 rorxd(reg_y1, reg_a, 22); // reg_y1 = reg_a >> 22 ; S0A 79 movl(reg_y3, reg_a); // reg_y3 = reg_a ; MAJA 80 81 xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ; S0 82 rorxd(reg_T1, reg_a, 2); // reg_T1 = (reg_a >> 2) ; S0 83 addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; -- 84 orl(reg_y3, reg_c); // reg_y3 = reg_a|reg_c ; MAJA 85 86 xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 87 movl(reg_T1, reg_a); // reg_T1 = reg_a ; MAJB 88 andl(reg_y3, reg_b); // reg_y3 = (reg_a|reg_c)®_b ; MAJA 89 andl(reg_T1, reg_c); // reg_T1 = reg_a®_c ; MAJB 90 addl(reg_y2, reg_y0); // reg_y2 = S1 + CH ; -- 91 92 93 addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- 94 orl(reg_y3, reg_T1); // reg_y3 = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ 95 addl(reg_h, reg_y1); // reg_h = k + w + reg_h + S0 ; -- 96 97 addl(reg_d, reg_y2); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- 98 99 100 if (iter%4 == 3) { 101 addl(reg_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- 102 addl(reg_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- 103 } 104 } 105 106 void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) { 107 sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi, r8, r9, r10, r11, start + 0); 108 sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi, r8, r9, r10, start + 1); 109 sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi, r8, r9, start + 2); 110 sha256_AVX2_one_round_compute(r9, r9, r10, r11, rax, rbx, rdi, rsi, r8, start + 3); 111 } 112 113 void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) { 114 sha256_AVX2_one_round_compute(r8, r8, r9, r10, r11, rax, rbx, rdi, rsi, start + 0); 115 sha256_AVX2_one_round_compute(rsi, rsi, r8, r9, r10, r11, rax, rbx, rdi, start + 1); 116 sha256_AVX2_one_round_compute(rdi, rdi, rsi, r8, r9, r10, r11, rax, rbx, start + 2); 117 sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi, r8, r9, r10, r11, rax, start + 3); 118 } 119 120 void MacroAssembler::sha256_AVX2_one_round_and_sched( 121 XMMRegister xmm_0, /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */ 122 XMMRegister xmm_1, /* ymm5 */ /* full cycle is 16 iterations */ 123 XMMRegister xmm_2, /* ymm6 */ 124 XMMRegister xmm_3, /* ymm7 */ 125 Register reg_a, /* == rax on 0 iteration, then rotate 8 register right on each next iteration */ 126 Register reg_b, /* rbx */ /* full cycle is 8 iterations */ 127 Register reg_c, /* rdi */ 128 Register reg_d, /* rsi */ 129 Register reg_e, /* r8 */ 130 Register reg_f, /* r9d */ 131 Register reg_g, /* r10d */ 132 Register reg_h, /* r11d */ 133 int iter) 134 { 135 movl(rcx, reg_a); // rcx = reg_a ; MAJA 136 rorxd(r13, reg_e, 25); // r13 = reg_e >> 25 ; S1A 137 rorxd(r14, reg_e, 11); // r14 = reg_e >> 11 ; S1B 138 addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); 139 orl(rcx, reg_c); // rcx = reg_a|reg_c ; MAJA 140 141 movl(r15, reg_f); // r15 = reg_f ; CH 142 rorxd(r12, reg_a, 13); // r12 = reg_a >> 13 ; S0B 143 xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ; S1 144 xorl(r15, reg_g); // r15 = reg_f^reg_g ; CH 145 146 rorxd(r14, reg_e, 6); // r14 = (reg_e >> 6) ; S1 147 andl(r15, reg_e); // r15 = (reg_f^reg_g)®_e ; CH 148 149 xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 150 rorxd(r14, reg_a, 22); // r14 = reg_a >> 22 ; S0A 151 addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- 152 153 andl(rcx, reg_b); // rcx = (reg_a|reg_c)®_b ; MAJA 154 xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ; S0 155 156 rorxd(r12, reg_a, 2); // r12 = (reg_a >> 2) ; S0 157 xorl(r15, reg_g); // r15 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH 158 159 xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 160 movl(r12, reg_a); // r12 = reg_a ; MAJB 161 andl(r12, reg_c); // r12 = reg_a®_c ; MAJB 162 addl(r15, r13); // r15 = S1 + CH ; -- 163 164 orl(rcx, r12); // rcx = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ 165 addl(reg_h, r14); // reg_h = k + w + reg_h + S0 ; -- 166 addl(reg_d, r15); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- 167 168 addl(reg_h, r15); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- 169 addl(reg_h, rcx); // reg_h = t1 + S0 + MAJ ; -- 170 171 if (iter%4 == 0) { 172 vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit); // ymm0 = W[-7] 173 vpaddd(xmm0, xmm0, xmm_0, AVX_256bit); // ymm0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1 174 vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit); // ymm1 = W[-15] 175 vpsrld(xmm2, xmm1, 7, AVX_256bit); 176 vpslld(xmm3, xmm1, 32-7, AVX_256bit); 177 vpor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 178 vpsrld(xmm2, xmm1,18, AVX_256bit); 179 } else if (iter%4 == 1 ) { 180 vpsrld(xmm8, xmm1, 3, AVX_256bit); // ymm8 = W[-15] >> 3 181 vpslld(xmm1, xmm1, 32-18, AVX_256bit); 182 vpxor(xmm3, xmm3, xmm1, AVX_256bit); 183 vpxor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 ^ W[-15] ror 18 184 vpxor(xmm1, xmm3, xmm8, AVX_256bit); // ymm1 = s0 185 vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit); // 11111010b ; ymm2 = W[-2] {BBAA} 186 vpaddd(xmm0, xmm0, xmm1, AVX_256bit); // ymm0 = W[-16] + W[-7] + s0 187 vpsrld(xmm8, xmm2, 10, AVX_256bit); // ymm8 = W[-2] >> 10 {BBAA} 188 } else if (iter%4 == 2) { 189 vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xBxA} 190 vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xBxA} 191 vpxor(xmm2, xmm2, xmm3, AVX_256bit); 192 vpxor(xmm8, xmm8, xmm2, AVX_256bit); // ymm8 = s1 {xBxA} 193 vpshufb(xmm8, xmm8, xmm10, AVX_256bit); // ymm8 = s1 {00BA} 194 vpaddd(xmm0, xmm0, xmm8, AVX_256bit); // ymm0 = {..., ..., W[1], W[0]} 195 vpshufd(xmm2, xmm0, 0x50, AVX_256bit); // 01010000b ; ymm2 = W[-2] {DDCC} 196 } else if (iter%4 == 3) { 197 vpsrld(xmm11, xmm2, 10, AVX_256bit); // ymm11 = W[-2] >> 10 {DDCC} 198 vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xDxC} 199 vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xDxC} 200 vpxor(xmm2, xmm2, xmm3, AVX_256bit); 201 vpxor(xmm11, xmm11, xmm2, AVX_256bit); // ymm11 = s1 {xDxC} 202 vpshufb(xmm11, xmm11, xmm12, AVX_256bit); // ymm11 = s1 {DC00} 203 vpaddd(xmm_0, xmm11, xmm0, AVX_256bit); // xmm_0 = {W[3], W[2], W[1], W[0]} 204 } 205 } 206 207 void MacroAssembler::addm(int disp, Register r1, Register r2) { 208 addl(r2, Address(r1, disp)); 209 movl(Address(r1, disp), r2); 210 } 211 212 void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, 213 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, 214 Register buf, Register state, Register ofs, Register limit, Register rsp, 215 bool multi_block, XMMRegister shuf_mask) { 216 217 Label loop0, loop1, loop2, loop3, 218 last_block_enter, do_last_block, only_one_block, done_hash, 219 compute_size, compute_size_end, 220 compute_size1, compute_size_end1; 221 222 address K256_W = StubRoutines::x86::k256_W_addr(); 223 address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); 224 address pshuffle_byte_flip_mask_addr = 0; 225 226 const XMMRegister& SHUF_00BA = xmm10; // ymm10: shuffle xBxA -> 00BA 227 const XMMRegister& SHUF_DC00 = xmm12; // ymm12: shuffle xDxC -> DC00 228 const XMMRegister& BYTE_FLIP_MASK = xmm13; // ymm13 229 230 const XMMRegister& X_BYTE_FLIP_MASK = xmm13; //XMM version of BYTE_FLIP_MASK 231 232 const Register& NUM_BLKS = r8; // 3rd arg 233 const Register& CTX = rdx; // 2nd arg 234 const Register& INP = rcx; // 1st arg 235 236 const Register& c = rdi; 237 const Register& d = rsi; 238 const Register& e = r8; // clobbers NUM_BLKS 239 const Register& y3 = rcx; // clobbers INP 240 241 const Register& TBL = rbp; 242 const Register& SRND = CTX; // SRND is same register as CTX 243 244 const Register& a = rax; 245 const Register& b = rbx; 246 const Register& f = r9; 247 const Register& g = r10; 248 const Register& h = r11; 249 250 const Register& T1 = r12; 251 const Register& y0 = r13; 252 const Register& y1 = r14; 253 const Register& y2 = r15; 254 255 256 enum { 257 _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round 258 #ifndef _WIN64 259 _XMM_SAVE_SIZE = 0, 260 #else 261 _XMM_SAVE_SIZE = 8*16, 262 #endif 263 _INP_END_SIZE = 8, 264 _INP_SIZE = 8, 265 _CTX_SIZE = 8, 266 _RSP_SIZE = 8, 267 268 _XFER = 0, 269 _XMM_SAVE = _XFER + _XFER_SIZE, 270 _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE, 271 _INP = _INP_END + _INP_END_SIZE, 272 _CTX = _INP + _INP_SIZE, 273 _RSP = _CTX + _CTX_SIZE, 274 STACK_SIZE = _RSP + _RSP_SIZE, 275 }; 276 277 #ifndef _WIN64 278 push(rcx); // linux: this is limit, need at the end 279 push(rdx); // linux: this is ofs 280 #else 281 push(r8); // win64: this is ofs 282 push(r9); // win64: this is limit, we need them again at the very and 283 #endif 284 285 286 push(rbx); 287 #ifdef _WIN64 288 push(rsi); 289 push(rdi); 290 #endif 291 push(rbp); 292 push(r12); 293 push(r13); 294 push(r14); 295 push(r15); 296 297 movq(rax, rsp); 298 subq(rsp, STACK_SIZE); 299 andq(rsp, -32); 300 movq(Address(rsp, _RSP), rax); 301 302 #ifndef _WIN64 303 // copy linux params to win64 params, therefore the rest of code will be the same for both 304 movq(r9, rcx); 305 movq(r8, rdx); 306 movq(rdx, rsi); 307 movq(rcx, rdi); 308 #endif 309 310 // setting original assembly ABI 311 /** message to encrypt in INP */ 312 lea(INP, Address(rcx, 0)); // rcx == message (buf) ;; linux: INP = buf = rdi 313 /** digest in CTX */ 314 movq(CTX, rdx); // rdx = digest (state) ;; linux: CTX = state = rsi 315 316 /** NUM_BLK is the length of message, need to set it from ofs and limit */ 317 if (multi_block) { 318 319 // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8 320 // on entry r8 = ofs 321 // on exit r8 = NUM_BLKS 322 323 xorq(rax, rax); 324 325 bind(compute_size); 326 cmpptr(r8, r9); // assume the original ofs <= limit ;; linux: cmp rcx, rdx 327 jccb(Assembler::aboveEqual, compute_size_end); 328 addq(r8, 64); //;; linux: ofs = rdx 329 addq(rax, 64); 330 jmpb(compute_size); 331 332 bind(compute_size_end); 333 movq(NUM_BLKS, rax); // NUM_BLK (r8) ;; linux: NUM_BLK = rdx 334 335 cmpq(NUM_BLKS, 0); 336 jcc(Assembler::equal, done_hash); 337 338 } else { 339 xorq(NUM_BLKS, NUM_BLKS); 340 addq(NUM_BLKS, 64); 341 }//if (!multi_block) 342 343 lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block 344 movq(Address(rsp, _INP_END), NUM_BLKS); // 345 346 cmpptr(INP, NUM_BLKS); //cmp INP, NUM_BLKS 347 jcc(Assembler::equal, only_one_block); //je only_one_block 348 349 // load initial digest 350 movl(a, Address(CTX, 4*0)); 351 movl(b, Address(CTX, 4*1)); 352 movl(c, Address(CTX, 4*2)); 353 movl(d, Address(CTX, 4*3)); 354 movl(e, Address(CTX, 4*4)); 355 movl(f, Address(CTX, 4*5)); 356 movl(g, Address(CTX, 4*6)); 357 movl(h, Address(CTX, 4*7)); 358 359 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; 360 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] 361 vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] 362 vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] 363 364 movq(Address(rsp, _CTX), CTX); // store 365 366 bind(loop0); 367 lea(TBL, ExternalAddress(K256_W)); 368 369 // assume buffers not aligned 370 371 // Load first 16 dwords from two blocks 372 vmovdqu(xmm0, Address(INP, 0*32)); 373 vmovdqu(xmm1, Address(INP, 1*32)); 374 vmovdqu(xmm2, Address(INP, 2*32)); 375 vmovdqu(xmm3, Address(INP, 3*32)); 376 377 // byte swap data 378 vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit); 379 vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit); 380 vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit); 381 vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit); 382 383 // transpose data into high/low halves 384 vperm2i128(xmm4, xmm0, xmm2, 0x20); 385 vperm2i128(xmm5, xmm0, xmm2, 0x31); 386 vperm2i128(xmm6, xmm1, xmm3, 0x20); 387 vperm2i128(xmm7, xmm1, xmm3, 0x31); 388 389 bind(last_block_enter); 390 addq(INP, 64); 391 movq(Address(rsp, _INP), INP); 392 393 //;; schedule 48 input dwords, by doing 3 rounds of 12 each 394 xorq(SRND, SRND); 395 396 align(16); 397 bind(loop1); 398 vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); 399 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); 400 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8, r9, r10, r11, 0); 401 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8, r9, r10, 1); 402 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8, r9, 2); 403 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9, r10, r11, rax, rbx, rdi, rsi, r8, 3); 404 405 vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); 406 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); 407 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8, r9, r10, r11, rax, rbx, rdi, rsi, 8+0); 408 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8, r9, r10, r11, rax, rbx, rdi, 8+1); 409 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8, r9, r10, r11, rax, rbx, 8+2); 410 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8, r9, r10, r11, rax, 8+3); 411 412 vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit); 413 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9); 414 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8, r9, r10, r11, 16+0); 415 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8, r9, r10, 16+1); 416 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8, r9, 16+2); 417 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9, r10, r11, rax, rbx, rdi, rsi, r8, 16+3); 418 419 vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit); 420 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9); 421 422 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8, r9, r10, r11, rax, rbx, rdi, rsi, 24+0); 423 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8, r9, r10, r11, rax, rbx, rdi, 24+1); 424 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8, r9, r10, r11, rax, rbx, 24+2); 425 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8, r9, r10, r11, rax, 24+3); 426 427 addq(SRND, 4*32); 428 cmpq(SRND, 3 * 4*32); 429 jcc(Assembler::below, loop1); 430 431 bind(loop2); 432 // Do last 16 rounds with no scheduling 433 vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); 434 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); 435 sha256_AVX2_four_rounds_compute_first(0); 436 437 vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); 438 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); 439 sha256_AVX2_four_rounds_compute_last(0 + 8); 440 441 addq(SRND, 2*32); 442 443 vmovdqu(xmm4, xmm6); 444 vmovdqu(xmm5, xmm7); 445 446 cmpq(SRND, 4 * 4*32); 447 jcc(Assembler::below, loop2); 448 449 movq(CTX, Address(rsp, _CTX)); 450 movq(INP, Address(rsp, _INP)); 451 452 addm(4*0, CTX, a); 453 addm(4*1, CTX, b); 454 addm(4*2, CTX, c); 455 addm(4*3, CTX, d); 456 addm(4*4, CTX, e); 457 addm(4*5, CTX, f); 458 addm(4*6, CTX, g); 459 addm(4*7, CTX, h); 460 461 cmpq(INP, Address(rsp, _INP_END)); 462 jcc(Assembler::above, done_hash); 463 464 //Do second block using previously scheduled results 465 xorq(SRND, SRND); 466 align(16); 467 bind(loop3); 468 sha256_AVX2_four_rounds_compute_first(4); 469 sha256_AVX2_four_rounds_compute_last(4+8); 470 471 addq(SRND, 2*32); 472 cmpq(SRND, 4 * 4*32); 473 jcc(Assembler::below, loop3); 474 475 movq(CTX, Address(rsp, _CTX)); 476 movq(INP, Address(rsp, _INP)); 477 addq(INP, 64); 478 479 addm(4*0, CTX, a); 480 addm(4*1, CTX, b); 481 addm(4*2, CTX, c); 482 addm(4*3, CTX, d); 483 addm(4*4, CTX, e); 484 addm(4*5, CTX, f); 485 addm(4*6, CTX, g); 486 addm(4*7, CTX, h); 487 488 cmpq(INP, Address(rsp, _INP_END)); 489 jcc(Assembler::below, loop0); 490 jccb(Assembler::above, done_hash); 491 492 bind(do_last_block); 493 lea(TBL, ExternalAddress(K256_W)); 494 495 movdqu(xmm4, Address(INP, 0*16)); 496 movdqu(xmm5, Address(INP, 1*16)); 497 movdqu(xmm6, Address(INP, 2*16)); 498 movdqu(xmm7, Address(INP, 3*16)); 499 500 vpshufb(xmm4, xmm4, xmm13, AVX_128bit); 501 vpshufb(xmm5, xmm5, xmm13, AVX_128bit); 502 vpshufb(xmm6, xmm6, xmm13, AVX_128bit); 503 vpshufb(xmm7, xmm7, xmm13, AVX_128bit); 504 505 jmp(last_block_enter); 506 507 bind(only_one_block); 508 509 // load initial digest ;; table should be preloaded with following values 510 movl(a, Address(CTX, 4*0)); // 0x6a09e667 511 movl(b, Address(CTX, 4*1)); // 0xbb67ae85 512 movl(c, Address(CTX, 4*2)); // 0x3c6ef372 513 movl(d, Address(CTX, 4*3)); // 0xa54ff53a 514 movl(e, Address(CTX, 4*4)); // 0x510e527f 515 movl(f, Address(CTX, 4*5)); // 0x9b05688c 516 movl(g, Address(CTX, 4*6)); // 0x1f83d9ab 517 movl(h, Address(CTX, 4*7)); // 0x5be0cd19 518 519 520 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; 521 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] 522 vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] 523 vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] 524 525 movq(Address(rsp, _CTX), CTX); 526 jmpb(do_last_block); 527 528 bind(done_hash); 529 530 movq(rsp, Address(rsp, _RSP)); 531 532 pop(r15); 533 pop(r14); 534 pop(r13); 535 pop(r12); 536 pop(rbp); 537 #ifdef _WIN64 538 pop(rdi); 539 pop(rsi); 540 #endif 541 pop(rbx); 542 543 #ifdef _WIN64 544 pop(r9); 545 pop(r8); 546 #else 547 pop(rdx); 548 pop(rcx); 549 #endif 550 551 if (multi_block) { 552 #ifdef _WIN64 553 const Register& limit_end = r9; 554 const Register& ofs_end = r8; 555 #else 556 const Register& limit_end = rcx; 557 const Register& ofs_end = rdx; 558 #endif 559 movq(rax, ofs_end); 560 561 bind(compute_size1); 562 cmpptr(rax, limit_end); // assume the original ofs <= limit 563 jccb(Assembler::aboveEqual, compute_size_end1); 564 addq(rax, 64); 565 jmpb(compute_size1); 566 567 bind(compute_size_end1); 568 } 569 } 570 #endif //#ifdef _LP64 571 572 // ofs and limit are used for multi-block byte array. 573 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 574 void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0, 575 XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask, 576 Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) { 577 578 Label start, done_hash, loop0; 579 580 address upper_word_mask = StubRoutines::x86::upper_word_mask_addr(); 581 address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr(); 582 583 bind(start); 584 movdqu(abcd, Address(state, 0)); 585 pinsrd(e0, Address(state, 16), 3); 586 movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000 587 pand(e0, shuf_mask); 588 pshufd(abcd, abcd, 0x1B); 589 movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f 590 591 bind(loop0); 592 // Save hash values for addition after rounds 593 movdqu(Address(rsp, 0), e0); 594 movdqu(Address(rsp, 16), abcd); 595 596 597 // Rounds 0 - 3 598 movdqu(msg0, Address(buf, 0)); 599 pshufb(msg0, shuf_mask); 600 paddd(e0, msg0); 601 movdqa(e1, abcd); 602 sha1rnds4(abcd, e0, 0); 603 604 // Rounds 4 - 7 605 movdqu(msg1, Address(buf, 16)); 606 pshufb(msg1, shuf_mask); 607 sha1nexte(e1, msg1); 608 movdqa(e0, abcd); 609 sha1rnds4(abcd, e1, 0); 610 sha1msg1(msg0, msg1); 611 612 // Rounds 8 - 11 613 movdqu(msg2, Address(buf, 32)); 614 pshufb(msg2, shuf_mask); 615 sha1nexte(e0, msg2); 616 movdqa(e1, abcd); 617 sha1rnds4(abcd, e0, 0); 618 sha1msg1(msg1, msg2); 619 pxor(msg0, msg2); 620 621 // Rounds 12 - 15 622 movdqu(msg3, Address(buf, 48)); 623 pshufb(msg3, shuf_mask); 624 sha1nexte(e1, msg3); 625 movdqa(e0, abcd); 626 sha1msg2(msg0, msg3); 627 sha1rnds4(abcd, e1, 0); 628 sha1msg1(msg2, msg3); 629 pxor(msg1, msg3); 630 631 // Rounds 16 - 19 632 sha1nexte(e0, msg0); 633 movdqa(e1, abcd); 634 sha1msg2(msg1, msg0); 635 sha1rnds4(abcd, e0, 0); 636 sha1msg1(msg3, msg0); 637 pxor(msg2, msg0); 638 639 // Rounds 20 - 23 640 sha1nexte(e1, msg1); 641 movdqa(e0, abcd); 642 sha1msg2(msg2, msg1); 643 sha1rnds4(abcd, e1, 1); 644 sha1msg1(msg0, msg1); 645 pxor(msg3, msg1); 646 647 // Rounds 24 - 27 648 sha1nexte(e0, msg2); 649 movdqa(e1, abcd); 650 sha1msg2(msg3, msg2); 651 sha1rnds4(abcd, e0, 1); 652 sha1msg1(msg1, msg2); 653 pxor(msg0, msg2); 654 655 // Rounds 28 - 31 656 sha1nexte(e1, msg3); 657 movdqa(e0, abcd); 658 sha1msg2(msg0, msg3); 659 sha1rnds4(abcd, e1, 1); 660 sha1msg1(msg2, msg3); 661 pxor(msg1, msg3); 662 663 // Rounds 32 - 35 664 sha1nexte(e0, msg0); 665 movdqa(e1, abcd); 666 sha1msg2(msg1, msg0); 667 sha1rnds4(abcd, e0, 1); 668 sha1msg1(msg3, msg0); 669 pxor(msg2, msg0); 670 671 // Rounds 36 - 39 672 sha1nexte(e1, msg1); 673 movdqa(e0, abcd); 674 sha1msg2(msg2, msg1); 675 sha1rnds4(abcd, e1, 1); 676 sha1msg1(msg0, msg1); 677 pxor(msg3, msg1); 678 679 // Rounds 40 - 43 680 sha1nexte(e0, msg2); 681 movdqa(e1, abcd); 682 sha1msg2(msg3, msg2); 683 sha1rnds4(abcd, e0, 2); 684 sha1msg1(msg1, msg2); 685 pxor(msg0, msg2); 686 687 // Rounds 44 - 47 688 sha1nexte(e1, msg3); 689 movdqa(e0, abcd); 690 sha1msg2(msg0, msg3); 691 sha1rnds4(abcd, e1, 2); 692 sha1msg1(msg2, msg3); 693 pxor(msg1, msg3); 694 695 // Rounds 48 - 51 696 sha1nexte(e0, msg0); 697 movdqa(e1, abcd); 698 sha1msg2(msg1, msg0); 699 sha1rnds4(abcd, e0, 2); 700 sha1msg1(msg3, msg0); 701 pxor(msg2, msg0); 702 703 // Rounds 52 - 55 704 sha1nexte(e1, msg1); 705 movdqa(e0, abcd); 706 sha1msg2(msg2, msg1); 707 sha1rnds4(abcd, e1, 2); 708 sha1msg1(msg0, msg1); 709 pxor(msg3, msg1); 710 711 // Rounds 56 - 59 712 sha1nexte(e0, msg2); 713 movdqa(e1, abcd); 714 sha1msg2(msg3, msg2); 715 sha1rnds4(abcd, e0, 2); 716 sha1msg1(msg1, msg2); 717 pxor(msg0, msg2); 718 719 // Rounds 60 - 63 720 sha1nexte(e1, msg3); 721 movdqa(e0, abcd); 722 sha1msg2(msg0, msg3); 723 sha1rnds4(abcd, e1, 3); 724 sha1msg1(msg2, msg3); 725 pxor(msg1, msg3); 726 727 // Rounds 64 - 67 728 sha1nexte(e0, msg0); 729 movdqa(e1, abcd); 730 sha1msg2(msg1, msg0); 731 sha1rnds4(abcd, e0, 3); 732 sha1msg1(msg3, msg0); 733 pxor(msg2, msg0); 734 735 // Rounds 68 - 71 736 sha1nexte(e1, msg1); 737 movdqa(e0, abcd); 738 sha1msg2(msg2, msg1); 739 sha1rnds4(abcd, e1, 3); 740 pxor(msg3, msg1); 741 742 // Rounds 72 - 75 743 sha1nexte(e0, msg2); 744 movdqa(e1, abcd); 745 sha1msg2(msg3, msg2); 746 sha1rnds4(abcd, e0, 3); 747 748 // Rounds 76 - 79 749 sha1nexte(e1, msg3); 750 movdqa(e0, abcd); 751 sha1rnds4(abcd, e1, 3); 752 753 // add current hash values with previously saved 754 movdqu(msg0, Address(rsp, 0)); 755 sha1nexte(e0, msg0); 756 movdqu(msg0, Address(rsp, 16)); 757 paddd(abcd, msg0); 758 759 if (multi_block) { 760 // increment data pointer and loop if more to process 761 addptr(buf, 64); 762 addptr(ofs, 64); 763 cmpptr(ofs, limit); 764 jcc(Assembler::belowEqual, loop0); 765 movptr(rax, ofs); //return ofs 766 } 767 // write hash values back in the correct order 768 pshufd(abcd, abcd, 0x1b); 769 movdqu(Address(state, 0), abcd); 770 pextrd(Address(state, 16), e0, 3); 771 772 bind(done_hash); 773 774 } 775 776 // xmm0 (msg) is used as an implicit argument to sh256rnds2 777 // and state0 and state1 can never use xmm0 register. 778 // ofs and limit are used for multi-block byte array. 779 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 780 #ifdef _LP64 781 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, 782 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, 783 Register buf, Register state, Register ofs, Register limit, Register rsp, 784 bool multi_block, XMMRegister shuf_mask) { 785 #else 786 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, 787 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, 788 Register buf, Register state, Register ofs, Register limit, Register rsp, 789 bool multi_block) { 790 #endif 791 Label start, done_hash, loop0; 792 793 address K256 = StubRoutines::x86::k256_addr(); 794 address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); 795 796 bind(start); 797 movdqu(state0, Address(state, 0)); 798 movdqu(state1, Address(state, 16)); 799 800 pshufd(state0, state0, 0xB1); 801 pshufd(state1, state1, 0x1B); 802 movdqa(msgtmp4, state0); 803 palignr(state0, state1, 8); 804 pblendw(state1, msgtmp4, 0xF0); 805 806 #ifdef _LP64 807 movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask)); 808 #endif 809 lea(rax, ExternalAddress(K256)); 810 811 bind(loop0); 812 movdqu(Address(rsp, 0), state0); 813 movdqu(Address(rsp, 16), state1); 814 815 // Rounds 0-3 816 movdqu(msg, Address(buf, 0)); 817 #ifdef _LP64 818 pshufb(msg, shuf_mask); 819 #else 820 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); 821 #endif 822 movdqa(msgtmp0, msg); 823 paddd(msg, Address(rax, 0)); 824 sha256rnds2(state1, state0); 825 pshufd(msg, msg, 0x0E); 826 sha256rnds2(state0, state1); 827 828 // Rounds 4-7 829 movdqu(msg, Address(buf, 16)); 830 #ifdef _LP64 831 pshufb(msg, shuf_mask); 832 #else 833 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); 834 #endif 835 movdqa(msgtmp1, msg); 836 paddd(msg, Address(rax, 16)); 837 sha256rnds2(state1, state0); 838 pshufd(msg, msg, 0x0E); 839 sha256rnds2(state0, state1); 840 sha256msg1(msgtmp0, msgtmp1); 841 842 // Rounds 8-11 843 movdqu(msg, Address(buf, 32)); 844 #ifdef _LP64 845 pshufb(msg, shuf_mask); 846 #else 847 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); 848 #endif 849 movdqa(msgtmp2, msg); 850 paddd(msg, Address(rax, 32)); 851 sha256rnds2(state1, state0); 852 pshufd(msg, msg, 0x0E); 853 sha256rnds2(state0, state1); 854 sha256msg1(msgtmp1, msgtmp2); 855 856 // Rounds 12-15 857 movdqu(msg, Address(buf, 48)); 858 #ifdef _LP64 859 pshufb(msg, shuf_mask); 860 #else 861 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); 862 #endif 863 movdqa(msgtmp3, msg); 864 paddd(msg, Address(rax, 48)); 865 sha256rnds2(state1, state0); 866 movdqa(msgtmp4, msgtmp3); 867 palignr(msgtmp4, msgtmp2, 4); 868 paddd(msgtmp0, msgtmp4); 869 sha256msg2(msgtmp0, msgtmp3); 870 pshufd(msg, msg, 0x0E); 871 sha256rnds2(state0, state1); 872 sha256msg1(msgtmp2, msgtmp3); 873 874 // Rounds 16-19 875 movdqa(msg, msgtmp0); 876 paddd(msg, Address(rax, 64)); 877 sha256rnds2(state1, state0); 878 movdqa(msgtmp4, msgtmp0); 879 palignr(msgtmp4, msgtmp3, 4); 880 paddd(msgtmp1, msgtmp4); 881 sha256msg2(msgtmp1, msgtmp0); 882 pshufd(msg, msg, 0x0E); 883 sha256rnds2(state0, state1); 884 sha256msg1(msgtmp3, msgtmp0); 885 886 // Rounds 20-23 887 movdqa(msg, msgtmp1); 888 paddd(msg, Address(rax, 80)); 889 sha256rnds2(state1, state0); 890 movdqa(msgtmp4, msgtmp1); 891 palignr(msgtmp4, msgtmp0, 4); 892 paddd(msgtmp2, msgtmp4); 893 sha256msg2(msgtmp2, msgtmp1); 894 pshufd(msg, msg, 0x0E); 895 sha256rnds2(state0, state1); 896 sha256msg1(msgtmp0, msgtmp1); 897 898 // Rounds 24-27 899 movdqa(msg, msgtmp2); 900 paddd(msg, Address(rax, 96)); 901 sha256rnds2(state1, state0); 902 movdqa(msgtmp4, msgtmp2); 903 palignr(msgtmp4, msgtmp1, 4); 904 paddd(msgtmp3, msgtmp4); 905 sha256msg2(msgtmp3, msgtmp2); 906 pshufd(msg, msg, 0x0E); 907 sha256rnds2(state0, state1); 908 sha256msg1(msgtmp1, msgtmp2); 909 910 // Rounds 28-31 911 movdqa(msg, msgtmp3); 912 paddd(msg, Address(rax, 112)); 913 sha256rnds2(state1, state0); 914 movdqa(msgtmp4, msgtmp3); 915 palignr(msgtmp4, msgtmp2, 4); 916 paddd(msgtmp0, msgtmp4); 917 sha256msg2(msgtmp0, msgtmp3); 918 pshufd(msg, msg, 0x0E); 919 sha256rnds2(state0, state1); 920 sha256msg1(msgtmp2, msgtmp3); 921 922 // Rounds 32-35 923 movdqa(msg, msgtmp0); 924 paddd(msg, Address(rax, 128)); 925 sha256rnds2(state1, state0); 926 movdqa(msgtmp4, msgtmp0); 927 palignr(msgtmp4, msgtmp3, 4); 928 paddd(msgtmp1, msgtmp4); 929 sha256msg2(msgtmp1, msgtmp0); 930 pshufd(msg, msg, 0x0E); 931 sha256rnds2(state0, state1); 932 sha256msg1(msgtmp3, msgtmp0); 933 934 // Rounds 36-39 935 movdqa(msg, msgtmp1); 936 paddd(msg, Address(rax, 144)); 937 sha256rnds2(state1, state0); 938 movdqa(msgtmp4, msgtmp1); 939 palignr(msgtmp4, msgtmp0, 4); 940 paddd(msgtmp2, msgtmp4); 941 sha256msg2(msgtmp2, msgtmp1); 942 pshufd(msg, msg, 0x0E); 943 sha256rnds2(state0, state1); 944 sha256msg1(msgtmp0, msgtmp1); 945 946 // Rounds 40-43 947 movdqa(msg, msgtmp2); 948 paddd(msg, Address(rax, 160)); 949 sha256rnds2(state1, state0); 950 movdqa(msgtmp4, msgtmp2); 951 palignr(msgtmp4, msgtmp1, 4); 952 paddd(msgtmp3, msgtmp4); 953 sha256msg2(msgtmp3, msgtmp2); 954 pshufd(msg, msg, 0x0E); 955 sha256rnds2(state0, state1); 956 sha256msg1(msgtmp1, msgtmp2); 957 958 // Rounds 44-47 959 movdqa(msg, msgtmp3); 960 paddd(msg, Address(rax, 176)); 961 sha256rnds2(state1, state0); 962 movdqa(msgtmp4, msgtmp3); 963 palignr(msgtmp4, msgtmp2, 4); 964 paddd(msgtmp0, msgtmp4); 965 sha256msg2(msgtmp0, msgtmp3); 966 pshufd(msg, msg, 0x0E); 967 sha256rnds2(state0, state1); 968 sha256msg1(msgtmp2, msgtmp3); 969 970 // Rounds 48-51 971 movdqa(msg, msgtmp0); 972 paddd(msg, Address(rax, 192)); 973 sha256rnds2(state1, state0); 974 movdqa(msgtmp4, msgtmp0); 975 palignr(msgtmp4, msgtmp3, 4); 976 paddd(msgtmp1, msgtmp4); 977 sha256msg2(msgtmp1, msgtmp0); 978 pshufd(msg, msg, 0x0E); 979 sha256rnds2(state0, state1); 980 sha256msg1(msgtmp3, msgtmp0); 981 982 // Rounds 52-55 983 movdqa(msg, msgtmp1); 984 paddd(msg, Address(rax, 208)); 985 sha256rnds2(state1, state0); 986 movdqa(msgtmp4, msgtmp1); 987 palignr(msgtmp4, msgtmp0, 4); 988 paddd(msgtmp2, msgtmp4); 989 sha256msg2(msgtmp2, msgtmp1); 990 pshufd(msg, msg, 0x0E); 991 sha256rnds2(state0, state1); 992 993 // Rounds 56-59 994 movdqa(msg, msgtmp2); 995 paddd(msg, Address(rax, 224)); 996 sha256rnds2(state1, state0); 997 movdqa(msgtmp4, msgtmp2); 998 palignr(msgtmp4, msgtmp1, 4); 999 paddd(msgtmp3, msgtmp4); 1000 sha256msg2(msgtmp3, msgtmp2); 1001 pshufd(msg, msg, 0x0E); 1002 sha256rnds2(state0, state1); 1003 1004 // Rounds 60-63 1005 movdqa(msg, msgtmp3); 1006 paddd(msg, Address(rax, 240)); 1007 sha256rnds2(state1, state0); 1008 pshufd(msg, msg, 0x0E); 1009 sha256rnds2(state0, state1); 1010 movdqu(msg, Address(rsp, 0)); 1011 paddd(state0, msg); 1012 movdqu(msg, Address(rsp, 16)); 1013 paddd(state1, msg); 1014 1015 if (multi_block) { 1016 // increment data pointer and loop if more to process 1017 addptr(buf, 64); 1018 addptr(ofs, 64); 1019 cmpptr(ofs, limit); 1020 jcc(Assembler::belowEqual, loop0); 1021 movptr(rax, ofs); //return ofs 1022 } 1023 1024 pshufd(state0, state0, 0x1B); 1025 pshufd(state1, state1, 0xB1); 1026 movdqa(msgtmp4, state0); 1027 pblendw(state0, state1, 0xF0); 1028 palignr(state1, msgtmp4, 8); 1029 1030 movdqu(Address(state, 0), state0); 1031 movdqu(Address(state, 16), state1); 1032 1033 bind(done_hash); 1034 1035 }