1 /* 2 * Copyright (c) 2016, Intel Corporation. 3 * 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/assembler.hpp" 28 #include "asm/assembler.inline.hpp" 29 #include "runtime/stubRoutines.hpp" 30 #include "macroAssembler_x86.hpp" 31 32 // ofs and limit are used for multi-block byte array. 33 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 34 void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0, 35 XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask, 36 Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) { 37 38 Label start, done_hash, loop0; 39 40 address upper_word_mask = StubRoutines::x86::upper_word_mask_addr(); 41 address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr(); 42 43 bind(start); 44 movdqu(abcd, Address(state, 0)); 45 pinsrd(e0, Address(state, 16), 3); 46 movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000 47 pand(e0, shuf_mask); 48 pshufd(abcd, abcd, 0x1B); 49 movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f 50 51 bind(loop0); 52 // Save hash values for addition after rounds 53 movdqu(Address(rsp, 0), e0); 54 movdqu(Address(rsp, 16), abcd); 55 56 57 // Rounds 0 - 3 58 movdqu(msg0, Address(buf, 0)); 59 pshufb(msg0, shuf_mask); 60 paddd(e0, msg0); 61 movdqa(e1, abcd); 62 sha1rnds4(abcd, e0, 0); 63 64 // Rounds 4 - 7 65 movdqu(msg1, Address(buf, 16)); 66 pshufb(msg1, shuf_mask); 67 sha1nexte(e1, msg1); 68 movdqa(e0, abcd); 69 sha1rnds4(abcd, e1, 0); 70 sha1msg1(msg0, msg1); 71 72 // Rounds 8 - 11 73 movdqu(msg2, Address(buf, 32)); 74 pshufb(msg2, shuf_mask); 75 sha1nexte(e0, msg2); 76 movdqa(e1, abcd); 77 sha1rnds4(abcd, e0, 0); 78 sha1msg1(msg1, msg2); 79 pxor(msg0, msg2); 80 81 // Rounds 12 - 15 82 movdqu(msg3, Address(buf, 48)); 83 pshufb(msg3, shuf_mask); 84 sha1nexte(e1, msg3); 85 movdqa(e0, abcd); 86 sha1msg2(msg0, msg3); 87 sha1rnds4(abcd, e1, 0); 88 sha1msg1(msg2, msg3); 89 pxor(msg1, msg3); 90 91 // Rounds 16 - 19 92 sha1nexte(e0, msg0); 93 movdqa(e1, abcd); 94 sha1msg2(msg1, msg0); 95 sha1rnds4(abcd, e0, 0); 96 sha1msg1(msg3, msg0); 97 pxor(msg2, msg0); 98 99 // Rounds 20 - 23 100 sha1nexte(e1, msg1); 101 movdqa(e0, abcd); 102 sha1msg2(msg2, msg1); 103 sha1rnds4(abcd, e1, 1); 104 sha1msg1(msg0, msg1); 105 pxor(msg3, msg1); 106 107 // Rounds 24 - 27 108 sha1nexte(e0, msg2); 109 movdqa(e1, abcd); 110 sha1msg2(msg3, msg2); 111 sha1rnds4(abcd, e0, 1); 112 sha1msg1(msg1, msg2); 113 pxor(msg0, msg2); 114 115 // Rounds 28 - 31 116 sha1nexte(e1, msg3); 117 movdqa(e0, abcd); 118 sha1msg2(msg0, msg3); 119 sha1rnds4(abcd, e1, 1); 120 sha1msg1(msg2, msg3); 121 pxor(msg1, msg3); 122 123 // Rounds 32 - 35 124 sha1nexte(e0, msg0); 125 movdqa(e1, abcd); 126 sha1msg2(msg1, msg0); 127 sha1rnds4(abcd, e0, 1); 128 sha1msg1(msg3, msg0); 129 pxor(msg2, msg0); 130 131 // Rounds 36 - 39 132 sha1nexte(e1, msg1); 133 movdqa(e0, abcd); 134 sha1msg2(msg2, msg1); 135 sha1rnds4(abcd, e1, 1); 136 sha1msg1(msg0, msg1); 137 pxor(msg3, msg1); 138 139 // Rounds 40 - 43 140 sha1nexte(e0, msg2); 141 movdqa(e1, abcd); 142 sha1msg2(msg3, msg2); 143 sha1rnds4(abcd, e0, 2); 144 sha1msg1(msg1, msg2); 145 pxor(msg0, msg2); 146 147 // Rounds 44 - 47 148 sha1nexte(e1, msg3); 149 movdqa(e0, abcd); 150 sha1msg2(msg0, msg3); 151 sha1rnds4(abcd, e1, 2); 152 sha1msg1(msg2, msg3); 153 pxor(msg1, msg3); 154 155 // Rounds 48 - 51 156 sha1nexte(e0, msg0); 157 movdqa(e1, abcd); 158 sha1msg2(msg1, msg0); 159 sha1rnds4(abcd, e0, 2); 160 sha1msg1(msg3, msg0); 161 pxor(msg2, msg0); 162 163 // Rounds 52 - 55 164 sha1nexte(e1, msg1); 165 movdqa(e0, abcd); 166 sha1msg2(msg2, msg1); 167 sha1rnds4(abcd, e1, 2); 168 sha1msg1(msg0, msg1); 169 pxor(msg3, msg1); 170 171 // Rounds 56 - 59 172 sha1nexte(e0, msg2); 173 movdqa(e1, abcd); 174 sha1msg2(msg3, msg2); 175 sha1rnds4(abcd, e0, 2); 176 sha1msg1(msg1, msg2); 177 pxor(msg0, msg2); 178 179 // Rounds 60 - 63 180 sha1nexte(e1, msg3); 181 movdqa(e0, abcd); 182 sha1msg2(msg0, msg3); 183 sha1rnds4(abcd, e1, 3); 184 sha1msg1(msg2, msg3); 185 pxor(msg1, msg3); 186 187 // Rounds 64 - 67 188 sha1nexte(e0, msg0); 189 movdqa(e1, abcd); 190 sha1msg2(msg1, msg0); 191 sha1rnds4(abcd, e0, 3); 192 sha1msg1(msg3, msg0); 193 pxor(msg2, msg0); 194 195 // Rounds 68 - 71 196 sha1nexte(e1, msg1); 197 movdqa(e0, abcd); 198 sha1msg2(msg2, msg1); 199 sha1rnds4(abcd, e1, 3); 200 pxor(msg3, msg1); 201 202 // Rounds 72 - 75 203 sha1nexte(e0, msg2); 204 movdqa(e1, abcd); 205 sha1msg2(msg3, msg2); 206 sha1rnds4(abcd, e0, 3); 207 208 // Rounds 76 - 79 209 sha1nexte(e1, msg3); 210 movdqa(e0, abcd); 211 sha1rnds4(abcd, e1, 3); 212 213 // add current hash values with previously saved 214 movdqu(msg0, Address(rsp, 0)); 215 sha1nexte(e0, msg0); 216 movdqu(msg0, Address(rsp, 16)); 217 paddd(abcd, msg0); 218 219 if (multi_block) { 220 // increment data pointer and loop if more to process 221 addptr(buf, 64); 222 addptr(ofs, 64); 223 cmpptr(ofs, limit); 224 jcc(Assembler::belowEqual, loop0); 225 movptr(rax, ofs); //return ofs 226 } 227 // write hash values back in the correct order 228 pshufd(abcd, abcd, 0x1b); 229 movdqu(Address(state, 0), abcd); 230 pextrd(Address(state, 16), e0, 3); 231 232 bind(done_hash); 233 234 } 235 236 // xmm0 (msg) is used as an implicit argument to sh256rnds2 237 // and state0 and state1 can never use xmm0 register. 238 // ofs and limit are used for multi-block byte array. 239 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 240 #ifdef _LP64 241 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, 242 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, 243 Register buf, Register state, Register ofs, Register limit, Register rsp, 244 bool multi_block, XMMRegister shuf_mask) { 245 #else 246 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, 247 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, 248 Register buf, Register state, Register ofs, Register limit, Register rsp, 249 bool multi_block) { 250 #endif 251 Label start, done_hash, loop0; 252 253 address K256 = StubRoutines::x86::k256_addr(); 254 address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); 255 256 bind(start); 257 movdqu(state0, Address(state, 0)); 258 movdqu(state1, Address(state, 16)); 259 260 pshufd(state0, state0, 0xB1); 261 pshufd(state1, state1, 0x1B); 262 movdqa(msgtmp4, state0); 263 palignr(state0, state1, 8); 264 pblendw(state1, msgtmp4, 0xF0); 265 266 #ifdef _LP64 267 movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask)); 268 #endif 269 lea(rax, ExternalAddress(K256)); 270 271 bind(loop0); 272 movdqu(Address(rsp, 0), state0); 273 movdqu(Address(rsp, 16), state1); 274 275 // Rounds 0-3 276 movdqu(msg, Address(buf, 0)); 277 #ifdef _LP64 278 pshufb(msg, shuf_mask); 279 #else 280 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); 281 #endif 282 movdqa(msgtmp0, msg); 283 paddd(msg, Address(rax, 0)); 284 sha256rnds2(state1, state0); 285 pshufd(msg, msg, 0x0E); 286 sha256rnds2(state0, state1); 287 288 // Rounds 4-7 289 movdqu(msg, Address(buf, 16)); 290 #ifdef _LP64 291 pshufb(msg, shuf_mask); 292 #else 293 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); 294 #endif 295 movdqa(msgtmp1, msg); 296 paddd(msg, Address(rax, 16)); 297 sha256rnds2(state1, state0); 298 pshufd(msg, msg, 0x0E); 299 sha256rnds2(state0, state1); 300 sha256msg1(msgtmp0, msgtmp1); 301 302 // Rounds 8-11 303 movdqu(msg, Address(buf, 32)); 304 #ifdef _LP64 305 pshufb(msg, shuf_mask); 306 #else 307 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); 308 #endif 309 movdqa(msgtmp2, msg); 310 paddd(msg, Address(rax, 32)); 311 sha256rnds2(state1, state0); 312 pshufd(msg, msg, 0x0E); 313 sha256rnds2(state0, state1); 314 sha256msg1(msgtmp1, msgtmp2); 315 316 // Rounds 12-15 317 movdqu(msg, Address(buf, 48)); 318 #ifdef _LP64 319 pshufb(msg, shuf_mask); 320 #else 321 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); 322 #endif 323 movdqa(msgtmp3, msg); 324 paddd(msg, Address(rax, 48)); 325 sha256rnds2(state1, state0); 326 movdqa(msgtmp4, msgtmp3); 327 palignr(msgtmp4, msgtmp2, 4); 328 paddd(msgtmp0, msgtmp4); 329 sha256msg2(msgtmp0, msgtmp3); 330 pshufd(msg, msg, 0x0E); 331 sha256rnds2(state0, state1); 332 sha256msg1(msgtmp2, msgtmp3); 333 334 // Rounds 16-19 335 movdqa(msg, msgtmp0); 336 paddd(msg, Address(rax, 64)); 337 sha256rnds2(state1, state0); 338 movdqa(msgtmp4, msgtmp0); 339 palignr(msgtmp4, msgtmp3, 4); 340 paddd(msgtmp1, msgtmp4); 341 sha256msg2(msgtmp1, msgtmp0); 342 pshufd(msg, msg, 0x0E); 343 sha256rnds2(state0, state1); 344 sha256msg1(msgtmp3, msgtmp0); 345 346 // Rounds 20-23 347 movdqa(msg, msgtmp1); 348 paddd(msg, Address(rax, 80)); 349 sha256rnds2(state1, state0); 350 movdqa(msgtmp4, msgtmp1); 351 palignr(msgtmp4, msgtmp0, 4); 352 paddd(msgtmp2, msgtmp4); 353 sha256msg2(msgtmp2, msgtmp1); 354 pshufd(msg, msg, 0x0E); 355 sha256rnds2(state0, state1); 356 sha256msg1(msgtmp0, msgtmp1); 357 358 // Rounds 24-27 359 movdqa(msg, msgtmp2); 360 paddd(msg, Address(rax, 96)); 361 sha256rnds2(state1, state0); 362 movdqa(msgtmp4, msgtmp2); 363 palignr(msgtmp4, msgtmp1, 4); 364 paddd(msgtmp3, msgtmp4); 365 sha256msg2(msgtmp3, msgtmp2); 366 pshufd(msg, msg, 0x0E); 367 sha256rnds2(state0, state1); 368 sha256msg1(msgtmp1, msgtmp2); 369 370 // Rounds 28-31 371 movdqa(msg, msgtmp3); 372 paddd(msg, Address(rax, 112)); 373 sha256rnds2(state1, state0); 374 movdqa(msgtmp4, msgtmp3); 375 palignr(msgtmp4, msgtmp2, 4); 376 paddd(msgtmp0, msgtmp4); 377 sha256msg2(msgtmp0, msgtmp3); 378 pshufd(msg, msg, 0x0E); 379 sha256rnds2(state0, state1); 380 sha256msg1(msgtmp2, msgtmp3); 381 382 // Rounds 32-35 383 movdqa(msg, msgtmp0); 384 paddd(msg, Address(rax, 128)); 385 sha256rnds2(state1, state0); 386 movdqa(msgtmp4, msgtmp0); 387 palignr(msgtmp4, msgtmp3, 4); 388 paddd(msgtmp1, msgtmp4); 389 sha256msg2(msgtmp1, msgtmp0); 390 pshufd(msg, msg, 0x0E); 391 sha256rnds2(state0, state1); 392 sha256msg1(msgtmp3, msgtmp0); 393 394 // Rounds 36-39 395 movdqa(msg, msgtmp1); 396 paddd(msg, Address(rax, 144)); 397 sha256rnds2(state1, state0); 398 movdqa(msgtmp4, msgtmp1); 399 palignr(msgtmp4, msgtmp0, 4); 400 paddd(msgtmp2, msgtmp4); 401 sha256msg2(msgtmp2, msgtmp1); 402 pshufd(msg, msg, 0x0E); 403 sha256rnds2(state0, state1); 404 sha256msg1(msgtmp0, msgtmp1); 405 406 // Rounds 40-43 407 movdqa(msg, msgtmp2); 408 paddd(msg, Address(rax, 160)); 409 sha256rnds2(state1, state0); 410 movdqa(msgtmp4, msgtmp2); 411 palignr(msgtmp4, msgtmp1, 4); 412 paddd(msgtmp3, msgtmp4); 413 sha256msg2(msgtmp3, msgtmp2); 414 pshufd(msg, msg, 0x0E); 415 sha256rnds2(state0, state1); 416 sha256msg1(msgtmp1, msgtmp2); 417 418 // Rounds 44-47 419 movdqa(msg, msgtmp3); 420 paddd(msg, Address(rax, 176)); 421 sha256rnds2(state1, state0); 422 movdqa(msgtmp4, msgtmp3); 423 palignr(msgtmp4, msgtmp2, 4); 424 paddd(msgtmp0, msgtmp4); 425 sha256msg2(msgtmp0, msgtmp3); 426 pshufd(msg, msg, 0x0E); 427 sha256rnds2(state0, state1); 428 sha256msg1(msgtmp2, msgtmp3); 429 430 // Rounds 48-51 431 movdqa(msg, msgtmp0); 432 paddd(msg, Address(rax, 192)); 433 sha256rnds2(state1, state0); 434 movdqa(msgtmp4, msgtmp0); 435 palignr(msgtmp4, msgtmp3, 4); 436 paddd(msgtmp1, msgtmp4); 437 sha256msg2(msgtmp1, msgtmp0); 438 pshufd(msg, msg, 0x0E); 439 sha256rnds2(state0, state1); 440 sha256msg1(msgtmp3, msgtmp0); 441 442 // Rounds 52-55 443 movdqa(msg, msgtmp1); 444 paddd(msg, Address(rax, 208)); 445 sha256rnds2(state1, state0); 446 movdqa(msgtmp4, msgtmp1); 447 palignr(msgtmp4, msgtmp0, 4); 448 paddd(msgtmp2, msgtmp4); 449 sha256msg2(msgtmp2, msgtmp1); 450 pshufd(msg, msg, 0x0E); 451 sha256rnds2(state0, state1); 452 453 // Rounds 56-59 454 movdqa(msg, msgtmp2); 455 paddd(msg, Address(rax, 224)); 456 sha256rnds2(state1, state0); 457 movdqa(msgtmp4, msgtmp2); 458 palignr(msgtmp4, msgtmp1, 4); 459 paddd(msgtmp3, msgtmp4); 460 sha256msg2(msgtmp3, msgtmp2); 461 pshufd(msg, msg, 0x0E); 462 sha256rnds2(state0, state1); 463 464 // Rounds 60-63 465 movdqa(msg, msgtmp3); 466 paddd(msg, Address(rax, 240)); 467 sha256rnds2(state1, state0); 468 pshufd(msg, msg, 0x0E); 469 sha256rnds2(state0, state1); 470 movdqu(msg, Address(rsp, 0)); 471 paddd(state0, msg); 472 movdqu(msg, Address(rsp, 16)); 473 paddd(state1, msg); 474 475 if (multi_block) { 476 // increment data pointer and loop if more to process 477 addptr(buf, 64); 478 addptr(ofs, 64); 479 cmpptr(ofs, limit); 480 jcc(Assembler::belowEqual, loop0); 481 movptr(rax, ofs); //return ofs 482 } 483 484 pshufd(state0, state0, 0x1B); 485 pshufd(state1, state1, 0xB1); 486 movdqa(msgtmp4, state0); 487 pblendw(state0, state1, 0xF0); 488 palignr(state1, msgtmp4, 8); 489 490 movdqu(Address(state, 0), state0); 491 movdqu(Address(state, 16), state1); 492 493 bind(done_hash); 494 495 } 496 497 #ifdef _LP64 498 /* 499 The algorithm below is based on Intel publication: 500 "Fast SHA-256 Implementations on IntelĀ® Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal. 501 The assembly code was originally provided by Sean Gulley and in many places preserves 502 the original assembly NAMES and comments to simplify matching Java assembly with its original. 503 The Java version was substantially redesigned to replace 1200 assembly instruction with 504 much shorter run-time generator of the same code in memory. 505 */ 506 507 void MacroAssembler::sha256_AVX2_one_round_compute( 508 Register reg_old_h, 509 Register reg_a, 510 Register reg_b, 511 Register reg_c, 512 Register reg_d, 513 Register reg_e, 514 Register reg_f, 515 Register reg_g, 516 Register reg_h, 517 int iter) { 518 const Register& reg_y0 = r13; 519 const Register& reg_y1 = r14; 520 const Register& reg_y2 = r15; 521 const Register& reg_y3 = rcx; 522 const Register& reg_T1 = r12; 523 //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;; 524 if (iter%4 > 0) { 525 addl(reg_old_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- 526 } 527 movl(reg_y2, reg_f); // reg_y2 = reg_f ; CH 528 rorxd(reg_y0, reg_e, 25); // reg_y0 = reg_e >> 25 ; S1A 529 rorxd(reg_y1, reg_e, 11); // reg_y1 = reg_e >> 11 ; S1B 530 xorl(reg_y2, reg_g); // reg_y2 = reg_f^reg_g ; CH 531 532 xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_h>>11) ; S1 533 rorxd(reg_y1, reg_e, 6); // reg_y1 = (reg_e >> 6) ; S1 534 andl(reg_y2, reg_e); // reg_y2 = (reg_f^reg_g)®_e ; CH 535 536 if (iter%4 > 0) { 537 addl(reg_old_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- 538 } 539 540 xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 541 rorxd(reg_T1, reg_a, 13); // reg_T1 = reg_a >> 13 ; S0B 542 xorl(reg_y2, reg_g); // reg_y2 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH 543 rorxd(reg_y1, reg_a, 22); // reg_y1 = reg_a >> 22 ; S0A 544 movl(reg_y3, reg_a); // reg_y3 = reg_a ; MAJA 545 546 xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ; S0 547 rorxd(reg_T1, reg_a, 2); // reg_T1 = (reg_a >> 2) ; S0 548 addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; -- 549 orl(reg_y3, reg_c); // reg_y3 = reg_a|reg_c ; MAJA 550 551 xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 552 movl(reg_T1, reg_a); // reg_T1 = reg_a ; MAJB 553 andl(reg_y3, reg_b); // reg_y3 = (reg_a|reg_c)®_b ; MAJA 554 andl(reg_T1, reg_c); // reg_T1 = reg_a®_c ; MAJB 555 addl(reg_y2, reg_y0); // reg_y2 = S1 + CH ; -- 556 557 558 addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- 559 orl(reg_y3, reg_T1); // reg_y3 = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ 560 addl(reg_h, reg_y1); // reg_h = k + w + reg_h + S0 ; -- 561 562 addl(reg_d, reg_y2); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- 563 564 565 if (iter%4 == 3) { 566 addl(reg_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- 567 addl(reg_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- 568 } 569 } 570 571 void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) { 572 sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi, r8, r9, r10, r11, start + 0); 573 sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi, r8, r9, r10, start + 1); 574 sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi, r8, r9, start + 2); 575 sha256_AVX2_one_round_compute(r9, r9, r10, r11, rax, rbx, rdi, rsi, r8, start + 3); 576 } 577 578 void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) { 579 sha256_AVX2_one_round_compute(r8, r8, r9, r10, r11, rax, rbx, rdi, rsi, start + 0); 580 sha256_AVX2_one_round_compute(rsi, rsi, r8, r9, r10, r11, rax, rbx, rdi, start + 1); 581 sha256_AVX2_one_round_compute(rdi, rdi, rsi, r8, r9, r10, r11, rax, rbx, start + 2); 582 sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi, r8, r9, r10, r11, rax, start + 3); 583 } 584 585 void MacroAssembler::sha256_AVX2_one_round_and_sched( 586 XMMRegister xmm_0, /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */ 587 XMMRegister xmm_1, /* ymm5 */ /* full cycle is 16 iterations */ 588 XMMRegister xmm_2, /* ymm6 */ 589 XMMRegister xmm_3, /* ymm7 */ 590 Register reg_a, /* == rax on 0 iteration, then rotate 8 register right on each next iteration */ 591 Register reg_b, /* rbx */ /* full cycle is 8 iterations */ 592 Register reg_c, /* rdi */ 593 Register reg_d, /* rsi */ 594 Register reg_e, /* r8 */ 595 Register reg_f, /* r9d */ 596 Register reg_g, /* r10d */ 597 Register reg_h, /* r11d */ 598 int iter) 599 { 600 movl(rcx, reg_a); // rcx = reg_a ; MAJA 601 rorxd(r13, reg_e, 25); // r13 = reg_e >> 25 ; S1A 602 rorxd(r14, reg_e, 11); // r14 = reg_e >> 11 ; S1B 603 addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); 604 orl(rcx, reg_c); // rcx = reg_a|reg_c ; MAJA 605 606 movl(r15, reg_f); // r15 = reg_f ; CH 607 rorxd(r12, reg_a, 13); // r12 = reg_a >> 13 ; S0B 608 xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ; S1 609 xorl(r15, reg_g); // r15 = reg_f^reg_g ; CH 610 611 rorxd(r14, reg_e, 6); // r14 = (reg_e >> 6) ; S1 612 andl(r15, reg_e); // r15 = (reg_f^reg_g)®_e ; CH 613 614 xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 615 rorxd(r14, reg_a, 22); // r14 = reg_a >> 22 ; S0A 616 addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- 617 618 andl(rcx, reg_b); // rcx = (reg_a|reg_c)®_b ; MAJA 619 xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ; S0 620 621 rorxd(r12, reg_a, 2); // r12 = (reg_a >> 2) ; S0 622 xorl(r15, reg_g); // r15 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH 623 624 xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 625 movl(r12, reg_a); // r12 = reg_a ; MAJB 626 andl(r12, reg_c); // r12 = reg_a®_c ; MAJB 627 addl(r15, r13); // r15 = S1 + CH ; -- 628 629 orl(rcx, r12); // rcx = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ 630 addl(reg_h, r14); // reg_h = k + w + reg_h + S0 ; -- 631 addl(reg_d, r15); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- 632 633 addl(reg_h, r15); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- 634 addl(reg_h, rcx); // reg_h = t1 + S0 + MAJ ; -- 635 636 if (iter%4 == 0) { 637 vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit); // ymm0 = W[-7] 638 vpaddd(xmm0, xmm0, xmm_0, AVX_256bit); // ymm0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1 639 vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit); // ymm1 = W[-15] 640 vpsrld(xmm2, xmm1, 7, AVX_256bit); 641 vpslld(xmm3, xmm1, 32-7, AVX_256bit); 642 vpor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 643 vpsrld(xmm2, xmm1,18, AVX_256bit); 644 } else if (iter%4 == 1 ) { 645 vpsrld(xmm8, xmm1, 3, AVX_256bit); // ymm8 = W[-15] >> 3 646 vpslld(xmm1, xmm1, 32-18, AVX_256bit); 647 vpxor(xmm3, xmm3, xmm1, AVX_256bit); 648 vpxor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 ^ W[-15] ror 18 649 vpxor(xmm1, xmm3, xmm8, AVX_256bit); // ymm1 = s0 650 vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit); // 11111010b ; ymm2 = W[-2] {BBAA} 651 vpaddd(xmm0, xmm0, xmm1, AVX_256bit); // ymm0 = W[-16] + W[-7] + s0 652 vpsrld(xmm8, xmm2, 10, AVX_256bit); // ymm8 = W[-2] >> 10 {BBAA} 653 } else if (iter%4 == 2) { 654 vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xBxA} 655 vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xBxA} 656 vpxor(xmm2, xmm2, xmm3, AVX_256bit); 657 vpxor(xmm8, xmm8, xmm2, AVX_256bit); // ymm8 = s1 {xBxA} 658 vpshufb(xmm8, xmm8, xmm10, AVX_256bit); // ymm8 = s1 {00BA} 659 vpaddd(xmm0, xmm0, xmm8, AVX_256bit); // ymm0 = {..., ..., W[1], W[0]} 660 vpshufd(xmm2, xmm0, 0x50, AVX_256bit); // 01010000b ; ymm2 = W[-2] {DDCC} 661 } else if (iter%4 == 3) { 662 vpsrld(xmm11, xmm2, 10, AVX_256bit); // ymm11 = W[-2] >> 10 {DDCC} 663 vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xDxC} 664 vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xDxC} 665 vpxor(xmm2, xmm2, xmm3, AVX_256bit); 666 vpxor(xmm11, xmm11, xmm2, AVX_256bit); // ymm11 = s1 {xDxC} 667 vpshufb(xmm11, xmm11, xmm12, AVX_256bit); // ymm11 = s1 {DC00} 668 vpaddd(xmm_0, xmm11, xmm0, AVX_256bit); // xmm_0 = {W[3], W[2], W[1], W[0]} 669 } 670 } 671 672 void MacroAssembler::addm(int disp, Register r1, Register r2) { 673 addl(r2, Address(r1, disp)); 674 movl(Address(r1, disp), r2); 675 } 676 677 void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, 678 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, 679 Register buf, Register state, Register ofs, Register limit, Register rsp, 680 bool multi_block, XMMRegister shuf_mask) { 681 682 Label loop0, loop1, loop2, loop3, 683 last_block_enter, do_last_block, only_one_block, done_hash, 684 compute_size, compute_size_end, 685 compute_size1, compute_size_end1; 686 687 address K256_W = StubRoutines::x86::k256_W_addr(); 688 address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); 689 address pshuffle_byte_flip_mask_addr = 0; 690 691 const XMMRegister& SHUF_00BA = xmm10; // ymm10: shuffle xBxA -> 00BA 692 const XMMRegister& SHUF_DC00 = xmm12; // ymm12: shuffle xDxC -> DC00 693 const XMMRegister& BYTE_FLIP_MASK = xmm13; // ymm13 694 695 const XMMRegister& X_BYTE_FLIP_MASK = xmm13; //XMM version of BYTE_FLIP_MASK 696 697 const Register& NUM_BLKS = r8; // 3rd arg 698 const Register& CTX = rdx; // 2nd arg 699 const Register& INP = rcx; // 1st arg 700 701 const Register& c = rdi; 702 const Register& d = rsi; 703 const Register& e = r8; // clobbers NUM_BLKS 704 const Register& y3 = rcx; // clobbers INP 705 706 const Register& TBL = rbp; 707 const Register& SRND = CTX; // SRND is same register as CTX 708 709 const Register& a = rax; 710 const Register& b = rbx; 711 const Register& f = r9; 712 const Register& g = r10; 713 const Register& h = r11; 714 715 const Register& T1 = r12; 716 const Register& y0 = r13; 717 const Register& y1 = r14; 718 const Register& y2 = r15; 719 720 721 enum { 722 _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round 723 #ifndef _WIN64 724 _XMM_SAVE_SIZE = 0, 725 #else 726 _XMM_SAVE_SIZE = 8*16, 727 #endif 728 _INP_END_SIZE = 8, 729 _INP_SIZE = 8, 730 _CTX_SIZE = 8, 731 _RSP_SIZE = 8, 732 733 _XFER = 0, 734 _XMM_SAVE = _XFER + _XFER_SIZE, 735 _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE, 736 _INP = _INP_END + _INP_END_SIZE, 737 _CTX = _INP + _INP_SIZE, 738 _RSP = _CTX + _CTX_SIZE, 739 STACK_SIZE = _RSP + _RSP_SIZE 740 }; 741 742 #ifndef _WIN64 743 push(rcx); // linux: this is limit, need at the end 744 push(rdx); // linux: this is ofs 745 #else 746 push(r8); // win64: this is ofs 747 push(r9); // win64: this is limit, we need them again at the very and 748 #endif 749 750 751 push(rbx); 752 #ifdef _WIN64 753 push(rsi); 754 push(rdi); 755 #endif 756 push(rbp); 757 push(r12); 758 push(r13); 759 push(r14); 760 push(r15); 761 762 movq(rax, rsp); 763 subq(rsp, STACK_SIZE); 764 andq(rsp, -32); 765 movq(Address(rsp, _RSP), rax); 766 767 #ifndef _WIN64 768 // copy linux params to win64 params, therefore the rest of code will be the same for both 769 movq(r9, rcx); 770 movq(r8, rdx); 771 movq(rdx, rsi); 772 movq(rcx, rdi); 773 #endif 774 775 // setting original assembly ABI 776 /** message to encrypt in INP */ 777 lea(INP, Address(rcx, 0)); // rcx == message (buf) ;; linux: INP = buf = rdi 778 /** digest in CTX */ 779 movq(CTX, rdx); // rdx = digest (state) ;; linux: CTX = state = rsi 780 781 /** NUM_BLK is the length of message, need to set it from ofs and limit */ 782 if (multi_block) { 783 784 // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8 785 // on entry r8 = ofs 786 // on exit r8 = NUM_BLKS 787 788 xorq(rax, rax); 789 790 bind(compute_size); 791 cmpptr(r8, r9); // assume the original ofs <= limit ;; linux: cmp rcx, rdx 792 jccb(Assembler::aboveEqual, compute_size_end); 793 addq(r8, 64); //;; linux: ofs = rdx 794 addq(rax, 64); 795 jmpb(compute_size); 796 797 bind(compute_size_end); 798 movq(NUM_BLKS, rax); // NUM_BLK (r8) ;; linux: NUM_BLK = rdx 799 800 cmpq(NUM_BLKS, 0); 801 jcc(Assembler::equal, done_hash); 802 803 } else { 804 xorq(NUM_BLKS, NUM_BLKS); 805 addq(NUM_BLKS, 64); 806 }//if (!multi_block) 807 808 lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block 809 movq(Address(rsp, _INP_END), NUM_BLKS); // 810 811 cmpptr(INP, NUM_BLKS); //cmp INP, NUM_BLKS 812 jcc(Assembler::equal, only_one_block); //je only_one_block 813 814 // load initial digest 815 movl(a, Address(CTX, 4*0)); 816 movl(b, Address(CTX, 4*1)); 817 movl(c, Address(CTX, 4*2)); 818 movl(d, Address(CTX, 4*3)); 819 movl(e, Address(CTX, 4*4)); 820 movl(f, Address(CTX, 4*5)); 821 movl(g, Address(CTX, 4*6)); 822 movl(h, Address(CTX, 4*7)); 823 824 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; 825 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] 826 vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] 827 vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] 828 829 movq(Address(rsp, _CTX), CTX); // store 830 831 bind(loop0); 832 lea(TBL, ExternalAddress(K256_W)); 833 834 // assume buffers not aligned 835 836 // Load first 16 dwords from two blocks 837 vmovdqu(xmm0, Address(INP, 0*32)); 838 vmovdqu(xmm1, Address(INP, 1*32)); 839 vmovdqu(xmm2, Address(INP, 2*32)); 840 vmovdqu(xmm3, Address(INP, 3*32)); 841 842 // byte swap data 843 vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit); 844 vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit); 845 vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit); 846 vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit); 847 848 // transpose data into high/low halves 849 vperm2i128(xmm4, xmm0, xmm2, 0x20); 850 vperm2i128(xmm5, xmm0, xmm2, 0x31); 851 vperm2i128(xmm6, xmm1, xmm3, 0x20); 852 vperm2i128(xmm7, xmm1, xmm3, 0x31); 853 854 bind(last_block_enter); 855 addq(INP, 64); 856 movq(Address(rsp, _INP), INP); 857 858 //;; schedule 48 input dwords, by doing 3 rounds of 12 each 859 xorq(SRND, SRND); 860 861 align(16); 862 bind(loop1); 863 vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); 864 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); 865 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8, r9, r10, r11, 0); 866 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8, r9, r10, 1); 867 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8, r9, 2); 868 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9, r10, r11, rax, rbx, rdi, rsi, r8, 3); 869 870 vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); 871 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); 872 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8, r9, r10, r11, rax, rbx, rdi, rsi, 8+0); 873 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8, r9, r10, r11, rax, rbx, rdi, 8+1); 874 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8, r9, r10, r11, rax, rbx, 8+2); 875 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8, r9, r10, r11, rax, 8+3); 876 877 vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit); 878 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9); 879 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8, r9, r10, r11, 16+0); 880 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8, r9, r10, 16+1); 881 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8, r9, 16+2); 882 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9, r10, r11, rax, rbx, rdi, rsi, r8, 16+3); 883 884 vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit); 885 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9); 886 887 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8, r9, r10, r11, rax, rbx, rdi, rsi, 24+0); 888 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8, r9, r10, r11, rax, rbx, rdi, 24+1); 889 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8, r9, r10, r11, rax, rbx, 24+2); 890 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8, r9, r10, r11, rax, 24+3); 891 892 addq(SRND, 4*32); 893 cmpq(SRND, 3 * 4*32); 894 jcc(Assembler::below, loop1); 895 896 bind(loop2); 897 // Do last 16 rounds with no scheduling 898 vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); 899 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); 900 sha256_AVX2_four_rounds_compute_first(0); 901 902 vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); 903 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); 904 sha256_AVX2_four_rounds_compute_last(0 + 8); 905 906 addq(SRND, 2*32); 907 908 vmovdqu(xmm4, xmm6); 909 vmovdqu(xmm5, xmm7); 910 911 cmpq(SRND, 4 * 4*32); 912 jcc(Assembler::below, loop2); 913 914 movq(CTX, Address(rsp, _CTX)); 915 movq(INP, Address(rsp, _INP)); 916 917 addm(4*0, CTX, a); 918 addm(4*1, CTX, b); 919 addm(4*2, CTX, c); 920 addm(4*3, CTX, d); 921 addm(4*4, CTX, e); 922 addm(4*5, CTX, f); 923 addm(4*6, CTX, g); 924 addm(4*7, CTX, h); 925 926 cmpq(INP, Address(rsp, _INP_END)); 927 jcc(Assembler::above, done_hash); 928 929 //Do second block using previously scheduled results 930 xorq(SRND, SRND); 931 align(16); 932 bind(loop3); 933 sha256_AVX2_four_rounds_compute_first(4); 934 sha256_AVX2_four_rounds_compute_last(4+8); 935 936 addq(SRND, 2*32); 937 cmpq(SRND, 4 * 4*32); 938 jcc(Assembler::below, loop3); 939 940 movq(CTX, Address(rsp, _CTX)); 941 movq(INP, Address(rsp, _INP)); 942 addq(INP, 64); 943 944 addm(4*0, CTX, a); 945 addm(4*1, CTX, b); 946 addm(4*2, CTX, c); 947 addm(4*3, CTX, d); 948 addm(4*4, CTX, e); 949 addm(4*5, CTX, f); 950 addm(4*6, CTX, g); 951 addm(4*7, CTX, h); 952 953 cmpq(INP, Address(rsp, _INP_END)); 954 jcc(Assembler::below, loop0); 955 jccb(Assembler::above, done_hash); 956 957 bind(do_last_block); 958 lea(TBL, ExternalAddress(K256_W)); 959 960 movdqu(xmm4, Address(INP, 0*16)); 961 movdqu(xmm5, Address(INP, 1*16)); 962 movdqu(xmm6, Address(INP, 2*16)); 963 movdqu(xmm7, Address(INP, 3*16)); 964 965 vpshufb(xmm4, xmm4, xmm13, AVX_128bit); 966 vpshufb(xmm5, xmm5, xmm13, AVX_128bit); 967 vpshufb(xmm6, xmm6, xmm13, AVX_128bit); 968 vpshufb(xmm7, xmm7, xmm13, AVX_128bit); 969 970 jmp(last_block_enter); 971 972 bind(only_one_block); 973 974 // load initial digest ;; table should be preloaded with following values 975 movl(a, Address(CTX, 4*0)); // 0x6a09e667 976 movl(b, Address(CTX, 4*1)); // 0xbb67ae85 977 movl(c, Address(CTX, 4*2)); // 0x3c6ef372 978 movl(d, Address(CTX, 4*3)); // 0xa54ff53a 979 movl(e, Address(CTX, 4*4)); // 0x510e527f 980 movl(f, Address(CTX, 4*5)); // 0x9b05688c 981 movl(g, Address(CTX, 4*6)); // 0x1f83d9ab 982 movl(h, Address(CTX, 4*7)); // 0x5be0cd19 983 984 985 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; 986 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] 987 vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] 988 vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] 989 990 movq(Address(rsp, _CTX), CTX); 991 jmpb(do_last_block); 992 993 bind(done_hash); 994 995 movq(rsp, Address(rsp, _RSP)); 996 997 pop(r15); 998 pop(r14); 999 pop(r13); 1000 pop(r12); 1001 pop(rbp); 1002 #ifdef _WIN64 1003 pop(rdi); 1004 pop(rsi); 1005 #endif 1006 pop(rbx); 1007 1008 #ifdef _WIN64 1009 pop(r9); 1010 pop(r8); 1011 #else 1012 pop(rdx); 1013 pop(rcx); 1014 #endif 1015 1016 if (multi_block) { 1017 #ifdef _WIN64 1018 const Register& limit_end = r9; 1019 const Register& ofs_end = r8; 1020 #else 1021 const Register& limit_end = rcx; 1022 const Register& ofs_end = rdx; 1023 #endif 1024 movq(rax, ofs_end); 1025 1026 bind(compute_size1); 1027 cmpptr(rax, limit_end); // assume the original ofs <= limit 1028 jccb(Assembler::aboveEqual, compute_size_end1); 1029 addq(rax, 64); 1030 jmpb(compute_size1); 1031 1032 bind(compute_size_end1); 1033 } 1034 } 1035 #endif //#ifdef _LP64