476 // increment data pointer and loop if more to process 477 addptr(buf, 64); 478 addptr(ofs, 64); 479 cmpptr(ofs, limit); 480 jcc(Assembler::belowEqual, loop0); 481 movptr(rax, ofs); //return ofs 482 } 483 484 pshufd(state0, state0, 0x1B); 485 pshufd(state1, state1, 0xB1); 486 movdqa(msgtmp4, state0); 487 pblendw(state0, state1, 0xF0); 488 palignr(state1, msgtmp4, 8); 489 490 movdqu(Address(state, 0), state0); 491 movdqu(Address(state, 16), state1); 492 493 bind(done_hash); 494 495 } | 476 // increment data pointer and loop if more to process 477 addptr(buf, 64); 478 addptr(ofs, 64); 479 cmpptr(ofs, limit); 480 jcc(Assembler::belowEqual, loop0); 481 movptr(rax, ofs); //return ofs 482 } 483 484 pshufd(state0, state0, 0x1B); 485 pshufd(state1, state1, 0xB1); 486 movdqa(msgtmp4, state0); 487 pblendw(state0, state1, 0xF0); 488 palignr(state1, msgtmp4, 8); 489 490 movdqu(Address(state, 0), state0); 491 movdqu(Address(state, 16), state1); 492 493 bind(done_hash); 494 495 } 496 497 #ifdef _LP64 498 /* 499 The algorithm below is based on Intel publication: 500 "Fast SHA-256 Implementations on IntelĀ® Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal. 501 The assembly code was originally provided by Sean Gulley and in many places preserves 502 the original assembly NAMES and comments to simplify matching Java assembly with its original. 503 The Java version was substantially redesigned to replace 1200 assembly instruction with 504 much shorter run-time generator of the same code in memory. 505 */ 506 507 void MacroAssembler::sha256_AVX2_one_round_compute( 508 Register reg_old_h, 509 Register reg_a, 510 Register reg_b, 511 Register reg_c, 512 Register reg_d, 513 Register reg_e, 514 Register reg_f, 515 Register reg_g, 516 Register reg_h, 517 int iter) { 518 const Register& reg_y0 = r13; 519 const Register& reg_y1 = r14; 520 const Register& reg_y2 = r15; 521 const Register& reg_y3 = rcx; 522 const Register& reg_T1 = r12; 523 //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;; 524 if (iter%4 > 0) { 525 addl(reg_old_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- 526 } 527 movl(reg_y2, reg_f); // reg_y2 = reg_f ; CH 528 rorxd(reg_y0, reg_e, 25); // reg_y0 = reg_e >> 25 ; S1A 529 rorxd(reg_y1, reg_e, 11); // reg_y1 = reg_e >> 11 ; S1B 530 xorl(reg_y2, reg_g); // reg_y2 = reg_f^reg_g ; CH 531 532 xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_h>>11) ; S1 533 rorxd(reg_y1, reg_e, 6); // reg_y1 = (reg_e >> 6) ; S1 534 andl(reg_y2, reg_e); // reg_y2 = (reg_f^reg_g)®_e ; CH 535 536 if (iter%4 > 0) { 537 addl(reg_old_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- 538 } 539 540 xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 541 rorxd(reg_T1, reg_a, 13); // reg_T1 = reg_a >> 13 ; S0B 542 xorl(reg_y2, reg_g); // reg_y2 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH 543 rorxd(reg_y1, reg_a, 22); // reg_y1 = reg_a >> 22 ; S0A 544 movl(reg_y3, reg_a); // reg_y3 = reg_a ; MAJA 545 546 xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ; S0 547 rorxd(reg_T1, reg_a, 2); // reg_T1 = (reg_a >> 2) ; S0 548 addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; -- 549 orl(reg_y3, reg_c); // reg_y3 = reg_a|reg_c ; MAJA 550 551 xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 552 movl(reg_T1, reg_a); // reg_T1 = reg_a ; MAJB 553 andl(reg_y3, reg_b); // reg_y3 = (reg_a|reg_c)®_b ; MAJA 554 andl(reg_T1, reg_c); // reg_T1 = reg_a®_c ; MAJB 555 addl(reg_y2, reg_y0); // reg_y2 = S1 + CH ; -- 556 557 558 addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- 559 orl(reg_y3, reg_T1); // reg_y3 = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ 560 addl(reg_h, reg_y1); // reg_h = k + w + reg_h + S0 ; -- 561 562 addl(reg_d, reg_y2); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- 563 564 565 if (iter%4 == 3) { 566 addl(reg_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- 567 addl(reg_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- 568 } 569 } 570 571 void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) { 572 sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi, r8, r9, r10, r11, start + 0); 573 sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi, r8, r9, r10, start + 1); 574 sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi, r8, r9, start + 2); 575 sha256_AVX2_one_round_compute(r9, r9, r10, r11, rax, rbx, rdi, rsi, r8, start + 3); 576 } 577 578 void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) { 579 sha256_AVX2_one_round_compute(r8, r8, r9, r10, r11, rax, rbx, rdi, rsi, start + 0); 580 sha256_AVX2_one_round_compute(rsi, rsi, r8, r9, r10, r11, rax, rbx, rdi, start + 1); 581 sha256_AVX2_one_round_compute(rdi, rdi, rsi, r8, r9, r10, r11, rax, rbx, start + 2); 582 sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi, r8, r9, r10, r11, rax, start + 3); 583 } 584 585 void MacroAssembler::sha256_AVX2_one_round_and_sched( 586 XMMRegister xmm_0, /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */ 587 XMMRegister xmm_1, /* ymm5 */ /* full cycle is 16 iterations */ 588 XMMRegister xmm_2, /* ymm6 */ 589 XMMRegister xmm_3, /* ymm7 */ 590 Register reg_a, /* == rax on 0 iteration, then rotate 8 register right on each next iteration */ 591 Register reg_b, /* rbx */ /* full cycle is 8 iterations */ 592 Register reg_c, /* rdi */ 593 Register reg_d, /* rsi */ 594 Register reg_e, /* r8 */ 595 Register reg_f, /* r9d */ 596 Register reg_g, /* r10d */ 597 Register reg_h, /* r11d */ 598 int iter) 599 { 600 movl(rcx, reg_a); // rcx = reg_a ; MAJA 601 rorxd(r13, reg_e, 25); // r13 = reg_e >> 25 ; S1A 602 rorxd(r14, reg_e, 11); // r14 = reg_e >> 11 ; S1B 603 addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); 604 orl(rcx, reg_c); // rcx = reg_a|reg_c ; MAJA 605 606 movl(r15, reg_f); // r15 = reg_f ; CH 607 rorxd(r12, reg_a, 13); // r12 = reg_a >> 13 ; S0B 608 xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ; S1 609 xorl(r15, reg_g); // r15 = reg_f^reg_g ; CH 610 611 rorxd(r14, reg_e, 6); // r14 = (reg_e >> 6) ; S1 612 andl(r15, reg_e); // r15 = (reg_f^reg_g)®_e ; CH 613 614 xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 615 rorxd(r14, reg_a, 22); // r14 = reg_a >> 22 ; S0A 616 addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- 617 618 andl(rcx, reg_b); // rcx = (reg_a|reg_c)®_b ; MAJA 619 xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ; S0 620 621 rorxd(r12, reg_a, 2); // r12 = (reg_a >> 2) ; S0 622 xorl(r15, reg_g); // r15 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH 623 624 xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 625 movl(r12, reg_a); // r12 = reg_a ; MAJB 626 andl(r12, reg_c); // r12 = reg_a®_c ; MAJB 627 addl(r15, r13); // r15 = S1 + CH ; -- 628 629 orl(rcx, r12); // rcx = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ 630 addl(reg_h, r14); // reg_h = k + w + reg_h + S0 ; -- 631 addl(reg_d, r15); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- 632 633 addl(reg_h, r15); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- 634 addl(reg_h, rcx); // reg_h = t1 + S0 + MAJ ; -- 635 636 if (iter%4 == 0) { 637 vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit); // ymm0 = W[-7] 638 vpaddd(xmm0, xmm0, xmm_0, AVX_256bit); // ymm0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1 639 vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit); // ymm1 = W[-15] 640 vpsrld(xmm2, xmm1, 7, AVX_256bit); 641 vpslld(xmm3, xmm1, 32-7, AVX_256bit); 642 vpor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 643 vpsrld(xmm2, xmm1,18, AVX_256bit); 644 } else if (iter%4 == 1 ) { 645 vpsrld(xmm8, xmm1, 3, AVX_256bit); // ymm8 = W[-15] >> 3 646 vpslld(xmm1, xmm1, 32-18, AVX_256bit); 647 vpxor(xmm3, xmm3, xmm1, AVX_256bit); 648 vpxor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 ^ W[-15] ror 18 649 vpxor(xmm1, xmm3, xmm8, AVX_256bit); // ymm1 = s0 650 vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit); // 11111010b ; ymm2 = W[-2] {BBAA} 651 vpaddd(xmm0, xmm0, xmm1, AVX_256bit); // ymm0 = W[-16] + W[-7] + s0 652 vpsrld(xmm8, xmm2, 10, AVX_256bit); // ymm8 = W[-2] >> 10 {BBAA} 653 } else if (iter%4 == 2) { 654 vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xBxA} 655 vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xBxA} 656 vpxor(xmm2, xmm2, xmm3, AVX_256bit); 657 vpxor(xmm8, xmm8, xmm2, AVX_256bit); // ymm8 = s1 {xBxA} 658 vpshufb(xmm8, xmm8, xmm10, AVX_256bit); // ymm8 = s1 {00BA} 659 vpaddd(xmm0, xmm0, xmm8, AVX_256bit); // ymm0 = {..., ..., W[1], W[0]} 660 vpshufd(xmm2, xmm0, 0x50, AVX_256bit); // 01010000b ; ymm2 = W[-2] {DDCC} 661 } else if (iter%4 == 3) { 662 vpsrld(xmm11, xmm2, 10, AVX_256bit); // ymm11 = W[-2] >> 10 {DDCC} 663 vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xDxC} 664 vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xDxC} 665 vpxor(xmm2, xmm2, xmm3, AVX_256bit); 666 vpxor(xmm11, xmm11, xmm2, AVX_256bit); // ymm11 = s1 {xDxC} 667 vpshufb(xmm11, xmm11, xmm12, AVX_256bit); // ymm11 = s1 {DC00} 668 vpaddd(xmm_0, xmm11, xmm0, AVX_256bit); // xmm_0 = {W[3], W[2], W[1], W[0]} 669 } 670 } 671 672 void MacroAssembler::addm(int disp, Register r1, Register r2) { 673 addl(r2, Address(r1, disp)); 674 movl(Address(r1, disp), r2); 675 } 676 677 void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, 678 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, 679 Register buf, Register state, Register ofs, Register limit, Register rsp, 680 bool multi_block, XMMRegister shuf_mask) { 681 682 Label loop0, loop1, loop2, loop3, 683 last_block_enter, do_last_block, only_one_block, done_hash, 684 compute_size, compute_size_end, 685 compute_size1, compute_size_end1; 686 687 address K256_W = StubRoutines::x86::k256_W_addr(); 688 address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); 689 address pshuffle_byte_flip_mask_addr = 0; 690 691 const XMMRegister& SHUF_00BA = xmm10; // ymm10: shuffle xBxA -> 00BA 692 const XMMRegister& SHUF_DC00 = xmm12; // ymm12: shuffle xDxC -> DC00 693 const XMMRegister& BYTE_FLIP_MASK = xmm13; // ymm13 694 695 const XMMRegister& X_BYTE_FLIP_MASK = xmm13; //XMM version of BYTE_FLIP_MASK 696 697 const Register& NUM_BLKS = r8; // 3rd arg 698 const Register& CTX = rdx; // 2nd arg 699 const Register& INP = rcx; // 1st arg 700 701 const Register& c = rdi; 702 const Register& d = rsi; 703 const Register& e = r8; // clobbers NUM_BLKS 704 const Register& y3 = rcx; // clobbers INP 705 706 const Register& TBL = rbp; 707 const Register& SRND = CTX; // SRND is same register as CTX 708 709 const Register& a = rax; 710 const Register& b = rbx; 711 const Register& f = r9; 712 const Register& g = r10; 713 const Register& h = r11; 714 715 const Register& T1 = r12; 716 const Register& y0 = r13; 717 const Register& y1 = r14; 718 const Register& y2 = r15; 719 720 721 enum { 722 _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round 723 #ifndef _WIN64 724 _XMM_SAVE_SIZE = 0, 725 #else 726 _XMM_SAVE_SIZE = 8*16, 727 #endif 728 _INP_END_SIZE = 8, 729 _INP_SIZE = 8, 730 _CTX_SIZE = 8, 731 _RSP_SIZE = 8, 732 733 _XFER = 0, 734 _XMM_SAVE = _XFER + _XFER_SIZE, 735 _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE, 736 _INP = _INP_END + _INP_END_SIZE, 737 _CTX = _INP + _INP_SIZE, 738 _RSP = _CTX + _CTX_SIZE, 739 STACK_SIZE = _RSP + _RSP_SIZE, 740 }; 741 742 #ifndef _WIN64 743 push(rcx); // linux: this is limit, need at the end 744 push(rdx); // linux: this is ofs 745 #else 746 push(r8); // win64: this is ofs 747 push(r9); // win64: this is limit, we need them again at the very and 748 #endif 749 750 751 push(rbx); 752 #ifdef _WIN64 753 push(rsi); 754 push(rdi); 755 #endif 756 push(rbp); 757 push(r12); 758 push(r13); 759 push(r14); 760 push(r15); 761 762 movq(rax, rsp); 763 subq(rsp, STACK_SIZE); 764 andq(rsp, -32); 765 movq(Address(rsp, _RSP), rax); 766 767 #ifndef _WIN64 768 // copy linux params to win64 params, therefore the rest of code will be the same for both 769 movq(r9, rcx); 770 movq(r8, rdx); 771 movq(rdx, rsi); 772 movq(rcx, rdi); 773 #endif 774 775 // setting original assembly ABI 776 /** message to encrypt in INP */ 777 lea(INP, Address(rcx, 0)); // rcx == message (buf) ;; linux: INP = buf = rdi 778 /** digest in CTX */ 779 movq(CTX, rdx); // rdx = digest (state) ;; linux: CTX = state = rsi 780 781 /** NUM_BLK is the length of message, need to set it from ofs and limit */ 782 if (multi_block) { 783 784 // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8 785 // on entry r8 = ofs 786 // on exit r8 = NUM_BLKS 787 788 xorq(rax, rax); 789 790 bind(compute_size); 791 cmpptr(r8, r9); // assume the original ofs <= limit ;; linux: cmp rcx, rdx 792 jccb(Assembler::aboveEqual, compute_size_end); 793 addq(r8, 64); //;; linux: ofs = rdx 794 addq(rax, 64); 795 jmpb(compute_size); 796 797 bind(compute_size_end); 798 movq(NUM_BLKS, rax); // NUM_BLK (r8) ;; linux: NUM_BLK = rdx 799 800 cmpq(NUM_BLKS, 0); 801 jcc(Assembler::equal, done_hash); 802 803 } else { 804 xorq(NUM_BLKS, NUM_BLKS); 805 addq(NUM_BLKS, 64); 806 }//if (!multi_block) 807 808 lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block 809 movq(Address(rsp, _INP_END), NUM_BLKS); // 810 811 cmpptr(INP, NUM_BLKS); //cmp INP, NUM_BLKS 812 jcc(Assembler::equal, only_one_block); //je only_one_block 813 814 // load initial digest 815 movl(a, Address(CTX, 4*0)); 816 movl(b, Address(CTX, 4*1)); 817 movl(c, Address(CTX, 4*2)); 818 movl(d, Address(CTX, 4*3)); 819 movl(e, Address(CTX, 4*4)); 820 movl(f, Address(CTX, 4*5)); 821 movl(g, Address(CTX, 4*6)); 822 movl(h, Address(CTX, 4*7)); 823 824 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; 825 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] 826 vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] 827 vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] 828 829 movq(Address(rsp, _CTX), CTX); // store 830 831 bind(loop0); 832 lea(TBL, ExternalAddress(K256_W)); 833 834 // assume buffers not aligned 835 836 // Load first 16 dwords from two blocks 837 vmovdqu(xmm0, Address(INP, 0*32)); 838 vmovdqu(xmm1, Address(INP, 1*32)); 839 vmovdqu(xmm2, Address(INP, 2*32)); 840 vmovdqu(xmm3, Address(INP, 3*32)); 841 842 // byte swap data 843 vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit); 844 vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit); 845 vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit); 846 vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit); 847 848 // transpose data into high/low halves 849 vperm2i128(xmm4, xmm0, xmm2, 0x20); 850 vperm2i128(xmm5, xmm0, xmm2, 0x31); 851 vperm2i128(xmm6, xmm1, xmm3, 0x20); 852 vperm2i128(xmm7, xmm1, xmm3, 0x31); 853 854 bind(last_block_enter); 855 addq(INP, 64); 856 movq(Address(rsp, _INP), INP); 857 858 //;; schedule 48 input dwords, by doing 3 rounds of 12 each 859 xorq(SRND, SRND); 860 861 align(16); 862 bind(loop1); 863 vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); 864 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); 865 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8, r9, r10, r11, 0); 866 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8, r9, r10, 1); 867 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8, r9, 2); 868 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9, r10, r11, rax, rbx, rdi, rsi, r8, 3); 869 870 vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); 871 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); 872 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8, r9, r10, r11, rax, rbx, rdi, rsi, 8+0); 873 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8, r9, r10, r11, rax, rbx, rdi, 8+1); 874 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8, r9, r10, r11, rax, rbx, 8+2); 875 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8, r9, r10, r11, rax, 8+3); 876 877 vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit); 878 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9); 879 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8, r9, r10, r11, 16+0); 880 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8, r9, r10, 16+1); 881 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8, r9, 16+2); 882 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9, r10, r11, rax, rbx, rdi, rsi, r8, 16+3); 883 884 vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit); 885 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9); 886 887 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8, r9, r10, r11, rax, rbx, rdi, rsi, 24+0); 888 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8, r9, r10, r11, rax, rbx, rdi, 24+1); 889 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8, r9, r10, r11, rax, rbx, 24+2); 890 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8, r9, r10, r11, rax, 24+3); 891 892 addq(SRND, 4*32); 893 cmpq(SRND, 3 * 4*32); 894 jcc(Assembler::below, loop1); 895 896 bind(loop2); 897 // Do last 16 rounds with no scheduling 898 vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); 899 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); 900 sha256_AVX2_four_rounds_compute_first(0); 901 902 vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); 903 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); 904 sha256_AVX2_four_rounds_compute_last(0 + 8); 905 906 addq(SRND, 2*32); 907 908 vmovdqu(xmm4, xmm6); 909 vmovdqu(xmm5, xmm7); 910 911 cmpq(SRND, 4 * 4*32); 912 jcc(Assembler::below, loop2); 913 914 movq(CTX, Address(rsp, _CTX)); 915 movq(INP, Address(rsp, _INP)); 916 917 addm(4*0, CTX, a); 918 addm(4*1, CTX, b); 919 addm(4*2, CTX, c); 920 addm(4*3, CTX, d); 921 addm(4*4, CTX, e); 922 addm(4*5, CTX, f); 923 addm(4*6, CTX, g); 924 addm(4*7, CTX, h); 925 926 cmpq(INP, Address(rsp, _INP_END)); 927 jcc(Assembler::above, done_hash); 928 929 //Do second block using previously scheduled results 930 xorq(SRND, SRND); 931 align(16); 932 bind(loop3); 933 sha256_AVX2_four_rounds_compute_first(4); 934 sha256_AVX2_four_rounds_compute_last(4+8); 935 936 addq(SRND, 2*32); 937 cmpq(SRND, 4 * 4*32); 938 jcc(Assembler::below, loop3); 939 940 movq(CTX, Address(rsp, _CTX)); 941 movq(INP, Address(rsp, _INP)); 942 addq(INP, 64); 943 944 addm(4*0, CTX, a); 945 addm(4*1, CTX, b); 946 addm(4*2, CTX, c); 947 addm(4*3, CTX, d); 948 addm(4*4, CTX, e); 949 addm(4*5, CTX, f); 950 addm(4*6, CTX, g); 951 addm(4*7, CTX, h); 952 953 cmpq(INP, Address(rsp, _INP_END)); 954 jcc(Assembler::below, loop0); 955 jccb(Assembler::above, done_hash); 956 957 bind(do_last_block); 958 lea(TBL, ExternalAddress(K256_W)); 959 960 movdqu(xmm4, Address(INP, 0*16)); 961 movdqu(xmm5, Address(INP, 1*16)); 962 movdqu(xmm6, Address(INP, 2*16)); 963 movdqu(xmm7, Address(INP, 3*16)); 964 965 vpshufb(xmm4, xmm4, xmm13, AVX_128bit); 966 vpshufb(xmm5, xmm5, xmm13, AVX_128bit); 967 vpshufb(xmm6, xmm6, xmm13, AVX_128bit); 968 vpshufb(xmm7, xmm7, xmm13, AVX_128bit); 969 970 jmp(last_block_enter); 971 972 bind(only_one_block); 973 974 // load initial digest ;; table should be preloaded with following values 975 movl(a, Address(CTX, 4*0)); // 0x6a09e667 976 movl(b, Address(CTX, 4*1)); // 0xbb67ae85 977 movl(c, Address(CTX, 4*2)); // 0x3c6ef372 978 movl(d, Address(CTX, 4*3)); // 0xa54ff53a 979 movl(e, Address(CTX, 4*4)); // 0x510e527f 980 movl(f, Address(CTX, 4*5)); // 0x9b05688c 981 movl(g, Address(CTX, 4*6)); // 0x1f83d9ab 982 movl(h, Address(CTX, 4*7)); // 0x5be0cd19 983 984 985 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; 986 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] 987 vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] 988 vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] 989 990 movq(Address(rsp, _CTX), CTX); 991 jmpb(do_last_block); 992 993 bind(done_hash); 994 995 movq(rsp, Address(rsp, _RSP)); 996 997 pop(r15); 998 pop(r14); 999 pop(r13); 1000 pop(r12); 1001 pop(rbp); 1002 #ifdef _WIN64 1003 pop(rdi); 1004 pop(rsi); 1005 #endif 1006 pop(rbx); 1007 1008 #ifdef _WIN64 1009 pop(r9); 1010 pop(r8); 1011 #else 1012 pop(rdx); 1013 pop(rcx); 1014 #endif 1015 1016 if (multi_block) { 1017 #ifdef _WIN64 1018 const Register& limit_end = r9; 1019 const Register& ofs_end = r8; 1020 #else 1021 const Register& limit_end = rcx; 1022 const Register& ofs_end = rdx; 1023 #endif 1024 movq(rax, ofs_end); 1025 1026 bind(compute_size1); 1027 cmpptr(rax, limit_end); // assume the original ofs <= limit 1028 jccb(Assembler::aboveEqual, compute_size_end1); 1029 addq(rax, 64); 1030 jmpb(compute_size1); 1031 1032 bind(compute_size_end1); 1033 } 1034 } 1035 #endif //#ifdef _LP64 |