< prev index next >

src/cpu/x86/vm/macroAssembler_x86_sha.cpp

Print this page




 476     // increment data pointer and loop if more to process
 477     addptr(buf, 64);
 478     addptr(ofs, 64);
 479     cmpptr(ofs, limit);
 480     jcc(Assembler::belowEqual, loop0);
 481     movptr(rax, ofs); //return ofs
 482   }
 483 
 484   pshufd(state0, state0, 0x1B);
 485   pshufd(state1, state1, 0xB1);
 486   movdqa(msgtmp4, state0);
 487   pblendw(state0, state1, 0xF0);
 488   palignr(state1, msgtmp4, 8);
 489 
 490   movdqu(Address(state, 0), state0);
 491   movdqu(Address(state, 16), state1);
 492 
 493   bind(done_hash);
 494 
 495 }






























































































































































































































































































































































































































































































































































 476     // increment data pointer and loop if more to process
 477     addptr(buf, 64);
 478     addptr(ofs, 64);
 479     cmpptr(ofs, limit);
 480     jcc(Assembler::belowEqual, loop0);
 481     movptr(rax, ofs); //return ofs
 482   }
 483 
 484   pshufd(state0, state0, 0x1B);
 485   pshufd(state1, state1, 0xB1);
 486   movdqa(msgtmp4, state0);
 487   pblendw(state0, state1, 0xF0);
 488   palignr(state1, msgtmp4, 8);
 489 
 490   movdqu(Address(state, 0), state0);
 491   movdqu(Address(state, 16), state1);
 492 
 493   bind(done_hash);
 494 
 495 }
 496 
 497 #ifdef _LP64
 498 /*
 499   The algorithm below is based on Intel publication:
 500   "Fast SHA-256 Implementations on IntelĀ® Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal.
 501   The assembly code was originally provided by Sean Gulley and in many places preserves
 502   the original assembly NAMES and comments to simplify matching Java assembly with its original.
 503   The Java version was substantially redesigned to replace 1200 assembly instruction with
 504   much shorter run-time generator of the same code in memory.
 505 */
 506 
 507 void MacroAssembler::sha256_AVX2_one_round_compute(
 508     Register  reg_old_h,
 509     Register  reg_a,
 510     Register  reg_b,
 511     Register  reg_c,
 512     Register  reg_d,
 513     Register  reg_e,
 514     Register  reg_f,
 515     Register  reg_g,
 516     Register  reg_h,
 517     int iter) {
 518   const Register& reg_y0     = r13;
 519   const Register& reg_y1     = r14;
 520   const Register& reg_y2     = r15;
 521   const Register& reg_y3     = rcx;
 522   const Register& reg_T1     = r12;
 523   //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;;
 524   if (iter%4 > 0) {
 525     addl(reg_old_h, reg_y2);   // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
 526   }
 527   movl(reg_y2, reg_f);         // reg_y2 = reg_f                                ; CH
 528   rorxd(reg_y0, reg_e, 25);    // reg_y0 = reg_e >> 25   ; S1A
 529   rorxd(reg_y1, reg_e, 11);    // reg_y1 = reg_e >> 11    ; S1B
 530   xorl(reg_y2, reg_g);         // reg_y2 = reg_f^reg_g                              ; CH
 531 
 532   xorl(reg_y0, reg_y1);        // reg_y0 = (reg_e>>25) ^ (reg_h>>11)  ; S1
 533   rorxd(reg_y1, reg_e, 6);     // reg_y1 = (reg_e >> 6)    ; S1
 534   andl(reg_y2, reg_e);         // reg_y2 = (reg_f^reg_g)&reg_e                          ; CH
 535 
 536   if (iter%4 > 0) {
 537     addl(reg_old_h, reg_y3);   // reg_h = t1 + S0 + MAJ                     ; --
 538   }
 539 
 540   xorl(reg_y0, reg_y1);       // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
 541   rorxd(reg_T1, reg_a, 13);   // reg_T1 = reg_a >> 13    ; S0B
 542   xorl(reg_y2, reg_g);        // reg_y2 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
 543   rorxd(reg_y1, reg_a, 22);   // reg_y1 = reg_a >> 22    ; S0A
 544   movl(reg_y3, reg_a);        // reg_y3 = reg_a                                ; MAJA
 545 
 546   xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13)  ; S0
 547   rorxd(reg_T1, reg_a, 2);    // reg_T1 = (reg_a >> 2)    ; S0
 548   addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; --
 549   orl(reg_y3, reg_c);         // reg_y3 = reg_a|reg_c                              ; MAJA
 550 
 551   xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
 552   movl(reg_T1, reg_a);        // reg_T1 = reg_a                                ; MAJB
 553   andl(reg_y3, reg_b);        // reg_y3 = (reg_a|reg_c)&reg_b                          ; MAJA
 554   andl(reg_T1, reg_c);        // reg_T1 = reg_a&reg_c                              ; MAJB
 555   addl(reg_y2, reg_y0);       // reg_y2 = S1 + CH                          ; --
 556 
 557 
 558   addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
 559   orl(reg_y3, reg_T1);        // reg_y3 = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
 560   addl(reg_h, reg_y1);        // reg_h = k + w + reg_h + S0                    ; --
 561 
 562   addl(reg_d, reg_y2);        // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
 563 
 564 
 565   if (iter%4 == 3) {
 566     addl(reg_h, reg_y2);      // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
 567     addl(reg_h, reg_y3);      // reg_h = t1 + S0 + MAJ                     ; --
 568   }
 569 }
 570 
 571 void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) {
 572     sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi,  r8,  r9, r10, r11, start + 0);
 573     sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi,  r8,  r9, r10, start + 1);
 574     sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi,  r8,  r9, start + 2);
 575     sha256_AVX2_one_round_compute(r9,  r9,  r10, r11, rax, rbx, rdi, rsi,  r8, start + 3);
 576 }
 577 
 578 void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) {
 579     sha256_AVX2_one_round_compute(r8,  r8,   r9, r10, r11, rax, rbx, rdi, rsi, start + 0);
 580     sha256_AVX2_one_round_compute(rsi, rsi,  r8,  r9, r10, r11, rax, rbx, rdi, start + 1);
 581     sha256_AVX2_one_round_compute(rdi, rdi, rsi,  r8,  r9, r10, r11, rax, rbx, start + 2);
 582     sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi,  r8,  r9, r10, r11, rax, start + 3);
 583 }
 584 
 585 void MacroAssembler::sha256_AVX2_one_round_and_sched(
 586         XMMRegister  xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
 587         XMMRegister  xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
 588         XMMRegister  xmm_2,     /* ymm6 */
 589         XMMRegister  xmm_3,     /* ymm7 */
 590         Register  reg_a,        /* == rax on 0 iteration, then rotate 8 register right on each next iteration */
 591         Register  reg_b,        /* rbx */    /* full cycle is 8 iterations */
 592         Register  reg_c,        /* rdi */
 593         Register  reg_d,        /* rsi */
 594         Register  reg_e,        /* r8 */
 595         Register  reg_f,        /* r9d */
 596         Register  reg_g,        /* r10d */
 597         Register  reg_h,        /* r11d */
 598         int iter)
 599 {
 600   movl(rcx, reg_a);           // rcx = reg_a               ; MAJA
 601   rorxd(r13, reg_e, 25);      // r13 = reg_e >> 25    ; S1A
 602   rorxd(r14, reg_e, 11);      //  r14 = reg_e >> 11    ; S1B
 603   addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter));
 604   orl(rcx, reg_c);            // rcx = reg_a|reg_c          ; MAJA
 605 
 606   movl(r15, reg_f);           // r15 = reg_f               ; CH
 607   rorxd(r12, reg_a, 13);      // r12 = reg_a >> 13      ; S0B
 608   xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11)  ; S1
 609   xorl(r15, reg_g);           // r15 = reg_f^reg_g         ; CH
 610 
 611   rorxd(r14, reg_e, 6);       // r14 = (reg_e >> 6)    ; S1
 612   andl(r15, reg_e);           // r15 = (reg_f^reg_g)&reg_e ; CH
 613 
 614   xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
 615   rorxd(r14, reg_a, 22);      // r14 = reg_a >> 22    ; S0A
 616   addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
 617 
 618   andl(rcx, reg_b);          // rcx = (reg_a|reg_c)&reg_b                          ; MAJA
 619   xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13)  ; S0
 620 
 621   rorxd(r12, reg_a, 2);      // r12 = (reg_a >> 2)    ; S0
 622   xorl(r15, reg_g);          // r15 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
 623 
 624   xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
 625   movl(r12, reg_a);          // r12 = reg_a                                ; MAJB
 626   andl(r12, reg_c);          // r12 = reg_a&reg_c                              ; MAJB
 627   addl(r15, r13);            // r15 = S1 + CH                          ; --
 628 
 629   orl(rcx, r12);             // rcx = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
 630   addl(reg_h, r14);          // reg_h = k + w + reg_h + S0                    ; --
 631   addl(reg_d, r15);          // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
 632 
 633   addl(reg_h, r15);          // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
 634   addl(reg_h, rcx);          // reg_h = t1 + S0 + MAJ                     ; --
 635 
 636   if (iter%4 == 0) {
 637     vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit);   // ymm0 = W[-7]
 638     vpaddd(xmm0, xmm0, xmm_0, AVX_256bit);         // ymm0 = W[-7] + W[-16]; y1 = (e >> 6)     ; S1
 639     vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit);   // ymm1 = W[-15]
 640     vpsrld(xmm2, xmm1, 7, AVX_256bit);
 641     vpslld(xmm3, xmm1, 32-7, AVX_256bit);
 642     vpor(xmm3, xmm3, xmm2, AVX_256bit);            // ymm3 = W[-15] ror 7
 643     vpsrld(xmm2, xmm1,18, AVX_256bit);
 644   } else if (iter%4 == 1 ) {
 645     vpsrld(xmm8, xmm1, 3, AVX_256bit);             // ymm8 = W[-15] >> 3
 646     vpslld(xmm1, xmm1, 32-18, AVX_256bit);
 647     vpxor(xmm3, xmm3, xmm1, AVX_256bit);
 648     vpxor(xmm3, xmm3, xmm2, AVX_256bit);           // ymm3 = W[-15] ror 7 ^ W[-15] ror 18
 649     vpxor(xmm1, xmm3, xmm8, AVX_256bit);           // ymm1 = s0
 650     vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit);        // 11111010b ; ymm2 = W[-2] {BBAA}
 651     vpaddd(xmm0, xmm0, xmm1, AVX_256bit);          // ymm0 = W[-16] + W[-7] + s0
 652     vpsrld(xmm8, xmm2, 10, AVX_256bit);            // ymm8 = W[-2] >> 10 {BBAA}
 653   } else if (iter%4 == 2) {
 654     vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xBxA}
 655     vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xBxA}
 656     vpxor(xmm2, xmm2, xmm3, AVX_256bit);
 657     vpxor(xmm8, xmm8, xmm2, AVX_256bit);           // ymm8 = s1 {xBxA}
 658     vpshufb(xmm8, xmm8, xmm10, AVX_256bit);        // ymm8 = s1 {00BA}
 659     vpaddd(xmm0, xmm0, xmm8, AVX_256bit);          // ymm0 = {..., ..., W[1], W[0]}
 660     vpshufd(xmm2, xmm0, 0x50, AVX_256bit);         // 01010000b ; ymm2 = W[-2] {DDCC}
 661   } else if (iter%4 == 3) {
 662     vpsrld(xmm11, xmm2, 10, AVX_256bit);           // ymm11 = W[-2] >> 10 {DDCC}
 663     vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xDxC}
 664     vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xDxC}
 665     vpxor(xmm2, xmm2, xmm3, AVX_256bit);
 666     vpxor(xmm11, xmm11, xmm2, AVX_256bit);         // ymm11 = s1 {xDxC}
 667     vpshufb(xmm11, xmm11, xmm12, AVX_256bit);      // ymm11 = s1 {DC00}
 668     vpaddd(xmm_0, xmm11, xmm0, AVX_256bit);        // xmm_0 = {W[3], W[2], W[1], W[0]}
 669   }
 670 }
 671 
 672 void MacroAssembler::addm(int disp, Register r1, Register r2) {
 673   addl(r2, Address(r1, disp));
 674   movl(Address(r1, disp), r2);
 675 }
 676 
 677 void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
 678   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
 679   Register buf, Register state, Register ofs, Register limit, Register rsp,
 680   bool multi_block, XMMRegister shuf_mask) {
 681 
 682   Label loop0, loop1, loop2, loop3,
 683         last_block_enter, do_last_block, only_one_block, done_hash,
 684         compute_size, compute_size_end,
 685         compute_size1, compute_size_end1;
 686 
 687   address K256_W = StubRoutines::x86::k256_W_addr();
 688   address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
 689   address pshuffle_byte_flip_mask_addr = 0;
 690 
 691 const XMMRegister& SHUF_00BA        = xmm10;    // ymm10: shuffle xBxA -> 00BA
 692 const XMMRegister& SHUF_DC00        = xmm12;    // ymm12: shuffle xDxC -> DC00
 693 const XMMRegister& BYTE_FLIP_MASK    = xmm13;   // ymm13
 694 
 695 const XMMRegister& X_BYTE_FLIP_MASK  = xmm13;   //XMM version of BYTE_FLIP_MASK
 696 
 697 const Register& NUM_BLKS = r8;   // 3rd arg
 698 const Register& CTX      = rdx;  // 2nd arg
 699 const Register& INP      = rcx;  // 1st arg
 700 
 701 const Register& c       = rdi;
 702 const Register& d       = rsi;
 703 const Register& e       = r8;    // clobbers NUM_BLKS
 704 const Register& y3       = rcx;  // clobbers INP
 705 
 706 const Register& TBL      = rbp;
 707 const Register& SRND    = CTX;   // SRND is same register as CTX
 708 
 709 const Register& a        = rax;
 710 const Register& b        = rbx;
 711 const Register& f        = r9;
 712 const Register& g        = r10;
 713 const Register& h        = r11;
 714 
 715 const Register& T1       = r12;
 716 const Register& y0       = r13;
 717 const Register& y1       = r14;
 718 const Register& y2       = r15;
 719 
 720 
 721 enum {
 722   _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round
 723 #ifndef _WIN64
 724   _XMM_SAVE_SIZE = 0,
 725 #else
 726   _XMM_SAVE_SIZE = 8*16,
 727 #endif
 728   _INP_END_SIZE = 8,
 729   _INP_SIZE = 8,
 730   _CTX_SIZE = 8,
 731   _RSP_SIZE = 8,
 732 
 733   _XFER = 0,
 734   _XMM_SAVE  = _XFER     + _XFER_SIZE,
 735   _INP_END  = _XMM_SAVE + _XMM_SAVE_SIZE,
 736   _INP     = _INP_END  + _INP_END_SIZE,
 737   _CTX     = _INP      + _INP_SIZE,
 738   _RSP     = _CTX      + _CTX_SIZE,
 739   STACK_SIZE = _RSP      + _RSP_SIZE,
 740 };
 741 
 742 #ifndef _WIN64
 743   push(rcx);    // linux: this is limit, need at the end
 744   push(rdx);    // linux: this is ofs
 745 #else
 746   push(r8);     // win64: this is ofs
 747   push(r9);     // win64: this is limit, we need them again at the very and
 748 #endif
 749 
 750 
 751   push(rbx);
 752 #ifdef _WIN64
 753   push(rsi);
 754   push(rdi);
 755 #endif
 756   push(rbp);
 757   push(r12);
 758   push(r13);
 759   push(r14);
 760   push(r15);
 761 
 762   movq(rax, rsp);
 763   subq(rsp, STACK_SIZE);
 764   andq(rsp, -32);
 765   movq(Address(rsp, _RSP), rax);
 766 
 767 #ifndef _WIN64
 768   // copy linux params to win64 params, therefore the rest of code will be the same for both
 769   movq(r9,  rcx);
 770   movq(r8,  rdx);
 771   movq(rdx, rsi);
 772   movq(rcx, rdi);
 773 #endif
 774 
 775   // setting original assembly ABI
 776   /** message to encrypt in INP */
 777   lea(INP, Address(rcx, 0));    // rcx == message (buf)     ;; linux: INP = buf = rdi
 778   /** digest in CTX             */
 779   movq(CTX, rdx);               // rdx = digest  (state)    ;; linux: CTX = state = rsi
 780 
 781   /** NUM_BLK is the length of message, need to set it from ofs and limit  */
 782   if (multi_block) {
 783 
 784     // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8
 785     // on entry r8 = ofs
 786     // on exit  r8 = NUM_BLKS
 787 
 788     xorq(rax, rax);
 789 
 790     bind(compute_size);
 791     cmpptr(r8, r9); // assume the original ofs <= limit ;; linux:  cmp rcx, rdx
 792     jccb(Assembler::aboveEqual, compute_size_end);
 793     addq(r8, 64);                                          //;; linux: ofs = rdx
 794     addq(rax, 64);
 795     jmpb(compute_size);
 796 
 797     bind(compute_size_end);
 798     movq(NUM_BLKS, rax);  // NUM_BLK (r8)                  ;; linux: NUM_BLK = rdx
 799 
 800     cmpq(NUM_BLKS, 0);
 801     jcc(Assembler::equal, done_hash);
 802 
 803     } else {
 804     xorq(NUM_BLKS, NUM_BLKS);
 805     addq(NUM_BLKS, 64);
 806   }//if (!multi_block)
 807 
 808   lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block
 809   movq(Address(rsp, _INP_END), NUM_BLKS);  //
 810 
 811   cmpptr(INP, NUM_BLKS);                   //cmp INP, NUM_BLKS
 812   jcc(Assembler::equal, only_one_block);   //je only_one_block
 813 
 814   // load initial digest
 815   movl(a, Address(CTX, 4*0));
 816   movl(b, Address(CTX, 4*1));
 817   movl(c, Address(CTX, 4*2));
 818   movl(d, Address(CTX, 4*3));
 819   movl(e, Address(CTX, 4*4));
 820   movl(f, Address(CTX, 4*5));
 821   movl(g, Address(CTX, 4*6));
 822   movl(h, Address(CTX, 4*7));
 823 
 824   pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
 825   vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
 826   vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
 827   vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
 828 
 829   movq(Address(rsp, _CTX), CTX);           // store
 830 
 831 bind(loop0);
 832   lea(TBL, ExternalAddress(K256_W));
 833 
 834   // assume buffers not aligned
 835 
 836   // Load first 16 dwords from two blocks
 837   vmovdqu(xmm0, Address(INP, 0*32));
 838   vmovdqu(xmm1, Address(INP, 1*32));
 839   vmovdqu(xmm2, Address(INP, 2*32));
 840   vmovdqu(xmm3, Address(INP, 3*32));
 841 
 842   // byte swap data
 843   vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit);
 844   vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit);
 845   vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit);
 846   vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit);
 847 
 848   // transpose data into high/low halves
 849   vperm2i128(xmm4, xmm0, xmm2, 0x20);
 850   vperm2i128(xmm5, xmm0, xmm2, 0x31);
 851   vperm2i128(xmm6, xmm1, xmm3, 0x20);
 852   vperm2i128(xmm7, xmm1, xmm3, 0x31);
 853 
 854 bind(last_block_enter);
 855   addq(INP, 64);
 856   movq(Address(rsp, _INP), INP);
 857 
 858   //;; schedule 48 input dwords, by doing 3 rounds of 12 each
 859   xorq(SRND, SRND);
 860 
 861 align(16);
 862 bind(loop1);
 863   vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
 864   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
 865   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 0);
 866   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 1);
 867   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  2);
 868   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  3);
 869 
 870   vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
 871   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
 872   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  8+0);
 873   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  8+1);
 874   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  8+2);
 875   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  8+3);
 876 
 877   vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit);
 878   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9);
 879   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 16+0);
 880   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 16+1);
 881   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  16+2);
 882   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  16+3);
 883 
 884   vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit);
 885   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9);
 886 
 887   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  24+0);
 888   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  24+1);
 889   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  24+2);
 890   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  24+3);
 891 
 892   addq(SRND, 4*32);
 893   cmpq(SRND, 3 * 4*32);
 894   jcc(Assembler::below, loop1);
 895 
 896 bind(loop2);
 897   // Do last 16 rounds with no scheduling
 898   vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
 899   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
 900   sha256_AVX2_four_rounds_compute_first(0);
 901 
 902   vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
 903   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
 904   sha256_AVX2_four_rounds_compute_last(0 + 8);
 905 
 906   addq(SRND, 2*32);
 907 
 908   vmovdqu(xmm4, xmm6);
 909   vmovdqu(xmm5, xmm7);
 910 
 911   cmpq(SRND, 4 * 4*32);
 912   jcc(Assembler::below, loop2);
 913 
 914   movq(CTX, Address(rsp, _CTX));
 915   movq(INP, Address(rsp, _INP));
 916 
 917   addm(4*0, CTX, a);
 918   addm(4*1, CTX, b);
 919   addm(4*2, CTX, c);
 920   addm(4*3, CTX, d);
 921   addm(4*4, CTX, e);
 922   addm(4*5, CTX, f);
 923   addm(4*6, CTX, g);
 924   addm(4*7, CTX, h);
 925 
 926   cmpq(INP, Address(rsp, _INP_END));
 927   jcc(Assembler::above, done_hash);
 928 
 929   //Do second block using previously scheduled results
 930   xorq(SRND, SRND);
 931 align(16);
 932 bind(loop3);
 933   sha256_AVX2_four_rounds_compute_first(4);
 934   sha256_AVX2_four_rounds_compute_last(4+8);
 935 
 936   addq(SRND, 2*32);
 937   cmpq(SRND, 4 * 4*32);
 938   jcc(Assembler::below, loop3);
 939 
 940   movq(CTX, Address(rsp, _CTX));
 941   movq(INP, Address(rsp, _INP));
 942   addq(INP, 64);
 943 
 944   addm(4*0, CTX, a);
 945   addm(4*1, CTX, b);
 946   addm(4*2, CTX, c);
 947   addm(4*3, CTX, d);
 948   addm(4*4, CTX, e);
 949   addm(4*5, CTX, f);
 950   addm(4*6, CTX, g);
 951   addm(4*7, CTX, h);
 952 
 953   cmpq(INP, Address(rsp, _INP_END));
 954   jcc(Assembler::below, loop0);
 955   jccb(Assembler::above, done_hash);
 956 
 957 bind(do_last_block);
 958   lea(TBL, ExternalAddress(K256_W));
 959 
 960   movdqu(xmm4, Address(INP, 0*16));
 961   movdqu(xmm5, Address(INP, 1*16));
 962   movdqu(xmm6, Address(INP, 2*16));
 963   movdqu(xmm7, Address(INP, 3*16));
 964 
 965   vpshufb(xmm4, xmm4, xmm13, AVX_128bit);
 966   vpshufb(xmm5, xmm5, xmm13, AVX_128bit);
 967   vpshufb(xmm6, xmm6, xmm13, AVX_128bit);
 968   vpshufb(xmm7, xmm7, xmm13, AVX_128bit);
 969 
 970   jmp(last_block_enter);
 971 
 972 bind(only_one_block);
 973 
 974   // load initial digest ;; table should be preloaded with following values
 975   movl(a, Address(CTX, 4*0));   // 0x6a09e667
 976   movl(b, Address(CTX, 4*1));   // 0xbb67ae85
 977   movl(c, Address(CTX, 4*2));   // 0x3c6ef372
 978   movl(d, Address(CTX, 4*3));   // 0xa54ff53a
 979   movl(e, Address(CTX, 4*4));   // 0x510e527f
 980   movl(f, Address(CTX, 4*5));   // 0x9b05688c
 981   movl(g, Address(CTX, 4*6));   // 0x1f83d9ab
 982   movl(h, Address(CTX, 4*7));   // 0x5be0cd19
 983 
 984 
 985   pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
 986   vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
 987   vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
 988   vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
 989 
 990   movq(Address(rsp, _CTX), CTX);
 991   jmpb(do_last_block);
 992 
 993 bind(done_hash);
 994 
 995   movq(rsp, Address(rsp, _RSP));
 996 
 997   pop(r15);
 998   pop(r14);
 999   pop(r13);
1000   pop(r12);
1001   pop(rbp);
1002 #ifdef _WIN64
1003   pop(rdi);
1004   pop(rsi);
1005 #endif
1006   pop(rbx);
1007 
1008 #ifdef _WIN64
1009   pop(r9);
1010   pop(r8);
1011 #else
1012   pop(rdx);
1013   pop(rcx);
1014 #endif
1015 
1016   if (multi_block) {
1017 #ifdef _WIN64
1018 const Register& limit_end = r9;
1019 const Register& ofs_end   = r8;
1020 #else
1021 const Register& limit_end = rcx;
1022 const Register& ofs_end   = rdx;
1023 #endif
1024     movq(rax, ofs_end);
1025 
1026 bind(compute_size1);
1027     cmpptr(rax, limit_end); // assume the original ofs <= limit
1028     jccb(Assembler::aboveEqual, compute_size_end1);
1029     addq(rax, 64);
1030     jmpb(compute_size1);
1031 
1032 bind(compute_size_end1);
1033   }
1034 }
1035 #endif //#ifdef _LP64
< prev index next >