New src/cpu/ppc/vm/macroAssembler_ppc

   1 // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.
   2 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   3 //
   4 // This code is free software; you can redistribute it and/or modify it
   5 // under the terms of the GNU General Public License version 2 only, as
   6 // published by the Free Software Foundation.
   7 //
   8 // This code is distributed in the hope that it will be useful, but WITHOUT
   9 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  11 // version 2 for more details (a copy is included in the LICENSE file that
  12 // accompanied this code).
  13 //
  14 // You should have received a copy of the GNU General Public License version
  15 // 2 along with this work; if not, write to the Free Software Foundation,
  16 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  17 //
  18 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  19 // or visit www.oracle.com if you need additional information or have any
  20 // questions.
  21 
  22 // Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512"
  23 // (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf).
  24 
  25 #include "asm/macroAssembler.inline.hpp"
  26 #include "runtime/stubRoutines.hpp"
  27 
  28 /**********************************************************************
  29  * SHA 256
  30  *********************************************************************/
  31 
  32 void MacroAssembler::sha256_deque(const VectorRegister src,
  33                                   const VectorRegister dst1,
  34                                   const VectorRegister dst2,
  35                                   const VectorRegister dst3) {
  36   vsldoi (dst1, src, src, 12);
  37   vsldoi (dst2, src, src, 8);
  38   vsldoi (dst3, src, src, 4);
  39 }
  40 
  41 void MacroAssembler::sha256_round(const VectorRegister* hs,
  42                                   const int total_hs,
  43                                   int& h_cnt,
  44                                   const VectorRegister kpw) {
  45   // convenience registers: cycle from 0-7 downwards
  46   const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
  47   const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
  48   const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
  49   const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
  50   const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
  51   const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
  52   const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
  53   const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
  54   // temporaries
  55   VectorRegister ch  = VR0;
  56   VectorRegister maj = VR1;
  57   VectorRegister bsa = VR2;
  58   VectorRegister bse = VR3;
  59   VectorRegister vt0 = VR4;
  60   VectorRegister vt1 = VR5;
  61   VectorRegister vt2 = VR6;
  62   VectorRegister vt3 = VR7;
  63 
  64   vsel       (ch,  g,   f, e);
  65   vxor       (maj, a,   b);
  66   vshasigmaw (bse, e,   1, 0xf);
  67   vadduwm    (vt2, ch,  kpw);
  68   vadduwm    (vt1, h,   bse);
  69   vsel       (maj, b,   c, maj);
  70   vadduwm    (vt3, vt1, vt2);
  71   vshasigmaw (bsa, a,   1, 0);
  72   vadduwm    (vt0, bsa, maj);
  73 
  74   vadduwm    (d,   d,   vt3);
  75   vadduwm    (h,   vt3, vt0);
  76 
  77   // advance vector pointer to the next iteration
  78   h_cnt++;
  79 }
  80 
  81 void MacroAssembler::sha256_load_h_vec(const VectorRegister a,
  82                                        const VectorRegister e,
  83                                        const Register hptr) {
  84   // temporaries
  85   Register tmp = R8;
  86   VectorRegister vt0 = VR0;
  87   VectorRegister vRb = VR6;
  88   // labels
  89   Label sha256_aligned, sha256_load_end;;
  90 
  91   andi_  (tmp,  hptr, 0xf);
  92   addi   (tmp,  hptr, 16);
  93   beq    (CCR0, sha256_aligned);
  94 
  95   // handle unaligned accesses
  96   lvx    (a,    hptr);
  97   lvsr   (vRb,  hptr);
  98 
  99   lvx    (e,    tmp);
 100   addi   (tmp,  tmp,  16);
 101   vec_perm(a,   e,    vRb);
 102 
 103   lvx    (vt0,  tmp);
 104   vec_perm(e,   vt0,  vRb);
 105   b      (sha256_load_end);
 106 
 107   // aligned accesses
 108   bind(sha256_aligned);
 109   lvx    (a,    hptr);
 110   addi   (tmp,  hptr, 16);
 111   lvx    (e,    tmp);
 112 
 113   bind(sha256_load_end);
 114 }
 115 
 116 void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in,
 117                                               const VectorRegister* ws,
 118                                               const int total_ws,
 119                                               const Register k,
 120                                               const VectorRegister* kpws,
 121                                               const int total_kpws) {
 122   Label w_aligned, after_w_load;
 123 
 124   Register tmp       = R8;
 125   VectorRegister vt0 = VR0;
 126   VectorRegister vt1 = VR1;
 127   VectorRegister vRb = VR6;
 128 
 129   andi_ (tmp, buf_in, 0xF);
 130   beq   (CCR0, w_aligned); // address ends with 0x0, not 0x8
 131 
 132   // deal with unaligned addresses
 133   lvx    (ws[0], buf_in);
 134   addi   (buf_in, buf_in, 16);
 135   lvsl   (vRb, buf_in);
 136 
 137   for (int n = 1; n < total_ws; n++) {
 138     VectorRegister w_cur = ws[n];
 139     VectorRegister w_prev = ws[n-1];
 140 
 141     lvx  (w_cur, buf_in);
 142     addi (buf_in, buf_in, 16);
 143     vec_perm(w_prev, w_cur, vRb);
 144   }
 145 
 146   lvx    (vt0, buf_in);
 147   vec_perm(ws[total_ws-1], vt0, vRb);
 148 
 149   b      (after_w_load);
 150 
 151   bind(w_aligned);
 152 
 153   // deal with aligned addresses
 154   for (int n = 0; n < total_ws; n++) {
 155     VectorRegister w = ws[n];
 156 
 157     lvx  (w, buf_in);
 158     addi (buf_in, buf_in, 16);
 159   }
 160 
 161   bind(after_w_load);
 162 
 163 #if defined(VM_LITTLE_ENDIAN)
 164   // Byte swapping within int values
 165   li       (tmp, 8);
 166   lvsl     (vt0, tmp);
 167   vspltisb (vt1, 0xb);
 168   vxor     (vt1, vt0, vt1);
 169   for (int n = 0; n < total_ws; n++) {
 170     VectorRegister w = ws[n];
 171     vec_perm(w, w, vt1);
 172   }
 173 #endif
 174 
 175   // Loading k, which is always aligned to 16-bytes
 176   lvx    (kpws[0], k);
 177   addi   (tmp, k, 16);
 178   for (int n = 1; n < total_kpws-1; n++) {
 179     VectorRegister kpw = kpws[n];
 180 
 181     lvx  (kpw, tmp);
 182     addi (tmp, tmp, 16);
 183   }
 184   lvx  (kpws[total_kpws-1], tmp);
 185 
 186   // Add w to K
 187   assert(total_ws == total_kpws, "Redesign the loop below");
 188   for (int n = 0; n < total_kpws; n++) {
 189     VectorRegister kpw = kpws[n];
 190     VectorRegister w   = ws[n];
 191 
 192     vadduwm  (kpw, kpw, w);
 193   }
 194 }
 195 
 196 void MacroAssembler::sha256_calc_4w(const VectorRegister w0,
 197                                     const VectorRegister w1,
 198                                     const VectorRegister w2,
 199                                     const VectorRegister w3,
 200                                     const VectorRegister kpw0,
 201                                     const VectorRegister kpw1,
 202                                     const VectorRegister kpw2,
 203                                     const VectorRegister kpw3,
 204                                     const Register j,
 205                                     const Register k) {
 206   // Temporaries
 207   const VectorRegister  vt0  = VR0;
 208   const VectorRegister  vt1  = VR1;
 209   const VectorSRegister vsrt1 = vt1->to_vsr();
 210   const VectorRegister  vt2  = VR2;
 211   const VectorRegister  vt3  = VR3;
 212   const VectorSRegister vst3 = vt3->to_vsr();
 213   const VectorRegister  vt4  = VR4;
 214 
 215   // load to k[j]
 216   lvx        (vt0, j,   k);
 217 
 218   // advance j
 219   addi       (j,   j,   16); // 16 bytes were read
 220 
 221 #if defined(VM_LITTLE_ENDIAN)
 222   // b = w[j-15], w[j-14], w[j-13], w[j-12]
 223   vsldoi     (vt1, w1,  w0, 12);
 224 
 225   // c = w[j-7], w[j-6], w[j-5], w[j-4]
 226   vsldoi     (vt2, w3,  w2, 12);
 227 
 228 #else
 229   // b = w[j-15], w[j-14], w[j-13], w[j-12]
 230   vsldoi     (vt1, w0,  w1, 4);
 231 
 232   // c = w[j-7], w[j-6], w[j-5], w[j-4]
 233   vsldoi     (vt2, w2,  w3, 4);
 234 #endif
 235 
 236   // d = w[j-2], w[j-1], w[j-4], w[j-3]
 237   vsldoi     (vt3, w3,  w3, 8);
 238 
 239   // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
 240   vshasigmaw (vt1, vt1, 0,  0);
 241 
 242   // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
 243   vshasigmaw (vt3, vt3, 0,  0xf);
 244 
 245   // c = s0(w[j-15]) + w[j-7],
 246   //     s0(w[j-14]) + w[j-6],
 247   //     s0(w[j-13]) + w[j-5],
 248   //     s0(w[j-12]) + w[j-4]
 249   vadduwm    (vt2, vt1, vt2);
 250 
 251   // c = s0(w[j-15]) + w[j-7] + w[j-16],
 252   //     s0(w[j-14]) + w[j-6] + w[j-15],
 253   //     s0(w[j-13]) + w[j-5] + w[j-14],
 254   //     s0(w[j-12]) + w[j-4] + w[j-13]
 255   vadduwm    (vt2, vt2, w0);
 256 
 257   // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 258   //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 259   //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
 260   //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3])  // UNDEFINED
 261   vadduwm    (vt4, vt2, vt3);
 262 
 263   // At this point, e[0] and e[1] are the correct values to be stored at w[j]
 264   // and w[j+1].
 265   // e[2] and e[3] are not considered.
 266   // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
 267   vshasigmaw (vt1, vt4, 0,  0xf);
 268 
 269   // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
 270 #if defined(VM_LITTLE_ENDIAN)
 271   xxmrgld    (vst3, vsrt1, vst3);
 272 #else
 273   xxmrghd    (vst3, vst3, vsrt1);
 274 #endif
 275 
 276   // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 277   //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 278   //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]),   // w[j+2]
 279   //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1])  // w[j+4]
 280   vadduwm    (vt2, vt2, vt3);
 281 
 282   // Updating w0 to w3 to hold the new previous 16 values from w.
 283   vmr        (w0,  w1);
 284   vmr        (w1,  w2);
 285   vmr        (w2,  w3);
 286   vmr        (w3,  vt2);
 287 
 288   // store k + w to v9 (4 values at once)
 289 #if defined(VM_LITTLE_ENDIAN)
 290   vadduwm    (kpw0, vt2, vt0);
 291 
 292   vsldoi     (kpw1, kpw0, kpw0, 12);
 293   vsldoi     (kpw2, kpw0, kpw0, 8);
 294   vsldoi     (kpw3, kpw0, kpw0, 4);
 295 #else
 296   vadduwm    (kpw3, vt2, vt0);
 297 
 298   vsldoi     (kpw2, kpw3, kpw3, 12);
 299   vsldoi     (kpw1, kpw3, kpw3, 8);
 300   vsldoi     (kpw0, kpw3, kpw3, 4);
 301 #endif
 302 }
 303 
 304 void MacroAssembler::sha256_update_sha_state(const VectorRegister a,
 305                                              const VectorRegister b_,
 306                                              const VectorRegister c,
 307                                              const VectorRegister d,
 308                                              const VectorRegister e,
 309                                              const VectorRegister f,
 310                                              const VectorRegister g,
 311                                              const VectorRegister h,
 312                                              const Register hptr) {
 313   // temporaries
 314   VectorRegister vt0  = VR0;
 315   VectorRegister vt1  = VR1;
 316   VectorRegister vt2  = VR2;
 317   VectorRegister vt3  = VR3;
 318   VectorRegister vt4  = VR4;
 319   VectorRegister vt5  = VR5;
 320   VectorRegister vaux = VR6;
 321   VectorRegister vRb  = VR6;
 322   Register tmp        = R8;
 323   Register of16       = R8;
 324   Register of32       = R9;
 325   Label state_load_aligned, after_state_load_aligned;
 326 
 327   // Load hptr
 328   andi_   (tmp, hptr, 0xf);
 329   li      (of16, 16);
 330   beq     (CCR0, state_load_aligned);
 331 
 332   // handle unaligned accesses
 333   li      (of32, 32);
 334   lvx     (vt0, hptr);
 335   lvsr    (vRb, hptr);
 336 
 337   lvx     (vt5, hptr, of16);
 338   vec_perm(vt0, vt5,  vRb);        // vt0 = hptr[0]..hptr[3]
 339 
 340   lvx     (vt1, hptr, of32);
 341   vec_perm(vt5, vt1,  vRb);        // vt5 = hptr[4]..hptr[7]
 342   b       (after_state_load_aligned);
 343 
 344   // aligned accesses
 345   bind(state_load_aligned);
 346   lvx     (vt0, hptr);
 347   lvx     (vt5, of16, hptr);
 348 
 349   bind(after_state_load_aligned);
 350 
 351 #if defined(VM_LITTLE_ENDIAN)
 352   vmrglw  (vt1, b_, a);            // vt1 = {a, b, ?, ?}
 353   vmrglw  (vt2, d, c);             // vt2 = {c, d, ?, ?}
 354   vmrglw  (vt3, f, e);             // vt3 = {e, f, ?, ?}
 355   vmrglw  (vt4, h, g);             // vt4 = {g, h, ?, ?}
 356   xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}
 357   xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}
 358   vadduwm (a,   vt0, vt1);         // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
 359   vadduwm (e,   vt5, vt3);         // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
 360 
 361   // Save hptr back, works for any alignment
 362   xxswapd (vt0->to_vsr(), a->to_vsr());
 363   stxvd2x (vt0->to_vsr(), hptr);
 364   xxswapd (vt5->to_vsr(), e->to_vsr());
 365   stxvd2x (vt5->to_vsr(), of16, hptr);
 366 #else
 367   vmrglw  (vt1, a, b_);            // vt1 = {a, b, ?, ?}
 368   vmrglw  (vt2, c, d);             // vt2 = {c, d, ?, ?}
 369   vmrglw  (vt3, e, f);             // vt3 = {e, f, ?, ?}
 370   vmrglw  (vt4, g, h);             // vt4 = {g, h, ?, ?}
 371   xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d}
 372   xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h}
 373   vadduwm (d,   vt0, vt1);         // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
 374   vadduwm (h,   vt5, vt3);         // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
 375 
 376   // Save hptr back, works for any alignment
 377   stxvd2x (d->to_vsr(), hptr);
 378   stxvd2x (h->to_vsr(), of16, hptr);
 379 #endif
 380 }
 381 
 382 
 383 //   R3_ARG1   - byte[]  Input string with padding but in Big Endian
 384 //   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
 385 //   R5_ARG3   - int     offset
 386 //   R6_ARG4   - int     limit
 387 //
 388 //   Internal Register usage:
 389 //   R7        - k
 390 //   R8        - tmp | j | of16
 391 //   R9        - of32
 392 //   VR0-VR8   - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb
 393 //   VR9-VR16  - a-h
 394 //   VR17-VR20 - w0-w3
 395 //   VR21-VR23 - vRb | vaux0-vaux2
 396 //   VR24-VR27 - kpw0-kpw3
 397 void MacroAssembler::sha256(bool multi_block) {
 398   static const ssize_t base_size = sizeof(uint32_t);
 399   static const ssize_t buf_size = 64;
 400   static uint32_t waux[buf_size / base_size] __attribute((aligned (16)));
 401   static const uint32_t round_consts[64] __attribute((aligned (16))) = {
 402     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 403     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 404     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 405     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 406     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 407     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 408     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 409     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 410     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 411     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 412     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 413     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 414     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 415     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 416     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 417     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 418   };
 419   static const uint8_t w_size = sizeof(round_consts)/sizeof(uint32_t);
 420 
 421   Register buf_in = R3_ARG1;
 422   Register state  = R4_ARG2;
 423   Register ofs    = R5_ARG3;
 424   Register limit  = R6_ARG4;
 425 
 426   Label sha_loop, bsw_loop, core_loop;
 427 
 428   // Save non-volatile vector registers in the red zone
 429   static const VectorRegister nv[] = {
 430     VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/
 431   };
 432   static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
 433 
 434   for (int c = 0; c < nv_size; c++) {
 435     Register tmp = R8;
 436     li  (tmp, (c - (nv_size)) * 16);
 437     stvx(nv[c], tmp, R1);
 438   }
 439 
 440   // Load hash state to registers
 441   VectorRegister a = VR9;
 442   VectorRegister b = VR10;
 443   VectorRegister c = VR11;
 444   VectorRegister d = VR12;
 445   VectorRegister e = VR13;
 446   VectorRegister f = VR14;
 447   VectorRegister g = VR15;
 448   VectorRegister h = VR16;
 449   static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
 450   static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
 451   // counter for cycling through hs vector to avoid register moves between iterations
 452   int h_cnt = 0;
 453 
 454   // Load a-h registers from the memory pointed by state
 455 #if defined(VM_LITTLE_ENDIAN)
 456   sha256_load_h_vec(a, e, state);
 457 #else
 458   sha256_load_h_vec(d, h, state);
 459 #endif
 460 
 461   // keep k loaded also during MultiBlock loops
 462   Register k = R7;
 463   load_const_optimized(k, const_cast<uint32_t *>(round_consts), R0);
 464 
 465   // Avoiding redundant loads
 466   if (multi_block) {
 467     align(OptoLoopAlignment);
 468   }
 469   bind(sha_loop);
 470 #if defined(VM_LITTLE_ENDIAN)
 471   sha256_deque(a, b, c, d);
 472   sha256_deque(e, f, g, h);
 473 #else
 474   sha256_deque(d, c, b, a);
 475   sha256_deque(h, g, f, e);
 476 #endif
 477 
 478   // Load 16 elements from w out of the loop.
 479   // Order of the int values is Endianess specific.
 480   VectorRegister w0 = VR17;
 481   VectorRegister w1 = VR18;
 482   VectorRegister w2 = VR19;
 483   VectorRegister w3 = VR20;
 484   static const VectorRegister ws[] = {w0, w1, w2, w3};
 485   static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
 486 
 487   VectorRegister kpw0 = VR24;
 488   VectorRegister kpw1 = VR25;
 489   VectorRegister kpw2 = VR26;
 490   VectorRegister kpw3 = VR27;
 491   static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};
 492   static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);
 493 
 494   sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);
 495 
 496   // Cycle through the first 16 elements
 497   assert(total_ws == total_kpws, "Redesign the loop below");
 498   for (int n = 0; n < total_ws; n++) {
 499     VectorRegister vaux0 = VR21;
 500     VectorRegister vaux1 = VR22;
 501     VectorRegister vaux2 = VR23;
 502 
 503     sha256_deque(kpws[n], vaux0, vaux1, vaux2);
 504 
 505 #if defined(VM_LITTLE_ENDIAN)
 506     sha256_round(hs, total_hs, h_cnt, kpws[n]);
 507     sha256_round(hs, total_hs, h_cnt, vaux0);
 508     sha256_round(hs, total_hs, h_cnt, vaux1);
 509     sha256_round(hs, total_hs, h_cnt, vaux2);
 510 #else
 511     sha256_round(hs, total_hs, h_cnt, vaux2);
 512     sha256_round(hs, total_hs, h_cnt, vaux1);
 513     sha256_round(hs, total_hs, h_cnt, vaux0);
 514     sha256_round(hs, total_hs, h_cnt, kpws[n]);
 515 #endif
 516   }
 517 
 518   Register tmp = R8;
 519   // loop the 16th to the 64th iteration by 8 steps
 520   li   (tmp, (w_size - 16) / total_hs);
 521   mtctr(tmp);
 522 
 523   // j will be aligned to 4 for loading words.
 524   // Whenever read, advance the pointer (e.g: when j is used in a function)
 525   Register j = R8;
 526   li   (j, 16*4);
 527 
 528   align(OptoLoopAlignment);
 529   bind(core_loop);
 530 
 531   // due to VectorRegister rotate, always iterate in multiples of total_hs
 532   for (int n = 0; n < total_hs/4; n++) {
 533     sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);
 534     sha256_round(hs, total_hs, h_cnt, kpw0);
 535     sha256_round(hs, total_hs, h_cnt, kpw1);
 536     sha256_round(hs, total_hs, h_cnt, kpw2);
 537     sha256_round(hs, total_hs, h_cnt, kpw3);
 538   }
 539 
 540   bdnz   (core_loop);
 541 
 542   // Update hash state
 543   sha256_update_sha_state(a, b, c, d, e, f, g, h, state);
 544 
 545   if (multi_block) {
 546     // process next 1024 bit block (buf_in already updated)
 547     addi(ofs, ofs, buf_size);
 548     cmpd(CCR0, ofs, limit);
 549     blt(CCR0, sha_loop);
 550 
 551     // return ofs
 552     mr(R3_ARG1, ofs);
 553   }
 554 
 555   // Restore non-volatile registers
 556   for (int c = 0; c < nv_size; c++) {
 557     Register tmp = R8;
 558     li  (tmp, (c - (nv_size)) * 16);
 559     lvx(nv[c], tmp, R1);
 560   }
 561 }
 562 
 563 
 564 /**********************************************************************
 565  * SHA 512
 566  *********************************************************************/
 567 
 568 void MacroAssembler::sha512_load_w_vec(const Register buf_in,
 569                                        const VectorRegister* ws,
 570                                        const int total_ws) {
 571   Register tmp       = R8;
 572   VectorRegister vRb = VR8;
 573   VectorRegister aux = VR9;
 574   Label is_aligned, after_alignment;
 575 
 576   andi_  (tmp, buf_in, 0xF);
 577   beq    (CCR0, is_aligned); // address ends with 0x0, not 0x8
 578 
 579   // deal with unaligned addresses
 580   lvx    (ws[0], buf_in);
 581   addi   (buf_in, buf_in, 16);
 582   lvsl   (vRb, buf_in);
 583 
 584   for (int n = 1; n < total_ws; n++) {
 585     VectorRegister w_cur = ws[n];
 586     VectorRegister w_prev = ws[n-1];
 587 
 588     lvx  (w_cur, buf_in);
 589     addi (buf_in, buf_in, 16);
 590     vec_perm(w_prev, w_cur, vRb);
 591   }
 592 
 593   lvx    (aux, buf_in);
 594   vec_perm(ws[total_ws-1], aux, vRb);
 595 
 596   b      (after_alignment);
 597 
 598   bind(is_aligned);
 599 
 600   for (int n = 0; n < total_ws; n++) {
 601     VectorRegister w = ws[n];
 602 
 603     lvx  (w, buf_in);
 604     addi (buf_in, buf_in, 16);
 605   }
 606 
 607   bind(after_alignment);
 608 }
 609 
 610 // Update hash state
 611 void MacroAssembler::sha512_update_sha_state(const Register state,
 612                                              const VectorRegister* hs,
 613                                              const int total_hs) {
 614 
 615 #if defined(VM_LITTLE_ENDIAN)
 616   int start_idx = 0;
 617 #else
 618   int start_idx = 1;
 619 #endif
 620 
 621   // load initial hash from the memory pointed by state
 622   VectorRegister ini_a = VR10;
 623   VectorRegister ini_c = VR12;
 624   VectorRegister ini_e = VR14;
 625   VectorRegister ini_g = VR16;
 626   static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};
 627   static const int total_inis = sizeof(inis)/sizeof(VectorRegister);
 628 
 629   Label state_save_aligned, after_state_save_aligned;
 630 
 631   Register addr      = R7;
 632   Register tmp       = R8;
 633   VectorRegister vRb = VR8;
 634   VectorRegister aux = VR9;
 635 
 636   andi_(tmp, state, 0xf);
 637   beq(CCR0, state_save_aligned);
 638   // deal with unaligned addresses
 639 
 640   {
 641     VectorRegister a = hs[0];
 642     VectorRegister b_ = hs[1];
 643     VectorRegister c = hs[2];
 644     VectorRegister d = hs[3];
 645     VectorRegister e = hs[4];
 646     VectorRegister f = hs[5];
 647     VectorRegister g = hs[6];
 648     VectorRegister h = hs[7];
 649     lvsr   (vRb, state);
 650     lvx    (ini_a, state);
 651     addi   (addr, state, 16);
 652 
 653     lvx    (ini_c, addr);
 654     addi   (addr, addr, 16);
 655     vec_perm(ini_a, ini_c, vRb);
 656 
 657     lvx    (ini_e, addr);
 658     addi   (addr, addr, 16);
 659     vec_perm(ini_c, ini_e, vRb);
 660 
 661     lvx    (ini_g, addr);
 662     addi   (addr, addr, 16);
 663     vec_perm(ini_e, ini_g, vRb);
 664 
 665     lvx    (aux, addr);
 666     vec_perm(ini_g, aux, vRb);
 667 
 668 #if defined(VM_LITTLE_ENDIAN)
 669     xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr());
 670     xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());
 671     xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());
 672     xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());
 673 #else
 674     xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr());
 675     xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr());
 676     xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr());
 677     xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr());
 678 #endif
 679 
 680     for (int n = start_idx; n < total_hs; n += 2) {
 681       VectorRegister h_cur = hs[n];
 682       VectorRegister ini_cur = inis[n/2];
 683 
 684       vaddudm(h_cur, ini_cur, h_cur);
 685     }
 686 
 687     for (int n = start_idx; n < total_hs; n += 2) {
 688       VectorRegister h_cur = hs[n];
 689 
 690       mfvrd  (tmp, h_cur);
 691 #if defined(VM_LITTLE_ENDIAN)
 692       std    (tmp, 8*n + 8, state);
 693 #else
 694       std    (tmp, 8*n - 8, state);
 695 #endif
 696       vsldoi (aux, h_cur, h_cur, 8);
 697       mfvrd  (tmp, aux);
 698       std    (tmp, 8*n + 0, state);
 699     }
 700 
 701     b      (after_state_save_aligned);
 702   }
 703 
 704   bind(state_save_aligned);
 705   {
 706     mr(addr, state);
 707     for (int n = 0; n < total_hs; n += 2) {
 708 #if defined(VM_LITTLE_ENDIAN)
 709       VectorRegister h_cur = hs[n];
 710       VectorRegister h_next = hs[n+1];
 711 #else
 712       VectorRegister h_cur = hs[n+1];
 713       VectorRegister h_next = hs[n];
 714 #endif
 715       VectorRegister ini_cur = inis[n/2];
 716 
 717       lvx(ini_cur, addr);
 718       addi(addr, addr, 16);
 719       xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());
 720     }
 721 
 722     for (int n = start_idx; n < total_hs; n += 2) {
 723       VectorRegister h_cur = hs[n];
 724       VectorRegister ini_cur = inis[n/2];
 725 
 726       vaddudm(h_cur, ini_cur, h_cur);
 727     }
 728 
 729     mr(addr, state);
 730     for (int n = start_idx; n < total_hs; n += 2) {
 731       VectorRegister h_cur = hs[n];
 732 
 733       stvx(h_cur, addr);
 734       addi(addr, addr, 16);
 735     }
 736   }
 737 
 738   bind(after_state_save_aligned);
 739 }
 740 
 741 // Use h_cnt to cycle through hs elements but also increment it at the end
 742 void MacroAssembler::sha512_round(const VectorRegister* hs,
 743                                   const int total_hs, int& h_cnt,
 744                                   const VectorRegister kpw) {
 745 
 746   // convenience registers: cycle from 0-7 downwards
 747   const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
 748   const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
 749   const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
 750   const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
 751   const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
 752   const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
 753   const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
 754   const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
 755   // temporaries
 756   const VectorRegister Ch   = VR20;
 757   const VectorRegister Maj  = VR21;
 758   const VectorRegister bsa  = VR22;
 759   const VectorRegister bse  = VR23;
 760   const VectorRegister tmp1 = VR24;
 761   const VectorRegister tmp2 = VR25;
 762 
 763   vsel      (Ch,   g,    f,   e);
 764   vxor      (Maj,  a,    b);
 765   vshasigmad(bse,  e,    1,   0xf);
 766   vaddudm   (tmp2, Ch,   kpw);
 767   vaddudm   (tmp1, h,    bse);
 768   vsel      (Maj,  b,    c,   Maj);
 769   vaddudm   (tmp1, tmp1, tmp2);
 770   vshasigmad(bsa,  a,    1,   0);
 771   vaddudm   (tmp2, bsa,  Maj);
 772   vaddudm   (d,    d,    tmp1);
 773   vaddudm   (h,    tmp1, tmp2);
 774 
 775   // advance vector pointer to the next iteration
 776   h_cnt++;
 777 }
 778 
 779 void MacroAssembler::sha512_calc_2w(const VectorRegister w0,
 780                                     const VectorRegister w1,
 781                                     const VectorRegister w2,
 782                                     const VectorRegister w3,
 783                                     const VectorRegister w4,
 784                                     const VectorRegister w5,
 785                                     const VectorRegister w6,
 786                                     const VectorRegister w7,
 787                                     const VectorRegister kpw0,
 788                                     const VectorRegister kpw1,
 789                                     const Register j,
 790                                     const VectorRegister vRb,
 791                                     const Register k) {
 792   // Temporaries
 793   const VectorRegister VR_a = VR20;
 794   const VectorRegister VR_b = VR21;
 795   const VectorRegister VR_c = VR22;
 796   const VectorRegister VR_d = VR23;
 797 
 798   // load to k[j]
 799   lvx        (VR_a, j,    k);
 800   // advance j
 801   addi       (j,    j,    16); // 16 bytes were read
 802 
 803 #if defined(VM_LITTLE_ENDIAN)
 804   // v6 = w[j-15], w[j-14]
 805   vperm      (VR_b, w1,   w0,  vRb);
 806   // v12 = w[j-7], w[j-6]
 807   vperm      (VR_c, w5,   w4,  vRb);
 808 #else
 809   // v6 = w[j-15], w[j-14]
 810   vperm      (VR_b, w0,   w1,  vRb);
 811   // v12 = w[j-7], w[j-6]
 812   vperm      (VR_c, w4,   w5,  vRb);
 813 #endif
 814 
 815   // v6 = s0(w[j-15]) , s0(w[j-14])
 816   vshasigmad (VR_b, VR_b,    0,   0);
 817   // v5 = s1(w[j-2]) , s1(w[j-1])
 818   vshasigmad (VR_d, w7,      0,   0xf);
 819   // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]
 820   vaddudm    (VR_b, VR_b, VR_c);
 821   // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]
 822   vaddudm    (VR_d, VR_d, w0);
 823   // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 824   //      s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 825   vaddudm    (VR_c, VR_d, VR_b);
 826   // Updating w0 to w7 to hold the new previous 16 values from w.
 827   vmr        (w0,   w1);
 828   vmr        (w1,   w2);
 829   vmr        (w2,   w3);
 830   vmr        (w3,   w4);
 831   vmr        (w4,   w5);
 832   vmr        (w5,   w6);
 833   vmr        (w6,   w7);
 834   vmr        (w7,   VR_c);
 835 
 836 #if defined(VM_LITTLE_ENDIAN)
 837   // store k + w to kpw0 (2 values at once)
 838   vaddudm    (kpw0, VR_c, VR_a);
 839   // kpw1 holds (k + w)[1]
 840   vsldoi     (kpw1, kpw0, kpw0, 8);
 841 #else
 842   // store k + w to kpw0 (2 values at once)
 843   vaddudm    (kpw1, VR_c, VR_a);
 844   // kpw1 holds (k + w)[1]
 845   vsldoi     (kpw0, kpw1, kpw1, 8);
 846 #endif
 847 }
 848 
 849 void MacroAssembler::sha512_load_h_vec(const Register state,
 850                                        const VectorRegister* hs,
 851                                        const int total_hs) {
 852 #if defined(VM_LITTLE_ENDIAN)
 853   VectorRegister a   = hs[0];
 854   VectorRegister g   = hs[6];
 855   int start_idx = 0;
 856 #else
 857   VectorRegister a   = hs[1];
 858   VectorRegister g   = hs[7];
 859   int start_idx = 1;
 860 #endif
 861 
 862   Register addr      = R7;
 863   VectorRegister vRb = VR8;
 864   Register tmp       = R8;
 865   Label state_aligned, after_state_aligned;
 866 
 867   andi_(tmp, state, 0xf);
 868   beq(CCR0, state_aligned);
 869 
 870   // deal with unaligned addresses
 871   VectorRegister aux = VR9;
 872 
 873   lvx    (a,    state);
 874   addi   (addr, state, 16);
 875   lvsl   (vRb,  addr);
 876 
 877   for (int n = start_idx + 2; n < total_hs; n += 2) {
 878     VectorRegister h_cur   = hs[n];
 879     VectorRegister h_prev2 = hs[n - 2];
 880 
 881     lvx    (h_cur,   addr);
 882     addi   (addr,    addr,  16);
 883     vec_perm(h_prev2, h_cur, vRb);
 884   }
 885   lvx    (aux, addr);
 886   vec_perm(g, aux, vRb);
 887 
 888   b      (after_state_aligned);
 889 
 890   bind(state_aligned);
 891 
 892   // deal with aligned addresses
 893   mr(addr, state);
 894   for (int n = start_idx; n < total_hs; n += 2) {
 895     VectorRegister h_cur = hs[n];
 896 
 897     lvx    (h_cur, addr);
 898     addi   (addr, addr, 16);
 899   }
 900 
 901   bind(after_state_aligned);
 902 }
 903 
 904 //   R3_ARG1   - byte[]  Input string with padding but in Big Endian
 905 //   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
 906 //   R5_ARG3   - int     offset
 907 //   R6_ARG4   - int     limit
 908 //
 909 //   Internal Register usage:
 910 //   R7 R8 R9  - volatile temporaries
 911 //   VR0-VR7   - a-h
 912 //   VR8       - vRb
 913 //   VR9       - aux (highly volatile, use with care)
 914 //   VR10-VR17 - w0-w7 | ini_a-ini_h
 915 //   VR18      - vsp16 | kplusw0
 916 //   VR19      - vsp32 | kplusw1
 917 //   VR20-VR25 - sha512_calc_2w and sha512_round temporaries
 918 void MacroAssembler::sha512(bool multi_block) {
 919   static const ssize_t base_size = sizeof(uint64_t);
 920   static const ssize_t buf_size = 128;
 921   static uint64_t waux[buf_size / base_size] __attribute((aligned (16)));
 922   static const uint64_t round_consts[80] __attribute((aligned (16))) = {
 923     0x428a2f98d728ae22, 0x7137449123ef65cd,
 924     0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
 925     0x3956c25bf348b538, 0x59f111f1b605d019,
 926     0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
 927     0xd807aa98a3030242, 0x12835b0145706fbe,
 928     0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
 929     0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
 930     0x9bdc06a725c71235, 0xc19bf174cf692694,
 931     0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
 932     0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
 933     0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
 934     0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
 935     0x983e5152ee66dfab, 0xa831c66d2db43210,
 936     0xb00327c898fb213f, 0xbf597fc7beef0ee4,
 937     0xc6e00bf33da88fc2, 0xd5a79147930aa725,
 938     0x06ca6351e003826f, 0x142929670a0e6e70,
 939     0x27b70a8546d22ffc, 0x2e1b21385c26c926,
 940     0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
 941     0x650a73548baf63de, 0x766a0abb3c77b2a8,
 942     0x81c2c92e47edaee6, 0x92722c851482353b,
 943     0xa2bfe8a14cf10364, 0xa81a664bbc423001,
 944     0xc24b8b70d0f89791, 0xc76c51a30654be30,
 945     0xd192e819d6ef5218, 0xd69906245565a910,
 946     0xf40e35855771202a, 0x106aa07032bbd1b8,
 947     0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
 948     0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
 949     0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
 950     0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
 951     0x748f82ee5defb2fc, 0x78a5636f43172f60,
 952     0x84c87814a1f0ab72, 0x8cc702081a6439ec,
 953     0x90befffa23631e28, 0xa4506cebde82bde9,
 954     0xbef9a3f7b2c67915, 0xc67178f2e372532b,
 955     0xca273eceea26619c, 0xd186b8c721c0c207,
 956     0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
 957     0x06f067aa72176fba, 0x0a637dc5a2c898a6,
 958     0x113f9804bef90dae, 0x1b710b35131c471b,
 959     0x28db77f523047d84, 0x32caab7b40c72493,
 960     0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
 961     0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
 962     0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
 963   };
 964   static const uint8_t w_size = sizeof(round_consts)/sizeof(uint64_t);
 965 
 966   Register buf_in = R3_ARG1;
 967   Register state  = R4_ARG2;
 968   Register ofs    = R5_ARG3;
 969   Register limit  = R6_ARG4;
 970 
 971   Label sha_loop, bsw_loop, core_loop;
 972 
 973   // Save non-volatile vector registers in the red zone
 974   static const VectorRegister nv[] = {
 975     VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/
 976   };
 977   static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
 978 
 979   for (int c = 0; c < nv_size; c++) {
 980     Register idx = R7;
 981     li  (idx, (c - (nv_size)) * 16);
 982     stvx(nv[c], idx, R1);
 983   }
 984 
 985   // Load hash state to registers
 986   VectorRegister a = VR0;
 987   VectorRegister b = VR1;
 988   VectorRegister c = VR2;
 989   VectorRegister d = VR3;
 990   VectorRegister e = VR4;
 991   VectorRegister f = VR5;
 992   VectorRegister g = VR6;
 993   VectorRegister h = VR7;
 994   static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
 995   static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
 996   // counter for cycling through hs vector to avoid register moves between iterations
 997   int h_cnt = 0;
 998 
 999   // Load a-h registers from the memory pointed by state
1000   sha512_load_h_vec(state, hs, total_hs);
1001 
1002   if (multi_block) {
1003     align(OptoLoopAlignment);
1004   }
1005   bind(sha_loop);
1006 
1007   for (int n = 0; n < total_hs; n += 2) {
1008 #if defined(VM_LITTLE_ENDIAN)
1009     VectorRegister h_cur = hs[n];
1010     VectorRegister h_next = hs[n + 1];
1011 #else
1012     VectorRegister h_cur = hs[n + 1];
1013     VectorRegister h_next = hs[n];
1014 #endif
1015     vsldoi (h_next, h_cur, h_cur, 8);
1016   }
1017 
1018   Register k = R9;
1019   load_const_optimized(k, const_cast<uint64_t *>(round_consts), R0);
1020 
1021   // Load 16 elements from w out of the loop.
1022   // Order of the long values is Endianess specific.
1023   VectorRegister w0 = VR10;
1024   VectorRegister w1 = VR11;
1025   VectorRegister w2 = VR12;
1026   VectorRegister w3 = VR13;
1027   VectorRegister w4 = VR14;
1028   VectorRegister w5 = VR15;
1029   VectorRegister w6 = VR16;
1030   VectorRegister w7 = VR17;
1031   static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};
1032   static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
1033 
1034   // Load 16 w into vectors and setup vsl for vperm
1035   sha512_load_w_vec(buf_in, ws, total_ws);
1036 
1037 #if defined(VM_LITTLE_ENDIAN)
1038   VectorRegister vsp16 = VR18;
1039   VectorRegister vsp32 = VR19;
1040   VectorRegister shiftarg = VR9;
1041 
1042   vspltisw(vsp16,    8);
1043   vspltisw(shiftarg, 1);
1044   vsl     (vsp16,    vsp16, shiftarg);
1045   vsl     (vsp32,    vsp16, shiftarg);
1046 
1047   VectorRegister vsp8 = VR9;
1048   vspltish(vsp8,     8);
1049 
1050   // Convert input from Big Endian to Little Endian
1051   for (int c = 0; c < total_ws; c++) {
1052     VectorRegister w = ws[c];
1053     vrlh  (w, w, vsp8);
1054   }
1055   for (int c = 0; c < total_ws; c++) {
1056     VectorRegister w = ws[c];
1057     vrlw  (w, w, vsp16);
1058   }
1059   for (int c = 0; c < total_ws; c++) {
1060     VectorRegister w = ws[c];
1061     vrld  (w, w, vsp32);
1062   }
1063 #endif
1064 
1065   Register Rb        = R10;
1066   VectorRegister vRb = VR8;
1067   li      (Rb, 8);
1068   lvsl    (vRb, Rb);
1069 
1070   VectorRegister kplusw0 = VR18;
1071   VectorRegister kplusw1 = VR19;
1072 
1073   Register addr      = R7;
1074   mr      (addr, k);
1075 
1076   for (int n = 0; n < total_ws; n++) {
1077     VectorRegister w = ws[n];
1078 
1079     lvx    (kplusw0, addr);
1080     addi   (addr, addr, 16);
1081 #if defined(VM_LITTLE_ENDIAN)
1082     vaddudm(kplusw0, kplusw0, w);
1083     vsldoi (kplusw1, kplusw0, kplusw0, 8);
1084 #else
1085     vaddudm(kplusw1, kplusw0, w);
1086     vsldoi (kplusw0, kplusw1, kplusw1, 8);
1087 #endif
1088 
1089     sha512_round(hs, total_hs, h_cnt, kplusw0);
1090     sha512_round(hs, total_hs, h_cnt, kplusw1);
1091   }
1092 
1093   Register tmp       = R8;
1094   li    (tmp, (w_size-16)/total_hs);
1095   mtctr (tmp);
1096   // j will be aligned to 4 for loading words.
1097   // Whenever read, advance the pointer (e.g: when j is used in a function)
1098   Register j = tmp;
1099   li     (j, 8*16);
1100 
1101   align(OptoLoopAlignment);
1102   bind(core_loop);
1103 
1104   // due to VectorRegister rotate, always iterate in multiples of total_hs
1105   for (int n = 0; n < total_hs/2; n++) {
1106     sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);
1107     sha512_round(hs, total_hs, h_cnt, kplusw0);
1108     sha512_round(hs, total_hs, h_cnt, kplusw1);
1109   }
1110 
1111   bdnz   (core_loop);
1112 
1113   sha512_update_sha_state(state, hs, total_hs);
1114 
1115   if (multi_block) {
1116     // process next 1024 bit block (buf_in already updated)
1117     addi(ofs, ofs, buf_size);
1118     cmpd(CCR0, ofs, limit);
1119     blt(CCR0, sha_loop);
1120 
1121     // return ofs
1122     mr(R3_ARG1, ofs);
1123   }
1124 
1125   // Restore non-volatile registers
1126   for (int c = 0; c < nv_size; c++) {
1127     Register idx = R7;
1128     li  (idx, (c - (nv_size)) * 16);
1129     lvx(nv[c], idx, R1);
1130   }
1131 }