1 // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.
   2 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   3 //
   4 // This code is free software; you can redistribute it and/or modify it
   5 // under the terms of the GNU General Public License version 2 only, as
   6 // published by the Free Software Foundation.
   7 //
   8 // This code is distributed in the hope that it will be useful, but WITHOUT
   9 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  11 // version 2 for more details (a copy is included in the LICENSE file that
  12 // accompanied this code).
  13 //
  14 // You should have received a copy of the GNU General Public License version
  15 // 2 along with this work; if not, write to the Free Software Foundation,
  16 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  17 //
  18 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  19 // or visit www.oracle.com if you need additional information or have any
  20 // questions.
  21 
  22 // Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512"
  23 // (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf).
  24 
  25 #include "asm/macroAssembler.inline.hpp"
  26 #include "runtime/stubRoutines.hpp"
  27 
  28 /**********************************************************************
  29  * SHA 256
  30  *********************************************************************/
  31 
  32 void MacroAssembler::sha256_deque(const VectorRegister src,
  33                                   const VectorRegister dst1,
  34                                   const VectorRegister dst2,
  35                                   const VectorRegister dst3) {
  36   vsldoi (dst1, src, src, 12);
  37   vsldoi (dst2, src, src, 8);
  38   vsldoi (dst3, src, src, 4);
  39 }
  40 
  41 void MacroAssembler::sha256_round(const VectorRegister* hs,
  42                                   const int total_hs,
  43                                   int& h_cnt,
  44                                   const VectorRegister kpw) {
  45   // convenience registers: cycle from 0-7 downwards
  46   const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
  47   const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
  48   const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
  49   const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
  50   const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
  51   const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
  52   const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
  53   const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
  54   // temporaries
  55   VectorRegister ch  = VR0;
  56   VectorRegister maj = VR1;
  57   VectorRegister bsa = VR2;
  58   VectorRegister bse = VR3;
  59   VectorRegister vt0 = VR4;
  60   VectorRegister vt1 = VR5;
  61   VectorRegister vt2 = VR6;
  62   VectorRegister vt3 = VR7;
  63 
  64   vsel       (ch,  g,   f, e);
  65   vxor       (maj, a,   b);
  66   vshasigmaw (bse, e,   1, 0xf);
  67   vadduwm    (vt2, ch,  kpw);
  68   vadduwm    (vt1, h,   bse);
  69   vsel       (maj, b,   c, maj);
  70   vadduwm    (vt3, vt1, vt2);
  71   vshasigmaw (bsa, a,   1, 0);
  72   vadduwm    (vt0, bsa, maj);
  73 
  74   vadduwm    (d,   d,   vt3);
  75   vadduwm    (h,   vt3, vt0);
  76 
  77   // advance vector pointer to the next iteration
  78   h_cnt++;
  79 }
  80 
  81 void MacroAssembler::sha256_load_h_vec(const VectorRegister a,
  82                                        const VectorRegister e,
  83                                        const Register hptr) {
  84   // temporaries
  85   Register tmp = R8;
  86   VectorRegister vt0 = VR0;
  87   VectorRegister vRb = VR6;
  88   // labels
  89   Label sha256_aligned;
  90 
  91   andi_  (tmp,  hptr, 0xf);
  92   lvx    (a,    hptr);
  93   addi   (tmp,  hptr, 16);
  94   lvx    (e,    tmp);
  95   beq    (CCR0, sha256_aligned);
  96 
  97   // handle unaligned accesses
  98   load_perm(vRb, hptr);
  99   addi   (tmp, hptr, 32);
 100   vec_perm(a,   e,    vRb);
 101 
 102   lvx    (vt0,  tmp);
 103   vec_perm(e,   vt0,  vRb);
 104 
 105   // aligned accesses
 106   bind(sha256_aligned);
 107 }
 108 
 109 void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in,
 110                                               const VectorRegister* ws,
 111                                               const int total_ws,
 112                                               const Register k,
 113                                               const VectorRegister* kpws,
 114                                               const int total_kpws) {
 115   Label w_aligned, after_w_load;
 116 
 117   Register tmp       = R8;
 118   VectorRegister vt0 = VR0;
 119   VectorRegister vt1 = VR1;
 120   VectorRegister vRb = VR6;
 121 
 122   andi_ (tmp, buf_in, 0xF);
 123   beq   (CCR0, w_aligned); // address ends with 0x0, not 0x8
 124 
 125   // deal with unaligned addresses
 126   lvx    (ws[0], buf_in);
 127   load_perm(vRb, buf_in);
 128 
 129   for (int n = 1; n < total_ws; n++) {
 130     VectorRegister w_cur = ws[n];
 131     VectorRegister w_prev = ws[n-1];
 132 
 133     addi (tmp, buf_in, n * 16);
 134     lvx  (w_cur, tmp);
 135     vec_perm(w_prev, w_cur, vRb);
 136   }
 137   addi   (tmp, buf_in, total_ws * 16);
 138   lvx    (vt0, tmp);
 139   vec_perm(ws[total_ws-1], vt0, vRb);
 140   b      (after_w_load);
 141 
 142   bind(w_aligned);
 143 
 144   // deal with aligned addresses
 145   lvx(ws[0], buf_in);
 146   for (int n = 1; n < total_ws; n++) {
 147     VectorRegister w = ws[n];
 148     addi (tmp, buf_in, n * 16);
 149     lvx  (w, tmp);
 150   }
 151 
 152   bind(after_w_load);
 153 
 154 #if defined(VM_LITTLE_ENDIAN)
 155   // Byte swapping within int values
 156   li       (tmp, 8);
 157   lvsl     (vt0, tmp);
 158   vspltisb (vt1, 0xb);
 159   vxor     (vt1, vt0, vt1);
 160   for (int n = 0; n < total_ws; n++) {
 161     VectorRegister w = ws[n];
 162     vec_perm(w, w, vt1);
 163   }
 164 #endif
 165 
 166   // Loading k, which is always aligned to 16-bytes
 167   lvx    (kpws[0], k);
 168   for (int n = 1; n < total_kpws; n++) {
 169     VectorRegister kpw = kpws[n];
 170     addi (tmp, k, 16 * n);
 171     lvx  (kpw, tmp);
 172   }
 173 
 174   // Add w to K
 175   assert(total_ws == total_kpws, "Redesign the loop below");
 176   for (int n = 0; n < total_kpws; n++) {
 177     VectorRegister kpw = kpws[n];
 178     VectorRegister w   = ws[n];
 179 
 180     vadduwm  (kpw, kpw, w);
 181   }
 182 }
 183 
 184 void MacroAssembler::sha256_calc_4w(const VectorRegister w0,
 185                                     const VectorRegister w1,
 186                                     const VectorRegister w2,
 187                                     const VectorRegister w3,
 188                                     const VectorRegister kpw0,
 189                                     const VectorRegister kpw1,
 190                                     const VectorRegister kpw2,
 191                                     const VectorRegister kpw3,
 192                                     const Register j,
 193                                     const Register k) {
 194   // Temporaries
 195   const VectorRegister  vt0  = VR0;
 196   const VectorRegister  vt1  = VR1;
 197   const VectorSRegister vsrt1 = vt1->to_vsr();
 198   const VectorRegister  vt2  = VR2;
 199   const VectorRegister  vt3  = VR3;
 200   const VectorSRegister vst3 = vt3->to_vsr();
 201   const VectorRegister  vt4  = VR4;
 202 
 203   // load to k[j]
 204   lvx        (vt0, j,   k);
 205 
 206   // advance j
 207   addi       (j,   j,   16); // 16 bytes were read
 208 
 209 #if defined(VM_LITTLE_ENDIAN)
 210   // b = w[j-15], w[j-14], w[j-13], w[j-12]
 211   vsldoi     (vt1, w1,  w0, 12);
 212 
 213   // c = w[j-7], w[j-6], w[j-5], w[j-4]
 214   vsldoi     (vt2, w3,  w2, 12);
 215 
 216 #else
 217   // b = w[j-15], w[j-14], w[j-13], w[j-12]
 218   vsldoi     (vt1, w0,  w1, 4);
 219 
 220   // c = w[j-7], w[j-6], w[j-5], w[j-4]
 221   vsldoi     (vt2, w2,  w3, 4);
 222 #endif
 223 
 224   // d = w[j-2], w[j-1], w[j-4], w[j-3]
 225   vsldoi     (vt3, w3,  w3, 8);
 226 
 227   // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
 228   vshasigmaw (vt1, vt1, 0,  0);
 229 
 230   // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
 231   vshasigmaw (vt3, vt3, 0,  0xf);
 232 
 233   // c = s0(w[j-15]) + w[j-7],
 234   //     s0(w[j-14]) + w[j-6],
 235   //     s0(w[j-13]) + w[j-5],
 236   //     s0(w[j-12]) + w[j-4]
 237   vadduwm    (vt2, vt1, vt2);
 238 
 239   // c = s0(w[j-15]) + w[j-7] + w[j-16],
 240   //     s0(w[j-14]) + w[j-6] + w[j-15],
 241   //     s0(w[j-13]) + w[j-5] + w[j-14],
 242   //     s0(w[j-12]) + w[j-4] + w[j-13]
 243   vadduwm    (vt2, vt2, w0);
 244 
 245   // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 246   //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 247   //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
 248   //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3])  // UNDEFINED
 249   vadduwm    (vt4, vt2, vt3);
 250 
 251   // At this point, e[0] and e[1] are the correct values to be stored at w[j]
 252   // and w[j+1].
 253   // e[2] and e[3] are not considered.
 254   // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
 255   vshasigmaw (vt1, vt4, 0,  0xf);
 256 
 257   // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
 258 #if defined(VM_LITTLE_ENDIAN)
 259   xxmrgld    (vst3, vsrt1, vst3);
 260 #else
 261   xxmrghd    (vst3, vst3, vsrt1);
 262 #endif
 263 
 264   // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 265   //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 266   //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]),   // w[j+2]
 267   //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1])  // w[j+4]
 268   vadduwm    (vt2, vt2, vt3);
 269 
 270   // Updating w0 to w3 to hold the new previous 16 values from w.
 271   vmr        (w0,  w1);
 272   vmr        (w1,  w2);
 273   vmr        (w2,  w3);
 274   vmr        (w3,  vt2);
 275 
 276   // store k + w to v9 (4 values at once)
 277 #if defined(VM_LITTLE_ENDIAN)
 278   vadduwm    (kpw0, vt2, vt0);
 279 
 280   vsldoi     (kpw1, kpw0, kpw0, 12);
 281   vsldoi     (kpw2, kpw0, kpw0, 8);
 282   vsldoi     (kpw3, kpw0, kpw0, 4);
 283 #else
 284   vadduwm    (kpw3, vt2, vt0);
 285 
 286   vsldoi     (kpw2, kpw3, kpw3, 12);
 287   vsldoi     (kpw1, kpw3, kpw3, 8);
 288   vsldoi     (kpw0, kpw3, kpw3, 4);
 289 #endif
 290 }
 291 
 292 void MacroAssembler::sha256_update_sha_state(const VectorRegister a,
 293                                              const VectorRegister b_,
 294                                              const VectorRegister c,
 295                                              const VectorRegister d,
 296                                              const VectorRegister e,
 297                                              const VectorRegister f,
 298                                              const VectorRegister g,
 299                                              const VectorRegister h,
 300                                              const Register hptr) {
 301   // temporaries
 302   VectorRegister vt0  = VR0;
 303   VectorRegister vt1  = VR1;
 304   VectorRegister vt2  = VR2;
 305   VectorRegister vt3  = VR3;
 306   VectorRegister vt4  = VR4;
 307   VectorRegister vt5  = VR5;
 308   VectorRegister vaux = VR6;
 309   VectorRegister vRb  = VR6;
 310   Register tmp        = R8;
 311   Register of16       = R8;
 312   Register of32       = R9;
 313   Label state_load_aligned;
 314 
 315   // Load hptr
 316   andi_   (tmp, hptr, 0xf);
 317   li      (of16, 16);
 318   lvx     (vt0, hptr);
 319   lvx     (vt5, of16, hptr);
 320   beq     (CCR0, state_load_aligned);
 321 
 322   // handle unaligned accesses
 323   li      (of32, 32);
 324   load_perm(vRb, hptr);
 325 
 326   vec_perm(vt0, vt5,  vRb);        // vt0 = hptr[0]..hptr[3]
 327 
 328   lvx     (vt1, hptr, of32);
 329   vec_perm(vt5, vt1,  vRb);        // vt5 = hptr[4]..hptr[7]
 330 
 331   // aligned accesses
 332   bind(state_load_aligned);
 333 
 334 #if defined(VM_LITTLE_ENDIAN)
 335   vmrglw  (vt1, b_, a);            // vt1 = {a, b, ?, ?}
 336   vmrglw  (vt2, d, c);             // vt2 = {c, d, ?, ?}
 337   vmrglw  (vt3, f, e);             // vt3 = {e, f, ?, ?}
 338   vmrglw  (vt4, h, g);             // vt4 = {g, h, ?, ?}
 339   xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}
 340   xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}
 341   vadduwm (a,   vt0, vt1);         // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
 342   vadduwm (e,   vt5, vt3);         // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
 343 
 344   // Save hptr back, works for any alignment
 345   xxswapd (vt0->to_vsr(), a->to_vsr());
 346   stxvd2x (vt0->to_vsr(), hptr);
 347   xxswapd (vt5->to_vsr(), e->to_vsr());
 348   stxvd2x (vt5->to_vsr(), of16, hptr);
 349 #else
 350   vmrglw  (vt1, a, b_);            // vt1 = {a, b, ?, ?}
 351   vmrglw  (vt2, c, d);             // vt2 = {c, d, ?, ?}
 352   vmrglw  (vt3, e, f);             // vt3 = {e, f, ?, ?}
 353   vmrglw  (vt4, g, h);             // vt4 = {g, h, ?, ?}
 354   xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d}
 355   xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h}
 356   vadduwm (d,   vt0, vt1);         // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
 357   vadduwm (h,   vt5, vt3);         // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
 358 
 359   // Save hptr back, works for any alignment
 360   stxvd2x (d->to_vsr(), hptr);
 361   stxvd2x (h->to_vsr(), of16, hptr);
 362 #endif
 363 }
 364 
 365 static const uint32_t sha256_round_table[64] __attribute((aligned(16))) = {
 366   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 367   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 368   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 369   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 370   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 371   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 372   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 373   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 374   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 375   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 376   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 377   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 378   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 379   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 380   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 381   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 382 };
 383 static const uint32_t *sha256_round_consts = sha256_round_table;
 384 
 385 //   R3_ARG1   - byte[]  Input string with padding but in Big Endian
 386 //   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
 387 //   R5_ARG3   - int     offset
 388 //   R6_ARG4   - int     limit
 389 //
 390 //   Internal Register usage:
 391 //   R7        - k
 392 //   R8        - tmp | j | of16
 393 //   R9        - of32
 394 //   VR0-VR8   - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb
 395 //   VR9-VR16  - a-h
 396 //   VR17-VR20 - w0-w3
 397 //   VR21-VR23 - vRb | vaux0-vaux2
 398 //   VR24-VR27 - kpw0-kpw3
 399 void MacroAssembler::sha256(bool multi_block) {
 400   static const ssize_t buf_size = 64;
 401   static const uint8_t w_size = sizeof(sha256_round_table)/sizeof(uint32_t);
 402 #ifdef AIX
 403   // malloc provides 16 byte alignment
 404   if (((uintptr_t)sha256_round_consts & 0xF) != 0) {
 405     uint32_t *new_round_consts = (uint32_t*)malloc(sizeof(sha256_round_table));
 406     guarantee(new_round_consts, "oom");
 407     memcpy(new_round_consts, sha256_round_consts, sizeof(sha256_round_table));
 408     sha256_round_consts = (const uint32_t*)new_round_consts;
 409   }
 410 #endif
 411 
 412   Register buf_in = R3_ARG1;
 413   Register state  = R4_ARG2;
 414   Register ofs    = R5_ARG3;
 415   Register limit  = R6_ARG4;
 416 
 417   Label sha_loop, core_loop;
 418 
 419   // Save non-volatile vector registers in the red zone
 420   static const VectorRegister nv[] = {
 421     VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/
 422   };
 423   static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
 424 
 425   for (int c = 0; c < nv_size; c++) {
 426     Register tmp = R8;
 427     li  (tmp, (c - (nv_size)) * 16);
 428     stvx(nv[c], tmp, R1);
 429   }
 430 
 431   // Load hash state to registers
 432   VectorRegister a = VR9;
 433   VectorRegister b = VR10;
 434   VectorRegister c = VR11;
 435   VectorRegister d = VR12;
 436   VectorRegister e = VR13;
 437   VectorRegister f = VR14;
 438   VectorRegister g = VR15;
 439   VectorRegister h = VR16;
 440   static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
 441   static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
 442   // counter for cycling through hs vector to avoid register moves between iterations
 443   int h_cnt = 0;
 444 
 445   // Load a-h registers from the memory pointed by state
 446 #if defined(VM_LITTLE_ENDIAN)
 447   sha256_load_h_vec(a, e, state);
 448 #else
 449   sha256_load_h_vec(d, h, state);
 450 #endif
 451 
 452   // keep k loaded also during MultiBlock loops
 453   Register k = R7;
 454   assert(((uintptr_t)sha256_round_consts & 0xF) == 0, "k alignment");
 455   load_const_optimized(k, (address)sha256_round_consts, R0);
 456 
 457   // Avoiding redundant loads
 458   if (multi_block) {
 459     align(OptoLoopAlignment);
 460   }
 461   bind(sha_loop);
 462 #if defined(VM_LITTLE_ENDIAN)
 463   sha256_deque(a, b, c, d);
 464   sha256_deque(e, f, g, h);
 465 #else
 466   sha256_deque(d, c, b, a);
 467   sha256_deque(h, g, f, e);
 468 #endif
 469 
 470   // Load 16 elements from w out of the loop.
 471   // Order of the int values is Endianess specific.
 472   VectorRegister w0 = VR17;
 473   VectorRegister w1 = VR18;
 474   VectorRegister w2 = VR19;
 475   VectorRegister w3 = VR20;
 476   static const VectorRegister ws[] = {w0, w1, w2, w3};
 477   static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
 478 
 479   VectorRegister kpw0 = VR24;
 480   VectorRegister kpw1 = VR25;
 481   VectorRegister kpw2 = VR26;
 482   VectorRegister kpw3 = VR27;
 483   static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};
 484   static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);
 485 
 486   sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);
 487 
 488   // Cycle through the first 16 elements
 489   assert(total_ws == total_kpws, "Redesign the loop below");
 490   for (int n = 0; n < total_ws; n++) {
 491     VectorRegister vaux0 = VR21;
 492     VectorRegister vaux1 = VR22;
 493     VectorRegister vaux2 = VR23;
 494 
 495     sha256_deque(kpws[n], vaux0, vaux1, vaux2);
 496 
 497 #if defined(VM_LITTLE_ENDIAN)
 498     sha256_round(hs, total_hs, h_cnt, kpws[n]);
 499     sha256_round(hs, total_hs, h_cnt, vaux0);
 500     sha256_round(hs, total_hs, h_cnt, vaux1);
 501     sha256_round(hs, total_hs, h_cnt, vaux2);
 502 #else
 503     sha256_round(hs, total_hs, h_cnt, vaux2);
 504     sha256_round(hs, total_hs, h_cnt, vaux1);
 505     sha256_round(hs, total_hs, h_cnt, vaux0);
 506     sha256_round(hs, total_hs, h_cnt, kpws[n]);
 507 #endif
 508   }
 509 
 510   Register tmp = R8;
 511   // loop the 16th to the 64th iteration by 8 steps
 512   li   (tmp, (w_size - 16) / total_hs);
 513   mtctr(tmp);
 514 
 515   // j will be aligned to 4 for loading words.
 516   // Whenever read, advance the pointer (e.g: when j is used in a function)
 517   Register j = R8;
 518   li   (j, 16*4);
 519 
 520   align(OptoLoopAlignment);
 521   bind(core_loop);
 522 
 523   // due to VectorRegister rotate, always iterate in multiples of total_hs
 524   for (int n = 0; n < total_hs/4; n++) {
 525     sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);
 526     sha256_round(hs, total_hs, h_cnt, kpw0);
 527     sha256_round(hs, total_hs, h_cnt, kpw1);
 528     sha256_round(hs, total_hs, h_cnt, kpw2);
 529     sha256_round(hs, total_hs, h_cnt, kpw3);
 530   }
 531 
 532   bdnz   (core_loop);
 533 
 534   // Update hash state
 535   sha256_update_sha_state(a, b, c, d, e, f, g, h, state);
 536 
 537   if (multi_block) {
 538     addi(buf_in, buf_in, buf_size);
 539     addi(ofs, ofs, buf_size);
 540     cmplw(CCR0, ofs, limit);
 541     ble(CCR0, sha_loop);
 542 
 543     // return ofs
 544     mr(R3_RET, ofs);
 545   }
 546 
 547   // Restore non-volatile registers
 548   for (int c = 0; c < nv_size; c++) {
 549     Register tmp = R8;
 550     li  (tmp, (c - (nv_size)) * 16);
 551     lvx(nv[c], tmp, R1);
 552   }
 553 }
 554 
 555 
 556 /**********************************************************************
 557  * SHA 512
 558  *********************************************************************/
 559 
 560 void MacroAssembler::sha512_load_w_vec(const Register buf_in,
 561                                        const VectorRegister* ws,
 562                                        const int total_ws) {
 563   Register tmp       = R8;
 564   VectorRegister vRb = VR8;
 565   VectorRegister aux = VR9;
 566   Label is_aligned, after_alignment;
 567 
 568   andi_  (tmp, buf_in, 0xF);
 569   beq    (CCR0, is_aligned); // address ends with 0x0, not 0x8
 570 
 571   // deal with unaligned addresses
 572   lvx    (ws[0], buf_in);
 573   load_perm(vRb, buf_in);
 574 
 575   for (int n = 1; n < total_ws; n++) {
 576     VectorRegister w_cur = ws[n];
 577     VectorRegister w_prev = ws[n-1];
 578     addi (tmp, buf_in, n * 16);
 579     lvx  (w_cur, tmp);
 580     vec_perm(w_prev, w_cur, vRb);
 581   }
 582   addi   (tmp, buf_in, total_ws * 16);
 583   lvx    (aux, tmp);
 584   vec_perm(ws[total_ws-1], aux, vRb);
 585   b      (after_alignment);
 586 
 587   bind(is_aligned);
 588   lvx  (ws[0], buf_in);
 589   for (int n = 1; n < total_ws; n++) {
 590     VectorRegister w = ws[n];
 591     addi (tmp, buf_in, n * 16);
 592     lvx  (w, tmp);
 593   }
 594 
 595   bind(after_alignment);
 596 }
 597 
 598 // Update hash state
 599 void MacroAssembler::sha512_update_sha_state(const Register state,
 600                                              const VectorRegister* hs,
 601                                              const int total_hs) {
 602 
 603 #if defined(VM_LITTLE_ENDIAN)
 604   int start_idx = 0;
 605 #else
 606   int start_idx = 1;
 607 #endif
 608 
 609   // load initial hash from the memory pointed by state
 610   VectorRegister ini_a = VR10;
 611   VectorRegister ini_c = VR12;
 612   VectorRegister ini_e = VR14;
 613   VectorRegister ini_g = VR16;
 614   static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};
 615   static const int total_inis = sizeof(inis)/sizeof(VectorRegister);
 616 
 617   Label state_save_aligned, after_state_save_aligned;
 618 
 619   Register addr      = R7;
 620   Register tmp       = R8;
 621   VectorRegister vRb = VR8;
 622   VectorRegister aux = VR9;
 623 
 624   andi_(tmp, state, 0xf);
 625   beq(CCR0, state_save_aligned);
 626   // deal with unaligned addresses
 627 
 628   {
 629     VectorRegister a = hs[0];
 630     VectorRegister b_ = hs[1];
 631     VectorRegister c = hs[2];
 632     VectorRegister d = hs[3];
 633     VectorRegister e = hs[4];
 634     VectorRegister f = hs[5];
 635     VectorRegister g = hs[6];
 636     VectorRegister h = hs[7];
 637     load_perm(vRb, state);
 638     lvx    (ini_a, state);
 639     addi   (addr, state, 16);
 640 
 641     lvx    (ini_c, addr);
 642     addi   (addr, state, 32);
 643     vec_perm(ini_a, ini_c, vRb);
 644 
 645     lvx    (ini_e, addr);
 646     addi   (addr, state, 48);
 647     vec_perm(ini_c, ini_e, vRb);
 648 
 649     lvx    (ini_g, addr);
 650     addi   (addr, state, 64);
 651     vec_perm(ini_e, ini_g, vRb);
 652 
 653     lvx    (aux, addr);
 654     vec_perm(ini_g, aux, vRb);
 655 
 656 #if defined(VM_LITTLE_ENDIAN)
 657     xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr());
 658     xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());
 659     xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());
 660     xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());
 661 #else
 662     xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr());
 663     xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr());
 664     xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr());
 665     xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr());
 666 #endif
 667 
 668     for (int n = start_idx; n < total_hs; n += 2) {
 669       VectorRegister h_cur = hs[n];
 670       VectorRegister ini_cur = inis[n/2];
 671 
 672       vaddudm(h_cur, ini_cur, h_cur);
 673     }
 674 
 675     for (int n = start_idx; n < total_hs; n += 2) {
 676       VectorRegister h_cur = hs[n];
 677 
 678       mfvrd  (tmp, h_cur);
 679 #if defined(VM_LITTLE_ENDIAN)
 680       std    (tmp, 8*n + 8, state);
 681 #else
 682       std    (tmp, 8*n - 8, state);
 683 #endif
 684       vsldoi (aux, h_cur, h_cur, 8);
 685       mfvrd  (tmp, aux);
 686       std    (tmp, 8*n + 0, state);
 687     }
 688 
 689     b      (after_state_save_aligned);
 690   }
 691 
 692   bind(state_save_aligned);
 693   {
 694     for (int n = 0; n < total_hs; n += 2) {
 695 #if defined(VM_LITTLE_ENDIAN)
 696       VectorRegister h_cur = hs[n];
 697       VectorRegister h_next = hs[n+1];
 698 #else
 699       VectorRegister h_cur = hs[n+1];
 700       VectorRegister h_next = hs[n];
 701 #endif
 702       VectorRegister ini_cur = inis[n/2];
 703 
 704       if (n/2 == 0) {
 705         lvx(ini_cur, state);
 706       } else {
 707         addi(addr, state, (n/2) * 16);
 708         lvx(ini_cur, addr);
 709       }
 710       xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());
 711     }
 712 
 713     for (int n = start_idx; n < total_hs; n += 2) {
 714       VectorRegister h_cur = hs[n];
 715       VectorRegister ini_cur = inis[n/2];
 716 
 717       vaddudm(h_cur, ini_cur, h_cur);
 718     }
 719 
 720     for (int n = start_idx; n < total_hs; n += 2) {
 721       VectorRegister h_cur = hs[n];
 722 
 723       if (n/2 == 0) {
 724         stvx(h_cur, state);
 725       } else {
 726         addi(addr, state, (n/2) * 16);
 727         stvx(h_cur, addr);
 728       }
 729     }
 730   }
 731 
 732   bind(after_state_save_aligned);
 733 }
 734 
 735 // Use h_cnt to cycle through hs elements but also increment it at the end
 736 void MacroAssembler::sha512_round(const VectorRegister* hs,
 737                                   const int total_hs, int& h_cnt,
 738                                   const VectorRegister kpw) {
 739 
 740   // convenience registers: cycle from 0-7 downwards
 741   const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
 742   const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
 743   const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
 744   const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
 745   const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
 746   const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
 747   const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
 748   const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
 749   // temporaries
 750   const VectorRegister Ch   = VR20;
 751   const VectorRegister Maj  = VR21;
 752   const VectorRegister bsa  = VR22;
 753   const VectorRegister bse  = VR23;
 754   const VectorRegister tmp1 = VR24;
 755   const VectorRegister tmp2 = VR25;
 756 
 757   vsel      (Ch,   g,    f,   e);
 758   vxor      (Maj,  a,    b);
 759   vshasigmad(bse,  e,    1,   0xf);
 760   vaddudm   (tmp2, Ch,   kpw);
 761   vaddudm   (tmp1, h,    bse);
 762   vsel      (Maj,  b,    c,   Maj);
 763   vaddudm   (tmp1, tmp1, tmp2);
 764   vshasigmad(bsa,  a,    1,   0);
 765   vaddudm   (tmp2, bsa,  Maj);
 766   vaddudm   (d,    d,    tmp1);
 767   vaddudm   (h,    tmp1, tmp2);
 768 
 769   // advance vector pointer to the next iteration
 770   h_cnt++;
 771 }
 772 
 773 void MacroAssembler::sha512_calc_2w(const VectorRegister w0,
 774                                     const VectorRegister w1,
 775                                     const VectorRegister w2,
 776                                     const VectorRegister w3,
 777                                     const VectorRegister w4,
 778                                     const VectorRegister w5,
 779                                     const VectorRegister w6,
 780                                     const VectorRegister w7,
 781                                     const VectorRegister kpw0,
 782                                     const VectorRegister kpw1,
 783                                     const Register j,
 784                                     const VectorRegister vRb,
 785                                     const Register k) {
 786   // Temporaries
 787   const VectorRegister VR_a = VR20;
 788   const VectorRegister VR_b = VR21;
 789   const VectorRegister VR_c = VR22;
 790   const VectorRegister VR_d = VR23;
 791 
 792   // load to k[j]
 793   lvx        (VR_a, j,    k);
 794   // advance j
 795   addi       (j,    j,    16); // 16 bytes were read
 796 
 797 #if defined(VM_LITTLE_ENDIAN)
 798   // v6 = w[j-15], w[j-14]
 799   vperm      (VR_b, w1,   w0,  vRb);
 800   // v12 = w[j-7], w[j-6]
 801   vperm      (VR_c, w5,   w4,  vRb);
 802 #else
 803   // v6 = w[j-15], w[j-14]
 804   vperm      (VR_b, w0,   w1,  vRb);
 805   // v12 = w[j-7], w[j-6]
 806   vperm      (VR_c, w4,   w5,  vRb);
 807 #endif
 808 
 809   // v6 = s0(w[j-15]) , s0(w[j-14])
 810   vshasigmad (VR_b, VR_b,    0,   0);
 811   // v5 = s1(w[j-2]) , s1(w[j-1])
 812   vshasigmad (VR_d, w7,      0,   0xf);
 813   // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]
 814   vaddudm    (VR_b, VR_b, VR_c);
 815   // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]
 816   vaddudm    (VR_d, VR_d, w0);
 817   // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 818   //      s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 819   vaddudm    (VR_c, VR_d, VR_b);
 820   // Updating w0 to w7 to hold the new previous 16 values from w.
 821   vmr        (w0,   w1);
 822   vmr        (w1,   w2);
 823   vmr        (w2,   w3);
 824   vmr        (w3,   w4);
 825   vmr        (w4,   w5);
 826   vmr        (w5,   w6);
 827   vmr        (w6,   w7);
 828   vmr        (w7,   VR_c);
 829 
 830 #if defined(VM_LITTLE_ENDIAN)
 831   // store k + w to kpw0 (2 values at once)
 832   vaddudm    (kpw0, VR_c, VR_a);
 833   // kpw1 holds (k + w)[1]
 834   vsldoi     (kpw1, kpw0, kpw0, 8);
 835 #else
 836   // store k + w to kpw0 (2 values at once)
 837   vaddudm    (kpw1, VR_c, VR_a);
 838   // kpw1 holds (k + w)[1]
 839   vsldoi     (kpw0, kpw1, kpw1, 8);
 840 #endif
 841 }
 842 
 843 void MacroAssembler::sha512_load_h_vec(const Register state,
 844                                        const VectorRegister* hs,
 845                                        const int total_hs) {
 846 #if defined(VM_LITTLE_ENDIAN)
 847   VectorRegister a   = hs[0];
 848   VectorRegister g   = hs[6];
 849   int start_idx = 0;
 850 #else
 851   VectorRegister a   = hs[1];
 852   VectorRegister g   = hs[7];
 853   int start_idx = 1;
 854 #endif
 855 
 856   Register addr      = R7;
 857   VectorRegister vRb = VR8;
 858   Register tmp       = R8;
 859   Label state_aligned, after_state_aligned;
 860 
 861   andi_(tmp, state, 0xf);
 862   beq(CCR0, state_aligned);
 863 
 864   // deal with unaligned addresses
 865   VectorRegister aux = VR9;
 866 
 867   lvx(hs[start_idx], state);
 868   load_perm(vRb, state);
 869 
 870   for (int n = start_idx + 2; n < total_hs; n += 2) {
 871     VectorRegister h_cur   = hs[n];
 872     VectorRegister h_prev2 = hs[n - 2];
 873     addi(addr, state, (n/2) * 16);
 874     lvx(h_cur, addr);
 875     vec_perm(h_prev2, h_cur, vRb);
 876   }
 877   addi(addr, state, (total_hs/2) * 16);
 878   lvx    (aux, addr);
 879   vec_perm(hs[total_hs - 2 + start_idx], aux, vRb);
 880   b      (after_state_aligned);
 881 
 882   bind(state_aligned);
 883 
 884   // deal with aligned addresses
 885   lvx(hs[start_idx], state);
 886 
 887   for (int n = start_idx + 2; n < total_hs; n += 2) {
 888     VectorRegister h_cur = hs[n];
 889     addi(addr, state, (n/2) * 16);
 890     lvx(h_cur, addr);
 891   }
 892 
 893   bind(after_state_aligned);
 894 }
 895 
 896 static const uint64_t sha512_round_table[80] __attribute((aligned(16))) = {
 897   0x428a2f98d728ae22, 0x7137449123ef65cd,
 898   0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
 899   0x3956c25bf348b538, 0x59f111f1b605d019,
 900   0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
 901   0xd807aa98a3030242, 0x12835b0145706fbe,
 902   0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
 903   0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
 904   0x9bdc06a725c71235, 0xc19bf174cf692694,
 905   0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
 906   0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
 907   0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
 908   0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
 909   0x983e5152ee66dfab, 0xa831c66d2db43210,
 910   0xb00327c898fb213f, 0xbf597fc7beef0ee4,
 911   0xc6e00bf33da88fc2, 0xd5a79147930aa725,
 912   0x06ca6351e003826f, 0x142929670a0e6e70,
 913   0x27b70a8546d22ffc, 0x2e1b21385c26c926,
 914   0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
 915   0x650a73548baf63de, 0x766a0abb3c77b2a8,
 916   0x81c2c92e47edaee6, 0x92722c851482353b,
 917   0xa2bfe8a14cf10364, 0xa81a664bbc423001,
 918   0xc24b8b70d0f89791, 0xc76c51a30654be30,
 919   0xd192e819d6ef5218, 0xd69906245565a910,
 920   0xf40e35855771202a, 0x106aa07032bbd1b8,
 921   0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
 922   0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
 923   0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
 924   0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
 925   0x748f82ee5defb2fc, 0x78a5636f43172f60,
 926   0x84c87814a1f0ab72, 0x8cc702081a6439ec,
 927   0x90befffa23631e28, 0xa4506cebde82bde9,
 928   0xbef9a3f7b2c67915, 0xc67178f2e372532b,
 929   0xca273eceea26619c, 0xd186b8c721c0c207,
 930   0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
 931   0x06f067aa72176fba, 0x0a637dc5a2c898a6,
 932   0x113f9804bef90dae, 0x1b710b35131c471b,
 933   0x28db77f523047d84, 0x32caab7b40c72493,
 934   0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
 935   0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
 936   0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
 937 };
 938 static const uint64_t *sha512_round_consts = sha512_round_table;
 939 
 940 //   R3_ARG1   - byte[]  Input string with padding but in Big Endian
 941 //   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
 942 //   R5_ARG3   - int     offset
 943 //   R6_ARG4   - int     limit
 944 //
 945 //   Internal Register usage:
 946 //   R7 R8 R9  - volatile temporaries
 947 //   VR0-VR7   - a-h
 948 //   VR8       - vRb
 949 //   VR9       - aux (highly volatile, use with care)
 950 //   VR10-VR17 - w0-w7 | ini_a-ini_h
 951 //   VR18      - vsp16 | kplusw0
 952 //   VR19      - vsp32 | kplusw1
 953 //   VR20-VR25 - sha512_calc_2w and sha512_round temporaries
 954 void MacroAssembler::sha512(bool multi_block) {
 955   static const ssize_t buf_size = 128;
 956   static const uint8_t w_size = sizeof(sha512_round_table)/sizeof(uint64_t);
 957 #ifdef AIX
 958   // malloc provides 16 byte alignment
 959   if (((uintptr_t)sha512_round_consts & 0xF) != 0) {
 960     uint64_t *new_round_consts = (uint64_t*)malloc(sizeof(sha512_round_table));
 961     guarantee(new_round_consts, "oom");
 962     memcpy(new_round_consts, sha512_round_consts, sizeof(sha512_round_table));
 963     sha512_round_consts = (const uint64_t*)new_round_consts;
 964   }
 965 #endif
 966 
 967   Register buf_in = R3_ARG1;
 968   Register state  = R4_ARG2;
 969   Register ofs    = R5_ARG3;
 970   Register limit  = R6_ARG4;
 971 
 972   Label sha_loop, core_loop;
 973 
 974   // Save non-volatile vector registers in the red zone
 975   static const VectorRegister nv[] = {
 976     VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/
 977   };
 978   static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
 979 
 980   for (int c = 0; c < nv_size; c++) {
 981     Register idx = R7;
 982     li  (idx, (c - (nv_size)) * 16);
 983     stvx(nv[c], idx, R1);
 984   }
 985 
 986   // Load hash state to registers
 987   VectorRegister a = VR0;
 988   VectorRegister b = VR1;
 989   VectorRegister c = VR2;
 990   VectorRegister d = VR3;
 991   VectorRegister e = VR4;
 992   VectorRegister f = VR5;
 993   VectorRegister g = VR6;
 994   VectorRegister h = VR7;
 995   static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
 996   static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
 997   // counter for cycling through hs vector to avoid register moves between iterations
 998   int h_cnt = 0;
 999 
1000   // Load a-h registers from the memory pointed by state
1001   sha512_load_h_vec(state, hs, total_hs);
1002 
1003   Register k = R9;
1004   assert(((uintptr_t)sha512_round_consts & 0xF) == 0, "k alignment");
1005   load_const_optimized(k, (address)sha512_round_consts, R0);
1006 
1007   if (multi_block) {
1008     align(OptoLoopAlignment);
1009   }
1010   bind(sha_loop);
1011 
1012   for (int n = 0; n < total_hs; n += 2) {
1013 #if defined(VM_LITTLE_ENDIAN)
1014     VectorRegister h_cur = hs[n];
1015     VectorRegister h_next = hs[n + 1];
1016 #else
1017     VectorRegister h_cur = hs[n + 1];
1018     VectorRegister h_next = hs[n];
1019 #endif
1020     vsldoi (h_next, h_cur, h_cur, 8);
1021   }
1022 
1023   // Load 16 elements from w out of the loop.
1024   // Order of the long values is Endianess specific.
1025   VectorRegister w0 = VR10;
1026   VectorRegister w1 = VR11;
1027   VectorRegister w2 = VR12;
1028   VectorRegister w3 = VR13;
1029   VectorRegister w4 = VR14;
1030   VectorRegister w5 = VR15;
1031   VectorRegister w6 = VR16;
1032   VectorRegister w7 = VR17;
1033   static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};
1034   static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
1035 
1036   // Load 16 w into vectors and setup vsl for vperm
1037   sha512_load_w_vec(buf_in, ws, total_ws);
1038 
1039 #if defined(VM_LITTLE_ENDIAN)
1040   VectorRegister vsp16 = VR18;
1041   VectorRegister vsp32 = VR19;
1042   VectorRegister shiftarg = VR9;
1043 
1044   vspltisw(vsp16,    8);
1045   vspltisw(shiftarg, 1);
1046   vsl     (vsp16,    vsp16, shiftarg);
1047   vsl     (vsp32,    vsp16, shiftarg);
1048 
1049   VectorRegister vsp8 = VR9;
1050   vspltish(vsp8,     8);
1051 
1052   // Convert input from Big Endian to Little Endian
1053   for (int c = 0; c < total_ws; c++) {
1054     VectorRegister w = ws[c];
1055     vrlh  (w, w, vsp8);
1056   }
1057   for (int c = 0; c < total_ws; c++) {
1058     VectorRegister w = ws[c];
1059     vrlw  (w, w, vsp16);
1060   }
1061   for (int c = 0; c < total_ws; c++) {
1062     VectorRegister w = ws[c];
1063     vrld  (w, w, vsp32);
1064   }
1065 #endif
1066 
1067   Register Rb        = R10;
1068   VectorRegister vRb = VR8;
1069   li      (Rb, 8);
1070   load_perm(vRb, Rb);
1071 
1072   VectorRegister kplusw0 = VR18;
1073   VectorRegister kplusw1 = VR19;
1074 
1075   Register addr      = R7;
1076 
1077   for (int n = 0; n < total_ws; n++) {
1078     VectorRegister w = ws[n];
1079 
1080     if (n == 0) {
1081       lvx  (kplusw0, k);
1082     } else {
1083       addi (addr, k, n * 16);
1084       lvx  (kplusw0, addr);
1085     }
1086 #if defined(VM_LITTLE_ENDIAN)
1087     vaddudm(kplusw0, kplusw0, w);
1088     vsldoi (kplusw1, kplusw0, kplusw0, 8);
1089 #else
1090     vaddudm(kplusw1, kplusw0, w);
1091     vsldoi (kplusw0, kplusw1, kplusw1, 8);
1092 #endif
1093 
1094     sha512_round(hs, total_hs, h_cnt, kplusw0);
1095     sha512_round(hs, total_hs, h_cnt, kplusw1);
1096   }
1097 
1098   Register tmp       = R8;
1099   li    (tmp, (w_size-16)/total_hs);
1100   mtctr (tmp);
1101   // j will be aligned to 4 for loading words.
1102   // Whenever read, advance the pointer (e.g: when j is used in a function)
1103   Register j = tmp;
1104   li     (j, 8*16);
1105 
1106   align(OptoLoopAlignment);
1107   bind(core_loop);
1108 
1109   // due to VectorRegister rotate, always iterate in multiples of total_hs
1110   for (int n = 0; n < total_hs/2; n++) {
1111     sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);
1112     sha512_round(hs, total_hs, h_cnt, kplusw0);
1113     sha512_round(hs, total_hs, h_cnt, kplusw1);
1114   }
1115 
1116   bdnz   (core_loop);
1117 
1118   sha512_update_sha_state(state, hs, total_hs);
1119 
1120   if (multi_block) {
1121     addi(buf_in, buf_in, buf_size);
1122     addi(ofs, ofs, buf_size);
1123     cmplw(CCR0, ofs, limit);
1124     ble(CCR0, sha_loop);
1125 
1126     // return ofs
1127     mr(R3_RET, ofs);
1128   }
1129 
1130   // Restore non-volatile registers
1131   for (int c = 0; c < nv_size; c++) {
1132     Register idx = R7;
1133     li  (idx, (c - (nv_size)) * 16);
1134     lvx(nv[c], idx, R1);
1135   }
1136 }