New src/cpu/ppc/vm/macroAssembler_ppc

   1 // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.
   2 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   3 //
   4 // This code is free software; you can redistribute it and/or modify it
   5 // under the terms of the GNU General Public License version 2 only, as
   6 // published by the Free Software Foundation.
   7 //
   8 // This code is distributed in the hope that it will be useful, but WITHOUT
   9 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  11 // version 2 for more details (a copy is included in the LICENSE file that
  12 // accompanied this code).
  13 //
  14 // You should have received a copy of the GNU General Public License version
  15 // 2 along with this work; if not, write to the Free Software Foundation,
  16 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  17 //
  18 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  19 // or visit www.oracle.com if you need additional information or have any
  20 // questions.
  21 
  22 #include "asm/assembler.hpp"
  23 #include "asm/assembler.inline.hpp"
  24 #include "runtime/stubRoutines.hpp"
  25 #include "macroAssembler_ppc.hpp"
  26 
  27 /**********************************************************************
  28  * SHA 256
  29  *********************************************************************/
  30 
  31 void MacroAssembler::sha256_deque(const VectorRegister src,
  32                                   const VectorRegister dst1,
  33                                   const VectorRegister dst2,
  34                                   const VectorRegister dst3) {
  35   vsldoi (dst1, src, src, 12);
  36   vsldoi (dst2, src, src, 8);
  37   vsldoi (dst3, src, src, 4);
  38 }
  39 
  40 void MacroAssembler::sha256_round(const VectorRegister* hs,
  41                                   const int total_hs,
  42                                   int& h_cnt,
  43                                   const VectorRegister kpw) {
  44   // convenience registers: cycle from 0-7 downwards
  45   const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
  46   const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
  47   const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
  48   const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
  49   const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
  50   const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
  51   const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
  52   const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
  53   // temporaries
  54   VectorRegister ch  = VR0;
  55   VectorRegister maj = VR1;
  56   VectorRegister bsa = VR2;
  57   VectorRegister bse = VR3;
  58   VectorRegister vt0 = VR4;
  59   VectorRegister vt1 = VR5;
  60   VectorRegister vt2 = VR6;
  61   VectorRegister vt3 = VR7;
  62 
  63   vsel       (ch,  g,   f, e);
  64   vxor       (maj, a,   b);
  65   vshasigmaw (bse, e,   1, 0xf);
  66   vadduwm    (vt2, ch,  kpw);
  67   vadduwm    (vt1, h,   bse);
  68   vsel       (maj, b,   c, maj);
  69   vadduwm    (vt3, vt1, vt2);
  70   vshasigmaw (bsa, a,   1, 0);
  71   vadduwm    (vt0, bsa, maj);
  72 
  73   vadduwm    (d,   d,   vt3);
  74   vadduwm    (h,   vt3, vt0);
  75 
  76   // advance vector pointer to the next iteration
  77   h_cnt++;
  78 }
  79 
  80 void MacroAssembler::sha256_load_h_vec(const VectorRegister a,
  81                                        const VectorRegister e,
  82                                        const Register hptr) {
  83   // temporaries
  84   Register tmp = R8;
  85   VectorRegister vt0 = VR0;
  86   VectorRegister vRb = VR6;
  87   // labels
  88   Label sha256_aligned, sha256_load_end;;
  89 
  90   andi_  (tmp,  hptr, 0xf);
  91   addi   (tmp,  hptr, 16);
  92   beq    (CCR0, sha256_aligned);
  93 
  94   // handle unaligned accesses
  95   lvx    (a,    hptr);
  96   lvsr   (vRb,  hptr);
  97 
  98   lvx    (e,    tmp);
  99   addi   (tmp,  tmp,  16);
 100 #if defined(VM_LITTLE_ENDIAN)
 101   vperm  (a,    e,    a, vRb);
 102 #else
 103   vperm  (a,    a,    e, vRb);
 104 #endif
 105 
 106   lvx    (vt0,  tmp);
 107 #if defined(VM_LITTLE_ENDIAN)
 108   vperm  (e,    vt0,  e, vRb);
 109 #else
 110   vperm  (e,    e,  vt0, vRb);
 111 #endif
 112   b      (sha256_load_end);
 113 
 114   // aligned accesses
 115   bind(sha256_aligned);
 116   lvx    (a,    hptr);
 117   addi   (tmp,  hptr, 16);
 118   lvx    (e,    tmp);
 119 
 120   bind(sha256_load_end);
 121 }
 122 
 123 void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in,
 124                                               const VectorRegister* ws,
 125                                               const int total_ws,
 126                                               const Register k,
 127                                               const VectorRegister* kpws,
 128                                               const int total_kpws) {
 129   Label w_aligned, after_w_load;
 130 
 131   Register tmp       = R8;
 132   VectorRegister vt0 = VR0;
 133   VectorRegister vt1 = VR1;
 134   VectorRegister vRb = VR6;
 135 
 136   andi_ (tmp, buf_in, 0xF);
 137   beq   (CCR0, w_aligned); // address ends with 0x0, not 0x8
 138 
 139   // deal with unaligned addresses
 140   lvx    (ws[0], buf_in);
 141   addi   (buf_in, buf_in, 16);
 142   lvsl   (vRb, buf_in);
 143 
 144   for (int n = 1; n < total_ws; n++) {
 145     VectorRegister w_cur = ws[n];
 146     VectorRegister w_prev = ws[n-1];
 147 
 148     lvx  (w_cur, buf_in);
 149     addi (buf_in, buf_in, 16);
 150 #if defined(VM_LITTLE_ENDIAN)
 151     vperm(w_prev, w_cur, w_prev, vRb);
 152 #else
 153     vperm(w_prev, w_prev, w_cur, vRb);
 154 #endif
 155   }
 156 
 157   lvx    (vt0, buf_in);
 158 #if defined(VM_LITTLE_ENDIAN)
 159   vperm  (ws[total_ws-1], vt0, ws[total_ws-1], vRb);
 160 #else
 161   vperm  (ws[total_ws-1], ws[total_ws-1], vt0, vRb);
 162 #endif
 163 
 164   b      (after_w_load);
 165 
 166   bind(w_aligned);
 167 
 168   // deal with aligned addresses
 169   for (int n = 0; n < total_ws; n++) {
 170     VectorRegister w = ws[n];
 171 
 172     lvx  (w, buf_in);
 173     addi (buf_in, buf_in, 16);
 174   }
 175 
 176   bind(after_w_load);
 177 
 178 #if defined(VM_LITTLE_ENDIAN)
 179   // Byte swapping within int values
 180   li       (tmp, 8);
 181   lvsl     (vt0, tmp);
 182   vspltisb (vt1, 0xb);
 183   vxor     (vt1, vt0, vt1);
 184   for (int n = 0; n < total_ws; n++) {
 185     VectorRegister w = ws[n];
 186     vperm  (w,   w,   w,   vt1);
 187   }
 188 #endif
 189 
 190   // Loading k, which is always aligned to 16-bytes
 191   lvx    (kpws[0], k);
 192   addi   (tmp, k, 16);
 193   for (int n = 1; n < total_kpws-1; n++) {
 194     VectorRegister kpw = kpws[n];
 195 
 196     lvx  (kpw, tmp);
 197     addi (tmp, tmp, 16);
 198   }
 199   lvx  (kpws[total_kpws-1], tmp);
 200 
 201   // Add w to K
 202   assert(total_ws == total_kpws, "Redesign the loop below");
 203   for (int n = 0; n < total_kpws; n++) {
 204     VectorRegister kpw = kpws[n];
 205     VectorRegister w   = ws[n];
 206 
 207     vadduwm  (kpw, kpw, w);
 208   }
 209 }
 210 
 211 void MacroAssembler::sha256_calc_4w(const VectorRegister w0,
 212                                     const VectorRegister w1,
 213                                     const VectorRegister w2,
 214                                     const VectorRegister w3,
 215                                     const VectorRegister kpw0,
 216                                     const VectorRegister kpw1,
 217                                     const VectorRegister kpw2,
 218                                     const VectorRegister kpw3,
 219                                     const Register j,
 220                                     const Register k) {
 221   // Temporaries
 222   const VectorRegister  vt0  = VR0;
 223   const VectorRegister  vt1  = VR1;
 224   const VectorSRegister vsrt1 = vt1->to_vsr();
 225   const VectorRegister  vt2  = VR2;
 226   const VectorRegister  vt3  = VR3;
 227   const VectorSRegister vst3 = vt3->to_vsr();
 228   const VectorRegister  vt4  = VR4;
 229 
 230   // load to k[j]
 231   lvx        (vt0, j,   k);
 232 
 233   // advance j
 234   addi       (j,   j,   16); // 16 bytes were read
 235 
 236 #if defined(VM_LITTLE_ENDIAN)
 237   // b = w[j-15], w[j-14], w[j-13], w[j-12]
 238   vsldoi     (vt1, w1,  w0, 12);
 239 
 240   // c = w[j-7], w[j-6], w[j-5], w[j-4]
 241   vsldoi     (vt2, w3,  w2, 12);
 242 
 243 #else
 244   // b = w[j-15], w[j-14], w[j-13], w[j-12]
 245   vsldoi     (vt1, w0,  w1, 4);
 246 
 247   // c = w[j-7], w[j-6], w[j-5], w[j-4]
 248   vsldoi     (vt2, w2,  w3, 4);
 249 #endif
 250 
 251   // d = w[j-2], w[j-1], w[j-4], w[j-3]
 252   vsldoi     (vt3, w3,  w3, 8);
 253 
 254   // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
 255   vshasigmaw (vt1, vt1, 0,  0);
 256 
 257   // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
 258   vshasigmaw (vt3, vt3, 0,  0xf);
 259 
 260   // c = s0(w[j-15]) + w[j-7],
 261   //     s0(w[j-14]) + w[j-6],
 262   //     s0(w[j-13]) + w[j-5],
 263   //     s0(w[j-12]) + w[j-4]
 264   vadduwm    (vt2, vt1, vt2);
 265 
 266   // c = s0(w[j-15]) + w[j-7] + w[j-16],
 267   //     s0(w[j-14]) + w[j-6] + w[j-15],
 268   //     s0(w[j-13]) + w[j-5] + w[j-14],
 269   //     s0(w[j-12]) + w[j-4] + w[j-13]
 270   vadduwm    (vt2, vt2, w0);
 271 
 272   // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 273   //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 274   //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
 275   //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3])  // UNDEFINED
 276   vadduwm    (vt4, vt2, vt3);
 277 
 278   // At this point, e[0] and e[1] are the correct values to be stored at w[j]
 279   // and w[j+1].
 280   // e[2] and e[3] are not considered.
 281   // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
 282   vshasigmaw (vt1, vt4, 0,  0xf);
 283 
 284   // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
 285 #if defined(VM_LITTLE_ENDIAN)
 286   xxmrgld    (vst3, vsrt1, vst3);
 287 #else
 288   xxmrghd    (vst3, vst3, vsrt1);
 289 #endif
 290 
 291   // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 292   //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 293   //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]),   // w[j+2]
 294   //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1])  // w[j+4]
 295   vadduwm    (vt2, vt2, vt3);
 296 
 297   // Updating w0 to w3 to hold the new previous 16 values from w.
 298   vmr        (w0,  w1);
 299   vmr        (w1,  w2);
 300   vmr        (w2,  w3);
 301   vmr        (w3,  vt2);
 302 
 303   // store k + w to v9 (4 values at once)
 304 #if defined(VM_LITTLE_ENDIAN)
 305   vadduwm    (kpw0, vt2, vt0);
 306 
 307   vsldoi     (kpw1, kpw0, kpw0, 12);
 308   vsldoi     (kpw2, kpw0, kpw0, 8);
 309   vsldoi     (kpw3, kpw0, kpw0, 4);
 310 #else
 311   vadduwm    (kpw3, vt2, vt0);
 312 
 313   vsldoi     (kpw2, kpw3, kpw3, 12);
 314   vsldoi     (kpw1, kpw3, kpw3, 8);
 315   vsldoi     (kpw0, kpw3, kpw3, 4);
 316 #endif
 317 }
 318 
 319 void MacroAssembler::sha256_update_sha_state(const VectorRegister a,
 320                                              const VectorRegister b_,
 321                                              const VectorRegister c,
 322                                              const VectorRegister d,
 323                                              const VectorRegister e,
 324                                              const VectorRegister f,
 325                                              const VectorRegister g,
 326                                              const VectorRegister h,
 327                                              const Register hptr) {
 328   // temporaries
 329   VectorRegister vt0  = VR0;
 330   VectorRegister vt1  = VR1;
 331   VectorRegister vt2  = VR2;
 332   VectorRegister vt3  = VR3;
 333   VectorRegister vt4  = VR4;
 334   VectorRegister vt5  = VR5;
 335   VectorRegister vaux = VR6;
 336   VectorRegister vRb  = VR6;
 337   Register tmp        = R8;
 338   Register of16       = R8;
 339   Register of32       = R9;
 340   Label state_load_aligned, after_state_load_aligned;
 341 
 342   // Load hptr
 343   andi_   (tmp, hptr, 0xf);
 344   li      (of16, 16);
 345   beq     (CCR0, state_load_aligned);
 346 
 347   // handle unaligned accesses
 348   li      (of32, 32);
 349   lvx     (vt0, hptr);
 350   lvsr    (vRb, hptr);
 351 
 352   lvx     (vt5, hptr, of16);
 353 #if defined(VM_LITTLE_ENDIAN)
 354   vperm   (vt0, vt5, vt0, vRb);    // vt0 = hptr[0]..hptr[3]
 355 #else
 356   vperm   (vt0, vt0, vt5, vRb);    // vt0 = hptr[0]..hptr[3]
 357 #endif
 358 
 359   lvx     (vt1, hptr, of32);
 360 #if defined(VM_LITTLE_ENDIAN)
 361   vperm   (vt5, vt1, vt5, vRb);    // vt5 = hptr[4]..hptr[7]
 362 #else
 363   vperm   (vt5, vt5, vt1, vRb);    // vt5 = hptr[4]..hptr[7]
 364 #endif
 365   b       (after_state_load_aligned);
 366 
 367   // aligned accesses
 368   bind(state_load_aligned);
 369   lvx     (vt0, hptr);
 370   lvx     (vt5, of16, hptr);
 371 
 372   bind(after_state_load_aligned);
 373 
 374 #if defined(VM_LITTLE_ENDIAN)
 375   vmrglw  (vt1, b_, a);            // vt1 = {a, b, ?, ?}
 376   vmrglw  (vt2, d, c);             // vt2 = {c, d, ?, ?}
 377   vmrglw  (vt3, f, e);             // vt3 = {e, f, ?, ?}
 378   vmrglw  (vt4, h, g);             // vt4 = {g, h, ?, ?}
 379   xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}
 380   xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}
 381   vadduwm (a,   vt0, vt1);         // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
 382   vadduwm (e,   vt5, vt3);         // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
 383 
 384   // Save hptr back, works for any alignment
 385   xxswapd (vt0->to_vsr(), a->to_vsr());
 386   stxvd2x (vt0->to_vsr(), hptr);
 387   xxswapd (vt5->to_vsr(), e->to_vsr());
 388   stxvd2x (vt5->to_vsr(), of16, hptr);
 389 #else
 390   vmrglw  (vt1, a, b_);            // vt1 = {a, b, ?, ?}
 391   vmrglw  (vt2, c, d);             // vt2 = {c, d, ?, ?}
 392   vmrglw  (vt3, e, f);             // vt3 = {e, f, ?, ?}
 393   vmrglw  (vt4, g, h);             // vt4 = {g, h, ?, ?}
 394   xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d}
 395   xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h}
 396   vadduwm (d,   vt0, vt1);         // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
 397   vadduwm (h,   vt5, vt3);         // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
 398 
 399   // Save hptr back, works for any alignment
 400   stxvd2x (d->to_vsr(), hptr);
 401   stxvd2x (h->to_vsr(), of16, hptr);
 402 #endif
 403 }
 404 
 405 
 406 //   R3_ARG1   - byte[]  Input string with padding but in Big Endian
 407 //   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
 408 //   R5_ARG3   - int     offset
 409 //   R6_ARG4   - int     limit
 410 //
 411 //   Internal Register usage:
 412 //   R7        - k
 413 //   R8        - tmp | j | of16
 414 //   R9        - of32
 415 //   VR0-VR8   - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb
 416 //   VR9-VR16  - a-h
 417 //   VR17-VR20 - w0-w3
 418 //   VR21-VR23 - vRb | vaux0-vaux2
 419 //   VR24-VR27 - kpw0-kpw3
 420 void MacroAssembler::sha256(bool multi_block) {
 421   static const ssize_t base_size = sizeof(uint32_t);
 422   static const ssize_t buf_size = 64;
 423   static uint32_t waux[buf_size / base_size] __attribute((aligned (16)));
 424   static const uint32_t round_consts[64] __attribute((aligned (16))) = {
 425     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 426     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 427     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 428     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 429     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 430     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 431     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 432     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 433     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 434     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 435     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 436     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 437     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 438     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 439     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 440     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 441   };
 442   static const uint8_t w_size = sizeof(round_consts)/sizeof(uint32_t);
 443 
 444   Register buf_in = R3_ARG1;
 445   Register state  = R4_ARG2;
 446   Register ofs    = R5_ARG3;
 447   Register limit  = R6_ARG4;
 448 
 449   Label sha_loop, bsw_loop, core_loop;
 450 
 451   // Save non-volatile vector registers in the red zone
 452   static const VectorRegister nv[] = {
 453     VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/
 454   };
 455   static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
 456 
 457   for (int c = 0; c < nv_size; c++) {
 458     Register tmp = R8;
 459     li  (tmp, (c - (nv_size)) * 16);
 460     stvx(nv[c], tmp, R1);
 461   }
 462 
 463   // Load hash state to registers
 464   VectorRegister a = VR9;
 465   VectorRegister b = VR10;
 466   VectorRegister c = VR11;
 467   VectorRegister d = VR12;
 468   VectorRegister e = VR13;
 469   VectorRegister f = VR14;
 470   VectorRegister g = VR15;
 471   VectorRegister h = VR16;
 472   static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
 473   static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
 474   // counter for cycling through hs vector to avoid register moves between iterations
 475   int h_cnt = 0;
 476 
 477   // Load a-h registers from the memory pointed by state
 478 #if defined(VM_LITTLE_ENDIAN)
 479   sha256_load_h_vec(a, e, state);
 480 #else
 481   sha256_load_h_vec(d, h, state);
 482 #endif
 483 
 484   // keep k loaded also during MultiBlock loops
 485   Register k = R7;
 486   load_const_optimized(k, const_cast<uint32_t *>(round_consts), R0);
 487 
 488   // Avoiding redundant loads
 489   if (multi_block) {
 490     align(OptoLoopAlignment);
 491   }
 492   bind(sha_loop);
 493 #if defined(VM_LITTLE_ENDIAN)
 494   sha256_deque(a, b, c, d);
 495   sha256_deque(e, f, g, h);
 496 #else
 497   sha256_deque(d, c, b, a);
 498   sha256_deque(h, g, f, e);
 499 #endif
 500 
 501   // Load 16 elements from w out of the loop.
 502   // Order of the int values is Endianess specific.
 503   VectorRegister w0 = VR17;
 504   VectorRegister w1 = VR18;
 505   VectorRegister w2 = VR19;
 506   VectorRegister w3 = VR20;
 507   static const VectorRegister ws[] = {w0, w1, w2, w3};
 508   static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
 509 
 510   VectorRegister kpw0 = VR24;
 511   VectorRegister kpw1 = VR25;
 512   VectorRegister kpw2 = VR26;
 513   VectorRegister kpw3 = VR27;
 514   static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};
 515   static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);
 516 
 517   sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);
 518 
 519   // Cycle through the first 16 elements
 520   assert(total_ws == total_kpws, "Redesign the loop below");
 521   for (int n = 0; n < total_ws; n++) {
 522     VectorRegister vaux0 = VR21;
 523     VectorRegister vaux1 = VR22;
 524     VectorRegister vaux2 = VR23;
 525 
 526     sha256_deque(kpws[n], vaux0, vaux1, vaux2);
 527 
 528 #if defined(VM_LITTLE_ENDIAN)
 529     sha256_round(hs, total_hs, h_cnt, kpws[n]);
 530     sha256_round(hs, total_hs, h_cnt, vaux0);
 531     sha256_round(hs, total_hs, h_cnt, vaux1);
 532     sha256_round(hs, total_hs, h_cnt, vaux2);
 533 #else
 534     sha256_round(hs, total_hs, h_cnt, vaux2);
 535     sha256_round(hs, total_hs, h_cnt, vaux1);
 536     sha256_round(hs, total_hs, h_cnt, vaux0);
 537     sha256_round(hs, total_hs, h_cnt, kpws[n]);
 538 #endif
 539   }
 540 
 541   Register tmp = R8;
 542   // loop the 16th to the 64th iteration by 8 steps
 543   li   (tmp, (w_size - 16) / total_hs);
 544   mtctr(tmp);
 545 
 546   // j will be aligned to 4 for loading words.
 547   // Whenever read, advance the pointer (e.g: when j is used in a function)
 548   Register j = R8;
 549   li   (j, 16*4);
 550 
 551   align(OptoLoopAlignment);
 552   bind(core_loop);
 553 
 554   // due to VectorRegister rotate, always iterate in multiples of total_hs
 555   for (int n = 0; n < total_hs/4; n++) {
 556     sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);
 557     sha256_round(hs, total_hs, h_cnt, kpw0);
 558     sha256_round(hs, total_hs, h_cnt, kpw1);
 559     sha256_round(hs, total_hs, h_cnt, kpw2);
 560     sha256_round(hs, total_hs, h_cnt, kpw3);
 561   }
 562 
 563   bdnz   (core_loop);
 564 
 565   // Update hash state
 566   sha256_update_sha_state(a, b, c, d, e, f, g, h, state);
 567 
 568   if (multi_block) {
 569     // process next 1024 bit block (buf_in already updated)
 570     addi(ofs, ofs, buf_size);
 571     cmpd(CCR0, ofs, limit);
 572     blt(CCR0, sha_loop);
 573 
 574     // return ofs
 575     mr(R3_ARG1, ofs);
 576   }
 577 
 578   // Restore non-volatile registers
 579   for (int c = 0; c < nv_size; c++) {
 580     Register tmp = R8;
 581     li  (tmp, (c - (nv_size)) * 16);
 582     lvx(nv[c], tmp, R1);
 583   }
 584 }
 585 
 586 
 587 /**********************************************************************
 588  * SHA 512
 589  *********************************************************************/
 590 
 591 void MacroAssembler::sha512_load_w_vec(const Register buf_in,
 592                                        const VectorRegister* ws,
 593                                        const int total_ws) {
 594   Register tmp       = R8;
 595   VectorRegister vRb = VR8;
 596   VectorRegister aux = VR9;
 597   Label is_aligned, after_alignment;
 598 
 599   andi_  (tmp, buf_in, 0xF);
 600   beq    (CCR0, is_aligned); // address ends with 0x0, not 0x8
 601 
 602   // deal with unaligned addresses
 603   lvx    (ws[0], buf_in);
 604   addi   (buf_in, buf_in, 16);
 605   lvsl   (vRb, buf_in);
 606 
 607   for (int n = 1; n < total_ws; n++) {
 608     VectorRegister w_cur = ws[n];
 609     VectorRegister w_prev = ws[n-1];
 610 
 611     lvx  (w_cur, buf_in);
 612     addi (buf_in, buf_in, 16);
 613 #if defined(VM_LITTLE_ENDIAN)
 614     vperm(w_prev, w_cur, w_prev, vRb);
 615 #else
 616     vperm(w_prev, w_prev, w_cur, vRb);
 617 #endif
 618   }
 619 
 620   lvx    (aux, buf_in);
 621 #if defined(VM_LITTLE_ENDIAN)
 622   vperm  (ws[total_ws-1], aux, ws[total_ws-1], vRb);
 623 #else
 624   vperm  (ws[total_ws-1], ws[total_ws-1], aux, vRb);
 625 #endif
 626 
 627   b      (after_alignment);
 628 
 629   bind(is_aligned);
 630 
 631   for (int n = 0; n < total_ws; n++) {
 632     VectorRegister w = ws[n];
 633 
 634     lvx  (w, buf_in);
 635     addi (buf_in, buf_in, 16);
 636   }
 637 
 638   bind(after_alignment);
 639 }
 640 
 641 // Update hash state
 642 void MacroAssembler::sha512_update_sha_state(const Register state,
 643                                              const VectorRegister* hs,
 644                                              const int total_hs) {
 645 
 646 #if defined(VM_LITTLE_ENDIAN)
 647   int start_idx = 0;
 648 #else
 649   int start_idx = 1;
 650 #endif
 651 
 652   // load initial hash from the memory pointed by state
 653   VectorRegister ini_a = VR10;
 654   VectorRegister ini_c = VR12;
 655   VectorRegister ini_e = VR14;
 656   VectorRegister ini_g = VR16;
 657   static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};
 658   static const int total_inis = sizeof(inis)/sizeof(VectorRegister);
 659 
 660   Label state_save_aligned, after_state_save_aligned;
 661 
 662   Register addr      = R7;
 663   Register tmp       = R8;
 664   VectorRegister vRb = VR8;
 665   VectorRegister aux = VR9;
 666 
 667   andi_(tmp, state, 0xf);
 668   beq(CCR0, state_save_aligned);
 669   // deal with unaligned addresses
 670 
 671   {
 672     VectorRegister a = hs[0];
 673     VectorRegister b_ = hs[1];
 674     VectorRegister c = hs[2];
 675     VectorRegister d = hs[3];
 676     VectorRegister e = hs[4];
 677     VectorRegister f = hs[5];
 678     VectorRegister g = hs[6];
 679     VectorRegister h = hs[7];
 680     lvsr   (vRb, state);
 681     lvx    (ini_a, state);
 682     addi   (addr, state, 16);
 683 
 684     lvx    (ini_c, addr);
 685     addi   (addr, addr, 16);
 686 #if defined(VM_LITTLE_ENDIAN)
 687     vperm  (ini_a, ini_c, ini_a, vRb);
 688 #else
 689     vperm  (ini_a, ini_a, ini_c, vRb);
 690 #endif
 691 
 692     lvx    (ini_e, addr);
 693     addi   (addr, addr, 16);
 694 #if defined(VM_LITTLE_ENDIAN)
 695     vperm  (ini_c, ini_e, ini_c, vRb);
 696 #else
 697     vperm  (ini_c, ini_c, ini_e, vRb);
 698 #endif
 699 
 700     lvx    (ini_g, addr);
 701     addi   (addr, addr, 16);
 702 #if defined(VM_LITTLE_ENDIAN)
 703     vperm  (ini_e, ini_g, ini_e, vRb);
 704 #else
 705     vperm  (ini_e, ini_e, ini_g, vRb);
 706 #endif
 707 
 708     lvx    (aux, addr);
 709 #if defined(VM_LITTLE_ENDIAN)
 710     vperm  (ini_g, aux, ini_g, vRb);
 711 #else
 712     vperm  (ini_g, ini_g, aux, vRb);
 713 #endif
 714 
 715 #if defined(VM_LITTLE_ENDIAN)
 716     xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr());
 717     xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());
 718     xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());
 719     xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());
 720 #else
 721     xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr());
 722     xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr());
 723     xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr());
 724     xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr());
 725 #endif
 726 
 727     for (int n = start_idx; n < total_hs; n += 2) {
 728       VectorRegister h_cur = hs[n];
 729       VectorRegister ini_cur = inis[n/2];
 730 
 731       vaddudm(h_cur, ini_cur, h_cur);
 732     }
 733 
 734     for (int n = start_idx; n < total_hs; n += 2) {
 735       VectorRegister h_cur = hs[n];
 736 
 737       mfvrd  (tmp, h_cur);
 738 #if defined(VM_LITTLE_ENDIAN)
 739       std    (tmp, 8*n + 8, state);
 740 #else
 741       std    (tmp, 8*n - 8, state);
 742 #endif
 743       vsldoi (aux, h_cur, h_cur, 8);
 744       mfvrd  (tmp, aux);
 745       std    (tmp, 8*n + 0, state);
 746     }
 747 
 748     b      (after_state_save_aligned);
 749   }
 750 
 751   bind(state_save_aligned);
 752   {
 753     mr(addr, state);
 754     for (int n = 0; n < total_hs; n += 2) {
 755 #if defined(VM_LITTLE_ENDIAN)
 756       VectorRegister h_cur = hs[n];
 757       VectorRegister h_next = hs[n+1];
 758 #else
 759       VectorRegister h_cur = hs[n+1];
 760       VectorRegister h_next = hs[n];
 761 #endif
 762       VectorRegister ini_cur = inis[n/2];
 763 
 764       lvx(ini_cur, addr);
 765       addi(addr, addr, 16);
 766       xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());
 767     }
 768 
 769     for (int n = start_idx; n < total_hs; n += 2) {
 770       VectorRegister h_cur = hs[n];
 771       VectorRegister ini_cur = inis[n/2];
 772 
 773       vaddudm(h_cur, ini_cur, h_cur);
 774     }
 775 
 776     mr(addr, state);
 777     for (int n = start_idx; n < total_hs; n += 2) {
 778       VectorRegister h_cur = hs[n];
 779 
 780       stvx(h_cur, addr);
 781       addi(addr, addr, 16);
 782     }
 783   }
 784 
 785   bind(after_state_save_aligned);
 786 }
 787 
 788 // Use h_cnt to cycle through hs elements but also increment it at the end
 789 void MacroAssembler::sha512_round(const VectorRegister* hs,
 790                                   const int total_hs, int& h_cnt,
 791                                   const VectorRegister kpw) {
 792 
 793   // convenience registers: cycle from 0-7 downwards
 794   const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
 795   const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
 796   const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
 797   const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
 798   const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
 799   const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
 800   const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
 801   const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
 802   // temporaries
 803   const VectorRegister Ch   = VR20;
 804   const VectorRegister Maj  = VR21;
 805   const VectorRegister bsa  = VR22;
 806   const VectorRegister bse  = VR23;
 807   const VectorRegister tmp1 = VR24;
 808   const VectorRegister tmp2 = VR25;
 809 
 810   vsel      (Ch,   g,    f,   e);
 811   vxor      (Maj,  a,    b);
 812   vshasigmad(bse,  e,    1,   0xf);
 813   vaddudm   (tmp2, Ch,   kpw);
 814   vaddudm   (tmp1, h,    bse);
 815   vsel      (Maj,  b,    c,   Maj);
 816   vaddudm   (tmp1, tmp1, tmp2);
 817   vshasigmad(bsa,  a,    1,   0);
 818   vaddudm   (tmp2, bsa,  Maj);
 819   vaddudm   (d,    d,    tmp1);
 820   vaddudm   (h,    tmp1, tmp2);
 821 
 822   // advance vector pointer to the next iteration
 823   h_cnt++;
 824 }
 825 
 826 void MacroAssembler::sha512_calc_2w(const VectorRegister w0,
 827                                     const VectorRegister w1,
 828                                     const VectorRegister w2,
 829                                     const VectorRegister w3,
 830                                     const VectorRegister w4,
 831                                     const VectorRegister w5,
 832                                     const VectorRegister w6,
 833                                     const VectorRegister w7,
 834                                     const VectorRegister kpw0,
 835                                     const VectorRegister kpw1,
 836                                     const Register j,
 837                                     const VectorRegister vRb,
 838                                     const Register k) {
 839   // Temporaries
 840   const VectorRegister VR_a = VR20;
 841   const VectorRegister VR_b = VR21;
 842   const VectorRegister VR_c = VR22;
 843   const VectorRegister VR_d = VR23;
 844 
 845   // load to k[j]
 846   lvx        (VR_a, j,    k);
 847   // advance j
 848   addi       (j,    j,    16); // 16 bytes were read
 849 
 850 #if defined(VM_LITTLE_ENDIAN)
 851   // v6 = w[j-15], w[j-14]
 852   vperm      (VR_b, w1,   w0,  vRb);
 853   // v12 = w[j-7], w[j-6]
 854   vperm      (VR_c, w5,   w4,  vRb);
 855 #else
 856   // v6 = w[j-15], w[j-14]
 857   vperm      (VR_b, w0,   w1,  vRb);
 858   // v12 = w[j-7], w[j-6]
 859   vperm      (VR_c, w4,   w5,  vRb);
 860 #endif
 861 
 862   // v6 = s0(w[j-15]) , s0(w[j-14])
 863   vshasigmad (VR_b, VR_b,    0,   0);
 864   // v5 = s1(w[j-2]) , s1(w[j-1])
 865   vshasigmad (VR_d, w7,      0,   0xf);
 866   // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]
 867   vaddudm    (VR_b, VR_b, VR_c);
 868   // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]
 869   vaddudm    (VR_d, VR_d, w0);
 870   // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 871   //      s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 872   vaddudm    (VR_c, VR_d, VR_b);
 873   // Updating w0 to w7 to hold the new previous 16 values from w.
 874   vmr        (w0,   w1);
 875   vmr        (w1,   w2);
 876   vmr        (w2,   w3);
 877   vmr        (w3,   w4);
 878   vmr        (w4,   w5);
 879   vmr        (w5,   w6);
 880   vmr        (w6,   w7);
 881   vmr        (w7,   VR_c);
 882 
 883 #if defined(VM_LITTLE_ENDIAN)
 884   // store k + w to kpw0 (2 values at once)
 885   vaddudm    (kpw0, VR_c, VR_a);
 886   // kpw1 holds (k + w)[1]
 887   vsldoi     (kpw1, kpw0, kpw0, 8);
 888 #else
 889   // store k + w to kpw0 (2 values at once)
 890   vaddudm    (kpw1, VR_c, VR_a);
 891   // kpw1 holds (k + w)[1]
 892   vsldoi     (kpw0, kpw1, kpw1, 8);
 893 #endif
 894 }
 895 
 896 void MacroAssembler::sha512_load_h_vec(const Register state,
 897                                        const VectorRegister* hs,
 898                                        const int total_hs) {
 899 #if defined(VM_LITTLE_ENDIAN)
 900   VectorRegister a   = hs[0];
 901   VectorRegister g   = hs[6];
 902   int start_idx = 0;
 903 #else
 904   VectorRegister a   = hs[1];
 905   VectorRegister g   = hs[7];
 906   int start_idx = 1;
 907 #endif
 908 
 909   Register addr      = R7;
 910   VectorRegister vRb = VR8;
 911   Register tmp       = R8;
 912   Label state_aligned, after_state_aligned;
 913 
 914   andi_(tmp, state, 0xf);
 915   beq(CCR0, state_aligned);
 916 
 917   // deal with unaligned addresses
 918   VectorRegister aux = VR9;
 919 
 920   lvx    (a,    state);
 921   addi   (addr, state, 16);
 922   lvsl   (vRb,  addr);
 923 
 924   for (int n = start_idx + 2; n < total_hs; n += 2) {
 925     VectorRegister h_cur   = hs[n];
 926     VectorRegister h_prev2 = hs[n - 2];
 927 
 928     lvx    (h_cur,   addr);
 929     addi   (addr,    addr,  16);
 930 #if defined(VM_LITTLE_ENDIAN)
 931     vperm  (h_prev2, h_cur, h_prev2, vRb);
 932 #else
 933     vperm  (h_prev2, h_prev2, h_cur, vRb);
 934 #endif
 935   }
 936   lvx    (aux, addr);
 937 #if defined(VM_LITTLE_ENDIAN)
 938   vperm  (g,   aux, g, vRb);
 939 #else
 940   vperm  (g,   g, aux, vRb);
 941 #endif
 942 
 943   b      (after_state_aligned);
 944 
 945   bind(state_aligned);
 946 
 947   // deal with aligned addresses
 948   mr(addr, state);
 949   for (int n = start_idx; n < total_hs; n += 2) {
 950     VectorRegister h_cur = hs[n];
 951 
 952     lvx    (h_cur, addr);
 953     addi   (addr, addr, 16);
 954   }
 955 
 956   bind(after_state_aligned);
 957 }
 958 
 959 //   R3_ARG1   - byte[]  Input string with padding but in Big Endian
 960 //   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
 961 //   R5_ARG3   - int     offset
 962 //   R6_ARG4   - int     limit
 963 //
 964 //   Internal Register usage:
 965 //   R7 R8 R9  - volatile temporaries
 966 //   VR0-VR7   - a-h
 967 //   VR8       - vRb
 968 //   VR9       - aux (highly volatile, use with care)
 969 //   VR10-VR17 - w0-w7 | ini_a-ini_h
 970 //   VR18      - vsp16 | kplusw0
 971 //   VR19      - vsp32 | kplusw1
 972 //   VR20-VR25 - sha512_calc_2w and sha512_round temporaries
 973 void MacroAssembler::sha512(bool multi_block) {
 974   static const ssize_t base_size = sizeof(uint64_t);
 975   static const ssize_t buf_size = 128;
 976   static uint64_t waux[buf_size / base_size] __attribute((aligned (16)));
 977   static const uint64_t round_consts[80] __attribute((aligned (16))) = {
 978     0x428a2f98d728ae22, 0x7137449123ef65cd,
 979     0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
 980     0x3956c25bf348b538, 0x59f111f1b605d019,
 981     0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
 982     0xd807aa98a3030242, 0x12835b0145706fbe,
 983     0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
 984     0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
 985     0x9bdc06a725c71235, 0xc19bf174cf692694,
 986     0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
 987     0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
 988     0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
 989     0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
 990     0x983e5152ee66dfab, 0xa831c66d2db43210,
 991     0xb00327c898fb213f, 0xbf597fc7beef0ee4,
 992     0xc6e00bf33da88fc2, 0xd5a79147930aa725,
 993     0x06ca6351e003826f, 0x142929670a0e6e70,
 994     0x27b70a8546d22ffc, 0x2e1b21385c26c926,
 995     0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
 996     0x650a73548baf63de, 0x766a0abb3c77b2a8,
 997     0x81c2c92e47edaee6, 0x92722c851482353b,
 998     0xa2bfe8a14cf10364, 0xa81a664bbc423001,
 999     0xc24b8b70d0f89791, 0xc76c51a30654be30,
1000     0xd192e819d6ef5218, 0xd69906245565a910,
1001     0xf40e35855771202a, 0x106aa07032bbd1b8,
1002     0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
1003     0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
1004     0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
1005     0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
1006     0x748f82ee5defb2fc, 0x78a5636f43172f60,
1007     0x84c87814a1f0ab72, 0x8cc702081a6439ec,
1008     0x90befffa23631e28, 0xa4506cebde82bde9,
1009     0xbef9a3f7b2c67915, 0xc67178f2e372532b,
1010     0xca273eceea26619c, 0xd186b8c721c0c207,
1011     0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
1012     0x06f067aa72176fba, 0x0a637dc5a2c898a6,
1013     0x113f9804bef90dae, 0x1b710b35131c471b,
1014     0x28db77f523047d84, 0x32caab7b40c72493,
1015     0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
1016     0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
1017     0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
1018   };
1019   static const uint8_t w_size = sizeof(round_consts)/sizeof(uint64_t);
1020 
1021   Register buf_in = R3_ARG1;
1022   Register state  = R4_ARG2;
1023   Register ofs    = R5_ARG3;
1024   Register limit  = R6_ARG4;
1025 
1026   Label sha_loop, bsw_loop, core_loop;
1027 
1028   // Save non-volatile vector registers in the red zone
1029   static const VectorRegister nv[] = {
1030     VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/
1031   };
1032   static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
1033 
1034   for (int c = 0; c < nv_size; c++) {
1035     Register idx = R7;
1036     li  (idx, (c - (nv_size)) * 16);
1037     stvx(nv[c], idx, R1);
1038   }
1039 
1040   // Load hash state to registers
1041   VectorRegister a = VR0;
1042   VectorRegister b = VR1;
1043   VectorRegister c = VR2;
1044   VectorRegister d = VR3;
1045   VectorRegister e = VR4;
1046   VectorRegister f = VR5;
1047   VectorRegister g = VR6;
1048   VectorRegister h = VR7;
1049   static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
1050   static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
1051   // counter for cycling through hs vector to avoid register moves between iterations
1052   int h_cnt = 0;
1053 
1054   // Load a-h registers from the memory pointed by state
1055   sha512_load_h_vec(state, hs, total_hs);
1056 
1057   if (multi_block) {
1058     align(OptoLoopAlignment);
1059   }
1060   bind(sha_loop);
1061 
1062   for (int n = 0; n < total_hs; n += 2) {
1063 #if defined(VM_LITTLE_ENDIAN)
1064     VectorRegister h_cur = hs[n];
1065     VectorRegister h_next = hs[n + 1];
1066 #else
1067     VectorRegister h_cur = hs[n + 1];
1068     VectorRegister h_next = hs[n];
1069 #endif
1070     vsldoi (h_next, h_cur, h_cur, 8);
1071   }
1072 
1073   Register k = R9;
1074   load_const_optimized(k, const_cast<uint64_t *>(round_consts), R0);
1075 
1076   // Load 16 elements from w out of the loop.
1077   // Order of the long values is Endianess specific.
1078   VectorRegister w0 = VR10;
1079   VectorRegister w1 = VR11;
1080   VectorRegister w2 = VR12;
1081   VectorRegister w3 = VR13;
1082   VectorRegister w4 = VR14;
1083   VectorRegister w5 = VR15;
1084   VectorRegister w6 = VR16;
1085   VectorRegister w7 = VR17;
1086   static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};
1087   static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
1088 
1089   // Load 16 w into vectors and setup vsl for vperm
1090   sha512_load_w_vec(buf_in, ws, total_ws);
1091 
1092 #if defined(VM_LITTLE_ENDIAN)
1093   VectorRegister vsp16 = VR18;
1094   VectorRegister vsp32 = VR19;
1095   VectorRegister shiftarg = VR9;
1096 
1097   vspltisw(vsp16,    8);
1098   vspltisw(shiftarg, 1);
1099   vsl     (vsp16,    vsp16, shiftarg);
1100   vsl     (vsp32,    vsp16, shiftarg);
1101 
1102   VectorRegister vsp8 = VR9;
1103   vspltish(vsp8,     8);
1104 
1105   // Convert input from Big Endian to Little Endian
1106   for (int c = 0; c < total_ws; c++) {
1107     VectorRegister w = ws[c];
1108     vrlh  (w, w, vsp8);
1109   }
1110   for (int c = 0; c < total_ws; c++) {
1111     VectorRegister w = ws[c];
1112     vrlw  (w, w, vsp16);
1113   }
1114   for (int c = 0; c < total_ws; c++) {
1115     VectorRegister w = ws[c];
1116     vrld  (w, w, vsp32);
1117   }
1118 #endif
1119 
1120   Register Rb        = R10;
1121   VectorRegister vRb = VR8;
1122   li      (Rb, 8);
1123   lvsl    (vRb, Rb);
1124 
1125   VectorRegister kplusw0 = VR18;
1126   VectorRegister kplusw1 = VR19;
1127 
1128   Register addr      = R7;
1129   mr      (addr, k);
1130 
1131   for (int n = 0; n < total_ws; n++) {
1132     VectorRegister w = ws[n];
1133 
1134     lvx    (kplusw0, addr);
1135     addi   (addr, addr, 16);
1136 #if defined(VM_LITTLE_ENDIAN)
1137     vaddudm(kplusw0, kplusw0, w);
1138     vsldoi (kplusw1, kplusw0, kplusw0, 8);
1139 #else
1140     vaddudm(kplusw1, kplusw0, w);
1141     vsldoi (kplusw0, kplusw1, kplusw1, 8);
1142 #endif
1143 
1144     sha512_round(hs, total_hs, h_cnt, kplusw0);
1145     sha512_round(hs, total_hs, h_cnt, kplusw1);
1146   }
1147 
1148   Register tmp       = R8;
1149   li    (tmp, (w_size-16)/total_hs);
1150   mtctr (tmp);
1151   // j will be aligned to 4 for loading words.
1152   // Whenever read, advance the pointer (e.g: when j is used in a function)
1153   Register j = tmp;
1154   li     (j, 8*16);
1155 
1156   align(OptoLoopAlignment);
1157   bind(core_loop);
1158 
1159   // due to VectorRegister rotate, always iterate in multiples of total_hs
1160   for (int n = 0; n < total_hs/2; n++) {
1161     sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);
1162     sha512_round(hs, total_hs, h_cnt, kplusw0);
1163     sha512_round(hs, total_hs, h_cnt, kplusw1);
1164   }
1165 
1166   bdnz   (core_loop);
1167 
1168   sha512_update_sha_state(state, hs, total_hs);
1169 
1170   if (multi_block) {
1171     // process next 1024 bit block (buf_in already updated)
1172     addi(ofs, ofs, buf_size);
1173     cmpd(CCR0, ofs, limit);
1174     blt(CCR0, sha_loop);
1175 
1176     // return ofs
1177     mr(R3_ARG1, ofs);
1178   }
1179 
1180   // Restore non-volatile registers
1181   for (int c = 0; c < nv_size; c++) {
1182     Register idx = R7;
1183     li  (idx, (c - (nv_size)) * 16);
1184     lvx(nv[c], idx, R1);
1185   }
1186 }