1 // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved. 2 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 3 // 4 // This code is free software; you can redistribute it and/or modify it 5 // under the terms of the GNU General Public License version 2 only, as 6 // published by the Free Software Foundation. 7 // 8 // This code is distributed in the hope that it will be useful, but WITHOUT 9 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11 // version 2 for more details (a copy is included in the LICENSE file that 12 // accompanied this code). 13 // 14 // You should have received a copy of the GNU General Public License version 15 // 2 along with this work; if not, write to the Free Software Foundation, 16 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 17 // 18 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 19 // or visit www.oracle.com if you need additional information or have any 20 // questions. 21 22 #include "asm/assembler.hpp" 23 #include "asm/assembler.inline.hpp" 24 #include "runtime/stubRoutines.hpp" 25 #include "macroAssembler_ppc.hpp" 26 27 /********************************************************************** 28 * SHA 256 29 *********************************************************************/ 30 31 void MacroAssembler::sha256_deque(const VectorRegister src, 32 const VectorRegister dst1, 33 const VectorRegister dst2, 34 const VectorRegister dst3) { 35 vsldoi (dst1, src, src, 12); 36 vsldoi (dst2, src, src, 8); 37 vsldoi (dst3, src, src, 4); 38 } 39 40 void MacroAssembler::sha256_round(const VectorRegister* hs, 41 const int total_hs, 42 int& h_cnt, 43 const VectorRegister kpw) { 44 // convenience registers: cycle from 0-7 downwards 45 const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs]; 46 const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs]; 47 const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs]; 48 const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs]; 49 const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs]; 50 const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs]; 51 const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs]; 52 const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs]; 53 // temporaries 54 VectorRegister ch = VR0; 55 VectorRegister maj = VR1; 56 VectorRegister bsa = VR2; 57 VectorRegister bse = VR3; 58 VectorRegister vt0 = VR4; 59 VectorRegister vt1 = VR5; 60 VectorRegister vt2 = VR6; 61 VectorRegister vt3 = VR7; 62 63 vsel (ch, g, f, e); 64 vxor (maj, a, b); 65 vshasigmaw (bse, e, 1, 0xf); 66 vadduwm (vt2, ch, kpw); 67 vadduwm (vt1, h, bse); 68 vsel (maj, b, c, maj); 69 vadduwm (vt3, vt1, vt2); 70 vshasigmaw (bsa, a, 1, 0); 71 vadduwm (vt0, bsa, maj); 72 73 vadduwm (d, d, vt3); 74 vadduwm (h, vt3, vt0); 75 76 // advance vector pointer to the next iteration 77 h_cnt++; 78 } 79 80 void MacroAssembler::sha256_load_h_vec(const VectorRegister a, 81 const VectorRegister e, 82 const Register hptr) { 83 // temporaries 84 Register tmp = R8; 85 VectorRegister vt0 = VR0; 86 VectorRegister vRb = VR6; 87 // labels 88 Label sha256_aligned, sha256_load_end;; 89 90 andi_ (tmp, hptr, 0xf); 91 addi (tmp, hptr, 16); 92 beq (CCR0, sha256_aligned); 93 94 // handle unaligned accesses 95 lvx (a, hptr); 96 lvsr (vRb, hptr); 97 98 lvx (e, tmp); 99 addi (tmp, tmp, 16); 100 #if defined(VM_LITTLE_ENDIAN) 101 vperm (a, e, a, vRb); 102 #else 103 vperm (a, a, e, vRb); 104 #endif 105 106 lvx (vt0, tmp); 107 #if defined(VM_LITTLE_ENDIAN) 108 vperm (e, vt0, e, vRb); 109 #else 110 vperm (e, e, vt0, vRb); 111 #endif 112 b (sha256_load_end); 113 114 // aligned accesses 115 bind(sha256_aligned); 116 lvx (a, hptr); 117 addi (tmp, hptr, 16); 118 lvx (e, tmp); 119 120 bind(sha256_load_end); 121 } 122 123 void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in, 124 const VectorRegister* ws, 125 const int total_ws, 126 const Register k, 127 const VectorRegister* kpws, 128 const int total_kpws) { 129 Label w_aligned, after_w_load; 130 131 Register tmp = R8; 132 VectorRegister vt0 = VR0; 133 VectorRegister vt1 = VR1; 134 VectorRegister vRb = VR6; 135 136 andi_ (tmp, buf_in, 0xF); 137 beq (CCR0, w_aligned); // address ends with 0x0, not 0x8 138 139 // deal with unaligned addresses 140 lvx (ws[0], buf_in); 141 addi (buf_in, buf_in, 16); 142 lvsl (vRb, buf_in); 143 144 for (int n = 1; n < total_ws; n++) { 145 VectorRegister w_cur = ws[n]; 146 VectorRegister w_prev = ws[n-1]; 147 148 lvx (w_cur, buf_in); 149 addi (buf_in, buf_in, 16); 150 #if defined(VM_LITTLE_ENDIAN) 151 vperm(w_prev, w_cur, w_prev, vRb); 152 #else 153 vperm(w_prev, w_prev, w_cur, vRb); 154 #endif 155 } 156 157 lvx (vt0, buf_in); 158 #if defined(VM_LITTLE_ENDIAN) 159 vperm (ws[total_ws-1], vt0, ws[total_ws-1], vRb); 160 #else 161 vperm (ws[total_ws-1], ws[total_ws-1], vt0, vRb); 162 #endif 163 164 b (after_w_load); 165 166 bind(w_aligned); 167 168 // deal with aligned addresses 169 for (int n = 0; n < total_ws; n++) { 170 VectorRegister w = ws[n]; 171 172 lvx (w, buf_in); 173 addi (buf_in, buf_in, 16); 174 } 175 176 bind(after_w_load); 177 178 #if defined(VM_LITTLE_ENDIAN) 179 // Byte swapping within int values 180 li (tmp, 8); 181 lvsl (vt0, tmp); 182 vspltisb (vt1, 0xb); 183 vxor (vt1, vt0, vt1); 184 for (int n = 0; n < total_ws; n++) { 185 VectorRegister w = ws[n]; 186 vperm (w, w, w, vt1); 187 } 188 #endif 189 190 // Loading k, which is always aligned to 16-bytes 191 lvx (kpws[0], k); 192 addi (tmp, k, 16); 193 for (int n = 1; n < total_kpws-1; n++) { 194 VectorRegister kpw = kpws[n]; 195 196 lvx (kpw, tmp); 197 addi (tmp, tmp, 16); 198 } 199 lvx (kpws[total_kpws-1], tmp); 200 201 // Add w to K 202 assert(total_ws == total_kpws, "Redesign the loop below"); 203 for (int n = 0; n < total_kpws; n++) { 204 VectorRegister kpw = kpws[n]; 205 VectorRegister w = ws[n]; 206 207 vadduwm (kpw, kpw, w); 208 } 209 } 210 211 void MacroAssembler::sha256_calc_4w(const VectorRegister w0, 212 const VectorRegister w1, 213 const VectorRegister w2, 214 const VectorRegister w3, 215 const VectorRegister kpw0, 216 const VectorRegister kpw1, 217 const VectorRegister kpw2, 218 const VectorRegister kpw3, 219 const Register j, 220 const Register k) { 221 // Temporaries 222 const VectorRegister vt0 = VR0; 223 const VectorRegister vt1 = VR1; 224 const VectorSRegister vsrt1 = vt1->to_vsr(); 225 const VectorRegister vt2 = VR2; 226 const VectorRegister vt3 = VR3; 227 const VectorSRegister vst3 = vt3->to_vsr(); 228 const VectorRegister vt4 = VR4; 229 230 // load to k[j] 231 lvx (vt0, j, k); 232 233 // advance j 234 addi (j, j, 16); // 16 bytes were read 235 236 #if defined(VM_LITTLE_ENDIAN) 237 // b = w[j-15], w[j-14], w[j-13], w[j-12] 238 vsldoi (vt1, w1, w0, 12); 239 240 // c = w[j-7], w[j-6], w[j-5], w[j-4] 241 vsldoi (vt2, w3, w2, 12); 242 243 #else 244 // b = w[j-15], w[j-14], w[j-13], w[j-12] 245 vsldoi (vt1, w0, w1, 4); 246 247 // c = w[j-7], w[j-6], w[j-5], w[j-4] 248 vsldoi (vt2, w2, w3, 4); 249 #endif 250 251 // d = w[j-2], w[j-1], w[j-4], w[j-3] 252 vsldoi (vt3, w3, w3, 8); 253 254 // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12]) 255 vshasigmaw (vt1, vt1, 0, 0); 256 257 // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3]) 258 vshasigmaw (vt3, vt3, 0, 0xf); 259 260 // c = s0(w[j-15]) + w[j-7], 261 // s0(w[j-14]) + w[j-6], 262 // s0(w[j-13]) + w[j-5], 263 // s0(w[j-12]) + w[j-4] 264 vadduwm (vt2, vt1, vt2); 265 266 // c = s0(w[j-15]) + w[j-7] + w[j-16], 267 // s0(w[j-14]) + w[j-6] + w[j-15], 268 // s0(w[j-13]) + w[j-5] + w[j-14], 269 // s0(w[j-12]) + w[j-4] + w[j-13] 270 vadduwm (vt2, vt2, w0); 271 272 // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 273 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 274 // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED 275 // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3]) // UNDEFINED 276 vadduwm (vt4, vt2, vt3); 277 278 // At this point, e[0] and e[1] are the correct values to be stored at w[j] 279 // and w[j+1]. 280 // e[2] and e[3] are not considered. 281 // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED 282 vshasigmaw (vt1, vt4, 0, 0xf); 283 284 // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1]) 285 #if defined(VM_LITTLE_ENDIAN) 286 xxmrgld (vst3, vsrt1, vst3); 287 #else 288 xxmrghd (vst3, vst3, vsrt1); 289 #endif 290 291 // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 292 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 293 // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]), // w[j+2] 294 // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1]) // w[j+4] 295 vadduwm (vt2, vt2, vt3); 296 297 // Updating w0 to w3 to hold the new previous 16 values from w. 298 vmr (w0, w1); 299 vmr (w1, w2); 300 vmr (w2, w3); 301 vmr (w3, vt2); 302 303 // store k + w to v9 (4 values at once) 304 #if defined(VM_LITTLE_ENDIAN) 305 vadduwm (kpw0, vt2, vt0); 306 307 vsldoi (kpw1, kpw0, kpw0, 12); 308 vsldoi (kpw2, kpw0, kpw0, 8); 309 vsldoi (kpw3, kpw0, kpw0, 4); 310 #else 311 vadduwm (kpw3, vt2, vt0); 312 313 vsldoi (kpw2, kpw3, kpw3, 12); 314 vsldoi (kpw1, kpw3, kpw3, 8); 315 vsldoi (kpw0, kpw3, kpw3, 4); 316 #endif 317 } 318 319 void MacroAssembler::sha256_update_sha_state(const VectorRegister a, 320 const VectorRegister b_, 321 const VectorRegister c, 322 const VectorRegister d, 323 const VectorRegister e, 324 const VectorRegister f, 325 const VectorRegister g, 326 const VectorRegister h, 327 const Register hptr) { 328 // temporaries 329 VectorRegister vt0 = VR0; 330 VectorRegister vt1 = VR1; 331 VectorRegister vt2 = VR2; 332 VectorRegister vt3 = VR3; 333 VectorRegister vt4 = VR4; 334 VectorRegister vt5 = VR5; 335 VectorRegister vaux = VR6; 336 VectorRegister vRb = VR6; 337 Register tmp = R8; 338 Register of16 = R8; 339 Register of32 = R9; 340 Label state_load_aligned, after_state_load_aligned; 341 342 // Load hptr 343 andi_ (tmp, hptr, 0xf); 344 li (of16, 16); 345 beq (CCR0, state_load_aligned); 346 347 // handle unaligned accesses 348 li (of32, 32); 349 lvx (vt0, hptr); 350 lvsr (vRb, hptr); 351 352 lvx (vt5, hptr, of16); 353 #if defined(VM_LITTLE_ENDIAN) 354 vperm (vt0, vt5, vt0, vRb); // vt0 = hptr[0]..hptr[3] 355 #else 356 vperm (vt0, vt0, vt5, vRb); // vt0 = hptr[0]..hptr[3] 357 #endif 358 359 lvx (vt1, hptr, of32); 360 #if defined(VM_LITTLE_ENDIAN) 361 vperm (vt5, vt1, vt5, vRb); // vt5 = hptr[4]..hptr[7] 362 #else 363 vperm (vt5, vt5, vt1, vRb); // vt5 = hptr[4]..hptr[7] 364 #endif 365 b (after_state_load_aligned); 366 367 // aligned accesses 368 bind(state_load_aligned); 369 lvx (vt0, hptr); 370 lvx (vt5, of16, hptr); 371 372 bind(after_state_load_aligned); 373 374 #if defined(VM_LITTLE_ENDIAN) 375 vmrglw (vt1, b_, a); // vt1 = {a, b, ?, ?} 376 vmrglw (vt2, d, c); // vt2 = {c, d, ?, ?} 377 vmrglw (vt3, f, e); // vt3 = {e, f, ?, ?} 378 vmrglw (vt4, h, g); // vt4 = {g, h, ?, ?} 379 xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d} 380 xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h} 381 vadduwm (a, vt0, vt1); // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]} 382 vadduwm (e, vt5, vt3); // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]} 383 384 // Save hptr back, works for any alignment 385 xxswapd (vt0->to_vsr(), a->to_vsr()); 386 stxvd2x (vt0->to_vsr(), hptr); 387 xxswapd (vt5->to_vsr(), e->to_vsr()); 388 stxvd2x (vt5->to_vsr(), of16, hptr); 389 #else 390 vmrglw (vt1, a, b_); // vt1 = {a, b, ?, ?} 391 vmrglw (vt2, c, d); // vt2 = {c, d, ?, ?} 392 vmrglw (vt3, e, f); // vt3 = {e, f, ?, ?} 393 vmrglw (vt4, g, h); // vt4 = {g, h, ?, ?} 394 xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d} 395 xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h} 396 vadduwm (d, vt0, vt1); // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]} 397 vadduwm (h, vt5, vt3); // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]} 398 399 // Save hptr back, works for any alignment 400 stxvd2x (d->to_vsr(), hptr); 401 stxvd2x (h->to_vsr(), of16, hptr); 402 #endif 403 } 404 405 406 // R3_ARG1 - byte[] Input string with padding but in Big Endian 407 // R4_ARG2 - int[] SHA.state (at first, the root of primes) 408 // R5_ARG3 - int offset 409 // R6_ARG4 - int limit 410 // 411 // Internal Register usage: 412 // R7 - k 413 // R8 - tmp | j | of16 414 // R9 - of32 415 // VR0-VR8 - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb 416 // VR9-VR16 - a-h 417 // VR17-VR20 - w0-w3 418 // VR21-VR23 - vRb | vaux0-vaux2 419 // VR24-VR27 - kpw0-kpw3 420 void MacroAssembler::sha256(bool multi_block) { 421 static const ssize_t base_size = sizeof(uint32_t); 422 static const ssize_t buf_size = 64; 423 static uint32_t waux[buf_size / base_size] __attribute((aligned (16))); 424 static const uint32_t round_consts[64] __attribute((aligned (16))) = { 425 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 426 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 427 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 428 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 429 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 430 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 431 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 432 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 433 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 434 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 435 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 436 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 437 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 438 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 439 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 440 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 441 }; 442 static const uint8_t w_size = sizeof(round_consts)/sizeof(uint32_t); 443 444 Register buf_in = R3_ARG1; 445 Register state = R4_ARG2; 446 Register ofs = R5_ARG3; 447 Register limit = R6_ARG4; 448 449 Label sha_loop, bsw_loop, core_loop; 450 451 // Save non-volatile vector registers in the red zone 452 static const VectorRegister nv[] = { 453 VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/ 454 }; 455 static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister); 456 457 for (int c = 0; c < nv_size; c++) { 458 Register tmp = R8; 459 li (tmp, (c - (nv_size)) * 16); 460 stvx(nv[c], tmp, R1); 461 } 462 463 // Load hash state to registers 464 VectorRegister a = VR9; 465 VectorRegister b = VR10; 466 VectorRegister c = VR11; 467 VectorRegister d = VR12; 468 VectorRegister e = VR13; 469 VectorRegister f = VR14; 470 VectorRegister g = VR15; 471 VectorRegister h = VR16; 472 static const VectorRegister hs[] = {a, b, c, d, e, f, g, h}; 473 static const int total_hs = sizeof(hs)/sizeof(VectorRegister); 474 // counter for cycling through hs vector to avoid register moves between iterations 475 int h_cnt = 0; 476 477 // Load a-h registers from the memory pointed by state 478 #if defined(VM_LITTLE_ENDIAN) 479 sha256_load_h_vec(a, e, state); 480 #else 481 sha256_load_h_vec(d, h, state); 482 #endif 483 484 // keep k loaded also during MultiBlock loops 485 Register k = R7; 486 load_const_optimized(k, const_cast<uint32_t *>(round_consts), R0); 487 488 // Avoiding redundant loads 489 if (multi_block) { 490 align(OptoLoopAlignment); 491 } 492 bind(sha_loop); 493 #if defined(VM_LITTLE_ENDIAN) 494 sha256_deque(a, b, c, d); 495 sha256_deque(e, f, g, h); 496 #else 497 sha256_deque(d, c, b, a); 498 sha256_deque(h, g, f, e); 499 #endif 500 501 // Load 16 elements from w out of the loop. 502 // Order of the int values is Endianess specific. 503 VectorRegister w0 = VR17; 504 VectorRegister w1 = VR18; 505 VectorRegister w2 = VR19; 506 VectorRegister w3 = VR20; 507 static const VectorRegister ws[] = {w0, w1, w2, w3}; 508 static const int total_ws = sizeof(ws)/sizeof(VectorRegister); 509 510 VectorRegister kpw0 = VR24; 511 VectorRegister kpw1 = VR25; 512 VectorRegister kpw2 = VR26; 513 VectorRegister kpw3 = VR27; 514 static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3}; 515 static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister); 516 517 sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws); 518 519 // Cycle through the first 16 elements 520 assert(total_ws == total_kpws, "Redesign the loop below"); 521 for (int n = 0; n < total_ws; n++) { 522 VectorRegister vaux0 = VR21; 523 VectorRegister vaux1 = VR22; 524 VectorRegister vaux2 = VR23; 525 526 sha256_deque(kpws[n], vaux0, vaux1, vaux2); 527 528 #if defined(VM_LITTLE_ENDIAN) 529 sha256_round(hs, total_hs, h_cnt, kpws[n]); 530 sha256_round(hs, total_hs, h_cnt, vaux0); 531 sha256_round(hs, total_hs, h_cnt, vaux1); 532 sha256_round(hs, total_hs, h_cnt, vaux2); 533 #else 534 sha256_round(hs, total_hs, h_cnt, vaux2); 535 sha256_round(hs, total_hs, h_cnt, vaux1); 536 sha256_round(hs, total_hs, h_cnt, vaux0); 537 sha256_round(hs, total_hs, h_cnt, kpws[n]); 538 #endif 539 } 540 541 Register tmp = R8; 542 // loop the 16th to the 64th iteration by 8 steps 543 li (tmp, (w_size - 16) / total_hs); 544 mtctr(tmp); 545 546 // j will be aligned to 4 for loading words. 547 // Whenever read, advance the pointer (e.g: when j is used in a function) 548 Register j = R8; 549 li (j, 16*4); 550 551 align(OptoLoopAlignment); 552 bind(core_loop); 553 554 // due to VectorRegister rotate, always iterate in multiples of total_hs 555 for (int n = 0; n < total_hs/4; n++) { 556 sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k); 557 sha256_round(hs, total_hs, h_cnt, kpw0); 558 sha256_round(hs, total_hs, h_cnt, kpw1); 559 sha256_round(hs, total_hs, h_cnt, kpw2); 560 sha256_round(hs, total_hs, h_cnt, kpw3); 561 } 562 563 bdnz (core_loop); 564 565 // Update hash state 566 sha256_update_sha_state(a, b, c, d, e, f, g, h, state); 567 568 if (multi_block) { 569 // process next 1024 bit block (buf_in already updated) 570 addi(ofs, ofs, buf_size); 571 cmpd(CCR0, ofs, limit); 572 blt(CCR0, sha_loop); 573 574 // return ofs 575 mr(R3_ARG1, ofs); 576 } 577 578 // Restore non-volatile registers 579 for (int c = 0; c < nv_size; c++) { 580 Register tmp = R8; 581 li (tmp, (c - (nv_size)) * 16); 582 lvx(nv[c], tmp, R1); 583 } 584 } 585 586 587 /********************************************************************** 588 * SHA 512 589 *********************************************************************/ 590 591 void MacroAssembler::sha512_load_w_vec(const Register buf_in, 592 const VectorRegister* ws, 593 const int total_ws) { 594 Register tmp = R8; 595 VectorRegister vRb = VR8; 596 VectorRegister aux = VR9; 597 Label is_aligned, after_alignment; 598 599 andi_ (tmp, buf_in, 0xF); 600 beq (CCR0, is_aligned); // address ends with 0x0, not 0x8 601 602 // deal with unaligned addresses 603 lvx (ws[0], buf_in); 604 addi (buf_in, buf_in, 16); 605 lvsl (vRb, buf_in); 606 607 for (int n = 1; n < total_ws; n++) { 608 VectorRegister w_cur = ws[n]; 609 VectorRegister w_prev = ws[n-1]; 610 611 lvx (w_cur, buf_in); 612 addi (buf_in, buf_in, 16); 613 #if defined(VM_LITTLE_ENDIAN) 614 vperm(w_prev, w_cur, w_prev, vRb); 615 #else 616 vperm(w_prev, w_prev, w_cur, vRb); 617 #endif 618 } 619 620 lvx (aux, buf_in); 621 #if defined(VM_LITTLE_ENDIAN) 622 vperm (ws[total_ws-1], aux, ws[total_ws-1], vRb); 623 #else 624 vperm (ws[total_ws-1], ws[total_ws-1], aux, vRb); 625 #endif 626 627 b (after_alignment); 628 629 bind(is_aligned); 630 631 for (int n = 0; n < total_ws; n++) { 632 VectorRegister w = ws[n]; 633 634 lvx (w, buf_in); 635 addi (buf_in, buf_in, 16); 636 } 637 638 bind(after_alignment); 639 } 640 641 // Update hash state 642 void MacroAssembler::sha512_update_sha_state(const Register state, 643 const VectorRegister* hs, 644 const int total_hs) { 645 646 #if defined(VM_LITTLE_ENDIAN) 647 int start_idx = 0; 648 #else 649 int start_idx = 1; 650 #endif 651 652 // load initial hash from the memory pointed by state 653 VectorRegister ini_a = VR10; 654 VectorRegister ini_c = VR12; 655 VectorRegister ini_e = VR14; 656 VectorRegister ini_g = VR16; 657 static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g}; 658 static const int total_inis = sizeof(inis)/sizeof(VectorRegister); 659 660 Label state_save_aligned, after_state_save_aligned; 661 662 Register addr = R7; 663 Register tmp = R8; 664 VectorRegister vRb = VR8; 665 VectorRegister aux = VR9; 666 667 andi_(tmp, state, 0xf); 668 beq(CCR0, state_save_aligned); 669 // deal with unaligned addresses 670 671 { 672 VectorRegister a = hs[0]; 673 VectorRegister b_ = hs[1]; 674 VectorRegister c = hs[2]; 675 VectorRegister d = hs[3]; 676 VectorRegister e = hs[4]; 677 VectorRegister f = hs[5]; 678 VectorRegister g = hs[6]; 679 VectorRegister h = hs[7]; 680 lvsr (vRb, state); 681 lvx (ini_a, state); 682 addi (addr, state, 16); 683 684 lvx (ini_c, addr); 685 addi (addr, addr, 16); 686 #if defined(VM_LITTLE_ENDIAN) 687 vperm (ini_a, ini_c, ini_a, vRb); 688 #else 689 vperm (ini_a, ini_a, ini_c, vRb); 690 #endif 691 692 lvx (ini_e, addr); 693 addi (addr, addr, 16); 694 #if defined(VM_LITTLE_ENDIAN) 695 vperm (ini_c, ini_e, ini_c, vRb); 696 #else 697 vperm (ini_c, ini_c, ini_e, vRb); 698 #endif 699 700 lvx (ini_g, addr); 701 addi (addr, addr, 16); 702 #if defined(VM_LITTLE_ENDIAN) 703 vperm (ini_e, ini_g, ini_e, vRb); 704 #else 705 vperm (ini_e, ini_e, ini_g, vRb); 706 #endif 707 708 lvx (aux, addr); 709 #if defined(VM_LITTLE_ENDIAN) 710 vperm (ini_g, aux, ini_g, vRb); 711 #else 712 vperm (ini_g, ini_g, aux, vRb); 713 #endif 714 715 #if defined(VM_LITTLE_ENDIAN) 716 xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr()); 717 xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr()); 718 xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr()); 719 xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr()); 720 #else 721 xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr()); 722 xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr()); 723 xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr()); 724 xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr()); 725 #endif 726 727 for (int n = start_idx; n < total_hs; n += 2) { 728 VectorRegister h_cur = hs[n]; 729 VectorRegister ini_cur = inis[n/2]; 730 731 vaddudm(h_cur, ini_cur, h_cur); 732 } 733 734 for (int n = start_idx; n < total_hs; n += 2) { 735 VectorRegister h_cur = hs[n]; 736 737 mfvrd (tmp, h_cur); 738 #if defined(VM_LITTLE_ENDIAN) 739 std (tmp, 8*n + 8, state); 740 #else 741 std (tmp, 8*n - 8, state); 742 #endif 743 vsldoi (aux, h_cur, h_cur, 8); 744 mfvrd (tmp, aux); 745 std (tmp, 8*n + 0, state); 746 } 747 748 b (after_state_save_aligned); 749 } 750 751 bind(state_save_aligned); 752 { 753 mr(addr, state); 754 for (int n = 0; n < total_hs; n += 2) { 755 #if defined(VM_LITTLE_ENDIAN) 756 VectorRegister h_cur = hs[n]; 757 VectorRegister h_next = hs[n+1]; 758 #else 759 VectorRegister h_cur = hs[n+1]; 760 VectorRegister h_next = hs[n]; 761 #endif 762 VectorRegister ini_cur = inis[n/2]; 763 764 lvx(ini_cur, addr); 765 addi(addr, addr, 16); 766 xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr()); 767 } 768 769 for (int n = start_idx; n < total_hs; n += 2) { 770 VectorRegister h_cur = hs[n]; 771 VectorRegister ini_cur = inis[n/2]; 772 773 vaddudm(h_cur, ini_cur, h_cur); 774 } 775 776 mr(addr, state); 777 for (int n = start_idx; n < total_hs; n += 2) { 778 VectorRegister h_cur = hs[n]; 779 780 stvx(h_cur, addr); 781 addi(addr, addr, 16); 782 } 783 } 784 785 bind(after_state_save_aligned); 786 } 787 788 // Use h_cnt to cycle through hs elements but also increment it at the end 789 void MacroAssembler::sha512_round(const VectorRegister* hs, 790 const int total_hs, int& h_cnt, 791 const VectorRegister kpw) { 792 793 // convenience registers: cycle from 0-7 downwards 794 const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs]; 795 const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs]; 796 const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs]; 797 const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs]; 798 const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs]; 799 const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs]; 800 const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs]; 801 const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs]; 802 // temporaries 803 const VectorRegister Ch = VR20; 804 const VectorRegister Maj = VR21; 805 const VectorRegister bsa = VR22; 806 const VectorRegister bse = VR23; 807 const VectorRegister tmp1 = VR24; 808 const VectorRegister tmp2 = VR25; 809 810 vsel (Ch, g, f, e); 811 vxor (Maj, a, b); 812 vshasigmad(bse, e, 1, 0xf); 813 vaddudm (tmp2, Ch, kpw); 814 vaddudm (tmp1, h, bse); 815 vsel (Maj, b, c, Maj); 816 vaddudm (tmp1, tmp1, tmp2); 817 vshasigmad(bsa, a, 1, 0); 818 vaddudm (tmp2, bsa, Maj); 819 vaddudm (d, d, tmp1); 820 vaddudm (h, tmp1, tmp2); 821 822 // advance vector pointer to the next iteration 823 h_cnt++; 824 } 825 826 void MacroAssembler::sha512_calc_2w(const VectorRegister w0, 827 const VectorRegister w1, 828 const VectorRegister w2, 829 const VectorRegister w3, 830 const VectorRegister w4, 831 const VectorRegister w5, 832 const VectorRegister w6, 833 const VectorRegister w7, 834 const VectorRegister kpw0, 835 const VectorRegister kpw1, 836 const Register j, 837 const VectorRegister vRb, 838 const Register k) { 839 // Temporaries 840 const VectorRegister VR_a = VR20; 841 const VectorRegister VR_b = VR21; 842 const VectorRegister VR_c = VR22; 843 const VectorRegister VR_d = VR23; 844 845 // load to k[j] 846 lvx (VR_a, j, k); 847 // advance j 848 addi (j, j, 16); // 16 bytes were read 849 850 #if defined(VM_LITTLE_ENDIAN) 851 // v6 = w[j-15], w[j-14] 852 vperm (VR_b, w1, w0, vRb); 853 // v12 = w[j-7], w[j-6] 854 vperm (VR_c, w5, w4, vRb); 855 #else 856 // v6 = w[j-15], w[j-14] 857 vperm (VR_b, w0, w1, vRb); 858 // v12 = w[j-7], w[j-6] 859 vperm (VR_c, w4, w5, vRb); 860 #endif 861 862 // v6 = s0(w[j-15]) , s0(w[j-14]) 863 vshasigmad (VR_b, VR_b, 0, 0); 864 // v5 = s1(w[j-2]) , s1(w[j-1]) 865 vshasigmad (VR_d, w7, 0, 0xf); 866 // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6] 867 vaddudm (VR_b, VR_b, VR_c); 868 // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15] 869 vaddudm (VR_d, VR_d, w0); 870 // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 871 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 872 vaddudm (VR_c, VR_d, VR_b); 873 // Updating w0 to w7 to hold the new previous 16 values from w. 874 vmr (w0, w1); 875 vmr (w1, w2); 876 vmr (w2, w3); 877 vmr (w3, w4); 878 vmr (w4, w5); 879 vmr (w5, w6); 880 vmr (w6, w7); 881 vmr (w7, VR_c); 882 883 #if defined(VM_LITTLE_ENDIAN) 884 // store k + w to kpw0 (2 values at once) 885 vaddudm (kpw0, VR_c, VR_a); 886 // kpw1 holds (k + w)[1] 887 vsldoi (kpw1, kpw0, kpw0, 8); 888 #else 889 // store k + w to kpw0 (2 values at once) 890 vaddudm (kpw1, VR_c, VR_a); 891 // kpw1 holds (k + w)[1] 892 vsldoi (kpw0, kpw1, kpw1, 8); 893 #endif 894 } 895 896 void MacroAssembler::sha512_load_h_vec(const Register state, 897 const VectorRegister* hs, 898 const int total_hs) { 899 #if defined(VM_LITTLE_ENDIAN) 900 VectorRegister a = hs[0]; 901 VectorRegister g = hs[6]; 902 int start_idx = 0; 903 #else 904 VectorRegister a = hs[1]; 905 VectorRegister g = hs[7]; 906 int start_idx = 1; 907 #endif 908 909 Register addr = R7; 910 VectorRegister vRb = VR8; 911 Register tmp = R8; 912 Label state_aligned, after_state_aligned; 913 914 andi_(tmp, state, 0xf); 915 beq(CCR0, state_aligned); 916 917 // deal with unaligned addresses 918 VectorRegister aux = VR9; 919 920 lvx (a, state); 921 addi (addr, state, 16); 922 lvsl (vRb, addr); 923 924 for (int n = start_idx + 2; n < total_hs; n += 2) { 925 VectorRegister h_cur = hs[n]; 926 VectorRegister h_prev2 = hs[n - 2]; 927 928 lvx (h_cur, addr); 929 addi (addr, addr, 16); 930 #if defined(VM_LITTLE_ENDIAN) 931 vperm (h_prev2, h_cur, h_prev2, vRb); 932 #else 933 vperm (h_prev2, h_prev2, h_cur, vRb); 934 #endif 935 } 936 lvx (aux, addr); 937 #if defined(VM_LITTLE_ENDIAN) 938 vperm (g, aux, g, vRb); 939 #else 940 vperm (g, g, aux, vRb); 941 #endif 942 943 b (after_state_aligned); 944 945 bind(state_aligned); 946 947 // deal with aligned addresses 948 mr(addr, state); 949 for (int n = start_idx; n < total_hs; n += 2) { 950 VectorRegister h_cur = hs[n]; 951 952 lvx (h_cur, addr); 953 addi (addr, addr, 16); 954 } 955 956 bind(after_state_aligned); 957 } 958 959 // R3_ARG1 - byte[] Input string with padding but in Big Endian 960 // R4_ARG2 - int[] SHA.state (at first, the root of primes) 961 // R5_ARG3 - int offset 962 // R6_ARG4 - int limit 963 // 964 // Internal Register usage: 965 // R7 R8 R9 - volatile temporaries 966 // VR0-VR7 - a-h 967 // VR8 - vRb 968 // VR9 - aux (highly volatile, use with care) 969 // VR10-VR17 - w0-w7 | ini_a-ini_h 970 // VR18 - vsp16 | kplusw0 971 // VR19 - vsp32 | kplusw1 972 // VR20-VR25 - sha512_calc_2w and sha512_round temporaries 973 void MacroAssembler::sha512(bool multi_block) { 974 static const ssize_t base_size = sizeof(uint64_t); 975 static const ssize_t buf_size = 128; 976 static uint64_t waux[buf_size / base_size] __attribute((aligned (16))); 977 static const uint64_t round_consts[80] __attribute((aligned (16))) = { 978 0x428a2f98d728ae22, 0x7137449123ef65cd, 979 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, 980 0x3956c25bf348b538, 0x59f111f1b605d019, 981 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 982 0xd807aa98a3030242, 0x12835b0145706fbe, 983 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, 984 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 985 0x9bdc06a725c71235, 0xc19bf174cf692694, 986 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 987 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 988 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 989 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, 990 0x983e5152ee66dfab, 0xa831c66d2db43210, 991 0xb00327c898fb213f, 0xbf597fc7beef0ee4, 992 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 993 0x06ca6351e003826f, 0x142929670a0e6e70, 994 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 995 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, 996 0x650a73548baf63de, 0x766a0abb3c77b2a8, 997 0x81c2c92e47edaee6, 0x92722c851482353b, 998 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 999 0xc24b8b70d0f89791, 0xc76c51a30654be30, 1000 0xd192e819d6ef5218, 0xd69906245565a910, 1001 0xf40e35855771202a, 0x106aa07032bbd1b8, 1002 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 1003 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, 1004 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 1005 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 1006 0x748f82ee5defb2fc, 0x78a5636f43172f60, 1007 0x84c87814a1f0ab72, 0x8cc702081a6439ec, 1008 0x90befffa23631e28, 0xa4506cebde82bde9, 1009 0xbef9a3f7b2c67915, 0xc67178f2e372532b, 1010 0xca273eceea26619c, 0xd186b8c721c0c207, 1011 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 1012 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 1013 0x113f9804bef90dae, 0x1b710b35131c471b, 1014 0x28db77f523047d84, 0x32caab7b40c72493, 1015 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, 1016 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 1017 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, 1018 }; 1019 static const uint8_t w_size = sizeof(round_consts)/sizeof(uint64_t); 1020 1021 Register buf_in = R3_ARG1; 1022 Register state = R4_ARG2; 1023 Register ofs = R5_ARG3; 1024 Register limit = R6_ARG4; 1025 1026 Label sha_loop, bsw_loop, core_loop; 1027 1028 // Save non-volatile vector registers in the red zone 1029 static const VectorRegister nv[] = { 1030 VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/ 1031 }; 1032 static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister); 1033 1034 for (int c = 0; c < nv_size; c++) { 1035 Register idx = R7; 1036 li (idx, (c - (nv_size)) * 16); 1037 stvx(nv[c], idx, R1); 1038 } 1039 1040 // Load hash state to registers 1041 VectorRegister a = VR0; 1042 VectorRegister b = VR1; 1043 VectorRegister c = VR2; 1044 VectorRegister d = VR3; 1045 VectorRegister e = VR4; 1046 VectorRegister f = VR5; 1047 VectorRegister g = VR6; 1048 VectorRegister h = VR7; 1049 static const VectorRegister hs[] = {a, b, c, d, e, f, g, h}; 1050 static const int total_hs = sizeof(hs)/sizeof(VectorRegister); 1051 // counter for cycling through hs vector to avoid register moves between iterations 1052 int h_cnt = 0; 1053 1054 // Load a-h registers from the memory pointed by state 1055 sha512_load_h_vec(state, hs, total_hs); 1056 1057 if (multi_block) { 1058 align(OptoLoopAlignment); 1059 } 1060 bind(sha_loop); 1061 1062 for (int n = 0; n < total_hs; n += 2) { 1063 #if defined(VM_LITTLE_ENDIAN) 1064 VectorRegister h_cur = hs[n]; 1065 VectorRegister h_next = hs[n + 1]; 1066 #else 1067 VectorRegister h_cur = hs[n + 1]; 1068 VectorRegister h_next = hs[n]; 1069 #endif 1070 vsldoi (h_next, h_cur, h_cur, 8); 1071 } 1072 1073 Register k = R9; 1074 load_const_optimized(k, const_cast<uint64_t *>(round_consts), R0); 1075 1076 // Load 16 elements from w out of the loop. 1077 // Order of the long values is Endianess specific. 1078 VectorRegister w0 = VR10; 1079 VectorRegister w1 = VR11; 1080 VectorRegister w2 = VR12; 1081 VectorRegister w3 = VR13; 1082 VectorRegister w4 = VR14; 1083 VectorRegister w5 = VR15; 1084 VectorRegister w6 = VR16; 1085 VectorRegister w7 = VR17; 1086 static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7}; 1087 static const int total_ws = sizeof(ws)/sizeof(VectorRegister); 1088 1089 // Load 16 w into vectors and setup vsl for vperm 1090 sha512_load_w_vec(buf_in, ws, total_ws); 1091 1092 #if defined(VM_LITTLE_ENDIAN) 1093 VectorRegister vsp16 = VR18; 1094 VectorRegister vsp32 = VR19; 1095 VectorRegister shiftarg = VR9; 1096 1097 vspltisw(vsp16, 8); 1098 vspltisw(shiftarg, 1); 1099 vsl (vsp16, vsp16, shiftarg); 1100 vsl (vsp32, vsp16, shiftarg); 1101 1102 VectorRegister vsp8 = VR9; 1103 vspltish(vsp8, 8); 1104 1105 // Convert input from Big Endian to Little Endian 1106 for (int c = 0; c < total_ws; c++) { 1107 VectorRegister w = ws[c]; 1108 vrlh (w, w, vsp8); 1109 } 1110 for (int c = 0; c < total_ws; c++) { 1111 VectorRegister w = ws[c]; 1112 vrlw (w, w, vsp16); 1113 } 1114 for (int c = 0; c < total_ws; c++) { 1115 VectorRegister w = ws[c]; 1116 vrld (w, w, vsp32); 1117 } 1118 #endif 1119 1120 Register Rb = R10; 1121 VectorRegister vRb = VR8; 1122 li (Rb, 8); 1123 lvsl (vRb, Rb); 1124 1125 VectorRegister kplusw0 = VR18; 1126 VectorRegister kplusw1 = VR19; 1127 1128 Register addr = R7; 1129 mr (addr, k); 1130 1131 for (int n = 0; n < total_ws; n++) { 1132 VectorRegister w = ws[n]; 1133 1134 lvx (kplusw0, addr); 1135 addi (addr, addr, 16); 1136 #if defined(VM_LITTLE_ENDIAN) 1137 vaddudm(kplusw0, kplusw0, w); 1138 vsldoi (kplusw1, kplusw0, kplusw0, 8); 1139 #else 1140 vaddudm(kplusw1, kplusw0, w); 1141 vsldoi (kplusw0, kplusw1, kplusw1, 8); 1142 #endif 1143 1144 sha512_round(hs, total_hs, h_cnt, kplusw0); 1145 sha512_round(hs, total_hs, h_cnt, kplusw1); 1146 } 1147 1148 Register tmp = R8; 1149 li (tmp, (w_size-16)/total_hs); 1150 mtctr (tmp); 1151 // j will be aligned to 4 for loading words. 1152 // Whenever read, advance the pointer (e.g: when j is used in a function) 1153 Register j = tmp; 1154 li (j, 8*16); 1155 1156 align(OptoLoopAlignment); 1157 bind(core_loop); 1158 1159 // due to VectorRegister rotate, always iterate in multiples of total_hs 1160 for (int n = 0; n < total_hs/2; n++) { 1161 sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k); 1162 sha512_round(hs, total_hs, h_cnt, kplusw0); 1163 sha512_round(hs, total_hs, h_cnt, kplusw1); 1164 } 1165 1166 bdnz (core_loop); 1167 1168 sha512_update_sha_state(state, hs, total_hs); 1169 1170 if (multi_block) { 1171 // process next 1024 bit block (buf_in already updated) 1172 addi(ofs, ofs, buf_size); 1173 cmpd(CCR0, ofs, limit); 1174 blt(CCR0, sha_loop); 1175 1176 // return ofs 1177 mr(R3_ARG1, ofs); 1178 } 1179 1180 // Restore non-volatile registers 1181 for (int c = 0; c < nv_size; c++) { 1182 Register idx = R7; 1183 li (idx, (c - (nv_size)) * 16); 1184 lvx(nv[c], idx, R1); 1185 } 1186 }