1 // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved. 2 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 3 // 4 // This code is free software; you can redistribute it and/or modify it 5 // under the terms of the GNU General Public License version 2 only, as 6 // published by the Free Software Foundation. 7 // 8 // This code is distributed in the hope that it will be useful, but WITHOUT 9 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11 // version 2 for more details (a copy is included in the LICENSE file that 12 // accompanied this code). 13 // 14 // You should have received a copy of the GNU General Public License version 15 // 2 along with this work; if not, write to the Free Software Foundation, 16 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 17 // 18 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 19 // or visit www.oracle.com if you need additional information or have any 20 // questions. 21 22 // Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512" 23 // (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf). 24 25 #include "asm/macroAssembler.inline.hpp" 26 #include "runtime/stubRoutines.hpp" 27 28 /********************************************************************** 29 * SHA 256 30 *********************************************************************/ 31 32 void MacroAssembler::sha256_deque(const VectorRegister src, 33 const VectorRegister dst1, 34 const VectorRegister dst2, 35 const VectorRegister dst3) { 36 vsldoi (dst1, src, src, 12); 37 vsldoi (dst2, src, src, 8); 38 vsldoi (dst3, src, src, 4); 39 } 40 41 void MacroAssembler::sha256_round(const VectorRegister* hs, 42 const int total_hs, 43 int& h_cnt, 44 const VectorRegister kpw) { 45 // convenience registers: cycle from 0-7 downwards 46 const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs]; 47 const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs]; 48 const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs]; 49 const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs]; 50 const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs]; 51 const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs]; 52 const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs]; 53 const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs]; 54 // temporaries 55 VectorRegister ch = VR0; 56 VectorRegister maj = VR1; 57 VectorRegister bsa = VR2; 58 VectorRegister bse = VR3; 59 VectorRegister vt0 = VR4; 60 VectorRegister vt1 = VR5; 61 VectorRegister vt2 = VR6; 62 VectorRegister vt3 = VR7; 63 64 vsel (ch, g, f, e); 65 vxor (maj, a, b); 66 vshasigmaw (bse, e, 1, 0xf); 67 vadduwm (vt2, ch, kpw); 68 vadduwm (vt1, h, bse); 69 vsel (maj, b, c, maj); 70 vadduwm (vt3, vt1, vt2); 71 vshasigmaw (bsa, a, 1, 0); 72 vadduwm (vt0, bsa, maj); 73 74 vadduwm (d, d, vt3); 75 vadduwm (h, vt3, vt0); 76 77 // advance vector pointer to the next iteration 78 h_cnt++; 79 } 80 81 void MacroAssembler::sha256_load_h_vec(const VectorRegister a, 82 const VectorRegister e, 83 const Register hptr) { 84 // temporaries 85 Register tmp = R8; 86 VectorRegister vt0 = VR0; 87 VectorRegister vRb = VR6; 88 // labels 89 Label sha256_aligned, sha256_load_end;; 90 91 andi_ (tmp, hptr, 0xf); 92 addi (tmp, hptr, 16); 93 beq (CCR0, sha256_aligned); 94 95 // handle unaligned accesses 96 lvx (a, hptr); 97 lvsr (vRb, hptr); 98 99 lvx (e, tmp); 100 addi (tmp, tmp, 16); 101 vec_perm(a, e, vRb); 102 103 lvx (vt0, tmp); 104 vec_perm(e, vt0, vRb); 105 b (sha256_load_end); 106 107 // aligned accesses 108 bind(sha256_aligned); 109 lvx (a, hptr); 110 addi (tmp, hptr, 16); 111 lvx (e, tmp); 112 113 bind(sha256_load_end); 114 } 115 116 void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in, 117 const VectorRegister* ws, 118 const int total_ws, 119 const Register k, 120 const VectorRegister* kpws, 121 const int total_kpws) { 122 Label w_aligned, after_w_load; 123 124 Register tmp = R8; 125 VectorRegister vt0 = VR0; 126 VectorRegister vt1 = VR1; 127 VectorRegister vRb = VR6; 128 129 andi_ (tmp, buf_in, 0xF); 130 beq (CCR0, w_aligned); // address ends with 0x0, not 0x8 131 132 // deal with unaligned addresses 133 lvx (ws[0], buf_in); 134 addi (buf_in, buf_in, 16); 135 lvsl (vRb, buf_in); 136 137 for (int n = 1; n < total_ws; n++) { 138 VectorRegister w_cur = ws[n]; 139 VectorRegister w_prev = ws[n-1]; 140 141 lvx (w_cur, buf_in); 142 addi (buf_in, buf_in, 16); 143 vec_perm(w_prev, w_cur, vRb); 144 } 145 146 lvx (vt0, buf_in); 147 vec_perm(ws[total_ws-1], vt0, vRb); 148 149 b (after_w_load); 150 151 bind(w_aligned); 152 153 // deal with aligned addresses 154 for (int n = 0; n < total_ws; n++) { 155 VectorRegister w = ws[n]; 156 157 lvx (w, buf_in); 158 addi (buf_in, buf_in, 16); 159 } 160 161 bind(after_w_load); 162 163 #if defined(VM_LITTLE_ENDIAN) 164 // Byte swapping within int values 165 li (tmp, 8); 166 lvsl (vt0, tmp); 167 vspltisb (vt1, 0xb); 168 vxor (vt1, vt0, vt1); 169 for (int n = 0; n < total_ws; n++) { 170 VectorRegister w = ws[n]; 171 vec_perm(w, w, vt1); 172 } 173 #endif 174 175 // Loading k, which is always aligned to 16-bytes 176 lvx (kpws[0], k); 177 addi (tmp, k, 16); 178 for (int n = 1; n < total_kpws-1; n++) { 179 VectorRegister kpw = kpws[n]; 180 181 lvx (kpw, tmp); 182 addi (tmp, tmp, 16); 183 } 184 lvx (kpws[total_kpws-1], tmp); 185 186 // Add w to K 187 assert(total_ws == total_kpws, "Redesign the loop below"); 188 for (int n = 0; n < total_kpws; n++) { 189 VectorRegister kpw = kpws[n]; 190 VectorRegister w = ws[n]; 191 192 vadduwm (kpw, kpw, w); 193 } 194 } 195 196 void MacroAssembler::sha256_calc_4w(const VectorRegister w0, 197 const VectorRegister w1, 198 const VectorRegister w2, 199 const VectorRegister w3, 200 const VectorRegister kpw0, 201 const VectorRegister kpw1, 202 const VectorRegister kpw2, 203 const VectorRegister kpw3, 204 const Register j, 205 const Register k) { 206 // Temporaries 207 const VectorRegister vt0 = VR0; 208 const VectorRegister vt1 = VR1; 209 const VectorSRegister vsrt1 = vt1->to_vsr(); 210 const VectorRegister vt2 = VR2; 211 const VectorRegister vt3 = VR3; 212 const VectorSRegister vst3 = vt3->to_vsr(); 213 const VectorRegister vt4 = VR4; 214 215 // load to k[j] 216 lvx (vt0, j, k); 217 218 // advance j 219 addi (j, j, 16); // 16 bytes were read 220 221 #if defined(VM_LITTLE_ENDIAN) 222 // b = w[j-15], w[j-14], w[j-13], w[j-12] 223 vsldoi (vt1, w1, w0, 12); 224 225 // c = w[j-7], w[j-6], w[j-5], w[j-4] 226 vsldoi (vt2, w3, w2, 12); 227 228 #else 229 // b = w[j-15], w[j-14], w[j-13], w[j-12] 230 vsldoi (vt1, w0, w1, 4); 231 232 // c = w[j-7], w[j-6], w[j-5], w[j-4] 233 vsldoi (vt2, w2, w3, 4); 234 #endif 235 236 // d = w[j-2], w[j-1], w[j-4], w[j-3] 237 vsldoi (vt3, w3, w3, 8); 238 239 // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12]) 240 vshasigmaw (vt1, vt1, 0, 0); 241 242 // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3]) 243 vshasigmaw (vt3, vt3, 0, 0xf); 244 245 // c = s0(w[j-15]) + w[j-7], 246 // s0(w[j-14]) + w[j-6], 247 // s0(w[j-13]) + w[j-5], 248 // s0(w[j-12]) + w[j-4] 249 vadduwm (vt2, vt1, vt2); 250 251 // c = s0(w[j-15]) + w[j-7] + w[j-16], 252 // s0(w[j-14]) + w[j-6] + w[j-15], 253 // s0(w[j-13]) + w[j-5] + w[j-14], 254 // s0(w[j-12]) + w[j-4] + w[j-13] 255 vadduwm (vt2, vt2, w0); 256 257 // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 258 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 259 // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED 260 // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3]) // UNDEFINED 261 vadduwm (vt4, vt2, vt3); 262 263 // At this point, e[0] and e[1] are the correct values to be stored at w[j] 264 // and w[j+1]. 265 // e[2] and e[3] are not considered. 266 // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED 267 vshasigmaw (vt1, vt4, 0, 0xf); 268 269 // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1]) 270 #if defined(VM_LITTLE_ENDIAN) 271 xxmrgld (vst3, vsrt1, vst3); 272 #else 273 xxmrghd (vst3, vst3, vsrt1); 274 #endif 275 276 // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 277 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 278 // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]), // w[j+2] 279 // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1]) // w[j+4] 280 vadduwm (vt2, vt2, vt3); 281 282 // Updating w0 to w3 to hold the new previous 16 values from w. 283 vmr (w0, w1); 284 vmr (w1, w2); 285 vmr (w2, w3); 286 vmr (w3, vt2); 287 288 // store k + w to v9 (4 values at once) 289 #if defined(VM_LITTLE_ENDIAN) 290 vadduwm (kpw0, vt2, vt0); 291 292 vsldoi (kpw1, kpw0, kpw0, 12); 293 vsldoi (kpw2, kpw0, kpw0, 8); 294 vsldoi (kpw3, kpw0, kpw0, 4); 295 #else 296 vadduwm (kpw3, vt2, vt0); 297 298 vsldoi (kpw2, kpw3, kpw3, 12); 299 vsldoi (kpw1, kpw3, kpw3, 8); 300 vsldoi (kpw0, kpw3, kpw3, 4); 301 #endif 302 } 303 304 void MacroAssembler::sha256_update_sha_state(const VectorRegister a, 305 const VectorRegister b_, 306 const VectorRegister c, 307 const VectorRegister d, 308 const VectorRegister e, 309 const VectorRegister f, 310 const VectorRegister g, 311 const VectorRegister h, 312 const Register hptr) { 313 // temporaries 314 VectorRegister vt0 = VR0; 315 VectorRegister vt1 = VR1; 316 VectorRegister vt2 = VR2; 317 VectorRegister vt3 = VR3; 318 VectorRegister vt4 = VR4; 319 VectorRegister vt5 = VR5; 320 VectorRegister vaux = VR6; 321 VectorRegister vRb = VR6; 322 Register tmp = R8; 323 Register of16 = R8; 324 Register of32 = R9; 325 Label state_load_aligned, after_state_load_aligned; 326 327 // Load hptr 328 andi_ (tmp, hptr, 0xf); 329 li (of16, 16); 330 beq (CCR0, state_load_aligned); 331 332 // handle unaligned accesses 333 li (of32, 32); 334 lvx (vt0, hptr); 335 lvsr (vRb, hptr); 336 337 lvx (vt5, hptr, of16); 338 vec_perm(vt0, vt5, vRb); // vt0 = hptr[0]..hptr[3] 339 340 lvx (vt1, hptr, of32); 341 vec_perm(vt5, vt1, vRb); // vt5 = hptr[4]..hptr[7] 342 b (after_state_load_aligned); 343 344 // aligned accesses 345 bind(state_load_aligned); 346 lvx (vt0, hptr); 347 lvx (vt5, of16, hptr); 348 349 bind(after_state_load_aligned); 350 351 #if defined(VM_LITTLE_ENDIAN) 352 vmrglw (vt1, b_, a); // vt1 = {a, b, ?, ?} 353 vmrglw (vt2, d, c); // vt2 = {c, d, ?, ?} 354 vmrglw (vt3, f, e); // vt3 = {e, f, ?, ?} 355 vmrglw (vt4, h, g); // vt4 = {g, h, ?, ?} 356 xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d} 357 xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h} 358 vadduwm (a, vt0, vt1); // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]} 359 vadduwm (e, vt5, vt3); // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]} 360 361 // Save hptr back, works for any alignment 362 xxswapd (vt0->to_vsr(), a->to_vsr()); 363 stxvd2x (vt0->to_vsr(), hptr); 364 xxswapd (vt5->to_vsr(), e->to_vsr()); 365 stxvd2x (vt5->to_vsr(), of16, hptr); 366 #else 367 vmrglw (vt1, a, b_); // vt1 = {a, b, ?, ?} 368 vmrglw (vt2, c, d); // vt2 = {c, d, ?, ?} 369 vmrglw (vt3, e, f); // vt3 = {e, f, ?, ?} 370 vmrglw (vt4, g, h); // vt4 = {g, h, ?, ?} 371 xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d} 372 xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h} 373 vadduwm (d, vt0, vt1); // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]} 374 vadduwm (h, vt5, vt3); // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]} 375 376 // Save hptr back, works for any alignment 377 stxvd2x (d->to_vsr(), hptr); 378 stxvd2x (h->to_vsr(), of16, hptr); 379 #endif 380 } 381 382 383 // R3_ARG1 - byte[] Input string with padding but in Big Endian 384 // R4_ARG2 - int[] SHA.state (at first, the root of primes) 385 // R5_ARG3 - int offset 386 // R6_ARG4 - int limit 387 // 388 // Internal Register usage: 389 // R7 - k 390 // R8 - tmp | j | of16 391 // R9 - of32 392 // VR0-VR8 - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb 393 // VR9-VR16 - a-h 394 // VR17-VR20 - w0-w3 395 // VR21-VR23 - vRb | vaux0-vaux2 396 // VR24-VR27 - kpw0-kpw3 397 void MacroAssembler::sha256(bool multi_block) { 398 static const ssize_t base_size = sizeof(uint32_t); 399 static const ssize_t buf_size = 64; 400 static uint32_t waux[buf_size / base_size] __attribute((aligned (16))); 401 static const uint32_t round_consts[64] __attribute((aligned (16))) = { 402 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 403 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 404 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 405 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 406 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 407 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 408 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 409 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 410 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 411 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 412 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 413 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 414 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 415 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 416 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 417 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 418 }; 419 static const uint8_t w_size = sizeof(round_consts)/sizeof(uint32_t); 420 421 Register buf_in = R3_ARG1; 422 Register state = R4_ARG2; 423 Register ofs = R5_ARG3; 424 Register limit = R6_ARG4; 425 426 Label sha_loop, bsw_loop, core_loop; 427 428 // Save non-volatile vector registers in the red zone 429 static const VectorRegister nv[] = { 430 VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/ 431 }; 432 static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister); 433 434 for (int c = 0; c < nv_size; c++) { 435 Register tmp = R8; 436 li (tmp, (c - (nv_size)) * 16); 437 stvx(nv[c], tmp, R1); 438 } 439 440 // Load hash state to registers 441 VectorRegister a = VR9; 442 VectorRegister b = VR10; 443 VectorRegister c = VR11; 444 VectorRegister d = VR12; 445 VectorRegister e = VR13; 446 VectorRegister f = VR14; 447 VectorRegister g = VR15; 448 VectorRegister h = VR16; 449 static const VectorRegister hs[] = {a, b, c, d, e, f, g, h}; 450 static const int total_hs = sizeof(hs)/sizeof(VectorRegister); 451 // counter for cycling through hs vector to avoid register moves between iterations 452 int h_cnt = 0; 453 454 // Load a-h registers from the memory pointed by state 455 #if defined(VM_LITTLE_ENDIAN) 456 sha256_load_h_vec(a, e, state); 457 #else 458 sha256_load_h_vec(d, h, state); 459 #endif 460 461 // keep k loaded also during MultiBlock loops 462 Register k = R7; 463 load_const_optimized(k, const_cast<uint32_t *>(round_consts), R0); 464 465 // Avoiding redundant loads 466 if (multi_block) { 467 align(OptoLoopAlignment); 468 } 469 bind(sha_loop); 470 #if defined(VM_LITTLE_ENDIAN) 471 sha256_deque(a, b, c, d); 472 sha256_deque(e, f, g, h); 473 #else 474 sha256_deque(d, c, b, a); 475 sha256_deque(h, g, f, e); 476 #endif 477 478 // Load 16 elements from w out of the loop. 479 // Order of the int values is Endianess specific. 480 VectorRegister w0 = VR17; 481 VectorRegister w1 = VR18; 482 VectorRegister w2 = VR19; 483 VectorRegister w3 = VR20; 484 static const VectorRegister ws[] = {w0, w1, w2, w3}; 485 static const int total_ws = sizeof(ws)/sizeof(VectorRegister); 486 487 VectorRegister kpw0 = VR24; 488 VectorRegister kpw1 = VR25; 489 VectorRegister kpw2 = VR26; 490 VectorRegister kpw3 = VR27; 491 static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3}; 492 static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister); 493 494 sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws); 495 496 // Cycle through the first 16 elements 497 assert(total_ws == total_kpws, "Redesign the loop below"); 498 for (int n = 0; n < total_ws; n++) { 499 VectorRegister vaux0 = VR21; 500 VectorRegister vaux1 = VR22; 501 VectorRegister vaux2 = VR23; 502 503 sha256_deque(kpws[n], vaux0, vaux1, vaux2); 504 505 #if defined(VM_LITTLE_ENDIAN) 506 sha256_round(hs, total_hs, h_cnt, kpws[n]); 507 sha256_round(hs, total_hs, h_cnt, vaux0); 508 sha256_round(hs, total_hs, h_cnt, vaux1); 509 sha256_round(hs, total_hs, h_cnt, vaux2); 510 #else 511 sha256_round(hs, total_hs, h_cnt, vaux2); 512 sha256_round(hs, total_hs, h_cnt, vaux1); 513 sha256_round(hs, total_hs, h_cnt, vaux0); 514 sha256_round(hs, total_hs, h_cnt, kpws[n]); 515 #endif 516 } 517 518 Register tmp = R8; 519 // loop the 16th to the 64th iteration by 8 steps 520 li (tmp, (w_size - 16) / total_hs); 521 mtctr(tmp); 522 523 // j will be aligned to 4 for loading words. 524 // Whenever read, advance the pointer (e.g: when j is used in a function) 525 Register j = R8; 526 li (j, 16*4); 527 528 align(OptoLoopAlignment); 529 bind(core_loop); 530 531 // due to VectorRegister rotate, always iterate in multiples of total_hs 532 for (int n = 0; n < total_hs/4; n++) { 533 sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k); 534 sha256_round(hs, total_hs, h_cnt, kpw0); 535 sha256_round(hs, total_hs, h_cnt, kpw1); 536 sha256_round(hs, total_hs, h_cnt, kpw2); 537 sha256_round(hs, total_hs, h_cnt, kpw3); 538 } 539 540 bdnz (core_loop); 541 542 // Update hash state 543 sha256_update_sha_state(a, b, c, d, e, f, g, h, state); 544 545 if (multi_block) { 546 // process next 1024 bit block (buf_in already updated) 547 addi(ofs, ofs, buf_size); 548 cmpd(CCR0, ofs, limit); 549 blt(CCR0, sha_loop); 550 551 // return ofs 552 mr(R3_ARG1, ofs); 553 } 554 555 // Restore non-volatile registers 556 for (int c = 0; c < nv_size; c++) { 557 Register tmp = R8; 558 li (tmp, (c - (nv_size)) * 16); 559 lvx(nv[c], tmp, R1); 560 } 561 } 562 563 564 /********************************************************************** 565 * SHA 512 566 *********************************************************************/ 567 568 void MacroAssembler::sha512_load_w_vec(const Register buf_in, 569 const VectorRegister* ws, 570 const int total_ws) { 571 Register tmp = R8; 572 VectorRegister vRb = VR8; 573 VectorRegister aux = VR9; 574 Label is_aligned, after_alignment; 575 576 andi_ (tmp, buf_in, 0xF); 577 beq (CCR0, is_aligned); // address ends with 0x0, not 0x8 578 579 // deal with unaligned addresses 580 lvx (ws[0], buf_in); 581 addi (buf_in, buf_in, 16); 582 lvsl (vRb, buf_in); 583 584 for (int n = 1; n < total_ws; n++) { 585 VectorRegister w_cur = ws[n]; 586 VectorRegister w_prev = ws[n-1]; 587 588 lvx (w_cur, buf_in); 589 addi (buf_in, buf_in, 16); 590 vec_perm(w_prev, w_cur, vRb); 591 } 592 593 lvx (aux, buf_in); 594 vec_perm(ws[total_ws-1], aux, vRb); 595 596 b (after_alignment); 597 598 bind(is_aligned); 599 600 for (int n = 0; n < total_ws; n++) { 601 VectorRegister w = ws[n]; 602 603 lvx (w, buf_in); 604 addi (buf_in, buf_in, 16); 605 } 606 607 bind(after_alignment); 608 } 609 610 // Update hash state 611 void MacroAssembler::sha512_update_sha_state(const Register state, 612 const VectorRegister* hs, 613 const int total_hs) { 614 615 #if defined(VM_LITTLE_ENDIAN) 616 int start_idx = 0; 617 #else 618 int start_idx = 1; 619 #endif 620 621 // load initial hash from the memory pointed by state 622 VectorRegister ini_a = VR10; 623 VectorRegister ini_c = VR12; 624 VectorRegister ini_e = VR14; 625 VectorRegister ini_g = VR16; 626 static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g}; 627 static const int total_inis = sizeof(inis)/sizeof(VectorRegister); 628 629 Label state_save_aligned, after_state_save_aligned; 630 631 Register addr = R7; 632 Register tmp = R8; 633 VectorRegister vRb = VR8; 634 VectorRegister aux = VR9; 635 636 andi_(tmp, state, 0xf); 637 beq(CCR0, state_save_aligned); 638 // deal with unaligned addresses 639 640 { 641 VectorRegister a = hs[0]; 642 VectorRegister b_ = hs[1]; 643 VectorRegister c = hs[2]; 644 VectorRegister d = hs[3]; 645 VectorRegister e = hs[4]; 646 VectorRegister f = hs[5]; 647 VectorRegister g = hs[6]; 648 VectorRegister h = hs[7]; 649 lvsr (vRb, state); 650 lvx (ini_a, state); 651 addi (addr, state, 16); 652 653 lvx (ini_c, addr); 654 addi (addr, addr, 16); 655 vec_perm(ini_a, ini_c, vRb); 656 657 lvx (ini_e, addr); 658 addi (addr, addr, 16); 659 vec_perm(ini_c, ini_e, vRb); 660 661 lvx (ini_g, addr); 662 addi (addr, addr, 16); 663 vec_perm(ini_e, ini_g, vRb); 664 665 lvx (aux, addr); 666 vec_perm(ini_g, aux, vRb); 667 668 #if defined(VM_LITTLE_ENDIAN) 669 xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr()); 670 xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr()); 671 xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr()); 672 xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr()); 673 #else 674 xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr()); 675 xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr()); 676 xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr()); 677 xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr()); 678 #endif 679 680 for (int n = start_idx; n < total_hs; n += 2) { 681 VectorRegister h_cur = hs[n]; 682 VectorRegister ini_cur = inis[n/2]; 683 684 vaddudm(h_cur, ini_cur, h_cur); 685 } 686 687 for (int n = start_idx; n < total_hs; n += 2) { 688 VectorRegister h_cur = hs[n]; 689 690 mfvrd (tmp, h_cur); 691 #if defined(VM_LITTLE_ENDIAN) 692 std (tmp, 8*n + 8, state); 693 #else 694 std (tmp, 8*n - 8, state); 695 #endif 696 vsldoi (aux, h_cur, h_cur, 8); 697 mfvrd (tmp, aux); 698 std (tmp, 8*n + 0, state); 699 } 700 701 b (after_state_save_aligned); 702 } 703 704 bind(state_save_aligned); 705 { 706 mr(addr, state); 707 for (int n = 0; n < total_hs; n += 2) { 708 #if defined(VM_LITTLE_ENDIAN) 709 VectorRegister h_cur = hs[n]; 710 VectorRegister h_next = hs[n+1]; 711 #else 712 VectorRegister h_cur = hs[n+1]; 713 VectorRegister h_next = hs[n]; 714 #endif 715 VectorRegister ini_cur = inis[n/2]; 716 717 lvx(ini_cur, addr); 718 addi(addr, addr, 16); 719 xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr()); 720 } 721 722 for (int n = start_idx; n < total_hs; n += 2) { 723 VectorRegister h_cur = hs[n]; 724 VectorRegister ini_cur = inis[n/2]; 725 726 vaddudm(h_cur, ini_cur, h_cur); 727 } 728 729 mr(addr, state); 730 for (int n = start_idx; n < total_hs; n += 2) { 731 VectorRegister h_cur = hs[n]; 732 733 stvx(h_cur, addr); 734 addi(addr, addr, 16); 735 } 736 } 737 738 bind(after_state_save_aligned); 739 } 740 741 // Use h_cnt to cycle through hs elements but also increment it at the end 742 void MacroAssembler::sha512_round(const VectorRegister* hs, 743 const int total_hs, int& h_cnt, 744 const VectorRegister kpw) { 745 746 // convenience registers: cycle from 0-7 downwards 747 const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs]; 748 const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs]; 749 const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs]; 750 const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs]; 751 const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs]; 752 const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs]; 753 const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs]; 754 const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs]; 755 // temporaries 756 const VectorRegister Ch = VR20; 757 const VectorRegister Maj = VR21; 758 const VectorRegister bsa = VR22; 759 const VectorRegister bse = VR23; 760 const VectorRegister tmp1 = VR24; 761 const VectorRegister tmp2 = VR25; 762 763 vsel (Ch, g, f, e); 764 vxor (Maj, a, b); 765 vshasigmad(bse, e, 1, 0xf); 766 vaddudm (tmp2, Ch, kpw); 767 vaddudm (tmp1, h, bse); 768 vsel (Maj, b, c, Maj); 769 vaddudm (tmp1, tmp1, tmp2); 770 vshasigmad(bsa, a, 1, 0); 771 vaddudm (tmp2, bsa, Maj); 772 vaddudm (d, d, tmp1); 773 vaddudm (h, tmp1, tmp2); 774 775 // advance vector pointer to the next iteration 776 h_cnt++; 777 } 778 779 void MacroAssembler::sha512_calc_2w(const VectorRegister w0, 780 const VectorRegister w1, 781 const VectorRegister w2, 782 const VectorRegister w3, 783 const VectorRegister w4, 784 const VectorRegister w5, 785 const VectorRegister w6, 786 const VectorRegister w7, 787 const VectorRegister kpw0, 788 const VectorRegister kpw1, 789 const Register j, 790 const VectorRegister vRb, 791 const Register k) { 792 // Temporaries 793 const VectorRegister VR_a = VR20; 794 const VectorRegister VR_b = VR21; 795 const VectorRegister VR_c = VR22; 796 const VectorRegister VR_d = VR23; 797 798 // load to k[j] 799 lvx (VR_a, j, k); 800 // advance j 801 addi (j, j, 16); // 16 bytes were read 802 803 #if defined(VM_LITTLE_ENDIAN) 804 // v6 = w[j-15], w[j-14] 805 vperm (VR_b, w1, w0, vRb); 806 // v12 = w[j-7], w[j-6] 807 vperm (VR_c, w5, w4, vRb); 808 #else 809 // v6 = w[j-15], w[j-14] 810 vperm (VR_b, w0, w1, vRb); 811 // v12 = w[j-7], w[j-6] 812 vperm (VR_c, w4, w5, vRb); 813 #endif 814 815 // v6 = s0(w[j-15]) , s0(w[j-14]) 816 vshasigmad (VR_b, VR_b, 0, 0); 817 // v5 = s1(w[j-2]) , s1(w[j-1]) 818 vshasigmad (VR_d, w7, 0, 0xf); 819 // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6] 820 vaddudm (VR_b, VR_b, VR_c); 821 // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15] 822 vaddudm (VR_d, VR_d, w0); 823 // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 824 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 825 vaddudm (VR_c, VR_d, VR_b); 826 // Updating w0 to w7 to hold the new previous 16 values from w. 827 vmr (w0, w1); 828 vmr (w1, w2); 829 vmr (w2, w3); 830 vmr (w3, w4); 831 vmr (w4, w5); 832 vmr (w5, w6); 833 vmr (w6, w7); 834 vmr (w7, VR_c); 835 836 #if defined(VM_LITTLE_ENDIAN) 837 // store k + w to kpw0 (2 values at once) 838 vaddudm (kpw0, VR_c, VR_a); 839 // kpw1 holds (k + w)[1] 840 vsldoi (kpw1, kpw0, kpw0, 8); 841 #else 842 // store k + w to kpw0 (2 values at once) 843 vaddudm (kpw1, VR_c, VR_a); 844 // kpw1 holds (k + w)[1] 845 vsldoi (kpw0, kpw1, kpw1, 8); 846 #endif 847 } 848 849 void MacroAssembler::sha512_load_h_vec(const Register state, 850 const VectorRegister* hs, 851 const int total_hs) { 852 #if defined(VM_LITTLE_ENDIAN) 853 VectorRegister a = hs[0]; 854 VectorRegister g = hs[6]; 855 int start_idx = 0; 856 #else 857 VectorRegister a = hs[1]; 858 VectorRegister g = hs[7]; 859 int start_idx = 1; 860 #endif 861 862 Register addr = R7; 863 VectorRegister vRb = VR8; 864 Register tmp = R8; 865 Label state_aligned, after_state_aligned; 866 867 andi_(tmp, state, 0xf); 868 beq(CCR0, state_aligned); 869 870 // deal with unaligned addresses 871 VectorRegister aux = VR9; 872 873 lvx (a, state); 874 addi (addr, state, 16); 875 lvsl (vRb, addr); 876 877 for (int n = start_idx + 2; n < total_hs; n += 2) { 878 VectorRegister h_cur = hs[n]; 879 VectorRegister h_prev2 = hs[n - 2]; 880 881 lvx (h_cur, addr); 882 addi (addr, addr, 16); 883 vec_perm(h_prev2, h_cur, vRb); 884 } 885 lvx (aux, addr); 886 vec_perm(g, aux, vRb); 887 888 b (after_state_aligned); 889 890 bind(state_aligned); 891 892 // deal with aligned addresses 893 mr(addr, state); 894 for (int n = start_idx; n < total_hs; n += 2) { 895 VectorRegister h_cur = hs[n]; 896 897 lvx (h_cur, addr); 898 addi (addr, addr, 16); 899 } 900 901 bind(after_state_aligned); 902 } 903 904 // R3_ARG1 - byte[] Input string with padding but in Big Endian 905 // R4_ARG2 - int[] SHA.state (at first, the root of primes) 906 // R5_ARG3 - int offset 907 // R6_ARG4 - int limit 908 // 909 // Internal Register usage: 910 // R7 R8 R9 - volatile temporaries 911 // VR0-VR7 - a-h 912 // VR8 - vRb 913 // VR9 - aux (highly volatile, use with care) 914 // VR10-VR17 - w0-w7 | ini_a-ini_h 915 // VR18 - vsp16 | kplusw0 916 // VR19 - vsp32 | kplusw1 917 // VR20-VR25 - sha512_calc_2w and sha512_round temporaries 918 void MacroAssembler::sha512(bool multi_block) { 919 static const ssize_t base_size = sizeof(uint64_t); 920 static const ssize_t buf_size = 128; 921 static uint64_t waux[buf_size / base_size] __attribute((aligned (16))); 922 static const uint64_t round_consts[80] __attribute((aligned (16))) = { 923 0x428a2f98d728ae22, 0x7137449123ef65cd, 924 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, 925 0x3956c25bf348b538, 0x59f111f1b605d019, 926 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 927 0xd807aa98a3030242, 0x12835b0145706fbe, 928 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, 929 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 930 0x9bdc06a725c71235, 0xc19bf174cf692694, 931 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 932 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 933 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 934 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, 935 0x983e5152ee66dfab, 0xa831c66d2db43210, 936 0xb00327c898fb213f, 0xbf597fc7beef0ee4, 937 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 938 0x06ca6351e003826f, 0x142929670a0e6e70, 939 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 940 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, 941 0x650a73548baf63de, 0x766a0abb3c77b2a8, 942 0x81c2c92e47edaee6, 0x92722c851482353b, 943 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 944 0xc24b8b70d0f89791, 0xc76c51a30654be30, 945 0xd192e819d6ef5218, 0xd69906245565a910, 946 0xf40e35855771202a, 0x106aa07032bbd1b8, 947 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 948 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, 949 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 950 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 951 0x748f82ee5defb2fc, 0x78a5636f43172f60, 952 0x84c87814a1f0ab72, 0x8cc702081a6439ec, 953 0x90befffa23631e28, 0xa4506cebde82bde9, 954 0xbef9a3f7b2c67915, 0xc67178f2e372532b, 955 0xca273eceea26619c, 0xd186b8c721c0c207, 956 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 957 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 958 0x113f9804bef90dae, 0x1b710b35131c471b, 959 0x28db77f523047d84, 0x32caab7b40c72493, 960 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, 961 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 962 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, 963 }; 964 static const uint8_t w_size = sizeof(round_consts)/sizeof(uint64_t); 965 966 Register buf_in = R3_ARG1; 967 Register state = R4_ARG2; 968 Register ofs = R5_ARG3; 969 Register limit = R6_ARG4; 970 971 Label sha_loop, bsw_loop, core_loop; 972 973 // Save non-volatile vector registers in the red zone 974 static const VectorRegister nv[] = { 975 VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/ 976 }; 977 static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister); 978 979 for (int c = 0; c < nv_size; c++) { 980 Register idx = R7; 981 li (idx, (c - (nv_size)) * 16); 982 stvx(nv[c], idx, R1); 983 } 984 985 // Load hash state to registers 986 VectorRegister a = VR0; 987 VectorRegister b = VR1; 988 VectorRegister c = VR2; 989 VectorRegister d = VR3; 990 VectorRegister e = VR4; 991 VectorRegister f = VR5; 992 VectorRegister g = VR6; 993 VectorRegister h = VR7; 994 static const VectorRegister hs[] = {a, b, c, d, e, f, g, h}; 995 static const int total_hs = sizeof(hs)/sizeof(VectorRegister); 996 // counter for cycling through hs vector to avoid register moves between iterations 997 int h_cnt = 0; 998 999 // Load a-h registers from the memory pointed by state 1000 sha512_load_h_vec(state, hs, total_hs); 1001 1002 if (multi_block) { 1003 align(OptoLoopAlignment); 1004 } 1005 bind(sha_loop); 1006 1007 for (int n = 0; n < total_hs; n += 2) { 1008 #if defined(VM_LITTLE_ENDIAN) 1009 VectorRegister h_cur = hs[n]; 1010 VectorRegister h_next = hs[n + 1]; 1011 #else 1012 VectorRegister h_cur = hs[n + 1]; 1013 VectorRegister h_next = hs[n]; 1014 #endif 1015 vsldoi (h_next, h_cur, h_cur, 8); 1016 } 1017 1018 Register k = R9; 1019 load_const_optimized(k, const_cast<uint64_t *>(round_consts), R0); 1020 1021 // Load 16 elements from w out of the loop. 1022 // Order of the long values is Endianess specific. 1023 VectorRegister w0 = VR10; 1024 VectorRegister w1 = VR11; 1025 VectorRegister w2 = VR12; 1026 VectorRegister w3 = VR13; 1027 VectorRegister w4 = VR14; 1028 VectorRegister w5 = VR15; 1029 VectorRegister w6 = VR16; 1030 VectorRegister w7 = VR17; 1031 static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7}; 1032 static const int total_ws = sizeof(ws)/sizeof(VectorRegister); 1033 1034 // Load 16 w into vectors and setup vsl for vperm 1035 sha512_load_w_vec(buf_in, ws, total_ws); 1036 1037 #if defined(VM_LITTLE_ENDIAN) 1038 VectorRegister vsp16 = VR18; 1039 VectorRegister vsp32 = VR19; 1040 VectorRegister shiftarg = VR9; 1041 1042 vspltisw(vsp16, 8); 1043 vspltisw(shiftarg, 1); 1044 vsl (vsp16, vsp16, shiftarg); 1045 vsl (vsp32, vsp16, shiftarg); 1046 1047 VectorRegister vsp8 = VR9; 1048 vspltish(vsp8, 8); 1049 1050 // Convert input from Big Endian to Little Endian 1051 for (int c = 0; c < total_ws; c++) { 1052 VectorRegister w = ws[c]; 1053 vrlh (w, w, vsp8); 1054 } 1055 for (int c = 0; c < total_ws; c++) { 1056 VectorRegister w = ws[c]; 1057 vrlw (w, w, vsp16); 1058 } 1059 for (int c = 0; c < total_ws; c++) { 1060 VectorRegister w = ws[c]; 1061 vrld (w, w, vsp32); 1062 } 1063 #endif 1064 1065 Register Rb = R10; 1066 VectorRegister vRb = VR8; 1067 li (Rb, 8); 1068 lvsl (vRb, Rb); 1069 1070 VectorRegister kplusw0 = VR18; 1071 VectorRegister kplusw1 = VR19; 1072 1073 Register addr = R7; 1074 mr (addr, k); 1075 1076 for (int n = 0; n < total_ws; n++) { 1077 VectorRegister w = ws[n]; 1078 1079 lvx (kplusw0, addr); 1080 addi (addr, addr, 16); 1081 #if defined(VM_LITTLE_ENDIAN) 1082 vaddudm(kplusw0, kplusw0, w); 1083 vsldoi (kplusw1, kplusw0, kplusw0, 8); 1084 #else 1085 vaddudm(kplusw1, kplusw0, w); 1086 vsldoi (kplusw0, kplusw1, kplusw1, 8); 1087 #endif 1088 1089 sha512_round(hs, total_hs, h_cnt, kplusw0); 1090 sha512_round(hs, total_hs, h_cnt, kplusw1); 1091 } 1092 1093 Register tmp = R8; 1094 li (tmp, (w_size-16)/total_hs); 1095 mtctr (tmp); 1096 // j will be aligned to 4 for loading words. 1097 // Whenever read, advance the pointer (e.g: when j is used in a function) 1098 Register j = tmp; 1099 li (j, 8*16); 1100 1101 align(OptoLoopAlignment); 1102 bind(core_loop); 1103 1104 // due to VectorRegister rotate, always iterate in multiples of total_hs 1105 for (int n = 0; n < total_hs/2; n++) { 1106 sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k); 1107 sha512_round(hs, total_hs, h_cnt, kplusw0); 1108 sha512_round(hs, total_hs, h_cnt, kplusw1); 1109 } 1110 1111 bdnz (core_loop); 1112 1113 sha512_update_sha_state(state, hs, total_hs); 1114 1115 if (multi_block) { 1116 // process next 1024 bit block (buf_in already updated) 1117 addi(ofs, ofs, buf_size); 1118 cmpd(CCR0, ofs, limit); 1119 blt(CCR0, sha_loop); 1120 1121 // return ofs 1122 mr(R3_ARG1, ofs); 1123 } 1124 1125 // Restore non-volatile registers 1126 for (int c = 0; c < nv_size; c++) { 1127 Register idx = R7; 1128 li (idx, (c - (nv_size)) * 16); 1129 lvx(nv[c], idx, R1); 1130 } 1131 }