1 /*
   2 * Copyright (c) 2016, Intel Corporation.
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This code is free software; you can redistribute it and/or modify it
   7 * under the terms of the GNU General Public License version 2 only, as
   8 * published by the Free Software Foundation.
   9 *
  10 * This code is distributed in the hope that it will be useful, but WITHOUT
  11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13 * version 2 for more details (a copy is included in the LICENSE file that
  14 * accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License version
  17 * 2 along with this work; if not, write to the Free Software Foundation,
  18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19 *
  20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21 * or visit www.oracle.com if you need additional information or have any
  22 * questions.
  23 *
  24 */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "macroAssembler_x86.hpp"
  30 
  31 // ofs and limit are used for multi-block byte array.
  32 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
  33 void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
  34   XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
  35   Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
  36 
  37   Label start, done_hash, loop0;
  38 
  39   address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
  40   address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
  41 
  42   bind(start);
  43   movdqu(abcd, Address(state, 0));
  44   pinsrd(e0, Address(state, 16), 3);
  45   movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
  46   pand(e0, shuf_mask);
  47   pshufd(abcd, abcd, 0x1B);
  48   movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
  49 
  50   bind(loop0);
  51   // Save hash values for addition after rounds
  52   movdqu(Address(rsp, 0), e0);
  53   movdqu(Address(rsp, 16), abcd);
  54 
  55 
  56   // Rounds 0 - 3
  57   movdqu(msg0, Address(buf, 0));
  58   pshufb(msg0, shuf_mask);
  59   paddd(e0, msg0);
  60   movdqa(e1, abcd);
  61   sha1rnds4(abcd, e0, 0);
  62 
  63   // Rounds 4 - 7
  64   movdqu(msg1, Address(buf, 16));
  65   pshufb(msg1, shuf_mask);
  66   sha1nexte(e1, msg1);
  67   movdqa(e0, abcd);
  68   sha1rnds4(abcd, e1, 0);
  69   sha1msg1(msg0, msg1);
  70 
  71   // Rounds 8 - 11
  72   movdqu(msg2, Address(buf, 32));
  73   pshufb(msg2, shuf_mask);
  74   sha1nexte(e0, msg2);
  75   movdqa(e1, abcd);
  76   sha1rnds4(abcd, e0, 0);
  77   sha1msg1(msg1, msg2);
  78   pxor(msg0, msg2);
  79 
  80   // Rounds 12 - 15
  81   movdqu(msg3, Address(buf, 48));
  82   pshufb(msg3, shuf_mask);
  83   sha1nexte(e1, msg3);
  84   movdqa(e0, abcd);
  85   sha1msg2(msg0, msg3);
  86   sha1rnds4(abcd, e1, 0);
  87   sha1msg1(msg2, msg3);
  88   pxor(msg1, msg3);
  89 
  90   // Rounds 16 - 19
  91   sha1nexte(e0, msg0);
  92   movdqa(e1, abcd);
  93   sha1msg2(msg1, msg0);
  94   sha1rnds4(abcd, e0, 0);
  95   sha1msg1(msg3, msg0);
  96   pxor(msg2, msg0);
  97 
  98   // Rounds 20 - 23
  99   sha1nexte(e1, msg1);
 100   movdqa(e0, abcd);
 101   sha1msg2(msg2, msg1);
 102   sha1rnds4(abcd, e1, 1);
 103   sha1msg1(msg0, msg1);
 104   pxor(msg3, msg1);
 105 
 106   // Rounds 24 - 27
 107   sha1nexte(e0, msg2);
 108   movdqa(e1, abcd);
 109   sha1msg2(msg3, msg2);
 110   sha1rnds4(abcd, e0, 1);
 111   sha1msg1(msg1, msg2);
 112   pxor(msg0, msg2);
 113 
 114   // Rounds 28 - 31
 115   sha1nexte(e1, msg3);
 116   movdqa(e0, abcd);
 117   sha1msg2(msg0, msg3);
 118   sha1rnds4(abcd, e1, 1);
 119   sha1msg1(msg2, msg3);
 120   pxor(msg1, msg3);
 121 
 122   // Rounds 32 - 35
 123   sha1nexte(e0, msg0);
 124   movdqa(e1, abcd);
 125   sha1msg2(msg1, msg0);
 126   sha1rnds4(abcd, e0, 1);
 127   sha1msg1(msg3, msg0);
 128   pxor(msg2, msg0);
 129 
 130   // Rounds 36 - 39
 131   sha1nexte(e1, msg1);
 132   movdqa(e0, abcd);
 133   sha1msg2(msg2, msg1);
 134   sha1rnds4(abcd, e1, 1);
 135   sha1msg1(msg0, msg1);
 136   pxor(msg3, msg1);
 137 
 138   // Rounds 40 - 43
 139   sha1nexte(e0, msg2);
 140   movdqa(e1, abcd);
 141   sha1msg2(msg3, msg2);
 142   sha1rnds4(abcd, e0, 2);
 143   sha1msg1(msg1, msg2);
 144   pxor(msg0, msg2);
 145 
 146   // Rounds 44 - 47
 147   sha1nexte(e1, msg3);
 148   movdqa(e0, abcd);
 149   sha1msg2(msg0, msg3);
 150   sha1rnds4(abcd, e1, 2);
 151   sha1msg1(msg2, msg3);
 152   pxor(msg1, msg3);
 153 
 154   // Rounds 48 - 51
 155   sha1nexte(e0, msg0);
 156   movdqa(e1, abcd);
 157   sha1msg2(msg1, msg0);
 158   sha1rnds4(abcd, e0, 2);
 159   sha1msg1(msg3, msg0);
 160   pxor(msg2, msg0);
 161 
 162   // Rounds 52 - 55
 163   sha1nexte(e1, msg1);
 164   movdqa(e0, abcd);
 165   sha1msg2(msg2, msg1);
 166   sha1rnds4(abcd, e1, 2);
 167   sha1msg1(msg0, msg1);
 168   pxor(msg3, msg1);
 169 
 170   // Rounds 56 - 59
 171   sha1nexte(e0, msg2);
 172   movdqa(e1, abcd);
 173   sha1msg2(msg3, msg2);
 174   sha1rnds4(abcd, e0, 2);
 175   sha1msg1(msg1, msg2);
 176   pxor(msg0, msg2);
 177 
 178   // Rounds 60 - 63
 179   sha1nexte(e1, msg3);
 180   movdqa(e0, abcd);
 181   sha1msg2(msg0, msg3);
 182   sha1rnds4(abcd, e1, 3);
 183   sha1msg1(msg2, msg3);
 184   pxor(msg1, msg3);
 185 
 186   // Rounds 64 - 67
 187   sha1nexte(e0, msg0);
 188   movdqa(e1, abcd);
 189   sha1msg2(msg1, msg0);
 190   sha1rnds4(abcd, e0, 3);
 191   sha1msg1(msg3, msg0);
 192   pxor(msg2, msg0);
 193 
 194   // Rounds 68 - 71
 195   sha1nexte(e1, msg1);
 196   movdqa(e0, abcd);
 197   sha1msg2(msg2, msg1);
 198   sha1rnds4(abcd, e1, 3);
 199   pxor(msg3, msg1);
 200 
 201   // Rounds 72 - 75
 202   sha1nexte(e0, msg2);
 203   movdqa(e1, abcd);
 204   sha1msg2(msg3, msg2);
 205   sha1rnds4(abcd, e0, 3);
 206 
 207   // Rounds 76 - 79
 208   sha1nexte(e1, msg3);
 209   movdqa(e0, abcd);
 210   sha1rnds4(abcd, e1, 3);
 211 
 212   // add current hash values with previously saved
 213   movdqu(msg0, Address(rsp, 0));
 214   sha1nexte(e0, msg0);
 215   movdqu(msg0, Address(rsp, 16));
 216   paddd(abcd, msg0);
 217 
 218   if (multi_block) {
 219     // increment data pointer and loop if more to process
 220     addptr(buf, 64);
 221     addptr(ofs, 64);
 222     cmpptr(ofs, limit);
 223     jcc(Assembler::belowEqual, loop0);
 224     movptr(rax, ofs); //return ofs
 225   }
 226   // write hash values back in the correct order
 227   pshufd(abcd, abcd, 0x1b);
 228   movdqu(Address(state, 0), abcd);
 229   pextrd(Address(state, 16), e0, 3);
 230 
 231   bind(done_hash);
 232 
 233 }
 234 
 235 // xmm0 (msg) is used as an implicit argument to sh256rnds2
 236 // and state0 and state1 can never use xmm0 register.
 237 // ofs and limit are used for multi-block byte array.
 238 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
 239 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
 240   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
 241   Register buf, Register state, Register ofs, Register limit, Register rsp,
 242   bool multi_block LP64_ONLY(COMMA XMMRegister shuf_mask)) {
 243   Label start, done_hash, loop0;
 244 
 245   address K256 = StubRoutines::x86::k256_addr();
 246   address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
 247 
 248   bind(start);
 249   movdqu(state0, Address(state, 0));
 250   movdqu(state1, Address(state, 16));
 251 
 252   pshufd(state0, state0, 0xB1);
 253   pshufd(state1, state1, 0x1B);
 254   movdqa(msgtmp4, state0);
 255   palignr(state0, state1, 8);
 256   pblendw(state1, msgtmp4, 0xF0);
 257 
 258 #ifdef _LP64
 259   movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask));
 260 #endif
 261   lea(rax, ExternalAddress(K256));
 262 
 263   bind(loop0);
 264   movdqu(Address(rsp, 0), state0);
 265   movdqu(Address(rsp, 16), state1);
 266 
 267   // Rounds 0-3
 268   movdqu(msg, Address(buf, 0));
 269 #ifdef _LP64
 270   pshufb(msg, shuf_mask);
 271 #else
 272   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
 273 #endif
 274   movdqa(msgtmp0, msg);
 275   paddd(msg, Address(rax, 0));
 276   sha256rnds2(state1, state0);
 277   pshufd(msg, msg, 0x0E);
 278   sha256rnds2(state0, state1);
 279 
 280   // Rounds 4-7
 281   movdqu(msg, Address(buf, 16));
 282 #ifdef _LP64
 283   pshufb(msg, shuf_mask);
 284 #else
 285   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
 286 #endif
 287   movdqa(msgtmp1, msg);
 288   paddd(msg, Address(rax, 16));
 289   sha256rnds2(state1, state0);
 290   pshufd(msg, msg, 0x0E);
 291   sha256rnds2(state0, state1);
 292   sha256msg1(msgtmp0, msgtmp1);
 293 
 294   // Rounds 8-11
 295   movdqu(msg, Address(buf, 32));
 296 #ifdef _LP64
 297   pshufb(msg, shuf_mask);
 298 #else
 299   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
 300 #endif
 301   movdqa(msgtmp2, msg);
 302   paddd(msg, Address(rax, 32));
 303   sha256rnds2(state1, state0);
 304   pshufd(msg, msg, 0x0E);
 305   sha256rnds2(state0, state1);
 306   sha256msg1(msgtmp1, msgtmp2);
 307 
 308   // Rounds 12-15
 309   movdqu(msg, Address(buf, 48));
 310 #ifdef _LP64
 311   pshufb(msg, shuf_mask);
 312 #else
 313   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
 314 #endif
 315   movdqa(msgtmp3, msg);
 316   paddd(msg, Address(rax, 48));
 317   sha256rnds2(state1, state0);
 318   movdqa(msgtmp4, msgtmp3);
 319   palignr(msgtmp4, msgtmp2, 4);
 320   paddd(msgtmp0, msgtmp4);
 321   sha256msg2(msgtmp0, msgtmp3);
 322   pshufd(msg, msg, 0x0E);
 323   sha256rnds2(state0, state1);
 324   sha256msg1(msgtmp2, msgtmp3);
 325 
 326   // Rounds 16-19
 327   movdqa(msg, msgtmp0);
 328   paddd(msg, Address(rax, 64));
 329   sha256rnds2(state1, state0);
 330   movdqa(msgtmp4, msgtmp0);
 331   palignr(msgtmp4, msgtmp3, 4);
 332   paddd(msgtmp1, msgtmp4);
 333   sha256msg2(msgtmp1, msgtmp0);
 334   pshufd(msg, msg, 0x0E);
 335   sha256rnds2(state0, state1);
 336   sha256msg1(msgtmp3, msgtmp0);
 337 
 338   // Rounds 20-23
 339   movdqa(msg, msgtmp1);
 340   paddd(msg, Address(rax, 80));
 341   sha256rnds2(state1, state0);
 342   movdqa(msgtmp4, msgtmp1);
 343   palignr(msgtmp4, msgtmp0, 4);
 344   paddd(msgtmp2, msgtmp4);
 345   sha256msg2(msgtmp2, msgtmp1);
 346   pshufd(msg, msg, 0x0E);
 347   sha256rnds2(state0, state1);
 348   sha256msg1(msgtmp0, msgtmp1);
 349 
 350   // Rounds 24-27
 351   movdqa(msg, msgtmp2);
 352   paddd(msg, Address(rax, 96));
 353   sha256rnds2(state1, state0);
 354   movdqa(msgtmp4, msgtmp2);
 355   palignr(msgtmp4, msgtmp1, 4);
 356   paddd(msgtmp3, msgtmp4);
 357   sha256msg2(msgtmp3, msgtmp2);
 358   pshufd(msg, msg, 0x0E);
 359   sha256rnds2(state0, state1);
 360   sha256msg1(msgtmp1, msgtmp2);
 361 
 362   // Rounds 28-31
 363   movdqa(msg, msgtmp3);
 364   paddd(msg, Address(rax, 112));
 365   sha256rnds2(state1, state0);
 366   movdqa(msgtmp4, msgtmp3);
 367   palignr(msgtmp4, msgtmp2, 4);
 368   paddd(msgtmp0, msgtmp4);
 369   sha256msg2(msgtmp0, msgtmp3);
 370   pshufd(msg, msg, 0x0E);
 371   sha256rnds2(state0, state1);
 372   sha256msg1(msgtmp2, msgtmp3);
 373 
 374   // Rounds 32-35
 375   movdqa(msg, msgtmp0);
 376   paddd(msg, Address(rax, 128));
 377   sha256rnds2(state1, state0);
 378   movdqa(msgtmp4, msgtmp0);
 379   palignr(msgtmp4, msgtmp3, 4);
 380   paddd(msgtmp1, msgtmp4);
 381   sha256msg2(msgtmp1, msgtmp0);
 382   pshufd(msg, msg, 0x0E);
 383   sha256rnds2(state0, state1);
 384   sha256msg1(msgtmp3, msgtmp0);
 385 
 386   // Rounds 36-39
 387   movdqa(msg, msgtmp1);
 388   paddd(msg, Address(rax, 144));
 389   sha256rnds2(state1, state0);
 390   movdqa(msgtmp4, msgtmp1);
 391   palignr(msgtmp4, msgtmp0, 4);
 392   paddd(msgtmp2, msgtmp4);
 393   sha256msg2(msgtmp2, msgtmp1);
 394   pshufd(msg, msg, 0x0E);
 395   sha256rnds2(state0, state1);
 396   sha256msg1(msgtmp0, msgtmp1);
 397 
 398   // Rounds 40-43
 399   movdqa(msg, msgtmp2);
 400   paddd(msg, Address(rax, 160));
 401   sha256rnds2(state1, state0);
 402   movdqa(msgtmp4, msgtmp2);
 403   palignr(msgtmp4, msgtmp1, 4);
 404   paddd(msgtmp3, msgtmp4);
 405   sha256msg2(msgtmp3, msgtmp2);
 406   pshufd(msg, msg, 0x0E);
 407   sha256rnds2(state0, state1);
 408   sha256msg1(msgtmp1, msgtmp2);
 409 
 410   // Rounds 44-47
 411   movdqa(msg, msgtmp3);
 412   paddd(msg, Address(rax, 176));
 413   sha256rnds2(state1, state0);
 414   movdqa(msgtmp4, msgtmp3);
 415   palignr(msgtmp4, msgtmp2, 4);
 416   paddd(msgtmp0, msgtmp4);
 417   sha256msg2(msgtmp0, msgtmp3);
 418   pshufd(msg, msg, 0x0E);
 419   sha256rnds2(state0, state1);
 420   sha256msg1(msgtmp2, msgtmp3);
 421 
 422   // Rounds 48-51
 423   movdqa(msg, msgtmp0);
 424   paddd(msg, Address(rax, 192));
 425   sha256rnds2(state1, state0);
 426   movdqa(msgtmp4, msgtmp0);
 427   palignr(msgtmp4, msgtmp3, 4);
 428   paddd(msgtmp1, msgtmp4);
 429   sha256msg2(msgtmp1, msgtmp0);
 430   pshufd(msg, msg, 0x0E);
 431   sha256rnds2(state0, state1);
 432   sha256msg1(msgtmp3, msgtmp0);
 433 
 434   // Rounds 52-55
 435   movdqa(msg, msgtmp1);
 436   paddd(msg, Address(rax, 208));
 437   sha256rnds2(state1, state0);
 438   movdqa(msgtmp4, msgtmp1);
 439   palignr(msgtmp4, msgtmp0, 4);
 440   paddd(msgtmp2, msgtmp4);
 441   sha256msg2(msgtmp2, msgtmp1);
 442   pshufd(msg, msg, 0x0E);
 443   sha256rnds2(state0, state1);
 444 
 445   // Rounds 56-59
 446   movdqa(msg, msgtmp2);
 447   paddd(msg, Address(rax, 224));
 448   sha256rnds2(state1, state0);
 449   movdqa(msgtmp4, msgtmp2);
 450   palignr(msgtmp4, msgtmp1, 4);
 451   paddd(msgtmp3, msgtmp4);
 452   sha256msg2(msgtmp3, msgtmp2);
 453   pshufd(msg, msg, 0x0E);
 454   sha256rnds2(state0, state1);
 455 
 456   // Rounds 60-63
 457   movdqa(msg, msgtmp3);
 458   paddd(msg, Address(rax, 240));
 459   sha256rnds2(state1, state0);
 460   pshufd(msg, msg, 0x0E);
 461   sha256rnds2(state0, state1);
 462   movdqu(msg, Address(rsp, 0));
 463   paddd(state0, msg);
 464   movdqu(msg, Address(rsp, 16));
 465   paddd(state1, msg);
 466 
 467   if (multi_block) {
 468     // increment data pointer and loop if more to process
 469     addptr(buf, 64);
 470     addptr(ofs, 64);
 471     cmpptr(ofs, limit);
 472     jcc(Assembler::belowEqual, loop0);
 473     movptr(rax, ofs); //return ofs
 474   }
 475 
 476   pshufd(state0, state0, 0x1B);
 477   pshufd(state1, state1, 0xB1);
 478   movdqa(msgtmp4, state0);
 479   pblendw(state0, state1, 0xF0);
 480   palignr(state1, msgtmp4, 8);
 481 
 482   movdqu(Address(state, 0), state0);
 483   movdqu(Address(state, 16), state1);
 484 
 485   bind(done_hash);
 486 
 487 }