1 /*
   2 * Copyright (c) 2016, Intel Corporation.
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This code is free software; you can redistribute it and/or modify it
   7 * under the terms of the GNU General Public License version 2 only, as
   8 * published by the Free Software Foundation.
   9 *
  10 * This code is distributed in the hope that it will be useful, but WITHOUT
  11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13 * version 2 for more details (a copy is included in the LICENSE file that
  14 * accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License version
  17 * 2 along with this work; if not, write to the Free Software Foundation,
  18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19 *
  20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21 * or visit www.oracle.com if you need additional information or have any
  22 * questions.
  23 *
  24 */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "runtime/stubRoutines.hpp"
  30 #include "macroAssembler_x86.hpp"
  31 
  32 // ofs and limit are used for multi-block byte array.
  33 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
  34 void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
  35   XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
  36   Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
  37 
  38   Label start, done_hash, loop0;
  39 
  40   address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
  41   address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
  42 
  43   bind(start);
  44   movdqu(abcd, Address(state, 0));
  45   pinsrd(e0, Address(state, 16), 3);
  46   movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
  47   pand(e0, shuf_mask);
  48   pshufd(abcd, abcd, 0x1B);
  49   movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
  50 
  51   bind(loop0);
  52   // Save hash values for addition after rounds
  53   movdqu(Address(rsp, 0), e0);
  54   movdqu(Address(rsp, 16), abcd);
  55 
  56 
  57   // Rounds 0 - 3
  58   movdqu(msg0, Address(buf, 0));
  59   pshufb(msg0, shuf_mask);
  60   paddd(e0, msg0);
  61   movdqa(e1, abcd);
  62   sha1rnds4(abcd, e0, 0);
  63 
  64   // Rounds 4 - 7
  65   movdqu(msg1, Address(buf, 16));
  66   pshufb(msg1, shuf_mask);
  67   sha1nexte(e1, msg1);
  68   movdqa(e0, abcd);
  69   sha1rnds4(abcd, e1, 0);
  70   sha1msg1(msg0, msg1);
  71 
  72   // Rounds 8 - 11
  73   movdqu(msg2, Address(buf, 32));
  74   pshufb(msg2, shuf_mask);
  75   sha1nexte(e0, msg2);
  76   movdqa(e1, abcd);
  77   sha1rnds4(abcd, e0, 0);
  78   sha1msg1(msg1, msg2);
  79   pxor(msg0, msg2);
  80 
  81   // Rounds 12 - 15
  82   movdqu(msg3, Address(buf, 48));
  83   pshufb(msg3, shuf_mask);
  84   sha1nexte(e1, msg3);
  85   movdqa(e0, abcd);
  86   sha1msg2(msg0, msg3);
  87   sha1rnds4(abcd, e1, 0);
  88   sha1msg1(msg2, msg3);
  89   pxor(msg1, msg3);
  90 
  91   // Rounds 16 - 19
  92   sha1nexte(e0, msg0);
  93   movdqa(e1, abcd);
  94   sha1msg2(msg1, msg0);
  95   sha1rnds4(abcd, e0, 0);
  96   sha1msg1(msg3, msg0);
  97   pxor(msg2, msg0);
  98 
  99   // Rounds 20 - 23
 100   sha1nexte(e1, msg1);
 101   movdqa(e0, abcd);
 102   sha1msg2(msg2, msg1);
 103   sha1rnds4(abcd, e1, 1);
 104   sha1msg1(msg0, msg1);
 105   pxor(msg3, msg1);
 106 
 107   // Rounds 24 - 27
 108   sha1nexte(e0, msg2);
 109   movdqa(e1, abcd);
 110   sha1msg2(msg3, msg2);
 111   sha1rnds4(abcd, e0, 1);
 112   sha1msg1(msg1, msg2);
 113   pxor(msg0, msg2);
 114 
 115   // Rounds 28 - 31
 116   sha1nexte(e1, msg3);
 117   movdqa(e0, abcd);
 118   sha1msg2(msg0, msg3);
 119   sha1rnds4(abcd, e1, 1);
 120   sha1msg1(msg2, msg3);
 121   pxor(msg1, msg3);
 122 
 123   // Rounds 32 - 35
 124   sha1nexte(e0, msg0);
 125   movdqa(e1, abcd);
 126   sha1msg2(msg1, msg0);
 127   sha1rnds4(abcd, e0, 1);
 128   sha1msg1(msg3, msg0);
 129   pxor(msg2, msg0);
 130 
 131   // Rounds 36 - 39
 132   sha1nexte(e1, msg1);
 133   movdqa(e0, abcd);
 134   sha1msg2(msg2, msg1);
 135   sha1rnds4(abcd, e1, 1);
 136   sha1msg1(msg0, msg1);
 137   pxor(msg3, msg1);
 138 
 139   // Rounds 40 - 43
 140   sha1nexte(e0, msg2);
 141   movdqa(e1, abcd);
 142   sha1msg2(msg3, msg2);
 143   sha1rnds4(abcd, e0, 2);
 144   sha1msg1(msg1, msg2);
 145   pxor(msg0, msg2);
 146 
 147   // Rounds 44 - 47
 148   sha1nexte(e1, msg3);
 149   movdqa(e0, abcd);
 150   sha1msg2(msg0, msg3);
 151   sha1rnds4(abcd, e1, 2);
 152   sha1msg1(msg2, msg3);
 153   pxor(msg1, msg3);
 154 
 155   // Rounds 48 - 51
 156   sha1nexte(e0, msg0);
 157   movdqa(e1, abcd);
 158   sha1msg2(msg1, msg0);
 159   sha1rnds4(abcd, e0, 2);
 160   sha1msg1(msg3, msg0);
 161   pxor(msg2, msg0);
 162 
 163   // Rounds 52 - 55
 164   sha1nexte(e1, msg1);
 165   movdqa(e0, abcd);
 166   sha1msg2(msg2, msg1);
 167   sha1rnds4(abcd, e1, 2);
 168   sha1msg1(msg0, msg1);
 169   pxor(msg3, msg1);
 170 
 171   // Rounds 56 - 59
 172   sha1nexte(e0, msg2);
 173   movdqa(e1, abcd);
 174   sha1msg2(msg3, msg2);
 175   sha1rnds4(abcd, e0, 2);
 176   sha1msg1(msg1, msg2);
 177   pxor(msg0, msg2);
 178 
 179   // Rounds 60 - 63
 180   sha1nexte(e1, msg3);
 181   movdqa(e0, abcd);
 182   sha1msg2(msg0, msg3);
 183   sha1rnds4(abcd, e1, 3);
 184   sha1msg1(msg2, msg3);
 185   pxor(msg1, msg3);
 186 
 187   // Rounds 64 - 67
 188   sha1nexte(e0, msg0);
 189   movdqa(e1, abcd);
 190   sha1msg2(msg1, msg0);
 191   sha1rnds4(abcd, e0, 3);
 192   sha1msg1(msg3, msg0);
 193   pxor(msg2, msg0);
 194 
 195   // Rounds 68 - 71
 196   sha1nexte(e1, msg1);
 197   movdqa(e0, abcd);
 198   sha1msg2(msg2, msg1);
 199   sha1rnds4(abcd, e1, 3);
 200   pxor(msg3, msg1);
 201 
 202   // Rounds 72 - 75
 203   sha1nexte(e0, msg2);
 204   movdqa(e1, abcd);
 205   sha1msg2(msg3, msg2);
 206   sha1rnds4(abcd, e0, 3);
 207 
 208   // Rounds 76 - 79
 209   sha1nexte(e1, msg3);
 210   movdqa(e0, abcd);
 211   sha1rnds4(abcd, e1, 3);
 212 
 213   // add current hash values with previously saved
 214   movdqu(msg0, Address(rsp, 0));
 215   sha1nexte(e0, msg0);
 216   movdqu(msg0, Address(rsp, 16));
 217   paddd(abcd, msg0);
 218 
 219   if (multi_block) {
 220     // increment data pointer and loop if more to process
 221     addptr(buf, 64);
 222     addptr(ofs, 64);
 223     cmpptr(ofs, limit);
 224     jcc(Assembler::belowEqual, loop0);
 225     movptr(rax, ofs); //return ofs
 226   }
 227   // write hash values back in the correct order
 228   pshufd(abcd, abcd, 0x1b);
 229   movdqu(Address(state, 0), abcd);
 230   pextrd(Address(state, 16), e0, 3);
 231 
 232   bind(done_hash);
 233 
 234 }
 235 
 236 // xmm0 (msg) is used as an implicit argument to sh256rnds2
 237 // and state0 and state1 can never use xmm0 register.
 238 // ofs and limit are used for multi-block byte array.
 239 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
 240 #ifdef _LP64
 241 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
 242   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
 243   Register buf, Register state, Register ofs, Register limit, Register rsp,
 244   bool multi_block, XMMRegister shuf_mask) {
 245 #else
 246 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
 247   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
 248   Register buf, Register state, Register ofs, Register limit, Register rsp,
 249   bool multi_block) {
 250 #endif
 251   Label start, done_hash, loop0;
 252 
 253   address K256 = StubRoutines::x86::k256_addr();
 254   address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
 255 
 256   bind(start);
 257   movdqu(state0, Address(state, 0));
 258   movdqu(state1, Address(state, 16));
 259 
 260   pshufd(state0, state0, 0xB1);
 261   pshufd(state1, state1, 0x1B);
 262   movdqa(msgtmp4, state0);
 263   palignr(state0, state1, 8);
 264   pblendw(state1, msgtmp4, 0xF0);
 265 
 266 #ifdef _LP64
 267   movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask));
 268 #endif
 269   lea(rax, ExternalAddress(K256));
 270 
 271   bind(loop0);
 272   movdqu(Address(rsp, 0), state0);
 273   movdqu(Address(rsp, 16), state1);
 274 
 275   // Rounds 0-3
 276   movdqu(msg, Address(buf, 0));
 277 #ifdef _LP64
 278   pshufb(msg, shuf_mask);
 279 #else
 280   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
 281 #endif
 282   movdqa(msgtmp0, msg);
 283   paddd(msg, Address(rax, 0));
 284   sha256rnds2(state1, state0);
 285   pshufd(msg, msg, 0x0E);
 286   sha256rnds2(state0, state1);
 287 
 288   // Rounds 4-7
 289   movdqu(msg, Address(buf, 16));
 290 #ifdef _LP64
 291   pshufb(msg, shuf_mask);
 292 #else
 293   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
 294 #endif
 295   movdqa(msgtmp1, msg);
 296   paddd(msg, Address(rax, 16));
 297   sha256rnds2(state1, state0);
 298   pshufd(msg, msg, 0x0E);
 299   sha256rnds2(state0, state1);
 300   sha256msg1(msgtmp0, msgtmp1);
 301 
 302   // Rounds 8-11
 303   movdqu(msg, Address(buf, 32));
 304 #ifdef _LP64
 305   pshufb(msg, shuf_mask);
 306 #else
 307   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
 308 #endif
 309   movdqa(msgtmp2, msg);
 310   paddd(msg, Address(rax, 32));
 311   sha256rnds2(state1, state0);
 312   pshufd(msg, msg, 0x0E);
 313   sha256rnds2(state0, state1);
 314   sha256msg1(msgtmp1, msgtmp2);
 315 
 316   // Rounds 12-15
 317   movdqu(msg, Address(buf, 48));
 318 #ifdef _LP64
 319   pshufb(msg, shuf_mask);
 320 #else
 321   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
 322 #endif
 323   movdqa(msgtmp3, msg);
 324   paddd(msg, Address(rax, 48));
 325   sha256rnds2(state1, state0);
 326   movdqa(msgtmp4, msgtmp3);
 327   palignr(msgtmp4, msgtmp2, 4);
 328   paddd(msgtmp0, msgtmp4);
 329   sha256msg2(msgtmp0, msgtmp3);
 330   pshufd(msg, msg, 0x0E);
 331   sha256rnds2(state0, state1);
 332   sha256msg1(msgtmp2, msgtmp3);
 333 
 334   // Rounds 16-19
 335   movdqa(msg, msgtmp0);
 336   paddd(msg, Address(rax, 64));
 337   sha256rnds2(state1, state0);
 338   movdqa(msgtmp4, msgtmp0);
 339   palignr(msgtmp4, msgtmp3, 4);
 340   paddd(msgtmp1, msgtmp4);
 341   sha256msg2(msgtmp1, msgtmp0);
 342   pshufd(msg, msg, 0x0E);
 343   sha256rnds2(state0, state1);
 344   sha256msg1(msgtmp3, msgtmp0);
 345 
 346   // Rounds 20-23
 347   movdqa(msg, msgtmp1);
 348   paddd(msg, Address(rax, 80));
 349   sha256rnds2(state1, state0);
 350   movdqa(msgtmp4, msgtmp1);
 351   palignr(msgtmp4, msgtmp0, 4);
 352   paddd(msgtmp2, msgtmp4);
 353   sha256msg2(msgtmp2, msgtmp1);
 354   pshufd(msg, msg, 0x0E);
 355   sha256rnds2(state0, state1);
 356   sha256msg1(msgtmp0, msgtmp1);
 357 
 358   // Rounds 24-27
 359   movdqa(msg, msgtmp2);
 360   paddd(msg, Address(rax, 96));
 361   sha256rnds2(state1, state0);
 362   movdqa(msgtmp4, msgtmp2);
 363   palignr(msgtmp4, msgtmp1, 4);
 364   paddd(msgtmp3, msgtmp4);
 365   sha256msg2(msgtmp3, msgtmp2);
 366   pshufd(msg, msg, 0x0E);
 367   sha256rnds2(state0, state1);
 368   sha256msg1(msgtmp1, msgtmp2);
 369 
 370   // Rounds 28-31
 371   movdqa(msg, msgtmp3);
 372   paddd(msg, Address(rax, 112));
 373   sha256rnds2(state1, state0);
 374   movdqa(msgtmp4, msgtmp3);
 375   palignr(msgtmp4, msgtmp2, 4);
 376   paddd(msgtmp0, msgtmp4);
 377   sha256msg2(msgtmp0, msgtmp3);
 378   pshufd(msg, msg, 0x0E);
 379   sha256rnds2(state0, state1);
 380   sha256msg1(msgtmp2, msgtmp3);
 381 
 382   // Rounds 32-35
 383   movdqa(msg, msgtmp0);
 384   paddd(msg, Address(rax, 128));
 385   sha256rnds2(state1, state0);
 386   movdqa(msgtmp4, msgtmp0);
 387   palignr(msgtmp4, msgtmp3, 4);
 388   paddd(msgtmp1, msgtmp4);
 389   sha256msg2(msgtmp1, msgtmp0);
 390   pshufd(msg, msg, 0x0E);
 391   sha256rnds2(state0, state1);
 392   sha256msg1(msgtmp3, msgtmp0);
 393 
 394   // Rounds 36-39
 395   movdqa(msg, msgtmp1);
 396   paddd(msg, Address(rax, 144));
 397   sha256rnds2(state1, state0);
 398   movdqa(msgtmp4, msgtmp1);
 399   palignr(msgtmp4, msgtmp0, 4);
 400   paddd(msgtmp2, msgtmp4);
 401   sha256msg2(msgtmp2, msgtmp1);
 402   pshufd(msg, msg, 0x0E);
 403   sha256rnds2(state0, state1);
 404   sha256msg1(msgtmp0, msgtmp1);
 405 
 406   // Rounds 40-43
 407   movdqa(msg, msgtmp2);
 408   paddd(msg, Address(rax, 160));
 409   sha256rnds2(state1, state0);
 410   movdqa(msgtmp4, msgtmp2);
 411   palignr(msgtmp4, msgtmp1, 4);
 412   paddd(msgtmp3, msgtmp4);
 413   sha256msg2(msgtmp3, msgtmp2);
 414   pshufd(msg, msg, 0x0E);
 415   sha256rnds2(state0, state1);
 416   sha256msg1(msgtmp1, msgtmp2);
 417 
 418   // Rounds 44-47
 419   movdqa(msg, msgtmp3);
 420   paddd(msg, Address(rax, 176));
 421   sha256rnds2(state1, state0);
 422   movdqa(msgtmp4, msgtmp3);
 423   palignr(msgtmp4, msgtmp2, 4);
 424   paddd(msgtmp0, msgtmp4);
 425   sha256msg2(msgtmp0, msgtmp3);
 426   pshufd(msg, msg, 0x0E);
 427   sha256rnds2(state0, state1);
 428   sha256msg1(msgtmp2, msgtmp3);
 429 
 430   // Rounds 48-51
 431   movdqa(msg, msgtmp0);
 432   paddd(msg, Address(rax, 192));
 433   sha256rnds2(state1, state0);
 434   movdqa(msgtmp4, msgtmp0);
 435   palignr(msgtmp4, msgtmp3, 4);
 436   paddd(msgtmp1, msgtmp4);
 437   sha256msg2(msgtmp1, msgtmp0);
 438   pshufd(msg, msg, 0x0E);
 439   sha256rnds2(state0, state1);
 440   sha256msg1(msgtmp3, msgtmp0);
 441 
 442   // Rounds 52-55
 443   movdqa(msg, msgtmp1);
 444   paddd(msg, Address(rax, 208));
 445   sha256rnds2(state1, state0);
 446   movdqa(msgtmp4, msgtmp1);
 447   palignr(msgtmp4, msgtmp0, 4);
 448   paddd(msgtmp2, msgtmp4);
 449   sha256msg2(msgtmp2, msgtmp1);
 450   pshufd(msg, msg, 0x0E);
 451   sha256rnds2(state0, state1);
 452 
 453   // Rounds 56-59
 454   movdqa(msg, msgtmp2);
 455   paddd(msg, Address(rax, 224));
 456   sha256rnds2(state1, state0);
 457   movdqa(msgtmp4, msgtmp2);
 458   palignr(msgtmp4, msgtmp1, 4);
 459   paddd(msgtmp3, msgtmp4);
 460   sha256msg2(msgtmp3, msgtmp2);
 461   pshufd(msg, msg, 0x0E);
 462   sha256rnds2(state0, state1);
 463 
 464   // Rounds 60-63
 465   movdqa(msg, msgtmp3);
 466   paddd(msg, Address(rax, 240));
 467   sha256rnds2(state1, state0);
 468   pshufd(msg, msg, 0x0E);
 469   sha256rnds2(state0, state1);
 470   movdqu(msg, Address(rsp, 0));
 471   paddd(state0, msg);
 472   movdqu(msg, Address(rsp, 16));
 473   paddd(state1, msg);
 474 
 475   if (multi_block) {
 476     // increment data pointer and loop if more to process
 477     addptr(buf, 64);
 478     addptr(ofs, 64);
 479     cmpptr(ofs, limit);
 480     jcc(Assembler::belowEqual, loop0);
 481     movptr(rax, ofs); //return ofs
 482   }
 483 
 484   pshufd(state0, state0, 0x1B);
 485   pshufd(state1, state1, 0xB1);
 486   movdqa(msgtmp4, state0);
 487   pblendw(state0, state1, 0xF0);
 488   palignr(state1, msgtmp4, 8);
 489 
 490   movdqu(Address(state, 0), state0);
 491   movdqu(Address(state, 16), state1);
 492 
 493   bind(done_hash);
 494 
 495 }