--- /dev/null 2016-02-26 14:23:30.000000000 -0800 +++ new/src/cpu/x86/vm/macroAssembler_intel_x86.cpp 2016-02-26 14:23:30.083341900 -0800 @@ -0,0 +1,487 @@ +/* +* Copyright (c) 2016, Intel Corporation. +* +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* This code is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License version 2 only, as +* published by the Free Software Foundation. +* +* This code is distributed in the hope that it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* version 2 for more details (a copy is included in the LICENSE file that +* accompanied this code). +* +* You should have received a copy of the GNU General Public License version +* 2 along with this work; if not, write to the Free Software Foundation, +* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +* +* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +* or visit www.oracle.com if you need additional information or have any +* questions. +* +*/ + +#include "precompiled.hpp" +#include "asm/assembler.hpp" +#include "asm/assembler.inline.hpp" +#include "macroAssembler_x86.hpp" + +// ofs and limit are used for multi-block byte array. +// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) +void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0, + XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask, + Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) { + + Label start, done_hash, loop0; + + address upper_word_mask = StubRoutines::x86::upper_word_mask_addr(); + address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr(); + + bind(start); + movdqu(abcd, Address(state, 0)); + pinsrd(e0, Address(state, 16), 3); + movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000 + pand(e0, shuf_mask); + pshufd(abcd, abcd, 0x1B); + movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f + + bind(loop0); + // Save hash values for addition after rounds + movdqu(Address(rsp, 0), e0); + movdqu(Address(rsp, 16), abcd); + + + // Rounds 0 - 3 + movdqu(msg0, Address(buf, 0)); + pshufb(msg0, shuf_mask); + paddd(e0, msg0); + movdqa(e1, abcd); + sha1rnds4(abcd, e0, 0); + + // Rounds 4 - 7 + movdqu(msg1, Address(buf, 16)); + pshufb(msg1, shuf_mask); + sha1nexte(e1, msg1); + movdqa(e0, abcd); + sha1rnds4(abcd, e1, 0); + sha1msg1(msg0, msg1); + + // Rounds 8 - 11 + movdqu(msg2, Address(buf, 32)); + pshufb(msg2, shuf_mask); + sha1nexte(e0, msg2); + movdqa(e1, abcd); + sha1rnds4(abcd, e0, 0); + sha1msg1(msg1, msg2); + pxor(msg0, msg2); + + // Rounds 12 - 15 + movdqu(msg3, Address(buf, 48)); + pshufb(msg3, shuf_mask); + sha1nexte(e1, msg3); + movdqa(e0, abcd); + sha1msg2(msg0, msg3); + sha1rnds4(abcd, e1, 0); + sha1msg1(msg2, msg3); + pxor(msg1, msg3); + + // Rounds 16 - 19 + sha1nexte(e0, msg0); + movdqa(e1, abcd); + sha1msg2(msg1, msg0); + sha1rnds4(abcd, e0, 0); + sha1msg1(msg3, msg0); + pxor(msg2, msg0); + + // Rounds 20 - 23 + sha1nexte(e1, msg1); + movdqa(e0, abcd); + sha1msg2(msg2, msg1); + sha1rnds4(abcd, e1, 1); + sha1msg1(msg0, msg1); + pxor(msg3, msg1); + + // Rounds 24 - 27 + sha1nexte(e0, msg2); + movdqa(e1, abcd); + sha1msg2(msg3, msg2); + sha1rnds4(abcd, e0, 1); + sha1msg1(msg1, msg2); + pxor(msg0, msg2); + + // Rounds 28 - 31 + sha1nexte(e1, msg3); + movdqa(e0, abcd); + sha1msg2(msg0, msg3); + sha1rnds4(abcd, e1, 1); + sha1msg1(msg2, msg3); + pxor(msg1, msg3); + + // Rounds 32 - 35 + sha1nexte(e0, msg0); + movdqa(e1, abcd); + sha1msg2(msg1, msg0); + sha1rnds4(abcd, e0, 1); + sha1msg1(msg3, msg0); + pxor(msg2, msg0); + + // Rounds 36 - 39 + sha1nexte(e1, msg1); + movdqa(e0, abcd); + sha1msg2(msg2, msg1); + sha1rnds4(abcd, e1, 1); + sha1msg1(msg0, msg1); + pxor(msg3, msg1); + + // Rounds 40 - 43 + sha1nexte(e0, msg2); + movdqa(e1, abcd); + sha1msg2(msg3, msg2); + sha1rnds4(abcd, e0, 2); + sha1msg1(msg1, msg2); + pxor(msg0, msg2); + + // Rounds 44 - 47 + sha1nexte(e1, msg3); + movdqa(e0, abcd); + sha1msg2(msg0, msg3); + sha1rnds4(abcd, e1, 2); + sha1msg1(msg2, msg3); + pxor(msg1, msg3); + + // Rounds 48 - 51 + sha1nexte(e0, msg0); + movdqa(e1, abcd); + sha1msg2(msg1, msg0); + sha1rnds4(abcd, e0, 2); + sha1msg1(msg3, msg0); + pxor(msg2, msg0); + + // Rounds 52 - 55 + sha1nexte(e1, msg1); + movdqa(e0, abcd); + sha1msg2(msg2, msg1); + sha1rnds4(abcd, e1, 2); + sha1msg1(msg0, msg1); + pxor(msg3, msg1); + + // Rounds 56 - 59 + sha1nexte(e0, msg2); + movdqa(e1, abcd); + sha1msg2(msg3, msg2); + sha1rnds4(abcd, e0, 2); + sha1msg1(msg1, msg2); + pxor(msg0, msg2); + + // Rounds 60 - 63 + sha1nexte(e1, msg3); + movdqa(e0, abcd); + sha1msg2(msg0, msg3); + sha1rnds4(abcd, e1, 3); + sha1msg1(msg2, msg3); + pxor(msg1, msg3); + + // Rounds 64 - 67 + sha1nexte(e0, msg0); + movdqa(e1, abcd); + sha1msg2(msg1, msg0); + sha1rnds4(abcd, e0, 3); + sha1msg1(msg3, msg0); + pxor(msg2, msg0); + + // Rounds 68 - 71 + sha1nexte(e1, msg1); + movdqa(e0, abcd); + sha1msg2(msg2, msg1); + sha1rnds4(abcd, e1, 3); + pxor(msg3, msg1); + + // Rounds 72 - 75 + sha1nexte(e0, msg2); + movdqa(e1, abcd); + sha1msg2(msg3, msg2); + sha1rnds4(abcd, e0, 3); + + // Rounds 76 - 79 + sha1nexte(e1, msg3); + movdqa(e0, abcd); + sha1rnds4(abcd, e1, 3); + + // add current hash values with previously saved + movdqu(msg0, Address(rsp, 0)); + sha1nexte(e0, msg0); + movdqu(msg0, Address(rsp, 16)); + paddd(abcd, msg0); + + if (multi_block) { + // increment data pointer and loop if more to process + addptr(buf, 64); + addptr(ofs, 64); + cmpptr(ofs, limit); + jcc(Assembler::belowEqual, loop0); + movptr(rax, ofs); //return ofs + } + // write hash values back in the correct order + pshufd(abcd, abcd, 0x1b); + movdqu(Address(state, 0), abcd); + pextrd(Address(state, 16), e0, 3); + + bind(done_hash); + +} + +// xmm0 (msg) is used as an implicit argument to sh256rnds2 +// and state0 and state1 can never use xmm0 register. +// ofs and limit are used for multi-block byte array. +// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) +void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, + XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, + Register buf, Register state, Register ofs, Register limit, Register rsp, + bool multi_block LP64_ONLY(COMMA XMMRegister shuf_mask)) { + Label start, done_hash, loop0; + + address K256 = StubRoutines::x86::k256_addr(); + address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); + + bind(start); + movdqu(state0, Address(state, 0)); + movdqu(state1, Address(state, 16)); + + pshufd(state0, state0, 0xB1); + pshufd(state1, state1, 0x1B); + movdqa(msgtmp4, state0); + palignr(state0, state1, 8); + pblendw(state1, msgtmp4, 0xF0); + +#ifdef _LP64 + movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask)); +#endif + lea(rax, ExternalAddress(K256)); + + bind(loop0); + movdqu(Address(rsp, 0), state0); + movdqu(Address(rsp, 16), state1); + + // Rounds 0-3 + movdqu(msg, Address(buf, 0)); +#ifdef _LP64 + pshufb(msg, shuf_mask); +#else + pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); +#endif + movdqa(msgtmp0, msg); + paddd(msg, Address(rax, 0)); + sha256rnds2(state1, state0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + + // Rounds 4-7 + movdqu(msg, Address(buf, 16)); +#ifdef _LP64 + pshufb(msg, shuf_mask); +#else + pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); +#endif + movdqa(msgtmp1, msg); + paddd(msg, Address(rax, 16)); + sha256rnds2(state1, state0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp0, msgtmp1); + + // Rounds 8-11 + movdqu(msg, Address(buf, 32)); +#ifdef _LP64 + pshufb(msg, shuf_mask); +#else + pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); +#endif + movdqa(msgtmp2, msg); + paddd(msg, Address(rax, 32)); + sha256rnds2(state1, state0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp1, msgtmp2); + + // Rounds 12-15 + movdqu(msg, Address(buf, 48)); +#ifdef _LP64 + pshufb(msg, shuf_mask); +#else + pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); +#endif + movdqa(msgtmp3, msg); + paddd(msg, Address(rax, 48)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp3); + palignr(msgtmp4, msgtmp2, 4); + paddd(msgtmp0, msgtmp4); + sha256msg2(msgtmp0, msgtmp3); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp2, msgtmp3); + + // Rounds 16-19 + movdqa(msg, msgtmp0); + paddd(msg, Address(rax, 64)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp0); + palignr(msgtmp4, msgtmp3, 4); + paddd(msgtmp1, msgtmp4); + sha256msg2(msgtmp1, msgtmp0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp3, msgtmp0); + + // Rounds 20-23 + movdqa(msg, msgtmp1); + paddd(msg, Address(rax, 80)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp1); + palignr(msgtmp4, msgtmp0, 4); + paddd(msgtmp2, msgtmp4); + sha256msg2(msgtmp2, msgtmp1); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp0, msgtmp1); + + // Rounds 24-27 + movdqa(msg, msgtmp2); + paddd(msg, Address(rax, 96)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp2); + palignr(msgtmp4, msgtmp1, 4); + paddd(msgtmp3, msgtmp4); + sha256msg2(msgtmp3, msgtmp2); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp1, msgtmp2); + + // Rounds 28-31 + movdqa(msg, msgtmp3); + paddd(msg, Address(rax, 112)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp3); + palignr(msgtmp4, msgtmp2, 4); + paddd(msgtmp0, msgtmp4); + sha256msg2(msgtmp0, msgtmp3); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp2, msgtmp3); + + // Rounds 32-35 + movdqa(msg, msgtmp0); + paddd(msg, Address(rax, 128)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp0); + palignr(msgtmp4, msgtmp3, 4); + paddd(msgtmp1, msgtmp4); + sha256msg2(msgtmp1, msgtmp0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp3, msgtmp0); + + // Rounds 36-39 + movdqa(msg, msgtmp1); + paddd(msg, Address(rax, 144)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp1); + palignr(msgtmp4, msgtmp0, 4); + paddd(msgtmp2, msgtmp4); + sha256msg2(msgtmp2, msgtmp1); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp0, msgtmp1); + + // Rounds 40-43 + movdqa(msg, msgtmp2); + paddd(msg, Address(rax, 160)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp2); + palignr(msgtmp4, msgtmp1, 4); + paddd(msgtmp3, msgtmp4); + sha256msg2(msgtmp3, msgtmp2); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp1, msgtmp2); + + // Rounds 44-47 + movdqa(msg, msgtmp3); + paddd(msg, Address(rax, 176)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp3); + palignr(msgtmp4, msgtmp2, 4); + paddd(msgtmp0, msgtmp4); + sha256msg2(msgtmp0, msgtmp3); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp2, msgtmp3); + + // Rounds 48-51 + movdqa(msg, msgtmp0); + paddd(msg, Address(rax, 192)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp0); + palignr(msgtmp4, msgtmp3, 4); + paddd(msgtmp1, msgtmp4); + sha256msg2(msgtmp1, msgtmp0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + sha256msg1(msgtmp3, msgtmp0); + + // Rounds 52-55 + movdqa(msg, msgtmp1); + paddd(msg, Address(rax, 208)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp1); + palignr(msgtmp4, msgtmp0, 4); + paddd(msgtmp2, msgtmp4); + sha256msg2(msgtmp2, msgtmp1); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + + // Rounds 56-59 + movdqa(msg, msgtmp2); + paddd(msg, Address(rax, 224)); + sha256rnds2(state1, state0); + movdqa(msgtmp4, msgtmp2); + palignr(msgtmp4, msgtmp1, 4); + paddd(msgtmp3, msgtmp4); + sha256msg2(msgtmp3, msgtmp2); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + + // Rounds 60-63 + movdqa(msg, msgtmp3); + paddd(msg, Address(rax, 240)); + sha256rnds2(state1, state0); + pshufd(msg, msg, 0x0E); + sha256rnds2(state0, state1); + movdqu(msg, Address(rsp, 0)); + paddd(state0, msg); + movdqu(msg, Address(rsp, 16)); + paddd(state1, msg); + + if (multi_block) { + // increment data pointer and loop if more to process + addptr(buf, 64); + addptr(ofs, 64); + cmpptr(ofs, limit); + jcc(Assembler::belowEqual, loop0); + movptr(rax, ofs); //return ofs + } + + pshufd(state0, state0, 0x1B); + pshufd(state1, state1, 0xB1); + movdqa(msgtmp4, state0); + pblendw(state0, state1, 0xF0); + palignr(state1, msgtmp4, 8); + + movdqu(Address(state, 0), state0); + movdqu(Address(state, 16), state1); + + bind(done_hash); + +}