< prev index next >

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Print this page
rev 60737 : 8252204: AArch64: Implement SHA3 accelerator/intrinsic
Reviewed-by: duke
Contributed-by: dongbo4@huawei.com

*** 3289,3298 **** --- 3289,3517 ---- __ ret(lr); return start; } + // Arguments: + // + // Inputs: + // c_rarg0 - byte[] source+offset + // c_rarg1 - byte[] SHA.state + // c_rarg2 - int digest_length + // c_rarg3 - int offset + // c_rarg4 - int limit + // + address generate_sha3_implCompress(bool multi_block, const char *name) { + static const uint64_t round_consts[24] = { + 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, + 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, + 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, + 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, + 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, + 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, + 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, + 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L + }; + + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + Register buf = c_rarg0; + Register state = c_rarg1; + Register digest_length = c_rarg2; + Register ofs = c_rarg3; + Register limit = c_rarg4; + + Label sha3_loop, rounds24_loop; + Label sha3_512, sha3_384_or_224, sha3_256; + + __ stpd(v8, v9, __ pre(sp, -64)); + __ stpd(v10, v11, Address(sp, 16)); + __ stpd(v12, v13, Address(sp, 32)); + __ stpd(v14, v15, Address(sp, 48)); + + // load state + __ add(rscratch1, state, 32); + __ ld1(v0, v1, v2, v3, __ T1D, state); + __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); + __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); + __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); + __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); + __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); + __ ld1(v24, __ T1D, rscratch1); + + __ BIND(sha3_loop); + + // 24 keccak rounds + __ movw(rscratch2, 24); + + // load round_constants base + __ lea(rscratch1, ExternalAddress((address) round_consts)); + + // load input + __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); + __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); + __ eor(v0, __ T8B, v0, v25); + __ eor(v1, __ T8B, v1, v26); + __ eor(v2, __ T8B, v2, v27); + __ eor(v3, __ T8B, v3, v28); + __ eor(v4, __ T8B, v4, v29); + __ eor(v5, __ T8B, v5, v30); + __ eor(v6, __ T8B, v6, v31); + + // digest_length == 64, SHA3-512 + __ tbnz(digest_length, 6, sha3_512); + + __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); + __ ld1(v29, v30, __ T8B, __ post(buf, 16)); + __ eor(v7, __ T8B, v7, v25); + __ eor(v8, __ T8B, v8, v26); + __ eor(v9, __ T8B, v9, v27); + __ eor(v10, __ T8B, v10, v28); + __ eor(v11, __ T8B, v11, v29); + __ eor(v12, __ T8B, v12, v30); + + // digest_length == 28, SHA3-224; digest_length == 48, SHA3-384 + __ tbnz(digest_length, 4, sha3_384_or_224); + + // SHA3-256 + __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); + __ eor(v13, __ T8B, v13, v25); + __ eor(v14, __ T8B, v14, v26); + __ eor(v15, __ T8B, v15, v27); + __ eor(v16, __ T8B, v16, v28); + __ b(rounds24_loop); + + __ BIND(sha3_384_or_224); + __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384 + + // SHA3-224 + __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); + __ ld1(v29, __ T8B, __ post(buf, 8)); + __ eor(v13, __ T8B, v13, v25); + __ eor(v14, __ T8B, v14, v26); + __ eor(v15, __ T8B, v15, v27); + __ eor(v16, __ T8B, v16, v28); + __ eor(v17, __ T8B, v17, v29); + __ b(rounds24_loop); + + __ BIND(sha3_512); + __ ld1(v25, v26, __ T8B, __ post(buf, 16)); + __ eor(v7, __ T8B, v7, v25); + __ eor(v8, __ T8B, v8, v26); + + __ BIND(rounds24_loop); + __ subw(rscratch2, rscratch2, 1); + + __ eor3(v29, __ T16B, v4, v9, v14); + __ eor3(v26, __ T16B, v1, v6, v11); + __ eor3(v28, __ T16B, v3, v8, v13); + __ eor3(v25, __ T16B, v0, v5, v10); + __ eor3(v27, __ T16B, v2, v7, v12); + __ eor3(v29, __ T16B, v29, v19, v24); + __ eor3(v26, __ T16B, v26, v16, v21); + __ eor3(v28, __ T16B, v28, v18, v23); + __ eor3(v25, __ T16B, v25, v15, v20); + __ eor3(v27, __ T16B, v27, v17, v22); + + __ rax1(v30, __ T2D, v29, v26); + __ rax1(v26, __ T2D, v26, v28); + __ rax1(v28, __ T2D, v28, v25); + __ rax1(v25, __ T2D, v25, v27); + __ rax1(v27, __ T2D, v27, v29); + + __ eor(v0, __ T16B, v0, v30); + __ xar(v29, __ T2D, v1, v25, (64 - 1)); + __ xar(v1, __ T2D, v6, v25, (64 - 44)); + __ xar(v6, __ T2D, v9, v28, (64 - 20)); + __ xar(v9, __ T2D, v22, v26, (64 - 61)); + __ xar(v22, __ T2D, v14, v28, (64 - 39)); + __ xar(v14, __ T2D, v20, v30, (64 - 18)); + __ xar(v31, __ T2D, v2, v26, (64 - 62)); + __ xar(v2, __ T2D, v12, v26, (64 - 43)); + __ xar(v12, __ T2D, v13, v27, (64 - 25)); + __ xar(v13, __ T2D, v19, v28, (64 - 8)); + __ xar(v19, __ T2D, v23, v27, (64 - 56)); + __ xar(v23, __ T2D, v15, v30, (64 - 41)); + __ xar(v15, __ T2D, v4, v28, (64 - 27)); + __ xar(v28, __ T2D, v24, v28, (64 - 14)); + __ xar(v24, __ T2D, v21, v25, (64 - 2)); + __ xar(v8, __ T2D, v8, v27, (64 - 55)); + __ xar(v4, __ T2D, v16, v25, (64 - 45)); + __ xar(v16, __ T2D, v5, v30, (64 - 36)); + __ xar(v5, __ T2D, v3, v27, (64 - 28)); + __ xar(v27, __ T2D, v18, v27, (64 - 21)); + __ xar(v3, __ T2D, v17, v26, (64 - 15)); + __ xar(v25, __ T2D, v11, v25, (64 - 10)); + __ xar(v26, __ T2D, v7, v26, (64 - 6)); + __ xar(v30, __ T2D, v10, v30, (64 - 3)); + + __ bcax(v20, __ T16B, v31, v22, v8); + __ bcax(v21, __ T16B, v8, v23, v22); + __ bcax(v22, __ T16B, v22, v24, v23); + __ bcax(v23, __ T16B, v23, v31, v24); + __ bcax(v24, __ T16B, v24, v8, v31); + + __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); + + __ bcax(v17, __ T16B, v25, v19, v3); + __ bcax(v18, __ T16B, v3, v15, v19); + __ bcax(v19, __ T16B, v19, v16, v15); + __ bcax(v15, __ T16B, v15, v25, v16); + __ bcax(v16, __ T16B, v16, v3, v25); + + __ bcax(v10, __ T16B, v29, v12, v26); + __ bcax(v11, __ T16B, v26, v13, v12); + __ bcax(v12, __ T16B, v12, v14, v13); + __ bcax(v13, __ T16B, v13, v29, v14); + __ bcax(v14, __ T16B, v14, v26, v29); + + __ bcax(v7, __ T16B, v30, v9, v4); + __ bcax(v8, __ T16B, v4, v5, v9); + __ bcax(v9, __ T16B, v9, v6, v5); + __ bcax(v5, __ T16B, v5, v30, v6); + __ bcax(v6, __ T16B, v6, v4, v30); + + __ bcax(v3, __ T16B, v27, v0, v28); + __ bcax(v4, __ T16B, v28, v1, v0); + __ bcax(v0, __ T16B, v0, v2, v1); + __ bcax(v1, __ T16B, v1, v27, v2); + __ bcax(v2, __ T16B, v2, v28, v27); + + __ eor(v0, __ T16B, v0, v31); + + __ cbnzw(rscratch2, rounds24_loop); + + if (multi_block) { + // block_size = 200 - 2 * digest_length, ofs += block_size + __ add(ofs, ofs, 200); + __ sub(ofs, ofs, digest_length, Assembler::LSL, 1); + + __ cmp(ofs, limit); + __ br(Assembler::LE, sha3_loop); + __ mov(c_rarg0, ofs); // return ofs + } + + __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); + __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); + __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); + __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); + __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); + __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); + __ st1(v24, __ T1D, state); + + __ ldpd(v14, v15, Address(sp, 48)); + __ ldpd(v12, v13, Address(sp, 32)); + __ ldpd(v10, v11, Address(sp, 16)); + __ ldpd(v8, v9, __ post(sp, 64)); + + __ ret(lr); + + return start; + } + // Safefetch stubs. void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) { // safefetch signatures: // int SafeFetch32(int* adr, int errValue);
*** 6020,6029 **** --- 6239,6252 ---- } if (UseSHA512Intrinsics) { StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); } + if (UseSHA3Intrinsics) { + StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); + StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); + } // generate Adler32 intrinsics code if (UseAdler32Intrinsics) { StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); }
< prev index next >