--- old/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp 2020-08-17 12:17:04.546676138 -0500 +++ new/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp 2020-08-17 12:17:04.436682758 -0500 @@ -1,6 +1,6 @@ /* - * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2019, SAP SE. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2020, SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -3544,6 +3544,434 @@ return start; } + +// The following Base64 decode intrinsic is based on an algorithm outlined +// in here: +// http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html +// in the section titled "Vector lookup (pshufb with bitmask)" +// +// This implementation differs in the following ways: +// * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions +// are used instead. It turns out that some of the vector operations +// needed in the algorithm require fewer AltiVec instructions. +// * The algorithm in the above mentioned paper doesn't handle the +// Base64-URL variant in RFC 4648. Adjustments to both the code and to two +// lookup tables are needed for this. +// * The "Pack" section of the code is a complete rewrite for Power because we +// can utilize better instructions for this step. +// + +// Offsets per group of Base64 characters +// Uppercase +#define UC (signed char)((-'A' + 0) & 0xff) +// Lowercase +#define LC (signed char)((-'a' + 26) & 0xff) +// Digits +#define DIG (signed char)((-'0' + 52) & 0xff) +// Plus sign (URL = 0) +#define PLS (signed char)((-'+' + 62) & 0xff) +// Hyphen (URL = 1) +#define HYP (signed char)((-'-' + 62) & 0xff) +// Slash (URL = 0) +#define SLS (signed char)((-'/' + 63) & 0xff) +// Underscore (URL = 1) +#define US (signed char)((-'_' + 63) & 0xff) + +// In little-endian mode, the lxv instruction loads the element at EA into element 15 +// of the the vector register, EA+1 goes into element 15, and so on. +// +// To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the order of +// the elements in a vector initialization. + +#define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0 + + // + // Base64 decodeBlock intrinsic + address generate_base64_decodeBlock() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "base64_decodeBlock"); + address start = __ function_entry(); + + static const __vector signed char offsetLUT_val = { + ARRAY_TO_LXV_ORDER( + 0, 0, PLS, DIG, UC, UC, LC, LC, + 0, 0, 0, 0, 0, 0, 0, 0 ) }; + + static const __vector signed char offsetLUT_URL_val = { + ARRAY_TO_LXV_ORDER( + 0, 0, HYP, DIG, UC, UC, LC, LC, + 0, 0, 0, 0, 0, 0, 0, 0 ) }; + + static const __vector unsigned char maskLUT_val = { + ARRAY_TO_LXV_ORDER( + /* 0 */ (unsigned char)0b10101000, + /* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, + (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, + (unsigned char)0b11111000, + /* 10 */ (unsigned char)0b11110000, + /* 11 */ (unsigned char)0b01010100, + /* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000, + /* 15 */ (unsigned char)0b01010100 ) }; + + static const __vector unsigned char maskLUT_URL_val = { + ARRAY_TO_LXV_ORDER( + /* 0 */ (unsigned char)0b10101000, + /* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, + (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, + (unsigned char)0b11111000, + /* 10 */ (unsigned char)0b11110000, + /* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000, + /* 13 */ (unsigned char)0b01010100, + /* 14 */ (unsigned char)0b01010000, + /* 15 */ (unsigned char)0b01110000 ) }; + + static const __vector unsigned char bitposLUT_val = { + ARRAY_TO_LXV_ORDER( + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) }; + + static const __vector unsigned char pack_lshift_val = { 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6, 0 }; + + static const __vector unsigned char pack_rshift_val = { 0, 4, 2, 0, 0, 4, 2, 0, 0, 4, 2, 0, 0, 4, 2, 0 }; + + // The last 4 index values are "don't care" because + // we only use the first 12 bytes of the vector, + // which are decoded from 16 bytes of Base64 characters. + static const __vector unsigned char pack_permute_val = { + 14, 13, 12, + 10, 9, 8, + 6, 5, 4, + 2, 1, 0, + 0, 0, 0, 0 }; + + static const __vector unsigned char p10_pack_permute_val = { + 10, 11, 12, 13, 14, 15, + 2, 3, 4, 5, 6, 7, + 0, 0, 0, 0 }; + + const unsigned loop_unrolls = 8; // needs to be a power of two so that the rounding can be done using a mask + const unsigned vec_size = 16; // size of vector registers in bytes + const unsigned block_size = vec_size * loop_unrolls; // number of bytes to process in each pass through the loop + const unsigned block_size_clear = exact_log2(block_size); // the lower log2(block_size) bits of the size + + // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore + Register s = R3_ARG1; // source starting address of Base64 characters + Register sp = R4_ARG2; // actual start of processing is at s + sp + Register sl = R5_ARG3; // source length = # of Base64 characters to be processed + Register d = R6_ARG4; // destination address + Register isURL = R7_ARG5; // boolean, if non-zero indicates use of RFC 4648 base64url encoding + + // Local variables + Register const_ptr = R8; // used for loading constants + Register tmp_reg = R9; // used for speeding up load_constant() + + // Re-use R8 and R9 to avoid using non-volatile registers (requires save/restore) + Register out = R8; // moving out (destination) pointer + Register in = R9; // moving in (source) pointer + Register end = R10; // pointer to the last byte of the source + Register non_match_cnt = R11; // flag for detecting non-BASE64 characters + + + // Volatile VSRS are 0..13, 32..51 (VR0..VR13) + // VR Constants + VectorRegister vec_0s = VR0; + VectorRegister vec_4s = VR1; + VectorRegister vec_8s = VR2; + VectorRegister vec_special_case_char = VR3; + VectorRegister pack_rshift = VR4; + VectorRegister pack_lshift = VR5; + // P10+ + VectorRegister vec_0x3fs = VR4; // safe to reuse pack_rshift's register + + // VSR Constants + VectorSRegister offsetLUT = VSR0; + VectorSRegister maskLUT = VSR1; + VectorSRegister bitposLUT = VSR2; + VectorSRegister vec_0xfs = VSR3; + VectorSRegister vec_special_case_offset = VSR4; + VectorSRegister pack_permute = VSR5; + + // Variables for lookup + // VR + VectorRegister input = VR6; + VectorRegister higher_nibble = VR7; + VectorRegister eq_special_case_char = VR8; + VectorRegister offsets = VR9; + VectorRegister non_match = VR10; + + // VSR + VectorSRegister bit = VSR6; + VectorSRegister lower_nibble = VSR7; + VectorSRegister M = VSR8; + + // Variables for pack + // VR + VectorRegister l = VR7; // reuse higher_nibble's register + VectorRegister r = VR8; // reuse eq_special_case_char's register + VectorRegister gathered = VR9; // reuse offsets's register + + Label not_URL, calculate_size, unrolled_loop_start, skip_xxsel[loop_unrolls], unrolled_loop_exit, zero_processed_exit; + + // Load constant vec registers that need to be loaded from memory + __ load_const(const_ptr, (address)&bitposLUT_val, tmp_reg); + __ lxv(bitposLUT, 0, const_ptr); + if (PowerArchitecturePPC64 >= 10) { + __ load_const(const_ptr, (address)&p10_pack_permute_val, tmp_reg); + } else { + __ load_const(const_ptr, (address)&pack_rshift_val, tmp_reg); + __ lxv(pack_rshift->to_vsr(), 0, const_ptr); + __ load_const(const_ptr, (address)&pack_lshift_val, tmp_reg); + __ lxv(pack_lshift->to_vsr(), 0, const_ptr); + __ load_const(const_ptr, (address)&pack_permute_val, tmp_reg); + } + __ lxv(pack_permute, 0, const_ptr); + + // Splat the constants that can use xxspltib + __ xxspltib(vec_0s->to_vsr(), 0); + __ xxspltib(vec_4s->to_vsr(), 4); + __ xxspltib(vec_8s->to_vsr(), 8); + __ xxspltib(vec_0xfs, 0xf); + if (PowerArchitecturePPC64 >= 10) { + __ xxspltib(vec_0x3fs->to_vsr(), 0x3f); + } + + // The rest of the constants use different values depending on the + // setting of isURL + __ cmpdi(CCR0, isURL, 0); + __ beq(CCR0, not_URL); + + // isURL != 0 (true) + __ load_const(const_ptr, (address)&offsetLUT_URL_val, tmp_reg); + __ lxv(offsetLUT, 0, const_ptr); + __ load_const(const_ptr, (address)&maskLUT_URL_val, tmp_reg); + __ lxv(maskLUT, 0, const_ptr); + __ xxspltib(vec_special_case_char->to_vsr(), '_'); + __ xxspltib(vec_special_case_offset, (unsigned char)US); + __ b(calculate_size); + + // isURL = 0 (false) + __ bind(not_URL); + __ load_const(const_ptr, (address)&offsetLUT_val, tmp_reg); + __ lxv(offsetLUT, 0, const_ptr); + __ load_const(const_ptr, (address)&maskLUT_val, tmp_reg); + __ lxv(maskLUT, 0, const_ptr); + __ xxspltib(vec_special_case_char->to_vsr(), '/'); + __ xxspltib(vec_special_case_offset, (unsigned char)SLS); + + __ bind(calculate_size); + + // Don't handle the last 4 characters of the source, because this + // VSX-based algorithm doesn't handle padding characters. Also the + // vector code will always write 16 bytes of decoded data on each pass, + // but only the first 12 of those 16 bytes are valid data (16 base64 + // characters become 12 bytes of binary data), so for this reason we + // need to subtract an additional 8 bytes from the source length, in + // order not to write past the end of the destination buffer. The + // result of this subtraction implies that the non-instrinsic routine + // will be used to process the last 12 characters. + __ subi(sl, sl, 12); + + // Round sl down to the nearest multiple of block_size + __ clrrdi(sl, sl, block_size_clear); + + // out starts at the beginning of the destination + __ addi(out, d, 0); + + // in starts at s + sp + __ add(in, s, sp); + + // Address of the last byte of the source is (in + sl - 1) + __ add(end, in, sl); + __ subi(end, end, 1); + + __ bind(unrolled_loop_start); + + __ cmpd(CCR0, end, in); + __ blt_predict_not_taken(CCR0, unrolled_loop_exit); + for (unsigned unroll_cnt=0; unroll_cnt < loop_unrolls; unroll_cnt++) { + // We can use a static displacement in the load since it's always a + // multiple of 16, which is a requirement of lxv/stxv. This saves + // an addi instruction. + __ lxv(input->to_vsr(), unroll_cnt * 16, in); + // + // Lookup + // + // Isolate the upper 4 bits of each character by shifting it right 4 bits + __ vsrb(higher_nibble, input, vec_4s); + // Isolate the lower 4 bits by masking + __ xxland(lower_nibble, input->to_vsr(), vec_0xfs); + + // Get the offset (the value to subtract from the byte) by using + // a lookup table indexed by the upper 4 bits of the character + __ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr()); + + // Find out which elemets are the special case character (isURL ? '/' : '-') + __ vcmpequb_(eq_special_case_char, input, vec_special_case_char); + // + // There's a (63/64)^16 = 77.7% chance that there are no special + // case chars in this 16 bytes of input. When we detect this case + // (CCR6-EQ, all comparisons are false), we can skip the xxsel + // step. + __ beq_predict_taken(CCR6, skip_xxsel[unroll_cnt]); + + // For each character in the input which is a special case + // character, replace its offset with one that is special for that + // character. + __ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr()); + + // Note that skip_xxsel is indexed because this code is contained + // in a C++ loop (the emitted code in this unroll loop doesn't + // loop). The indexing allows the creation of a unique labels for + // each iteration of the unrolled loop. + __ bind(skip_xxsel[unroll_cnt]); + + // Use the lower_nibble to select a mask "M" from the lookup table. + __ xxperm(M, maskLUT, lower_nibble); + + // "bit" is used to isolate which of the bits in M is relevant. + __ xxperm(bit, bitposLUT, higher_nibble->to_vsr()); + + // Each element of non_match correspond to one each of the 16 input + // characters. Those elements that become 0x00 after the xxland + // instuction are invalid Base64 characters. + __ xxland(non_match->to_vsr(), M, bit); + + // Compare each element to zero + // + // vmcmpequb_ sets the EQ bit of CCR6 if no elements compare equal. + // Any element comparing equal to zero means there is an error in + // that element. Note that the comparison result register + // non_match is not referenced again. Only CCR6-EQ matters. + __ vcmpequb_(non_match, non_match, vec_0s); + __ bne_predict_not_taken(CCR6, zero_processed_exit); + + // The Base64 characters had no errors, so add the offsets + __ vaddubm(input, input, offsets); + + // Pack + // + // Legend for the tables below: b0, b1, .. b15 are the bytes of + // decoded binary data. The specifier after the colon depicts + // which bits are there. The bit numbering is big endian style + // (bit 0 is the most significant). The || is a concatenate + // operator (same terminology as used in the Power ISA 3.x + // document). Strings of 0's are a field of zeros with the shown + // length. + + if (PowerArchitecturePPC64 >= 10) { + // Note that only e15..e8 are shown here because the extract + // bit pattern is the same in e7..e0. + // + // +===============+=============+======================+======================+=============+=============+======================+======================+=============+ + // | Vector | e15 | e14 | e13 | e12 | e11 | e10 | e9 | e8 | + // | Element | | | | | | | | | + // +===============+=============+======================+======================+=============+=============+======================+======================+=============+ + // | after vaddubm | 00||b0:0..5 | 00||b0:6..7||b1:0..3 | 00||b1:4..7||b2:0..1 | 00||b2:2..7 | 00||b3:0..5 | 00||b3:6..7||b4:0..3 | 00||b4:4..7||b5:0..1 | 00||b5:2..7 | + // +---------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ + // | after xxbrd | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 | + // +---------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ + // | vec_0x3fs | 00111111 | 00111111 | 00111111 | 00111111 | 00111111 | 00111111 | 00111111 | 00111111 | + // +---------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ + // | after vpextd | b5:0..7 | b4:0..7 | b3:0..7 | b2:0..7 | b1:0..7 | b0:0..7 | 00000000 | 00000000 | + // +===============+=============+======================+======================+=============+=============+======================+======================+=============+ + + __ xxbrd(input->to_vsr(), input->to_vsr()); + __ vpextd(gathered, input, vec_0x3fs); + + // Final jostling of bytes into their correct positions. + // +==================+=====+=====+=====+=====+=====+=====+====+====+=====+=====+=====+=====+====+====+====+====+ + // | Vector | e15 | e14 | e13 | e12 | e11 | e10 | e9 | e8 | e7 | e6 | e5 | e4 | e3 | e2 | e1 | e0 | + // | Elements | | | | | | | | | | | | | | | | | + // +==================+=====+=====+=====+=====+=====+=====+====+====+=====+=====+=====+=====+====+====+====+====+ + // | after vpextd | b5 | b4 | b3 | b2 | b1 | b0 | 0 | 0 | b11 | b10 | b9 | b8 | b7 | b6 | 0 | 0 | + // +------------------+-----+-----+-----+-----+-----+-----+----+----+-----+-----+-----+-----+----+----+----+----+ + // | p10_pack_permute | 10 | 11 | 12 | 13 | 14 | 15 | 2 | 3 | 4 | 5 | 6 | 7 | 0 | 0 | 0 | 0 | + // +------------------+-----+-----+-----+-----+-----+-----+----+----+-----+-----+-----+-----+----+----+----+----+ + // | after xxperm | b0 | b1 | b2 | b3 | b4 | b5 | b6 | b7 | b8 | b9 | b10 | b11 | 0 | 0 | 0 | 0 | + // +==================+=====+=====+=====+=====+=====+=====+====+====+=====+=====+=====+=====+====+====+====+====+ + } else { + // Note that only e15..e12 are shown here because the shifting + // and OR'ing pattern replicates for e11..e8, e7..e4, and + // e3..e0. + // + // +======================+=============+======================+======================+=================+ + // | Vector | e15 | e14 | e13 | e12 | + // | Element | | | | | + // +======================+=============+======================+======================+=================+ + // | after vaddubm | 00||b0:0..5 | 00||b0:6..7||b1:0..3 | 00||b1:4..7||b2:0..1 | 00||b2:2..7 | + // +----------------------+-------------+----------------------+----------------------+-----------------+ + // | pack_lshift | << 2 | << 4 | << 6 | | + // +----------------------+-------------+----------------------+----------------------+-----------------+ + // | l after vslb | b0:0..5||00 | b1:0..3||0000 | b2:0..1||000000 | 00||b2:2..7 | + // +----------------------+-------------+----------------------+----------------------+-----------------+ + // | l after vslo | 00000000 | b0:0..5||00 | b1:0..3||0000 | b2:0..1||000000 | + // +----------------------+-------------+----------------------+----------------------+-----------------+ + // | pack_rshift | | >> 4 | >> 2 | | + // +----------------------+-------------+----------------------+----------------------+-----------------+ + // | r after vsrb | 00||b0:0..5 | 000000||b0:6..7 | 0000||b1:4..7 | 00||b2:2..7 | + // +----------------------+-------------+----------------------+----------------------+-----------------+ + // | gathered after xxlor | 00||b0:0..5 | b0:0..7 | b1:0..7 | b2:0..7 | + // +======================+=============+======================+======================+=================+ + // + // + __ vslb(l, input, pack_lshift); + // vslo of vec_8s shifts the vector by one octet toward lower + // element numbers, discarding element 0. This means it actually + // shifts to the right (not left) according to the order of the + // table above. + __ vslo(l, l, vec_8s); + __ vsrb(r, input, pack_rshift); + __ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr()); + + // Final jostling of bytes into their correct positions. + // +==============+=====+=====+=====+=====+=====+=====+====+====+====+====+=====+=====+======+======+======+======+ + // | Vector | e15 | e14 | e13 | e12 | e11 | e10 | e9 | e8 | e7 | e6 | e5 | e4 | e3 | e2 | e1 | e0 | + // | Elements | | | | | | | | | | | | | | | | | + // +==============+=====+=====+=====+=====+=====+=====+====+====+====+====+=====+=====+======+======+======+======+ + // | after xxlor | xx | b0 | b1 | b2 | xx | b3 | b4 | b5 | xx | b6 | b7 | b8 | xx | b9 | b10 | b11 | + // +--------------+-----+-----+-----+-----+-----+-----+----+----+----+----+-----+-----+------+------+------+------+ + // | pack_permute | 14 | 13 | 12 | 10 | 9 | 8 | 6 | 5 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | + // +--------------+-----+-----+-----+-----+-----+-----+----+----+----+----+-----+-----+------+------+------+------+ + // | after xxperm | b0 | b1 | b2 | b3 | b4 | b5 | b6 | b7 | b8 | b9 | b10 | b11 | b11* | b11* | b11* | b11* | + // +==============+=====+=====+=====+=====+=====+=====+====+====+====+====+=====+=====+======+======+======+======+ + // xx bytes are not used to form the final data + // b0..b15 are the decoded and reassembled 8-bit bytes of data + // b11 with asterisk is a "don't care", because these bytes will be + // overwritten on the next iteration. + } + __ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute); + + // We cannot use a static displacement on the store, since it's a + // multiple of 12, not 16. Note that this stxv instruction actually + // writes 16 bytes, even though only the first 12 are valid data. + __ stxv(gathered->to_vsr(), 0, out); + __ addi(out, out, 12); + } + __ addi(in, in, 16 * loop_unrolls); + __ b(unrolled_loop_start); + + __ bind(unrolled_loop_exit); + + // Return the number of out bytes produced, which is (out - d) + __ sub(R3_RET, out, d); + __ blr(); + + // Return 0 characters processed. This can be due to an illegal Base64 character + // that was discovered. + __ bind(zero_processed_exit); + __ li(R3_RET, 0); + __ blr(); + return start; + } + +#undef UC +#undef LC +#undef DIG +#undef PLS +#undef HYP +#undef SLS +#undef US + // Initialization void generate_initial() { // Generates all stubs and initializes the entry points @@ -3642,6 +4070,13 @@ StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); } + +#ifdef VM_LITTLE_ENDIAN + // Currently supported on PPC64LE only + if (UseBASE64Intrinsics) { + StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); + } +#endif } public: