1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2019, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 3527 * R3_ARG1 - int crc 3528 * R4_ARG2 - byte* buf 3529 * R5_ARG3 - int length (of buffer) 3530 * 3531 * scratch: 3532 * R2, R6-R12 3533 * 3534 * Ouput: 3535 * R3_RET - int crc result 3536 */ 3537 // Compute CRC32 function. 3538 address generate_CRC32_updateBytes(bool is_crc32c) { 3539 __ align(CodeEntryAlignment); 3540 StubCodeMark mark(this, "StubRoutines", is_crc32c ? "CRC32C_updateBytes" : "CRC32_updateBytes"); 3541 address start = __ function_entry(); // Remember stub start address (is rtn value). 3542 __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c); 3543 __ blr(); 3544 return start; 3545 } 3546 3547 // Initialization 3548 void generate_initial() { 3549 // Generates all stubs and initializes the entry points 3550 3551 // Entry points that exist in all platforms. 3552 // Note: This is code that could be shared among different platforms - however the 3553 // benefit seems to be smaller than the disadvantage of having a 3554 // much more complicated generator structure. See also comment in 3555 // stubRoutines.hpp. 3556 3557 StubRoutines::_forward_exception_entry = generate_forward_exception(); 3558 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 3559 StubRoutines::_catch_exception_entry = generate_catch_exception(); 3560 3561 // Build this early so it's available for the interpreter. 3562 StubRoutines::_throw_StackOverflowError_entry = 3563 generate_throw_exception("StackOverflowError throw_exception", 3564 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false); 3565 StubRoutines::_throw_delayed_StackOverflowError_entry = 3566 generate_throw_exception("delayed StackOverflowError throw_exception", 3625 3626 // data cache line writeback 3627 if (VM_Version::supports_data_cache_line_flush()) { 3628 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 3629 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 3630 } 3631 3632 if (UseAESIntrinsics) { 3633 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 3634 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 3635 } 3636 3637 if (UseSHA256Intrinsics) { 3638 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 3639 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 3640 } 3641 if (UseSHA512Intrinsics) { 3642 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 3643 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 3644 } 3645 } 3646 3647 public: 3648 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 3649 // replace the standard masm with a special one: 3650 _masm = new MacroAssembler(code); 3651 if (all) { 3652 generate_all(); 3653 } else { 3654 generate_initial(); 3655 } 3656 } 3657 }; 3658 3659 #define UCM_TABLE_MAX_ENTRIES 8 3660 void StubGenerator_generate(CodeBuffer* code, bool all) { 3661 if (UnsafeCopyMemory::_table == NULL) { 3662 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 3663 } 3664 StubGenerator g(code, all); | 1 /* 2 * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2020, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 3527 * R3_ARG1 - int crc 3528 * R4_ARG2 - byte* buf 3529 * R5_ARG3 - int length (of buffer) 3530 * 3531 * scratch: 3532 * R2, R6-R12 3533 * 3534 * Ouput: 3535 * R3_RET - int crc result 3536 */ 3537 // Compute CRC32 function. 3538 address generate_CRC32_updateBytes(bool is_crc32c) { 3539 __ align(CodeEntryAlignment); 3540 StubCodeMark mark(this, "StubRoutines", is_crc32c ? "CRC32C_updateBytes" : "CRC32_updateBytes"); 3541 address start = __ function_entry(); // Remember stub start address (is rtn value). 3542 __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c); 3543 __ blr(); 3544 return start; 3545 } 3546 3547 3548 // The following Base64 decode intrinsic is based on an algorithm outlined 3549 // in here: 3550 // http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html 3551 // in the section titled "Vector lookup (pshufb with bitmask)" 3552 // 3553 // This implementation differs in the following ways: 3554 // * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions 3555 // are used instead. It turns out that some of the vector operations 3556 // needed in the algorithm require fewer AltiVec instructions. 3557 // * The algorithm in the above mentioned paper doesn't handle the 3558 // Base64-URL variant in RFC 4648. Adjustments to both the code and to two 3559 // lookup tables are needed for this. 3560 // * The "Pack" section of the code is a complete rewrite for Power because we 3561 // can utilize better instructions for this step. 3562 // 3563 3564 // Offsets per group of Base64 characters 3565 // Uppercase 3566 #define UC (signed char)((-'A' + 0) & 0xff) 3567 // Lowercase 3568 #define LC (signed char)((-'a' + 26) & 0xff) 3569 // Digits 3570 #define DIG (signed char)((-'0' + 52) & 0xff) 3571 // Plus sign (URL = 0) 3572 #define PLS (signed char)((-'+' + 62) & 0xff) 3573 // Hyphen (URL = 1) 3574 #define HYP (signed char)((-'-' + 62) & 0xff) 3575 // Slash (URL = 0) 3576 #define SLS (signed char)((-'/' + 63) & 0xff) 3577 // Underscore (URL = 1) 3578 #define US (signed char)((-'_' + 63) & 0xff) 3579 3580 // In little-endian mode, the lxv instruction loads the element at EA into element 15 3581 // of the the vector register, EA+1 goes into element 15, and so on. 3582 // 3583 // To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the order of 3584 // the elements in a vector initialization. 3585 3586 #define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0 3587 3588 // 3589 // Base64 decodeBlock intrinsic 3590 address generate_base64_decodeBlock() { 3591 __ align(CodeEntryAlignment); 3592 StubCodeMark mark(this, "StubRoutines", "base64_decodeBlock"); 3593 address start = __ function_entry(); 3594 3595 static const __vector signed char offsetLUT_val = { 3596 ARRAY_TO_LXV_ORDER( 3597 0, 0, PLS, DIG, UC, UC, LC, LC, 3598 0, 0, 0, 0, 0, 0, 0, 0 ) }; 3599 3600 static const __vector signed char offsetLUT_URL_val = { 3601 ARRAY_TO_LXV_ORDER( 3602 0, 0, HYP, DIG, UC, UC, LC, LC, 3603 0, 0, 0, 0, 0, 0, 0, 0 ) }; 3604 3605 static const __vector unsigned char maskLUT_val = { 3606 ARRAY_TO_LXV_ORDER( 3607 /* 0 */ (unsigned char)0b10101000, 3608 /* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, 3609 (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, 3610 (unsigned char)0b11111000, 3611 /* 10 */ (unsigned char)0b11110000, 3612 /* 11 */ (unsigned char)0b01010100, 3613 /* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000, 3614 /* 15 */ (unsigned char)0b01010100 ) }; 3615 3616 static const __vector unsigned char maskLUT_URL_val = { 3617 ARRAY_TO_LXV_ORDER( 3618 /* 0 */ (unsigned char)0b10101000, 3619 /* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, 3620 (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, 3621 (unsigned char)0b11111000, 3622 /* 10 */ (unsigned char)0b11110000, 3623 /* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000, 3624 /* 13 */ (unsigned char)0b01010100, 3625 /* 14 */ (unsigned char)0b01010000, 3626 /* 15 */ (unsigned char)0b01110000 ) }; 3627 3628 static const __vector unsigned char bitposLUT_val = { 3629 ARRAY_TO_LXV_ORDER( 3630 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80, 3631 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) }; 3632 3633 static const __vector unsigned char pack_lshift_val = { 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6, 0 }; 3634 3635 static const __vector unsigned char pack_rshift_val = { 0, 4, 2, 0, 0, 4, 2, 0, 0, 4, 2, 0, 0, 4, 2, 0 }; 3636 3637 // The last 4 index values are "don't care" because 3638 // we only use the first 12 bytes of the vector, 3639 // which are decoded from 16 bytes of Base64 characters. 3640 static const __vector unsigned char pack_permute_val = { 3641 14, 13, 12, 3642 10, 9, 8, 3643 6, 5, 4, 3644 2, 1, 0, 3645 0, 0, 0, 0 }; 3646 3647 static const __vector unsigned char p10_pack_permute_val = { 3648 10, 11, 12, 13, 14, 15, 3649 2, 3, 4, 5, 6, 7, 3650 0, 0, 0, 0 }; 3651 3652 const unsigned loop_unrolls = 8; // needs to be a power of two so that the rounding can be done using a mask 3653 const unsigned vec_size = 16; // size of vector registers in bytes 3654 const unsigned block_size = vec_size * loop_unrolls; // number of bytes to process in each pass through the loop 3655 const unsigned block_size_clear = exact_log2(block_size); // the lower log2(block_size) bits of the size 3656 3657 // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore 3658 Register s = R3_ARG1; // source starting address of Base64 characters 3659 Register sp = R4_ARG2; // actual start of processing is at s + sp 3660 Register sl = R5_ARG3; // source length = # of Base64 characters to be processed 3661 Register d = R6_ARG4; // destination address 3662 Register isURL = R7_ARG5; // boolean, if non-zero indicates use of RFC 4648 base64url encoding 3663 3664 // Local variables 3665 Register const_ptr = R8; // used for loading constants 3666 Register tmp_reg = R9; // used for speeding up load_constant() 3667 3668 // Re-use R8 and R9 to avoid using non-volatile registers (requires save/restore) 3669 Register out = R8; // moving out (destination) pointer 3670 Register in = R9; // moving in (source) pointer 3671 Register end = R10; // pointer to the last byte of the source 3672 Register non_match_cnt = R11; // flag for detecting non-BASE64 characters 3673 3674 3675 // Volatile VSRS are 0..13, 32..51 (VR0..VR13) 3676 // VR Constants 3677 VectorRegister vec_0s = VR0; 3678 VectorRegister vec_4s = VR1; 3679 VectorRegister vec_8s = VR2; 3680 VectorRegister vec_special_case_char = VR3; 3681 VectorRegister pack_rshift = VR4; 3682 VectorRegister pack_lshift = VR5; 3683 // P10+ 3684 VectorRegister vec_0x3fs = VR4; // safe to reuse pack_rshift's register 3685 3686 // VSR Constants 3687 VectorSRegister offsetLUT = VSR0; 3688 VectorSRegister maskLUT = VSR1; 3689 VectorSRegister bitposLUT = VSR2; 3690 VectorSRegister vec_0xfs = VSR3; 3691 VectorSRegister vec_special_case_offset = VSR4; 3692 VectorSRegister pack_permute = VSR5; 3693 3694 // Variables for lookup 3695 // VR 3696 VectorRegister input = VR6; 3697 VectorRegister higher_nibble = VR7; 3698 VectorRegister eq_special_case_char = VR8; 3699 VectorRegister offsets = VR9; 3700 VectorRegister non_match = VR10; 3701 3702 // VSR 3703 VectorSRegister bit = VSR6; 3704 VectorSRegister lower_nibble = VSR7; 3705 VectorSRegister M = VSR8; 3706 3707 // Variables for pack 3708 // VR 3709 VectorRegister l = VR7; // reuse higher_nibble's register 3710 VectorRegister r = VR8; // reuse eq_special_case_char's register 3711 VectorRegister gathered = VR9; // reuse offsets's register 3712 3713 Label not_URL, calculate_size, unrolled_loop_start, skip_xxsel[loop_unrolls], unrolled_loop_exit, zero_processed_exit; 3714 3715 // Load constant vec registers that need to be loaded from memory 3716 __ load_const(const_ptr, (address)&bitposLUT_val, tmp_reg); 3717 __ lxv(bitposLUT, 0, const_ptr); 3718 if (PowerArchitecturePPC64 >= 10) { 3719 __ load_const(const_ptr, (address)&p10_pack_permute_val, tmp_reg); 3720 } else { 3721 __ load_const(const_ptr, (address)&pack_rshift_val, tmp_reg); 3722 __ lxv(pack_rshift->to_vsr(), 0, const_ptr); 3723 __ load_const(const_ptr, (address)&pack_lshift_val, tmp_reg); 3724 __ lxv(pack_lshift->to_vsr(), 0, const_ptr); 3725 __ load_const(const_ptr, (address)&pack_permute_val, tmp_reg); 3726 } 3727 __ lxv(pack_permute, 0, const_ptr); 3728 3729 // Splat the constants that can use xxspltib 3730 __ xxspltib(vec_0s->to_vsr(), 0); 3731 __ xxspltib(vec_4s->to_vsr(), 4); 3732 __ xxspltib(vec_8s->to_vsr(), 8); 3733 __ xxspltib(vec_0xfs, 0xf); 3734 if (PowerArchitecturePPC64 >= 10) { 3735 __ xxspltib(vec_0x3fs->to_vsr(), 0x3f); 3736 } 3737 3738 // The rest of the constants use different values depending on the 3739 // setting of isURL 3740 __ cmpdi(CCR0, isURL, 0); 3741 __ beq(CCR0, not_URL); 3742 3743 // isURL != 0 (true) 3744 __ load_const(const_ptr, (address)&offsetLUT_URL_val, tmp_reg); 3745 __ lxv(offsetLUT, 0, const_ptr); 3746 __ load_const(const_ptr, (address)&maskLUT_URL_val, tmp_reg); 3747 __ lxv(maskLUT, 0, const_ptr); 3748 __ xxspltib(vec_special_case_char->to_vsr(), '_'); 3749 __ xxspltib(vec_special_case_offset, (unsigned char)US); 3750 __ b(calculate_size); 3751 3752 // isURL = 0 (false) 3753 __ bind(not_URL); 3754 __ load_const(const_ptr, (address)&offsetLUT_val, tmp_reg); 3755 __ lxv(offsetLUT, 0, const_ptr); 3756 __ load_const(const_ptr, (address)&maskLUT_val, tmp_reg); 3757 __ lxv(maskLUT, 0, const_ptr); 3758 __ xxspltib(vec_special_case_char->to_vsr(), '/'); 3759 __ xxspltib(vec_special_case_offset, (unsigned char)SLS); 3760 3761 __ bind(calculate_size); 3762 3763 // Don't handle the last 4 characters of the source, because this 3764 // VSX-based algorithm doesn't handle padding characters. Also the 3765 // vector code will always write 16 bytes of decoded data on each pass, 3766 // but only the first 12 of those 16 bytes are valid data (16 base64 3767 // characters become 12 bytes of binary data), so for this reason we 3768 // need to subtract an additional 8 bytes from the source length, in 3769 // order not to write past the end of the destination buffer. The 3770 // result of this subtraction implies that the non-instrinsic routine 3771 // will be used to process the last 12 characters. 3772 __ subi(sl, sl, 12); 3773 3774 // Round sl down to the nearest multiple of block_size 3775 __ clrrdi(sl, sl, block_size_clear); 3776 3777 // out starts at the beginning of the destination 3778 __ addi(out, d, 0); 3779 3780 // in starts at s + sp 3781 __ add(in, s, sp); 3782 3783 // Address of the last byte of the source is (in + sl - 1) 3784 __ add(end, in, sl); 3785 __ subi(end, end, 1); 3786 3787 __ bind(unrolled_loop_start); 3788 3789 __ cmpd(CCR0, end, in); 3790 __ blt_predict_not_taken(CCR0, unrolled_loop_exit); 3791 for (unsigned unroll_cnt=0; unroll_cnt < loop_unrolls; unroll_cnt++) { 3792 // We can use a static displacement in the load since it's always a 3793 // multiple of 16, which is a requirement of lxv/stxv. This saves 3794 // an addi instruction. 3795 __ lxv(input->to_vsr(), unroll_cnt * 16, in); 3796 // 3797 // Lookup 3798 // 3799 // Isolate the upper 4 bits of each character by shifting it right 4 bits 3800 __ vsrb(higher_nibble, input, vec_4s); 3801 // Isolate the lower 4 bits by masking 3802 __ xxland(lower_nibble, input->to_vsr(), vec_0xfs); 3803 3804 // Get the offset (the value to subtract from the byte) by using 3805 // a lookup table indexed by the upper 4 bits of the character 3806 __ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr()); 3807 3808 // Find out which elemets are the special case character (isURL ? '/' : '-') 3809 __ vcmpequb_(eq_special_case_char, input, vec_special_case_char); 3810 // 3811 // There's a (63/64)^16 = 77.7% chance that there are no special 3812 // case chars in this 16 bytes of input. When we detect this case 3813 // (CCR6-EQ, all comparisons are false), we can skip the xxsel 3814 // step. 3815 __ beq_predict_taken(CCR6, skip_xxsel[unroll_cnt]); 3816 3817 // For each character in the input which is a special case 3818 // character, replace its offset with one that is special for that 3819 // character. 3820 __ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr()); 3821 3822 // Note that skip_xxsel is indexed because this code is contained 3823 // in a C++ loop (the emitted code in this unroll loop doesn't 3824 // loop). The indexing allows the creation of a unique labels for 3825 // each iteration of the unrolled loop. 3826 __ bind(skip_xxsel[unroll_cnt]); 3827 3828 // Use the lower_nibble to select a mask "M" from the lookup table. 3829 __ xxperm(M, maskLUT, lower_nibble); 3830 3831 // "bit" is used to isolate which of the bits in M is relevant. 3832 __ xxperm(bit, bitposLUT, higher_nibble->to_vsr()); 3833 3834 // Each element of non_match correspond to one each of the 16 input 3835 // characters. Those elements that become 0x00 after the xxland 3836 // instuction are invalid Base64 characters. 3837 __ xxland(non_match->to_vsr(), M, bit); 3838 3839 // Compare each element to zero 3840 // 3841 // vmcmpequb_ sets the EQ bit of CCR6 if no elements compare equal. 3842 // Any element comparing equal to zero means there is an error in 3843 // that element. Note that the comparison result register 3844 // non_match is not referenced again. Only CCR6-EQ matters. 3845 __ vcmpequb_(non_match, non_match, vec_0s); 3846 __ bne_predict_not_taken(CCR6, zero_processed_exit); 3847 3848 // The Base64 characters had no errors, so add the offsets 3849 __ vaddubm(input, input, offsets); 3850 3851 // Pack 3852 // 3853 // Legend for the tables below: b0, b1, .. b15 are the bytes of 3854 // decoded binary data. The specifier after the colon depicts 3855 // which bits are there. The bit numbering is big endian style 3856 // (bit 0 is the most significant). The || is a concatenate 3857 // operator (same terminology as used in the Power ISA 3.x 3858 // document). Strings of 0's are a field of zeros with the shown 3859 // length. 3860 3861 if (PowerArchitecturePPC64 >= 10) { 3862 // Note that only e15..e8 are shown here because the extract 3863 // bit pattern is the same in e7..e0. 3864 // 3865 // +===============+=============+======================+======================+=============+=============+======================+======================+=============+ 3866 // | Vector | e15 | e14 | e13 | e12 | e11 | e10 | e9 | e8 | 3867 // | Element | | | | | | | | | 3868 // +===============+=============+======================+======================+=============+=============+======================+======================+=============+ 3869 // | after vaddubm | 00||b0:0..5 | 00||b0:6..7||b1:0..3 | 00||b1:4..7||b2:0..1 | 00||b2:2..7 | 00||b3:0..5 | 00||b3:6..7||b4:0..3 | 00||b4:4..7||b5:0..1 | 00||b5:2..7 | 3870 // +---------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 3871 // | after xxbrd | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 | 3872 // +---------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 3873 // | vec_0x3fs | 00111111 | 00111111 | 00111111 | 00111111 | 00111111 | 00111111 | 00111111 | 00111111 | 3874 // +---------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 3875 // | after vpextd | b5:0..7 | b4:0..7 | b3:0..7 | b2:0..7 | b1:0..7 | b0:0..7 | 00000000 | 00000000 | 3876 // +===============+=============+======================+======================+=============+=============+======================+======================+=============+ 3877 3878 __ xxbrd(input->to_vsr(), input->to_vsr()); 3879 __ vpextd(gathered, input, vec_0x3fs); 3880 3881 // Final jostling of bytes into their correct positions. 3882 // +==================+=====+=====+=====+=====+=====+=====+====+====+=====+=====+=====+=====+====+====+====+====+ 3883 // | Vector | e15 | e14 | e13 | e12 | e11 | e10 | e9 | e8 | e7 | e6 | e5 | e4 | e3 | e2 | e1 | e0 | 3884 // | Elements | | | | | | | | | | | | | | | | | 3885 // +==================+=====+=====+=====+=====+=====+=====+====+====+=====+=====+=====+=====+====+====+====+====+ 3886 // | after vpextd | b5 | b4 | b3 | b2 | b1 | b0 | 0 | 0 | b11 | b10 | b9 | b8 | b7 | b6 | 0 | 0 | 3887 // +------------------+-----+-----+-----+-----+-----+-----+----+----+-----+-----+-----+-----+----+----+----+----+ 3888 // | p10_pack_permute | 10 | 11 | 12 | 13 | 14 | 15 | 2 | 3 | 4 | 5 | 6 | 7 | 0 | 0 | 0 | 0 | 3889 // +------------------+-----+-----+-----+-----+-----+-----+----+----+-----+-----+-----+-----+----+----+----+----+ 3890 // | after xxperm | b0 | b1 | b2 | b3 | b4 | b5 | b6 | b7 | b8 | b9 | b10 | b11 | 0 | 0 | 0 | 0 | 3891 // +==================+=====+=====+=====+=====+=====+=====+====+====+=====+=====+=====+=====+====+====+====+====+ 3892 } else { 3893 // Note that only e15..e12 are shown here because the shifting 3894 // and OR'ing pattern replicates for e11..e8, e7..e4, and 3895 // e3..e0. 3896 // 3897 // +======================+=============+======================+======================+=================+ 3898 // | Vector | e15 | e14 | e13 | e12 | 3899 // | Element | | | | | 3900 // +======================+=============+======================+======================+=================+ 3901 // | after vaddubm | 00||b0:0..5 | 00||b0:6..7||b1:0..3 | 00||b1:4..7||b2:0..1 | 00||b2:2..7 | 3902 // +----------------------+-------------+----------------------+----------------------+-----------------+ 3903 // | pack_lshift | << 2 | << 4 | << 6 | | 3904 // +----------------------+-------------+----------------------+----------------------+-----------------+ 3905 // | l after vslb | b0:0..5||00 | b1:0..3||0000 | b2:0..1||000000 | 00||b2:2..7 | 3906 // +----------------------+-------------+----------------------+----------------------+-----------------+ 3907 // | l after vslo | 00000000 | b0:0..5||00 | b1:0..3||0000 | b2:0..1||000000 | 3908 // +----------------------+-------------+----------------------+----------------------+-----------------+ 3909 // | pack_rshift | | >> 4 | >> 2 | | 3910 // +----------------------+-------------+----------------------+----------------------+-----------------+ 3911 // | r after vsrb | 00||b0:0..5 | 000000||b0:6..7 | 0000||b1:4..7 | 00||b2:2..7 | 3912 // +----------------------+-------------+----------------------+----------------------+-----------------+ 3913 // | gathered after xxlor | 00||b0:0..5 | b0:0..7 | b1:0..7 | b2:0..7 | 3914 // +======================+=============+======================+======================+=================+ 3915 // 3916 // 3917 __ vslb(l, input, pack_lshift); 3918 // vslo of vec_8s shifts the vector by one octet toward lower 3919 // element numbers, discarding element 0. This means it actually 3920 // shifts to the right (not left) according to the order of the 3921 // table above. 3922 __ vslo(l, l, vec_8s); 3923 __ vsrb(r, input, pack_rshift); 3924 __ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr()); 3925 3926 // Final jostling of bytes into their correct positions. 3927 // +==============+=====+=====+=====+=====+=====+=====+====+====+====+====+=====+=====+======+======+======+======+ 3928 // | Vector | e15 | e14 | e13 | e12 | e11 | e10 | e9 | e8 | e7 | e6 | e5 | e4 | e3 | e2 | e1 | e0 | 3929 // | Elements | | | | | | | | | | | | | | | | | 3930 // +==============+=====+=====+=====+=====+=====+=====+====+====+====+====+=====+=====+======+======+======+======+ 3931 // | after xxlor | xx | b0 | b1 | b2 | xx | b3 | b4 | b5 | xx | b6 | b7 | b8 | xx | b9 | b10 | b11 | 3932 // +--------------+-----+-----+-----+-----+-----+-----+----+----+----+----+-----+-----+------+------+------+------+ 3933 // | pack_permute | 14 | 13 | 12 | 10 | 9 | 8 | 6 | 5 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 3934 // +--------------+-----+-----+-----+-----+-----+-----+----+----+----+----+-----+-----+------+------+------+------+ 3935 // | after xxperm | b0 | b1 | b2 | b3 | b4 | b5 | b6 | b7 | b8 | b9 | b10 | b11 | b11* | b11* | b11* | b11* | 3936 // +==============+=====+=====+=====+=====+=====+=====+====+====+====+====+=====+=====+======+======+======+======+ 3937 // xx bytes are not used to form the final data 3938 // b0..b15 are the decoded and reassembled 8-bit bytes of data 3939 // b11 with asterisk is a "don't care", because these bytes will be 3940 // overwritten on the next iteration. 3941 } 3942 __ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute); 3943 3944 // We cannot use a static displacement on the store, since it's a 3945 // multiple of 12, not 16. Note that this stxv instruction actually 3946 // writes 16 bytes, even though only the first 12 are valid data. 3947 __ stxv(gathered->to_vsr(), 0, out); 3948 __ addi(out, out, 12); 3949 } 3950 __ addi(in, in, 16 * loop_unrolls); 3951 __ b(unrolled_loop_start); 3952 3953 __ bind(unrolled_loop_exit); 3954 3955 // Return the number of out bytes produced, which is (out - d) 3956 __ sub(R3_RET, out, d); 3957 __ blr(); 3958 3959 // Return 0 characters processed. This can be due to an illegal Base64 character 3960 // that was discovered. 3961 __ bind(zero_processed_exit); 3962 __ li(R3_RET, 0); 3963 __ blr(); 3964 return start; 3965 } 3966 3967 #undef UC 3968 #undef LC 3969 #undef DIG 3970 #undef PLS 3971 #undef HYP 3972 #undef SLS 3973 #undef US 3974 3975 // Initialization 3976 void generate_initial() { 3977 // Generates all stubs and initializes the entry points 3978 3979 // Entry points that exist in all platforms. 3980 // Note: This is code that could be shared among different platforms - however the 3981 // benefit seems to be smaller than the disadvantage of having a 3982 // much more complicated generator structure. See also comment in 3983 // stubRoutines.hpp. 3984 3985 StubRoutines::_forward_exception_entry = generate_forward_exception(); 3986 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 3987 StubRoutines::_catch_exception_entry = generate_catch_exception(); 3988 3989 // Build this early so it's available for the interpreter. 3990 StubRoutines::_throw_StackOverflowError_entry = 3991 generate_throw_exception("StackOverflowError throw_exception", 3992 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false); 3993 StubRoutines::_throw_delayed_StackOverflowError_entry = 3994 generate_throw_exception("delayed StackOverflowError throw_exception", 4053 4054 // data cache line writeback 4055 if (VM_Version::supports_data_cache_line_flush()) { 4056 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 4057 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 4058 } 4059 4060 if (UseAESIntrinsics) { 4061 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4062 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4063 } 4064 4065 if (UseSHA256Intrinsics) { 4066 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4067 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4068 } 4069 if (UseSHA512Intrinsics) { 4070 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 4071 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 4072 } 4073 4074 #ifdef VM_LITTLE_ENDIAN 4075 // Currently supported on PPC64LE only 4076 if (UseBASE64Intrinsics) { 4077 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 4078 } 4079 #endif 4080 } 4081 4082 public: 4083 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4084 // replace the standard masm with a special one: 4085 _masm = new MacroAssembler(code); 4086 if (all) { 4087 generate_all(); 4088 } else { 4089 generate_initial(); 4090 } 4091 } 4092 }; 4093 4094 #define UCM_TABLE_MAX_ENTRIES 8 4095 void StubGenerator_generate(CodeBuffer* code, bool all) { 4096 if (UnsafeCopyMemory::_table == NULL) { 4097 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 4098 } 4099 StubGenerator g(code, all); |