< prev index next >

src/hotspot/cpu/ppc/stubGenerator_ppc.cpp

Print this page


   1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *


3527    *   R3_ARG1    - int   crc
3528    *   R4_ARG2    - byte* buf
3529    *   R5_ARG3    - int   length (of buffer)
3530    *
3531    * scratch:
3532    *   R2, R6-R12
3533    *
3534    * Ouput:
3535    *   R3_RET     - int   crc result
3536    */
3537   // Compute CRC32 function.
3538   address generate_CRC32_updateBytes(bool is_crc32c) {
3539     __ align(CodeEntryAlignment);
3540     StubCodeMark mark(this, "StubRoutines", is_crc32c ? "CRC32C_updateBytes" : "CRC32_updateBytes");
3541     address start = __ function_entry();  // Remember stub start address (is rtn value).
3542     __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
3543     __ blr();
3544     return start;
3545   }
3546 












































































































































































































































































































































































































































3547   // Initialization
3548   void generate_initial() {
3549     // Generates all stubs and initializes the entry points
3550 
3551     // Entry points that exist in all platforms.
3552     // Note: This is code that could be shared among different platforms - however the
3553     // benefit seems to be smaller than the disadvantage of having a
3554     // much more complicated generator structure. See also comment in
3555     // stubRoutines.hpp.
3556 
3557     StubRoutines::_forward_exception_entry          = generate_forward_exception();
3558     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);
3559     StubRoutines::_catch_exception_entry            = generate_catch_exception();
3560 
3561     // Build this early so it's available for the interpreter.
3562     StubRoutines::_throw_StackOverflowError_entry   =
3563       generate_throw_exception("StackOverflowError throw_exception",
3564                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
3565     StubRoutines::_throw_delayed_StackOverflowError_entry =
3566       generate_throw_exception("delayed StackOverflowError throw_exception",


3625 
3626     // data cache line writeback
3627     if (VM_Version::supports_data_cache_line_flush()) {
3628       StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
3629       StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
3630     }
3631 
3632     if (UseAESIntrinsics) {
3633       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3634       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3635     }
3636 
3637     if (UseSHA256Intrinsics) {
3638       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
3639       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
3640     }
3641     if (UseSHA512Intrinsics) {
3642       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
3643       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
3644     }







3645   }
3646 
3647  public:
3648   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3649     // replace the standard masm with a special one:
3650     _masm = new MacroAssembler(code);
3651     if (all) {
3652       generate_all();
3653     } else {
3654       generate_initial();
3655     }
3656   }
3657 };
3658 
3659 #define UCM_TABLE_MAX_ENTRIES 8
3660 void StubGenerator_generate(CodeBuffer* code, bool all) {
3661   if (UnsafeCopyMemory::_table == NULL) {
3662     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
3663   }
3664   StubGenerator g(code, all);
   1 /*
   2  * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2020, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *


3527    *   R3_ARG1    - int   crc
3528    *   R4_ARG2    - byte* buf
3529    *   R5_ARG3    - int   length (of buffer)
3530    *
3531    * scratch:
3532    *   R2, R6-R12
3533    *
3534    * Ouput:
3535    *   R3_RET     - int   crc result
3536    */
3537   // Compute CRC32 function.
3538   address generate_CRC32_updateBytes(bool is_crc32c) {
3539     __ align(CodeEntryAlignment);
3540     StubCodeMark mark(this, "StubRoutines", is_crc32c ? "CRC32C_updateBytes" : "CRC32_updateBytes");
3541     address start = __ function_entry();  // Remember stub start address (is rtn value).
3542     __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
3543     __ blr();
3544     return start;
3545   }
3546 
3547 
3548 // The following Base64 decode intrinsic is based on an algorithm outlined
3549 // in here:
3550 // http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html
3551 // in the section titled "Vector lookup (pshufb with bitmask)"
3552 //
3553 // This implementation differs in the following ways:
3554 //  * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions
3555 //    are used instead.  It turns out that some of the vector operations
3556 //    needed in the algorithm require fewer AltiVec instructions.
3557 //  * The algorithm in the above mentioned paper doesn't handle the
3558 //    Base64-URL variant in RFC 4648.  Adjustments to both the code and to two
3559 //    lookup tables are needed for this.
3560 //  * The "Pack" section of the code is a complete rewrite for Power because we
3561 //    can utilize better instructions for this step.
3562 //
3563 
3564 // Offsets per group of Base64 characters
3565 // Uppercase
3566 #define UC  (signed char)((-'A' + 0) & 0xff)
3567 // Lowercase
3568 #define LC  (signed char)((-'a' + 26) & 0xff)
3569 // Digits
3570 #define DIG (signed char)((-'0' + 52) & 0xff)
3571 // Plus sign (URL = 0)
3572 #define PLS (signed char)((-'+' + 62) & 0xff)
3573 // Hyphen (URL = 1)
3574 #define HYP (signed char)((-'-' + 62) & 0xff)
3575 // Slash (URL = 0)
3576 #define SLS (signed char)((-'/' + 63) & 0xff)
3577 // Underscore (URL = 1)
3578 #define US  (signed char)((-'_' + 63) & 0xff)
3579 
3580 // In little-endian mode, the lxv instruction loads the element at EA into element 15
3581 // of the the vector register,  EA+1 goes into element 15, and so on.
3582 //
3583 // To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the order of
3584 // the elements in a vector initialization.
3585 
3586 #define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0
3587 
3588   //
3589   // Base64 decodeBlock intrinsic
3590   address generate_base64_decodeBlock() {
3591     __ align(CodeEntryAlignment);
3592     StubCodeMark mark(this, "StubRoutines", "base64_decodeBlock");
3593     address start   = __ function_entry();
3594 
3595     static const __vector signed char offsetLUT_val = {
3596       ARRAY_TO_LXV_ORDER(
3597       0,   0, PLS, DIG,  UC,  UC,  LC,  LC,
3598       0,   0,   0,   0,   0,   0,   0,   0 ) };
3599 
3600     static const __vector signed char offsetLUT_URL_val = {
3601       ARRAY_TO_LXV_ORDER(
3602       0,   0, HYP, DIG,  UC,  UC,  LC,  LC,
3603       0,   0,   0,   0,   0,   0,   0,   0 ) };
3604 
3605     static const __vector unsigned char maskLUT_val = {
3606       ARRAY_TO_LXV_ORDER(
3607       /* 0        */ (unsigned char)0b10101000,
3608       /* 1 .. 9   */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3609                      (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3610                      (unsigned char)0b11111000,
3611       /* 10       */ (unsigned char)0b11110000,
3612       /* 11       */ (unsigned char)0b01010100,
3613       /* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000,
3614       /* 15       */ (unsigned char)0b01010100 ) };
3615 
3616     static const __vector unsigned char maskLUT_URL_val = {
3617       ARRAY_TO_LXV_ORDER(
3618       /* 0        */ (unsigned char)0b10101000,
3619       /* 1 .. 9   */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3620                      (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3621                      (unsigned char)0b11111000,
3622       /* 10       */ (unsigned char)0b11110000,
3623       /* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000,
3624       /* 13       */ (unsigned char)0b01010100,
3625       /* 14       */ (unsigned char)0b01010000,
3626       /* 15       */ (unsigned char)0b01110000 ) };
3627 
3628     static const __vector unsigned char bitposLUT_val = {
3629       ARRAY_TO_LXV_ORDER(
3630       0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80,
3631       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) };
3632 
3633     static const __vector unsigned char pack_lshift_val = { 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6, 0 };
3634 
3635     static const __vector unsigned char pack_rshift_val = { 0, 4, 2, 0, 0, 4, 2, 0, 0, 4, 2, 0, 0, 4, 2, 0 };
3636 
3637     // The last 4 index values are "don't care" because
3638     // we only use the first 12 bytes of the vector,
3639     // which are decoded from 16 bytes of Base64 characters.
3640     static const __vector unsigned char pack_permute_val = {
3641       14, 13, 12,
3642       10,  9,  8,
3643        6,  5,  4,
3644        2,  1,  0,
3645        0,  0,  0, 0 };
3646 
3647     static const __vector unsigned char p10_pack_permute_val = {
3648       10, 11, 12, 13, 14, 15,
3649        2,  3,  4,  5,  6,  7,
3650        0,  0,  0,  0 };
3651 
3652     const unsigned loop_unrolls = 8; // needs to be a power of two so that the rounding can be done using a mask
3653     const unsigned vec_size = 16; // size of vector registers in bytes
3654     const unsigned block_size = vec_size * loop_unrolls;  // number of bytes to process in each pass through the loop
3655     const unsigned block_size_clear = exact_log2(block_size); // the lower log2(block_size) bits of the size
3656 
3657     // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
3658     Register s      = R3_ARG1; // source starting address of Base64 characters
3659     Register sp     = R4_ARG2; // actual start of processing is at s + sp
3660     Register sl     = R5_ARG3; // source length = # of Base64 characters to be processed
3661     Register d      = R6_ARG4; // destination address
3662     Register isURL  = R7_ARG5; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
3663 
3664     // Local variables
3665     Register const_ptr     = R8; // used for loading constants
3666     Register tmp_reg       = R9; // used for speeding up load_constant()
3667 
3668     // Re-use R8 and R9 to avoid using non-volatile registers (requires save/restore)
3669     Register out           = R8; // moving out (destination) pointer
3670     Register in            = R9; // moving in (source) pointer
3671     Register end           = R10; // pointer to the last byte of the source
3672     Register non_match_cnt = R11; // flag for detecting non-BASE64 characters
3673 
3674 
3675     // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
3676     // VR Constants
3677     VectorRegister  vec_0s                  = VR0;
3678     VectorRegister  vec_4s                  = VR1;
3679     VectorRegister  vec_8s                  = VR2;
3680     VectorRegister  vec_special_case_char   = VR3;
3681     VectorRegister  pack_rshift             = VR4;
3682     VectorRegister  pack_lshift             = VR5;
3683     // P10+
3684     VectorRegister  vec_0x3fs               = VR4; // safe to reuse pack_rshift's register
3685 
3686     // VSR Constants
3687     VectorSRegister offsetLUT               = VSR0;
3688     VectorSRegister maskLUT                 = VSR1;
3689     VectorSRegister bitposLUT               = VSR2;
3690     VectorSRegister vec_0xfs                = VSR3;
3691     VectorSRegister vec_special_case_offset = VSR4;
3692     VectorSRegister pack_permute            = VSR5;
3693 
3694     // Variables for lookup
3695     // VR
3696     VectorRegister  input                   = VR6;
3697     VectorRegister  higher_nibble           = VR7;
3698     VectorRegister  eq_special_case_char    = VR8;
3699     VectorRegister  offsets                 = VR9;
3700     VectorRegister  non_match               = VR10;
3701 
3702     // VSR
3703     VectorSRegister bit                     = VSR6;
3704     VectorSRegister lower_nibble            = VSR7;
3705     VectorSRegister M                       = VSR8;
3706 
3707     // Variables for pack
3708     // VR
3709     VectorRegister  l                       = VR7;  // reuse higher_nibble's register
3710     VectorRegister  r                       = VR8;  // reuse eq_special_case_char's register
3711     VectorRegister  gathered                = VR9;  // reuse offsets's register
3712 
3713     Label not_URL, calculate_size, unrolled_loop_start, skip_xxsel[loop_unrolls], unrolled_loop_exit, zero_processed_exit;
3714 
3715     // Load constant vec registers that need to be loaded from memory
3716     __ load_const(const_ptr, (address)&bitposLUT_val, tmp_reg);
3717     __ lxv(bitposLUT, 0, const_ptr);
3718     if (PowerArchitecturePPC64 >= 10) {
3719         __ load_const(const_ptr, (address)&p10_pack_permute_val, tmp_reg);
3720     } else {
3721         __ load_const(const_ptr, (address)&pack_rshift_val, tmp_reg);
3722         __ lxv(pack_rshift->to_vsr(), 0, const_ptr);
3723         __ load_const(const_ptr, (address)&pack_lshift_val, tmp_reg);
3724         __ lxv(pack_lshift->to_vsr(), 0, const_ptr);
3725         __ load_const(const_ptr, (address)&pack_permute_val, tmp_reg);
3726     }
3727     __ lxv(pack_permute, 0, const_ptr);
3728 
3729     // Splat the constants that can use xxspltib
3730     __ xxspltib(vec_0s->to_vsr(), 0);
3731     __ xxspltib(vec_4s->to_vsr(), 4);
3732     __ xxspltib(vec_8s->to_vsr(), 8);
3733     __ xxspltib(vec_0xfs, 0xf);
3734     if (PowerArchitecturePPC64 >= 10) {
3735         __ xxspltib(vec_0x3fs->to_vsr(), 0x3f);
3736     }
3737 
3738     // The rest of the constants use different values depending on the
3739     // setting of isURL
3740     __ cmpdi(CCR0, isURL, 0);
3741     __ beq(CCR0, not_URL);
3742 
3743     // isURL != 0 (true)
3744     __ load_const(const_ptr, (address)&offsetLUT_URL_val, tmp_reg);
3745     __ lxv(offsetLUT, 0, const_ptr);
3746     __ load_const(const_ptr, (address)&maskLUT_URL_val, tmp_reg);
3747     __ lxv(maskLUT, 0, const_ptr);
3748     __ xxspltib(vec_special_case_char->to_vsr(), '_');
3749     __ xxspltib(vec_special_case_offset, (unsigned char)US);
3750     __ b(calculate_size);
3751 
3752     // isURL = 0 (false)
3753     __ bind(not_URL);
3754     __ load_const(const_ptr, (address)&offsetLUT_val, tmp_reg);
3755     __ lxv(offsetLUT, 0, const_ptr);
3756     __ load_const(const_ptr, (address)&maskLUT_val, tmp_reg);
3757     __ lxv(maskLUT, 0, const_ptr);
3758     __ xxspltib(vec_special_case_char->to_vsr(), '/');
3759     __ xxspltib(vec_special_case_offset, (unsigned char)SLS);
3760 
3761     __ bind(calculate_size);
3762 
3763     // Don't handle the last 4 characters of the source, because this
3764     // VSX-based algorithm doesn't handle padding characters.  Also the
3765     // vector code will always write 16 bytes of decoded data on each pass,
3766     // but only the first 12 of those 16 bytes are valid data (16 base64
3767     // characters become 12 bytes of binary data), so for this reason we
3768     // need to subtract an additional 8 bytes from the source length, in
3769     // order not to write past the end of the destination buffer.  The
3770     // result of this subtraction implies that the non-instrinsic routine
3771     // will be used to process the last 12 characters.
3772     __ subi(sl, sl, 12);
3773 
3774     // Round sl down to the nearest multiple of block_size
3775     __ clrrdi(sl, sl, block_size_clear);
3776 
3777     // out starts at the beginning of the destination
3778     __ addi(out, d, 0);
3779 
3780     // in starts at s + sp
3781     __ add(in, s, sp);
3782 
3783     // Address of the last byte of the source is (in + sl - 1)
3784     __ add(end, in, sl);
3785     __ subi(end, end, 1);
3786 
3787     __ bind(unrolled_loop_start);
3788 
3789     __ cmpd(CCR0, end, in);
3790     __ blt_predict_not_taken(CCR0, unrolled_loop_exit);
3791     for (unsigned unroll_cnt=0; unroll_cnt < loop_unrolls; unroll_cnt++) {
3792         // We can use a static displacement in the load since it's always a
3793         // multiple of 16, which is a requirement of lxv/stxv.  This saves
3794         // an addi instruction.
3795         __ lxv(input->to_vsr(), unroll_cnt * 16, in);
3796         //
3797         // Lookup
3798         //
3799         // Isolate the upper 4 bits of each character by shifting it right 4 bits
3800         __ vsrb(higher_nibble, input, vec_4s);
3801         // Isolate the lower 4 bits by masking
3802         __ xxland(lower_nibble, input->to_vsr(), vec_0xfs);
3803 
3804         // Get the offset (the value to subtract from the byte) by using
3805         // a lookup table indexed by the upper 4 bits of the character
3806         __ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr());
3807 
3808         // Find out which elemets are the special case character (isURL ? '/' : '-')
3809         __ vcmpequb_(eq_special_case_char, input, vec_special_case_char);
3810         //
3811         // There's a (63/64)^16 = 77.7% chance that there are no special
3812         // case chars in this 16 bytes of input.  When we detect this case
3813         // (CCR6-EQ, all comparisons are false), we can skip the xxsel
3814         // step.
3815         __ beq_predict_taken(CCR6, skip_xxsel[unroll_cnt]);
3816 
3817         // For each character in the input which is a special case
3818         // character, replace its offset with one that is special for that
3819         // character.
3820         __ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr());
3821 
3822         // Note that skip_xxsel is indexed because this code is contained
3823         // in a C++ loop (the emitted code in this unroll loop doesn't
3824         // loop).  The indexing allows the creation of a unique labels for
3825         // each iteration of the unrolled loop.
3826         __ bind(skip_xxsel[unroll_cnt]);
3827 
3828         // Use the lower_nibble to select a mask "M" from the lookup table.
3829         __ xxperm(M, maskLUT, lower_nibble);
3830 
3831         // "bit" is used to isolate which of the bits in M is relevant.
3832         __ xxperm(bit, bitposLUT, higher_nibble->to_vsr());
3833 
3834         // Each element of non_match correspond to one each of the 16 input
3835         // characters.  Those elements that become 0x00 after the xxland
3836         // instuction are invalid Base64 characters.
3837         __ xxland(non_match->to_vsr(), M, bit);
3838 
3839         // Compare each element to zero
3840         //
3841         // vmcmpequb_ sets the EQ bit of CCR6 if no elements compare equal.
3842         // Any element comparing equal to zero means there is an error in
3843         // that element.  Note that the comparison result register
3844         // non_match is not referenced again.  Only CCR6-EQ matters.
3845         __ vcmpequb_(non_match, non_match, vec_0s);
3846         __ bne_predict_not_taken(CCR6, zero_processed_exit);
3847 
3848         // The Base64 characters had no errors, so add the offsets
3849         __ vaddubm(input, input, offsets);
3850 
3851         // Pack
3852         //
3853         // Legend for the tables below: b0, b1, .. b15 are the bytes of
3854         // decoded binary data.  The specifier after the colon depicts
3855         // which bits are there.  The bit numbering is big endian style
3856         // (bit 0 is the most significant).  The || is a concatenate
3857         // operator (same terminology as used in the Power ISA 3.x
3858         // document).  Strings of 0's are a field of zeros with the shown
3859         // length.
3860 
3861         if (PowerArchitecturePPC64 >= 10) {
3862             // Note that only e15..e8 are shown here because the extract
3863             // bit pattern is the same in e7..e0.
3864             //
3865             // +===============+=============+======================+======================+=============+=============+======================+======================+=============+
3866             // |    Vector     |     e15     |         e14          |         e13          |     e12     |     e11     |         e10          |          e9          |     e8      |
3867             // |    Element    |             |                      |                      |             |             |                      |                      |             |
3868             // +===============+=============+======================+======================+=============+=============+======================+======================+=============+
3869             // | after vaddubm | 00||b0:0..5 | 00||b0:6..7||b1:0..3 | 00||b1:4..7||b2:0..1 | 00||b2:2..7 | 00||b3:0..5 | 00||b3:6..7||b4:0..3 | 00||b4:4..7||b5:0..1 | 00||b5:2..7 |
3870             // +---------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
3871             // |  after xxbrd  | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
3872             // +---------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
3873             // |   vec_0x3fs   |  00111111   |       00111111       |       00111111       |  00111111   |  00111111   |       00111111       |       00111111       |  00111111   |
3874             // +---------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
3875             // | after vpextd  |   b5:0..7   |       b4:0..7        |       b3:0..7        |   b2:0..7   |   b1:0..7   |       b0:0..7        |       00000000       |  00000000   |
3876             // +===============+=============+======================+======================+=============+=============+======================+======================+=============+
3877 
3878             __ xxbrd(input->to_vsr(), input->to_vsr());
3879             __ vpextd(gathered, input, vec_0x3fs);
3880 
3881             // Final jostling of bytes into their correct positions.
3882             // +==================+=====+=====+=====+=====+=====+=====+====+====+=====+=====+=====+=====+====+====+====+====+
3883             // |      Vector      | e15 | e14 | e13 | e12 | e11 | e10 | e9 | e8 | e7  | e6  | e5  | e4  | e3 | e2 | e1 | e0 |
3884             // |     Elements     |     |     |     |     |     |     |    |    |     |     |     |     |    |    |    |    |
3885             // +==================+=====+=====+=====+=====+=====+=====+====+====+=====+=====+=====+=====+====+====+====+====+
3886             // |   after vpextd   | b5  | b4  | b3  | b2  | b1  | b0  | 0  | 0  | b11 | b10 | b9  | b8  | b7 | b6 | 0  | 0  |
3887             // +------------------+-----+-----+-----+-----+-----+-----+----+----+-----+-----+-----+-----+----+----+----+----+
3888             // | p10_pack_permute | 10  | 11  | 12  | 13  | 14  | 15  | 2  | 3  |  4  |  5  |  6  |  7  | 0  | 0  | 0  | 0  |
3889             // +------------------+-----+-----+-----+-----+-----+-----+----+----+-----+-----+-----+-----+----+----+----+----+
3890             // |   after xxperm   | b0  | b1  | b2  | b3  | b4  | b5  | b6 | b7 | b8  | b9  | b10 | b11 | 0  | 0  | 0  | 0  |
3891             // +==================+=====+=====+=====+=====+=====+=====+====+====+=====+=====+=====+=====+====+====+====+====+
3892         } else {
3893             // Note that only e15..e12 are shown here because the shifting
3894             // and OR'ing pattern replicates for e11..e8, e7..e4, and
3895             // e3..e0.
3896             //
3897             // +======================+=============+======================+======================+=================+
3898             // |        Vector        |     e15     |         e14          |         e13          |       e12       |
3899             // |       Element        |             |                      |                      |                 |
3900             // +======================+=============+======================+======================+=================+
3901             // |    after vaddubm     | 00||b0:0..5 | 00||b0:6..7||b1:0..3 | 00||b1:4..7||b2:0..1 |   00||b2:2..7   |
3902             // +----------------------+-------------+----------------------+----------------------+-----------------+
3903             // |     pack_lshift      |    << 2     |         << 4         |         << 6         |                 |
3904             // +----------------------+-------------+----------------------+----------------------+-----------------+
3905             // |     l after vslb     | b0:0..5||00 |    b1:0..3||0000     |   b2:0..1||000000    |   00||b2:2..7   |
3906             // +----------------------+-------------+----------------------+----------------------+-----------------+
3907             // |     l after vslo     |  00000000   |     b0:0..5||00      |    b1:0..3||0000     | b2:0..1||000000 |
3908             // +----------------------+-------------+----------------------+----------------------+-----------------+
3909             // |     pack_rshift      |             |         >> 4         |         >> 2         |                 |
3910             // +----------------------+-------------+----------------------+----------------------+-----------------+
3911             // |     r after vsrb     | 00||b0:0..5 |   000000||b0:6..7    |    0000||b1:4..7     |   00||b2:2..7   |
3912             // +----------------------+-------------+----------------------+----------------------+-----------------+
3913             // | gathered after xxlor | 00||b0:0..5 |       b0:0..7        |       b1:0..7        |     b2:0..7     |
3914             // +======================+=============+======================+======================+=================+
3915             //
3916             //
3917             __ vslb(l, input, pack_lshift);
3918             // vslo of vec_8s shifts the vector by one octet toward lower
3919             // element numbers, discarding element 0.  This means it actually
3920             // shifts to the right (not left) according to the order of the
3921             // table above.
3922             __ vslo(l, l, vec_8s);
3923             __ vsrb(r, input, pack_rshift);
3924             __ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr());
3925 
3926             // Final jostling of bytes into their correct positions.
3927             // +==============+=====+=====+=====+=====+=====+=====+====+====+====+====+=====+=====+======+======+======+======+
3928             // |    Vector    | e15 | e14 | e13 | e12 | e11 | e10 | e9 | e8 | e7 | e6 | e5  | e4  |  e3  |  e2  |  e1  |  e0  |
3929             // |   Elements   |     |     |     |     |     |     |    |    |    |    |     |     |      |      |      |      |
3930             // +==============+=====+=====+=====+=====+=====+=====+====+====+====+====+=====+=====+======+======+======+======+
3931             // | after xxlor  | xx  | b0  | b1  | b2  | xx  | b3  | b4 | b5 | xx | b6 | b7  | b8  |  xx  |  b9  | b10  | b11  |
3932             // +--------------+-----+-----+-----+-----+-----+-----+----+----+----+----+-----+-----+------+------+------+------+
3933             // | pack_permute | 14  | 13  | 12  | 10  |  9  |  8  | 6  | 5  | 4  | 2  |  1  |  0  |  0   |  0   |  0   |  0   |
3934             // +--------------+-----+-----+-----+-----+-----+-----+----+----+----+----+-----+-----+------+------+------+------+
3935             // | after xxperm | b0  | b1  | b2  | b3  | b4  | b5  | b6 | b7 | b8 | b9 | b10 | b11 | b11* | b11* | b11* | b11* |
3936             // +==============+=====+=====+=====+=====+=====+=====+====+====+====+====+=====+=====+======+======+======+======+
3937             // xx bytes are not used to form the final data
3938             // b0..b15 are the decoded and reassembled 8-bit bytes of data
3939             // b11 with asterisk is a "don't care", because these bytes will be
3940             // overwritten on the next iteration.
3941         }
3942         __ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute);
3943 
3944         // We cannot use a static displacement on the store, since it's a
3945         // multiple of 12, not 16.  Note that this stxv instruction actually
3946         // writes 16 bytes, even though only the first 12 are valid data.
3947         __ stxv(gathered->to_vsr(), 0, out);
3948         __ addi(out, out, 12);
3949     }
3950     __ addi(in, in, 16 * loop_unrolls);
3951     __ b(unrolled_loop_start);
3952 
3953     __ bind(unrolled_loop_exit);
3954 
3955     // Return the number of out bytes produced, which is (out - d)
3956     __ sub(R3_RET, out, d);
3957     __ blr();
3958 
3959     // Return 0 characters processed.  This can be due to an illegal Base64 character
3960     // that was discovered.
3961     __ bind(zero_processed_exit);
3962     __ li(R3_RET, 0);
3963     __ blr();
3964     return start;
3965   }
3966 
3967 #undef UC
3968 #undef LC
3969 #undef DIG
3970 #undef PLS
3971 #undef HYP
3972 #undef SLS
3973 #undef US
3974 
3975   // Initialization
3976   void generate_initial() {
3977     // Generates all stubs and initializes the entry points
3978 
3979     // Entry points that exist in all platforms.
3980     // Note: This is code that could be shared among different platforms - however the
3981     // benefit seems to be smaller than the disadvantage of having a
3982     // much more complicated generator structure. See also comment in
3983     // stubRoutines.hpp.
3984 
3985     StubRoutines::_forward_exception_entry          = generate_forward_exception();
3986     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);
3987     StubRoutines::_catch_exception_entry            = generate_catch_exception();
3988 
3989     // Build this early so it's available for the interpreter.
3990     StubRoutines::_throw_StackOverflowError_entry   =
3991       generate_throw_exception("StackOverflowError throw_exception",
3992                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
3993     StubRoutines::_throw_delayed_StackOverflowError_entry =
3994       generate_throw_exception("delayed StackOverflowError throw_exception",


4053 
4054     // data cache line writeback
4055     if (VM_Version::supports_data_cache_line_flush()) {
4056       StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
4057       StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
4058     }
4059 
4060     if (UseAESIntrinsics) {
4061       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4062       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4063     }
4064 
4065     if (UseSHA256Intrinsics) {
4066       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4067       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4068     }
4069     if (UseSHA512Intrinsics) {
4070       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
4071       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
4072     }
4073 
4074 #ifdef VM_LITTLE_ENDIAN
4075     // Currently supported on PPC64LE only
4076     if (UseBASE64Intrinsics) {
4077       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
4078     }
4079 #endif
4080   }
4081 
4082  public:
4083   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4084     // replace the standard masm with a special one:
4085     _masm = new MacroAssembler(code);
4086     if (all) {
4087       generate_all();
4088     } else {
4089       generate_initial();
4090     }
4091   }
4092 };
4093 
4094 #define UCM_TABLE_MAX_ENTRIES 8
4095 void StubGenerator_generate(CodeBuffer* code, bool all) {
4096   if (UnsafeCopyMemory::_table == NULL) {
4097     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
4098   }
4099   StubGenerator g(code, all);
< prev index next >