1 /*
2 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2016 SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
5867 * @param [in,out]crc Register containing the crc.
5868 * @param [in]val Register containing the byte to fold into the CRC.
5869 * @param [in]table Register containing the table of crc constants.
5870 *
5871 * uint32_t crc;
5872 * val = crc_table[(val ^ crc) & 0xFF];
5873 * crc = val ^ (crc >> 8);
5874 */
5875 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
5876 z_xr(val, crc);
5877 fold_byte_crc32(crc, val, table, val);
5878 }
5879
5880
5881 /**
5882 * @param crc register containing existing CRC (32-bit)
5883 * @param buf register pointing to input byte buffer (byte*)
5884 * @param len register containing number of bytes
5885 * @param table register pointing to CRC table
5886 */
5887 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
5888 Register data, bool invertCRC) {
5889 assert_different_registers(crc, buf, len, table, data);
5890
5891 Label L_mainLoop, L_done;
5892 const int mainLoop_stepping = 1;
5893
5894 // Process all bytes in a single-byte loop.
5895 z_ltr(len, len);
5896 z_brnh(L_done);
5897
5898 if (invertCRC) {
5899 not_(crc, noreg, false); // ~c
5900 }
5901
5902 bind(L_mainLoop);
5903 z_llgc(data, Address(buf, (intptr_t)0));// Current byte of input buffer (zero extended). Avoids garbage in upper half of register.
5904 add2reg(buf, mainLoop_stepping); // Advance buffer position.
5905 update_byte_crc32(crc, data, table);
5906 z_brct(len, L_mainLoop); // Iterate.
5907
5908 if (invertCRC) {
5909 not_(crc, noreg, false); // ~c
5910 }
5911
5912 bind(L_done);
5913 }
5914
5915 /**
5916 * Emits code to update CRC-32 with a 4-byte value according to constants in table.
5917 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c.
5918 *
5919 */
5920 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
5921 Register t0, Register t1, Register t2, Register t3) {
5922 // This is what we implement (the DOBIG4 part):
5923 //
5924 // #define DOBIG4 c ^= *++buf4; \
5925 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
5926 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
5927 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
5928 const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
5929 const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
5930 const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
5931 const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
5949 z_ly(t2, Address(table, t2, (intptr_t)ix1));
5950 z_ly(t1, Address(table, t1, (intptr_t)ix2));
5951 z_ly(t0, Address(table, t0, (intptr_t)ix3));
5952
5953 // Calculate new crc from table values.
5954 z_xr(t2, t3);
5955 z_xr(t0, t1);
5956 z_xr(t0, t2); // Now crc contains the final checksum value.
5957 lgr_if_needed(crc, t0);
5958 }
5959
5960 /**
5961 * @param crc register containing existing CRC (32-bit)
5962 * @param buf register pointing to input byte buffer (byte*)
5963 * @param len register containing number of bytes
5964 * @param table register pointing to CRC table
5965 *
5966 * uses Z_R10..Z_R13 as work register. Must be saved/restored by caller!
5967 */
5968 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
5969 Register t0, Register t1, Register t2, Register t3) {
5970 assert_different_registers(crc, buf, len, table);
5971
5972 Label L_mainLoop, L_tail;
5973 Register data = t0;
5974 Register ctr = Z_R0;
5975 const int mainLoop_stepping = 8;
5976 const int tailLoop_stepping = 1;
5977 const int log_stepping = exact_log2(mainLoop_stepping);
5978
5979 // Don't test for len <= 0 here. This pathological case should not occur anyway.
5980 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
5981 // The situation itself is detected and handled correctly by the conditional branches
5982 // following aghi(len, -stepping) and aghi(len, +stepping).
5983
5984 not_(crc, noreg, false); // 1s complement of crc
5985
5986 #if 0
5987 {
5988 // Pre-mainLoop alignment did not show any positive effect on performance.
5989 // We leave the code in for reference. Maybe the vector instructions in z13 depend on alignment.
5990
5991 z_cghi(len, mainLoop_stepping); // Alignment is useless for short data streams.
5992 z_brnh(L_tail);
5993
5994 // Align buf to word (4-byte) boundary.
5995 z_lcr(ctr, buf);
5996 rotate_then_insert(ctr, ctr, 62, 63, 0, true); // TODO: should set cc
5997 z_sgfr(len, ctr); // Remaining len after alignment.
5998
5999 update_byteLoop_crc32(crc, buf, ctr, table, data, false);
6000 }
6001 #endif
6002
6003 // Check for short (<mainLoop_stepping bytes) buffer.
6004 z_srag(ctr, len, log_stepping);
6005 z_brnh(L_tail);
6006
6007 z_lrvr(crc, crc); // Revert byte order because we are dealing with big-endian data.
6008 rotate_then_insert(len, len, 64-log_stepping, 63, 0, true); // #bytes for tailLoop
6009
6010 BIND(L_mainLoop);
6011 update_1word_crc32(crc, buf, table, 0, 0, crc, t1, t2, t3);
6012 update_1word_crc32(crc, buf, table, 4, mainLoop_stepping, crc, t1, t2, t3);
6013 z_brct(ctr, L_mainLoop); // Iterate.
6014
6015 z_lrvr(crc, crc); // Revert byte order back to original.
6016
6017 // Process last few (<8) bytes of buffer.
6018 BIND(L_tail);
6019 update_byteLoop_crc32(crc, buf, len, table, data, false);
6020
6021 not_(crc, noreg, false); // 1s complement of crc
6022 }
6023
6024 /**
6025 * @param crc register containing existing CRC (32-bit)
6026 * @param buf register pointing to input byte buffer (byte*)
6027 * @param len register containing number of bytes
6028 * @param table register pointing to CRC table
6029 *
6030 * uses Z_R10..Z_R13 as work register. Must be saved/restored by caller!
6031 */
6032 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
6033 Register t0, Register t1, Register t2, Register t3) {
6034 assert_different_registers(crc, buf, len, table);
6035
6036 Label L_mainLoop, L_tail;
6037 Register data = t0;
6038 Register ctr = Z_R0;
6039 const int mainLoop_stepping = 4;
6040 const int log_stepping = exact_log2(mainLoop_stepping);
6041
6042 // Don't test for len <= 0 here. This pathological case should not occur anyway.
6043 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
6044 // The situation itself is detected and handled correctly by the conditional branches
6045 // following aghi(len, -stepping) and aghi(len, +stepping).
6046
6047 not_(crc, noreg, false); // 1s complement of crc
6048
6049 // Check for short (<4 bytes) buffer.
6050 z_srag(ctr, len, log_stepping);
6051 z_brnh(L_tail);
6052
6053 z_lrvr(crc, crc); // Revert byte order because we are dealing with big-endian data.
6054 rotate_then_insert(len, len, 64-log_stepping, 63, 0, true); // #bytes for tailLoop
6055
6056 BIND(L_mainLoop);
6057 update_1word_crc32(crc, buf, table, 0, mainLoop_stepping, crc, t1, t2, t3);
6058 z_brct(ctr, L_mainLoop); // Iterate.
6059 z_lrvr(crc, crc); // Revert byte order back to original.
6060
6061 // Process last few (<8) bytes of buffer.
6062 BIND(L_tail);
6063 update_byteLoop_crc32(crc, buf, len, table, data, false);
6064
6065 not_(crc, noreg, false); // 1s complement of crc
6066 }
6067
6068 /**
6069 * @param crc register containing existing CRC (32-bit)
6070 * @param buf register pointing to input byte buffer (byte*)
6071 * @param len register containing number of bytes
6072 * @param table register pointing to CRC table
6073 */
6074 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
6075 Register t0, Register t1, Register t2, Register t3) {
6076 assert_different_registers(crc, buf, len, table);
6077 Register data = t0;
6078
6079 update_byteLoop_crc32(crc, buf, len, table, data, true);
6080 }
6081
6082 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
6083 assert_different_registers(crc, buf, len, table, tmp);
6084
6085 not_(crc, noreg, false); // ~c
6086
6087 z_llgc(tmp, Address(buf, (intptr_t)0)); // Current byte of input buffer (zero extended). Avoids garbage in upper half of register.
6088 update_byte_crc32(crc, tmp, table);
6089
6090 not_(crc, noreg, false); // ~c
6091 }
6092
6093 //
6094 // Code for BigInteger::multiplyToLen() intrinsic.
6095 //
6096
6097 // dest_lo += src1 + src2
6098 // dest_hi += carry1 + carry2
6099 // Z_R7 is destroyed !
6100 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo,
6101 Register src1, Register src2) {
6102 clear_reg(Z_R7);
6103 z_algr(dest_lo, src1);
6104 z_alcgr(dest_hi, Z_R7);
6105 z_algr(dest_lo, src2);
6106 z_alcgr(dest_hi, Z_R7);
6107 }
6108
6109 // Multiply 64 bit by 64 bit first loop.
6110 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
|
1 /*
2 * Copyright (c) 2016, 2017, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2016, 2017, SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
5867 * @param [in,out]crc Register containing the crc.
5868 * @param [in]val Register containing the byte to fold into the CRC.
5869 * @param [in]table Register containing the table of crc constants.
5870 *
5871 * uint32_t crc;
5872 * val = crc_table[(val ^ crc) & 0xFF];
5873 * crc = val ^ (crc >> 8);
5874 */
5875 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
5876 z_xr(val, crc);
5877 fold_byte_crc32(crc, val, table, val);
5878 }
5879
5880
5881 /**
5882 * @param crc register containing existing CRC (32-bit)
5883 * @param buf register pointing to input byte buffer (byte*)
5884 * @param len register containing number of bytes
5885 * @param table register pointing to CRC table
5886 */
5887 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, Register data) {
5888 assert_different_registers(crc, buf, len, table, data);
5889
5890 Label L_mainLoop, L_done;
5891 const int mainLoop_stepping = 1;
5892
5893 // Process all bytes in a single-byte loop.
5894 z_ltr(len, len);
5895 z_brnh(L_done);
5896
5897 bind(L_mainLoop);
5898 z_llgc(data, Address(buf, (intptr_t)0));// Current byte of input buffer (zero extended). Avoids garbage in upper half of register.
5899 add2reg(buf, mainLoop_stepping); // Advance buffer position.
5900 update_byte_crc32(crc, data, table);
5901 z_brct(len, L_mainLoop); // Iterate.
5902
5903 bind(L_done);
5904 }
5905
5906 /**
5907 * Emits code to update CRC-32 with a 4-byte value according to constants in table.
5908 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c.
5909 *
5910 */
5911 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
5912 Register t0, Register t1, Register t2, Register t3) {
5913 // This is what we implement (the DOBIG4 part):
5914 //
5915 // #define DOBIG4 c ^= *++buf4; \
5916 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
5917 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
5918 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
5919 const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
5920 const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
5921 const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
5922 const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
5940 z_ly(t2, Address(table, t2, (intptr_t)ix1));
5941 z_ly(t1, Address(table, t1, (intptr_t)ix2));
5942 z_ly(t0, Address(table, t0, (intptr_t)ix3));
5943
5944 // Calculate new crc from table values.
5945 z_xr(t2, t3);
5946 z_xr(t0, t1);
5947 z_xr(t0, t2); // Now crc contains the final checksum value.
5948 lgr_if_needed(crc, t0);
5949 }
5950
5951 /**
5952 * @param crc register containing existing CRC (32-bit)
5953 * @param buf register pointing to input byte buffer (byte*)
5954 * @param len register containing number of bytes
5955 * @param table register pointing to CRC table
5956 *
5957 * uses Z_R10..Z_R13 as work register. Must be saved/restored by caller!
5958 */
5959 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
5960 Register t0, Register t1, Register t2, Register t3,
5961 bool invertCRC) {
5962 assert_different_registers(crc, buf, len, table);
5963
5964 Label L_mainLoop, L_tail;
5965 Register data = t0;
5966 Register ctr = Z_R0;
5967 const int mainLoop_stepping = 8;
5968 const int tailLoop_stepping = 1;
5969 const int log_stepping = exact_log2(mainLoop_stepping);
5970
5971 // Don't test for len <= 0 here. This pathological case should not occur anyway.
5972 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
5973 // The situation itself is detected and handled correctly by the conditional branches
5974 // following aghi(len, -stepping) and aghi(len, +stepping).
5975
5976 if (invertCRC) {
5977 not_(crc, noreg, false); // 1s complement of crc
5978 }
5979
5980 #if 0
5981 {
5982 // Pre-mainLoop alignment did not show any positive effect on performance.
5983 // We leave the code in for reference. Maybe the vector instructions in z13 depend on alignment.
5984
5985 z_cghi(len, mainLoop_stepping); // Alignment is useless for short data streams.
5986 z_brnh(L_tail);
5987
5988 // Align buf to word (4-byte) boundary.
5989 z_lcr(ctr, buf);
5990 rotate_then_insert(ctr, ctr, 62, 63, 0, true); // TODO: should set cc
5991 z_sgfr(len, ctr); // Remaining len after alignment.
5992
5993 update_byteLoop_crc32(crc, buf, ctr, table, data);
5994 }
5995 #endif
5996
5997 // Check for short (<mainLoop_stepping bytes) buffer.
5998 z_srag(ctr, len, log_stepping);
5999 z_brnh(L_tail);
6000
6001 z_lrvr(crc, crc); // Revert byte order because we are dealing with big-endian data.
6002 rotate_then_insert(len, len, 64-log_stepping, 63, 0, true); // #bytes for tailLoop
6003
6004 BIND(L_mainLoop);
6005 update_1word_crc32(crc, buf, table, 0, 0, crc, t1, t2, t3);
6006 update_1word_crc32(crc, buf, table, 4, mainLoop_stepping, crc, t1, t2, t3);
6007 z_brct(ctr, L_mainLoop); // Iterate.
6008
6009 z_lrvr(crc, crc); // Revert byte order back to original.
6010
6011 // Process last few (<8) bytes of buffer.
6012 BIND(L_tail);
6013 update_byteLoop_crc32(crc, buf, len, table, data);
6014
6015 if (invertCRC) {
6016 not_(crc, noreg, false); // 1s complement of crc
6017 }
6018 }
6019
6020 /**
6021 * @param crc register containing existing CRC (32-bit)
6022 * @param buf register pointing to input byte buffer (byte*)
6023 * @param len register containing number of bytes
6024 * @param table register pointing to CRC table
6025 *
6026 * uses Z_R10..Z_R13 as work register. Must be saved/restored by caller!
6027 */
6028 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
6029 Register t0, Register t1, Register t2, Register t3,
6030 bool invertCRC) {
6031 assert_different_registers(crc, buf, len, table);
6032
6033 Label L_mainLoop, L_tail;
6034 Register data = t0;
6035 Register ctr = Z_R0;
6036 const int mainLoop_stepping = 4;
6037 const int log_stepping = exact_log2(mainLoop_stepping);
6038
6039 // Don't test for len <= 0 here. This pathological case should not occur anyway.
6040 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
6041 // The situation itself is detected and handled correctly by the conditional branches
6042 // following aghi(len, -stepping) and aghi(len, +stepping).
6043
6044 if (invertCRC) {
6045 not_(crc, noreg, false); // 1s complement of crc
6046 }
6047
6048 // Check for short (<4 bytes) buffer.
6049 z_srag(ctr, len, log_stepping);
6050 z_brnh(L_tail);
6051
6052 z_lrvr(crc, crc); // Revert byte order because we are dealing with big-endian data.
6053 rotate_then_insert(len, len, 64-log_stepping, 63, 0, true); // #bytes for tailLoop
6054
6055 BIND(L_mainLoop);
6056 update_1word_crc32(crc, buf, table, 0, mainLoop_stepping, crc, t1, t2, t3);
6057 z_brct(ctr, L_mainLoop); // Iterate.
6058 z_lrvr(crc, crc); // Revert byte order back to original.
6059
6060 // Process last few (<8) bytes of buffer.
6061 BIND(L_tail);
6062 update_byteLoop_crc32(crc, buf, len, table, data);
6063
6064 if (invertCRC) {
6065 not_(crc, noreg, false); // 1s complement of crc
6066 }
6067 }
6068
6069 /**
6070 * @param crc register containing existing CRC (32-bit)
6071 * @param buf register pointing to input byte buffer (byte*)
6072 * @param len register containing number of bytes
6073 * @param table register pointing to CRC table
6074 */
6075 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
6076 Register t0, Register t1, Register t2, Register t3,
6077 bool invertCRC) {
6078 assert_different_registers(crc, buf, len, table);
6079 Register data = t0;
6080
6081 if (invertCRC) {
6082 not_(crc, noreg, false); // 1s complement of crc
6083 }
6084
6085 update_byteLoop_crc32(crc, buf, len, table, data);
6086
6087 if (invertCRC) {
6088 not_(crc, noreg, false); // 1s complement of crc
6089 }
6090 }
6091
6092 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp,
6093 bool invertCRC) {
6094 assert_different_registers(crc, buf, len, table, tmp);
6095
6096 if (invertCRC) {
6097 not_(crc, noreg, false); // 1s complement of crc
6098 }
6099
6100 z_llgc(tmp, Address(buf, (intptr_t)0)); // Current byte of input buffer (zero extended). Avoids garbage in upper half of register.
6101 update_byte_crc32(crc, tmp, table);
6102
6103 if (invertCRC) {
6104 not_(crc, noreg, false); // 1s complement of crc
6105 }
6106 }
6107
6108 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table,
6109 bool invertCRC) {
6110 assert_different_registers(crc, val, table);
6111
6112 if (invertCRC) {
6113 not_(crc, noreg, false); // 1s complement of crc
6114 }
6115
6116 update_byte_crc32(crc, val, table);
6117
6118 if (invertCRC) {
6119 not_(crc, noreg, false); // 1s complement of crc
6120 }
6121 }
6122
6123 //
6124 // Code for BigInteger::multiplyToLen() intrinsic.
6125 //
6126
6127 // dest_lo += src1 + src2
6128 // dest_hi += carry1 + carry2
6129 // Z_R7 is destroyed !
6130 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo,
6131 Register src1, Register src2) {
6132 clear_reg(Z_R7);
6133 z_algr(dest_lo, src1);
6134 z_alcgr(dest_hi, Z_R7);
6135 z_algr(dest_lo, src2);
6136 z_alcgr(dest_hi, Z_R7);
6137 }
6138
6139 // Multiply 64 bit by 64 bit first loop.
6140 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
|