< prev index next >

src/cpu/s390/vm/macroAssembler_s390.cpp

Print this page


   1 /*
   2  * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *


5867  * @param [in,out]crc Register containing the crc.
5868  * @param [in]val     Register containing the byte to fold into the CRC.
5869  * @param [in]table   Register containing the table of crc constants.
5870  *
5871  * uint32_t crc;
5872  * val = crc_table[(val ^ crc) & 0xFF];
5873  * crc = val ^ (crc >> 8);
5874  */
5875 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
5876   z_xr(val, crc);
5877   fold_byte_crc32(crc, val, table, val);
5878 }
5879 
5880 
5881 /**
5882  * @param crc   register containing existing CRC (32-bit)
5883  * @param buf   register pointing to input byte buffer (byte*)
5884  * @param len   register containing number of bytes
5885  * @param table register pointing to CRC table
5886  */
5887 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
5888                                            Register data, bool invertCRC) {
5889   assert_different_registers(crc, buf, len, table, data);
5890 
5891   Label L_mainLoop, L_done;
5892   const int mainLoop_stepping = 1;
5893 
5894   // Process all bytes in a single-byte loop.
5895   z_ltr(len, len);
5896   z_brnh(L_done);
5897 
5898   if (invertCRC) {
5899     not_(crc, noreg, false); // ~c
5900   }
5901 
5902   bind(L_mainLoop);
5903     z_llgc(data, Address(buf, (intptr_t)0));// Current byte of input buffer (zero extended). Avoids garbage in upper half of register.
5904     add2reg(buf, mainLoop_stepping);        // Advance buffer position.
5905     update_byte_crc32(crc, data, table);
5906     z_brct(len, L_mainLoop);                // Iterate.
5907 
5908   if (invertCRC) {
5909     not_(crc, noreg, false); // ~c
5910   }
5911 
5912   bind(L_done);
5913 }
5914 
5915 /**
5916  * Emits code to update CRC-32 with a 4-byte value according to constants in table.
5917  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c.
5918  *
5919  */
5920 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
5921                                         Register t0,  Register t1,  Register t2,    Register t3) {
5922   // This is what we implement (the DOBIG4 part):
5923   //
5924   // #define DOBIG4 c ^= *++buf4; \
5925   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
5926   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
5927   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
5928   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
5929   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
5930   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
5931   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);


5949   z_ly(t2, Address(table, t2, (intptr_t)ix1));
5950   z_ly(t1, Address(table, t1, (intptr_t)ix2));
5951   z_ly(t0, Address(table, t0, (intptr_t)ix3));
5952 
5953   // Calculate new crc from table values.
5954   z_xr(t2, t3);
5955   z_xr(t0, t1);
5956   z_xr(t0, t2);  // Now crc contains the final checksum value.
5957   lgr_if_needed(crc, t0);
5958 }
5959 
5960 /**
5961  * @param crc   register containing existing CRC (32-bit)
5962  * @param buf   register pointing to input byte buffer (byte*)
5963  * @param len   register containing number of bytes
5964  * @param table register pointing to CRC table
5965  *
5966  * uses Z_R10..Z_R13 as work register. Must be saved/restored by caller!
5967  */
5968 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
5969                                         Register t0,  Register t1,  Register t2,  Register t3) {

5970   assert_different_registers(crc, buf, len, table);
5971 
5972   Label L_mainLoop, L_tail;
5973   Register  data = t0;
5974   Register  ctr  = Z_R0;
5975   const int mainLoop_stepping = 8;
5976   const int tailLoop_stepping = 1;
5977   const int log_stepping      = exact_log2(mainLoop_stepping);
5978 
5979   // Don't test for len <= 0 here. This pathological case should not occur anyway.
5980   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
5981   // The situation itself is detected and handled correctly by the conditional branches
5982   // following aghi(len, -stepping) and aghi(len, +stepping).
5983 

5984   not_(crc, noreg, false);             // 1s complement of crc

5985 
5986 #if 0
5987   {
5988     // Pre-mainLoop alignment did not show any positive effect on performance.
5989     // We leave the code in for reference. Maybe the vector instructions in z13 depend on alignment.
5990 
5991     z_cghi(len, mainLoop_stepping);    // Alignment is useless for short data streams.
5992     z_brnh(L_tail);
5993 
5994     // Align buf to word (4-byte) boundary.
5995     z_lcr(ctr, buf);
5996     rotate_then_insert(ctr, ctr, 62, 63, 0, true); // TODO: should set cc
5997     z_sgfr(len, ctr);                  // Remaining len after alignment.
5998 
5999     update_byteLoop_crc32(crc, buf, ctr, table, data, false);
6000   }
6001 #endif
6002 
6003   // Check for short (<mainLoop_stepping bytes) buffer.
6004   z_srag(ctr, len, log_stepping);
6005   z_brnh(L_tail);
6006 
6007   z_lrvr(crc, crc);             // Revert byte order because we are dealing with big-endian data.
6008   rotate_then_insert(len, len, 64-log_stepping, 63, 0, true); // #bytes for tailLoop
6009 
6010   BIND(L_mainLoop);
6011     update_1word_crc32(crc, buf, table, 0, 0, crc, t1, t2, t3);
6012     update_1word_crc32(crc, buf, table, 4, mainLoop_stepping, crc, t1, t2, t3);
6013     z_brct(ctr, L_mainLoop);    // Iterate.
6014 
6015   z_lrvr(crc, crc);        // Revert byte order back to original.
6016 
6017   // Process last few (<8) bytes of buffer.
6018   BIND(L_tail);
6019   update_byteLoop_crc32(crc, buf, len, table, data, false);
6020 

6021   not_(crc, noreg, false); // 1s complement of crc

6022 }
6023 
6024 /**
6025  * @param crc   register containing existing CRC (32-bit)
6026  * @param buf   register pointing to input byte buffer (byte*)
6027  * @param len   register containing number of bytes
6028  * @param table register pointing to CRC table
6029  *
6030  * uses Z_R10..Z_R13 as work register. Must be saved/restored by caller!
6031  */
6032 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
6033                                         Register t0,  Register t1,  Register t2,  Register t3) {

6034   assert_different_registers(crc, buf, len, table);
6035 
6036   Label L_mainLoop, L_tail;
6037   Register  data = t0;
6038   Register  ctr  = Z_R0;
6039   const int mainLoop_stepping = 4;
6040   const int log_stepping      = exact_log2(mainLoop_stepping);
6041 
6042   // Don't test for len <= 0 here. This pathological case should not occur anyway.
6043   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
6044   // The situation itself is detected and handled correctly by the conditional branches
6045   // following aghi(len, -stepping) and aghi(len, +stepping).
6046 

6047   not_(crc, noreg, false); // 1s complement of crc

6048 
6049   // Check for short (<4 bytes) buffer.
6050   z_srag(ctr, len, log_stepping);
6051   z_brnh(L_tail);
6052 
6053   z_lrvr(crc, crc);          // Revert byte order because we are dealing with big-endian data.
6054   rotate_then_insert(len, len, 64-log_stepping, 63, 0, true); // #bytes for tailLoop
6055 
6056   BIND(L_mainLoop);
6057     update_1word_crc32(crc, buf, table, 0, mainLoop_stepping, crc, t1, t2, t3);
6058     z_brct(ctr, L_mainLoop); // Iterate.
6059   z_lrvr(crc, crc);          // Revert byte order back to original.
6060 
6061   // Process last few (<8) bytes of buffer.
6062   BIND(L_tail);
6063   update_byteLoop_crc32(crc, buf, len, table, data, false);
6064 

6065   not_(crc, noreg, false); // 1s complement of crc

6066 }
6067 
6068 /**
6069  * @param crc   register containing existing CRC (32-bit)
6070  * @param buf   register pointing to input byte buffer (byte*)
6071  * @param len   register containing number of bytes
6072  * @param table register pointing to CRC table
6073  */
6074 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
6075                                         Register t0,  Register t1,  Register t2,  Register t3) {

6076   assert_different_registers(crc, buf, len, table);
6077   Register data = t0;
6078 
6079   update_byteLoop_crc32(crc, buf, len, table, data, true);








6080 }
6081 
6082 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {

6083   assert_different_registers(crc, buf, len, table, tmp);
6084 
6085   not_(crc, noreg, false); // ~c


6086 
6087   z_llgc(tmp, Address(buf, (intptr_t)0));  // Current byte of input buffer (zero extended). Avoids garbage in upper half of register.
6088   update_byte_crc32(crc, tmp, table);
6089 
6090   not_(crc, noreg, false); // ~c

















6091 }
6092 
6093 //
6094 // Code for BigInteger::multiplyToLen() intrinsic.
6095 //
6096 
6097 // dest_lo += src1 + src2
6098 // dest_hi += carry1 + carry2
6099 // Z_R7 is destroyed !
6100 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo,
6101                                      Register src1, Register src2) {
6102   clear_reg(Z_R7);
6103   z_algr(dest_lo, src1);
6104   z_alcgr(dest_hi, Z_R7);
6105   z_algr(dest_lo, src2);
6106   z_alcgr(dest_hi, Z_R7);
6107 }
6108 
6109 // Multiply 64 bit by 64 bit first loop.
6110 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,


   1 /*
   2  * Copyright (c) 2016, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016, 2017, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *


5867  * @param [in,out]crc Register containing the crc.
5868  * @param [in]val     Register containing the byte to fold into the CRC.
5869  * @param [in]table   Register containing the table of crc constants.
5870  *
5871  * uint32_t crc;
5872  * val = crc_table[(val ^ crc) & 0xFF];
5873  * crc = val ^ (crc >> 8);
5874  */
5875 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
5876   z_xr(val, crc);
5877   fold_byte_crc32(crc, val, table, val);
5878 }
5879 
5880 
5881 /**
5882  * @param crc   register containing existing CRC (32-bit)
5883  * @param buf   register pointing to input byte buffer (byte*)
5884  * @param len   register containing number of bytes
5885  * @param table register pointing to CRC table
5886  */
5887 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, Register data) {

5888   assert_different_registers(crc, buf, len, table, data);
5889 
5890   Label L_mainLoop, L_done;
5891   const int mainLoop_stepping = 1;
5892 
5893   // Process all bytes in a single-byte loop.
5894   z_ltr(len, len);
5895   z_brnh(L_done);
5896 




5897   bind(L_mainLoop);
5898     z_llgc(data, Address(buf, (intptr_t)0));// Current byte of input buffer (zero extended). Avoids garbage in upper half of register.
5899     add2reg(buf, mainLoop_stepping);        // Advance buffer position.
5900     update_byte_crc32(crc, data, table);
5901     z_brct(len, L_mainLoop);                // Iterate.
5902 




5903   bind(L_done);
5904 }
5905 
5906 /**
5907  * Emits code to update CRC-32 with a 4-byte value according to constants in table.
5908  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c.
5909  *
5910  */
5911 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
5912                                         Register t0,  Register t1,  Register t2,    Register t3) {
5913   // This is what we implement (the DOBIG4 part):
5914   //
5915   // #define DOBIG4 c ^= *++buf4; \
5916   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
5917   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
5918   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
5919   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
5920   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
5921   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
5922   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);


5940   z_ly(t2, Address(table, t2, (intptr_t)ix1));
5941   z_ly(t1, Address(table, t1, (intptr_t)ix2));
5942   z_ly(t0, Address(table, t0, (intptr_t)ix3));
5943 
5944   // Calculate new crc from table values.
5945   z_xr(t2, t3);
5946   z_xr(t0, t1);
5947   z_xr(t0, t2);  // Now crc contains the final checksum value.
5948   lgr_if_needed(crc, t0);
5949 }
5950 
5951 /**
5952  * @param crc   register containing existing CRC (32-bit)
5953  * @param buf   register pointing to input byte buffer (byte*)
5954  * @param len   register containing number of bytes
5955  * @param table register pointing to CRC table
5956  *
5957  * uses Z_R10..Z_R13 as work register. Must be saved/restored by caller!
5958  */
5959 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
5960                                         Register t0,  Register t1,  Register t2,  Register t3,
5961                                         bool invertCRC) {
5962   assert_different_registers(crc, buf, len, table);
5963 
5964   Label L_mainLoop, L_tail;
5965   Register  data = t0;
5966   Register  ctr  = Z_R0;
5967   const int mainLoop_stepping = 8;
5968   const int tailLoop_stepping = 1;
5969   const int log_stepping      = exact_log2(mainLoop_stepping);
5970 
5971   // Don't test for len <= 0 here. This pathological case should not occur anyway.
5972   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
5973   // The situation itself is detected and handled correctly by the conditional branches
5974   // following aghi(len, -stepping) and aghi(len, +stepping).
5975 
5976   if (invertCRC) {
5977     not_(crc, noreg, false);           // 1s complement of crc
5978   }
5979 
5980 #if 0
5981   {
5982     // Pre-mainLoop alignment did not show any positive effect on performance.
5983     // We leave the code in for reference. Maybe the vector instructions in z13 depend on alignment.
5984 
5985     z_cghi(len, mainLoop_stepping);    // Alignment is useless for short data streams.
5986     z_brnh(L_tail);
5987 
5988     // Align buf to word (4-byte) boundary.
5989     z_lcr(ctr, buf);
5990     rotate_then_insert(ctr, ctr, 62, 63, 0, true); // TODO: should set cc
5991     z_sgfr(len, ctr);                  // Remaining len after alignment.
5992 
5993     update_byteLoop_crc32(crc, buf, ctr, table, data);
5994   }
5995 #endif
5996 
5997   // Check for short (<mainLoop_stepping bytes) buffer.
5998   z_srag(ctr, len, log_stepping);
5999   z_brnh(L_tail);
6000 
6001   z_lrvr(crc, crc);             // Revert byte order because we are dealing with big-endian data.
6002   rotate_then_insert(len, len, 64-log_stepping, 63, 0, true); // #bytes for tailLoop
6003 
6004   BIND(L_mainLoop);
6005     update_1word_crc32(crc, buf, table, 0, 0, crc, t1, t2, t3);
6006     update_1word_crc32(crc, buf, table, 4, mainLoop_stepping, crc, t1, t2, t3);
6007     z_brct(ctr, L_mainLoop);    // Iterate.
6008 
6009   z_lrvr(crc, crc);        // Revert byte order back to original.
6010 
6011   // Process last few (<8) bytes of buffer.
6012   BIND(L_tail);
6013   update_byteLoop_crc32(crc, buf, len, table, data);
6014 
6015   if (invertCRC) {
6016     not_(crc, noreg, false);           // 1s complement of crc
6017   }
6018 }
6019 
6020 /**
6021  * @param crc   register containing existing CRC (32-bit)
6022  * @param buf   register pointing to input byte buffer (byte*)
6023  * @param len   register containing number of bytes
6024  * @param table register pointing to CRC table
6025  *
6026  * uses Z_R10..Z_R13 as work register. Must be saved/restored by caller!
6027  */
6028 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
6029                                         Register t0,  Register t1,  Register t2,  Register t3,
6030                                         bool invertCRC) {
6031   assert_different_registers(crc, buf, len, table);
6032 
6033   Label L_mainLoop, L_tail;
6034   Register  data = t0;
6035   Register  ctr  = Z_R0;
6036   const int mainLoop_stepping = 4;
6037   const int log_stepping      = exact_log2(mainLoop_stepping);
6038 
6039   // Don't test for len <= 0 here. This pathological case should not occur anyway.
6040   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
6041   // The situation itself is detected and handled correctly by the conditional branches
6042   // following aghi(len, -stepping) and aghi(len, +stepping).
6043 
6044   if (invertCRC) {
6045     not_(crc, noreg, false);           // 1s complement of crc
6046   }
6047 
6048   // Check for short (<4 bytes) buffer.
6049   z_srag(ctr, len, log_stepping);
6050   z_brnh(L_tail);
6051 
6052   z_lrvr(crc, crc);          // Revert byte order because we are dealing with big-endian data.
6053   rotate_then_insert(len, len, 64-log_stepping, 63, 0, true); // #bytes for tailLoop
6054 
6055   BIND(L_mainLoop);
6056     update_1word_crc32(crc, buf, table, 0, mainLoop_stepping, crc, t1, t2, t3);
6057     z_brct(ctr, L_mainLoop); // Iterate.
6058   z_lrvr(crc, crc);          // Revert byte order back to original.
6059 
6060   // Process last few (<8) bytes of buffer.
6061   BIND(L_tail);
6062   update_byteLoop_crc32(crc, buf, len, table, data);
6063 
6064   if (invertCRC) {
6065     not_(crc, noreg, false);           // 1s complement of crc
6066   }
6067 }
6068 
6069 /**
6070  * @param crc   register containing existing CRC (32-bit)
6071  * @param buf   register pointing to input byte buffer (byte*)
6072  * @param len   register containing number of bytes
6073  * @param table register pointing to CRC table
6074  */
6075 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
6076                                         Register t0,  Register t1,  Register t2,  Register t3,
6077                                         bool invertCRC) {
6078   assert_different_registers(crc, buf, len, table);
6079   Register data = t0;
6080 
6081   if (invertCRC) {
6082     not_(crc, noreg, false);           // 1s complement of crc
6083   }
6084 
6085   update_byteLoop_crc32(crc, buf, len, table, data);
6086 
6087   if (invertCRC) {
6088     not_(crc, noreg, false);           // 1s complement of crc
6089   }
6090 }
6091 
6092 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp,
6093                                              bool invertCRC) {
6094   assert_different_registers(crc, buf, len, table, tmp);
6095 
6096   if (invertCRC) {
6097     not_(crc, noreg, false);           // 1s complement of crc
6098   }
6099 
6100   z_llgc(tmp, Address(buf, (intptr_t)0));  // Current byte of input buffer (zero extended). Avoids garbage in upper half of register.
6101   update_byte_crc32(crc, tmp, table);
6102 
6103   if (invertCRC) {
6104     not_(crc, noreg, false);           // 1s complement of crc
6105   }
6106 }
6107 
6108 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table,
6109                                                 bool invertCRC) {
6110   assert_different_registers(crc, val, table);
6111 
6112   if (invertCRC) {
6113     not_(crc, noreg, false);           // 1s complement of crc
6114   }
6115 
6116   update_byte_crc32(crc, val, table);
6117 
6118   if (invertCRC) {
6119     not_(crc, noreg, false);           // 1s complement of crc
6120   }
6121 }
6122 
6123 //
6124 // Code for BigInteger::multiplyToLen() intrinsic.
6125 //
6126 
6127 // dest_lo += src1 + src2
6128 // dest_hi += carry1 + carry2
6129 // Z_R7 is destroyed !
6130 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo,
6131                                      Register src1, Register src2) {
6132   clear_reg(Z_R7);
6133   z_algr(dest_lo, src1);
6134   z_alcgr(dest_hi, Z_R7);
6135   z_algr(dest_lo, src2);
6136   z_alcgr(dest_hi, Z_R7);
6137 }
6138 
6139 // Multiply 64 bit by 64 bit first loop.
6140 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,


< prev index next >