hotspot Sdiff src/cpu/ppc/vm

src/cpu/ppc/vm/macroAssembler_ppc.cpp

rev 12672 : [mq]: crc32_ppc.patch

   1 /*
   2  * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *

4075  * @param [in]val       Register containing the byte to fold into the CRC.
4076  * @param [in]table     Register containing the table of crc constants.
4077  *
4078  * uint32_t crc;
4079  * val = crc_table[(val ^ crc) & 0xFF];
4080  * crc = val ^ (crc >> 8);
4081  */
4082 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4083   BLOCK_COMMENT("update_byte_crc32:");
4084   xorr(val, val, crc);
4085   fold_byte_crc32(crc, val, table, val);
4086 }
4087 
4088 /**
4089  * @param crc   register containing existing CRC (32-bit)
4090  * @param buf   register pointing to input byte buffer (byte*)
4091  * @param len   register containing number of bytes
4092  * @param table register pointing to CRC table
4093  */
4094 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4095                                            Register data, bool loopAlignment, bool invertCRC) {
4096   assert_different_registers(crc, buf, len, table, data);
4097 
4098   Label L_mainLoop, L_done;
4099   const int mainLoop_stepping  = 1;
4100   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4101 
4102   // Process all bytes in a single-byte loop.
4103   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
4104   beq(CCR0, L_done);
4105 
4106   if (invertCRC) {
4107     nand(crc, crc, crc);                         // ~c
4108   }
4109 
4110   mtctr(len);
4111   align(mainLoop_alignment);
4112   BIND(L_mainLoop);
4113     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
4114     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
4115     update_byte_crc32(crc, data, table);
4116     bdnz(L_mainLoop);                            // Iterate.
4117 
4118   if (invertCRC) {
4119     nand(crc, crc, crc);                         // ~c
4120   }
4121 
4122   bind(L_done);
4123 }
4124 
4125 /**
4126  * Emits code to update CRC-32 with a 4-byte value according to constants in table
4127  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4128  */
4129 // A not on the lookup table address(es):
4130 // The lookup table consists of two sets of four columns each.
4131 // The columns {0..3} are used for little-endian machines.
4132 // The columns {4..7} are used for big-endian machines.
4133 // To save the effort of adding the column offset to the table address each time
4134 // a table element is looked up, it is possible to pass the pre-calculated
4135 // column addresses.
4136 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4137 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4138                                         Register t0,  Register t1,  Register t2,  Register t3,
4139                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4140   assert_different_registers(crc, t3);
4141

4158   lwzx(t1, tc1, t1);
4159   lwzx(t2, tc2, t2);
4160   lwzx(t3, tc3, t3);
4161 
4162   // Calculate new crc from table values.
4163   xorr(t0,  t0, t1);
4164   xorr(t2,  t2, t3);
4165   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4166 }
4167 
4168 /**
4169  * @param crc   register containing existing CRC (32-bit)
4170  * @param buf   register pointing to input byte buffer (byte*)
4171  * @param len   register containing number of bytes
4172  * @param table register pointing to CRC table
4173  *
4174  * Uses R9..R12 as work register. Must be saved/restored by caller!
4175  */
4176 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4177                                         Register t0,  Register t1,  Register t2,  Register t3,
4178                                         Register tc0, Register tc1, Register tc2, Register tc3) {

4179   assert_different_registers(crc, buf, len, table);
4180 
4181   Label L_mainLoop, L_tail;
4182   Register  tmp  = t0;
4183   Register  data = t0;
4184   Register  tmp2 = t1;
4185   const int mainLoop_stepping  = 8;
4186   const int tailLoop_stepping  = 1;
4187   const int log_stepping       = exact_log2(mainLoop_stepping);
4188   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4189   const int complexThreshold   = 2*mainLoop_stepping;
4190 
4191   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4192   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
4193   // The situation itself is detected and handled correctly by the conditional branches
4194   // following  aghi(len, -stepping) and aghi(len, +stepping).
4195   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4196 
4197   BLOCK_COMMENT("kernel_crc32_2word {");
4198 
4199   nand(crc, crc, crc);                           // ~c


4200 
4201   // Check for short (<mainLoop_stepping) buffer.
4202   cmpdi(CCR0, len, complexThreshold);
4203   blt(CCR0, L_tail);
4204 
4205   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4206   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4207   {
4208     // Align buf addr to mainLoop_stepping boundary.
4209     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
4210     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4211 
4212     if (complexThreshold > mainLoop_stepping) {
4213       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4214     } else {
4215       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4216       cmpdi(CCR0, tmp, mainLoop_stepping);
4217       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4218       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4219     }
4220     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
4221   }
4222 
4223   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4224   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4225   mtctr(tmp2);
4226 
4227 #ifdef VM_LITTLE_ENDIAN
4228   Register crc_rv = crc;
4229 #else
4230   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4231                                                  // Occupies tmp, but frees up crc.
4232   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4233   tmp = crc;
4234 #endif
4235 
4236   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4237 
4238   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4239   BIND(L_mainLoop);
4240     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4241     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4242     bdnz(L_mainLoop);
4243 
4244 #ifndef VM_LITTLE_ENDIAN
4245   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4246   tmp = crc_rv;                                  // Tmp uses it's original register again.
4247 #endif
4248 
4249   // Restore original table address for tailLoop.
4250   if (reconstructTableOffset != 0) {
4251     addi(table, table, -reconstructTableOffset);
4252   }
4253 
4254   // Process last few (<complexThreshold) bytes of buffer.
4255   BIND(L_tail);
4256   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
4257 
4258   nand(crc, crc, crc);                           // ~c


4259   BLOCK_COMMENT("} kernel_crc32_2word");
4260 }
4261 
4262 /**
4263  * @param crc   register containing existing CRC (32-bit)
4264  * @param buf   register pointing to input byte buffer (byte*)
4265  * @param len   register containing number of bytes
4266  * @param table register pointing to CRC table
4267  *
4268  * uses R9..R12 as work register. Must be saved/restored by caller!
4269  */
4270 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4271                                         Register t0,  Register t1,  Register t2,  Register t3,
4272                                         Register tc0, Register tc1, Register tc2, Register tc3) {

4273   assert_different_registers(crc, buf, len, table);
4274 
4275   Label L_mainLoop, L_tail;
4276   Register  tmp          = t0;
4277   Register  data         = t0;
4278   Register  tmp2         = t1;
4279   const int mainLoop_stepping  = 4;
4280   const int tailLoop_stepping  = 1;
4281   const int log_stepping       = exact_log2(mainLoop_stepping);
4282   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4283   const int complexThreshold   = 2*mainLoop_stepping;
4284 
4285   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4286   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
4287   // The situation itself is detected and handled correctly by the conditional branches
4288   // following  aghi(len, -stepping) and aghi(len, +stepping).
4289   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4290 
4291   BLOCK_COMMENT("kernel_crc32_1word {");
4292 
4293   nand(crc, crc, crc);                           // ~c


4294 
4295   // Check for short (<mainLoop_stepping) buffer.
4296   cmpdi(CCR0, len, complexThreshold);
4297   blt(CCR0, L_tail);
4298 
4299   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4300   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4301   {
4302     // Align buf addr to mainLoop_stepping boundary.
4303     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4304     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4305 
4306     if (complexThreshold > mainLoop_stepping) {
4307       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4308     } else {
4309       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4310       cmpdi(CCR0, tmp, mainLoop_stepping);
4311       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4312       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4313     }
4314     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
4315   }
4316 
4317   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4318   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4319   mtctr(tmp2);
4320 
4321 #ifdef VM_LITTLE_ENDIAN
4322   Register crc_rv = crc;
4323 #else
4324   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4325                                                  // Occupies tmp, but frees up crc.
4326   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4327   tmp = crc;
4328 #endif
4329 
4330   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4331 
4332   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4333   BIND(L_mainLoop);
4334     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4335     bdnz(L_mainLoop);
4336 
4337 #ifndef VM_LITTLE_ENDIAN
4338   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4339   tmp = crc_rv;                                  // Tmp uses it's original register again.
4340 #endif
4341 
4342   // Restore original table address for tailLoop.
4343   if (reconstructTableOffset != 0) {
4344     addi(table, table, -reconstructTableOffset);
4345   }
4346 
4347   // Process last few (<complexThreshold) bytes of buffer.
4348   BIND(L_tail);
4349   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
4350 
4351   nand(crc, crc, crc);                           // ~c


4352   BLOCK_COMMENT("} kernel_crc32_1word");
4353 }
4354 
4355 /**
4356  * @param crc   register containing existing CRC (32-bit)
4357  * @param buf   register pointing to input byte buffer (byte*)
4358  * @param len   register containing number of bytes
4359  * @param table register pointing to CRC table
4360  *
4361  * Uses R7_ARG5, R8_ARG6 as work registers.
4362  */
4363 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4364                                         Register t0,  Register t1,  Register t2,  Register t3) {

4365   assert_different_registers(crc, buf, len, table);
4366 
4367   Register  data = t0;                   // Holds the current byte to be folded into crc.
4368 
4369   BLOCK_COMMENT("kernel_crc32_1byte {");
4370 




4371   // Process all bytes in a single-byte loop.
4372   update_byteLoop_crc32(crc, buf, len, table, data, true, true);
4373 



4374   BLOCK_COMMENT("} kernel_crc32_1byte");
4375 }
4376 
4377 /**
4378  * @param crc             register containing existing CRC (32-bit)
4379  * @param buf             register pointing to input byte buffer (byte*)
4380  * @param len             register containing number of bytes
4381  * @param table           register pointing to CRC table
4382  * @param constants       register pointing to CRC table for 128-bit aligned memory
4383  * @param barretConstants register pointing to table for barrett reduction
4384  * @param t0              volatile register
4385  * @param t1              volatile register
4386  * @param t2              volatile register
4387  * @param t3              volatile register
4388  */
4389 void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
4390                                                 Register constants,  Register barretConstants,
4391                                                 Register t0,  Register t1, Register t2, Register t3, Register t4) {

4392   assert_different_registers(crc, buf, len, table);
4393 
4394   Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
4395 
4396   Register  prealign     = t0;
4397   Register  postalign    = t0;
4398 
4399   BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
4400 
4401   // 1. use kernel_crc32_1word for shorter than 384bit
4402   clrldi(len, len, 32);
4403   cmpdi(CCR0, len, 384);
4404   bge(CCR0, L_start);
4405 
4406     Register tc0 = t4;
4407     Register tc1 = constants;
4408     Register tc2 = barretConstants;
4409     kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table);
4410     b(L_end);
4411 
4412   BIND(L_start);
4413 
4414     // 2. ~c
4415     nand(crc, crc, crc);


4416 
4417     // 3. calculate from 0 to first 128bit-aligned address
4418     clrldi_(prealign, buf, 57);
4419     beq(CCR0, L_alignedHead);
4420 
4421     subfic(prealign, prealign, 128);
4422 
4423     subf(len, prealign, len);
4424     update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false);
4425 
4426     // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
4427     BIND(L_alignedHead);
4428 
4429     clrldi(postalign, len, 57);
4430     subf(len, postalign, len);
4431 
4432     // len must be more than 256bit
4433     kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
4434 
4435     // 5. calculate remaining
4436     cmpdi(CCR0, postalign, 0);
4437     beq(CCR0, L_tail);
4438 
4439     update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false);
4440 
4441     BIND(L_tail);
4442 
4443     // 6. ~c
4444     nand(crc, crc, crc);


4445 
4446   BIND(L_end);
4447 
4448   BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
4449 }
4450 
4451 /**
4452  * @param crc             register containing existing CRC (32-bit)
4453  * @param buf             register pointing to input byte buffer (byte*)
4454  * @param len             register containing number of bytes
4455  * @param constants       register pointing to CRC table for 128-bit aligned memory
4456  * @param barretConstants register pointing to table for barrett reduction
4457  * @param t0              volatile register
4458  * @param t1              volatile register
4459  * @param t2              volatile register
4460  */
4461 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4462     Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
4463   Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
4464   Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;

4916   offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
4917   offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
4918   offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
4919   offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
4920   offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
4921   offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
4922   offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
4923   offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
4924   offsetInt -= 8;  ld(R22, offsetInt, R1_SP);
4925   offsetInt -= 8;  ld(R23, offsetInt, R1_SP);
4926   offsetInt -= 8;  ld(R24, offsetInt, R1_SP);
4927   offsetInt -= 8;  ld(R25, offsetInt, R1_SP);
4928   offsetInt -= 8;  ld(R26, offsetInt, R1_SP);
4929   offsetInt -= 8;  ld(R27, offsetInt, R1_SP);
4930   offsetInt -= 8;  ld(R28, offsetInt, R1_SP);
4931   offsetInt -= 8;  ld(R29, offsetInt, R1_SP);
4932   offsetInt -= 8;  ld(R30, offsetInt, R1_SP);
4933   offsetInt -= 8;  ld(R31, offsetInt, R1_SP);
4934 }
4935 
4936 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
4937   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4938 
4939   BLOCK_COMMENT("kernel_crc32_singleByte:");
4940   nand(crc, crc, crc);       // ~c


4941 
4942   lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
4943   update_byte_crc32(crc, tmp, table);
4944 
4945   nand(crc, crc, crc);       // ~c

















4946 }
4947 
4948 // dest_lo += src1 + src2
4949 // dest_hi += carry1 + carry2
4950 void MacroAssembler::add2_with_carry(Register dest_hi,
4951                                      Register dest_lo,
4952                                      Register src1, Register src2) {
4953   li(R0, 0);
4954   addc(dest_lo, dest_lo, src1);
4955   adde(dest_hi, dest_hi, R0);
4956   addc(dest_lo, dest_lo, src2);
4957   adde(dest_hi, dest_hi, R0);
4958 }
4959 
4960 // Multiply 64 bit by 64 bit first loop.
4961 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4962                                            Register x_xstart,
4963                                            Register y, Register y_idx,
4964                                            Register z,
4965                                            Register carry,

   1 /*
   2  * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2017, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *

4075  * @param [in]val       Register containing the byte to fold into the CRC.
4076  * @param [in]table     Register containing the table of crc constants.
4077  *
4078  * uint32_t crc;
4079  * val = crc_table[(val ^ crc) & 0xFF];
4080  * crc = val ^ (crc >> 8);
4081  */
4082 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4083   BLOCK_COMMENT("update_byte_crc32:");
4084   xorr(val, val, crc);
4085   fold_byte_crc32(crc, val, table, val);
4086 }
4087 
4088 /**
4089  * @param crc   register containing existing CRC (32-bit)
4090  * @param buf   register pointing to input byte buffer (byte*)
4091  * @param len   register containing number of bytes
4092  * @param table register pointing to CRC table
4093  */
4094 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4095                                            Register data, bool loopAlignment) {
4096   assert_different_registers(crc, buf, len, table, data);
4097 
4098   Label L_mainLoop, L_done;
4099   const int mainLoop_stepping  = 1;
4100   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4101 
4102   // Process all bytes in a single-byte loop.
4103   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
4104   beq(CCR0, L_done);
4105 




4106   mtctr(len);
4107   align(mainLoop_alignment);
4108   BIND(L_mainLoop);
4109     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
4110     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
4111     update_byte_crc32(crc, data, table);
4112     bdnz(L_mainLoop);                            // Iterate.
4113 




4114   bind(L_done);
4115 }
4116 
4117 /**
4118  * Emits code to update CRC-32 with a 4-byte value according to constants in table
4119  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4120  */
4121 // A not on the lookup table address(es):
4122 // The lookup table consists of two sets of four columns each.
4123 // The columns {0..3} are used for little-endian machines.
4124 // The columns {4..7} are used for big-endian machines.
4125 // To save the effort of adding the column offset to the table address each time
4126 // a table element is looked up, it is possible to pass the pre-calculated
4127 // column addresses.
4128 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4129 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4130                                         Register t0,  Register t1,  Register t2,  Register t3,
4131                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4132   assert_different_registers(crc, t3);
4133

4150   lwzx(t1, tc1, t1);
4151   lwzx(t2, tc2, t2);
4152   lwzx(t3, tc3, t3);
4153 
4154   // Calculate new crc from table values.
4155   xorr(t0,  t0, t1);
4156   xorr(t2,  t2, t3);
4157   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4158 }
4159 
4160 /**
4161  * @param crc   register containing existing CRC (32-bit)
4162  * @param buf   register pointing to input byte buffer (byte*)
4163  * @param len   register containing number of bytes
4164  * @param table register pointing to CRC table
4165  *
4166  * Uses R9..R12 as work register. Must be saved/restored by caller!
4167  */
4168 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4169                                         Register t0,  Register t1,  Register t2,  Register t3,
4170                                         Register tc0, Register tc1, Register tc2, Register tc3,
4171                                         bool invertCRC) {
4172   assert_different_registers(crc, buf, len, table);
4173 
4174   Label L_mainLoop, L_tail;
4175   Register  tmp  = t0;
4176   Register  data = t0;
4177   Register  tmp2 = t1;
4178   const int mainLoop_stepping  = 8;
4179   const int tailLoop_stepping  = 1;
4180   const int log_stepping       = exact_log2(mainLoop_stepping);
4181   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4182   const int complexThreshold   = 2*mainLoop_stepping;
4183 
4184   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4185   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4186   // for all well-behaved cases. The situation itself is detected and handled correctly
4187   // within update_byteLoop_crc32.
4188   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4189 
4190   BLOCK_COMMENT("kernel_crc32_2word {");
4191 
4192   if (invertCRC) {
4193     nand(crc, crc, crc);                      // 1s complement of crc
4194   }
4195 
4196   // Check for short (<mainLoop_stepping) buffer.
4197   cmpdi(CCR0, len, complexThreshold);
4198   blt(CCR0, L_tail);
4199 
4200   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4201   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4202   {
4203     // Align buf addr to mainLoop_stepping boundary.
4204     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
4205     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4206 
4207     if (complexThreshold > mainLoop_stepping) {
4208       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4209     } else {
4210       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4211       cmpdi(CCR0, tmp, mainLoop_stepping);
4212       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4213       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4214     }
4215     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4216   }
4217 
4218   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4219   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4220   mtctr(tmp2);
4221 
4222 #ifdef VM_LITTLE_ENDIAN
4223   Register crc_rv = crc;
4224 #else
4225   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4226                                                  // Occupies tmp, but frees up crc.
4227   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4228   tmp = crc;
4229 #endif
4230 
4231   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4232 
4233   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4234   BIND(L_mainLoop);
4235     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4236     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4237     bdnz(L_mainLoop);
4238 
4239 #ifndef VM_LITTLE_ENDIAN
4240   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4241   tmp = crc_rv;                                  // Tmp uses it's original register again.
4242 #endif
4243 
4244   // Restore original table address for tailLoop.
4245   if (reconstructTableOffset != 0) {
4246     addi(table, table, -reconstructTableOffset);
4247   }
4248 
4249   // Process last few (<complexThreshold) bytes of buffer.
4250   BIND(L_tail);
4251   update_byteLoop_crc32(crc, buf, len, table, data, false);
4252 
4253   if (invertCRC) {
4254     nand(crc, crc, crc);                      // 1s complement of crc
4255   }
4256   BLOCK_COMMENT("} kernel_crc32_2word");
4257 }
4258 
4259 /**
4260  * @param crc   register containing existing CRC (32-bit)
4261  * @param buf   register pointing to input byte buffer (byte*)
4262  * @param len   register containing number of bytes
4263  * @param table register pointing to CRC table
4264  *
4265  * uses R9..R12 as work register. Must be saved/restored by caller!
4266  */
4267 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4268                                         Register t0,  Register t1,  Register t2,  Register t3,
4269                                         Register tc0, Register tc1, Register tc2, Register tc3,
4270                                         bool invertCRC) {
4271   assert_different_registers(crc, buf, len, table);
4272 
4273   Label L_mainLoop, L_tail;
4274   Register  tmp          = t0;
4275   Register  data         = t0;
4276   Register  tmp2         = t1;
4277   const int mainLoop_stepping  = 4;
4278   const int tailLoop_stepping  = 1;
4279   const int log_stepping       = exact_log2(mainLoop_stepping);
4280   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4281   const int complexThreshold   = 2*mainLoop_stepping;
4282 
4283   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4284   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4285   // for all well-behaved cases. The situation itself is detected and handled correctly
4286   // within update_byteLoop_crc32.
4287   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4288 
4289   BLOCK_COMMENT("kernel_crc32_1word {");
4290 
4291   if (invertCRC) {
4292     nand(crc, crc, crc);                      // 1s complement of crc
4293   }
4294 
4295   // Check for short (<mainLoop_stepping) buffer.
4296   cmpdi(CCR0, len, complexThreshold);
4297   blt(CCR0, L_tail);
4298 
4299   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4300   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4301   {
4302     // Align buf addr to mainLoop_stepping boundary.
4303     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4304     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4305 
4306     if (complexThreshold > mainLoop_stepping) {
4307       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4308     } else {
4309       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4310       cmpdi(CCR0, tmp, mainLoop_stepping);
4311       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4312       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4313     }
4314     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4315   }
4316 
4317   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4318   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4319   mtctr(tmp2);
4320 
4321 #ifdef VM_LITTLE_ENDIAN
4322   Register crc_rv = crc;
4323 #else
4324   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4325                                                  // Occupies tmp, but frees up crc.
4326   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4327   tmp = crc;
4328 #endif
4329 
4330   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4331 
4332   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4333   BIND(L_mainLoop);
4334     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4335     bdnz(L_mainLoop);
4336 
4337 #ifndef VM_LITTLE_ENDIAN
4338   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4339   tmp = crc_rv;                                  // Tmp uses it's original register again.
4340 #endif
4341 
4342   // Restore original table address for tailLoop.
4343   if (reconstructTableOffset != 0) {
4344     addi(table, table, -reconstructTableOffset);
4345   }
4346 
4347   // Process last few (<complexThreshold) bytes of buffer.
4348   BIND(L_tail);
4349   update_byteLoop_crc32(crc, buf, len, table, data, false);
4350 
4351   if (invertCRC) {
4352     nand(crc, crc, crc);                      // 1s complement of crc
4353   }
4354   BLOCK_COMMENT("} kernel_crc32_1word");
4355 }
4356 
4357 /**
4358  * @param crc   register containing existing CRC (32-bit)
4359  * @param buf   register pointing to input byte buffer (byte*)
4360  * @param len   register containing number of bytes
4361  * @param table register pointing to CRC table
4362  *
4363  * Uses R7_ARG5, R8_ARG6 as work registers.
4364  */
4365 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4366                                         Register t0,  Register t1,  Register t2,  Register t3,
4367                                         bool invertCRC) {
4368   assert_different_registers(crc, buf, len, table);
4369 
4370   Register  data = t0;                   // Holds the current byte to be folded into crc.
4371 
4372   BLOCK_COMMENT("kernel_crc32_1byte {");
4373 
4374   if (invertCRC) {
4375     nand(crc, crc, crc);                      // 1s complement of crc
4376   }
4377 
4378   // Process all bytes in a single-byte loop.
4379   update_byteLoop_crc32(crc, buf, len, table, data, true);
4380 
4381   if (invertCRC) {
4382     nand(crc, crc, crc);                      // 1s complement of crc
4383   }
4384   BLOCK_COMMENT("} kernel_crc32_1byte");
4385 }
4386 
4387 /**
4388  * @param crc             register containing existing CRC (32-bit)
4389  * @param buf             register pointing to input byte buffer (byte*)
4390  * @param len             register containing number of bytes
4391  * @param table           register pointing to CRC table
4392  * @param constants       register pointing to CRC table for 128-bit aligned memory
4393  * @param barretConstants register pointing to table for barrett reduction
4394  * @param t0              volatile register
4395  * @param t1              volatile register
4396  * @param t2              volatile register
4397  * @param t3              volatile register
4398  */
4399 void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
4400                                                 Register constants,  Register barretConstants,
4401                                                 Register t0,  Register t1, Register t2, Register t3, Register t4,
4402                                                 bool invertCRC) {
4403   assert_different_registers(crc, buf, len, table);
4404 
4405   Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
4406 
4407   Register  prealign     = t0;
4408   Register  postalign    = t0;
4409 
4410   BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
4411 
4412   // 1. use kernel_crc32_1word for shorter than 384bit
4413   clrldi(len, len, 32);
4414   cmpdi(CCR0, len, 384);
4415   bge(CCR0, L_start);
4416 
4417     Register tc0 = t4;
4418     Register tc1 = constants;
4419     Register tc2 = barretConstants;
4420     kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
4421     b(L_end);
4422 
4423   BIND(L_start);
4424 
4425     // 2. ~c
4426     if (invertCRC) {
4427       nand(crc, crc, crc);                      // 1s complement of crc
4428     }
4429 
4430     // 3. calculate from 0 to first 128bit-aligned address
4431     clrldi_(prealign, buf, 57);
4432     beq(CCR0, L_alignedHead);
4433 
4434     subfic(prealign, prealign, 128);
4435 
4436     subf(len, prealign, len);
4437     update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4438 
4439     // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
4440     BIND(L_alignedHead);
4441 
4442     clrldi(postalign, len, 57);
4443     subf(len, postalign, len);
4444 
4445     // len must be more than 256bit
4446     kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
4447 
4448     // 5. calculate remaining
4449     cmpdi(CCR0, postalign, 0);
4450     beq(CCR0, L_tail);
4451 
4452     update_byteLoop_crc32(crc, buf, postalign, table, t2, false);
4453 
4454     BIND(L_tail);
4455 
4456     // 6. ~c
4457     if (invertCRC) {
4458       nand(crc, crc, crc);                      // 1s complement of crc
4459     }
4460 
4461   BIND(L_end);
4462 
4463   BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
4464 }
4465 
4466 /**
4467  * @param crc             register containing existing CRC (32-bit)
4468  * @param buf             register pointing to input byte buffer (byte*)
4469  * @param len             register containing number of bytes
4470  * @param constants       register pointing to CRC table for 128-bit aligned memory
4471  * @param barretConstants register pointing to table for barrett reduction
4472  * @param t0              volatile register
4473  * @param t1              volatile register
4474  * @param t2              volatile register
4475  */
4476 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4477     Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
4478   Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
4479   Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;

4931   offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
4932   offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
4933   offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
4934   offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
4935   offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
4936   offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
4937   offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
4938   offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
4939   offsetInt -= 8;  ld(R22, offsetInt, R1_SP);
4940   offsetInt -= 8;  ld(R23, offsetInt, R1_SP);
4941   offsetInt -= 8;  ld(R24, offsetInt, R1_SP);
4942   offsetInt -= 8;  ld(R25, offsetInt, R1_SP);
4943   offsetInt -= 8;  ld(R26, offsetInt, R1_SP);
4944   offsetInt -= 8;  ld(R27, offsetInt, R1_SP);
4945   offsetInt -= 8;  ld(R28, offsetInt, R1_SP);
4946   offsetInt -= 8;  ld(R29, offsetInt, R1_SP);
4947   offsetInt -= 8;  ld(R30, offsetInt, R1_SP);
4948   offsetInt -= 8;  ld(R31, offsetInt, R1_SP);
4949 }
4950 
4951 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
4952   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4953 
4954   BLOCK_COMMENT("kernel_crc32_singleByte:");
4955   if (invertCRC) {
4956     nand(crc, crc, crc);                // 1s complement of crc
4957   }
4958 
4959   lbz(tmp, 0, buf);                     // Byte from buffer, zero-extended.
4960   update_byte_crc32(crc, tmp, table);
4961 
4962   if (invertCRC) {
4963     nand(crc, crc, crc);                // 1s complement of crc
4964   }
4965 }
4966 
4967 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
4968   assert_different_registers(crc, val, table);
4969 
4970   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
4971   if (invertCRC) {
4972     nand(crc, crc, crc);                // 1s complement of crc
4973   }
4974 
4975   update_byte_crc32(crc, val, table);
4976 
4977   if (invertCRC) {
4978     nand(crc, crc, crc);                // 1s complement of crc
4979   }
4980 }
4981 
4982 // dest_lo += src1 + src2
4983 // dest_hi += carry1 + carry2
4984 void MacroAssembler::add2_with_carry(Register dest_hi,
4985                                      Register dest_lo,
4986                                      Register src1, Register src2) {
4987   li(R0, 0);
4988   addc(dest_lo, dest_lo, src1);
4989   adde(dest_hi, dest_hi, R0);
4990   addc(dest_lo, dest_lo, src2);
4991   adde(dest_hi, dest_hi, R0);
4992 }
4993 
4994 // Multiply 64 bit by 64 bit first loop.
4995 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4996                                            Register x_xstart,
4997                                            Register y, Register y_idx,
4998                                            Register z,
4999                                            Register carry,

< prev index next >