1 /*
2 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
4075 * @param [in]val Register containing the byte to fold into the CRC.
4076 * @param [in]table Register containing the table of crc constants.
4077 *
4078 * uint32_t crc;
4079 * val = crc_table[(val ^ crc) & 0xFF];
4080 * crc = val ^ (crc >> 8);
4081 */
4082 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4083 BLOCK_COMMENT("update_byte_crc32:");
4084 xorr(val, val, crc);
4085 fold_byte_crc32(crc, val, table, val);
4086 }
4087
4088 /**
4089 * @param crc register containing existing CRC (32-bit)
4090 * @param buf register pointing to input byte buffer (byte*)
4091 * @param len register containing number of bytes
4092 * @param table register pointing to CRC table
4093 */
4094 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4095 Register data, bool loopAlignment, bool invertCRC) {
4096 assert_different_registers(crc, buf, len, table, data);
4097
4098 Label L_mainLoop, L_done;
4099 const int mainLoop_stepping = 1;
4100 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4101
4102 // Process all bytes in a single-byte loop.
4103 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?
4104 beq(CCR0, L_done);
4105
4106 if (invertCRC) {
4107 nand(crc, crc, crc); // ~c
4108 }
4109
4110 mtctr(len);
4111 align(mainLoop_alignment);
4112 BIND(L_mainLoop);
4113 lbz(data, 0, buf); // Byte from buffer, zero-extended.
4114 addi(buf, buf, mainLoop_stepping); // Advance buffer position.
4115 update_byte_crc32(crc, data, table);
4116 bdnz(L_mainLoop); // Iterate.
4117
4118 if (invertCRC) {
4119 nand(crc, crc, crc); // ~c
4120 }
4121
4122 bind(L_done);
4123 }
4124
4125 /**
4126 * Emits code to update CRC-32 with a 4-byte value according to constants in table
4127 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4128 */
4129 // A not on the lookup table address(es):
4130 // The lookup table consists of two sets of four columns each.
4131 // The columns {0..3} are used for little-endian machines.
4132 // The columns {4..7} are used for big-endian machines.
4133 // To save the effort of adding the column offset to the table address each time
4134 // a table element is looked up, it is possible to pass the pre-calculated
4135 // column addresses.
4136 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4137 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4138 Register t0, Register t1, Register t2, Register t3,
4139 Register tc0, Register tc1, Register tc2, Register tc3) {
4140 assert_different_registers(crc, t3);
4141
4158 lwzx(t1, tc1, t1);
4159 lwzx(t2, tc2, t2);
4160 lwzx(t3, tc3, t3);
4161
4162 // Calculate new crc from table values.
4163 xorr(t0, t0, t1);
4164 xorr(t2, t2, t3);
4165 xorr(crc, t0, t2); // Now crc contains the final checksum value.
4166 }
4167
4168 /**
4169 * @param crc register containing existing CRC (32-bit)
4170 * @param buf register pointing to input byte buffer (byte*)
4171 * @param len register containing number of bytes
4172 * @param table register pointing to CRC table
4173 *
4174 * Uses R9..R12 as work register. Must be saved/restored by caller!
4175 */
4176 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4177 Register t0, Register t1, Register t2, Register t3,
4178 Register tc0, Register tc1, Register tc2, Register tc3) {
4179 assert_different_registers(crc, buf, len, table);
4180
4181 Label L_mainLoop, L_tail;
4182 Register tmp = t0;
4183 Register data = t0;
4184 Register tmp2 = t1;
4185 const int mainLoop_stepping = 8;
4186 const int tailLoop_stepping = 1;
4187 const int log_stepping = exact_log2(mainLoop_stepping);
4188 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4189 const int complexThreshold = 2*mainLoop_stepping;
4190
4191 // Don't test for len <= 0 here. This pathological case should not occur anyway.
4192 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
4193 // The situation itself is detected and handled correctly by the conditional branches
4194 // following aghi(len, -stepping) and aghi(len, +stepping).
4195 assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4196
4197 BLOCK_COMMENT("kernel_crc32_2word {");
4198
4199 nand(crc, crc, crc); // ~c
4200
4201 // Check for short (<mainLoop_stepping) buffer.
4202 cmpdi(CCR0, len, complexThreshold);
4203 blt(CCR0, L_tail);
4204
4205 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4206 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4207 {
4208 // Align buf addr to mainLoop_stepping boundary.
4209 neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
4210 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4211
4212 if (complexThreshold > mainLoop_stepping) {
4213 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4214 } else {
4215 sub(tmp, len, tmp2); // Remaining bytes for main loop.
4216 cmpdi(CCR0, tmp, mainLoop_stepping);
4217 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
4218 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4219 }
4220 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
4221 }
4222
4223 srdi(tmp2, len, log_stepping); // #iterations for mainLoop
4224 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
4225 mtctr(tmp2);
4226
4227 #ifdef VM_LITTLE_ENDIAN
4228 Register crc_rv = crc;
4229 #else
4230 Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
4231 // Occupies tmp, but frees up crc.
4232 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
4233 tmp = crc;
4234 #endif
4235
4236 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4237
4238 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
4239 BIND(L_mainLoop);
4240 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4241 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4242 bdnz(L_mainLoop);
4243
4244 #ifndef VM_LITTLE_ENDIAN
4245 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
4246 tmp = crc_rv; // Tmp uses it's original register again.
4247 #endif
4248
4249 // Restore original table address for tailLoop.
4250 if (reconstructTableOffset != 0) {
4251 addi(table, table, -reconstructTableOffset);
4252 }
4253
4254 // Process last few (<complexThreshold) bytes of buffer.
4255 BIND(L_tail);
4256 update_byteLoop_crc32(crc, buf, len, table, data, false, false);
4257
4258 nand(crc, crc, crc); // ~c
4259 BLOCK_COMMENT("} kernel_crc32_2word");
4260 }
4261
4262 /**
4263 * @param crc register containing existing CRC (32-bit)
4264 * @param buf register pointing to input byte buffer (byte*)
4265 * @param len register containing number of bytes
4266 * @param table register pointing to CRC table
4267 *
4268 * uses R9..R12 as work register. Must be saved/restored by caller!
4269 */
4270 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4271 Register t0, Register t1, Register t2, Register t3,
4272 Register tc0, Register tc1, Register tc2, Register tc3) {
4273 assert_different_registers(crc, buf, len, table);
4274
4275 Label L_mainLoop, L_tail;
4276 Register tmp = t0;
4277 Register data = t0;
4278 Register tmp2 = t1;
4279 const int mainLoop_stepping = 4;
4280 const int tailLoop_stepping = 1;
4281 const int log_stepping = exact_log2(mainLoop_stepping);
4282 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4283 const int complexThreshold = 2*mainLoop_stepping;
4284
4285 // Don't test for len <= 0 here. This pathological case should not occur anyway.
4286 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
4287 // The situation itself is detected and handled correctly by the conditional branches
4288 // following aghi(len, -stepping) and aghi(len, +stepping).
4289 assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4290
4291 BLOCK_COMMENT("kernel_crc32_1word {");
4292
4293 nand(crc, crc, crc); // ~c
4294
4295 // Check for short (<mainLoop_stepping) buffer.
4296 cmpdi(CCR0, len, complexThreshold);
4297 blt(CCR0, L_tail);
4298
4299 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4300 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4301 {
4302 // Align buf addr to mainLoop_stepping boundary.
4303 neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
4304 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4305
4306 if (complexThreshold > mainLoop_stepping) {
4307 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4308 } else {
4309 sub(tmp, len, tmp2); // Remaining bytes for main loop.
4310 cmpdi(CCR0, tmp, mainLoop_stepping);
4311 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
4312 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4313 }
4314 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
4315 }
4316
4317 srdi(tmp2, len, log_stepping); // #iterations for mainLoop
4318 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
4319 mtctr(tmp2);
4320
4321 #ifdef VM_LITTLE_ENDIAN
4322 Register crc_rv = crc;
4323 #else
4324 Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
4325 // Occupies tmp, but frees up crc.
4326 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
4327 tmp = crc;
4328 #endif
4329
4330 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4331
4332 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
4333 BIND(L_mainLoop);
4334 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4335 bdnz(L_mainLoop);
4336
4337 #ifndef VM_LITTLE_ENDIAN
4338 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
4339 tmp = crc_rv; // Tmp uses it's original register again.
4340 #endif
4341
4342 // Restore original table address for tailLoop.
4343 if (reconstructTableOffset != 0) {
4344 addi(table, table, -reconstructTableOffset);
4345 }
4346
4347 // Process last few (<complexThreshold) bytes of buffer.
4348 BIND(L_tail);
4349 update_byteLoop_crc32(crc, buf, len, table, data, false, false);
4350
4351 nand(crc, crc, crc); // ~c
4352 BLOCK_COMMENT("} kernel_crc32_1word");
4353 }
4354
4355 /**
4356 * @param crc register containing existing CRC (32-bit)
4357 * @param buf register pointing to input byte buffer (byte*)
4358 * @param len register containing number of bytes
4359 * @param table register pointing to CRC table
4360 *
4361 * Uses R7_ARG5, R8_ARG6 as work registers.
4362 */
4363 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4364 Register t0, Register t1, Register t2, Register t3) {
4365 assert_different_registers(crc, buf, len, table);
4366
4367 Register data = t0; // Holds the current byte to be folded into crc.
4368
4369 BLOCK_COMMENT("kernel_crc32_1byte {");
4370
4371 // Process all bytes in a single-byte loop.
4372 update_byteLoop_crc32(crc, buf, len, table, data, true, true);
4373
4374 BLOCK_COMMENT("} kernel_crc32_1byte");
4375 }
4376
4377 /**
4378 * @param crc register containing existing CRC (32-bit)
4379 * @param buf register pointing to input byte buffer (byte*)
4380 * @param len register containing number of bytes
4381 * @param table register pointing to CRC table
4382 * @param constants register pointing to CRC table for 128-bit aligned memory
4383 * @param barretConstants register pointing to table for barrett reduction
4384 * @param t0 volatile register
4385 * @param t1 volatile register
4386 * @param t2 volatile register
4387 * @param t3 volatile register
4388 */
4389 void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
4390 Register constants, Register barretConstants,
4391 Register t0, Register t1, Register t2, Register t3, Register t4) {
4392 assert_different_registers(crc, buf, len, table);
4393
4394 Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
4395
4396 Register prealign = t0;
4397 Register postalign = t0;
4398
4399 BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
4400
4401 // 1. use kernel_crc32_1word for shorter than 384bit
4402 clrldi(len, len, 32);
4403 cmpdi(CCR0, len, 384);
4404 bge(CCR0, L_start);
4405
4406 Register tc0 = t4;
4407 Register tc1 = constants;
4408 Register tc2 = barretConstants;
4409 kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table);
4410 b(L_end);
4411
4412 BIND(L_start);
4413
4414 // 2. ~c
4415 nand(crc, crc, crc);
4416
4417 // 3. calculate from 0 to first 128bit-aligned address
4418 clrldi_(prealign, buf, 57);
4419 beq(CCR0, L_alignedHead);
4420
4421 subfic(prealign, prealign, 128);
4422
4423 subf(len, prealign, len);
4424 update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false);
4425
4426 // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
4427 BIND(L_alignedHead);
4428
4429 clrldi(postalign, len, 57);
4430 subf(len, postalign, len);
4431
4432 // len must be more than 256bit
4433 kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
4434
4435 // 5. calculate remaining
4436 cmpdi(CCR0, postalign, 0);
4437 beq(CCR0, L_tail);
4438
4439 update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false);
4440
4441 BIND(L_tail);
4442
4443 // 6. ~c
4444 nand(crc, crc, crc);
4445
4446 BIND(L_end);
4447
4448 BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
4449 }
4450
4451 /**
4452 * @param crc register containing existing CRC (32-bit)
4453 * @param buf register pointing to input byte buffer (byte*)
4454 * @param len register containing number of bytes
4455 * @param constants register pointing to CRC table for 128-bit aligned memory
4456 * @param barretConstants register pointing to table for barrett reduction
4457 * @param t0 volatile register
4458 * @param t1 volatile register
4459 * @param t2 volatile register
4460 */
4461 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4462 Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
4463 Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
4464 Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
4916 offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
4917 offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
4918 offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
4919 offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
4920 offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
4921 offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
4922 offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
4923 offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
4924 offsetInt -= 8; ld(R22, offsetInt, R1_SP);
4925 offsetInt -= 8; ld(R23, offsetInt, R1_SP);
4926 offsetInt -= 8; ld(R24, offsetInt, R1_SP);
4927 offsetInt -= 8; ld(R25, offsetInt, R1_SP);
4928 offsetInt -= 8; ld(R26, offsetInt, R1_SP);
4929 offsetInt -= 8; ld(R27, offsetInt, R1_SP);
4930 offsetInt -= 8; ld(R28, offsetInt, R1_SP);
4931 offsetInt -= 8; ld(R29, offsetInt, R1_SP);
4932 offsetInt -= 8; ld(R30, offsetInt, R1_SP);
4933 offsetInt -= 8; ld(R31, offsetInt, R1_SP);
4934 }
4935
4936 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
4937 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp);
4938
4939 BLOCK_COMMENT("kernel_crc32_singleByte:");
4940 nand(crc, crc, crc); // ~c
4941
4942 lbz(tmp, 0, buf); // Byte from buffer, zero-extended.
4943 update_byte_crc32(crc, tmp, table);
4944
4945 nand(crc, crc, crc); // ~c
4946 }
4947
4948 // dest_lo += src1 + src2
4949 // dest_hi += carry1 + carry2
4950 void MacroAssembler::add2_with_carry(Register dest_hi,
4951 Register dest_lo,
4952 Register src1, Register src2) {
4953 li(R0, 0);
4954 addc(dest_lo, dest_lo, src1);
4955 adde(dest_hi, dest_hi, R0);
4956 addc(dest_lo, dest_lo, src2);
4957 adde(dest_hi, dest_hi, R0);
4958 }
4959
4960 // Multiply 64 bit by 64 bit first loop.
4961 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4962 Register x_xstart,
4963 Register y, Register y_idx,
4964 Register z,
4965 Register carry,
|
1 /*
2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2017, SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
4075 * @param [in]val Register containing the byte to fold into the CRC.
4076 * @param [in]table Register containing the table of crc constants.
4077 *
4078 * uint32_t crc;
4079 * val = crc_table[(val ^ crc) & 0xFF];
4080 * crc = val ^ (crc >> 8);
4081 */
4082 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4083 BLOCK_COMMENT("update_byte_crc32:");
4084 xorr(val, val, crc);
4085 fold_byte_crc32(crc, val, table, val);
4086 }
4087
4088 /**
4089 * @param crc register containing existing CRC (32-bit)
4090 * @param buf register pointing to input byte buffer (byte*)
4091 * @param len register containing number of bytes
4092 * @param table register pointing to CRC table
4093 */
4094 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4095 Register data, bool loopAlignment) {
4096 assert_different_registers(crc, buf, len, table, data);
4097
4098 Label L_mainLoop, L_done;
4099 const int mainLoop_stepping = 1;
4100 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4101
4102 // Process all bytes in a single-byte loop.
4103 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?
4104 beq(CCR0, L_done);
4105
4106 mtctr(len);
4107 align(mainLoop_alignment);
4108 BIND(L_mainLoop);
4109 lbz(data, 0, buf); // Byte from buffer, zero-extended.
4110 addi(buf, buf, mainLoop_stepping); // Advance buffer position.
4111 update_byte_crc32(crc, data, table);
4112 bdnz(L_mainLoop); // Iterate.
4113
4114 bind(L_done);
4115 }
4116
4117 /**
4118 * Emits code to update CRC-32 with a 4-byte value according to constants in table
4119 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4120 */
4121 // A not on the lookup table address(es):
4122 // The lookup table consists of two sets of four columns each.
4123 // The columns {0..3} are used for little-endian machines.
4124 // The columns {4..7} are used for big-endian machines.
4125 // To save the effort of adding the column offset to the table address each time
4126 // a table element is looked up, it is possible to pass the pre-calculated
4127 // column addresses.
4128 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4129 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4130 Register t0, Register t1, Register t2, Register t3,
4131 Register tc0, Register tc1, Register tc2, Register tc3) {
4132 assert_different_registers(crc, t3);
4133
4150 lwzx(t1, tc1, t1);
4151 lwzx(t2, tc2, t2);
4152 lwzx(t3, tc3, t3);
4153
4154 // Calculate new crc from table values.
4155 xorr(t0, t0, t1);
4156 xorr(t2, t2, t3);
4157 xorr(crc, t0, t2); // Now crc contains the final checksum value.
4158 }
4159
4160 /**
4161 * @param crc register containing existing CRC (32-bit)
4162 * @param buf register pointing to input byte buffer (byte*)
4163 * @param len register containing number of bytes
4164 * @param table register pointing to CRC table
4165 *
4166 * Uses R9..R12 as work register. Must be saved/restored by caller!
4167 */
4168 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4169 Register t0, Register t1, Register t2, Register t3,
4170 Register tc0, Register tc1, Register tc2, Register tc3,
4171 bool invertCRC) {
4172 assert_different_registers(crc, buf, len, table);
4173
4174 Label L_mainLoop, L_tail;
4175 Register tmp = t0;
4176 Register data = t0;
4177 Register tmp2 = t1;
4178 const int mainLoop_stepping = 8;
4179 const int tailLoop_stepping = 1;
4180 const int log_stepping = exact_log2(mainLoop_stepping);
4181 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4182 const int complexThreshold = 2*mainLoop_stepping;
4183
4184 // Don't test for len <= 0 here. This pathological case should not occur anyway.
4185 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4186 // for all well-behaved cases. The situation itself is detected and handled correctly
4187 // within update_byteLoop_crc32.
4188 assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4189
4190 BLOCK_COMMENT("kernel_crc32_2word {");
4191
4192 if (invertCRC) {
4193 nand(crc, crc, crc); // 1s complement of crc
4194 }
4195
4196 // Check for short (<mainLoop_stepping) buffer.
4197 cmpdi(CCR0, len, complexThreshold);
4198 blt(CCR0, L_tail);
4199
4200 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4201 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4202 {
4203 // Align buf addr to mainLoop_stepping boundary.
4204 neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
4205 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4206
4207 if (complexThreshold > mainLoop_stepping) {
4208 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4209 } else {
4210 sub(tmp, len, tmp2); // Remaining bytes for main loop.
4211 cmpdi(CCR0, tmp, mainLoop_stepping);
4212 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
4213 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4214 }
4215 update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4216 }
4217
4218 srdi(tmp2, len, log_stepping); // #iterations for mainLoop
4219 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
4220 mtctr(tmp2);
4221
4222 #ifdef VM_LITTLE_ENDIAN
4223 Register crc_rv = crc;
4224 #else
4225 Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
4226 // Occupies tmp, but frees up crc.
4227 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
4228 tmp = crc;
4229 #endif
4230
4231 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4232
4233 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
4234 BIND(L_mainLoop);
4235 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4236 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4237 bdnz(L_mainLoop);
4238
4239 #ifndef VM_LITTLE_ENDIAN
4240 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
4241 tmp = crc_rv; // Tmp uses it's original register again.
4242 #endif
4243
4244 // Restore original table address for tailLoop.
4245 if (reconstructTableOffset != 0) {
4246 addi(table, table, -reconstructTableOffset);
4247 }
4248
4249 // Process last few (<complexThreshold) bytes of buffer.
4250 BIND(L_tail);
4251 update_byteLoop_crc32(crc, buf, len, table, data, false);
4252
4253 if (invertCRC) {
4254 nand(crc, crc, crc); // 1s complement of crc
4255 }
4256 BLOCK_COMMENT("} kernel_crc32_2word");
4257 }
4258
4259 /**
4260 * @param crc register containing existing CRC (32-bit)
4261 * @param buf register pointing to input byte buffer (byte*)
4262 * @param len register containing number of bytes
4263 * @param table register pointing to CRC table
4264 *
4265 * uses R9..R12 as work register. Must be saved/restored by caller!
4266 */
4267 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4268 Register t0, Register t1, Register t2, Register t3,
4269 Register tc0, Register tc1, Register tc2, Register tc3,
4270 bool invertCRC) {
4271 assert_different_registers(crc, buf, len, table);
4272
4273 Label L_mainLoop, L_tail;
4274 Register tmp = t0;
4275 Register data = t0;
4276 Register tmp2 = t1;
4277 const int mainLoop_stepping = 4;
4278 const int tailLoop_stepping = 1;
4279 const int log_stepping = exact_log2(mainLoop_stepping);
4280 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4281 const int complexThreshold = 2*mainLoop_stepping;
4282
4283 // Don't test for len <= 0 here. This pathological case should not occur anyway.
4284 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4285 // for all well-behaved cases. The situation itself is detected and handled correctly
4286 // within update_byteLoop_crc32.
4287 assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4288
4289 BLOCK_COMMENT("kernel_crc32_1word {");
4290
4291 if (invertCRC) {
4292 nand(crc, crc, crc); // 1s complement of crc
4293 }
4294
4295 // Check for short (<mainLoop_stepping) buffer.
4296 cmpdi(CCR0, len, complexThreshold);
4297 blt(CCR0, L_tail);
4298
4299 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4300 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4301 {
4302 // Align buf addr to mainLoop_stepping boundary.
4303 neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
4304 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4305
4306 if (complexThreshold > mainLoop_stepping) {
4307 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4308 } else {
4309 sub(tmp, len, tmp2); // Remaining bytes for main loop.
4310 cmpdi(CCR0, tmp, mainLoop_stepping);
4311 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
4312 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4313 }
4314 update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4315 }
4316
4317 srdi(tmp2, len, log_stepping); // #iterations for mainLoop
4318 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
4319 mtctr(tmp2);
4320
4321 #ifdef VM_LITTLE_ENDIAN
4322 Register crc_rv = crc;
4323 #else
4324 Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
4325 // Occupies tmp, but frees up crc.
4326 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
4327 tmp = crc;
4328 #endif
4329
4330 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4331
4332 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
4333 BIND(L_mainLoop);
4334 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4335 bdnz(L_mainLoop);
4336
4337 #ifndef VM_LITTLE_ENDIAN
4338 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
4339 tmp = crc_rv; // Tmp uses it's original register again.
4340 #endif
4341
4342 // Restore original table address for tailLoop.
4343 if (reconstructTableOffset != 0) {
4344 addi(table, table, -reconstructTableOffset);
4345 }
4346
4347 // Process last few (<complexThreshold) bytes of buffer.
4348 BIND(L_tail);
4349 update_byteLoop_crc32(crc, buf, len, table, data, false);
4350
4351 if (invertCRC) {
4352 nand(crc, crc, crc); // 1s complement of crc
4353 }
4354 BLOCK_COMMENT("} kernel_crc32_1word");
4355 }
4356
4357 /**
4358 * @param crc register containing existing CRC (32-bit)
4359 * @param buf register pointing to input byte buffer (byte*)
4360 * @param len register containing number of bytes
4361 * @param table register pointing to CRC table
4362 *
4363 * Uses R7_ARG5, R8_ARG6 as work registers.
4364 */
4365 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4366 Register t0, Register t1, Register t2, Register t3,
4367 bool invertCRC) {
4368 assert_different_registers(crc, buf, len, table);
4369
4370 Register data = t0; // Holds the current byte to be folded into crc.
4371
4372 BLOCK_COMMENT("kernel_crc32_1byte {");
4373
4374 if (invertCRC) {
4375 nand(crc, crc, crc); // 1s complement of crc
4376 }
4377
4378 // Process all bytes in a single-byte loop.
4379 update_byteLoop_crc32(crc, buf, len, table, data, true);
4380
4381 if (invertCRC) {
4382 nand(crc, crc, crc); // 1s complement of crc
4383 }
4384 BLOCK_COMMENT("} kernel_crc32_1byte");
4385 }
4386
4387 /**
4388 * @param crc register containing existing CRC (32-bit)
4389 * @param buf register pointing to input byte buffer (byte*)
4390 * @param len register containing number of bytes
4391 * @param table register pointing to CRC table
4392 * @param constants register pointing to CRC table for 128-bit aligned memory
4393 * @param barretConstants register pointing to table for barrett reduction
4394 * @param t0 volatile register
4395 * @param t1 volatile register
4396 * @param t2 volatile register
4397 * @param t3 volatile register
4398 */
4399 void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
4400 Register constants, Register barretConstants,
4401 Register t0, Register t1, Register t2, Register t3, Register t4,
4402 bool invertCRC) {
4403 assert_different_registers(crc, buf, len, table);
4404
4405 Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
4406
4407 Register prealign = t0;
4408 Register postalign = t0;
4409
4410 BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
4411
4412 // 1. use kernel_crc32_1word for shorter than 384bit
4413 clrldi(len, len, 32);
4414 cmpdi(CCR0, len, 384);
4415 bge(CCR0, L_start);
4416
4417 Register tc0 = t4;
4418 Register tc1 = constants;
4419 Register tc2 = barretConstants;
4420 kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
4421 b(L_end);
4422
4423 BIND(L_start);
4424
4425 // 2. ~c
4426 if (invertCRC) {
4427 nand(crc, crc, crc); // 1s complement of crc
4428 }
4429
4430 // 3. calculate from 0 to first 128bit-aligned address
4431 clrldi_(prealign, buf, 57);
4432 beq(CCR0, L_alignedHead);
4433
4434 subfic(prealign, prealign, 128);
4435
4436 subf(len, prealign, len);
4437 update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4438
4439 // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
4440 BIND(L_alignedHead);
4441
4442 clrldi(postalign, len, 57);
4443 subf(len, postalign, len);
4444
4445 // len must be more than 256bit
4446 kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
4447
4448 // 5. calculate remaining
4449 cmpdi(CCR0, postalign, 0);
4450 beq(CCR0, L_tail);
4451
4452 update_byteLoop_crc32(crc, buf, postalign, table, t2, false);
4453
4454 BIND(L_tail);
4455
4456 // 6. ~c
4457 if (invertCRC) {
4458 nand(crc, crc, crc); // 1s complement of crc
4459 }
4460
4461 BIND(L_end);
4462
4463 BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
4464 }
4465
4466 /**
4467 * @param crc register containing existing CRC (32-bit)
4468 * @param buf register pointing to input byte buffer (byte*)
4469 * @param len register containing number of bytes
4470 * @param constants register pointing to CRC table for 128-bit aligned memory
4471 * @param barretConstants register pointing to table for barrett reduction
4472 * @param t0 volatile register
4473 * @param t1 volatile register
4474 * @param t2 volatile register
4475 */
4476 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4477 Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
4478 Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
4479 Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
4931 offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
4932 offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
4933 offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
4934 offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
4935 offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
4936 offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
4937 offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
4938 offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
4939 offsetInt -= 8; ld(R22, offsetInt, R1_SP);
4940 offsetInt -= 8; ld(R23, offsetInt, R1_SP);
4941 offsetInt -= 8; ld(R24, offsetInt, R1_SP);
4942 offsetInt -= 8; ld(R25, offsetInt, R1_SP);
4943 offsetInt -= 8; ld(R26, offsetInt, R1_SP);
4944 offsetInt -= 8; ld(R27, offsetInt, R1_SP);
4945 offsetInt -= 8; ld(R28, offsetInt, R1_SP);
4946 offsetInt -= 8; ld(R29, offsetInt, R1_SP);
4947 offsetInt -= 8; ld(R30, offsetInt, R1_SP);
4948 offsetInt -= 8; ld(R31, offsetInt, R1_SP);
4949 }
4950
4951 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
4952 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp);
4953
4954 BLOCK_COMMENT("kernel_crc32_singleByte:");
4955 if (invertCRC) {
4956 nand(crc, crc, crc); // 1s complement of crc
4957 }
4958
4959 lbz(tmp, 0, buf); // Byte from buffer, zero-extended.
4960 update_byte_crc32(crc, tmp, table);
4961
4962 if (invertCRC) {
4963 nand(crc, crc, crc); // 1s complement of crc
4964 }
4965 }
4966
4967 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
4968 assert_different_registers(crc, val, table);
4969
4970 BLOCK_COMMENT("kernel_crc32_singleByteReg:");
4971 if (invertCRC) {
4972 nand(crc, crc, crc); // 1s complement of crc
4973 }
4974
4975 update_byte_crc32(crc, val, table);
4976
4977 if (invertCRC) {
4978 nand(crc, crc, crc); // 1s complement of crc
4979 }
4980 }
4981
4982 // dest_lo += src1 + src2
4983 // dest_hi += carry1 + carry2
4984 void MacroAssembler::add2_with_carry(Register dest_hi,
4985 Register dest_lo,
4986 Register src1, Register src2) {
4987 li(R0, 0);
4988 addc(dest_lo, dest_lo, src1);
4989 adde(dest_hi, dest_hi, R0);
4990 addc(dest_lo, dest_lo, src2);
4991 adde(dest_hi, dest_hi, R0);
4992 }
4993
4994 // Multiply 64 bit by 64 bit first loop.
4995 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4996 Register x_xstart,
4997 Register y, Register y_idx,
4998 Register z,
4999 Register carry,
|