1 /* 2 * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have 23 * questions. 24 */ 25 package benchmark.jdk.incubator.vector; 26 27 import jdk.incubator.vector.ByteVector; 28 import jdk.incubator.vector.ShortVector; 29 import jdk.incubator.vector.IntVector; 30 import jdk.incubator.vector.LongVector; 31 import jdk.incubator.vector.VectorSpecies; 32 import org.openjdk.jmh.annotations.*; 33 34 import java.util.concurrent.TimeUnit; 35 36 import static org.junit.jupiter.api.Assertions.assertEquals; 37 38 /** 39 * Population count algorithms from "Faster Population Counts Using AVX2 Instructions", 2018 by Mula, Kurz, Lemire 40 */ 41 @BenchmarkMode(Mode.Throughput) 42 @Warmup(iterations = 3, time = 1) 43 @Measurement(iterations = 5, time = 1) 44 @OutputTimeUnit(TimeUnit.MILLISECONDS) 45 @State(Scope.Benchmark) 46 @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) 47 public class PopulationCount extends AbstractVectorBenchmark { 48 @Param({"64", "1024", "65536"}) 49 int size; 50 51 private long[] data; 52 53 @Setup 54 public void init() { 55 data = fillLong(size, i -> RANDOM.nextLong()); 56 // data = fillLong(size, i -> 0L); 57 // data = fillLong(size, i -> -1L); 58 59 checkConsistency(); 60 } 61 62 @TearDown 63 public void tearDown() { 64 checkConsistency(); 65 } 66 67 void checkConsistency() { 68 long popCount = longBitCount(); 69 assertEquals(popCount, treeOfAdders()); 70 assertEquals(popCount, WilkesWheelerGill()); 71 assertEquals(popCount, Wegner()); 72 assertEquals(popCount, Lauradoux()); 73 assertEquals(popCount, HarleySeal()); 74 assertEquals(popCount, Mula128()); 75 assertEquals(popCount, Mula256()); 76 assertEquals(popCount, HarleySeal256()); 77 } 78 79 long tail(int upper) { 80 long acc = 0; 81 for (int i = upper; i < data.length; i++) { 82 acc += Long.bitCount(data[i]); 83 } 84 return acc; 85 } 86 87 @Benchmark 88 public long longBitCount() { 89 long acc = 0; 90 for (int i = 0; i < data.length; i++) { 91 acc += Long.bitCount(data[i]); 92 } 93 return acc; 94 } 95 96 /* ============================================================================================================== */ 97 98 // FIGURE 4. The Wegner function in C 99 100 long popcntWegner(long x) { 101 int v = 0; 102 while (x != 0) { 103 x &= x - 1; 104 v++; 105 } 106 return v; 107 } 108 109 @Benchmark 110 public long Wegner() { 111 long acc = 0; 112 for (int i = 0; i < data.length; i++) { 113 acc += popcntWegner(data[i]); 114 } 115 return acc; 116 } 117 118 /* ============================================================================================================== */ 119 120 // FIGURE 2. A naive tree-of-adders function in C 121 122 static long popcntTree(long x) { 123 long c1 = 0x5555555555555555L; 124 long c2 = 0x3333333333333333L; 125 long c4 = 0x0F0F0F0F0F0F0F0FL; 126 long c8 = 0x00FF00FF00FF00FFL; 127 long c16 = 0x0000FFFF0000FFFFL; 128 long c32 = 0x00000000FFFFFFFFL; 129 130 x = (x & c1) + ((x >>> 1) & c1); 131 x = (x & c2) + ((x >>> 2) & c2); 132 x = (x & c4) + ((x >>> 4) & c4); 133 x = (x & c8) + ((x >>> 8) & c8); 134 x = (x & c16) + ((x >>> 16) & c16); 135 x = (x & c32) + ((x >>> 32) & c32); 136 return x; 137 } 138 139 @Benchmark 140 public long treeOfAdders() { 141 long acc = 0; 142 for (int i = 0; i < data.length; i++) { 143 acc += popcntTree(data[i]); 144 } 145 return acc; 146 } 147 148 /* ============================================================================================================== */ 149 150 // FIGURE 3. The Wilkes-Wheeler-Gill function in C 151 152 static long popcntWWG(long x) { 153 long c1 = 0x5555555555555555L; 154 long c2 = 0x3333333333333333L; 155 long c4 = 0x0F0F0F0F0F0F0F0FL; 156 157 x -= (x >>> 1) & c1; 158 x = (( x >>> 2) & c2) + (x & c2) ; 159 x = ( x + (x >>> 4) ) & c4; 160 x *= 0x0101010101010101L; 161 x = x >>> 56; 162 return x; 163 } 164 165 @Benchmark 166 public long WilkesWheelerGill() { 167 long acc = 0; 168 for (int i = 0; i < data.length; i++) { 169 acc += popcntWWG(data[i]); 170 } 171 return acc; 172 } 173 174 /* ============================================================================================================== */ 175 176 // FIGURE 5. The Lauradoux population count in C for sets of 12 words. 177 178 static long parallelPopcnt(long count1, long count2, long count3) { 179 long m1 = 0x5555555555555555L; 180 long m2 = 0x3333333333333333L; 181 long m4 = 0x0F0F0F0F0F0F0F0FL; 182 183 long half1 = (count3 ) & m1; 184 long half2 = (count3 >>> 1) & m1; 185 186 count1 -= (count1 >>> 1) & m1; 187 count2 -= (count2 >>> 1) & m1; 188 count1 += half1; 189 count2 += half2; 190 count1 = (count1 & m2) + (( count1 >>> 2) & m2); 191 count1 += (count2 & m2) + (( count2 >>> 2) & m2); 192 return (count1 & m4) + (( count1 >>> 4) & m4); 193 } 194 195 static long reduce(long acc) { 196 long m8 = 0x00FF00FF00FF00FFL; 197 long m16 = 0x0000FFFF0000FFFFL; 198 long m32 = 0x00000000FFFFFFFFL; 199 200 acc = (acc & m8) + (( acc >>> 8) & m8); 201 acc = (acc + (acc >>> 16) ) & m16; 202 acc = (acc & m32) + (acc >>> 32); 203 return acc; 204 } 205 206 static long popcntLauradoux(long[] xs, int off) { 207 long acc = 0; 208 for (int j = off; j < off+12; j += 3) { 209 acc += parallelPopcnt(xs[j+0], xs[j+1], xs[j+2]); 210 } 211 return reduce(acc); 212 } 213 214 @Benchmark 215 public long Lauradoux() { 216 long acc = 0; 217 int upper = data.length - (data.length % 12); 218 for (int i = 0; i < upper; i += 12) { 219 acc += popcntLauradoux(data, i); 220 } 221 return acc + tail(upper); 222 } 223 224 /* ============================================================================================================== */ 225 226 // FIGURE 6. A C function implementing a bitwise parallel carry-save adder (CSA). Given three input words a, b, c, it 227 // generates two new words h, l in which each bit represents the high and low bits in the bitwise sum of the bits from a, 228 // b, and c. 229 230 static long csaLow(long a, long b, long c) { 231 long u = a ^ b; 232 long lo = u ^ c; 233 return lo; 234 } 235 236 static long csaHigh(long a, long b, long c) { 237 long u = a ^ b; 238 long hi = (a & b) | (u & c) ; 239 return hi; 240 } 241 242 // FIGURE 8. A C function implementing the Harley-Seal 243 // population count over an array of 64-bit words. The count 244 // function could be the Wilkes-Wheeler-Gill function. 245 @Benchmark 246 public long HarleySeal() { 247 long total = 0, ones = 0, twos = 0, fours = 0, eights = 0, sixteens = 0; 248 long twosA = 0, twosB = 0; 249 long foursA = 0, foursB = 0; 250 long eightsA = 0, eightsB = 0; 251 252 int step = 16; 253 int upper = data.length - (data.length % step); 254 for (int i = 0; i < upper; i += step) { 255 // CSA(&twosA, &ones, ones, d[i+0], d[i +1]); 256 twosA = csaHigh(ones, data[i+0], data[i+1]); 257 ones = csaLow(ones, data[i+0], data[i+1]); 258 259 // CSA(&twosB, &ones, ones, d[i+2], d[i+3]); 260 twosB = csaHigh(ones, data[i+2], data[i+3]); 261 ones = csaLow(ones, data[i+2], data[i+3]); 262 263 // CSA(&foursA, &twos, twos, twosA, twosB); 264 foursA = csaHigh(twos, twosA, twosB); 265 twos = csaLow(twos, twosA, twosB); 266 267 // ==================================== 268 269 // CSA(&twosA, &ones, ones, d[i+4], d[i+5]); 270 twosA = csaHigh(ones, data[i+4], data[i+5]); 271 ones = csaLow(ones, data[i+4], data[i+5]); 272 273 // CSA(&twosB, &ones, ones, d[i+6], d[i+7]); 274 twosB = csaHigh(ones, data[i+6], data[i+7]); 275 ones = csaLow(ones, data[i+6], data[i+7]); 276 277 // CSA(&foursB, &twos, twos, twosA, twosB); 278 foursB = csaHigh(twos, twosA, twosB); 279 twos = csaLow(twos, twosA, twosB); 280 281 // ==================================== 282 283 // CSA(&eightsA, &fours, fours, foursA, foursB); 284 eightsA = csaHigh(fours, foursA, foursB); 285 fours = csaLow(fours, foursA, foursB); 286 287 // ==================================== 288 289 // CSA(&twosA, &ones, ones, d[i+8], d[i+9]); 290 twosA = csaHigh(ones, data[i+8], data[i+9]); 291 ones = csaLow(ones, data[i+8], data[i+9]); 292 293 // CSA(&twosB, &ones, ones, d[i+10],d[i+11]); 294 twosB = csaHigh(ones, data[i+10], data[i+11]); 295 ones = csaLow(ones, data[i+10], data[i+11]); 296 297 // CSA(&foursA, &twos, twos, twosA, twosB); 298 foursA = csaHigh(twos, twosA, twosB); 299 twos = csaLow(twos, twosA, twosB); 300 301 // ==================================== 302 303 // CSA(&twosA, &ones, ones, d[i+12], d[i +13]); 304 twosA = csaHigh(ones, data[i+12], data[i+13]); 305 ones = csaLow(ones, data[i+12], data[i+13]); 306 307 // CSA(&twosB, &ones, ones, d[i+14], d[i +15]); 308 twosB = csaHigh(ones, data[i+14], data[i+15]); 309 ones = csaLow(ones, data[i+14], data[i+15]); 310 311 // ==================================== 312 313 // CSA(&foursB, &twos, twos, twosA, twosB); 314 foursB = csaHigh(twos, twosA, twosB); 315 twos = csaLow(twos, twosA, twosB); 316 317 // CSA(&eightsB, &fours, fours, foursA, foursB); 318 eightsB = csaHigh(fours, foursA, foursB); 319 fours = csaLow(fours, foursA, foursB); 320 321 // ==================================== 322 323 // CSA(&sixteens, &eights, eights, eightsA, eightsB); 324 sixteens = csaHigh(eights, eightsA, eightsB); 325 eights = csaLow(eights, eightsA, eightsB); 326 327 total += Long.bitCount(sixteens); 328 } 329 total = 16 * total 330 + 8 * Long.bitCount(eights) 331 + 4 * Long.bitCount(fours) 332 + 2 * Long.bitCount(twos) 333 + 1 * Long.bitCount(ones); 334 335 return total + tail(upper); 336 } 337 338 /* ============================================================================================================== */ 339 340 // FIGURE 9. A C function using SSE intrinsics implementing Mula’s algorithm to compute sixteen population counts, 341 // corresponding to sixteen input bytes. 342 343 static final ByteVector MULA128_LOOKUP = (ByteVector)(IntVector.scalars(I128, 0x02_01_01_00, // 0, 1, 1, 2, 344 0x03_02_02_01, // 1, 2, 2, 3, 345 0x03_02_02_01, // 1, 2, 2, 3, 346 0x04_03_03_02 // 2, 3, 3, 4 347 ).reinterpret(B128)); 348 349 ByteVector popcntB128(ByteVector v) { 350 var low_mask = ByteVector.broadcast(B128, (byte)0x0f); 351 352 var lo = v .and(low_mask); 353 var hi = v.shiftRight(4).and(low_mask); 354 355 var cnt1 = MULA128_LOOKUP.rearrange(lo.toShuffle()); 356 var cnt2 = MULA128_LOOKUP.rearrange(hi.toShuffle()); 357 358 return cnt1.add(cnt2); 359 } 360 361 @Benchmark 362 public long Mula128() { 363 var acc = LongVector.zero(L128); // IntVector 364 int step = 32; // % B128.length() == 0! 365 int upper = data.length - (data.length % step); 366 for (int i = 0; i < upper; i += step) { 367 var bacc = ByteVector.zero(B128); 368 for (int j = 0; j < step; j += L128.length()) { 369 var v1 = LongVector.fromArray(L128, data, i + j); 370 var v2 = (ByteVector)v1.reinterpret(B128); 371 var v3 = popcntB128(v2); 372 bacc = bacc.add(v3); 373 } 374 acc = acc.add(sumUnsignedBytes(bacc)); 375 } 376 var r = acc.addLanes() + tail(upper); 377 return r; 378 } 379 380 /* ============================================================================================================== */ 381 382 // FIGURE 10. A C function using AVX2 intrinsics implementing Mula’s algorithm to compute the four population counts 383 // of the four 64-bit words in a 256-bit vector. The 32 B output vector should be interpreted as four separate 384 // 64-bit counts that need to be summed to obtain the final population count. 385 386 static final ByteVector MULA256_LOOKUP = 387 (ByteVector)(join(I128, I256, (IntVector)(MULA128_LOOKUP.reinterpret(I128)), (IntVector)(MULA128_LOOKUP.reinterpret(I128))).reinterpret(B256)); 388 389 ByteVector popcntB256(ByteVector v) { 390 var low_mask = ByteVector.broadcast(B256, (byte)0x0F); 391 392 var lo = v .and(low_mask); 393 var hi = v.shiftRight(4).and(low_mask); 394 395 var cnt1 = MULA256_LOOKUP.rearrange(lo.toShuffle()); 396 var cnt2 = MULA256_LOOKUP.rearrange(hi.toShuffle()); 397 var cnt = cnt1.add(cnt2); 398 399 return cnt; 400 } 401 402 // Horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, 403 // and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst: 404 // _mm256_sad_epu8(total, _mm256_setzero_si256()) 405 LongVector sumUnsignedBytes(ByteVector vb) { 406 return sumUnsignedBytesShapes(vb); 407 // return sumUnsignedBytesShifts(vb); 408 } 409 410 LongVector sumUnsignedBytesShapes(ByteVector vb) { 411 VectorSpecies<Short> shortSpecies = VectorSpecies.of(short.class, vb.shape()); 412 VectorSpecies<Integer> intSpecies = VectorSpecies.of(int.class, vb.shape()); 413 VectorSpecies<Long> longSpecies = VectorSpecies.of(long.class, vb.shape()); 414 415 var low_short_mask = ShortVector.broadcast(shortSpecies, (short) 0xFF); 416 var low_int_mask = IntVector.broadcast(intSpecies, 0xFFFF); 417 var low_long_mask = LongVector.broadcast(longSpecies, 0xFFFFFFFFL); 418 419 var vs = (ShortVector)vb.reinterpret(shortSpecies); // 16-bit 420 var vs0 = vs.and(low_short_mask); 421 var vs1 = vs.shiftRight(8).and(low_short_mask); 422 var vs01 = vs0.add(vs1); 423 424 var vi = (IntVector)vs01.reinterpret(intSpecies); // 32-bit 425 var vi0 = vi.and(low_int_mask); 426 var vi1 = vi.shiftRight(16).and(low_int_mask); 427 var vi01 = vi0.add(vi1); 428 429 var vl = (LongVector)vi01.reinterpret(longSpecies); // 64-bit 430 var vl0 = vl.and(low_long_mask); 431 var vl1 = vl.shiftRight(32).and(low_long_mask); 432 var vl01 = vl0.add(vl1); 433 434 return vl01; 435 } 436 437 LongVector sumUnsignedBytesShifts(ByteVector vb) { 438 VectorSpecies<Long> to = VectorSpecies.of(long.class, vb.shape()); 439 440 var low_mask = LongVector.broadcast(to, 0xFF); 441 442 var vl = (LongVector)vb.reinterpret(to); 443 444 var v0 = vl .and(low_mask); // 8-bit 445 var v1 = vl.shiftRight( 8).and(low_mask); // 8-bit 446 var v2 = vl.shiftRight(16).and(low_mask); // 8-bit 447 var v3 = vl.shiftRight(24).and(low_mask); // 8-bit 448 var v4 = vl.shiftRight(32).and(low_mask); // 8-bit 449 var v5 = vl.shiftRight(40).and(low_mask); // 8-bit 450 var v6 = vl.shiftRight(48).and(low_mask); // 8-bit 451 var v7 = vl.shiftRight(56).and(low_mask); // 8-bit 452 453 var v01 = v0.add(v1); 454 var v23 = v2.add(v3); 455 var v45 = v4.add(v5); 456 var v67 = v6.add(v7); 457 458 var v03 = v01.add(v23); 459 var v47 = v45.add(v67); 460 461 var sum = v03.add(v47); // 64-bit 462 return sum; 463 } 464 465 @Benchmark 466 public long Mula256() { 467 var acc = LongVector.zero(L256); 468 int step = 32; // % B256.length() == 0! 469 int upper = data.length - (data.length % step); 470 for (int i = 0; i < upper; i += step) { 471 var bacc = ByteVector.zero(B256); 472 for (int j = 0; j < step; j += L256.length()) { 473 var v1 = LongVector.fromArray(L256, data, i + j); 474 var v2 = popcntB256((ByteVector)(v1.reinterpret(B256))); 475 bacc = bacc.add(v2); 476 } 477 acc = acc.add(sumUnsignedBytes(bacc)); 478 } 479 return acc.addLanes() + tail(upper); 480 } 481 482 483 /* ============================================================================================================== */ 484 485 // FIGURE 11. A C function using AVX2 intrinsics implementing a bitwise parallel carry-save adder (CSA). 486 487 LongVector csaLow(LongVector a, LongVector b, LongVector c) { 488 var u = a.xor(b); 489 var r = u.xor(c); 490 return r; 491 } 492 493 LongVector csaHigh(LongVector a, LongVector b, LongVector c) { 494 var u = a.xor(b); 495 var ab = a.and(b); 496 var uc = u.and(c); 497 var r = ab.or(uc); // (a & b) | ((a ^ b) & c) 498 return r; 499 } 500 501 LongVector popcntL256(LongVector v) { 502 var vb1 = (ByteVector)v.reinterpret(B256); 503 var vb2 = popcntB256(vb1); 504 return sumUnsignedBytes(vb2); 505 } 506 507 // FIGURE 12. A C function using AVX2 intrinsics implementing Harley-Seal’s algorithm. It assumes, for 508 // simplicity, that the input size in 256-bit vectors is divisible by 16. See Fig. 10 for the count function. 509 510 @Benchmark 511 public long HarleySeal256() { 512 LongVector ones, twos, fours, eights, sixteens, vtotal, twosA, twosB, foursA, foursB, eightsA, eightsB; 513 ones = twos = fours = eights = sixteens = twosA = twosB = foursA = foursB = eightsA = eights = vtotal = LongVector.broadcast(L256, 0); 514 515 var vlen = L256.length(); 516 int step = 16 * vlen; 517 int upper = data.length - (data.length % step); 518 for (int i = 0; i < upper; i += step) { 519 // CSA(&twosA, &ones, ones, d[i+0], d[i +1]); 520 var d0 = LongVector.fromArray(L256, data, i + 0 * vlen); 521 var d1 = LongVector.fromArray(L256, data, i + 1 * vlen); 522 523 twosA = csaHigh(ones, d0, d1); 524 ones = csaLow(ones, d0, d1); 525 526 // CSA(&twosB, &ones, ones, d[i+2], d[i+3]); 527 var d2 = LongVector.fromArray(L256, data, i + 2 * vlen); 528 var d3 = LongVector.fromArray(L256, data, i + 3 * vlen); 529 twosB = csaHigh(ones, d2, d3); 530 ones = csaLow(ones, d2, d3); 531 532 // CSA(&foursA, &twos, twos, twosA, twosB); 533 foursA = csaHigh(twos, twosA, twosB); 534 twos = csaLow(twos, twosA, twosB); 535 536 // ==================================== 537 538 // CSA(&twosA, &ones, ones, d[i+4], d[i+5]); 539 var d4 = LongVector.fromArray(L256, data, i + 4 * vlen); 540 var d5 = LongVector.fromArray(L256, data, i + 5 * vlen); 541 twosA = csaHigh(ones, d4, d5); 542 ones = csaLow(ones, d4, d5); 543 544 // CSA(&twosB, &ones, ones, d[i+6], d[i+7]); 545 var d6 = LongVector.fromArray(L256, data, i + 6 * vlen); 546 var d7 = LongVector.fromArray(L256, data, i + 7 * vlen); 547 twosB = csaHigh(ones, d6, d7); 548 ones = csaLow(ones, d6, d7); 549 550 // CSA(&foursB, &twos, twos, twosA, twosB); 551 foursB = csaHigh(twos, twosA, twosB); 552 twos = csaLow(twos, twosA, twosB); 553 554 // ==================================== 555 556 // CSA(&eightsA, &fours, fours, foursA, foursB); 557 eightsA = csaHigh(fours, foursA, foursB); 558 fours = csaLow(fours, foursA, foursB); 559 560 // ==================================== 561 562 // CSA(&twosA, &ones, ones, d[i+8], d[i+9]); 563 var d8 = LongVector.fromArray(L256, data, i + 8 * vlen); 564 var d9 = LongVector.fromArray(L256, data, i + 9 * vlen); 565 twosA = csaHigh(ones, d8, d9); 566 ones = csaLow(ones, d8, d9); 567 568 // CSA(&twosB, &ones, ones, d[i+10],d[i+11]); 569 var d10 = LongVector.fromArray(L256, data, i + 10 * vlen); 570 var d11 = LongVector.fromArray(L256, data, i + 11 * vlen); 571 twosB = csaHigh(ones, d10, d11); 572 ones = csaLow(ones, d10, d11); 573 574 // CSA(&foursA, &twos, twos, twosA, twosB); 575 foursA = csaHigh(twos, twosA, twosB); 576 twos = csaLow(twos, twosA, twosB); 577 578 // ==================================== 579 580 // CSA(&twosA, &ones, ones, d[i+12], d[i +13]); 581 var d12 = LongVector.fromArray(L256, data, i + 12 * vlen); 582 var d13 = LongVector.fromArray(L256, data, i + 13 * vlen); 583 twosA = csaHigh(ones, d12, d13); 584 ones = csaLow(ones, d12, d13); 585 586 // CSA(&twosB, &ones, ones, d[i+14], d[i +15]); 587 var d14 = LongVector.fromArray(L256, data, i + 14 * vlen); 588 var d15 = LongVector.fromArray(L256, data, i + 15 * vlen); 589 twosB = csaHigh(ones, d14, d15); 590 ones = csaLow(ones, d14, d15); 591 592 // ==================================== 593 594 // CSA(&foursB, &twos, twos, twosA, twosB); 595 foursB = csaHigh(twos, twosA, twosB); 596 twos = csaLow(twos, twosA, twosB); 597 598 // CSA(&eightsB, &fours, fours, foursA, foursB); 599 eightsB = csaHigh(fours, foursA, foursB); 600 fours = csaLow(fours, foursA, foursB); 601 602 // ==================================== 603 604 // CSA(&sixteens, &eights, eights, eightsA, eightsB); 605 sixteens = csaHigh(eights, eightsA, eightsB); 606 eights = csaLow(eights, eightsA, eightsB); 607 608 vtotal = vtotal.add(popcntL256(sixteens)); 609 } 610 611 vtotal = vtotal.mul(16); // << 4 612 vtotal = vtotal.add(popcntL256(eights).mul(8)); // << 3 613 vtotal = vtotal.add(popcntL256(fours).mul(4)); // << 2 614 vtotal = vtotal.add(popcntL256(twos).mul(2)); // << 1 615 vtotal = vtotal.add(popcntL256(ones)); // << 0 616 617 var total = vtotal.addLanes(); 618 619 return total + tail(upper); 620 } 621 622 /* ============================================================================================================== */ 623 624 // ByteVector csaLow512(ByteVector a, ByteVector b, ByteVector c) { 625 // return _mm512_ternarylogic_epi32(c, b, a, 0x96); // vpternlogd 626 // } 627 // 628 // ByteVector csaLow512(ByteVector a, ByteVector b, ByteVector c) { 629 // return _mm512_ternarylogic_epi32(c, b, a, 0xe8); // vpternlogd 630 // } 631 }