1 /*
   2  * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have
  23  * questions.
  24  */
  25 package benchmark.jdk.incubator.vector;
  26 
  27 import jdk.incubator.vector.ByteVector;
  28 import jdk.incubator.vector.ShortVector;
  29 import jdk.incubator.vector.IntVector;
  30 import jdk.incubator.vector.LongVector;
  31 import jdk.incubator.vector.VectorSpecies;
  32 import org.openjdk.jmh.annotations.*;
  33 
  34 import java.util.concurrent.TimeUnit;
  35 
  36 import static org.junit.jupiter.api.Assertions.assertEquals;
  37 
  38 /**
  39  * Population count algorithms from "Faster Population Counts Using AVX2 Instructions", 2018 by Mula, Kurz, Lemire
  40  */
  41 @BenchmarkMode(Mode.Throughput)
  42 @Warmup(iterations = 3, time = 1)
  43 @Measurement(iterations = 5, time = 1)
  44 @OutputTimeUnit(TimeUnit.MILLISECONDS)
  45 @State(Scope.Benchmark)
  46 @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
  47 public class PopulationCount extends AbstractVectorBenchmark {
  48     @Param({"64", "1024", "65536"})
  49     int size;
  50 
  51     private long[] data;
  52 
  53     @Setup
  54     public void init() {
  55         data = fillLong(size, i -> RANDOM.nextLong());
  56 //        data = fillLong(size, i -> 0L);
  57 //        data = fillLong(size, i -> -1L);
  58 
  59         checkConsistency();
  60     }
  61 
  62     @TearDown
  63     public void tearDown() {
  64         checkConsistency();
  65     }
  66 
  67     void checkConsistency() {
  68         long popCount = longBitCount();
  69         assertEquals(popCount, treeOfAdders());
  70         assertEquals(popCount, WilkesWheelerGill());
  71         assertEquals(popCount, Wegner());
  72         assertEquals(popCount, Lauradoux());
  73         assertEquals(popCount, HarleySeal());
  74         assertEquals(popCount, Mula128());
  75         assertEquals(popCount, Mula256());
  76         assertEquals(popCount, HarleySeal256());
  77     }
  78 
  79     long tail(int upper) {
  80         long acc = 0;
  81         for (int i = upper; i < data.length; i++) {
  82             acc += Long.bitCount(data[i]);
  83         }
  84         return acc;
  85     }
  86 
  87     @Benchmark
  88     public long longBitCount() {
  89         long acc = 0;
  90         for (int i = 0; i < data.length; i++) {
  91             acc += Long.bitCount(data[i]);
  92         }
  93         return acc;
  94     }
  95 
  96     /* ============================================================================================================== */
  97 
  98     // FIGURE 4. The Wegner function in C
  99 
 100     long popcntWegner(long x) {
 101         int v = 0;
 102         while (x != 0) {
 103             x &= x - 1;
 104             v++;
 105         }
 106         return v;
 107     }
 108 
 109     @Benchmark
 110     public long Wegner() {
 111         long acc = 0;
 112         for (int i = 0; i < data.length; i++) {
 113             acc += popcntWegner(data[i]);
 114         }
 115         return acc;
 116     }
 117 
 118     /* ============================================================================================================== */
 119 
 120     // FIGURE 2. A naive tree-of-adders function in C
 121 
 122     static long popcntTree(long x) {
 123         long c1  = 0x5555555555555555L;
 124         long c2  = 0x3333333333333333L;
 125         long c4  = 0x0F0F0F0F0F0F0F0FL;
 126         long c8  = 0x00FF00FF00FF00FFL;
 127         long c16 = 0x0000FFFF0000FFFFL;
 128         long c32 = 0x00000000FFFFFFFFL;
 129 
 130         x = (x & c1)  + ((x >>> 1)  & c1);
 131         x = (x & c2)  + ((x >>> 2)  & c2);
 132         x = (x & c4)  + ((x >>> 4)  & c4);
 133         x = (x & c8)  + ((x >>> 8)  & c8);
 134         x = (x & c16) + ((x >>> 16) & c16);
 135         x = (x & c32) + ((x >>> 32) & c32);
 136         return x;
 137     }
 138 
 139     @Benchmark
 140     public long treeOfAdders() {
 141         long acc = 0;
 142         for (int i = 0; i < data.length; i++) {
 143             acc += popcntTree(data[i]);
 144         }
 145         return acc;
 146     }
 147 
 148     /* ============================================================================================================== */
 149 
 150     // FIGURE 3. The Wilkes-Wheeler-Gill function in C
 151 
 152     static long popcntWWG(long x) {
 153         long c1 = 0x5555555555555555L;
 154         long c2 = 0x3333333333333333L;
 155         long c4 = 0x0F0F0F0F0F0F0F0FL;
 156 
 157         x -= (x >>> 1) & c1;
 158         x = (( x >>> 2) & c2) + (x & c2) ;
 159         x = ( x + (x >>> 4) ) & c4;
 160         x *= 0x0101010101010101L;
 161         x = x >>> 56;
 162         return x;
 163     }
 164 
 165     @Benchmark
 166     public long WilkesWheelerGill() {
 167         long acc = 0;
 168         for (int i = 0; i < data.length; i++) {
 169             acc += popcntWWG(data[i]);
 170         }
 171         return acc;
 172     }
 173 
 174     /* ============================================================================================================== */
 175 
 176     // FIGURE 5. The Lauradoux population count in C for sets of 12 words.
 177 
 178     static long parallelPopcnt(long count1, long count2, long count3) {
 179         long m1  = 0x5555555555555555L;
 180         long m2  = 0x3333333333333333L;
 181         long m4  = 0x0F0F0F0F0F0F0F0FL;
 182 
 183         long half1 = (count3      ) & m1;
 184         long half2 = (count3 >>> 1) & m1;
 185 
 186         count1 -= (count1 >>> 1) & m1;
 187         count2 -= (count2 >>> 1) & m1;
 188         count1 += half1;
 189         count2 += half2;
 190         count1  = (count1 & m2) + (( count1 >>> 2) & m2);
 191         count1 += (count2 & m2) + (( count2 >>> 2) & m2);
 192         return (count1 & m4) + (( count1 >>> 4) & m4);
 193     }
 194 
 195     static long reduce(long acc) {
 196         long m8  = 0x00FF00FF00FF00FFL;
 197         long m16 = 0x0000FFFF0000FFFFL;
 198         long m32 = 0x00000000FFFFFFFFL;
 199 
 200         acc = (acc & m8) + (( acc >>> 8) & m8);
 201         acc = (acc + (acc >>> 16) ) & m16;
 202         acc = (acc & m32) + (acc >>> 32);
 203         return acc;
 204     }
 205 
 206     static long popcntLauradoux(long[] xs, int off) {
 207         long acc = 0;
 208         for (int j = off; j < off+12; j += 3) {
 209             acc += parallelPopcnt(xs[j+0], xs[j+1], xs[j+2]);
 210         }
 211         return reduce(acc);
 212     }
 213 
 214     @Benchmark
 215     public long Lauradoux() {
 216         long acc = 0;
 217         int upper = data.length - (data.length % 12);
 218         for (int i = 0; i < upper; i += 12) {
 219             acc += popcntLauradoux(data, i);
 220         }
 221         return acc + tail(upper);
 222     }
 223 
 224     /* ============================================================================================================== */
 225 
 226     // FIGURE 6. A C function implementing a bitwise parallel carry-save adder (CSA). Given three input words a, b, c, it
 227     // generates two new words h, l in which each bit represents the high and low bits in the bitwise sum of the bits from a,
 228     // b, and c.
 229 
 230     static long csaLow(long a, long b, long c) {
 231         long u = a ^ b;
 232         long lo = u ^ c;
 233         return lo;
 234     }
 235 
 236     static long csaHigh(long a, long b, long c) {
 237         long u = a ^ b;
 238         long hi = (a & b) | (u & c) ;
 239         return hi;
 240     }
 241 
 242     // FIGURE 8. A C function implementing the Harley-Seal
 243     // population count over an array of 64-bit words. The count
 244     // function could be the Wilkes-Wheeler-Gill function.
 245     @Benchmark
 246     public long HarleySeal() {
 247         long total = 0, ones = 0, twos = 0, fours = 0, eights = 0, sixteens = 0;
 248         long twosA   = 0, twosB   = 0;
 249         long foursA  = 0, foursB  = 0;
 250         long eightsA = 0, eightsB = 0;
 251 
 252         int step = 16;
 253         int upper = data.length - (data.length % step);
 254         for (int i = 0; i < upper; i += step) {
 255             // CSA(&twosA, &ones, ones, d[i+0], d[i +1]);
 256             twosA = csaHigh(ones, data[i+0], data[i+1]);
 257             ones  = csaLow(ones, data[i+0], data[i+1]);
 258 
 259             // CSA(&twosB, &ones, ones, d[i+2], d[i+3]);
 260             twosB = csaHigh(ones, data[i+2], data[i+3]);
 261             ones  = csaLow(ones, data[i+2], data[i+3]);
 262 
 263             // CSA(&foursA, &twos, twos, twosA, twosB);
 264             foursA = csaHigh(twos, twosA, twosB);
 265             twos   = csaLow(twos, twosA, twosB);
 266 
 267             // ====================================
 268 
 269             // CSA(&twosA, &ones, ones, d[i+4], d[i+5]);
 270             twosA = csaHigh(ones, data[i+4], data[i+5]);
 271             ones  = csaLow(ones, data[i+4], data[i+5]);
 272 
 273             // CSA(&twosB, &ones, ones, d[i+6], d[i+7]);
 274             twosB = csaHigh(ones, data[i+6], data[i+7]);
 275             ones  = csaLow(ones, data[i+6], data[i+7]);
 276 
 277             // CSA(&foursB, &twos, twos, twosA, twosB);
 278             foursB = csaHigh(twos, twosA, twosB);
 279             twos   = csaLow(twos, twosA, twosB);
 280 
 281             // ====================================
 282 
 283             // CSA(&eightsA, &fours, fours, foursA, foursB);
 284             eightsA = csaHigh(fours, foursA, foursB);
 285             fours   = csaLow(fours, foursA, foursB);
 286 
 287             // ====================================
 288 
 289             // CSA(&twosA, &ones, ones, d[i+8], d[i+9]);
 290             twosA = csaHigh(ones, data[i+8], data[i+9]);
 291             ones  = csaLow(ones, data[i+8], data[i+9]);
 292 
 293             // CSA(&twosB, &ones, ones, d[i+10],d[i+11]);
 294             twosB = csaHigh(ones, data[i+10], data[i+11]);
 295             ones  = csaLow(ones, data[i+10], data[i+11]);
 296 
 297             // CSA(&foursA, &twos, twos, twosA, twosB);
 298             foursA = csaHigh(twos, twosA, twosB);
 299             twos   = csaLow(twos, twosA, twosB);
 300 
 301             // ====================================
 302 
 303             // CSA(&twosA, &ones, ones, d[i+12], d[i +13]);
 304             twosA = csaHigh(ones, data[i+12], data[i+13]);
 305             ones  = csaLow(ones, data[i+12], data[i+13]);
 306 
 307             // CSA(&twosB, &ones, ones, d[i+14], d[i +15]);
 308             twosB = csaHigh(ones, data[i+14], data[i+15]);
 309             ones  = csaLow(ones, data[i+14], data[i+15]);
 310 
 311             // ====================================
 312 
 313             // CSA(&foursB, &twos, twos, twosA, twosB);
 314             foursB = csaHigh(twos, twosA, twosB);
 315             twos   = csaLow(twos, twosA, twosB);
 316 
 317             // CSA(&eightsB, &fours, fours, foursA, foursB);
 318             eightsB = csaHigh(fours, foursA, foursB);
 319             fours   = csaLow(fours, foursA, foursB);
 320 
 321             // ====================================
 322 
 323             // CSA(&sixteens, &eights, eights, eightsA, eightsB);
 324             sixteens = csaHigh(eights, eightsA, eightsB);
 325             eights   = csaLow(eights, eightsA, eightsB);
 326 
 327             total += Long.bitCount(sixteens);
 328         }
 329         total = 16 * total
 330                 + 8 * Long.bitCount(eights)
 331                 + 4 * Long.bitCount(fours)
 332                 + 2 * Long.bitCount(twos)
 333                 + 1 * Long.bitCount(ones);
 334 
 335         return total + tail(upper);
 336     }
 337 
 338     /* ============================================================================================================== */
 339 
 340     // FIGURE 9. A C function using SSE intrinsics implementing Mula’s algorithm to compute sixteen population counts,
 341     // corresponding to sixteen input bytes.
 342 
 343     static final ByteVector MULA128_LOOKUP = (ByteVector)(IntVector.scalars(I128, 0x02_01_01_00, // 0, 1, 1, 2,
 344                                                           0x03_02_02_01, // 1, 2, 2, 3,
 345                                                           0x03_02_02_01, // 1, 2, 2, 3,
 346                                                           0x04_03_03_02  // 2, 3, 3, 4
 347                                                           ).reinterpret(B128));
 348 
 349     ByteVector popcntB128(ByteVector v) {
 350         var low_mask = ByteVector.broadcast(B128, (byte)0x0f);
 351 
 352         var lo = v          .and(low_mask);
 353         var hi = v.shiftRight(4).and(low_mask);
 354 
 355         var cnt1 = MULA128_LOOKUP.rearrange(lo.toShuffle());
 356         var cnt2 = MULA128_LOOKUP.rearrange(hi.toShuffle());
 357 
 358         return cnt1.add(cnt2);
 359     }
 360 
 361     @Benchmark
 362     public long Mula128() {
 363         var acc = LongVector.zero(L128); // IntVector
 364         int step = 32; // % B128.length() == 0!
 365         int upper = data.length - (data.length % step);
 366         for (int i = 0; i < upper; i += step) {
 367             var bacc = ByteVector.zero(B128);
 368             for (int j = 0; j < step; j += L128.length()) {
 369                 var v1 = LongVector.fromArray(L128, data, i + j);
 370                 var v2 = (ByteVector)v1.reinterpret(B128);
 371                 var v3 = popcntB128(v2);
 372                 bacc = bacc.add(v3);
 373             }
 374             acc = acc.add(sumUnsignedBytes(bacc));
 375         }
 376         var r = acc.addLanes() + tail(upper);
 377         return r;
 378     }
 379 
 380     /* ============================================================================================================== */
 381 
 382     // FIGURE 10. A C function using AVX2 intrinsics implementing Mula’s algorithm to compute the four population counts
 383     // of the four 64-bit words in a 256-bit vector. The 32 B output vector should be interpreted as four separate
 384     // 64-bit counts that need to be summed to obtain the final population count.
 385 
 386     static final ByteVector MULA256_LOOKUP = 
 387             (ByteVector)(join(I128, I256, (IntVector)(MULA128_LOOKUP.reinterpret(I128)), (IntVector)(MULA128_LOOKUP.reinterpret(I128))).reinterpret(B256));
 388 
 389     ByteVector popcntB256(ByteVector v) {
 390         var low_mask = ByteVector.broadcast(B256, (byte)0x0F);
 391 
 392         var lo = v          .and(low_mask);
 393         var hi = v.shiftRight(4).and(low_mask);
 394 
 395         var cnt1 = MULA256_LOOKUP.rearrange(lo.toShuffle());
 396         var cnt2 = MULA256_LOOKUP.rearrange(hi.toShuffle());
 397         var cnt = cnt1.add(cnt2);
 398 
 399         return cnt;
 400     }
 401 
 402     // Horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers,
 403     // and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst:
 404     //   _mm256_sad_epu8(total, _mm256_setzero_si256())
 405     LongVector sumUnsignedBytes(ByteVector vb) {
 406         return sumUnsignedBytesShapes(vb);
 407 //        return sumUnsignedBytesShifts(vb);
 408     }
 409 
 410     LongVector sumUnsignedBytesShapes(ByteVector vb) {
 411         VectorSpecies<Short> shortSpecies = VectorSpecies.of(short.class, vb.shape());
 412         VectorSpecies<Integer> intSpecies = VectorSpecies.of(int.class, vb.shape());
 413         VectorSpecies<Long> longSpecies = VectorSpecies.of(long.class, vb.shape());
 414 
 415         var low_short_mask = ShortVector.broadcast(shortSpecies, (short) 0xFF);
 416         var low_int_mask = IntVector.broadcast(intSpecies, 0xFFFF);
 417         var low_long_mask = LongVector.broadcast(longSpecies, 0xFFFFFFFFL);
 418 
 419         var vs = (ShortVector)vb.reinterpret(shortSpecies); // 16-bit
 420         var vs0 = vs.and(low_short_mask);
 421         var vs1 = vs.shiftRight(8).and(low_short_mask);
 422         var vs01 = vs0.add(vs1);
 423 
 424         var vi = (IntVector)vs01.reinterpret(intSpecies); // 32-bit
 425         var vi0 = vi.and(low_int_mask);
 426         var vi1 = vi.shiftRight(16).and(low_int_mask);
 427         var vi01 = vi0.add(vi1);
 428 
 429         var vl = (LongVector)vi01.reinterpret(longSpecies); // 64-bit
 430         var vl0 = vl.and(low_long_mask);
 431         var vl1 = vl.shiftRight(32).and(low_long_mask);
 432         var vl01 = vl0.add(vl1);
 433 
 434         return vl01;
 435     }
 436 
 437     LongVector sumUnsignedBytesShifts(ByteVector vb) {
 438         VectorSpecies<Long> to = VectorSpecies.of(long.class, vb.shape());
 439 
 440         var low_mask = LongVector.broadcast(to, 0xFF);
 441 
 442         var vl = (LongVector)vb.reinterpret(to);
 443 
 444         var v0 = vl           .and(low_mask); // 8-bit
 445         var v1 = vl.shiftRight( 8).and(low_mask); // 8-bit
 446         var v2 = vl.shiftRight(16).and(low_mask); // 8-bit
 447         var v3 = vl.shiftRight(24).and(low_mask); // 8-bit
 448         var v4 = vl.shiftRight(32).and(low_mask); // 8-bit
 449         var v5 = vl.shiftRight(40).and(low_mask); // 8-bit
 450         var v6 = vl.shiftRight(48).and(low_mask); // 8-bit
 451         var v7 = vl.shiftRight(56).and(low_mask); // 8-bit
 452 
 453         var v01 = v0.add(v1);
 454         var v23 = v2.add(v3);
 455         var v45 = v4.add(v5);
 456         var v67 = v6.add(v7);
 457 
 458         var v03 = v01.add(v23);
 459         var v47 = v45.add(v67);
 460 
 461         var sum = v03.add(v47); // 64-bit
 462         return sum;
 463     }
 464 
 465     @Benchmark
 466     public long Mula256() {
 467         var acc = LongVector.zero(L256);
 468         int step = 32; // % B256.length() == 0!
 469         int upper = data.length - (data.length % step);
 470         for (int i = 0; i < upper; i += step) {
 471             var bacc = ByteVector.zero(B256);
 472             for (int j = 0; j < step; j += L256.length()) {
 473                 var v1 = LongVector.fromArray(L256, data, i + j);
 474                 var v2 = popcntB256((ByteVector)(v1.reinterpret(B256)));
 475                 bacc = bacc.add(v2);
 476             }
 477             acc = acc.add(sumUnsignedBytes(bacc));
 478         }
 479         return acc.addLanes() + tail(upper);
 480     }
 481 
 482 
 483     /* ============================================================================================================== */
 484 
 485     // FIGURE 11. A C function using AVX2 intrinsics implementing a bitwise parallel carry-save adder (CSA).
 486 
 487     LongVector csaLow(LongVector a, LongVector b, LongVector c) {
 488         var u = a.xor(b);
 489         var r = u.xor(c);
 490         return r;
 491     }
 492 
 493     LongVector csaHigh(LongVector a, LongVector b, LongVector c) {
 494         var u  = a.xor(b);
 495         var ab = a.and(b);
 496         var uc = u.and(c);
 497         var r  = ab.or(uc); // (a & b) | ((a ^ b) & c)
 498         return r;
 499     }
 500 
 501     LongVector popcntL256(LongVector v) {
 502         var vb1 = (ByteVector)v.reinterpret(B256);
 503         var vb2 = popcntB256(vb1);
 504         return sumUnsignedBytes(vb2);
 505     }
 506 
 507     // FIGURE 12. A C function using AVX2 intrinsics implementing Harley-Seal’s algorithm. It assumes, for
 508     // simplicity, that the input size in 256-bit vectors is divisible by 16. See Fig. 10 for the count function.
 509 
 510     @Benchmark
 511     public long HarleySeal256() {
 512         LongVector ones, twos, fours, eights, sixteens, vtotal, twosA, twosB, foursA, foursB, eightsA, eightsB;
 513         ones = twos = fours = eights = sixteens = twosA = twosB = foursA = foursB = eightsA = eights = vtotal = LongVector.broadcast(L256, 0);
 514 
 515         var vlen = L256.length();
 516         int step = 16 * vlen;
 517         int upper = data.length - (data.length % step);
 518         for (int i = 0; i < upper; i += step) {
 519             // CSA(&twosA, &ones, ones, d[i+0], d[i +1]);
 520             var d0 = LongVector.fromArray(L256, data, i + 0 * vlen);
 521             var d1 = LongVector.fromArray(L256, data, i + 1 * vlen);
 522 
 523             twosA = csaHigh(ones, d0, d1);
 524             ones  = csaLow(ones, d0, d1);
 525 
 526             // CSA(&twosB, &ones, ones, d[i+2], d[i+3]);
 527             var d2 = LongVector.fromArray(L256, data, i + 2 * vlen);
 528             var d3 = LongVector.fromArray(L256, data, i + 3 * vlen);
 529             twosB = csaHigh(ones, d2, d3);
 530             ones  = csaLow(ones, d2, d3);
 531 
 532             // CSA(&foursA, &twos, twos, twosA, twosB);
 533             foursA = csaHigh(twos, twosA, twosB);
 534             twos   = csaLow(twos, twosA, twosB);
 535 
 536             // ====================================
 537 
 538             // CSA(&twosA, &ones, ones, d[i+4], d[i+5]);
 539             var d4 = LongVector.fromArray(L256, data, i + 4 * vlen);
 540             var d5 = LongVector.fromArray(L256, data, i + 5 * vlen);
 541             twosA = csaHigh(ones, d4, d5);
 542             ones  = csaLow(ones, d4, d5);
 543 
 544             // CSA(&twosB, &ones, ones, d[i+6], d[i+7]);
 545             var d6 = LongVector.fromArray(L256, data, i + 6 * vlen);
 546             var d7 = LongVector.fromArray(L256, data, i + 7 * vlen);
 547             twosB = csaHigh(ones, d6, d7);
 548             ones  = csaLow(ones, d6, d7);
 549 
 550             // CSA(&foursB, &twos, twos, twosA, twosB);
 551             foursB = csaHigh(twos, twosA, twosB);
 552             twos   = csaLow(twos, twosA, twosB);
 553 
 554             // ====================================
 555 
 556             // CSA(&eightsA, &fours, fours, foursA, foursB);
 557             eightsA = csaHigh(fours, foursA, foursB);
 558             fours   = csaLow(fours, foursA, foursB);
 559 
 560             // ====================================
 561 
 562             // CSA(&twosA, &ones, ones, d[i+8], d[i+9]);
 563             var d8 = LongVector.fromArray(L256, data, i + 8 * vlen);
 564             var d9 = LongVector.fromArray(L256, data, i + 9 * vlen);
 565             twosA = csaHigh(ones, d8, d9);
 566             ones  = csaLow(ones, d8, d9);
 567 
 568             // CSA(&twosB, &ones, ones, d[i+10],d[i+11]);
 569             var d10 = LongVector.fromArray(L256, data, i + 10 * vlen);
 570             var d11 = LongVector.fromArray(L256, data, i + 11 * vlen);
 571             twosB = csaHigh(ones, d10, d11);
 572             ones  = csaLow(ones, d10, d11);
 573 
 574             // CSA(&foursA, &twos, twos, twosA, twosB);
 575             foursA = csaHigh(twos, twosA, twosB);
 576             twos   = csaLow(twos, twosA, twosB);
 577 
 578             // ====================================
 579 
 580             // CSA(&twosA, &ones, ones, d[i+12], d[i +13]);
 581             var d12 = LongVector.fromArray(L256, data, i + 12 * vlen);
 582             var d13 = LongVector.fromArray(L256, data, i + 13 * vlen);
 583             twosA = csaHigh(ones, d12, d13);
 584             ones  = csaLow(ones, d12, d13);
 585 
 586             // CSA(&twosB, &ones, ones, d[i+14], d[i +15]);
 587             var d14 = LongVector.fromArray(L256, data, i + 14 * vlen);
 588             var d15 = LongVector.fromArray(L256, data, i + 15 * vlen);
 589             twosB = csaHigh(ones, d14, d15);
 590             ones  = csaLow(ones, d14, d15);
 591 
 592             // ====================================
 593 
 594             // CSA(&foursB, &twos, twos, twosA, twosB);
 595             foursB = csaHigh(twos, twosA, twosB);
 596             twos   = csaLow(twos, twosA, twosB);
 597 
 598             // CSA(&eightsB, &fours, fours, foursA, foursB);
 599             eightsB = csaHigh(fours, foursA, foursB);
 600             fours   = csaLow(fours, foursA, foursB);
 601 
 602             // ====================================
 603 
 604             // CSA(&sixteens, &eights, eights, eightsA, eightsB);
 605             sixteens = csaHigh(eights, eightsA, eightsB);
 606             eights   = csaLow(eights, eightsA, eightsB);
 607 
 608             vtotal = vtotal.add(popcntL256(sixteens));
 609         }
 610 
 611         vtotal = vtotal.mul(16);                       // << 4
 612         vtotal = vtotal.add(popcntL256(eights).mul(8)); // << 3
 613         vtotal = vtotal.add(popcntL256(fours).mul(4));  // << 2
 614         vtotal = vtotal.add(popcntL256(twos).mul(2));   // << 1
 615         vtotal = vtotal.add(popcntL256(ones));          // << 0
 616 
 617         var total = vtotal.addLanes();
 618 
 619         return total + tail(upper);
 620     }
 621 
 622     /* ============================================================================================================== */
 623 
 624 //    ByteVector csaLow512(ByteVector a, ByteVector b, ByteVector c) {
 625 //        return _mm512_ternarylogic_epi32(c, b, a, 0x96); // vpternlogd
 626 //    }
 627 //
 628 //    ByteVector csaLow512(ByteVector a, ByteVector b, ByteVector c) {
 629 //        return _mm512_ternarylogic_epi32(c, b, a, 0xe8); // vpternlogd
 630 //    }
 631 }