63
64 @Benchmark
65 public int scalar() {
66 int sum = 0;
67 for (int i = 0; i < data.length; i++) {
68 sum += data[i] & 0xFF;
69 }
70 return sum;
71 }
72
73 // 1. 32-bit accumulators
74 @Benchmark
75 public int vectorInt() {
76 final var lobyte_mask = IntVector.broadcast(I256, 0x000000FF);
77
78 var acc = IntVector.zero(I256);
79 for (int i = 0; i < data.length; i += B256.length()) {
80 var vb = ByteVector.fromArray(B256, data, i);
81 var vi = (IntVector)vb.reinterpret(I256);
82 for (int j = 0; j < 4; j++) {
83 var tj = vi.shiftR(j * 8).and(lobyte_mask);
84 acc = acc.add(tj);
85 }
86 }
87 return (int)Integer.toUnsignedLong(acc.addAll());
88 }
89
90 // 2. 16-bit accumulators
91 @Benchmark
92 public int vectorShort() {
93 final var lobyte_mask = ShortVector.broadcast(S256, (short) 0x00FF);
94
95 // FIXME: overflow
96 var acc = ShortVector.zero(S256);
97 for (int i = 0; i < data.length; i += B256.length()) {
98 var vb = ByteVector.fromArray(B256, data, i);
99 var vs = (ShortVector)vb.reinterpret(S256);
100 for (int j = 0; j < 2; j++) {
101 var tj = vs.shiftR(j * 8).and(lobyte_mask);
102 acc = acc.add(tj);
103 }
104 }
105
106 int mid = S128.length();
107 var accLo = ((IntVector)(acc .reshape(S128).cast(I256))).and(0xFFFF); // low half as ints
108 var accHi = ((IntVector)(acc.shiftEL(mid).reshape(S128).cast(I256))).and(0xFFFF); // high half as ints
109 return accLo.addAll() + accHi.addAll();
110 }
111
112 /*
113 // 3. 8-bit halves (MISSING: _mm_adds_epu8)
114 @Benchmark
115 public int vectorByte() {
116 int window = 256;
117 var acc_hi = IntVector.zero(I256);
118 var acc8_lo = ByteVector.zero(B256);
119 for (int i = 0; i < data.length; i += window) {
120 var acc8_hi = ByteVector.zero(B256);
121 int limit = Math.min(window, data.length - i);
122 for (int j = 0; j < limit; j += B256.length()) {
123 var vb = ByteVector.fromArray(B256, data, i + j);
124
125 var t0 = acc8_lo.add(vb);
126 var t1 = addSaturated(acc8_lo, vb); // MISSING
127 var overflow = t0.notEqual(t1);
128
129 acc8_lo = t0;
|
63
64 @Benchmark
65 public int scalar() {
66 int sum = 0;
67 for (int i = 0; i < data.length; i++) {
68 sum += data[i] & 0xFF;
69 }
70 return sum;
71 }
72
73 // 1. 32-bit accumulators
74 @Benchmark
75 public int vectorInt() {
76 final var lobyte_mask = IntVector.broadcast(I256, 0x000000FF);
77
78 var acc = IntVector.zero(I256);
79 for (int i = 0; i < data.length; i += B256.length()) {
80 var vb = ByteVector.fromArray(B256, data, i);
81 var vi = (IntVector)vb.reinterpret(I256);
82 for (int j = 0; j < 4; j++) {
83 var tj = vi.shiftRight(j * 8).and(lobyte_mask);
84 acc = acc.add(tj);
85 }
86 }
87 return (int)Integer.toUnsignedLong(acc.addLanes());
88 }
89
90 // 2. 16-bit accumulators
91 @Benchmark
92 public int vectorShort() {
93 final var lobyte_mask = ShortVector.broadcast(S256, (short) 0x00FF);
94
95 // FIXME: overflow
96 var acc = ShortVector.zero(S256);
97 for (int i = 0; i < data.length; i += B256.length()) {
98 var vb = ByteVector.fromArray(B256, data, i);
99 var vs = (ShortVector)vb.reinterpret(S256);
100 for (int j = 0; j < 2; j++) {
101 var tj = vs.shiftRight(j * 8).and(lobyte_mask);
102 acc = acc.add(tj);
103 }
104 }
105
106 int mid = S128.length();
107 var accLo = ((IntVector)(acc .reshape(S128).cast(I256))).and(0xFFFF); // low half as ints
108 var accHi = ((IntVector)(acc.shiftLanesLeft(mid).reshape(S128).cast(I256))).and(0xFFFF); // high half as ints
109 return accLo.addLanes() + accHi.addLanes();
110 }
111
112 /*
113 // 3. 8-bit halves (MISSING: _mm_adds_epu8)
114 @Benchmark
115 public int vectorByte() {
116 int window = 256;
117 var acc_hi = IntVector.zero(I256);
118 var acc8_lo = ByteVector.zero(B256);
119 for (int i = 0; i < data.length; i += window) {
120 var acc8_hi = ByteVector.zero(B256);
121 int limit = Math.min(window, data.length - i);
122 for (int j = 0; j < limit; j += B256.length()) {
123 var vb = ByteVector.fromArray(B256, data, i + j);
124
125 var t0 = acc8_lo.add(vb);
126 var t1 = addSaturated(acc8_lo, vb); // MISSING
127 var overflow = t0.notEqual(t1);
128
129 acc8_lo = t0;
|