40 @Warmup(iterations = 3, time = 1)
41 @Measurement(iterations = 5, time = 1)
42 @OutputTimeUnit(TimeUnit.MILLISECONDS)
43 @State(Scope.Benchmark)
44 @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
45 public class SumOfUnsignedBytes extends AbstractVectorBenchmark {
46
47 @Param({"64", "1024", "65536"})
48 int size;
49
50 private byte[] data;
51
52 @Setup
53 public void init() {
54 size = size + size % 32; // FIXME: process tails
55 data = fillByte(size, i -> (byte)(int)i);
56
57 int sum = scalar();
58 assertEquals(vectorInt(), sum);
59 assertEquals(vectorShort(), sum);
60 assertEquals(vectorByte(), sum);
61 assertEquals(vectorSAD(), sum);
62 }
63
64 @Benchmark
65 public int scalar() {
66 int sum = 0;
67 for (int i = 0; i < data.length; i++) {
68 sum += data[i] & 0xFF;
69 }
70 return sum;
71 }
72
73 // 1. 32-bit accumulators
74 @Benchmark
75 public int vectorInt() {
76 final var lobyte_mask = I256.broadcast(0x000000FF);
77
78 var acc = IntVector.zero(I256);
79 for (int i = 0; i < data.length; i += B256.length()) {
80 var vb = ByteVector.fromArray(B256, data, i);
81 var vi = (IntVector)vb.reinterpret(I256);
82 for (int j = 0; j < 4; j++) {
83 var tj = vi.shiftR(j * 8).and(lobyte_mask);
84 acc = acc.add(tj);
85 }
86 }
87 return (int)Integer.toUnsignedLong(acc.addAll());
88 }
89
90 // 2. 16-bit accumulators
91 @Benchmark
92 public int vectorShort() {
93 final var lobyte_mask = S256.broadcast((short) 0x00FF);
94
95 // FIXME: overflow
96 var acc = ShortVector.zero(S256);
97 for (int i = 0; i < data.length; i += B256.length()) {
98 var vb = ByteVector.fromArray(B256, data, i);
99 var vs = (ShortVector)vb.reinterpret(S256);
100 for (int j = 0; j < 2; j++) {
101 var tj = vs.shiftR(j * 8).and(lobyte_mask);
102 acc = acc.add(tj);
103 }
104 }
105
106 int mid = S128.length();
107 var accLo = ((IntVector)(acc .reshape(S128).cast(I256))).and(0xFFFF); // low half as ints
108 var accHi = ((IntVector)(acc.shiftEL(mid).reshape(S128).cast(I256))).and(0xFFFF); // high half as ints
109 return accLo.addAll() + accHi.addAll();
110 }
111
112 // 3. 8-bit halves (MISSING: _mm_adds_epu8)
113 @Benchmark
114 public int vectorByte() {
115 int window = 256;
116 var acc_hi = IntVector.zero(I256);
117 var acc8_lo = ByteVector.zero(B256);
118 for (int i = 0; i < data.length; i += window) {
119 var acc8_hi = ByteVector.zero(B256);
120 int limit = Math.min(window, data.length - i);
121 for (int j = 0; j < limit; j += B256.length()) {
122 var vb = ByteVector.fromArray(B256, data, i + j);
123
124 var t0 = acc8_lo.add(vb);
125 var t1 = addSaturated(acc8_lo, vb); // MISSING
126 var overflow = t0.notEqual(t1);
127
128 acc8_lo = t0;
129 acc8_hi = acc8_hi.add((byte) 1, overflow);
130 }
131 acc_hi = acc_hi.add(sum(acc8_hi));
132 }
133 return sum(acc8_lo)
134 .add(acc_hi.mul(256)) // overflow
135 .addAll();
136 }
137
138 // 4. Sum Of Absolute Differences (SAD) (MISSING: VPSADBW, _mm256_sad_epu8)
139 public int vectorSAD() {
140 var acc = IntVector.zero(I256);
141 for (int i = 0; i < data.length; i += B256.length()) {
142 var v = ByteVector.fromArray(B256, data, i);
143 var sad = sumOfAbsoluteDifferences(v, ByteVector.zero(B256)); // MISSING
144 acc = acc.add(sad);
145 }
146 return acc.addAll();
147 }
148
149 // Helpers
150
151 static ByteVector addSaturated(ByteVector va, ByteVector vb) {
152 return ByteVectorHelper.map(va, vb, (i, a, b) -> {
153 if ((a & 0xFF) + (b & 0xFF) < 0xFF) {
154 return (byte) (a + b);
155 } else {
156 return (byte)0xFF;
157 }
158 });
159 }
160
161 IntVector sumOfAbsoluteDifferences(ByteVector va, ByteVector vb) {
162 var vc = ByteVectorHelper.map(va, vb, (i, a, b) -> {
163 if ((a & 0xFF) > (b & 0xFF)) {
164 return (byte)(a - b);
165 } else {
166 return (byte)(b - a);
167 }
168 });
169 return sum(vc);
170 }
171 }
|
40 @Warmup(iterations = 3, time = 1)
41 @Measurement(iterations = 5, time = 1)
42 @OutputTimeUnit(TimeUnit.MILLISECONDS)
43 @State(Scope.Benchmark)
44 @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
45 public class SumOfUnsignedBytes extends AbstractVectorBenchmark {
46
47 @Param({"64", "1024", "65536"})
48 int size;
49
50 private byte[] data;
51
52 @Setup
53 public void init() {
54 size = size + size % 32; // FIXME: process tails
55 data = fillByte(size, i -> (byte)(int)i);
56
57 int sum = scalar();
58 assertEquals(vectorInt(), sum);
59 assertEquals(vectorShort(), sum);
60 //assertEquals(vectorByte(), sum);
61 //assertEquals(vectorSAD(), sum);
62 }
63
64 @Benchmark
65 public int scalar() {
66 int sum = 0;
67 for (int i = 0; i < data.length; i++) {
68 sum += data[i] & 0xFF;
69 }
70 return sum;
71 }
72
73 // 1. 32-bit accumulators
74 @Benchmark
75 public int vectorInt() {
76 final var lobyte_mask = IntVector.broadcast(I256, 0x000000FF);
77
78 var acc = IntVector.zero(I256);
79 for (int i = 0; i < data.length; i += B256.length()) {
80 var vb = ByteVector.fromArray(B256, data, i);
81 var vi = (IntVector)vb.reinterpret(I256);
82 for (int j = 0; j < 4; j++) {
83 var tj = vi.shiftR(j * 8).and(lobyte_mask);
84 acc = acc.add(tj);
85 }
86 }
87 return (int)Integer.toUnsignedLong(acc.addAll());
88 }
89
90 // 2. 16-bit accumulators
91 @Benchmark
92 public int vectorShort() {
93 final var lobyte_mask = ShortVector.broadcast(S256, (short) 0x00FF);
94
95 // FIXME: overflow
96 var acc = ShortVector.zero(S256);
97 for (int i = 0; i < data.length; i += B256.length()) {
98 var vb = ByteVector.fromArray(B256, data, i);
99 var vs = (ShortVector)vb.reinterpret(S256);
100 for (int j = 0; j < 2; j++) {
101 var tj = vs.shiftR(j * 8).and(lobyte_mask);
102 acc = acc.add(tj);
103 }
104 }
105
106 int mid = S128.length();
107 var accLo = ((IntVector)(acc .reshape(S128).cast(I256))).and(0xFFFF); // low half as ints
108 var accHi = ((IntVector)(acc.shiftEL(mid).reshape(S128).cast(I256))).and(0xFFFF); // high half as ints
109 return accLo.addAll() + accHi.addAll();
110 }
111
112 /*
113 // 3. 8-bit halves (MISSING: _mm_adds_epu8)
114 @Benchmark
115 public int vectorByte() {
116 int window = 256;
117 var acc_hi = IntVector.zero(I256);
118 var acc8_lo = ByteVector.zero(B256);
119 for (int i = 0; i < data.length; i += window) {
120 var acc8_hi = ByteVector.zero(B256);
121 int limit = Math.min(window, data.length - i);
122 for (int j = 0; j < limit; j += B256.length()) {
123 var vb = ByteVector.fromArray(B256, data, i + j);
124
125 var t0 = acc8_lo.add(vb);
126 var t1 = addSaturated(acc8_lo, vb); // MISSING
127 var overflow = t0.notEqual(t1);
128
129 acc8_lo = t0;
130 acc8_hi = acc8_hi.add((byte) 1, overflow);
131 }
132 acc_hi = acc_hi.add(sum(acc8_hi));
133 }
134 return sum(acc8_lo)
135 .add(acc_hi.mul(256)) // overflow
136 .addAll();
137 }
138
139 // 4. Sum Of Absolute Differences (SAD) (MISSING: VPSADBW, _mm256_sad_epu8)
140 public int vectorSAD() {
141 var acc = IntVector.zero(I256);
142 for (int i = 0; i < data.length; i += B256.length()) {
143 var v = ByteVector.fromArray(B256, data, i);
144 var sad = sumOfAbsoluteDifferences(v, ByteVector.zero(B256)); // MISSING
145 acc = acc.add(sad);
146 }
147 return acc.addAll();
148 } */
149
150 // Helpers
151 /*
152 static ByteVector addSaturated(ByteVector va, ByteVector vb) {
153 return ByteVectorHelper.map(va, vb, (i, a, b) -> {
154 if ((a & 0xFF) + (b & 0xFF) < 0xFF) {
155 return (byte) (a + b);
156 } else {
157 return (byte)0xFF;
158 }
159 });
160 }
161
162 IntVector sumOfAbsoluteDifferences(ByteVector va, ByteVector vb) {
163 var vc = ByteVectorHelper.map(va, vb, (i, a, b) -> {
164 if ((a & 0xFF) > (b & 0xFF)) {
165 return (byte)(a - b);
166 } else {
167 return (byte)(b - a);
168 }
169 });
170 return sum(vc);
171 } */
172 }
|