20 * or visit www.oracle.com if you need additional information or have
21 * questions.
22 */
23
24 package benchmark.crypto;
25
26 import org.openjdk.jmh.annotations.*;
27 import jdk.incubator.vector.*;
28 import java.util.Arrays;
29
30 @State(Scope.Thread)
31 @BenchmarkMode(Mode.Throughput)
32 @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
33 @Warmup(iterations = 3, time = 3)
34 @Measurement(iterations = 8, time = 2)
35 public class Poly1305Bench {
36
37 @Param({"16384", "65536"})
38 private int dataSize;
39
40 private Poly1305Vector poly1305_S128 = makePoly1305(Vector.Shape.S_128_BIT);
41 private Poly1305Vector poly1305_S256 = makePoly1305(Vector.Shape.S_256_BIT);
42 private Poly1305Vector poly1305_S512 = makePoly1305(Vector.Shape.S_512_BIT);
43
44 private byte[] in;
45 private byte[] out = new byte[16];
46 private byte[] key = new byte[32];
47
48 private static Poly1305Vector makePoly1305(Vector.Shape shape) {
49 Poly1305Vector poly = new Poly1305Vector(shape);
50 runKAT(poly);
51 return poly;
52 }
53
54 @Setup
55 public void setup() {
56 in = new byte[dataSize];
57 }
58
59 @Benchmark
60 public void auth128() {
61 poly1305_S128.computeTag(key, in, out);
62 }
63
64 @Benchmark
65 public void auth256() {
66 poly1305_S256.computeTag(key, in, out);
67 }
68
69 @Benchmark
70 public void auth512() {
71 poly1305_S512.computeTag(key, in, out);
72 }
73
74 private static class Poly1305Vector {
75
76 private static final int BITS_PER_LIMB = 26;
77 private static final int LIMB_MASK = (1 << BITS_PER_LIMB) - 1;
78 private static final int KEY_LENGTH = 32;
79 private static final int RS_LENGTH = KEY_LENGTH / 2;
80
81 private final Vector.Species<Long> longSpecies;
82 private final Vector.Species<Integer> intSpecies;
83 private final int vectorWidth;
84 private final int parBlockCount;
85
86 private final LongVector.Shuffle<Long> inShuffle0;
87 private final LongVector.Shuffle<Long> inShuffle1;
88 private final IntVector.Mask<Long> inMask;
89
90 public Poly1305Vector(Vector.Shape shape) {
91
92 this.longSpecies = Vector.Species.of(long.class, shape);
93 int intSize = shape.bitSize() / 2;
94 Vector.Shape intShape = Vector.Shape.forBitSize(intSize);
95 this.intSpecies = Vector.Species.of(int.class, intShape);
96 this.vectorWidth = longSpecies.length();
97 this.parBlockCount = vectorWidth * 16;
98
99 this.inShuffle0 = makeInShuffle0();
100 this.inShuffle1 = makeInShuffle1();
101 this.inMask = makeInMask();
102 }
103
104 private LongVector.Shuffle<Long> makeInShuffle0() {
105 int[] indexArr = new int[vectorWidth];
106 for (int i = 0; i < indexArr.length; i++) {
107 indexArr[i] = (2 * i) % vectorWidth;
108 }
109 return LongVector.shuffleFromArray(longSpecies, indexArr, 0);
110 }
111 private LongVector.Shuffle<Long> makeInShuffle1() {
112 int[] indexArr = new int[vectorWidth];
113 for (int i = 0; i < indexArr.length; i++) {
114 indexArr[i] = ((2 * i) % vectorWidth) + 1;
115 }
116 return LongVector.shuffleFromArray(longSpecies, indexArr, 0);
117 }
118 private LongVector.Mask<Long> makeInMask() {
119 boolean[] maskArr = new boolean[vectorWidth];
120 for (int i = vectorWidth / 2; i < vectorWidth; i++) {
121 maskArr[i] = true;
122 }
123 return LongVector.maskFromArray(longSpecies, maskArr, 0);
124 }
125
126 private static int[] fromByteArray(byte[] buf) {
127 int[] result = new int[5];
128
129 result[0]
130 = (buf[0] & 0xFF)
131 + ((buf[1] & 0xFF) << 8)
132 + ((buf[2] & 0xFF) << 16)
133 + ((buf[3] & 0x03) << 24);
134 result[1]
135 = ((buf[3] & 0xFF) >> 2)
136 + ((buf[4] & 0xFF) << 6)
137 + ((buf[5] & 0xFF) << 14)
138 + ((buf[6] & 0x0F) << 22);
139 result[2]
140 = ((buf[6] & 0xFF) >> 4)
141 + ((buf[7] & 0xFF) << 4)
142 + ((buf[8] & 0xFF) << 12)
143 + ((buf[9] & 0x3F) << 20);
281 IntVector rUp3_int = IntVector.broadcast(intSpecies, r[rUpIndex][3]);
282 IntVector rUp4_int = IntVector.broadcast(intSpecies, r[rUpIndex][4]);
283
284 IntVector r5Up1_int = rUp1_int.mul(5);
285 IntVector r5Up2_int = rUp2_int.mul(5);
286 IntVector r5Up3_int = rUp3_int.mul(5);
287 IntVector r5Up4_int = rUp4_int.mul(5);
288
289 LongVector longMsg0 = LongVector.fromByteArray(longSpecies, msg, 0);
290 LongVector longMsg1 =
291 LongVector.fromByteArray(longSpecies, msg, vectorWidth * 8);
292
293 LongVector inAlign0 =
294 longMsg0.rearrange(longMsg1, inShuffle0, inMask);
295 LongVector inAlign1 =
296 longMsg0.rearrange(longMsg1, inShuffle1, inMask);
297
298 IntVector a0 = (IntVector)
299 inAlign0.and(LIMB_MASK).cast(intSpecies);
300 IntVector a1 = (IntVector)
301 inAlign0.shiftR(26).and(LIMB_MASK).cast(intSpecies);
302 IntVector a2 = (IntVector)
303 inAlign0.shiftR(52).and(0xFFF).cast(intSpecies);
304 a2 = a2.or(inAlign1.and(0x3FFF).shiftL(12).cast(intSpecies));
305 IntVector a3 = (IntVector)
306 inAlign1.shiftR(14).and(LIMB_MASK).cast(intSpecies);
307 IntVector a4 = (IntVector)
308 inAlign1.shiftR(40).and(0xFFFFFF).cast(intSpecies);
309 a4 = a4.or(1 << 24);
310
311 int numParBlocks = msg.length / parBlockCount - 1;
312 for (int i = 0; i < numParBlocks; i++) {
313
314 // multiply and reduce
315 LongVector c0 = (LongVector)
316 a0.cast(longSpecies).mul(rUp0_int.cast(longSpecies))
317 .add(a1.cast(longSpecies).mul(r5Up4_int.cast(longSpecies)))
318 .add(a2.cast(longSpecies).mul(r5Up3_int.cast(longSpecies)))
319 .add(a3.cast(longSpecies).mul(r5Up2_int.cast(longSpecies)))
320 .add(a4.cast(longSpecies).mul(r5Up1_int.cast(longSpecies)));
321
322 LongVector c1 = (LongVector)
323 a0.cast(longSpecies).mul(rUp1_int.cast(longSpecies))
324 .add(a1.cast(longSpecies).mul(rUp0_int.cast(longSpecies)))
325 .add(a2.cast(longSpecies).mul(r5Up4_int.cast(longSpecies)))
326 .add(a3.cast(longSpecies).mul(r5Up3_int.cast(longSpecies)))
327 .add(a4.cast(longSpecies).mul(r5Up2_int.cast(longSpecies)));
328
332 .add(a2.cast(longSpecies).mul(rUp0_int.cast(longSpecies)))
333 .add(a3.cast(longSpecies).mul(r5Up4_int.cast(longSpecies)))
334 .add(a4.cast(longSpecies).mul(r5Up3_int.cast(longSpecies)));
335
336 LongVector c3 = (LongVector)
337 a0.cast(longSpecies).mul(rUp3_int.cast(longSpecies))
338 .add(a1.cast(longSpecies).mul(rUp2_int.cast(longSpecies)))
339 .add(a2.cast(longSpecies).mul(rUp1_int.cast(longSpecies)))
340 .add(a3.cast(longSpecies).mul(rUp0_int.cast(longSpecies)))
341 .add(a4.cast(longSpecies).mul(r5Up4_int.cast(longSpecies)));
342
343 LongVector c4 = (LongVector)
344 a0.cast(longSpecies).mul(rUp4_int.cast(longSpecies))
345 .add(a1.cast(longSpecies).mul(rUp3_int.cast(longSpecies)))
346 .add(a2.cast(longSpecies).mul(rUp2_int.cast(longSpecies)))
347 .add(a3.cast(longSpecies).mul(rUp1_int.cast(longSpecies)))
348 .add(a4.cast(longSpecies).mul(rUp0_int.cast(longSpecies)));
349
350 // carry/reduce
351 // Note: this carry/reduce sequence might not be correct
352 c4 = c4.add(c3.shiftR(BITS_PER_LIMB));
353 c3 = c3.and(LIMB_MASK);
354 c0 = c0.add(c4.shiftR(BITS_PER_LIMB).mul(5));
355 c4 = c4.and(LIMB_MASK);
356 c1 = c1.add(c0.shiftR(BITS_PER_LIMB));
357 c0 = c0.and(LIMB_MASK);
358 c2 = c2.add(c1.shiftR(BITS_PER_LIMB));
359 c1 = c1.and(LIMB_MASK);
360 c3 = c3.add(c2.shiftR(BITS_PER_LIMB));
361 c2 = c2.and(LIMB_MASK);
362 c4 = c4.add(c3.shiftR(BITS_PER_LIMB));
363 c3 = c3.and(LIMB_MASK);
364
365 a0 = (IntVector) c0.cast(intSpecies);
366 a1 = (IntVector) c1.cast(intSpecies);
367 a2 = (IntVector) c2.cast(intSpecies);
368 a3 = (IntVector) c3.cast(intSpecies);
369 a4 = (IntVector) c4.cast(intSpecies);
370
371 // fromByteArray and add next part of message
372 int start = parBlockCount * (i + 1);
373
374 longMsg0 = LongVector.fromByteArray(longSpecies, msg, start);
375 longMsg1 = LongVector.fromByteArray(longSpecies, msg,
376 start + vectorWidth * 8);
377
378 inAlign0 = longMsg0.rearrange(longMsg1, inShuffle0, inMask);
379 inAlign1 = longMsg0.rearrange(longMsg1, inShuffle1, inMask);
380
381 IntVector in0 = (IntVector)
382 inAlign0.and(LIMB_MASK).cast(intSpecies);
383 IntVector in1 = (IntVector)
384 inAlign0.shiftR(26).and(LIMB_MASK).cast(intSpecies);
385 IntVector in2 = (IntVector)
386 inAlign0.shiftR(52).and(0xFFF).cast(intSpecies);
387 in2 = in2.or(inAlign1.and(0x3FFF).shiftL(12).cast(intSpecies));
388 IntVector in3 = (IntVector)
389 inAlign1.shiftR(14).and(LIMB_MASK).cast(intSpecies);
390 IntVector in4 = (IntVector)
391 inAlign1.shiftR(40).and(0xFFFFFF).cast(intSpecies);
392 in4 = in4.or(1 << 24);
393
394 a0 = a0.add(in0);
395 a1 = a1.add(in1);
396 a2 = a2.add(in2);
397 a3 = a3.add(in3);
398 a4 = a4.add(in4);
399 }
400
401 // multiply by powers of r
402 long[] rTemp = new long[vectorWidth];
403 LongVector rFin0 = rPowerVec(r, rTemp, rUpIndex, 0);
404 LongVector rFin1 = rPowerVec(r, rTemp, rUpIndex, 1);
405 LongVector rFin2 = rPowerVec(r, rTemp, rUpIndex, 2);
406 LongVector rFin3 = rPowerVec(r, rTemp, rUpIndex, 3);
407 LongVector rFin4 = rPowerVec(r, rTemp, rUpIndex, 4);
408
409 LongVector r5Fin_1 = rFin1.mul(5);
410 LongVector r5Fin_2 = rFin2.mul(5);
411 LongVector r5Fin_3 = rFin3.mul(5);
420 .add(a1.cast(longSpecies).mul(rFin0))
421 .add(a2.cast(longSpecies).mul(r5Fin_4))
422 .add(a3.cast(longSpecies).mul(r5Fin_3))
423 .add(a4.cast(longSpecies).mul(r5Fin_2));
424 LongVector c2 = (LongVector) a0.cast(longSpecies).mul(rFin2)
425 .add(a1.cast(longSpecies).mul(rFin1))
426 .add(a2.cast(longSpecies).mul(rFin0))
427 .add(a3.cast(longSpecies).mul(r5Fin_4))
428 .add(a4.cast(longSpecies).mul(r5Fin_3));
429 LongVector c3 = (LongVector) a0.cast(longSpecies).mul(rFin3)
430 .add(a1.cast(longSpecies).mul(rFin2))
431 .add(a2.cast(longSpecies).mul(rFin1))
432 .add(a3.cast(longSpecies).mul(rFin0))
433 .add(a4.cast(longSpecies).mul(r5Fin_4));
434 LongVector c4 = (LongVector) a0.cast(longSpecies).mul(rFin4)
435 .add(a1.cast(longSpecies).mul(rFin3))
436 .add(a2.cast(longSpecies).mul(rFin2))
437 .add(a3.cast(longSpecies).mul(rFin1))
438 .add(a4.cast(longSpecies).mul(rFin0));
439
440 c4 = c4.add(c3.shiftR(BITS_PER_LIMB));
441 c3 = c3.and(LIMB_MASK);
442 c0 = c0.add(c4.shiftR(BITS_PER_LIMB).mul(5));
443 c4 = c4.and(LIMB_MASK);
444 c1 = c1.add(c0.shiftR(BITS_PER_LIMB));
445 c0 = c0.and(LIMB_MASK);
446 c2 = c2.add(c1.shiftR(BITS_PER_LIMB));
447 c1 = c1.and(LIMB_MASK);
448 c3 = c3.add(c2.shiftR(BITS_PER_LIMB));
449 c2 = c2.and(LIMB_MASK);
450 c4 = c4.add(c3.shiftR(BITS_PER_LIMB));
451 c3 = c3.and(LIMB_MASK);
452
453 a0 = (IntVector) c0.cast(intSpecies);
454 a1 = (IntVector) c1.cast(intSpecies);
455 a2 = (IntVector) c2.cast(intSpecies);
456 a3 = (IntVector) c3.cast(intSpecies);
457 a4 = (IntVector) c4.cast(intSpecies);
458
459 // collect lanes and calculate tag
460 long a0Fin = a0.addAll();
461 long a1Fin = a1.addAll();
462 long a2Fin = a2.addAll();
463 long a3Fin = a3.addAll();
464 long a4Fin = a4.addAll();
465
466 // carry/reduce the result
467 a4Fin = a4Fin + (a3Fin >>> BITS_PER_LIMB);
468 a3Fin = a3Fin & LIMB_MASK;
469 a0Fin = a0Fin + ((a4Fin >>> BITS_PER_LIMB) * 5);
470 a4Fin = a4Fin & LIMB_MASK;
471 a1Fin = a1Fin + (a0Fin >>> BITS_PER_LIMB);
472 a0Fin = a0Fin & LIMB_MASK;
473 a2Fin = a2Fin + (a1Fin >>> BITS_PER_LIMB);
474 a1Fin = a1Fin & LIMB_MASK;
475 a3Fin = a3Fin + (a2Fin >>> BITS_PER_LIMB);
476 a2Fin = a2Fin & LIMB_MASK;
477 a4Fin = a4Fin + (a3Fin >>> BITS_PER_LIMB);
478 a3Fin = a3Fin & LIMB_MASK;
479
480 byte[] s_arr =
481 Arrays.copyOfRange(keyBytes, RS_LENGTH, 2 * RS_LENGTH);
482 int[] s = fromByteArray(s_arr);
483
484 // Add in the s-half of the key to the accumulator
|
20 * or visit www.oracle.com if you need additional information or have
21 * questions.
22 */
23
24 package benchmark.crypto;
25
26 import org.openjdk.jmh.annotations.*;
27 import jdk.incubator.vector.*;
28 import java.util.Arrays;
29
30 @State(Scope.Thread)
31 @BenchmarkMode(Mode.Throughput)
32 @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
33 @Warmup(iterations = 3, time = 3)
34 @Measurement(iterations = 8, time = 2)
35 public class Poly1305Bench {
36
37 @Param({"16384", "65536"})
38 private int dataSize;
39
40 private Poly1305Vector poly1305_S128 = makePoly1305(VectorShape.S_128_BIT);
41 private Poly1305Vector poly1305_S256 = makePoly1305(VectorShape.S_256_BIT);
42 private Poly1305Vector poly1305_S512 = makePoly1305(VectorShape.S_512_BIT);
43
44 private byte[] in;
45 private byte[] out = new byte[16];
46 private byte[] key = new byte[32];
47
48 private static Poly1305Vector makePoly1305(VectorShape shape) {
49 Poly1305Vector poly = new Poly1305Vector(shape);
50 runKAT(poly);
51 return poly;
52 }
53
54 @Setup
55 public void setup() {
56 in = new byte[dataSize];
57 }
58
59 @Benchmark
60 public void auth128() {
61 poly1305_S128.computeTag(key, in, out);
62 }
63
64 @Benchmark
65 public void auth256() {
66 poly1305_S256.computeTag(key, in, out);
67 }
68
69 @Benchmark
70 public void auth512() {
71 poly1305_S512.computeTag(key, in, out);
72 }
73
74 private static class Poly1305Vector {
75
76 private static final int BITS_PER_LIMB = 26;
77 private static final int LIMB_MASK = (1 << BITS_PER_LIMB) - 1;
78 private static final int KEY_LENGTH = 32;
79 private static final int RS_LENGTH = KEY_LENGTH / 2;
80
81 private final VectorSpecies<Long> longSpecies;
82 private final VectorSpecies<Integer> intSpecies;
83 private final int vectorWidth;
84 private final int parBlockCount;
85
86 private final VectorShuffle<Long> inShuffle0;
87 private final VectorShuffle<Long> inShuffle1;
88 private final VectorMask<Long> inMask;
89
90 public Poly1305Vector(VectorShape shape) {
91
92 this.longSpecies = VectorSpecies.of(long.class, shape);
93 int intSize = shape.bitSize() / 2;
94 VectorShape intShape = VectorShape.forBitSize(intSize);
95 this.intSpecies = VectorSpecies.of(int.class, intShape);
96 this.vectorWidth = longSpecies.length();
97 this.parBlockCount = vectorWidth * 16;
98
99 this.inShuffle0 = makeInShuffle0();
100 this.inShuffle1 = makeInShuffle1();
101 this.inMask = makeInMask();
102 }
103
104 private VectorShuffle<Long> makeInShuffle0() {
105 int[] indexArr = new int[vectorWidth];
106 for (int i = 0; i < indexArr.length; i++) {
107 indexArr[i] = (2 * i) % vectorWidth;
108 }
109 return VectorShuffle.fromArray(longSpecies, indexArr, 0);
110 }
111 private VectorShuffle<Long> makeInShuffle1() {
112 int[] indexArr = new int[vectorWidth];
113 for (int i = 0; i < indexArr.length; i++) {
114 indexArr[i] = ((2 * i) % vectorWidth) + 1;
115 }
116 return VectorShuffle.fromArray(longSpecies, indexArr, 0);
117 }
118 private VectorMask<Long> makeInMask() {
119 boolean[] maskArr = new boolean[vectorWidth];
120 for (int i = vectorWidth / 2; i < vectorWidth; i++) {
121 maskArr[i] = true;
122 }
123 return VectorMask.fromArray(longSpecies, maskArr, 0);
124 }
125
126 private static int[] fromByteArray(byte[] buf) {
127 int[] result = new int[5];
128
129 result[0]
130 = (buf[0] & 0xFF)
131 + ((buf[1] & 0xFF) << 8)
132 + ((buf[2] & 0xFF) << 16)
133 + ((buf[3] & 0x03) << 24);
134 result[1]
135 = ((buf[3] & 0xFF) >> 2)
136 + ((buf[4] & 0xFF) << 6)
137 + ((buf[5] & 0xFF) << 14)
138 + ((buf[6] & 0x0F) << 22);
139 result[2]
140 = ((buf[6] & 0xFF) >> 4)
141 + ((buf[7] & 0xFF) << 4)
142 + ((buf[8] & 0xFF) << 12)
143 + ((buf[9] & 0x3F) << 20);
281 IntVector rUp3_int = IntVector.broadcast(intSpecies, r[rUpIndex][3]);
282 IntVector rUp4_int = IntVector.broadcast(intSpecies, r[rUpIndex][4]);
283
284 IntVector r5Up1_int = rUp1_int.mul(5);
285 IntVector r5Up2_int = rUp2_int.mul(5);
286 IntVector r5Up3_int = rUp3_int.mul(5);
287 IntVector r5Up4_int = rUp4_int.mul(5);
288
289 LongVector longMsg0 = LongVector.fromByteArray(longSpecies, msg, 0);
290 LongVector longMsg1 =
291 LongVector.fromByteArray(longSpecies, msg, vectorWidth * 8);
292
293 LongVector inAlign0 =
294 longMsg0.rearrange(longMsg1, inShuffle0, inMask);
295 LongVector inAlign1 =
296 longMsg0.rearrange(longMsg1, inShuffle1, inMask);
297
298 IntVector a0 = (IntVector)
299 inAlign0.and(LIMB_MASK).cast(intSpecies);
300 IntVector a1 = (IntVector)
301 inAlign0.shiftRight(26).and(LIMB_MASK).cast(intSpecies);
302 IntVector a2 = (IntVector)
303 inAlign0.shiftRight(52).and(0xFFF).cast(intSpecies);
304 a2 = a2.or(inAlign1.and(0x3FFF).shiftLeft(12).cast(intSpecies));
305 IntVector a3 = (IntVector)
306 inAlign1.shiftRight(14).and(LIMB_MASK).cast(intSpecies);
307 IntVector a4 = (IntVector)
308 inAlign1.shiftRight(40).and(0xFFFFFF).cast(intSpecies);
309 a4 = a4.or(1 << 24);
310
311 int numParBlocks = msg.length / parBlockCount - 1;
312 for (int i = 0; i < numParBlocks; i++) {
313
314 // multiply and reduce
315 LongVector c0 = (LongVector)
316 a0.cast(longSpecies).mul(rUp0_int.cast(longSpecies))
317 .add(a1.cast(longSpecies).mul(r5Up4_int.cast(longSpecies)))
318 .add(a2.cast(longSpecies).mul(r5Up3_int.cast(longSpecies)))
319 .add(a3.cast(longSpecies).mul(r5Up2_int.cast(longSpecies)))
320 .add(a4.cast(longSpecies).mul(r5Up1_int.cast(longSpecies)));
321
322 LongVector c1 = (LongVector)
323 a0.cast(longSpecies).mul(rUp1_int.cast(longSpecies))
324 .add(a1.cast(longSpecies).mul(rUp0_int.cast(longSpecies)))
325 .add(a2.cast(longSpecies).mul(r5Up4_int.cast(longSpecies)))
326 .add(a3.cast(longSpecies).mul(r5Up3_int.cast(longSpecies)))
327 .add(a4.cast(longSpecies).mul(r5Up2_int.cast(longSpecies)));
328
332 .add(a2.cast(longSpecies).mul(rUp0_int.cast(longSpecies)))
333 .add(a3.cast(longSpecies).mul(r5Up4_int.cast(longSpecies)))
334 .add(a4.cast(longSpecies).mul(r5Up3_int.cast(longSpecies)));
335
336 LongVector c3 = (LongVector)
337 a0.cast(longSpecies).mul(rUp3_int.cast(longSpecies))
338 .add(a1.cast(longSpecies).mul(rUp2_int.cast(longSpecies)))
339 .add(a2.cast(longSpecies).mul(rUp1_int.cast(longSpecies)))
340 .add(a3.cast(longSpecies).mul(rUp0_int.cast(longSpecies)))
341 .add(a4.cast(longSpecies).mul(r5Up4_int.cast(longSpecies)));
342
343 LongVector c4 = (LongVector)
344 a0.cast(longSpecies).mul(rUp4_int.cast(longSpecies))
345 .add(a1.cast(longSpecies).mul(rUp3_int.cast(longSpecies)))
346 .add(a2.cast(longSpecies).mul(rUp2_int.cast(longSpecies)))
347 .add(a3.cast(longSpecies).mul(rUp1_int.cast(longSpecies)))
348 .add(a4.cast(longSpecies).mul(rUp0_int.cast(longSpecies)));
349
350 // carry/reduce
351 // Note: this carry/reduce sequence might not be correct
352 c4 = c4.add(c3.shiftRight(BITS_PER_LIMB));
353 c3 = c3.and(LIMB_MASK);
354 c0 = c0.add(c4.shiftRight(BITS_PER_LIMB).mul(5));
355 c4 = c4.and(LIMB_MASK);
356 c1 = c1.add(c0.shiftRight(BITS_PER_LIMB));
357 c0 = c0.and(LIMB_MASK);
358 c2 = c2.add(c1.shiftRight(BITS_PER_LIMB));
359 c1 = c1.and(LIMB_MASK);
360 c3 = c3.add(c2.shiftRight(BITS_PER_LIMB));
361 c2 = c2.and(LIMB_MASK);
362 c4 = c4.add(c3.shiftRight(BITS_PER_LIMB));
363 c3 = c3.and(LIMB_MASK);
364
365 a0 = (IntVector) c0.cast(intSpecies);
366 a1 = (IntVector) c1.cast(intSpecies);
367 a2 = (IntVector) c2.cast(intSpecies);
368 a3 = (IntVector) c3.cast(intSpecies);
369 a4 = (IntVector) c4.cast(intSpecies);
370
371 // fromByteArray and add next part of message
372 int start = parBlockCount * (i + 1);
373
374 longMsg0 = LongVector.fromByteArray(longSpecies, msg, start);
375 longMsg1 = LongVector.fromByteArray(longSpecies, msg,
376 start + vectorWidth * 8);
377
378 inAlign0 = longMsg0.rearrange(longMsg1, inShuffle0, inMask);
379 inAlign1 = longMsg0.rearrange(longMsg1, inShuffle1, inMask);
380
381 IntVector in0 = (IntVector)
382 inAlign0.and(LIMB_MASK).cast(intSpecies);
383 IntVector in1 = (IntVector)
384 inAlign0.shiftRight(26).and(LIMB_MASK).cast(intSpecies);
385 IntVector in2 = (IntVector)
386 inAlign0.shiftRight(52).and(0xFFF).cast(intSpecies);
387 in2 = in2.or(inAlign1.and(0x3FFF).shiftLeft(12).cast(intSpecies));
388 IntVector in3 = (IntVector)
389 inAlign1.shiftRight(14).and(LIMB_MASK).cast(intSpecies);
390 IntVector in4 = (IntVector)
391 inAlign1.shiftRight(40).and(0xFFFFFF).cast(intSpecies);
392 in4 = in4.or(1 << 24);
393
394 a0 = a0.add(in0);
395 a1 = a1.add(in1);
396 a2 = a2.add(in2);
397 a3 = a3.add(in3);
398 a4 = a4.add(in4);
399 }
400
401 // multiply by powers of r
402 long[] rTemp = new long[vectorWidth];
403 LongVector rFin0 = rPowerVec(r, rTemp, rUpIndex, 0);
404 LongVector rFin1 = rPowerVec(r, rTemp, rUpIndex, 1);
405 LongVector rFin2 = rPowerVec(r, rTemp, rUpIndex, 2);
406 LongVector rFin3 = rPowerVec(r, rTemp, rUpIndex, 3);
407 LongVector rFin4 = rPowerVec(r, rTemp, rUpIndex, 4);
408
409 LongVector r5Fin_1 = rFin1.mul(5);
410 LongVector r5Fin_2 = rFin2.mul(5);
411 LongVector r5Fin_3 = rFin3.mul(5);
420 .add(a1.cast(longSpecies).mul(rFin0))
421 .add(a2.cast(longSpecies).mul(r5Fin_4))
422 .add(a3.cast(longSpecies).mul(r5Fin_3))
423 .add(a4.cast(longSpecies).mul(r5Fin_2));
424 LongVector c2 = (LongVector) a0.cast(longSpecies).mul(rFin2)
425 .add(a1.cast(longSpecies).mul(rFin1))
426 .add(a2.cast(longSpecies).mul(rFin0))
427 .add(a3.cast(longSpecies).mul(r5Fin_4))
428 .add(a4.cast(longSpecies).mul(r5Fin_3));
429 LongVector c3 = (LongVector) a0.cast(longSpecies).mul(rFin3)
430 .add(a1.cast(longSpecies).mul(rFin2))
431 .add(a2.cast(longSpecies).mul(rFin1))
432 .add(a3.cast(longSpecies).mul(rFin0))
433 .add(a4.cast(longSpecies).mul(r5Fin_4));
434 LongVector c4 = (LongVector) a0.cast(longSpecies).mul(rFin4)
435 .add(a1.cast(longSpecies).mul(rFin3))
436 .add(a2.cast(longSpecies).mul(rFin2))
437 .add(a3.cast(longSpecies).mul(rFin1))
438 .add(a4.cast(longSpecies).mul(rFin0));
439
440 c4 = c4.add(c3.shiftRight(BITS_PER_LIMB));
441 c3 = c3.and(LIMB_MASK);
442 c0 = c0.add(c4.shiftRight(BITS_PER_LIMB).mul(5));
443 c4 = c4.and(LIMB_MASK);
444 c1 = c1.add(c0.shiftRight(BITS_PER_LIMB));
445 c0 = c0.and(LIMB_MASK);
446 c2 = c2.add(c1.shiftRight(BITS_PER_LIMB));
447 c1 = c1.and(LIMB_MASK);
448 c3 = c3.add(c2.shiftRight(BITS_PER_LIMB));
449 c2 = c2.and(LIMB_MASK);
450 c4 = c4.add(c3.shiftRight(BITS_PER_LIMB));
451 c3 = c3.and(LIMB_MASK);
452
453 a0 = (IntVector) c0.cast(intSpecies);
454 a1 = (IntVector) c1.cast(intSpecies);
455 a2 = (IntVector) c2.cast(intSpecies);
456 a3 = (IntVector) c3.cast(intSpecies);
457 a4 = (IntVector) c4.cast(intSpecies);
458
459 // collect lanes and calculate tag
460 long a0Fin = a0.addLanes();
461 long a1Fin = a1.addLanes();
462 long a2Fin = a2.addLanes();
463 long a3Fin = a3.addLanes();
464 long a4Fin = a4.addLanes();
465
466 // carry/reduce the result
467 a4Fin = a4Fin + (a3Fin >>> BITS_PER_LIMB);
468 a3Fin = a3Fin & LIMB_MASK;
469 a0Fin = a0Fin + ((a4Fin >>> BITS_PER_LIMB) * 5);
470 a4Fin = a4Fin & LIMB_MASK;
471 a1Fin = a1Fin + (a0Fin >>> BITS_PER_LIMB);
472 a0Fin = a0Fin & LIMB_MASK;
473 a2Fin = a2Fin + (a1Fin >>> BITS_PER_LIMB);
474 a1Fin = a1Fin & LIMB_MASK;
475 a3Fin = a3Fin + (a2Fin >>> BITS_PER_LIMB);
476 a2Fin = a2Fin & LIMB_MASK;
477 a4Fin = a4Fin + (a3Fin >>> BITS_PER_LIMB);
478 a3Fin = a3Fin & LIMB_MASK;
479
480 byte[] s_arr =
481 Arrays.copyOfRange(keyBytes, RS_LENGTH, 2 * RS_LENGTH);
482 int[] s = fromByteArray(s_arr);
483
484 // Add in the s-half of the key to the accumulator
|