< prev index next >
src/cpu/x86/vm/sharedRuntime_x86_64.cpp
Print this page
rev 8502 : 8130150: Implement BigInteger.montgomeryMultiply intrinsic
Summary: Add montgomeryMultiply intrinsics
Reviewed-by: kvn
*** 21,30 ****
--- 21,31 ----
* questions.
*
*/
#include "precompiled.hpp"
+ #include "alloca.h"
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "code/debugInfoRec.hpp"
#include "code/icBuffer.hpp"
#include "code/vtableStubs.hpp"
*** 3509,3518 ****
--- 3510,3763 ----
// frame_size_words or bytes??
return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
}
+ //------------------------------Montgomery multiplication------------------------
+ //
+
+ #ifndef _WINDOWS
+
+ #define ASM_SUBTRACT
+
+ #ifdef ASM_SUBTRACT
+ // Subtract 0:b from carry:a. Return carry.
+ static unsigned long
+ sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
+ long i = 0, cnt = len;
+ unsigned long tmp;
+ asm volatile("clc; "
+ "0: ; "
+ "mov (%[b], %[i], 8), %[tmp]; "
+ "sbb %[tmp], (%[a], %[i], 8); "
+ "inc %[i]; dec %[cnt]; "
+ "jne 0b; "
+ "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
+ : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
+ : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
+ : "memory");
+ return tmp;
+ }
+ #else // ASM_SUBTRACT
+ typedef int __attribute__((mode(TI))) int128;
+
+ // Subtract 0:b from carry:a. Return carry.
+ static unsigned long
+ sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
+ int128 tmp = 0;
+ int i;
+ for (i = 0; i < len; i++) {
+ tmp += a[i];
+ tmp -= b[i];
+ a[i] = tmp;
+ tmp >>= 64;
+ assert(-1 <= tmp && tmp <= 0, "invariant");
+ }
+ return tmp + carry;
+ }
+ #endif // ! ASM_SUBTRACT
+
+ // Multiply (unsigned) Long A by Long B, accumulating the double-
+ // length result into the accumulator formed of T0, T1, and T2.
+ #define MACC(A, B, T0, T1, T2) \
+ do { \
+ unsigned long hi, lo; \
+ __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
+ : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
+ : "r"(A), "a"(B) : "cc"); \
+ } while(0)
+
+ // As above, but add twice the double-length result into the
+ // accumulator.
+ #define MACC2(A, B, T0, T1, T2) \
+ do { \
+ unsigned long hi, lo; \
+ __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
+ "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
+ : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
+ : "r"(A), "a"(B) : "cc"); \
+ } while(0)
+
+ // Fast Montgomery multiplication. The derivation of the algorithm is
+ // in A Cryptographic Library for the Motorola DSP56000,
+ // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
+
+ static void __attribute__((noinline))
+ montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
+ unsigned long m[], unsigned long inv, int len) {
+ unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+ int i;
+
+ assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+
+ for (i = 0; i < len; i++) {
+ int j;
+ for (j = 0; j < i; j++) {
+ MACC(a[j], b[i-j], t0, t1, t2);
+ MACC(m[j], n[i-j], t0, t1, t2);
+ }
+ MACC(a[i], b[0], t0, t1, t2);
+ m[i] = t0 * inv;
+ MACC(m[i], n[0], t0, t1, t2);
+
+ assert(t0 == 0, "broken Montgomery multiply");
+
+ t0 = t1; t1 = t2; t2 = 0;
+ }
+
+ for (i = len; i < 2*len; i++) {
+ int j;
+ for (j = i-len+1; j < len; j++) {
+ MACC(a[j], b[i-j], t0, t1, t2);
+ MACC(m[j], n[i-j], t0, t1, t2);
+ }
+ m[i-len] = t0;
+ t0 = t1; t1 = t2; t2 = 0;
+ }
+
+ while (t0)
+ t0 = sub(m, n, t0, len);
+ }
+
+ // Fast Montgomery squaring. This uses asymptotically 25% fewer
+ // multiplies so it should be up to 25% faster than Montgomery
+ // multiplication. However, its loop control is more complex and it
+ // may actually run slower on some machines.
+
+ static void __attribute__((noinline))
+ montgomery_square(unsigned long a[], unsigned long n[],
+ unsigned long m[], unsigned long inv, int len) {
+ unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+ int i;
+
+ assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+
+ for (i = 0; i < len; i++) {
+ int j;
+ int end = (i+1)/2;
+ for (j = 0; j < end; j++) {
+ MACC2(a[j], a[i-j], t0, t1, t2);
+ MACC(m[j], n[i-j], t0, t1, t2);
+ }
+ if ((i & 1) == 0) {
+ MACC(a[j], a[j], t0, t1, t2);
+ }
+ for (; j < i; j++) {
+ MACC(m[j], n[i-j], t0, t1, t2);
+ }
+ m[i] = t0 * inv;
+ MACC(m[i], n[0], t0, t1, t2);
+
+ assert(t0 == 0, "broken Montgomery square");
+
+ t0 = t1; t1 = t2; t2 = 0;
+ }
+
+ for (i = len; i < 2*len; i++) {
+ int start = i-len+1;
+ int end = start + (len - start)/2;
+ int j;
+ for (j = start; j < end; j++) {
+ MACC2(a[j], a[i-j], t0, t1, t2);
+ MACC(m[j], n[i-j], t0, t1, t2);
+ }
+ if ((i & 1) == 0) {
+ MACC(a[j], a[j], t0, t1, t2);
+ }
+ for (; j < len; j++) {
+ MACC(m[j], n[i-j], t0, t1, t2);
+ }
+ m[i-len] = t0;
+ t0 = t1; t1 = t2; t2 = 0;
+ }
+
+ while (t0)
+ t0 = sub(m, n, t0, len);
+ }
+
+ // Swap words in a longword.
+ static unsigned long swap(unsigned long x) {
+ return (x << 32) | (x >> 32);
+ }
+
+ // Copy len longwords from s to d, word-swapping as we go. The
+ // destination array is reversed.
+ static void reverse_words(unsigned long *s, unsigned long *d, int len) {
+ d += len;
+ while(len-- > 0) {
+ d--;
+ *d = swap(*s);
+ s++;
+ }
+ }
+
+ // The threshold at which squaring is advantageous was determined
+ // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
+ #define MONTGOMERY_SQUARING_THRESHOLD 64
+
+ void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
+ jint len, jlong inv,
+ jint *m_ints) {
+ assert(len % 2 == 0, "array length in montgomery_multiply must be even");
+ int longwords = len/2;
+
+ // Make very sure we don't use so much space that the stack might
+ // overflow. 512 jints corresponds to an 16384-bit integer and
+ // will use here a total of 8k bytes of stack space.
+ int total_allocation = longwords * sizeof (unsigned long) * 4;
+ guarantee(total_allocation <= 8192, "must be");
+ unsigned long *scratch = (unsigned long *)alloca(total_allocation);
+
+ // Local scratch arrays
+ unsigned long
+ *a = scratch + 0 * longwords,
+ *b = scratch + 1 * longwords,
+ *n = scratch + 2 * longwords,
+ *m = scratch + 3 * longwords;
+
+ reverse_words((unsigned long *)a_ints, a, longwords);
+ reverse_words((unsigned long *)b_ints, b, longwords);
+ reverse_words((unsigned long *)n_ints, n, longwords);
+
+ ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
+
+ reverse_words(m, (unsigned long *)m_ints, longwords);
+ }
+
+ void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
+ jint len, jlong inv,
+ jint *m_ints) {
+ assert(len % 2 == 0, "array length in montgomery_square must be even");
+ int longwords = len/2;
+
+ // Make very sure we don't use so much space that the stack might
+ // overflow. 512 jints corresponds to an 16384-bit integer and
+ // will use here a total of 6k bytes of stack space.
+ int total_allocation = longwords * sizeof (unsigned long) * 3;
+ guarantee(total_allocation <= 8192, "must be");
+ unsigned long *scratch = (unsigned long *)alloca(total_allocation);
+
+ // Local scratch arrays
+ unsigned long
+ *a = scratch + 0 * longwords,
+ *n = scratch + 1 * longwords,
+ *m = scratch + 2 * longwords;
+
+ reverse_words((unsigned long *)a_ints, a, longwords);
+ reverse_words((unsigned long *)n_ints, n, longwords);
+
+ if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
+ ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
+ } else {
+ ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
+ }
+
+ reverse_words(m, (unsigned long *)m_ints, longwords);
+ }
+
+ #endif // WINDOWS
+
#ifdef COMPILER2
// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
//
//------------------------------generate_exception_blob---------------------------
// creates exception blob at the end
< prev index next >