--- old/src/cpu/x86/vm/sharedRuntime_x86_64.cpp 2021-01-11 15:16:51.303830021 -0500 +++ new/src/cpu/x86/vm/sharedRuntime_x86_64.cpp 2021-01-11 15:16:51.223829748 -0500 @@ -25,6 +25,8 @@ #include "precompiled.hpp" #ifndef _WINDOWS #include "alloca.h" +#else //WINDOWS +#include #endif #include "asm/macroAssembler.hpp" #include "asm/macroAssembler.inline.hpp" @@ -3971,14 +3973,11 @@ #ifndef _WINDOWS -#define ASM_SUBTRACT - -#ifdef ASM_SUBTRACT // Subtract 0:b from carry:a. Return carry. -static unsigned long -sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) { - long i = 0, cnt = len; - unsigned long tmp; +static julong +sub(julong a[], julong b[], julong carry, long len) { + long long i = 0, cnt = len; + julong tmp; asm volatile("clc; " "0: ; " "mov (%[b], %[i], 8), %[tmp]; " @@ -3991,24 +3990,6 @@ : "memory"); return tmp; } -#else // ASM_SUBTRACT -typedef int __attribute__((mode(TI))) int128; - -// Subtract 0:b from carry:a. Return carry. -static unsigned long -sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) { - int128 tmp = 0; - int i; - for (i = 0; i < len; i++) { - tmp += a[i]; - tmp -= b[i]; - a[i] = tmp; - tmp >>= 64; - assert(-1 <= tmp && tmp <= 0, "invariant"); - } - return tmp + carry; -} -#endif // ! ASM_SUBTRACT // Multiply (unsigned) Long A by Long B, accumulating the double- // length result into the accumulator formed of T0, T1, and T2. @@ -4031,17 +4012,65 @@ : "r"(A), "a"(B) : "cc"); \ } while(0) +#else //_WINDOWS + +// Visual Studio 2010 does not have _addcarry_u64 instrinsic +// (TBD: does 2015?) +#if defined(_WINDOWS) && _MSC_VER >= 1910 +static julong +sub(julong a[], julong b[], julong carry, long len) { + long i; + julong tmp; + unsigned char c = 1; + for (i = 0; i < len; i++) { + c = _addcarry_u64(c, a[i], ~b[i], &tmp); + a[i] = tmp; + } + c = _addcarry_u64(c, carry, ~0, &tmp); + return tmp; +} + +// Multiply (unsigned) Long A by Long B, accumulating the double- +// length result into the accumulator formed of T0, T1, and T2. +#define MACC(A, B, T0, T1, T2) \ +do { \ + julong hi, lo; \ + lo = _umul128(A, B, &hi); \ + unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ + c = _addcarry_u64(c, hi, T1, &T1); \ + _addcarry_u64(c, T2, 0, &T2); \ + } while(0) + +// As above, but add twice the double-length result into the +// accumulator. +#define MACC2(A, B, T0, T1, T2) \ +do { \ + julong hi, lo; \ + lo = _umul128(A, B, &hi); \ + unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ + c = _addcarry_u64(c, hi, T1, &T1); \ + _addcarry_u64(c, T2, 0, &T2); \ + c = _addcarry_u64(0, lo, T0, &T0); \ + c = _addcarry_u64(c, hi, T1, &T1); \ + _addcarry_u64(c, T2, 0, &T2); \ + } while(0) + +#endif // defined(_WINDOWS) && _MSC_VER >= 1910 +#endif // defined(_WINDOWS) + +#if !(defined(_WINDOWS) && _MSC_VER < 1910) + // Fast Montgomery multiplication. The derivation of the algorithm is // in A Cryptographic Library for the Motorola DSP56000, // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. -static void __attribute__((noinline)) -montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[], - unsigned long m[], unsigned long inv, int len) { - unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator +static void NOINLINE +montgomery_multiply(julong a[], julong b[], julong n[], + julong m[], julong inv, int len) { + julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator int i; - assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); + assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); for (i = 0; i < len; i++) { int j; @@ -4077,13 +4106,13 @@ // multiplication. However, its loop control is more complex and it // may actually run slower on some machines. -static void __attribute__((noinline)) -montgomery_square(unsigned long a[], unsigned long n[], - unsigned long m[], unsigned long inv, int len) { - unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator +static void NOINLINE +montgomery_square(julong a[], julong n[], + julong m[], julong inv, int len) { + julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator int i; - assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); + assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); for (i = 0; i < len; i++) { int j; @@ -4129,13 +4158,13 @@ } // Swap words in a longword. -static unsigned long swap(unsigned long x) { +static julong swap(julong x) { return (x << 32) | (x >> 32); } // Copy len longwords from s to d, word-swapping as we go. The // destination array is reversed. -static void reverse_words(unsigned long *s, unsigned long *d, int len) { +static void reverse_words(julong *s, julong *d, int len) { d += len; while(len-- > 0) { d--; @@ -4157,24 +4186,24 @@ // Make very sure we don't use so much space that the stack might // overflow. 512 jints corresponds to an 16384-bit integer and // will use here a total of 8k bytes of stack space. - int total_allocation = longwords * sizeof (unsigned long) * 4; + int total_allocation = longwords * sizeof (julong) * 4; guarantee(total_allocation <= 8192, "must be"); - unsigned long *scratch = (unsigned long *)alloca(total_allocation); + julong *scratch = (julong *)alloca(total_allocation); // Local scratch arrays - unsigned long + julong *a = scratch + 0 * longwords, *b = scratch + 1 * longwords, *n = scratch + 2 * longwords, *m = scratch + 3 * longwords; - reverse_words((unsigned long *)a_ints, a, longwords); - reverse_words((unsigned long *)b_ints, b, longwords); - reverse_words((unsigned long *)n_ints, n, longwords); + reverse_words((julong *)a_ints, a, longwords); + reverse_words((julong *)b_ints, b, longwords); + reverse_words((julong *)n_ints, n, longwords); - ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords); + ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); - reverse_words(m, (unsigned long *)m_ints, longwords); + reverse_words(m, (julong *)m_ints, longwords); } void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, @@ -4186,18 +4215,18 @@ // Make very sure we don't use so much space that the stack might // overflow. 512 jints corresponds to an 16384-bit integer and // will use here a total of 6k bytes of stack space. - int total_allocation = longwords * sizeof (unsigned long) * 3; + int total_allocation = longwords * sizeof (julong) * 3; guarantee(total_allocation <= 8192, "must be"); - unsigned long *scratch = (unsigned long *)alloca(total_allocation); + julong *scratch = (julong *)alloca(total_allocation); // Local scratch arrays - unsigned long + julong *a = scratch + 0 * longwords, *n = scratch + 1 * longwords, *m = scratch + 2 * longwords; - reverse_words((unsigned long *)a_ints, a, longwords); - reverse_words((unsigned long *)n_ints, n, longwords); + reverse_words((julong *)a_ints, a, longwords); + reverse_words((julong *)n_ints, n, longwords); //montgomery_square fails to pass BigIntegerTest on solaris amd64 //on jdk7 and jdk8. @@ -4206,15 +4235,15 @@ #else if (0) { #endif - ::montgomery_square(a, n, m, (unsigned long)inv, longwords); + ::montgomery_square(a, n, m, (julong)inv, longwords); } else { - ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords); + ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); } - reverse_words(m, (unsigned long *)m_ints, longwords); + reverse_words(m, (julong *)m_ints, longwords); } -#endif // WINDOWS +#endif // !(defined(_WINDOWS) && MSVC < VS2017) #ifdef COMPILER2 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame --- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2021-01-11 15:16:51.612831075 -0500 +++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2021-01-11 15:16:51.530830795 -0500 @@ -4270,8 +4270,9 @@ if (UseMulAddIntrinsic) { StubRoutines::_mulAdd = generate_mulAdd(); } - -#ifndef _WINDOWS + +// Visual Studio 2017 (and higher) has the compiler instrinisics required +#if !(defined(_WINDOWS) && _MSC_VER < 1910) if (UseMontgomeryMultiplyIntrinsic) { StubRoutines::_montgomeryMultiply = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); @@ -4280,7 +4281,7 @@ StubRoutines::_montgomerySquare = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); } -#endif // WINDOWS + #endif //! VC++ < 2017 #endif // COMPILER2 } --- old/test/compiler/intrinsics/montgomerymultiply/MontgomeryMultiplyTest.java 2021-01-11 15:16:51.944832206 -0500 +++ new/test/compiler/intrinsics/montgomerymultiply/MontgomeryMultiplyTest.java 2021-01-11 15:16:51.850831886 -0500 @@ -37,7 +37,7 @@ * @test * @bug 8130150 * @library /testlibrary - * @requires (os.simpleArch == "x64") & (os.family != "windows") + * @requires (os.simpleArch == "x64") * @summary Verify that the Montgomery multiply intrinsic works and correctly checks its arguments. * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UseMontgomerySquareIntrinsic * -XX:+UseMontgomeryMultiplyIntrinsic MontgomeryMultiplyTest