# HG changeset patch # User aph # Date 1434472313 -3600 # Tue Jun 16 17:31:53 2015 +0100 # Node ID fb79e92c0e7dfc8e29d87bd4490d3e9ad42f55df # Parent 5a9d5d58e667b7a49b5d07ccfdb02c00252a9800 8046943: Leverage CPU Instructions for GHASH and RSA Summary: Add montgomeryMultiply intrinsics Reviewed-by: kvn diff --git a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp --- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp +++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp @@ -3511,6 +3511,254 @@ } +//------------------------------Montgomery multiplication------------------------ +// + +#define ASM_SUBTRACT + +#ifdef ASM_SUBTRACT +// Subtract 0:b from carry:a. Return carry. +static unsigned long +sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) { + long i = 0, cnt = len; + unsigned long tmp; + asm volatile("clc; " + "0: ; " + "mov (%[b], %[i], 8), %[tmp]; " + "sbb %[tmp], (%[a], %[i], 8); " + "inc %[i]; dec %[cnt]; " + "jne 0b; " + "mov %[carry], %[tmp]; sbb $0, %[tmp]; " + : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp), "+m"(a) + : [a]"r"(a), [b]"r"(b), "m"(b), [carry]"r"(carry) + : "memory"); + return tmp; +} +#else // ASM_SUBTRACT +typedef int __attribute__((mode(TI))) int128; + +// Subtract 0:b from carry:a. Return carry. +static unsigned long +sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) { + int128 tmp = 0; + int i; + for (i = 0; i < len; i++) { + tmp += a[i]; + tmp -= b[i]; + a[i] = tmp; + tmp >>= 64; + assert(-1 <= tmp && tmp <= 0, "invariant"); + } + return tmp + carry; +} +#endif // ! ASM_SUBTRACT + +// Multiply (unsigned) Long A by Long B, accumulating the double- +// length result into the accumulator formed of T0, T1, and T2. +#define MACC(A, B, T0, T1, T2) \ +do { \ + unsigned long hi, lo; \ + __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ + : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ + : "r"(A), "a"(B) : "cc"); \ + } while(0) + +// As above, but add twice the double-length result into the +// accumulator. +#define MACC2(A, B, T0, T1, T2) \ +do { \ + unsigned long hi, lo; \ + __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \ + "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ + : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ + : "r"(A), "a"(B) : "cc"); \ + } while(0) + +// Fast Montgomery multiplication. The derivation of the algorithm is +// in A Cryptographic Library for the Motorola DSP56000, +// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. + +static void __attribute__((noinline)) +montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[], + unsigned long m[], unsigned long inv, int len) { + unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator + int i; + + assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); + + for (i = 0; i < len; i++) { + int j; + for (j = 0; j < i; j++) { + MACC(a[j], b[i-j], t0, t1, t2); + MACC(m[j], n[i-j], t0, t1, t2); + } + MACC(a[i], b[0], t0, t1, t2); + m[i] = t0 * inv; + MACC(m[i], n[0], t0, t1, t2); + + assert(t0 == 0, "broken Montgomery multiply"); + + t0 = t1; t1 = t2; t2 = 0; + } + + for (i = len; i < 2*len; i++) { + int j; + for (j = i-len+1; j < len; j++) { + MACC(a[j], b[i-j], t0, t1, t2); + MACC(m[j], n[i-j], t0, t1, t2); + } + m[i-len] = t0; + t0 = t1; t1 = t2; t2 = 0; + } + + while (t0) + t0 = sub(m, n, t0, len); +} + +// Fast Montgomery squaring. This uses asymptotically 25% fewer +// multiplies so it should be up to 25% faster than Montgomery +// multiplication. However, its loop control is more complex and it +// may actually run slower on some machines. + +static void __attribute__((noinline)) +montgomery_square(unsigned long a[], unsigned long n[], + unsigned long m[], unsigned long inv, int len) { + unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator + int i; + + assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); + + for (i = 0; i < len; i++) { + int j; + int end = (i+1)/2; + for (j = 0; j < end; j++) { + MACC2(a[j], a[i-j], t0, t1, t2); + MACC(m[j], n[i-j], t0, t1, t2); + } + if ((i & 1) == 0) { + MACC(a[j], a[j], t0, t1, t2); + } + for (; j < i; j++) { + MACC(m[j], n[i-j], t0, t1, t2); + } + m[i] = t0 * inv; + MACC(m[i], n[0], t0, t1, t2); + + assert(t0 == 0, "broken Montgomery multiply"); + + t0 = t1; t1 = t2; t2 = 0; + } + + for (i = len; i < 2*len; i++) { + int start = i-len+1; + int end = start + (len - start)/2; + int j; + for (j = start; j < end; j++) { + MACC2(a[j], a[i-j], t0, t1, t2); + MACC(m[j], n[i-j], t0, t1, t2); + } + if ((i & 1) == 0) { + MACC(a[j], a[j], t0, t1, t2); + } + for (; j < len; j++) { + MACC(m[j], n[i-j], t0, t1, t2); + } + m[i-len] = t0; + t0 = t1; t1 = t2; t2 = 0; + } + + while (t0) + t0 = sub(m, n, t0, len); +} + +// Swap words in a longword. +static unsigned long swap(unsigned long x) { + return (x << 32) | (x >> 32); +} + +// Copy len longwords from s to do, word-swapping as we go. The +// destination array is reversed. +static void reverse_words(unsigned long *s, unsigned long *d, int len) { + d += len; + while(len-- > 0) { + d--; + *d = swap(*s); + s++; + } +} + +// The threshold at which squaring is advantageous was determined +// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. +#define MONTGOMERY_SQUARING_THRESHOLD 64 + +void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, + jint len, jlong inv, + unsigned long *scratch, + jint *m_ints) { + assert(len % 2 == 0, "array length in montgomery_multiply must be even"); + int longwords = len/2; + + if (__builtin_expect(scratch == NULL, true)) { + // Make very sure we don't use so much space that the stack might + // overflow. 512 jints corresponds to an 16384-bit integer and + // will use here a total of 8k bytes of stack space. + int total_allocation = longwords * sizeof (unsigned long) * 4; + guarantee(total_allocation <= 8192, "must be"); + scratch = (unsigned long *)__builtin_alloca(total_allocation); + } + + // Local scratach arrays + unsigned long + *a = scratch + 0 * longwords, + *b = scratch + 1 * longwords, + *n = scratch + 2 * longwords, + *m = scratch + 3 * longwords; + + reverse_words((unsigned long *)a_ints, a, longwords); + reverse_words((unsigned long *)b_ints, b, longwords); + reverse_words((unsigned long *)n_ints, n, longwords); + + ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords); + + reverse_words(m, (unsigned long *)m_ints, longwords); +} + + +void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, + jint len, jlong inv, + unsigned long *scratch, + jint *m_ints) { + assert(len % 2 == 0, "array length in montgomery_square must be even"); + int longwords = len/2; + + if (__builtin_expect(scratch == NULL, true)) { + // Make very sure we don't use so much space that the stack might + // overflow. 512 jints corresponds to an 16384-bit integer and + // will use here a total of 6k bytes of stack space. + int total_allocation = longwords * sizeof (unsigned long) * 3; + guarantee(total_allocation <= 8192, "must be"); + scratch = (unsigned long *)__builtin_alloca(total_allocation); + } + + // Local scratach arrays + unsigned long + *a = scratch + 0 * longwords, + *n = scratch + 1 * longwords, + *m = scratch + 2 * longwords; + + reverse_words((unsigned long *)a_ints, a, longwords); + reverse_words((unsigned long *)n_ints, n, longwords); + + if (len >= MONTGOMERY_SQUARING_THRESHOLD) { + ::montgomery_square(a, n, m, (unsigned long)inv, longwords); + } else { + ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords); + } + + reverse_words(m, (unsigned long *)m_ints, longwords); +} + + #ifdef COMPILER2 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame // diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp @@ -4137,6 +4137,14 @@ if (UseMulAddIntrinsic) { StubRoutines::_mulAdd = generate_mulAdd(); } + if (UseMontgomeryMultiplyIntrinsic) { + StubRoutines::_montgomeryMultiply + = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); + } + if (UseMontgomerySquareIntrinsic) { + StubRoutines::_montgomerySquare + = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); + } #endif } diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp --- a/src/cpu/x86/vm/vm_version_x86.cpp +++ b/src/cpu/x86/vm/vm_version_x86.cpp @@ -796,6 +796,12 @@ if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) { UseMulAddIntrinsic = true; } + if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) { + UseMontgomeryMultiplyIntrinsic = true; + } + if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) { + UseMontgomerySquareIntrinsic = true; + } #else if (UseMultiplyToLenIntrinsic) { if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { @@ -803,6 +809,18 @@ } FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false); } + if (UseMontgomeryMultiplyIntrinsic) { + if (!FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) { + warning("montgomeryMultiply intrinsic is not available in 32-bit VM"); + } + FLAG_SET_DEFAULT(UseMontgomeryMultiplyIntrinsic, false); + } + if (UseMontgomerySquareIntrinsic) { + if (!FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) { + warning("montgomerySquare intrinsic is not available in 32-bit VM"); + } + FLAG_SET_DEFAULT(UseMontgomerySquareIntrinsic, false); + } if (UseSquareToLenIntrinsic) { if (!FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) { warning("squareToLen intrinsic is not available in 32-bit VM"); diff --git a/src/share/vm/classfile/vmSymbols.hpp b/src/share/vm/classfile/vmSymbols.hpp --- a/src/share/vm/classfile/vmSymbols.hpp +++ b/src/share/vm/classfile/vmSymbols.hpp @@ -808,6 +808,14 @@ do_name( mulAdd_name, "implMulAdd") \ do_signature(mulAdd_signature, "([I[IIII)I") \ \ + do_intrinsic(_montgomeryMultiply, java_math_BigInteger, montgomeryMultiply_name, montgomeryMultiply_signature, F_R) \ + do_name( montgomeryMultiply_name, "montgomeryMultiply") \ + do_signature(montgomeryMultiply_signature, "([I[I[IIJ[I)[I") \ + \ + do_intrinsic(_montgomerySquare, java_math_BigInteger, montgomerySquare_name, montgomerySquare_signature, F_R) \ + do_name( montgomerySquare_name, "montgomerySquare") \ + do_signature(montgomerySquare_signature, "([I[IIJ[I)[I") \ + \ /* java/lang/ref/Reference */ \ do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \ \ diff --git a/src/share/vm/opto/c2_globals.hpp b/src/share/vm/opto/c2_globals.hpp --- a/src/share/vm/opto/c2_globals.hpp +++ b/src/share/vm/opto/c2_globals.hpp @@ -671,6 +671,12 @@ product(bool, UseMulAddIntrinsic, false, \ "Enables intrinsification of BigInteger.mulAdd()") \ \ + product(bool, UseMontgomeryMultiplyIntrinsic, false, \ + "Enables intrinsification of BigInteger.montgomeryMultiply()") \ + \ + product(bool, UseMontgomerySquareIntrinsic, false, \ + "Enables intrinsification of BigInteger.montgomerySquare()") \ + \ product(bool, UseTypeSpeculation, true, \ "Speculatively propagate types from profiles") \ \ diff --git a/src/share/vm/opto/escape.cpp b/src/share/vm/opto/escape.cpp --- a/src/share/vm/opto/escape.cpp +++ b/src/share/vm/opto/escape.cpp @@ -974,8 +974,10 @@ strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 || strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0 || strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 || - strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0) - ))) { + strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 || + strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 || + strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0) + ))) { call->dump(); fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name)); } diff --git a/src/share/vm/opto/idealKit.hpp b/src/share/vm/opto/idealKit.hpp --- a/src/share/vm/opto/idealKit.hpp +++ b/src/share/vm/opto/idealKit.hpp @@ -210,6 +210,9 @@ Node* URShiftX(Node* l, Node* r) { return transform(new URShiftXNode(l, r)); } Node* ConX(jint k) { return (Node*)gvn().MakeConX(k); } Node* CastPX(Node* ctl, Node* p) { return transform(new CastP2XNode(ctl, p)); } + Node* CastII(Node* ctl, const Type* type, bool carry_dependency = false) { + return transform(new CastIINode(ctl, type, carry_dependency)); + } // Memory operations diff --git a/src/share/vm/opto/library_call.cpp b/src/share/vm/opto/library_call.cpp --- a/src/share/vm/opto/library_call.cpp +++ b/src/share/vm/opto/library_call.cpp @@ -293,6 +293,8 @@ bool inline_multiplyToLen(); bool inline_squareToLen(); bool inline_mulAdd(); + bool inline_montgomeryMultiply(); + bool inline_montgomerySquare(); bool inline_profileBoolean(); bool inline_isCompileConstant(); @@ -504,6 +506,13 @@ if (!UseMulAddIntrinsic) return NULL; break; + case vmIntrinsics::_montgomeryMultiply: + if (!UseMontgomeryMultiplyIntrinsic) return NULL; + break; + case vmIntrinsics::_montgomerySquare: + if (!UseMontgomerySquareIntrinsic) return NULL; + break; + case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: if (!UseAESIntrinsics) return NULL; @@ -929,6 +938,11 @@ case vmIntrinsics::_mulAdd: return inline_mulAdd(); + case vmIntrinsics::_montgomeryMultiply: + return inline_montgomeryMultiply(); + case vmIntrinsics::_montgomerySquare: + return inline_montgomerySquare(); + case vmIntrinsics::_encodeISOArray: return inline_encodeISOArray(); @@ -5416,6 +5430,270 @@ return true; } +//-------------inline_montgomeryMultiply----------------------------------- +bool LibraryCallKit::inline_montgomeryMultiply() { + address stubAddr = StubRoutines::montgomeryMultiply(); + assert(UseMontgomeryMultiplyIntrinsic, "not implementated on this platform"); + const char* stubName = "montgomery_multiply"; + + assert(callee()->signature()->size() == 7, "montgomeryMultiply has 7 parameters"); + + Node* a = argument(1); + Node* b = argument(2); + Node* n = argument(3); + Node* len = argument(4); + Node* inv = argument(5); + Node* m = argument(7); + + const Type* a_type = a->Value(&_gvn); + const Type* b_type = b->Value(&_gvn); + const TypeAryPtr* top_a = a_type->isa_aryptr(); + const TypeAryPtr* top_b = a_type->isa_aryptr(); + const Type* n_type = a->Value(&_gvn); + const TypeAryPtr* top_n = n_type->isa_aryptr(); + if (top_a == NULL || top_b->klass() == NULL || + top_b == NULL || top_b->klass() == NULL || + top_n == NULL || top_n->klass() == NULL) { + // failed array check + return false; + } + + BasicType a_elem = a_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + BasicType b_elem = b_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + BasicType n_elem = n_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + if (a_elem != T_INT || b_elem != T_INT || n_elem != T_INT) { + return false; + } + + // Set the original stack and the reexecute bit for the interpreter + // to reexecute the bytecode that invokes + // BigInteger.montgomeryMultiply() if deoptimization happens on the + // return from array allocation in the runtime. + { PreserveReexecuteState preexecs(this); + jvms()->set_should_reexecute(true); + + Node* a_start = array_element_address(a, intcon(0), a_elem); + Node* b_start = array_element_address(b, intcon(0), b_elem); + Node* n_start = array_element_address(n, intcon(0), n_elem); + + // Allocate the result array + ciKlass* klass = ciTypeArrayKlass::make(T_INT); + Node* klass_node = makecon(TypeKlassPtr::make(klass)); + + ciKlass* long_array_klass = ciTypeArrayKlass::make(T_LONG); + Node* long_array_klass_node = makecon(TypeKlassPtr::make(long_array_klass)); + + IdealKit ideal(this); + +#define __ ideal. + + // Cast the type of len to the type of an array index. We do it + // here because otherwise new_array() will generate a CastII node + // inside a conditional. If that happens this CastII will not + // dominate its use (in the call to montgomeryMultiply()) so the + // compilation will fail. + const TypeAryPtr* ary_type = n_type->isa_aryptr(); + const TypeInt* len_type = _gvn.find_int_type(len); + const TypeInt* narrow_len_type = ary_type->narrow_size_type(len_type); + len = __ CastII(len, narrow_len_type); + + Node* one = __ ConI(1); + Node* zero = __ ConI(0); + IdealVariable need_alloc(ideal), m_alloc(ideal), + scratch_start(ideal), scratch_alloc(ideal); __ declarations_done(); + + __ set(need_alloc, zero); + __ set(m_alloc, m); + __ if_then(m, BoolTest::eq, null(), PROB_STATIC_INFREQUENT); { + __ increment (need_alloc, one); + } __ else_(); { + // Update graphKit memory and control from IdealKit. + sync_kit(ideal); + Node* mlen_arg = load_array_length(m); + // Update IdealKit memory and control from graphKit. + __ sync_kit(this); + __ if_then(mlen_arg, BoolTest::lt, len, PROB_MIN); { + __ increment (need_alloc, one); + } __ end_if(); + } __ end_if(); + + __ if_then(__ value(need_alloc), BoolTest::ne, zero, PROB_STATIC_INFREQUENT); { + // Update graphKit memory and control from IdealKit. + sync_kit(ideal); + Node * narr = new_array(klass_node, len, 1); + // Update IdealKit memory and control from graphKit. + __ sync_kit(this); + __ set(m_alloc, narr); + } __ end_if(); + + // We need some scratch space. If len is reasonably small + // montgomeryMultiply() will allocate scratch on its own stack; if + // len is large we must do it here or we risk a stack overflow. + __ set(scratch_start, null()); + __ if_then(len, BoolTest::gt, intcon(512), PROB_MIN); { + // Update graphKit memory and control from IdealKit. + sync_kit(ideal); + Node * narr = new_array(long_array_klass_node, len, 1); + // Update IdealKit memory and control from graphKit. + __ sync_kit(this); + __ set(scratch_alloc, narr); + __ set(scratch_start, + array_element_address(narr, intcon(0), T_LONG)); + } __ end_if(); + + m = __ value(m_alloc); + // Can't use TypeAryPtr::INTS which uses Bottom offset. + _gvn.set_type(m, TypeOopPtr::make_from_klass(klass)); + + Node *scratch = __ value(scratch_start); + + // Final sync IdealKit and GraphKit. + final_sync(ideal); +#undef __ + + Node* m_start = array_element_address(m, intcon(0), T_INT); + + Node* call = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::montgomeryMultiply_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + a_start, b_start, n_start, len, inv, top(), + scratch, m_start); + } // original reexecute is set back here + + C->set_has_split_ifs(true); // Has chance for split-if optimization + set_result(m); + return true; +} + + +bool LibraryCallKit::inline_montgomerySquare() { + address stubAddr = StubRoutines::montgomerySquare(); + assert(UseMontgomerySquareIntrinsic, "not implementated on this platform"); + const char* stubName = "montgomery_square"; + + assert(callee()->signature()->size() == 6, "montgomerySquare has 6 parameters"); + + Node* a = argument(1); + Node* n = argument(2); + Node* len = argument(3); + Node* inv = argument(4); + Node* m = argument(6); + + const Type* a_type = a->Value(&_gvn); + const TypeAryPtr* top_a = a_type->isa_aryptr(); + const Type* n_type = a->Value(&_gvn); + const TypeAryPtr* top_n = n_type->isa_aryptr(); + if (top_a == NULL || top_a->klass() == NULL || + top_n == NULL || top_n->klass() == NULL) { + // failed array check + return false; + } + + BasicType a_elem = a_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + BasicType n_elem = n_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + if (a_elem != T_INT || n_elem != T_INT) { + return false; + } + + // Set the original stack and the reexecute bit for the interpreter + // to reexecute the bytecode that invokes + // BigInteger.montgomerySquare() if deoptimization happens on the + // return from array allocation in the runtime. + { PreserveReexecuteState preexecs(this); + jvms()->set_should_reexecute(true); + + Node* a_start = array_element_address(a, intcon(0), a_elem); + Node* n_start = array_element_address(n, intcon(0), n_elem); + + // Allocate the result array + ciKlass* klass = ciTypeArrayKlass::make(T_INT); + Node* klass_node = makecon(TypeKlassPtr::make(klass)); + + ciKlass* long_array_klass = ciTypeArrayKlass::make(T_LONG); + Node* long_array_klass_node = makecon(TypeKlassPtr::make(long_array_klass)); + + IdealKit ideal(this); + +#define __ ideal. + + // Cast the type of len to the type of an array index. We do it + // here because otherwise new_array() will generate a CastII node + // inside a conditional. If that happens this CastII will not + // dominate its use (in the call to montgomerySquare()) so the + // compilation will fail. + const TypeAryPtr* ary_type = n_type->isa_aryptr(); + const TypeInt* len_type = _gvn.find_int_type(len); + const TypeInt* narrow_len_type = ary_type->narrow_size_type(len_type); + len = __ CastII(len, narrow_len_type); + + Node* one = __ ConI(1); + Node* zero = __ ConI(0); + IdealVariable need_alloc(ideal), m_alloc(ideal), + scratch_start(ideal), scratch_alloc(ideal); __ declarations_done(); + + __ set(need_alloc, zero); + __ set(m_alloc, m); + __ if_then(m, BoolTest::eq, null(), PROB_STATIC_INFREQUENT); { + __ increment (need_alloc, one); + } __ else_(); { + // Update graphKit memory and control from IdealKit. + sync_kit(ideal); + Node* mlen_arg = load_array_length(m); + // Update IdealKit memory and control from graphKit. + __ sync_kit(this); + __ if_then(mlen_arg, BoolTest::lt, len, PROB_MIN); { + __ increment (need_alloc, one); + } __ end_if(); + } __ end_if(); + + __ if_then(__ value(need_alloc), BoolTest::ne, zero, PROB_STATIC_INFREQUENT); { + // Update graphKit memory and control from IdealKit. + sync_kit(ideal); + Node * narr = new_array(klass_node, len, 1); + // Update IdealKit memory and control from graphKit. + __ sync_kit(this); + __ set(m_alloc, narr); + } __ end_if(); + + // We need some scratch space. If len is reasonably small + // montgomerySquare() will allocate scratch on its own stack; if + // len is large we must do it here or we risk a stack overflow. + __ set(scratch_start, null()); + __ if_then(len, BoolTest::gt, intcon(512), PROB_MIN); { + // Update graphKit memory and control from IdealKit. + sync_kit(ideal); + Node * narr = new_array(long_array_klass_node, len, 1); + // Update IdealKit memory and control from graphKit. + __ sync_kit(this); + __ set(scratch_alloc, narr); + __ set(scratch_start, + array_element_address(narr, intcon(0), T_LONG)); + } __ end_if(); + + m = __ value(m_alloc); + // Can't use TypeAryPtr::INTS which uses Bottom offset. + _gvn.set_type(m, TypeOopPtr::make_from_klass(klass)); + + Node *scratch = __ value(scratch_start); + + // Final sync IdealKit and GraphKit. + final_sync(ideal); +#undef __ + + Node* m_start = array_element_address(m, intcon(0), T_INT); + + Node* call = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::montgomerySquare_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + a_start, n_start, len, inv, top(), + scratch, m_start); + } // original reexecute is set back here + + C->set_has_split_ifs(true); // Has chance for split-if optimization + set_result(m); + return true; +} + /** * Calculate CRC32 for byte. diff --git a/src/share/vm/opto/runtime.cpp b/src/share/vm/opto/runtime.cpp --- a/src/share/vm/opto/runtime.cpp +++ b/src/share/vm/opto/runtime.cpp @@ -100,6 +100,8 @@ address OptoRuntime::_slow_arraycopy_Java = NULL; address OptoRuntime::_register_finalizer_Java = NULL; +address OptoRuntime::_montgomeryMultiply_Java = NULL; +address OptoRuntime::_montgomerySquare_Java = NULL; # ifdef ENABLE_ZAP_DEAD_LOCALS address OptoRuntime::_zap_dead_Java_locals_Java = NULL; @@ -987,6 +989,54 @@ return TypeFunc::make(domain, range); } +const TypeFunc* OptoRuntime::montgomeryMultiply_Type() { + // create input type (domain) + int num_args = 8; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // a + fields[argp++] = TypePtr::NOTNULL; // b + fields[argp++] = TypePtr::NOTNULL; // n + fields[argp++] = TypeInt::INT; // len + fields[argp++] = TypeLong::LONG; // inv + fields[argp++] = Type::HALF; + fields[argp++] = TypePtr::NOTNULL; // scratch + fields[argp++] = TypePtr::NOTNULL; // result + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = TypePtr::NOTNULL; + + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + +const TypeFunc* OptoRuntime::montgomerySquare_Type() { + // create input type (domain) + int num_args = 7; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // a + fields[argp++] = TypePtr::NOTNULL; // n + fields[argp++] = TypeInt::INT; // len + fields[argp++] = TypeLong::LONG; // inv + fields[argp++] = Type::HALF; + fields[argp++] = TypePtr::NOTNULL; // scratch + fields[argp++] = TypePtr::NOTNULL; // result + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = TypePtr::NOTNULL; + + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} //------------- Interpreter state access for on stack replacement diff --git a/src/share/vm/opto/runtime.hpp b/src/share/vm/opto/runtime.hpp --- a/src/share/vm/opto/runtime.hpp +++ b/src/share/vm/opto/runtime.hpp @@ -150,6 +150,9 @@ static address _slow_arraycopy_Java; static address _register_finalizer_Java; + static address _montgomeryMultiply_Java; + static address _montgomerySquare_Java; + # ifdef ENABLE_ZAP_DEAD_LOCALS static address _zap_dead_Java_locals_Java; static address _zap_dead_native_locals_Java; @@ -248,6 +251,8 @@ static address slow_arraycopy_Java() { return _slow_arraycopy_Java; } static address register_finalizer_Java() { return _register_finalizer_Java; } + static address montgomeryMultiply_Java() { return _montgomeryMultiply_Java; } + static address montgomerySquare_Java() { return _montgomerySquare_Java; } # ifdef ENABLE_ZAP_DEAD_LOCALS @@ -311,6 +316,8 @@ static const TypeFunc* digestBase_implCompressMB_Type(); static const TypeFunc* multiplyToLen_Type(); + static const TypeFunc* montgomeryMultiply_Type(); + static const TypeFunc* montgomerySquare_Type(); static const TypeFunc* squareToLen_Type(); diff --git a/src/share/vm/runtime/sharedRuntime.hpp b/src/share/vm/runtime/sharedRuntime.hpp --- a/src/share/vm/runtime/sharedRuntime.hpp +++ b/src/share/vm/runtime/sharedRuntime.hpp @@ -145,6 +145,12 @@ static double dsqrt(double f); #endif + // Montgomery multiplication + static void montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, + jint len, jlong inv, unsigned long *scratch, jint *m_ints); + static void montgomery_square(jint *a_ints, jint *n_ints, + jint len, jlong inv, unsigned long *scratch, jint *m_ints); + #ifdef __SOFTFP__ // C++ compiler generates soft float instructions as well as passing // float and double in registers. diff --git a/src/share/vm/runtime/stubRoutines.cpp b/src/share/vm/runtime/stubRoutines.cpp --- a/src/share/vm/runtime/stubRoutines.cpp +++ b/src/share/vm/runtime/stubRoutines.cpp @@ -139,6 +139,8 @@ address StubRoutines::_multiplyToLen = NULL; address StubRoutines::_squareToLen = NULL; address StubRoutines::_mulAdd = NULL; +address StubRoutines::_montgomeryMultiply = NULL; +address StubRoutines::_montgomerySquare = NULL; double (* StubRoutines::_intrinsic_log )(double) = NULL; double (* StubRoutines::_intrinsic_log10 )(double) = NULL; diff --git a/src/share/vm/runtime/stubRoutines.hpp b/src/share/vm/runtime/stubRoutines.hpp --- a/src/share/vm/runtime/stubRoutines.hpp +++ b/src/share/vm/runtime/stubRoutines.hpp @@ -199,6 +199,8 @@ static address _multiplyToLen; static address _squareToLen; static address _mulAdd; + static address _montgomeryMultiply; + static address _montgomerySquare; // These are versions of the java.lang.Math methods which perform // the same operations as the intrinsic version. They are used for @@ -360,6 +362,8 @@ static address multiplyToLen() {return _multiplyToLen; } static address squareToLen() {return _squareToLen; } static address mulAdd() {return _mulAdd; } + static address montgomeryMultiply() { return _montgomeryMultiply; } + static address montgomerySquare() { return _montgomerySquare; } static address select_fill_function(BasicType t, bool aligned, const char* &name);