# HG changeset patch
# User aph
# Date 1434472313 -3600
#      Tue Jun 16 17:31:53 2015 +0100
# Node ID fb79e92c0e7dfc8e29d87bd4490d3e9ad42f55df
# Parent  5a9d5d58e667b7a49b5d07ccfdb02c00252a9800
8046943: Leverage CPU Instructions for GHASH and RSA
Summary: Add montgomeryMultiply intrinsics
Reviewed-by: kvn

diff --git a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
@@ -3511,6 +3511,254 @@
 }
 
 
+//------------------------------Montgomery multiplication------------------------
+//
+
+#define ASM_SUBTRACT
+
+#ifdef ASM_SUBTRACT
+// Subtract 0:b from carry:a.  Return carry.
+static unsigned long
+sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
+  long i = 0, cnt = len;
+  unsigned long tmp;
+  asm volatile("clc; "
+               "0: ; "
+               "mov (%[b], %[i], 8), %[tmp]; "
+               "sbb %[tmp], (%[a], %[i], 8); "
+               "inc %[i]; dec %[cnt]; "
+               "jne 0b; "
+               "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
+               : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp), "+m"(a)
+               : [a]"r"(a), [b]"r"(b), "m"(b), [carry]"r"(carry)
+               : "memory");
+  return tmp;
+}
+#else // ASM_SUBTRACT
+typedef int __attribute__((mode(TI))) int128;
+
+// Subtract 0:b from carry:a.  Return carry.
+static unsigned long
+sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
+  int128 tmp = 0;
+  int i;
+  for (i = 0; i < len; i++) {
+    tmp += a[i];
+    tmp -= b[i];
+    a[i] = tmp;
+    tmp >>= 64;
+    assert(-1 <= tmp && tmp <= 0, "invariant");
+  }
+  return tmp + carry;
+}
+#endif // ! ASM_SUBTRACT
+
+// Multiply (unsigned) Long A by Long B, accumulating the double-
+// length result into the accumulator formed of T0, T1, and T2.
+#define MACC(A, B, T0, T1, T2)                                  \
+do {                                                            \
+  unsigned long hi, lo;                                         \
+  __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
+           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
+           : "r"(A), "a"(B) : "cc");                            \
+ } while(0)
+
+// As above, but add twice the double-length result into the
+// accumulator.
+#define MACC2(A, B, T0, T1, T2)                                 \
+do {                                                            \
+  unsigned long hi, lo;                                         \
+  __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
+           "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
+           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
+           : "r"(A), "a"(B) : "cc");                            \
+ } while(0)
+
+// Fast Montgomery multiplication.  The derivation of the algorithm is
+// in  A Cryptographic Library for the Motorola DSP56000,
+// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
+
+static void __attribute__((noinline))
+montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
+                    unsigned long m[], unsigned long inv, int len) {
+  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+  int i;
+
+  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+
+  for (i = 0; i < len; i++) {
+    int j;
+    for (j = 0; j < i; j++) {
+      MACC(a[j], b[i-j], t0, t1, t2);
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    MACC(a[i], b[0], t0, t1, t2);
+    m[i] = t0 * inv;
+    MACC(m[i], n[0], t0, t1, t2);
+
+    assert(t0 == 0, "broken Montgomery multiply");
+
+    t0 = t1; t1 = t2; t2 = 0;
+  }
+
+  for (i = len; i < 2*len; i++) {
+    int j;
+    for (j = i-len+1; j < len; j++) {
+      MACC(a[j], b[i-j], t0, t1, t2);
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    m[i-len] = t0;
+    t0 = t1; t1 = t2; t2 = 0;
+  }
+
+  while (t0)
+    t0 = sub(m, n, t0, len);
+}
+
+// Fast Montgomery squaring.  This uses asymptotically 25% fewer
+// multiplies so it should be up to 25% faster than Montgomery
+// multiplication.  However, its loop control is more complex and it
+// may actually run slower on some machines.
+
+static void __attribute__((noinline))
+montgomery_square(unsigned long a[], unsigned long n[],
+                  unsigned long m[], unsigned long inv, int len) {
+  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+  int i;
+
+  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+
+  for (i = 0; i < len; i++) {
+    int j;
+    int end = (i+1)/2;
+    for (j = 0; j < end; j++) {
+      MACC2(a[j], a[i-j], t0, t1, t2);
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    if ((i & 1) == 0) {
+      MACC(a[j], a[j], t0, t1, t2);
+    }
+    for (; j < i; j++) {
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    m[i] = t0 * inv;
+    MACC(m[i], n[0], t0, t1, t2);
+
+    assert(t0 == 0, "broken Montgomery multiply");
+
+    t0 = t1; t1 = t2; t2 = 0;
+  }
+
+  for (i = len; i < 2*len; i++) {
+    int start = i-len+1;
+    int end = start + (len - start)/2;
+    int j;
+    for (j = start; j < end; j++) {
+      MACC2(a[j], a[i-j], t0, t1, t2);
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    if ((i & 1) == 0) {
+      MACC(a[j], a[j], t0, t1, t2);
+    }
+    for (; j < len; j++) {
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    m[i-len] = t0;
+    t0 = t1; t1 = t2; t2 = 0;
+  }
+
+  while (t0)
+    t0 = sub(m, n, t0, len);
+}
+
+// Swap words in a longword.
+static unsigned long swap(unsigned long x) {
+  return (x << 32) | (x >> 32);
+}
+
+// Copy len longwords from s to do, word-swapping as we go.  The
+// destination array is reversed.
+static void reverse_words(unsigned long *s, unsigned long *d, int len) {
+  d += len;
+  while(len-- > 0) {
+    d--;
+    *d = swap(*s);
+    s++;
+  }
+}
+
+// The threshold at which squaring is advantageous was determined
+// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
+#define MONTGOMERY_SQUARING_THRESHOLD 64
+
+void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
+                                        jint len, jlong inv,
+                                        unsigned long *scratch,
+                                        jint *m_ints) {
+  assert(len % 2 == 0, "array length in montgomery_multiply must be even");
+  int longwords = len/2;
+
+  if (__builtin_expect(scratch == NULL, true)) {
+    // Make very sure we don't use so much space that the stack might
+    // overflow.  512 jints corresponds to an 16384-bit integer and
+    // will use here a total of 8k bytes of stack space.
+    int total_allocation = longwords * sizeof (unsigned long) * 4;
+    guarantee(total_allocation <= 8192, "must be");
+    scratch = (unsigned long *)__builtin_alloca(total_allocation);
+  }
+
+  // Local scratach arrays
+  unsigned long
+    *a = scratch + 0 * longwords,
+    *b = scratch + 1 * longwords,
+    *n = scratch + 2 * longwords,
+    *m = scratch + 3 * longwords;
+
+  reverse_words((unsigned long *)a_ints, a, longwords);
+  reverse_words((unsigned long *)b_ints, b, longwords);
+  reverse_words((unsigned long *)n_ints, n, longwords);
+
+  ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
+
+  reverse_words(m, (unsigned long *)m_ints, longwords);
+}
+
+
+void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
+                                      jint len, jlong inv,
+                                      unsigned long *scratch,
+                                      jint *m_ints) {
+  assert(len % 2 == 0, "array length in montgomery_square must be even");
+  int longwords = len/2;
+
+  if (__builtin_expect(scratch == NULL, true)) {
+    // Make very sure we don't use so much space that the stack might
+    // overflow.  512 jints corresponds to an 16384-bit integer and
+    // will use here a total of 6k bytes of stack space.
+    int total_allocation = longwords * sizeof (unsigned long) * 3;
+    guarantee(total_allocation <= 8192, "must be");
+    scratch = (unsigned long *)__builtin_alloca(total_allocation);
+  }
+
+  // Local scratach arrays
+  unsigned long
+    *a = scratch + 0 * longwords,
+    *n = scratch + 1 * longwords,
+    *m = scratch + 2 * longwords;
+
+  reverse_words((unsigned long *)a_ints, a, longwords);
+  reverse_words((unsigned long *)n_ints, n, longwords);
+
+  if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
+    ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
+  } else {
+    ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
+  }
+
+  reverse_words(m, (unsigned long *)m_ints, longwords);
+}
+
+
 #ifdef COMPILER2
 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
 //
diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -4137,6 +4137,14 @@
     if (UseMulAddIntrinsic) {
       StubRoutines::_mulAdd = generate_mulAdd();
     }
+    if (UseMontgomeryMultiplyIntrinsic) {
+      StubRoutines::_montgomeryMultiply
+        = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
+    }
+    if (UseMontgomerySquareIntrinsic) {
+      StubRoutines::_montgomerySquare
+        = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
+    }
 #endif
   }
 
diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp
--- a/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/src/cpu/x86/vm/vm_version_x86.cpp
@@ -796,6 +796,12 @@
   if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
     UseMulAddIntrinsic = true;
   }
+  if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
+    UseMontgomeryMultiplyIntrinsic = true;
+  }
+  if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
+    UseMontgomerySquareIntrinsic = true;
+  }
 #else
   if (UseMultiplyToLenIntrinsic) {
     if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
@@ -803,6 +809,18 @@
     }
     FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false);
   }
+  if (UseMontgomeryMultiplyIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
+      warning("montgomeryMultiply intrinsic is not available in 32-bit VM");
+    }
+    FLAG_SET_DEFAULT(UseMontgomeryMultiplyIntrinsic, false);
+  }
+  if (UseMontgomerySquareIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
+      warning("montgomerySquare intrinsic is not available in 32-bit VM");
+    }
+    FLAG_SET_DEFAULT(UseMontgomerySquareIntrinsic, false);
+  }
   if (UseSquareToLenIntrinsic) {
     if (!FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
       warning("squareToLen intrinsic is not available in 32-bit VM");
diff --git a/src/share/vm/classfile/vmSymbols.hpp b/src/share/vm/classfile/vmSymbols.hpp
--- a/src/share/vm/classfile/vmSymbols.hpp
+++ b/src/share/vm/classfile/vmSymbols.hpp
@@ -808,6 +808,14 @@
    do_name(     mulAdd_name,                                  "implMulAdd")                                             \
    do_signature(mulAdd_signature,                             "([I[IIII)I")                                             \
                                                                                                                         \
+  do_intrinsic(_montgomeryMultiply,      java_math_BigInteger, montgomeryMultiply_name, montgomeryMultiply_signature, F_R) \
+   do_name(     montgomeryMultiply_name,                             "montgomeryMultiply")                              \
+   do_signature(montgomeryMultiply_signature,                        "([I[I[IIJ[I)[I")                                  \
+                                                                                                                        \
+  do_intrinsic(_montgomerySquare,      java_math_BigInteger, montgomerySquare_name, montgomerySquare_signature, F_R)    \
+   do_name(     montgomerySquare_name,                             "montgomerySquare")                                  \
+   do_signature(montgomerySquare_signature,                        "([I[IIJ[I)[I")                                      \
+                                                                                                                        \
   /* java/lang/ref/Reference */                                                                                         \
   do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
                                                                                                                         \
diff --git a/src/share/vm/opto/c2_globals.hpp b/src/share/vm/opto/c2_globals.hpp
--- a/src/share/vm/opto/c2_globals.hpp
+++ b/src/share/vm/opto/c2_globals.hpp
@@ -671,6 +671,12 @@
   product(bool, UseMulAddIntrinsic, false,                                  \
           "Enables intrinsification of BigInteger.mulAdd()")                \
                                                                             \
+  product(bool, UseMontgomeryMultiplyIntrinsic, false,                      \
+          "Enables intrinsification of BigInteger.montgomeryMultiply()")    \
+                                                                            \
+  product(bool, UseMontgomerySquareIntrinsic, false,                        \
+          "Enables intrinsification of BigInteger.montgomerySquare()")      \
+                                                                            \
   product(bool, UseTypeSpeculation, true,                                   \
           "Speculatively propagate types from profiles")                    \
                                                                             \
diff --git a/src/share/vm/opto/escape.cpp b/src/share/vm/opto/escape.cpp
--- a/src/share/vm/opto/escape.cpp
+++ b/src/share/vm/opto/escape.cpp
@@ -974,8 +974,10 @@
                   strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 ||
-                  strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0)
-                  ))) {
+                  strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0)
+                 ))) {
             call->dump();
             fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
           }
diff --git a/src/share/vm/opto/idealKit.hpp b/src/share/vm/opto/idealKit.hpp
--- a/src/share/vm/opto/idealKit.hpp
+++ b/src/share/vm/opto/idealKit.hpp
@@ -210,6 +210,9 @@
   Node* URShiftX(Node* l, Node* r) { return transform(new URShiftXNode(l, r)); }
   Node* ConX(jint k) { return (Node*)gvn().MakeConX(k); }
   Node* CastPX(Node* ctl, Node* p) { return transform(new CastP2XNode(ctl, p)); }
+  Node* CastII(Node* ctl, const Type* type, bool carry_dependency = false) {
+    return transform(new CastIINode(ctl, type, carry_dependency));
+  }
 
   // Memory operations
 
diff --git a/src/share/vm/opto/library_call.cpp b/src/share/vm/opto/library_call.cpp
--- a/src/share/vm/opto/library_call.cpp
+++ b/src/share/vm/opto/library_call.cpp
@@ -293,6 +293,8 @@
   bool inline_multiplyToLen();
   bool inline_squareToLen();
   bool inline_mulAdd();
+  bool inline_montgomeryMultiply();
+  bool inline_montgomerySquare();
 
   bool inline_profileBoolean();
   bool inline_isCompileConstant();
@@ -504,6 +506,13 @@
     if (!UseMulAddIntrinsic) return NULL;
     break;
 
+  case vmIntrinsics::_montgomeryMultiply:
+     if (!UseMontgomeryMultiplyIntrinsic) return NULL;
+    break;
+  case vmIntrinsics::_montgomerySquare:
+     if (!UseMontgomerySquareIntrinsic) return NULL;
+    break;
+
   case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
     if (!UseAESIntrinsics) return NULL;
@@ -929,6 +938,11 @@
   case vmIntrinsics::_mulAdd:
     return inline_mulAdd();
 
+  case vmIntrinsics::_montgomeryMultiply:
+    return inline_montgomeryMultiply();
+  case vmIntrinsics::_montgomerySquare:
+    return inline_montgomerySquare();
+
   case vmIntrinsics::_encodeISOArray:
     return inline_encodeISOArray();
 
@@ -5416,6 +5430,270 @@
   return true;
 }
 
+//-------------inline_montgomeryMultiply-----------------------------------
+bool LibraryCallKit::inline_montgomeryMultiply() {
+  address stubAddr = StubRoutines::montgomeryMultiply();
+  assert(UseMontgomeryMultiplyIntrinsic, "not implementated on this platform");
+  const char* stubName = "montgomery_multiply";
+
+  assert(callee()->signature()->size() == 7, "montgomeryMultiply has 7 parameters");
+
+  Node* a    = argument(1);
+  Node* b    = argument(2);
+  Node* n    = argument(3);
+  Node* len  = argument(4);
+  Node* inv  = argument(5);
+  Node* m    = argument(7);
+
+  const Type* a_type = a->Value(&_gvn);
+  const Type* b_type = b->Value(&_gvn);
+  const TypeAryPtr* top_a = a_type->isa_aryptr();
+  const TypeAryPtr* top_b = a_type->isa_aryptr();
+  const Type* n_type = a->Value(&_gvn);
+  const TypeAryPtr* top_n = n_type->isa_aryptr();
+  if (top_a  == NULL || top_b->klass()  == NULL ||
+      top_b == NULL || top_b->klass() == NULL ||
+      top_n == NULL || top_n->klass()  == NULL) {
+    // failed array check
+    return false;
+  }
+
+  BasicType a_elem = a_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType b_elem = b_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType n_elem = n_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  if (a_elem != T_INT || b_elem != T_INT || n_elem != T_INT) {
+    return false;
+  }
+
+  // Set the original stack and the reexecute bit for the interpreter
+  // to reexecute the bytecode that invokes
+  // BigInteger.montgomeryMultiply() if deoptimization happens on the
+  // return from array allocation in the runtime.
+  { PreserveReexecuteState preexecs(this);
+    jvms()->set_should_reexecute(true);
+
+    Node* a_start = array_element_address(a, intcon(0), a_elem);
+    Node* b_start = array_element_address(b, intcon(0), b_elem);
+    Node* n_start = array_element_address(n, intcon(0), n_elem);
+
+    // Allocate the result array
+    ciKlass* klass = ciTypeArrayKlass::make(T_INT);
+    Node* klass_node = makecon(TypeKlassPtr::make(klass));
+
+    ciKlass* long_array_klass = ciTypeArrayKlass::make(T_LONG);
+    Node* long_array_klass_node = makecon(TypeKlassPtr::make(long_array_klass));
+
+    IdealKit ideal(this);
+
+#define __ ideal.
+
+    // Cast the type of len to the type of an array index.  We do it
+    // here because otherwise new_array() will generate a CastII node
+    // inside a conditional.  If that happens this CastII will not
+    // dominate its use (in the call to montgomeryMultiply()) so the
+    // compilation will fail.
+    const TypeAryPtr* ary_type = n_type->isa_aryptr();
+    const TypeInt* len_type = _gvn.find_int_type(len);
+    const TypeInt* narrow_len_type = ary_type->narrow_size_type(len_type);
+    len = __ CastII(len, narrow_len_type);
+
+    Node* one = __ ConI(1);
+    Node* zero = __ ConI(0);
+    IdealVariable need_alloc(ideal), m_alloc(ideal),
+      scratch_start(ideal), scratch_alloc(ideal);  __ declarations_done();
+
+    __ set(need_alloc, zero);
+    __ set(m_alloc, m);
+    __ if_then(m, BoolTest::eq, null(), PROB_STATIC_INFREQUENT); {
+      __ increment (need_alloc, one);
+    } __ else_(); {
+      // Update graphKit memory and control from IdealKit.
+      sync_kit(ideal);
+      Node* mlen_arg = load_array_length(m);
+      // Update IdealKit memory and control from graphKit.
+      __ sync_kit(this);
+      __ if_then(mlen_arg, BoolTest::lt, len, PROB_MIN); {
+        __ increment (need_alloc, one);
+      } __ end_if();
+    } __ end_if();
+
+    __ if_then(__ value(need_alloc), BoolTest::ne, zero, PROB_STATIC_INFREQUENT); {
+      // Update graphKit memory and control from IdealKit.
+      sync_kit(ideal);
+      Node * narr = new_array(klass_node, len, 1);
+      // Update IdealKit memory and control from graphKit.
+      __ sync_kit(this);
+      __ set(m_alloc, narr);
+    } __ end_if();
+
+    // We need some scratch space.  If len is reasonably small
+    // montgomeryMultiply() will allocate scratch on its own stack; if
+    // len is large we must do it here or we risk a stack overflow.
+    __ set(scratch_start, null());
+    __ if_then(len, BoolTest::gt, intcon(512), PROB_MIN); {
+      // Update graphKit memory and control from IdealKit.
+      sync_kit(ideal);
+      Node * narr = new_array(long_array_klass_node, len, 1);
+      // Update IdealKit memory and control from graphKit.
+      __ sync_kit(this);
+      __ set(scratch_alloc, narr);
+      __ set(scratch_start,
+             array_element_address(narr, intcon(0), T_LONG));
+    } __ end_if();
+
+    m = __ value(m_alloc);
+    // Can't use TypeAryPtr::INTS which uses Bottom offset.
+    _gvn.set_type(m, TypeOopPtr::make_from_klass(klass));
+
+    Node *scratch = __ value(scratch_start);
+
+    // Final sync IdealKit and GraphKit.
+    final_sync(ideal);
+#undef __
+
+    Node* m_start = array_element_address(m, intcon(0), T_INT);
+
+    Node* call = make_runtime_call(RC_LEAF|RC_NO_FP,
+                                   OptoRuntime::montgomeryMultiply_Type(),
+                                   stubAddr, stubName, TypePtr::BOTTOM,
+                                   a_start, b_start, n_start, len, inv, top(),
+                                   scratch, m_start);
+  } // original reexecute is set back here
+
+  C->set_has_split_ifs(true); // Has chance for split-if optimization
+  set_result(m);
+  return true;
+}
+
+
+bool LibraryCallKit::inline_montgomerySquare() {
+  address stubAddr = StubRoutines::montgomerySquare();
+  assert(UseMontgomerySquareIntrinsic, "not implementated on this platform");
+  const char* stubName = "montgomery_square";
+
+  assert(callee()->signature()->size() == 6, "montgomerySquare has 6 parameters");
+
+  Node* a    = argument(1);
+  Node* n    = argument(2);
+  Node* len  = argument(3);
+  Node* inv  = argument(4);
+  Node* m    = argument(6);
+
+  const Type* a_type = a->Value(&_gvn);
+  const TypeAryPtr* top_a = a_type->isa_aryptr();
+  const Type* n_type = a->Value(&_gvn);
+  const TypeAryPtr* top_n = n_type->isa_aryptr();
+  if (top_a  == NULL || top_a->klass()  == NULL ||
+      top_n == NULL || top_n->klass()  == NULL) {
+    // failed array check
+    return false;
+  }
+
+  BasicType a_elem = a_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType n_elem = n_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  if (a_elem != T_INT || n_elem != T_INT) {
+    return false;
+  }
+
+  // Set the original stack and the reexecute bit for the interpreter
+  // to reexecute the bytecode that invokes
+  // BigInteger.montgomerySquare() if deoptimization happens on the
+  // return from array allocation in the runtime.
+  { PreserveReexecuteState preexecs(this);
+    jvms()->set_should_reexecute(true);
+
+    Node* a_start = array_element_address(a, intcon(0), a_elem);
+    Node* n_start = array_element_address(n, intcon(0), n_elem);
+
+    // Allocate the result array
+    ciKlass* klass = ciTypeArrayKlass::make(T_INT);
+    Node* klass_node = makecon(TypeKlassPtr::make(klass));
+
+    ciKlass* long_array_klass = ciTypeArrayKlass::make(T_LONG);
+    Node* long_array_klass_node = makecon(TypeKlassPtr::make(long_array_klass));
+
+    IdealKit ideal(this);
+
+#define __ ideal.
+
+    // Cast the type of len to the type of an array index.  We do it
+    // here because otherwise new_array() will generate a CastII node
+    // inside a conditional.  If that happens this CastII will not
+    // dominate its use (in the call to montgomerySquare()) so the
+    // compilation will fail.
+    const TypeAryPtr* ary_type = n_type->isa_aryptr();
+    const TypeInt* len_type = _gvn.find_int_type(len);
+    const TypeInt* narrow_len_type = ary_type->narrow_size_type(len_type);
+    len = __ CastII(len, narrow_len_type);
+
+    Node* one = __ ConI(1);
+    Node* zero = __ ConI(0);
+    IdealVariable need_alloc(ideal), m_alloc(ideal),
+      scratch_start(ideal), scratch_alloc(ideal);  __ declarations_done();
+
+    __ set(need_alloc, zero);
+    __ set(m_alloc, m);
+    __ if_then(m, BoolTest::eq, null(), PROB_STATIC_INFREQUENT); {
+      __ increment (need_alloc, one);
+    } __ else_(); {
+      // Update graphKit memory and control from IdealKit.
+      sync_kit(ideal);
+      Node* mlen_arg = load_array_length(m);
+      // Update IdealKit memory and control from graphKit.
+      __ sync_kit(this);
+      __ if_then(mlen_arg, BoolTest::lt, len, PROB_MIN); {
+        __ increment (need_alloc, one);
+      } __ end_if();
+    } __ end_if();
+
+    __ if_then(__ value(need_alloc), BoolTest::ne, zero, PROB_STATIC_INFREQUENT); {
+      // Update graphKit memory and control from IdealKit.
+      sync_kit(ideal);
+      Node * narr = new_array(klass_node, len, 1);
+      // Update IdealKit memory and control from graphKit.
+      __ sync_kit(this);
+      __ set(m_alloc, narr);
+    } __ end_if();
+
+    // We need some scratch space.  If len is reasonably small
+    // montgomerySquare() will allocate scratch on its own stack; if
+    // len is large we must do it here or we risk a stack overflow.
+    __ set(scratch_start, null());
+    __ if_then(len, BoolTest::gt, intcon(512), PROB_MIN); {
+      // Update graphKit memory and control from IdealKit.
+      sync_kit(ideal);
+      Node * narr = new_array(long_array_klass_node, len, 1);
+      // Update IdealKit memory and control from graphKit.
+      __ sync_kit(this);
+      __ set(scratch_alloc, narr);
+      __ set(scratch_start,
+             array_element_address(narr, intcon(0), T_LONG));
+    } __ end_if();
+
+    m = __ value(m_alloc);
+    // Can't use TypeAryPtr::INTS which uses Bottom offset.
+    _gvn.set_type(m, TypeOopPtr::make_from_klass(klass));
+
+    Node *scratch = __ value(scratch_start);
+
+    // Final sync IdealKit and GraphKit.
+    final_sync(ideal);
+#undef __
+
+    Node* m_start = array_element_address(m, intcon(0), T_INT);
+
+    Node* call = make_runtime_call(RC_LEAF|RC_NO_FP,
+                                   OptoRuntime::montgomerySquare_Type(),
+                                   stubAddr, stubName, TypePtr::BOTTOM,
+                                   a_start, n_start, len, inv, top(),
+                                   scratch, m_start);
+  } // original reexecute is set back here
+
+  C->set_has_split_ifs(true); // Has chance for split-if optimization
+  set_result(m);
+  return true;
+}
+
 
 /**
  * Calculate CRC32 for byte.
diff --git a/src/share/vm/opto/runtime.cpp b/src/share/vm/opto/runtime.cpp
--- a/src/share/vm/opto/runtime.cpp
+++ b/src/share/vm/opto/runtime.cpp
@@ -100,6 +100,8 @@
 
 address OptoRuntime::_slow_arraycopy_Java                         = NULL;
 address OptoRuntime::_register_finalizer_Java                     = NULL;
+address OptoRuntime::_montgomeryMultiply_Java                     = NULL;
+address OptoRuntime::_montgomerySquare_Java                     = NULL;
 
 # ifdef ENABLE_ZAP_DEAD_LOCALS
 address OptoRuntime::_zap_dead_Java_locals_Java                   = NULL;
@@ -987,6 +989,54 @@
   return TypeFunc::make(domain, range);
 }
 
+const TypeFunc* OptoRuntime::montgomeryMultiply_Type() {
+  // create input type (domain)
+  int num_args      = 8;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // a
+  fields[argp++] = TypePtr::NOTNULL;    // b
+  fields[argp++] = TypePtr::NOTNULL;    // n
+  fields[argp++] = TypeInt::INT;        // len
+  fields[argp++] = TypeLong::LONG;      // inv
+  fields[argp++] = Type::HALF;
+  fields[argp++] = TypePtr::NOTNULL;    // scratch
+  fields[argp++] = TypePtr::NOTNULL;    // result
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = TypePtr::NOTNULL;
+
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
+const TypeFunc* OptoRuntime::montgomerySquare_Type() {
+  // create input type (domain)
+  int num_args      = 7;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // a
+  fields[argp++] = TypePtr::NOTNULL;    // n
+  fields[argp++] = TypeInt::INT;        // len
+  fields[argp++] = TypeLong::LONG;      // inv
+  fields[argp++] = Type::HALF;
+  fields[argp++] = TypePtr::NOTNULL;    // scratch
+  fields[argp++] = TypePtr::NOTNULL;    // result
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = TypePtr::NOTNULL;
+
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
 
 
 //------------- Interpreter state access for on stack replacement
diff --git a/src/share/vm/opto/runtime.hpp b/src/share/vm/opto/runtime.hpp
--- a/src/share/vm/opto/runtime.hpp
+++ b/src/share/vm/opto/runtime.hpp
@@ -150,6 +150,9 @@
   static address _slow_arraycopy_Java;
   static address _register_finalizer_Java;
 
+  static address _montgomeryMultiply_Java;
+  static address _montgomerySquare_Java;
+
 # ifdef ENABLE_ZAP_DEAD_LOCALS
   static address _zap_dead_Java_locals_Java;
   static address _zap_dead_native_locals_Java;
@@ -248,6 +251,8 @@
 
   static address slow_arraycopy_Java()                   { return _slow_arraycopy_Java; }
   static address register_finalizer_Java()               { return _register_finalizer_Java; }
+  static address montgomeryMultiply_Java()               { return _montgomeryMultiply_Java; }
+  static address montgomerySquare_Java()                 { return _montgomerySquare_Java; }
 
 
 # ifdef ENABLE_ZAP_DEAD_LOCALS
@@ -311,6 +316,8 @@
   static const TypeFunc* digestBase_implCompressMB_Type();
 
   static const TypeFunc* multiplyToLen_Type();
+  static const TypeFunc* montgomeryMultiply_Type();
+  static const TypeFunc* montgomerySquare_Type();
 
   static const TypeFunc* squareToLen_Type();
 
diff --git a/src/share/vm/runtime/sharedRuntime.hpp b/src/share/vm/runtime/sharedRuntime.hpp
--- a/src/share/vm/runtime/sharedRuntime.hpp
+++ b/src/share/vm/runtime/sharedRuntime.hpp
@@ -145,6 +145,12 @@
   static double dsqrt(double f);
 #endif
 
+  // Montgomery multiplication
+  static void montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
+                                  jint len, jlong inv, unsigned long *scratch, jint *m_ints);
+  static void montgomery_square(jint *a_ints, jint *n_ints,
+                                jint len, jlong inv, unsigned long *scratch, jint *m_ints);
+
 #ifdef __SOFTFP__
   // C++ compiler generates soft float instructions as well as passing
   // float and double in registers.
diff --git a/src/share/vm/runtime/stubRoutines.cpp b/src/share/vm/runtime/stubRoutines.cpp
--- a/src/share/vm/runtime/stubRoutines.cpp
+++ b/src/share/vm/runtime/stubRoutines.cpp
@@ -139,6 +139,8 @@
 address StubRoutines::_multiplyToLen = NULL;
 address StubRoutines::_squareToLen = NULL;
 address StubRoutines::_mulAdd = NULL;
+address StubRoutines::_montgomeryMultiply = NULL;
+address StubRoutines::_montgomerySquare = NULL;
 
 double (* StubRoutines::_intrinsic_log   )(double) = NULL;
 double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
diff --git a/src/share/vm/runtime/stubRoutines.hpp b/src/share/vm/runtime/stubRoutines.hpp
--- a/src/share/vm/runtime/stubRoutines.hpp
+++ b/src/share/vm/runtime/stubRoutines.hpp
@@ -199,6 +199,8 @@
   static address _multiplyToLen;
   static address _squareToLen;
   static address _mulAdd;
+  static address _montgomeryMultiply;
+  static address _montgomerySquare;
 
   // These are versions of the java.lang.Math methods which perform
   // the same operations as the intrinsic version.  They are used for
@@ -360,6 +362,8 @@
   static address multiplyToLen()       {return _multiplyToLen; }
   static address squareToLen()         {return _squareToLen; }
   static address mulAdd()              {return _mulAdd; }
+  static address montgomeryMultiply()  { return _montgomeryMultiply; }
+  static address montgomerySquare()    { return _montgomerySquare; }
 
   static address select_fill_function(BasicType t, bool aligned, const char* &name);