< prev index next >

src/cpu/x86/vm/sharedRuntime_x86_64.cpp

Print this page
rev 8502 : 8130150: Implement BigInteger.montgomeryMultiply intrinsic
Summary: Add montgomeryMultiply intrinsics
Reviewed-by: kvn


   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"

  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "code/debugInfoRec.hpp"
  29 #include "code/icBuffer.hpp"
  30 #include "code/vtableStubs.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "oops/compiledICHolder.hpp"
  33 #include "prims/jvmtiRedefineClassesTrace.hpp"
  34 #include "runtime/sharedRuntime.hpp"
  35 #include "runtime/vframeArray.hpp"
  36 #include "vmreg_x86.inline.hpp"
  37 #ifdef COMPILER1
  38 #include "c1/c1_Runtime1.hpp"
  39 #endif
  40 #ifdef COMPILER2
  41 #include "opto/runtime.hpp"
  42 #endif
  43 
  44 #define __ masm->
  45 


3494 
3495   RegisterSaver::restore_live_registers(masm);
3496 
3497   // exception pending => remove activation and forward to exception handler
3498 
3499   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3500 
3501   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3502   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3503 
3504   // -------------
3505   // make sure all code is generated
3506   masm->flush();
3507 
3508   // return the  blob
3509   // frame_size_words or bytes??
3510   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3511 }
3512 
3513 




















































































































































































































































3514 #ifdef COMPILER2
3515 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3516 //
3517 //------------------------------generate_exception_blob---------------------------
3518 // creates exception blob at the end
3519 // Using exception blob, this code is jumped from a compiled method.
3520 // (see emit_exception_handler in x86_64.ad file)
3521 //
3522 // Given an exception pc at a call we call into the runtime for the
3523 // handler in this method. This handler might merely restore state
3524 // (i.e. callee save registers) unwind the frame and jump to the
3525 // exception handler for the nmethod if there is no Java level handler
3526 // for the nmethod.
3527 //
3528 // This code is entered with a jmp.
3529 //
3530 // Arguments:
3531 //   rax: exception oop
3532 //   rdx: exception pc
3533 //




   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "alloca.h"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "code/debugInfoRec.hpp"
  30 #include "code/icBuffer.hpp"
  31 #include "code/vtableStubs.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "oops/compiledICHolder.hpp"
  34 #include "prims/jvmtiRedefineClassesTrace.hpp"
  35 #include "runtime/sharedRuntime.hpp"
  36 #include "runtime/vframeArray.hpp"
  37 #include "vmreg_x86.inline.hpp"
  38 #ifdef COMPILER1
  39 #include "c1/c1_Runtime1.hpp"
  40 #endif
  41 #ifdef COMPILER2
  42 #include "opto/runtime.hpp"
  43 #endif
  44 
  45 #define __ masm->
  46 


3495 
3496   RegisterSaver::restore_live_registers(masm);
3497 
3498   // exception pending => remove activation and forward to exception handler
3499 
3500   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3501 
3502   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3503   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3504 
3505   // -------------
3506   // make sure all code is generated
3507   masm->flush();
3508 
3509   // return the  blob
3510   // frame_size_words or bytes??
3511   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3512 }
3513 
3514 
3515 //------------------------------Montgomery multiplication------------------------
3516 //
3517 
3518 #ifndef _WINDOWS
3519 
3520 #define ASM_SUBTRACT
3521 
3522 #ifdef ASM_SUBTRACT
3523 // Subtract 0:b from carry:a.  Return carry.
3524 static unsigned long
3525 sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
3526   long i = 0, cnt = len;
3527   unsigned long tmp;
3528   asm volatile("clc; "
3529                "0: ; "
3530                "mov (%[b], %[i], 8), %[tmp]; "
3531                "sbb %[tmp], (%[a], %[i], 8); "
3532                "inc %[i]; dec %[cnt]; "
3533                "jne 0b; "
3534                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3535                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3536                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3537                : "memory");
3538   return tmp;
3539 }
3540 #else // ASM_SUBTRACT
3541 typedef int __attribute__((mode(TI))) int128;
3542 
3543 // Subtract 0:b from carry:a.  Return carry.
3544 static unsigned long
3545 sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
3546   int128 tmp = 0;
3547   int i;
3548   for (i = 0; i < len; i++) {
3549     tmp += a[i];
3550     tmp -= b[i];
3551     a[i] = tmp;
3552     tmp >>= 64;
3553     assert(-1 <= tmp && tmp <= 0, "invariant");
3554   }
3555   return tmp + carry;
3556 }
3557 #endif // ! ASM_SUBTRACT
3558 
3559 // Multiply (unsigned) Long A by Long B, accumulating the double-
3560 // length result into the accumulator formed of T0, T1, and T2.
3561 #define MACC(A, B, T0, T1, T2)                                  \
3562 do {                                                            \
3563   unsigned long hi, lo;                                         \
3564   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3565            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3566            : "r"(A), "a"(B) : "cc");                            \
3567  } while(0)
3568 
3569 // As above, but add twice the double-length result into the
3570 // accumulator.
3571 #define MACC2(A, B, T0, T1, T2)                                 \
3572 do {                                                            \
3573   unsigned long hi, lo;                                         \
3574   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3575            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3576            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3577            : "r"(A), "a"(B) : "cc");                            \
3578  } while(0)
3579 
3580 // Fast Montgomery multiplication.  The derivation of the algorithm is
3581 // in  A Cryptographic Library for the Motorola DSP56000,
3582 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3583 
3584 static void __attribute__((noinline))
3585 montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
3586                     unsigned long m[], unsigned long inv, int len) {
3587   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3588   int i;
3589 
3590   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3591 
3592   for (i = 0; i < len; i++) {
3593     int j;
3594     for (j = 0; j < i; j++) {
3595       MACC(a[j], b[i-j], t0, t1, t2);
3596       MACC(m[j], n[i-j], t0, t1, t2);
3597     }
3598     MACC(a[i], b[0], t0, t1, t2);
3599     m[i] = t0 * inv;
3600     MACC(m[i], n[0], t0, t1, t2);
3601 
3602     assert(t0 == 0, "broken Montgomery multiply");
3603 
3604     t0 = t1; t1 = t2; t2 = 0;
3605   }
3606 
3607   for (i = len; i < 2*len; i++) {
3608     int j;
3609     for (j = i-len+1; j < len; j++) {
3610       MACC(a[j], b[i-j], t0, t1, t2);
3611       MACC(m[j], n[i-j], t0, t1, t2);
3612     }
3613     m[i-len] = t0;
3614     t0 = t1; t1 = t2; t2 = 0;
3615   }
3616 
3617   while (t0)
3618     t0 = sub(m, n, t0, len);
3619 }
3620 
3621 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3622 // multiplies so it should be up to 25% faster than Montgomery
3623 // multiplication.  However, its loop control is more complex and it
3624 // may actually run slower on some machines.
3625 
3626 static void __attribute__((noinline))
3627 montgomery_square(unsigned long a[], unsigned long n[],
3628                   unsigned long m[], unsigned long inv, int len) {
3629   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3630   int i;
3631 
3632   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3633 
3634   for (i = 0; i < len; i++) {
3635     int j;
3636     int end = (i+1)/2;
3637     for (j = 0; j < end; j++) {
3638       MACC2(a[j], a[i-j], t0, t1, t2);
3639       MACC(m[j], n[i-j], t0, t1, t2);
3640     }
3641     if ((i & 1) == 0) {
3642       MACC(a[j], a[j], t0, t1, t2);
3643     }
3644     for (; j < i; j++) {
3645       MACC(m[j], n[i-j], t0, t1, t2);
3646     }
3647     m[i] = t0 * inv;
3648     MACC(m[i], n[0], t0, t1, t2);
3649 
3650     assert(t0 == 0, "broken Montgomery square");
3651 
3652     t0 = t1; t1 = t2; t2 = 0;
3653   }
3654 
3655   for (i = len; i < 2*len; i++) {
3656     int start = i-len+1;
3657     int end = start + (len - start)/2;
3658     int j;
3659     for (j = start; j < end; j++) {
3660       MACC2(a[j], a[i-j], t0, t1, t2);
3661       MACC(m[j], n[i-j], t0, t1, t2);
3662     }
3663     if ((i & 1) == 0) {
3664       MACC(a[j], a[j], t0, t1, t2);
3665     }
3666     for (; j < len; j++) {
3667       MACC(m[j], n[i-j], t0, t1, t2);
3668     }
3669     m[i-len] = t0;
3670     t0 = t1; t1 = t2; t2 = 0;
3671   }
3672 
3673   while (t0)
3674     t0 = sub(m, n, t0, len);
3675 }
3676 
3677 // Swap words in a longword.
3678 static unsigned long swap(unsigned long x) {
3679   return (x << 32) | (x >> 32);
3680 }
3681 
3682 // Copy len longwords from s to d, word-swapping as we go.  The
3683 // destination array is reversed.
3684 static void reverse_words(unsigned long *s, unsigned long *d, int len) {
3685   d += len;
3686   while(len-- > 0) {
3687     d--;
3688     *d = swap(*s);
3689     s++;
3690   }
3691 }
3692 
3693 // The threshold at which squaring is advantageous was determined
3694 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3695 #define MONTGOMERY_SQUARING_THRESHOLD 64
3696 
3697 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3698                                         jint len, jlong inv,
3699                                         jint *m_ints) {
3700   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3701   int longwords = len/2;
3702 
3703   // Make very sure we don't use so much space that the stack might
3704   // overflow.  512 jints corresponds to an 16384-bit integer and
3705   // will use here a total of 8k bytes of stack space.
3706   int total_allocation = longwords * sizeof (unsigned long) * 4;
3707   guarantee(total_allocation <= 8192, "must be");
3708   unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3709 
3710   // Local scratch arrays
3711   unsigned long
3712     *a = scratch + 0 * longwords,
3713     *b = scratch + 1 * longwords,
3714     *n = scratch + 2 * longwords,
3715     *m = scratch + 3 * longwords;
3716 
3717   reverse_words((unsigned long *)a_ints, a, longwords);
3718   reverse_words((unsigned long *)b_ints, b, longwords);
3719   reverse_words((unsigned long *)n_ints, n, longwords);
3720 
3721   ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
3722 
3723   reverse_words(m, (unsigned long *)m_ints, longwords);
3724 }
3725 
3726 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3727                                       jint len, jlong inv,
3728                                       jint *m_ints) {
3729   assert(len % 2 == 0, "array length in montgomery_square must be even");
3730   int longwords = len/2;
3731 
3732   // Make very sure we don't use so much space that the stack might
3733   // overflow.  512 jints corresponds to an 16384-bit integer and
3734   // will use here a total of 6k bytes of stack space.
3735   int total_allocation = longwords * sizeof (unsigned long) * 3;
3736   guarantee(total_allocation <= 8192, "must be");
3737   unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3738 
3739   // Local scratch arrays
3740   unsigned long
3741     *a = scratch + 0 * longwords,
3742     *n = scratch + 1 * longwords,
3743     *m = scratch + 2 * longwords;
3744 
3745   reverse_words((unsigned long *)a_ints, a, longwords);
3746   reverse_words((unsigned long *)n_ints, n, longwords);
3747 
3748   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3749     ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
3750   } else {
3751     ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
3752   }
3753 
3754   reverse_words(m, (unsigned long *)m_ints, longwords);
3755 }
3756 
3757 #endif // WINDOWS
3758 
3759 #ifdef COMPILER2
3760 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3761 //
3762 //------------------------------generate_exception_blob---------------------------
3763 // creates exception blob at the end
3764 // Using exception blob, this code is jumped from a compiled method.
3765 // (see emit_exception_handler in x86_64.ad file)
3766 //
3767 // Given an exception pc at a call we call into the runtime for the
3768 // handler in this method. This handler might merely restore state
3769 // (i.e. callee save registers) unwind the frame and jump to the
3770 // exception handler for the nmethod if there is no Java level handler
3771 // for the nmethod.
3772 //
3773 // This code is entered with a jmp.
3774 //
3775 // Arguments:
3776 //   rax: exception oop
3777 //   rdx: exception pc
3778 //


< prev index next >