< prev index next >

src/cpu/x86/vm/sharedRuntime_x86_64.cpp

Print this page
rev 8502 : 8046943: Leverage CPU Instructions for GHASH and RSA
Summary: Add montgomeryMultiply intrinsics
Reviewed-by: kvn


   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"

  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "code/debugInfoRec.hpp"
  29 #include "code/icBuffer.hpp"
  30 #include "code/vtableStubs.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "oops/compiledICHolder.hpp"
  33 #include "prims/jvmtiRedefineClassesTrace.hpp"
  34 #include "runtime/sharedRuntime.hpp"
  35 #include "runtime/vframeArray.hpp"
  36 #include "vmreg_x86.inline.hpp"
  37 #ifdef COMPILER1
  38 #include "c1/c1_Runtime1.hpp"
  39 #endif
  40 #ifdef COMPILER2
  41 #include "opto/runtime.hpp"
  42 #endif
  43 
  44 #define __ masm->
  45 


3494 
3495   RegisterSaver::restore_live_registers(masm);
3496 
3497   // exception pending => remove activation and forward to exception handler
3498 
3499   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3500 
3501   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3502   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3503 
3504   // -------------
3505   // make sure all code is generated
3506   masm->flush();
3507 
3508   // return the  blob
3509   // frame_size_words or bytes??
3510   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3511 }
3512 
3513 
















































































































































































































































3514 #ifdef COMPILER2
3515 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3516 //
3517 //------------------------------generate_exception_blob---------------------------
3518 // creates exception blob at the end
3519 // Using exception blob, this code is jumped from a compiled method.
3520 // (see emit_exception_handler in x86_64.ad file)
3521 //
3522 // Given an exception pc at a call we call into the runtime for the
3523 // handler in this method. This handler might merely restore state
3524 // (i.e. callee save registers) unwind the frame and jump to the
3525 // exception handler for the nmethod if there is no Java level handler
3526 // for the nmethod.
3527 //
3528 // This code is entered with a jmp.
3529 //
3530 // Arguments:
3531 //   rax: exception oop
3532 //   rdx: exception pc
3533 //




   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "alloca.h"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "code/debugInfoRec.hpp"
  30 #include "code/icBuffer.hpp"
  31 #include "code/vtableStubs.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "oops/compiledICHolder.hpp"
  34 #include "prims/jvmtiRedefineClassesTrace.hpp"
  35 #include "runtime/sharedRuntime.hpp"
  36 #include "runtime/vframeArray.hpp"
  37 #include "vmreg_x86.inline.hpp"
  38 #ifdef COMPILER1
  39 #include "c1/c1_Runtime1.hpp"
  40 #endif
  41 #ifdef COMPILER2
  42 #include "opto/runtime.hpp"
  43 #endif
  44 
  45 #define __ masm->
  46 


3495 
3496   RegisterSaver::restore_live_registers(masm);
3497 
3498   // exception pending => remove activation and forward to exception handler
3499 
3500   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3501 
3502   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3503   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3504 
3505   // -------------
3506   // make sure all code is generated
3507   masm->flush();
3508 
3509   // return the  blob
3510   // frame_size_words or bytes??
3511   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3512 }
3513 
3514 
3515 //------------------------------Montgomery multiplication------------------------
3516 //
3517 
3518 #define ASM_SUBTRACT
3519 
3520 #ifdef ASM_SUBTRACT
3521 // Subtract 0:b from carry:a.  Return carry.
3522 static unsigned long
3523 sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
3524   long i = 0, cnt = len;
3525   unsigned long tmp;
3526   asm volatile("clc; "
3527                "0: ; "
3528                "mov (%[b], %[i], 8), %[tmp]; "
3529                "sbb %[tmp], (%[a], %[i], 8); "
3530                "inc %[i]; dec %[cnt]; "
3531                "jne 0b; "
3532                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3533                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3534                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3535                : "memory");
3536   return tmp;
3537 }
3538 #else // ASM_SUBTRACT
3539 typedef int __attribute__((mode(TI))) int128;
3540 
3541 // Subtract 0:b from carry:a.  Return carry.
3542 static unsigned long
3543 sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
3544   int128 tmp = 0;
3545   int i;
3546   for (i = 0; i < len; i++) {
3547     tmp += a[i];
3548     tmp -= b[i];
3549     a[i] = tmp;
3550     tmp >>= 64;
3551     assert(-1 <= tmp && tmp <= 0, "invariant");
3552   }
3553   return tmp + carry;
3554 }
3555 #endif // ! ASM_SUBTRACT
3556 
3557 // Multiply (unsigned) Long A by Long B, accumulating the double-
3558 // length result into the accumulator formed of T0, T1, and T2.
3559 #define MACC(A, B, T0, T1, T2)                                  \
3560 do {                                                            \
3561   unsigned long hi, lo;                                         \
3562   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3563            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3564            : "r"(A), "a"(B) : "cc");                            \
3565  } while(0)
3566 
3567 // As above, but add twice the double-length result into the
3568 // accumulator.
3569 #define MACC2(A, B, T0, T1, T2)                                 \
3570 do {                                                            \
3571   unsigned long hi, lo;                                         \
3572   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3573            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3574            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3575            : "r"(A), "a"(B) : "cc");                            \
3576  } while(0)
3577 
3578 // Fast Montgomery multiplication.  The derivation of the algorithm is
3579 // in  A Cryptographic Library for the Motorola DSP56000,
3580 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3581 
3582 static void __attribute__((noinline))
3583 montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
3584                     unsigned long m[], unsigned long inv, int len) {
3585   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3586   int i;
3587 
3588   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3589 
3590   for (i = 0; i < len; i++) {
3591     int j;
3592     for (j = 0; j < i; j++) {
3593       MACC(a[j], b[i-j], t0, t1, t2);
3594       MACC(m[j], n[i-j], t0, t1, t2);
3595     }
3596     MACC(a[i], b[0], t0, t1, t2);
3597     m[i] = t0 * inv;
3598     MACC(m[i], n[0], t0, t1, t2);
3599 
3600     assert(t0 == 0, "broken Montgomery multiply");
3601 
3602     t0 = t1; t1 = t2; t2 = 0;
3603   }
3604 
3605   for (i = len; i < 2*len; i++) {
3606     int j;
3607     for (j = i-len+1; j < len; j++) {
3608       MACC(a[j], b[i-j], t0, t1, t2);
3609       MACC(m[j], n[i-j], t0, t1, t2);
3610     }
3611     m[i-len] = t0;
3612     t0 = t1; t1 = t2; t2 = 0;
3613   }
3614 
3615   while (t0)
3616     t0 = sub(m, n, t0, len);
3617 }
3618 
3619 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3620 // multiplies so it should be up to 25% faster than Montgomery
3621 // multiplication.  However, its loop control is more complex and it
3622 // may actually run slower on some machines.
3623 
3624 static void __attribute__((noinline))
3625 montgomery_square(unsigned long a[], unsigned long n[],
3626                   unsigned long m[], unsigned long inv, int len) {
3627   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3628   int i;
3629 
3630   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3631 
3632   for (i = 0; i < len; i++) {
3633     int j;
3634     int end = (i+1)/2;
3635     for (j = 0; j < end; j++) {
3636       MACC2(a[j], a[i-j], t0, t1, t2);
3637       MACC(m[j], n[i-j], t0, t1, t2);
3638     }
3639     if ((i & 1) == 0) {
3640       MACC(a[j], a[j], t0, t1, t2);
3641     }
3642     for (; j < i; j++) {
3643       MACC(m[j], n[i-j], t0, t1, t2);
3644     }
3645     m[i] = t0 * inv;
3646     MACC(m[i], n[0], t0, t1, t2);
3647 
3648     assert(t0 == 0, "broken Montgomery square");
3649 
3650     t0 = t1; t1 = t2; t2 = 0;
3651   }
3652 
3653   for (i = len; i < 2*len; i++) {
3654     int start = i-len+1;
3655     int end = start + (len - start)/2;
3656     int j;
3657     for (j = start; j < end; j++) {
3658       MACC2(a[j], a[i-j], t0, t1, t2);
3659       MACC(m[j], n[i-j], t0, t1, t2);
3660     }
3661     if ((i & 1) == 0) {
3662       MACC(a[j], a[j], t0, t1, t2);
3663     }
3664     for (; j < len; j++) {
3665       MACC(m[j], n[i-j], t0, t1, t2);
3666     }
3667     m[i-len] = t0;
3668     t0 = t1; t1 = t2; t2 = 0;
3669   }
3670 
3671   while (t0)
3672     t0 = sub(m, n, t0, len);
3673 }
3674 
3675 // Swap words in a longword.
3676 static unsigned long swap(unsigned long x) {
3677   return (x << 32) | (x >> 32);
3678 }
3679 
3680 // Copy len longwords from s to d, word-swapping as we go.  The
3681 // destination array is reversed.
3682 static void reverse_words(unsigned long *s, unsigned long *d, int len) {
3683   d += len;
3684   while(len-- > 0) {
3685     d--;
3686     *d = swap(*s);
3687     s++;
3688   }
3689 }
3690 
3691 // The threshold at which squaring is advantageous was determined
3692 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3693 #define MONTGOMERY_SQUARING_THRESHOLD 64
3694 
3695 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3696                                         jint len, jlong inv,
3697                                         jint *m_ints) {
3698   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3699   int longwords = len/2;
3700 
3701   // Make very sure we don't use so much space that the stack might
3702   // overflow.  512 jints corresponds to an 16384-bit integer and
3703   // will use here a total of 8k bytes of stack space.
3704   int total_allocation = longwords * sizeof (unsigned long) * 4;
3705   guarantee(total_allocation <= 8192, "must be");
3706   unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3707 
3708   // Local scratch arrays
3709   unsigned long
3710     *a = scratch + 0 * longwords,
3711     *b = scratch + 1 * longwords,
3712     *n = scratch + 2 * longwords,
3713     *m = scratch + 3 * longwords;
3714 
3715   reverse_words((unsigned long *)a_ints, a, longwords);
3716   reverse_words((unsigned long *)b_ints, b, longwords);
3717   reverse_words((unsigned long *)n_ints, n, longwords);
3718 
3719   ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
3720 
3721   reverse_words(m, (unsigned long *)m_ints, longwords);
3722 }
3723 
3724 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3725                                       jint len, jlong inv,
3726                                       jint *m_ints) {
3727   assert(len % 2 == 0, "array length in montgomery_square must be even");
3728   int longwords = len/2;
3729 
3730   // Make very sure we don't use so much space that the stack might
3731   // overflow.  512 jints corresponds to an 16384-bit integer and
3732   // will use here a total of 6k bytes of stack space.
3733   int total_allocation = longwords * sizeof (unsigned long) * 3;
3734   guarantee(total_allocation <= 8192, "must be");
3735   unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3736 
3737   // Local scratch arrays
3738   unsigned long
3739     *a = scratch + 0 * longwords,
3740     *n = scratch + 1 * longwords,
3741     *m = scratch + 2 * longwords;
3742 
3743   reverse_words((unsigned long *)a_ints, a, longwords);
3744   reverse_words((unsigned long *)n_ints, n, longwords);
3745 
3746   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3747     ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
3748   } else {
3749     ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
3750   }
3751 
3752   reverse_words(m, (unsigned long *)m_ints, longwords);
3753 }
3754 
3755 #ifdef COMPILER2
3756 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3757 //
3758 //------------------------------generate_exception_blob---------------------------
3759 // creates exception blob at the end
3760 // Using exception blob, this code is jumped from a compiled method.
3761 // (see emit_exception_handler in x86_64.ad file)
3762 //
3763 // Given an exception pc at a call we call into the runtime for the
3764 // handler in this method. This handler might merely restore state
3765 // (i.e. callee save registers) unwind the frame and jump to the
3766 // exception handler for the nmethod if there is no Java level handler
3767 // for the nmethod.
3768 //
3769 // This code is entered with a jmp.
3770 //
3771 // Arguments:
3772 //   rax: exception oop
3773 //   rdx: exception pc
3774 //


< prev index next >