--- old/src/cpu/x86/vm/assembler_x86.cpp	2015-07-22 17:47:38.484625200 -0700
+++ new/src/cpu/x86/vm/assembler_x86.cpp	2015-07-22 17:47:37.642625200 -0700
@@ -2894,6 +2894,15 @@
   emit_int8(imm8);
 }
 
+void Assembler::pextrw(Register dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sse2(), "");
+  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
+                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  emit_int8(0x15);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(imm8);
+}
+
 void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
   int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
@@ -2912,6 +2921,15 @@
   emit_int8(imm8);
 }
 
+void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) {
+  assert(VM_Version::supports_sse2(), "");
+  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F,
+                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  emit_int8((unsigned char)0xC4);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(imm8);
+}
+
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
   if (VM_Version::supports_evex()) {
@@ -3899,6 +3917,15 @@
   }
 }
 
+void Assembler::mulpd(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x59, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x59, dst, src, VEX_SIMD_66);
+  }
+}
+
 void Assembler::mulps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE);
@@ -4058,6 +4085,24 @@
                  (VM_Version::supports_avx512dq() == false));
 }
 
+void Assembler::unpckhpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x15, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x15, dst, src, VEX_SIMD_66);
+  }
+}
+
+void Assembler::unpcklpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x14, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x14, dst, src, VEX_SIMD_66);
+  }
+}
+
 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
@@ -4676,6 +4721,15 @@
   emit_simd_arith(0xDB, dst, src, VEX_SIMD_66);
 }
 
+void Assembler::pandn(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0xDF, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0xDF, dst, src, VEX_SIMD_66);
+  }
+}
+
 void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len);
--- old/src/cpu/x86/vm/assembler_x86.hpp	2015-07-22 17:47:47.288625200 -0700
+++ new/src/cpu/x86/vm/assembler_x86.hpp	2015-07-22 17:47:46.456625200 -0700
@@ -1653,10 +1653,14 @@
   // SSE 4.1 extract
   void pextrd(Register dst, XMMRegister src, int imm8);
   void pextrq(Register dst, XMMRegister src, int imm8);
+  // SSE 2 extract
+  void pextrw(Register dst, XMMRegister src, int imm8);
 
   // SSE 4.1 insert
   void pinsrd(XMMRegister dst, Register src, int imm8);
   void pinsrq(XMMRegister dst, Register src, int imm8);
+  // SSE 2 insert
+  void pinsrw(XMMRegister dst, Register src, int imm8);
 
   // SSE4.1 packed move
   void pmovzxbw(XMMRegister dst, XMMRegister src);
@@ -1906,6 +1910,7 @@
 
   // Multiply Packed Floating-Point Values
   void mulpd(XMMRegister dst, XMMRegister src);
+  void mulpd(XMMRegister dst, Address src);
   void mulps(XMMRegister dst, XMMRegister src);
   void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
@@ -1928,6 +1933,9 @@
   void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
   void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
+  void unpckhpd(XMMRegister dst, XMMRegister src);
+  void unpcklpd(XMMRegister dst, XMMRegister src);
+
   // Bitwise Logical XOR of Packed Floating-Point Values
   void xorpd(XMMRegister dst, XMMRegister src);
   void xorps(XMMRegister dst, XMMRegister src);
@@ -2020,6 +2028,7 @@
 
   // And packed integers
   void pand(XMMRegister dst, XMMRegister src);
+  void pandn(XMMRegister dst, XMMRegister src);
   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
--- old/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	2015-07-22 17:47:55.936625200 -0700
+++ new/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	2015-07-22 17:47:55.106625200 -0700
@@ -2459,9 +2459,6 @@
         // Should consider not saving rbx, if not necessary
         __ trigfunc('t', op->as_Op2()->fpu_stack_size());
         break;
-      case lir_exp :
-        __ exp_with_fallback(op->as_Op2()->fpu_stack_size());
-        break;
       case lir_pow :
         __ pow_with_fallback(op->as_Op2()->fpu_stack_size());
         break;
--- old/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp	2015-07-22 17:48:04.602625200 -0700
+++ new/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp	2015-07-22 17:48:03.773625200 -0700
@@ -808,6 +808,12 @@
 
 void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
   assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");
+
+  if (x->id() == vmIntrinsics::_dexp) {
+    do_ExpIntrinsic(x);
+    return;
+  }
+
   LIRItem value(x->argument_at(0), this);
 
   bool use_fpu = false;
@@ -818,7 +824,6 @@
       case vmIntrinsics::_dtan:
       case vmIntrinsics::_dlog:
       case vmIntrinsics::_dlog10:
-      case vmIntrinsics::_dexp:
       case vmIntrinsics::_dpow:
         use_fpu = true;
     }
@@ -870,7 +875,6 @@
     case vmIntrinsics::_dtan:   __ tan  (calc_input, calc_result, tmp1, tmp2);              break;
     case vmIntrinsics::_dlog:   __ log  (calc_input, calc_result, tmp1);                    break;
     case vmIntrinsics::_dlog10: __ log10(calc_input, calc_result, tmp1);                    break;
-    case vmIntrinsics::_dexp:   __ exp  (calc_input, calc_result,              tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
     case vmIntrinsics::_dpow:   __ pow  (calc_input, calc_input2, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
     default:                    ShouldNotReachHere();
   }
@@ -880,6 +884,32 @@
   }
 }
 
+void LIRGenerator::do_ExpIntrinsic(Intrinsic* x) {
+  LIRItem value(x->argument_at(0), this);
+  value.set_destroys_register();
+
+  LIR_Opr calc_result = rlock_result(x);
+  LIR_Opr result_reg = result_register_for(x->type());  
+
+  BasicTypeList signature(1);
+  signature.append(T_DOUBLE);
+  CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+
+  value.load_item_force(cc->at(0));
+
+#ifndef _LP64  
+  LIR_Opr tmp = FrameMap::fpu0_double_opr;
+  result_reg = tmp;
+  if (VM_Version::supports_sse2()) {
+    __ call_runtime_leaf(StubRoutines::dexp(), getThreadTemp(), result_reg, cc->args());
+  } else {
+    __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dexp), getThreadTemp(), result_reg, cc->args());
+  }
+#else
+  __ call_runtime_leaf(StubRoutines::dexp(), getThreadTemp(), result_reg, cc->args());
+#endif
+  __ move(result_reg, calc_result);
+}
 
 void LIRGenerator::do_ArrayCopy(Intrinsic* x) {
   assert(x->number_of_arguments() == 5, "wrong type");
--- old/src/cpu/x86/vm/c1_LinearScan_x86.cpp	2015-07-22 17:48:13.172625200 -0700
+++ new/src/cpu/x86/vm/c1_LinearScan_x86.cpp	2015-07-22 17:48:12.339625200 -0700
@@ -814,8 +814,7 @@
 
     case lir_tan:
     case lir_sin:
-    case lir_cos:
-    case lir_exp: {
+    case lir_cos: {
       // sin, cos and exp need two temporary fpu stack slots, so there are two temporary
       // registers (stored in right and temp of the operation).
       // the stack allocator must guarantee that the stack slots are really free,
--- old/src/cpu/x86/vm/interpreter_x86_32.cpp	2015-07-22 17:48:21.718625200 -0700
+++ new/src/cpu/x86/vm/interpreter_x86_32.cpp	2015-07-22 17:48:20.878625200 -0700
@@ -151,11 +151,15 @@
       __ pop_fTOS();
       break;
     case Interpreter::java_lang_math_exp:
-      __ exp_with_fallback(0);
-      // Store to stack to convert 80bit precision back to 64bits
-      __ push_fTOS();
-      __ pop_fTOS();
-      break;
+      __ subptr(rsp, 2*wordSize);
+      __ fstp_d(Address(rsp, 0));
+      if (VM_Version::supports_sse2()) {
+        __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dexp())));
+      } else {
+        __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::dexp)));
+      }
+      __ addptr(rsp, 2*wordSize);
+    break;
     default                              :
         ShouldNotReachHere();
   }
--- old/src/cpu/x86/vm/interpreter_x86_64.cpp	2015-07-22 17:48:30.196625200 -0700
+++ new/src/cpu/x86/vm/interpreter_x86_64.cpp	2015-07-22 17:48:29.374625200 -0700
@@ -252,6 +252,9 @@
 
   if (kind == Interpreter::java_lang_math_sqrt) {
     __ sqrtsd(xmm0, Address(rsp, wordSize));
+  } else if (kind == Interpreter::java_lang_math_exp) {
+    __ movdbl(xmm0, Address(rsp, wordSize));
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dexp())));
   } else {
     __ fld_d(Address(rsp, wordSize));
     switch (kind) {
@@ -278,9 +281,6 @@
                                               // empty stack slot)
           __ pow_with_fallback(0);
           break;
-      case Interpreter::java_lang_math_exp:
-          __ exp_with_fallback(0);
-           break;
       default                              :
           ShouldNotReachHere();
     }
--- old/src/cpu/x86/vm/macroAssembler_x86.cpp	2015-07-22 17:48:38.721625200 -0700
+++ new/src/cpu/x86/vm/macroAssembler_x86.cpp	2015-07-22 17:48:37.892625200 -0700
@@ -3027,6 +3027,15 @@
   Assembler::fldcw(as_Address(src));
 }
 
+void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::mulpd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::mulpd(dst, Address(rscratch1, 0));
+  }
+}
+
 void MacroAssembler::pow_exp_core_encoding() {
   // kills rax, rcx, rdx
   subptr(rsp,sizeof(jdouble));
@@ -3099,19 +3108,7 @@
   BLOCK_COMMENT("} fast_pow");
 }
 
-void MacroAssembler::fast_exp() {
-  // computes exp(X) = 2^(X * log2(e))
-  // if fast computation is not possible, result is NaN. Requires
-  // fallback from user of this macro.
-  // increase precision for intermediate steps of the computation
-  increase_precision();
-  fldl2e();                // Stack: log2(e) X ...
-  fmulp(1);                // Stack: (X*log2(e)) ...
-  pow_exp_core_encoding(); // Stack: exp(X) ...
-  restore_precision();
-}
-
-void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
+void MacroAssembler::pow_or_exp(int num_fpu_regs_in_use) {
   // kills rax, rcx, rdx
   // pow and exp needs 2 extra registers on the fpu stack.
   Label slow_case, done;
@@ -3123,182 +3120,164 @@
   Register tmp2 = rax;
   Register tmp3 = rcx;
 
-  if (is_exp) {
-    // Stack: X
-    fld_s(0);                   // duplicate argument for runtime call. Stack: X X
-    fast_exp();                 // Stack: exp(X) X
-    fcmp(tmp, 0, false, false); // Stack: exp(X) X
-    // exp(X) not equal to itself: exp(X) is NaN go to slow case.
-    jcc(Assembler::parity, slow_case);
-    // get rid of duplicate argument. Stack: exp(X)
-    if (num_fpu_regs_in_use > 0) {
-      fxch();
-      fpop();
-    } else {
-      ffree(1);
-    }
-    jmp(done);
+  // Stack: X Y
+  Label x_negative, y_not_2;
+
+  static double two = 2.0;
+  ExternalAddress two_addr((address)&two);
+
+  // constant maybe too far on 64 bit
+  lea(tmp2, two_addr);
+  fld_d(Address(tmp2, 0));    // Stack: 2 X Y
+  fcmp(tmp, 2, true, false);  // Stack: X Y
+  jcc(Assembler::parity, y_not_2);
+  jcc(Assembler::notEqual, y_not_2);
+
+  fxch(); fpop();             // Stack: X
+  fmul(0);                    // Stack: X*X
+
+  jmp(done);
+
+  bind(y_not_2);
+
+  fldz();                     // Stack: 0 X Y
+  fcmp(tmp, 1, true, false);  // Stack: X Y
+  jcc(Assembler::above, x_negative);
+
+  // X >= 0
+
+  fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
+  fld_s(1);                   // Stack: X Y X Y
+  fast_pow();                 // Stack: X^Y X Y
+  fcmp(tmp, 0, false, false); // Stack: X^Y X Y
+  // X^Y not equal to itself: X^Y is NaN go to slow case.
+  jcc(Assembler::parity, slow_case);
+  // get rid of duplicate arguments. Stack: X^Y
+  if (num_fpu_regs_in_use > 0) {
+    fxch(); fpop();
+    fxch(); fpop();
   } else {
-    // Stack: X Y
-    Label x_negative, y_not_2;
+    ffree(2);
+    ffree(1);
+  }
+  jmp(done);
 
-    static double two = 2.0;
-    ExternalAddress two_addr((address)&two);
+  // X <= 0
+  bind(x_negative);
 
-    // constant maybe too far on 64 bit
-    lea(tmp2, two_addr);
-    fld_d(Address(tmp2, 0));    // Stack: 2 X Y
-    fcmp(tmp, 2, true, false);  // Stack: X Y
-    jcc(Assembler::parity, y_not_2);
-    jcc(Assembler::notEqual, y_not_2);
-
-    fxch(); fpop();             // Stack: X
-    fmul(0);                    // Stack: X*X
-
-    jmp(done);
-
-    bind(y_not_2);
-
-    fldz();                     // Stack: 0 X Y
-    fcmp(tmp, 1, true, false);  // Stack: X Y
-    jcc(Assembler::above, x_negative);
-
-    // X >= 0
-
-    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
-    fld_s(1);                   // Stack: X Y X Y
-    fast_pow();                 // Stack: X^Y X Y
-    fcmp(tmp, 0, false, false); // Stack: X^Y X Y
-    // X^Y not equal to itself: X^Y is NaN go to slow case.
-    jcc(Assembler::parity, slow_case);
-    // get rid of duplicate arguments. Stack: X^Y
-    if (num_fpu_regs_in_use > 0) {
-      fxch(); fpop();
-      fxch(); fpop();
-    } else {
-      ffree(2);
-      ffree(1);
-    }
-    jmp(done);
-
-    // X <= 0
-    bind(x_negative);
-
-    fld_s(1);                   // Stack: Y X Y
-    frndint();                  // Stack: int(Y) X Y
-    fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
-    jcc(Assembler::notEqual, slow_case);
-
-    subptr(rsp, 8);
-
-    // For X^Y, when X < 0, Y has to be an integer and the final
-    // result depends on whether it's odd or even. We just checked
-    // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
-    // integer to test its parity. If int(Y) is huge and doesn't fit
-    // in the 64 bit integer range, the integer indefinite value will
-    // end up in the gp registers. Huge numbers are all even, the
-    // integer indefinite number is even so it's fine.
+  fld_s(1);                   // Stack: Y X Y
+  frndint();                  // Stack: int(Y) X Y
+  fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
+  jcc(Assembler::notEqual, slow_case);
+
+  subptr(rsp, 8);
+
+  // For X^Y, when X < 0, Y has to be an integer and the final
+  // result depends on whether it's odd or even. We just checked
+  // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
+  // integer to test its parity. If int(Y) is huge and doesn't fit
+  // in the 64 bit integer range, the integer indefinite value will
+  // end up in the gp registers. Huge numbers are all even, the
+  // integer indefinite number is even so it's fine.
 
 #ifdef ASSERT
-    // Let's check we don't end up with an integer indefinite number
-    // when not expected. First test for huge numbers: check whether
-    // int(Y)+1 == int(Y) which is true for very large numbers and
-    // those are all even. A 64 bit integer is guaranteed to not
-    // overflow for numbers where y+1 != y (when precision is set to
-    // double precision).
-    Label y_not_huge;
+  // Let's check we don't end up with an integer indefinite number
+  // when not expected. First test for huge numbers: check whether
+  // int(Y)+1 == int(Y) which is true for very large numbers and
+  // those are all even. A 64 bit integer is guaranteed to not
+  // overflow for numbers where y+1 != y (when precision is set to
+  // double precision).
+  Label y_not_huge;
 
-    fld1();                     // Stack: 1 int(Y) X Y
-    fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
+  fld1();                     // Stack: 1 int(Y) X Y
+  fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
 
 #ifdef _LP64
-    // trip to memory to force the precision down from double extended
-    // precision
-    fstp_d(Address(rsp, 0));
-    fld_d(Address(rsp, 0));
+  // trip to memory to force the precision down from double extended
+  // precision
+  fstp_d(Address(rsp, 0));
+  fld_d(Address(rsp, 0));
 #endif
 
-    fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
+  fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
 #endif
 
-    // move int(Y) as 64 bit integer to thread's stack
-    fistp_d(Address(rsp,0));    // Stack: X Y
+  // move int(Y) as 64 bit integer to thread's stack
+  fistp_d(Address(rsp,0));    // Stack: X Y
 
 #ifdef ASSERT
-    jcc(Assembler::notEqual, y_not_huge);
+  jcc(Assembler::notEqual, y_not_huge);
 
-    // Y is huge so we know it's even. It may not fit in a 64 bit
-    // integer and we don't want the debug code below to see the
-    // integer indefinite value so overwrite int(Y) on the thread's
-    // stack with 0.
-    movl(Address(rsp, 0), 0);
-    movl(Address(rsp, 4), 0);
+  // Y is huge so we know it's even. It may not fit in a 64 bit
+  // integer and we don't want the debug code below to see the
+  // integer indefinite value so overwrite int(Y) on the thread's
+  // stack with 0.
+  movl(Address(rsp, 0), 0);
+  movl(Address(rsp, 4), 0);
 
-    bind(y_not_huge);
+  bind(y_not_huge);
 #endif
 
-    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
-    fld_s(1);                   // Stack: X Y X Y
-    fabs();                     // Stack: abs(X) Y X Y
-    fast_pow();                 // Stack: abs(X)^Y X Y
-    fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
-    // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
-
-    pop(tmp2);
-    NOT_LP64(pop(tmp3));
-    jcc(Assembler::parity, slow_case);
+  fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
+  fld_s(1);                   // Stack: X Y X Y
+  fabs();                     // Stack: abs(X) Y X Y
+  fast_pow();                 // Stack: abs(X)^Y X Y
+  fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
+  // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
+
+  pop(tmp2);
+  NOT_LP64(pop(tmp3));
+  jcc(Assembler::parity, slow_case);
 
 #ifdef ASSERT
-    // Check that int(Y) is not integer indefinite value (int
-    // overflow). Shouldn't happen because for values that would
-    // overflow, 1+int(Y)==Y which was tested earlier.
+  // Check that int(Y) is not integer indefinite value (int
+  // overflow). Shouldn't happen because for values that would
+  // overflow, 1+int(Y)==Y which was tested earlier.
 #ifndef _LP64
-    {
-      Label integer;
-      testl(tmp2, tmp2);
-      jcc(Assembler::notZero, integer);
-      cmpl(tmp3, 0x80000000);
-      jcc(Assembler::notZero, integer);
-      STOP("integer indefinite value shouldn't be seen here");
-      bind(integer);
-    }
+  {
+    Label integer;
+    testl(tmp2, tmp2);
+    jcc(Assembler::notZero, integer);
+    cmpl(tmp3, 0x80000000);
+    jcc(Assembler::notZero, integer);
+    STOP("integer indefinite value shouldn't be seen here");
+    bind(integer);
+  }
 #else
-    {
-      Label integer;
-      mov(tmp3, tmp2); // preserve tmp2 for parity check below
-      shlq(tmp3, 1);
-      jcc(Assembler::carryClear, integer);
-      jcc(Assembler::notZero, integer);
-      STOP("integer indefinite value shouldn't be seen here");
-      bind(integer);
-    }
+  {
+    Label integer;
+    mov(tmp3, tmp2); // preserve tmp2 for parity check below
+    shlq(tmp3, 1);
+    jcc(Assembler::carryClear, integer);
+    jcc(Assembler::notZero, integer);
+    STOP("integer indefinite value shouldn't be seen here");
+    bind(integer);
+  }
 #endif
 #endif
 
-    // get rid of duplicate arguments. Stack: X^Y
-    if (num_fpu_regs_in_use > 0) {
-      fxch(); fpop();
-      fxch(); fpop();
-    } else {
-      ffree(2);
-      ffree(1);
-    }
-
-    testl(tmp2, 1);
-    jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
-    // X <= 0, Y even: X^Y = -abs(X)^Y
-
-    fchs();                     // Stack: -abs(X)^Y Y
-    jmp(done);
+  // get rid of duplicate arguments. Stack: X^Y
+  if (num_fpu_regs_in_use > 0) {
+    fxch(); fpop();
+    fxch(); fpop();
+  } else {
+    ffree(2);
+    ffree(1);
   }
 
+  testl(tmp2, 1);
+  jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
+  // X <= 0, Y even: X^Y = -abs(X)^Y
+
+  fchs();                     // Stack: -abs(X)^Y Y
+  jmp(done);
+
   // slow case: runtime call
   bind(slow_case);
 
   fpop();                       // pop incorrect result or int(Y)
 
-  fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
-                      is_exp ? 1 : 2, num_fpu_regs_in_use);
+  fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), 2, num_fpu_regs_in_use);
 
   // Come here with result in F-TOS
   bind(done);
--- old/src/cpu/x86/vm/macroAssembler_x86.hpp	2015-07-22 17:48:47.553625200 -0700
+++ new/src/cpu/x86/vm/macroAssembler_x86.hpp	2015-07-22 17:48:46.734625200 -0700
@@ -890,14 +890,14 @@
   // all corner cases and may result in NaN and require fallback to a
   // runtime call.
   void fast_pow();
-  void fast_exp();
+  void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
+                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
+                Register rax, Register rcx, Register rdx, Register tmp);
   void increase_precision();
   void restore_precision();
 
-  // computes exp(x). Fallback to runtime call included.
-  void exp_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(true, num_fpu_regs_in_use); }
   // computes pow(x,y). Fallback to runtime call included.
-  void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(false, num_fpu_regs_in_use); }
+  void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(num_fpu_regs_in_use); }
 
 private:
 
@@ -908,7 +908,7 @@
   void pow_exp_core_encoding();
 
   // computes pow(x,y) or exp(x). Fallback to runtime call included.
-  void pow_or_exp(bool is_exp, int num_fpu_regs_in_use);
+  void pow_or_exp(int num_fpu_regs_in_use);
 
   // these are private because users should be doing movflt/movdbl
 
@@ -954,6 +954,10 @@
   void movsd(XMMRegister dst, Address src)     { Assembler::movsd(dst, src); }
   void movsd(XMMRegister dst, AddressLiteral src);
 
+  void mulpd(XMMRegister dst, XMMRegister src)    { Assembler::mulpd(dst, src); }
+  void mulpd(XMMRegister dst, Address src)        { Assembler::mulpd(dst, src); }
+  void mulpd(XMMRegister dst, AddressLiteral src);
+
   void mulsd(XMMRegister dst, XMMRegister src)    { Assembler::mulsd(dst, src); }
   void mulsd(XMMRegister dst, Address src)        { Assembler::mulsd(dst, src); }
   void mulsd(XMMRegister dst, AddressLiteral src);
--- old/src/cpu/x86/vm/stubGenerator_x86_32.cpp	2015-07-22 17:48:56.080625200 -0700
+++ new/src/cpu/x86/vm/stubGenerator_x86_32.cpp	2015-07-22 17:48:55.246625200 -0700
@@ -2129,14 +2129,6 @@
       __ ret(0);
     }
     {
-      StubCodeMark mark(this, "StubRoutines", "exp");
-      StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
-
-      __ fld_d(Address(rsp, 4));
-      __ exp_with_fallback(0);
-      __ ret(0);
-    }
-    {
       StubCodeMark mark(this, "StubRoutines", "pow");
       StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
 
@@ -2943,6 +2935,32 @@
     return start;
   }
 
+ address generate_libmExp() {
+    address start = __ pc();
+
+    const XMMRegister x0  = xmm0;
+    const XMMRegister x1  = xmm1;
+    const XMMRegister x2  = xmm2;
+    const XMMRegister x3  = xmm3;
+
+    const XMMRegister x4  = xmm4;
+    const XMMRegister x5  = xmm5;
+    const XMMRegister x6  = xmm6;
+    const XMMRegister x7  = xmm7;
+
+    const Register tmp   = rbx;
+   
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+
+  }
+
+
   // Safefetch stubs.
   void generate_safefetch(const char* name, int size, address* entry,
                           address* fault_pc, address* continuation_pc) {
@@ -3156,6 +3174,9 @@
       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
     }
+    if (VM_Version::supports_sse2()) {
+      StubRoutines::_dexp = generate_libmExp();
+    }
   }
 
 
--- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp	2015-07-22 17:49:04.712625200 -0700
+++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp	2015-07-22 17:49:03.884625200 -0700
@@ -3016,19 +3016,6 @@
       __ ret(0);
     }
     {
-      StubCodeMark mark(this, "StubRoutines", "exp");
-      StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
-
-      __ subq(rsp, 8);
-      __ movdbl(Address(rsp, 0), xmm0);
-      __ fld_d(Address(rsp, 0));
-      __ exp_with_fallback(0);
-      __ fstp_d(Address(rsp, 0));
-      __ movdbl(xmm0, Address(rsp, 0));
-      __ addq(rsp, 8);
-      __ ret(0);
-    }
-    {
       StubCodeMark mark(this, "StubRoutines", "pow");
       StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
 
@@ -4059,6 +4046,44 @@
     return start;
   }
 
+  address generate_libmExp() {
+    address start = __ pc();
+
+    const XMMRegister x0  = xmm0;
+    const XMMRegister x1  = xmm1;
+    const XMMRegister x2  = xmm2;
+    const XMMRegister x3  = xmm3;
+
+    const XMMRegister x4  = xmm4;
+    const XMMRegister x5  = xmm5;
+    const XMMRegister x6  = xmm6;
+    const XMMRegister x7  = xmm7;
+
+    const Register tmp   = r11;
+   
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifdef _WIN64
+    // save the xmm registers which must be preserved 6-7
+    __ movdqu(xmm_save(6), as_XMMRegister(6));
+    __ movdqu(xmm_save(7), as_XMMRegister(7));
+#endif
+      __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
+
+#ifdef _WIN64
+    // restore xmm regs belonging to calling function
+    __ movdqu(as_XMMRegister(6), xmm_save(6));
+    __ movdqu(as_XMMRegister(7), xmm_save(7));
+#endif
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+
+  }
+
 
 #undef __
 #define __ masm->
@@ -4239,6 +4264,7 @@
       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
     }
+    StubRoutines::_dexp = generate_libmExp();
   }
 
   void generate_all() {
--- old/src/cpu/x86/vm/x86_32.ad	2015-07-22 17:49:13.416625200 -0700
+++ new/src/cpu/x86/vm/x86_32.ad	2015-07-22 17:49:12.578625200 -0700
@@ -9907,35 +9907,6 @@
   ins_pipe( pipe_slow );
 %}
 
-
-instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
-  predicate (UseSSE<=1);
-  match(Set dpr1 (ExpD dpr1));
-  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
-  format %{ "fast_exp $dpr1 -> $dpr1  // KILL $rax, $rcx, $rdx" %}
-  ins_encode %{
-    __ fast_exp();
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct expD_reg(regD dst, regD src, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
-  predicate (UseSSE>=2);
-  match(Set dst (ExpD src));
-  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
-  format %{ "fast_exp $dst -> $src  // KILL $rax, $rcx, $rdx" %}
-  ins_encode %{
-    __ subptr(rsp, 8);
-    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
-    __ fld_d(Address(rsp, 0));
-    __ fast_exp();
-    __ fstp_d(Address(rsp, 0));
-    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
-    __ addptr(rsp, 8);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   // The source Double operand on FPU stack
--- old/src/cpu/x86/vm/x86_64.ad	2015-07-22 17:49:22.413625200 -0700
+++ new/src/cpu/x86/vm/x86_64.ad	2015-07-22 17:49:21.587625200 -0700
@@ -9867,22 +9867,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct expD_reg(regD dst, regD src, rax_RegI rax, rdx_RegI rdx, rcx_RegI rcx, rFlagsReg cr) %{
-  match(Set dst (ExpD src));
-  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
-  format %{ "fast_exp $dst -> $src  // KILL $rax, $rcx, $rdx" %}
-  ins_encode %{
-    __ subptr(rsp, 8);
-    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
-    __ fld_d(Address(rsp, 0));
-    __ fast_exp();
-    __ fstp_d(Address(rsp, 0));
-    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
-    __ addptr(rsp, 8);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 //----------Arithmetic Conversion Instructions---------------------------------
 
 instruct roundFloat_nop(regF dst)
--- old/src/share/vm/adlc/formssel.cpp	2015-07-22 17:49:31.340625200 -0700
+++ new/src/share/vm/adlc/formssel.cpp	2015-07-22 17:49:30.511625200 -0700
@@ -4006,7 +4006,6 @@
         strcmp(opType,"DivD")==0 ||
         strcmp(opType,"DivF")==0 ||
         strcmp(opType,"DivI")==0 ||
-        strcmp(opType,"ExpD")==0 ||
         strcmp(opType,"LogD")==0 ||
         strcmp(opType,"Log10D")==0 ||
         strcmp(opType,"ModD")==0 ||
--- old/src/share/vm/c1/c1_LIR.cpp	2015-07-22 17:49:40.021625200 -0700
+++ new/src/share/vm/c1/c1_LIR.cpp	2015-07-22 17:49:39.185625200 -0700
@@ -732,8 +732,7 @@
     case lir_sin:
     case lir_cos:
     case lir_log:
-    case lir_log10:
-    case lir_exp: {
+    case lir_log10: {
       assert(op->as_Op2() != NULL, "must be");
       LIR_Op2* op2 = (LIR_Op2*)op;
 
@@ -743,9 +742,6 @@
       // overlap with the input.
       assert(op2->_info == NULL, "not used");
       assert(op2->_tmp5->is_illegal(), "not used");
-      assert(op2->_tmp2->is_valid() == (op->code() == lir_exp), "not used");
-      assert(op2->_tmp3->is_valid() == (op->code() == lir_exp), "not used");
-      assert(op2->_tmp4->is_valid() == (op->code() == lir_exp), "not used");
       assert(op2->_opr1->is_valid(), "used");
       do_input(op2->_opr1); do_temp(op2->_opr1);
 
@@ -1775,7 +1771,6 @@
      case lir_tan:                   s = "tan";           break;
      case lir_log:                   s = "log";           break;
      case lir_log10:                 s = "log10";         break;
-     case lir_exp:                   s = "exp";           break;
      case lir_pow:                   s = "pow";           break;
      case lir_logic_and:             s = "logic_and";     break;
      case lir_logic_or:              s = "logic_or";      break;
--- old/src/share/vm/c1/c1_LIR.hpp	2015-07-22 17:49:48.582625200 -0700
+++ new/src/share/vm/c1/c1_LIR.hpp	2015-07-22 17:49:47.756625200 -0700
@@ -961,7 +961,6 @@
       , lir_tan
       , lir_log
       , lir_log10
-      , lir_exp
       , lir_pow
       , lir_logic_and
       , lir_logic_or
@@ -2199,7 +2198,6 @@
   void sin (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_sin , from, tmp1, to, tmp2)); }
   void cos (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_cos , from, tmp1, to, tmp2)); }
   void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); }
-  void exp (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5)                { append(new LIR_Op2(lir_exp , from, tmp1, to, tmp2, tmp3, tmp4, tmp5)); }
   void pow (LIR_Opr arg1, LIR_Opr arg2, LIR_Opr res, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5) { append(new LIR_Op2(lir_pow, arg1, arg2, res, tmp1, tmp2, tmp3, tmp4, tmp5)); }
 
   void add (LIR_Opr left, LIR_Opr right, LIR_Opr res)      { append(new LIR_Op2(lir_add, left, right, res)); }
--- old/src/share/vm/c1/c1_LIRAssembler.cpp	2015-07-22 17:49:57.170625200 -0700
+++ new/src/share/vm/c1/c1_LIRAssembler.cpp	2015-07-22 17:49:56.343625200 -0700
@@ -738,7 +738,6 @@
     case lir_cos:
     case lir_log:
     case lir_log10:
-    case lir_exp:
     case lir_pow:
       intrinsic_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op);
       break;
--- old/src/share/vm/c1/c1_LIRGenerator.hpp	2015-07-22 17:50:05.708625200 -0700
+++ new/src/share/vm/c1/c1_LIRGenerator.hpp	2015-07-22 17:50:04.882625200 -0700
@@ -244,6 +244,7 @@
   void do_getClass(Intrinsic* x);
   void do_currentThread(Intrinsic* x);
   void do_MathIntrinsic(Intrinsic* x);
+  void do_ExpIntrinsic(Intrinsic* x);
   void do_ArrayCopy(Intrinsic* x);
   void do_CompareAndSwap(Intrinsic* x, ValueType* type);
   void do_NIOCheckIndex(Intrinsic* x);
--- old/src/share/vm/c1/c1_LinearScan.cpp	2015-07-22 17:50:14.231625200 -0700
+++ new/src/share/vm/c1/c1_LinearScan.cpp	2015-07-22 17:50:13.398625200 -0700
@@ -6588,7 +6588,6 @@
         case lir_log10:
         case lir_log:
         case lir_pow:
-        case lir_exp:
         case lir_logic_and:
         case lir_logic_or:
         case lir_logic_xor:
--- old/src/share/vm/c1/c1_Runtime1.cpp	2015-07-22 17:50:23.017625200 -0700
+++ new/src/share/vm/c1/c1_Runtime1.cpp	2015-07-22 17:50:22.188625200 -0700
@@ -317,6 +317,7 @@
   FUNCTION_CASE(entry, TRACE_TIME_METHOD);
 #endif
   FUNCTION_CASE(entry, StubRoutines::updateBytesCRC32());
+  FUNCTION_CASE(entry, StubRoutines::dexp());
 
 #undef FUNCTION_CASE
 
--- old/src/share/vm/opto/classes.hpp	2015-07-22 17:50:31.729625200 -0700
+++ new/src/share/vm/opto/classes.hpp	2015-07-22 17:50:30.875625200 -0700
@@ -131,7 +131,6 @@
 macro(EncodeISOArray)
 macro(EncodeP)
 macro(EncodePKlass)
-macro(ExpD)
 macro(FastLock)
 macro(FastUnlock)
 macro(Goto)
--- old/src/share/vm/opto/compile.hpp	2015-07-22 17:50:40.836625200 -0700
+++ new/src/share/vm/opto/compile.hpp	2015-07-22 17:50:39.933625200 -0700
@@ -1095,7 +1095,7 @@
   bool           in_scratch_emit_size() const   { return _in_scratch_emit_size;     }
 
   enum ScratchBufferBlob {
-    MAX_inst_size       = 1024,
+    MAX_inst_size       = 32768,
     MAX_locs_size       = 128, // number of relocInfo elements
     MAX_const_size      = 128,
     MAX_stubs_size      = 128
--- old/src/share/vm/opto/library_call.cpp	2015-07-22 17:50:49.904625200 -0700
+++ new/src/share/vm/opto/library_call.cpp	2015-07-22 17:50:49.058625200 -0700
@@ -221,7 +221,6 @@
   bool inline_math_negateExactL();
   bool inline_math_subtractExactI(bool is_decrement);
   bool inline_math_subtractExactL(bool is_decrement);
-  bool inline_exp();
   bool inline_pow();
   Node* finish_pow_exp(Node* result, Node* x, Node* y, const TypeFunc* call_type, address funcAddr, const char* funcName);
   bool inline_min_max(vmIntrinsics::ID id);
@@ -1810,20 +1809,6 @@
   }
 }
 
-//------------------------------inline_exp-------------------------------------
-// Inline exp instructions, if possible.  The Intel hardware only misses
-// really odd corner cases (+/- Infinity).  Just uncommon-trap them.
-bool LibraryCallKit::inline_exp() {
-  Node* arg = round_double_node(argument(0));
-  Node* n   = _gvn.transform(new ExpDNode(C, control(), arg));
-
-  n = finish_pow_exp(n, arg, NULL, OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dexp), "EXP");
-  set_result(n);
-
-  C->set_has_split_ifs(true); // Has chance for split-if optimization
-  return true;
-}
-
 //------------------------------inline_pow-------------------------------------
 // Inline power instructions, if possible.
 bool LibraryCallKit::inline_pow() {
@@ -2051,8 +2036,9 @@
   case vmIntrinsics::_dsqrt:  return Matcher::match_rule_supported(Op_SqrtD) ? inline_math(id) : false;
   case vmIntrinsics::_dabs:   return Matcher::has_match_rule(Op_AbsD)   ? inline_math(id) : false;
 
-  case vmIntrinsics::_dexp:   return Matcher::has_match_rule(Op_ExpD)   ? inline_exp()    :
-    runtime_math(OptoRuntime::Math_D_D_Type(),  FN_PTR(SharedRuntime::dexp),  "EXP");
+  case vmIntrinsics::_dexp:
+    return VM_Version::supports_sse2() ? runtime_math(OptoRuntime::Math_D_D_Type(), StubRoutines::dexp(),  "dexp") :
+    runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dexp),  "EXP");
   case vmIntrinsics::_dpow:   return Matcher::has_match_rule(Op_PowD)   ? inline_pow()    :
     runtime_math(OptoRuntime::Math_DD_D_Type(), FN_PTR(SharedRuntime::dpow),  "POW");
 #undef FN_PTR
--- old/src/share/vm/opto/subnode.cpp	2015-07-22 17:50:58.757625200 -0700
+++ new/src/share/vm/opto/subnode.cpp	2015-07-22 17:50:57.930625200 -0700
@@ -1487,18 +1487,6 @@
 
 //=============================================================================
 //------------------------------Value------------------------------------------
-// Compute exp
-const Type *ExpDNode::Value( PhaseTransform *phase ) const {
-  const Type *t1 = phase->type( in(1) );
-  if( t1 == Type::TOP ) return Type::TOP;
-  if( t1->base() != Type::DoubleCon ) return Type::DOUBLE;
-  double d = t1->getd();
-  return TypeD::make( StubRoutines::intrinsic_exp( d ) );
-}
-
-
-//=============================================================================
-//------------------------------Value------------------------------------------
 // Compute pow
 const Type *PowDNode::Value( PhaseTransform *phase ) const {
   const Type *t1 = phase->type( in(1) );
--- old/src/share/vm/opto/subnode.hpp	2015-07-22 17:51:07.652625200 -0700
+++ new/src/share/vm/opto/subnode.hpp	2015-07-22 17:51:06.806625200 -0700
@@ -470,20 +470,6 @@
   virtual const Type *Value( PhaseTransform *phase ) const;
 };
 
-//------------------------------ExpDNode---------------------------------------
-//  Exponentiate a double
-class ExpDNode : public Node {
-public:
-  ExpDNode(Compile* C, Node *c, Node *in1) : Node(c, in1) {
-    init_flags(Flag_is_expensive);
-    C->add_expensive_node(this);
-  }
-  virtual int Opcode() const;
-  const Type *bottom_type() const { return Type::DOUBLE; }
-  virtual uint ideal_reg() const { return Op_RegD; }
-  virtual const Type *Value( PhaseTransform *phase ) const;
-};
-
 //------------------------------LogDNode---------------------------------------
 // Log_e of a double
 class LogDNode : public Node {
--- old/src/share/vm/runtime/stubRoutines.cpp	2015-07-22 17:51:16.846625200 -0700
+++ new/src/share/vm/runtime/stubRoutines.cpp	2015-07-22 17:51:16.008625200 -0700
@@ -146,9 +146,10 @@
 address StubRoutines::_montgomeryMultiply = NULL;
 address StubRoutines::_montgomerySquare = NULL;
 
+address StubRoutines::_dexp = NULL;
+
 double (* StubRoutines::_intrinsic_log   )(double) = NULL;
 double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
-double (* StubRoutines::_intrinsic_exp   )(double) = NULL;
 double (* StubRoutines::_intrinsic_pow   )(double, double) = NULL;
 double (* StubRoutines::_intrinsic_sin   )(double) = NULL;
 double (* StubRoutines::_intrinsic_cos   )(double) = NULL;
--- old/src/share/vm/runtime/stubRoutines.hpp	2015-07-22 17:51:25.546625200 -0700
+++ new/src/share/vm/runtime/stubRoutines.hpp	2015-07-22 17:51:24.700625200 -0700
@@ -205,6 +205,8 @@
   static address _montgomeryMultiply;
   static address _montgomerySquare;
 
+  static address _dexp;
+
   // These are versions of the java.lang.Math methods which perform
   // the same operations as the intrinsic version.  They are used for
   // constant folding in the compiler to ensure equivalence.  If the
@@ -213,7 +215,6 @@
   // SharedRuntime.
   static double (*_intrinsic_log)(double);
   static double (*_intrinsic_log10)(double);
-  static double (*_intrinsic_exp)(double);
   static double (*_intrinsic_pow)(double, double);
   static double (*_intrinsic_sin)(double);
   static double (*_intrinsic_cos)(double);
@@ -371,6 +372,8 @@
   static address montgomeryMultiply()  { return _montgomeryMultiply; }
   static address montgomerySquare()    { return _montgomerySquare; }
 
+  static address dexp()                {return _dexp; }
+
   static address select_fill_function(BasicType t, bool aligned, const char* &name);
 
   static address zero_aligned_words()   { return _zero_aligned_words; }
@@ -383,10 +386,6 @@
     assert(_intrinsic_log != NULL, "must be defined");
     return _intrinsic_log10(d);
   }
-  static double  intrinsic_exp(double d) {
-    assert(_intrinsic_exp != NULL, "must be defined");
-    return _intrinsic_exp(d);
-  }
   static double  intrinsic_pow(double d, double d2) {
     assert(_intrinsic_pow != NULL, "must be defined");
     return _intrinsic_pow(d, d2);
--- old/src/share/vm/runtime/vmStructs.cpp	2015-07-22 17:51:34.174625200 -0700
+++ new/src/share/vm/runtime/vmStructs.cpp	2015-07-22 17:51:33.339625200 -0700
@@ -836,6 +836,7 @@
      static_field(StubRoutines,                _multiplyToLen,                                address)                               \
      static_field(StubRoutines,                _squareToLen,                                  address)                               \
      static_field(StubRoutines,                _mulAdd,                                       address)                               \
+     static_field(StubRoutines,                _dexp,                                         address)                               \
                                                                                                                                      \
   /*****************/                                                                                                                \
   /* SharedRuntime */                                                                                                                \
@@ -1990,7 +1991,6 @@
   declare_c2_type(TanDNode, Node)                                         \
   declare_c2_type(AtanDNode, Node)                                        \
   declare_c2_type(SqrtDNode, Node)                                        \
-  declare_c2_type(ExpDNode, Node)                                         \
   declare_c2_type(LogDNode, Node)                                         \
   declare_c2_type(Log10DNode, Node)                                       \
   declare_c2_type(PowDNode, Node)                                         \
--- /dev/null	2015-07-22 17:51:44.000000000 -0700
+++ new/src/cpu/x86/vm/macroAssembler_x86_libm.cpp	2015-07-22 17:51:42.163625200 -0700
@@ -0,0 +1,1085 @@
+/*
+* Copyright (c) 2007, 2015, Oracle and/or its affiliates. All rights reserved.
+* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+*
+* This code is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License version 2 only, as
+* published by the Free Software Foundation.
+*
+* This code is distributed in the hope that it will be useful, but WITHOUT
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+* version 2 for more details (a copy is included in the LICENSE file that
+* accompanied this code).
+*
+* You should have received a copy of the GNU General Public License version
+* 2 along with this work; if not, write to the Free Software Foundation,
+* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+*
+* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+* or visit www.oracle.com if you need additional information or have any
+* questions.
+*/
+
+/*
+* Intel Math Library (LIBM) Source Code
+* Copyright (c) 2015, Intel Corporation.
+*
+* This program is free software; you can redistribute it and/or modify it
+* under the terms and conditions of the GNU General Public License,
+* version 2, as published by the Free Software Foundation.
+*
+* This program is distributed in the hope it will be useful, but WITHOUT
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+* more details.
+*/
+
+/******************************************************************************/
+//                     ALGORITHM DESCRIPTION
+//                     ---------------------
+//
+// Description:
+//  Let K = 64 (table size).
+//        x    x/log(2)     n
+//       e  = 2          = 2 * T[j] * (1 + P(y))
+//  where
+//       x = m*log(2)/K + y,    y in [-log(2)/K..log(2)/K]
+//       m = n*K + j,           m,n,j - signed integer, j in [-K/2..K/2]
+//                  j/K
+//       values of 2   are tabulated as T[j] = T_hi[j] ( 1 + T_lo[j]).
+//
+//       P(y) is a minimax polynomial approximation of exp(x)-1
+//       on small interval [-log(2)/K..log(2)/K] (were calculated by Maple V).
+//
+//  To avoid problems with arithmetic overflow and underflow,
+//            n                        n1  n2
+//  value of 2  is safely computed as 2 * 2 where n1 in [-BIAS/2..BIAS/2]
+//  where BIAS is a value of exponent bias.
+//
+// Special cases:
+//  exp(NaN) = NaN
+//  exp(+INF) = +INF
+//  exp(-INF) = 0
+//  exp(x) = 1 for subnormals
+//  for finite argument, only exp(0)=1 is exact
+//  For IEEE double
+//    if x >  709.782712893383973096 then exp(x) overflow
+//    if x < -745.133219101941108420 then exp(x) underflow
+//
+/******************************************************************************/
+
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+
+
+#ifdef _LP64
+//registers, 
+// input: xmm0
+// scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+//          rax, rdx, rcx, tmp - r11
+
+void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
+                              XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
+                              Register eax, Register ecx, Register edx, Register tmp) {
+  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
+  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
+  Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
+  Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;
+  
+  assert_different_registers(tmp, eax, ecx, edx);
+  jmp(start);
+  address cv = pc();
+  emit_int32(1697350398);
+  emit_int32(1079448903);
+  emit_int32(1697350398);
+  emit_int32(1079448903);
+  emit_int32(4277796864);
+  emit_int32(1065758274);
+  emit_int32(4277796864);
+  emit_int32(1065758274);
+  emit_int32(3164486458);
+  emit_int32(1025308570);
+  emit_int32(3164486458);
+  emit_int32(1025308570);
+  emit_int32(4294967294);
+  emit_int32(1071644671);
+  emit_int32(4294967294);
+  emit_int32(1071644671);
+  emit_int32(3811088480);
+  emit_int32(1062650204);
+  emit_int32(1432067621);
+  emit_int32(1067799893);
+  emit_int32(3230715663);
+  emit_int32(1065423125);
+  emit_int32(1431604129);
+  emit_int32(1069897045);
+  address Shifter = pc();
+  emit_int32(0);
+  emit_int32(1127743488);
+  emit_int32(0);
+  emit_int32(1127743488);
+  address mmask = pc(); 
+  emit_int32(4294967232);
+  emit_int32(0);
+  emit_int32(4294967232);
+  emit_int32(0);
+  address bias = pc();
+  emit_int32(65472);
+  emit_int32(0);
+  emit_int32(65472);
+  emit_int32(0);
+  address Tbl_addr = pc();
+  emit_int32(0);
+  emit_int32(0);
+  emit_int32(0);
+  emit_int32(0);
+  emit_int32(235107661);
+  emit_int32(1018002367);
+  emit_int32(1048019040);
+  emit_int32(11418);
+  emit_int32(896005651);
+  emit_int32(1015861842);
+  emit_int32(3541402996);
+  emit_int32(22960);
+  emit_int32(1642514529);
+  emit_int32(1012987726);
+  emit_int32(410360776);
+  emit_int32(34629);
+  emit_int32(1568897900);
+  emit_int32(1016568486);
+  emit_int32(1828292879);
+  emit_int32(46424);
+  emit_int32(1882168529);
+  emit_int32(1010744893);
+  emit_int32(852742562);
+  emit_int32(58348);
+  emit_int32(509852888);
+  emit_int32(1017336174);
+  emit_int32(3490863952);
+  emit_int32(70401);
+  emit_int32(653277307);
+  emit_int32(1017431380);
+  emit_int32(2930322911);
+  emit_int32(82586);
+  emit_int32(1649557430);
+  emit_int32(1017729363);
+  emit_int32(1014845818);
+  emit_int32(94904);
+  emit_int32(1058231231);
+  emit_int32(1015777676);
+  emit_int32(3949972341);
+  emit_int32(107355);
+  emit_int32(1044000607);
+  emit_int32(1016786167);
+  emit_int32(828946858);
+  emit_int32(119943);
+  emit_int32(1151779725);
+  emit_int32(1015705409);
+  emit_int32(2288159958);
+  emit_int32(132667);
+  emit_int32(3819481236);
+  emit_int32(1016499965);
+  emit_int32(1853186616);
+  emit_int32(145530);
+  emit_int32(2552227826);
+  emit_int32(1015039787);
+  emit_int32(1709341917);
+  emit_int32(158533);
+  emit_int32(1829350193);
+  emit_int32(1015216097);
+  emit_int32(4112506593);
+  emit_int32(171677);
+  emit_int32(1913391795);
+  emit_int32(1015756674);
+  emit_int32(2799960843);
+  emit_int32(184965);
+  emit_int32(1303423926);
+  emit_int32(1015238005);
+  emit_int32(171030293);
+  emit_int32(198398);
+  emit_int32(1574172746);
+  emit_int32(1016061241);
+  emit_int32(2992903935);
+  emit_int32(211976);
+  emit_int32(3424156969);
+  emit_int32(1017196428);
+  emit_int32(926591434);
+  emit_int32(225703);
+  emit_int32(1938513547);
+  emit_int32(1017631273);
+  emit_int32(887463926);
+  emit_int32(239579);
+  emit_int32(2804567149);
+  emit_int32(1015390024);
+  emit_int32(1276261410);
+  emit_int32(253606);
+  emit_int32(631083525);
+  emit_int32(1017690182);
+  emit_int32(569847337);
+  emit_int32(267786);
+  emit_int32(1623370770);
+  emit_int32(1011049453);
+  emit_int32(1617004845);
+  emit_int32(282120);
+  emit_int32(3667985273);
+  emit_int32(1013894369);
+  emit_int32(3049340112);
+  emit_int32(296610);
+  emit_int32(3145379760);
+  emit_int32(1014403278);
+  emit_int32(3577096743);
+  emit_int32(311258);
+  emit_int32(2603100681);
+  emit_int32(1017152460);
+  emit_int32(1990012070);
+  emit_int32(326066);
+  emit_int32(3249202951);
+  emit_int32(1017448880);
+  emit_int32(1453150081);
+  emit_int32(341035);
+  emit_int32(419288974);
+  emit_int32(1016280325);
+  emit_int32(917841882);
+  emit_int32(356167);
+  emit_int32(3793507337);
+  emit_int32(1016095713);
+  emit_int32(3712504873);
+  emit_int32(371463);
+  emit_int32(728023093);
+  emit_int32(1016345318);
+  emit_int32(363667784);
+  emit_int32(386927);
+  emit_int32(2582678538);
+  emit_int32(1017123460);
+  emit_int32(2956612996);
+  emit_int32(402558);
+  emit_int32(7592966);
+  emit_int32(1016721543);
+  emit_int32(2186617380);
+  emit_int32(418360);
+  emit_int32(228611441);
+  emit_int32(1016696141);
+  emit_int32(1719614412);
+  emit_int32(434334);
+  emit_int32(2261665670);
+  emit_int32(1017457593);
+  emit_int32(1013258798);
+  emit_int32(450482);
+  emit_int32(544148907);
+  emit_int32(1017323666);
+  emit_int32(3907805043);
+  emit_int32(466805);
+  emit_int32(2383914918);
+  emit_int32(1017143586);
+  emit_int32(1447192520);
+  emit_int32(483307);
+  emit_int32(1176412038);
+  emit_int32(1017267372);
+  emit_int32(1944781190);
+  emit_int32(499988);
+  emit_int32(2882956373);
+  emit_int32(1013312481);
+  emit_int32(919555682);
+  emit_int32(516851);
+  emit_int32(3154077648);
+  emit_int32(1016528543);
+  emit_int32(2571947538);
+  emit_int32(533897);
+  emit_int32(348651999);
+  emit_int32(1016405780);
+  emit_int32(2604962540);
+  emit_int32(551129);
+  emit_int32(3253791412);
+  emit_int32(1015920431);
+  emit_int32(1110089947);
+  emit_int32(568549);
+  emit_int32(1509121860);
+  emit_int32(1014756995);
+  emit_int32(2568320822);
+  emit_int32(586158);
+  emit_int32(2617649212);
+  emit_int32(1017340090);
+  emit_int32(2966275556);
+  emit_int32(603959);
+  emit_int32(553214634);
+  emit_int32(1016457425);
+  emit_int32(2682146383);
+  emit_int32(621954);
+  emit_int32(730975783);
+  emit_int32(1014083580);
+  emit_int32(2191782032);
+  emit_int32(640145);
+  emit_int32(1486499517);
+  emit_int32(1016818996);
+  emit_int32(2069751140);
+  emit_int32(658534);
+  emit_int32(2595788928);
+  emit_int32(1016407932);
+  emit_int32(2990417244);
+  emit_int32(677123);
+  emit_int32(1853053619);
+  emit_int32(1015310724);
+  emit_int32(1434058175);
+  emit_int32(695915);
+  emit_int32(2462790535);
+  emit_int32(1015814775);
+  emit_int32(2572866477);
+  emit_int32(714911);
+  emit_int32(3693944214);
+  emit_int32(1017259110);
+  emit_int32(3092190714);
+  emit_int32(734114);
+  emit_int32(2979333550);
+  emit_int32(1017188654);
+  emit_int32(4076559942);
+  emit_int32(753526);
+  emit_int32(174054861);
+  emit_int32(1014300631);
+  emit_int32(2420883922);
+  emit_int32(773150);
+  emit_int32(816778419);
+  emit_int32(1014197934);
+  emit_int32(3716502172);
+  emit_int32(792987);
+  emit_int32(3507050924);
+  emit_int32(1015341199);
+  emit_int32(777507147);
+  emit_int32(813041);
+  emit_int32(1821514088);
+  emit_int32(1013410604);
+  emit_int32(3706687593);
+  emit_int32(833312);
+  emit_int32(920623539);
+  emit_int32(1016295433);
+  emit_int32(1242007931);
+  emit_int32(853805);
+  emit_int32(2789017511);
+  emit_int32(1014276997);
+  emit_int32(3707479175);
+  emit_int32(874520);
+  emit_int32(3586233004);
+  emit_int32(1015962192);
+  emit_int32(64696965);
+  emit_int32(895462);
+  emit_int32(474650514);
+  emit_int32(1016642419);
+  emit_int32(863738718);
+  emit_int32(916631);
+  emit_int32(1614448851);
+  emit_int32(1014281732);
+  emit_int32(3884662774);
+  emit_int32(938030);
+  emit_int32(2450082086);
+  emit_int32(1016164135);
+  emit_int32(2728693977);
+  emit_int32(959663);
+  emit_int32(1101668360);
+  emit_int32(1015989180);
+  emit_int32(3999357479);
+  emit_int32(981531);
+  emit_int32(835814894);
+  emit_int32(1015702697);
+  emit_int32(1533953344);
+  emit_int32(1003638);
+  emit_int32(1301400989);
+  emit_int32(1014466875);
+  emit_int32(2174652632);
+  emit_int32(1025985);
+  address ALLONES = pc();
+  emit_int32(4294967295);
+  emit_int32(4294967295);
+  emit_int32(4294967295);
+  address ebias = pc();
+  emit_int32(0);
+  emit_int32(1072693248);
+  emit_int32(0);
+  emit_int32(1072693248);
+  address XMAX = pc();
+  emit_int32(4294967295);
+  emit_int32(2146435071);
+  address XMIN = pc();
+  emit_int32(0);
+  emit_int32(1048576);
+  address INF = pc();
+  emit_int32(0);
+  emit_int32(2146435072);
+  address ZERO = pc();
+  emit_int32(0);
+  emit_int32(0);
+  address ONE_val = pc();
+  emit_int32(0);
+  emit_int32(1072693248);
+  bind(start);
+  subq(rsp, 24);
+  movsd(Address(rsp, 8), xmm0);
+  unpcklpd(xmm0, xmm0);
+  movdqu(xmm1, InternalAddress(cv));
+  movdqu(xmm6, InternalAddress(Shifter));
+  movdqu(xmm2, InternalAddress(16+cv));
+  movdqu(xmm3, InternalAddress(32+cv));
+  pextrw(eax, xmm0, 3);
+  andl(eax, 32767);
+  movl(edx, 16527);
+  subl(edx, eax);
+  subl(eax, 15504);
+  orl(edx, eax);
+  cmpl(edx, INT_MIN);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
+  mulpd(xmm1, xmm0);
+  addpd(xmm1, xmm6);
+  movapd(xmm7, xmm1);
+  subpd(xmm1, xmm6);
+  mulpd(xmm2, xmm1);
+  movdqu(xmm4, InternalAddress(64+cv));
+  mulpd(xmm3, xmm1);
+  movdqu(xmm5, InternalAddress(80+cv));
+  subpd(xmm0, xmm2);
+  movdl(eax, xmm7);
+  movl(ecx, eax);
+  andl(ecx, 63);
+  shll(ecx, 4);
+  sarl(eax, 6);
+  movl(edx, eax);
+  movdqu(xmm6, InternalAddress(mmask));
+  pand(xmm7, xmm6);
+  movdqu(xmm6, InternalAddress(bias));
+  paddq(xmm7, xmm6);
+  psllq(xmm7, 46);
+  subpd(xmm0, xmm3);
+  lea(tmp, InternalAddress(Tbl_addr));
+  movdqu(xmm2, Address(ecx,tmp));
+  mulpd(xmm4, xmm0);
+  movapd(xmm6, xmm0);
+  movapd(xmm1, xmm0);
+  mulpd(xmm6, xmm6);
+  mulpd(xmm0, xmm6);
+  addpd(xmm5, xmm4);
+  mulsd(xmm0, xmm6);
+  mulpd(xmm6, InternalAddress(48+cv));
+  addsd(xmm1, xmm2);
+  unpckhpd(xmm2, xmm2);
+  mulpd(xmm0, xmm5);
+  addsd(xmm1, xmm0);
+  por(xmm2, xmm7);
+  unpckhpd(xmm0, xmm0);
+  addsd(xmm0, xmm1);
+  addsd(xmm0, xmm6);
+  addl(edx, 894);
+  cmpl(edx, 1916);
+  jcc (Assembler::above, L_2TAG_PACKET_1_0_2);
+  mulsd(xmm0, xmm2);
+  addsd(xmm0, xmm2);
+  jmp (B1_5);
+  bind(L_2TAG_PACKET_1_0_2);
+  xorpd(xmm3, xmm3);
+  movdqu(xmm4, InternalAddress(ALLONES));
+  movl(edx, -1022);
+  subl(edx, eax);
+  movdl(xmm5, edx);
+  psllq(xmm4, xmm5);
+  movl(ecx, eax);
+  sarl(eax, 1);
+  pinsrw(xmm3, eax, 3);
+  movdqu(xmm6, InternalAddress(ebias));
+  psllq(xmm3, 4);
+  psubd(xmm2, xmm3);
+  mulsd(xmm0, xmm2);
+  cmpl(edx, 52);
+  jcc(Assembler::greater, L_2TAG_PACKET_2_0_2);
+  pand(xmm4, xmm2);
+  paddd(xmm3, xmm6);
+  subsd(xmm2, xmm4);
+  addsd(xmm0, xmm2);
+  cmpl(ecx, 1023);
+  jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2);
+  pextrw(ecx, xmm0, 3);
+  andl(ecx, 32768);
+  orl(edx, ecx);
+  cmpl(edx, 0);
+  jcc(Assembler::equal, L_2TAG_PACKET_4_0_2);
+  movapd(xmm6, xmm0);
+  addsd(xmm0, xmm4);
+  mulsd(xmm0, xmm3);
+  pextrw(ecx, xmm0, 3);
+  andl(ecx, 32752);
+  cmpl(ecx, 0);
+  jcc(Assembler::equal, L_2TAG_PACKET_5_0_2);
+  jmp(B1_5);
+  bind(L_2TAG_PACKET_5_0_2);
+  mulsd(xmm6, xmm3);
+  mulsd(xmm4, xmm3);
+  movdqu(xmm0, xmm6);
+  pxor(xmm6, xmm4);
+  psrad(xmm6, 31);
+  pshufd(xmm6, xmm6, 85);
+  psllq(xmm0, 1);
+  psrlq(xmm0, 1);
+  pxor(xmm0, xmm6);
+  psrlq(xmm6, 63);
+  paddq(xmm0, xmm6);
+  paddq(xmm0, xmm4);
+  movl(Address(rsp,0), 15);
+  jmp(L_2TAG_PACKET_6_0_2);
+  bind(L_2TAG_PACKET_4_0_2);
+  addsd(xmm0, xmm4);
+  mulsd(xmm0, xmm3);
+  jmp(B1_5);
+  bind(L_2TAG_PACKET_3_0_2);
+  addsd(xmm0, xmm4);
+  mulsd(xmm0, xmm3);
+  pextrw(ecx, xmm0, 3);
+  andl(ecx, 32752);
+  cmpl(ecx, 32752);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2);
+  jmp(B1_5);
+  bind(L_2TAG_PACKET_2_0_2);
+  paddd(xmm3, xmm6);
+  addpd(xmm0, xmm2);
+  mulsd(xmm0, xmm3);
+  movl(Address(rsp,0), 15);
+  jmp(L_2TAG_PACKET_6_0_2);
+  bind(L_2TAG_PACKET_8_0_2);
+  cmpl(eax, 2146435072);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_9_0_2);
+  movl(eax, Address(rsp,12)); 
+  cmpl(eax, INT_MIN);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_10_0_2);
+  movsd(xmm0, InternalAddress(XMAX));
+  mulsd(xmm0, xmm0);
+  bind(L_2TAG_PACKET_7_0_2);
+  movl(Address(rsp,0), 14);
+  jmp(L_2TAG_PACKET_6_0_2);
+  bind(L_2TAG_PACKET_10_0_2);
+  movsd(xmm0, InternalAddress(XMIN));
+  mulsd(xmm0, xmm0);
+  movl(Address(rsp,0), 15);
+  jmp(L_2TAG_PACKET_6_0_2);
+  bind(L_2TAG_PACKET_9_0_2);
+  movl(edx, Address(rsp,8));
+  cmpl(eax, 2146435072);
+  jcc(Assembler::above, L_2TAG_PACKET_11_0_2);
+  cmpl(edx, 0);
+  jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2);
+  movl(eax, Address(rsp,12));
+  cmpl(eax, 2146435072);
+  jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_2);
+  movsd(xmm0, InternalAddress(INF));
+  jmp(B1_5);
+  bind(L_2TAG_PACKET_12_0_2);
+  movsd(xmm0, InternalAddress(ZERO));
+  jmp(B1_5);
+  bind(L_2TAG_PACKET_11_0_2);
+  movsd(xmm0, Address(rsp, 8));
+  addsd(xmm0, xmm0);
+  jmp(B1_5);
+  bind(L_2TAG_PACKET_0_0_2);
+  movl(eax, Address(rsp, 12));
+  andl(eax, 2147483647);
+  cmpl(eax, 1083179008);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_8_0_2);
+  movsd(Address(rsp, 8), xmm0);
+  addsd(xmm0, InternalAddress(ONE_val));
+  jmp(B1_5);
+  bind(L_2TAG_PACKET_6_0_2);
+  movq(Address(rsp, 16), xmm0);
+  bind(B1_3);
+  movq(xmm0, Address(rsp, 16));
+  bind(L_2TAG_PACKET_13_0_2);
+  bind(B1_5);
+  addq(rsp, 24);
+}
+#endif
+
+#ifndef _LP64
+//registers, 
+// input: (rbp + 8)
+// scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+//          rax, rdx, rcx, rbx (tmp) 
+
+void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
+                              XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
+                              Register eax, Register ecx, Register edx, Register tmp) {
+  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
+  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
+  Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
+  Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;
+  
+  assert_different_registers(tmp, eax, ecx, edx);
+  jmp(start);
+  address static_const_table = pc();
+  emit_int32(0);
+  emit_int32(4293918720u);
+  emit_int32(0);
+  emit_int32(4293918720u);
+  emit_int32(4294967232u);
+  emit_int32(0);
+  emit_int32(4294967232u);
+  emit_int32(0);
+  emit_int32(65472u);
+  emit_int32(0);
+  emit_int32(65472u);
+  emit_int32(0);
+  emit_int32(0);
+  emit_int32(1127743488u);
+  emit_int32(0);
+  emit_int32(1127743488u);
+  emit_int32(1697350398u);
+  emit_int32(1079448903u);
+  emit_int32(1697350398u);
+  emit_int32(1079448903u);
+  emit_int32(4277796864u);
+  emit_int32(1065758274u);
+  emit_int32(4277796864u);
+  emit_int32(1065758274u);
+  emit_int32(3164486458u);
+  emit_int32(1025308570u);
+  emit_int32(3164486458u);
+  emit_int32(1025308570u);
+  emit_int32(4294967294u);
+  emit_int32(1071644671u);
+  emit_int32(4294967294u);
+  emit_int32(1071644671u);
+  emit_int32(3811088480u);
+  emit_int32(1062650204u);
+  emit_int32(1432067621u);
+  emit_int32(1067799893u);
+  emit_int32(3230715663u);
+  emit_int32(1065423125u);
+  emit_int32(1431604129u);
+  emit_int32(1069897045u);
+  emit_int32(0);
+  emit_int32(0);
+  emit_int32(0);
+  emit_int32(0);
+  emit_int32(235107661u);
+  emit_int32(1018002367u);
+  emit_int32(1048019040u);
+  emit_int32(11418u);
+  emit_int32(896005651u);
+  emit_int32(1015861842u);
+  emit_int32(3541402996u);
+  emit_int32(22960u);
+  emit_int32(1642514529u);
+  emit_int32(1012987726u);
+  emit_int32(410360776u);
+  emit_int32(34629u);
+  emit_int32(1568897900u);
+  emit_int32(1016568486u);
+  emit_int32(1828292879u);
+  emit_int32(46424u);
+  emit_int32(1882168529u);
+  emit_int32(1010744893u);
+  emit_int32(852742562u);
+  emit_int32(58348u);
+  emit_int32(509852888u);
+  emit_int32(1017336174u);
+  emit_int32(3490863952u);
+  emit_int32(70401u);
+  emit_int32(653277307u);
+  emit_int32(1017431380u);
+  emit_int32(2930322911u);
+  emit_int32(82586u);
+  emit_int32(1649557430u);
+  emit_int32(1017729363u);
+  emit_int32(1014845818u);
+  emit_int32(94904u);
+  emit_int32(1058231231u);
+  emit_int32(1015777676u);
+  emit_int32(3949972341u);
+  emit_int32(107355u);
+  emit_int32(1044000607u);
+  emit_int32(1016786167u);
+  emit_int32(828946858u);
+  emit_int32(119943u);
+  emit_int32(1151779725u);
+  emit_int32(1015705409u);
+  emit_int32(2288159958u);
+  emit_int32(132667u);
+  emit_int32(3819481236u);
+  emit_int32(1016499965u);
+  emit_int32(1853186616u);
+  emit_int32(145530u);
+  emit_int32(2552227826u);
+  emit_int32(1015039787u);
+  emit_int32(1709341917u);
+  emit_int32(158533u);
+  emit_int32(1829350193u);
+  emit_int32(1015216097u);
+  emit_int32(4112506593u);
+  emit_int32(171677u);
+  emit_int32(1913391795u);
+  emit_int32(1015756674u);
+  emit_int32(2799960843u);
+  emit_int32(184965u);
+  emit_int32(1303423926u);
+  emit_int32(1015238005u);
+  emit_int32(171030293u);
+  emit_int32(198398u);
+  emit_int32(1574172746u);
+  emit_int32(1016061241u);
+  emit_int32(2992903935u);
+  emit_int32(211976u);
+  emit_int32(3424156969u);
+  emit_int32(1017196428u);
+  emit_int32(926591434u);
+  emit_int32(225703u);
+  emit_int32(1938513547u);
+  emit_int32(1017631273u);
+  emit_int32(887463926u);
+  emit_int32(239579u);
+  emit_int32(2804567149u);
+  emit_int32(1015390024u);
+  emit_int32(1276261410u);
+  emit_int32(253606u);
+  emit_int32(631083525u);
+  emit_int32(1017690182u);
+  emit_int32(569847337u);
+  emit_int32(267786u);
+  emit_int32(1623370770u);
+  emit_int32(1011049453u);
+  emit_int32(1617004845u);
+  emit_int32(282120u);
+  emit_int32(3667985273u);
+  emit_int32(1013894369u);
+  emit_int32(3049340112u);
+  emit_int32(296610u);
+  emit_int32(3145379760u);
+  emit_int32(1014403278u);
+  emit_int32(3577096743u);
+  emit_int32(311258u);
+  emit_int32(2603100681u);
+  emit_int32(1017152460u);
+  emit_int32(1990012070u);
+  emit_int32(326066u);
+  emit_int32(3249202951u);
+  emit_int32(1017448880u);
+  emit_int32(1453150081u);
+  emit_int32(341035u);
+  emit_int32(419288974u);
+  emit_int32(1016280325u);
+  emit_int32(917841882u);
+  emit_int32(356167u);
+  emit_int32(3793507337u);
+  emit_int32(1016095713u);
+  emit_int32(3712504873u);
+  emit_int32(371463u);
+  emit_int32(728023093u);
+  emit_int32(1016345318u);
+  emit_int32(363667784u);
+  emit_int32(386927u);
+  emit_int32(2582678538u);
+  emit_int32(1017123460u);
+  emit_int32(2956612996u);
+  emit_int32(402558u);
+  emit_int32(7592966u);
+  emit_int32(1016721543u);
+  emit_int32(2186617380u);
+  emit_int32(418360u);
+  emit_int32(228611441u);
+  emit_int32(1016696141u);
+  emit_int32(1719614412u);
+  emit_int32(434334u);
+  emit_int32(2261665670u);
+  emit_int32(1017457593u);
+  emit_int32(1013258798u);
+  emit_int32(450482u);
+  emit_int32(544148907u);
+  emit_int32(1017323666u);
+  emit_int32(3907805043u);
+  emit_int32(466805u);
+  emit_int32(2383914918u);
+  emit_int32(1017143586u);
+  emit_int32(1447192520u);
+  emit_int32(483307u);
+  emit_int32(1176412038u);
+  emit_int32(1017267372u);
+  emit_int32(1944781190u);
+  emit_int32(499988u);
+  emit_int32(2882956373u);
+  emit_int32(1013312481u);
+  emit_int32(919555682u);
+  emit_int32(516851u);
+  emit_int32(3154077648u);
+  emit_int32(1016528543u);
+  emit_int32(2571947538u);
+  emit_int32(533897u);
+  emit_int32(348651999u);
+  emit_int32(1016405780u);
+  emit_int32(2604962540u);
+  emit_int32(551129u);
+  emit_int32(3253791412u);
+  emit_int32(1015920431u);
+  emit_int32(1110089947u);
+  emit_int32(568549u);
+  emit_int32(1509121860u);
+  emit_int32(1014756995u);
+  emit_int32(2568320822u);
+  emit_int32(586158u);
+  emit_int32(2617649212u);
+  emit_int32(1017340090u);
+  emit_int32(2966275556u);
+  emit_int32(603959u);
+  emit_int32(553214634u);
+  emit_int32(1016457425u);
+  emit_int32(2682146383u);
+  emit_int32(621954u);
+  emit_int32(730975783u);
+  emit_int32(1014083580u);
+  emit_int32(2191782032u);
+  emit_int32(640145u);
+  emit_int32(1486499517u);
+  emit_int32(1016818996u);
+  emit_int32(2069751140u);
+  emit_int32(658534u);
+  emit_int32(2595788928u);
+  emit_int32(1016407932u);
+  emit_int32(2990417244u);
+  emit_int32(677123u);
+  emit_int32(1853053619u);
+  emit_int32(1015310724u);
+  emit_int32(1434058175u);
+  emit_int32(695915u);
+  emit_int32(2462790535u);
+  emit_int32(1015814775u);
+  emit_int32(2572866477u);
+  emit_int32(714911u);
+  emit_int32(3693944214u);
+  emit_int32(1017259110u);
+  emit_int32(3092190714u);
+  emit_int32(734114u);
+  emit_int32(2979333550u);
+  emit_int32(1017188654u);
+  emit_int32(4076559942u);
+  emit_int32(753526u);
+  emit_int32(174054861u);
+  emit_int32(1014300631u);
+  emit_int32(2420883922u);
+  emit_int32(773150u);
+  emit_int32(816778419u);
+  emit_int32(1014197934u);
+  emit_int32(3716502172u);
+  emit_int32(792987u);
+  emit_int32(3507050924u);
+  emit_int32(1015341199u);
+  emit_int32(777507147u);
+  emit_int32(813041u);
+  emit_int32(1821514088u);
+  emit_int32(1013410604u);
+  emit_int32(3706687593u);
+  emit_int32(833312u);
+  emit_int32(920623539u);
+  emit_int32(1016295433u);
+  emit_int32(1242007931u);
+  emit_int32(853805u);
+  emit_int32(2789017511u);
+  emit_int32(1014276997u);
+  emit_int32(3707479175u);
+  emit_int32(874520u);
+  emit_int32(3586233004u);
+  emit_int32(1015962192u);
+  emit_int32(64696965u);
+  emit_int32(895462u);
+  emit_int32(474650514u);
+  emit_int32(1016642419u);
+  emit_int32(863738718u);
+  emit_int32(916631u);
+  emit_int32(1614448851u);
+  emit_int32(1014281732u);
+  emit_int32(3884662774u);
+  emit_int32(938030u);
+  emit_int32(2450082086u);
+  emit_int32(1016164135u);
+  emit_int32(2728693977u);
+  emit_int32(959663u);
+  emit_int32(1101668360u);
+  emit_int32(1015989180u);
+  emit_int32(3999357479u);
+  emit_int32(981531u);
+  emit_int32(835814894u);
+  emit_int32(1015702697u);
+  emit_int32(1533953344u);
+  emit_int32(1003638u);
+  emit_int32(1301400989u);
+  emit_int32(1014466875u);
+  emit_int32(2174652632u);
+  emit_int32(1025985u);
+  emit_int32(0);
+  emit_int32(1072693248u);
+  emit_int32(0);
+  emit_int32(2146435072u);
+  emit_int32(0);
+  emit_int32(0);
+  emit_int32(4294967295u);
+  emit_int32(2146435071u);
+  emit_int32(0);
+  emit_int32(1048576u);
+  bind(start);
+  subl(rsp, 120);
+  movl(Address(rsp, 64), tmp);
+  lea(tmp, InternalAddress(static_const_table));
+  movdqu(xmm0, Address(rsp, 128));
+  unpcklpd(xmm0, xmm0);
+  movdqu(xmm1, Address(tmp, 64));
+  movdqu(xmm6, Address(tmp, 48));
+  movdqu(xmm2, Address(tmp, 80));
+  movdqu(xmm3, Address(tmp, 96));
+  pextrw(eax, xmm0, 3);
+  andl(eax, 32767);
+  movl(edx, 16527);
+  subl(edx, eax);
+  subl(eax, 15504);
+  orl(edx, eax);
+  cmpl(edx, INT_MIN);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
+  mulpd(xmm1, xmm0);
+  addpd(xmm1, xmm6);
+  movapd(xmm7, xmm1);
+  subpd(xmm1, xmm6);
+  mulpd(xmm2, xmm1);
+  movdqu(xmm4, Address(tmp, 128));
+  mulpd(xmm3, xmm1);
+  movdqu(xmm5, Address(tmp, 144));
+  subpd(xmm0, xmm2);
+  movdl(eax, xmm7);
+  movl(ecx, eax);
+  andl(ecx, 63);
+  shll(ecx, 4);
+  sarl(eax, 6);
+  movl(edx, eax);
+  movdqu(xmm6, Address(tmp, 16));
+  pand(xmm7, xmm6);
+  movdqu(xmm6, Address(tmp, 32));
+  paddq(xmm7, xmm6);
+  psllq(xmm7, 46);
+  subpd(xmm0, xmm3);
+  movdqu(xmm2, Address(tmp, ecx, Address::times_1, 160));
+  mulpd(xmm4, xmm0);
+  movapd(xmm6, xmm0);
+  movapd(xmm1, xmm0);
+  mulpd(xmm6, xmm6);
+  mulpd(xmm0, xmm6);
+  addpd(xmm5, xmm4);
+  mulsd(xmm0, xmm6);
+  mulpd(xmm6, Address(tmp, 112));
+  addsd(xmm1, xmm2);
+  unpckhpd(xmm2, xmm2);
+  mulpd(xmm0, xmm5);
+  addsd(xmm1, xmm0);
+  por(xmm2, xmm7);
+  unpckhpd(xmm0, xmm0);
+  addsd(xmm0, xmm1);
+  addsd(xmm0, xmm6);
+  addl(edx, 894);
+  cmpl(edx, 1916);
+  jcc (Assembler::above, L_2TAG_PACKET_1_0_2);
+  mulsd(xmm0, xmm2);
+  addsd(xmm0, xmm2);
+  jmp(L_2TAG_PACKET_2_0_2);
+  bind(L_2TAG_PACKET_1_0_2);
+  fnstcw(Address(rsp, 24));
+  movzwl(edx, Address(rsp, 24));
+  orl(edx, 768);
+  movw(Address(rsp, 28), edx);
+  fldcw(Address(rsp, 28));
+  movl(edx, eax);
+  sarl(eax, 1);
+  subl(edx, eax);
+  movdqu(xmm6, Address(tmp, 0));
+  pandn(xmm6, xmm2);
+  addl(eax, 1023);
+  movdl(xmm3, eax);
+  psllq(xmm3, 52);
+  por(xmm6, xmm3);
+  addl(edx, 1023);
+  movdl(xmm4, edx);
+  psllq(xmm4, 52);
+  movsd(Address(rsp, 8), xmm0);
+  fld_d(Address(rsp, 8));
+  movsd(Address(rsp, 16), xmm6);
+  fld_d(Address(rsp, 16));
+  fmula(1);
+  faddp(1);
+  movsd(Address(rsp, 8), xmm4);
+  fld_d(Address(rsp, 8));
+  fmulp(1);
+  fstp_d(Address(rsp, 8));
+  movsd(xmm0,Address(rsp, 8));
+  fldcw(Address(rsp, 24));
+  pextrw(ecx, xmm0, 3);
+  andl(ecx, 32752);
+  cmpl(ecx, 32752);
+  jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2);
+  cmpl(ecx, 0);
+  jcc(Assembler::equal, L_2TAG_PACKET_4_0_2);
+  jmp(L_2TAG_PACKET_2_0_2);
+  cmpl(ecx, INT_MIN);
+  jcc(Assembler::less, L_2TAG_PACKET_3_0_2);
+  cmpl(ecx, -1064950997);
+  jcc(Assembler::less, L_2TAG_PACKET_2_0_2);
+  jcc(Assembler::greater, L_2TAG_PACKET_4_0_2);
+  movl(edx, Address(rsp, 128));
+  cmpl(edx ,-17155601);
+  jcc(Assembler::less, L_2TAG_PACKET_2_0_2);
+  jmp(L_2TAG_PACKET_4_0_2);
+  bind(L_2TAG_PACKET_3_0_2);
+  movl(edx, 14);
+  jmp(L_2TAG_PACKET_5_0_2);
+  bind(L_2TAG_PACKET_4_0_2);
+  movl(edx, 15);
+  bind(L_2TAG_PACKET_5_0_2);
+  movsd(Address(rsp, 0), xmm0);
+  movsd(xmm0, Address(rsp, 128));
+  fld_d(Address(rsp, 0));
+  jmp(L_2TAG_PACKET_6_0_2);
+  bind(L_2TAG_PACKET_7_0_2);
+  cmpl(eax, 2146435072);
+  jcc(Assembler::greaterEqual, L_2TAG_PACKET_8_0_2);
+  movl(eax, Address(rsp, 132));
+  cmpl(eax, INT_MIN);
+  jcc(Assembler::greaterEqual, L_2TAG_PACKET_9_0_2);
+  movsd(xmm0, Address(tmp, 1208));
+  mulsd(xmm0, xmm0);
+  movl(edx, 14);
+  jmp(L_2TAG_PACKET_5_0_2);
+  bind(L_2TAG_PACKET_9_0_2);
+  movsd(xmm0, Address(tmp, 1216));
+  mulsd(xmm0, xmm0);
+  movl(edx, 15);
+  jmp(L_2TAG_PACKET_5_0_2);
+  bind(L_2TAG_PACKET_8_0_2);
+  movl(edx, Address(rsp, 128));
+  cmpl(eax, 2146435072);
+  jcc(Assembler::above, L_2TAG_PACKET_10_0_2);
+  cmpl(edx, 0);
+  jcc(Assembler::notEqual, L_2TAG_PACKET_10_0_2);
+  movl(eax, Address(rsp, 132));
+  cmpl(eax, 2146435072);
+  jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2);
+  movsd(xmm0, Address(tmp, 1192));
+  jmp(L_2TAG_PACKET_2_0_2);
+  bind(L_2TAG_PACKET_11_0_2);
+  movsd(xmm0, Address(tmp, 1200));
+  jmp(L_2TAG_PACKET_2_0_2);
+  bind(L_2TAG_PACKET_10_0_2);
+  movsd(xmm0, Address(rsp, 128));
+  addsd(xmm0, xmm0);
+  jmp(L_2TAG_PACKET_2_0_2);
+  bind(L_2TAG_PACKET_0_0_2);
+  movl(eax, Address(rsp, 132));
+  andl(eax, 2147483647);
+  cmpl(eax, 1083179008);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2);
+  movsd(xmm0, Address(rsp, 128));
+  addsd(xmm0, Address(tmp, 1184));
+  jmp(L_2TAG_PACKET_2_0_2);
+  bind(L_2TAG_PACKET_2_0_2);
+  movsd(Address(rsp, 48), xmm0);
+  fld_d(Address(rsp, 48));
+  bind(L_2TAG_PACKET_6_0_2);
+  movl(tmp, Address(rsp, 64));
+}
+
+#endif