--- old/src/cpu/x86/vm/assembler_x86.cpp 2015-07-22 17:47:38.484625200 -0700 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2015-07-22 17:47:37.642625200 -0700 @@ -2894,6 +2894,15 @@ emit_int8(imm8); } +void Assembler::pextrw(Register dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse2(), ""); + int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A, + false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); + emit_int8(0x15); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8); +} + void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A, @@ -2912,6 +2921,15 @@ emit_int8(imm8); } +void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) { + assert(VM_Version::supports_sse2(), ""); + int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F, + false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); + emit_int8((unsigned char)0xC4); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8); +} + void Assembler::pmovzxbw(XMMRegister dst, Address src) { assert(VM_Version::supports_sse4_1(), ""); if (VM_Version::supports_evex()) { @@ -3899,6 +3917,15 @@ } } +void Assembler::mulpd(XMMRegister dst, Address src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x59, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0x59, dst, src, VEX_SIMD_66); + } +} + void Assembler::mulps(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE); @@ -4058,6 +4085,24 @@ (VM_Version::supports_avx512dq() == false)); } +void Assembler::unpckhpd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x15, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0x15, dst, src, VEX_SIMD_66); + } +} + +void Assembler::unpcklpd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x14, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0x14, dst, src, VEX_SIMD_66); + } +} + void Assembler::xorpd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { @@ -4676,6 +4721,15 @@ emit_simd_arith(0xDB, dst, src, VEX_SIMD_66); } +void Assembler::pandn(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0xDF, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0xDF, dst, src, VEX_SIMD_66); + } +} + void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len); --- old/src/cpu/x86/vm/assembler_x86.hpp 2015-07-22 17:47:47.288625200 -0700 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2015-07-22 17:47:46.456625200 -0700 @@ -1653,10 +1653,14 @@ // SSE 4.1 extract void pextrd(Register dst, XMMRegister src, int imm8); void pextrq(Register dst, XMMRegister src, int imm8); + // SSE 2 extract + void pextrw(Register dst, XMMRegister src, int imm8); // SSE 4.1 insert void pinsrd(XMMRegister dst, Register src, int imm8); void pinsrq(XMMRegister dst, Register src, int imm8); + // SSE 2 insert + void pinsrw(XMMRegister dst, Register src, int imm8); // SSE4.1 packed move void pmovzxbw(XMMRegister dst, XMMRegister src); @@ -1906,6 +1910,7 @@ // Multiply Packed Floating-Point Values void mulpd(XMMRegister dst, XMMRegister src); + void mulpd(XMMRegister dst, Address src); void mulps(XMMRegister dst, XMMRegister src); void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); @@ -1928,6 +1933,9 @@ void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void unpckhpd(XMMRegister dst, XMMRegister src); + void unpcklpd(XMMRegister dst, XMMRegister src); + // Bitwise Logical XOR of Packed Floating-Point Values void xorpd(XMMRegister dst, XMMRegister src); void xorps(XMMRegister dst, XMMRegister src); @@ -2020,6 +2028,7 @@ // And packed integers void pand(XMMRegister dst, XMMRegister src); + void pandn(XMMRegister dst, XMMRegister src); void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len); --- old/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp 2015-07-22 17:47:55.936625200 -0700 +++ new/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp 2015-07-22 17:47:55.106625200 -0700 @@ -2459,9 +2459,6 @@ // Should consider not saving rbx, if not necessary __ trigfunc('t', op->as_Op2()->fpu_stack_size()); break; - case lir_exp : - __ exp_with_fallback(op->as_Op2()->fpu_stack_size()); - break; case lir_pow : __ pow_with_fallback(op->as_Op2()->fpu_stack_size()); break; --- old/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp 2015-07-22 17:48:04.602625200 -0700 +++ new/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp 2015-07-22 17:48:03.773625200 -0700 @@ -808,6 +808,12 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) { assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type"); + + if (x->id() == vmIntrinsics::_dexp) { + do_ExpIntrinsic(x); + return; + } + LIRItem value(x->argument_at(0), this); bool use_fpu = false; @@ -818,7 +824,6 @@ case vmIntrinsics::_dtan: case vmIntrinsics::_dlog: case vmIntrinsics::_dlog10: - case vmIntrinsics::_dexp: case vmIntrinsics::_dpow: use_fpu = true; } @@ -870,7 +875,6 @@ case vmIntrinsics::_dtan: __ tan (calc_input, calc_result, tmp1, tmp2); break; case vmIntrinsics::_dlog: __ log (calc_input, calc_result, tmp1); break; case vmIntrinsics::_dlog10: __ log10(calc_input, calc_result, tmp1); break; - case vmIntrinsics::_dexp: __ exp (calc_input, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break; case vmIntrinsics::_dpow: __ pow (calc_input, calc_input2, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break; default: ShouldNotReachHere(); } @@ -880,6 +884,32 @@ } } +void LIRGenerator::do_ExpIntrinsic(Intrinsic* x) { + LIRItem value(x->argument_at(0), this); + value.set_destroys_register(); + + LIR_Opr calc_result = rlock_result(x); + LIR_Opr result_reg = result_register_for(x->type()); + + BasicTypeList signature(1); + signature.append(T_DOUBLE); + CallingConvention* cc = frame_map()->c_calling_convention(&signature); + + value.load_item_force(cc->at(0)); + +#ifndef _LP64 + LIR_Opr tmp = FrameMap::fpu0_double_opr; + result_reg = tmp; + if (VM_Version::supports_sse2()) { + __ call_runtime_leaf(StubRoutines::dexp(), getThreadTemp(), result_reg, cc->args()); + } else { + __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dexp), getThreadTemp(), result_reg, cc->args()); + } +#else + __ call_runtime_leaf(StubRoutines::dexp(), getThreadTemp(), result_reg, cc->args()); +#endif + __ move(result_reg, calc_result); +} void LIRGenerator::do_ArrayCopy(Intrinsic* x) { assert(x->number_of_arguments() == 5, "wrong type"); --- old/src/cpu/x86/vm/c1_LinearScan_x86.cpp 2015-07-22 17:48:13.172625200 -0700 +++ new/src/cpu/x86/vm/c1_LinearScan_x86.cpp 2015-07-22 17:48:12.339625200 -0700 @@ -814,8 +814,7 @@ case lir_tan: case lir_sin: - case lir_cos: - case lir_exp: { + case lir_cos: { // sin, cos and exp need two temporary fpu stack slots, so there are two temporary // registers (stored in right and temp of the operation). // the stack allocator must guarantee that the stack slots are really free, --- old/src/cpu/x86/vm/interpreter_x86_32.cpp 2015-07-22 17:48:21.718625200 -0700 +++ new/src/cpu/x86/vm/interpreter_x86_32.cpp 2015-07-22 17:48:20.878625200 -0700 @@ -151,11 +151,15 @@ __ pop_fTOS(); break; case Interpreter::java_lang_math_exp: - __ exp_with_fallback(0); - // Store to stack to convert 80bit precision back to 64bits - __ push_fTOS(); - __ pop_fTOS(); - break; + __ subptr(rsp, 2*wordSize); + __ fstp_d(Address(rsp, 0)); + if (VM_Version::supports_sse2()) { + __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dexp()))); + } else { + __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::dexp))); + } + __ addptr(rsp, 2*wordSize); + break; default : ShouldNotReachHere(); } --- old/src/cpu/x86/vm/interpreter_x86_64.cpp 2015-07-22 17:48:30.196625200 -0700 +++ new/src/cpu/x86/vm/interpreter_x86_64.cpp 2015-07-22 17:48:29.374625200 -0700 @@ -252,6 +252,9 @@ if (kind == Interpreter::java_lang_math_sqrt) { __ sqrtsd(xmm0, Address(rsp, wordSize)); + } else if (kind == Interpreter::java_lang_math_exp) { + __ movdbl(xmm0, Address(rsp, wordSize)); + __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dexp()))); } else { __ fld_d(Address(rsp, wordSize)); switch (kind) { @@ -278,9 +281,6 @@ // empty stack slot) __ pow_with_fallback(0); break; - case Interpreter::java_lang_math_exp: - __ exp_with_fallback(0); - break; default : ShouldNotReachHere(); } --- old/src/cpu/x86/vm/macroAssembler_x86.cpp 2015-07-22 17:48:38.721625200 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp 2015-07-22 17:48:37.892625200 -0700 @@ -3027,6 +3027,15 @@ Assembler::fldcw(as_Address(src)); } +void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) { + if (reachable(src)) { + Assembler::mulpd(dst, as_Address(src)); + } else { + lea(rscratch1, src); + Assembler::mulpd(dst, Address(rscratch1, 0)); + } +} + void MacroAssembler::pow_exp_core_encoding() { // kills rax, rcx, rdx subptr(rsp,sizeof(jdouble)); @@ -3099,19 +3108,7 @@ BLOCK_COMMENT("} fast_pow"); } -void MacroAssembler::fast_exp() { - // computes exp(X) = 2^(X * log2(e)) - // if fast computation is not possible, result is NaN. Requires - // fallback from user of this macro. - // increase precision for intermediate steps of the computation - increase_precision(); - fldl2e(); // Stack: log2(e) X ... - fmulp(1); // Stack: (X*log2(e)) ... - pow_exp_core_encoding(); // Stack: exp(X) ... - restore_precision(); -} - -void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) { +void MacroAssembler::pow_or_exp(int num_fpu_regs_in_use) { // kills rax, rcx, rdx // pow and exp needs 2 extra registers on the fpu stack. Label slow_case, done; @@ -3123,182 +3120,164 @@ Register tmp2 = rax; Register tmp3 = rcx; - if (is_exp) { - // Stack: X - fld_s(0); // duplicate argument for runtime call. Stack: X X - fast_exp(); // Stack: exp(X) X - fcmp(tmp, 0, false, false); // Stack: exp(X) X - // exp(X) not equal to itself: exp(X) is NaN go to slow case. - jcc(Assembler::parity, slow_case); - // get rid of duplicate argument. Stack: exp(X) - if (num_fpu_regs_in_use > 0) { - fxch(); - fpop(); - } else { - ffree(1); - } - jmp(done); + // Stack: X Y + Label x_negative, y_not_2; + + static double two = 2.0; + ExternalAddress two_addr((address)&two); + + // constant maybe too far on 64 bit + lea(tmp2, two_addr); + fld_d(Address(tmp2, 0)); // Stack: 2 X Y + fcmp(tmp, 2, true, false); // Stack: X Y + jcc(Assembler::parity, y_not_2); + jcc(Assembler::notEqual, y_not_2); + + fxch(); fpop(); // Stack: X + fmul(0); // Stack: X*X + + jmp(done); + + bind(y_not_2); + + fldz(); // Stack: 0 X Y + fcmp(tmp, 1, true, false); // Stack: X Y + jcc(Assembler::above, x_negative); + + // X >= 0 + + fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y + fld_s(1); // Stack: X Y X Y + fast_pow(); // Stack: X^Y X Y + fcmp(tmp, 0, false, false); // Stack: X^Y X Y + // X^Y not equal to itself: X^Y is NaN go to slow case. + jcc(Assembler::parity, slow_case); + // get rid of duplicate arguments. Stack: X^Y + if (num_fpu_regs_in_use > 0) { + fxch(); fpop(); + fxch(); fpop(); } else { - // Stack: X Y - Label x_negative, y_not_2; + ffree(2); + ffree(1); + } + jmp(done); - static double two = 2.0; - ExternalAddress two_addr((address)&two); + // X <= 0 + bind(x_negative); - // constant maybe too far on 64 bit - lea(tmp2, two_addr); - fld_d(Address(tmp2, 0)); // Stack: 2 X Y - fcmp(tmp, 2, true, false); // Stack: X Y - jcc(Assembler::parity, y_not_2); - jcc(Assembler::notEqual, y_not_2); - - fxch(); fpop(); // Stack: X - fmul(0); // Stack: X*X - - jmp(done); - - bind(y_not_2); - - fldz(); // Stack: 0 X Y - fcmp(tmp, 1, true, false); // Stack: X Y - jcc(Assembler::above, x_negative); - - // X >= 0 - - fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y - fld_s(1); // Stack: X Y X Y - fast_pow(); // Stack: X^Y X Y - fcmp(tmp, 0, false, false); // Stack: X^Y X Y - // X^Y not equal to itself: X^Y is NaN go to slow case. - jcc(Assembler::parity, slow_case); - // get rid of duplicate arguments. Stack: X^Y - if (num_fpu_regs_in_use > 0) { - fxch(); fpop(); - fxch(); fpop(); - } else { - ffree(2); - ffree(1); - } - jmp(done); - - // X <= 0 - bind(x_negative); - - fld_s(1); // Stack: Y X Y - frndint(); // Stack: int(Y) X Y - fcmp(tmp, 2, false, false); // Stack: int(Y) X Y - jcc(Assembler::notEqual, slow_case); - - subptr(rsp, 8); - - // For X^Y, when X < 0, Y has to be an integer and the final - // result depends on whether it's odd or even. We just checked - // that int(Y) == Y. We move int(Y) to gp registers as a 64 bit - // integer to test its parity. If int(Y) is huge and doesn't fit - // in the 64 bit integer range, the integer indefinite value will - // end up in the gp registers. Huge numbers are all even, the - // integer indefinite number is even so it's fine. + fld_s(1); // Stack: Y X Y + frndint(); // Stack: int(Y) X Y + fcmp(tmp, 2, false, false); // Stack: int(Y) X Y + jcc(Assembler::notEqual, slow_case); + + subptr(rsp, 8); + + // For X^Y, when X < 0, Y has to be an integer and the final + // result depends on whether it's odd or even. We just checked + // that int(Y) == Y. We move int(Y) to gp registers as a 64 bit + // integer to test its parity. If int(Y) is huge and doesn't fit + // in the 64 bit integer range, the integer indefinite value will + // end up in the gp registers. Huge numbers are all even, the + // integer indefinite number is even so it's fine. #ifdef ASSERT - // Let's check we don't end up with an integer indefinite number - // when not expected. First test for huge numbers: check whether - // int(Y)+1 == int(Y) which is true for very large numbers and - // those are all even. A 64 bit integer is guaranteed to not - // overflow for numbers where y+1 != y (when precision is set to - // double precision). - Label y_not_huge; + // Let's check we don't end up with an integer indefinite number + // when not expected. First test for huge numbers: check whether + // int(Y)+1 == int(Y) which is true for very large numbers and + // those are all even. A 64 bit integer is guaranteed to not + // overflow for numbers where y+1 != y (when precision is set to + // double precision). + Label y_not_huge; - fld1(); // Stack: 1 int(Y) X Y - fadd(1); // Stack: 1+int(Y) int(Y) X Y + fld1(); // Stack: 1 int(Y) X Y + fadd(1); // Stack: 1+int(Y) int(Y) X Y #ifdef _LP64 - // trip to memory to force the precision down from double extended - // precision - fstp_d(Address(rsp, 0)); - fld_d(Address(rsp, 0)); + // trip to memory to force the precision down from double extended + // precision + fstp_d(Address(rsp, 0)); + fld_d(Address(rsp, 0)); #endif - fcmp(tmp, 1, true, false); // Stack: int(Y) X Y + fcmp(tmp, 1, true, false); // Stack: int(Y) X Y #endif - // move int(Y) as 64 bit integer to thread's stack - fistp_d(Address(rsp,0)); // Stack: X Y + // move int(Y) as 64 bit integer to thread's stack + fistp_d(Address(rsp,0)); // Stack: X Y #ifdef ASSERT - jcc(Assembler::notEqual, y_not_huge); + jcc(Assembler::notEqual, y_not_huge); - // Y is huge so we know it's even. It may not fit in a 64 bit - // integer and we don't want the debug code below to see the - // integer indefinite value so overwrite int(Y) on the thread's - // stack with 0. - movl(Address(rsp, 0), 0); - movl(Address(rsp, 4), 0); + // Y is huge so we know it's even. It may not fit in a 64 bit + // integer and we don't want the debug code below to see the + // integer indefinite value so overwrite int(Y) on the thread's + // stack with 0. + movl(Address(rsp, 0), 0); + movl(Address(rsp, 4), 0); - bind(y_not_huge); + bind(y_not_huge); #endif - fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y - fld_s(1); // Stack: X Y X Y - fabs(); // Stack: abs(X) Y X Y - fast_pow(); // Stack: abs(X)^Y X Y - fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y - // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case. - - pop(tmp2); - NOT_LP64(pop(tmp3)); - jcc(Assembler::parity, slow_case); + fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y + fld_s(1); // Stack: X Y X Y + fabs(); // Stack: abs(X) Y X Y + fast_pow(); // Stack: abs(X)^Y X Y + fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y + // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case. + + pop(tmp2); + NOT_LP64(pop(tmp3)); + jcc(Assembler::parity, slow_case); #ifdef ASSERT - // Check that int(Y) is not integer indefinite value (int - // overflow). Shouldn't happen because for values that would - // overflow, 1+int(Y)==Y which was tested earlier. + // Check that int(Y) is not integer indefinite value (int + // overflow). Shouldn't happen because for values that would + // overflow, 1+int(Y)==Y which was tested earlier. #ifndef _LP64 - { - Label integer; - testl(tmp2, tmp2); - jcc(Assembler::notZero, integer); - cmpl(tmp3, 0x80000000); - jcc(Assembler::notZero, integer); - STOP("integer indefinite value shouldn't be seen here"); - bind(integer); - } + { + Label integer; + testl(tmp2, tmp2); + jcc(Assembler::notZero, integer); + cmpl(tmp3, 0x80000000); + jcc(Assembler::notZero, integer); + STOP("integer indefinite value shouldn't be seen here"); + bind(integer); + } #else - { - Label integer; - mov(tmp3, tmp2); // preserve tmp2 for parity check below - shlq(tmp3, 1); - jcc(Assembler::carryClear, integer); - jcc(Assembler::notZero, integer); - STOP("integer indefinite value shouldn't be seen here"); - bind(integer); - } + { + Label integer; + mov(tmp3, tmp2); // preserve tmp2 for parity check below + shlq(tmp3, 1); + jcc(Assembler::carryClear, integer); + jcc(Assembler::notZero, integer); + STOP("integer indefinite value shouldn't be seen here"); + bind(integer); + } #endif #endif - // get rid of duplicate arguments. Stack: X^Y - if (num_fpu_regs_in_use > 0) { - fxch(); fpop(); - fxch(); fpop(); - } else { - ffree(2); - ffree(1); - } - - testl(tmp2, 1); - jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y - // X <= 0, Y even: X^Y = -abs(X)^Y - - fchs(); // Stack: -abs(X)^Y Y - jmp(done); + // get rid of duplicate arguments. Stack: X^Y + if (num_fpu_regs_in_use > 0) { + fxch(); fpop(); + fxch(); fpop(); + } else { + ffree(2); + ffree(1); } + testl(tmp2, 1); + jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y + // X <= 0, Y even: X^Y = -abs(X)^Y + + fchs(); // Stack: -abs(X)^Y Y + jmp(done); + // slow case: runtime call bind(slow_case); fpop(); // pop incorrect result or int(Y) - fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow), - is_exp ? 1 : 2, num_fpu_regs_in_use); + fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), 2, num_fpu_regs_in_use); // Come here with result in F-TOS bind(done); --- old/src/cpu/x86/vm/macroAssembler_x86.hpp 2015-07-22 17:48:47.553625200 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.hpp 2015-07-22 17:48:46.734625200 -0700 @@ -890,14 +890,14 @@ // all corner cases and may result in NaN and require fallback to a // runtime call. void fast_pow(); - void fast_exp(); + void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, + XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, + Register rax, Register rcx, Register rdx, Register tmp); void increase_precision(); void restore_precision(); - // computes exp(x). Fallback to runtime call included. - void exp_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(true, num_fpu_regs_in_use); } // computes pow(x,y). Fallback to runtime call included. - void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(false, num_fpu_regs_in_use); } + void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(num_fpu_regs_in_use); } private: @@ -908,7 +908,7 @@ void pow_exp_core_encoding(); // computes pow(x,y) or exp(x). Fallback to runtime call included. - void pow_or_exp(bool is_exp, int num_fpu_regs_in_use); + void pow_or_exp(int num_fpu_regs_in_use); // these are private because users should be doing movflt/movdbl @@ -954,6 +954,10 @@ void movsd(XMMRegister dst, Address src) { Assembler::movsd(dst, src); } void movsd(XMMRegister dst, AddressLiteral src); + void mulpd(XMMRegister dst, XMMRegister src) { Assembler::mulpd(dst, src); } + void mulpd(XMMRegister dst, Address src) { Assembler::mulpd(dst, src); } + void mulpd(XMMRegister dst, AddressLiteral src); + void mulsd(XMMRegister dst, XMMRegister src) { Assembler::mulsd(dst, src); } void mulsd(XMMRegister dst, Address src) { Assembler::mulsd(dst, src); } void mulsd(XMMRegister dst, AddressLiteral src); --- old/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2015-07-22 17:48:56.080625200 -0700 +++ new/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2015-07-22 17:48:55.246625200 -0700 @@ -2129,14 +2129,6 @@ __ ret(0); } { - StubCodeMark mark(this, "StubRoutines", "exp"); - StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc(); - - __ fld_d(Address(rsp, 4)); - __ exp_with_fallback(0); - __ ret(0); - } - { StubCodeMark mark(this, "StubRoutines", "pow"); StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc(); @@ -2943,6 +2935,32 @@ return start; } + address generate_libmExp() { + address start = __ pc(); + + const XMMRegister x0 = xmm0; + const XMMRegister x1 = xmm1; + const XMMRegister x2 = xmm2; + const XMMRegister x3 = xmm3; + + const XMMRegister x4 = xmm4; + const XMMRegister x5 = xmm5; + const XMMRegister x6 = xmm6; + const XMMRegister x7 = xmm7; + + const Register tmp = rbx; + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + + } + + // Safefetch stubs. void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) { @@ -3156,6 +3174,9 @@ StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table; StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); } + if (VM_Version::supports_sse2()) { + StubRoutines::_dexp = generate_libmExp(); + } } --- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-07-22 17:49:04.712625200 -0700 +++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-07-22 17:49:03.884625200 -0700 @@ -3016,19 +3016,6 @@ __ ret(0); } { - StubCodeMark mark(this, "StubRoutines", "exp"); - StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc(); - - __ subq(rsp, 8); - __ movdbl(Address(rsp, 0), xmm0); - __ fld_d(Address(rsp, 0)); - __ exp_with_fallback(0); - __ fstp_d(Address(rsp, 0)); - __ movdbl(xmm0, Address(rsp, 0)); - __ addq(rsp, 8); - __ ret(0); - } - { StubCodeMark mark(this, "StubRoutines", "pow"); StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc(); @@ -4059,6 +4046,44 @@ return start; } + address generate_libmExp() { + address start = __ pc(); + + const XMMRegister x0 = xmm0; + const XMMRegister x1 = xmm1; + const XMMRegister x2 = xmm2; + const XMMRegister x3 = xmm3; + + const XMMRegister x4 = xmm4; + const XMMRegister x5 = xmm5; + const XMMRegister x6 = xmm6; + const XMMRegister x7 = xmm7; + + const Register tmp = r11; + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + +#ifdef _WIN64 + // save the xmm registers which must be preserved 6-7 + __ movdqu(xmm_save(6), as_XMMRegister(6)); + __ movdqu(xmm_save(7), as_XMMRegister(7)); +#endif + __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); + +#ifdef _WIN64 + // restore xmm regs belonging to calling function + __ movdqu(as_XMMRegister(6), xmm_save(6)); + __ movdqu(as_XMMRegister(7), xmm_save(7)); +#endif + + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + + } + #undef __ #define __ masm-> @@ -4239,6 +4264,7 @@ StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table; StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); } + StubRoutines::_dexp = generate_libmExp(); } void generate_all() { --- old/src/cpu/x86/vm/x86_32.ad 2015-07-22 17:49:13.416625200 -0700 +++ new/src/cpu/x86/vm/x86_32.ad 2015-07-22 17:49:12.578625200 -0700 @@ -9907,35 +9907,6 @@ ins_pipe( pipe_slow ); %} - -instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{ - predicate (UseSSE<=1); - match(Set dpr1 (ExpD dpr1)); - effect(KILL rax, KILL rcx, KILL rdx, KILL cr); - format %{ "fast_exp $dpr1 -> $dpr1 // KILL $rax, $rcx, $rdx" %} - ins_encode %{ - __ fast_exp(); - %} - ins_pipe( pipe_slow ); -%} - -instruct expD_reg(regD dst, regD src, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{ - predicate (UseSSE>=2); - match(Set dst (ExpD src)); - effect(KILL rax, KILL rcx, KILL rdx, KILL cr); - format %{ "fast_exp $dst -> $src // KILL $rax, $rcx, $rdx" %} - ins_encode %{ - __ subptr(rsp, 8); - __ movdbl(Address(rsp, 0), $src$$XMMRegister); - __ fld_d(Address(rsp, 0)); - __ fast_exp(); - __ fstp_d(Address(rsp, 0)); - __ movdbl($dst$$XMMRegister, Address(rsp, 0)); - __ addptr(rsp, 8); - %} - ins_pipe( pipe_slow ); -%} - instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{ predicate (UseSSE<=1); // The source Double operand on FPU stack --- old/src/cpu/x86/vm/x86_64.ad 2015-07-22 17:49:22.413625200 -0700 +++ new/src/cpu/x86/vm/x86_64.ad 2015-07-22 17:49:21.587625200 -0700 @@ -9867,22 +9867,6 @@ ins_pipe( pipe_slow ); %} -instruct expD_reg(regD dst, regD src, rax_RegI rax, rdx_RegI rdx, rcx_RegI rcx, rFlagsReg cr) %{ - match(Set dst (ExpD src)); - effect(KILL rax, KILL rcx, KILL rdx, KILL cr); - format %{ "fast_exp $dst -> $src // KILL $rax, $rcx, $rdx" %} - ins_encode %{ - __ subptr(rsp, 8); - __ movdbl(Address(rsp, 0), $src$$XMMRegister); - __ fld_d(Address(rsp, 0)); - __ fast_exp(); - __ fstp_d(Address(rsp, 0)); - __ movdbl($dst$$XMMRegister, Address(rsp, 0)); - __ addptr(rsp, 8); - %} - ins_pipe( pipe_slow ); -%} - //----------Arithmetic Conversion Instructions--------------------------------- instruct roundFloat_nop(regF dst) --- old/src/share/vm/adlc/formssel.cpp 2015-07-22 17:49:31.340625200 -0700 +++ new/src/share/vm/adlc/formssel.cpp 2015-07-22 17:49:30.511625200 -0700 @@ -4006,7 +4006,6 @@ strcmp(opType,"DivD")==0 || strcmp(opType,"DivF")==0 || strcmp(opType,"DivI")==0 || - strcmp(opType,"ExpD")==0 || strcmp(opType,"LogD")==0 || strcmp(opType,"Log10D")==0 || strcmp(opType,"ModD")==0 || --- old/src/share/vm/c1/c1_LIR.cpp 2015-07-22 17:49:40.021625200 -0700 +++ new/src/share/vm/c1/c1_LIR.cpp 2015-07-22 17:49:39.185625200 -0700 @@ -732,8 +732,7 @@ case lir_sin: case lir_cos: case lir_log: - case lir_log10: - case lir_exp: { + case lir_log10: { assert(op->as_Op2() != NULL, "must be"); LIR_Op2* op2 = (LIR_Op2*)op; @@ -743,9 +742,6 @@ // overlap with the input. assert(op2->_info == NULL, "not used"); assert(op2->_tmp5->is_illegal(), "not used"); - assert(op2->_tmp2->is_valid() == (op->code() == lir_exp), "not used"); - assert(op2->_tmp3->is_valid() == (op->code() == lir_exp), "not used"); - assert(op2->_tmp4->is_valid() == (op->code() == lir_exp), "not used"); assert(op2->_opr1->is_valid(), "used"); do_input(op2->_opr1); do_temp(op2->_opr1); @@ -1775,7 +1771,6 @@ case lir_tan: s = "tan"; break; case lir_log: s = "log"; break; case lir_log10: s = "log10"; break; - case lir_exp: s = "exp"; break; case lir_pow: s = "pow"; break; case lir_logic_and: s = "logic_and"; break; case lir_logic_or: s = "logic_or"; break; --- old/src/share/vm/c1/c1_LIR.hpp 2015-07-22 17:49:48.582625200 -0700 +++ new/src/share/vm/c1/c1_LIR.hpp 2015-07-22 17:49:47.756625200 -0700 @@ -961,7 +961,6 @@ , lir_tan , lir_log , lir_log10 - , lir_exp , lir_pow , lir_logic_and , lir_logic_or @@ -2199,7 +2198,6 @@ void sin (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_sin , from, tmp1, to, tmp2)); } void cos (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_cos , from, tmp1, to, tmp2)); } void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); } - void exp (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5) { append(new LIR_Op2(lir_exp , from, tmp1, to, tmp2, tmp3, tmp4, tmp5)); } void pow (LIR_Opr arg1, LIR_Opr arg2, LIR_Opr res, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5) { append(new LIR_Op2(lir_pow, arg1, arg2, res, tmp1, tmp2, tmp3, tmp4, tmp5)); } void add (LIR_Opr left, LIR_Opr right, LIR_Opr res) { append(new LIR_Op2(lir_add, left, right, res)); } --- old/src/share/vm/c1/c1_LIRAssembler.cpp 2015-07-22 17:49:57.170625200 -0700 +++ new/src/share/vm/c1/c1_LIRAssembler.cpp 2015-07-22 17:49:56.343625200 -0700 @@ -738,7 +738,6 @@ case lir_cos: case lir_log: case lir_log10: - case lir_exp: case lir_pow: intrinsic_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op); break; --- old/src/share/vm/c1/c1_LIRGenerator.hpp 2015-07-22 17:50:05.708625200 -0700 +++ new/src/share/vm/c1/c1_LIRGenerator.hpp 2015-07-22 17:50:04.882625200 -0700 @@ -244,6 +244,7 @@ void do_getClass(Intrinsic* x); void do_currentThread(Intrinsic* x); void do_MathIntrinsic(Intrinsic* x); + void do_ExpIntrinsic(Intrinsic* x); void do_ArrayCopy(Intrinsic* x); void do_CompareAndSwap(Intrinsic* x, ValueType* type); void do_NIOCheckIndex(Intrinsic* x); --- old/src/share/vm/c1/c1_LinearScan.cpp 2015-07-22 17:50:14.231625200 -0700 +++ new/src/share/vm/c1/c1_LinearScan.cpp 2015-07-22 17:50:13.398625200 -0700 @@ -6588,7 +6588,6 @@ case lir_log10: case lir_log: case lir_pow: - case lir_exp: case lir_logic_and: case lir_logic_or: case lir_logic_xor: --- old/src/share/vm/c1/c1_Runtime1.cpp 2015-07-22 17:50:23.017625200 -0700 +++ new/src/share/vm/c1/c1_Runtime1.cpp 2015-07-22 17:50:22.188625200 -0700 @@ -317,6 +317,7 @@ FUNCTION_CASE(entry, TRACE_TIME_METHOD); #endif FUNCTION_CASE(entry, StubRoutines::updateBytesCRC32()); + FUNCTION_CASE(entry, StubRoutines::dexp()); #undef FUNCTION_CASE --- old/src/share/vm/opto/classes.hpp 2015-07-22 17:50:31.729625200 -0700 +++ new/src/share/vm/opto/classes.hpp 2015-07-22 17:50:30.875625200 -0700 @@ -131,7 +131,6 @@ macro(EncodeISOArray) macro(EncodeP) macro(EncodePKlass) -macro(ExpD) macro(FastLock) macro(FastUnlock) macro(Goto) --- old/src/share/vm/opto/compile.hpp 2015-07-22 17:50:40.836625200 -0700 +++ new/src/share/vm/opto/compile.hpp 2015-07-22 17:50:39.933625200 -0700 @@ -1095,7 +1095,7 @@ bool in_scratch_emit_size() const { return _in_scratch_emit_size; } enum ScratchBufferBlob { - MAX_inst_size = 1024, + MAX_inst_size = 32768, MAX_locs_size = 128, // number of relocInfo elements MAX_const_size = 128, MAX_stubs_size = 128 --- old/src/share/vm/opto/library_call.cpp 2015-07-22 17:50:49.904625200 -0700 +++ new/src/share/vm/opto/library_call.cpp 2015-07-22 17:50:49.058625200 -0700 @@ -221,7 +221,6 @@ bool inline_math_negateExactL(); bool inline_math_subtractExactI(bool is_decrement); bool inline_math_subtractExactL(bool is_decrement); - bool inline_exp(); bool inline_pow(); Node* finish_pow_exp(Node* result, Node* x, Node* y, const TypeFunc* call_type, address funcAddr, const char* funcName); bool inline_min_max(vmIntrinsics::ID id); @@ -1810,20 +1809,6 @@ } } -//------------------------------inline_exp------------------------------------- -// Inline exp instructions, if possible. The Intel hardware only misses -// really odd corner cases (+/- Infinity). Just uncommon-trap them. -bool LibraryCallKit::inline_exp() { - Node* arg = round_double_node(argument(0)); - Node* n = _gvn.transform(new ExpDNode(C, control(), arg)); - - n = finish_pow_exp(n, arg, NULL, OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dexp), "EXP"); - set_result(n); - - C->set_has_split_ifs(true); // Has chance for split-if optimization - return true; -} - //------------------------------inline_pow------------------------------------- // Inline power instructions, if possible. bool LibraryCallKit::inline_pow() { @@ -2051,8 +2036,9 @@ case vmIntrinsics::_dsqrt: return Matcher::match_rule_supported(Op_SqrtD) ? inline_math(id) : false; case vmIntrinsics::_dabs: return Matcher::has_match_rule(Op_AbsD) ? inline_math(id) : false; - case vmIntrinsics::_dexp: return Matcher::has_match_rule(Op_ExpD) ? inline_exp() : - runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dexp), "EXP"); + case vmIntrinsics::_dexp: + return VM_Version::supports_sse2() ? runtime_math(OptoRuntime::Math_D_D_Type(), StubRoutines::dexp(), "dexp") : + runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dexp), "EXP"); case vmIntrinsics::_dpow: return Matcher::has_match_rule(Op_PowD) ? inline_pow() : runtime_math(OptoRuntime::Math_DD_D_Type(), FN_PTR(SharedRuntime::dpow), "POW"); #undef FN_PTR --- old/src/share/vm/opto/subnode.cpp 2015-07-22 17:50:58.757625200 -0700 +++ new/src/share/vm/opto/subnode.cpp 2015-07-22 17:50:57.930625200 -0700 @@ -1487,18 +1487,6 @@ //============================================================================= //------------------------------Value------------------------------------------ -// Compute exp -const Type *ExpDNode::Value( PhaseTransform *phase ) const { - const Type *t1 = phase->type( in(1) ); - if( t1 == Type::TOP ) return Type::TOP; - if( t1->base() != Type::DoubleCon ) return Type::DOUBLE; - double d = t1->getd(); - return TypeD::make( StubRoutines::intrinsic_exp( d ) ); -} - - -//============================================================================= -//------------------------------Value------------------------------------------ // Compute pow const Type *PowDNode::Value( PhaseTransform *phase ) const { const Type *t1 = phase->type( in(1) ); --- old/src/share/vm/opto/subnode.hpp 2015-07-22 17:51:07.652625200 -0700 +++ new/src/share/vm/opto/subnode.hpp 2015-07-22 17:51:06.806625200 -0700 @@ -470,20 +470,6 @@ virtual const Type *Value( PhaseTransform *phase ) const; }; -//------------------------------ExpDNode--------------------------------------- -// Exponentiate a double -class ExpDNode : public Node { -public: - ExpDNode(Compile* C, Node *c, Node *in1) : Node(c, in1) { - init_flags(Flag_is_expensive); - C->add_expensive_node(this); - } - virtual int Opcode() const; - const Type *bottom_type() const { return Type::DOUBLE; } - virtual uint ideal_reg() const { return Op_RegD; } - virtual const Type *Value( PhaseTransform *phase ) const; -}; - //------------------------------LogDNode--------------------------------------- // Log_e of a double class LogDNode : public Node { --- old/src/share/vm/runtime/stubRoutines.cpp 2015-07-22 17:51:16.846625200 -0700 +++ new/src/share/vm/runtime/stubRoutines.cpp 2015-07-22 17:51:16.008625200 -0700 @@ -146,9 +146,10 @@ address StubRoutines::_montgomeryMultiply = NULL; address StubRoutines::_montgomerySquare = NULL; +address StubRoutines::_dexp = NULL; + double (* StubRoutines::_intrinsic_log )(double) = NULL; double (* StubRoutines::_intrinsic_log10 )(double) = NULL; -double (* StubRoutines::_intrinsic_exp )(double) = NULL; double (* StubRoutines::_intrinsic_pow )(double, double) = NULL; double (* StubRoutines::_intrinsic_sin )(double) = NULL; double (* StubRoutines::_intrinsic_cos )(double) = NULL; --- old/src/share/vm/runtime/stubRoutines.hpp 2015-07-22 17:51:25.546625200 -0700 +++ new/src/share/vm/runtime/stubRoutines.hpp 2015-07-22 17:51:24.700625200 -0700 @@ -205,6 +205,8 @@ static address _montgomeryMultiply; static address _montgomerySquare; + static address _dexp; + // These are versions of the java.lang.Math methods which perform // the same operations as the intrinsic version. They are used for // constant folding in the compiler to ensure equivalence. If the @@ -213,7 +215,6 @@ // SharedRuntime. static double (*_intrinsic_log)(double); static double (*_intrinsic_log10)(double); - static double (*_intrinsic_exp)(double); static double (*_intrinsic_pow)(double, double); static double (*_intrinsic_sin)(double); static double (*_intrinsic_cos)(double); @@ -371,6 +372,8 @@ static address montgomeryMultiply() { return _montgomeryMultiply; } static address montgomerySquare() { return _montgomerySquare; } + static address dexp() {return _dexp; } + static address select_fill_function(BasicType t, bool aligned, const char* &name); static address zero_aligned_words() { return _zero_aligned_words; } @@ -383,10 +386,6 @@ assert(_intrinsic_log != NULL, "must be defined"); return _intrinsic_log10(d); } - static double intrinsic_exp(double d) { - assert(_intrinsic_exp != NULL, "must be defined"); - return _intrinsic_exp(d); - } static double intrinsic_pow(double d, double d2) { assert(_intrinsic_pow != NULL, "must be defined"); return _intrinsic_pow(d, d2); --- old/src/share/vm/runtime/vmStructs.cpp 2015-07-22 17:51:34.174625200 -0700 +++ new/src/share/vm/runtime/vmStructs.cpp 2015-07-22 17:51:33.339625200 -0700 @@ -836,6 +836,7 @@ static_field(StubRoutines, _multiplyToLen, address) \ static_field(StubRoutines, _squareToLen, address) \ static_field(StubRoutines, _mulAdd, address) \ + static_field(StubRoutines, _dexp, address) \ \ /*****************/ \ /* SharedRuntime */ \ @@ -1990,7 +1991,6 @@ declare_c2_type(TanDNode, Node) \ declare_c2_type(AtanDNode, Node) \ declare_c2_type(SqrtDNode, Node) \ - declare_c2_type(ExpDNode, Node) \ declare_c2_type(LogDNode, Node) \ declare_c2_type(Log10DNode, Node) \ declare_c2_type(PowDNode, Node) \ --- /dev/null 2015-07-22 17:51:44.000000000 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86_libm.cpp 2015-07-22 17:51:42.163625200 -0700 @@ -0,0 +1,1085 @@ +/* +* Copyright (c) 2007, 2015, Oracle and/or its affiliates. All rights reserved. +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* This code is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License version 2 only, as +* published by the Free Software Foundation. +* +* This code is distributed in the hope that it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* version 2 for more details (a copy is included in the LICENSE file that +* accompanied this code). +* +* You should have received a copy of the GNU General Public License version +* 2 along with this work; if not, write to the Free Software Foundation, +* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +* +* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +* or visit www.oracle.com if you need additional information or have any +* questions. +*/ + +/* +* Intel Math Library (LIBM) Source Code +* Copyright (c) 2015, Intel Corporation. +* +* This program is free software; you can redistribute it and/or modify it +* under the terms and conditions of the GNU General Public License, +* version 2, as published by the Free Software Foundation. +* +* This program is distributed in the hope it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +* more details. +*/ + +/******************************************************************************/ +// ALGORITHM DESCRIPTION +// --------------------- +// +// Description: +// Let K = 64 (table size). +// x x/log(2) n +// e = 2 = 2 * T[j] * (1 + P(y)) +// where +// x = m*log(2)/K + y, y in [-log(2)/K..log(2)/K] +// m = n*K + j, m,n,j - signed integer, j in [-K/2..K/2] +// j/K +// values of 2 are tabulated as T[j] = T_hi[j] ( 1 + T_lo[j]). +// +// P(y) is a minimax polynomial approximation of exp(x)-1 +// on small interval [-log(2)/K..log(2)/K] (were calculated by Maple V). +// +// To avoid problems with arithmetic overflow and underflow, +// n n1 n2 +// value of 2 is safely computed as 2 * 2 where n1 in [-BIAS/2..BIAS/2] +// where BIAS is a value of exponent bias. +// +// Special cases: +// exp(NaN) = NaN +// exp(+INF) = +INF +// exp(-INF) = 0 +// exp(x) = 1 for subnormals +// for finite argument, only exp(0)=1 is exact +// For IEEE double +// if x > 709.782712893383973096 then exp(x) overflow +// if x < -745.133219101941108420 then exp(x) underflow +// +/******************************************************************************/ + + +#include "precompiled.hpp" +#include "asm/assembler.hpp" +#include "asm/assembler.inline.hpp" + + +#ifdef _LP64 +//registers, +// input: xmm0 +// scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 +// rax, rdx, rcx, tmp - r11 + +void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, + XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, + Register eax, Register ecx, Register edx, Register tmp) { + Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2; + Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2; + Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2; + Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start; + + assert_different_registers(tmp, eax, ecx, edx); + jmp(start); + address cv = pc(); + emit_int32(1697350398); + emit_int32(1079448903); + emit_int32(1697350398); + emit_int32(1079448903); + emit_int32(4277796864); + emit_int32(1065758274); + emit_int32(4277796864); + emit_int32(1065758274); + emit_int32(3164486458); + emit_int32(1025308570); + emit_int32(3164486458); + emit_int32(1025308570); + emit_int32(4294967294); + emit_int32(1071644671); + emit_int32(4294967294); + emit_int32(1071644671); + emit_int32(3811088480); + emit_int32(1062650204); + emit_int32(1432067621); + emit_int32(1067799893); + emit_int32(3230715663); + emit_int32(1065423125); + emit_int32(1431604129); + emit_int32(1069897045); + address Shifter = pc(); + emit_int32(0); + emit_int32(1127743488); + emit_int32(0); + emit_int32(1127743488); + address mmask = pc(); + emit_int32(4294967232); + emit_int32(0); + emit_int32(4294967232); + emit_int32(0); + address bias = pc(); + emit_int32(65472); + emit_int32(0); + emit_int32(65472); + emit_int32(0); + address Tbl_addr = pc(); + emit_int32(0); + emit_int32(0); + emit_int32(0); + emit_int32(0); + emit_int32(235107661); + emit_int32(1018002367); + emit_int32(1048019040); + emit_int32(11418); + emit_int32(896005651); + emit_int32(1015861842); + emit_int32(3541402996); + emit_int32(22960); + emit_int32(1642514529); + emit_int32(1012987726); + emit_int32(410360776); + emit_int32(34629); + emit_int32(1568897900); + emit_int32(1016568486); + emit_int32(1828292879); + emit_int32(46424); + emit_int32(1882168529); + emit_int32(1010744893); + emit_int32(852742562); + emit_int32(58348); + emit_int32(509852888); + emit_int32(1017336174); + emit_int32(3490863952); + emit_int32(70401); + emit_int32(653277307); + emit_int32(1017431380); + emit_int32(2930322911); + emit_int32(82586); + emit_int32(1649557430); + emit_int32(1017729363); + emit_int32(1014845818); + emit_int32(94904); + emit_int32(1058231231); + emit_int32(1015777676); + emit_int32(3949972341); + emit_int32(107355); + emit_int32(1044000607); + emit_int32(1016786167); + emit_int32(828946858); + emit_int32(119943); + emit_int32(1151779725); + emit_int32(1015705409); + emit_int32(2288159958); + emit_int32(132667); + emit_int32(3819481236); + emit_int32(1016499965); + emit_int32(1853186616); + emit_int32(145530); + emit_int32(2552227826); + emit_int32(1015039787); + emit_int32(1709341917); + emit_int32(158533); + emit_int32(1829350193); + emit_int32(1015216097); + emit_int32(4112506593); + emit_int32(171677); + emit_int32(1913391795); + emit_int32(1015756674); + emit_int32(2799960843); + emit_int32(184965); + emit_int32(1303423926); + emit_int32(1015238005); + emit_int32(171030293); + emit_int32(198398); + emit_int32(1574172746); + emit_int32(1016061241); + emit_int32(2992903935); + emit_int32(211976); + emit_int32(3424156969); + emit_int32(1017196428); + emit_int32(926591434); + emit_int32(225703); + emit_int32(1938513547); + emit_int32(1017631273); + emit_int32(887463926); + emit_int32(239579); + emit_int32(2804567149); + emit_int32(1015390024); + emit_int32(1276261410); + emit_int32(253606); + emit_int32(631083525); + emit_int32(1017690182); + emit_int32(569847337); + emit_int32(267786); + emit_int32(1623370770); + emit_int32(1011049453); + emit_int32(1617004845); + emit_int32(282120); + emit_int32(3667985273); + emit_int32(1013894369); + emit_int32(3049340112); + emit_int32(296610); + emit_int32(3145379760); + emit_int32(1014403278); + emit_int32(3577096743); + emit_int32(311258); + emit_int32(2603100681); + emit_int32(1017152460); + emit_int32(1990012070); + emit_int32(326066); + emit_int32(3249202951); + emit_int32(1017448880); + emit_int32(1453150081); + emit_int32(341035); + emit_int32(419288974); + emit_int32(1016280325); + emit_int32(917841882); + emit_int32(356167); + emit_int32(3793507337); + emit_int32(1016095713); + emit_int32(3712504873); + emit_int32(371463); + emit_int32(728023093); + emit_int32(1016345318); + emit_int32(363667784); + emit_int32(386927); + emit_int32(2582678538); + emit_int32(1017123460); + emit_int32(2956612996); + emit_int32(402558); + emit_int32(7592966); + emit_int32(1016721543); + emit_int32(2186617380); + emit_int32(418360); + emit_int32(228611441); + emit_int32(1016696141); + emit_int32(1719614412); + emit_int32(434334); + emit_int32(2261665670); + emit_int32(1017457593); + emit_int32(1013258798); + emit_int32(450482); + emit_int32(544148907); + emit_int32(1017323666); + emit_int32(3907805043); + emit_int32(466805); + emit_int32(2383914918); + emit_int32(1017143586); + emit_int32(1447192520); + emit_int32(483307); + emit_int32(1176412038); + emit_int32(1017267372); + emit_int32(1944781190); + emit_int32(499988); + emit_int32(2882956373); + emit_int32(1013312481); + emit_int32(919555682); + emit_int32(516851); + emit_int32(3154077648); + emit_int32(1016528543); + emit_int32(2571947538); + emit_int32(533897); + emit_int32(348651999); + emit_int32(1016405780); + emit_int32(2604962540); + emit_int32(551129); + emit_int32(3253791412); + emit_int32(1015920431); + emit_int32(1110089947); + emit_int32(568549); + emit_int32(1509121860); + emit_int32(1014756995); + emit_int32(2568320822); + emit_int32(586158); + emit_int32(2617649212); + emit_int32(1017340090); + emit_int32(2966275556); + emit_int32(603959); + emit_int32(553214634); + emit_int32(1016457425); + emit_int32(2682146383); + emit_int32(621954); + emit_int32(730975783); + emit_int32(1014083580); + emit_int32(2191782032); + emit_int32(640145); + emit_int32(1486499517); + emit_int32(1016818996); + emit_int32(2069751140); + emit_int32(658534); + emit_int32(2595788928); + emit_int32(1016407932); + emit_int32(2990417244); + emit_int32(677123); + emit_int32(1853053619); + emit_int32(1015310724); + emit_int32(1434058175); + emit_int32(695915); + emit_int32(2462790535); + emit_int32(1015814775); + emit_int32(2572866477); + emit_int32(714911); + emit_int32(3693944214); + emit_int32(1017259110); + emit_int32(3092190714); + emit_int32(734114); + emit_int32(2979333550); + emit_int32(1017188654); + emit_int32(4076559942); + emit_int32(753526); + emit_int32(174054861); + emit_int32(1014300631); + emit_int32(2420883922); + emit_int32(773150); + emit_int32(816778419); + emit_int32(1014197934); + emit_int32(3716502172); + emit_int32(792987); + emit_int32(3507050924); + emit_int32(1015341199); + emit_int32(777507147); + emit_int32(813041); + emit_int32(1821514088); + emit_int32(1013410604); + emit_int32(3706687593); + emit_int32(833312); + emit_int32(920623539); + emit_int32(1016295433); + emit_int32(1242007931); + emit_int32(853805); + emit_int32(2789017511); + emit_int32(1014276997); + emit_int32(3707479175); + emit_int32(874520); + emit_int32(3586233004); + emit_int32(1015962192); + emit_int32(64696965); + emit_int32(895462); + emit_int32(474650514); + emit_int32(1016642419); + emit_int32(863738718); + emit_int32(916631); + emit_int32(1614448851); + emit_int32(1014281732); + emit_int32(3884662774); + emit_int32(938030); + emit_int32(2450082086); + emit_int32(1016164135); + emit_int32(2728693977); + emit_int32(959663); + emit_int32(1101668360); + emit_int32(1015989180); + emit_int32(3999357479); + emit_int32(981531); + emit_int32(835814894); + emit_int32(1015702697); + emit_int32(1533953344); + emit_int32(1003638); + emit_int32(1301400989); + emit_int32(1014466875); + emit_int32(2174652632); + emit_int32(1025985); + address ALLONES = pc(); + emit_int32(4294967295); + emit_int32(4294967295); + emit_int32(4294967295); + address ebias = pc(); + emit_int32(0); + emit_int32(1072693248); + emit_int32(0); + emit_int32(1072693248); + address XMAX = pc(); + emit_int32(4294967295); + emit_int32(2146435071); + address XMIN = pc(); + emit_int32(0); + emit_int32(1048576); + address INF = pc(); + emit_int32(0); + emit_int32(2146435072); + address ZERO = pc(); + emit_int32(0); + emit_int32(0); + address ONE_val = pc(); + emit_int32(0); + emit_int32(1072693248); + bind(start); + subq(rsp, 24); + movsd(Address(rsp, 8), xmm0); + unpcklpd(xmm0, xmm0); + movdqu(xmm1, InternalAddress(cv)); + movdqu(xmm6, InternalAddress(Shifter)); + movdqu(xmm2, InternalAddress(16+cv)); + movdqu(xmm3, InternalAddress(32+cv)); + pextrw(eax, xmm0, 3); + andl(eax, 32767); + movl(edx, 16527); + subl(edx, eax); + subl(eax, 15504); + orl(edx, eax); + cmpl(edx, INT_MIN); + jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2); + mulpd(xmm1, xmm0); + addpd(xmm1, xmm6); + movapd(xmm7, xmm1); + subpd(xmm1, xmm6); + mulpd(xmm2, xmm1); + movdqu(xmm4, InternalAddress(64+cv)); + mulpd(xmm3, xmm1); + movdqu(xmm5, InternalAddress(80+cv)); + subpd(xmm0, xmm2); + movdl(eax, xmm7); + movl(ecx, eax); + andl(ecx, 63); + shll(ecx, 4); + sarl(eax, 6); + movl(edx, eax); + movdqu(xmm6, InternalAddress(mmask)); + pand(xmm7, xmm6); + movdqu(xmm6, InternalAddress(bias)); + paddq(xmm7, xmm6); + psllq(xmm7, 46); + subpd(xmm0, xmm3); + lea(tmp, InternalAddress(Tbl_addr)); + movdqu(xmm2, Address(ecx,tmp)); + mulpd(xmm4, xmm0); + movapd(xmm6, xmm0); + movapd(xmm1, xmm0); + mulpd(xmm6, xmm6); + mulpd(xmm0, xmm6); + addpd(xmm5, xmm4); + mulsd(xmm0, xmm6); + mulpd(xmm6, InternalAddress(48+cv)); + addsd(xmm1, xmm2); + unpckhpd(xmm2, xmm2); + mulpd(xmm0, xmm5); + addsd(xmm1, xmm0); + por(xmm2, xmm7); + unpckhpd(xmm0, xmm0); + addsd(xmm0, xmm1); + addsd(xmm0, xmm6); + addl(edx, 894); + cmpl(edx, 1916); + jcc (Assembler::above, L_2TAG_PACKET_1_0_2); + mulsd(xmm0, xmm2); + addsd(xmm0, xmm2); + jmp (B1_5); + bind(L_2TAG_PACKET_1_0_2); + xorpd(xmm3, xmm3); + movdqu(xmm4, InternalAddress(ALLONES)); + movl(edx, -1022); + subl(edx, eax); + movdl(xmm5, edx); + psllq(xmm4, xmm5); + movl(ecx, eax); + sarl(eax, 1); + pinsrw(xmm3, eax, 3); + movdqu(xmm6, InternalAddress(ebias)); + psllq(xmm3, 4); + psubd(xmm2, xmm3); + mulsd(xmm0, xmm2); + cmpl(edx, 52); + jcc(Assembler::greater, L_2TAG_PACKET_2_0_2); + pand(xmm4, xmm2); + paddd(xmm3, xmm6); + subsd(xmm2, xmm4); + addsd(xmm0, xmm2); + cmpl(ecx, 1023); + jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2); + pextrw(ecx, xmm0, 3); + andl(ecx, 32768); + orl(edx, ecx); + cmpl(edx, 0); + jcc(Assembler::equal, L_2TAG_PACKET_4_0_2); + movapd(xmm6, xmm0); + addsd(xmm0, xmm4); + mulsd(xmm0, xmm3); + pextrw(ecx, xmm0, 3); + andl(ecx, 32752); + cmpl(ecx, 0); + jcc(Assembler::equal, L_2TAG_PACKET_5_0_2); + jmp(B1_5); + bind(L_2TAG_PACKET_5_0_2); + mulsd(xmm6, xmm3); + mulsd(xmm4, xmm3); + movdqu(xmm0, xmm6); + pxor(xmm6, xmm4); + psrad(xmm6, 31); + pshufd(xmm6, xmm6, 85); + psllq(xmm0, 1); + psrlq(xmm0, 1); + pxor(xmm0, xmm6); + psrlq(xmm6, 63); + paddq(xmm0, xmm6); + paddq(xmm0, xmm4); + movl(Address(rsp,0), 15); + jmp(L_2TAG_PACKET_6_0_2); + bind(L_2TAG_PACKET_4_0_2); + addsd(xmm0, xmm4); + mulsd(xmm0, xmm3); + jmp(B1_5); + bind(L_2TAG_PACKET_3_0_2); + addsd(xmm0, xmm4); + mulsd(xmm0, xmm3); + pextrw(ecx, xmm0, 3); + andl(ecx, 32752); + cmpl(ecx, 32752); + jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2); + jmp(B1_5); + bind(L_2TAG_PACKET_2_0_2); + paddd(xmm3, xmm6); + addpd(xmm0, xmm2); + mulsd(xmm0, xmm3); + movl(Address(rsp,0), 15); + jmp(L_2TAG_PACKET_6_0_2); + bind(L_2TAG_PACKET_8_0_2); + cmpl(eax, 2146435072); + jcc(Assembler::aboveEqual, L_2TAG_PACKET_9_0_2); + movl(eax, Address(rsp,12)); + cmpl(eax, INT_MIN); + jcc(Assembler::aboveEqual, L_2TAG_PACKET_10_0_2); + movsd(xmm0, InternalAddress(XMAX)); + mulsd(xmm0, xmm0); + bind(L_2TAG_PACKET_7_0_2); + movl(Address(rsp,0), 14); + jmp(L_2TAG_PACKET_6_0_2); + bind(L_2TAG_PACKET_10_0_2); + movsd(xmm0, InternalAddress(XMIN)); + mulsd(xmm0, xmm0); + movl(Address(rsp,0), 15); + jmp(L_2TAG_PACKET_6_0_2); + bind(L_2TAG_PACKET_9_0_2); + movl(edx, Address(rsp,8)); + cmpl(eax, 2146435072); + jcc(Assembler::above, L_2TAG_PACKET_11_0_2); + cmpl(edx, 0); + jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2); + movl(eax, Address(rsp,12)); + cmpl(eax, 2146435072); + jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_2); + movsd(xmm0, InternalAddress(INF)); + jmp(B1_5); + bind(L_2TAG_PACKET_12_0_2); + movsd(xmm0, InternalAddress(ZERO)); + jmp(B1_5); + bind(L_2TAG_PACKET_11_0_2); + movsd(xmm0, Address(rsp, 8)); + addsd(xmm0, xmm0); + jmp(B1_5); + bind(L_2TAG_PACKET_0_0_2); + movl(eax, Address(rsp, 12)); + andl(eax, 2147483647); + cmpl(eax, 1083179008); + jcc(Assembler::aboveEqual, L_2TAG_PACKET_8_0_2); + movsd(Address(rsp, 8), xmm0); + addsd(xmm0, InternalAddress(ONE_val)); + jmp(B1_5); + bind(L_2TAG_PACKET_6_0_2); + movq(Address(rsp, 16), xmm0); + bind(B1_3); + movq(xmm0, Address(rsp, 16)); + bind(L_2TAG_PACKET_13_0_2); + bind(B1_5); + addq(rsp, 24); +} +#endif + +#ifndef _LP64 +//registers, +// input: (rbp + 8) +// scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 +// rax, rdx, rcx, rbx (tmp) + +void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, + XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, + Register eax, Register ecx, Register edx, Register tmp) { + Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2; + Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2; + Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2; + Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start; + + assert_different_registers(tmp, eax, ecx, edx); + jmp(start); + address static_const_table = pc(); + emit_int32(0); + emit_int32(4293918720u); + emit_int32(0); + emit_int32(4293918720u); + emit_int32(4294967232u); + emit_int32(0); + emit_int32(4294967232u); + emit_int32(0); + emit_int32(65472u); + emit_int32(0); + emit_int32(65472u); + emit_int32(0); + emit_int32(0); + emit_int32(1127743488u); + emit_int32(0); + emit_int32(1127743488u); + emit_int32(1697350398u); + emit_int32(1079448903u); + emit_int32(1697350398u); + emit_int32(1079448903u); + emit_int32(4277796864u); + emit_int32(1065758274u); + emit_int32(4277796864u); + emit_int32(1065758274u); + emit_int32(3164486458u); + emit_int32(1025308570u); + emit_int32(3164486458u); + emit_int32(1025308570u); + emit_int32(4294967294u); + emit_int32(1071644671u); + emit_int32(4294967294u); + emit_int32(1071644671u); + emit_int32(3811088480u); + emit_int32(1062650204u); + emit_int32(1432067621u); + emit_int32(1067799893u); + emit_int32(3230715663u); + emit_int32(1065423125u); + emit_int32(1431604129u); + emit_int32(1069897045u); + emit_int32(0); + emit_int32(0); + emit_int32(0); + emit_int32(0); + emit_int32(235107661u); + emit_int32(1018002367u); + emit_int32(1048019040u); + emit_int32(11418u); + emit_int32(896005651u); + emit_int32(1015861842u); + emit_int32(3541402996u); + emit_int32(22960u); + emit_int32(1642514529u); + emit_int32(1012987726u); + emit_int32(410360776u); + emit_int32(34629u); + emit_int32(1568897900u); + emit_int32(1016568486u); + emit_int32(1828292879u); + emit_int32(46424u); + emit_int32(1882168529u); + emit_int32(1010744893u); + emit_int32(852742562u); + emit_int32(58348u); + emit_int32(509852888u); + emit_int32(1017336174u); + emit_int32(3490863952u); + emit_int32(70401u); + emit_int32(653277307u); + emit_int32(1017431380u); + emit_int32(2930322911u); + emit_int32(82586u); + emit_int32(1649557430u); + emit_int32(1017729363u); + emit_int32(1014845818u); + emit_int32(94904u); + emit_int32(1058231231u); + emit_int32(1015777676u); + emit_int32(3949972341u); + emit_int32(107355u); + emit_int32(1044000607u); + emit_int32(1016786167u); + emit_int32(828946858u); + emit_int32(119943u); + emit_int32(1151779725u); + emit_int32(1015705409u); + emit_int32(2288159958u); + emit_int32(132667u); + emit_int32(3819481236u); + emit_int32(1016499965u); + emit_int32(1853186616u); + emit_int32(145530u); + emit_int32(2552227826u); + emit_int32(1015039787u); + emit_int32(1709341917u); + emit_int32(158533u); + emit_int32(1829350193u); + emit_int32(1015216097u); + emit_int32(4112506593u); + emit_int32(171677u); + emit_int32(1913391795u); + emit_int32(1015756674u); + emit_int32(2799960843u); + emit_int32(184965u); + emit_int32(1303423926u); + emit_int32(1015238005u); + emit_int32(171030293u); + emit_int32(198398u); + emit_int32(1574172746u); + emit_int32(1016061241u); + emit_int32(2992903935u); + emit_int32(211976u); + emit_int32(3424156969u); + emit_int32(1017196428u); + emit_int32(926591434u); + emit_int32(225703u); + emit_int32(1938513547u); + emit_int32(1017631273u); + emit_int32(887463926u); + emit_int32(239579u); + emit_int32(2804567149u); + emit_int32(1015390024u); + emit_int32(1276261410u); + emit_int32(253606u); + emit_int32(631083525u); + emit_int32(1017690182u); + emit_int32(569847337u); + emit_int32(267786u); + emit_int32(1623370770u); + emit_int32(1011049453u); + emit_int32(1617004845u); + emit_int32(282120u); + emit_int32(3667985273u); + emit_int32(1013894369u); + emit_int32(3049340112u); + emit_int32(296610u); + emit_int32(3145379760u); + emit_int32(1014403278u); + emit_int32(3577096743u); + emit_int32(311258u); + emit_int32(2603100681u); + emit_int32(1017152460u); + emit_int32(1990012070u); + emit_int32(326066u); + emit_int32(3249202951u); + emit_int32(1017448880u); + emit_int32(1453150081u); + emit_int32(341035u); + emit_int32(419288974u); + emit_int32(1016280325u); + emit_int32(917841882u); + emit_int32(356167u); + emit_int32(3793507337u); + emit_int32(1016095713u); + emit_int32(3712504873u); + emit_int32(371463u); + emit_int32(728023093u); + emit_int32(1016345318u); + emit_int32(363667784u); + emit_int32(386927u); + emit_int32(2582678538u); + emit_int32(1017123460u); + emit_int32(2956612996u); + emit_int32(402558u); + emit_int32(7592966u); + emit_int32(1016721543u); + emit_int32(2186617380u); + emit_int32(418360u); + emit_int32(228611441u); + emit_int32(1016696141u); + emit_int32(1719614412u); + emit_int32(434334u); + emit_int32(2261665670u); + emit_int32(1017457593u); + emit_int32(1013258798u); + emit_int32(450482u); + emit_int32(544148907u); + emit_int32(1017323666u); + emit_int32(3907805043u); + emit_int32(466805u); + emit_int32(2383914918u); + emit_int32(1017143586u); + emit_int32(1447192520u); + emit_int32(483307u); + emit_int32(1176412038u); + emit_int32(1017267372u); + emit_int32(1944781190u); + emit_int32(499988u); + emit_int32(2882956373u); + emit_int32(1013312481u); + emit_int32(919555682u); + emit_int32(516851u); + emit_int32(3154077648u); + emit_int32(1016528543u); + emit_int32(2571947538u); + emit_int32(533897u); + emit_int32(348651999u); + emit_int32(1016405780u); + emit_int32(2604962540u); + emit_int32(551129u); + emit_int32(3253791412u); + emit_int32(1015920431u); + emit_int32(1110089947u); + emit_int32(568549u); + emit_int32(1509121860u); + emit_int32(1014756995u); + emit_int32(2568320822u); + emit_int32(586158u); + emit_int32(2617649212u); + emit_int32(1017340090u); + emit_int32(2966275556u); + emit_int32(603959u); + emit_int32(553214634u); + emit_int32(1016457425u); + emit_int32(2682146383u); + emit_int32(621954u); + emit_int32(730975783u); + emit_int32(1014083580u); + emit_int32(2191782032u); + emit_int32(640145u); + emit_int32(1486499517u); + emit_int32(1016818996u); + emit_int32(2069751140u); + emit_int32(658534u); + emit_int32(2595788928u); + emit_int32(1016407932u); + emit_int32(2990417244u); + emit_int32(677123u); + emit_int32(1853053619u); + emit_int32(1015310724u); + emit_int32(1434058175u); + emit_int32(695915u); + emit_int32(2462790535u); + emit_int32(1015814775u); + emit_int32(2572866477u); + emit_int32(714911u); + emit_int32(3693944214u); + emit_int32(1017259110u); + emit_int32(3092190714u); + emit_int32(734114u); + emit_int32(2979333550u); + emit_int32(1017188654u); + emit_int32(4076559942u); + emit_int32(753526u); + emit_int32(174054861u); + emit_int32(1014300631u); + emit_int32(2420883922u); + emit_int32(773150u); + emit_int32(816778419u); + emit_int32(1014197934u); + emit_int32(3716502172u); + emit_int32(792987u); + emit_int32(3507050924u); + emit_int32(1015341199u); + emit_int32(777507147u); + emit_int32(813041u); + emit_int32(1821514088u); + emit_int32(1013410604u); + emit_int32(3706687593u); + emit_int32(833312u); + emit_int32(920623539u); + emit_int32(1016295433u); + emit_int32(1242007931u); + emit_int32(853805u); + emit_int32(2789017511u); + emit_int32(1014276997u); + emit_int32(3707479175u); + emit_int32(874520u); + emit_int32(3586233004u); + emit_int32(1015962192u); + emit_int32(64696965u); + emit_int32(895462u); + emit_int32(474650514u); + emit_int32(1016642419u); + emit_int32(863738718u); + emit_int32(916631u); + emit_int32(1614448851u); + emit_int32(1014281732u); + emit_int32(3884662774u); + emit_int32(938030u); + emit_int32(2450082086u); + emit_int32(1016164135u); + emit_int32(2728693977u); + emit_int32(959663u); + emit_int32(1101668360u); + emit_int32(1015989180u); + emit_int32(3999357479u); + emit_int32(981531u); + emit_int32(835814894u); + emit_int32(1015702697u); + emit_int32(1533953344u); + emit_int32(1003638u); + emit_int32(1301400989u); + emit_int32(1014466875u); + emit_int32(2174652632u); + emit_int32(1025985u); + emit_int32(0); + emit_int32(1072693248u); + emit_int32(0); + emit_int32(2146435072u); + emit_int32(0); + emit_int32(0); + emit_int32(4294967295u); + emit_int32(2146435071u); + emit_int32(0); + emit_int32(1048576u); + bind(start); + subl(rsp, 120); + movl(Address(rsp, 64), tmp); + lea(tmp, InternalAddress(static_const_table)); + movdqu(xmm0, Address(rsp, 128)); + unpcklpd(xmm0, xmm0); + movdqu(xmm1, Address(tmp, 64)); + movdqu(xmm6, Address(tmp, 48)); + movdqu(xmm2, Address(tmp, 80)); + movdqu(xmm3, Address(tmp, 96)); + pextrw(eax, xmm0, 3); + andl(eax, 32767); + movl(edx, 16527); + subl(edx, eax); + subl(eax, 15504); + orl(edx, eax); + cmpl(edx, INT_MIN); + jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2); + mulpd(xmm1, xmm0); + addpd(xmm1, xmm6); + movapd(xmm7, xmm1); + subpd(xmm1, xmm6); + mulpd(xmm2, xmm1); + movdqu(xmm4, Address(tmp, 128)); + mulpd(xmm3, xmm1); + movdqu(xmm5, Address(tmp, 144)); + subpd(xmm0, xmm2); + movdl(eax, xmm7); + movl(ecx, eax); + andl(ecx, 63); + shll(ecx, 4); + sarl(eax, 6); + movl(edx, eax); + movdqu(xmm6, Address(tmp, 16)); + pand(xmm7, xmm6); + movdqu(xmm6, Address(tmp, 32)); + paddq(xmm7, xmm6); + psllq(xmm7, 46); + subpd(xmm0, xmm3); + movdqu(xmm2, Address(tmp, ecx, Address::times_1, 160)); + mulpd(xmm4, xmm0); + movapd(xmm6, xmm0); + movapd(xmm1, xmm0); + mulpd(xmm6, xmm6); + mulpd(xmm0, xmm6); + addpd(xmm5, xmm4); + mulsd(xmm0, xmm6); + mulpd(xmm6, Address(tmp, 112)); + addsd(xmm1, xmm2); + unpckhpd(xmm2, xmm2); + mulpd(xmm0, xmm5); + addsd(xmm1, xmm0); + por(xmm2, xmm7); + unpckhpd(xmm0, xmm0); + addsd(xmm0, xmm1); + addsd(xmm0, xmm6); + addl(edx, 894); + cmpl(edx, 1916); + jcc (Assembler::above, L_2TAG_PACKET_1_0_2); + mulsd(xmm0, xmm2); + addsd(xmm0, xmm2); + jmp(L_2TAG_PACKET_2_0_2); + bind(L_2TAG_PACKET_1_0_2); + fnstcw(Address(rsp, 24)); + movzwl(edx, Address(rsp, 24)); + orl(edx, 768); + movw(Address(rsp, 28), edx); + fldcw(Address(rsp, 28)); + movl(edx, eax); + sarl(eax, 1); + subl(edx, eax); + movdqu(xmm6, Address(tmp, 0)); + pandn(xmm6, xmm2); + addl(eax, 1023); + movdl(xmm3, eax); + psllq(xmm3, 52); + por(xmm6, xmm3); + addl(edx, 1023); + movdl(xmm4, edx); + psllq(xmm4, 52); + movsd(Address(rsp, 8), xmm0); + fld_d(Address(rsp, 8)); + movsd(Address(rsp, 16), xmm6); + fld_d(Address(rsp, 16)); + fmula(1); + faddp(1); + movsd(Address(rsp, 8), xmm4); + fld_d(Address(rsp, 8)); + fmulp(1); + fstp_d(Address(rsp, 8)); + movsd(xmm0,Address(rsp, 8)); + fldcw(Address(rsp, 24)); + pextrw(ecx, xmm0, 3); + andl(ecx, 32752); + cmpl(ecx, 32752); + jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2); + cmpl(ecx, 0); + jcc(Assembler::equal, L_2TAG_PACKET_4_0_2); + jmp(L_2TAG_PACKET_2_0_2); + cmpl(ecx, INT_MIN); + jcc(Assembler::less, L_2TAG_PACKET_3_0_2); + cmpl(ecx, -1064950997); + jcc(Assembler::less, L_2TAG_PACKET_2_0_2); + jcc(Assembler::greater, L_2TAG_PACKET_4_0_2); + movl(edx, Address(rsp, 128)); + cmpl(edx ,-17155601); + jcc(Assembler::less, L_2TAG_PACKET_2_0_2); + jmp(L_2TAG_PACKET_4_0_2); + bind(L_2TAG_PACKET_3_0_2); + movl(edx, 14); + jmp(L_2TAG_PACKET_5_0_2); + bind(L_2TAG_PACKET_4_0_2); + movl(edx, 15); + bind(L_2TAG_PACKET_5_0_2); + movsd(Address(rsp, 0), xmm0); + movsd(xmm0, Address(rsp, 128)); + fld_d(Address(rsp, 0)); + jmp(L_2TAG_PACKET_6_0_2); + bind(L_2TAG_PACKET_7_0_2); + cmpl(eax, 2146435072); + jcc(Assembler::greaterEqual, L_2TAG_PACKET_8_0_2); + movl(eax, Address(rsp, 132)); + cmpl(eax, INT_MIN); + jcc(Assembler::greaterEqual, L_2TAG_PACKET_9_0_2); + movsd(xmm0, Address(tmp, 1208)); + mulsd(xmm0, xmm0); + movl(edx, 14); + jmp(L_2TAG_PACKET_5_0_2); + bind(L_2TAG_PACKET_9_0_2); + movsd(xmm0, Address(tmp, 1216)); + mulsd(xmm0, xmm0); + movl(edx, 15); + jmp(L_2TAG_PACKET_5_0_2); + bind(L_2TAG_PACKET_8_0_2); + movl(edx, Address(rsp, 128)); + cmpl(eax, 2146435072); + jcc(Assembler::above, L_2TAG_PACKET_10_0_2); + cmpl(edx, 0); + jcc(Assembler::notEqual, L_2TAG_PACKET_10_0_2); + movl(eax, Address(rsp, 132)); + cmpl(eax, 2146435072); + jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2); + movsd(xmm0, Address(tmp, 1192)); + jmp(L_2TAG_PACKET_2_0_2); + bind(L_2TAG_PACKET_11_0_2); + movsd(xmm0, Address(tmp, 1200)); + jmp(L_2TAG_PACKET_2_0_2); + bind(L_2TAG_PACKET_10_0_2); + movsd(xmm0, Address(rsp, 128)); + addsd(xmm0, xmm0); + jmp(L_2TAG_PACKET_2_0_2); + bind(L_2TAG_PACKET_0_0_2); + movl(eax, Address(rsp, 132)); + andl(eax, 2147483647); + cmpl(eax, 1083179008); + jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2); + movsd(xmm0, Address(rsp, 128)); + addsd(xmm0, Address(tmp, 1184)); + jmp(L_2TAG_PACKET_2_0_2); + bind(L_2TAG_PACKET_2_0_2); + movsd(Address(rsp, 48), xmm0); + fld_d(Address(rsp, 48)); + bind(L_2TAG_PACKET_6_0_2); + movl(tmp, Address(rsp, 64)); +} + +#endif