--- old/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp 2012-04-16 11:10:57.999111680 +0200 +++ new/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp 2012-04-16 11:10:57.809921622 +0200 @@ -738,7 +738,8 @@ case vmIntrinsics::_dlog: // fall through case vmIntrinsics::_dsin: // fall through case vmIntrinsics::_dtan: // fall through - case vmIntrinsics::_dcos: { + case vmIntrinsics::_dcos: // fall through + case vmIntrinsics::_dexp: { assert(x->number_of_arguments() == 1, "wrong type"); address runtime_entry = NULL; @@ -758,12 +759,23 @@ case vmIntrinsics::_dlog10: runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dlog10); break; + case vmIntrinsics::_dexp: + runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dexp); + break; default: ShouldNotReachHere(); } LIR_Opr result = call_runtime(x->argument_at(0), runtime_entry, x->type(), NULL); set_result(x, result); + break; + } + case vmIntrinsics::_dpow: { + assert(x->number_of_arguments() == 2, "wrong type"); + address runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dpow); + LIR_Opr result = call_runtime(x->argument_at(0), x->argument_at(1), runtime_entry, x->type(), NULL); + set_result(x, result); + break; } } } --- old/src/cpu/sparc/vm/interpreter_sparc.cpp 2012-04-16 11:10:59.383338626 +0200 +++ new/src/cpu/sparc/vm/interpreter_sparc.cpp 2012-04-16 11:10:59.195588482 +0200 @@ -403,6 +403,8 @@ case Interpreter::java_lang_math_abs : break; case Interpreter::java_lang_math_log : break; case Interpreter::java_lang_math_log10 : break; + case Interpreter::java_lang_math_pow : break; + case Interpreter::java_lang_math_exp : break; case Interpreter::java_lang_ref_reference_get : entry_point = ((InterpreterGenerator*)this)->generate_Reference_get_entry(); break; default : ShouldNotReachHere(); break; --- old/src/cpu/x86/vm/assembler_x86.cpp 2012-04-16 11:11:00.837818722 +0200 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2012-04-16 11:11:00.576870918 +0200 @@ -3576,6 +3576,21 @@ emit_byte(0xF1); } +void Assembler::frndint() { + emit_byte(0xD9); + emit_byte(0xFC); +} + +void Assembler::f2xm1() { + emit_byte(0xD9); + emit_byte(0xF0); +} + +void Assembler::fldl2e() { + emit_byte(0xD9); + emit_byte(0xEA); +} + // SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding. static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 }; // SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding. @@ -6866,6 +6881,242 @@ Assembler::fldcw(as_Address(src)); } +void MacroAssembler::pow_exp_core_encoding() { + // kills rax, rcx, rdx + subptr(rsp,sizeof(jdouble)); + // computes 2^X. Stack: X ... + // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and + // keep it on the thread's stack to compute 2^int(X) later + // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1) + // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X)) + fld_s(0); // Stack: X X ... + frndint(); // Stack: int(X) X ... + fsuba(1); // Stack: int(X) X-int(X) ... + fistp_s(Address(rsp,0)); // move int(X) as integer to thread's stack. Stack: X-int(X) ... + f2xm1(); // Stack: 2^(X-int(X))-1 ... + fld1(); // Stack: 1 2^(X-int(X))-1 ... + faddp(1); // Stack: 2^(X-int(X)) + // computes 2^(int(X)): add exponent bias (1023) to int(X), then + // shift int(X)+1023 to exponent position. + // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11 + // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent + // values so detect them and set result to NaN. + movl(rax,Address(rsp,0)); + movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding + addl(rax, 1023); + movl(rdx,rax); + shll(rax,20); + // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN. + addl(rdx,1); + // Check that 1 < int(X)+1023+1 < 2048 + // in 3 steps: + // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048 + // 2- (int(X)+1023+1)&-2048 != 0 + // 3- (int(X)+1023+1)&-2048 != 1 + // Do 2- first because addl just updated the flags. + cmov32(Assembler::equal,rax,rcx); + cmpl(rdx,1); + cmov32(Assembler::equal,rax,rcx); + testl(rdx,rcx); + cmov32(Assembler::notEqual,rax,rcx); + movl(Address(rsp,4),rax); + movl(Address(rsp,0),0); + fmul_d(Address(rsp,0)); // Stack: 2^X ... + addptr(rsp,sizeof(jdouble)); +} + +void MacroAssembler::fast_pow() { + // computes X^Y = 2^(Y * log2(X)) + // if fast computation is not possible, result is NaN. Requires + // fallback from user of this macro. + fyl2x(); // Stack: (Y*log2(X)) ... + pow_exp_core_encoding(); // Stack: exp(X) ... +} + +void MacroAssembler::fast_exp() { + // computes exp(X) = 2^(X * log2(e)) + // if fast computation is not possible, result is NaN. Requires + // fallback from user of this macro. + fldl2e(); // Stack: log2(e) X ... + fmulp(1); // Stack: (X*log2(e)) ... + pow_exp_core_encoding(); // Stack: exp(X) ... +} + +void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) { + // kills rax, rcx, rdx + // pow and exp needs 2 extra registers on the fpu stack. + Label slow_case, done; + Register tmp = noreg; + if (!VM_Version::supports_cmov()) { + // fcmp needs a temporary so preserve rdx, + tmp = rdx; + } + Register tmp2 = rax; + NOT_LP64(Register tmp3 = rcx;) + + if (is_exp) { + // Stack: X + fld_s(0); // duplicate argument for runtime call. Stack: X X + fast_exp(); // Stack: exp(X) X + fcmp(tmp, 0, false, false); // Stack: exp(X) X + // exp(X) not equal to itself: exp(X) is NaN go to slow case. + jcc(Assembler::parity, slow_case); + // get rid of duplicate argument. Stack: exp(X) + if (num_fpu_regs_in_use > 0) { + fxch(); + fpop(); + } else { + ffree(1); + } + jmp(done); + } else { + // Stack: X Y + Label x_negative, y_odd; + + fldz(); // Stack: 0 X Y + fcmp(tmp, 1, true, false); // Stack: X Y + jcc(Assembler::above, x_negative); + + // X >= 0 + + fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y + fld_s(1); // Stack: X Y X Y + fast_pow(); // Stack: X^Y X Y + fcmp(tmp, 0, false, false); // Stack: X^Y X Y + // X^Y not equal to itself: X^Y is NaN go to slow case. + jcc(Assembler::parity, slow_case); + // get rid of duplicate arguments. Stack: X^Y + if (num_fpu_regs_in_use > 0) { + fxch(); fpop(); + fxch(); fpop(); + } else { + ffree(2); + ffree(1); + } + jmp(done); + + // X <= 0 + bind(x_negative); + + fld_s(1); // Stack: Y X Y + frndint(); // Stack: int(Y) X Y + fcmp(tmp, 2, false, false); // Stack: int(Y) X Y + jcc(Assembler::notEqual, slow_case); + + subptr(rsp, 8); + + // For X^Y, when X < 0, Y has to be an integer and the final + // result depends on whether it's odd or even. We just checked + // that int(Y) == Y. We move int(Y) to gp registers as a 64 bit + // integer to test its parity. If int(Y) is huge and doesn't fit + // in the 64 bit integer range, the integer indefinite value will + // end up in the gp registers. Huge numbers are all even, the + // integer indefinite number is even so it's fine. + +#ifdef ASSERT + // Let's check we don't end up with an integer indefinite number + // when not expected. First test for huge numbers: check whether + // int(Y)+1 == int(Y) which is true for very large numbers and + // those are all even. A 64 bit integer is guaranteed to not + // overflow for numbers where y+1 != y (when precision is set to + // double precision). + Label y_not_huge; + + fld1(); // Stack: 1 int(Y) X Y + fadd(1); // Stack: 1+int(Y) int(Y) X Y + +#ifdef _LP64 + // trip to memory to force the precision down from double extended + // precision + fstp_d(Address(rsp, 0)); + fld_d(Address(rsp, 0)); +#endif + + fcmp(tmp, 1, true, false); // Stack: int(Y) X Y +#endif + + // move int(Y) as 64 bit integer to thread's stack + fistp_d(Address(rsp,0)); // Stack: X Y + +#ifdef ASSERT + jcc(Assembler::notEqual, y_not_huge); + + // Y is huge so we know it's even. It may not fit in a 64 bit + // integer and we don't want the debug code below to see the + // integer indefinite value so overwrite int(Y) on the thread's + // stack with 0. + movl(Address(rsp, 0), 0); + movl(Address(rsp, 4), 0); + + bind(y_not_huge); +#endif + + fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y + fld_s(1); // Stack: X Y X Y + fabs(); // Stack: abs(X) Y X Y + fast_pow(); // Stack: abs(X)^Y X Y + fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y + // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case. + + pop(tmp2); + NOT_LP64(pop(tmp3)); + jcc(Assembler::parity, slow_case); + +#ifdef ASSERT + // Check that int(Y) is not integer indefinite value (int + // overflow). Shouldn't happen because for values that would + // overflow, 1+int(Y)==Y which was tested earlier. +#ifndef _LP64 + { + Label integer; + testl(tmp2, tmp2); + jcc(Assembler::notZero, integer); + cmpl(tmp3, 0x80000000); + jcc(Assembler::notZero, integer); + stop("integer indefinite value shouldn't be seen here"); + bind(integer); + } +#else + { + Label integer; + shlq(tmp2, 1); + jcc(Assembler::carryClear, integer); + jcc(Assembler::notZero, integer); + stop("integer indefinite value shouldn't be seen here"); + bind(integer); + } +#endif +#endif + + // get rid of duplicate arguments. Stack: X^Y + if (num_fpu_regs_in_use > 0) { + fxch(); fpop(); + fxch(); fpop(); + } else { + ffree(2); + ffree(1); + } + + testl(tmp2, 1); + jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y + // X <= 0, Y even: X^Y = -abs(X)^Y + + fchs(); // Stack: -abs(X)^Y Y + jmp(done); + } + + // slow case: runtime call + bind(slow_case); + + fpop(); // pop incorrect result or int(Y) + + fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow), + is_exp ? 1 : 2, num_fpu_regs_in_use); + + // Come here with result in F-TOS + bind(done); +} + void MacroAssembler::fpop() { ffree(); fincstp(); @@ -8043,6 +8294,144 @@ #endif } +void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) { + pusha(); + + // if we are coming from c1, xmm registers may be live + if (UseSSE >= 1) { + subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8)); + } + int off = 0; + if (UseSSE == 1) { + movflt(Address(rsp,off++*sizeof(jdouble)),xmm0); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm1); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm2); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm3); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm4); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm5); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm6); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm7); + } else if (UseSSE >= 2) { + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7); +#ifdef _LP64 + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15); +#endif + } + + // Preserve registers across runtime call + int incoming_argument_and_return_value_offset = -1; + if (num_fpu_regs_in_use > 1) { + // Must preserve all other FPU regs (could alternatively convert + // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash + // FPU state, but can not trust C compiler) + NEEDS_CLEANUP; + // NOTE that in this case we also push the incoming argument(s) to + // the stack and restore it later; we also use this stack slot to + // hold the return value from dsin, dcos etc. + for (int i = 0; i < num_fpu_regs_in_use; i++) { + subptr(rsp, sizeof(jdouble)); + fstp_d(Address(rsp, 0)); + } + incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1); + for (int i = nb_args-1; i >= 0; i--) { + fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble))); + } + } + + subptr(rsp, nb_args*sizeof(jdouble)); + for (int i = 0; i < nb_args; i++) { + fstp_d(Address(rsp, i*sizeof(jdouble))); + } + +#ifdef _LP64 + if (nb_args > 0) { + movdbl(xmm0, Address(rsp, 0)); + } + if (nb_args > 1) { + movdbl(xmm1, Address(rsp, sizeof(jdouble))); + } + assert(nb_args <= 2, "unsupported number of args"); +#endif // _LP64 + + // NOTE: we must not use call_VM_leaf here because that requires a + // complete interpreter frame in debug mode -- same bug as 4387334 + // MacroAssembler::call_VM_leaf_base is perfectly safe and will + // do proper 64bit abi + + NEEDS_CLEANUP; + // Need to add stack banging before this runtime call if it needs to + // be taken; however, there is no generic stack banging routine at + // the MacroAssembler level + + MacroAssembler::call_VM_leaf_base(runtime_entry, 0); + +#ifdef _LP64 + movsd(Address(rsp, 0), xmm0); + fld_d(Address(rsp, 0)); +#endif // _LP64 + addptr(rsp, sizeof(jdouble) * nb_args); + if (num_fpu_regs_in_use > 1) { + // Must save return value to stack and then restore entire FPU + // stack except incoming arguments + fstp_d(Address(rsp, incoming_argument_and_return_value_offset)); + for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) { + fld_d(Address(rsp, 0)); + addptr(rsp, sizeof(jdouble)); + } + fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble))); + addptr(rsp, sizeof(jdouble) * nb_args); + } + + off = 0; + if (UseSSE == 1) { + movflt(xmm0, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm1, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm2, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm3, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm4, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm5, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm6, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm7, Address(rsp,off++*sizeof(jdouble))); + } else if (UseSSE >= 2) { + movdbl(xmm0, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm1, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm2, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm3, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm4, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm5, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm6, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm7, Address(rsp,off++*sizeof(jdouble))); +#ifdef _LP64 + movdbl(xmm8, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm9, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm10, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm11, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm12, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm13, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm14, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm15, Address(rsp,off++*sizeof(jdouble))); +#endif + } + if (UseSSE >= 1) { + addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8)); + } + popa(); +} + static const double pi_4 = 0.7853981633974483; void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) { @@ -8090,73 +8479,27 @@ // slow case: runtime call bind(slow_case); - // Preserve registers across runtime call - pusha(); - int incoming_argument_and_return_value_offset = -1; - if (num_fpu_regs_in_use > 1) { - // Must preserve all other FPU regs (could alternatively convert - // SharedRuntime::dsin and dcos into assembly routines known not to trash - // FPU state, but can not trust C compiler) - NEEDS_CLEANUP; - // NOTE that in this case we also push the incoming argument to - // the stack and restore it later; we also use this stack slot to - // hold the return value from dsin or dcos. - for (int i = 0; i < num_fpu_regs_in_use; i++) { - subptr(rsp, sizeof(jdouble)); - fstp_d(Address(rsp, 0)); - } - incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1); - fld_d(Address(rsp, incoming_argument_and_return_value_offset)); - } - subptr(rsp, sizeof(jdouble)); - fstp_d(Address(rsp, 0)); -#ifdef _LP64 - movdbl(xmm0, Address(rsp, 0)); -#endif // _LP64 - // NOTE: we must not use call_VM_leaf here because that requires a - // complete interpreter frame in debug mode -- same bug as 4387334 - // MacroAssembler::call_VM_leaf_base is perfectly safe and will - // do proper 64bit abi - - NEEDS_CLEANUP; - // Need to add stack banging before this runtime call if it needs to - // be taken; however, there is no generic stack banging routine at - // the MacroAssembler level switch(trig) { case 's': { - MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 0); + fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use); } break; case 'c': { - MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 0); + fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use); } break; case 't': { - MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 0); + fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use); } break; default: assert(false, "bad intrinsic"); break; } -#ifdef _LP64 - movsd(Address(rsp, 0), xmm0); - fld_d(Address(rsp, 0)); -#endif // _LP64 - addptr(rsp, sizeof(jdouble)); - if (num_fpu_regs_in_use > 1) { - // Must save return value to stack and then restore entire FPU stack - fstp_d(Address(rsp, incoming_argument_and_return_value_offset)); - for (int i = 0; i < num_fpu_regs_in_use; i++) { - fld_d(Address(rsp, 0)); - addptr(rsp, sizeof(jdouble)); - } - } - popa(); // Come here with result in F-TOS bind(done); --- old/src/cpu/x86/vm/assembler_x86.hpp 2012-04-16 11:11:02.683246608 +0200 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2012-04-16 11:11:02.489885196 +0200 @@ -1148,6 +1148,9 @@ void fxsave(Address dst); void fyl2x(); + void frndint(); + void f2xm1(); + void fldl2e(); void hlt(); @@ -2387,7 +2390,28 @@ void ldmxcsr(Address src) { Assembler::ldmxcsr(src); } void ldmxcsr(AddressLiteral src); + // compute pow(x,y) and exp(x) with x86 instructions. Don't cover + // all corner cases and may result in NaN and require fallback to a + // runtime call. + void fast_pow(); + void fast_exp(); + + // computes exp(x). Fallback to runtime call included. + void exp_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(true, num_fpu_regs_in_use); } + // computes pow(x,y). Fallback to runtime call included. + void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(false, num_fpu_regs_in_use); } + private: + + // call runtime as a fallback for trig functions and pow/exp. + void fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use); + + // computes 2^(Ylog2X); Ylog2X in ST(0) + void pow_exp_core_encoding(); + + // computes pow(x,y) or exp(x). Fallback to runtime call included. + void pow_or_exp(bool is_exp, int num_fpu_regs_in_use); + // these are private because users should be doing movflt/movdbl void movss(Address dst, XMMRegister src) { Assembler::movss(dst, src); } --- old/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp 2012-04-16 11:11:04.141051510 +0200 +++ new/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp 2012-04-16 11:11:03.946871938 +0200 @@ -2433,6 +2433,12 @@ // Should consider not saving rbx, if not necessary __ trigfunc('t', op->as_Op2()->fpu_stack_size()); break; + case lir_exp : + __ exp_with_fallback(op->as_Op2()->fpu_stack_size()); + break; + case lir_pow : + __ pow_with_fallback(op->as_Op2()->fpu_stack_size()); + break; default : ShouldNotReachHere(); } } else { --- old/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp 2012-04-16 11:11:05.615188559 +0200 +++ new/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp 2012-04-16 11:11:05.426377224 +0200 @@ -823,7 +823,7 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) { - assert(x->number_of_arguments() == 1, "wrong type"); + assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type"); LIRItem value(x->argument_at(0), this); bool use_fpu = false; @@ -834,6 +834,8 @@ case vmIntrinsics::_dtan: case vmIntrinsics::_dlog: case vmIntrinsics::_dlog10: + case vmIntrinsics::_dexp: + case vmIntrinsics::_dpow: use_fpu = true; } } else { @@ -843,20 +845,37 @@ value.load_item(); LIR_Opr calc_input = value.result(); + LIR_Opr calc_input2 = NULL; + if (x->id() == vmIntrinsics::_dpow) { + LIRItem extra_arg(x->argument_at(1), this); + if (UseSSE < 2) { + extra_arg.set_destroys_register(); + } + extra_arg.load_item(); + calc_input2 = extra_arg.result(); + } LIR_Opr calc_result = rlock_result(x); - // sin and cos need two free fpu stack slots, so register two temporary operands + // sin, cos, pow and exp need two free fpu stack slots, so register + // two temporary operands LIR_Opr tmp1 = FrameMap::caller_save_fpu_reg_at(0); LIR_Opr tmp2 = FrameMap::caller_save_fpu_reg_at(1); if (use_fpu) { LIR_Opr tmp = FrameMap::fpu0_double_opr; + int tmp_start = 1; + if (calc_input2 != NULL) { + __ move(calc_input2, tmp); + tmp_start = 2; + calc_input2 = tmp; + } __ move(calc_input, tmp); calc_input = tmp; calc_result = tmp; - tmp1 = FrameMap::caller_save_fpu_reg_at(1); - tmp2 = FrameMap::caller_save_fpu_reg_at(2); + + tmp1 = FrameMap::caller_save_fpu_reg_at(tmp_start); + tmp2 = FrameMap::caller_save_fpu_reg_at(tmp_start + 1); } switch(x->id()) { @@ -867,6 +886,8 @@ case vmIntrinsics::_dtan: __ tan (calc_input, calc_result, tmp1, tmp2); break; case vmIntrinsics::_dlog: __ log (calc_input, calc_result, tmp1); break; case vmIntrinsics::_dlog10: __ log10(calc_input, calc_result, tmp1); break; + case vmIntrinsics::_dexp: __ exp (calc_input, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break; + case vmIntrinsics::_dpow: __ pow (calc_input, calc_input2, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break; default: ShouldNotReachHere(); } --- old/src/cpu/x86/vm/c1_LinearScan_x86.cpp 2012-04-16 11:11:06.968341207 +0200 +++ new/src/cpu/x86/vm/c1_LinearScan_x86.cpp 2012-04-16 11:11:06.775527610 +0200 @@ -690,8 +690,8 @@ case lir_mul_strictfp: case lir_div_strictfp: { - assert(op2->tmp_opr()->is_fpu_register(), "strict operations need temporary fpu stack slot"); - insert_free_if_dead(op2->tmp_opr()); + assert(op2->tmp1_opr()->is_fpu_register(), "strict operations need temporary fpu stack slot"); + insert_free_if_dead(op2->tmp1_opr()); assert(sim()->stack_size() <= 7, "at least one stack slot must be free"); // fall-through: continue with the normal handling of lir_mul and lir_div } @@ -787,16 +787,17 @@ case lir_log: case lir_log10: { - // log and log10 needs one temporary fpu stack slot, so there is ontemporary - // registers stored in temp of the operation. - // the stack allocator must guarantee that the stack slots are really free, - // otherwise there might be a stack overflow. + // log and log10 need one temporary fpu stack slot, so + // there is one temporary registers stored in temp of the + // operation. the stack allocator must guarantee that the stack + // slots are really free, otherwise there might be a stack + // overflow. assert(right->is_illegal(), "must be"); assert(left->is_fpu_register(), "must be"); assert(res->is_fpu_register(), "must be"); - assert(op2->tmp_opr()->is_fpu_register(), "must be"); + assert(op2->tmp1_opr()->is_fpu_register(), "must be"); - insert_free_if_dead(op2->tmp_opr()); + insert_free_if_dead(op2->tmp1_opr()); insert_free_if_dead(res, left); insert_exchange(left); do_rename(left, res); @@ -812,8 +813,9 @@ case lir_tan: case lir_sin: - case lir_cos: { - // sin and cos need two temporary fpu stack slots, so there are two temporary + case lir_cos: + case lir_exp: { + // sin, cos and exp need two temporary fpu stack slots, so there are two temporary // registers (stored in right and temp of the operation). // the stack allocator must guarantee that the stack slots are really free, // otherwise there might be a stack overflow. @@ -821,11 +823,11 @@ assert(res->is_fpu_register(), "must be"); // assert(left->is_last_use(), "old value gets destroyed"); assert(right->is_fpu_register(), "right is used as the first temporary register"); - assert(op2->tmp_opr()->is_fpu_register(), "temp is used as the second temporary register"); - assert(fpu_num(left) != fpu_num(right) && fpu_num(right) != fpu_num(op2->tmp_opr()) && fpu_num(op2->tmp_opr()) != fpu_num(res), "need distinct temp registers"); + assert(op2->tmp1_opr()->is_fpu_register(), "temp is used as the second temporary register"); + assert(fpu_num(left) != fpu_num(right) && fpu_num(right) != fpu_num(op2->tmp1_opr()) && fpu_num(op2->tmp1_opr()) != fpu_num(res), "need distinct temp registers"); insert_free_if_dead(right); - insert_free_if_dead(op2->tmp_opr()); + insert_free_if_dead(op2->tmp1_opr()); insert_free_if_dead(res, left); insert_exchange(left); @@ -839,6 +841,53 @@ break; } + case lir_pow: { + // pow needs two temporary fpu stack slots, so there are two temporary + // registers (stored in tmp1 and tmp2 of the operation). + // the stack allocator must guarantee that the stack slots are really free, + // otherwise there might be a stack overflow. + assert(left->is_fpu_register(), "must be"); + assert(right->is_fpu_register(), "must be"); + assert(res->is_fpu_register(), "must be"); + + assert(op2->tmp1_opr()->is_fpu_register(), "tmp1 is the first temporary register"); + assert(op2->tmp2_opr()->is_fpu_register(), "tmp2 is the second temporary register"); + assert(fpu_num(left) != fpu_num(right) && fpu_num(left) != fpu_num(op2->tmp1_opr()) && fpu_num(left) != fpu_num(op2->tmp2_opr()) && fpu_num(left) != fpu_num(res), "need distinct temp registers"); + assert(fpu_num(right) != fpu_num(op2->tmp1_opr()) && fpu_num(right) != fpu_num(op2->tmp2_opr()) && fpu_num(right) != fpu_num(res), "need distinct temp registers"); + assert(fpu_num(op2->tmp1_opr()) != fpu_num(op2->tmp2_opr()) && fpu_num(op2->tmp1_opr()) != fpu_num(res), "need distinct temp registers"); + assert(fpu_num(op2->tmp2_opr()) != fpu_num(res), "need distinct temp registers"); + + insert_free_if_dead(op2->tmp1_opr()); + insert_free_if_dead(op2->tmp2_opr()); + + // Must bring both operands to top of stack with following operand ordering: + // * fpu stack before pow: ... right left + // * fpu stack after pow: ... left + + insert_free_if_dead(res, right); + + if (tos_offset(right) != 1) { + insert_exchange(right); + insert_exchange(1); + } + insert_exchange(left); + assert(tos_offset(right) == 1, "check"); + assert(tos_offset(left) == 0, "check"); + + new_left = to_fpu_stack_top(left); + new_right = to_fpu_stack(right); + + op2->set_fpu_stack_size(sim()->stack_size()); + assert(sim()->stack_size() <= 6, "at least two stack slots must be free"); + + sim()->pop(); + + do_rename(right, res); + + new_res = to_fpu_stack_top(res); + break; + } + default: { assert(false, "missed a fpu-operation"); } --- old/src/cpu/x86/vm/interpreter_x86_32.cpp 2012-04-16 11:11:08.348943032 +0200 +++ new/src/cpu/x86/vm/interpreter_x86_32.cpp 2012-04-16 11:11:08.140091522 +0200 @@ -181,6 +181,19 @@ __ push_fTOS(); __ pop_fTOS(); break; + case Interpreter::java_lang_math_pow: + __ fld_d(Address(rsp, 3*wordSize)); // second argument + __ pow_with_fallback(0); + // Store to stack to convert 80bit precision back to 64bits + __ push_fTOS(); + __ pop_fTOS(); + break; + case Interpreter::java_lang_math_exp: + __ exp_with_fallback(0); + // Store to stack to convert 80bit precision back to 64bits + __ push_fTOS(); + __ pop_fTOS(); + break; default : ShouldNotReachHere(); } --- old/src/cpu/x86/vm/interpreter_x86_64.cpp 2012-04-16 11:11:09.655301099 +0200 +++ new/src/cpu/x86/vm/interpreter_x86_64.cpp 2012-04-16 11:11:09.469702956 +0200 @@ -271,6 +271,14 @@ case Interpreter::java_lang_math_log10: __ flog10(); break; + case Interpreter::java_lang_math_pow: + __ fld_d(Address(rsp, 3*wordSize)); // second argument (one + // empty stack slot) + __ pow_with_fallback(0); + break; + case Interpreter::java_lang_math_exp: + __ exp_with_fallback(0); + break; default : ShouldNotReachHere(); } --- old/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2012-04-16 11:11:10.965066327 +0200 +++ new/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2012-04-16 11:11:10.772376580 +0200 @@ -2136,11 +2136,23 @@ __ trigfunc('t'); __ ret(0); } + { + StubCodeMark mark(this, "StubRoutines", "exp"); + StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc(); - // The intrinsic version of these seem to return the same value as - // the strict version. - StubRoutines::_intrinsic_exp = SharedRuntime::dexp; - StubRoutines::_intrinsic_pow = SharedRuntime::dpow; + __ fld_d(Address(rsp, 4)); + __ exp_with_fallback(0); + __ ret(0); + } + { + StubCodeMark mark(this, "StubRoutines", "pow"); + StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc(); + + __ fld_d(Address(rsp, 12)); + __ fld_d(Address(rsp, 4)); + __ pow_with_fallback(0); + __ ret(0); + } } public: --- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2012-04-16 11:11:12.469877955 +0200 +++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2012-04-16 11:11:12.266230753 +0200 @@ -2928,11 +2928,34 @@ __ addq(rsp, 8); __ ret(0); } + { + StubCodeMark mark(this, "StubRoutines", "exp"); + StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc(); - // The intrinsic version of these seem to return the same value as - // the strict version. - StubRoutines::_intrinsic_exp = SharedRuntime::dexp; - StubRoutines::_intrinsic_pow = SharedRuntime::dpow; + __ subq(rsp, 8); + __ movdbl(Address(rsp, 0), xmm0); + __ fld_d(Address(rsp, 0)); + __ exp_with_fallback(0); + __ fstp_d(Address(rsp, 0)); + __ movdbl(xmm0, Address(rsp, 0)); + __ addq(rsp, 8); + __ ret(0); + } + { + StubCodeMark mark(this, "StubRoutines", "pow"); + StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc(); + + __ subq(rsp, 8); + __ movdbl(Address(rsp, 0), xmm1); + __ fld_d(Address(rsp, 0)); + __ movdbl(Address(rsp, 0), xmm0); + __ fld_d(Address(rsp, 0)); + __ pow_with_fallback(0); + __ fstp_d(Address(rsp, 0)); + __ movdbl(xmm0, Address(rsp, 0)); + __ addq(rsp, 8); + __ ret(0); + } } #undef __ --- old/src/cpu/x86/vm/templateInterpreter_x86_32.cpp 2012-04-16 11:11:13.916607866 +0200 +++ new/src/cpu/x86/vm/templateInterpreter_x86_32.cpp 2012-04-16 11:11:13.723839496 +0200 @@ -1518,7 +1518,9 @@ case Interpreter::java_lang_math_abs : // fall thru case Interpreter::java_lang_math_log : // fall thru case Interpreter::java_lang_math_log10 : // fall thru - case Interpreter::java_lang_math_sqrt : entry_point = ((InterpreterGenerator*)this)->generate_math_entry(kind); break; + case Interpreter::java_lang_math_sqrt : // fall thru + case Interpreter::java_lang_math_pow : // fall thru + case Interpreter::java_lang_math_exp : entry_point = ((InterpreterGenerator*)this)->generate_math_entry(kind); break; case Interpreter::java_lang_ref_reference_get : entry_point = ((InterpreterGenerator*)this)->generate_Reference_get_entry(); break; default : ShouldNotReachHere(); break; @@ -1540,7 +1542,9 @@ case Interpreter::java_lang_math_abs : // fall thru case Interpreter::java_lang_math_log : // fall thru case Interpreter::java_lang_math_log10 : // fall thru - case Interpreter::java_lang_math_sqrt : + case Interpreter::java_lang_math_sqrt : // fall thru + case Interpreter::java_lang_math_pow : // fall thru + case Interpreter::java_lang_math_exp : return false; default: return true; --- old/src/cpu/x86/vm/templateInterpreter_x86_64.cpp 2012-04-16 11:11:15.570642980 +0200 +++ new/src/cpu/x86/vm/templateInterpreter_x86_64.cpp 2012-04-16 11:11:15.140565702 +0200 @@ -1534,7 +1534,9 @@ case Interpreter::java_lang_math_abs : // fall thru case Interpreter::java_lang_math_log : // fall thru case Interpreter::java_lang_math_log10 : // fall thru - case Interpreter::java_lang_math_sqrt : entry_point = ((InterpreterGenerator*) this)->generate_math_entry(kind); break; + case Interpreter::java_lang_math_sqrt : // fall thru + case Interpreter::java_lang_math_pow : // fall thru + case Interpreter::java_lang_math_exp : entry_point = ((InterpreterGenerator*) this)->generate_math_entry(kind); break; case Interpreter::java_lang_ref_reference_get : entry_point = ((InterpreterGenerator*)this)->generate_Reference_get_entry(); break; default : ShouldNotReachHere(); break; @@ -1558,7 +1560,9 @@ case Interpreter::java_lang_math_abs : // fall thru case Interpreter::java_lang_math_log : // fall thru case Interpreter::java_lang_math_log10 : // fall thru - case Interpreter::java_lang_math_sqrt : + case Interpreter::java_lang_math_sqrt : // fall thru + case Interpreter::java_lang_math_pow : // fall thru + case Interpreter::java_lang_math_exp : return false; default: return true; --- old/src/cpu/x86/vm/x86_32.ad 2012-04-16 11:11:16.993253025 +0200 +++ new/src/cpu/x86/vm/x86_32.ad 2012-04-16 11:11:16.755394951 +0200 @@ -2536,45 +2536,6 @@ __ fld_d(Address(rsp, 0)); %} - // Compute X^Y using Intel's fast hardware instructions, if possible. - // Otherwise return a NaN. - enc_class pow_exp_core_encoding %{ - // FPR1 holds Y*ln2(X). Compute FPR1 = 2^(Y*ln2(X)) - emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xC0); // fdup = fld st(0) Q Q - emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xFC); // frndint int(Q) Q - emit_opcode(cbuf,0xDC); emit_opcode(cbuf,0xE9); // fsub st(1) -= st(0); int(Q) frac(Q) - emit_opcode(cbuf,0xDB); // FISTP [ESP] frac(Q) - emit_opcode(cbuf,0x1C); - emit_d8(cbuf,0x24); - emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xF0); // f2xm1 2^frac(Q)-1 - emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xE8); // fld1 1 2^frac(Q)-1 - emit_opcode(cbuf,0xDE); emit_opcode(cbuf,0xC1); // faddp 2^frac(Q) - emit_opcode(cbuf,0x8B); // mov rax,[esp+0]=int(Q) - encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 0, false); - emit_opcode(cbuf,0xC7); // mov rcx,0xFFFFF800 - overflow mask - emit_rm(cbuf, 0x3, 0x0, ECX_enc); - emit_d32(cbuf,0xFFFFF800); - emit_opcode(cbuf,0x81); // add rax,1023 - the double exponent bias - emit_rm(cbuf, 0x3, 0x0, EAX_enc); - emit_d32(cbuf,1023); - emit_opcode(cbuf,0x8B); // mov rbx,eax - emit_rm(cbuf, 0x3, EBX_enc, EAX_enc); - emit_opcode(cbuf,0xC1); // shl rax,20 - Slide to exponent position - emit_rm(cbuf,0x3,0x4,EAX_enc); - emit_d8(cbuf,20); - emit_opcode(cbuf,0x85); // test rbx,ecx - check for overflow - emit_rm(cbuf, 0x3, EBX_enc, ECX_enc); - emit_opcode(cbuf,0x0F); emit_opcode(cbuf,0x45); // CMOVne rax,ecx - overflow; stuff NAN into EAX - emit_rm(cbuf, 0x3, EAX_enc, ECX_enc); - emit_opcode(cbuf,0x89); // mov [esp+4],eax - Store as part of double word - encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 4, false); - emit_opcode(cbuf,0xC7); // mov [esp+0],0 - [ESP] = (double)(1< $Y // KILL $rax, $rcx, $rdx" %} + ins_encode %{ + __ subptr(rsp, 8); + __ fld_s($X$$reg - 1); + __ fast_pow(); + __ addptr(rsp, 8); + %} ins_pipe( pipe_slow ); %} -instruct powD_reg(regD dst, regD src0, regD src1, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx ) %{ +instruct powD_reg(regD dst, regD src0, regD src1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{ predicate (UseSSE>=2); match(Set dst (PowD src0 src1)); // Raise src0 to the src1'th power - effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx ); - format %{ "SUB ESP,8\t\t# Fast-path POW encoding\n\t" - "MOVSD [ESP],$src1\n\t" - "FLD FPR1,$src1\n\t" - "MOVSD [ESP],$src0\n\t" - "FLD FPR1,$src0\n\t" - "FYL2X \t\t\t# Q=Y*ln2(X)\n\t" - - "FDUP \t\t\t# Q Q\n\t" - "FRNDINT\t\t\t# int(Q) Q\n\t" - "FSUB ST(1),ST(0)\t# int(Q) frac(Q)\n\t" - "FISTP dword [ESP]\n\t" - "F2XM1 \t\t\t# 2^frac(Q)-1 int(Q)\n\t" - "FLD1 \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t" - "FADDP \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead - "MOV EAX,[ESP]\t# Pick up int(Q)\n\t" - "MOV ECX,0xFFFFF800\t# Overflow mask\n\t" - "ADD EAX,1023\t\t# Double exponent bias\n\t" - "MOV EBX,EAX\t\t# Preshifted biased expo\n\t" - "SHL EAX,20\t\t# Shift exponent into place\n\t" - "TEST EBX,ECX\t\t# Check for overflow\n\t" - "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t" - "MOV [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t" - "MOV [ESP+0],0\n\t" - "FMUL ST(0),[ESP+0]\t# Scale\n\t" - - "FST_D [ESP]\n\t" - "MOVSD $dst,[ESP]\n\t" - "ADD ESP,8" - %} - ins_encode( push_stack_temp_qword, - push_xmm_to_fpr1(src1), - push_xmm_to_fpr1(src0), - Opcode(0xD9), Opcode(0xF1), // fyl2x - pow_exp_core_encoding, - Push_ResultD(dst) ); + effect(KILL rax, KILL rdx, KILL rcx, KILL cr); + format %{ "fast_pow $src0 $src1 -> $dst // KILL $rax, $rcx, $rdx" %} + ins_encode %{ + __ subptr(rsp, 8); + __ movdbl(Address(rsp, 0), $src1$$XMMRegister); + __ fld_d(Address(rsp, 0)); + __ movdbl(Address(rsp, 0), $src0$$XMMRegister); + __ fld_d(Address(rsp, 0)); + __ fast_pow(); + __ fstp_d(Address(rsp, 0)); + __ movdbl($dst$$XMMRegister, Address(rsp, 0)); + __ addptr(rsp, 8); + %} ins_pipe( pipe_slow ); %} -instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{ +instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{ predicate (UseSSE<=1); match(Set dpr1 (ExpD dpr1)); - effect(KILL rax, KILL rbx, KILL rcx); - format %{ "SUB ESP,8\t\t# Fast-path EXP encoding" - "FLDL2E \t\t\t# Ld log2(e) X\n\t" - "FMULP \t\t\t# Q=X*log2(e)\n\t" - - "FDUP \t\t\t# Q Q\n\t" - "FRNDINT\t\t\t# int(Q) Q\n\t" - "FSUB ST(1),ST(0)\t# int(Q) frac(Q)\n\t" - "FISTP dword [ESP]\n\t" - "F2XM1 \t\t\t# 2^frac(Q)-1 int(Q)\n\t" - "FLD1 \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t" - "FADDP \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead - "MOV EAX,[ESP]\t# Pick up int(Q)\n\t" - "MOV ECX,0xFFFFF800\t# Overflow mask\n\t" - "ADD EAX,1023\t\t# Double exponent bias\n\t" - "MOV EBX,EAX\t\t# Preshifted biased expo\n\t" - "SHL EAX,20\t\t# Shift exponent into place\n\t" - "TEST EBX,ECX\t\t# Check for overflow\n\t" - "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t" - "MOV [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t" - "MOV [ESP+0],0\n\t" - "FMUL ST(0),[ESP+0]\t# Scale\n\t" - - "ADD ESP,8" - %} - ins_encode( push_stack_temp_qword, - Opcode(0xD9), Opcode(0xEA), // fldl2e - Opcode(0xDE), Opcode(0xC9), // fmulp - pow_exp_core_encoding, - pop_stack_temp_qword); + effect(KILL rax, KILL rcx, KILL rdx, KILL cr); + format %{ "fast_exp $dpr1 -> $dpr1 // KILL $rax, $rcx, $rdx" %} + ins_encode %{ + __ fast_exp(); + %} ins_pipe( pipe_slow ); %} -instruct expD_reg(regD dst, regD src, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{ +instruct expD_reg(regD dst, regD src, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{ predicate (UseSSE>=2); match(Set dst (ExpD src)); - effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx); - format %{ "SUB ESP,8\t\t# Fast-path EXP encoding\n\t" - "MOVSD [ESP],$src\n\t" - "FLDL2E \t\t\t# Ld log2(e) X\n\t" - "FMULP \t\t\t# Q=X*log2(e) X\n\t" - - "FDUP \t\t\t# Q Q\n\t" - "FRNDINT\t\t\t# int(Q) Q\n\t" - "FSUB ST(1),ST(0)\t# int(Q) frac(Q)\n\t" - "FISTP dword [ESP]\n\t" - "F2XM1 \t\t\t# 2^frac(Q)-1 int(Q)\n\t" - "FLD1 \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t" - "FADDP \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead - "MOV EAX,[ESP]\t# Pick up int(Q)\n\t" - "MOV ECX,0xFFFFF800\t# Overflow mask\n\t" - "ADD EAX,1023\t\t# Double exponent bias\n\t" - "MOV EBX,EAX\t\t# Preshifted biased expo\n\t" - "SHL EAX,20\t\t# Shift exponent into place\n\t" - "TEST EBX,ECX\t\t# Check for overflow\n\t" - "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t" - "MOV [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t" - "MOV [ESP+0],0\n\t" - "FMUL ST(0),[ESP+0]\t# Scale\n\t" - - "FST_D [ESP]\n\t" - "MOVSD $dst,[ESP]\n\t" - "ADD ESP,8" - %} - ins_encode( Push_SrcD(src), - Opcode(0xD9), Opcode(0xEA), // fldl2e - Opcode(0xDE), Opcode(0xC9), // fmulp - pow_exp_core_encoding, - Push_ResultD(dst) ); + effect(KILL rax, KILL rcx, KILL rdx, KILL cr); + format %{ "fast_exp $dst -> $src // KILL $rax, $rcx, $rdx" %} + ins_encode %{ + __ subptr(rsp, 8); + __ movdbl(Address(rsp, 0), $src$$XMMRegister); + __ fld_d(Address(rsp, 0)); + __ fast_exp(); + __ fstp_d(Address(rsp, 0)); + __ movdbl($dst$$XMMRegister, Address(rsp, 0)); + __ addptr(rsp, 8); + %} ins_pipe( pipe_slow ); %} - - instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{ predicate (UseSSE<=1); // The source Double operand on FPU stack --- old/src/cpu/x86/vm/x86_64.ad 2012-04-16 11:11:18.949904128 +0200 +++ new/src/cpu/x86/vm/x86_64.ad 2012-04-16 11:11:18.717902675 +0200 @@ -9842,7 +9842,39 @@ ins_pipe( pipe_slow ); %} +instruct powD_reg(regD dst, regD src0, regD src1, rax_RegI rax, rdx_RegI rdx, rcx_RegI rcx, rFlagsReg cr) %{ + match(Set dst (PowD src0 src1)); // Raise src0 to the src1'th power + effect(KILL rax, KILL rdx, KILL rcx, KILL cr); + format %{ "fast_pow $src0 $src1 -> $dst // KILL $rax, $rcx, $rdx" %} + ins_encode %{ + __ subptr(rsp, 8); + __ movdbl(Address(rsp, 0), $src1$$XMMRegister); + __ fld_d(Address(rsp, 0)); + __ movdbl(Address(rsp, 0), $src0$$XMMRegister); + __ fld_d(Address(rsp, 0)); + __ fast_pow(); + __ fstp_d(Address(rsp, 0)); + __ movdbl($dst$$XMMRegister, Address(rsp, 0)); + __ addptr(rsp, 8); + %} + ins_pipe( pipe_slow ); +%} +instruct expD_reg(regD dst, regD src, rax_RegI rax, rdx_RegI rdx, rcx_RegI rcx, rFlagsReg cr) %{ + match(Set dst (ExpD src)); + effect(KILL rax, KILL rcx, KILL rdx, KILL cr); + format %{ "fast_exp $dst -> $src // KILL $rax, $rcx, $rdx" %} + ins_encode %{ + __ subptr(rsp, 8); + __ movdbl(Address(rsp, 0), $src$$XMMRegister); + __ fld_d(Address(rsp, 0)); + __ fast_exp(); + __ fstp_d(Address(rsp, 0)); + __ movdbl($dst$$XMMRegister, Address(rsp, 0)); + __ addptr(rsp, 8); + %} + ins_pipe( pipe_slow ); +%} //----------Arithmetic Conversion Instructions--------------------------------- --- old/src/share/vm/c1/c1_GraphBuilder.cpp 2012-04-16 11:11:20.749501515 +0200 +++ new/src/share/vm/c1/c1_GraphBuilder.cpp 2012-04-16 11:11:20.548378358 +0200 @@ -2947,6 +2947,8 @@ case vmIntrinsics::_dtan : // fall through case vmIntrinsics::_dlog : // fall through case vmIntrinsics::_dlog10 : // fall through + case vmIntrinsics::_dexp : // fall through + case vmIntrinsics::_dpow : // fall through { // Compiles where the root method is an intrinsic need a special // compilation environment because the bytecodes for the method @@ -2967,6 +2969,9 @@ _state = start_block->state()->copy_for_parsing(); _last = start_block; load_local(doubleType, 0); + if (scope->method()->intrinsic_id() == vmIntrinsics::_dpow) { + load_local(doubleType, 2); + } // Emit the intrinsic node. bool result = try_inline_intrinsics(scope->method()); @@ -3167,6 +3172,8 @@ case vmIntrinsics::_dtan : // fall through case vmIntrinsics::_dlog : // fall through case vmIntrinsics::_dlog10 : // fall through + case vmIntrinsics::_dexp : // fall through + case vmIntrinsics::_dpow : // fall through if (!InlineMathNatives) return false; cantrap = false; preserves_state = true; --- old/src/share/vm/c1/c1_LIR.cpp 2012-04-16 11:11:22.282637717 +0200 +++ new/src/share/vm/c1/c1_LIR.cpp 2012-04-16 11:11:22.092552533 +0200 @@ -624,11 +624,13 @@ { assert(op->as_Op2() != NULL, "must be"); LIR_Op2* op2 = (LIR_Op2*)op; + assert(op2->_tmp2->is_illegal() && op2->_tmp3->is_illegal() && + op2->_tmp4->is_illegal() && op2->_tmp5->is_illegal(), "not used"); if (op2->_info) do_info(op2->_info); if (op2->_opr1->is_valid()) do_input(op2->_opr1); if (op2->_opr2->is_valid()) do_input(op2->_opr2); - if (op2->_tmp->is_valid()) do_temp(op2->_tmp); + if (op2->_tmp1->is_valid()) do_temp(op2->_tmp1); if (op2->_result->is_valid()) do_output(op2->_result); break; @@ -641,7 +643,8 @@ assert(op->as_Op2() != NULL, "must be"); LIR_Op2* op2 = (LIR_Op2*)op; - assert(op2->_info == NULL && op2->_tmp->is_illegal(), "not used"); + assert(op2->_info == NULL && op2->_tmp1->is_illegal() && op2->_tmp2->is_illegal() && + op2->_tmp3->is_illegal() && op2->_tmp4->is_illegal() && op2->_tmp5->is_illegal(), "not used"); assert(op2->_opr1->is_valid() && op2->_opr2->is_valid() && op2->_result->is_valid(), "used"); do_input(op2->_opr1); @@ -665,10 +668,12 @@ assert(op2->_opr1->is_valid(), "used"); assert(op2->_opr2->is_valid(), "used"); assert(op2->_result->is_valid(), "used"); + assert(op2->_tmp2->is_illegal() && op2->_tmp3->is_illegal() && + op2->_tmp4->is_illegal() && op2->_tmp5->is_illegal(), "not used"); do_input(op2->_opr1); do_temp(op2->_opr1); do_input(op2->_opr2); do_temp(op2->_opr2); - if (op2->_tmp->is_valid()) do_temp(op2->_tmp); + if (op2->_tmp1->is_valid()) do_temp(op2->_tmp1); do_output(op2->_result); break; @@ -682,6 +687,8 @@ if (op2->_opr1->is_valid()) do_temp(op2->_opr1); if (op2->_opr2->is_valid()) do_input(op2->_opr2); // exception object is input parameter assert(op2->_result->is_illegal(), "no result"); + assert(op2->_tmp2->is_illegal() && op2->_tmp3->is_illegal() && + op2->_tmp4->is_illegal() && op2->_tmp5->is_illegal(), "not used"); break; } @@ -702,7 +709,8 @@ case lir_sin: case lir_cos: case lir_log: - case lir_log10: { + case lir_log10: + case lir_exp: { assert(op->as_Op2() != NULL, "must be"); LIR_Op2* op2 = (LIR_Op2*)op; @@ -711,16 +719,47 @@ // Register input operand as temp to guarantee that it doesn't // overlap with the input. assert(op2->_info == NULL, "not used"); + assert(op2->_tmp5->is_illegal(), "not used"); + assert(op2->_tmp2->is_valid() == (op->code() == lir_exp), "not used"); + assert(op2->_tmp3->is_valid() == (op->code() == lir_exp), "not used"); + assert(op2->_tmp4->is_valid() == (op->code() == lir_exp), "not used"); assert(op2->_opr1->is_valid(), "used"); do_input(op2->_opr1); do_temp(op2->_opr1); if (op2->_opr2->is_valid()) do_temp(op2->_opr2); - if (op2->_tmp->is_valid()) do_temp(op2->_tmp); + if (op2->_tmp1->is_valid()) do_temp(op2->_tmp1); + if (op2->_tmp2->is_valid()) do_temp(op2->_tmp2); + if (op2->_tmp3->is_valid()) do_temp(op2->_tmp3); + if (op2->_tmp4->is_valid()) do_temp(op2->_tmp4); if (op2->_result->is_valid()) do_output(op2->_result); break; } + case lir_pow: { + assert(op->as_Op2() != NULL, "must be"); + LIR_Op2* op2 = (LIR_Op2*)op; + + // On x86 pow needs two temporary fpu stack slots: tmp1 and + // tmp2. Register input operands as temps to guarantee that it + // doesn't overlap with the temporary slots. + assert(op2->_info == NULL, "not used"); + assert(op2->_opr1->is_valid() && op2->_opr2->is_valid(), "used"); + assert(op2->_tmp1->is_valid() && op2->_tmp2->is_valid() && op2->_tmp3->is_valid() + && op2->_tmp4->is_valid() && op2->_tmp5->is_valid(), "used"); + assert(op2->_result->is_valid(), "used"); + + do_input(op2->_opr1); do_temp(op2->_opr1); + do_input(op2->_opr2); do_temp(op2->_opr2); + do_temp(op2->_tmp1); + do_temp(op2->_tmp2); + do_temp(op2->_tmp3); + do_temp(op2->_tmp4); + do_temp(op2->_tmp5); + do_output(op2->_result); + + break; + } // LIR_Op3 case lir_idiv: @@ -1670,6 +1709,8 @@ case lir_tan: s = "tan"; break; case lir_log: s = "log"; break; case lir_log10: s = "log10"; break; + case lir_exp: s = "exp"; break; + case lir_pow: s = "pow"; break; case lir_logic_and: s = "logic_and"; break; case lir_logic_or: s = "logic_or"; break; case lir_logic_xor: s = "logic_xor"; break; @@ -1892,7 +1933,11 @@ } in_opr1()->print(out); out->print(" "); in_opr2()->print(out); out->print(" "); - if (tmp_opr()->is_valid()) { tmp_opr()->print(out); out->print(" "); } + if (tmp1_opr()->is_valid()) { tmp1_opr()->print(out); out->print(" "); } + if (tmp2_opr()->is_valid()) { tmp2_opr()->print(out); out->print(" "); } + if (tmp3_opr()->is_valid()) { tmp3_opr()->print(out); out->print(" "); } + if (tmp4_opr()->is_valid()) { tmp4_opr()->print(out); out->print(" "); } + if (tmp5_opr()->is_valid()) { tmp5_opr()->print(out); out->print(" "); } result_opr()->print(out); } --- old/src/share/vm/c1/c1_LIR.hpp 2012-04-16 11:11:23.718507909 +0200 +++ new/src/share/vm/c1/c1_LIR.hpp 2012-04-16 11:11:23.520954666 +0200 @@ -916,6 +916,8 @@ , lir_tan , lir_log , lir_log10 + , lir_exp + , lir_pow , lir_logic_and , lir_logic_or , lir_logic_xor @@ -1560,7 +1562,11 @@ LIR_Opr _opr1; LIR_Opr _opr2; BasicType _type; - LIR_Opr _tmp; + LIR_Opr _tmp1; + LIR_Opr _tmp2; + LIR_Opr _tmp3; + LIR_Opr _tmp4; + LIR_Opr _tmp5; LIR_Condition _condition; void verify() const; @@ -1573,7 +1579,11 @@ , _type(T_ILLEGAL) , _condition(condition) , _fpu_stack_size(0) - , _tmp(LIR_OprFact::illegalOpr) { + , _tmp1(LIR_OprFact::illegalOpr) + , _tmp2(LIR_OprFact::illegalOpr) + , _tmp3(LIR_OprFact::illegalOpr) + , _tmp4(LIR_OprFact::illegalOpr) + , _tmp5(LIR_OprFact::illegalOpr) { assert(code == lir_cmp, "code check"); } @@ -1584,7 +1594,11 @@ , _type(type) , _condition(condition) , _fpu_stack_size(0) - , _tmp(LIR_OprFact::illegalOpr) { + , _tmp1(LIR_OprFact::illegalOpr) + , _tmp2(LIR_OprFact::illegalOpr) + , _tmp3(LIR_OprFact::illegalOpr) + , _tmp4(LIR_OprFact::illegalOpr) + , _tmp5(LIR_OprFact::illegalOpr) { assert(code == lir_cmove, "code check"); assert(type != T_ILLEGAL, "cmove should have type"); } @@ -1597,25 +1611,38 @@ , _type(type) , _condition(lir_cond_unknown) , _fpu_stack_size(0) - , _tmp(LIR_OprFact::illegalOpr) { + , _tmp1(LIR_OprFact::illegalOpr) + , _tmp2(LIR_OprFact::illegalOpr) + , _tmp3(LIR_OprFact::illegalOpr) + , _tmp4(LIR_OprFact::illegalOpr) + , _tmp5(LIR_OprFact::illegalOpr) { assert(code != lir_cmp && is_in_range(code, begin_op2, end_op2), "code check"); } - LIR_Op2(LIR_Code code, LIR_Opr opr1, LIR_Opr opr2, LIR_Opr result, LIR_Opr tmp) + LIR_Op2(LIR_Code code, LIR_Opr opr1, LIR_Opr opr2, LIR_Opr result, LIR_Opr tmp1, LIR_Opr tmp2 = LIR_OprFact::illegalOpr, + LIR_Opr tmp3 = LIR_OprFact::illegalOpr, LIR_Opr tmp4 = LIR_OprFact::illegalOpr, LIR_Opr tmp5 = LIR_OprFact::illegalOpr) : LIR_Op(code, result, NULL) , _opr1(opr1) , _opr2(opr2) , _type(T_ILLEGAL) , _condition(lir_cond_unknown) , _fpu_stack_size(0) - , _tmp(tmp) { + , _tmp1(tmp1) + , _tmp2(tmp2) + , _tmp3(tmp3) + , _tmp4(tmp4) + , _tmp5(tmp5) { assert(code != lir_cmp && is_in_range(code, begin_op2, end_op2), "code check"); } LIR_Opr in_opr1() const { return _opr1; } LIR_Opr in_opr2() const { return _opr2; } BasicType type() const { return _type; } - LIR_Opr tmp_opr() const { return _tmp; } + LIR_Opr tmp1_opr() const { return _tmp1; } + LIR_Opr tmp2_opr() const { return _tmp2; } + LIR_Opr tmp3_opr() const { return _tmp3; } + LIR_Opr tmp4_opr() const { return _tmp4; } + LIR_Opr tmp5_opr() const { return _tmp5; } LIR_Condition condition() const { assert(code() == lir_cmp || code() == lir_cmove, "only valid for cmp and cmove"); return _condition; } @@ -2025,6 +2052,8 @@ void sin (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_sin , from, tmp1, to, tmp2)); } void cos (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_cos , from, tmp1, to, tmp2)); } void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); } + void exp (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5) { append(new LIR_Op2(lir_exp , from, tmp1, to, tmp2, tmp3, tmp4, tmp5)); } + void pow (LIR_Opr arg1, LIR_Opr arg2, LIR_Opr res, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5) { append(new LIR_Op2(lir_pow, arg1, arg2, res, tmp1, tmp2, tmp3, tmp4, tmp5)); } void add (LIR_Opr left, LIR_Opr right, LIR_Opr res) { append(new LIR_Op2(lir_add, left, right, res)); } void sub (LIR_Opr left, LIR_Opr right, LIR_Opr res, CodeEmitInfo* info = NULL) { append(new LIR_Op2(lir_sub, left, right, res, info)); } --- old/src/share/vm/c1/c1_LIRAssembler.cpp 2012-04-16 11:11:25.183388286 +0200 +++ new/src/share/vm/c1/c1_LIRAssembler.cpp 2012-04-16 11:11:24.989859822 +0200 @@ -718,7 +718,7 @@ if (op->in_opr2()->is_constant()) { shift_op(op->code(), op->in_opr1(), op->in_opr2()->as_constant_ptr()->as_jint(), op->result_opr()); } else { - shift_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op->tmp_opr()); + shift_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op->tmp1_opr()); } break; @@ -746,6 +746,8 @@ case lir_cos: case lir_log: case lir_log10: + case lir_exp: + case lir_pow: intrinsic_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op); break; --- old/src/share/vm/c1/c1_LIRGenerator.cpp 2012-04-16 11:11:26.571866864 +0200 +++ new/src/share/vm/c1/c1_LIRGenerator.cpp 2012-04-16 11:11:26.359034151 +0200 @@ -2920,7 +2920,9 @@ case vmIntrinsics::_dsqrt: // fall through case vmIntrinsics::_dtan: // fall through case vmIntrinsics::_dsin : // fall through - case vmIntrinsics::_dcos : do_MathIntrinsic(x); break; + case vmIntrinsics::_dcos : // fall through + case vmIntrinsics::_dexp : // fall through + case vmIntrinsics::_dpow : do_MathIntrinsic(x); break; case vmIntrinsics::_arraycopy: do_ArrayCopy(x); break; // java.nio.Buffer.checkIndex --- old/src/share/vm/c1/c1_LinearScan.cpp 2012-04-16 11:11:28.071334690 +0200 +++ new/src/share/vm/c1/c1_LinearScan.cpp 2012-04-16 11:11:27.860024259 +0200 @@ -6579,6 +6579,8 @@ case lir_abs: case lir_log10: case lir_log: + case lir_pow: + case lir_exp: case lir_logic_and: case lir_logic_or: case lir_logic_xor: --- old/src/share/vm/interpreter/abstractInterpreter.hpp 2012-04-16 11:11:29.709824250 +0200 +++ new/src/share/vm/interpreter/abstractInterpreter.hpp 2012-04-16 11:11:29.531534921 +0200 @@ -107,6 +107,8 @@ java_lang_math_sqrt, // implementation of java.lang.Math.sqrt (x) java_lang_math_log, // implementation of java.lang.Math.log (x) java_lang_math_log10, // implementation of java.lang.Math.log10 (x) + java_lang_math_pow, // implementation of java.lang.Math.pow (x,y) + java_lang_math_exp, // implementation of java.lang.Math.exp (x) java_lang_ref_reference_get, // implementation of java.lang.ref.Reference.get() number_of_method_entries, invalid = -1 --- old/src/share/vm/interpreter/interpreter.cpp 2012-04-16 11:11:31.033993124 +0200 +++ new/src/share/vm/interpreter/interpreter.cpp 2012-04-16 11:11:30.852293081 +0200 @@ -221,6 +221,8 @@ case vmIntrinsics::_dsqrt : return java_lang_math_sqrt ; case vmIntrinsics::_dlog : return java_lang_math_log ; case vmIntrinsics::_dlog10: return java_lang_math_log10; + case vmIntrinsics::_dpow : return java_lang_math_pow ; + case vmIntrinsics::_dexp : return java_lang_math_exp ; case vmIntrinsics::_Reference_get: return java_lang_ref_reference_get; --- old/src/share/vm/interpreter/templateInterpreter.cpp 2012-04-16 11:11:32.375857625 +0200 +++ new/src/share/vm/interpreter/templateInterpreter.cpp 2012-04-16 11:11:32.199574725 +0200 @@ -370,6 +370,8 @@ method_entry(java_lang_math_sqrt ) method_entry(java_lang_math_log ) method_entry(java_lang_math_log10) + method_entry(java_lang_math_exp ) + method_entry(java_lang_math_pow ) method_entry(java_lang_ref_reference_get) // all native method kinds (must be one contiguous block) --- old/src/share/vm/opto/library_call.cpp 2012-04-16 11:11:33.752314592 +0200 +++ new/src/share/vm/opto/library_call.cpp 2012-04-16 11:11:33.518296239 +0200 @@ -1526,9 +1526,6 @@ // every again. NaN results requires StrictMath.exp handling. if (too_many_traps(Deoptimization::Reason_intrinsic)) return false; - // Do not intrinsify on older platforms which lack cmove. - if (ConditionalMoveLimit == 0) return false; - _sp += arg_size(); // restore stack pointer Node *x = pop_math_arg(); Node *result = _gvn.transform(new (C, 2) ExpDNode(0,x)); @@ -1771,15 +1768,11 @@ case vmIntrinsics::_dsqrt: return Matcher::has_match_rule(Op_SqrtD) ? inline_sqrt(id) : false; case vmIntrinsics::_dabs: return Matcher::has_match_rule(Op_AbsD) ? inline_abs(id) : false; - // These intrinsics don't work on X86. The ad implementation doesn't - // handle NaN's properly. Instead of returning infinity, the ad - // implementation returns a NaN on overflow. See bug: 6304089 - // Once the ad implementations are fixed, change the code below - // to match the intrinsics above - case vmIntrinsics::_dexp: return + Matcher::has_match_rule(Op_ExpD) ? inline_exp(id) : runtime_math(OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dexp), "EXP"); case vmIntrinsics::_dpow: return + Matcher::has_match_rule(Op_PowD) ? inline_pow(id) : runtime_math(OptoRuntime::Math_DD_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dpow), "POW"); // These intrinsics are not yet correctly implemented --- old/src/share/vm/opto/subnode.cpp 2012-04-16 11:11:35.342830622 +0200 +++ new/src/share/vm/opto/subnode.cpp 2012-04-16 11:11:35.162547941 +0200 @@ -1314,7 +1314,5 @@ if( t2->base() != Type::DoubleCon ) return Type::DOUBLE; double d1 = t1->getd(); double d2 = t2->getd(); - if( d1 < 0.0 ) return Type::DOUBLE; - if( d2 < 0.0 ) return Type::DOUBLE; return TypeD::make( StubRoutines::intrinsic_pow( d1, d2 ) ); }