src/cpu/x86/vm/assembler_x86.cpp

Print this page
rev 3227 : 7133857: exp() and pow() should use the x87 ISA on x86
Summary: use x87 instructions to implement exp() and pow() in interpreter/c1/c2.
Reviewed-by:

*** 3574,3583 **** --- 3574,3598 ---- void Assembler::fyl2x() { emit_byte(0xD9); emit_byte(0xF1); } + void Assembler::frndint() { + emit_byte(0xD9); + emit_byte(0xFC); + } + + void Assembler::f2xm1() { + emit_byte(0xD9); + emit_byte(0xF0); + } + + void Assembler::fldl2e() { + emit_byte(0xD9); + emit_byte(0xEA); + } + // SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding. static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 }; // SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding. static int simd_opc[4] = { 0, 0, 0x38, 0x3A };
*** 6864,6873 **** --- 6879,7059 ---- void MacroAssembler::fldcw(AddressLiteral src) { Assembler::fldcw(as_Address(src)); } + void MacroAssembler::pow_exp_core_encoding() { + // kills rax, rcx, rdx + subptr(rsp,sizeof(jdouble)); + // computes 2^X. Stack: X ... + // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and + // keep it on the thread's stack to compute 2^int(X) later + // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1) + // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X)) + fld_s(0); // Stack: X X ... + frndint(); // Stack: int(X) X ... + fsuba(1); // Stack: int(X) X-int(X) ... + fistp_s(Address(rsp,0)); // move int(X) as integer to thread's stack. Stack: X-int(X) ... + f2xm1(); // Stack: 2^(X-int(X))-1 ... + fld1(); // Stack: 1 2^(X-int(X))-1 ... + faddp(1); // Stack: 2^(X-int(X)) + // computes 2^(int(X)): add exponent bias (1023) to int(X), then + // shift int(X)+1023 to exponent position. + // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11 + // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent + // values so detect them and set result to NaN. + movl(rax,Address(rsp,0)); + movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding + addl(rax, 1023); + movl(rdx,rax); + shll(rax,20); + // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN. + addl(rdx,1); + // Check that 1 < int(X)+1023+1 < 2048 + // in 3 steps: + // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048 + // 2- (int(X)+1023+1)&-2048 != 0 + // 3- (int(X)+1023+1)&-2048 != 1 + // Do 2- first because addl just updated the flags. + cmov32(Assembler::equal,rax,rcx); + cmpl(rdx,1); + cmov32(Assembler::equal,rax,rcx); + testl(rdx,rcx); + cmov32(Assembler::notEqual,rax,rcx); + movl(Address(rsp,4),rax); + movl(Address(rsp,0),0); + fmul_d(Address(rsp,0)); // Stack: 2^X ... + addptr(rsp,sizeof(jdouble)); + } + + void MacroAssembler::fast_pow() { + // computes X^Y = 2^(Y * log2(X)) + // if fast computation is not possible, result is NaN. Requires + // fallback from user of this macro. + fyl2x(); // Stack: (Y*log2(X)) ... + pow_exp_core_encoding(); // Stack: exp(X) ... + } + + void MacroAssembler::fast_exp() { + // computes exp(X) = 2^(X * log2(e)) + // if fast computation is not possible, result is NaN. Requires + // fallback from user of this macro. + fldl2e(); // Stack: log2(e) X ... + fmulp(1); // Stack: (X*log2(e)) ... + pow_exp_core_encoding(); // Stack: exp(X) ... + } + + void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) { + // kills rax, rcx, rdx + // pow and exp needs 2 extra registers on the fpu stack. + Label slow_case, done; + Register tmp = noreg; + if (!VM_Version::supports_cmov()) { + // fcmp needs a temporary so preserve rdx, + tmp = rdx; + } + Register tmp2 = rdx; + + if (is_exp) { + // Stack: X + fld_s(0); // duplicate argument for runtime call. Stack: X X + fast_exp(); // Stack: exp(X) X + fcmp(tmp, 0, false, false); // Stack: exp(X) X + // exp(X) not equal to itself: exp(X) is NaN go to slow case. + jcc(Assembler::parity, slow_case); + // get rid of duplicate argument. Stack: exp(X) + if (num_fpu_regs_in_use > 0) { + fxch(); + fpop(); + } else { + ffree(1); + } + jmp(done); + } else { + // Stack: X Y + Label x_negative, y_odd; + + fldz(); // Stack: 0 X Y + fcmp(tmp, 1, true, false); // Stack: X Y + jcc(Assembler::above, x_negative); + + // X >= 0 + + fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y + fld_s(1); // Stack: X Y X Y + fast_pow(); // Stack: X^Y X Y + fcmp(tmp, 0, false, false); // Stack: X^Y X Y + // X^Y not equal to itself: X^Y is NaN go to slow case. + jcc(Assembler::parity, slow_case); + // get rid of duplicate arguments. Stack: X^Y + if (num_fpu_regs_in_use > 0) { + fxch(); fpop(); + fxch(); fpop(); + } else { + ffree(2); + ffree(1); + } + jmp(done); + + // X <= 0 + bind(x_negative); + + fld_s(1); // Stack: Y X Y + frndint(); // Stack: int(Y) X Y + fcmp(tmp, 2, false, false); // Stack: int(Y) X Y + jcc(Assembler::notEqual, slow_case); + + #ifdef _LP64 + subptr(rsp, 8); + #else + subptr(rsp, 4); + #endif + fistp_s(Address(rsp,0)); // Stack: X Y + + fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y + fld_s(1); // Stack: X Y X Y + fabs(); // Stack: abs(X) Y X Y + fast_pow(); // Stack: abs(X)^Y X Y + fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y + // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case. + + pop(tmp2); + jcc(Assembler::parity, slow_case); + + // test Y for integer indefinite value (int overflow) + cmpl(tmp2, 0x80000000); + jcc(Assembler::equal, slow_case); + + // get rid of duplicate arguments. Stack: X^Y + if (num_fpu_regs_in_use > 0) { + fxch(); fpop(); + fxch(); fpop(); + } else { + ffree(2); + ffree(1); + } + + testl(tmp2, 1); + jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y + // X <= 0, Y even: X^Y = -abs(X)^Y + + fchs(); // Stack: -abs(X)^Y Y + jmp(done); + } + + // slow case: runtime call + bind(slow_case); + + fpop(); // pop incorrect result or int(Y) + + fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow), + is_exp ? 1 : 2, num_fpu_regs_in_use); + + // Come here with result in F-TOS + bind(done); + } + void MacroAssembler::fpop() { ffree(); fincstp(); }
*** 8041,8050 **** --- 8227,8374 ---- } adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0); #endif } + void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) { + pusha(); + + // if we are coming from c1, xmm registers may be live + if (UseSSE >= 1) { + subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8)); + } + int off = 0; + if (UseSSE == 1) { + movflt(Address(rsp,off++*sizeof(jdouble)),xmm0); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm1); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm2); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm3); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm4); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm5); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm6); + movflt(Address(rsp,off++*sizeof(jdouble)),xmm7); + } else if (UseSSE >= 2) { + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7); + #ifdef _LP64 + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14); + movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15); + #endif + } + + // Preserve registers across runtime call + int incoming_argument_and_return_value_offset = -1; + if (num_fpu_regs_in_use > 1) { + // Must preserve all other FPU regs (could alternatively convert + // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash + // FPU state, but can not trust C compiler) + NEEDS_CLEANUP; + // NOTE that in this case we also push the incoming argument(s) to + // the stack and restore it later; we also use this stack slot to + // hold the return value from dsin, dcos etc. + for (int i = 0; i < num_fpu_regs_in_use; i++) { + subptr(rsp, sizeof(jdouble)); + fstp_d(Address(rsp, 0)); + } + incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1); + for (int i = nb_args-1; i >= 0; i--) { + fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble))); + } + } + + subptr(rsp, nb_args*sizeof(jdouble)); + for (int i = 0; i < nb_args; i++) { + fstp_d(Address(rsp, i*sizeof(jdouble))); + } + + #ifdef _LP64 + if (nb_args > 0) { + movdbl(xmm0, Address(rsp, 0)); + } + if (nb_args > 1) { + movdbl(xmm1, Address(rsp, sizeof(jdouble))); + } + assert(nb_args <= 2, "unsupported number of args"); + #endif // _LP64 + + // NOTE: we must not use call_VM_leaf here because that requires a + // complete interpreter frame in debug mode -- same bug as 4387334 + // MacroAssembler::call_VM_leaf_base is perfectly safe and will + // do proper 64bit abi + + NEEDS_CLEANUP; + // Need to add stack banging before this runtime call if it needs to + // be taken; however, there is no generic stack banging routine at + // the MacroAssembler level + + MacroAssembler::call_VM_leaf_base(runtime_entry, 0); + + #ifdef _LP64 + movsd(Address(rsp, 0), xmm0); + fld_d(Address(rsp, 0)); + #endif // _LP64 + addptr(rsp, sizeof(jdouble) * nb_args); + if (num_fpu_regs_in_use > 1) { + // Must save return value to stack and then restore entire FPU + // stack except incoming arguments + fstp_d(Address(rsp, incoming_argument_and_return_value_offset)); + for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) { + fld_d(Address(rsp, 0)); + addptr(rsp, sizeof(jdouble)); + } + fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble))); + addptr(rsp, sizeof(jdouble) * nb_args); + } + + off = 0; + if (UseSSE == 1) { + movflt(xmm0, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm1, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm2, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm3, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm4, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm5, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm6, Address(rsp,off++*sizeof(jdouble))); + movflt(xmm7, Address(rsp,off++*sizeof(jdouble))); + } else if (UseSSE >= 2) { + movdbl(xmm0, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm1, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm2, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm3, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm4, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm5, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm6, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm7, Address(rsp,off++*sizeof(jdouble))); + #ifdef _LP64 + movdbl(xmm8, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm9, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm10, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm11, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm12, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm13, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm14, Address(rsp,off++*sizeof(jdouble))); + movdbl(xmm15, Address(rsp,off++*sizeof(jdouble))); + #endif + } + if (UseSSE >= 1) { + addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8)); + } + popa(); + } + static const double pi_4 = 0.7853981633974483; void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) { // A hand-coded argument reduction for values in fabs(pi/4, pi/2) // was attempted in this code; unfortunately it appears that the
*** 8088,8164 **** jmp(done); } // slow case: runtime call bind(slow_case); - // Preserve registers across runtime call - pusha(); - int incoming_argument_and_return_value_offset = -1; - if (num_fpu_regs_in_use > 1) { - // Must preserve all other FPU regs (could alternatively convert - // SharedRuntime::dsin and dcos into assembly routines known not to trash - // FPU state, but can not trust C compiler) - NEEDS_CLEANUP; - // NOTE that in this case we also push the incoming argument to - // the stack and restore it later; we also use this stack slot to - // hold the return value from dsin or dcos. - for (int i = 0; i < num_fpu_regs_in_use; i++) { - subptr(rsp, sizeof(jdouble)); - fstp_d(Address(rsp, 0)); - } - incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1); - fld_d(Address(rsp, incoming_argument_and_return_value_offset)); - } - subptr(rsp, sizeof(jdouble)); - fstp_d(Address(rsp, 0)); - #ifdef _LP64 - movdbl(xmm0, Address(rsp, 0)); - #endif // _LP64 - // NOTE: we must not use call_VM_leaf here because that requires a - // complete interpreter frame in debug mode -- same bug as 4387334 - // MacroAssembler::call_VM_leaf_base is perfectly safe and will - // do proper 64bit abi - - NEEDS_CLEANUP; - // Need to add stack banging before this runtime call if it needs to - // be taken; however, there is no generic stack banging routine at - // the MacroAssembler level switch(trig) { case 's': { ! MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 0); } break; case 'c': { ! MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 0); } break; case 't': { ! MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 0); } break; default: assert(false, "bad intrinsic"); break; } - #ifdef _LP64 - movsd(Address(rsp, 0), xmm0); - fld_d(Address(rsp, 0)); - #endif // _LP64 - addptr(rsp, sizeof(jdouble)); - if (num_fpu_regs_in_use > 1) { - // Must save return value to stack and then restore entire FPU stack - fstp_d(Address(rsp, incoming_argument_and_return_value_offset)); - for (int i = 0; i < num_fpu_regs_in_use; i++) { - fld_d(Address(rsp, 0)); - addptr(rsp, sizeof(jdouble)); - } - } - popa(); // Come here with result in F-TOS bind(done); if (tmp != noreg) { --- 8412,8442 ---- jmp(done); } // slow case: runtime call bind(slow_case); switch(trig) { case 's': { ! fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use); } break; case 'c': { ! fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use); } break; case 't': { ! fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use); } break; default: assert(false, "bad intrinsic"); break; } // Come here with result in F-TOS bind(done); if (tmp != noreg) {