src/cpu/x86/vm/assembler_x86.cpp
Print this page
rev 3227 : 7133857: exp() and pow() should use the x87 ISA on x86
Summary: use x87 instructions to implement exp() and pow() in interpreter/c1/c2.
Reviewed-by:
*** 3574,3583 ****
--- 3574,3598 ----
void Assembler::fyl2x() {
emit_byte(0xD9);
emit_byte(0xF1);
}
+ void Assembler::frndint() {
+ emit_byte(0xD9);
+ emit_byte(0xFC);
+ }
+
+ void Assembler::f2xm1() {
+ emit_byte(0xD9);
+ emit_byte(0xF0);
+ }
+
+ void Assembler::fldl2e() {
+ emit_byte(0xD9);
+ emit_byte(0xEA);
+ }
+
// SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
// SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
static int simd_opc[4] = { 0, 0, 0x38, 0x3A };
*** 6864,6873 ****
--- 6879,7059 ----
void MacroAssembler::fldcw(AddressLiteral src) {
Assembler::fldcw(as_Address(src));
}
+ void MacroAssembler::pow_exp_core_encoding() {
+ // kills rax, rcx, rdx
+ subptr(rsp,sizeof(jdouble));
+ // computes 2^X. Stack: X ...
+ // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
+ // keep it on the thread's stack to compute 2^int(X) later
+ // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
+ // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
+ fld_s(0); // Stack: X X ...
+ frndint(); // Stack: int(X) X ...
+ fsuba(1); // Stack: int(X) X-int(X) ...
+ fistp_s(Address(rsp,0)); // move int(X) as integer to thread's stack. Stack: X-int(X) ...
+ f2xm1(); // Stack: 2^(X-int(X))-1 ...
+ fld1(); // Stack: 1 2^(X-int(X))-1 ...
+ faddp(1); // Stack: 2^(X-int(X))
+ // computes 2^(int(X)): add exponent bias (1023) to int(X), then
+ // shift int(X)+1023 to exponent position.
+ // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
+ // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
+ // values so detect them and set result to NaN.
+ movl(rax,Address(rsp,0));
+ movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
+ addl(rax, 1023);
+ movl(rdx,rax);
+ shll(rax,20);
+ // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
+ addl(rdx,1);
+ // Check that 1 < int(X)+1023+1 < 2048
+ // in 3 steps:
+ // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
+ // 2- (int(X)+1023+1)&-2048 != 0
+ // 3- (int(X)+1023+1)&-2048 != 1
+ // Do 2- first because addl just updated the flags.
+ cmov32(Assembler::equal,rax,rcx);
+ cmpl(rdx,1);
+ cmov32(Assembler::equal,rax,rcx);
+ testl(rdx,rcx);
+ cmov32(Assembler::notEqual,rax,rcx);
+ movl(Address(rsp,4),rax);
+ movl(Address(rsp,0),0);
+ fmul_d(Address(rsp,0)); // Stack: 2^X ...
+ addptr(rsp,sizeof(jdouble));
+ }
+
+ void MacroAssembler::fast_pow() {
+ // computes X^Y = 2^(Y * log2(X))
+ // if fast computation is not possible, result is NaN. Requires
+ // fallback from user of this macro.
+ fyl2x(); // Stack: (Y*log2(X)) ...
+ pow_exp_core_encoding(); // Stack: exp(X) ...
+ }
+
+ void MacroAssembler::fast_exp() {
+ // computes exp(X) = 2^(X * log2(e))
+ // if fast computation is not possible, result is NaN. Requires
+ // fallback from user of this macro.
+ fldl2e(); // Stack: log2(e) X ...
+ fmulp(1); // Stack: (X*log2(e)) ...
+ pow_exp_core_encoding(); // Stack: exp(X) ...
+ }
+
+ void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
+ // kills rax, rcx, rdx
+ // pow and exp needs 2 extra registers on the fpu stack.
+ Label slow_case, done;
+ Register tmp = noreg;
+ if (!VM_Version::supports_cmov()) {
+ // fcmp needs a temporary so preserve rdx,
+ tmp = rdx;
+ }
+ Register tmp2 = rdx;
+
+ if (is_exp) {
+ // Stack: X
+ fld_s(0); // duplicate argument for runtime call. Stack: X X
+ fast_exp(); // Stack: exp(X) X
+ fcmp(tmp, 0, false, false); // Stack: exp(X) X
+ // exp(X) not equal to itself: exp(X) is NaN go to slow case.
+ jcc(Assembler::parity, slow_case);
+ // get rid of duplicate argument. Stack: exp(X)
+ if (num_fpu_regs_in_use > 0) {
+ fxch();
+ fpop();
+ } else {
+ ffree(1);
+ }
+ jmp(done);
+ } else {
+ // Stack: X Y
+ Label x_negative, y_odd;
+
+ fldz(); // Stack: 0 X Y
+ fcmp(tmp, 1, true, false); // Stack: X Y
+ jcc(Assembler::above, x_negative);
+
+ // X >= 0
+
+ fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y
+ fld_s(1); // Stack: X Y X Y
+ fast_pow(); // Stack: X^Y X Y
+ fcmp(tmp, 0, false, false); // Stack: X^Y X Y
+ // X^Y not equal to itself: X^Y is NaN go to slow case.
+ jcc(Assembler::parity, slow_case);
+ // get rid of duplicate arguments. Stack: X^Y
+ if (num_fpu_regs_in_use > 0) {
+ fxch(); fpop();
+ fxch(); fpop();
+ } else {
+ ffree(2);
+ ffree(1);
+ }
+ jmp(done);
+
+ // X <= 0
+ bind(x_negative);
+
+ fld_s(1); // Stack: Y X Y
+ frndint(); // Stack: int(Y) X Y
+ fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
+ jcc(Assembler::notEqual, slow_case);
+
+ #ifdef _LP64
+ subptr(rsp, 8);
+ #else
+ subptr(rsp, 4);
+ #endif
+ fistp_s(Address(rsp,0)); // Stack: X Y
+
+ fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y
+ fld_s(1); // Stack: X Y X Y
+ fabs(); // Stack: abs(X) Y X Y
+ fast_pow(); // Stack: abs(X)^Y X Y
+ fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
+ // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
+
+ pop(tmp2);
+ jcc(Assembler::parity, slow_case);
+
+ // test Y for integer indefinite value (int overflow)
+ cmpl(tmp2, 0x80000000);
+ jcc(Assembler::equal, slow_case);
+
+ // get rid of duplicate arguments. Stack: X^Y
+ if (num_fpu_regs_in_use > 0) {
+ fxch(); fpop();
+ fxch(); fpop();
+ } else {
+ ffree(2);
+ ffree(1);
+ }
+
+ testl(tmp2, 1);
+ jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
+ // X <= 0, Y even: X^Y = -abs(X)^Y
+
+ fchs(); // Stack: -abs(X)^Y Y
+ jmp(done);
+ }
+
+ // slow case: runtime call
+ bind(slow_case);
+
+ fpop(); // pop incorrect result or int(Y)
+
+ fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
+ is_exp ? 1 : 2, num_fpu_regs_in_use);
+
+ // Come here with result in F-TOS
+ bind(done);
+ }
+
void MacroAssembler::fpop() {
ffree();
fincstp();
}
*** 8041,8050 ****
--- 8227,8374 ----
}
adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
#endif
}
+ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
+ pusha();
+
+ // if we are coming from c1, xmm registers may be live
+ if (UseSSE >= 1) {
+ subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
+ }
+ int off = 0;
+ if (UseSSE == 1) {
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
+ } else if (UseSSE >= 2) {
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7);
+ #ifdef _LP64
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15);
+ #endif
+ }
+
+ // Preserve registers across runtime call
+ int incoming_argument_and_return_value_offset = -1;
+ if (num_fpu_regs_in_use > 1) {
+ // Must preserve all other FPU regs (could alternatively convert
+ // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
+ // FPU state, but can not trust C compiler)
+ NEEDS_CLEANUP;
+ // NOTE that in this case we also push the incoming argument(s) to
+ // the stack and restore it later; we also use this stack slot to
+ // hold the return value from dsin, dcos etc.
+ for (int i = 0; i < num_fpu_regs_in_use; i++) {
+ subptr(rsp, sizeof(jdouble));
+ fstp_d(Address(rsp, 0));
+ }
+ incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
+ for (int i = nb_args-1; i >= 0; i--) {
+ fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
+ }
+ }
+
+ subptr(rsp, nb_args*sizeof(jdouble));
+ for (int i = 0; i < nb_args; i++) {
+ fstp_d(Address(rsp, i*sizeof(jdouble)));
+ }
+
+ #ifdef _LP64
+ if (nb_args > 0) {
+ movdbl(xmm0, Address(rsp, 0));
+ }
+ if (nb_args > 1) {
+ movdbl(xmm1, Address(rsp, sizeof(jdouble)));
+ }
+ assert(nb_args <= 2, "unsupported number of args");
+ #endif // _LP64
+
+ // NOTE: we must not use call_VM_leaf here because that requires a
+ // complete interpreter frame in debug mode -- same bug as 4387334
+ // MacroAssembler::call_VM_leaf_base is perfectly safe and will
+ // do proper 64bit abi
+
+ NEEDS_CLEANUP;
+ // Need to add stack banging before this runtime call if it needs to
+ // be taken; however, there is no generic stack banging routine at
+ // the MacroAssembler level
+
+ MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
+
+ #ifdef _LP64
+ movsd(Address(rsp, 0), xmm0);
+ fld_d(Address(rsp, 0));
+ #endif // _LP64
+ addptr(rsp, sizeof(jdouble) * nb_args);
+ if (num_fpu_regs_in_use > 1) {
+ // Must save return value to stack and then restore entire FPU
+ // stack except incoming arguments
+ fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
+ for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
+ fld_d(Address(rsp, 0));
+ addptr(rsp, sizeof(jdouble));
+ }
+ fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
+ addptr(rsp, sizeof(jdouble) * nb_args);
+ }
+
+ off = 0;
+ if (UseSSE == 1) {
+ movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
+ } else if (UseSSE >= 2) {
+ movdbl(xmm0, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm1, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm2, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm3, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm4, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm5, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm6, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm7, Address(rsp,off++*sizeof(jdouble)));
+ #ifdef _LP64
+ movdbl(xmm8, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm9, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm10, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm11, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm12, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm13, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm14, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm15, Address(rsp,off++*sizeof(jdouble)));
+ #endif
+ }
+ if (UseSSE >= 1) {
+ addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
+ }
+ popa();
+ }
+
static const double pi_4 = 0.7853981633974483;
void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
// A hand-coded argument reduction for values in fabs(pi/4, pi/2)
// was attempted in this code; unfortunately it appears that the
*** 8088,8164 ****
jmp(done);
}
// slow case: runtime call
bind(slow_case);
- // Preserve registers across runtime call
- pusha();
- int incoming_argument_and_return_value_offset = -1;
- if (num_fpu_regs_in_use > 1) {
- // Must preserve all other FPU regs (could alternatively convert
- // SharedRuntime::dsin and dcos into assembly routines known not to trash
- // FPU state, but can not trust C compiler)
- NEEDS_CLEANUP;
- // NOTE that in this case we also push the incoming argument to
- // the stack and restore it later; we also use this stack slot to
- // hold the return value from dsin or dcos.
- for (int i = 0; i < num_fpu_regs_in_use; i++) {
- subptr(rsp, sizeof(jdouble));
- fstp_d(Address(rsp, 0));
- }
- incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
- fld_d(Address(rsp, incoming_argument_and_return_value_offset));
- }
- subptr(rsp, sizeof(jdouble));
- fstp_d(Address(rsp, 0));
- #ifdef _LP64
- movdbl(xmm0, Address(rsp, 0));
- #endif // _LP64
- // NOTE: we must not use call_VM_leaf here because that requires a
- // complete interpreter frame in debug mode -- same bug as 4387334
- // MacroAssembler::call_VM_leaf_base is perfectly safe and will
- // do proper 64bit abi
-
- NEEDS_CLEANUP;
- // Need to add stack banging before this runtime call if it needs to
- // be taken; however, there is no generic stack banging routine at
- // the MacroAssembler level
switch(trig) {
case 's':
{
! MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 0);
}
break;
case 'c':
{
! MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 0);
}
break;
case 't':
{
! MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 0);
}
break;
default:
assert(false, "bad intrinsic");
break;
}
- #ifdef _LP64
- movsd(Address(rsp, 0), xmm0);
- fld_d(Address(rsp, 0));
- #endif // _LP64
- addptr(rsp, sizeof(jdouble));
- if (num_fpu_regs_in_use > 1) {
- // Must save return value to stack and then restore entire FPU stack
- fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
- for (int i = 0; i < num_fpu_regs_in_use; i++) {
- fld_d(Address(rsp, 0));
- addptr(rsp, sizeof(jdouble));
- }
- }
- popa();
// Come here with result in F-TOS
bind(done);
if (tmp != noreg) {
--- 8412,8442 ----
jmp(done);
}
// slow case: runtime call
bind(slow_case);
switch(trig) {
case 's':
{
! fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
}
break;
case 'c':
{
! fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
}
break;
case 't':
{
! fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
}
break;
default:
assert(false, "bad intrinsic");
break;
}
// Come here with result in F-TOS
bind(done);
if (tmp != noreg) {