hotspot Sdiff src/cpu/x86/vm

src/cpu/x86/vm/x86_32.ad

rev 3227 : 7133857: exp() and pow() should use the x87 ISA on x86
Summary: use x87 instructions to implement exp() and pow() in interpreter/c1/c2.
Reviewed-by:

2519     __ movdbl(Address(rsp, 0), $src$$XMMRegister);
2520     __ fld_d(Address(rsp, 0));
2521   %}
2522 
2523   enc_class push_stack_temp_qword() %{
2524     MacroAssembler _masm(&cbuf);
2525     __ subptr(rsp, 8);
2526   %}
2527 
2528   enc_class pop_stack_temp_qword() %{
2529     MacroAssembler _masm(&cbuf);
2530     __ addptr(rsp, 8);
2531   %}
2532 
2533   enc_class push_xmm_to_fpr1(regD src) %{
2534     MacroAssembler _masm(&cbuf);
2535     __ movdbl(Address(rsp, 0), $src$$XMMRegister);
2536     __ fld_d(Address(rsp, 0));
2537   %}
2538 
2539   // Compute X^Y using Intel's fast hardware instructions, if possible.
2540   // Otherwise return a NaN.
2541   enc_class pow_exp_core_encoding %{
2542     // FPR1 holds Y*ln2(X).  Compute FPR1 = 2^(Y*ln2(X))
2543     emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xC0);  // fdup = fld st(0)          Q       Q
2544     emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xFC);  // frndint               int(Q)      Q
2545     emit_opcode(cbuf,0xDC); emit_opcode(cbuf,0xE9);  // fsub st(1) -= st(0);  int(Q) frac(Q)
2546     emit_opcode(cbuf,0xDB);                          // FISTP [ESP]           frac(Q)
2547     emit_opcode(cbuf,0x1C);
2548     emit_d8(cbuf,0x24);
2549     emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xF0);  // f2xm1                 2^frac(Q)-1
2550     emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xE8);  // fld1                  1 2^frac(Q)-1
2551     emit_opcode(cbuf,0xDE); emit_opcode(cbuf,0xC1);  // faddp                 2^frac(Q)
2552     emit_opcode(cbuf,0x8B);                          // mov rax,[esp+0]=int(Q)
2553     encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 0, false);
2554     emit_opcode(cbuf,0xC7);                          // mov rcx,0xFFFFF800 - overflow mask
2555     emit_rm(cbuf, 0x3, 0x0, ECX_enc);
2556     emit_d32(cbuf,0xFFFFF800);
2557     emit_opcode(cbuf,0x81);                          // add rax,1023 - the double exponent bias
2558     emit_rm(cbuf, 0x3, 0x0, EAX_enc);
2559     emit_d32(cbuf,1023);
2560     emit_opcode(cbuf,0x8B);                          // mov rbx,eax
2561     emit_rm(cbuf, 0x3, EBX_enc, EAX_enc);
2562     emit_opcode(cbuf,0xC1);                          // shl rax,20 - Slide to exponent position
2563     emit_rm(cbuf,0x3,0x4,EAX_enc);
2564     emit_d8(cbuf,20);
2565     emit_opcode(cbuf,0x85);                          // test rbx,ecx - check for overflow
2566     emit_rm(cbuf, 0x3, EBX_enc, ECX_enc);
2567     emit_opcode(cbuf,0x0F); emit_opcode(cbuf,0x45);  // CMOVne rax,ecx - overflow; stuff NAN into EAX
2568     emit_rm(cbuf, 0x3, EAX_enc, ECX_enc);
2569     emit_opcode(cbuf,0x89);                          // mov [esp+4],eax - Store as part of double word
2570     encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 4, false);
2571     emit_opcode(cbuf,0xC7);                          // mov [esp+0],0   - [ESP] = (double)(1<<int(Q)) = 2^int(Q)
2572     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
2573     emit_d32(cbuf,0);
2574     emit_opcode(cbuf,0xDC);                          // fmul dword st(0),[esp+0]; FPR1 = 2^int(Q)*2^frac(Q) = 2^Q
2575     encode_RegMem(cbuf, 0x1, ESP_enc, 0x4, 0, 0, false);
2576   %}
2577 
2578   enc_class Push_Result_Mod_DPR( regDPR src) %{
2579     if ($src$$reg != FPR1L_enc) {
2580       // fincstp
2581       emit_opcode (cbuf, 0xD9);
2582       emit_opcode (cbuf, 0xF7);
2583       // FXCH FPR1 with src
2584       emit_opcode(cbuf, 0xD9);
2585       emit_d8(cbuf, 0xC8-1+$src$$reg );
2586       // fdecstp
2587       emit_opcode (cbuf, 0xD9);
2588       emit_opcode (cbuf, 0xF6);
2589     }
2590     // // following asm replaced with Pop_Reg_F or Pop_Mem_F
2591     // // FSTP   FPR$dst$$reg
2592     // emit_opcode( cbuf, 0xDD );
2593     // emit_d8( cbuf, 0xD8+$dst$$reg );
2594   %}
2595 
2596   enc_class fnstsw_sahf_skip_parity() %{
2597     // fnstsw ax

10083   predicate (UseSSE>=2);
10084   match(Set dst(AtanD dst src));
10085   effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
10086   format %{ "DATA   $dst,$src" %}
10087   opcode(0xD9, 0xF3);
10088   ins_encode( Push_SrcD(src),
10089               OpcP, OpcS, Push_ResultD(dst) );
10090   ins_pipe( pipe_slow );
10091 %}
10092 
10093 instruct sqrtDPR_reg(regDPR dst, regDPR src) %{
10094   predicate (UseSSE<=1);
10095   match(Set dst (SqrtD src));
10096   format %{ "DSQRT  $dst,$src" %}
10097   opcode(0xFA, 0xD9);
10098   ins_encode( Push_Reg_DPR(src),
10099               OpcS, OpcP, Pop_Reg_DPR(dst) );
10100   ins_pipe( pipe_slow );
10101 %}
10102 
10103 instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
10104   predicate (UseSSE<=1);
10105   match(Set Y (PowD X Y));  // Raise X to the Yth power
10106   effect(KILL rax, KILL rbx, KILL rcx);
10107   format %{ "SUB    ESP,8\t\t# Fast-path POW encoding\n\t"
10108             "FLD_D  $X\n\t"
10109             "FYL2X  \t\t\t# Q=Y*ln2(X)\n\t"
10110 
10111             "FDUP   \t\t\t# Q Q\n\t"
10112             "FRNDINT\t\t\t# int(Q) Q\n\t"
10113             "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
10114             "FISTP  dword [ESP]\n\t"
10115             "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
10116             "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
10117             "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
10118             "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
10119             "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
10120             "ADD    EAX,1023\t\t# Double exponent bias\n\t"
10121             "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
10122             "SHL    EAX,20\t\t# Shift exponent into place\n\t"
10123             "TEST   EBX,ECX\t\t# Check for overflow\n\t"
10124             "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
10125             "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
10126             "MOV    [ESP+0],0\n\t"
10127             "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
10128 
10129             "ADD    ESP,8"
10130              %}
10131   ins_encode( push_stack_temp_qword,
10132               Push_Reg_DPR(X),
10133               Opcode(0xD9), Opcode(0xF1),   // fyl2x
10134               pow_exp_core_encoding,
10135               pop_stack_temp_qword);
10136   ins_pipe( pipe_slow );
10137 %}
10138 
10139 instruct powD_reg(regD dst, regD src0, regD src1, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx ) %{
10140   predicate (UseSSE>=2);
10141   match(Set dst (PowD src0 src1));  // Raise src0 to the src1'th power
10142   effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx );
10143   format %{ "SUB    ESP,8\t\t# Fast-path POW encoding\n\t"
10144             "MOVSD  [ESP],$src1\n\t"
10145             "FLD    FPR1,$src1\n\t"
10146             "MOVSD  [ESP],$src0\n\t"
10147             "FLD    FPR1,$src0\n\t"
10148             "FYL2X  \t\t\t# Q=Y*ln2(X)\n\t"
10149 
10150             "FDUP   \t\t\t# Q Q\n\t"
10151             "FRNDINT\t\t\t# int(Q) Q\n\t"
10152             "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
10153             "FISTP  dword [ESP]\n\t"
10154             "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
10155             "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
10156             "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
10157             "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
10158             "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
10159             "ADD    EAX,1023\t\t# Double exponent bias\n\t"
10160             "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
10161             "SHL    EAX,20\t\t# Shift exponent into place\n\t"
10162             "TEST   EBX,ECX\t\t# Check for overflow\n\t"
10163             "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
10164             "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
10165             "MOV    [ESP+0],0\n\t"
10166             "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
10167 
10168             "FST_D  [ESP]\n\t"
10169             "MOVSD  $dst,[ESP]\n\t"
10170             "ADD    ESP,8"
10171              %}
10172   ins_encode( push_stack_temp_qword,
10173               push_xmm_to_fpr1(src1),
10174               push_xmm_to_fpr1(src0),
10175               Opcode(0xD9), Opcode(0xF1),   // fyl2x
10176               pow_exp_core_encoding,
10177               Push_ResultD(dst) );
10178   ins_pipe( pipe_slow );
10179 %}
10180 
10181 
10182 instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
10183   predicate (UseSSE<=1);
10184   match(Set dpr1 (ExpD dpr1));
10185   effect(KILL rax, KILL rbx, KILL rcx);
10186   format %{ "SUB    ESP,8\t\t# Fast-path EXP encoding"
10187             "FLDL2E \t\t\t# Ld log2(e) X\n\t"
10188             "FMULP  \t\t\t# Q=X*log2(e)\n\t"
10189 
10190             "FDUP   \t\t\t# Q Q\n\t"
10191             "FRNDINT\t\t\t# int(Q) Q\n\t"
10192             "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
10193             "FISTP  dword [ESP]\n\t"
10194             "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
10195             "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
10196             "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
10197             "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
10198             "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
10199             "ADD    EAX,1023\t\t# Double exponent bias\n\t"
10200             "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
10201             "SHL    EAX,20\t\t# Shift exponent into place\n\t"
10202             "TEST   EBX,ECX\t\t# Check for overflow\n\t"
10203             "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
10204             "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
10205             "MOV    [ESP+0],0\n\t"
10206             "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
10207 
10208             "ADD    ESP,8"
10209              %}
10210   ins_encode( push_stack_temp_qword,
10211               Opcode(0xD9), Opcode(0xEA),   // fldl2e
10212               Opcode(0xDE), Opcode(0xC9),   // fmulp
10213               pow_exp_core_encoding,
10214               pop_stack_temp_qword);
10215   ins_pipe( pipe_slow );
10216 %}
10217 
10218 instruct expD_reg(regD dst, regD src, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
10219   predicate (UseSSE>=2);
10220   match(Set dst (ExpD src));
10221   effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx);
10222   format %{ "SUB    ESP,8\t\t# Fast-path EXP encoding\n\t"
10223             "MOVSD  [ESP],$src\n\t"
10224             "FLDL2E \t\t\t# Ld log2(e) X\n\t"
10225             "FMULP  \t\t\t# Q=X*log2(e) X\n\t"
10226 
10227             "FDUP   \t\t\t# Q Q\n\t"
10228             "FRNDINT\t\t\t# int(Q) Q\n\t"
10229             "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
10230             "FISTP  dword [ESP]\n\t"
10231             "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
10232             "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
10233             "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
10234             "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
10235             "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
10236             "ADD    EAX,1023\t\t# Double exponent bias\n\t"
10237             "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
10238             "SHL    EAX,20\t\t# Shift exponent into place\n\t"
10239             "TEST   EBX,ECX\t\t# Check for overflow\n\t"
10240             "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
10241             "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
10242             "MOV    [ESP+0],0\n\t"
10243             "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
10244 
10245             "FST_D  [ESP]\n\t"
10246             "MOVSD  $dst,[ESP]\n\t"
10247             "ADD    ESP,8"
10248              %}
10249   ins_encode( Push_SrcD(src),
10250               Opcode(0xD9), Opcode(0xEA),   // fldl2e
10251               Opcode(0xDE), Opcode(0xC9),   // fmulp
10252               pow_exp_core_encoding,
10253               Push_ResultD(dst) );
10254   ins_pipe( pipe_slow );
10255 %}
10256 
10257 
10258 
10259 instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{
10260   predicate (UseSSE<=1);
10261   // The source Double operand on FPU stack
10262   match(Set dst (Log10D src));
10263   // fldlg2       ; push log_10(2) on the FPU stack; full 80-bit number
10264   // fxch         ; swap ST(0) with ST(1)
10265   // fyl2x        ; compute log_10(2) * log_2(x)
10266   format %{ "FLDLG2 \t\t\t#Log10\n\t"
10267             "FXCH   \n\t"
10268             "FYL2X  \t\t\t# Q=Log10*Log_2(x)"
10269          %}
10270   ins_encode( Opcode(0xD9), Opcode(0xEC),   // fldlg2
10271               Opcode(0xD9), Opcode(0xC9),   // fxch
10272               Opcode(0xD9), Opcode(0xF1));  // fyl2x
10273 
10274   ins_pipe( pipe_slow );
10275 %}
10276 
10277 instruct log10D_reg(regD dst, regD src, eFlagsReg cr) %{

2519     __ movdbl(Address(rsp, 0), $src$$XMMRegister);
2520     __ fld_d(Address(rsp, 0));
2521   %}
2522 
2523   enc_class push_stack_temp_qword() %{
2524     MacroAssembler _masm(&cbuf);
2525     __ subptr(rsp, 8);
2526   %}
2527 
2528   enc_class pop_stack_temp_qword() %{
2529     MacroAssembler _masm(&cbuf);
2530     __ addptr(rsp, 8);
2531   %}
2532 
2533   enc_class push_xmm_to_fpr1(regD src) %{
2534     MacroAssembler _masm(&cbuf);
2535     __ movdbl(Address(rsp, 0), $src$$XMMRegister);
2536     __ fld_d(Address(rsp, 0));
2537   %}
2538 







































2539   enc_class Push_Result_Mod_DPR( regDPR src) %{
2540     if ($src$$reg != FPR1L_enc) {
2541       // fincstp
2542       emit_opcode (cbuf, 0xD9);
2543       emit_opcode (cbuf, 0xF7);
2544       // FXCH FPR1 with src
2545       emit_opcode(cbuf, 0xD9);
2546       emit_d8(cbuf, 0xC8-1+$src$$reg );
2547       // fdecstp
2548       emit_opcode (cbuf, 0xD9);
2549       emit_opcode (cbuf, 0xF6);
2550     }
2551     // // following asm replaced with Pop_Reg_F or Pop_Mem_F
2552     // // FSTP   FPR$dst$$reg
2553     // emit_opcode( cbuf, 0xDD );
2554     // emit_d8( cbuf, 0xD8+$dst$$reg );
2555   %}
2556 
2557   enc_class fnstsw_sahf_skip_parity() %{
2558     // fnstsw ax

10044   predicate (UseSSE>=2);
10045   match(Set dst(AtanD dst src));
10046   effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
10047   format %{ "DATA   $dst,$src" %}
10048   opcode(0xD9, 0xF3);
10049   ins_encode( Push_SrcD(src),
10050               OpcP, OpcS, Push_ResultD(dst) );
10051   ins_pipe( pipe_slow );
10052 %}
10053 
10054 instruct sqrtDPR_reg(regDPR dst, regDPR src) %{
10055   predicate (UseSSE<=1);
10056   match(Set dst (SqrtD src));
10057   format %{ "DSQRT  $dst,$src" %}
10058   opcode(0xFA, 0xD9);
10059   ins_encode( Push_Reg_DPR(src),
10060               OpcS, OpcP, Pop_Reg_DPR(dst) );
10061   ins_pipe( pipe_slow );
10062 %}
10063 
10064 instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
10065   predicate (UseSSE<=1);
10066   match(Set Y (PowD X Y));  // Raise X to the Yth power
10067   effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
10068   format %{ "fast_pow $X $Y -> $Y  // KILL $rax, $rcx, $rdx" %}
10069   ins_encode %{
10070     __ subptr(rsp, 8);
10071     __ fld_s($X$$reg - 1);
10072     __ fast_pow();
10073     __ addptr(rsp, 8);
10074   %}






















10075   ins_pipe( pipe_slow );
10076 %}
10077 
10078 instruct powD_reg(regD dst, regD src0, regD src1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
10079   predicate (UseSSE>=2);
10080   match(Set dst (PowD src0 src1));  // Raise src0 to the src1'th power
10081   effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
10082   format %{ "fast_pow $src0 $src1 -> $dst  // KILL $rax, $rcx, $rdx" %}
10083   ins_encode %{
10084     __ subptr(rsp, 8);
10085     __ movdbl(Address(rsp, 0), $src1$$XMMRegister);
10086     __ fld_d(Address(rsp, 0));
10087     __ movdbl(Address(rsp, 0), $src0$$XMMRegister);
10088     __ fld_d(Address(rsp, 0));
10089     __ fast_pow();
10090     __ fstp_d(Address(rsp, 0));
10091     __ movdbl($dst$$XMMRegister, Address(rsp, 0));
10092     __ addptr(rsp, 8);
10093   %}























10094   ins_pipe( pipe_slow );
10095 %}
10096 
10097 
10098 instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
10099   predicate (UseSSE<=1);
10100   match(Set dpr1 (ExpD dpr1));
10101   effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
10102   format %{ "fast_exp $dpr1 -> $dpr1  // KILL $rax, $rcx, $rdx" %}
10103   ins_encode %{
10104     __ fast_exp();
10105   %}

























10106   ins_pipe( pipe_slow );
10107 %}
10108 
10109 instruct expD_reg(regD dst, regD src, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
10110   predicate (UseSSE>=2);
10111   match(Set dst (ExpD src));
10112   effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
10113   format %{ "fast_exp $dst -> $src  // KILL $rax, $rcx, $rdx" %}
10114   ins_encode %{
10115     __ subptr(rsp, 8);
10116     __ movdbl(Address(rsp, 0), $src$$XMMRegister);
10117     __ fld_d(Address(rsp, 0));
10118     __ fast_exp();
10119     __ fstp_d(Address(rsp, 0));
10120     __ movdbl($dst$$XMMRegister, Address(rsp, 0));
10121     __ addptr(rsp, 8);

















10122   %}





10123   ins_pipe( pipe_slow );
10124 %}


10125 
10126 instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{
10127   predicate (UseSSE<=1);
10128   // The source Double operand on FPU stack
10129   match(Set dst (Log10D src));
10130   // fldlg2       ; push log_10(2) on the FPU stack; full 80-bit number
10131   // fxch         ; swap ST(0) with ST(1)
10132   // fyl2x        ; compute log_10(2) * log_2(x)
10133   format %{ "FLDLG2 \t\t\t#Log10\n\t"
10134             "FXCH   \n\t"
10135             "FYL2X  \t\t\t# Q=Log10*Log_2(x)"
10136          %}
10137   ins_encode( Opcode(0xD9), Opcode(0xEC),   // fldlg2
10138               Opcode(0xD9), Opcode(0xC9),   // fxch
10139               Opcode(0xD9), Opcode(0xF1));  // fyl2x
10140 
10141   ins_pipe( pipe_slow );
10142 %}
10143 
10144 instruct log10D_reg(regD dst, regD src, eFlagsReg cr) %{