1 /*
   2  * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "prims/jni.h"
  27 #include "runtime/interfaceSupport.hpp"
  28 #include "runtime/sharedRuntime.hpp"
  29 
  30 // This file contains copies of the fdlibm routines used by
  31 // StrictMath. It turns out that it is almost always required to use
  32 // these runtime routines; the Intel CPU doesn't meet the Java
  33 // specification for sin/cos outside a certain limited argument range,
  34 // and the SPARC CPU doesn't appear to have sin/cos instructions. It
  35 // also turns out that avoiding the indirect call through function
  36 // pointer out to libjava.so in SharedRuntime speeds these routines up
  37 // by roughly 15% on both Win32/x86 and Solaris/SPARC.
  38 
  39 // Enabling optimizations in this file causes incorrect code to be
  40 // generated; can not figure out how to turn down optimization for one
  41 // file in the IDE on Windows
  42 #ifdef WIN32
  43 # pragma optimize ( "", off )
  44 #endif
  45 
  46 /* The above workaround now causes more problems with the latest MS compiler.
  47  * Visual Studio 2010's /GS option tries to guard against buffer overruns.
  48  * /GS is on by default if you specify optimizations, which we do globally
  49  * via /W3 /O2. However the above selective turning off of optimizations means
  50  * that /GS issues a warning "4748". And since we treat warnings as errors (/WX)
  51  * then the compilation fails. There are several possible solutions
  52  * (1) Remove that pragma above as obsolete with VS2010 - requires testing.
  53  * (2) Stop treating warnings as errors - would be a backward step
  54  * (3) Disable /GS - may help performance but you lose the security checks
  55  * (4) Disable the warning with "#pragma warning( disable : 4748 )"
  56  * (5) Disable planting the code with  __declspec(safebuffers)
  57  * I've opted for (5) although we should investigate the local performance
  58  * benefits of (1) and global performance benefit of (3).
  59  */
  60 #if defined(WIN32) && (defined(_MSC_VER) && (_MSC_VER >= 1600))
  61 #define SAFEBUF __declspec(safebuffers)
  62 #else
  63 #define SAFEBUF
  64 #endif
  65 
  66 #include "runtime/sharedRuntimeMath.hpp"
  67 
  68 /*
  69  * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
  70  * double x[],y[]; int e0,nx,prec; int ipio2[];
  71  *
  72  * __kernel_rem_pio2 return the last three digits of N with
  73  *              y = x - N*pi/2
  74  * so that |y| < pi/2.
  75  *
  76  * The method is to compute the integer (mod 8) and fraction parts of
  77  * (2/pi)*x without doing the full multiplication. In general we
  78  * skip the part of the product that are known to be a huge integer (
  79  * more accurately, = 0 mod 8 ). Thus the number of operations are
  80  * independent of the exponent of the input.
  81  *
  82  * (2/pi) is represented by an array of 24-bit integers in ipio2[].
  83  *
  84  * Input parameters:
  85  *      x[]     The input value (must be positive) is broken into nx
  86  *              pieces of 24-bit integers in double precision format.
  87  *              x[i] will be the i-th 24 bit of x. The scaled exponent
  88  *              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
  89  *              match x's up to 24 bits.
  90  *
  91  *              Example of breaking a double positive z into x[0]+x[1]+x[2]:
  92  *                      e0 = ilogb(z)-23
  93  *                      z  = scalbn(z,-e0)
  94  *              for i = 0,1,2
  95  *                      x[i] = floor(z)
  96  *                      z    = (z-x[i])*2**24
  97  *
  98  *
  99  *      y[]     ouput result in an array of double precision numbers.
 100  *              The dimension of y[] is:
 101  *                      24-bit  precision       1
 102  *                      53-bit  precision       2
 103  *                      64-bit  precision       2
 104  *                      113-bit precision       3
 105  *              The actual value is the sum of them. Thus for 113-bit
 106  *              precsion, one may have to do something like:
 107  *
 108  *              long double t,w,r_head, r_tail;
 109  *              t = (long double)y[2] + (long double)y[1];
 110  *              w = (long double)y[0];
 111  *              r_head = t+w;
 112  *              r_tail = w - (r_head - t);
 113  *
 114  *      e0      The exponent of x[0]
 115  *
 116  *      nx      dimension of x[]
 117  *
 118  *      prec    an interger indicating the precision:
 119  *                      0       24  bits (single)
 120  *                      1       53  bits (double)
 121  *                      2       64  bits (extended)
 122  *                      3       113 bits (quad)
 123  *
 124  *      ipio2[]
 125  *              integer array, contains the (24*i)-th to (24*i+23)-th
 126  *              bit of 2/pi after binary point. The corresponding
 127  *              floating value is
 128  *
 129  *                      ipio2[i] * 2^(-24(i+1)).
 130  *
 131  * External function:
 132  *      double scalbn(), floor();
 133  *
 134  *
 135  * Here is the description of some local variables:
 136  *
 137  *      jk      jk+1 is the initial number of terms of ipio2[] needed
 138  *              in the computation. The recommended value is 2,3,4,
 139  *              6 for single, double, extended,and quad.
 140  *
 141  *      jz      local integer variable indicating the number of
 142  *              terms of ipio2[] used.
 143  *
 144  *      jx      nx - 1
 145  *
 146  *      jv      index for pointing to the suitable ipio2[] for the
 147  *              computation. In general, we want
 148  *                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
 149  *              is an integer. Thus
 150  *                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
 151  *              Hence jv = max(0,(e0-3)/24).
 152  *
 153  *      jp      jp+1 is the number of terms in PIo2[] needed, jp = jk.
 154  *
 155  *      q[]     double array with integral value, representing the
 156  *              24-bits chunk of the product of x and 2/pi.
 157  *
 158  *      q0      the corresponding exponent of q[0]. Note that the
 159  *              exponent for q[i] would be q0-24*i.
 160  *
 161  *      PIo2[]  double precision array, obtained by cutting pi/2
 162  *              into 24 bits chunks.
 163  *
 164  *      f[]     ipio2[] in floating point
 165  *
 166  *      iq[]    integer array by breaking up q[] in 24-bits chunk.
 167  *
 168  *      fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
 169  *
 170  *      ih      integer. If >0 it indicates q[] is >= 0.5, hence
 171  *              it also indicates the *sign* of the result.
 172  *
 173  */
 174 
 175 
 176 /*
 177  * Constants:
 178  * The hexadecimal values are the intended ones for the following
 179  * constants. The decimal values may be used, provided that the
 180  * compiler will convert from decimal to binary accurately enough
 181  * to produce the hexadecimal values shown.
 182  */
 183 
 184 
 185 static const int init_jk[] = {2,3,4,6}; /* initial value for jk */
 186 
 187 static const double PIo2[] = {
 188   1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
 189   7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
 190   5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
 191   3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
 192   1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
 193   1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
 194   2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
 195   2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
 196 };
 197 
 198 static const double
 199 zeroB   = 0.0,
 200 one     = 1.0,
 201 two24B  = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
 202 twon24  = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
 203 
 204 static SAFEBUF int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int *ipio2) {
 205   int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
 206   double z,fw,f[20],fq[20],q[20];
 207 
 208   /* initialize jk*/
 209   jk = init_jk[prec];
 210   jp = jk;
 211 
 212   /* determine jx,jv,q0, note that 3>q0 */
 213   jx =  nx-1;
 214   jv = (e0-3)/24; if(jv<0) jv=0;
 215   q0 =  e0-24*(jv+1);
 216 
 217   /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
 218   j = jv-jx; m = jx+jk;
 219   for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : (double) ipio2[j];
 220 
 221   /* compute q[0],q[1],...q[jk] */
 222   for (i=0;i<=jk;i++) {
 223     for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
 224   }
 225 
 226   jz = jk;
 227 recompute:
 228   /* distill q[] into iq[] reversingly */
 229   for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
 230     fw    =  (double)((int)(twon24* z));
 231     iq[i] =  (int)(z-two24B*fw);
 232     z     =  q[j-1]+fw;
 233   }
 234 
 235   /* compute n */
 236   z  = scalbnA(z,q0);           /* actual value of z */
 237   z -= 8.0*floor(z*0.125);              /* trim off integer >= 8 */
 238   n  = (int) z;
 239   z -= (double)n;
 240   ih = 0;
 241   if(q0>0) {    /* need iq[jz-1] to determine n */
 242     i  = (iq[jz-1]>>(24-q0)); n += i;
 243     iq[jz-1] -= i<<(24-q0);
 244     ih = iq[jz-1]>>(23-q0);
 245   }
 246   else if(q0==0) ih = iq[jz-1]>>23;
 247   else if(z>=0.5) ih=2;
 248 
 249   if(ih>0) {    /* q > 0.5 */
 250     n += 1; carry = 0;
 251     for(i=0;i<jz ;i++) {        /* compute 1-q */
 252       j = iq[i];
 253       if(carry==0) {
 254         if(j!=0) {
 255           carry = 1; iq[i] = 0x1000000- j;
 256         }
 257       } else  iq[i] = 0xffffff - j;
 258     }
 259     if(q0>0) {          /* rare case: chance is 1 in 12 */
 260       switch(q0) {
 261       case 1:
 262         iq[jz-1] &= 0x7fffff; break;
 263       case 2:
 264         iq[jz-1] &= 0x3fffff; break;
 265       }
 266     }
 267     if(ih==2) {
 268       z = one - z;
 269       if(carry!=0) z -= scalbnA(one,q0);
 270     }
 271   }
 272 
 273   /* check if recomputation is needed */
 274   if(z==zeroB) {
 275     j = 0;
 276     for (i=jz-1;i>=jk;i--) j |= iq[i];
 277     if(j==0) { /* need recomputation */
 278       for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
 279 
 280       for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
 281         f[jx+i] = (double) ipio2[jv+i];
 282         for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
 283         q[i] = fw;
 284       }
 285       jz += k;
 286       goto recompute;
 287     }
 288   }
 289 
 290   /* chop off zero terms */
 291   if(z==0.0) {
 292     jz -= 1; q0 -= 24;
 293     while(iq[jz]==0) { jz--; q0-=24;}
 294   } else { /* break z into 24-bit if necessary */
 295     z = scalbnA(z,-q0);
 296     if(z>=two24B) {
 297       fw = (double)((int)(twon24*z));
 298       iq[jz] = (int)(z-two24B*fw);
 299       jz += 1; q0 += 24;
 300       iq[jz] = (int) fw;
 301     } else iq[jz] = (int) z ;
 302   }
 303 
 304   /* convert integer "bit" chunk to floating-point value */
 305   fw = scalbnA(one,q0);
 306   for(i=jz;i>=0;i--) {
 307     q[i] = fw*(double)iq[i]; fw*=twon24;
 308   }
 309 
 310   /* compute PIo2[0,...,jp]*q[jz,...,0] */
 311   for(i=jz;i>=0;i--) {
 312     for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
 313     fq[jz-i] = fw;
 314   }
 315 
 316   /* compress fq[] into y[] */
 317   switch(prec) {
 318   case 0:
 319     fw = 0.0;
 320     for (i=jz;i>=0;i--) fw += fq[i];
 321     y[0] = (ih==0)? fw: -fw;
 322     break;
 323   case 1:
 324   case 2:
 325     fw = 0.0;
 326     for (i=jz;i>=0;i--) fw += fq[i];
 327     y[0] = (ih==0)? fw: -fw;
 328     fw = fq[0]-fw;
 329     for (i=1;i<=jz;i++) fw += fq[i];
 330     y[1] = (ih==0)? fw: -fw;
 331     break;
 332   case 3:       /* painful */
 333     for (i=jz;i>0;i--) {
 334       fw      = fq[i-1]+fq[i];
 335       fq[i]  += fq[i-1]-fw;
 336       fq[i-1] = fw;
 337     }
 338     for (i=jz;i>1;i--) {
 339       fw      = fq[i-1]+fq[i];
 340       fq[i]  += fq[i-1]-fw;
 341       fq[i-1] = fw;
 342     }
 343     for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
 344     if(ih==0) {
 345       y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
 346     } else {
 347       y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
 348     }
 349   }
 350   return n&7;
 351 }
 352 
 353 
 354 /*
 355  * ====================================================
 356  * Copyright (c) 1993 Oracle and/or its affiliates. All rights reserved.
 357  *
 358  * Developed at SunPro, a Sun Microsystems, Inc. business.
 359  * Permission to use, copy, modify, and distribute this
 360  * software is freely granted, provided that this notice
 361  * is preserved.
 362  * ====================================================
 363  *
 364  */
 365 
 366 /* __ieee754_rem_pio2(x,y)
 367  *
 368  * return the remainder of x rem pi/2 in y[0]+y[1]
 369  * use __kernel_rem_pio2()
 370  */
 371 
 372 /*
 373  * Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi
 374  */
 375 static const int two_over_pi[] = {
 376   0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
 377   0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
 378   0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
 379   0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
 380   0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
 381   0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
 382   0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
 383   0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
 384   0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
 385   0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
 386   0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
 387 };
 388 
 389 static const int npio2_hw[] = {
 390   0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C,
 391   0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C,
 392   0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A,
 393   0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C,
 394   0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB,
 395   0x404858EB, 0x404921FB,
 396 };
 397 
 398 /*
 399  * invpio2:  53 bits of 2/pi
 400  * pio2_1:   first  33 bit of pi/2
 401  * pio2_1t:  pi/2 - pio2_1
 402  * pio2_2:   second 33 bit of pi/2
 403  * pio2_2t:  pi/2 - (pio2_1+pio2_2)
 404  * pio2_3:   third  33 bit of pi/2
 405  * pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
 406  */
 407 
 408 static const double
 409 zeroA =  0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
 410 half =  5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
 411 two24A =  1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
 412 invpio2 =  6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
 413 pio2_1  =  1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
 414 pio2_1t =  6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
 415 pio2_2  =  6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
 416 pio2_2t =  2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
 417 pio2_3  =  2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
 418 pio2_3t =  8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
 419 
 420 static SAFEBUF int __ieee754_rem_pio2(double x, double *y) {
 421   double z,w,t,r,fn;
 422   double tx[3];
 423   int e0,i,j,nx,n,ix,hx,i0;
 424 
 425   i0 = ((*(int*)&two24A)>>30)^1;        /* high word index */
 426   hx = *(i0+(int*)&x);          /* high word of x */
 427   ix = hx&0x7fffffff;
 428   if(ix<=0x3fe921fb)   /* |x| ~<= pi/4 , no need for reduction */
 429     {y[0] = x; y[1] = 0; return 0;}
 430   if(ix<0x4002d97c) {  /* |x| < 3pi/4, special case with n=+-1 */
 431     if(hx>0) {
 432       z = x - pio2_1;
 433       if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
 434         y[0] = z - pio2_1t;
 435         y[1] = (z-y[0])-pio2_1t;
 436       } else {                /* near pi/2, use 33+33+53 bit pi */
 437         z -= pio2_2;
 438         y[0] = z - pio2_2t;
 439         y[1] = (z-y[0])-pio2_2t;
 440       }
 441       return 1;
 442     } else {    /* negative x */
 443       z = x + pio2_1;
 444       if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
 445         y[0] = z + pio2_1t;
 446         y[1] = (z-y[0])+pio2_1t;
 447       } else {                /* near pi/2, use 33+33+53 bit pi */
 448         z += pio2_2;
 449         y[0] = z + pio2_2t;
 450         y[1] = (z-y[0])+pio2_2t;
 451       }
 452       return -1;
 453     }
 454   }
 455   if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
 456     t  = fabsd(x);
 457     n  = (int) (t*invpio2+half);
 458     fn = (double)n;
 459     r  = t-fn*pio2_1;
 460     w  = fn*pio2_1t;    /* 1st round good to 85 bit */
 461     if(n<32&&ix!=npio2_hw[n-1]) {
 462       y[0] = r-w;       /* quick check no cancellation */
 463     } else {
 464       j  = ix>>20;
 465       y[0] = r-w;
 466       i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
 467       if(i>16) {  /* 2nd iteration needed, good to 118 */
 468         t  = r;
 469         w  = fn*pio2_2;
 470         r  = t-w;
 471         w  = fn*pio2_2t-((t-r)-w);
 472         y[0] = r-w;
 473         i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
 474         if(i>49)  {     /* 3rd iteration need, 151 bits acc */
 475           t  = r;       /* will cover all possible cases */
 476           w  = fn*pio2_3;
 477           r  = t-w;
 478           w  = fn*pio2_3t-((t-r)-w);
 479           y[0] = r-w;
 480         }
 481       }
 482     }
 483     y[1] = (r-y[0])-w;
 484     if(hx<0)    {y[0] = -y[0]; y[1] = -y[1]; return -n;}
 485     else         return n;
 486   }
 487   /*
 488    * all other (large) arguments
 489    */
 490   if(ix>=0x7ff00000) {          /* x is inf or NaN */
 491     y[0]=y[1]=x-x; return 0;
 492   }
 493   /* set z = scalbn(|x|,ilogb(x)-23) */
 494   *(1-i0+(int*)&z) = *(1-i0+(int*)&x);
 495   e0    = (ix>>20)-1046;        /* e0 = ilogb(z)-23; */
 496   *(i0+(int*)&z) = ix - (e0<<20);
 497   for(i=0;i<2;i++) {
 498     tx[i] = (double)((int)(z));
 499     z     = (z-tx[i])*two24A;
 500   }
 501   tx[2] = z;
 502   nx = 3;
 503   while(tx[nx-1]==zeroA) nx--;  /* skip zero term */
 504   n  =  __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi);
 505   if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
 506   return n;
 507 }
 508 
 509 
 510 /* __kernel_sin( x, y, iy)
 511  * kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854
 512  * Input x is assumed to be bounded by ~pi/4 in magnitude.
 513  * Input y is the tail of x.
 514  * Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
 515  *
 516  * Algorithm
 517  *      1. Since sin(-x) = -sin(x), we need only to consider positive x.
 518  *      2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0.
 519  *      3. sin(x) is approximated by a polynomial of degree 13 on
 520  *         [0,pi/4]
 521  *                               3            13
 522  *              sin(x) ~ x + S1*x + ... + S6*x
 523  *         where
 524  *
 525  *      |sin(x)         2     4     6     8     10     12  |     -58
 526  *      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
 527  *      |  x                                               |
 528  *
 529  *      4. sin(x+y) = sin(x) + sin'(x')*y
 530  *                  ~ sin(x) + (1-x*x/2)*y
 531  *         For better accuracy, let
 532  *                   3      2      2      2      2
 533  *              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
 534  *         then                   3    2
 535  *              sin(x) = x + (S1*x + (x *(r-y/2)+y))
 536  */
 537 
 538 static const double
 539 S1  = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
 540 S2  =  8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
 541 S3  = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
 542 S4  =  2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
 543 S5  = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
 544 S6  =  1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
 545 
 546 static double __kernel_sin(double x, double y, int iy)
 547 {
 548         double z,r,v;
 549         int ix;
 550         ix = __HI(x)&0x7fffffff;        /* high word of x */
 551         if(ix<0x3e400000)                       /* |x| < 2**-27 */
 552            {if((int)x==0) return x;}            /* generate inexact */
 553         z       =  x*x;
 554         v       =  z*x;
 555         r       =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
 556         if(iy==0) return x+v*(S1+z*r);
 557         else      return x-((z*(half*y-v*r)-y)-v*S1);
 558 }
 559 
 560 /*
 561  * __kernel_cos( x,  y )
 562  * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
 563  * Input x is assumed to be bounded by ~pi/4 in magnitude.
 564  * Input y is the tail of x.
 565  *
 566  * Algorithm
 567  *      1. Since cos(-x) = cos(x), we need only to consider positive x.
 568  *      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
 569  *      3. cos(x) is approximated by a polynomial of degree 14 on
 570  *         [0,pi/4]
 571  *                                       4            14
 572  *              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
 573  *         where the remez error is
 574  *
 575  *      |              2     4     6     8     10    12     14 |     -58
 576  *      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
 577  *      |                                                      |
 578  *
 579  *                     4     6     8     10    12     14
 580  *      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
 581  *             cos(x) = 1 - x*x/2 + r
 582  *         since cos(x+y) ~ cos(x) - sin(x)*y
 583  *                        ~ cos(x) - x*y,
 584  *         a correction term is necessary in cos(x) and hence
 585  *              cos(x+y) = 1 - (x*x/2 - (r - x*y))
 586  *         For better accuracy when x > 0.3, let qx = |x|/4 with
 587  *         the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125.
 588  *         Then
 589  *              cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)).
 590  *         Note that 1-qx and (x*x/2-qx) is EXACT here, and the
 591  *         magnitude of the latter is at least a quarter of x*x/2,
 592  *         thus, reducing the rounding error in the subtraction.
 593  */
 594 
 595 static const double
 596 C1  =  4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
 597 C2  = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
 598 C3  =  2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
 599 C4  = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
 600 C5  =  2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
 601 C6  = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
 602 
 603 static double __kernel_cos(double x, double y)
 604 {
 605   double a,h,z,r,qx;
 606   int ix;
 607   ix = __HI(x)&0x7fffffff;      /* ix = |x|'s high word*/
 608   if(ix<0x3e400000) {                   /* if x < 2**27 */
 609     if(((int)x)==0) return one;         /* generate inexact */
 610   }
 611   z  = x*x;
 612   r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
 613   if(ix < 0x3FD33333)                   /* if |x| < 0.3 */
 614     return one - (0.5*z - (z*r - x*y));
 615   else {
 616     if(ix > 0x3fe90000) {               /* x > 0.78125 */
 617       qx = 0.28125;
 618     } else {
 619       __HI(qx) = ix-0x00200000; /* x/4 */
 620       __LO(qx) = 0;
 621     }
 622     h = 0.5*z-qx;
 623     a = one-qx;
 624     return a - (h - (z*r-x*y));
 625   }
 626 }
 627 
 628 /* __kernel_tan( x, y, k )
 629  * kernel tan function on [-pi/4, pi/4], pi/4 ~ 0.7854
 630  * Input x is assumed to be bounded by ~pi/4 in magnitude.
 631  * Input y is the tail of x.
 632  * Input k indicates whether tan (if k=1) or
 633  * -1/tan (if k= -1) is returned.
 634  *
 635  * Algorithm
 636  *      1. Since tan(-x) = -tan(x), we need only to consider positive x.
 637  *      2. if x < 2^-28 (hx<0x3e300000 0), return x with inexact if x!=0.
 638  *      3. tan(x) is approximated by a odd polynomial of degree 27 on
 639  *         [0,0.67434]
 640  *                               3             27
 641  *              tan(x) ~ x + T1*x + ... + T13*x
 642  *         where
 643  *
 644  *              |tan(x)         2     4            26   |     -59.2
 645  *              |----- - (1+T1*x +T2*x +.... +T13*x    )| <= 2
 646  *              |  x                                    |
 647  *
 648  *         Note: tan(x+y) = tan(x) + tan'(x)*y
 649  *                        ~ tan(x) + (1+x*x)*y
 650  *         Therefore, for better accuracy in computing tan(x+y), let
 651  *                   3      2      2       2       2
 652  *              r = x *(T2+x *(T3+x *(...+x *(T12+x *T13))))
 653  *         then
 654  *                                  3    2
 655  *              tan(x+y) = x + (T1*x + (x *(r+y)+y))
 656  *
 657  *      4. For x in [0.67434,pi/4],  let y = pi/4 - x, then
 658  *              tan(x) = tan(pi/4-y) = (1-tan(y))/(1+tan(y))
 659  *                     = 1 - 2*(tan(y) - (tan(y)^2)/(1+tan(y)))
 660  */
 661 
 662 static const double
 663 pio4  =  7.85398163397448278999e-01, /* 0x3FE921FB, 0x54442D18 */
 664 pio4lo=  3.06161699786838301793e-17, /* 0x3C81A626, 0x33145C07 */
 665 T[] =  {
 666   3.33333333333334091986e-01, /* 0x3FD55555, 0x55555563 */
 667   1.33333333333201242699e-01, /* 0x3FC11111, 0x1110FE7A */
 668   5.39682539762260521377e-02, /* 0x3FABA1BA, 0x1BB341FE */
 669   2.18694882948595424599e-02, /* 0x3F9664F4, 0x8406D637 */
 670   8.86323982359930005737e-03, /* 0x3F8226E3, 0xE96E8493 */
 671   3.59207910759131235356e-03, /* 0x3F6D6D22, 0xC9560328 */
 672   1.45620945432529025516e-03, /* 0x3F57DBC8, 0xFEE08315 */
 673   5.88041240820264096874e-04, /* 0x3F4344D8, 0xF2F26501 */
 674   2.46463134818469906812e-04, /* 0x3F3026F7, 0x1A8D1068 */
 675   7.81794442939557092300e-05, /* 0x3F147E88, 0xA03792A6 */
 676   7.14072491382608190305e-05, /* 0x3F12B80F, 0x32F0A7E9 */
 677  -1.85586374855275456654e-05, /* 0xBEF375CB, 0xDB605373 */
 678   2.59073051863633712884e-05, /* 0x3EFB2A70, 0x74BF7AD4 */
 679 };
 680 
 681 static double __kernel_tan(double x, double y, int iy)
 682 {
 683   double z,r,v,w,s;
 684   int ix,hx;
 685   hx = __HI(x);   /* high word of x */
 686   ix = hx&0x7fffffff;     /* high word of |x| */
 687   if(ix<0x3e300000) {                     /* x < 2**-28 */
 688     if((int)x==0) {                       /* generate inexact */
 689       if (((ix | __LO(x)) | (iy + 1)) == 0)
 690         return one / fabsd(x);
 691       else {
 692         if (iy == 1)
 693           return x;
 694         else {    /* compute -1 / (x+y) carefully */
 695           double a, t;
 696 
 697           z = w = x + y;
 698           __LO(z) = 0;
 699           v = y - (z - x);
 700           t = a = -one / w;
 701           __LO(t) = 0;
 702           s = one + t * z;
 703           return t + a * (s + t * v);
 704         }
 705       }
 706     }
 707   }
 708   if(ix>=0x3FE59428) {                    /* |x|>=0.6744 */
 709     if(hx<0) {x = -x; y = -y;}
 710     z = pio4-x;
 711     w = pio4lo-y;
 712     x = z+w; y = 0.0;
 713   }
 714   z       =  x*x;
 715   w       =  z*z;
 716   /* Break x^5*(T[1]+x^2*T[2]+...) into
 717    *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
 718    *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
 719    */
 720   r = T[1]+w*(T[3]+w*(T[5]+w*(T[7]+w*(T[9]+w*T[11]))));
 721   v = z*(T[2]+w*(T[4]+w*(T[6]+w*(T[8]+w*(T[10]+w*T[12])))));
 722   s = z*x;
 723   r = y + z*(s*(r+v)+y);
 724   r += T[0]*s;
 725   w = x+r;
 726   if(ix>=0x3FE59428) {
 727     v = (double)iy;
 728     return (double)(1-((hx>>30)&2))*(v-2.0*(x-(w*w/(w+v)-r)));
 729   }
 730   if(iy==1) return w;
 731   else {          /* if allow error up to 2 ulp,
 732                      simply return -1.0/(x+r) here */
 733     /*  compute -1.0/(x+r) accurately */
 734     double a,t;
 735     z  = w;
 736     __LO(z) = 0;
 737     v  = r-(z - x);     /* z+v = r+x */
 738     t = a  = -1.0/w;    /* a = -1.0/w */
 739     __LO(t) = 0;
 740     s  = 1.0+t*z;
 741     return t+a*(s+t*v);
 742   }
 743 }
 744 
 745 
 746 //----------------------------------------------------------------------
 747 //
 748 // Routines for new sin/cos implementation
 749 //
 750 //----------------------------------------------------------------------
 751 
 752 /* sin(x)
 753  * Return sine function of x.
 754  *
 755  * kernel function:
 756  *      __kernel_sin            ... sine function on [-pi/4,pi/4]
 757  *      __kernel_cos            ... cose function on [-pi/4,pi/4]
 758  *      __ieee754_rem_pio2      ... argument reduction routine
 759  *
 760  * Method.
 761  *      Let S,C and T denote the sin, cos and tan respectively on
 762  *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
 763  *      in [-pi/4 , +pi/4], and let n = k mod 4.
 764  *      We have
 765  *
 766  *          n        sin(x)      cos(x)        tan(x)
 767  *     ----------------------------------------------------------
 768  *          0          S           C             T
 769  *          1          C          -S            -1/T
 770  *          2         -S          -C             T
 771  *          3         -C           S            -1/T
 772  *     ----------------------------------------------------------
 773  *
 774  * Special cases:
 775  *      Let trig be any of sin, cos, or tan.
 776  *      trig(+-INF)  is NaN, with signals;
 777  *      trig(NaN)    is that NaN;
 778  *
 779  * Accuracy:
 780  *      TRIG(x) returns trig(x) nearly rounded
 781  */
 782 
 783 JRT_LEAF(jdouble, SharedRuntime::dsin(jdouble x))
 784   double y[2],z=0.0;
 785   int n, ix;
 786 
 787   /* High word of x. */
 788   ix = __HI(x);
 789 
 790   /* |x| ~< pi/4 */
 791   ix &= 0x7fffffff;
 792   if(ix <= 0x3fe921fb) return __kernel_sin(x,z,0);
 793 
 794   /* sin(Inf or NaN) is NaN */
 795   else if (ix>=0x7ff00000) return x-x;
 796 
 797   /* argument reduction needed */
 798   else {
 799     n = __ieee754_rem_pio2(x,y);
 800     switch(n&3) {
 801     case 0: return  __kernel_sin(y[0],y[1],1);
 802     case 1: return  __kernel_cos(y[0],y[1]);
 803     case 2: return -__kernel_sin(y[0],y[1],1);
 804     default:
 805       return -__kernel_cos(y[0],y[1]);
 806     }
 807   }
 808 JRT_END
 809 
 810 /* cos(x)
 811  * Return cosine function of x.
 812  *
 813  * kernel function:
 814  *      __kernel_sin            ... sine function on [-pi/4,pi/4]
 815  *      __kernel_cos            ... cosine function on [-pi/4,pi/4]
 816  *      __ieee754_rem_pio2      ... argument reduction routine
 817  *
 818  * Method.
 819  *      Let S,C and T denote the sin, cos and tan respectively on
 820  *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
 821  *      in [-pi/4 , +pi/4], and let n = k mod 4.
 822  *      We have
 823  *
 824  *          n        sin(x)      cos(x)        tan(x)
 825  *     ----------------------------------------------------------
 826  *          0          S           C             T
 827  *          1          C          -S            -1/T
 828  *          2         -S          -C             T
 829  *          3         -C           S            -1/T
 830  *     ----------------------------------------------------------
 831  *
 832  * Special cases:
 833  *      Let trig be any of sin, cos, or tan.
 834  *      trig(+-INF)  is NaN, with signals;
 835  *      trig(NaN)    is that NaN;
 836  *
 837  * Accuracy:
 838  *      TRIG(x) returns trig(x) nearly rounded
 839  */
 840 
 841 JRT_LEAF(jdouble, SharedRuntime::dcos(jdouble x))
 842   double y[2],z=0.0;
 843   int n, ix;
 844 
 845   /* High word of x. */
 846   ix = __HI(x);
 847 
 848   /* |x| ~< pi/4 */
 849   ix &= 0x7fffffff;
 850   if(ix <= 0x3fe921fb) return __kernel_cos(x,z);
 851 
 852   /* cos(Inf or NaN) is NaN */
 853   else if (ix>=0x7ff00000) return x-x;
 854 
 855   /* argument reduction needed */
 856   else {
 857     n = __ieee754_rem_pio2(x,y);
 858     switch(n&3) {
 859     case 0: return  __kernel_cos(y[0],y[1]);
 860     case 1: return -__kernel_sin(y[0],y[1],1);
 861     case 2: return -__kernel_cos(y[0],y[1]);
 862     default:
 863       return  __kernel_sin(y[0],y[1],1);
 864     }
 865   }
 866 JRT_END
 867 
 868 /* tan(x)
 869  * Return tangent function of x.
 870  *
 871  * kernel function:
 872  *      __kernel_tan            ... tangent function on [-pi/4,pi/4]
 873  *      __ieee754_rem_pio2      ... argument reduction routine
 874  *
 875  * Method.
 876  *      Let S,C and T denote the sin, cos and tan respectively on
 877  *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
 878  *      in [-pi/4 , +pi/4], and let n = k mod 4.
 879  *      We have
 880  *
 881  *          n        sin(x)      cos(x)        tan(x)
 882  *     ----------------------------------------------------------
 883  *          0          S           C             T
 884  *          1          C          -S            -1/T
 885  *          2         -S          -C             T
 886  *          3         -C           S            -1/T
 887  *     ----------------------------------------------------------
 888  *
 889  * Special cases:
 890  *      Let trig be any of sin, cos, or tan.
 891  *      trig(+-INF)  is NaN, with signals;
 892  *      trig(NaN)    is that NaN;
 893  *
 894  * Accuracy:
 895  *      TRIG(x) returns trig(x) nearly rounded
 896  */
 897 
 898 JRT_LEAF(jdouble, SharedRuntime::dtan(jdouble x))
 899   double y[2],z=0.0;
 900   int n, ix;
 901 
 902   /* High word of x. */
 903   ix = __HI(x);
 904 
 905   /* |x| ~< pi/4 */
 906   ix &= 0x7fffffff;
 907   if(ix <= 0x3fe921fb) return __kernel_tan(x,z,1);
 908 
 909   /* tan(Inf or NaN) is NaN */
 910   else if (ix>=0x7ff00000) return x-x;            /* NaN */
 911 
 912   /* argument reduction needed */
 913   else {
 914     n = __ieee754_rem_pio2(x,y);
 915     return __kernel_tan(y[0],y[1],1-((n&1)<<1)); /*   1 -- n even
 916                                                      -1 -- n odd */
 917   }
 918 JRT_END
 919 
 920 
 921 #ifdef WIN32
 922 # pragma optimize ( "", on )
 923 #endif