Old src/share/vm/runtime/sharedRuntimeTrig.cpp

   1 /*
   2  * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "prims/jni.h"
  27 #include "runtime/interfaceSupport.hpp"
  28 #include "runtime/sharedRuntime.hpp"
  29 
  30 // This file contains copies of the fdlibm routines used by
  31 // StrictMath. It turns out that it is almost always required to use
  32 // these runtime routines; the Intel CPU doesn't meet the Java
  33 // specification for sin/cos outside a certain limited argument range,
  34 // and the SPARC CPU doesn't appear to have sin/cos instructions. It
  35 // also turns out that avoiding the indirect call through function
  36 // pointer out to libjava.so in SharedRuntime speeds these routines up
  37 // by roughly 15% on both Win32/x86 and Solaris/SPARC.
  38 
  39 // Enabling optimizations in this file causes incorrect code to be
  40 // generated; can not figure out how to turn down optimization for one
  41 // file in the IDE on Windows
  42 #ifdef WIN32
  43 # pragma optimize ( "", off )
  44 #endif
  45 
  46 /* The above workaround now causes more problems with the latest MS compiler.
  47  * Visual Studio 2010's /GS option tries to guard against buffer overruns.
  48  * /GS is on by default if you specify optimizations, which we do globally
  49  * via /W3 /O2. However the above selective turning off of optimizations means
  50  * that /GS issues a warning "4748". And since we treat warnings as errors (/WX)
  51  * then the compilation fails. There are several possible solutions
  52  * (1) Remove that pragma above as obsolete with VS2010 - requires testing.
  53  * (2) Stop treating warnings as errors - would be a backward step
  54  * (3) Disable /GS - may help performance but you lose the security checks
  55  * (4) Disable the warning with "#pragma warning( disable : 4748 )"
  56  * (5) Disable planting the code with  __declspec(safebuffers)
  57  * I've opted for (5) although we should investigate the local performance
  58  * benefits of (1) and global performance benefit of (3).
  59  */
  60 #if defined(WIN32) && (defined(_MSC_VER) && (_MSC_VER >= 1600))
  61 #define SAFEBUF __declspec(safebuffers)
  62 #else
  63 #define SAFEBUF
  64 #endif
  65 
  66 #include <math.h>
  67 
  68 // VM_LITTLE_ENDIAN is #defined appropriately in the Makefiles
  69 // [jk] this is not 100% correct because the float word order may different
  70 // from the byte order (e.g. on ARM)
  71 #ifdef VM_LITTLE_ENDIAN
  72 # define __HI(x) *(1+(int*)&x)
  73 # define __LO(x) *(int*)&x
  74 #else
  75 # define __HI(x) *(int*)&x
  76 # define __LO(x) *(1+(int*)&x)
  77 #endif
  78 
  79 static double copysignA(double x, double y) {
  80   __HI(x) = (__HI(x)&0x7fffffff)|(__HI(y)&0x80000000);
  81   return x;
  82 }
  83 
  84 /*
  85  * scalbn (double x, int n)
  86  * scalbn(x,n) returns x* 2**n  computed by  exponent
  87  * manipulation rather than by actually performing an
  88  * exponentiation or a multiplication.
  89  */
  90 
  91 static const double
  92 two54   =  1.80143985094819840000e+16, /* 0x43500000, 0x00000000 */
  93 twom54  =  5.55111512312578270212e-17, /* 0x3C900000, 0x00000000 */
  94 hugeX  = 1.0e+300,
  95 tiny   = 1.0e-300;
  96 
  97 static double scalbnA (double x, int n) {
  98   int  k,hx,lx;
  99   hx = __HI(x);
 100   lx = __LO(x);
 101   k = (hx&0x7ff00000)>>20;              /* extract exponent */
 102   if (k==0) {                           /* 0 or subnormal x */
 103     if ((lx|(hx&0x7fffffff))==0) return x; /* +-0 */
 104     x *= two54;
 105     hx = __HI(x);
 106     k = ((hx&0x7ff00000)>>20) - 54;
 107     if (n< -50000) return tiny*x;       /*underflow*/
 108   }
 109   if (k==0x7ff) return x+x;             /* NaN or Inf */
 110   k = k+n;
 111   if (k >  0x7fe) return hugeX*copysignA(hugeX,x); /* overflow  */
 112   if (k > 0)                            /* normal result */
 113     {__HI(x) = (hx&0x800fffff)|(k<<20); return x;}
 114   if (k <= -54) {
 115     if (n > 50000)      /* in case integer overflow in n+k */
 116       return hugeX*copysignA(hugeX,x);  /*overflow*/
 117     else return tiny*copysignA(tiny,x); /*underflow*/
 118   }
 119   k += 54;                              /* subnormal result */
 120   __HI(x) = (hx&0x800fffff)|(k<<20);
 121   return x*twom54;
 122 }
 123 
 124 /*
 125  * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
 126  * double x[],y[]; int e0,nx,prec; int ipio2[];
 127  *
 128  * __kernel_rem_pio2 return the last three digits of N with
 129  *              y = x - N*pi/2
 130  * so that |y| < pi/2.
 131  *
 132  * The method is to compute the integer (mod 8) and fraction parts of
 133  * (2/pi)*x without doing the full multiplication. In general we
 134  * skip the part of the product that are known to be a huge integer (
 135  * more accurately, = 0 mod 8 ). Thus the number of operations are
 136  * independent of the exponent of the input.
 137  *
 138  * (2/pi) is represented by an array of 24-bit integers in ipio2[].
 139  *
 140  * Input parameters:
 141  *      x[]     The input value (must be positive) is broken into nx
 142  *              pieces of 24-bit integers in double precision format.
 143  *              x[i] will be the i-th 24 bit of x. The scaled exponent
 144  *              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
 145  *              match x's up to 24 bits.
 146  *
 147  *              Example of breaking a double positive z into x[0]+x[1]+x[2]:
 148  *                      e0 = ilogb(z)-23
 149  *                      z  = scalbn(z,-e0)
 150  *              for i = 0,1,2
 151  *                      x[i] = floor(z)
 152  *                      z    = (z-x[i])*2**24
 153  *
 154  *
 155  *      y[]     ouput result in an array of double precision numbers.
 156  *              The dimension of y[] is:
 157  *                      24-bit  precision       1
 158  *                      53-bit  precision       2
 159  *                      64-bit  precision       2
 160  *                      113-bit precision       3
 161  *              The actual value is the sum of them. Thus for 113-bit
 162  *              precsion, one may have to do something like:
 163  *
 164  *              long double t,w,r_head, r_tail;
 165  *              t = (long double)y[2] + (long double)y[1];
 166  *              w = (long double)y[0];
 167  *              r_head = t+w;
 168  *              r_tail = w - (r_head - t);
 169  *
 170  *      e0      The exponent of x[0]
 171  *
 172  *      nx      dimension of x[]
 173  *
 174  *      prec    an interger indicating the precision:
 175  *                      0       24  bits (single)
 176  *                      1       53  bits (double)
 177  *                      2       64  bits (extended)
 178  *                      3       113 bits (quad)
 179  *
 180  *      ipio2[]
 181  *              integer array, contains the (24*i)-th to (24*i+23)-th
 182  *              bit of 2/pi after binary point. The corresponding
 183  *              floating value is
 184  *
 185  *                      ipio2[i] * 2^(-24(i+1)).
 186  *
 187  * External function:
 188  *      double scalbn(), floor();
 189  *
 190  *
 191  * Here is the description of some local variables:
 192  *
 193  *      jk      jk+1 is the initial number of terms of ipio2[] needed
 194  *              in the computation. The recommended value is 2,3,4,
 195  *              6 for single, double, extended,and quad.
 196  *
 197  *      jz      local integer variable indicating the number of
 198  *              terms of ipio2[] used.
 199  *
 200  *      jx      nx - 1
 201  *
 202  *      jv      index for pointing to the suitable ipio2[] for the
 203  *              computation. In general, we want
 204  *                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
 205  *              is an integer. Thus
 206  *                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
 207  *              Hence jv = max(0,(e0-3)/24).
 208  *
 209  *      jp      jp+1 is the number of terms in PIo2[] needed, jp = jk.
 210  *
 211  *      q[]     double array with integral value, representing the
 212  *              24-bits chunk of the product of x and 2/pi.
 213  *
 214  *      q0      the corresponding exponent of q[0]. Note that the
 215  *              exponent for q[i] would be q0-24*i.
 216  *
 217  *      PIo2[]  double precision array, obtained by cutting pi/2
 218  *              into 24 bits chunks.
 219  *
 220  *      f[]     ipio2[] in floating point
 221  *
 222  *      iq[]    integer array by breaking up q[] in 24-bits chunk.
 223  *
 224  *      fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
 225  *
 226  *      ih      integer. If >0 it indicats q[] is >= 0.5, hence
 227  *              it also indicates the *sign* of the result.
 228  *
 229  */
 230 
 231 
 232 /*
 233  * Constants:
 234  * The hexadecimal values are the intended ones for the following
 235  * constants. The decimal values may be used, provided that the
 236  * compiler will convert from decimal to binary accurately enough
 237  * to produce the hexadecimal values shown.
 238  */
 239 
 240 
 241 static const int init_jk[] = {2,3,4,6}; /* initial value for jk */
 242 
 243 static const double PIo2[] = {
 244   1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
 245   7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
 246   5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
 247   3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
 248   1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
 249   1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
 250   2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
 251   2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
 252 };
 253 
 254 static const double
 255 zeroB   = 0.0,
 256 one     = 1.0,
 257 two24B  = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
 258 twon24  = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
 259 
 260 static SAFEBUF int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int *ipio2) {
 261   int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
 262   double z,fw,f[20],fq[20],q[20];
 263 
 264   /* initialize jk*/
 265   jk = init_jk[prec];
 266   jp = jk;
 267 
 268   /* determine jx,jv,q0, note that 3>q0 */
 269   jx =  nx-1;
 270   jv = (e0-3)/24; if(jv<0) jv=0;
 271   q0 =  e0-24*(jv+1);
 272 
 273   /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
 274   j = jv-jx; m = jx+jk;
 275   for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : (double) ipio2[j];
 276 
 277   /* compute q[0],q[1],...q[jk] */
 278   for (i=0;i<=jk;i++) {
 279     for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
 280   }
 281 
 282   jz = jk;
 283 recompute:
 284   /* distill q[] into iq[] reversingly */
 285   for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
 286     fw    =  (double)((int)(twon24* z));
 287     iq[i] =  (int)(z-two24B*fw);
 288     z     =  q[j-1]+fw;
 289   }
 290 
 291   /* compute n */
 292   z  = scalbnA(z,q0);           /* actual value of z */
 293   z -= 8.0*floor(z*0.125);              /* trim off integer >= 8 */
 294   n  = (int) z;
 295   z -= (double)n;
 296   ih = 0;
 297   if(q0>0) {    /* need iq[jz-1] to determine n */
 298     i  = (iq[jz-1]>>(24-q0)); n += i;
 299     iq[jz-1] -= i<<(24-q0);
 300     ih = iq[jz-1]>>(23-q0);
 301   }
 302   else if(q0==0) ih = iq[jz-1]>>23;
 303   else if(z>=0.5) ih=2;
 304 
 305   if(ih>0) {    /* q > 0.5 */
 306     n += 1; carry = 0;
 307     for(i=0;i<jz ;i++) {        /* compute 1-q */
 308       j = iq[i];
 309       if(carry==0) {
 310         if(j!=0) {
 311           carry = 1; iq[i] = 0x1000000- j;
 312         }
 313       } else  iq[i] = 0xffffff - j;
 314     }
 315     if(q0>0) {          /* rare case: chance is 1 in 12 */
 316       switch(q0) {
 317       case 1:
 318         iq[jz-1] &= 0x7fffff; break;
 319       case 2:
 320         iq[jz-1] &= 0x3fffff; break;
 321       }
 322     }
 323     if(ih==2) {
 324       z = one - z;
 325       if(carry!=0) z -= scalbnA(one,q0);
 326     }
 327   }
 328 
 329   /* check if recomputation is needed */
 330   if(z==zeroB) {
 331     j = 0;
 332     for (i=jz-1;i>=jk;i--) j |= iq[i];
 333     if(j==0) { /* need recomputation */
 334       for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
 335 
 336       for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
 337         f[jx+i] = (double) ipio2[jv+i];
 338         for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
 339         q[i] = fw;
 340       }
 341       jz += k;
 342       goto recompute;
 343     }
 344   }
 345 
 346   /* chop off zero terms */
 347   if(z==0.0) {
 348     jz -= 1; q0 -= 24;
 349     while(iq[jz]==0) { jz--; q0-=24;}
 350   } else { /* break z into 24-bit if neccessary */
 351     z = scalbnA(z,-q0);
 352     if(z>=two24B) {
 353       fw = (double)((int)(twon24*z));
 354       iq[jz] = (int)(z-two24B*fw);
 355       jz += 1; q0 += 24;
 356       iq[jz] = (int) fw;
 357     } else iq[jz] = (int) z ;
 358   }
 359 
 360   /* convert integer "bit" chunk to floating-point value */
 361   fw = scalbnA(one,q0);
 362   for(i=jz;i>=0;i--) {
 363     q[i] = fw*(double)iq[i]; fw*=twon24;
 364   }
 365 
 366   /* compute PIo2[0,...,jp]*q[jz,...,0] */
 367   for(i=jz;i>=0;i--) {
 368     for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
 369     fq[jz-i] = fw;
 370   }
 371 
 372   /* compress fq[] into y[] */
 373   switch(prec) {
 374   case 0:
 375     fw = 0.0;
 376     for (i=jz;i>=0;i--) fw += fq[i];
 377     y[0] = (ih==0)? fw: -fw;
 378     break;
 379   case 1:
 380   case 2:
 381     fw = 0.0;
 382     for (i=jz;i>=0;i--) fw += fq[i];
 383     y[0] = (ih==0)? fw: -fw;
 384     fw = fq[0]-fw;
 385     for (i=1;i<=jz;i++) fw += fq[i];
 386     y[1] = (ih==0)? fw: -fw;
 387     break;
 388   case 3:       /* painful */
 389     for (i=jz;i>0;i--) {
 390       fw      = fq[i-1]+fq[i];
 391       fq[i]  += fq[i-1]-fw;
 392       fq[i-1] = fw;
 393     }
 394     for (i=jz;i>1;i--) {
 395       fw      = fq[i-1]+fq[i];
 396       fq[i]  += fq[i-1]-fw;
 397       fq[i-1] = fw;
 398     }
 399     for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
 400     if(ih==0) {
 401       y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
 402     } else {
 403       y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
 404     }
 405   }
 406   return n&7;
 407 }
 408 
 409 
 410 /*
 411  * ====================================================
 412  * Copyright (c) 1993 Oracle and/or its affilates. All rights reserved.
 413  *
 414  * Developed at SunPro, a Sun Microsystems, Inc. business.
 415  * Permission to use, copy, modify, and distribute this
 416  * software is freely granted, provided that this notice
 417  * is preserved.
 418  * ====================================================
 419  *
 420  */
 421 
 422 /* __ieee754_rem_pio2(x,y)
 423  *
 424  * return the remainder of x rem pi/2 in y[0]+y[1]
 425  * use __kernel_rem_pio2()
 426  */
 427 
 428 /*
 429  * Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi
 430  */
 431 static const int two_over_pi[] = {
 432   0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
 433   0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
 434   0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
 435   0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
 436   0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
 437   0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
 438   0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
 439   0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
 440   0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
 441   0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
 442   0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
 443 };
 444 
 445 static const int npio2_hw[] = {
 446   0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C,
 447   0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C,
 448   0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A,
 449   0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C,
 450   0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB,
 451   0x404858EB, 0x404921FB,
 452 };
 453 
 454 /*
 455  * invpio2:  53 bits of 2/pi
 456  * pio2_1:   first  33 bit of pi/2
 457  * pio2_1t:  pi/2 - pio2_1
 458  * pio2_2:   second 33 bit of pi/2
 459  * pio2_2t:  pi/2 - (pio2_1+pio2_2)
 460  * pio2_3:   third  33 bit of pi/2
 461  * pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
 462  */
 463 
 464 static const double
 465 zeroA =  0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
 466 half =  5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
 467 two24A =  1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
 468 invpio2 =  6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
 469 pio2_1  =  1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
 470 pio2_1t =  6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
 471 pio2_2  =  6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
 472 pio2_2t =  2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
 473 pio2_3  =  2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
 474 pio2_3t =  8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
 475 
 476 static SAFEBUF int __ieee754_rem_pio2(double x, double *y) {
 477   double z,w,t,r,fn;
 478   double tx[3];
 479   int e0,i,j,nx,n,ix,hx,i0;
 480 
 481   i0 = ((*(int*)&two24A)>>30)^1;        /* high word index */
 482   hx = *(i0+(int*)&x);          /* high word of x */
 483   ix = hx&0x7fffffff;
 484   if(ix<=0x3fe921fb)   /* |x| ~<= pi/4 , no need for reduction */
 485     {y[0] = x; y[1] = 0; return 0;}
 486   if(ix<0x4002d97c) {  /* |x| < 3pi/4, special case with n=+-1 */
 487     if(hx>0) {
 488       z = x - pio2_1;
 489       if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
 490         y[0] = z - pio2_1t;
 491         y[1] = (z-y[0])-pio2_1t;
 492       } else {                /* near pi/2, use 33+33+53 bit pi */
 493         z -= pio2_2;
 494         y[0] = z - pio2_2t;
 495         y[1] = (z-y[0])-pio2_2t;
 496       }
 497       return 1;
 498     } else {    /* negative x */
 499       z = x + pio2_1;
 500       if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
 501         y[0] = z + pio2_1t;
 502         y[1] = (z-y[0])+pio2_1t;
 503       } else {                /* near pi/2, use 33+33+53 bit pi */
 504         z += pio2_2;
 505         y[0] = z + pio2_2t;
 506         y[1] = (z-y[0])+pio2_2t;
 507       }
 508       return -1;
 509     }
 510   }
 511   if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
 512     t  = fabsd(x);
 513     n  = (int) (t*invpio2+half);
 514     fn = (double)n;
 515     r  = t-fn*pio2_1;
 516     w  = fn*pio2_1t;    /* 1st round good to 85 bit */
 517     if(n<32&&ix!=npio2_hw[n-1]) {
 518       y[0] = r-w;       /* quick check no cancellation */
 519     } else {
 520       j  = ix>>20;
 521       y[0] = r-w;
 522       i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
 523       if(i>16) {  /* 2nd iteration needed, good to 118 */
 524         t  = r;
 525         w  = fn*pio2_2;
 526         r  = t-w;
 527         w  = fn*pio2_2t-((t-r)-w);
 528         y[0] = r-w;
 529         i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
 530         if(i>49)  {     /* 3rd iteration need, 151 bits acc */
 531           t  = r;       /* will cover all possible cases */
 532           w  = fn*pio2_3;
 533           r  = t-w;
 534           w  = fn*pio2_3t-((t-r)-w);
 535           y[0] = r-w;
 536         }
 537       }
 538     }
 539     y[1] = (r-y[0])-w;
 540     if(hx<0)    {y[0] = -y[0]; y[1] = -y[1]; return -n;}
 541     else         return n;
 542   }
 543   /*
 544    * all other (large) arguments
 545    */
 546   if(ix>=0x7ff00000) {          /* x is inf or NaN */
 547     y[0]=y[1]=x-x; return 0;
 548   }
 549   /* set z = scalbn(|x|,ilogb(x)-23) */
 550   *(1-i0+(int*)&z) = *(1-i0+(int*)&x);
 551   e0    = (ix>>20)-1046;        /* e0 = ilogb(z)-23; */
 552   *(i0+(int*)&z) = ix - (e0<<20);
 553   for(i=0;i<2;i++) {
 554     tx[i] = (double)((int)(z));
 555     z     = (z-tx[i])*two24A;
 556   }
 557   tx[2] = z;
 558   nx = 3;
 559   while(tx[nx-1]==zeroA) nx--;  /* skip zero term */
 560   n  =  __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi);
 561   if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
 562   return n;
 563 }
 564 
 565 
 566 /* __kernel_sin( x, y, iy)
 567  * kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854
 568  * Input x is assumed to be bounded by ~pi/4 in magnitude.
 569  * Input y is the tail of x.
 570  * Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
 571  *
 572  * Algorithm
 573  *      1. Since sin(-x) = -sin(x), we need only to consider positive x.
 574  *      2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0.
 575  *      3. sin(x) is approximated by a polynomial of degree 13 on
 576  *         [0,pi/4]
 577  *                               3            13
 578  *              sin(x) ~ x + S1*x + ... + S6*x
 579  *         where
 580  *
 581  *      |sin(x)         2     4     6     8     10     12  |     -58
 582  *      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
 583  *      |  x                                               |
 584  *
 585  *      4. sin(x+y) = sin(x) + sin'(x')*y
 586  *                  ~ sin(x) + (1-x*x/2)*y
 587  *         For better accuracy, let
 588  *                   3      2      2      2      2
 589  *              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
 590  *         then                   3    2
 591  *              sin(x) = x + (S1*x + (x *(r-y/2)+y))
 592  */
 593 
 594 static const double
 595 S1  = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
 596 S2  =  8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
 597 S3  = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
 598 S4  =  2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
 599 S5  = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
 600 S6  =  1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
 601 
 602 static double __kernel_sin(double x, double y, int iy)
 603 {
 604         double z,r,v;
 605         int ix;
 606         ix = __HI(x)&0x7fffffff;        /* high word of x */
 607         if(ix<0x3e400000)                       /* |x| < 2**-27 */
 608            {if((int)x==0) return x;}            /* generate inexact */
 609         z       =  x*x;
 610         v       =  z*x;
 611         r       =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
 612         if(iy==0) return x+v*(S1+z*r);
 613         else      return x-((z*(half*y-v*r)-y)-v*S1);
 614 }
 615 
 616 /*
 617  * __kernel_cos( x,  y )
 618  * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
 619  * Input x is assumed to be bounded by ~pi/4 in magnitude.
 620  * Input y is the tail of x.
 621  *
 622  * Algorithm
 623  *      1. Since cos(-x) = cos(x), we need only to consider positive x.
 624  *      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
 625  *      3. cos(x) is approximated by a polynomial of degree 14 on
 626  *         [0,pi/4]
 627  *                                       4            14
 628  *              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
 629  *         where the remez error is
 630  *
 631  *      |              2     4     6     8     10    12     14 |     -58
 632  *      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
 633  *      |                                                      |
 634  *
 635  *                     4     6     8     10    12     14
 636  *      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
 637  *             cos(x) = 1 - x*x/2 + r
 638  *         since cos(x+y) ~ cos(x) - sin(x)*y
 639  *                        ~ cos(x) - x*y,
 640  *         a correction term is necessary in cos(x) and hence
 641  *              cos(x+y) = 1 - (x*x/2 - (r - x*y))
 642  *         For better accuracy when x > 0.3, let qx = |x|/4 with
 643  *         the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125.
 644  *         Then
 645  *              cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)).
 646  *         Note that 1-qx and (x*x/2-qx) is EXACT here, and the
 647  *         magnitude of the latter is at least a quarter of x*x/2,
 648  *         thus, reducing the rounding error in the subtraction.
 649  */
 650 
 651 static const double
 652 C1  =  4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
 653 C2  = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
 654 C3  =  2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
 655 C4  = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
 656 C5  =  2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
 657 C6  = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
 658 
 659 static double __kernel_cos(double x, double y)
 660 {
 661   double a,hz,z,r,qx;
 662   int ix;
 663   ix = __HI(x)&0x7fffffff;      /* ix = |x|'s high word*/
 664   if(ix<0x3e400000) {                   /* if x < 2**27 */
 665     if(((int)x)==0) return one;         /* generate inexact */
 666   }
 667   z  = x*x;
 668   r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
 669   if(ix < 0x3FD33333)                   /* if |x| < 0.3 */
 670     return one - (0.5*z - (z*r - x*y));
 671   else {
 672     if(ix > 0x3fe90000) {               /* x > 0.78125 */
 673       qx = 0.28125;
 674     } else {
 675       __HI(qx) = ix-0x00200000; /* x/4 */
 676       __LO(qx) = 0;
 677     }
 678     hz = 0.5*z-qx;
 679     a  = one-qx;
 680     return a - (hz - (z*r-x*y));
 681   }
 682 }
 683 
 684 /* __kernel_tan( x, y, k )
 685  * kernel tan function on [-pi/4, pi/4], pi/4 ~ 0.7854
 686  * Input x is assumed to be bounded by ~pi/4 in magnitude.
 687  * Input y is the tail of x.
 688  * Input k indicates whether tan (if k=1) or
 689  * -1/tan (if k= -1) is returned.
 690  *
 691  * Algorithm
 692  *      1. Since tan(-x) = -tan(x), we need only to consider positive x.
 693  *      2. if x < 2^-28 (hx<0x3e300000 0), return x with inexact if x!=0.
 694  *      3. tan(x) is approximated by a odd polynomial of degree 27 on
 695  *         [0,0.67434]
 696  *                               3             27
 697  *              tan(x) ~ x + T1*x + ... + T13*x
 698  *         where
 699  *
 700  *              |tan(x)         2     4            26   |     -59.2
 701  *              |----- - (1+T1*x +T2*x +.... +T13*x    )| <= 2
 702  *              |  x                                    |
 703  *
 704  *         Note: tan(x+y) = tan(x) + tan'(x)*y
 705  *                        ~ tan(x) + (1+x*x)*y
 706  *         Therefore, for better accuracy in computing tan(x+y), let
 707  *                   3      2      2       2       2
 708  *              r = x *(T2+x *(T3+x *(...+x *(T12+x *T13))))
 709  *         then
 710  *                                  3    2
 711  *              tan(x+y) = x + (T1*x + (x *(r+y)+y))
 712  *
 713  *      4. For x in [0.67434,pi/4],  let y = pi/4 - x, then
 714  *              tan(x) = tan(pi/4-y) = (1-tan(y))/(1+tan(y))
 715  *                     = 1 - 2*(tan(y) - (tan(y)^2)/(1+tan(y)))
 716  */
 717 
 718 static const double
 719 pio4  =  7.85398163397448278999e-01, /* 0x3FE921FB, 0x54442D18 */
 720 pio4lo=  3.06161699786838301793e-17, /* 0x3C81A626, 0x33145C07 */
 721 T[] =  {
 722   3.33333333333334091986e-01, /* 0x3FD55555, 0x55555563 */
 723   1.33333333333201242699e-01, /* 0x3FC11111, 0x1110FE7A */
 724   5.39682539762260521377e-02, /* 0x3FABA1BA, 0x1BB341FE */
 725   2.18694882948595424599e-02, /* 0x3F9664F4, 0x8406D637 */
 726   8.86323982359930005737e-03, /* 0x3F8226E3, 0xE96E8493 */
 727   3.59207910759131235356e-03, /* 0x3F6D6D22, 0xC9560328 */
 728   1.45620945432529025516e-03, /* 0x3F57DBC8, 0xFEE08315 */
 729   5.88041240820264096874e-04, /* 0x3F4344D8, 0xF2F26501 */
 730   2.46463134818469906812e-04, /* 0x3F3026F7, 0x1A8D1068 */
 731   7.81794442939557092300e-05, /* 0x3F147E88, 0xA03792A6 */
 732   7.14072491382608190305e-05, /* 0x3F12B80F, 0x32F0A7E9 */
 733  -1.85586374855275456654e-05, /* 0xBEF375CB, 0xDB605373 */
 734   2.59073051863633712884e-05, /* 0x3EFB2A70, 0x74BF7AD4 */
 735 };
 736 
 737 static double __kernel_tan(double x, double y, int iy)
 738 {
 739   double z,r,v,w,s;
 740   int ix,hx;
 741   hx = __HI(x);   /* high word of x */
 742   ix = hx&0x7fffffff;     /* high word of |x| */
 743   if(ix<0x3e300000) {                     /* x < 2**-28 */
 744     if((int)x==0) {                       /* generate inexact */
 745       if (((ix | __LO(x)) | (iy + 1)) == 0)
 746         return one / fabsd(x);
 747       else {
 748         if (iy == 1)
 749           return x;
 750         else {    /* compute -1 / (x+y) carefully */
 751           double a, t;
 752 
 753           z = w = x + y;
 754           __LO(z) = 0;
 755           v = y - (z - x);
 756           t = a = -one / w;
 757           __LO(t) = 0;
 758           s = one + t * z;
 759           return t + a * (s + t * v);
 760         }
 761       }
 762     }
 763   }
 764   if(ix>=0x3FE59428) {                    /* |x|>=0.6744 */
 765     if(hx<0) {x = -x; y = -y;}
 766     z = pio4-x;
 767     w = pio4lo-y;
 768     x = z+w; y = 0.0;
 769   }
 770   z       =  x*x;
 771   w       =  z*z;
 772   /* Break x^5*(T[1]+x^2*T[2]+...) into
 773    *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
 774    *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
 775    */
 776   r = T[1]+w*(T[3]+w*(T[5]+w*(T[7]+w*(T[9]+w*T[11]))));
 777   v = z*(T[2]+w*(T[4]+w*(T[6]+w*(T[8]+w*(T[10]+w*T[12])))));
 778   s = z*x;
 779   r = y + z*(s*(r+v)+y);
 780   r += T[0]*s;
 781   w = x+r;
 782   if(ix>=0x3FE59428) {
 783     v = (double)iy;
 784     return (double)(1-((hx>>30)&2))*(v-2.0*(x-(w*w/(w+v)-r)));
 785   }
 786   if(iy==1) return w;
 787   else {          /* if allow error up to 2 ulp,
 788                      simply return -1.0/(x+r) here */
 789     /*  compute -1.0/(x+r) accurately */
 790     double a,t;
 791     z  = w;
 792     __LO(z) = 0;
 793     v  = r-(z - x);     /* z+v = r+x */
 794     t = a  = -1.0/w;    /* a = -1.0/w */
 795     __LO(t) = 0;
 796     s  = 1.0+t*z;
 797     return t+a*(s+t*v);
 798   }
 799 }
 800 
 801 
 802 //----------------------------------------------------------------------
 803 //
 804 // Routines for new sin/cos implementation
 805 //
 806 //----------------------------------------------------------------------
 807 
 808 /* sin(x)
 809  * Return sine function of x.
 810  *
 811  * kernel function:
 812  *      __kernel_sin            ... sine function on [-pi/4,pi/4]
 813  *      __kernel_cos            ... cose function on [-pi/4,pi/4]
 814  *      __ieee754_rem_pio2      ... argument reduction routine
 815  *
 816  * Method.
 817  *      Let S,C and T denote the sin, cos and tan respectively on
 818  *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
 819  *      in [-pi/4 , +pi/4], and let n = k mod 4.
 820  *      We have
 821  *
 822  *          n        sin(x)      cos(x)        tan(x)
 823  *     ----------------------------------------------------------
 824  *          0          S           C             T
 825  *          1          C          -S            -1/T
 826  *          2         -S          -C             T
 827  *          3         -C           S            -1/T
 828  *     ----------------------------------------------------------
 829  *
 830  * Special cases:
 831  *      Let trig be any of sin, cos, or tan.
 832  *      trig(+-INF)  is NaN, with signals;
 833  *      trig(NaN)    is that NaN;
 834  *
 835  * Accuracy:
 836  *      TRIG(x) returns trig(x) nearly rounded
 837  */
 838 
 839 JRT_LEAF(jdouble, SharedRuntime::dsin(jdouble x))
 840   double y[2],z=0.0;
 841   int n, ix;
 842 
 843   /* High word of x. */
 844   ix = __HI(x);
 845 
 846   /* |x| ~< pi/4 */
 847   ix &= 0x7fffffff;
 848   if(ix <= 0x3fe921fb) return __kernel_sin(x,z,0);
 849 
 850   /* sin(Inf or NaN) is NaN */
 851   else if (ix>=0x7ff00000) return x-x;
 852 
 853   /* argument reduction needed */
 854   else {
 855     n = __ieee754_rem_pio2(x,y);
 856     switch(n&3) {
 857     case 0: return  __kernel_sin(y[0],y[1],1);
 858     case 1: return  __kernel_cos(y[0],y[1]);
 859     case 2: return -__kernel_sin(y[0],y[1],1);
 860     default:
 861       return -__kernel_cos(y[0],y[1]);
 862     }
 863   }
 864 JRT_END
 865 
 866 /* cos(x)
 867  * Return cosine function of x.
 868  *
 869  * kernel function:
 870  *      __kernel_sin            ... sine function on [-pi/4,pi/4]
 871  *      __kernel_cos            ... cosine function on [-pi/4,pi/4]
 872  *      __ieee754_rem_pio2      ... argument reduction routine
 873  *
 874  * Method.
 875  *      Let S,C and T denote the sin, cos and tan respectively on
 876  *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
 877  *      in [-pi/4 , +pi/4], and let n = k mod 4.
 878  *      We have
 879  *
 880  *          n        sin(x)      cos(x)        tan(x)
 881  *     ----------------------------------------------------------
 882  *          0          S           C             T
 883  *          1          C          -S            -1/T
 884  *          2         -S          -C             T
 885  *          3         -C           S            -1/T
 886  *     ----------------------------------------------------------
 887  *
 888  * Special cases:
 889  *      Let trig be any of sin, cos, or tan.
 890  *      trig(+-INF)  is NaN, with signals;
 891  *      trig(NaN)    is that NaN;
 892  *
 893  * Accuracy:
 894  *      TRIG(x) returns trig(x) nearly rounded
 895  */
 896 
 897 JRT_LEAF(jdouble, SharedRuntime::dcos(jdouble x))
 898   double y[2],z=0.0;
 899   int n, ix;
 900 
 901   /* High word of x. */
 902   ix = __HI(x);
 903 
 904   /* |x| ~< pi/4 */
 905   ix &= 0x7fffffff;
 906   if(ix <= 0x3fe921fb) return __kernel_cos(x,z);
 907 
 908   /* cos(Inf or NaN) is NaN */
 909   else if (ix>=0x7ff00000) return x-x;
 910 
 911   /* argument reduction needed */
 912   else {
 913     n = __ieee754_rem_pio2(x,y);
 914     switch(n&3) {
 915     case 0: return  __kernel_cos(y[0],y[1]);
 916     case 1: return -__kernel_sin(y[0],y[1],1);
 917     case 2: return -__kernel_cos(y[0],y[1]);
 918     default:
 919       return  __kernel_sin(y[0],y[1],1);
 920     }
 921   }
 922 JRT_END
 923 
 924 /* tan(x)
 925  * Return tangent function of x.
 926  *
 927  * kernel function:
 928  *      __kernel_tan            ... tangent function on [-pi/4,pi/4]
 929  *      __ieee754_rem_pio2      ... argument reduction routine
 930  *
 931  * Method.
 932  *      Let S,C and T denote the sin, cos and tan respectively on
 933  *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
 934  *      in [-pi/4 , +pi/4], and let n = k mod 4.
 935  *      We have
 936  *
 937  *          n        sin(x)      cos(x)        tan(x)
 938  *     ----------------------------------------------------------
 939  *          0          S           C             T
 940  *          1          C          -S            -1/T
 941  *          2         -S          -C             T
 942  *          3         -C           S            -1/T
 943  *     ----------------------------------------------------------
 944  *
 945  * Special cases:
 946  *      Let trig be any of sin, cos, or tan.
 947  *      trig(+-INF)  is NaN, with signals;
 948  *      trig(NaN)    is that NaN;
 949  *
 950  * Accuracy:
 951  *      TRIG(x) returns trig(x) nearly rounded
 952  */
 953 
 954 JRT_LEAF(jdouble, SharedRuntime::dtan(jdouble x))
 955   double y[2],z=0.0;
 956   int n, ix;
 957 
 958   /* High word of x. */
 959   ix = __HI(x);
 960 
 961   /* |x| ~< pi/4 */
 962   ix &= 0x7fffffff;
 963   if(ix <= 0x3fe921fb) return __kernel_tan(x,z,1);
 964 
 965   /* tan(Inf or NaN) is NaN */
 966   else if (ix>=0x7ff00000) return x-x;            /* NaN */
 967 
 968   /* argument reduction needed */
 969   else {
 970     n = __ieee754_rem_pio2(x,y);
 971     return __kernel_tan(y[0],y[1],1-((n&1)<<1)); /*   1 -- n even
 972                                                      -1 -- n odd */
 973   }
 974 JRT_END
 975 
 976 
 977 #ifdef WIN32
 978 # pragma optimize ( "", on )
 979 #endif