1 /*
   2 * Copyright (c) 2016, Intel Corporation.
   3 * Intel Math Library (LIBM) Source Code
   4 *
   5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6 *
   7 * This code is free software; you can redistribute it and/or modify it
   8 * under the terms of the GNU General Public License version 2 only, as
   9 * published by the Free Software Foundation.
  10 *
  11 * This code is distributed in the hope that it will be useful, but WITHOUT
  12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 * version 2 for more details (a copy is included in the LICENSE file that
  15 * accompanied this code).
  16 *
  17 * You should have received a copy of the GNU General Public License version
  18 * 2 along with this work; if not, write to the Free Software Foundation,
  19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20 *
  21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22 * or visit www.oracle.com if you need additional information or have any
  23 * questions.
  24 *
  25 */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "runtime/stubRoutines.hpp"
  31 #include "macroAssembler_x86.hpp"
  32 
  33 #ifdef _MSC_VER
  34 #define ALIGNED_(x) __declspec(align(x))
  35 #else
  36 #define ALIGNED_(x) __attribute__ ((aligned(x)))
  37 #endif
  38 
  39 /******************************************************************************/
  40 //                     ALGORITHM DESCRIPTION - COS()
  41 //                     ---------------------
  42 //
  43 //     1. RANGE REDUCTION
  44 //
  45 //     We perform an initial range reduction from X to r with
  46 //
  47 //          X =~= N * pi/32 + r
  48 //
  49 //     so that |r| <= pi/64 + epsilon. We restrict inputs to those
  50 //     where |N| <= 932560. Beyond this, the range reduction is
  51 //     insufficiently accurate. For extremely small inputs,
  52 //     denormalization can occur internally, impacting performance.
  53 //     This means that the main path is actually only taken for
  54 //     2^-252 <= |X| < 90112.
  55 //
  56 //     To avoid branches, we perform the range reduction to full
  57 //     accuracy each time.
  58 //
  59 //          X - N * (P_1 + P_2 + P_3)
  60 //
  61 //     where P_1 and P_2 are 32-bit numbers (so multiplication by N
  62 //     is exact) and P_3 is a 53-bit number. Together, these
  63 //     approximate pi well enough for all cases in the restricted
  64 //     range.
  65 //
  66 //     The main reduction sequence is:
  67 //
  68 //             y = 32/pi * x
  69 //             N = integer(y)
  70 //     (computed by adding and subtracting off SHIFTER)
  71 //
  72 //             m_1 = N * P_1
  73 //             m_2 = N * P_2
  74 //             r_1 = x - m_1
  75 //             r = r_1 - m_2
  76 //     (this r can be used for most of the calculation)
  77 //
  78 //             c_1 = r_1 - r
  79 //             m_3 = N * P_3
  80 //             c_2 = c_1 - m_2
  81 //             c = c_2 - m_3
  82 //
  83 //     2. MAIN ALGORITHM
  84 //
  85 //     The algorithm uses a table lookup based on B = M * pi / 32
  86 //     where M = N mod 64. The stored values are:
  87 //       sigma             closest power of 2 to cos(B)
  88 //       C_hl              53-bit cos(B) - sigma
  89 //       S_hi + S_lo       2 * 53-bit sin(B)
  90 //
  91 //     The computation is organized as follows:
  92 //
  93 //          sin(B + r + c) = [sin(B) + sigma * r] +
  94 //                           r * (cos(B) - sigma) +
  95 //                           sin(B) * [cos(r + c) - 1] +
  96 //                           cos(B) * [sin(r + c) - r]
  97 //
  98 //     which is approximately:
  99 //
 100 //          [S_hi + sigma * r] +
 101 //          C_hl * r +
 102 //          S_lo + S_hi * [(cos(r) - 1) - r * c] +
 103 //          (C_hl + sigma) * [(sin(r) - r) + c]
 104 //
 105 //     and this is what is actually computed. We separate this sum
 106 //     into four parts:
 107 //
 108 //          hi + med + pols + corr
 109 //
 110 //     where
 111 //
 112 //          hi       = S_hi + sigma r
 113 //          med      = C_hl * r
 114 //          pols     = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r)
 115 //          corr     = S_lo + c * ((C_hl + sigma) - S_hi * r)
 116 //
 117 //     3. POLYNOMIAL
 118 //
 119 //     The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) *
 120 //     (sin(r) - r) can be rearranged freely, since it is quite
 121 //     small, so we exploit parallelism to the fullest.
 122 //
 123 //          psc4       =   SC_4 * r_1
 124 //          msc4       =   psc4 * r
 125 //          r2         =   r * r
 126 //          msc2       =   SC_2 * r2
 127 //          r4         =   r2 * r2
 128 //          psc3       =   SC_3 + msc4
 129 //          psc1       =   SC_1 + msc2
 130 //          msc3       =   r4 * psc3
 131 //          sincospols =   psc1 + msc3
 132 //          pols       =   sincospols *
 133 //                         <S_hi * r^2 | (C_hl + sigma) * r^3>
 134 //
 135 //     4. CORRECTION TERM
 136 //
 137 //     This is where the "c" component of the range reduction is
 138 //     taken into account; recall that just "r" is used for most of
 139 //     the calculation.
 140 //
 141 //          -c   = m_3 - c_2
 142 //          -d   = S_hi * r - (C_hl + sigma)
 143 //          corr = -c * -d + S_lo
 144 //
 145 //     5. COMPENSATED SUMMATIONS
 146 //
 147 //     The two successive compensated summations add up the high
 148 //     and medium parts, leaving just the low parts to add up at
 149 //     the end.
 150 //
 151 //          rs        =  sigma * r
 152 //          res_int   =  S_hi + rs
 153 //          k_0       =  S_hi - res_int
 154 //          k_2       =  k_0 + rs
 155 //          med       =  C_hl * r
 156 //          res_hi    =  res_int + med
 157 //          k_1       =  res_int - res_hi
 158 //          k_3       =  k_1 + med
 159 //
 160 //     6. FINAL SUMMATION
 161 //
 162 //     We now add up all the small parts:
 163 //
 164 //          res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3
 165 //
 166 //     Now the overall result is just:
 167 //
 168 //          res_hi + res_lo
 169 //
 170 //     7. SMALL ARGUMENTS
 171 //
 172 //     Inputs with |X| < 2^-252 are treated specially as
 173 //     1 - |x|.
 174 //
 175 // Special cases:
 176 //  cos(NaN) = quiet NaN, and raise invalid exception
 177 //  cos(INF) = NaN and raise invalid exception
 178 //  cos(0) = 1
 179 //
 180 /******************************************************************************/
 181 
 182 #ifdef _LP64
 183 // The 64 bit code is at most SSE2 compliant
 184 ALIGNED_(8) juint _ONE[] =
 185 {
 186     0x00000000UL, 0x3ff00000UL
 187 };
 188 void MacroAssembler::fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register r8, Register r9, Register r10, Register r11) {
 189 
 190   Label L_2TAG_PACKET_0_0_1, L_2TAG_PACKET_1_0_1, L_2TAG_PACKET_2_0_1, L_2TAG_PACKET_3_0_1;
 191   Label L_2TAG_PACKET_4_0_1, L_2TAG_PACKET_5_0_1, L_2TAG_PACKET_6_0_1, L_2TAG_PACKET_7_0_1;
 192   Label L_2TAG_PACKET_8_0_1, L_2TAG_PACKET_9_0_1, L_2TAG_PACKET_10_0_1, L_2TAG_PACKET_11_0_1;
 193   Label L_2TAG_PACKET_12_0_1, L_2TAG_PACKET_13_0_1, B1_2, B1_3, B1_4, B1_5, start;
 194 
 195   assert_different_registers(r8, r9, r10, r11, eax, ecx, edx);
 196 
 197   address ONEHALF = StubRoutines::x86::_ONEHALF_addr();
 198   address P_2 = StubRoutines::x86::_P_2_addr();
 199   address SC_4 = StubRoutines::x86::_SC_4_addr();
 200   address Ctable = StubRoutines::x86::_Ctable_addr();
 201   address SC_2 = StubRoutines::x86::_SC_2_addr();
 202   address SC_3 = StubRoutines::x86::_SC_3_addr();
 203   address SC_1 = StubRoutines::x86::_SC_1_addr();
 204   address PI_INV_TABLE = StubRoutines::x86::_PI_INV_TABLE_addr();
 205   address PI_4 = (address)StubRoutines::x86::_PI_4_addr();
 206   address PI32INV = (address)StubRoutines::x86::_PI32INV_addr();
 207   address SIGN_MASK = (address)StubRoutines::x86::_SIGN_MASK_addr();
 208   address P_1 = (address)StubRoutines::x86::_P_1_addr();
 209   address P_3 = (address)StubRoutines::x86::_P_3_addr();
 210   address ONE = (address)_ONE;
 211   address NEG_ZERO = (address)StubRoutines::x86::_NEG_ZERO_addr();
 212 
 213   bind(start);
 214   push(rbx);
 215   subq(rsp, 16);
 216   movsd(Address(rsp, 8), xmm0);
 217 
 218   bind(B1_2);
 219   movl(eax, Address(rsp, 12));
 220   movq(xmm1, ExternalAddress(PI32INV));    //0x6dc9c883UL, 0x40245f30UL
 221   andl(eax, 2147418112);
 222   subl(eax, 808452096);
 223   cmpl(eax, 281346048);
 224   jcc(Assembler::above, L_2TAG_PACKET_0_0_1);
 225   mulsd(xmm1, xmm0);
 226   movdqu(xmm5, ExternalAddress(ONEHALF));    //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
 227   movq(xmm4, ExternalAddress(SIGN_MASK));    //0x00000000UL, 0x80000000UL
 228   pand(xmm4, xmm0);
 229   por(xmm5, xmm4);
 230   addpd(xmm1, xmm5);
 231   cvttsd2sil(edx, xmm1);
 232   cvtsi2sdl(xmm1, edx);
 233   movdqu(xmm2, ExternalAddress(P_2));    //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL
 234   movq(xmm3, ExternalAddress(P_1));    //0x54400000UL, 0x3fb921fbUL
 235   mulsd(xmm3, xmm1);
 236   unpcklpd(xmm1, xmm1);
 237   addq(rdx, 1865232);
 238   movdqu(xmm4, xmm0);
 239   andq(rdx, 63);
 240   movdqu(xmm5, ExternalAddress(SC_4));    //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL
 241   lea(rax, ExternalAddress(Ctable));
 242   shlq(rdx, 5);
 243   addq(rax, rdx);
 244   mulpd(xmm2, xmm1);
 245   subsd(xmm0, xmm3);
 246   mulsd(xmm1, ExternalAddress(P_3));    //0x2e037073UL, 0x3b63198aUL
 247   subsd(xmm4, xmm3);
 248   movq(xmm7, Address(rax, 8));
 249   unpcklpd(xmm0, xmm0);
 250   movdqu(xmm3, xmm4);
 251   subsd(xmm4, xmm2);
 252   mulpd(xmm5, xmm0);
 253   subpd(xmm0, xmm2);
 254   movdqu(xmm6, ExternalAddress(SC_2));    //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL
 255   mulsd(xmm7, xmm4);
 256   subsd(xmm3, xmm4);
 257   mulpd(xmm5, xmm0);
 258   mulpd(xmm0, xmm0);
 259   subsd(xmm3, xmm2);
 260   movdqu(xmm2, Address(rax, 0));
 261   subsd(xmm1, xmm3);
 262   movq(xmm3, Address(rax, 24));
 263   addsd(xmm2, xmm3);
 264   subsd(xmm7, xmm2);
 265   mulsd(xmm2, xmm4);
 266   mulpd(xmm6, xmm0);
 267   mulsd(xmm3, xmm4);
 268   mulpd(xmm2, xmm0);
 269   mulpd(xmm0, xmm0);
 270   addpd(xmm5, ExternalAddress(SC_3));    //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL
 271   mulsd(xmm4, Address(rax, 0));
 272   addpd(xmm6, ExternalAddress(SC_1));    //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL
 273   mulpd(xmm5, xmm0);
 274   movdqu(xmm0, xmm3);
 275   addsd(xmm3, Address(rax, 8));
 276   mulpd(xmm1, xmm7);
 277   movdqu(xmm7, xmm4);
 278   addsd(xmm4, xmm3);
 279   addpd(xmm6, xmm5);
 280   movq(xmm5, Address(rax, 8));
 281   subsd(xmm5, xmm3);
 282   subsd(xmm3, xmm4);
 283   addsd(xmm1, Address(rax, 16));
 284   mulpd(xmm6, xmm2);
 285   addsd(xmm0, xmm5);
 286   addsd(xmm3, xmm7);
 287   addsd(xmm0, xmm1);
 288   addsd(xmm0, xmm3);
 289   addsd(xmm0, xmm6);
 290   unpckhpd(xmm6, xmm6);
 291   addsd(xmm0, xmm6);
 292   addsd(xmm0, xmm4);
 293   jmp(B1_4);
 294 
 295   bind(L_2TAG_PACKET_0_0_1);
 296   jcc(Assembler::greater, L_2TAG_PACKET_1_0_1);
 297   pextrw(eax, xmm0, 3);
 298   andl(eax, 32767);
 299   pinsrw(xmm0, eax, 3);
 300   movq(xmm1, ExternalAddress(ONE));    //0x00000000UL, 0x3ff00000UL
 301   subsd(xmm1, xmm0);
 302   movdqu(xmm0, xmm1);
 303   jmp(B1_4);
 304 
 305   bind(L_2TAG_PACKET_1_0_1);
 306   pextrw(eax, xmm0, 3);
 307   andl(eax, 32752);
 308   cmpl(eax, 32752);
 309   jcc(Assembler::equal, L_2TAG_PACKET_2_0_1);
 310   pextrw(ecx, xmm0, 3);
 311   andl(ecx, 32752);
 312   subl(ecx, 16224);
 313   shrl(ecx, 7);
 314   andl(ecx, 65532);
 315   lea(r11, ExternalAddress(PI_INV_TABLE));
 316   addq(rcx, r11);
 317   movdq(rax, xmm0);
 318   movl(r10, Address(rcx, 20));
 319   movl(r8, Address(rcx, 24));
 320   movl(edx, eax);
 321   shrq(rax, 21);
 322   orl(eax, INT_MIN);
 323   shrl(eax, 11);
 324   movl(r9, r10);
 325   imulq(r10, rdx);
 326   imulq(r9, rax);
 327   imulq(r8, rax);
 328   movl(rsi, Address(rcx, 16));
 329   movl(rdi, Address(rcx, 12));
 330   movl(r11, r10);
 331   shrq(r10, 32);
 332   addq(r9, r10);
 333   addq(r11, r8);
 334   movl(r8, r11);
 335   shrq(r11, 32);
 336   addq(r9, r11);
 337   movl(r10, rsi);
 338   imulq(rsi, rdx);
 339   imulq(r10, rax);
 340   movl(r11, rdi);
 341   imulq(rdi, rdx);
 342   movl(rbx, rsi);
 343   shrq(rsi, 32);
 344   addq(r9, rbx);
 345   movl(rbx, r9);
 346   shrq(r9, 32);
 347   addq(r10, rsi);
 348   addq(r10, r9);
 349   shlq(rbx, 32);
 350   orq(r8, rbx);
 351   imulq(r11, rax);
 352   movl(r9, Address(rcx, 8));
 353   movl(rsi, Address(rcx, 4));
 354   movl(rbx, rdi);
 355   shrq(rdi, 32);
 356   addq(r10, rbx);
 357   movl(rbx, r10);
 358   shrq(r10, 32);
 359   addq(r11, rdi);
 360   addq(r11, r10);
 361   movq(rdi, r9);
 362   imulq(r9, rdx);
 363   imulq(rdi, rax);
 364   movl(r10, r9);
 365   shrq(r9, 32);
 366   addq(r11, r10);
 367   movl(r10, r11);
 368   shrq(r11, 32);
 369   addq(rdi, r9);
 370   addq(rdi, r11);
 371   movq(r9, rsi);
 372   imulq(rsi, rdx);
 373   imulq(r9, rax);
 374   shlq(r10, 32);
 375   orq(r10, rbx);
 376   movl(eax, Address(rcx, 0));
 377   movl(r11, rsi);
 378   shrq(rsi, 32);
 379   addq(rdi, r11);
 380   movl(r11, rdi);
 381   shrq(rdi, 32);
 382   addq(r9, rsi);
 383   addq(r9, rdi);
 384   imulq(rdx, rax);
 385   pextrw(rbx, xmm0, 3);
 386   lea(rdi, ExternalAddress(PI_INV_TABLE));
 387   subq(rcx, rdi);
 388   addl(ecx, ecx);
 389   addl(ecx, ecx);
 390   addl(ecx, ecx);
 391   addl(ecx, 19);
 392   movl(rsi, 32768);
 393   andl(rsi, rbx);
 394   shrl(rbx, 4);
 395   andl(rbx, 2047);
 396   subl(rbx, 1023);
 397   subl(ecx, rbx);
 398   addq(r9, rdx);
 399   movl(edx, ecx);
 400   addl(edx, 32);
 401   cmpl(ecx, 1);
 402   jcc(Assembler::less, L_2TAG_PACKET_3_0_1);
 403   negl(ecx);
 404   addl(ecx, 29);
 405   shll(r9);
 406   movl(rdi, r9);
 407   andl(r9, 536870911);
 408   testl(r9, 268435456);
 409   jcc(Assembler::notEqual, L_2TAG_PACKET_4_0_1);
 410   shrl(r9);
 411   movl(rbx, 0);
 412   shlq(r9, 32);
 413   orq(r9, r11);
 414 
 415   bind(L_2TAG_PACKET_5_0_1);
 416 
 417   bind(L_2TAG_PACKET_6_0_1);
 418   cmpq(r9, 0);
 419   jcc(Assembler::equal, L_2TAG_PACKET_7_0_1);
 420 
 421   bind(L_2TAG_PACKET_8_0_1);
 422   bsrq(r11, r9);
 423   movl(ecx, 29);
 424   subl(ecx, r11);
 425   jcc(Assembler::lessEqual, L_2TAG_PACKET_9_0_1);
 426   shlq(r9);
 427   movq(rax, r10);
 428   shlq(r10);
 429   addl(edx, ecx);
 430   negl(ecx);
 431   addl(ecx, 64);
 432   shrq(rax);
 433   shrq(r8);
 434   orq(r9, rax);
 435   orq(r10, r8);
 436 
 437   bind(L_2TAG_PACKET_10_0_1);
 438   cvtsi2sdq(xmm0, r9);
 439   shrq(r10, 1);
 440   cvtsi2sdq(xmm3, r10);
 441   xorpd(xmm4, xmm4);
 442   shll(edx, 4);
 443   negl(edx);
 444   addl(edx, 16368);
 445   orl(edx, rsi);
 446   xorl(edx, rbx);
 447   pinsrw(xmm4, edx, 3);
 448   movq(xmm2, ExternalAddress(PI_4));    //0x40000000UL, 0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL
 449   movq(xmm6, ExternalAddress(8 + PI_4));    //0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL
 450   xorpd(xmm5, xmm5);
 451   subl(edx, 1008);
 452   pinsrw(xmm5, edx, 3);
 453   mulsd(xmm0, xmm4);
 454   shll(rsi, 16);
 455   sarl(rsi, 31);
 456   mulsd(xmm3, xmm5);
 457   movdqu(xmm1, xmm0);
 458   mulsd(xmm0, xmm2);
 459   shrl(rdi, 29);
 460   addsd(xmm1, xmm3);
 461   mulsd(xmm3, xmm2);
 462   addl(rdi, rsi);
 463   xorl(rdi, rsi);
 464   mulsd(xmm6, xmm1);
 465   movl(eax, rdi);
 466   addsd(xmm6, xmm3);
 467   movdqu(xmm2, xmm0);
 468   addsd(xmm0, xmm6);
 469   subsd(xmm2, xmm0);
 470   addsd(xmm6, xmm2);
 471 
 472   bind(L_2TAG_PACKET_11_0_1);
 473   movq(xmm1, ExternalAddress(PI32INV));    //0x6dc9c883UL, 0x40245f30UL
 474   mulsd(xmm1, xmm0);
 475   movq(xmm5, ExternalAddress(ONEHALF));    //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
 476   movq(xmm4, ExternalAddress(SIGN_MASK));    //0x00000000UL, 0x80000000UL
 477   pand(xmm4, xmm0);
 478   por(xmm5, xmm4);
 479   addpd(xmm1, xmm5);
 480   cvttsd2siq(rdx, xmm1);
 481   cvtsi2sdq(xmm1, rdx);
 482   movq(xmm3, ExternalAddress(P_1));    //0x54400000UL, 0x3fb921fbUL
 483   movdqu(xmm2, ExternalAddress(P_2));    //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL
 484   mulsd(xmm3, xmm1);
 485   unpcklpd(xmm1, xmm1);
 486   shll(eax, 3);
 487   addl(edx, 1865232);
 488   movdqu(xmm4, xmm0);
 489   addl(edx, eax);
 490   andl(edx, 63);
 491   movdqu(xmm5, ExternalAddress(SC_4));    //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL
 492   lea(rax, ExternalAddress(Ctable));
 493   shll(edx, 5);
 494   addq(rax, rdx);
 495   mulpd(xmm2, xmm1);
 496   subsd(xmm0, xmm3);
 497   mulsd(xmm1, ExternalAddress(P_3));    //0x2e037073UL, 0x3b63198aUL
 498   subsd(xmm4, xmm3);
 499   movq(xmm7, Address(rax, 8));
 500   unpcklpd(xmm0, xmm0);
 501   movdqu(xmm3, xmm4);
 502   subsd(xmm4, xmm2);
 503   mulpd(xmm5, xmm0);
 504   subpd(xmm0, xmm2);
 505   mulsd(xmm7, xmm4);
 506   subsd(xmm3, xmm4);
 507   mulpd(xmm5, xmm0);
 508   mulpd(xmm0, xmm0);
 509   subsd(xmm3, xmm2);
 510   movdqu(xmm2, Address(rax, 0));
 511   subsd(xmm1, xmm3);
 512   movq(xmm3, Address(rax, 24));
 513   addsd(xmm2, xmm3);
 514   subsd(xmm7, xmm2);
 515   subsd(xmm1, xmm6);
 516   movdqu(xmm6, ExternalAddress(SC_2));    //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL
 517   mulsd(xmm2, xmm4);
 518   mulpd(xmm6, xmm0);
 519   mulsd(xmm3, xmm4);
 520   mulpd(xmm2, xmm0);
 521   mulpd(xmm0, xmm0);
 522   addpd(xmm5, ExternalAddress(SC_3));    //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL
 523   mulsd(xmm4, Address(rax, 0));
 524   addpd(xmm6, ExternalAddress(SC_1));    //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL
 525   mulpd(xmm5, xmm0);
 526   movdqu(xmm0, xmm3);
 527   addsd(xmm3, Address(rax, 8));
 528   mulpd(xmm1, xmm7);
 529   movdqu(xmm7, xmm4);
 530   addsd(xmm4, xmm3);
 531   addpd(xmm6, xmm5);
 532   movq(xmm5, Address(rax, 8));
 533   subsd(xmm5, xmm3);
 534   subsd(xmm3, xmm4);
 535   addsd(xmm1, Address(rax, 16));
 536   mulpd(xmm6, xmm2);
 537   addsd(xmm5, xmm0);
 538   addsd(xmm3, xmm7);
 539   addsd(xmm1, xmm5);
 540   addsd(xmm1, xmm3);
 541   addsd(xmm1, xmm6);
 542   unpckhpd(xmm6, xmm6);
 543   movdqu(xmm0, xmm4);
 544   addsd(xmm1, xmm6);
 545   addsd(xmm0, xmm1);
 546   jmp(B1_4);
 547 
 548   bind(L_2TAG_PACKET_7_0_1);
 549   addl(edx, 64);
 550   movq(r9, r10);
 551   movq(r10, r8);
 552   movl(r8, 0);
 553   cmpq(r9, 0);
 554   jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1);
 555   addl(edx, 64);
 556   movq(r9, r10);
 557   movq(r10, r8);
 558   cmpq(r9, 0);
 559   jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1);
 560   xorpd(xmm0, xmm0);
 561   xorpd(xmm6, xmm6);
 562   jmp(L_2TAG_PACKET_11_0_1);
 563 
 564   bind(L_2TAG_PACKET_9_0_1);
 565   jcc(Assembler::equal, L_2TAG_PACKET_10_0_1);
 566   negl(ecx);
 567   shrq(r10);
 568   movq(rax, r9);
 569   shrq(r9);
 570   subl(edx, ecx);
 571   negl(ecx);
 572   addl(ecx, 64);
 573   shlq(rax);
 574   orq(r10, rax);
 575   jmp(L_2TAG_PACKET_10_0_1);
 576   bind(L_2TAG_PACKET_3_0_1);
 577   negl(ecx);
 578   shlq(r9, 32);
 579   orq(r9, r11);
 580   shlq(r9);
 581   movq(rdi, r9);
 582   testl(r9, INT_MIN);
 583   jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_1);
 584   shrl(r9);
 585   movl(rbx, 0);
 586   shrq(rdi, 3);
 587   jmp(L_2TAG_PACKET_6_0_1);
 588 
 589   bind(L_2TAG_PACKET_4_0_1);
 590   shrl(r9);
 591   movl(rbx, 536870912);
 592   shrl(rbx);
 593   shlq(r9, 32);
 594   orq(r9, r11);
 595   shlq(rbx, 32);
 596   addl(rdi, 536870912);
 597   movl(rcx, 0);
 598   movl(r11, 0);
 599   subq(rcx, r8);
 600   sbbq(r11, r10);
 601   sbbq(rbx, r9);
 602   movq(r8, rcx);
 603   movq(r10, r11);
 604   movq(r9, rbx);
 605   movl(rbx, 32768);
 606   jmp(L_2TAG_PACKET_5_0_1);
 607 
 608   bind(L_2TAG_PACKET_12_0_1);
 609   shrl(r9);
 610   mov64(rbx, 0x100000000);
 611   shrq(rbx);
 612   movl(rcx, 0);
 613   movl(r11, 0);
 614   subq(rcx, r8);
 615   sbbq(r11, r10);
 616   sbbq(rbx, r9);
 617   movq(r8, rcx);
 618   movq(r10, r11);
 619   movq(r9, rbx);
 620   movl(rbx, 32768);
 621   shrq(rdi, 3);
 622   addl(rdi, 536870912);
 623   jmp(L_2TAG_PACKET_6_0_1);
 624 
 625   bind(L_2TAG_PACKET_2_0_1);
 626   movsd(xmm0, Address(rsp, 8));
 627   mulsd(xmm0, ExternalAddress(NEG_ZERO));    //0x00000000UL, 0x80000000UL
 628   movq(Address(rsp, 0), xmm0);
 629 
 630   bind(L_2TAG_PACKET_13_0_1);
 631 
 632   bind(B1_4);
 633   addq(rsp, 16);
 634   pop(rbx);
 635 }
 636 #else
 637 // The 32 bit code is at most SSE2 compliant
 638 
 639 ALIGNED_(16) juint _static_const_table_cos[] =
 640 {
 641     0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
 642     0x00000000UL, 0x00000000UL, 0x3ff00000UL, 0x176d6d31UL, 0xbf73b92eUL,
 643     0xbc29b42cUL, 0x3fb917a6UL, 0xe0000000UL, 0xbc3e2718UL, 0x00000000UL,
 644     0x3ff00000UL, 0x011469fbUL, 0xbf93ad06UL, 0x3c69a60bUL, 0x3fc8f8b8UL,
 645     0xc0000000UL, 0xbc626d19UL, 0x00000000UL, 0x3ff00000UL, 0x939d225aUL,
 646     0xbfa60beaUL, 0x2ed59f06UL, 0x3fd29406UL, 0xa0000000UL, 0xbc75d28dUL,
 647     0x00000000UL, 0x3ff00000UL, 0x866b95cfUL, 0xbfb37ca1UL, 0xa6aea963UL,
 648     0x3fd87de2UL, 0xe0000000UL, 0xbc672cedUL, 0x00000000UL, 0x3ff00000UL,
 649     0x73fa1279UL, 0xbfbe3a68UL, 0x3806f63bUL, 0x3fde2b5dUL, 0x20000000UL,
 650     0x3c5e0d89UL, 0x00000000UL, 0x3ff00000UL, 0x5bc57974UL, 0xbfc59267UL,
 651     0x39ae68c8UL, 0x3fe1c73bUL, 0x20000000UL, 0x3c8b25ddUL, 0x00000000UL,
 652     0x3ff00000UL, 0x53aba2fdUL, 0xbfcd0dfeUL, 0x25091dd6UL, 0x3fe44cf3UL,
 653     0x20000000UL, 0x3c68076aUL, 0x00000000UL, 0x3ff00000UL, 0x99fcef32UL,
 654     0x3fca8279UL, 0x667f3bcdUL, 0x3fe6a09eUL, 0x20000000UL, 0xbc8bdd34UL,
 655     0x00000000UL, 0x3fe00000UL, 0x94247758UL, 0x3fc133ccUL, 0x6b151741UL,
 656     0x3fe8bc80UL, 0x20000000UL, 0xbc82c5e1UL, 0x00000000UL, 0x3fe00000UL,
 657     0x9ae68c87UL, 0x3fac73b3UL, 0x290ea1a3UL, 0x3fea9b66UL, 0xe0000000UL,
 658     0x3c39f630UL, 0x00000000UL, 0x3fe00000UL, 0x7f909c4eUL, 0xbf9d4a2cUL,
 659     0xf180bdb1UL, 0x3fec38b2UL, 0x80000000UL, 0xbc76e0b1UL, 0x00000000UL,
 660     0x3fe00000UL, 0x65455a75UL, 0xbfbe0875UL, 0xcf328d46UL, 0x3fed906bUL,
 661     0x20000000UL, 0x3c7457e6UL, 0x00000000UL, 0x3fe00000UL, 0x76acf82dUL,
 662     0x3fa4a031UL, 0x56c62ddaUL, 0x3fee9f41UL, 0xe0000000UL, 0x3c8760b1UL,
 663     0x00000000UL, 0x3fd00000UL, 0x0e5967d5UL, 0xbfac1d1fUL, 0xcff75cb0UL,
 664     0x3fef6297UL, 0x20000000UL, 0x3c756217UL, 0x00000000UL, 0x3fd00000UL,
 665     0x0f592f50UL, 0xbf9ba165UL, 0xa3d12526UL, 0x3fefd88dUL, 0x40000000UL,
 666     0xbc887df6UL, 0x00000000UL, 0x3fc00000UL, 0x00000000UL, 0x00000000UL,
 667     0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
 668     0x00000000UL, 0x0f592f50UL, 0x3f9ba165UL, 0xa3d12526UL, 0x3fefd88dUL,
 669     0x40000000UL, 0xbc887df6UL, 0x00000000UL, 0xbfc00000UL, 0x0e5967d5UL,
 670     0x3fac1d1fUL, 0xcff75cb0UL, 0x3fef6297UL, 0x20000000UL, 0x3c756217UL,
 671     0x00000000UL, 0xbfd00000UL, 0x76acf82dUL, 0xbfa4a031UL, 0x56c62ddaUL,
 672     0x3fee9f41UL, 0xe0000000UL, 0x3c8760b1UL, 0x00000000UL, 0xbfd00000UL,
 673     0x65455a75UL, 0x3fbe0875UL, 0xcf328d46UL, 0x3fed906bUL, 0x20000000UL,
 674     0x3c7457e6UL, 0x00000000UL, 0xbfe00000UL, 0x7f909c4eUL, 0x3f9d4a2cUL,
 675     0xf180bdb1UL, 0x3fec38b2UL, 0x80000000UL, 0xbc76e0b1UL, 0x00000000UL,
 676     0xbfe00000UL, 0x9ae68c87UL, 0xbfac73b3UL, 0x290ea1a3UL, 0x3fea9b66UL,
 677     0xe0000000UL, 0x3c39f630UL, 0x00000000UL, 0xbfe00000UL, 0x94247758UL,
 678     0xbfc133ccUL, 0x6b151741UL, 0x3fe8bc80UL, 0x20000000UL, 0xbc82c5e1UL,
 679     0x00000000UL, 0xbfe00000UL, 0x99fcef32UL, 0xbfca8279UL, 0x667f3bcdUL,
 680     0x3fe6a09eUL, 0x20000000UL, 0xbc8bdd34UL, 0x00000000UL, 0xbfe00000UL,
 681     0x53aba2fdUL, 0x3fcd0dfeUL, 0x25091dd6UL, 0x3fe44cf3UL, 0x20000000UL,
 682     0x3c68076aUL, 0x00000000UL, 0xbff00000UL, 0x5bc57974UL, 0x3fc59267UL,
 683     0x39ae68c8UL, 0x3fe1c73bUL, 0x20000000UL, 0x3c8b25ddUL, 0x00000000UL,
 684     0xbff00000UL, 0x73fa1279UL, 0x3fbe3a68UL, 0x3806f63bUL, 0x3fde2b5dUL,
 685     0x20000000UL, 0x3c5e0d89UL, 0x00000000UL, 0xbff00000UL, 0x866b95cfUL,
 686     0x3fb37ca1UL, 0xa6aea963UL, 0x3fd87de2UL, 0xe0000000UL, 0xbc672cedUL,
 687     0x00000000UL, 0xbff00000UL, 0x939d225aUL, 0x3fa60beaUL, 0x2ed59f06UL,
 688     0x3fd29406UL, 0xa0000000UL, 0xbc75d28dUL, 0x00000000UL, 0xbff00000UL,
 689     0x011469fbUL, 0x3f93ad06UL, 0x3c69a60bUL, 0x3fc8f8b8UL, 0xc0000000UL,
 690     0xbc626d19UL, 0x00000000UL, 0xbff00000UL, 0x176d6d31UL, 0x3f73b92eUL,
 691     0xbc29b42cUL, 0x3fb917a6UL, 0xe0000000UL, 0xbc3e2718UL, 0x00000000UL,
 692     0xbff00000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
 693     0x00000000UL, 0x00000000UL, 0x00000000UL, 0xbff00000UL, 0x176d6d31UL,
 694     0x3f73b92eUL, 0xbc29b42cUL, 0xbfb917a6UL, 0xe0000000UL, 0x3c3e2718UL,
 695     0x00000000UL, 0xbff00000UL, 0x011469fbUL, 0x3f93ad06UL, 0x3c69a60bUL,
 696     0xbfc8f8b8UL, 0xc0000000UL, 0x3c626d19UL, 0x00000000UL, 0xbff00000UL,
 697     0x939d225aUL, 0x3fa60beaUL, 0x2ed59f06UL, 0xbfd29406UL, 0xa0000000UL,
 698     0x3c75d28dUL, 0x00000000UL, 0xbff00000UL, 0x866b95cfUL, 0x3fb37ca1UL,
 699     0xa6aea963UL, 0xbfd87de2UL, 0xe0000000UL, 0x3c672cedUL, 0x00000000UL,
 700     0xbff00000UL, 0x73fa1279UL, 0x3fbe3a68UL, 0x3806f63bUL, 0xbfde2b5dUL,
 701     0x20000000UL, 0xbc5e0d89UL, 0x00000000UL, 0xbff00000UL, 0x5bc57974UL,
 702     0x3fc59267UL, 0x39ae68c8UL, 0xbfe1c73bUL, 0x20000000UL, 0xbc8b25ddUL,
 703     0x00000000UL, 0xbff00000UL, 0x53aba2fdUL, 0x3fcd0dfeUL, 0x25091dd6UL,
 704     0xbfe44cf3UL, 0x20000000UL, 0xbc68076aUL, 0x00000000UL, 0xbff00000UL,
 705     0x99fcef32UL, 0xbfca8279UL, 0x667f3bcdUL, 0xbfe6a09eUL, 0x20000000UL,
 706     0x3c8bdd34UL, 0x00000000UL, 0xbfe00000UL, 0x94247758UL, 0xbfc133ccUL,
 707     0x6b151741UL, 0xbfe8bc80UL, 0x20000000UL, 0x3c82c5e1UL, 0x00000000UL,
 708     0xbfe00000UL, 0x9ae68c87UL, 0xbfac73b3UL, 0x290ea1a3UL, 0xbfea9b66UL,
 709     0xe0000000UL, 0xbc39f630UL, 0x00000000UL, 0xbfe00000UL, 0x7f909c4eUL,
 710     0x3f9d4a2cUL, 0xf180bdb1UL, 0xbfec38b2UL, 0x80000000UL, 0x3c76e0b1UL,
 711     0x00000000UL, 0xbfe00000UL, 0x65455a75UL, 0x3fbe0875UL, 0xcf328d46UL,
 712     0xbfed906bUL, 0x20000000UL, 0xbc7457e6UL, 0x00000000UL, 0xbfe00000UL,
 713     0x76acf82dUL, 0xbfa4a031UL, 0x56c62ddaUL, 0xbfee9f41UL, 0xe0000000UL,
 714     0xbc8760b1UL, 0x00000000UL, 0xbfd00000UL, 0x0e5967d5UL, 0x3fac1d1fUL,
 715     0xcff75cb0UL, 0xbfef6297UL, 0x20000000UL, 0xbc756217UL, 0x00000000UL,
 716     0xbfd00000UL, 0x0f592f50UL, 0x3f9ba165UL, 0xa3d12526UL, 0xbfefd88dUL,
 717     0x40000000UL, 0x3c887df6UL, 0x00000000UL, 0xbfc00000UL, 0x00000000UL,
 718     0x00000000UL, 0x00000000UL, 0xbff00000UL, 0x00000000UL, 0x00000000UL,
 719     0x00000000UL, 0x00000000UL, 0x0f592f50UL, 0xbf9ba165UL, 0xa3d12526UL,
 720     0xbfefd88dUL, 0x40000000UL, 0x3c887df6UL, 0x00000000UL, 0x3fc00000UL,
 721     0x0e5967d5UL, 0xbfac1d1fUL, 0xcff75cb0UL, 0xbfef6297UL, 0x20000000UL,
 722     0xbc756217UL, 0x00000000UL, 0x3fd00000UL, 0x76acf82dUL, 0x3fa4a031UL,
 723     0x56c62ddaUL, 0xbfee9f41UL, 0xe0000000UL, 0xbc8760b1UL, 0x00000000UL,
 724     0x3fd00000UL, 0x65455a75UL, 0xbfbe0875UL, 0xcf328d46UL, 0xbfed906bUL,
 725     0x20000000UL, 0xbc7457e6UL, 0x00000000UL, 0x3fe00000UL, 0x7f909c4eUL,
 726     0xbf9d4a2cUL, 0xf180bdb1UL, 0xbfec38b2UL, 0x80000000UL, 0x3c76e0b1UL,
 727     0x00000000UL, 0x3fe00000UL, 0x9ae68c87UL, 0x3fac73b3UL, 0x290ea1a3UL,
 728     0xbfea9b66UL, 0xe0000000UL, 0xbc39f630UL, 0x00000000UL, 0x3fe00000UL,
 729     0x94247758UL, 0x3fc133ccUL, 0x6b151741UL, 0xbfe8bc80UL, 0x20000000UL,
 730     0x3c82c5e1UL, 0x00000000UL, 0x3fe00000UL, 0x99fcef32UL, 0x3fca8279UL,
 731     0x667f3bcdUL, 0xbfe6a09eUL, 0x20000000UL, 0x3c8bdd34UL, 0x00000000UL,
 732     0x3fe00000UL, 0x53aba2fdUL, 0xbfcd0dfeUL, 0x25091dd6UL, 0xbfe44cf3UL,
 733     0x20000000UL, 0xbc68076aUL, 0x00000000UL, 0x3ff00000UL, 0x5bc57974UL,
 734     0xbfc59267UL, 0x39ae68c8UL, 0xbfe1c73bUL, 0x20000000UL, 0xbc8b25ddUL,
 735     0x00000000UL, 0x3ff00000UL, 0x73fa1279UL, 0xbfbe3a68UL, 0x3806f63bUL,
 736     0xbfde2b5dUL, 0x20000000UL, 0xbc5e0d89UL, 0x00000000UL, 0x3ff00000UL,
 737     0x866b95cfUL, 0xbfb37ca1UL, 0xa6aea963UL, 0xbfd87de2UL, 0xe0000000UL,
 738     0x3c672cedUL, 0x00000000UL, 0x3ff00000UL, 0x939d225aUL, 0xbfa60beaUL,
 739     0x2ed59f06UL, 0xbfd29406UL, 0xa0000000UL, 0x3c75d28dUL, 0x00000000UL,
 740     0x3ff00000UL, 0x011469fbUL, 0xbf93ad06UL, 0x3c69a60bUL, 0xbfc8f8b8UL,
 741     0xc0000000UL, 0x3c626d19UL, 0x00000000UL, 0x3ff00000UL, 0x176d6d31UL,
 742     0xbf73b92eUL, 0xbc29b42cUL, 0xbfb917a6UL, 0xe0000000UL, 0x3c3e2718UL,
 743     0x00000000UL, 0x3ff00000UL, 0x55555555UL, 0xbfc55555UL, 0x00000000UL,
 744     0xbfe00000UL, 0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL,
 745     0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL, 0xa556c734UL,
 746     0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL, 0x1a600000UL, 0x3d90b461UL,
 747     0x1a600000UL, 0x3d90b461UL, 0x54400000UL, 0x3fb921fbUL, 0x00000000UL,
 748     0x00000000UL, 0x2e037073UL, 0x3b63198aUL, 0x00000000UL, 0x00000000UL,
 749     0x6dc9c883UL, 0x40245f30UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
 750     0x43380000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x3ff00000UL,
 751     0x00000000UL, 0x00000000UL, 0x00000000UL, 0x80000000UL, 0x00000000UL,
 752     0x00000000UL, 0x00000000UL, 0x80000000UL, 0x00000000UL, 0x00000000UL,
 753     0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
 754 };
 755 //registers,
 756 // input: (rbp + 8)
 757 // scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 758 //          rax, rdx, rcx, rbx (tmp)
 759 
 760 // Code generated by Intel C compiler for LIBM library
 761 
 762 void MacroAssembler::fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
 763   Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
 764   Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
 765   Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
 766   Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;
 767 
 768   assert_different_registers(tmp, eax, ecx, edx);
 769 
 770   address static_const_table_cos = (address)_static_const_table_cos;
 771 
 772   bind(start);
 773   subl(rsp, 120);
 774   movl(Address(rsp, 56), tmp);
 775   lea(tmp, ExternalAddress(static_const_table_cos));
 776   movsd(xmm0, Address(rsp, 128));
 777   pextrw(eax, xmm0, 3);
 778   andl(eax, 32767);
 779   subl(eax, 12336);
 780   cmpl(eax, 4293);
 781   jcc(Assembler::above, L_2TAG_PACKET_0_0_2);
 782   movsd(xmm1, Address(tmp, 2160));
 783   mulsd(xmm1, xmm0);
 784   movdqu(xmm5, Address(tmp, 2240));
 785   movsd(xmm4, Address(tmp, 2224));
 786   pand(xmm4, xmm0);
 787   por(xmm5, xmm4);
 788   movsd(xmm3, Address(tmp, 2128));
 789   movdqu(xmm2, Address(tmp, 2112));
 790   addpd(xmm1, xmm5);
 791   cvttsd2sil(edx, xmm1);
 792   cvtsi2sdl(xmm1, edx);
 793   mulsd(xmm3, xmm1);
 794   unpcklpd(xmm1, xmm1);
 795   addl(edx, 1865232);
 796   movdqu(xmm4, xmm0);
 797   andl(edx, 63);
 798   movdqu(xmm5, Address(tmp, 2096));
 799   lea(eax, Address(tmp, 0));
 800   shll(edx, 5);
 801   addl(eax, edx);
 802   mulpd(xmm2, xmm1);
 803   subsd(xmm0, xmm3);
 804   mulsd(xmm1, Address(tmp, 2144));
 805   subsd(xmm4, xmm3);
 806   movsd(xmm7, Address(eax, 8));
 807   unpcklpd(xmm0, xmm0);
 808   movapd(xmm3, xmm4);
 809   subsd(xmm4, xmm2);
 810   mulpd(xmm5, xmm0);
 811   subpd(xmm0, xmm2);
 812   movdqu(xmm6, Address(tmp, 2064));
 813   mulsd(xmm7, xmm4);
 814   subsd(xmm3, xmm4);
 815   mulpd(xmm5, xmm0);
 816   mulpd(xmm0, xmm0);
 817   subsd(xmm3, xmm2);
 818   movdqu(xmm2, Address(eax, 0));
 819   subsd(xmm1, xmm3);
 820   movsd(xmm3, Address(eax, 24));
 821   addsd(xmm2, xmm3);
 822   subsd(xmm7, xmm2);
 823   mulsd(xmm2, xmm4);
 824   mulpd(xmm6, xmm0);
 825   mulsd(xmm3, xmm4);
 826   mulpd(xmm2, xmm0);
 827   mulpd(xmm0, xmm0);
 828   addpd(xmm5, Address(tmp, 2080));
 829   mulsd(xmm4, Address(eax, 0));
 830   addpd(xmm6, Address(tmp, 2048));
 831   mulpd(xmm5, xmm0);
 832   movapd(xmm0, xmm3);
 833   addsd(xmm3, Address(eax, 8));
 834   mulpd(xmm1, xmm7);
 835   movapd(xmm7, xmm4);
 836   addsd(xmm4, xmm3);
 837   addpd(xmm6, xmm5);
 838   movsd(xmm5, Address(eax, 8));
 839   subsd(xmm5, xmm3);
 840   subsd(xmm3, xmm4);
 841   addsd(xmm1, Address(eax, 16));
 842   mulpd(xmm6, xmm2);
 843   addsd(xmm5, xmm0);
 844   addsd(xmm3, xmm7);
 845   addsd(xmm1, xmm5);
 846   addsd(xmm1, xmm3);
 847   addsd(xmm1, xmm6);
 848   unpckhpd(xmm6, xmm6);
 849   addsd(xmm1, xmm6);
 850   addsd(xmm4, xmm1);
 851   movsd(Address(rsp, 0), xmm4);
 852   fld_d(Address(rsp, 0));
 853   jmp(L_2TAG_PACKET_1_0_2);
 854 
 855   bind(L_2TAG_PACKET_0_0_2);
 856   jcc(Assembler::greater, L_2TAG_PACKET_2_0_2);
 857   pextrw(eax, xmm0, 3);
 858   andl(eax, 32767);
 859   pinsrw(xmm0, eax, 3);
 860   movsd(xmm1, Address(tmp, 2192));
 861   subsd(xmm1, xmm0);
 862   movsd(Address(rsp, 0), xmm1);
 863   fld_d(Address(rsp, 0));
 864   jmp(L_2TAG_PACKET_1_0_2);
 865 
 866   bind(L_2TAG_PACKET_2_0_2);
 867   movl(eax, Address(rsp, 132));
 868   andl(eax, 2146435072);
 869   cmpl(eax, 2146435072);
 870   jcc(Assembler::equal, L_2TAG_PACKET_3_0_2);
 871   subl(rsp, 32);
 872   movsd(Address(rsp, 0), xmm0);
 873   lea(eax, Address(rsp, 40));
 874   movl(Address(rsp, 8), eax);
 875   movl(eax, 1);
 876   movl(Address(rsp, 12), eax);
 877   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dlibm_sin_cos_huge())));
 878   addl(rsp, 32);
 879   fld_d(Address(rsp, 8));
 880   jmp(L_2TAG_PACKET_1_0_2);
 881 
 882   bind(L_2TAG_PACKET_3_0_2);
 883   fld_d(Address(rsp, 128));
 884   fmul_d(Address(tmp, 2208));
 885 
 886   bind(L_2TAG_PACKET_1_0_2);
 887   movl(tmp, Address(rsp, 56));
 888 }
 889 #endif