1 /*
   2 * Copyright (c) 2016, Intel Corporation.
   3 * Intel Math Library (LIBM) Source Code
   4 *
   5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6 *
   7 * This code is free software; you can redistribute it and/or modify it
   8 * under the terms of the GNU General Public License version 2 only, as
   9 * published by the Free Software Foundation.
  10 *
  11 * This code is distributed in the hope that it will be useful, but WITHOUT
  12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 * version 2 for more details (a copy is included in the LICENSE file that
  15 * accompanied this code).
  16 *
  17 * You should have received a copy of the GNU General Public License version
  18 * 2 along with this work; if not, write to the Free Software Foundation,
  19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20 *
  21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22 * or visit www.oracle.com if you need additional information or have any
  23 * questions.
  24 *
  25 */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "macroAssembler_x86.hpp"
  31 #include "runtime/stubRoutines.hpp"
  32 #include "utilities/globalDefinitions.hpp"
  33 
  34 /******************************************************************************/
  35 //                     ALGORITHM DESCRIPTION - COS()
  36 //                     ---------------------
  37 //
  38 //     1. RANGE REDUCTION
  39 //
  40 //     We perform an initial range reduction from X to r with
  41 //
  42 //          X =~= N * pi/32 + r
  43 //
  44 //     so that |r| <= pi/64 + epsilon. We restrict inputs to those
  45 //     where |N| <= 932560. Beyond this, the range reduction is
  46 //     insufficiently accurate. For extremely small inputs,
  47 //     denormalization can occur internally, impacting performance.
  48 //     This means that the main path is actually only taken for
  49 //     2^-252 <= |X| < 90112.
  50 //
  51 //     To avoid branches, we perform the range reduction to full
  52 //     accuracy each time.
  53 //
  54 //          X - N * (P_1 + P_2 + P_3)
  55 //
  56 //     where P_1 and P_2 are 32-bit numbers (so multiplication by N
  57 //     is exact) and P_3 is a 53-bit number. Together, these
  58 //     approximate pi well enough for all cases in the restricted
  59 //     range.
  60 //
  61 //     The main reduction sequence is:
  62 //
  63 //             y = 32/pi * x
  64 //             N = integer(y)
  65 //     (computed by adding and subtracting off SHIFTER)
  66 //
  67 //             m_1 = N * P_1
  68 //             m_2 = N * P_2
  69 //             r_1 = x - m_1
  70 //             r = r_1 - m_2
  71 //     (this r can be used for most of the calculation)
  72 //
  73 //             c_1 = r_1 - r
  74 //             m_3 = N * P_3
  75 //             c_2 = c_1 - m_2
  76 //             c = c_2 - m_3
  77 //
  78 //     2. MAIN ALGORITHM
  79 //
  80 //     The algorithm uses a table lookup based on B = M * pi / 32
  81 //     where M = N mod 64. The stored values are:
  82 //       sigma             closest power of 2 to cos(B)
  83 //       C_hl              53-bit cos(B) - sigma
  84 //       S_hi + S_lo       2 * 53-bit sin(B)
  85 //
  86 //     The computation is organized as follows:
  87 //
  88 //          sin(B + r + c) = [sin(B) + sigma * r] +
  89 //                           r * (cos(B) - sigma) +
  90 //                           sin(B) * [cos(r + c) - 1] +
  91 //                           cos(B) * [sin(r + c) - r]
  92 //
  93 //     which is approximately:
  94 //
  95 //          [S_hi + sigma * r] +
  96 //          C_hl * r +
  97 //          S_lo + S_hi * [(cos(r) - 1) - r * c] +
  98 //          (C_hl + sigma) * [(sin(r) - r) + c]
  99 //
 100 //     and this is what is actually computed. We separate this sum
 101 //     into four parts:
 102 //
 103 //          hi + med + pols + corr
 104 //
 105 //     where
 106 //
 107 //          hi       = S_hi + sigma r
 108 //          med      = C_hl * r
 109 //          pols     = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r)
 110 //          corr     = S_lo + c * ((C_hl + sigma) - S_hi * r)
 111 //
 112 //     3. POLYNOMIAL
 113 //
 114 //     The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) *
 115 //     (sin(r) - r) can be rearranged freely, since it is quite
 116 //     small, so we exploit parallelism to the fullest.
 117 //
 118 //          psc4       =   SC_4 * r_1
 119 //          msc4       =   psc4 * r
 120 //          r2         =   r * r
 121 //          msc2       =   SC_2 * r2
 122 //          r4         =   r2 * r2
 123 //          psc3       =   SC_3 + msc4
 124 //          psc1       =   SC_1 + msc2
 125 //          msc3       =   r4 * psc3
 126 //          sincospols =   psc1 + msc3
 127 //          pols       =   sincospols *
 128 //                         <S_hi * r^2 | (C_hl + sigma) * r^3>
 129 //
 130 //     4. CORRECTION TERM
 131 //
 132 //     This is where the "c" component of the range reduction is
 133 //     taken into account; recall that just "r" is used for most of
 134 //     the calculation.
 135 //
 136 //          -c   = m_3 - c_2
 137 //          -d   = S_hi * r - (C_hl + sigma)
 138 //          corr = -c * -d + S_lo
 139 //
 140 //     5. COMPENSATED SUMMATIONS
 141 //
 142 //     The two successive compensated summations add up the high
 143 //     and medium parts, leaving just the low parts to add up at
 144 //     the end.
 145 //
 146 //          rs        =  sigma * r
 147 //          res_int   =  S_hi + rs
 148 //          k_0       =  S_hi - res_int
 149 //          k_2       =  k_0 + rs
 150 //          med       =  C_hl * r
 151 //          res_hi    =  res_int + med
 152 //          k_1       =  res_int - res_hi
 153 //          k_3       =  k_1 + med
 154 //
 155 //     6. FINAL SUMMATION
 156 //
 157 //     We now add up all the small parts:
 158 //
 159 //          res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3
 160 //
 161 //     Now the overall result is just:
 162 //
 163 //          res_hi + res_lo
 164 //
 165 //     7. SMALL ARGUMENTS
 166 //
 167 //     Inputs with |X| < 2^-252 are treated specially as
 168 //     1 - |x|.
 169 //
 170 // Special cases:
 171 //  cos(NaN) = quiet NaN, and raise invalid exception
 172 //  cos(INF) = NaN and raise invalid exception
 173 //  cos(0) = 1
 174 //
 175 /******************************************************************************/
 176 
 177 #ifdef _LP64
 178 // The 64 bit code is at most SSE2 compliant
 179 ATTRIBUTE_ALIGNED(8) juint _ONE[] =
 180 {
 181     0x00000000UL, 0x3ff00000UL
 182 };
 183 void MacroAssembler::fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register r8, Register r9, Register r10, Register r11) {
 184 
 185   Label L_2TAG_PACKET_0_0_1, L_2TAG_PACKET_1_0_1, L_2TAG_PACKET_2_0_1, L_2TAG_PACKET_3_0_1;
 186   Label L_2TAG_PACKET_4_0_1, L_2TAG_PACKET_5_0_1, L_2TAG_PACKET_6_0_1, L_2TAG_PACKET_7_0_1;
 187   Label L_2TAG_PACKET_8_0_1, L_2TAG_PACKET_9_0_1, L_2TAG_PACKET_10_0_1, L_2TAG_PACKET_11_0_1;
 188   Label L_2TAG_PACKET_12_0_1, L_2TAG_PACKET_13_0_1, B1_2, B1_4, start;
 189 
 190   assert_different_registers(r8, r9, r10, r11, eax, ecx, edx);
 191 
 192   address ONEHALF = StubRoutines::x86::_ONEHALF_addr();
 193   address P_2 = StubRoutines::x86::_P_2_addr();
 194   address SC_4 = StubRoutines::x86::_SC_4_addr();
 195   address Ctable = StubRoutines::x86::_Ctable_addr();
 196   address SC_2 = StubRoutines::x86::_SC_2_addr();
 197   address SC_3 = StubRoutines::x86::_SC_3_addr();
 198   address SC_1 = StubRoutines::x86::_SC_1_addr();
 199   address PI_INV_TABLE = StubRoutines::x86::_PI_INV_TABLE_addr();
 200   address PI_4 = (address)StubRoutines::x86::_PI_4_addr();
 201   address PI32INV = (address)StubRoutines::x86::_PI32INV_addr();
 202   address SIGN_MASK = (address)StubRoutines::x86::_SIGN_MASK_addr();
 203   address P_1 = (address)StubRoutines::x86::_P_1_addr();
 204   address P_3 = (address)StubRoutines::x86::_P_3_addr();
 205   address ONE = (address)_ONE;
 206   address NEG_ZERO = (address)StubRoutines::x86::_NEG_ZERO_addr();
 207 
 208   bind(start);
 209   push(rbx);
 210   subq(rsp, 16);
 211   movsd(Address(rsp, 8), xmm0);
 212 
 213   bind(B1_2);
 214   movl(eax, Address(rsp, 12));
 215   movq(xmm1, ExternalAddress(PI32INV));    //0x6dc9c883UL, 0x40245f30UL
 216   andl(eax, 2147418112);
 217   subl(eax, 808452096);
 218   cmpl(eax, 281346048);
 219   jcc(Assembler::above, L_2TAG_PACKET_0_0_1);
 220   mulsd(xmm1, xmm0);
 221   movdqu(xmm5, ExternalAddress(ONEHALF));    //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
 222   movq(xmm4, ExternalAddress(SIGN_MASK));    //0x00000000UL, 0x80000000UL
 223   pand(xmm4, xmm0);
 224   por(xmm5, xmm4);
 225   addpd(xmm1, xmm5);
 226   cvttsd2sil(edx, xmm1);
 227   cvtsi2sdl(xmm1, edx);
 228   movdqu(xmm2, ExternalAddress(P_2));    //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL
 229   movq(xmm3, ExternalAddress(P_1));    //0x54400000UL, 0x3fb921fbUL
 230   mulsd(xmm3, xmm1);
 231   unpcklpd(xmm1, xmm1);
 232   addq(rdx, 1865232);
 233   movdqu(xmm4, xmm0);
 234   andq(rdx, 63);
 235   movdqu(xmm5, ExternalAddress(SC_4));    //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL
 236   lea(rax, ExternalAddress(Ctable));
 237   shlq(rdx, 5);
 238   addq(rax, rdx);
 239   mulpd(xmm2, xmm1);
 240   subsd(xmm0, xmm3);
 241   mulsd(xmm1, ExternalAddress(P_3));    //0x2e037073UL, 0x3b63198aUL
 242   subsd(xmm4, xmm3);
 243   movq(xmm7, Address(rax, 8));
 244   unpcklpd(xmm0, xmm0);
 245   movdqu(xmm3, xmm4);
 246   subsd(xmm4, xmm2);
 247   mulpd(xmm5, xmm0);
 248   subpd(xmm0, xmm2);
 249   movdqu(xmm6, ExternalAddress(SC_2));    //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL
 250   mulsd(xmm7, xmm4);
 251   subsd(xmm3, xmm4);
 252   mulpd(xmm5, xmm0);
 253   mulpd(xmm0, xmm0);
 254   subsd(xmm3, xmm2);
 255   movdqu(xmm2, Address(rax, 0));
 256   subsd(xmm1, xmm3);
 257   movq(xmm3, Address(rax, 24));
 258   addsd(xmm2, xmm3);
 259   subsd(xmm7, xmm2);
 260   mulsd(xmm2, xmm4);
 261   mulpd(xmm6, xmm0);
 262   mulsd(xmm3, xmm4);
 263   mulpd(xmm2, xmm0);
 264   mulpd(xmm0, xmm0);
 265   addpd(xmm5, ExternalAddress(SC_3));    //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL
 266   mulsd(xmm4, Address(rax, 0));
 267   addpd(xmm6, ExternalAddress(SC_1));    //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL
 268   mulpd(xmm5, xmm0);
 269   movdqu(xmm0, xmm3);
 270   addsd(xmm3, Address(rax, 8));
 271   mulpd(xmm1, xmm7);
 272   movdqu(xmm7, xmm4);
 273   addsd(xmm4, xmm3);
 274   addpd(xmm6, xmm5);
 275   movq(xmm5, Address(rax, 8));
 276   subsd(xmm5, xmm3);
 277   subsd(xmm3, xmm4);
 278   addsd(xmm1, Address(rax, 16));
 279   mulpd(xmm6, xmm2);
 280   addsd(xmm0, xmm5);
 281   addsd(xmm3, xmm7);
 282   addsd(xmm0, xmm1);
 283   addsd(xmm0, xmm3);
 284   addsd(xmm0, xmm6);
 285   unpckhpd(xmm6, xmm6);
 286   addsd(xmm0, xmm6);
 287   addsd(xmm0, xmm4);
 288   jmp(B1_4);
 289 
 290   bind(L_2TAG_PACKET_0_0_1);
 291   jcc(Assembler::greater, L_2TAG_PACKET_1_0_1);
 292   pextrw(eax, xmm0, 3);
 293   andl(eax, 32767);
 294   pinsrw(xmm0, eax, 3);
 295   movq(xmm1, ExternalAddress(ONE));    //0x00000000UL, 0x3ff00000UL
 296   subsd(xmm1, xmm0);
 297   movdqu(xmm0, xmm1);
 298   jmp(B1_4);
 299 
 300   bind(L_2TAG_PACKET_1_0_1);
 301   pextrw(eax, xmm0, 3);
 302   andl(eax, 32752);
 303   cmpl(eax, 32752);
 304   jcc(Assembler::equal, L_2TAG_PACKET_2_0_1);
 305   pextrw(ecx, xmm0, 3);
 306   andl(ecx, 32752);
 307   subl(ecx, 16224);
 308   shrl(ecx, 7);
 309   andl(ecx, 65532);
 310   lea(r11, ExternalAddress(PI_INV_TABLE));
 311   addq(rcx, r11);
 312   movdq(rax, xmm0);
 313   movl(r10, Address(rcx, 20));
 314   movl(r8, Address(rcx, 24));
 315   movl(edx, eax);
 316   shrq(rax, 21);
 317   orl(eax, INT_MIN);
 318   shrl(eax, 11);
 319   movl(r9, r10);
 320   imulq(r10, rdx);
 321   imulq(r9, rax);
 322   imulq(r8, rax);
 323   movl(rsi, Address(rcx, 16));
 324   movl(rdi, Address(rcx, 12));
 325   movl(r11, r10);
 326   shrq(r10, 32);
 327   addq(r9, r10);
 328   addq(r11, r8);
 329   movl(r8, r11);
 330   shrq(r11, 32);
 331   addq(r9, r11);
 332   movl(r10, rsi);
 333   imulq(rsi, rdx);
 334   imulq(r10, rax);
 335   movl(r11, rdi);
 336   imulq(rdi, rdx);
 337   movl(rbx, rsi);
 338   shrq(rsi, 32);
 339   addq(r9, rbx);
 340   movl(rbx, r9);
 341   shrq(r9, 32);
 342   addq(r10, rsi);
 343   addq(r10, r9);
 344   shlq(rbx, 32);
 345   orq(r8, rbx);
 346   imulq(r11, rax);
 347   movl(r9, Address(rcx, 8));
 348   movl(rsi, Address(rcx, 4));
 349   movl(rbx, rdi);
 350   shrq(rdi, 32);
 351   addq(r10, rbx);
 352   movl(rbx, r10);
 353   shrq(r10, 32);
 354   addq(r11, rdi);
 355   addq(r11, r10);
 356   movq(rdi, r9);
 357   imulq(r9, rdx);
 358   imulq(rdi, rax);
 359   movl(r10, r9);
 360   shrq(r9, 32);
 361   addq(r11, r10);
 362   movl(r10, r11);
 363   shrq(r11, 32);
 364   addq(rdi, r9);
 365   addq(rdi, r11);
 366   movq(r9, rsi);
 367   imulq(rsi, rdx);
 368   imulq(r9, rax);
 369   shlq(r10, 32);
 370   orq(r10, rbx);
 371   movl(eax, Address(rcx, 0));
 372   movl(r11, rsi);
 373   shrq(rsi, 32);
 374   addq(rdi, r11);
 375   movl(r11, rdi);
 376   shrq(rdi, 32);
 377   addq(r9, rsi);
 378   addq(r9, rdi);
 379   imulq(rdx, rax);
 380   pextrw(rbx, xmm0, 3);
 381   lea(rdi, ExternalAddress(PI_INV_TABLE));
 382   subq(rcx, rdi);
 383   addl(ecx, ecx);
 384   addl(ecx, ecx);
 385   addl(ecx, ecx);
 386   addl(ecx, 19);
 387   movl(rsi, 32768);
 388   andl(rsi, rbx);
 389   shrl(rbx, 4);
 390   andl(rbx, 2047);
 391   subl(rbx, 1023);
 392   subl(ecx, rbx);
 393   addq(r9, rdx);
 394   movl(edx, ecx);
 395   addl(edx, 32);
 396   cmpl(ecx, 1);
 397   jcc(Assembler::less, L_2TAG_PACKET_3_0_1);
 398   negl(ecx);
 399   addl(ecx, 29);
 400   shll(r9);
 401   movl(rdi, r9);
 402   andl(r9, 536870911);
 403   testl(r9, 268435456);
 404   jcc(Assembler::notEqual, L_2TAG_PACKET_4_0_1);
 405   shrl(r9);
 406   movl(rbx, 0);
 407   shlq(r9, 32);
 408   orq(r9, r11);
 409 
 410   bind(L_2TAG_PACKET_5_0_1);
 411 
 412   bind(L_2TAG_PACKET_6_0_1);
 413   cmpq(r9, 0);
 414   jcc(Assembler::equal, L_2TAG_PACKET_7_0_1);
 415 
 416   bind(L_2TAG_PACKET_8_0_1);
 417   bsrq(r11, r9);
 418   movl(ecx, 29);
 419   subl(ecx, r11);
 420   jcc(Assembler::lessEqual, L_2TAG_PACKET_9_0_1);
 421   shlq(r9);
 422   movq(rax, r10);
 423   shlq(r10);
 424   addl(edx, ecx);
 425   negl(ecx);
 426   addl(ecx, 64);
 427   shrq(rax);
 428   shrq(r8);
 429   orq(r9, rax);
 430   orq(r10, r8);
 431 
 432   bind(L_2TAG_PACKET_10_0_1);
 433   cvtsi2sdq(xmm0, r9);
 434   shrq(r10, 1);
 435   cvtsi2sdq(xmm3, r10);
 436   xorpd(xmm4, xmm4);
 437   shll(edx, 4);
 438   negl(edx);
 439   addl(edx, 16368);
 440   orl(edx, rsi);
 441   xorl(edx, rbx);
 442   pinsrw(xmm4, edx, 3);
 443   movq(xmm2, ExternalAddress(PI_4));    //0x40000000UL, 0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL
 444   movq(xmm6, ExternalAddress(8 + PI_4));    //0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL
 445   xorpd(xmm5, xmm5);
 446   subl(edx, 1008);
 447   pinsrw(xmm5, edx, 3);
 448   mulsd(xmm0, xmm4);
 449   shll(rsi, 16);
 450   sarl(rsi, 31);
 451   mulsd(xmm3, xmm5);
 452   movdqu(xmm1, xmm0);
 453   mulsd(xmm0, xmm2);
 454   shrl(rdi, 29);
 455   addsd(xmm1, xmm3);
 456   mulsd(xmm3, xmm2);
 457   addl(rdi, rsi);
 458   xorl(rdi, rsi);
 459   mulsd(xmm6, xmm1);
 460   movl(eax, rdi);
 461   addsd(xmm6, xmm3);
 462   movdqu(xmm2, xmm0);
 463   addsd(xmm0, xmm6);
 464   subsd(xmm2, xmm0);
 465   addsd(xmm6, xmm2);
 466 
 467   bind(L_2TAG_PACKET_11_0_1);
 468   movq(xmm1, ExternalAddress(PI32INV));    //0x6dc9c883UL, 0x40245f30UL
 469   mulsd(xmm1, xmm0);
 470   movq(xmm5, ExternalAddress(ONEHALF));    //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
 471   movq(xmm4, ExternalAddress(SIGN_MASK));    //0x00000000UL, 0x80000000UL
 472   pand(xmm4, xmm0);
 473   por(xmm5, xmm4);
 474   addpd(xmm1, xmm5);
 475   cvttsd2siq(rdx, xmm1);
 476   cvtsi2sdq(xmm1, rdx);
 477   movq(xmm3, ExternalAddress(P_1));    //0x54400000UL, 0x3fb921fbUL
 478   movdqu(xmm2, ExternalAddress(P_2));    //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL
 479   mulsd(xmm3, xmm1);
 480   unpcklpd(xmm1, xmm1);
 481   shll(eax, 3);
 482   addl(edx, 1865232);
 483   movdqu(xmm4, xmm0);
 484   addl(edx, eax);
 485   andl(edx, 63);
 486   movdqu(xmm5, ExternalAddress(SC_4));    //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL
 487   lea(rax, ExternalAddress(Ctable));
 488   shll(edx, 5);
 489   addq(rax, rdx);
 490   mulpd(xmm2, xmm1);
 491   subsd(xmm0, xmm3);
 492   mulsd(xmm1, ExternalAddress(P_3));    //0x2e037073UL, 0x3b63198aUL
 493   subsd(xmm4, xmm3);
 494   movq(xmm7, Address(rax, 8));
 495   unpcklpd(xmm0, xmm0);
 496   movdqu(xmm3, xmm4);
 497   subsd(xmm4, xmm2);
 498   mulpd(xmm5, xmm0);
 499   subpd(xmm0, xmm2);
 500   mulsd(xmm7, xmm4);
 501   subsd(xmm3, xmm4);
 502   mulpd(xmm5, xmm0);
 503   mulpd(xmm0, xmm0);
 504   subsd(xmm3, xmm2);
 505   movdqu(xmm2, Address(rax, 0));
 506   subsd(xmm1, xmm3);
 507   movq(xmm3, Address(rax, 24));
 508   addsd(xmm2, xmm3);
 509   subsd(xmm7, xmm2);
 510   subsd(xmm1, xmm6);
 511   movdqu(xmm6, ExternalAddress(SC_2));    //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL
 512   mulsd(xmm2, xmm4);
 513   mulpd(xmm6, xmm0);
 514   mulsd(xmm3, xmm4);
 515   mulpd(xmm2, xmm0);
 516   mulpd(xmm0, xmm0);
 517   addpd(xmm5, ExternalAddress(SC_3));    //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL
 518   mulsd(xmm4, Address(rax, 0));
 519   addpd(xmm6, ExternalAddress(SC_1));    //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL
 520   mulpd(xmm5, xmm0);
 521   movdqu(xmm0, xmm3);
 522   addsd(xmm3, Address(rax, 8));
 523   mulpd(xmm1, xmm7);
 524   movdqu(xmm7, xmm4);
 525   addsd(xmm4, xmm3);
 526   addpd(xmm6, xmm5);
 527   movq(xmm5, Address(rax, 8));
 528   subsd(xmm5, xmm3);
 529   subsd(xmm3, xmm4);
 530   addsd(xmm1, Address(rax, 16));
 531   mulpd(xmm6, xmm2);
 532   addsd(xmm5, xmm0);
 533   addsd(xmm3, xmm7);
 534   addsd(xmm1, xmm5);
 535   addsd(xmm1, xmm3);
 536   addsd(xmm1, xmm6);
 537   unpckhpd(xmm6, xmm6);
 538   movdqu(xmm0, xmm4);
 539   addsd(xmm1, xmm6);
 540   addsd(xmm0, xmm1);
 541   jmp(B1_4);
 542 
 543   bind(L_2TAG_PACKET_7_0_1);
 544   addl(edx, 64);
 545   movq(r9, r10);
 546   movq(r10, r8);
 547   movl(r8, 0);
 548   cmpq(r9, 0);
 549   jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1);
 550   addl(edx, 64);
 551   movq(r9, r10);
 552   movq(r10, r8);
 553   cmpq(r9, 0);
 554   jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1);
 555   xorpd(xmm0, xmm0);
 556   xorpd(xmm6, xmm6);
 557   jmp(L_2TAG_PACKET_11_0_1);
 558 
 559   bind(L_2TAG_PACKET_9_0_1);
 560   jcc(Assembler::equal, L_2TAG_PACKET_10_0_1);
 561   negl(ecx);
 562   shrq(r10);
 563   movq(rax, r9);
 564   shrq(r9);
 565   subl(edx, ecx);
 566   negl(ecx);
 567   addl(ecx, 64);
 568   shlq(rax);
 569   orq(r10, rax);
 570   jmp(L_2TAG_PACKET_10_0_1);
 571   bind(L_2TAG_PACKET_3_0_1);
 572   negl(ecx);
 573   shlq(r9, 32);
 574   orq(r9, r11);
 575   shlq(r9);
 576   movq(rdi, r9);
 577   testl(r9, INT_MIN);
 578   jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_1);
 579   shrl(r9);
 580   movl(rbx, 0);
 581   shrq(rdi, 3);
 582   jmp(L_2TAG_PACKET_6_0_1);
 583 
 584   bind(L_2TAG_PACKET_4_0_1);
 585   shrl(r9);
 586   movl(rbx, 536870912);
 587   shrl(rbx);
 588   shlq(r9, 32);
 589   orq(r9, r11);
 590   shlq(rbx, 32);
 591   addl(rdi, 536870912);
 592   movl(rcx, 0);
 593   movl(r11, 0);
 594   subq(rcx, r8);
 595   sbbq(r11, r10);
 596   sbbq(rbx, r9);
 597   movq(r8, rcx);
 598   movq(r10, r11);
 599   movq(r9, rbx);
 600   movl(rbx, 32768);
 601   jmp(L_2TAG_PACKET_5_0_1);
 602 
 603   bind(L_2TAG_PACKET_12_0_1);
 604   shrl(r9);
 605   mov64(rbx, 0x100000000);
 606   shrq(rbx);
 607   movl(rcx, 0);
 608   movl(r11, 0);
 609   subq(rcx, r8);
 610   sbbq(r11, r10);
 611   sbbq(rbx, r9);
 612   movq(r8, rcx);
 613   movq(r10, r11);
 614   movq(r9, rbx);
 615   movl(rbx, 32768);
 616   shrq(rdi, 3);
 617   addl(rdi, 536870912);
 618   jmp(L_2TAG_PACKET_6_0_1);
 619 
 620   bind(L_2TAG_PACKET_2_0_1);
 621   movsd(xmm0, Address(rsp, 8));
 622   mulsd(xmm0, ExternalAddress(NEG_ZERO));    //0x00000000UL, 0x80000000UL
 623   movq(Address(rsp, 0), xmm0);
 624 
 625   bind(L_2TAG_PACKET_13_0_1);
 626 
 627   bind(B1_4);
 628   addq(rsp, 16);
 629   pop(rbx);
 630 }
 631 #else
 632 // The 32 bit code is at most SSE2 compliant
 633 
 634 ATTRIBUTE_ALIGNED(16) juint _static_const_table_cos[] =
 635 {
 636     0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
 637     0x00000000UL, 0x00000000UL, 0x3ff00000UL, 0x176d6d31UL, 0xbf73b92eUL,
 638     0xbc29b42cUL, 0x3fb917a6UL, 0xe0000000UL, 0xbc3e2718UL, 0x00000000UL,
 639     0x3ff00000UL, 0x011469fbUL, 0xbf93ad06UL, 0x3c69a60bUL, 0x3fc8f8b8UL,
 640     0xc0000000UL, 0xbc626d19UL, 0x00000000UL, 0x3ff00000UL, 0x939d225aUL,
 641     0xbfa60beaUL, 0x2ed59f06UL, 0x3fd29406UL, 0xa0000000UL, 0xbc75d28dUL,
 642     0x00000000UL, 0x3ff00000UL, 0x866b95cfUL, 0xbfb37ca1UL, 0xa6aea963UL,
 643     0x3fd87de2UL, 0xe0000000UL, 0xbc672cedUL, 0x00000000UL, 0x3ff00000UL,
 644     0x73fa1279UL, 0xbfbe3a68UL, 0x3806f63bUL, 0x3fde2b5dUL, 0x20000000UL,
 645     0x3c5e0d89UL, 0x00000000UL, 0x3ff00000UL, 0x5bc57974UL, 0xbfc59267UL,
 646     0x39ae68c8UL, 0x3fe1c73bUL, 0x20000000UL, 0x3c8b25ddUL, 0x00000000UL,
 647     0x3ff00000UL, 0x53aba2fdUL, 0xbfcd0dfeUL, 0x25091dd6UL, 0x3fe44cf3UL,
 648     0x20000000UL, 0x3c68076aUL, 0x00000000UL, 0x3ff00000UL, 0x99fcef32UL,
 649     0x3fca8279UL, 0x667f3bcdUL, 0x3fe6a09eUL, 0x20000000UL, 0xbc8bdd34UL,
 650     0x00000000UL, 0x3fe00000UL, 0x94247758UL, 0x3fc133ccUL, 0x6b151741UL,
 651     0x3fe8bc80UL, 0x20000000UL, 0xbc82c5e1UL, 0x00000000UL, 0x3fe00000UL,
 652     0x9ae68c87UL, 0x3fac73b3UL, 0x290ea1a3UL, 0x3fea9b66UL, 0xe0000000UL,
 653     0x3c39f630UL, 0x00000000UL, 0x3fe00000UL, 0x7f909c4eUL, 0xbf9d4a2cUL,
 654     0xf180bdb1UL, 0x3fec38b2UL, 0x80000000UL, 0xbc76e0b1UL, 0x00000000UL,
 655     0x3fe00000UL, 0x65455a75UL, 0xbfbe0875UL, 0xcf328d46UL, 0x3fed906bUL,
 656     0x20000000UL, 0x3c7457e6UL, 0x00000000UL, 0x3fe00000UL, 0x76acf82dUL,
 657     0x3fa4a031UL, 0x56c62ddaUL, 0x3fee9f41UL, 0xe0000000UL, 0x3c8760b1UL,
 658     0x00000000UL, 0x3fd00000UL, 0x0e5967d5UL, 0xbfac1d1fUL, 0xcff75cb0UL,
 659     0x3fef6297UL, 0x20000000UL, 0x3c756217UL, 0x00000000UL, 0x3fd00000UL,
 660     0x0f592f50UL, 0xbf9ba165UL, 0xa3d12526UL, 0x3fefd88dUL, 0x40000000UL,
 661     0xbc887df6UL, 0x00000000UL, 0x3fc00000UL, 0x00000000UL, 0x00000000UL,
 662     0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
 663     0x00000000UL, 0x0f592f50UL, 0x3f9ba165UL, 0xa3d12526UL, 0x3fefd88dUL,
 664     0x40000000UL, 0xbc887df6UL, 0x00000000UL, 0xbfc00000UL, 0x0e5967d5UL,
 665     0x3fac1d1fUL, 0xcff75cb0UL, 0x3fef6297UL, 0x20000000UL, 0x3c756217UL,
 666     0x00000000UL, 0xbfd00000UL, 0x76acf82dUL, 0xbfa4a031UL, 0x56c62ddaUL,
 667     0x3fee9f41UL, 0xe0000000UL, 0x3c8760b1UL, 0x00000000UL, 0xbfd00000UL,
 668     0x65455a75UL, 0x3fbe0875UL, 0xcf328d46UL, 0x3fed906bUL, 0x20000000UL,
 669     0x3c7457e6UL, 0x00000000UL, 0xbfe00000UL, 0x7f909c4eUL, 0x3f9d4a2cUL,
 670     0xf180bdb1UL, 0x3fec38b2UL, 0x80000000UL, 0xbc76e0b1UL, 0x00000000UL,
 671     0xbfe00000UL, 0x9ae68c87UL, 0xbfac73b3UL, 0x290ea1a3UL, 0x3fea9b66UL,
 672     0xe0000000UL, 0x3c39f630UL, 0x00000000UL, 0xbfe00000UL, 0x94247758UL,
 673     0xbfc133ccUL, 0x6b151741UL, 0x3fe8bc80UL, 0x20000000UL, 0xbc82c5e1UL,
 674     0x00000000UL, 0xbfe00000UL, 0x99fcef32UL, 0xbfca8279UL, 0x667f3bcdUL,
 675     0x3fe6a09eUL, 0x20000000UL, 0xbc8bdd34UL, 0x00000000UL, 0xbfe00000UL,
 676     0x53aba2fdUL, 0x3fcd0dfeUL, 0x25091dd6UL, 0x3fe44cf3UL, 0x20000000UL,
 677     0x3c68076aUL, 0x00000000UL, 0xbff00000UL, 0x5bc57974UL, 0x3fc59267UL,
 678     0x39ae68c8UL, 0x3fe1c73bUL, 0x20000000UL, 0x3c8b25ddUL, 0x00000000UL,
 679     0xbff00000UL, 0x73fa1279UL, 0x3fbe3a68UL, 0x3806f63bUL, 0x3fde2b5dUL,
 680     0x20000000UL, 0x3c5e0d89UL, 0x00000000UL, 0xbff00000UL, 0x866b95cfUL,
 681     0x3fb37ca1UL, 0xa6aea963UL, 0x3fd87de2UL, 0xe0000000UL, 0xbc672cedUL,
 682     0x00000000UL, 0xbff00000UL, 0x939d225aUL, 0x3fa60beaUL, 0x2ed59f06UL,
 683     0x3fd29406UL, 0xa0000000UL, 0xbc75d28dUL, 0x00000000UL, 0xbff00000UL,
 684     0x011469fbUL, 0x3f93ad06UL, 0x3c69a60bUL, 0x3fc8f8b8UL, 0xc0000000UL,
 685     0xbc626d19UL, 0x00000000UL, 0xbff00000UL, 0x176d6d31UL, 0x3f73b92eUL,
 686     0xbc29b42cUL, 0x3fb917a6UL, 0xe0000000UL, 0xbc3e2718UL, 0x00000000UL,
 687     0xbff00000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
 688     0x00000000UL, 0x00000000UL, 0x00000000UL, 0xbff00000UL, 0x176d6d31UL,
 689     0x3f73b92eUL, 0xbc29b42cUL, 0xbfb917a6UL, 0xe0000000UL, 0x3c3e2718UL,
 690     0x00000000UL, 0xbff00000UL, 0x011469fbUL, 0x3f93ad06UL, 0x3c69a60bUL,
 691     0xbfc8f8b8UL, 0xc0000000UL, 0x3c626d19UL, 0x00000000UL, 0xbff00000UL,
 692     0x939d225aUL, 0x3fa60beaUL, 0x2ed59f06UL, 0xbfd29406UL, 0xa0000000UL,
 693     0x3c75d28dUL, 0x00000000UL, 0xbff00000UL, 0x866b95cfUL, 0x3fb37ca1UL,
 694     0xa6aea963UL, 0xbfd87de2UL, 0xe0000000UL, 0x3c672cedUL, 0x00000000UL,
 695     0xbff00000UL, 0x73fa1279UL, 0x3fbe3a68UL, 0x3806f63bUL, 0xbfde2b5dUL,
 696     0x20000000UL, 0xbc5e0d89UL, 0x00000000UL, 0xbff00000UL, 0x5bc57974UL,
 697     0x3fc59267UL, 0x39ae68c8UL, 0xbfe1c73bUL, 0x20000000UL, 0xbc8b25ddUL,
 698     0x00000000UL, 0xbff00000UL, 0x53aba2fdUL, 0x3fcd0dfeUL, 0x25091dd6UL,
 699     0xbfe44cf3UL, 0x20000000UL, 0xbc68076aUL, 0x00000000UL, 0xbff00000UL,
 700     0x99fcef32UL, 0xbfca8279UL, 0x667f3bcdUL, 0xbfe6a09eUL, 0x20000000UL,
 701     0x3c8bdd34UL, 0x00000000UL, 0xbfe00000UL, 0x94247758UL, 0xbfc133ccUL,
 702     0x6b151741UL, 0xbfe8bc80UL, 0x20000000UL, 0x3c82c5e1UL, 0x00000000UL,
 703     0xbfe00000UL, 0x9ae68c87UL, 0xbfac73b3UL, 0x290ea1a3UL, 0xbfea9b66UL,
 704     0xe0000000UL, 0xbc39f630UL, 0x00000000UL, 0xbfe00000UL, 0x7f909c4eUL,
 705     0x3f9d4a2cUL, 0xf180bdb1UL, 0xbfec38b2UL, 0x80000000UL, 0x3c76e0b1UL,
 706     0x00000000UL, 0xbfe00000UL, 0x65455a75UL, 0x3fbe0875UL, 0xcf328d46UL,
 707     0xbfed906bUL, 0x20000000UL, 0xbc7457e6UL, 0x00000000UL, 0xbfe00000UL,
 708     0x76acf82dUL, 0xbfa4a031UL, 0x56c62ddaUL, 0xbfee9f41UL, 0xe0000000UL,
 709     0xbc8760b1UL, 0x00000000UL, 0xbfd00000UL, 0x0e5967d5UL, 0x3fac1d1fUL,
 710     0xcff75cb0UL, 0xbfef6297UL, 0x20000000UL, 0xbc756217UL, 0x00000000UL,
 711     0xbfd00000UL, 0x0f592f50UL, 0x3f9ba165UL, 0xa3d12526UL, 0xbfefd88dUL,
 712     0x40000000UL, 0x3c887df6UL, 0x00000000UL, 0xbfc00000UL, 0x00000000UL,
 713     0x00000000UL, 0x00000000UL, 0xbff00000UL, 0x00000000UL, 0x00000000UL,
 714     0x00000000UL, 0x00000000UL, 0x0f592f50UL, 0xbf9ba165UL, 0xa3d12526UL,
 715     0xbfefd88dUL, 0x40000000UL, 0x3c887df6UL, 0x00000000UL, 0x3fc00000UL,
 716     0x0e5967d5UL, 0xbfac1d1fUL, 0xcff75cb0UL, 0xbfef6297UL, 0x20000000UL,
 717     0xbc756217UL, 0x00000000UL, 0x3fd00000UL, 0x76acf82dUL, 0x3fa4a031UL,
 718     0x56c62ddaUL, 0xbfee9f41UL, 0xe0000000UL, 0xbc8760b1UL, 0x00000000UL,
 719     0x3fd00000UL, 0x65455a75UL, 0xbfbe0875UL, 0xcf328d46UL, 0xbfed906bUL,
 720     0x20000000UL, 0xbc7457e6UL, 0x00000000UL, 0x3fe00000UL, 0x7f909c4eUL,
 721     0xbf9d4a2cUL, 0xf180bdb1UL, 0xbfec38b2UL, 0x80000000UL, 0x3c76e0b1UL,
 722     0x00000000UL, 0x3fe00000UL, 0x9ae68c87UL, 0x3fac73b3UL, 0x290ea1a3UL,
 723     0xbfea9b66UL, 0xe0000000UL, 0xbc39f630UL, 0x00000000UL, 0x3fe00000UL,
 724     0x94247758UL, 0x3fc133ccUL, 0x6b151741UL, 0xbfe8bc80UL, 0x20000000UL,
 725     0x3c82c5e1UL, 0x00000000UL, 0x3fe00000UL, 0x99fcef32UL, 0x3fca8279UL,
 726     0x667f3bcdUL, 0xbfe6a09eUL, 0x20000000UL, 0x3c8bdd34UL, 0x00000000UL,
 727     0x3fe00000UL, 0x53aba2fdUL, 0xbfcd0dfeUL, 0x25091dd6UL, 0xbfe44cf3UL,
 728     0x20000000UL, 0xbc68076aUL, 0x00000000UL, 0x3ff00000UL, 0x5bc57974UL,
 729     0xbfc59267UL, 0x39ae68c8UL, 0xbfe1c73bUL, 0x20000000UL, 0xbc8b25ddUL,
 730     0x00000000UL, 0x3ff00000UL, 0x73fa1279UL, 0xbfbe3a68UL, 0x3806f63bUL,
 731     0xbfde2b5dUL, 0x20000000UL, 0xbc5e0d89UL, 0x00000000UL, 0x3ff00000UL,
 732     0x866b95cfUL, 0xbfb37ca1UL, 0xa6aea963UL, 0xbfd87de2UL, 0xe0000000UL,
 733     0x3c672cedUL, 0x00000000UL, 0x3ff00000UL, 0x939d225aUL, 0xbfa60beaUL,
 734     0x2ed59f06UL, 0xbfd29406UL, 0xa0000000UL, 0x3c75d28dUL, 0x00000000UL,
 735     0x3ff00000UL, 0x011469fbUL, 0xbf93ad06UL, 0x3c69a60bUL, 0xbfc8f8b8UL,
 736     0xc0000000UL, 0x3c626d19UL, 0x00000000UL, 0x3ff00000UL, 0x176d6d31UL,
 737     0xbf73b92eUL, 0xbc29b42cUL, 0xbfb917a6UL, 0xe0000000UL, 0x3c3e2718UL,
 738     0x00000000UL, 0x3ff00000UL, 0x55555555UL, 0xbfc55555UL, 0x00000000UL,
 739     0xbfe00000UL, 0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL,
 740     0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL, 0xa556c734UL,
 741     0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL, 0x1a600000UL, 0x3d90b461UL,
 742     0x1a600000UL, 0x3d90b461UL, 0x54400000UL, 0x3fb921fbUL, 0x00000000UL,
 743     0x00000000UL, 0x2e037073UL, 0x3b63198aUL, 0x00000000UL, 0x00000000UL,
 744     0x6dc9c883UL, 0x40245f30UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
 745     0x43380000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x3ff00000UL,
 746     0x00000000UL, 0x00000000UL, 0x00000000UL, 0x80000000UL, 0x00000000UL,
 747     0x00000000UL, 0x00000000UL, 0x80000000UL, 0x00000000UL, 0x00000000UL,
 748     0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
 749 };
 750 //registers,
 751 // input: (rbp + 8)
 752 // scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 753 //          rax, rdx, rcx, rbx (tmp)
 754 
 755 // Code generated by Intel C compiler for LIBM library
 756 
 757 void MacroAssembler::fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
 758   Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
 759   Label start;
 760 
 761   assert_different_registers(tmp, eax, ecx, edx);
 762 
 763   address static_const_table_cos = (address)_static_const_table_cos;
 764 
 765   bind(start);
 766   subl(rsp, 120);
 767   movl(Address(rsp, 56), tmp);
 768   lea(tmp, ExternalAddress(static_const_table_cos));
 769   movsd(xmm0, Address(rsp, 128));
 770   pextrw(eax, xmm0, 3);
 771   andl(eax, 32767);
 772   subl(eax, 12336);
 773   cmpl(eax, 4293);
 774   jcc(Assembler::above, L_2TAG_PACKET_0_0_2);
 775   movsd(xmm1, Address(tmp, 2160));
 776   mulsd(xmm1, xmm0);
 777   movdqu(xmm5, Address(tmp, 2240));
 778   movsd(xmm4, Address(tmp, 2224));
 779   pand(xmm4, xmm0);
 780   por(xmm5, xmm4);
 781   movsd(xmm3, Address(tmp, 2128));
 782   movdqu(xmm2, Address(tmp, 2112));
 783   addpd(xmm1, xmm5);
 784   cvttsd2sil(edx, xmm1);
 785   cvtsi2sdl(xmm1, edx);
 786   mulsd(xmm3, xmm1);
 787   unpcklpd(xmm1, xmm1);
 788   addl(edx, 1865232);
 789   movdqu(xmm4, xmm0);
 790   andl(edx, 63);
 791   movdqu(xmm5, Address(tmp, 2096));
 792   lea(eax, Address(tmp, 0));
 793   shll(edx, 5);
 794   addl(eax, edx);
 795   mulpd(xmm2, xmm1);
 796   subsd(xmm0, xmm3);
 797   mulsd(xmm1, Address(tmp, 2144));
 798   subsd(xmm4, xmm3);
 799   movsd(xmm7, Address(eax, 8));
 800   unpcklpd(xmm0, xmm0);
 801   movapd(xmm3, xmm4);
 802   subsd(xmm4, xmm2);
 803   mulpd(xmm5, xmm0);
 804   subpd(xmm0, xmm2);
 805   movdqu(xmm6, Address(tmp, 2064));
 806   mulsd(xmm7, xmm4);
 807   subsd(xmm3, xmm4);
 808   mulpd(xmm5, xmm0);
 809   mulpd(xmm0, xmm0);
 810   subsd(xmm3, xmm2);
 811   movdqu(xmm2, Address(eax, 0));
 812   subsd(xmm1, xmm3);
 813   movsd(xmm3, Address(eax, 24));
 814   addsd(xmm2, xmm3);
 815   subsd(xmm7, xmm2);
 816   mulsd(xmm2, xmm4);
 817   mulpd(xmm6, xmm0);
 818   mulsd(xmm3, xmm4);
 819   mulpd(xmm2, xmm0);
 820   mulpd(xmm0, xmm0);
 821   addpd(xmm5, Address(tmp, 2080));
 822   mulsd(xmm4, Address(eax, 0));
 823   addpd(xmm6, Address(tmp, 2048));
 824   mulpd(xmm5, xmm0);
 825   movapd(xmm0, xmm3);
 826   addsd(xmm3, Address(eax, 8));
 827   mulpd(xmm1, xmm7);
 828   movapd(xmm7, xmm4);
 829   addsd(xmm4, xmm3);
 830   addpd(xmm6, xmm5);
 831   movsd(xmm5, Address(eax, 8));
 832   subsd(xmm5, xmm3);
 833   subsd(xmm3, xmm4);
 834   addsd(xmm1, Address(eax, 16));
 835   mulpd(xmm6, xmm2);
 836   addsd(xmm5, xmm0);
 837   addsd(xmm3, xmm7);
 838   addsd(xmm1, xmm5);
 839   addsd(xmm1, xmm3);
 840   addsd(xmm1, xmm6);
 841   unpckhpd(xmm6, xmm6);
 842   addsd(xmm1, xmm6);
 843   addsd(xmm4, xmm1);
 844   movsd(Address(rsp, 0), xmm4);
 845   fld_d(Address(rsp, 0));
 846   jmp(L_2TAG_PACKET_1_0_2);
 847 
 848   bind(L_2TAG_PACKET_0_0_2);
 849   jcc(Assembler::greater, L_2TAG_PACKET_2_0_2);
 850   pextrw(eax, xmm0, 3);
 851   andl(eax, 32767);
 852   pinsrw(xmm0, eax, 3);
 853   movsd(xmm1, Address(tmp, 2192));
 854   subsd(xmm1, xmm0);
 855   movsd(Address(rsp, 0), xmm1);
 856   fld_d(Address(rsp, 0));
 857   jmp(L_2TAG_PACKET_1_0_2);
 858 
 859   bind(L_2TAG_PACKET_2_0_2);
 860   movl(eax, Address(rsp, 132));
 861   andl(eax, 2146435072);
 862   cmpl(eax, 2146435072);
 863   jcc(Assembler::equal, L_2TAG_PACKET_3_0_2);
 864   subl(rsp, 32);
 865   movsd(Address(rsp, 0), xmm0);
 866   lea(eax, Address(rsp, 40));
 867   movl(Address(rsp, 8), eax);
 868   movl(eax, 1);
 869   movl(Address(rsp, 12), eax);
 870   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dlibm_sin_cos_huge())));
 871   addl(rsp, 32);
 872   fld_d(Address(rsp, 8));
 873   jmp(L_2TAG_PACKET_1_0_2);
 874 
 875   bind(L_2TAG_PACKET_3_0_2);
 876   fld_d(Address(rsp, 128));
 877   fmul_d(Address(tmp, 2208));
 878 
 879   bind(L_2TAG_PACKET_1_0_2);
 880   movl(tmp, Address(rsp, 56));
 881 }
 882 #endif