1 /*
   2  * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016, Intel Corporation. All rights reserved.
   4  * Intel Math Library (LIBM) Source Code
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 package org.graalvm.compiler.lir.amd64;
  28 
  29 import static jdk.vm.ci.amd64.AMD64.r10;
  30 import static jdk.vm.ci.amd64.AMD64.r11;
  31 import static jdk.vm.ci.amd64.AMD64.r8;
  32 import static jdk.vm.ci.amd64.AMD64.r9;
  33 import static jdk.vm.ci.amd64.AMD64.rax;
  34 import static jdk.vm.ci.amd64.AMD64.rbx;
  35 import static jdk.vm.ci.amd64.AMD64.rcx;
  36 import static jdk.vm.ci.amd64.AMD64.rdi;
  37 import static jdk.vm.ci.amd64.AMD64.rdx;
  38 import static jdk.vm.ci.amd64.AMD64.rsi;
  39 import static jdk.vm.ci.amd64.AMD64.rsp;
  40 import static jdk.vm.ci.amd64.AMD64.xmm0;
  41 import static jdk.vm.ci.amd64.AMD64.xmm1;
  42 import static jdk.vm.ci.amd64.AMD64.xmm2;
  43 import static jdk.vm.ci.amd64.AMD64.xmm3;
  44 import static jdk.vm.ci.amd64.AMD64.xmm4;
  45 import static jdk.vm.ci.amd64.AMD64.xmm5;
  46 import static jdk.vm.ci.amd64.AMD64.xmm6;
  47 import static jdk.vm.ci.amd64.AMD64.xmm7;
  48 import static org.graalvm.compiler.lir.amd64.AMD64HotSpotHelper.pointerConstant;
  49 import static org.graalvm.compiler.lir.amd64.AMD64HotSpotHelper.recordExternalAddress;
  50 
  51 import org.graalvm.compiler.asm.Label;
  52 import org.graalvm.compiler.asm.amd64.AMD64Address;
  53 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
  54 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  55 import org.graalvm.compiler.lir.LIRInstructionClass;
  56 import org.graalvm.compiler.lir.asm.ArrayDataPointerConstant;
  57 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  58 
  59 /**
  60  * <pre>
  61  *                     ALGORITHM DESCRIPTION - COS()
  62  *                     ---------------------
  63  *
  64  *     1. RANGE REDUCTION
  65  *
  66  *     We perform an initial range reduction from X to r with
  67  *
  68  *          X =~= N * pi/32 + r
  69  *
  70  *     so that |r| <= pi/64 + epsilon. We restrict inputs to those
  71  *     where |N| <= 932560. Beyond this, the range reduction is
  72  *     insufficiently accurate. For extremely small inputs,
  73  *     denormalization can occur internally, impacting performance.
  74  *     This means that the main path is actually only taken for
  75  *     2^-252 <= |X| < 90112.
  76  *
  77  *     To avoid branches, we perform the range reduction to full
  78  *     accuracy each time.
  79  *
  80  *          X - N * (P_1 + P_2 + P_3)
  81  *
  82  *     where P_1 and P_2 are 32-bit numbers (so multiplication by N
  83  *     is exact) and P_3 is a 53-bit number. Together, these
  84  *     approximate pi well enough for all cases in the restricted
  85  *     range.
  86  *
  87  *     The main reduction sequence is:
  88  *
  89  *             y = 32/pi * x
  90  *             N = integer(y)
  91  *     (computed by adding and subtracting off SHIFTER)
  92  *
  93  *             m_1 = N * P_1
  94  *             m_2 = N * P_2
  95  *             r_1 = x - m_1
  96  *             r = r_1 - m_2
  97  *     (this r can be used for most of the calculation)
  98  *
  99  *             c_1 = r_1 - r
 100  *             m_3 = N * P_3
 101  *             c_2 = c_1 - m_2
 102  *             c = c_2 - m_3
 103  *
 104  *     2. MAIN ALGORITHM
 105  *
 106  *     The algorithm uses a table lookup based on B = M * pi / 32
 107  *     where M = N mod 64. The stored values are:
 108  *       sigma             closest power of 2 to cos(B)
 109  *       C_hl              53-bit cos(B) - sigma
 110  *       S_hi + S_lo       2 * 53-bit sin(B)
 111  *
 112  *     The computation is organized as follows:
 113  *
 114  *          sin(B + r + c) = [sin(B) + sigma * r] +
 115  *                           r * (cos(B) - sigma) +
 116  *                           sin(B) * [cos(r + c) - 1] +
 117  *                           cos(B) * [sin(r + c) - r]
 118  *
 119  *     which is approximately:
 120  *
 121  *          [S_hi + sigma * r] +
 122  *          C_hl * r +
 123  *          S_lo + S_hi * [(cos(r) - 1) - r * c] +
 124  *          (C_hl + sigma) * [(sin(r) - r) + c]
 125  *
 126  *     and this is what is actually computed. We separate this sum
 127  *     into four parts:
 128  *
 129  *          hi + med + pols + corr
 130  *
 131  *     where
 132  *
 133  *          hi       = S_hi + sigma r
 134  *          med      = C_hl * r
 135  *          pols     = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r)
 136  *          corr     = S_lo + c * ((C_hl + sigma) - S_hi * r)
 137  *
 138  *     3. POLYNOMIAL
 139  *
 140  *     The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) *
 141  *     (sin(r) - r) can be rearranged freely, since it is quite
 142  *     small, so we exploit parallelism to the fullest.
 143  *
 144  *          psc4       =   SC_4 * r_1
 145  *          msc4       =   psc4 * r
 146  *          r2         =   r * r
 147  *          msc2       =   SC_2 * r2
 148  *          r4         =   r2 * r2
 149  *          psc3       =   SC_3 + msc4
 150  *          psc1       =   SC_1 + msc2
 151  *          msc3       =   r4 * psc3
 152  *          sincospols =   psc1 + msc3
 153  *          pols       =   sincospols *
 154  *                         <S_hi * r^2 | (C_hl + sigma) * r^3>
 155  *
 156  *     4. CORRECTION TERM
 157  *
 158  *     This is where the "c" component of the range reduction is
 159  *     taken into account; recall that just "r" is used for most of
 160  *     the calculation.
 161  *
 162  *          -c   = m_3 - c_2
 163  *          -d   = S_hi * r - (C_hl + sigma)
 164  *          corr = -c * -d + S_lo
 165  *
 166  *     5. COMPENSATED SUMMATIONS
 167  *
 168  *     The two successive compensated summations add up the high
 169  *     and medium parts, leaving just the low parts to add up at
 170  *     the end.
 171  *
 172  *          rs        =  sigma * r
 173  *          res_int   =  S_hi + rs
 174  *          k_0       =  S_hi - res_int
 175  *          k_2       =  k_0 + rs
 176  *          med       =  C_hl * r
 177  *          res_hi    =  res_int + med
 178  *          k_1       =  res_int - res_hi
 179  *          k_3       =  k_1 + med
 180  *
 181  *     6. FINAL SUMMATION
 182  *
 183  *     We now add up all the small parts:
 184  *
 185  *          res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3
 186  *
 187  *     Now the overall result is just:
 188  *
 189  *          res_hi + res_lo
 190  *
 191  *     7. SMALL ARGUMENTS
 192  *
 193  *     Inputs with |X| < 2^-252 are treated specially as
 194  *     1 - |x|.
 195  *
 196  * Special cases:
 197  *  cos(NaN) = quiet NaN, and raise invalid exception
 198  *  cos(INF) = NaN and raise invalid exception
 199  *  cos(0) = 1
 200  * </pre>
 201  */
 202 public final class AMD64MathCosOp extends AMD64MathIntrinsicUnaryOp {
 203 
 204     public static final LIRInstructionClass<AMD64MathCosOp> TYPE = LIRInstructionClass.create(AMD64MathCosOp.class);
 205 
 206     public AMD64MathCosOp() {
 207         super(TYPE, /* GPR */ rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11,
 208                         /* XMM */ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
 209     }
 210 
 211     private ArrayDataPointerConstant onehalf = pointerConstant(16, new int[]{
 212             // @formatter:off
 213             0x00000000, 0x3fe00000, 0x00000000, 0x3fe00000
 214             // @formatter:on
 215     });
 216 
 217     private ArrayDataPointerConstant p2 = pointerConstant(16, new int[]{
 218             // @formatter:off
 219             0x1a600000, 0x3d90b461, 0x1a600000, 0x3d90b461
 220             // @formatter:on
 221     });
 222 
 223     private ArrayDataPointerConstant sc4 = pointerConstant(16, new int[]{
 224             // @formatter:off
 225             0xa556c734, 0x3ec71de3, 0x1a01a01a, 0x3efa01a0
 226             // @formatter:on
 227     });
 228 
 229     private ArrayDataPointerConstant ctable = pointerConstant(16, new int[]{
 230             // @formatter:off
 231             0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 232             0x00000000, 0x00000000, 0x3ff00000, 0x176d6d31, 0xbf73b92e,
 233             0xbc29b42c, 0x3fb917a6, 0xe0000000, 0xbc3e2718, 0x00000000,
 234             0x3ff00000, 0x011469fb, 0xbf93ad06, 0x3c69a60b, 0x3fc8f8b8,
 235             0xc0000000, 0xbc626d19, 0x00000000, 0x3ff00000, 0x939d225a,
 236             0xbfa60bea, 0x2ed59f06, 0x3fd29406, 0xa0000000, 0xbc75d28d,
 237             0x00000000, 0x3ff00000, 0x866b95cf, 0xbfb37ca1, 0xa6aea963,
 238             0x3fd87de2, 0xe0000000, 0xbc672ced, 0x00000000, 0x3ff00000,
 239             0x73fa1279, 0xbfbe3a68, 0x3806f63b, 0x3fde2b5d, 0x20000000,
 240             0x3c5e0d89, 0x00000000, 0x3ff00000, 0x5bc57974, 0xbfc59267,
 241             0x39ae68c8, 0x3fe1c73b, 0x20000000, 0x3c8b25dd, 0x00000000,
 242             0x3ff00000, 0x53aba2fd, 0xbfcd0dfe, 0x25091dd6, 0x3fe44cf3,
 243             0x20000000, 0x3c68076a, 0x00000000, 0x3ff00000, 0x99fcef32,
 244             0x3fca8279, 0x667f3bcd, 0x3fe6a09e, 0x20000000, 0xbc8bdd34,
 245             0x00000000, 0x3fe00000, 0x94247758, 0x3fc133cc, 0x6b151741,
 246             0x3fe8bc80, 0x20000000, 0xbc82c5e1, 0x00000000, 0x3fe00000,
 247             0x9ae68c87, 0x3fac73b3, 0x290ea1a3, 0x3fea9b66, 0xe0000000,
 248             0x3c39f630, 0x00000000, 0x3fe00000, 0x7f909c4e, 0xbf9d4a2c,
 249             0xf180bdb1, 0x3fec38b2, 0x80000000, 0xbc76e0b1, 0x00000000,
 250             0x3fe00000, 0x65455a75, 0xbfbe0875, 0xcf328d46, 0x3fed906b,
 251             0x20000000, 0x3c7457e6, 0x00000000, 0x3fe00000, 0x76acf82d,
 252             0x3fa4a031, 0x56c62dda, 0x3fee9f41, 0xe0000000, 0x3c8760b1,
 253             0x00000000, 0x3fd00000, 0x0e5967d5, 0xbfac1d1f, 0xcff75cb0,
 254             0x3fef6297, 0x20000000, 0x3c756217, 0x00000000, 0x3fd00000,
 255             0x0f592f50, 0xbf9ba165, 0xa3d12526, 0x3fefd88d, 0x40000000,
 256             0xbc887df6, 0x00000000, 0x3fc00000, 0x00000000, 0x00000000,
 257             0x00000000, 0x3ff00000, 0x00000000, 0x00000000, 0x00000000,
 258             0x00000000, 0x0f592f50, 0x3f9ba165, 0xa3d12526, 0x3fefd88d,
 259             0x40000000, 0xbc887df6, 0x00000000, 0xbfc00000, 0x0e5967d5,
 260             0x3fac1d1f, 0xcff75cb0, 0x3fef6297, 0x20000000, 0x3c756217,
 261             0x00000000, 0xbfd00000, 0x76acf82d, 0xbfa4a031, 0x56c62dda,
 262             0x3fee9f41, 0xe0000000, 0x3c8760b1, 0x00000000, 0xbfd00000,
 263             0x65455a75, 0x3fbe0875, 0xcf328d46, 0x3fed906b, 0x20000000,
 264             0x3c7457e6, 0x00000000, 0xbfe00000, 0x7f909c4e, 0x3f9d4a2c,
 265             0xf180bdb1, 0x3fec38b2, 0x80000000, 0xbc76e0b1, 0x00000000,
 266             0xbfe00000, 0x9ae68c87, 0xbfac73b3, 0x290ea1a3, 0x3fea9b66,
 267             0xe0000000, 0x3c39f630, 0x00000000, 0xbfe00000, 0x94247758,
 268             0xbfc133cc, 0x6b151741, 0x3fe8bc80, 0x20000000, 0xbc82c5e1,
 269             0x00000000, 0xbfe00000, 0x99fcef32, 0xbfca8279, 0x667f3bcd,
 270             0x3fe6a09e, 0x20000000, 0xbc8bdd34, 0x00000000, 0xbfe00000,
 271             0x53aba2fd, 0x3fcd0dfe, 0x25091dd6, 0x3fe44cf3, 0x20000000,
 272             0x3c68076a, 0x00000000, 0xbff00000, 0x5bc57974, 0x3fc59267,
 273             0x39ae68c8, 0x3fe1c73b, 0x20000000, 0x3c8b25dd, 0x00000000,
 274             0xbff00000, 0x73fa1279, 0x3fbe3a68, 0x3806f63b, 0x3fde2b5d,
 275             0x20000000, 0x3c5e0d89, 0x00000000, 0xbff00000, 0x866b95cf,
 276             0x3fb37ca1, 0xa6aea963, 0x3fd87de2, 0xe0000000, 0xbc672ced,
 277             0x00000000, 0xbff00000, 0x939d225a, 0x3fa60bea, 0x2ed59f06,
 278             0x3fd29406, 0xa0000000, 0xbc75d28d, 0x00000000, 0xbff00000,
 279             0x011469fb, 0x3f93ad06, 0x3c69a60b, 0x3fc8f8b8, 0xc0000000,
 280             0xbc626d19, 0x00000000, 0xbff00000, 0x176d6d31, 0x3f73b92e,
 281             0xbc29b42c, 0x3fb917a6, 0xe0000000, 0xbc3e2718, 0x00000000,
 282             0xbff00000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 283             0x00000000, 0x00000000, 0x00000000, 0xbff00000, 0x176d6d31,
 284             0x3f73b92e, 0xbc29b42c, 0xbfb917a6, 0xe0000000, 0x3c3e2718,
 285             0x00000000, 0xbff00000, 0x011469fb, 0x3f93ad06, 0x3c69a60b,
 286             0xbfc8f8b8, 0xc0000000, 0x3c626d19, 0x00000000, 0xbff00000,
 287             0x939d225a, 0x3fa60bea, 0x2ed59f06, 0xbfd29406, 0xa0000000,
 288             0x3c75d28d, 0x00000000, 0xbff00000, 0x866b95cf, 0x3fb37ca1,
 289             0xa6aea963, 0xbfd87de2, 0xe0000000, 0x3c672ced, 0x00000000,
 290             0xbff00000, 0x73fa1279, 0x3fbe3a68, 0x3806f63b, 0xbfde2b5d,
 291             0x20000000, 0xbc5e0d89, 0x00000000, 0xbff00000, 0x5bc57974,
 292             0x3fc59267, 0x39ae68c8, 0xbfe1c73b, 0x20000000, 0xbc8b25dd,
 293             0x00000000, 0xbff00000, 0x53aba2fd, 0x3fcd0dfe, 0x25091dd6,
 294             0xbfe44cf3, 0x20000000, 0xbc68076a, 0x00000000, 0xbff00000,
 295             0x99fcef32, 0xbfca8279, 0x667f3bcd, 0xbfe6a09e, 0x20000000,
 296             0x3c8bdd34, 0x00000000, 0xbfe00000, 0x94247758, 0xbfc133cc,
 297             0x6b151741, 0xbfe8bc80, 0x20000000, 0x3c82c5e1, 0x00000000,
 298             0xbfe00000, 0x9ae68c87, 0xbfac73b3, 0x290ea1a3, 0xbfea9b66,
 299             0xe0000000, 0xbc39f630, 0x00000000, 0xbfe00000, 0x7f909c4e,
 300             0x3f9d4a2c, 0xf180bdb1, 0xbfec38b2, 0x80000000, 0x3c76e0b1,
 301             0x00000000, 0xbfe00000, 0x65455a75, 0x3fbe0875, 0xcf328d46,
 302             0xbfed906b, 0x20000000, 0xbc7457e6, 0x00000000, 0xbfe00000,
 303             0x76acf82d, 0xbfa4a031, 0x56c62dda, 0xbfee9f41, 0xe0000000,
 304             0xbc8760b1, 0x00000000, 0xbfd00000, 0x0e5967d5, 0x3fac1d1f,
 305             0xcff75cb0, 0xbfef6297, 0x20000000, 0xbc756217, 0x00000000,
 306             0xbfd00000, 0x0f592f50, 0x3f9ba165, 0xa3d12526, 0xbfefd88d,
 307             0x40000000, 0x3c887df6, 0x00000000, 0xbfc00000, 0x00000000,
 308             0x00000000, 0x00000000, 0xbff00000, 0x00000000, 0x00000000,
 309             0x00000000, 0x00000000, 0x0f592f50, 0xbf9ba165, 0xa3d12526,
 310             0xbfefd88d, 0x40000000, 0x3c887df6, 0x00000000, 0x3fc00000,
 311             0x0e5967d5, 0xbfac1d1f, 0xcff75cb0, 0xbfef6297, 0x20000000,
 312             0xbc756217, 0x00000000, 0x3fd00000, 0x76acf82d, 0x3fa4a031,
 313             0x56c62dda, 0xbfee9f41, 0xe0000000, 0xbc8760b1, 0x00000000,
 314             0x3fd00000, 0x65455a75, 0xbfbe0875, 0xcf328d46, 0xbfed906b,
 315             0x20000000, 0xbc7457e6, 0x00000000, 0x3fe00000, 0x7f909c4e,
 316             0xbf9d4a2c, 0xf180bdb1, 0xbfec38b2, 0x80000000, 0x3c76e0b1,
 317             0x00000000, 0x3fe00000, 0x9ae68c87, 0x3fac73b3, 0x290ea1a3,
 318             0xbfea9b66, 0xe0000000, 0xbc39f630, 0x00000000, 0x3fe00000,
 319             0x94247758, 0x3fc133cc, 0x6b151741, 0xbfe8bc80, 0x20000000,
 320             0x3c82c5e1, 0x00000000, 0x3fe00000, 0x99fcef32, 0x3fca8279,
 321             0x667f3bcd, 0xbfe6a09e, 0x20000000, 0x3c8bdd34, 0x00000000,
 322             0x3fe00000, 0x53aba2fd, 0xbfcd0dfe, 0x25091dd6, 0xbfe44cf3,
 323             0x20000000, 0xbc68076a, 0x00000000, 0x3ff00000, 0x5bc57974,
 324             0xbfc59267, 0x39ae68c8, 0xbfe1c73b, 0x20000000, 0xbc8b25dd,
 325             0x00000000, 0x3ff00000, 0x73fa1279, 0xbfbe3a68, 0x3806f63b,
 326             0xbfde2b5d, 0x20000000, 0xbc5e0d89, 0x00000000, 0x3ff00000,
 327             0x866b95cf, 0xbfb37ca1, 0xa6aea963, 0xbfd87de2, 0xe0000000,
 328             0x3c672ced, 0x00000000, 0x3ff00000, 0x939d225a, 0xbfa60bea,
 329             0x2ed59f06, 0xbfd29406, 0xa0000000, 0x3c75d28d, 0x00000000,
 330             0x3ff00000, 0x011469fb, 0xbf93ad06, 0x3c69a60b, 0xbfc8f8b8,
 331             0xc0000000, 0x3c626d19, 0x00000000, 0x3ff00000, 0x176d6d31,
 332             0xbf73b92e, 0xbc29b42c, 0xbfb917a6, 0xe0000000, 0x3c3e2718,
 333             0x00000000, 0x3ff00000
 334             // @formatter:on
 335     });
 336 
 337     private ArrayDataPointerConstant sc2 = pointerConstant(16, new int[]{
 338             // @formatter:off
 339             0x11111111, 0x3f811111, 0x55555555, 0x3fa55555
 340             // @formatter:on
 341     });
 342 
 343     private ArrayDataPointerConstant sc3 = pointerConstant(16, new int[]{
 344             // @formatter:off
 345             0x1a01a01a, 0xbf2a01a0, 0x16c16c17, 0xbf56c16c
 346             // @formatter:on
 347     });
 348 
 349     private ArrayDataPointerConstant sc1 = pointerConstant(16, new int[]{
 350             // @formatter:off
 351             0x55555555, 0xbfc55555, 0x00000000, 0xbfe00000
 352             // @formatter:on
 353     });
 354 
 355     private ArrayDataPointerConstant piInvTable = pointerConstant(16, new int[]{
 356             // @formatter:off
 357             0x00000000, 0x00000000, 0xa2f9836e, 0x4e441529, 0xfc2757d1,
 358             0xf534ddc0, 0xdb629599, 0x3c439041, 0xfe5163ab, 0xdebbc561,
 359             0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c, 0xfe1deb1c,
 360             0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
 361             0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff,
 362             0xde05980f, 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7,
 363             0x4f463f66, 0x9e5fea2d, 0x7527bac7, 0xebe5f17b, 0x3d0739f7,
 364             0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08, 0x56033046, 0xfc7b6bab,
 365             0xf0cfbc21
 366             // @formatter:on
 367     });
 368 
 369     private ArrayDataPointerConstant pi4 = pointerConstant(8, new int[]{
 370             // @formatter:off
 371             0x40000000, 0x3fe921fb, 0x18469899, 0x3e64442d
 372             // @formatter:on
 373     });
 374 
 375     private ArrayDataPointerConstant pi48 = pointerConstant(8, new int[]{
 376             // @formatter:off
 377             0x18469899, 0x3e64442d
 378             // @formatter:on
 379     });
 380 
 381     private ArrayDataPointerConstant pi32Inv = pointerConstant(8, new int[]{
 382             // @formatter:off
 383             0x6dc9c883, 0x40245f30
 384             // @formatter:on
 385     });
 386 
 387     private ArrayDataPointerConstant signMask = pointerConstant(8, new int[]{
 388             // @formatter:off
 389             0x00000000, 0x80000000
 390             // @formatter:on
 391     });
 392 
 393     private ArrayDataPointerConstant p3 = pointerConstant(8, new int[]{
 394             // @formatter:off
 395             0x2e037073, 0x3b63198a
 396             // @formatter:on
 397     });
 398 
 399     private ArrayDataPointerConstant p1 = pointerConstant(8, new int[]{
 400             // @formatter:off
 401             0x54400000, 0x3fb921fb
 402             // @formatter:on
 403     });
 404 
 405     private ArrayDataPointerConstant negZero = pointerConstant(8, new int[]{
 406             // @formatter:off
 407             0x00000000, 0x80000000
 408             // @formatter:on
 409     });
 410 
 411     // The 64 bit code is at most SSE2 compliant
 412     private ArrayDataPointerConstant one = pointerConstant(8, new int[]{
 413             // @formatter:off
 414             0x00000000, 0x3ff00000
 415             // @formatter:on
 416     });
 417 
 418     @Override
 419     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
 420         Label block0 = new Label();
 421         Label block1 = new Label();
 422         Label block2 = new Label();
 423         Label block3 = new Label();
 424         Label block4 = new Label();
 425         Label block5 = new Label();
 426         Label block6 = new Label();
 427         Label block7 = new Label();
 428         Label block8 = new Label();
 429         Label block9 = new Label();
 430         Label block10 = new Label();
 431         Label block11 = new Label();
 432         Label block12 = new Label();
 433         Label block13 = new Label();
 434 
 435         masm.push(rbx);
 436         masm.subq(rsp, 16);
 437         masm.movsd(new AMD64Address(rsp, 8), xmm0);
 438 
 439         masm.movl(rax, new AMD64Address(rsp, 12));
 440         masm.movq(xmm1, recordExternalAddress(crb, pi32Inv));          // 0x6dc9c883, 0x40245f30
 441         masm.andl(rax, 2147418112);
 442         masm.subl(rax, 808452096);
 443         masm.cmpl(rax, 281346048);
 444         masm.jcc(ConditionFlag.Above, block0);
 445         masm.mulsd(xmm1, xmm0);
 446         masm.movdqu(xmm5, recordExternalAddress(crb, onehalf));        // 0x00000000, 0x3fe00000,
 447                                                                        // 0x00000000, 0x3fe00000
 448         masm.movq(xmm4, recordExternalAddress(crb, signMask));         // 0x00000000, 0x80000000
 449         masm.pand(xmm4, xmm0);
 450         masm.por(xmm5, xmm4);
 451         masm.addpd(xmm1, xmm5);
 452         masm.cvttsd2sil(rdx, xmm1);
 453         masm.cvtsi2sdl(xmm1, rdx);
 454         masm.movdqu(xmm2, recordExternalAddress(crb, p2));             // 0x1a600000, 0x3d90b461,
 455                                                                        // 0x1a600000, 0x3d90b461
 456         masm.movq(xmm3, recordExternalAddress(crb, p1));               // 0x54400000, 0x3fb921fb
 457         masm.mulsd(xmm3, xmm1);
 458         masm.unpcklpd(xmm1, xmm1);
 459         masm.addq(rdx, 1865232);
 460         masm.movdqu(xmm4, xmm0);
 461         masm.andq(rdx, 63);
 462         masm.movdqu(xmm5, recordExternalAddress(crb, sc4));            // 0xa556c734, 0x3ec71de3,
 463                                                                        // 0x1a01a01a, 0x3efa01a0
 464         masm.leaq(rax, recordExternalAddress(crb, ctable));
 465         masm.shlq(rdx, 5);
 466         masm.addq(rax, rdx);
 467         masm.mulpd(xmm2, xmm1);
 468         masm.subsd(xmm0, xmm3);
 469         masm.mulsd(xmm1, recordExternalAddress(crb, p3));              // 0x2e037073, 0x3b63198a
 470         masm.subsd(xmm4, xmm3);
 471         masm.movq(xmm7, new AMD64Address(rax, 8));
 472         masm.unpcklpd(xmm0, xmm0);
 473         masm.movdqu(xmm3, xmm4);
 474         masm.subsd(xmm4, xmm2);
 475         masm.mulpd(xmm5, xmm0);
 476         masm.subpd(xmm0, xmm2);
 477         masm.movdqu(xmm6, recordExternalAddress(crb, sc2));            // 0x11111111, 0x3f811111,
 478                                                                        // 0x55555555, 0x3fa55555
 479         masm.mulsd(xmm7, xmm4);
 480         masm.subsd(xmm3, xmm4);
 481         masm.mulpd(xmm5, xmm0);
 482         masm.mulpd(xmm0, xmm0);
 483         masm.subsd(xmm3, xmm2);
 484         masm.movdqu(xmm2, new AMD64Address(rax, 0));
 485         masm.subsd(xmm1, xmm3);
 486         masm.movq(xmm3, new AMD64Address(rax, 24));
 487         masm.addsd(xmm2, xmm3);
 488         masm.subsd(xmm7, xmm2);
 489         masm.mulsd(xmm2, xmm4);
 490         masm.mulpd(xmm6, xmm0);
 491         masm.mulsd(xmm3, xmm4);
 492         masm.mulpd(xmm2, xmm0);
 493         masm.mulpd(xmm0, xmm0);
 494         masm.addpd(xmm5, recordExternalAddress(crb, sc3));             // 0x1a01a01a, 0xbf2a01a0,
 495                                                                        // 0x16c16c17, 0xbf56c16c
 496         masm.mulsd(xmm4, new AMD64Address(rax, 0));
 497         masm.addpd(xmm6, recordExternalAddress(crb, sc1));             // 0x55555555, 0xbfc55555,
 498                                                                        // 0x00000000, 0xbfe00000
 499         masm.mulpd(xmm5, xmm0);
 500         masm.movdqu(xmm0, xmm3);
 501         masm.addsd(xmm3, new AMD64Address(rax, 8));
 502         masm.mulpd(xmm1, xmm7);
 503         masm.movdqu(xmm7, xmm4);
 504         masm.addsd(xmm4, xmm3);
 505         masm.addpd(xmm6, xmm5);
 506         masm.movq(xmm5, new AMD64Address(rax, 8));
 507         masm.subsd(xmm5, xmm3);
 508         masm.subsd(xmm3, xmm4);
 509         masm.addsd(xmm1, new AMD64Address(rax, 16));
 510         masm.mulpd(xmm6, xmm2);
 511         masm.addsd(xmm0, xmm5);
 512         masm.addsd(xmm3, xmm7);
 513         masm.addsd(xmm0, xmm1);
 514         masm.addsd(xmm0, xmm3);
 515         masm.addsd(xmm0, xmm6);
 516         masm.unpckhpd(xmm6, xmm6);
 517         masm.addsd(xmm0, xmm6);
 518         masm.addsd(xmm0, xmm4);
 519         masm.jmp(block13);
 520 
 521         masm.bind(block0);
 522         masm.jcc(ConditionFlag.Greater, block1);
 523         masm.pextrw(rax, xmm0, 3);
 524         masm.andl(rax, 32767);
 525         masm.pinsrw(xmm0, rax, 3);
 526         masm.movq(xmm1, recordExternalAddress(crb, one));              // 0x00000000, 0x3ff00000
 527         masm.subsd(xmm1, xmm0);
 528         masm.movdqu(xmm0, xmm1);
 529         masm.jmp(block13);
 530 
 531         masm.bind(block1);
 532         masm.pextrw(rax, xmm0, 3);
 533         masm.andl(rax, 32752);
 534         masm.cmpl(rax, 32752);
 535         masm.jcc(ConditionFlag.Equal, block2);
 536         masm.pextrw(rcx, xmm0, 3);
 537         masm.andl(rcx, 32752);
 538         masm.subl(rcx, 16224);
 539         masm.shrl(rcx, 7);
 540         masm.andl(rcx, 65532);
 541         masm.leaq(r11, recordExternalAddress(crb, piInvTable));
 542         masm.addq(rcx, r11);
 543         masm.movdq(rax, xmm0);
 544         masm.movl(r10, new AMD64Address(rcx, 20));
 545         masm.movl(r8, new AMD64Address(rcx, 24));
 546         masm.movl(rdx, rax);
 547         masm.shrq(rax, 21);
 548         masm.orl(rax, Integer.MIN_VALUE);
 549         masm.shrl(rax, 11);
 550         masm.movl(r9, r10);
 551         masm.imulq(r10, rdx);
 552         masm.imulq(r9, rax);
 553         masm.imulq(r8, rax);
 554         masm.movl(rsi, new AMD64Address(rcx, 16));
 555         masm.movl(rdi, new AMD64Address(rcx, 12));
 556         masm.movl(r11, r10);
 557         masm.shrq(r10, 32);
 558         masm.addq(r9, r10);
 559         masm.addq(r11, r8);
 560         masm.movl(r8, r11);
 561         masm.shrq(r11, 32);
 562         masm.addq(r9, r11);
 563         masm.movl(r10, rsi);
 564         masm.imulq(rsi, rdx);
 565         masm.imulq(r10, rax);
 566         masm.movl(r11, rdi);
 567         masm.imulq(rdi, rdx);
 568         masm.movl(rbx, rsi);
 569         masm.shrq(rsi, 32);
 570         masm.addq(r9, rbx);
 571         masm.movl(rbx, r9);
 572         masm.shrq(r9, 32);
 573         masm.addq(r10, rsi);
 574         masm.addq(r10, r9);
 575         masm.shlq(rbx, 32);
 576         masm.orq(r8, rbx);
 577         masm.imulq(r11, rax);
 578         masm.movl(r9, new AMD64Address(rcx, 8));
 579         masm.movl(rsi, new AMD64Address(rcx, 4));
 580         masm.movl(rbx, rdi);
 581         masm.shrq(rdi, 32);
 582         masm.addq(r10, rbx);
 583         masm.movl(rbx, r10);
 584         masm.shrq(r10, 32);
 585         masm.addq(r11, rdi);
 586         masm.addq(r11, r10);
 587         masm.movq(rdi, r9);
 588         masm.imulq(r9, rdx);
 589         masm.imulq(rdi, rax);
 590         masm.movl(r10, r9);
 591         masm.shrq(r9, 32);
 592         masm.addq(r11, r10);
 593         masm.movl(r10, r11);
 594         masm.shrq(r11, 32);
 595         masm.addq(rdi, r9);
 596         masm.addq(rdi, r11);
 597         masm.movq(r9, rsi);
 598         masm.imulq(rsi, rdx);
 599         masm.imulq(r9, rax);
 600         masm.shlq(r10, 32);
 601         masm.orq(r10, rbx);
 602         masm.movl(rax, new AMD64Address(rcx, 0));
 603         masm.movl(r11, rsi);
 604         masm.shrq(rsi, 32);
 605         masm.addq(rdi, r11);
 606         masm.movl(r11, rdi);
 607         masm.shrq(rdi, 32);
 608         masm.addq(r9, rsi);
 609         masm.addq(r9, rdi);
 610         masm.imulq(rdx, rax);
 611         masm.pextrw(rbx, xmm0, 3);
 612         masm.leaq(rdi, recordExternalAddress(crb, piInvTable));
 613         masm.subq(rcx, rdi);
 614         masm.addl(rcx, rcx);
 615         masm.addl(rcx, rcx);
 616         masm.addl(rcx, rcx);
 617         masm.addl(rcx, 19);
 618         masm.movl(rsi, 32768);
 619         masm.andl(rsi, rbx);
 620         masm.shrl(rbx, 4);
 621         masm.andl(rbx, 2047);
 622         masm.subl(rbx, 1023);
 623         masm.subl(rcx, rbx);
 624         masm.addq(r9, rdx);
 625         masm.movl(rdx, rcx);
 626         masm.addl(rdx, 32);
 627         masm.cmpl(rcx, 1);
 628         masm.jcc(ConditionFlag.Less, block3);
 629         masm.negl(rcx);
 630         masm.addl(rcx, 29);
 631         masm.shll(r9);
 632         masm.movl(rdi, r9);
 633         masm.andl(r9, 536870911);
 634         masm.testl(r9, 268435456);
 635         masm.jcc(ConditionFlag.NotEqual, block4);
 636         masm.shrl(r9);
 637         masm.movl(rbx, 0);
 638         masm.shlq(r9, 32);
 639         masm.orq(r9, r11);
 640 
 641         masm.bind(block5);
 642 
 643         masm.bind(block6);
 644         masm.cmpq(r9, 0);
 645         masm.jcc(ConditionFlag.Equal, block7);
 646 
 647         masm.bind(block8);
 648         masm.bsrq(r11, r9);
 649         masm.movl(rcx, 29);
 650         masm.subl(rcx, r11);
 651         masm.jcc(ConditionFlag.LessEqual, block9);
 652         masm.shlq(r9);
 653         masm.movq(rax, r10);
 654         masm.shlq(r10);
 655         masm.addl(rdx, rcx);
 656         masm.negl(rcx);
 657         masm.addl(rcx, 64);
 658         masm.shrq(rax);
 659         masm.shrq(r8);
 660         masm.orq(r9, rax);
 661         masm.orq(r10, r8);
 662 
 663         masm.bind(block10);
 664         masm.cvtsi2sdq(xmm0, r9);
 665         masm.shrq(r10, 1);
 666         masm.cvtsi2sdq(xmm3, r10);
 667         masm.xorpd(xmm4, xmm4);
 668         masm.shll(rdx, 4);
 669         masm.negl(rdx);
 670         masm.addl(rdx, 16368);
 671         masm.orl(rdx, rsi);
 672         masm.xorl(rdx, rbx);
 673         masm.pinsrw(xmm4, rdx, 3);
 674         masm.movq(xmm2, recordExternalAddress(crb, pi4));              // 0x40000000, 0x3fe921fb,
 675                                                                        // 0x18469899, 0x3e64442d
 676         masm.movq(xmm6, recordExternalAddress(crb, pi48));             // 0x3fe921fb, 0x18469899,
 677                                                                        // 0x3e64442d
 678         masm.xorpd(xmm5, xmm5);
 679         masm.subl(rdx, 1008);
 680         masm.pinsrw(xmm5, rdx, 3);
 681         masm.mulsd(xmm0, xmm4);
 682         masm.shll(rsi, 16);
 683         masm.sarl(rsi, 31);
 684         masm.mulsd(xmm3, xmm5);
 685         masm.movdqu(xmm1, xmm0);
 686         masm.mulsd(xmm0, xmm2);
 687         masm.shrl(rdi, 29);
 688         masm.addsd(xmm1, xmm3);
 689         masm.mulsd(xmm3, xmm2);
 690         masm.addl(rdi, rsi);
 691         masm.xorl(rdi, rsi);
 692         masm.mulsd(xmm6, xmm1);
 693         masm.movl(rax, rdi);
 694         masm.addsd(xmm6, xmm3);
 695         masm.movdqu(xmm2, xmm0);
 696         masm.addsd(xmm0, xmm6);
 697         masm.subsd(xmm2, xmm0);
 698         masm.addsd(xmm6, xmm2);
 699 
 700         masm.bind(block11);
 701         masm.movq(xmm1, recordExternalAddress(crb, pi32Inv));          // 0x6dc9c883, 0x40245f30
 702         masm.mulsd(xmm1, xmm0);
 703         masm.movq(xmm5, recordExternalAddress(crb, onehalf));          // 0x00000000, 0x3fe00000,
 704                                                                        // 0x00000000, 0x3fe00000
 705         masm.movq(xmm4, recordExternalAddress(crb, signMask));         // 0x00000000, 0x80000000
 706         masm.pand(xmm4, xmm0);
 707         masm.por(xmm5, xmm4);
 708         masm.addpd(xmm1, xmm5);
 709         masm.cvttsd2siq(rdx, xmm1);
 710         masm.cvtsi2sdq(xmm1, rdx);
 711         masm.movq(xmm3, recordExternalAddress(crb, p1));               // 0x54400000, 0x3fb921fb
 712         masm.movdqu(xmm2, recordExternalAddress(crb, p2));             // 0x1a600000, 0x3d90b461,
 713                                                                        // 0x1a600000, 0x3d90b461
 714         masm.mulsd(xmm3, xmm1);
 715         masm.unpcklpd(xmm1, xmm1);
 716         masm.shll(rax, 3);
 717         masm.addl(rdx, 1865232);
 718         masm.movdqu(xmm4, xmm0);
 719         masm.addl(rdx, rax);
 720         masm.andl(rdx, 63);
 721         masm.movdqu(xmm5, recordExternalAddress(crb, sc4));            // 0xa556c734, 0x3ec71de3,
 722                                                                        // 0x1a01a01a, 0x3efa01a0
 723         masm.leaq(rax, recordExternalAddress(crb, ctable));
 724         masm.shll(rdx, 5);
 725         masm.addq(rax, rdx);
 726         masm.mulpd(xmm2, xmm1);
 727         masm.subsd(xmm0, xmm3);
 728         masm.mulsd(xmm1, recordExternalAddress(crb, p3));              // 0x2e037073, 0x3b63198a
 729         masm.subsd(xmm4, xmm3);
 730         masm.movq(xmm7, new AMD64Address(rax, 8));
 731         masm.unpcklpd(xmm0, xmm0);
 732         masm.movdqu(xmm3, xmm4);
 733         masm.subsd(xmm4, xmm2);
 734         masm.mulpd(xmm5, xmm0);
 735         masm.subpd(xmm0, xmm2);
 736         masm.mulsd(xmm7, xmm4);
 737         masm.subsd(xmm3, xmm4);
 738         masm.mulpd(xmm5, xmm0);
 739         masm.mulpd(xmm0, xmm0);
 740         masm.subsd(xmm3, xmm2);
 741         masm.movdqu(xmm2, new AMD64Address(rax, 0));
 742         masm.subsd(xmm1, xmm3);
 743         masm.movq(xmm3, new AMD64Address(rax, 24));
 744         masm.addsd(xmm2, xmm3);
 745         masm.subsd(xmm7, xmm2);
 746         masm.subsd(xmm1, xmm6);
 747         masm.movdqu(xmm6, recordExternalAddress(crb, sc2));            // 0x11111111, 0x3f811111,
 748                                                                        // 0x55555555, 0x3fa55555
 749         masm.mulsd(xmm2, xmm4);
 750         masm.mulpd(xmm6, xmm0);
 751         masm.mulsd(xmm3, xmm4);
 752         masm.mulpd(xmm2, xmm0);
 753         masm.mulpd(xmm0, xmm0);
 754         masm.addpd(xmm5, recordExternalAddress(crb, sc3));             // 0x1a01a01a, 0xbf2a01a0,
 755                                                                        // 0x16c16c17, 0xbf56c16c
 756         masm.mulsd(xmm4, new AMD64Address(rax, 0));
 757         masm.addpd(xmm6, recordExternalAddress(crb, sc1));             // 0x55555555, 0xbfc55555,
 758                                                                        // 0x00000000, 0xbfe00000
 759         masm.mulpd(xmm5, xmm0);
 760         masm.movdqu(xmm0, xmm3);
 761         masm.addsd(xmm3, new AMD64Address(rax, 8));
 762         masm.mulpd(xmm1, xmm7);
 763         masm.movdqu(xmm7, xmm4);
 764         masm.addsd(xmm4, xmm3);
 765         masm.addpd(xmm6, xmm5);
 766         masm.movq(xmm5, new AMD64Address(rax, 8));
 767         masm.subsd(xmm5, xmm3);
 768         masm.subsd(xmm3, xmm4);
 769         masm.addsd(xmm1, new AMD64Address(rax, 16));
 770         masm.mulpd(xmm6, xmm2);
 771         masm.addsd(xmm5, xmm0);
 772         masm.addsd(xmm3, xmm7);
 773         masm.addsd(xmm1, xmm5);
 774         masm.addsd(xmm1, xmm3);
 775         masm.addsd(xmm1, xmm6);
 776         masm.unpckhpd(xmm6, xmm6);
 777         masm.movdqu(xmm0, xmm4);
 778         masm.addsd(xmm1, xmm6);
 779         masm.addsd(xmm0, xmm1);
 780         masm.jmp(block13);
 781 
 782         masm.bind(block7);
 783         masm.addl(rdx, 64);
 784         masm.movq(r9, r10);
 785         masm.movq(r10, r8);
 786         masm.movl(r8, 0);
 787         masm.cmpq(r9, 0);
 788         masm.jcc(ConditionFlag.NotEqual, block8);
 789         masm.addl(rdx, 64);
 790         masm.movq(r9, r10);
 791         masm.movq(r10, r8);
 792         masm.cmpq(r9, 0);
 793         masm.jcc(ConditionFlag.NotEqual, block8);
 794         masm.xorpd(xmm0, xmm0);
 795         masm.xorpd(xmm6, xmm6);
 796         masm.jmp(block11);
 797 
 798         masm.bind(block9);
 799         masm.jcc(ConditionFlag.Equal, block10);
 800         masm.negl(rcx);
 801         masm.shrq(r10);
 802         masm.movq(rax, r9);
 803         masm.shrq(r9);
 804         masm.subl(rdx, rcx);
 805         masm.negl(rcx);
 806         masm.addl(rcx, 64);
 807         masm.shlq(rax);
 808         masm.orq(r10, rax);
 809         masm.jmp(block10);
 810         masm.bind(block3);
 811         masm.negl(rcx);
 812         masm.shlq(r9, 32);
 813         masm.orq(r9, r11);
 814         masm.shlq(r9);
 815         masm.movq(rdi, r9);
 816         masm.testl(r9, Integer.MIN_VALUE);
 817         masm.jcc(ConditionFlag.NotEqual, block12);
 818         masm.shrl(r9);
 819         masm.movl(rbx, 0);
 820         masm.shrq(rdi, 3);
 821         masm.jmp(block6);
 822 
 823         masm.bind(block4);
 824         masm.shrl(r9);
 825         masm.movl(rbx, 536870912);
 826         masm.shrl(rbx);
 827         masm.shlq(r9, 32);
 828         masm.orq(r9, r11);
 829         masm.shlq(rbx, 32);
 830         masm.addl(rdi, 536870912);
 831         masm.movl(rcx, 0);
 832         masm.movl(r11, 0);
 833         masm.subq(rcx, r8);
 834         masm.sbbq(r11, r10);
 835         masm.sbbq(rbx, r9);
 836         masm.movq(r8, rcx);
 837         masm.movq(r10, r11);
 838         masm.movq(r9, rbx);
 839         masm.movl(rbx, 32768);
 840         masm.jmp(block5);
 841 
 842         masm.bind(block12);
 843         masm.shrl(r9);
 844         masm.movq(rbx, 0x100000000L);
 845         masm.shrq(rbx);
 846         masm.movl(rcx, 0);
 847         masm.movl(r11, 0);
 848         masm.subq(rcx, r8);
 849         masm.sbbq(r11, r10);
 850         masm.sbbq(rbx, r9);
 851         masm.movq(r8, rcx);
 852         masm.movq(r10, r11);
 853         masm.movq(r9, rbx);
 854         masm.movl(rbx, 32768);
 855         masm.shrq(rdi, 3);
 856         masm.addl(rdi, 536870912);
 857         masm.jmp(block6);
 858 
 859         masm.bind(block2);
 860         masm.movsd(xmm0, new AMD64Address(rsp, 8));
 861         masm.mulsd(xmm0, recordExternalAddress(crb, negZero));         // 0x00000000, 0x80000000
 862         masm.movq(new AMD64Address(rsp, 0), xmm0);
 863 
 864         masm.bind(block13);
 865         masm.addq(rsp, 16);
 866         masm.pop(rbx);
 867     }
 868 
 869 }