1 /* 2 * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2016, Intel Corporation. All rights reserved. 4 * Intel Math Library (LIBM) Source Code 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 27 package org.graalvm.compiler.lir.amd64; 28 29 import static jdk.vm.ci.amd64.AMD64.r10; 30 import static jdk.vm.ci.amd64.AMD64.r11; 31 import static jdk.vm.ci.amd64.AMD64.r8; 32 import static jdk.vm.ci.amd64.AMD64.r9; 33 import static jdk.vm.ci.amd64.AMD64.rax; 34 import static jdk.vm.ci.amd64.AMD64.rbx; 35 import static jdk.vm.ci.amd64.AMD64.rcx; 36 import static jdk.vm.ci.amd64.AMD64.rdi; 37 import static jdk.vm.ci.amd64.AMD64.rdx; 38 import static jdk.vm.ci.amd64.AMD64.rsi; 39 import static jdk.vm.ci.amd64.AMD64.rsp; 40 import static jdk.vm.ci.amd64.AMD64.xmm0; 41 import static jdk.vm.ci.amd64.AMD64.xmm1; 42 import static jdk.vm.ci.amd64.AMD64.xmm2; 43 import static jdk.vm.ci.amd64.AMD64.xmm3; 44 import static jdk.vm.ci.amd64.AMD64.xmm4; 45 import static jdk.vm.ci.amd64.AMD64.xmm5; 46 import static jdk.vm.ci.amd64.AMD64.xmm6; 47 import static jdk.vm.ci.amd64.AMD64.xmm7; 48 import static org.graalvm.compiler.lir.amd64.AMD64HotSpotHelper.pointerConstant; 49 import static org.graalvm.compiler.lir.amd64.AMD64HotSpotHelper.recordExternalAddress; 50 51 import org.graalvm.compiler.asm.Label; 52 import org.graalvm.compiler.asm.amd64.AMD64Address; 53 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag; 54 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 55 import org.graalvm.compiler.lir.LIRInstructionClass; 56 import org.graalvm.compiler.lir.asm.ArrayDataPointerConstant; 57 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 58 59 /** 60 * <pre> 61 * ALGORITHM DESCRIPTION - COS() 62 * --------------------- 63 * 64 * 1. RANGE REDUCTION 65 * 66 * We perform an initial range reduction from X to r with 67 * 68 * X =~= N * pi/32 + r 69 * 70 * so that |r| <= pi/64 + epsilon. We restrict inputs to those 71 * where |N| <= 932560. Beyond this, the range reduction is 72 * insufficiently accurate. For extremely small inputs, 73 * denormalization can occur internally, impacting performance. 74 * This means that the main path is actually only taken for 75 * 2^-252 <= |X| < 90112. 76 * 77 * To avoid branches, we perform the range reduction to full 78 * accuracy each time. 79 * 80 * X - N * (P_1 + P_2 + P_3) 81 * 82 * where P_1 and P_2 are 32-bit numbers (so multiplication by N 83 * is exact) and P_3 is a 53-bit number. Together, these 84 * approximate pi well enough for all cases in the restricted 85 * range. 86 * 87 * The main reduction sequence is: 88 * 89 * y = 32/pi * x 90 * N = integer(y) 91 * (computed by adding and subtracting off SHIFTER) 92 * 93 * m_1 = N * P_1 94 * m_2 = N * P_2 95 * r_1 = x - m_1 96 * r = r_1 - m_2 97 * (this r can be used for most of the calculation) 98 * 99 * c_1 = r_1 - r 100 * m_3 = N * P_3 101 * c_2 = c_1 - m_2 102 * c = c_2 - m_3 103 * 104 * 2. MAIN ALGORITHM 105 * 106 * The algorithm uses a table lookup based on B = M * pi / 32 107 * where M = N mod 64. The stored values are: 108 * sigma closest power of 2 to cos(B) 109 * C_hl 53-bit cos(B) - sigma 110 * S_hi + S_lo 2 * 53-bit sin(B) 111 * 112 * The computation is organized as follows: 113 * 114 * sin(B + r + c) = [sin(B) + sigma * r] + 115 * r * (cos(B) - sigma) + 116 * sin(B) * [cos(r + c) - 1] + 117 * cos(B) * [sin(r + c) - r] 118 * 119 * which is approximately: 120 * 121 * [S_hi + sigma * r] + 122 * C_hl * r + 123 * S_lo + S_hi * [(cos(r) - 1) - r * c] + 124 * (C_hl + sigma) * [(sin(r) - r) + c] 125 * 126 * and this is what is actually computed. We separate this sum 127 * into four parts: 128 * 129 * hi + med + pols + corr 130 * 131 * where 132 * 133 * hi = S_hi + sigma r 134 * med = C_hl * r 135 * pols = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r) 136 * corr = S_lo + c * ((C_hl + sigma) - S_hi * r) 137 * 138 * 3. POLYNOMIAL 139 * 140 * The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) * 141 * (sin(r) - r) can be rearranged freely, since it is quite 142 * small, so we exploit parallelism to the fullest. 143 * 144 * psc4 = SC_4 * r_1 145 * msc4 = psc4 * r 146 * r2 = r * r 147 * msc2 = SC_2 * r2 148 * r4 = r2 * r2 149 * psc3 = SC_3 + msc4 150 * psc1 = SC_1 + msc2 151 * msc3 = r4 * psc3 152 * sincospols = psc1 + msc3 153 * pols = sincospols * 154 * <S_hi * r^2 | (C_hl + sigma) * r^3> 155 * 156 * 4. CORRECTION TERM 157 * 158 * This is where the "c" component of the range reduction is 159 * taken into account; recall that just "r" is used for most of 160 * the calculation. 161 * 162 * -c = m_3 - c_2 163 * -d = S_hi * r - (C_hl + sigma) 164 * corr = -c * -d + S_lo 165 * 166 * 5. COMPENSATED SUMMATIONS 167 * 168 * The two successive compensated summations add up the high 169 * and medium parts, leaving just the low parts to add up at 170 * the end. 171 * 172 * rs = sigma * r 173 * res_int = S_hi + rs 174 * k_0 = S_hi - res_int 175 * k_2 = k_0 + rs 176 * med = C_hl * r 177 * res_hi = res_int + med 178 * k_1 = res_int - res_hi 179 * k_3 = k_1 + med 180 * 181 * 6. FINAL SUMMATION 182 * 183 * We now add up all the small parts: 184 * 185 * res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3 186 * 187 * Now the overall result is just: 188 * 189 * res_hi + res_lo 190 * 191 * 7. SMALL ARGUMENTS 192 * 193 * Inputs with |X| < 2^-252 are treated specially as 194 * 1 - |x|. 195 * 196 * Special cases: 197 * cos(NaN) = quiet NaN, and raise invalid exception 198 * cos(INF) = NaN and raise invalid exception 199 * cos(0) = 1 200 * </pre> 201 */ 202 public final class AMD64MathCosOp extends AMD64MathIntrinsicUnaryOp { 203 204 public static final LIRInstructionClass<AMD64MathCosOp> TYPE = LIRInstructionClass.create(AMD64MathCosOp.class); 205 206 public AMD64MathCosOp() { 207 super(TYPE, /* GPR */ rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11, 208 /* XMM */ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); 209 } 210 211 private ArrayDataPointerConstant onehalf = pointerConstant(16, new int[]{ 212 // @formatter:off 213 0x00000000, 0x3fe00000, 0x00000000, 0x3fe00000 214 // @formatter:on 215 }); 216 217 private ArrayDataPointerConstant p2 = pointerConstant(16, new int[]{ 218 // @formatter:off 219 0x1a600000, 0x3d90b461, 0x1a600000, 0x3d90b461 220 // @formatter:on 221 }); 222 223 private ArrayDataPointerConstant sc4 = pointerConstant(16, new int[]{ 224 // @formatter:off 225 0xa556c734, 0x3ec71de3, 0x1a01a01a, 0x3efa01a0 226 // @formatter:on 227 }); 228 229 private ArrayDataPointerConstant ctable = pointerConstant(16, new int[]{ 230 // @formatter:off 231 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 232 0x00000000, 0x00000000, 0x3ff00000, 0x176d6d31, 0xbf73b92e, 233 0xbc29b42c, 0x3fb917a6, 0xe0000000, 0xbc3e2718, 0x00000000, 234 0x3ff00000, 0x011469fb, 0xbf93ad06, 0x3c69a60b, 0x3fc8f8b8, 235 0xc0000000, 0xbc626d19, 0x00000000, 0x3ff00000, 0x939d225a, 236 0xbfa60bea, 0x2ed59f06, 0x3fd29406, 0xa0000000, 0xbc75d28d, 237 0x00000000, 0x3ff00000, 0x866b95cf, 0xbfb37ca1, 0xa6aea963, 238 0x3fd87de2, 0xe0000000, 0xbc672ced, 0x00000000, 0x3ff00000, 239 0x73fa1279, 0xbfbe3a68, 0x3806f63b, 0x3fde2b5d, 0x20000000, 240 0x3c5e0d89, 0x00000000, 0x3ff00000, 0x5bc57974, 0xbfc59267, 241 0x39ae68c8, 0x3fe1c73b, 0x20000000, 0x3c8b25dd, 0x00000000, 242 0x3ff00000, 0x53aba2fd, 0xbfcd0dfe, 0x25091dd6, 0x3fe44cf3, 243 0x20000000, 0x3c68076a, 0x00000000, 0x3ff00000, 0x99fcef32, 244 0x3fca8279, 0x667f3bcd, 0x3fe6a09e, 0x20000000, 0xbc8bdd34, 245 0x00000000, 0x3fe00000, 0x94247758, 0x3fc133cc, 0x6b151741, 246 0x3fe8bc80, 0x20000000, 0xbc82c5e1, 0x00000000, 0x3fe00000, 247 0x9ae68c87, 0x3fac73b3, 0x290ea1a3, 0x3fea9b66, 0xe0000000, 248 0x3c39f630, 0x00000000, 0x3fe00000, 0x7f909c4e, 0xbf9d4a2c, 249 0xf180bdb1, 0x3fec38b2, 0x80000000, 0xbc76e0b1, 0x00000000, 250 0x3fe00000, 0x65455a75, 0xbfbe0875, 0xcf328d46, 0x3fed906b, 251 0x20000000, 0x3c7457e6, 0x00000000, 0x3fe00000, 0x76acf82d, 252 0x3fa4a031, 0x56c62dda, 0x3fee9f41, 0xe0000000, 0x3c8760b1, 253 0x00000000, 0x3fd00000, 0x0e5967d5, 0xbfac1d1f, 0xcff75cb0, 254 0x3fef6297, 0x20000000, 0x3c756217, 0x00000000, 0x3fd00000, 255 0x0f592f50, 0xbf9ba165, 0xa3d12526, 0x3fefd88d, 0x40000000, 256 0xbc887df6, 0x00000000, 0x3fc00000, 0x00000000, 0x00000000, 257 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, 0x00000000, 258 0x00000000, 0x0f592f50, 0x3f9ba165, 0xa3d12526, 0x3fefd88d, 259 0x40000000, 0xbc887df6, 0x00000000, 0xbfc00000, 0x0e5967d5, 260 0x3fac1d1f, 0xcff75cb0, 0x3fef6297, 0x20000000, 0x3c756217, 261 0x00000000, 0xbfd00000, 0x76acf82d, 0xbfa4a031, 0x56c62dda, 262 0x3fee9f41, 0xe0000000, 0x3c8760b1, 0x00000000, 0xbfd00000, 263 0x65455a75, 0x3fbe0875, 0xcf328d46, 0x3fed906b, 0x20000000, 264 0x3c7457e6, 0x00000000, 0xbfe00000, 0x7f909c4e, 0x3f9d4a2c, 265 0xf180bdb1, 0x3fec38b2, 0x80000000, 0xbc76e0b1, 0x00000000, 266 0xbfe00000, 0x9ae68c87, 0xbfac73b3, 0x290ea1a3, 0x3fea9b66, 267 0xe0000000, 0x3c39f630, 0x00000000, 0xbfe00000, 0x94247758, 268 0xbfc133cc, 0x6b151741, 0x3fe8bc80, 0x20000000, 0xbc82c5e1, 269 0x00000000, 0xbfe00000, 0x99fcef32, 0xbfca8279, 0x667f3bcd, 270 0x3fe6a09e, 0x20000000, 0xbc8bdd34, 0x00000000, 0xbfe00000, 271 0x53aba2fd, 0x3fcd0dfe, 0x25091dd6, 0x3fe44cf3, 0x20000000, 272 0x3c68076a, 0x00000000, 0xbff00000, 0x5bc57974, 0x3fc59267, 273 0x39ae68c8, 0x3fe1c73b, 0x20000000, 0x3c8b25dd, 0x00000000, 274 0xbff00000, 0x73fa1279, 0x3fbe3a68, 0x3806f63b, 0x3fde2b5d, 275 0x20000000, 0x3c5e0d89, 0x00000000, 0xbff00000, 0x866b95cf, 276 0x3fb37ca1, 0xa6aea963, 0x3fd87de2, 0xe0000000, 0xbc672ced, 277 0x00000000, 0xbff00000, 0x939d225a, 0x3fa60bea, 0x2ed59f06, 278 0x3fd29406, 0xa0000000, 0xbc75d28d, 0x00000000, 0xbff00000, 279 0x011469fb, 0x3f93ad06, 0x3c69a60b, 0x3fc8f8b8, 0xc0000000, 280 0xbc626d19, 0x00000000, 0xbff00000, 0x176d6d31, 0x3f73b92e, 281 0xbc29b42c, 0x3fb917a6, 0xe0000000, 0xbc3e2718, 0x00000000, 282 0xbff00000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 283 0x00000000, 0x00000000, 0x00000000, 0xbff00000, 0x176d6d31, 284 0x3f73b92e, 0xbc29b42c, 0xbfb917a6, 0xe0000000, 0x3c3e2718, 285 0x00000000, 0xbff00000, 0x011469fb, 0x3f93ad06, 0x3c69a60b, 286 0xbfc8f8b8, 0xc0000000, 0x3c626d19, 0x00000000, 0xbff00000, 287 0x939d225a, 0x3fa60bea, 0x2ed59f06, 0xbfd29406, 0xa0000000, 288 0x3c75d28d, 0x00000000, 0xbff00000, 0x866b95cf, 0x3fb37ca1, 289 0xa6aea963, 0xbfd87de2, 0xe0000000, 0x3c672ced, 0x00000000, 290 0xbff00000, 0x73fa1279, 0x3fbe3a68, 0x3806f63b, 0xbfde2b5d, 291 0x20000000, 0xbc5e0d89, 0x00000000, 0xbff00000, 0x5bc57974, 292 0x3fc59267, 0x39ae68c8, 0xbfe1c73b, 0x20000000, 0xbc8b25dd, 293 0x00000000, 0xbff00000, 0x53aba2fd, 0x3fcd0dfe, 0x25091dd6, 294 0xbfe44cf3, 0x20000000, 0xbc68076a, 0x00000000, 0xbff00000, 295 0x99fcef32, 0xbfca8279, 0x667f3bcd, 0xbfe6a09e, 0x20000000, 296 0x3c8bdd34, 0x00000000, 0xbfe00000, 0x94247758, 0xbfc133cc, 297 0x6b151741, 0xbfe8bc80, 0x20000000, 0x3c82c5e1, 0x00000000, 298 0xbfe00000, 0x9ae68c87, 0xbfac73b3, 0x290ea1a3, 0xbfea9b66, 299 0xe0000000, 0xbc39f630, 0x00000000, 0xbfe00000, 0x7f909c4e, 300 0x3f9d4a2c, 0xf180bdb1, 0xbfec38b2, 0x80000000, 0x3c76e0b1, 301 0x00000000, 0xbfe00000, 0x65455a75, 0x3fbe0875, 0xcf328d46, 302 0xbfed906b, 0x20000000, 0xbc7457e6, 0x00000000, 0xbfe00000, 303 0x76acf82d, 0xbfa4a031, 0x56c62dda, 0xbfee9f41, 0xe0000000, 304 0xbc8760b1, 0x00000000, 0xbfd00000, 0x0e5967d5, 0x3fac1d1f, 305 0xcff75cb0, 0xbfef6297, 0x20000000, 0xbc756217, 0x00000000, 306 0xbfd00000, 0x0f592f50, 0x3f9ba165, 0xa3d12526, 0xbfefd88d, 307 0x40000000, 0x3c887df6, 0x00000000, 0xbfc00000, 0x00000000, 308 0x00000000, 0x00000000, 0xbff00000, 0x00000000, 0x00000000, 309 0x00000000, 0x00000000, 0x0f592f50, 0xbf9ba165, 0xa3d12526, 310 0xbfefd88d, 0x40000000, 0x3c887df6, 0x00000000, 0x3fc00000, 311 0x0e5967d5, 0xbfac1d1f, 0xcff75cb0, 0xbfef6297, 0x20000000, 312 0xbc756217, 0x00000000, 0x3fd00000, 0x76acf82d, 0x3fa4a031, 313 0x56c62dda, 0xbfee9f41, 0xe0000000, 0xbc8760b1, 0x00000000, 314 0x3fd00000, 0x65455a75, 0xbfbe0875, 0xcf328d46, 0xbfed906b, 315 0x20000000, 0xbc7457e6, 0x00000000, 0x3fe00000, 0x7f909c4e, 316 0xbf9d4a2c, 0xf180bdb1, 0xbfec38b2, 0x80000000, 0x3c76e0b1, 317 0x00000000, 0x3fe00000, 0x9ae68c87, 0x3fac73b3, 0x290ea1a3, 318 0xbfea9b66, 0xe0000000, 0xbc39f630, 0x00000000, 0x3fe00000, 319 0x94247758, 0x3fc133cc, 0x6b151741, 0xbfe8bc80, 0x20000000, 320 0x3c82c5e1, 0x00000000, 0x3fe00000, 0x99fcef32, 0x3fca8279, 321 0x667f3bcd, 0xbfe6a09e, 0x20000000, 0x3c8bdd34, 0x00000000, 322 0x3fe00000, 0x53aba2fd, 0xbfcd0dfe, 0x25091dd6, 0xbfe44cf3, 323 0x20000000, 0xbc68076a, 0x00000000, 0x3ff00000, 0x5bc57974, 324 0xbfc59267, 0x39ae68c8, 0xbfe1c73b, 0x20000000, 0xbc8b25dd, 325 0x00000000, 0x3ff00000, 0x73fa1279, 0xbfbe3a68, 0x3806f63b, 326 0xbfde2b5d, 0x20000000, 0xbc5e0d89, 0x00000000, 0x3ff00000, 327 0x866b95cf, 0xbfb37ca1, 0xa6aea963, 0xbfd87de2, 0xe0000000, 328 0x3c672ced, 0x00000000, 0x3ff00000, 0x939d225a, 0xbfa60bea, 329 0x2ed59f06, 0xbfd29406, 0xa0000000, 0x3c75d28d, 0x00000000, 330 0x3ff00000, 0x011469fb, 0xbf93ad06, 0x3c69a60b, 0xbfc8f8b8, 331 0xc0000000, 0x3c626d19, 0x00000000, 0x3ff00000, 0x176d6d31, 332 0xbf73b92e, 0xbc29b42c, 0xbfb917a6, 0xe0000000, 0x3c3e2718, 333 0x00000000, 0x3ff00000 334 // @formatter:on 335 }); 336 337 private ArrayDataPointerConstant sc2 = pointerConstant(16, new int[]{ 338 // @formatter:off 339 0x11111111, 0x3f811111, 0x55555555, 0x3fa55555 340 // @formatter:on 341 }); 342 343 private ArrayDataPointerConstant sc3 = pointerConstant(16, new int[]{ 344 // @formatter:off 345 0x1a01a01a, 0xbf2a01a0, 0x16c16c17, 0xbf56c16c 346 // @formatter:on 347 }); 348 349 private ArrayDataPointerConstant sc1 = pointerConstant(16, new int[]{ 350 // @formatter:off 351 0x55555555, 0xbfc55555, 0x00000000, 0xbfe00000 352 // @formatter:on 353 }); 354 355 private ArrayDataPointerConstant piInvTable = pointerConstant(16, new int[]{ 356 // @formatter:off 357 0x00000000, 0x00000000, 0xa2f9836e, 0x4e441529, 0xfc2757d1, 358 0xf534ddc0, 0xdb629599, 0x3c439041, 0xfe5163ab, 0xdebbc561, 359 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c, 0xfe1deb1c, 360 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41, 361 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 362 0xde05980f, 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 363 0x4f463f66, 0x9e5fea2d, 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 364 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08, 0x56033046, 0xfc7b6bab, 365 0xf0cfbc21 366 // @formatter:on 367 }); 368 369 private ArrayDataPointerConstant pi4 = pointerConstant(8, new int[]{ 370 // @formatter:off 371 0x40000000, 0x3fe921fb, 0x18469899, 0x3e64442d 372 // @formatter:on 373 }); 374 375 private ArrayDataPointerConstant pi48 = pointerConstant(8, new int[]{ 376 // @formatter:off 377 0x18469899, 0x3e64442d 378 // @formatter:on 379 }); 380 381 private ArrayDataPointerConstant pi32Inv = pointerConstant(8, new int[]{ 382 // @formatter:off 383 0x6dc9c883, 0x40245f30 384 // @formatter:on 385 }); 386 387 private ArrayDataPointerConstant signMask = pointerConstant(8, new int[]{ 388 // @formatter:off 389 0x00000000, 0x80000000 390 // @formatter:on 391 }); 392 393 private ArrayDataPointerConstant p3 = pointerConstant(8, new int[]{ 394 // @formatter:off 395 0x2e037073, 0x3b63198a 396 // @formatter:on 397 }); 398 399 private ArrayDataPointerConstant p1 = pointerConstant(8, new int[]{ 400 // @formatter:off 401 0x54400000, 0x3fb921fb 402 // @formatter:on 403 }); 404 405 private ArrayDataPointerConstant negZero = pointerConstant(8, new int[]{ 406 // @formatter:off 407 0x00000000, 0x80000000 408 // @formatter:on 409 }); 410 411 // The 64 bit code is at most SSE2 compliant 412 private ArrayDataPointerConstant one = pointerConstant(8, new int[]{ 413 // @formatter:off 414 0x00000000, 0x3ff00000 415 // @formatter:on 416 }); 417 418 @Override 419 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) { 420 Label block0 = new Label(); 421 Label block1 = new Label(); 422 Label block2 = new Label(); 423 Label block3 = new Label(); 424 Label block4 = new Label(); 425 Label block5 = new Label(); 426 Label block6 = new Label(); 427 Label block7 = new Label(); 428 Label block8 = new Label(); 429 Label block9 = new Label(); 430 Label block10 = new Label(); 431 Label block11 = new Label(); 432 Label block12 = new Label(); 433 Label block13 = new Label(); 434 435 masm.push(rbx); 436 masm.subq(rsp, 16); 437 masm.movsd(new AMD64Address(rsp, 8), xmm0); 438 439 masm.movl(rax, new AMD64Address(rsp, 12)); 440 masm.movq(xmm1, recordExternalAddress(crb, pi32Inv)); // 0x6dc9c883, 0x40245f30 441 masm.andl(rax, 2147418112); 442 masm.subl(rax, 808452096); 443 masm.cmpl(rax, 281346048); 444 masm.jcc(ConditionFlag.Above, block0); 445 masm.mulsd(xmm1, xmm0); 446 masm.movdqu(xmm5, recordExternalAddress(crb, onehalf)); // 0x00000000, 0x3fe00000, 447 // 0x00000000, 0x3fe00000 448 masm.movq(xmm4, recordExternalAddress(crb, signMask)); // 0x00000000, 0x80000000 449 masm.pand(xmm4, xmm0); 450 masm.por(xmm5, xmm4); 451 masm.addpd(xmm1, xmm5); 452 masm.cvttsd2sil(rdx, xmm1); 453 masm.cvtsi2sdl(xmm1, rdx); 454 masm.movdqu(xmm2, recordExternalAddress(crb, p2)); // 0x1a600000, 0x3d90b461, 455 // 0x1a600000, 0x3d90b461 456 masm.movq(xmm3, recordExternalAddress(crb, p1)); // 0x54400000, 0x3fb921fb 457 masm.mulsd(xmm3, xmm1); 458 masm.unpcklpd(xmm1, xmm1); 459 masm.addq(rdx, 1865232); 460 masm.movdqu(xmm4, xmm0); 461 masm.andq(rdx, 63); 462 masm.movdqu(xmm5, recordExternalAddress(crb, sc4)); // 0xa556c734, 0x3ec71de3, 463 // 0x1a01a01a, 0x3efa01a0 464 masm.leaq(rax, recordExternalAddress(crb, ctable)); 465 masm.shlq(rdx, 5); 466 masm.addq(rax, rdx); 467 masm.mulpd(xmm2, xmm1); 468 masm.subsd(xmm0, xmm3); 469 masm.mulsd(xmm1, recordExternalAddress(crb, p3)); // 0x2e037073, 0x3b63198a 470 masm.subsd(xmm4, xmm3); 471 masm.movq(xmm7, new AMD64Address(rax, 8)); 472 masm.unpcklpd(xmm0, xmm0); 473 masm.movdqu(xmm3, xmm4); 474 masm.subsd(xmm4, xmm2); 475 masm.mulpd(xmm5, xmm0); 476 masm.subpd(xmm0, xmm2); 477 masm.movdqu(xmm6, recordExternalAddress(crb, sc2)); // 0x11111111, 0x3f811111, 478 // 0x55555555, 0x3fa55555 479 masm.mulsd(xmm7, xmm4); 480 masm.subsd(xmm3, xmm4); 481 masm.mulpd(xmm5, xmm0); 482 masm.mulpd(xmm0, xmm0); 483 masm.subsd(xmm3, xmm2); 484 masm.movdqu(xmm2, new AMD64Address(rax, 0)); 485 masm.subsd(xmm1, xmm3); 486 masm.movq(xmm3, new AMD64Address(rax, 24)); 487 masm.addsd(xmm2, xmm3); 488 masm.subsd(xmm7, xmm2); 489 masm.mulsd(xmm2, xmm4); 490 masm.mulpd(xmm6, xmm0); 491 masm.mulsd(xmm3, xmm4); 492 masm.mulpd(xmm2, xmm0); 493 masm.mulpd(xmm0, xmm0); 494 masm.addpd(xmm5, recordExternalAddress(crb, sc3)); // 0x1a01a01a, 0xbf2a01a0, 495 // 0x16c16c17, 0xbf56c16c 496 masm.mulsd(xmm4, new AMD64Address(rax, 0)); 497 masm.addpd(xmm6, recordExternalAddress(crb, sc1)); // 0x55555555, 0xbfc55555, 498 // 0x00000000, 0xbfe00000 499 masm.mulpd(xmm5, xmm0); 500 masm.movdqu(xmm0, xmm3); 501 masm.addsd(xmm3, new AMD64Address(rax, 8)); 502 masm.mulpd(xmm1, xmm7); 503 masm.movdqu(xmm7, xmm4); 504 masm.addsd(xmm4, xmm3); 505 masm.addpd(xmm6, xmm5); 506 masm.movq(xmm5, new AMD64Address(rax, 8)); 507 masm.subsd(xmm5, xmm3); 508 masm.subsd(xmm3, xmm4); 509 masm.addsd(xmm1, new AMD64Address(rax, 16)); 510 masm.mulpd(xmm6, xmm2); 511 masm.addsd(xmm0, xmm5); 512 masm.addsd(xmm3, xmm7); 513 masm.addsd(xmm0, xmm1); 514 masm.addsd(xmm0, xmm3); 515 masm.addsd(xmm0, xmm6); 516 masm.unpckhpd(xmm6, xmm6); 517 masm.addsd(xmm0, xmm6); 518 masm.addsd(xmm0, xmm4); 519 masm.jmp(block13); 520 521 masm.bind(block0); 522 masm.jcc(ConditionFlag.Greater, block1); 523 masm.pextrw(rax, xmm0, 3); 524 masm.andl(rax, 32767); 525 masm.pinsrw(xmm0, rax, 3); 526 masm.movq(xmm1, recordExternalAddress(crb, one)); // 0x00000000, 0x3ff00000 527 masm.subsd(xmm1, xmm0); 528 masm.movdqu(xmm0, xmm1); 529 masm.jmp(block13); 530 531 masm.bind(block1); 532 masm.pextrw(rax, xmm0, 3); 533 masm.andl(rax, 32752); 534 masm.cmpl(rax, 32752); 535 masm.jcc(ConditionFlag.Equal, block2); 536 masm.pextrw(rcx, xmm0, 3); 537 masm.andl(rcx, 32752); 538 masm.subl(rcx, 16224); 539 masm.shrl(rcx, 7); 540 masm.andl(rcx, 65532); 541 masm.leaq(r11, recordExternalAddress(crb, piInvTable)); 542 masm.addq(rcx, r11); 543 masm.movdq(rax, xmm0); 544 masm.movl(r10, new AMD64Address(rcx, 20)); 545 masm.movl(r8, new AMD64Address(rcx, 24)); 546 masm.movl(rdx, rax); 547 masm.shrq(rax, 21); 548 masm.orl(rax, Integer.MIN_VALUE); 549 masm.shrl(rax, 11); 550 masm.movl(r9, r10); 551 masm.imulq(r10, rdx); 552 masm.imulq(r9, rax); 553 masm.imulq(r8, rax); 554 masm.movl(rsi, new AMD64Address(rcx, 16)); 555 masm.movl(rdi, new AMD64Address(rcx, 12)); 556 masm.movl(r11, r10); 557 masm.shrq(r10, 32); 558 masm.addq(r9, r10); 559 masm.addq(r11, r8); 560 masm.movl(r8, r11); 561 masm.shrq(r11, 32); 562 masm.addq(r9, r11); 563 masm.movl(r10, rsi); 564 masm.imulq(rsi, rdx); 565 masm.imulq(r10, rax); 566 masm.movl(r11, rdi); 567 masm.imulq(rdi, rdx); 568 masm.movl(rbx, rsi); 569 masm.shrq(rsi, 32); 570 masm.addq(r9, rbx); 571 masm.movl(rbx, r9); 572 masm.shrq(r9, 32); 573 masm.addq(r10, rsi); 574 masm.addq(r10, r9); 575 masm.shlq(rbx, 32); 576 masm.orq(r8, rbx); 577 masm.imulq(r11, rax); 578 masm.movl(r9, new AMD64Address(rcx, 8)); 579 masm.movl(rsi, new AMD64Address(rcx, 4)); 580 masm.movl(rbx, rdi); 581 masm.shrq(rdi, 32); 582 masm.addq(r10, rbx); 583 masm.movl(rbx, r10); 584 masm.shrq(r10, 32); 585 masm.addq(r11, rdi); 586 masm.addq(r11, r10); 587 masm.movq(rdi, r9); 588 masm.imulq(r9, rdx); 589 masm.imulq(rdi, rax); 590 masm.movl(r10, r9); 591 masm.shrq(r9, 32); 592 masm.addq(r11, r10); 593 masm.movl(r10, r11); 594 masm.shrq(r11, 32); 595 masm.addq(rdi, r9); 596 masm.addq(rdi, r11); 597 masm.movq(r9, rsi); 598 masm.imulq(rsi, rdx); 599 masm.imulq(r9, rax); 600 masm.shlq(r10, 32); 601 masm.orq(r10, rbx); 602 masm.movl(rax, new AMD64Address(rcx, 0)); 603 masm.movl(r11, rsi); 604 masm.shrq(rsi, 32); 605 masm.addq(rdi, r11); 606 masm.movl(r11, rdi); 607 masm.shrq(rdi, 32); 608 masm.addq(r9, rsi); 609 masm.addq(r9, rdi); 610 masm.imulq(rdx, rax); 611 masm.pextrw(rbx, xmm0, 3); 612 masm.leaq(rdi, recordExternalAddress(crb, piInvTable)); 613 masm.subq(rcx, rdi); 614 masm.addl(rcx, rcx); 615 masm.addl(rcx, rcx); 616 masm.addl(rcx, rcx); 617 masm.addl(rcx, 19); 618 masm.movl(rsi, 32768); 619 masm.andl(rsi, rbx); 620 masm.shrl(rbx, 4); 621 masm.andl(rbx, 2047); 622 masm.subl(rbx, 1023); 623 masm.subl(rcx, rbx); 624 masm.addq(r9, rdx); 625 masm.movl(rdx, rcx); 626 masm.addl(rdx, 32); 627 masm.cmpl(rcx, 1); 628 masm.jcc(ConditionFlag.Less, block3); 629 masm.negl(rcx); 630 masm.addl(rcx, 29); 631 masm.shll(r9); 632 masm.movl(rdi, r9); 633 masm.andl(r9, 536870911); 634 masm.testl(r9, 268435456); 635 masm.jcc(ConditionFlag.NotEqual, block4); 636 masm.shrl(r9); 637 masm.movl(rbx, 0); 638 masm.shlq(r9, 32); 639 masm.orq(r9, r11); 640 641 masm.bind(block5); 642 643 masm.bind(block6); 644 masm.cmpq(r9, 0); 645 masm.jcc(ConditionFlag.Equal, block7); 646 647 masm.bind(block8); 648 masm.bsrq(r11, r9); 649 masm.movl(rcx, 29); 650 masm.subl(rcx, r11); 651 masm.jcc(ConditionFlag.LessEqual, block9); 652 masm.shlq(r9); 653 masm.movq(rax, r10); 654 masm.shlq(r10); 655 masm.addl(rdx, rcx); 656 masm.negl(rcx); 657 masm.addl(rcx, 64); 658 masm.shrq(rax); 659 masm.shrq(r8); 660 masm.orq(r9, rax); 661 masm.orq(r10, r8); 662 663 masm.bind(block10); 664 masm.cvtsi2sdq(xmm0, r9); 665 masm.shrq(r10, 1); 666 masm.cvtsi2sdq(xmm3, r10); 667 masm.xorpd(xmm4, xmm4); 668 masm.shll(rdx, 4); 669 masm.negl(rdx); 670 masm.addl(rdx, 16368); 671 masm.orl(rdx, rsi); 672 masm.xorl(rdx, rbx); 673 masm.pinsrw(xmm4, rdx, 3); 674 masm.movq(xmm2, recordExternalAddress(crb, pi4)); // 0x40000000, 0x3fe921fb, 675 // 0x18469899, 0x3e64442d 676 masm.movq(xmm6, recordExternalAddress(crb, pi48)); // 0x3fe921fb, 0x18469899, 677 // 0x3e64442d 678 masm.xorpd(xmm5, xmm5); 679 masm.subl(rdx, 1008); 680 masm.pinsrw(xmm5, rdx, 3); 681 masm.mulsd(xmm0, xmm4); 682 masm.shll(rsi, 16); 683 masm.sarl(rsi, 31); 684 masm.mulsd(xmm3, xmm5); 685 masm.movdqu(xmm1, xmm0); 686 masm.mulsd(xmm0, xmm2); 687 masm.shrl(rdi, 29); 688 masm.addsd(xmm1, xmm3); 689 masm.mulsd(xmm3, xmm2); 690 masm.addl(rdi, rsi); 691 masm.xorl(rdi, rsi); 692 masm.mulsd(xmm6, xmm1); 693 masm.movl(rax, rdi); 694 masm.addsd(xmm6, xmm3); 695 masm.movdqu(xmm2, xmm0); 696 masm.addsd(xmm0, xmm6); 697 masm.subsd(xmm2, xmm0); 698 masm.addsd(xmm6, xmm2); 699 700 masm.bind(block11); 701 masm.movq(xmm1, recordExternalAddress(crb, pi32Inv)); // 0x6dc9c883, 0x40245f30 702 masm.mulsd(xmm1, xmm0); 703 masm.movq(xmm5, recordExternalAddress(crb, onehalf)); // 0x00000000, 0x3fe00000, 704 // 0x00000000, 0x3fe00000 705 masm.movq(xmm4, recordExternalAddress(crb, signMask)); // 0x00000000, 0x80000000 706 masm.pand(xmm4, xmm0); 707 masm.por(xmm5, xmm4); 708 masm.addpd(xmm1, xmm5); 709 masm.cvttsd2siq(rdx, xmm1); 710 masm.cvtsi2sdq(xmm1, rdx); 711 masm.movq(xmm3, recordExternalAddress(crb, p1)); // 0x54400000, 0x3fb921fb 712 masm.movdqu(xmm2, recordExternalAddress(crb, p2)); // 0x1a600000, 0x3d90b461, 713 // 0x1a600000, 0x3d90b461 714 masm.mulsd(xmm3, xmm1); 715 masm.unpcklpd(xmm1, xmm1); 716 masm.shll(rax, 3); 717 masm.addl(rdx, 1865232); 718 masm.movdqu(xmm4, xmm0); 719 masm.addl(rdx, rax); 720 masm.andl(rdx, 63); 721 masm.movdqu(xmm5, recordExternalAddress(crb, sc4)); // 0xa556c734, 0x3ec71de3, 722 // 0x1a01a01a, 0x3efa01a0 723 masm.leaq(rax, recordExternalAddress(crb, ctable)); 724 masm.shll(rdx, 5); 725 masm.addq(rax, rdx); 726 masm.mulpd(xmm2, xmm1); 727 masm.subsd(xmm0, xmm3); 728 masm.mulsd(xmm1, recordExternalAddress(crb, p3)); // 0x2e037073, 0x3b63198a 729 masm.subsd(xmm4, xmm3); 730 masm.movq(xmm7, new AMD64Address(rax, 8)); 731 masm.unpcklpd(xmm0, xmm0); 732 masm.movdqu(xmm3, xmm4); 733 masm.subsd(xmm4, xmm2); 734 masm.mulpd(xmm5, xmm0); 735 masm.subpd(xmm0, xmm2); 736 masm.mulsd(xmm7, xmm4); 737 masm.subsd(xmm3, xmm4); 738 masm.mulpd(xmm5, xmm0); 739 masm.mulpd(xmm0, xmm0); 740 masm.subsd(xmm3, xmm2); 741 masm.movdqu(xmm2, new AMD64Address(rax, 0)); 742 masm.subsd(xmm1, xmm3); 743 masm.movq(xmm3, new AMD64Address(rax, 24)); 744 masm.addsd(xmm2, xmm3); 745 masm.subsd(xmm7, xmm2); 746 masm.subsd(xmm1, xmm6); 747 masm.movdqu(xmm6, recordExternalAddress(crb, sc2)); // 0x11111111, 0x3f811111, 748 // 0x55555555, 0x3fa55555 749 masm.mulsd(xmm2, xmm4); 750 masm.mulpd(xmm6, xmm0); 751 masm.mulsd(xmm3, xmm4); 752 masm.mulpd(xmm2, xmm0); 753 masm.mulpd(xmm0, xmm0); 754 masm.addpd(xmm5, recordExternalAddress(crb, sc3)); // 0x1a01a01a, 0xbf2a01a0, 755 // 0x16c16c17, 0xbf56c16c 756 masm.mulsd(xmm4, new AMD64Address(rax, 0)); 757 masm.addpd(xmm6, recordExternalAddress(crb, sc1)); // 0x55555555, 0xbfc55555, 758 // 0x00000000, 0xbfe00000 759 masm.mulpd(xmm5, xmm0); 760 masm.movdqu(xmm0, xmm3); 761 masm.addsd(xmm3, new AMD64Address(rax, 8)); 762 masm.mulpd(xmm1, xmm7); 763 masm.movdqu(xmm7, xmm4); 764 masm.addsd(xmm4, xmm3); 765 masm.addpd(xmm6, xmm5); 766 masm.movq(xmm5, new AMD64Address(rax, 8)); 767 masm.subsd(xmm5, xmm3); 768 masm.subsd(xmm3, xmm4); 769 masm.addsd(xmm1, new AMD64Address(rax, 16)); 770 masm.mulpd(xmm6, xmm2); 771 masm.addsd(xmm5, xmm0); 772 masm.addsd(xmm3, xmm7); 773 masm.addsd(xmm1, xmm5); 774 masm.addsd(xmm1, xmm3); 775 masm.addsd(xmm1, xmm6); 776 masm.unpckhpd(xmm6, xmm6); 777 masm.movdqu(xmm0, xmm4); 778 masm.addsd(xmm1, xmm6); 779 masm.addsd(xmm0, xmm1); 780 masm.jmp(block13); 781 782 masm.bind(block7); 783 masm.addl(rdx, 64); 784 masm.movq(r9, r10); 785 masm.movq(r10, r8); 786 masm.movl(r8, 0); 787 masm.cmpq(r9, 0); 788 masm.jcc(ConditionFlag.NotEqual, block8); 789 masm.addl(rdx, 64); 790 masm.movq(r9, r10); 791 masm.movq(r10, r8); 792 masm.cmpq(r9, 0); 793 masm.jcc(ConditionFlag.NotEqual, block8); 794 masm.xorpd(xmm0, xmm0); 795 masm.xorpd(xmm6, xmm6); 796 masm.jmp(block11); 797 798 masm.bind(block9); 799 masm.jcc(ConditionFlag.Equal, block10); 800 masm.negl(rcx); 801 masm.shrq(r10); 802 masm.movq(rax, r9); 803 masm.shrq(r9); 804 masm.subl(rdx, rcx); 805 masm.negl(rcx); 806 masm.addl(rcx, 64); 807 masm.shlq(rax); 808 masm.orq(r10, rax); 809 masm.jmp(block10); 810 masm.bind(block3); 811 masm.negl(rcx); 812 masm.shlq(r9, 32); 813 masm.orq(r9, r11); 814 masm.shlq(r9); 815 masm.movq(rdi, r9); 816 masm.testl(r9, Integer.MIN_VALUE); 817 masm.jcc(ConditionFlag.NotEqual, block12); 818 masm.shrl(r9); 819 masm.movl(rbx, 0); 820 masm.shrq(rdi, 3); 821 masm.jmp(block6); 822 823 masm.bind(block4); 824 masm.shrl(r9); 825 masm.movl(rbx, 536870912); 826 masm.shrl(rbx); 827 masm.shlq(r9, 32); 828 masm.orq(r9, r11); 829 masm.shlq(rbx, 32); 830 masm.addl(rdi, 536870912); 831 masm.movl(rcx, 0); 832 masm.movl(r11, 0); 833 masm.subq(rcx, r8); 834 masm.sbbq(r11, r10); 835 masm.sbbq(rbx, r9); 836 masm.movq(r8, rcx); 837 masm.movq(r10, r11); 838 masm.movq(r9, rbx); 839 masm.movl(rbx, 32768); 840 masm.jmp(block5); 841 842 masm.bind(block12); 843 masm.shrl(r9); 844 masm.movq(rbx, 0x100000000L); 845 masm.shrq(rbx); 846 masm.movl(rcx, 0); 847 masm.movl(r11, 0); 848 masm.subq(rcx, r8); 849 masm.sbbq(r11, r10); 850 masm.sbbq(rbx, r9); 851 masm.movq(r8, rcx); 852 masm.movq(r10, r11); 853 masm.movq(r9, rbx); 854 masm.movl(rbx, 32768); 855 masm.shrq(rdi, 3); 856 masm.addl(rdi, 536870912); 857 masm.jmp(block6); 858 859 masm.bind(block2); 860 masm.movsd(xmm0, new AMD64Address(rsp, 8)); 861 masm.mulsd(xmm0, recordExternalAddress(crb, negZero)); // 0x00000000, 0x80000000 862 masm.movq(new AMD64Address(rsp, 0), xmm0); 863 864 masm.bind(block13); 865 masm.addq(rsp, 16); 866 masm.pop(rbx); 867 } 868 869 }