1 /* 2 * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2016, Intel Corporation. All rights reserved. 4 * Intel Math Library (LIBM) Source Code 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 27 package org.graalvm.compiler.lir.amd64; 28 29 import static jdk.vm.ci.amd64.AMD64.r10; 30 import static jdk.vm.ci.amd64.AMD64.r11; 31 import static jdk.vm.ci.amd64.AMD64.r8; 32 import static jdk.vm.ci.amd64.AMD64.r9; 33 import static jdk.vm.ci.amd64.AMD64.rax; 34 import static jdk.vm.ci.amd64.AMD64.rbx; 35 import static jdk.vm.ci.amd64.AMD64.rcx; 36 import static jdk.vm.ci.amd64.AMD64.rdi; 37 import static jdk.vm.ci.amd64.AMD64.rdx; 38 import static jdk.vm.ci.amd64.AMD64.rsi; 39 import static jdk.vm.ci.amd64.AMD64.rsp; 40 import static jdk.vm.ci.amd64.AMD64.xmm0; 41 import static jdk.vm.ci.amd64.AMD64.xmm1; 42 import static jdk.vm.ci.amd64.AMD64.xmm2; 43 import static jdk.vm.ci.amd64.AMD64.xmm3; 44 import static jdk.vm.ci.amd64.AMD64.xmm4; 45 import static jdk.vm.ci.amd64.AMD64.xmm5; 46 import static jdk.vm.ci.amd64.AMD64.xmm6; 47 import static jdk.vm.ci.amd64.AMD64.xmm7; 48 import static org.graalvm.compiler.lir.amd64.AMD64HotSpotHelper.pointerConstant; 49 import static org.graalvm.compiler.lir.amd64.AMD64HotSpotHelper.recordExternalAddress; 50 51 import org.graalvm.compiler.asm.Label; 52 import org.graalvm.compiler.asm.amd64.AMD64Address; 53 import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag; 54 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 55 import org.graalvm.compiler.lir.LIRInstructionClass; 56 import org.graalvm.compiler.lir.asm.ArrayDataPointerConstant; 57 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 58 59 import jdk.vm.ci.amd64.AMD64; 60 61 /** 62 * <pre> 63 * ALGORITHM DESCRIPTION - SIN() 64 * --------------------- 65 * 66 * 1. RANGE REDUCTION 67 * 68 * We perform an initial range reduction from X to r with 69 * 70 * X =~= N * pi/32 + r 71 * 72 * so that |r| <= pi/64 + epsilon. We restrict inputs to those 73 * where |N| <= 932560. Beyond this, the range reduction is 74 * insufficiently accurate. For extremely small inputs, 75 * denormalization can occur internally, impacting performance. 76 * This means that the main path is actually only taken for 77 * 2^-252 <= |X| < 90112. 78 * 79 * To avoid branches, we perform the range reduction to full 80 * accuracy each time. 81 * 82 * X - N * (P_1 + P_2 + P_3) 83 * 84 * where P_1 and P_2 are 32-bit numbers (so multiplication by N 85 * is exact) and P_3 is a 53-bit number. Together, these 86 * approximate pi well enough for all cases in the restricted 87 * range. 88 * 89 * The main reduction sequence is: 90 * 91 * y = 32/pi * x 92 * N = integer(y) 93 * (computed by adding and subtracting off SHIFTER) 94 * 95 * m_1 = N * P_1 96 * m_2 = N * P_2 97 * r_1 = x - m_1 98 * r = r_1 - m_2 99 * (this r can be used for most of the calculation) 100 * 101 * c_1 = r_1 - r 102 * m_3 = N * P_3 103 * c_2 = c_1 - m_2 104 * c = c_2 - m_3 105 * 106 * 2. MAIN ALGORITHM 107 * 108 * The algorithm uses a table lookup based on B = M * pi / 32 109 * where M = N mod 64. The stored values are: 110 * sigma closest power of 2 to cos(B) 111 * C_hl 53-bit cos(B) - sigma 112 * S_hi + S_lo 2 * 53-bit sin(B) 113 * 114 * The computation is organized as follows: 115 * 116 * sin(B + r + c) = [sin(B) + sigma * r] + 117 * r * (cos(B) - sigma) + 118 * sin(B) * [cos(r + c) - 1] + 119 * cos(B) * [sin(r + c) - r] 120 * 121 * which is approximately: 122 * 123 * [S_hi + sigma * r] + 124 * C_hl * r + 125 * S_lo + S_hi * [(cos(r) - 1) - r * c] + 126 * (C_hl + sigma) * [(sin(r) - r) + c] 127 * 128 * and this is what is actually computed. We separate this sum 129 * into four parts: 130 * 131 * hi + med + pols + corr 132 * 133 * where 134 * 135 * hi = S_hi + sigma r 136 * med = C_hl * r 137 * pols = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r) 138 * corr = S_lo + c * ((C_hl + sigma) - S_hi * r) 139 * 140 * 3. POLYNOMIAL 141 * 142 * The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) * 143 * (sin(r) - r) can be rearranged freely, since it is quite 144 * small, so we exploit parallelism to the fullest. 145 * 146 * psc4 = SC_4 * r_1 147 * msc4 = psc4 * r 148 * r2 = r * r 149 * msc2 = SC_2 * r2 150 * r4 = r2 * r2 151 * psc3 = SC_3 + msc4 152 * psc1 = SC_1 + msc2 153 * msc3 = r4 * psc3 154 * sincospols = psc1 + msc3 155 * pols = sincospols * 156 * <S_hi * r^2 | (C_hl + sigma) * r^3> 157 * 158 * 4. CORRECTION TERM 159 * 160 * This is where the "c" component of the range reduction is 161 * taken into account; recall that just "r" is used for most of 162 * the calculation. 163 * 164 * -c = m_3 - c_2 165 * -d = S_hi * r - (C_hl + sigma) 166 * corr = -c * -d + S_lo 167 * 168 * 5. COMPENSATED SUMMATIONS 169 * 170 * The two successive compensated summations add up the high 171 * and medium parts, leaving just the low parts to add up at 172 * the end. 173 * 174 * rs = sigma * r 175 * res_int = S_hi + rs 176 * k_0 = S_hi - res_int 177 * k_2 = k_0 + rs 178 * med = C_hl * r 179 * res_hi = res_int + med 180 * k_1 = res_int - res_hi 181 * k_3 = k_1 + med 182 * 183 * 6. FINAL SUMMATION 184 * 185 * We now add up all the small parts: 186 * 187 * res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3 188 * 189 * Now the overall result is just: 190 * 191 * res_hi + res_lo 192 * 193 * 7. SMALL ARGUMENTS 194 * 195 * If |x| < SNN (SNN meaning the smallest normal number), we 196 * simply perform 0.1111111 cdots 1111 * x. For SNN <= |x|, we 197 * do 2^-55 * (2^55 * x - x). 198 * 199 * Special cases: 200 * sin(NaN) = quiet NaN, and raise invalid exception 201 * sin(INF) = NaN and raise invalid exception 202 * sin(+/-0) = +/-0 203 * </pre> 204 */ 205 public final class AMD64MathSinOp extends AMD64MathIntrinsicUnaryOp { 206 207 public static final LIRInstructionClass<AMD64MathSinOp> TYPE = LIRInstructionClass.create(AMD64MathSinOp.class); 208 209 public AMD64MathSinOp() { 210 super(TYPE, /* GPR */ rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11, 211 /* XMM */ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); 212 } 213 214 private ArrayDataPointerConstant onehalf = pointerConstant(16, new int[]{ 215 // @formatter:off 216 0x00000000, 0x3fe00000, 0x00000000, 0x3fe00000 217 // @formatter:on 218 }); 219 220 private ArrayDataPointerConstant p2 = pointerConstant(16, new int[]{ 221 // @formatter:off 222 0x1a600000, 0x3d90b461, 0x1a600000, 0x3d90b461 223 // @formatter:on 224 }); 225 226 private ArrayDataPointerConstant sc4 = pointerConstant(16, new int[]{ 227 // @formatter:off 228 0xa556c734, 0x3ec71de3, 0x1a01a01a, 0x3efa01a0 229 // @formatter:on 230 }); 231 232 private ArrayDataPointerConstant ctable = pointerConstant(16, new int[]{ 233 // @formatter:off 234 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 235 0x00000000, 0x00000000, 0x3ff00000, 0x176d6d31, 0xbf73b92e, 236 0xbc29b42c, 0x3fb917a6, 0xe0000000, 0xbc3e2718, 0x00000000, 237 0x3ff00000, 0x011469fb, 0xbf93ad06, 0x3c69a60b, 0x3fc8f8b8, 238 0xc0000000, 0xbc626d19, 0x00000000, 0x3ff00000, 0x939d225a, 239 0xbfa60bea, 0x2ed59f06, 0x3fd29406, 0xa0000000, 0xbc75d28d, 240 0x00000000, 0x3ff00000, 0x866b95cf, 0xbfb37ca1, 0xa6aea963, 241 0x3fd87de2, 0xe0000000, 0xbc672ced, 0x00000000, 0x3ff00000, 242 0x73fa1279, 0xbfbe3a68, 0x3806f63b, 0x3fde2b5d, 0x20000000, 243 0x3c5e0d89, 0x00000000, 0x3ff00000, 0x5bc57974, 0xbfc59267, 244 0x39ae68c8, 0x3fe1c73b, 0x20000000, 0x3c8b25dd, 0x00000000, 245 0x3ff00000, 0x53aba2fd, 0xbfcd0dfe, 0x25091dd6, 0x3fe44cf3, 246 0x20000000, 0x3c68076a, 0x00000000, 0x3ff00000, 0x99fcef32, 247 0x3fca8279, 0x667f3bcd, 0x3fe6a09e, 0x20000000, 0xbc8bdd34, 248 0x00000000, 0x3fe00000, 0x94247758, 0x3fc133cc, 0x6b151741, 249 0x3fe8bc80, 0x20000000, 0xbc82c5e1, 0x00000000, 0x3fe00000, 250 0x9ae68c87, 0x3fac73b3, 0x290ea1a3, 0x3fea9b66, 0xe0000000, 251 0x3c39f630, 0x00000000, 0x3fe00000, 0x7f909c4e, 0xbf9d4a2c, 252 0xf180bdb1, 0x3fec38b2, 0x80000000, 0xbc76e0b1, 0x00000000, 253 0x3fe00000, 0x65455a75, 0xbfbe0875, 0xcf328d46, 0x3fed906b, 254 0x20000000, 0x3c7457e6, 0x00000000, 0x3fe00000, 0x76acf82d, 255 0x3fa4a031, 0x56c62dda, 0x3fee9f41, 0xe0000000, 0x3c8760b1, 256 0x00000000, 0x3fd00000, 0x0e5967d5, 0xbfac1d1f, 0xcff75cb0, 257 0x3fef6297, 0x20000000, 0x3c756217, 0x00000000, 0x3fd00000, 258 0x0f592f50, 0xbf9ba165, 0xa3d12526, 0x3fefd88d, 0x40000000, 259 0xbc887df6, 0x00000000, 0x3fc00000, 0x00000000, 0x00000000, 260 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, 0x00000000, 261 0x00000000, 0x0f592f50, 0x3f9ba165, 0xa3d12526, 0x3fefd88d, 262 0x40000000, 0xbc887df6, 0x00000000, 0xbfc00000, 0x0e5967d5, 263 0x3fac1d1f, 0xcff75cb0, 0x3fef6297, 0x20000000, 0x3c756217, 264 0x00000000, 0xbfd00000, 0x76acf82d, 0xbfa4a031, 0x56c62dda, 265 0x3fee9f41, 0xe0000000, 0x3c8760b1, 0x00000000, 0xbfd00000, 266 0x65455a75, 0x3fbe0875, 0xcf328d46, 0x3fed906b, 0x20000000, 267 0x3c7457e6, 0x00000000, 0xbfe00000, 0x7f909c4e, 0x3f9d4a2c, 268 0xf180bdb1, 0x3fec38b2, 0x80000000, 0xbc76e0b1, 0x00000000, 269 0xbfe00000, 0x9ae68c87, 0xbfac73b3, 0x290ea1a3, 0x3fea9b66, 270 0xe0000000, 0x3c39f630, 0x00000000, 0xbfe00000, 0x94247758, 271 0xbfc133cc, 0x6b151741, 0x3fe8bc80, 0x20000000, 0xbc82c5e1, 272 0x00000000, 0xbfe00000, 0x99fcef32, 0xbfca8279, 0x667f3bcd, 273 0x3fe6a09e, 0x20000000, 0xbc8bdd34, 0x00000000, 0xbfe00000, 274 0x53aba2fd, 0x3fcd0dfe, 0x25091dd6, 0x3fe44cf3, 0x20000000, 275 0x3c68076a, 0x00000000, 0xbff00000, 0x5bc57974, 0x3fc59267, 276 0x39ae68c8, 0x3fe1c73b, 0x20000000, 0x3c8b25dd, 0x00000000, 277 0xbff00000, 0x73fa1279, 0x3fbe3a68, 0x3806f63b, 0x3fde2b5d, 278 0x20000000, 0x3c5e0d89, 0x00000000, 0xbff00000, 0x866b95cf, 279 0x3fb37ca1, 0xa6aea963, 0x3fd87de2, 0xe0000000, 0xbc672ced, 280 0x00000000, 0xbff00000, 0x939d225a, 0x3fa60bea, 0x2ed59f06, 281 0x3fd29406, 0xa0000000, 0xbc75d28d, 0x00000000, 0xbff00000, 282 0x011469fb, 0x3f93ad06, 0x3c69a60b, 0x3fc8f8b8, 0xc0000000, 283 0xbc626d19, 0x00000000, 0xbff00000, 0x176d6d31, 0x3f73b92e, 284 0xbc29b42c, 0x3fb917a6, 0xe0000000, 0xbc3e2718, 0x00000000, 285 0xbff00000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 286 0x00000000, 0x00000000, 0x00000000, 0xbff00000, 0x176d6d31, 287 0x3f73b92e, 0xbc29b42c, 0xbfb917a6, 0xe0000000, 0x3c3e2718, 288 0x00000000, 0xbff00000, 0x011469fb, 0x3f93ad06, 0x3c69a60b, 289 0xbfc8f8b8, 0xc0000000, 0x3c626d19, 0x00000000, 0xbff00000, 290 0x939d225a, 0x3fa60bea, 0x2ed59f06, 0xbfd29406, 0xa0000000, 291 0x3c75d28d, 0x00000000, 0xbff00000, 0x866b95cf, 0x3fb37ca1, 292 0xa6aea963, 0xbfd87de2, 0xe0000000, 0x3c672ced, 0x00000000, 293 0xbff00000, 0x73fa1279, 0x3fbe3a68, 0x3806f63b, 0xbfde2b5d, 294 0x20000000, 0xbc5e0d89, 0x00000000, 0xbff00000, 0x5bc57974, 295 0x3fc59267, 0x39ae68c8, 0xbfe1c73b, 0x20000000, 0xbc8b25dd, 296 0x00000000, 0xbff00000, 0x53aba2fd, 0x3fcd0dfe, 0x25091dd6, 297 0xbfe44cf3, 0x20000000, 0xbc68076a, 0x00000000, 0xbff00000, 298 0x99fcef32, 0xbfca8279, 0x667f3bcd, 0xbfe6a09e, 0x20000000, 299 0x3c8bdd34, 0x00000000, 0xbfe00000, 0x94247758, 0xbfc133cc, 300 0x6b151741, 0xbfe8bc80, 0x20000000, 0x3c82c5e1, 0x00000000, 301 0xbfe00000, 0x9ae68c87, 0xbfac73b3, 0x290ea1a3, 0xbfea9b66, 302 0xe0000000, 0xbc39f630, 0x00000000, 0xbfe00000, 0x7f909c4e, 303 0x3f9d4a2c, 0xf180bdb1, 0xbfec38b2, 0x80000000, 0x3c76e0b1, 304 0x00000000, 0xbfe00000, 0x65455a75, 0x3fbe0875, 0xcf328d46, 305 0xbfed906b, 0x20000000, 0xbc7457e6, 0x00000000, 0xbfe00000, 306 0x76acf82d, 0xbfa4a031, 0x56c62dda, 0xbfee9f41, 0xe0000000, 307 0xbc8760b1, 0x00000000, 0xbfd00000, 0x0e5967d5, 0x3fac1d1f, 308 0xcff75cb0, 0xbfef6297, 0x20000000, 0xbc756217, 0x00000000, 309 0xbfd00000, 0x0f592f50, 0x3f9ba165, 0xa3d12526, 0xbfefd88d, 310 0x40000000, 0x3c887df6, 0x00000000, 0xbfc00000, 0x00000000, 311 0x00000000, 0x00000000, 0xbff00000, 0x00000000, 0x00000000, 312 0x00000000, 0x00000000, 0x0f592f50, 0xbf9ba165, 0xa3d12526, 313 0xbfefd88d, 0x40000000, 0x3c887df6, 0x00000000, 0x3fc00000, 314 0x0e5967d5, 0xbfac1d1f, 0xcff75cb0, 0xbfef6297, 0x20000000, 315 0xbc756217, 0x00000000, 0x3fd00000, 0x76acf82d, 0x3fa4a031, 316 0x56c62dda, 0xbfee9f41, 0xe0000000, 0xbc8760b1, 0x00000000, 317 0x3fd00000, 0x65455a75, 0xbfbe0875, 0xcf328d46, 0xbfed906b, 318 0x20000000, 0xbc7457e6, 0x00000000, 0x3fe00000, 0x7f909c4e, 319 0xbf9d4a2c, 0xf180bdb1, 0xbfec38b2, 0x80000000, 0x3c76e0b1, 320 0x00000000, 0x3fe00000, 0x9ae68c87, 0x3fac73b3, 0x290ea1a3, 321 0xbfea9b66, 0xe0000000, 0xbc39f630, 0x00000000, 0x3fe00000, 322 0x94247758, 0x3fc133cc, 0x6b151741, 0xbfe8bc80, 0x20000000, 323 0x3c82c5e1, 0x00000000, 0x3fe00000, 0x99fcef32, 0x3fca8279, 324 0x667f3bcd, 0xbfe6a09e, 0x20000000, 0x3c8bdd34, 0x00000000, 325 0x3fe00000, 0x53aba2fd, 0xbfcd0dfe, 0x25091dd6, 0xbfe44cf3, 326 0x20000000, 0xbc68076a, 0x00000000, 0x3ff00000, 0x5bc57974, 327 0xbfc59267, 0x39ae68c8, 0xbfe1c73b, 0x20000000, 0xbc8b25dd, 328 0x00000000, 0x3ff00000, 0x73fa1279, 0xbfbe3a68, 0x3806f63b, 329 0xbfde2b5d, 0x20000000, 0xbc5e0d89, 0x00000000, 0x3ff00000, 330 0x866b95cf, 0xbfb37ca1, 0xa6aea963, 0xbfd87de2, 0xe0000000, 331 0x3c672ced, 0x00000000, 0x3ff00000, 0x939d225a, 0xbfa60bea, 332 0x2ed59f06, 0xbfd29406, 0xa0000000, 0x3c75d28d, 0x00000000, 333 0x3ff00000, 0x011469fb, 0xbf93ad06, 0x3c69a60b, 0xbfc8f8b8, 334 0xc0000000, 0x3c626d19, 0x00000000, 0x3ff00000, 0x176d6d31, 335 0xbf73b92e, 0xbc29b42c, 0xbfb917a6, 0xe0000000, 0x3c3e2718, 336 0x00000000, 0x3ff00000 337 // @formatter:on 338 }); 339 340 private ArrayDataPointerConstant sc2 = pointerConstant(16, new int[]{ 341 // @formatter:off 342 0x11111111, 0x3f811111, 0x55555555, 0x3fa55555 343 // @formatter:on 344 }); 345 346 private ArrayDataPointerConstant sc3 = pointerConstant(16, new int[]{ 347 // @formatter:off 348 0x1a01a01a, 0xbf2a01a0, 0x16c16c17, 0xbf56c16c 349 // @formatter:on 350 }); 351 352 private ArrayDataPointerConstant sc1 = pointerConstant(16, new int[]{ 353 // @formatter:off 354 0x55555555, 0xbfc55555, 0x00000000, 0xbfe00000 355 // @formatter:on 356 }); 357 358 private ArrayDataPointerConstant piInvTable = pointerConstant(16, new int[]{ 359 // @formatter:off 360 0x00000000, 0x00000000, 0xa2f9836e, 0x4e441529, 0xfc2757d1, 361 0xf534ddc0, 0xdb629599, 0x3c439041, 0xfe5163ab, 0xdebbc561, 362 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c, 0xfe1deb1c, 363 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41, 364 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 365 0xde05980f, 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 366 0x4f463f66, 0x9e5fea2d, 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 367 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08, 0x56033046, 0xfc7b6bab, 368 0xf0cfbc21 369 // @formatter:on 370 }); 371 372 private ArrayDataPointerConstant pi4 = pointerConstant(8, new int[]{ 373 // @formatter:off 374 0x40000000, 0x3fe921fb, 375 }); 376 private ArrayDataPointerConstant pi48 = pointerConstant(8, new int[]{ 377 0x18469899, 0x3e64442d 378 // @formatter:on 379 }); 380 381 private ArrayDataPointerConstant pi32Inv = pointerConstant(8, new int[]{ 382 // @formatter:off 383 0x6dc9c883, 0x40245f30 384 // @formatter:on 385 }); 386 387 private ArrayDataPointerConstant shifter = pointerConstant(8, new int[]{ 388 // @formatter:off 389 0x00000000, 0x43380000 390 // @formatter:on 391 }); 392 393 private ArrayDataPointerConstant signMask = pointerConstant(8, new int[]{ 394 // @formatter:off 395 0x00000000, 0x80000000 396 // @formatter:on 397 }); 398 399 private ArrayDataPointerConstant p3 = pointerConstant(8, new int[]{ 400 // @formatter:off 401 0x2e037073, 0x3b63198a 402 // @formatter:on 403 }); 404 405 private ArrayDataPointerConstant allOnes = pointerConstant(8, new int[]{ 406 // @formatter:off 407 0xffffffff, 0x3fefffff 408 // @formatter:on 409 }); 410 411 private ArrayDataPointerConstant twoPow55 = pointerConstant(8, new int[]{ 412 // @formatter:off 413 0x00000000, 0x43600000 414 // @formatter:on 415 }); 416 417 private ArrayDataPointerConstant twoPowM55 = pointerConstant(8, new int[]{ 418 // @formatter:off 419 0x00000000, 0x3c800000 420 // @formatter:on 421 }); 422 423 private ArrayDataPointerConstant p1 = pointerConstant(8, new int[]{ 424 // @formatter:off 425 0x54400000, 0x3fb921fb 426 // @formatter:on 427 }); 428 429 private ArrayDataPointerConstant negZero = pointerConstant(8, new int[]{ 430 // @formatter:off 431 0x00000000, 0x80000000 432 // @formatter:on 433 }); 434 435 @Override 436 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) { 437 Label block0 = new Label(); 438 Label block1 = new Label(); 439 Label block2 = new Label(); 440 Label block3 = new Label(); 441 Label block4 = new Label(); 442 Label block5 = new Label(); 443 Label block6 = new Label(); 444 Label block7 = new Label(); 445 Label block8 = new Label(); 446 Label block9 = new Label(); 447 Label block10 = new Label(); 448 Label block11 = new Label(); 449 Label block12 = new Label(); 450 Label block13 = new Label(); 451 Label block14 = new Label(); 452 453 masm.push(AMD64.rbx); 454 masm.subq(rsp, 16); 455 masm.movsd(new AMD64Address(rsp, 8), xmm0); 456 masm.movl(rax, new AMD64Address(rsp, 12)); 457 masm.movq(xmm1, recordExternalAddress(crb, pi32Inv)); // 0x6dc9c883, 0x40245f30 458 masm.movq(xmm2, recordExternalAddress(crb, shifter)); // 0x00000000, 0x43380000 459 masm.andl(rax, 2147418112); 460 masm.subl(rax, 808452096); 461 masm.cmpl(rax, 281346048); 462 masm.jcc(ConditionFlag.Above, block0); 463 masm.mulsd(xmm1, xmm0); 464 masm.movdqu(xmm5, recordExternalAddress(crb, onehalf)); // 0x00000000, 0x3fe00000, 465 // 0x00000000, 0x3fe00000 466 masm.movq(xmm4, recordExternalAddress(crb, signMask)); // 0x00000000, 0x80000000 467 masm.pand(xmm4, xmm0); 468 masm.por(xmm5, xmm4); 469 masm.addpd(xmm1, xmm5); 470 masm.cvttsd2sil(rdx, xmm1); 471 masm.cvtsi2sdl(xmm1, rdx); 472 masm.movdqu(xmm6, recordExternalAddress(crb, p2)); // 0x1a600000, 0x3d90b461, 473 // 0x1a600000, 0x3d90b461 474 masm.movq(r8, 0x3fb921fb54400000L); 475 masm.movdq(xmm3, r8); 476 masm.movdqu(xmm5, recordExternalAddress(crb, sc4)); // 0xa556c734, 0x3ec71de3, 477 // 0x1a01a01a, 0x3efa01a0 478 masm.pshufd(xmm4, xmm0, 68); 479 masm.mulsd(xmm3, xmm1); 480 if (masm.supports(AMD64.CPUFeature.SSE3)) { 481 masm.movddup(xmm1, xmm1); 482 } else { 483 masm.movlhps(xmm1, xmm1); 484 } 485 masm.andl(rdx, 63); 486 masm.shll(rdx, 5); 487 masm.leaq(AMD64.rax, recordExternalAddress(crb, ctable)); 488 masm.addq(AMD64.rax, AMD64.rdx); 489 masm.mulpd(xmm6, xmm1); 490 masm.mulsd(xmm1, recordExternalAddress(crb, p3)); // 0x2e037073, 0x3b63198a 491 masm.subsd(xmm4, xmm3); 492 masm.movq(xmm7, new AMD64Address(AMD64.rax, 8)); 493 masm.subsd(xmm0, xmm3); 494 if (masm.supports(AMD64.CPUFeature.SSE3)) { 495 masm.movddup(xmm3, xmm4); 496 } else { 497 masm.movdqu(xmm3, xmm4); 498 masm.movlhps(xmm3, xmm3); 499 } 500 masm.subsd(xmm4, xmm6); 501 masm.pshufd(xmm0, xmm0, 68); 502 masm.movdqu(xmm2, new AMD64Address(AMD64.rax, 0)); 503 masm.mulpd(xmm5, xmm0); 504 masm.subpd(xmm0, xmm6); 505 masm.mulsd(xmm7, xmm4); 506 masm.subsd(xmm3, xmm4); 507 masm.mulpd(xmm5, xmm0); 508 masm.mulpd(xmm0, xmm0); 509 masm.subsd(xmm3, xmm6); 510 masm.movdqu(xmm6, recordExternalAddress(crb, sc2)); // 0x11111111, 0x3f811111, 511 // 0x55555555, 0x3fa55555 512 masm.subsd(xmm1, xmm3); 513 masm.movq(xmm3, new AMD64Address(AMD64.rax, 24)); 514 masm.addsd(xmm2, xmm3); 515 masm.subsd(xmm7, xmm2); 516 masm.mulsd(xmm2, xmm4); 517 masm.mulpd(xmm6, xmm0); 518 masm.mulsd(xmm3, xmm4); 519 masm.mulpd(xmm2, xmm0); 520 masm.mulpd(xmm0, xmm0); 521 masm.addpd(xmm5, recordExternalAddress(crb, sc3)); // 0x1a01a01a, 0xbf2a01a0, 522 // 0x16c16c17, 0xbf56c16c 523 masm.mulsd(xmm4, new AMD64Address(AMD64.rax, 0)); 524 masm.addpd(xmm6, recordExternalAddress(crb, sc1)); // 0x55555555, 0xbfc55555, 525 // 0x00000000, 0xbfe00000 526 masm.mulpd(xmm5, xmm0); 527 masm.movdqu(xmm0, xmm3); 528 masm.addsd(xmm3, new AMD64Address(AMD64.rax, 8)); 529 masm.mulpd(xmm1, xmm7); 530 masm.movdqu(xmm7, xmm4); 531 masm.addsd(xmm4, xmm3); 532 masm.addpd(xmm6, xmm5); 533 masm.movq(xmm5, new AMD64Address(AMD64.rax, 8)); 534 masm.subsd(xmm5, xmm3); 535 masm.subsd(xmm3, xmm4); 536 masm.addsd(xmm1, new AMD64Address(AMD64.rax, 16)); 537 masm.mulpd(xmm6, xmm2); 538 masm.addsd(xmm5, xmm0); 539 masm.addsd(xmm3, xmm7); 540 masm.addsd(xmm1, xmm5); 541 masm.addsd(xmm1, xmm3); 542 masm.addsd(xmm1, xmm6); 543 masm.unpckhpd(xmm6, xmm6); 544 masm.movdqu(xmm0, xmm4); 545 masm.addsd(xmm1, xmm6); 546 masm.addsd(xmm0, xmm1); 547 masm.jmp(block14); 548 549 masm.bind(block0); 550 masm.jcc(ConditionFlag.Greater, block1); 551 masm.shrl(rax, 20); 552 masm.cmpl(rax, 3325); 553 masm.jcc(ConditionFlag.NotEqual, block2); 554 masm.mulsd(xmm0, recordExternalAddress(crb, allOnes)); // 0xffffffff, 0x3fefffff 555 masm.jmp(block14); 556 557 masm.bind(block2); 558 masm.movq(xmm3, recordExternalAddress(crb, twoPow55)); // 0x00000000, 0x43600000 559 masm.mulsd(xmm3, xmm0); 560 masm.subsd(xmm3, xmm0); 561 masm.mulsd(xmm3, recordExternalAddress(crb, twoPowM55)); // 0x00000000, 0x3c800000 562 masm.jmp(block14); 563 564 masm.bind(block1); 565 masm.pextrw(rax, xmm0, 3); 566 masm.andl(rax, 32752); 567 masm.cmpl(rax, 32752); 568 masm.jcc(ConditionFlag.Equal, block3); 569 masm.pextrw(rcx, xmm0, 3); 570 masm.andl(rcx, 32752); 571 masm.subl(rcx, 16224); 572 masm.shrl(rcx, 7); 573 masm.andl(rcx, 65532); 574 masm.leaq(r11, recordExternalAddress(crb, piInvTable)); 575 masm.addq(AMD64.rcx, r11); 576 masm.movdq(AMD64.rax, xmm0); 577 masm.movl(r10, new AMD64Address(AMD64.rcx, 20)); 578 masm.movl(r8, new AMD64Address(AMD64.rcx, 24)); 579 masm.movl(rdx, rax); 580 masm.shrq(AMD64.rax, 21); 581 masm.orl(rax, Integer.MIN_VALUE); 582 masm.shrl(rax, 11); 583 masm.movl(r9, r10); 584 masm.imulq(r10, AMD64.rdx); 585 masm.imulq(r9, AMD64.rax); 586 masm.imulq(r8, AMD64.rax); 587 masm.movl(rsi, new AMD64Address(AMD64.rcx, 16)); 588 masm.movl(rdi, new AMD64Address(AMD64.rcx, 12)); 589 masm.movl(r11, r10); 590 masm.shrq(r10, 32); 591 masm.addq(r9, r10); 592 masm.addq(r11, r8); 593 masm.movl(r8, r11); 594 masm.shrq(r11, 32); 595 masm.addq(r9, r11); 596 masm.movl(r10, rsi); 597 masm.imulq(rsi, AMD64.rdx); 598 masm.imulq(r10, AMD64.rax); 599 masm.movl(r11, rdi); 600 masm.imulq(rdi, AMD64.rdx); 601 masm.movl(rbx, rsi); 602 masm.shrq(rsi, 32); 603 masm.addq(r9, AMD64.rbx); 604 masm.movl(rbx, r9); 605 masm.shrq(r9, 32); 606 masm.addq(r10, rsi); 607 masm.addq(r10, r9); 608 masm.shlq(AMD64.rbx, 32); 609 masm.orq(r8, AMD64.rbx); 610 masm.imulq(r11, AMD64.rax); 611 masm.movl(r9, new AMD64Address(AMD64.rcx, 8)); 612 masm.movl(rsi, new AMD64Address(AMD64.rcx, 4)); 613 masm.movl(rbx, rdi); 614 masm.shrq(rdi, 32); 615 masm.addq(r10, AMD64.rbx); 616 masm.movl(rbx, r10); 617 masm.shrq(r10, 32); 618 masm.addq(r11, rdi); 619 masm.addq(r11, r10); 620 masm.movq(rdi, r9); 621 masm.imulq(r9, AMD64.rdx); 622 masm.imulq(rdi, AMD64.rax); 623 masm.movl(r10, r9); 624 masm.shrq(r9, 32); 625 masm.addq(r11, r10); 626 masm.movl(r10, r11); 627 masm.shrq(r11, 32); 628 masm.addq(rdi, r9); 629 masm.addq(rdi, r11); 630 masm.movq(r9, rsi); 631 masm.imulq(rsi, AMD64.rdx); 632 masm.imulq(r9, AMD64.rax); 633 masm.shlq(r10, 32); 634 masm.orq(r10, AMD64.rbx); 635 masm.movl(rax, new AMD64Address(AMD64.rcx, 0)); 636 masm.movl(r11, rsi); 637 masm.shrq(rsi, 32); 638 masm.addq(rdi, r11); 639 masm.movl(r11, rdi); 640 masm.shrq(rdi, 32); 641 masm.addq(r9, rsi); 642 masm.addq(r9, rdi); 643 masm.imulq(AMD64.rdx, AMD64.rax); 644 masm.pextrw(rbx, xmm0, 3); 645 masm.leaq(rdi, recordExternalAddress(crb, piInvTable)); 646 masm.subq(AMD64.rcx, rdi); 647 masm.addl(rcx, rcx); 648 masm.addl(rcx, rcx); 649 masm.addl(rcx, rcx); 650 masm.addl(rcx, 19); 651 masm.movl(rsi, 32768); 652 masm.andl(rsi, rbx); 653 masm.shrl(rbx, 4); 654 masm.andl(rbx, 2047); 655 masm.subl(rbx, 1023); 656 masm.subl(rcx, rbx); 657 masm.addq(r9, AMD64.rdx); 658 masm.movl(rdx, rcx); 659 masm.addl(rdx, 32); 660 masm.cmpl(rcx, 1); 661 masm.jcc(ConditionFlag.Less, block4); 662 masm.negl(rcx); 663 masm.addl(rcx, 29); 664 masm.shll(r9); 665 masm.movl(rdi, r9); 666 masm.andl(r9, 536870911); 667 masm.testl(r9, 268435456); 668 masm.jcc(ConditionFlag.NotEqual, block5); 669 masm.shrl(r9); 670 masm.movl(rbx, 0); 671 masm.shlq(r9, 32); 672 masm.orq(r9, r11); 673 674 masm.bind(block6); 675 676 masm.bind(block7); 677 678 masm.cmpq(r9, 0); 679 masm.jcc(ConditionFlag.Equal, block8); 680 681 masm.bind(block9); 682 masm.bsrq(r11, r9); 683 masm.movl(rcx, 29); 684 masm.subl(rcx, r11); 685 masm.jcc(ConditionFlag.LessEqual, block10); 686 masm.shlq(r9); 687 masm.movq(AMD64.rax, r10); 688 masm.shlq(r10); 689 masm.addl(rdx, rcx); 690 masm.negl(rcx); 691 masm.addl(rcx, 64); 692 masm.shrq(AMD64.rax); 693 masm.shrq(r8); 694 masm.orq(r9, AMD64.rax); 695 masm.orq(r10, r8); 696 697 masm.bind(block11); 698 masm.cvtsi2sdq(xmm0, r9); 699 masm.shrq(r10, 1); 700 masm.cvtsi2sdq(xmm3, r10); 701 masm.xorpd(xmm4, xmm4); 702 masm.shll(rdx, 4); 703 masm.negl(rdx); 704 masm.addl(rdx, 16368); 705 masm.orl(rdx, rsi); 706 masm.xorl(rdx, rbx); 707 masm.pinsrw(xmm4, rdx, 3); 708 masm.movq(xmm2, recordExternalAddress(crb, pi4)); // 0x40000000, 0x3fe921fb, 709 // 0x18469899, 0x3e64442d 710 masm.movq(xmm6, recordExternalAddress(crb, pi48)); // 0x3fe921fb, 0x18469899, 711 // 0x3e64442d 712 masm.xorpd(xmm5, xmm5); 713 masm.subl(rdx, 1008); 714 masm.pinsrw(xmm5, rdx, 3); 715 masm.mulsd(xmm0, xmm4); 716 masm.shll(rsi, 16); 717 masm.sarl(rsi, 31); 718 masm.mulsd(xmm3, xmm5); 719 masm.movdqu(xmm1, xmm0); 720 masm.mulsd(xmm0, xmm2); 721 masm.shrl(rdi, 29); 722 masm.addsd(xmm1, xmm3); 723 masm.mulsd(xmm3, xmm2); 724 masm.addl(rdi, rsi); 725 masm.xorl(rdi, rsi); 726 masm.mulsd(xmm6, xmm1); 727 masm.movl(rax, rdi); 728 masm.addsd(xmm6, xmm3); 729 masm.movdqu(xmm2, xmm0); 730 masm.addsd(xmm0, xmm6); 731 masm.subsd(xmm2, xmm0); 732 masm.addsd(xmm6, xmm2); 733 734 masm.bind(block12); 735 masm.movq(xmm1, recordExternalAddress(crb, pi32Inv)); // 0x6dc9c883, 0x40245f30 736 masm.mulsd(xmm1, xmm0); 737 masm.movq(xmm5, recordExternalAddress(crb, onehalf)); // 0x00000000, 0x3fe00000, 738 // 0x00000000, 0x3fe00000 739 masm.movq(xmm4, recordExternalAddress(crb, signMask)); // 0x00000000, 0x80000000 740 masm.pand(xmm4, xmm0); 741 masm.por(xmm5, xmm4); 742 masm.addpd(xmm1, xmm5); 743 masm.cvttsd2sil(rdx, xmm1); 744 masm.cvtsi2sdl(xmm1, rdx); 745 masm.movq(xmm3, recordExternalAddress(crb, p1)); // 0x54400000, 0x3fb921fb 746 masm.movdqu(xmm2, recordExternalAddress(crb, p2)); // 0x1a600000, 0x3d90b461, 747 // 0x1a600000, 0x3d90b461 748 masm.mulsd(xmm3, xmm1); 749 masm.unpcklpd(xmm1, xmm1); 750 masm.shll(rax, 3); 751 masm.addl(rdx, 1865216); 752 masm.movdqu(xmm4, xmm0); 753 masm.addl(rdx, rax); 754 masm.andl(rdx, 63); 755 masm.movdqu(xmm5, recordExternalAddress(crb, sc4)); // 0x54400000, 0x3fb921fb 756 masm.leaq(AMD64.rax, recordExternalAddress(crb, ctable)); 757 masm.shll(rdx, 5); 758 masm.addq(AMD64.rax, AMD64.rdx); 759 masm.mulpd(xmm2, xmm1); 760 masm.subsd(xmm0, xmm3); 761 masm.mulsd(xmm1, recordExternalAddress(crb, p3)); // 0x2e037073, 0x3b63198a 762 masm.subsd(xmm4, xmm3); 763 masm.movq(xmm7, new AMD64Address(AMD64.rax, 8)); 764 masm.unpcklpd(xmm0, xmm0); 765 masm.movdqu(xmm3, xmm4); 766 masm.subsd(xmm4, xmm2); 767 masm.mulpd(xmm5, xmm0); 768 masm.subpd(xmm0, xmm2); 769 masm.mulsd(xmm7, xmm4); 770 masm.subsd(xmm3, xmm4); 771 masm.mulpd(xmm5, xmm0); 772 masm.mulpd(xmm0, xmm0); 773 masm.subsd(xmm3, xmm2); 774 masm.movdqu(xmm2, new AMD64Address(AMD64.rax, 0)); 775 masm.subsd(xmm1, xmm3); 776 masm.movq(xmm3, new AMD64Address(AMD64.rax, 24)); 777 masm.addsd(xmm2, xmm3); 778 masm.subsd(xmm7, xmm2); 779 masm.subsd(xmm1, xmm6); 780 masm.movdqu(xmm6, recordExternalAddress(crb, sc2)); // 0x11111111, 0x3f811111, 781 // 0x55555555, 0x3fa55555 782 masm.mulsd(xmm2, xmm4); 783 masm.mulpd(xmm6, xmm0); 784 masm.mulsd(xmm3, xmm4); 785 masm.mulpd(xmm2, xmm0); 786 masm.mulpd(xmm0, xmm0); 787 masm.addpd(xmm5, recordExternalAddress(crb, sc3)); // 0x1a01a01a, 0xbf2a01a0, 788 // 0x16c16c17, 0xbf56c16c 789 masm.mulsd(xmm4, new AMD64Address(AMD64.rax, 0)); 790 masm.addpd(xmm6, recordExternalAddress(crb, sc1)); // 0x55555555, 0xbfc55555, 791 // 0x00000000, 0xbfe00000 792 masm.mulpd(xmm5, xmm0); 793 masm.movdqu(xmm0, xmm3); 794 masm.addsd(xmm3, new AMD64Address(AMD64.rax, 8)); 795 masm.mulpd(xmm1, xmm7); 796 masm.movdqu(xmm7, xmm4); 797 masm.addsd(xmm4, xmm3); 798 masm.addpd(xmm6, xmm5); 799 masm.movq(xmm5, new AMD64Address(AMD64.rax, 8)); 800 masm.subsd(xmm5, xmm3); 801 masm.subsd(xmm3, xmm4); 802 masm.addsd(xmm1, new AMD64Address(AMD64.rax, 16)); 803 masm.mulpd(xmm6, xmm2); 804 masm.addsd(xmm5, xmm0); 805 masm.addsd(xmm3, xmm7); 806 masm.addsd(xmm1, xmm5); 807 masm.addsd(xmm1, xmm3); 808 masm.addsd(xmm1, xmm6); 809 masm.unpckhpd(xmm6, xmm6); 810 masm.movdqu(xmm0, xmm4); 811 masm.addsd(xmm1, xmm6); 812 masm.addsd(xmm0, xmm1); 813 masm.jmp(block14); 814 815 masm.bind(block8); 816 masm.addl(rdx, 64); 817 masm.movq(r9, r10); 818 masm.movq(r10, r8); 819 masm.movl(r8, 0); 820 masm.cmpq(r9, 0); 821 masm.jcc(ConditionFlag.NotEqual, block9); 822 masm.addl(rdx, 64); 823 masm.movq(r9, r10); 824 masm.movq(r10, r8); 825 masm.cmpq(r9, 0); 826 masm.jcc(ConditionFlag.NotEqual, block9); 827 masm.xorpd(xmm0, xmm0); 828 masm.xorpd(xmm6, xmm6); 829 masm.jmp(block12); 830 831 masm.bind(block10); 832 masm.jcc(ConditionFlag.Equal, block11); 833 masm.negl(rcx); 834 masm.shrq(r10); 835 masm.movq(AMD64.rax, r9); 836 masm.shrq(r9); 837 masm.subl(rdx, rcx); 838 masm.negl(rcx); 839 masm.addl(rcx, 64); 840 masm.shlq(AMD64.rax); 841 masm.orq(r10, AMD64.rax); 842 masm.jmp(block11); 843 844 masm.bind(block4); 845 masm.negl(rcx); 846 masm.shlq(r9, 32); 847 masm.orq(r9, r11); 848 masm.shlq(r9); 849 masm.movq(rdi, r9); 850 masm.testl(r9, Integer.MIN_VALUE); 851 masm.jcc(ConditionFlag.NotEqual, block13); 852 masm.shrl(r9); 853 masm.movl(rbx, 0); 854 masm.shrq(rdi, 3); 855 masm.jmp(block7); 856 857 masm.bind(block5); 858 masm.shrl(r9); 859 masm.movl(rbx, 536870912); 860 masm.shrl(rbx); 861 masm.shlq(r9, 32); 862 masm.orq(r9, r11); 863 masm.shlq(AMD64.rbx, 32); 864 masm.addl(rdi, 536870912); 865 masm.movl(AMD64.rcx, 0); 866 masm.movl(r11, 0); 867 masm.subq(AMD64.rcx, r8); 868 masm.sbbq(r11, r10); 869 masm.sbbq(AMD64.rbx, r9); 870 masm.movq(r8, AMD64.rcx); 871 masm.movq(r10, r11); 872 masm.movq(r9, AMD64.rbx); 873 masm.movl(rbx, 32768); 874 masm.jmp(block6); 875 876 masm.bind(block13); 877 masm.shrl(r9); 878 masm.movq(AMD64.rbx, 0x100000000L); 879 masm.shrq(AMD64.rbx); 880 masm.movl(AMD64.rcx, 0); 881 masm.movl(r11, 0); 882 masm.subq(AMD64.rcx, r8); 883 masm.sbbq(r11, r10); 884 masm.sbbq(AMD64.rbx, r9); 885 masm.movq(r8, AMD64.rcx); 886 masm.movq(r10, r11); 887 masm.movq(r9, AMD64.rbx); 888 masm.movl(rbx, 32768); 889 masm.shrq(rdi, 3); 890 masm.addl(rdi, 536870912); 891 masm.jmp(block7); 892 893 masm.bind(block3); 894 masm.movq(xmm0, new AMD64Address(rsp, 8)); 895 masm.mulsd(xmm0, recordExternalAddress(crb, negZero)); // 0x00000000, 0x80000000 896 masm.movq(new AMD64Address(rsp, 0), xmm0); 897 898 masm.bind(block14); 899 masm.addq(rsp, 16); 900 masm.pop(AMD64.rbx); 901 } 902 }