1 /* Copyright (c) 2018, Cavium. All rights reserved. (By BELLSOFT) 2 * Copyright (c) 2016, Intel Corporation. 3 * Intel Math Library (LIBM) Source Code 4 * 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/assembler.hpp" 29 #include "asm/assembler.inline.hpp" 30 #include "macroAssembler_aarch64.hpp" 31 32 // Algorithm idea is taken from x86 hotspot intrinsic and adapted for AARCH64. 33 // 34 // For mathematical background please refer to the following literature: 35 // 36 // Tang, Ping-Tak Peter. 37 // Table-driven implementation of the logarithm function 38 // in IEEE floating-point arithmetic. 39 // ACM Transactions on Mathematical Software (TOMS) 16, no. 4, 1990: 378-400. 40 41 /******************************************************************************/ 42 // ALGORITHM DESCRIPTION - LOG() 43 // --------------------- 44 // 45 // x=2^k * mx, mx in [1,2) 46 // 47 // Get B~1/mx based on the output of frecpe instruction (B0) 48 // B = int((B0*2^7+0.5))/2^7 49 // 50 // Reduced argument: r=B*mx-1.0 (computed accurately in high and low parts) 51 // 52 // Result: k*log(2) - log(B) + p(r) if |x-1| >= small value (2^-6) and 53 // p(r) is a degree 7 polynomial 54 // -log(B) read from data table (high, low parts) 55 // Result is formed from high and low parts 56 // 57 // Special cases: 58 // 1. log(NaN) = quiet NaN 59 // 2. log(+INF) = that INF 60 // 3. log(0) = -INF 61 // 4. log(1) = +0 62 // 5. log(x) = NaN if x < -0, including -INF 63 // 64 /******************************************************************************/ 65 66 // Table with p(r) polynomial coefficients 67 // and table representation of logarithm values (hi and low parts) 68 __attribute__ ((aligned(64))) juint _L_tbl[] = 69 { 70 // coefficients of p(r) polynomial: 71 // _coeff[] 72 0x00000000UL, 0xbfd00000UL, // C1_0 = -0.25 73 0x92492492UL, 0x3fc24924UL, // C1_1 = 0.14285714285714285 74 0x55555555UL, 0x3fd55555UL, // C2_0 = 0.3333333333333333 75 0x3d6fb175UL, 0xbfc5555eUL, // C2_1 = -0.16666772842235003 76 0x00000000UL, 0xbfe00000UL, // C3_0 = -0.5 77 0x9999999aUL, 0x3fc99999UL, // C3_1 = 0.2 78 // _log2[] 79 0xfefa3800UL, 0x3fa62e42UL, // C4_0 = 0.043321698784993146 80 0x93c76730UL, 0x3ceef357UL, // C4_1 = 3.436201886692732e-15 81 // _L_tbl[] with logarithm values (hi and low parts) 82 0xfefa3800UL, 0x3fe62e42UL, 0x93c76730UL, 0x3d2ef357UL, 0xaa241800UL, 83 0x3fe5ee82UL, 0x0cda46beUL, 0x3d220238UL, 0x5c364800UL, 0x3fe5af40UL, 84 0xac10c9fbUL, 0x3d2dfa63UL, 0x26bb8c00UL, 0x3fe5707aUL, 0xff3303ddUL, 85 0x3d09980bUL, 0x26867800UL, 0x3fe5322eUL, 0x5d257531UL, 0x3d05ccc4UL, 86 0x835a5000UL, 0x3fe4f45aUL, 0x6d93b8fbUL, 0xbd2e6c51UL, 0x6f970c00UL, 87 0x3fe4b6fdUL, 0xed4c541cUL, 0x3cef7115UL, 0x27e8a400UL, 0x3fe47a15UL, 88 0xf94d60aaUL, 0xbd22cb6aUL, 0xf2f92400UL, 0x3fe43d9fUL, 0x481051f7UL, 89 0xbcfd984fUL, 0x2125cc00UL, 0x3fe4019cUL, 0x30f0c74cUL, 0xbd26ce79UL, 90 0x0c36c000UL, 0x3fe3c608UL, 0x7cfe13c2UL, 0xbd02b736UL, 0x17197800UL, 91 0x3fe38ae2UL, 0xbb5569a4UL, 0xbd218b7aUL, 0xad9d8c00UL, 0x3fe35028UL, 92 0x9527e6acUL, 0x3d10b83fUL, 0x44340800UL, 0x3fe315daUL, 0xc5a0ed9cUL, 93 0xbd274e93UL, 0x57b0e000UL, 0x3fe2dbf5UL, 0x07b9dc11UL, 0xbd17a6e5UL, 94 0x6d0ec000UL, 0x3fe2a278UL, 0xe797882dUL, 0x3d206d2bUL, 0x1134dc00UL, 95 0x3fe26962UL, 0x05226250UL, 0xbd0b61f1UL, 0xd8bebc00UL, 0x3fe230b0UL, 96 0x6e48667bUL, 0x3d12fc06UL, 0x5fc61800UL, 0x3fe1f863UL, 0xc9fe81d3UL, 97 0xbd2a7242UL, 0x49ae6000UL, 0x3fe1c078UL, 0xed70e667UL, 0x3cccacdeUL, 98 0x40f23c00UL, 0x3fe188eeUL, 0xf8ab4650UL, 0x3d14cc4eUL, 0xf6f29800UL, 99 0x3fe151c3UL, 0xa293ae49UL, 0xbd2edd97UL, 0x23c75c00UL, 0x3fe11af8UL, 100 0xbb9ddcb2UL, 0xbd258647UL, 0x8611cc00UL, 0x3fe0e489UL, 0x07801742UL, 101 0x3d1c2998UL, 0xe2d05400UL, 0x3fe0ae76UL, 0x887e7e27UL, 0x3d1f486bUL, 102 0x0533c400UL, 0x3fe078bfUL, 0x41edf5fdUL, 0x3d268122UL, 0xbe760400UL, 103 0x3fe04360UL, 0xe79539e0UL, 0xbd04c45fUL, 0xe5b20800UL, 0x3fe00e5aUL, 104 0xb1727b1cUL, 0xbd053ba3UL, 0xaf7a4800UL, 0x3fdfb358UL, 0x3c164935UL, 105 0x3d0085faUL, 0xee031800UL, 0x3fdf4aa7UL, 0x6f014a8bUL, 0x3d12cde5UL, 106 0x56b41000UL, 0x3fdee2a1UL, 0x5a470251UL, 0x3d2f27f4UL, 0xc3ddb000UL, 107 0x3fde7b42UL, 0x5372bd08UL, 0xbd246550UL, 0x1a272800UL, 0x3fde148aUL, 108 0x07322938UL, 0xbd1326b2UL, 0x484c9800UL, 0x3fddae75UL, 0x60dc616aUL, 109 0xbd1ea42dUL, 0x46def800UL, 0x3fdd4902UL, 0xe9a767a8UL, 0x3d235bafUL, 110 0x18064800UL, 0x3fdce42fUL, 0x3ec7a6b0UL, 0xbd0797c3UL, 0xc7455800UL, 111 0x3fdc7ff9UL, 0xc15249aeUL, 0xbd29b6ddUL, 0x693fa000UL, 0x3fdc1c60UL, 112 0x7fe8e180UL, 0x3d2cec80UL, 0x1b80e000UL, 0x3fdbb961UL, 0xf40a666dUL, 113 0x3d27d85bUL, 0x04462800UL, 0x3fdb56faUL, 0x2d841995UL, 0x3d109525UL, 114 0x5248d000UL, 0x3fdaf529UL, 0x52774458UL, 0xbd217cc5UL, 0x3c8ad800UL, 115 0x3fda93edUL, 0xbea77a5dUL, 0x3d1e36f2UL, 0x0224f800UL, 0x3fda3344UL, 116 0x7f9d79f5UL, 0x3d23c645UL, 0xea15f000UL, 0x3fd9d32bUL, 0x10d0c0b0UL, 117 0xbd26279eUL, 0x43135800UL, 0x3fd973a3UL, 0xa502d9f0UL, 0xbd152313UL, 118 0x635bf800UL, 0x3fd914a8UL, 0x2ee6307dUL, 0xbd1766b5UL, 0xa88b3000UL, 119 0x3fd8b639UL, 0xe5e70470UL, 0xbd205ae1UL, 0x776dc800UL, 0x3fd85855UL, 120 0x3333778aUL, 0x3d2fd56fUL, 0x3bd81800UL, 0x3fd7fafaUL, 0xc812566aUL, 121 0xbd272090UL, 0x687cf800UL, 0x3fd79e26UL, 0x2efd1778UL, 0x3d29ec7dUL, 122 0x76c67800UL, 0x3fd741d8UL, 0x49dc60b3UL, 0x3d2d8b09UL, 0xe6af1800UL, 123 0x3fd6e60eUL, 0x7c222d87UL, 0x3d172165UL, 0x3e9c6800UL, 0x3fd68ac8UL, 124 0x2756eba0UL, 0x3d20a0d3UL, 0x0b3ab000UL, 0x3fd63003UL, 0xe731ae00UL, 125 0xbd2db623UL, 0xdf596000UL, 0x3fd5d5bdUL, 0x08a465dcUL, 0xbd0a0b2aUL, 126 0x53c8d000UL, 0x3fd57bf7UL, 0xee5d40efUL, 0x3d1fadedUL, 0x0738a000UL, 127 0x3fd522aeUL, 0x8164c759UL, 0x3d2ebe70UL, 0x9e173000UL, 0x3fd4c9e0UL, 128 0x1b0ad8a4UL, 0xbd2e2089UL, 0xc271c800UL, 0x3fd4718dUL, 0x0967d675UL, 129 0xbd2f27ceUL, 0x23d5e800UL, 0x3fd419b4UL, 0xec90e09dUL, 0x3d08e436UL, 130 0x77333000UL, 0x3fd3c252UL, 0xb606bd5cUL, 0x3d183b54UL, 0x76be1000UL, 131 0x3fd36b67UL, 0xb0f177c8UL, 0x3d116ecdUL, 0xe1d36000UL, 0x3fd314f1UL, 132 0xd3213cb8UL, 0xbd28e27aUL, 0x7cdc9000UL, 0x3fd2bef0UL, 0x4a5004f4UL, 133 0x3d2a9cfaUL, 0x1134d800UL, 0x3fd26962UL, 0xdf5bb3b6UL, 0x3d2c93c1UL, 134 0x6d0eb800UL, 0x3fd21445UL, 0xba46baeaUL, 0x3d0a87deUL, 0x635a6800UL, 135 0x3fd1bf99UL, 0x5147bdb7UL, 0x3d2ca6edUL, 0xcbacf800UL, 0x3fd16b5cUL, 136 0xf7a51681UL, 0x3d2b9acdUL, 0x8227e800UL, 0x3fd1178eUL, 0x63a5f01cUL, 137 0xbd2c210eUL, 0x67616000UL, 0x3fd0c42dUL, 0x163ceae9UL, 0x3d27188bUL, 138 0x604d5800UL, 0x3fd07138UL, 0x16ed4e91UL, 0x3cf89cdbUL, 0x5626c800UL, 139 0x3fd01eaeUL, 0x1485e94aUL, 0xbd16f08cUL, 0x6cb3b000UL, 0x3fcf991cUL, 140 0xca0cdf30UL, 0x3d1bcbecUL, 0xe4dd0000UL, 0x3fcef5adUL, 0x65bb8e11UL, 141 0xbcca2115UL, 0xffe71000UL, 0x3fce530eUL, 0x6041f430UL, 0x3cc21227UL, 142 0xb0d49000UL, 0x3fcdb13dUL, 0xf715b035UL, 0xbd2aff2aUL, 0xf2656000UL, 143 0x3fcd1037UL, 0x75b6f6e4UL, 0xbd084a7eUL, 0xc6f01000UL, 0x3fcc6ffbUL, 144 0xc5962bd2UL, 0xbcf1ec72UL, 0x383be000UL, 0x3fcbd087UL, 0x595412b6UL, 145 0xbd2d4bc4UL, 0x575bd000UL, 0x3fcb31d8UL, 0x4eace1aaUL, 0xbd0c358dUL, 146 0x3c8ae000UL, 0x3fca93edUL, 0x50562169UL, 0xbd287243UL, 0x07089000UL, 147 0x3fc9f6c4UL, 0x6865817aUL, 0x3d29904dUL, 0xdcf70000UL, 0x3fc95a5aUL, 148 0x58a0ff6fUL, 0x3d07f228UL, 0xeb390000UL, 0x3fc8beafUL, 0xaae92cd1UL, 149 0xbd073d54UL, 0x6551a000UL, 0x3fc823c1UL, 0x9a631e83UL, 0x3d1e0ddbUL, 150 0x85445000UL, 0x3fc7898dUL, 0x70914305UL, 0xbd1c6610UL, 0x8b757000UL, 151 0x3fc6f012UL, 0xe59c21e1UL, 0xbd25118dUL, 0xbe8c1000UL, 0x3fc6574eUL, 152 0x2c3c2e78UL, 0x3d19cf8bUL, 0x6b544000UL, 0x3fc5bf40UL, 0xeb68981cUL, 153 0xbd127023UL, 0xe4a1b000UL, 0x3fc527e5UL, 0xe5697dc7UL, 0x3d2633e8UL, 154 0x8333b000UL, 0x3fc4913dUL, 0x54fdb678UL, 0x3d258379UL, 0xa5993000UL, 155 0x3fc3fb45UL, 0x7e6a354dUL, 0xbd2cd1d8UL, 0xb0159000UL, 0x3fc365fcUL, 156 0x234b7289UL, 0x3cc62fa8UL, 0x0c868000UL, 0x3fc2d161UL, 0xcb81b4a1UL, 157 0x3d039d6cUL, 0x2a49c000UL, 0x3fc23d71UL, 0x8fd3df5cUL, 0x3d100d23UL, 158 0x7e23f000UL, 0x3fc1aa2bUL, 0x44389934UL, 0x3d2ca78eUL, 0x8227e000UL, 159 0x3fc1178eUL, 0xce2d07f2UL, 0x3d21ef78UL, 0xb59e4000UL, 0x3fc08598UL, 160 0x7009902cUL, 0xbd27e5ddUL, 0x39dbe000UL, 0x3fbfe891UL, 0x4fa10afdUL, 161 0xbd2534d6UL, 0x830a2000UL, 0x3fbec739UL, 0xafe645e0UL, 0xbd2dc068UL, 162 0x63844000UL, 0x3fbda727UL, 0x1fa71733UL, 0x3d1a8940UL, 0x01bc4000UL, 163 0x3fbc8858UL, 0xc65aacd3UL, 0x3d2646d1UL, 0x8dad6000UL, 0x3fbb6ac8UL, 164 0x2bf768e5UL, 0xbd139080UL, 0x40b1c000UL, 0x3fba4e76UL, 0xb94407c8UL, 165 0xbd0e42b6UL, 0x5d594000UL, 0x3fb9335eUL, 0x3abd47daUL, 0x3d23115cUL, 166 0x2f40e000UL, 0x3fb8197eUL, 0xf96ffdf7UL, 0x3d0f80dcUL, 0x0aeac000UL, 167 0x3fb700d3UL, 0xa99ded32UL, 0x3cec1e8dUL, 0x4d97a000UL, 0x3fb5e95aUL, 168 0x3c5d1d1eUL, 0xbd2c6906UL, 0x5d208000UL, 0x3fb4d311UL, 0x82f4e1efUL, 169 0xbcf53a25UL, 0xa7d1e000UL, 0x3fb3bdf5UL, 0xa5db4ed7UL, 0x3d2cc85eUL, 170 0xa4472000UL, 0x3fb2aa04UL, 0xae9c697dUL, 0xbd20b6e8UL, 0xd1466000UL, 171 0x3fb1973bUL, 0x560d9e9bUL, 0xbd25325dUL, 0xb59e4000UL, 0x3fb08598UL, 172 0x7009902cUL, 0xbd17e5ddUL, 0xc006c000UL, 0x3faeea31UL, 0x4fc93b7bUL, 173 0xbd0e113eUL, 0xcdddc000UL, 0x3faccb73UL, 0x47d82807UL, 0xbd1a68f2UL, 174 0xd0fb0000UL, 0x3faaaef2UL, 0x353bb42eUL, 0x3d20fc1aUL, 0x149fc000UL, 175 0x3fa894aaUL, 0xd05a267dUL, 0xbd197995UL, 0xf2d4c000UL, 0x3fa67c94UL, 176 0xec19afa2UL, 0xbd029efbUL, 0xd42e0000UL, 0x3fa466aeUL, 0x75bdfd28UL, 177 0xbd2c1673UL, 0x2f8d0000UL, 0x3fa252f3UL, 0xe021b67bUL, 0x3d283e9aUL, 178 0x89e74000UL, 0x3fa0415dUL, 0x5cf1d753UL, 0x3d0111c0UL, 0xec148000UL, 179 0x3f9c63d2UL, 0x3f9eb2f3UL, 0x3d2578c6UL, 0x28c90000UL, 0x3f984925UL, 180 0x325a0c34UL, 0xbd2aa0baUL, 0x25980000UL, 0x3f9432a9UL, 0x928637feUL, 181 0x3d098139UL, 0x58938000UL, 0x3f902056UL, 0x06e2f7d2UL, 0xbd23dc5bUL, 182 0xa3890000UL, 0x3f882448UL, 0xda74f640UL, 0xbd275577UL, 0x75890000UL, 183 0x3f801015UL, 0x999d2be8UL, 0xbd10c76bUL, 0x59580000UL, 0x3f700805UL, 184 0xcb31c67bUL, 0x3d2166afUL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 185 0x80000000UL 186 }; 187 188 // BEGIN dlog PSEUDO CODE: 189 // double dlog(double X) { 190 // // p(r) polynomial coefficients initialized from _L_tbl table 191 // double C1_0 = _L_tbl[0]; 192 // double C1_1 = _L_tbl[1]; 193 // double C2_0 = _L_tbl[2]; 194 // double C2_1 = _L_tbl[3]; 195 // double C3_0 = _L_tbl[4]; 196 // double C3_1 = _L_tbl[5]; 197 // double C4_0 = _L_tbl[6]; 198 // double C4_1 = _L_tbl[7]; 199 // // NOTE: operations with coefficients above are mostly vectorized in assembly 200 // // Check corner cases first 201 // if (X == 1.0d || AS_LONG_BITS(X) + 0x0010000000000000 <= 0x0010000000000000) { 202 // // NOTE: AS_LONG_BITS(X) + 0x0010000000000000 <= 0x0010000000000000 means 203 // // that X < 0 or X >= 0x7FF0000000000000 (0x7FF* is NaN or INF) 204 // if (X < 0 || X is NaN) return NaN; 205 // if (X == 1.0d) return 0.0d; 206 // if (X == 0.0d) return -INFINITY; 207 // if (X is INFINITY) return INFINITY; 208 // } 209 // // double representation is 2^exponent * mantissa 210 // // split X into two multipliers: 2^exponent and 1.0 * mantissa 211 // // pseudo function: zeroExponent(X) return value of X with exponent == 0 212 // float vtmp5 = 1/(float)(zeroExponent(X)); // reciprocal estimate 213 // // pseudo function: HI16(X) returns high 16 bits of double value 214 // int hiWord = HI16(X); 215 // double vtmp1 = (double) 0x77F0 << 48 | mantissa(X); 216 // hiWord -= 16; 217 // if (AS_LONG_BITS(hiWord) > 0x8000) { 218 // // SMALL_VALUE branch 219 // vtmp0 = vtmp1 = vtmp0 * AS_DOUBLE_BITS(0x47F0000000000000); 220 // hiWord = HI16(vtmp1); 221 // vtmp0 = AS_DOUBLE_BITS(AS_LONG_BITS(vtmp0) |= 0x3FF0000000000000); 222 // vtmp5 = (double) (1/(float)vtmp0); 223 // vtmp1 <<= 12; 224 // vtmp1 >>= 12; 225 // } 226 // // MAIN branch 227 // double vtmp3 = AS_LONG_BITS(vtmp1) & 0xffffe00000000000; // hi part 228 // int intB0 = AS_INT_BITS(vtmp5) + 0x8000; 229 // double vtmp0 = AS_DOUBLE_BITS(0xffffe00000000000 & (intB0<<29)); 230 // int index = (intB0 >> 16) && 0xFF; 231 // double hiTableValue = _L_tbl[8+index]; // vtmp2[0] 232 // double lowTableValue = _L_tbl[16+index]; // vtmp2[1] 233 // vtmp5 = AS_DOUBLE_BITS(hiWord & 0x7FF0 - 0x3FE0); // 0x3FE = 1023 << 4 234 // vtmp1 -= vtmp3; // low part 235 // vtmp3 = vtmp3*vtmp0 - 1.0; 236 // hiTableValue += C4_0 * vtmp5; 237 // lowTableValue += C4_1 * vtmp5; 238 // double r = vtmp1 * vtmp0 + vtmp3; // r = B*mx-1.0, computed in hi and low parts 239 // vtmp0 = hiTableValue + r; 240 // hiTableValue -= vtmp0; 241 // double r2 = r*r; 242 // double r3 = r2*r; 243 // double p7 = C3_0*r2 + C2_0*r3 + C1_0*r2*r2 + C3_1*r3*r2 + C2_1*r3*r3 244 // + C1_1*r3*r2*r2; // degree 7 polynomial 245 // return p7 + (vtmp0 + ((r + hiTableValue) + lowTableValue)); 246 // } 247 // 248 // END dlog PSEUDO CODE 249 250 251 // Generate log(X). X passed in register v0. Return log(X) into v0. 252 // Generator parameters: 10 temporary FPU registers and temporary general 253 // purpose registers 254 void MacroAssembler::fast_log(FloatRegister vtmp0, FloatRegister vtmp1, 255 FloatRegister vtmp2, FloatRegister vtmp3, 256 FloatRegister vtmp4, FloatRegister vtmp5, 257 FloatRegister C1, FloatRegister C2, 258 FloatRegister C3, FloatRegister C4, 259 Register tmp1, Register tmp2, Register tmp3, 260 Register tmp4, Register tmp5) { 261 Label DONE, CHECK_CORNER_CASES, SMALL_VALUE, MAIN, 262 CHECKED_CORNER_CASES, RETURN_MINF_OR_NAN; 263 const long INF_OR_NAN_PREFIX = 0x7FF0; 264 const long MINF_OR_MNAN_PREFIX = 0xFFF0; 265 const long ONE_PREFIX = 0x3FF0; 266 movz(tmp2, ONE_PREFIX, 48); 267 movz(tmp4, 0x0010, 48); 268 fmovd(rscratch1, v0); // rscratch1 = AS_LONG_BITS(X) 269 lea(rscratch2, ExternalAddress((address)_L_tbl)); 270 movz(tmp5, 0x7F); 271 add(tmp1, rscratch1, tmp4); 272 cmp(tmp2, rscratch1); 273 lsr(tmp3, rscratch1, 29); 274 ccmp(tmp1, tmp4, 0b1101 /* LE */, NE); 275 bfm(tmp3, tmp5, 41, 8); 276 fmovs(vtmp5, tmp3); 277 // Load coefficients from table. All coefficients are organized to be 278 // in specific order, because load below will load it in vectors to be used 279 // later in vector instructions. Load will be performed in parallel while 280 // branches are taken. C1 will contain vector of {C1_0, C1_1}, C2 = 281 // {C2_0, C2_1}, C3 = {C3_0, C3_1}, C4 = {C4_0, C4_1} 282 ld1(C1, C2, C3, C4, T2D, post(rscratch2, 64)); 283 br(LE, CHECK_CORNER_CASES); 284 bind(CHECKED_CORNER_CASES); 285 // all corner cases are handled 286 frecpe(vtmp5, vtmp5, S); // vtmp5 ~= 1/vtmp5 287 lsr(tmp2, rscratch1, 48); 288 movz(tmp4, 0x77f0, 48); 289 fmovd(vtmp4, 1.0d); 290 movz(tmp1, INF_OR_NAN_PREFIX, 48); 291 bfm(tmp4, rscratch1, 0, 51); // tmp4 = 0x77F0 << 48 | mantissa(X) 292 // vtmp1 = AS_DOUBLE_BITS(0x77F0 << 48 | mantissa(X)) == mx 293 fmovd(vtmp1, tmp4); 294 subw(tmp2, tmp2, 16); 295 subs(zr, tmp2, 0x8000); 296 br(GE, SMALL_VALUE); 297 bind(MAIN); 298 fmovs(tmp3, vtmp5); // int intB0 = AS_INT_BITS(B); 299 mov(tmp5, 0x3FE0); 300 mov(rscratch1, 0xffffe00000000000); 301 andr(tmp2, tmp2, tmp1, LSR, 48); // hiWord & 0x7FF0 302 sub(tmp2, tmp2, tmp5); // tmp2 = hiWord & 0x7FF0 - 0x3FE0 303 scvtfwd(vtmp5, tmp2); // vtmp5 = (double)tmp2; 304 addw(tmp3, tmp3, 0x8000); // tmp3 = B 305 andr(tmp4, tmp4, rscratch1); // tmp4 == hi_part(mx) 306 andr(rscratch1, rscratch1, tmp3, LSL, 29); // rscratch1 = hi_part(B) 307 ubfm(tmp3, tmp3, 16, 23); // int index = (intB0 >> 16) && 0xFF 308 ldrq(vtmp2, Address(rscratch2, tmp3, Address::lsl(4))); // vtmp2 = _L_tbl[index] 309 // AS_LONG_BITS(vtmp1) & 0xffffe00000000000 // hi_part(mx) 310 fmovd(vtmp3, tmp4); 311 fmovd(vtmp0, rscratch1); // vtmp0 = hi_part(B) 312 fsubd(vtmp1, vtmp1, vtmp3); // vtmp1 -= vtmp3; // low_part(mx) 313 fnmsub(vtmp3, vtmp3, vtmp0, vtmp4); // vtmp3 = vtmp3*vtmp0 - vtmp4 314 fmlavs(vtmp2, T2D, C4, vtmp5, 0); // vtmp2 += {C4} * vtmp5 315 // vtmp1 = r = vtmp1 * vtmp0 + vtmp3 == low_part(mx) * hi_part(B) + (hi_part(mx)*hi_part(B) - 1.0) 316 fmaddd(vtmp1, vtmp1, vtmp0, vtmp3); 317 ins(vtmp5, D, vtmp2, 0, 1); // vtmp5 = vtmp2[1]; 318 faddd(vtmp0, vtmp2, vtmp1); // vtmp0 = vtmp2 + vtmp1 319 fmlavs(C3, T2D, C2, vtmp1, 0); // {C3} += {C2}*vtmp1 320 fsubd(vtmp2, vtmp2, vtmp0); // vtmp2 -= vtmp0 321 fmuld(vtmp3, vtmp1, vtmp1); // vtmp3 = vtmp1*vtmp1 322 faddd(C4, vtmp1, vtmp2); // C4[0] = vtmp1 + vtmp2 323 fmlavs(C3, T2D, C1, vtmp3, 0); // {C3} += {C1}*vtmp3 324 faddd(C4, C4, vtmp5); // C4 += vtmp5 325 fmuld(vtmp4, vtmp3, vtmp1); // vtmp4 = vtmp3*vtmp1 326 faddd(vtmp0, vtmp0, C4); // vtmp0 += C4 327 fmlavs(C3, T2D, vtmp4, C3, 1); // {C3} += {vtmp4}*C3[1] 328 fmaddd(vtmp0, C3, vtmp3, vtmp0); // vtmp0 = C3 * vtmp3 + vtmp0 329 ret(lr); 330 331 block_comment("if (AS_LONG_BITS(hiWord) > 0x8000)"); { 332 bind(SMALL_VALUE); 333 movz(tmp2, 0x47F0, 48); 334 fmovd(vtmp1, tmp2); 335 fmuld(vtmp0, vtmp1, v0); 336 fmovd(vtmp1, vtmp0); 337 umov(tmp2, vtmp1, S, 3); 338 orr(vtmp0, T16B, vtmp0, vtmp4); 339 ushr(vtmp5, T2D, vtmp0, 27); 340 ushr(vtmp5, T4S, vtmp5, 2); 341 frecpe(vtmp5, vtmp5, S); 342 shl(vtmp1, T2D, vtmp1, 12); 343 ushr(vtmp1, T2D, vtmp1, 12); 344 b(MAIN); 345 } 346 347 block_comment("Corner cases"); { 348 bind(RETURN_MINF_OR_NAN); 349 movz(tmp1, MINF_OR_MNAN_PREFIX, 48); 350 orr(rscratch1, rscratch1, tmp1); 351 fmovd(v0, rscratch1); 352 ret(lr); 353 bind(CHECK_CORNER_CASES); 354 movz(tmp1, INF_OR_NAN_PREFIX, 48); 355 cmp(rscratch1, zr); 356 br(LE, RETURN_MINF_OR_NAN); 357 cmp(rscratch1, tmp1); 358 br(GE, DONE); 359 cmp(rscratch1, tmp2); 360 br(NE, CHECKED_CORNER_CASES); 361 fmovd(v0, 0.0d); 362 } 363 bind(DONE); 364 ret(lr); 365 }