1 /* Copyright (c) 2018, Cavium. All rights reserved. (By BELLSOFT)
   2  * Copyright (c) 2016, Intel Corporation.
   3  * Intel Math Library (LIBM) Source Code
   4  *
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "macroAssembler_aarch64.hpp"
  31 
  32 // Algorithm idea is taken from x86 hotspot intrinsic and adapted for AARCH64.
  33 //
  34 // For mathematical background please refer to the following literature:
  35 //
  36 // Tang, Ping-Tak Peter.
  37 // Table-driven implementation of the logarithm function
  38 // in IEEE floating-point arithmetic.
  39 // ACM Transactions on Mathematical Software (TOMS) 16, no. 4, 1990: 378-400.
  40 
  41 /******************************************************************************/
  42 //                     ALGORITHM DESCRIPTION - LOG()
  43 //                     ---------------------
  44 //
  45 //    x=2^k * mx, mx in [1,2)
  46 //
  47 //    Get B~1/mx based on the output of frecpe instruction (B0)
  48 //    B = int((B0*2^7+0.5))/2^7
  49 //
  50 //    Reduced argument: r=B*mx-1.0 (computed accurately in high and low parts)
  51 //
  52 //    Result:  k*log(2) - log(B) + p(r) if |x-1| >= small value (2^-6)  and
  53 //             p(r) is a degree 7 polynomial
  54 //             -log(B) read from data table (high, low parts)
  55 //             Result is formed from high and low parts
  56 //
  57 // Special cases:
  58 // 1. log(NaN) = quiet NaN
  59 // 2. log(+INF) = that INF
  60 // 3. log(0) = -INF
  61 // 4. log(1) = +0
  62 // 5. log(x) = NaN if x < -0, including -INF
  63 //
  64 /******************************************************************************/
  65 
  66 // Table with p(r) polynomial coefficients
  67 // and table representation of logarithm values (hi and low parts)
  68 __attribute__ ((aligned(64))) juint _L_tbl[] =
  69 {
  70     // coefficients of p(r) polynomial:
  71     // _coeff[]
  72     0x00000000UL, 0xbfd00000UL, // C1_0 = -0.25
  73     0x92492492UL, 0x3fc24924UL, // C1_1 = 0.14285714285714285
  74     0x55555555UL, 0x3fd55555UL, // C2_0 = 0.3333333333333333
  75     0x3d6fb175UL, 0xbfc5555eUL, // C2_1 = -0.16666772842235003
  76     0x00000000UL, 0xbfe00000UL, // C3_0 = -0.5
  77     0x9999999aUL, 0x3fc99999UL, // C3_1 = 0.2
  78     // _log2[]
  79     0xfefa3800UL, 0x3fa62e42UL, // C4_0 = 0.043321698784993146
  80     0x93c76730UL, 0x3ceef357UL, // C4_1 = 3.436201886692732e-15
  81     // _L_tbl[] with logarithm values (hi and low parts)
  82     0xfefa3800UL, 0x3fe62e42UL, 0x93c76730UL, 0x3d2ef357UL, 0xaa241800UL,
  83     0x3fe5ee82UL, 0x0cda46beUL, 0x3d220238UL, 0x5c364800UL, 0x3fe5af40UL,
  84     0xac10c9fbUL, 0x3d2dfa63UL, 0x26bb8c00UL, 0x3fe5707aUL, 0xff3303ddUL,
  85     0x3d09980bUL, 0x26867800UL, 0x3fe5322eUL, 0x5d257531UL, 0x3d05ccc4UL,
  86     0x835a5000UL, 0x3fe4f45aUL, 0x6d93b8fbUL, 0xbd2e6c51UL, 0x6f970c00UL,
  87     0x3fe4b6fdUL, 0xed4c541cUL, 0x3cef7115UL, 0x27e8a400UL, 0x3fe47a15UL,
  88     0xf94d60aaUL, 0xbd22cb6aUL, 0xf2f92400UL, 0x3fe43d9fUL, 0x481051f7UL,
  89     0xbcfd984fUL, 0x2125cc00UL, 0x3fe4019cUL, 0x30f0c74cUL, 0xbd26ce79UL,
  90     0x0c36c000UL, 0x3fe3c608UL, 0x7cfe13c2UL, 0xbd02b736UL, 0x17197800UL,
  91     0x3fe38ae2UL, 0xbb5569a4UL, 0xbd218b7aUL, 0xad9d8c00UL, 0x3fe35028UL,
  92     0x9527e6acUL, 0x3d10b83fUL, 0x44340800UL, 0x3fe315daUL, 0xc5a0ed9cUL,
  93     0xbd274e93UL, 0x57b0e000UL, 0x3fe2dbf5UL, 0x07b9dc11UL, 0xbd17a6e5UL,
  94     0x6d0ec000UL, 0x3fe2a278UL, 0xe797882dUL, 0x3d206d2bUL, 0x1134dc00UL,
  95     0x3fe26962UL, 0x05226250UL, 0xbd0b61f1UL, 0xd8bebc00UL, 0x3fe230b0UL,
  96     0x6e48667bUL, 0x3d12fc06UL, 0x5fc61800UL, 0x3fe1f863UL, 0xc9fe81d3UL,
  97     0xbd2a7242UL, 0x49ae6000UL, 0x3fe1c078UL, 0xed70e667UL, 0x3cccacdeUL,
  98     0x40f23c00UL, 0x3fe188eeUL, 0xf8ab4650UL, 0x3d14cc4eUL, 0xf6f29800UL,
  99     0x3fe151c3UL, 0xa293ae49UL, 0xbd2edd97UL, 0x23c75c00UL, 0x3fe11af8UL,
 100     0xbb9ddcb2UL, 0xbd258647UL, 0x8611cc00UL, 0x3fe0e489UL, 0x07801742UL,
 101     0x3d1c2998UL, 0xe2d05400UL, 0x3fe0ae76UL, 0x887e7e27UL, 0x3d1f486bUL,
 102     0x0533c400UL, 0x3fe078bfUL, 0x41edf5fdUL, 0x3d268122UL, 0xbe760400UL,
 103     0x3fe04360UL, 0xe79539e0UL, 0xbd04c45fUL, 0xe5b20800UL, 0x3fe00e5aUL,
 104     0xb1727b1cUL, 0xbd053ba3UL, 0xaf7a4800UL, 0x3fdfb358UL, 0x3c164935UL,
 105     0x3d0085faUL, 0xee031800UL, 0x3fdf4aa7UL, 0x6f014a8bUL, 0x3d12cde5UL,
 106     0x56b41000UL, 0x3fdee2a1UL, 0x5a470251UL, 0x3d2f27f4UL, 0xc3ddb000UL,
 107     0x3fde7b42UL, 0x5372bd08UL, 0xbd246550UL, 0x1a272800UL, 0x3fde148aUL,
 108     0x07322938UL, 0xbd1326b2UL, 0x484c9800UL, 0x3fddae75UL, 0x60dc616aUL,
 109     0xbd1ea42dUL, 0x46def800UL, 0x3fdd4902UL, 0xe9a767a8UL, 0x3d235bafUL,
 110     0x18064800UL, 0x3fdce42fUL, 0x3ec7a6b0UL, 0xbd0797c3UL, 0xc7455800UL,
 111     0x3fdc7ff9UL, 0xc15249aeUL, 0xbd29b6ddUL, 0x693fa000UL, 0x3fdc1c60UL,
 112     0x7fe8e180UL, 0x3d2cec80UL, 0x1b80e000UL, 0x3fdbb961UL, 0xf40a666dUL,
 113     0x3d27d85bUL, 0x04462800UL, 0x3fdb56faUL, 0x2d841995UL, 0x3d109525UL,
 114     0x5248d000UL, 0x3fdaf529UL, 0x52774458UL, 0xbd217cc5UL, 0x3c8ad800UL,
 115     0x3fda93edUL, 0xbea77a5dUL, 0x3d1e36f2UL, 0x0224f800UL, 0x3fda3344UL,
 116     0x7f9d79f5UL, 0x3d23c645UL, 0xea15f000UL, 0x3fd9d32bUL, 0x10d0c0b0UL,
 117     0xbd26279eUL, 0x43135800UL, 0x3fd973a3UL, 0xa502d9f0UL, 0xbd152313UL,
 118     0x635bf800UL, 0x3fd914a8UL, 0x2ee6307dUL, 0xbd1766b5UL, 0xa88b3000UL,
 119     0x3fd8b639UL, 0xe5e70470UL, 0xbd205ae1UL, 0x776dc800UL, 0x3fd85855UL,
 120     0x3333778aUL, 0x3d2fd56fUL, 0x3bd81800UL, 0x3fd7fafaUL, 0xc812566aUL,
 121     0xbd272090UL, 0x687cf800UL, 0x3fd79e26UL, 0x2efd1778UL, 0x3d29ec7dUL,
 122     0x76c67800UL, 0x3fd741d8UL, 0x49dc60b3UL, 0x3d2d8b09UL, 0xe6af1800UL,
 123     0x3fd6e60eUL, 0x7c222d87UL, 0x3d172165UL, 0x3e9c6800UL, 0x3fd68ac8UL,
 124     0x2756eba0UL, 0x3d20a0d3UL, 0x0b3ab000UL, 0x3fd63003UL, 0xe731ae00UL,
 125     0xbd2db623UL, 0xdf596000UL, 0x3fd5d5bdUL, 0x08a465dcUL, 0xbd0a0b2aUL,
 126     0x53c8d000UL, 0x3fd57bf7UL, 0xee5d40efUL, 0x3d1fadedUL, 0x0738a000UL,
 127     0x3fd522aeUL, 0x8164c759UL, 0x3d2ebe70UL, 0x9e173000UL, 0x3fd4c9e0UL,
 128     0x1b0ad8a4UL, 0xbd2e2089UL, 0xc271c800UL, 0x3fd4718dUL, 0x0967d675UL,
 129     0xbd2f27ceUL, 0x23d5e800UL, 0x3fd419b4UL, 0xec90e09dUL, 0x3d08e436UL,
 130     0x77333000UL, 0x3fd3c252UL, 0xb606bd5cUL, 0x3d183b54UL, 0x76be1000UL,
 131     0x3fd36b67UL, 0xb0f177c8UL, 0x3d116ecdUL, 0xe1d36000UL, 0x3fd314f1UL,
 132     0xd3213cb8UL, 0xbd28e27aUL, 0x7cdc9000UL, 0x3fd2bef0UL, 0x4a5004f4UL,
 133     0x3d2a9cfaUL, 0x1134d800UL, 0x3fd26962UL, 0xdf5bb3b6UL, 0x3d2c93c1UL,
 134     0x6d0eb800UL, 0x3fd21445UL, 0xba46baeaUL, 0x3d0a87deUL, 0x635a6800UL,
 135     0x3fd1bf99UL, 0x5147bdb7UL, 0x3d2ca6edUL, 0xcbacf800UL, 0x3fd16b5cUL,
 136     0xf7a51681UL, 0x3d2b9acdUL, 0x8227e800UL, 0x3fd1178eUL, 0x63a5f01cUL,
 137     0xbd2c210eUL, 0x67616000UL, 0x3fd0c42dUL, 0x163ceae9UL, 0x3d27188bUL,
 138     0x604d5800UL, 0x3fd07138UL, 0x16ed4e91UL, 0x3cf89cdbUL, 0x5626c800UL,
 139     0x3fd01eaeUL, 0x1485e94aUL, 0xbd16f08cUL, 0x6cb3b000UL, 0x3fcf991cUL,
 140     0xca0cdf30UL, 0x3d1bcbecUL, 0xe4dd0000UL, 0x3fcef5adUL, 0x65bb8e11UL,
 141     0xbcca2115UL, 0xffe71000UL, 0x3fce530eUL, 0x6041f430UL, 0x3cc21227UL,
 142     0xb0d49000UL, 0x3fcdb13dUL, 0xf715b035UL, 0xbd2aff2aUL, 0xf2656000UL,
 143     0x3fcd1037UL, 0x75b6f6e4UL, 0xbd084a7eUL, 0xc6f01000UL, 0x3fcc6ffbUL,
 144     0xc5962bd2UL, 0xbcf1ec72UL, 0x383be000UL, 0x3fcbd087UL, 0x595412b6UL,
 145     0xbd2d4bc4UL, 0x575bd000UL, 0x3fcb31d8UL, 0x4eace1aaUL, 0xbd0c358dUL,
 146     0x3c8ae000UL, 0x3fca93edUL, 0x50562169UL, 0xbd287243UL, 0x07089000UL,
 147     0x3fc9f6c4UL, 0x6865817aUL, 0x3d29904dUL, 0xdcf70000UL, 0x3fc95a5aUL,
 148     0x58a0ff6fUL, 0x3d07f228UL, 0xeb390000UL, 0x3fc8beafUL, 0xaae92cd1UL,
 149     0xbd073d54UL, 0x6551a000UL, 0x3fc823c1UL, 0x9a631e83UL, 0x3d1e0ddbUL,
 150     0x85445000UL, 0x3fc7898dUL, 0x70914305UL, 0xbd1c6610UL, 0x8b757000UL,
 151     0x3fc6f012UL, 0xe59c21e1UL, 0xbd25118dUL, 0xbe8c1000UL, 0x3fc6574eUL,
 152     0x2c3c2e78UL, 0x3d19cf8bUL, 0x6b544000UL, 0x3fc5bf40UL, 0xeb68981cUL,
 153     0xbd127023UL, 0xe4a1b000UL, 0x3fc527e5UL, 0xe5697dc7UL, 0x3d2633e8UL,
 154     0x8333b000UL, 0x3fc4913dUL, 0x54fdb678UL, 0x3d258379UL, 0xa5993000UL,
 155     0x3fc3fb45UL, 0x7e6a354dUL, 0xbd2cd1d8UL, 0xb0159000UL, 0x3fc365fcUL,
 156     0x234b7289UL, 0x3cc62fa8UL, 0x0c868000UL, 0x3fc2d161UL, 0xcb81b4a1UL,
 157     0x3d039d6cUL, 0x2a49c000UL, 0x3fc23d71UL, 0x8fd3df5cUL, 0x3d100d23UL,
 158     0x7e23f000UL, 0x3fc1aa2bUL, 0x44389934UL, 0x3d2ca78eUL, 0x8227e000UL,
 159     0x3fc1178eUL, 0xce2d07f2UL, 0x3d21ef78UL, 0xb59e4000UL, 0x3fc08598UL,
 160     0x7009902cUL, 0xbd27e5ddUL, 0x39dbe000UL, 0x3fbfe891UL, 0x4fa10afdUL,
 161     0xbd2534d6UL, 0x830a2000UL, 0x3fbec739UL, 0xafe645e0UL, 0xbd2dc068UL,
 162     0x63844000UL, 0x3fbda727UL, 0x1fa71733UL, 0x3d1a8940UL, 0x01bc4000UL,
 163     0x3fbc8858UL, 0xc65aacd3UL, 0x3d2646d1UL, 0x8dad6000UL, 0x3fbb6ac8UL,
 164     0x2bf768e5UL, 0xbd139080UL, 0x40b1c000UL, 0x3fba4e76UL, 0xb94407c8UL,
 165     0xbd0e42b6UL, 0x5d594000UL, 0x3fb9335eUL, 0x3abd47daUL, 0x3d23115cUL,
 166     0x2f40e000UL, 0x3fb8197eUL, 0xf96ffdf7UL, 0x3d0f80dcUL, 0x0aeac000UL,
 167     0x3fb700d3UL, 0xa99ded32UL, 0x3cec1e8dUL, 0x4d97a000UL, 0x3fb5e95aUL,
 168     0x3c5d1d1eUL, 0xbd2c6906UL, 0x5d208000UL, 0x3fb4d311UL, 0x82f4e1efUL,
 169     0xbcf53a25UL, 0xa7d1e000UL, 0x3fb3bdf5UL, 0xa5db4ed7UL, 0x3d2cc85eUL,
 170     0xa4472000UL, 0x3fb2aa04UL, 0xae9c697dUL, 0xbd20b6e8UL, 0xd1466000UL,
 171     0x3fb1973bUL, 0x560d9e9bUL, 0xbd25325dUL, 0xb59e4000UL, 0x3fb08598UL,
 172     0x7009902cUL, 0xbd17e5ddUL, 0xc006c000UL, 0x3faeea31UL, 0x4fc93b7bUL,
 173     0xbd0e113eUL, 0xcdddc000UL, 0x3faccb73UL, 0x47d82807UL, 0xbd1a68f2UL,
 174     0xd0fb0000UL, 0x3faaaef2UL, 0x353bb42eUL, 0x3d20fc1aUL, 0x149fc000UL,
 175     0x3fa894aaUL, 0xd05a267dUL, 0xbd197995UL, 0xf2d4c000UL, 0x3fa67c94UL,
 176     0xec19afa2UL, 0xbd029efbUL, 0xd42e0000UL, 0x3fa466aeUL, 0x75bdfd28UL,
 177     0xbd2c1673UL, 0x2f8d0000UL, 0x3fa252f3UL, 0xe021b67bUL, 0x3d283e9aUL,
 178     0x89e74000UL, 0x3fa0415dUL, 0x5cf1d753UL, 0x3d0111c0UL, 0xec148000UL,
 179     0x3f9c63d2UL, 0x3f9eb2f3UL, 0x3d2578c6UL, 0x28c90000UL, 0x3f984925UL,
 180     0x325a0c34UL, 0xbd2aa0baUL, 0x25980000UL, 0x3f9432a9UL, 0x928637feUL,
 181     0x3d098139UL, 0x58938000UL, 0x3f902056UL, 0x06e2f7d2UL, 0xbd23dc5bUL,
 182     0xa3890000UL, 0x3f882448UL, 0xda74f640UL, 0xbd275577UL, 0x75890000UL,
 183     0x3f801015UL, 0x999d2be8UL, 0xbd10c76bUL, 0x59580000UL, 0x3f700805UL,
 184     0xcb31c67bUL, 0x3d2166afUL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
 185     0x80000000UL
 186 };
 187 
 188 // BEGIN dlog PSEUDO CODE:
 189 //  double dlog(double X) {
 190 //    // p(r) polynomial coefficients initialized from _L_tbl table
 191 //    double C1_0 = _L_tbl[0];
 192 //    double C1_1 = _L_tbl[1];
 193 //    double C2_0 = _L_tbl[2];
 194 //    double C2_1 = _L_tbl[3];
 195 //    double C3_0 = _L_tbl[4];
 196 //    double C3_1 = _L_tbl[5];
 197 //    double C4_0 = _L_tbl[6];
 198 //    double C4_1 = _L_tbl[7];
 199 //    // NOTE: operations with coefficients above are mostly vectorized in assembly
 200 //    // Check corner cases first
 201 //    if (X == 1.0d || AS_LONG_BITS(X) + 0x0010000000000000 <= 0x0010000000000000) {
 202 //      // NOTE: AS_LONG_BITS(X) + 0x0010000000000000 <= 0x0010000000000000 means
 203 //      //    that X < 0 or X >= 0x7FF0000000000000 (0x7FF* is NaN or INF)
 204 //      if (X < 0 || X is NaN) return NaN;
 205 //      if (X == 1.0d) return 0.0d;
 206 //      if (X == 0.0d) return -INFINITY;
 207 //      if (X is INFINITY) return INFINITY;
 208 //    }
 209 //    // double representation is 2^exponent * mantissa
 210 //    // split X into two multipliers: 2^exponent and 1.0 * mantissa
 211 //    // pseudo function: zeroExponent(X) return value of X with exponent == 0
 212 //    float vtmp5 = 1/(float)(zeroExponent(X)); // reciprocal estimate
 213 //    // pseudo function: HI16(X) returns high 16 bits of double value
 214 //    int hiWord = HI16(X);
 215 //    double vtmp1 = (double) 0x77F0 << 48 | mantissa(X);
 216 //    hiWord -= 16;
 217 //    if (AS_LONG_BITS(hiWord) > 0x8000) {
 218 //      // SMALL_VALUE branch
 219 //      vtmp0 = vtmp1 = vtmp0 * AS_DOUBLE_BITS(0x47F0000000000000);
 220 //      hiWord = HI16(vtmp1);
 221 //      vtmp0 = AS_DOUBLE_BITS(AS_LONG_BITS(vtmp0) |= 0x3FF0000000000000);
 222 //      vtmp5 = (double) (1/(float)vtmp0);
 223 //      vtmp1 <<= 12;
 224 //      vtmp1 >>= 12;
 225 //    }
 226 //    // MAIN branch
 227 //    double vtmp3 = AS_LONG_BITS(vtmp1) & 0xffffe00000000000; // hi part
 228 //    int intB0 = AS_INT_BITS(vtmp5) + 0x8000;
 229 //    double vtmp0 = AS_DOUBLE_BITS(0xffffe00000000000 & (intB0<<29));
 230 //    int index = (intB0 >> 16) && 0xFF;
 231 //    double hiTableValue = _L_tbl[8+index]; // vtmp2[0]
 232 //    double lowTableValue = _L_tbl[16+index]; // vtmp2[1]
 233 //    vtmp5 = AS_DOUBLE_BITS(hiWord & 0x7FF0 - 0x3FE0); // 0x3FE = 1023 << 4
 234 //    vtmp1 -= vtmp3; // low part
 235 //    vtmp3 = vtmp3*vtmp0 - 1.0;
 236 //    hiTableValue += C4_0 * vtmp5;
 237 //    lowTableValue += C4_1 * vtmp5;
 238 //    double r = vtmp1 * vtmp0 + vtmp3; // r = B*mx-1.0, computed in hi and low parts
 239 //    vtmp0 = hiTableValue + r;
 240 //    hiTableValue -= vtmp0;
 241 //    double r2 = r*r;
 242 //    double r3 = r2*r;
 243 //    double p7 = C3_0*r2 + C2_0*r3 + C1_0*r2*r2 + C3_1*r3*r2 + C2_1*r3*r3
 244 //              + C1_1*r3*r2*r2; // degree 7 polynomial
 245 //    return p7 + (vtmp0 + ((r + hiTableValue) + lowTableValue));
 246 //  }
 247 //
 248 // END dlog PSEUDO CODE
 249 
 250 
 251 // Generate log(X). X passed in register v0. Return log(X) into v0.
 252 // Generator parameters: 10 temporary FPU registers and  temporary general
 253 // purpose registers
 254 void MacroAssembler::fast_log(FloatRegister vtmp0, FloatRegister vtmp1,
 255                               FloatRegister vtmp2, FloatRegister vtmp3,
 256                               FloatRegister vtmp4, FloatRegister vtmp5,
 257                               FloatRegister C1, FloatRegister C2,
 258                               FloatRegister C3, FloatRegister C4,
 259                               Register tmp1, Register tmp2, Register tmp3,
 260                               Register tmp4, Register tmp5) {
 261   Label DONE, CHECK_CORNER_CASES, SMALL_VALUE, MAIN,
 262       CHECKED_CORNER_CASES, RETURN_MINF_OR_NAN;
 263   const long INF_OR_NAN_PREFIX = 0x7FF0;
 264   const long MINF_OR_MNAN_PREFIX = 0xFFF0;
 265   const long ONE_PREFIX = 0x3FF0;
 266     movz(tmp2, ONE_PREFIX, 48);
 267     movz(tmp4, 0x0010, 48);
 268     fmovd(rscratch1, v0); // rscratch1 = AS_LONG_BITS(X)
 269     lea(rscratch2, ExternalAddress((address)_L_tbl));
 270     movz(tmp5, 0x7F);
 271     add(tmp1, rscratch1, tmp4);
 272     cmp(tmp2, rscratch1);
 273     lsr(tmp3, rscratch1, 29);
 274     ccmp(tmp1, tmp4, 0b1101 /* LE */, NE);
 275     bfm(tmp3, tmp5, 41, 8);
 276     fmovs(vtmp5, tmp3);
 277     // Load coefficients from table. All coefficients are organized to be
 278     // in specific order, because load below will load it in vectors to be used
 279     // later in vector instructions. Load will be performed in parallel while
 280     // branches are taken. C1 will contain vector of {C1_0, C1_1}, C2 =
 281     // {C2_0, C2_1}, C3 = {C3_0, C3_1}, C4 = {C4_0, C4_1}
 282     ld1(C1, C2, C3, C4, T2D, post(rscratch2, 64));
 283     br(LE, CHECK_CORNER_CASES);
 284   bind(CHECKED_CORNER_CASES);
 285     // all corner cases are handled
 286     frecpe(vtmp5, vtmp5, S);                   // vtmp5 ~= 1/vtmp5
 287     lsr(tmp2, rscratch1, 48);
 288     movz(tmp4, 0x77f0, 48);
 289     fmovd(vtmp4, 1.0d);
 290     movz(tmp1, INF_OR_NAN_PREFIX, 48);
 291     bfm(tmp4, rscratch1, 0, 51);               // tmp4 = 0x77F0 << 48 | mantissa(X)
 292     // vtmp1 = AS_DOUBLE_BITS(0x77F0 << 48 | mantissa(X)) == mx
 293     fmovd(vtmp1, tmp4);
 294     subw(tmp2, tmp2, 16);
 295     subs(zr, tmp2, 0x8000);
 296     br(GE, SMALL_VALUE);
 297   bind(MAIN);
 298     fmovs(tmp3, vtmp5);                        // int intB0 = AS_INT_BITS(B);
 299     mov(tmp5, 0x3FE0);
 300     mov(rscratch1, 0xffffe00000000000);
 301     andr(tmp2, tmp2, tmp1, LSR, 48);           // hiWord & 0x7FF0
 302     sub(tmp2, tmp2, tmp5);                     // tmp2 = hiWord & 0x7FF0 - 0x3FE0
 303     scvtfwd(vtmp5, tmp2);                      // vtmp5 = (double)tmp2;
 304     addw(tmp3, tmp3, 0x8000);                  // tmp3 = B
 305     andr(tmp4, tmp4, rscratch1);               // tmp4 == hi_part(mx)
 306     andr(rscratch1, rscratch1, tmp3, LSL, 29); // rscratch1 = hi_part(B)
 307     ubfm(tmp3, tmp3, 16, 23);                  // int index = (intB0 >> 16) && 0xFF
 308     ldrq(vtmp2, Address(rscratch2, tmp3, Address::lsl(4))); // vtmp2 = _L_tbl[index]
 309     // AS_LONG_BITS(vtmp1) & 0xffffe00000000000 // hi_part(mx)
 310     fmovd(vtmp3, tmp4);
 311     fmovd(vtmp0, rscratch1);                   // vtmp0 = hi_part(B)
 312     fsubd(vtmp1, vtmp1, vtmp3);                // vtmp1 -= vtmp3; // low_part(mx)
 313     fnmsub(vtmp3, vtmp3, vtmp0, vtmp4);        // vtmp3 = vtmp3*vtmp0 - vtmp4
 314     fmlavs(vtmp2, T2D, C4, vtmp5, 0);          // vtmp2 += {C4} * vtmp5
 315     // vtmp1 = r = vtmp1 * vtmp0 + vtmp3 == low_part(mx) * hi_part(B) + (hi_part(mx)*hi_part(B) - 1.0)
 316     fmaddd(vtmp1, vtmp1, vtmp0, vtmp3);
 317     ins(vtmp5, D, vtmp2, 0, 1);                // vtmp5 = vtmp2[1];
 318     faddd(vtmp0, vtmp2, vtmp1);                // vtmp0 = vtmp2 + vtmp1
 319     fmlavs(C3, T2D, C2, vtmp1, 0);             // {C3} += {C2}*vtmp1
 320     fsubd(vtmp2, vtmp2, vtmp0);                // vtmp2 -= vtmp0
 321     fmuld(vtmp3, vtmp1, vtmp1);                // vtmp3 = vtmp1*vtmp1
 322     faddd(C4, vtmp1, vtmp2);                   // C4[0] = vtmp1 + vtmp2
 323     fmlavs(C3, T2D, C1, vtmp3, 0);             // {C3} += {C1}*vtmp3
 324     faddd(C4, C4, vtmp5);                      // C4 += vtmp5
 325     fmuld(vtmp4, vtmp3, vtmp1);                // vtmp4 = vtmp3*vtmp1
 326     faddd(vtmp0, vtmp0, C4);                   // vtmp0 += C4
 327     fmlavs(C3, T2D, vtmp4, C3, 1);             // {C3} += {vtmp4}*C3[1]
 328     fmaddd(vtmp0, C3, vtmp3, vtmp0);           // vtmp0 = C3 * vtmp3 + vtmp0
 329     ret(lr);
 330 
 331   block_comment("if (AS_LONG_BITS(hiWord) > 0x8000)"); {
 332     bind(SMALL_VALUE);
 333       movz(tmp2, 0x47F0, 48);
 334       fmovd(vtmp1, tmp2);
 335       fmuld(vtmp0, vtmp1, v0);
 336       fmovd(vtmp1, vtmp0);
 337       umov(tmp2, vtmp1, S, 3);
 338       orr(vtmp0, T16B, vtmp0, vtmp4);
 339       ushr(vtmp5, T2D, vtmp0, 27);
 340       ushr(vtmp5, T4S, vtmp5, 2);
 341       frecpe(vtmp5, vtmp5, S);
 342       shl(vtmp1, T2D, vtmp1, 12);
 343       ushr(vtmp1, T2D, vtmp1, 12);
 344       b(MAIN);
 345   }
 346 
 347   block_comment("Corner cases"); {
 348     bind(RETURN_MINF_OR_NAN);
 349       movz(tmp1, MINF_OR_MNAN_PREFIX, 48);
 350       orr(rscratch1, rscratch1, tmp1);
 351       fmovd(v0, rscratch1);
 352       ret(lr);
 353     bind(CHECK_CORNER_CASES);
 354       movz(tmp1, INF_OR_NAN_PREFIX, 48);
 355       cmp(rscratch1, zr);
 356       br(LE, RETURN_MINF_OR_NAN);
 357       cmp(rscratch1, tmp1);
 358       br(GE, DONE);
 359       cmp(rscratch1, tmp2);
 360       br(NE, CHECKED_CORNER_CASES);
 361       fmovd(v0, 0.0d);
 362   }
 363   bind(DONE);
 364     ret(lr);
 365 }