< prev index next >

src/hotspot/cpu/aarch64/macroAssembler_aarch64_log.cpp

Print this page
8248238: Adding Windows support to OpenJDK on AArch64

Summary: LP64 vs LLP64 changes to add Windows support

Contributed-by: Monica Beckwith <monica.beckwith@microsoft.com>, Ludovic Henry <luhenry@microsoft.com>
Reviewed-by:
8248238: Adding Windows support to OpenJDK on AArch64

Summary: Adding Windows support for AArch64

Contributed-by: Ludovic Henry <luhenry@microsoft.com>, Monica Beckwith <monica.beckwith@microsoft.com>
Reviewed-by:


  48 //    B = int((B0*2^7+0.5))/2^7
  49 //
  50 //    Reduced argument: r=B*mx-1.0 (computed accurately in high and low parts)
  51 //
  52 //    Result:  k*log(2) - log(B) + p(r) if |x-1| >= small value (2^-6)  and
  53 //             p(r) is a degree 7 polynomial
  54 //             -log(B) read from data table (high, low parts)
  55 //             Result is formed from high and low parts
  56 //
  57 // Special cases:
  58 // 1. log(NaN) = quiet NaN
  59 // 2. log(+INF) = that INF
  60 // 3. log(0) = -INF
  61 // 4. log(1) = +0
  62 // 5. log(x) = NaN if x < -0, including -INF
  63 //
  64 /******************************************************************************/
  65 
  66 // Table with p(r) polynomial coefficients
  67 // and table representation of logarithm values (hi and low parts)
  68 __attribute__ ((aligned(64))) juint _L_tbl[] =
  69 {
  70     // coefficients of p(r) polynomial:
  71     // _coeff[]
  72     0x00000000UL, 0xbfd00000UL, // C1_0 = -0.25
  73     0x92492492UL, 0x3fc24924UL, // C1_1 = 0.14285714285714285
  74     0x55555555UL, 0x3fd55555UL, // C2_0 = 0.3333333333333333
  75     0x3d6fb175UL, 0xbfc5555eUL, // C2_1 = -0.16666772842235003
  76     0x00000000UL, 0xbfe00000UL, // C3_0 = -0.5
  77     0x9999999aUL, 0x3fc99999UL, // C3_1 = 0.2
  78     // _log2[]
  79     0xfefa3800UL, 0x3fa62e42UL, // C4_0 = 0.043321698784993146
  80     0x93c76730UL, 0x3ceef357UL, // C4_1 = 3.436201886692732e-15
  81     // _L_tbl[] with logarithm values (hi and low parts)
  82     0xfefa3800UL, 0x3fe62e42UL, 0x93c76730UL, 0x3d2ef357UL, 0xaa241800UL,
  83     0x3fe5ee82UL, 0x0cda46beUL, 0x3d220238UL, 0x5c364800UL, 0x3fe5af40UL,
  84     0xac10c9fbUL, 0x3d2dfa63UL, 0x26bb8c00UL, 0x3fe5707aUL, 0xff3303ddUL,
  85     0x3d09980bUL, 0x26867800UL, 0x3fe5322eUL, 0x5d257531UL, 0x3d05ccc4UL,
  86     0x835a5000UL, 0x3fe4f45aUL, 0x6d93b8fbUL, 0xbd2e6c51UL, 0x6f970c00UL,
  87     0x3fe4b6fdUL, 0xed4c541cUL, 0x3cef7115UL, 0x27e8a400UL, 0x3fe47a15UL,
  88     0xf94d60aaUL, 0xbd22cb6aUL, 0xf2f92400UL, 0x3fe43d9fUL, 0x481051f7UL,


 243 //    double p7 = C3_0*r2 + C2_0*r3 + C1_0*r2*r2 + C3_1*r3*r2 + C2_1*r3*r3
 244 //              + C1_1*r3*r2*r2; // degree 7 polynomial
 245 //    return p7 + (vtmp0 + ((r + hiTableValue) + lowTableValue));
 246 //  }
 247 //
 248 // END dlog PSEUDO CODE
 249 
 250 
 251 // Generate log(X). X passed in register v0. Return log(X) into v0.
 252 // Generator parameters: 10 temporary FPU registers and  temporary general
 253 // purpose registers
 254 void MacroAssembler::fast_log(FloatRegister vtmp0, FloatRegister vtmp1,
 255                               FloatRegister vtmp2, FloatRegister vtmp3,
 256                               FloatRegister vtmp4, FloatRegister vtmp5,
 257                               FloatRegister C1, FloatRegister C2,
 258                               FloatRegister C3, FloatRegister C4,
 259                               Register tmp1, Register tmp2, Register tmp3,
 260                               Register tmp4, Register tmp5) {
 261   Label DONE, CHECK_CORNER_CASES, SMALL_VALUE, MAIN,
 262       CHECKED_CORNER_CASES, RETURN_MINF_OR_NAN;
 263   const long INF_OR_NAN_PREFIX = 0x7FF0;
 264   const long MINF_OR_MNAN_PREFIX = 0xFFF0;
 265   const long ONE_PREFIX = 0x3FF0;
 266     movz(tmp2, ONE_PREFIX, 48);
 267     movz(tmp4, 0x0010, 48);
 268     fmovd(rscratch1, v0); // rscratch1 = AS_LONG_BITS(X)
 269     lea(rscratch2, ExternalAddress((address)_L_tbl));
 270     movz(tmp5, 0x7F);
 271     add(tmp1, rscratch1, tmp4);
 272     cmp(tmp2, rscratch1);
 273     lsr(tmp3, rscratch1, 29);
 274     ccmp(tmp1, tmp4, 0b1101 /* LE */, NE);
 275     bfm(tmp3, tmp5, 41, 8);
 276     fmovs(vtmp5, tmp3);
 277     // Load coefficients from table. All coefficients are organized to be
 278     // in specific order, because load below will load it in vectors to be used
 279     // later in vector instructions. Load will be performed in parallel while
 280     // branches are taken. C1 will contain vector of {C1_0, C1_1}, C2 =
 281     // {C2_0, C2_1}, C3 = {C3_0, C3_1}, C4 = {C4_0, C4_1}
 282     ld1(C1, C2, C3, C4, T2D, post(rscratch2, 64));
 283     br(LE, CHECK_CORNER_CASES);
 284   bind(CHECKED_CORNER_CASES);
 285     // all corner cases are handled




  48 //    B = int((B0*2^7+0.5))/2^7
  49 //
  50 //    Reduced argument: r=B*mx-1.0 (computed accurately in high and low parts)
  51 //
  52 //    Result:  k*log(2) - log(B) + p(r) if |x-1| >= small value (2^-6)  and
  53 //             p(r) is a degree 7 polynomial
  54 //             -log(B) read from data table (high, low parts)
  55 //             Result is formed from high and low parts
  56 //
  57 // Special cases:
  58 // 1. log(NaN) = quiet NaN
  59 // 2. log(+INF) = that INF
  60 // 3. log(0) = -INF
  61 // 4. log(1) = +0
  62 // 5. log(x) = NaN if x < -0, including -INF
  63 //
  64 /******************************************************************************/
  65 
  66 // Table with p(r) polynomial coefficients
  67 // and table representation of logarithm values (hi and low parts)
  68 ATTRIBUTE_ALIGNED(64) juint _L_tbl[] =
  69 {
  70     // coefficients of p(r) polynomial:
  71     // _coeff[]
  72     0x00000000UL, 0xbfd00000UL, // C1_0 = -0.25
  73     0x92492492UL, 0x3fc24924UL, // C1_1 = 0.14285714285714285
  74     0x55555555UL, 0x3fd55555UL, // C2_0 = 0.3333333333333333
  75     0x3d6fb175UL, 0xbfc5555eUL, // C2_1 = -0.16666772842235003
  76     0x00000000UL, 0xbfe00000UL, // C3_0 = -0.5
  77     0x9999999aUL, 0x3fc99999UL, // C3_1 = 0.2
  78     // _log2[]
  79     0xfefa3800UL, 0x3fa62e42UL, // C4_0 = 0.043321698784993146
  80     0x93c76730UL, 0x3ceef357UL, // C4_1 = 3.436201886692732e-15
  81     // _L_tbl[] with logarithm values (hi and low parts)
  82     0xfefa3800UL, 0x3fe62e42UL, 0x93c76730UL, 0x3d2ef357UL, 0xaa241800UL,
  83     0x3fe5ee82UL, 0x0cda46beUL, 0x3d220238UL, 0x5c364800UL, 0x3fe5af40UL,
  84     0xac10c9fbUL, 0x3d2dfa63UL, 0x26bb8c00UL, 0x3fe5707aUL, 0xff3303ddUL,
  85     0x3d09980bUL, 0x26867800UL, 0x3fe5322eUL, 0x5d257531UL, 0x3d05ccc4UL,
  86     0x835a5000UL, 0x3fe4f45aUL, 0x6d93b8fbUL, 0xbd2e6c51UL, 0x6f970c00UL,
  87     0x3fe4b6fdUL, 0xed4c541cUL, 0x3cef7115UL, 0x27e8a400UL, 0x3fe47a15UL,
  88     0xf94d60aaUL, 0xbd22cb6aUL, 0xf2f92400UL, 0x3fe43d9fUL, 0x481051f7UL,


 243 //    double p7 = C3_0*r2 + C2_0*r3 + C1_0*r2*r2 + C3_1*r3*r2 + C2_1*r3*r3
 244 //              + C1_1*r3*r2*r2; // degree 7 polynomial
 245 //    return p7 + (vtmp0 + ((r + hiTableValue) + lowTableValue));
 246 //  }
 247 //
 248 // END dlog PSEUDO CODE
 249 
 250 
 251 // Generate log(X). X passed in register v0. Return log(X) into v0.
 252 // Generator parameters: 10 temporary FPU registers and  temporary general
 253 // purpose registers
 254 void MacroAssembler::fast_log(FloatRegister vtmp0, FloatRegister vtmp1,
 255                               FloatRegister vtmp2, FloatRegister vtmp3,
 256                               FloatRegister vtmp4, FloatRegister vtmp5,
 257                               FloatRegister C1, FloatRegister C2,
 258                               FloatRegister C3, FloatRegister C4,
 259                               Register tmp1, Register tmp2, Register tmp3,
 260                               Register tmp4, Register tmp5) {
 261   Label DONE, CHECK_CORNER_CASES, SMALL_VALUE, MAIN,
 262       CHECKED_CORNER_CASES, RETURN_MINF_OR_NAN;
 263   const int64_t INF_OR_NAN_PREFIX = 0x7FF0;
 264   const int64_t MINF_OR_MNAN_PREFIX = 0xFFF0;
 265   const int64_t ONE_PREFIX = 0x3FF0;
 266     movz(tmp2, ONE_PREFIX, 48);
 267     movz(tmp4, 0x0010, 48);
 268     fmovd(rscratch1, v0); // rscratch1 = AS_LONG_BITS(X)
 269     lea(rscratch2, ExternalAddress((address)_L_tbl));
 270     movz(tmp5, 0x7F);
 271     add(tmp1, rscratch1, tmp4);
 272     cmp(tmp2, rscratch1);
 273     lsr(tmp3, rscratch1, 29);
 274     ccmp(tmp1, tmp4, 0b1101 /* LE */, NE);
 275     bfm(tmp3, tmp5, 41, 8);
 276     fmovs(vtmp5, tmp3);
 277     // Load coefficients from table. All coefficients are organized to be
 278     // in specific order, because load below will load it in vectors to be used
 279     // later in vector instructions. Load will be performed in parallel while
 280     // branches are taken. C1 will contain vector of {C1_0, C1_1}, C2 =
 281     // {C2_0, C2_1}, C3 = {C3_0, C3_1}, C4 = {C4_0, C4_1}
 282     ld1(C1, C2, C3, C4, T2D, post(rscratch2, 64));
 283     br(LE, CHECK_CORNER_CASES);
 284   bind(CHECKED_CORNER_CASES);
 285     // all corner cases are handled


< prev index next >