275 bfm(tmp3, tmp5, 41, 8);
276 fmovs(vtmp5, tmp3);
277 // Load coefficients from table. All coefficients are organized to be
278 // in specific order, because load below will load it in vectors to be used
279 // later in vector instructions. Load will be performed in parallel while
280 // branches are taken. C1 will contain vector of {C1_0, C1_1}, C2 =
281 // {C2_0, C2_1}, C3 = {C3_0, C3_1}, C4 = {C4_0, C4_1}
282 ld1(C1, C2, C3, C4, T2D, post(rscratch2, 64));
283 br(LE, CHECK_CORNER_CASES);
284 bind(CHECKED_CORNER_CASES);
285 // all corner cases are handled
286 frecpe(vtmp5, vtmp5, S); // vtmp5 ~= 1/vtmp5
287 lsr(tmp2, rscratch1, 48);
288 movz(tmp4, 0x77f0, 48);
289 fmovd(vtmp4, 1.0d);
290 movz(tmp1, INF_OR_NAN_PREFIX, 48);
291 bfm(tmp4, rscratch1, 0, 51); // tmp4 = 0x77F0 << 48 | mantissa(X)
292 // vtmp1 = AS_DOUBLE_BITS(0x77F0 << 48 | mantissa(X)) == mx
293 fmovd(vtmp1, tmp4);
294 subw(tmp2, tmp2, 16);
295 cmp(tmp2, 0x8000);
296 br(GE, SMALL_VALUE);
297 bind(MAIN);
298 fmovs(tmp3, vtmp5); // int intB0 = AS_INT_BITS(B);
299 mov(tmp5, 0x3FE0);
300 mov(rscratch1, 0xffffe00000000000);
301 andr(tmp2, tmp2, tmp1, LSR, 48); // hiWord & 0x7FF0
302 sub(tmp2, tmp2, tmp5); // tmp2 = hiWord & 0x7FF0 - 0x3FE0
303 scvtfwd(vtmp5, tmp2); // vtmp5 = (double)tmp2;
304 addw(tmp3, tmp3, 0x8000); // tmp3 = B
305 andr(tmp4, tmp4, rscratch1); // tmp4 == hi_part(mx)
306 andr(rscratch1, rscratch1, tmp3, LSL, 29); // rscratch1 = hi_part(B)
307 ubfm(tmp3, tmp3, 16, 23); // int index = (intB0 >> 16) && 0xFF
308 ldrq(vtmp2, Address(rscratch2, tmp3, Address::lsl(4))); // vtmp2 = _L_tbl[index]
309 // AS_LONG_BITS(vtmp1) & 0xffffe00000000000 // hi_part(mx)
310 fmovd(vtmp3, tmp4);
311 fmovd(vtmp0, rscratch1); // vtmp0 = hi_part(B)
312 fsubd(vtmp1, vtmp1, vtmp3); // vtmp1 -= vtmp3; // low_part(mx)
313 fnmsub(vtmp3, vtmp3, vtmp0, vtmp4); // vtmp3 = vtmp3*vtmp0 - vtmp4
314 fmlavs(vtmp2, T2D, C4, vtmp5, 0); // vtmp2 += {C4} * vtmp5
315 // vtmp1 = r = vtmp1 * vtmp0 + vtmp3 == low_part(mx) * hi_part(B) + (hi_part(mx)*hi_part(B) - 1.0)
|
275 bfm(tmp3, tmp5, 41, 8);
276 fmovs(vtmp5, tmp3);
277 // Load coefficients from table. All coefficients are organized to be
278 // in specific order, because load below will load it in vectors to be used
279 // later in vector instructions. Load will be performed in parallel while
280 // branches are taken. C1 will contain vector of {C1_0, C1_1}, C2 =
281 // {C2_0, C2_1}, C3 = {C3_0, C3_1}, C4 = {C4_0, C4_1}
282 ld1(C1, C2, C3, C4, T2D, post(rscratch2, 64));
283 br(LE, CHECK_CORNER_CASES);
284 bind(CHECKED_CORNER_CASES);
285 // all corner cases are handled
286 frecpe(vtmp5, vtmp5, S); // vtmp5 ~= 1/vtmp5
287 lsr(tmp2, rscratch1, 48);
288 movz(tmp4, 0x77f0, 48);
289 fmovd(vtmp4, 1.0d);
290 movz(tmp1, INF_OR_NAN_PREFIX, 48);
291 bfm(tmp4, rscratch1, 0, 51); // tmp4 = 0x77F0 << 48 | mantissa(X)
292 // vtmp1 = AS_DOUBLE_BITS(0x77F0 << 48 | mantissa(X)) == mx
293 fmovd(vtmp1, tmp4);
294 subw(tmp2, tmp2, 16);
295 subs(zr, tmp2, 0x8000);
296 br(GE, SMALL_VALUE);
297 bind(MAIN);
298 fmovs(tmp3, vtmp5); // int intB0 = AS_INT_BITS(B);
299 mov(tmp5, 0x3FE0);
300 mov(rscratch1, 0xffffe00000000000);
301 andr(tmp2, tmp2, tmp1, LSR, 48); // hiWord & 0x7FF0
302 sub(tmp2, tmp2, tmp5); // tmp2 = hiWord & 0x7FF0 - 0x3FE0
303 scvtfwd(vtmp5, tmp2); // vtmp5 = (double)tmp2;
304 addw(tmp3, tmp3, 0x8000); // tmp3 = B
305 andr(tmp4, tmp4, rscratch1); // tmp4 == hi_part(mx)
306 andr(rscratch1, rscratch1, tmp3, LSL, 29); // rscratch1 = hi_part(B)
307 ubfm(tmp3, tmp3, 16, 23); // int index = (intB0 >> 16) && 0xFF
308 ldrq(vtmp2, Address(rscratch2, tmp3, Address::lsl(4))); // vtmp2 = _L_tbl[index]
309 // AS_LONG_BITS(vtmp1) & 0xffffe00000000000 // hi_part(mx)
310 fmovd(vtmp3, tmp4);
311 fmovd(vtmp0, rscratch1); // vtmp0 = hi_part(B)
312 fsubd(vtmp1, vtmp1, vtmp3); // vtmp1 -= vtmp3; // low_part(mx)
313 fnmsub(vtmp3, vtmp3, vtmp0, vtmp4); // vtmp3 = vtmp3*vtmp0 - vtmp4
314 fmlavs(vtmp2, T2D, C4, vtmp5, 0); // vtmp2 += {C4} * vtmp5
315 // vtmp1 = r = vtmp1 * vtmp0 + vtmp3 == low_part(mx) * hi_part(B) + (hi_part(mx)*hi_part(B) - 1.0)
|