672 // constants because of that (see comments in code)
673 // 4. Use of jx, which is nx-1 instead of nx
674 // Assumptions:
675 // 1. Assume |X| >= PI/4
676 // Input and output:
677 // 1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
678 // 2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
679 // NOTE: general purpose register names match local variable names in C code
680 // NOTE: fpu registers are actively reused. See comments in code about their usage
681 void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
682 Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
683 RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
684 INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
685 Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
686 RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
687 CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
688 IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
689 RECOMP_FOR1_CHECK;
690 Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
691 tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
692 jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r18;
693 // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
694 // jx = nx - 1
695 lea(twoOverPiBase, ExternalAddress(two_over_pi));
696 cmpw(jv, zr);
697 addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
698 cselw(jv, jv, zr, GE);
699 fmovd(v26, 0.0);
700 addw(tmp5, jv, 1); // jv+1
701 subsw(j, jv, jx);
702 add(qBase, sp, 320); // base of q[]
703 msubw(rscratch1, i, tmp5, rscratch1); // q0 = e0-24*(jv+1)
704 // use double f[20], fq[20], q[20], iq[20] on stack, which is
705 // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
706 // will contain f[20], fq[20], q[20], iq[20]
707 // now initialize f[20] indexes 0..m (inclusive)
708 // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
709 mov(tmp5, sp);
710
711 block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
712 eorw(i, i, i);
1404 // END dsin/dcos PSEUDO CODE
1405 //
1406 // Changes between fdlibm and intrinsic:
1407 // 1. Moved ix < 2**27 from kernel_sin/kernel_cos into dsin/dcos
1408 // 2. Final switch use equivalent bit checks(tbz/tbnz)
1409 // Input ans output:
1410 // 1. Input for generated function: X = r0
1411 // 2. Input for generator: isCos = generate sin or cos, npio2_hw = address
1412 // of npio2_hw table, two_over_pi = address of two_over_pi table,
1413 // pio2 = address if pio2 table, dsin_coef = address if dsin_coef table,
1414 // dcos_coef = address of dcos_coef table
1415 // 3. Return result in v0
1416 // NOTE: general purpose register names match local variable names in C code
1417 void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
1418 address two_over_pi, address pio2, address dsin_coef, address dcos_coef) {
1419 const int POSITIVE_INFINITY_OR_NAN_PREFIX = 0x7FF0;
1420
1421 Label DONE, ARG_REDUCTION, TINY_X, RETURN_SIN, EARLY_CASE;
1422 Register X = r0, absX = r1, n = r2, ix = r3;
1423 FloatRegister y0 = v4, y1 = v5;
1424 block_comment("check |x| ~< pi/4, NaN, Inf and |x| < 2**-27 cases"); {
1425 fmovd(X, v0);
1426 mov(rscratch2, 0x3e400000);
1427 mov(rscratch1, 0x3fe921fb00000000); // pi/4. shifted to reuse later
1428 ubfm(absX, X, 0, 62); // absX
1429 movz(r10, POSITIVE_INFINITY_OR_NAN_PREFIX, 48);
1430 cmp(rscratch2, absX, LSR, 32);
1431 lsr(ix, absX, 32); // set ix
1432 br(GT, TINY_X); // handle tiny x (|x| < 2^-27)
1433 cmp(ix, rscratch1, LSR, 32);
1434 br(LE, EARLY_CASE); // if(ix <= 0x3fe921fb) return
1435 cmp(absX, r10);
1436 br(LT, ARG_REDUCTION);
1437 // X is NaN or INF(i.e. 0x7FF* or 0xFFF*). Return NaN (mantissa != 0).
1438 // Set last bit unconditionally to make it NaN
1439 orr(r10, r10, 1);
1440 fmovd(v0, r10);
1441 ret(lr);
1442 }
1443 block_comment("kernel_sin/kernel_cos: if(ix<0x3e400000) {<fast return>}"); {
1444 bind(TINY_X);
1445 if (isCos) {
1446 fmovd(v0, 1.0);
1447 }
1448 ret(lr);
1449 }
1450 bind(ARG_REDUCTION); /* argument reduction needed */
1451 block_comment("n = __ieee754_rem_pio2(x,y);"); {
1452 generate__ieee754_rem_pio2(npio2_hw, two_over_pi, pio2);
1453 }
1454 block_comment("switch(n&3) {case ... }"); {
1455 if (isCos) {
1456 eorw(absX, n, n, LSR, 1);
1457 tbnz(n, 0, RETURN_SIN);
1458 } else {
1459 tbz(n, 0, RETURN_SIN);
1460 }
1461 generate_kernel_cos(y0, dcos_coef);
1462 if (isCos) {
1463 tbz(absX, 0, DONE);
1464 } else {
1465 tbz(n, 1, DONE);
1466 }
1467 fnegd(v0, v0);
1468 ret(lr);
1469 bind(RETURN_SIN);
1470 generate_kernel_sin(y0, true, dsin_coef);
1471 if (isCos) {
1472 tbz(absX, 0, DONE);
1473 } else {
1474 tbz(n, 1, DONE);
1475 }
1476 fnegd(v0, v0);
1477 ret(lr);
1478 }
1479 bind(EARLY_CASE);
1480 eor(y1, T8B, y1, y1);
1481 if (isCos) {
1482 generate_kernel_cos(v0, dcos_coef);
1483 } else {
1484 generate_kernel_sin(v0, false, dsin_coef);
1485 }
1486 bind(DONE);
1487 ret(lr);
1488 }
|
672 // constants because of that (see comments in code)
673 // 4. Use of jx, which is nx-1 instead of nx
674 // Assumptions:
675 // 1. Assume |X| >= PI/4
676 // Input and output:
677 // 1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
678 // 2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
679 // NOTE: general purpose register names match local variable names in C code
680 // NOTE: fpu registers are actively reused. See comments in code about their usage
681 void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
682 Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
683 RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
684 INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
685 Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
686 RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
687 CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
688 IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
689 RECOMP_FOR1_CHECK;
690 Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
691 tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
692 jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r19;
693 // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
694 // jx = nx - 1
695 lea(twoOverPiBase, ExternalAddress(two_over_pi));
696 cmpw(jv, zr);
697 addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
698 cselw(jv, jv, zr, GE);
699 fmovd(v26, 0.0);
700 addw(tmp5, jv, 1); // jv+1
701 subsw(j, jv, jx);
702 add(qBase, sp, 320); // base of q[]
703 msubw(rscratch1, i, tmp5, rscratch1); // q0 = e0-24*(jv+1)
704 // use double f[20], fq[20], q[20], iq[20] on stack, which is
705 // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
706 // will contain f[20], fq[20], q[20], iq[20]
707 // now initialize f[20] indexes 0..m (inclusive)
708 // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
709 mov(tmp5, sp);
710
711 block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
712 eorw(i, i, i);
1404 // END dsin/dcos PSEUDO CODE
1405 //
1406 // Changes between fdlibm and intrinsic:
1407 // 1. Moved ix < 2**27 from kernel_sin/kernel_cos into dsin/dcos
1408 // 2. Final switch use equivalent bit checks(tbz/tbnz)
1409 // Input ans output:
1410 // 1. Input for generated function: X = r0
1411 // 2. Input for generator: isCos = generate sin or cos, npio2_hw = address
1412 // of npio2_hw table, two_over_pi = address of two_over_pi table,
1413 // pio2 = address if pio2 table, dsin_coef = address if dsin_coef table,
1414 // dcos_coef = address of dcos_coef table
1415 // 3. Return result in v0
1416 // NOTE: general purpose register names match local variable names in C code
1417 void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
1418 address two_over_pi, address pio2, address dsin_coef, address dcos_coef) {
1419 const int POSITIVE_INFINITY_OR_NAN_PREFIX = 0x7FF0;
1420
1421 Label DONE, ARG_REDUCTION, TINY_X, RETURN_SIN, EARLY_CASE;
1422 Register X = r0, absX = r1, n = r2, ix = r3;
1423 FloatRegister y0 = v4, y1 = v5;
1424
1425 enter();
1426 // r19 is used in TemplateInterpreterGenerator::generate_math_entry
1427 RegSet saved_regs = RegSet::of(r19);
1428 push (saved_regs, sp);
1429
1430 block_comment("check |x| ~< pi/4, NaN, Inf and |x| < 2**-27 cases"); {
1431 fmovd(X, v0);
1432 mov(rscratch2, 0x3e400000);
1433 mov(rscratch1, 0x3fe921fb00000000); // pi/4. shifted to reuse later
1434 ubfm(absX, X, 0, 62); // absX
1435 movz(r10, POSITIVE_INFINITY_OR_NAN_PREFIX, 48);
1436 cmp(rscratch2, absX, LSR, 32);
1437 lsr(ix, absX, 32); // set ix
1438 br(GT, TINY_X); // handle tiny x (|x| < 2^-27)
1439 cmp(ix, rscratch1, LSR, 32);
1440 br(LE, EARLY_CASE); // if(ix <= 0x3fe921fb) return
1441 cmp(absX, r10);
1442 br(LT, ARG_REDUCTION);
1443 // X is NaN or INF(i.e. 0x7FF* or 0xFFF*). Return NaN (mantissa != 0).
1444 // Set last bit unconditionally to make it NaN
1445 orr(r10, r10, 1);
1446 fmovd(v0, r10);
1447 b(DONE);
1448 }
1449 block_comment("kernel_sin/kernel_cos: if(ix<0x3e400000) {<fast return>}"); {
1450 bind(TINY_X);
1451 if (isCos) {
1452 fmovd(v0, 1.0);
1453 }
1454 b(DONE);
1455 }
1456 bind(ARG_REDUCTION); /* argument reduction needed */
1457 block_comment("n = __ieee754_rem_pio2(x,y);"); {
1458 generate__ieee754_rem_pio2(npio2_hw, two_over_pi, pio2);
1459 }
1460 block_comment("switch(n&3) {case ... }"); {
1461 if (isCos) {
1462 eorw(absX, n, n, LSR, 1);
1463 tbnz(n, 0, RETURN_SIN);
1464 } else {
1465 tbz(n, 0, RETURN_SIN);
1466 }
1467 generate_kernel_cos(y0, dcos_coef);
1468 if (isCos) {
1469 tbz(absX, 0, DONE);
1470 } else {
1471 tbz(n, 1, DONE);
1472 }
1473 fnegd(v0, v0);
1474 b(DONE);
1475 bind(RETURN_SIN);
1476 generate_kernel_sin(y0, true, dsin_coef);
1477 if (isCos) {
1478 tbz(absX, 0, DONE);
1479 } else {
1480 tbz(n, 1, DONE);
1481 }
1482 fnegd(v0, v0);
1483 b(DONE);
1484 }
1485 bind(EARLY_CASE);
1486 eor(y1, T8B, y1, y1);
1487 if (isCos) {
1488 generate_kernel_cos(v0, dcos_coef);
1489 } else {
1490 generate_kernel_sin(v0, false, dsin_coef);
1491 }
1492 bind(DONE);
1493 pop(saved_regs, sp);
1494 leave();
1495 ret(lr);
1496 }
|