< prev index next >

src/hotspot/cpu/aarch64/macroAssembler_aarch64_trig.cpp

Print this page
rev 60623 : 8248500: AArch64: Remove the r18 dependency on Windows AArch64
Reviewed-by:
Contributed-by: mbeckwit, luhenry, burban


 672 //         constants because of that (see comments in code)
 673 //     4. Use of jx, which is nx-1 instead of nx
 674 // Assumptions:
 675 //     1. Assume |X| >= PI/4
 676 // Input and output:
 677 //     1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
 678 //     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
 679 // NOTE: general purpose register names match local variable names in C code
 680 // NOTE: fpu registers are actively reused. See comments in code about their usage
 681 void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
 682   Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
 683       RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
 684       INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
 685       Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
 686       RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
 687       CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
 688       IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
 689       RECOMP_FOR1_CHECK;
 690   Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
 691       tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
 692       jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r18;
 693     // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
 694     // jx = nx - 1
 695     lea(twoOverPiBase, ExternalAddress(two_over_pi));
 696     cmpw(jv, zr);
 697     addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
 698     cselw(jv, jv, zr, GE);
 699     fmovd(v26, 0.0);
 700     addw(tmp5, jv, 1);                    // jv+1
 701     subsw(j, jv, jx);
 702     add(qBase, sp, 320);                  // base of q[]
 703     msubw(rscratch1, i, tmp5, rscratch1); // q0 =  e0-24*(jv+1)
 704     // use double f[20], fq[20], q[20], iq[20] on stack, which is
 705     // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
 706     // will contain f[20], fq[20], q[20], iq[20]
 707     // now initialize f[20] indexes 0..m (inclusive)
 708     // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
 709     mov(tmp5, sp);
 710 
 711     block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
 712         eorw(i, i, i);


1404 // END dsin/dcos PSEUDO CODE
1405 //
1406 // Changes between fdlibm and intrinsic:
1407 //     1. Moved ix < 2**27 from kernel_sin/kernel_cos into dsin/dcos
1408 //     2. Final switch use equivalent bit checks(tbz/tbnz)
1409 // Input ans output:
1410 //     1. Input for generated function: X = r0
1411 //     2. Input for generator: isCos = generate sin or cos, npio2_hw = address
1412 //         of npio2_hw table, two_over_pi = address of two_over_pi table,
1413 //         pio2 = address if pio2 table, dsin_coef = address if dsin_coef table,
1414 //         dcos_coef = address of dcos_coef table
1415 //     3. Return result in v0
1416 // NOTE: general purpose register names match local variable names in C code
1417 void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
1418     address two_over_pi, address pio2, address dsin_coef, address dcos_coef) {
1419   const int POSITIVE_INFINITY_OR_NAN_PREFIX = 0x7FF0;
1420 
1421   Label DONE, ARG_REDUCTION, TINY_X, RETURN_SIN, EARLY_CASE;
1422   Register X = r0, absX = r1, n = r2, ix = r3;
1423   FloatRegister y0 = v4, y1 = v5;






1424     block_comment("check |x| ~< pi/4, NaN, Inf and |x| < 2**-27 cases"); {
1425       fmovd(X, v0);
1426       mov(rscratch2, 0x3e400000);
1427       mov(rscratch1, 0x3fe921fb00000000);            // pi/4. shifted to reuse later
1428       ubfm(absX, X, 0, 62);                          // absX
1429       movz(r10, POSITIVE_INFINITY_OR_NAN_PREFIX, 48);
1430       cmp(rscratch2, absX, LSR, 32);
1431       lsr(ix, absX, 32);                             // set ix
1432       br(GT, TINY_X);                                // handle tiny x (|x| < 2^-27)
1433       cmp(ix, rscratch1, LSR, 32);
1434       br(LE, EARLY_CASE);                            // if(ix <= 0x3fe921fb) return
1435       cmp(absX, r10);
1436       br(LT, ARG_REDUCTION);
1437       // X is NaN or INF(i.e. 0x7FF* or 0xFFF*). Return NaN (mantissa != 0).
1438       // Set last bit unconditionally to make it NaN
1439       orr(r10, r10, 1);
1440       fmovd(v0, r10);
1441       ret(lr);
1442     }
1443   block_comment("kernel_sin/kernel_cos: if(ix<0x3e400000) {<fast return>}"); {
1444     bind(TINY_X);
1445       if (isCos) {
1446         fmovd(v0, 1.0);
1447       }
1448       ret(lr);
1449   }
1450   bind(ARG_REDUCTION); /* argument reduction needed */
1451     block_comment("n = __ieee754_rem_pio2(x,y);"); {
1452       generate__ieee754_rem_pio2(npio2_hw, two_over_pi, pio2);
1453     }
1454     block_comment("switch(n&3) {case ... }"); {
1455       if (isCos) {
1456         eorw(absX, n, n, LSR, 1);
1457         tbnz(n, 0, RETURN_SIN);
1458       } else {
1459         tbz(n, 0, RETURN_SIN);
1460       }
1461       generate_kernel_cos(y0, dcos_coef);
1462       if (isCos) {
1463         tbz(absX, 0, DONE);
1464       } else {
1465         tbz(n, 1, DONE);
1466       }
1467       fnegd(v0, v0);
1468       ret(lr);
1469     bind(RETURN_SIN);
1470       generate_kernel_sin(y0, true, dsin_coef);
1471       if (isCos) {
1472         tbz(absX, 0, DONE);
1473       } else {
1474         tbz(n, 1, DONE);
1475       }
1476       fnegd(v0, v0);
1477       ret(lr);
1478     }
1479   bind(EARLY_CASE);
1480     eor(y1, T8B, y1, y1);
1481     if (isCos) {
1482       generate_kernel_cos(v0, dcos_coef);
1483     } else {
1484       generate_kernel_sin(v0, false, dsin_coef);
1485     }
1486   bind(DONE);


1487     ret(lr);
1488 }


 672 //         constants because of that (see comments in code)
 673 //     4. Use of jx, which is nx-1 instead of nx
 674 // Assumptions:
 675 //     1. Assume |X| >= PI/4
 676 // Input and output:
 677 //     1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
 678 //     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
 679 // NOTE: general purpose register names match local variable names in C code
 680 // NOTE: fpu registers are actively reused. See comments in code about their usage
 681 void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
 682   Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
 683       RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
 684       INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
 685       Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
 686       RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
 687       CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
 688       IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
 689       RECOMP_FOR1_CHECK;
 690   Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
 691       tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
 692       jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r19;
 693     // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
 694     // jx = nx - 1
 695     lea(twoOverPiBase, ExternalAddress(two_over_pi));
 696     cmpw(jv, zr);
 697     addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
 698     cselw(jv, jv, zr, GE);
 699     fmovd(v26, 0.0);
 700     addw(tmp5, jv, 1);                    // jv+1
 701     subsw(j, jv, jx);
 702     add(qBase, sp, 320);                  // base of q[]
 703     msubw(rscratch1, i, tmp5, rscratch1); // q0 =  e0-24*(jv+1)
 704     // use double f[20], fq[20], q[20], iq[20] on stack, which is
 705     // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
 706     // will contain f[20], fq[20], q[20], iq[20]
 707     // now initialize f[20] indexes 0..m (inclusive)
 708     // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
 709     mov(tmp5, sp);
 710 
 711     block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
 712         eorw(i, i, i);


1404 // END dsin/dcos PSEUDO CODE
1405 //
1406 // Changes between fdlibm and intrinsic:
1407 //     1. Moved ix < 2**27 from kernel_sin/kernel_cos into dsin/dcos
1408 //     2. Final switch use equivalent bit checks(tbz/tbnz)
1409 // Input ans output:
1410 //     1. Input for generated function: X = r0
1411 //     2. Input for generator: isCos = generate sin or cos, npio2_hw = address
1412 //         of npio2_hw table, two_over_pi = address of two_over_pi table,
1413 //         pio2 = address if pio2 table, dsin_coef = address if dsin_coef table,
1414 //         dcos_coef = address of dcos_coef table
1415 //     3. Return result in v0
1416 // NOTE: general purpose register names match local variable names in C code
1417 void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
1418     address two_over_pi, address pio2, address dsin_coef, address dcos_coef) {
1419   const int POSITIVE_INFINITY_OR_NAN_PREFIX = 0x7FF0;
1420 
1421   Label DONE, ARG_REDUCTION, TINY_X, RETURN_SIN, EARLY_CASE;
1422   Register X = r0, absX = r1, n = r2, ix = r3;
1423   FloatRegister y0 = v4, y1 = v5;
1424 
1425   enter();
1426   // r19 is used in TemplateInterpreterGenerator::generate_math_entry
1427   RegSet saved_regs = RegSet::of(r19);
1428   push (saved_regs, sp);
1429 
1430     block_comment("check |x| ~< pi/4, NaN, Inf and |x| < 2**-27 cases"); {
1431       fmovd(X, v0);
1432       mov(rscratch2, 0x3e400000);
1433       mov(rscratch1, 0x3fe921fb00000000);            // pi/4. shifted to reuse later
1434       ubfm(absX, X, 0, 62);                          // absX
1435       movz(r10, POSITIVE_INFINITY_OR_NAN_PREFIX, 48);
1436       cmp(rscratch2, absX, LSR, 32);
1437       lsr(ix, absX, 32);                             // set ix
1438       br(GT, TINY_X);                                // handle tiny x (|x| < 2^-27)
1439       cmp(ix, rscratch1, LSR, 32);
1440       br(LE, EARLY_CASE);                            // if(ix <= 0x3fe921fb) return
1441       cmp(absX, r10);
1442       br(LT, ARG_REDUCTION);
1443       // X is NaN or INF(i.e. 0x7FF* or 0xFFF*). Return NaN (mantissa != 0).
1444       // Set last bit unconditionally to make it NaN
1445       orr(r10, r10, 1);
1446       fmovd(v0, r10);
1447       b(DONE);
1448     }
1449   block_comment("kernel_sin/kernel_cos: if(ix<0x3e400000) {<fast return>}"); {
1450     bind(TINY_X);
1451       if (isCos) {
1452         fmovd(v0, 1.0);
1453       }
1454       b(DONE);
1455   }
1456   bind(ARG_REDUCTION); /* argument reduction needed */
1457     block_comment("n = __ieee754_rem_pio2(x,y);"); {
1458       generate__ieee754_rem_pio2(npio2_hw, two_over_pi, pio2);
1459     }
1460     block_comment("switch(n&3) {case ... }"); {
1461       if (isCos) {
1462         eorw(absX, n, n, LSR, 1);
1463         tbnz(n, 0, RETURN_SIN);
1464       } else {
1465         tbz(n, 0, RETURN_SIN);
1466       }
1467       generate_kernel_cos(y0, dcos_coef);
1468       if (isCos) {
1469         tbz(absX, 0, DONE);
1470       } else {
1471         tbz(n, 1, DONE);
1472       }
1473       fnegd(v0, v0);
1474       b(DONE);
1475     bind(RETURN_SIN);
1476       generate_kernel_sin(y0, true, dsin_coef);
1477       if (isCos) {
1478         tbz(absX, 0, DONE);
1479       } else {
1480         tbz(n, 1, DONE);
1481       }
1482       fnegd(v0, v0);
1483       b(DONE);
1484     }
1485   bind(EARLY_CASE);
1486     eor(y1, T8B, y1, y1);
1487     if (isCos) {
1488       generate_kernel_cos(v0, dcos_coef);
1489     } else {
1490       generate_kernel_sin(v0, false, dsin_coef);
1491     }
1492   bind(DONE);
1493     pop(saved_regs, sp);
1494     leave();
1495     ret(lr);
1496 }
< prev index next >