184 //}
185 //
186 // END __ieee754_rem_pio2 PSEUDO CODE
187 //
188 // Changes between fdlibm and intrinsic for __ieee754_rem_pio2:
189 // 1. INF/NaN check for huge argument is removed in comparison with fdlibm
190 // code, because this check is already done in dcos/dsin code
191 // 2. Most constants are now loaded from table instead of direct initialization
192 // 3. Two loops are unrolled
193 // Assumptions:
194 // 1. Assume |X| >= PI/4
195 // 2. Assume rscratch1 = 0x3fe921fb00000000 (~ PI/4)
196 // 3. Assume ix = r3
197 // Input and output:
198 // 1. Input: X = r0
199 // 2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
200 // NOTE: general purpose register names match local variable names in C code
201 // NOTE: fpu registers are actively reused. See comments in code about their usage
202 void MacroAssembler::generate__ieee754_rem_pio2(address npio2_hw,
203 address two_over_pi, address pio2) {
204 const long PIO2_1t = 0x3DD0B4611A626331UL;
205 const long PIO2_2 = 0x3DD0B4611A600000UL;
206 const long PIO2_2t = 0x3BA3198A2E037073UL;
207 Label X_IS_NEGATIVE, X_IS_MEDIUM_OR_LARGE, X_IS_POSITIVE_LONG_PI, LARGE_ELSE,
208 REDUCTION_DONE, X_IS_MEDIUM_BRANCH_DONE, X_IS_LARGE, NX_SET,
209 X_IS_NEGATIVE_LONG_PI;
210 Register X = r0, n = r2, ix = r3, jv = r4, tmp5 = r5, jx = r6,
211 tmp3 = r7, iqBase = r10, ih = r11, i = r17;
212 // initializing constants first
213 // rscratch1 = 0x3fe921fb00000000 (see assumptions)
214 movk(rscratch1, 0x3ff9, 48); // was 0x3fe921fb0..0 now it's 0x3ff921fb0..0
215 mov(rscratch2, 0x4002d97c); // 3*PI/4 high word
216 movk(rscratch1, 0x5440, 16); // now rscratch1 == PIO2_1
217 fmovd(v1, rscratch1); // v1 = PIO2_1
218 cmp(rscratch2, ix);
219 br(LE, X_IS_MEDIUM_OR_LARGE);
220
221 block_comment("if(ix<0x4002d97c) {... /* |x| ~< 3pi/4 */ "); {
222 cmp(X, zr);
223 br(LT, X_IS_NEGATIVE);
224
225 block_comment("if(hx>0) {"); {
226 fsubd(v2, v0, v1); // v2 = z = x - pio2_1
672 // constants because of that (see comments in code)
673 // 4. Use of jx, which is nx-1 instead of nx
674 // Assumptions:
675 // 1. Assume |X| >= PI/4
676 // Input and output:
677 // 1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
678 // 2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
679 // NOTE: general purpose register names match local variable names in C code
680 // NOTE: fpu registers are actively reused. See comments in code about their usage
681 void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
682 Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
683 RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
684 INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
685 Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
686 RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
687 CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
688 IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
689 RECOMP_FOR1_CHECK;
690 Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
691 tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
692 jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r18;
693 // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
694 // jx = nx - 1
695 lea(twoOverPiBase, ExternalAddress(two_over_pi));
696 cmpw(jv, zr);
697 addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
698 cselw(jv, jv, zr, GE);
699 fmovd(v26, 0.0);
700 addw(tmp5, jv, 1); // jv+1
701 subsw(j, jv, jx);
702 add(qBase, sp, 320); // base of q[]
703 msubw(rscratch1, i, tmp5, rscratch1); // q0 = e0-24*(jv+1)
704 // use double f[20], fq[20], q[20], iq[20] on stack, which is
705 // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
706 // will contain f[20], fq[20], q[20], iq[20]
707 // now initialize f[20] indexes 0..m (inclusive)
708 // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
709 mov(tmp5, sp);
710
711 block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
712 eorw(i, i, i);
|
184 //}
185 //
186 // END __ieee754_rem_pio2 PSEUDO CODE
187 //
188 // Changes between fdlibm and intrinsic for __ieee754_rem_pio2:
189 // 1. INF/NaN check for huge argument is removed in comparison with fdlibm
190 // code, because this check is already done in dcos/dsin code
191 // 2. Most constants are now loaded from table instead of direct initialization
192 // 3. Two loops are unrolled
193 // Assumptions:
194 // 1. Assume |X| >= PI/4
195 // 2. Assume rscratch1 = 0x3fe921fb00000000 (~ PI/4)
196 // 3. Assume ix = r3
197 // Input and output:
198 // 1. Input: X = r0
199 // 2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
200 // NOTE: general purpose register names match local variable names in C code
201 // NOTE: fpu registers are actively reused. See comments in code about their usage
202 void MacroAssembler::generate__ieee754_rem_pio2(address npio2_hw,
203 address two_over_pi, address pio2) {
204 const int64_t PIO2_1t = 0x3DD0B4611A626331UL;
205 const int64_t PIO2_2 = 0x3DD0B4611A600000UL;
206 const int64_t PIO2_2t = 0x3BA3198A2E037073UL;
207 Label X_IS_NEGATIVE, X_IS_MEDIUM_OR_LARGE, X_IS_POSITIVE_LONG_PI, LARGE_ELSE,
208 REDUCTION_DONE, X_IS_MEDIUM_BRANCH_DONE, X_IS_LARGE, NX_SET,
209 X_IS_NEGATIVE_LONG_PI;
210 Register X = r0, n = r2, ix = r3, jv = r4, tmp5 = r5, jx = r6,
211 tmp3 = r7, iqBase = r10, ih = r11, i = r17;
212 // initializing constants first
213 // rscratch1 = 0x3fe921fb00000000 (see assumptions)
214 movk(rscratch1, 0x3ff9, 48); // was 0x3fe921fb0..0 now it's 0x3ff921fb0..0
215 mov(rscratch2, 0x4002d97c); // 3*PI/4 high word
216 movk(rscratch1, 0x5440, 16); // now rscratch1 == PIO2_1
217 fmovd(v1, rscratch1); // v1 = PIO2_1
218 cmp(rscratch2, ix);
219 br(LE, X_IS_MEDIUM_OR_LARGE);
220
221 block_comment("if(ix<0x4002d97c) {... /* |x| ~< 3pi/4 */ "); {
222 cmp(X, zr);
223 br(LT, X_IS_NEGATIVE);
224
225 block_comment("if(hx>0) {"); {
226 fsubd(v2, v0, v1); // v2 = z = x - pio2_1
672 // constants because of that (see comments in code)
673 // 4. Use of jx, which is nx-1 instead of nx
674 // Assumptions:
675 // 1. Assume |X| >= PI/4
676 // Input and output:
677 // 1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
678 // 2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
679 // NOTE: general purpose register names match local variable names in C code
680 // NOTE: fpu registers are actively reused. See comments in code about their usage
681 void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
682 Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
683 RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
684 INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
685 Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
686 RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
687 CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
688 IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
689 RECOMP_FOR1_CHECK;
690 Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
691 tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
692 jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r19;
693 // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
694 // jx = nx - 1
695 lea(twoOverPiBase, ExternalAddress(two_over_pi));
696 cmpw(jv, zr);
697 addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
698 cselw(jv, jv, zr, GE);
699 fmovd(v26, 0.0);
700 addw(tmp5, jv, 1); // jv+1
701 subsw(j, jv, jx);
702 add(qBase, sp, 320); // base of q[]
703 msubw(rscratch1, i, tmp5, rscratch1); // q0 = e0-24*(jv+1)
704 // use double f[20], fq[20], q[20], iq[20] on stack, which is
705 // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
706 // will contain f[20], fq[20], q[20], iq[20]
707 // now initialize f[20] indexes 0..m (inclusive)
708 // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
709 mov(tmp5, sp);
710
711 block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
712 eorw(i, i, i);
|