jdk-tip Sdiff src/hotspot/cpu/aarch64

src/hotspot/cpu/aarch64/macroAssembler_aarch64_trig.cpp

8248238: Adding Windows support to OpenJDK on AArch64

Summary: LP64 vs LLP64 changes to add Windows support

Contributed-by: Monica Beckwith <monica.beckwith@microsoft.com>, Ludovic Henry <luhenry@microsoft.com>
Reviewed-by:
8248238: Adding Windows support to OpenJDK on AArch64

Summary: Adding Windows support for AArch64

Contributed-by: Ludovic Henry <luhenry@microsoft.com>, Monica Beckwith <monica.beckwith@microsoft.com>
Reviewed-by:

 184 //}
 185 //
 186 // END __ieee754_rem_pio2 PSEUDO CODE
 187 //
 188 // Changes between fdlibm and intrinsic for __ieee754_rem_pio2:
 189 //     1. INF/NaN check for huge argument is removed in comparison with fdlibm
 190 //     code, because this check is already done in dcos/dsin code
 191 //     2. Most constants are now loaded from table instead of direct initialization
 192 //     3. Two loops are unrolled
 193 // Assumptions:
 194 //     1. Assume |X| >= PI/4
 195 //     2. Assume rscratch1 = 0x3fe921fb00000000  (~ PI/4)
 196 //     3. Assume ix = r3
 197 // Input and output:
 198 //     1. Input: X = r0
 199 //     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
 200 // NOTE: general purpose register names match local variable names in C code
 201 // NOTE: fpu registers are actively reused. See comments in code about their usage
 202 void MacroAssembler::generate__ieee754_rem_pio2(address npio2_hw,
 203     address two_over_pi, address pio2) {
 204   const long PIO2_1t = 0x3DD0B4611A626331UL;
 205   const long PIO2_2  = 0x3DD0B4611A600000UL;
 206   const long PIO2_2t = 0x3BA3198A2E037073UL;
 207   Label X_IS_NEGATIVE, X_IS_MEDIUM_OR_LARGE, X_IS_POSITIVE_LONG_PI, LARGE_ELSE,
 208       REDUCTION_DONE, X_IS_MEDIUM_BRANCH_DONE, X_IS_LARGE, NX_SET,
 209       X_IS_NEGATIVE_LONG_PI;
 210   Register X = r0, n = r2, ix = r3, jv = r4, tmp5 = r5, jx = r6,
 211       tmp3 = r7, iqBase = r10, ih = r11, i = r17;
 212     // initializing constants first
 213     // rscratch1 = 0x3fe921fb00000000 (see assumptions)
 214     movk(rscratch1, 0x3ff9, 48); // was 0x3fe921fb0..0 now it's 0x3ff921fb0..0
 215     mov(rscratch2, 0x4002d97c); // 3*PI/4 high word
 216     movk(rscratch1, 0x5440, 16); // now rscratch1 == PIO2_1
 217     fmovd(v1, rscratch1); // v1 = PIO2_1
 218     cmp(rscratch2, ix);
 219     br(LE, X_IS_MEDIUM_OR_LARGE);
 220 
 221     block_comment("if(ix<0x4002d97c) {...  /* |x| ~< 3pi/4 */ "); {
 222       cmp(X, zr);
 223       br(LT, X_IS_NEGATIVE);
 224 
 225       block_comment("if(hx>0) {"); {
 226         fsubd(v2, v0, v1); // v2 = z = x - pio2_1

 672 //         constants because of that (see comments in code)
 673 //     4. Use of jx, which is nx-1 instead of nx
 674 // Assumptions:
 675 //     1. Assume |X| >= PI/4
 676 // Input and output:
 677 //     1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
 678 //     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
 679 // NOTE: general purpose register names match local variable names in C code
 680 // NOTE: fpu registers are actively reused. See comments in code about their usage
 681 void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
 682   Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
 683       RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
 684       INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
 685       Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
 686       RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
 687       CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
 688       IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
 689       RECOMP_FOR1_CHECK;
 690   Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
 691       tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
 692       jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r18;
 693     // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
 694     // jx = nx - 1
 695     lea(twoOverPiBase, ExternalAddress(two_over_pi));
 696     cmpw(jv, zr);
 697     addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
 698     cselw(jv, jv, zr, GE);
 699     fmovd(v26, 0.0);
 700     addw(tmp5, jv, 1);                    // jv+1
 701     subsw(j, jv, jx);
 702     add(qBase, sp, 320);                  // base of q[]
 703     msubw(rscratch1, i, tmp5, rscratch1); // q0 =  e0-24*(jv+1)
 704     // use double f[20], fq[20], q[20], iq[20] on stack, which is
 705     // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
 706     // will contain f[20], fq[20], q[20], iq[20]
 707     // now initialize f[20] indexes 0..m (inclusive)
 708     // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
 709     mov(tmp5, sp);
 710 
 711     block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
 712         eorw(i, i, i);

 184 //}
 185 //
 186 // END __ieee754_rem_pio2 PSEUDO CODE
 187 //
 188 // Changes between fdlibm and intrinsic for __ieee754_rem_pio2:
 189 //     1. INF/NaN check for huge argument is removed in comparison with fdlibm
 190 //     code, because this check is already done in dcos/dsin code
 191 //     2. Most constants are now loaded from table instead of direct initialization
 192 //     3. Two loops are unrolled
 193 // Assumptions:
 194 //     1. Assume |X| >= PI/4
 195 //     2. Assume rscratch1 = 0x3fe921fb00000000  (~ PI/4)
 196 //     3. Assume ix = r3
 197 // Input and output:
 198 //     1. Input: X = r0
 199 //     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
 200 // NOTE: general purpose register names match local variable names in C code
 201 // NOTE: fpu registers are actively reused. See comments in code about their usage
 202 void MacroAssembler::generate__ieee754_rem_pio2(address npio2_hw,
 203     address two_over_pi, address pio2) {
 204   const int64_t PIO2_1t = 0x3DD0B4611A626331UL;
 205   const int64_t PIO2_2  = 0x3DD0B4611A600000UL;
 206   const int64_t PIO2_2t = 0x3BA3198A2E037073UL;
 207   Label X_IS_NEGATIVE, X_IS_MEDIUM_OR_LARGE, X_IS_POSITIVE_LONG_PI, LARGE_ELSE,
 208       REDUCTION_DONE, X_IS_MEDIUM_BRANCH_DONE, X_IS_LARGE, NX_SET,
 209       X_IS_NEGATIVE_LONG_PI;
 210   Register X = r0, n = r2, ix = r3, jv = r4, tmp5 = r5, jx = r6,
 211       tmp3 = r7, iqBase = r10, ih = r11, i = r17;
 212     // initializing constants first
 213     // rscratch1 = 0x3fe921fb00000000 (see assumptions)
 214     movk(rscratch1, 0x3ff9, 48); // was 0x3fe921fb0..0 now it's 0x3ff921fb0..0
 215     mov(rscratch2, 0x4002d97c); // 3*PI/4 high word
 216     movk(rscratch1, 0x5440, 16); // now rscratch1 == PIO2_1
 217     fmovd(v1, rscratch1); // v1 = PIO2_1
 218     cmp(rscratch2, ix);
 219     br(LE, X_IS_MEDIUM_OR_LARGE);
 220 
 221     block_comment("if(ix<0x4002d97c) {...  /* |x| ~< 3pi/4 */ "); {
 222       cmp(X, zr);
 223       br(LT, X_IS_NEGATIVE);
 224 
 225       block_comment("if(hx>0) {"); {
 226         fsubd(v2, v0, v1); // v2 = z = x - pio2_1

 672 //         constants because of that (see comments in code)
 673 //     4. Use of jx, which is nx-1 instead of nx
 674 // Assumptions:
 675 //     1. Assume |X| >= PI/4
 676 // Input and output:
 677 //     1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
 678 //     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
 679 // NOTE: general purpose register names match local variable names in C code
 680 // NOTE: fpu registers are actively reused. See comments in code about their usage
 681 void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
 682   Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
 683       RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
 684       INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
 685       Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
 686       RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
 687       CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
 688       IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
 689       RECOMP_FOR1_CHECK;
 690   Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
 691       tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
 692       jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r19;
 693     // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
 694     // jx = nx - 1
 695     lea(twoOverPiBase, ExternalAddress(two_over_pi));
 696     cmpw(jv, zr);
 697     addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
 698     cselw(jv, jv, zr, GE);
 699     fmovd(v26, 0.0);
 700     addw(tmp5, jv, 1);                    // jv+1
 701     subsw(j, jv, jx);
 702     add(qBase, sp, 320);                  // base of q[]
 703     msubw(rscratch1, i, tmp5, rscratch1); // q0 =  e0-24*(jv+1)
 704     // use double f[20], fq[20], q[20], iq[20] on stack, which is
 705     // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
 706     // will contain f[20], fq[20], q[20], iq[20]
 707     // now initialize f[20] indexes 0..m (inclusive)
 708     // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
 709     mov(tmp5, sp);
 710 
 711     block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
 712         eorw(i, i, i);

< prev index next >