jdk-tip Sdiff src/hotspot/cpu/aarch64

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

8248238: Adding Windows support to OpenJDK on AArch64

Summary: LP64 vs LLP64 changes to add Windows support

Contributed-by: Monica Beckwith <monica.beckwith@microsoft.com>, Ludovic Henry <luhenry@microsoft.com>
Reviewed-by:

3266     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3267 
3268     // Aliases
3269     Register adler  = c_rarg0;
3270     Register s1     = c_rarg0;
3271     Register s2     = c_rarg3;
3272     Register buff   = c_rarg1;
3273     Register len    = c_rarg2;
3274     Register nmax  = r4;
3275     Register base  = r5;
3276     Register count = r6;
3277     Register temp0 = rscratch1;
3278     Register temp1 = rscratch2;
3279     FloatRegister vbytes = v0;
3280     FloatRegister vs1acc = v1;
3281     FloatRegister vs2acc = v2;
3282     FloatRegister vtable = v3;
3283 
3284     // Max number of bytes we can process before having to take the mod
3285     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3286     unsigned long BASE = 0xfff1;
3287     unsigned long NMAX = 0x15B0;
3288 
3289     __ mov(base, BASE);
3290     __ mov(nmax, NMAX);
3291 
3292     // Load accumulation coefficients for the upper 16 bits
3293     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3294     __ ld1(vtable, __ T16B, Address(temp0));
3295 
3296     // s1 is initialized to the lower 16 bits of adler
3297     // s2 is initialized to the upper 16 bits of adler
3298     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3299     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3300 
3301     // The pipelined loop needs at least 16 elements for 1 iteration
3302     // It does check this, but it is more effective to skip to the cleanup loop
3303     __ cmp(len, (u1)16);
3304     __ br(Assembler::HS, L_nmax);
3305     __ cbz(len, L_combine);
3306 
3307     __ bind(L_simple_by1_loop);

5364       }
5365       block_comment("} // i");
5366 
5367       normalize(Rlen);
5368 
5369       mov(Ra, Pm_base);  // Save Pm_base in Ra
5370       restore_regs();  // Restore caller's Pm_base
5371 
5372       // Copy our result into caller's Pm_base
5373       reverse(Pm_base, Ra, Rlen, t0, t1);
5374 
5375       leave();
5376       bind(nothing);
5377       ret(lr);
5378 
5379       return entry;
5380     }
5381     // In C, approximately:
5382 
5383     // void
5384     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5385     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5386     //                     unsigned long inv, int len) {
5387     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5388     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5389     //   unsigned long Ra, Rb, Rn, Rm;
5390 
5391     //   int i;
5392 
5393     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5394 
5395     //   for (i = 0; i < len; i++) {
5396     //     int j;
5397 
5398     //     Pa = Pa_base;
5399     //     Pb = Pb_base + i;
5400     //     Pm = Pm_base;
5401     //     Pn = Pn_base + i;
5402 
5403     //     Ra = *Pa;
5404     //     Rb = *Pb;
5405     //     Rm = *Pm;
5406     //     Rn = *Pn;
5407 
5408     //     int iters = i;
5409     //     for (j = 0; iters--; j++) {

5577         bind(end);
5578         block_comment("} // i");
5579       }
5580 
5581       normalize(Rlen);
5582 
5583       mov(Ra, Pm_base);  // Save Pm_base in Ra
5584       restore_regs();  // Restore caller's Pm_base
5585 
5586       // Copy our result into caller's Pm_base
5587       reverse(Pm_base, Ra, Rlen, t0, t1);
5588 
5589       leave();
5590       ret(lr);
5591 
5592       return entry;
5593     }
5594     // In C, approximately:
5595 
5596     // void
5597     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5598     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5599     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5600     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5601     //   unsigned long Ra, Rb, Rn, Rm;
5602 
5603     //   int i;
5604 
5605     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5606 
5607     //   for (i = 0; i < len; i++) {
5608     //     int j;
5609 
5610     //     Pa = Pa_base;
5611     //     Pb = Pa_base + i;
5612     //     Pm = Pm_base;
5613     //     Pn = Pn_base + i;
5614 
5615     //     Ra = *Pa;
5616     //     Rb = *Pb;
5617     //     Rm = *Pm;
5618     //     Rn = *Pn;
5619 
5620     //     int iters = (i+1)/2;
5621     //     for (j = 0; iters--; j++) {

3266     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3267 
3268     // Aliases
3269     Register adler  = c_rarg0;
3270     Register s1     = c_rarg0;
3271     Register s2     = c_rarg3;
3272     Register buff   = c_rarg1;
3273     Register len    = c_rarg2;
3274     Register nmax  = r4;
3275     Register base  = r5;
3276     Register count = r6;
3277     Register temp0 = rscratch1;
3278     Register temp1 = rscratch2;
3279     FloatRegister vbytes = v0;
3280     FloatRegister vs1acc = v1;
3281     FloatRegister vs2acc = v2;
3282     FloatRegister vtable = v3;
3283 
3284     // Max number of bytes we can process before having to take the mod
3285     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3286     uint64_t BASE = 0xfff1;
3287     uint64_t NMAX = 0x15B0;
3288 
3289     __ mov(base, BASE);
3290     __ mov(nmax, NMAX);
3291 
3292     // Load accumulation coefficients for the upper 16 bits
3293     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3294     __ ld1(vtable, __ T16B, Address(temp0));
3295 
3296     // s1 is initialized to the lower 16 bits of adler
3297     // s2 is initialized to the upper 16 bits of adler
3298     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3299     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3300 
3301     // The pipelined loop needs at least 16 elements for 1 iteration
3302     // It does check this, but it is more effective to skip to the cleanup loop
3303     __ cmp(len, (u1)16);
3304     __ br(Assembler::HS, L_nmax);
3305     __ cbz(len, L_combine);
3306 
3307     __ bind(L_simple_by1_loop);

5364       }
5365       block_comment("} // i");
5366 
5367       normalize(Rlen);
5368 
5369       mov(Ra, Pm_base);  // Save Pm_base in Ra
5370       restore_regs();  // Restore caller's Pm_base
5371 
5372       // Copy our result into caller's Pm_base
5373       reverse(Pm_base, Ra, Rlen, t0, t1);
5374 
5375       leave();
5376       bind(nothing);
5377       ret(lr);
5378 
5379       return entry;
5380     }
5381     // In C, approximately:
5382 
5383     // void
5384     // montgomery_multiply(uint64_t Pa_base[], uint64_t Pb_base[],
5385     //                     uint64_t Pn_base[], uint64_t Pm_base[],
5386     //                     uint64_t inv, int len) {
5387     //   uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5388     //   uint64_t *Pa, *Pb, *Pn, *Pm;
5389     //   uint64_t Ra, Rb, Rn, Rm;
5390 
5391     //   int i;
5392 
5393     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5394 
5395     //   for (i = 0; i < len; i++) {
5396     //     int j;
5397 
5398     //     Pa = Pa_base;
5399     //     Pb = Pb_base + i;
5400     //     Pm = Pm_base;
5401     //     Pn = Pn_base + i;
5402 
5403     //     Ra = *Pa;
5404     //     Rb = *Pb;
5405     //     Rm = *Pm;
5406     //     Rn = *Pn;
5407 
5408     //     int iters = i;
5409     //     for (j = 0; iters--; j++) {

5577         bind(end);
5578         block_comment("} // i");
5579       }
5580 
5581       normalize(Rlen);
5582 
5583       mov(Ra, Pm_base);  // Save Pm_base in Ra
5584       restore_regs();  // Restore caller's Pm_base
5585 
5586       // Copy our result into caller's Pm_base
5587       reverse(Pm_base, Ra, Rlen, t0, t1);
5588 
5589       leave();
5590       ret(lr);
5591 
5592       return entry;
5593     }
5594     // In C, approximately:
5595 
5596     // void
5597     // montgomery_square(uint64_t Pa_base[], uint64_t Pn_base[],
5598     //                   uint64_t Pm_base[], uint64_t inv, int len) {
5599     //   uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5600     //   uint64_t *Pa, *Pb, *Pn, *Pm;
5601     //   uint64_t Ra, Rb, Rn, Rm;
5602 
5603     //   int i;
5604 
5605     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5606 
5607     //   for (i = 0; i < len; i++) {
5608     //     int j;
5609 
5610     //     Pa = Pa_base;
5611     //     Pb = Pa_base + i;
5612     //     Pm = Pm_base;
5613     //     Pn = Pn_base + i;
5614 
5615     //     Ra = *Pa;
5616     //     Rb = *Pb;
5617     //     Rm = *Pm;
5618     //     Rn = *Pn;
5619 
5620     //     int iters = (i+1)/2;
5621     //     for (j = 0; iters--; j++) {

< prev index next >