3266 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3267
3268 // Aliases
3269 Register adler = c_rarg0;
3270 Register s1 = c_rarg0;
3271 Register s2 = c_rarg3;
3272 Register buff = c_rarg1;
3273 Register len = c_rarg2;
3274 Register nmax = r4;
3275 Register base = r5;
3276 Register count = r6;
3277 Register temp0 = rscratch1;
3278 Register temp1 = rscratch2;
3279 FloatRegister vbytes = v0;
3280 FloatRegister vs1acc = v1;
3281 FloatRegister vs2acc = v2;
3282 FloatRegister vtable = v3;
3283
3284 // Max number of bytes we can process before having to take the mod
3285 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3286 unsigned long BASE = 0xfff1;
3287 unsigned long NMAX = 0x15B0;
3288
3289 __ mov(base, BASE);
3290 __ mov(nmax, NMAX);
3291
3292 // Load accumulation coefficients for the upper 16 bits
3293 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3294 __ ld1(vtable, __ T16B, Address(temp0));
3295
3296 // s1 is initialized to the lower 16 bits of adler
3297 // s2 is initialized to the upper 16 bits of adler
3298 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
3299 __ uxth(s1, adler); // s1 = (adler & 0xffff)
3300
3301 // The pipelined loop needs at least 16 elements for 1 iteration
3302 // It does check this, but it is more effective to skip to the cleanup loop
3303 __ cmp(len, (u1)16);
3304 __ br(Assembler::HS, L_nmax);
3305 __ cbz(len, L_combine);
3306
3307 __ bind(L_simple_by1_loop);
5364 }
5365 block_comment("} // i");
5366
5367 normalize(Rlen);
5368
5369 mov(Ra, Pm_base); // Save Pm_base in Ra
5370 restore_regs(); // Restore caller's Pm_base
5371
5372 // Copy our result into caller's Pm_base
5373 reverse(Pm_base, Ra, Rlen, t0, t1);
5374
5375 leave();
5376 bind(nothing);
5377 ret(lr);
5378
5379 return entry;
5380 }
5381 // In C, approximately:
5382
5383 // void
5384 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5385 // unsigned long Pn_base[], unsigned long Pm_base[],
5386 // unsigned long inv, int len) {
5387 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5388 // unsigned long *Pa, *Pb, *Pn, *Pm;
5389 // unsigned long Ra, Rb, Rn, Rm;
5390
5391 // int i;
5392
5393 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5394
5395 // for (i = 0; i < len; i++) {
5396 // int j;
5397
5398 // Pa = Pa_base;
5399 // Pb = Pb_base + i;
5400 // Pm = Pm_base;
5401 // Pn = Pn_base + i;
5402
5403 // Ra = *Pa;
5404 // Rb = *Pb;
5405 // Rm = *Pm;
5406 // Rn = *Pn;
5407
5408 // int iters = i;
5409 // for (j = 0; iters--; j++) {
5577 bind(end);
5578 block_comment("} // i");
5579 }
5580
5581 normalize(Rlen);
5582
5583 mov(Ra, Pm_base); // Save Pm_base in Ra
5584 restore_regs(); // Restore caller's Pm_base
5585
5586 // Copy our result into caller's Pm_base
5587 reverse(Pm_base, Ra, Rlen, t0, t1);
5588
5589 leave();
5590 ret(lr);
5591
5592 return entry;
5593 }
5594 // In C, approximately:
5595
5596 // void
5597 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5598 // unsigned long Pm_base[], unsigned long inv, int len) {
5599 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5600 // unsigned long *Pa, *Pb, *Pn, *Pm;
5601 // unsigned long Ra, Rb, Rn, Rm;
5602
5603 // int i;
5604
5605 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5606
5607 // for (i = 0; i < len; i++) {
5608 // int j;
5609
5610 // Pa = Pa_base;
5611 // Pb = Pa_base + i;
5612 // Pm = Pm_base;
5613 // Pn = Pn_base + i;
5614
5615 // Ra = *Pa;
5616 // Rb = *Pb;
5617 // Rm = *Pm;
5618 // Rn = *Pn;
5619
5620 // int iters = (i+1)/2;
5621 // for (j = 0; iters--; j++) {
|
3266 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3267
3268 // Aliases
3269 Register adler = c_rarg0;
3270 Register s1 = c_rarg0;
3271 Register s2 = c_rarg3;
3272 Register buff = c_rarg1;
3273 Register len = c_rarg2;
3274 Register nmax = r4;
3275 Register base = r5;
3276 Register count = r6;
3277 Register temp0 = rscratch1;
3278 Register temp1 = rscratch2;
3279 FloatRegister vbytes = v0;
3280 FloatRegister vs1acc = v1;
3281 FloatRegister vs2acc = v2;
3282 FloatRegister vtable = v3;
3283
3284 // Max number of bytes we can process before having to take the mod
3285 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3286 uint64_t BASE = 0xfff1;
3287 uint64_t NMAX = 0x15B0;
3288
3289 __ mov(base, BASE);
3290 __ mov(nmax, NMAX);
3291
3292 // Load accumulation coefficients for the upper 16 bits
3293 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3294 __ ld1(vtable, __ T16B, Address(temp0));
3295
3296 // s1 is initialized to the lower 16 bits of adler
3297 // s2 is initialized to the upper 16 bits of adler
3298 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
3299 __ uxth(s1, adler); // s1 = (adler & 0xffff)
3300
3301 // The pipelined loop needs at least 16 elements for 1 iteration
3302 // It does check this, but it is more effective to skip to the cleanup loop
3303 __ cmp(len, (u1)16);
3304 __ br(Assembler::HS, L_nmax);
3305 __ cbz(len, L_combine);
3306
3307 __ bind(L_simple_by1_loop);
5364 }
5365 block_comment("} // i");
5366
5367 normalize(Rlen);
5368
5369 mov(Ra, Pm_base); // Save Pm_base in Ra
5370 restore_regs(); // Restore caller's Pm_base
5371
5372 // Copy our result into caller's Pm_base
5373 reverse(Pm_base, Ra, Rlen, t0, t1);
5374
5375 leave();
5376 bind(nothing);
5377 ret(lr);
5378
5379 return entry;
5380 }
5381 // In C, approximately:
5382
5383 // void
5384 // montgomery_multiply(uint64_t Pa_base[], uint64_t Pb_base[],
5385 // uint64_t Pn_base[], uint64_t Pm_base[],
5386 // uint64_t inv, int len) {
5387 // uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5388 // uint64_t *Pa, *Pb, *Pn, *Pm;
5389 // uint64_t Ra, Rb, Rn, Rm;
5390
5391 // int i;
5392
5393 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5394
5395 // for (i = 0; i < len; i++) {
5396 // int j;
5397
5398 // Pa = Pa_base;
5399 // Pb = Pb_base + i;
5400 // Pm = Pm_base;
5401 // Pn = Pn_base + i;
5402
5403 // Ra = *Pa;
5404 // Rb = *Pb;
5405 // Rm = *Pm;
5406 // Rn = *Pn;
5407
5408 // int iters = i;
5409 // for (j = 0; iters--; j++) {
5577 bind(end);
5578 block_comment("} // i");
5579 }
5580
5581 normalize(Rlen);
5582
5583 mov(Ra, Pm_base); // Save Pm_base in Ra
5584 restore_regs(); // Restore caller's Pm_base
5585
5586 // Copy our result into caller's Pm_base
5587 reverse(Pm_base, Ra, Rlen, t0, t1);
5588
5589 leave();
5590 ret(lr);
5591
5592 return entry;
5593 }
5594 // In C, approximately:
5595
5596 // void
5597 // montgomery_square(uint64_t Pa_base[], uint64_t Pn_base[],
5598 // uint64_t Pm_base[], uint64_t inv, int len) {
5599 // uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5600 // uint64_t *Pa, *Pb, *Pn, *Pm;
5601 // uint64_t Ra, Rb, Rn, Rm;
5602
5603 // int i;
5604
5605 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5606
5607 // for (i = 0; i < len; i++) {
5608 // int j;
5609
5610 // Pa = Pa_base;
5611 // Pb = Pa_base + i;
5612 // Pm = Pm_base;
5613 // Pn = Pn_base + i;
5614
5615 // Ra = *Pa;
5616 // Rb = *Pb;
5617 // Rm = *Pm;
5618 // Rn = *Pn;
5619
5620 // int iters = (i+1)/2;
5621 // for (j = 0; iters--; j++) {
|