< prev index next >

src/cpu/ppc/vm/macroAssembler_ppc.cpp

Print this page
rev 8631 : 8130654: ppc: implement MultiplyToLen intrinsic
Contributed-by: Peter.Januschke@sap.com


3416     //14:
3417     if (cntval & 2) {
3418       lwzx(R0, str1_reg, index_reg);
3419       lwzx(tmp2_reg, str2_reg, index_reg);
3420       cmpw(CCR0, R0, tmp2_reg);
3421       bne(CCR0, Ldone_false);
3422       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3423     }
3424     if (cntval & 1) {
3425       lhzx(R0, str1_reg, index_reg);
3426       lhzx(tmp2_reg, str2_reg, index_reg);
3427       cmpw(CCR0, R0, tmp2_reg);
3428       bne(CCR0, Ldone_false);
3429     }
3430     // fallthru: true
3431   }
3432   li(result_reg, 1);
3433   bind(Ldone_false);
3434 }
3435 


















































































































































































































































































































































































3436 
3437 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
3438 #ifdef ASSERT
3439   Label ok;
3440   if (check_equal) {
3441     beq(CCR0, ok);
3442   } else {
3443     bne(CCR0, ok);
3444   }
3445   stop(msg, id);
3446   bind(ok);
3447 #endif
3448 }
3449 
3450 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
3451                                           Register mem_base, const char* msg, int id) {
3452 #ifdef ASSERT
3453   switch (size) {
3454     case 4:
3455       lwz(R0, mem_offset, mem_base);




3416     //14:
3417     if (cntval & 2) {
3418       lwzx(R0, str1_reg, index_reg);
3419       lwzx(tmp2_reg, str2_reg, index_reg);
3420       cmpw(CCR0, R0, tmp2_reg);
3421       bne(CCR0, Ldone_false);
3422       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3423     }
3424     if (cntval & 1) {
3425       lhzx(R0, str1_reg, index_reg);
3426       lhzx(tmp2_reg, str2_reg, index_reg);
3427       cmpw(CCR0, R0, tmp2_reg);
3428       bne(CCR0, Ldone_false);
3429     }
3430     // fallthru: true
3431   }
3432   li(result_reg, 1);
3433   bind(Ldone_false);
3434 }
3435 
3436 // dest_lo += src1 + src2
3437 // dest_hi += carry1 + carry2
3438 void MacroAssembler::add2_with_carry(Register dest_hi,
3439                                      Register dest_lo,
3440                                      Register src1, Register src2) {
3441   li(R0, 0);
3442   addc(dest_lo, dest_lo, src1);
3443   adde(dest_hi, dest_hi, R0);
3444   addc(dest_lo, dest_lo, src2);
3445   adde(dest_hi, dest_hi, R0);
3446 }
3447 
3448 // Multiply 64 bit by 64 bit first loop.
3449 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3450                                            Register x_xstart,
3451                                            Register y, Register y_idx,
3452                                            Register z,
3453                                            Register carry,
3454                                            Register product_high, Register product,
3455                                            Register idx, Register kdx,
3456                                            Register tmp) {
3457   //  jlong carry, x[], y[], z[];
3458   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3459   //    huge_128 product = y[idx] * x[xstart] + carry;
3460   //    z[kdx] = (jlong)product;
3461   //    carry  = (jlong)(product >>> 64);
3462   //  }
3463   //  z[xstart] = carry;
3464 
3465   Label L_first_loop, L_first_loop_exit;
3466   Label L_one_x, L_one_y, L_multiply;
3467 
3468   addic_(xstart, xstart, -1);
3469   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3470 
3471   // Load next two integers of x.
3472   sldi(tmp, xstart, LogBytesPerInt);
3473   ldx(x_xstart, x, tmp);
3474 #ifdef VM_LITTLE_ENDIAN
3475   rldicl(x_xstart, x_xstart, 32, 0);
3476 #endif
3477 
3478   align(32, 16);
3479   bind(L_first_loop);
3480 
3481   cmpdi(CCR0, idx, 1);
3482   blt(CCR0, L_first_loop_exit);
3483   addi(idx, idx, -2);
3484   beq(CCR0, L_one_y);
3485 
3486   // Load next two integers of y.
3487   sldi(tmp, idx, LogBytesPerInt);
3488   ldx(y_idx, y, tmp);
3489 #ifdef VM_LITTLE_ENDIAN
3490   rldicl(y_idx, y_idx, 32, 0);
3491 #endif
3492 
3493 
3494   bind(L_multiply);
3495   multiply64(product_high, product, x_xstart, y_idx);
3496 
3497   li(tmp, 0);
3498   addc(product, product, carry);         // Add carry to result.
3499   adde(product_high, product_high, tmp); // Add carry of the last addition.
3500   addi(kdx, kdx, -2);
3501 
3502   // Store result.
3503 #ifdef VM_LITTLE_ENDIAN
3504   rldicl(product, product, 32, 0);
3505 #endif
3506   sldi(tmp, kdx, LogBytesPerInt);
3507   stdx(product, z, tmp);
3508   mr_if_needed(carry, product_high);
3509   b(L_first_loop);
3510 
3511 
3512   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3513 
3514   lwz(y_idx, 0, y);
3515   b(L_multiply);
3516 
3517 
3518   bind( L_one_x ); // Load one 32 bit portion of x as (0,value).
3519 
3520   lwz(x_xstart, 0, x);
3521   b(L_first_loop);
3522 
3523   bind(L_first_loop_exit);
3524 }
3525 
3526 // Multiply 64 bit by 64 bit and add 128 bit.
3527 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3528                                             Register z, Register yz_idx,
3529                                             Register idx, Register carry,
3530                                             Register product_high, Register product,
3531                                             Register tmp, int offset) {
3532 
3533   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3534   //  z[kdx] = (jlong)product;
3535 
3536   sldi(tmp, idx, LogBytesPerInt);
3537   if ( offset ) {
3538     addi(tmp, tmp, offset);
3539   }
3540   ldx(yz_idx, y, tmp);
3541 #ifdef VM_LITTLE_ENDIAN
3542   rldicl(yz_idx, yz_idx, 32, 0);
3543 #endif
3544 
3545   multiply64(product_high, product, x_xstart, yz_idx);
3546   ldx(yz_idx, z, tmp);
3547 #ifdef VM_LITTLE_ENDIAN
3548   rldicl(yz_idx, yz_idx, 32, 0);
3549 #endif
3550 
3551   add2_with_carry(product_high, product, carry, yz_idx);
3552 
3553   sldi(tmp, idx, LogBytesPerInt);
3554   if ( offset ) {
3555     addi(tmp, tmp, offset);
3556   }
3557 #ifdef VM_LITTLE_ENDIAN
3558   rldicl(product, product, 32, 0);
3559 #endif
3560   stdx(product, z, tmp);
3561 }
3562 
3563 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3564 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3565                                              Register y, Register z,
3566                                              Register yz_idx, Register idx, Register carry,
3567                                              Register product_high, Register product,
3568                                              Register carry2, Register tmp) {
3569 
3570   //  jlong carry, x[], y[], z[];
3571   //  int kdx = ystart+1;
3572   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3573   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3574   //    z[kdx+idx+1] = (jlong)product;
3575   //    jlong carry2 = (jlong)(product >>> 64);
3576   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3577   //    z[kdx+idx] = (jlong)product;
3578   //    carry = (jlong)(product >>> 64);
3579   //  }
3580   //  idx += 2;
3581   //  if (idx > 0) {
3582   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3583   //    z[kdx+idx] = (jlong)product;
3584   //    carry = (jlong)(product >>> 64);
3585   //  }
3586 
3587   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3588   const Register jdx = R0;
3589 
3590   // Scale the index.
3591   srdi_(jdx, idx, 2);
3592   beq(CCR0, L_third_loop_exit);
3593   mtctr(jdx);
3594 
3595   align(32, 16);
3596   bind(L_third_loop);
3597 
3598   addi(idx, idx, -4);
3599 
3600   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3601   mr_if_needed(carry2, product_high);
3602 
3603   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3604   mr_if_needed(carry, product_high);
3605   bdnz(L_third_loop);
3606 
3607   bind(L_third_loop_exit);  // Handle any left-over operand parts.
3608 
3609   andi_(idx, idx, 0x3);
3610   beq(CCR0, L_post_third_loop_done);
3611 
3612   Label L_check_1;
3613 
3614   addic_(idx, idx, -2);
3615   blt(CCR0, L_check_1);
3616 
3617   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3618   mr_if_needed(carry, product_high);
3619 
3620   bind(L_check_1);
3621 
3622   addi(idx, idx, 0x2);
3623   andi_(idx, idx, 0x1) ;
3624   addic_(idx, idx, -1);
3625   blt(CCR0, L_post_third_loop_done);
3626 
3627   sldi(tmp, idx, LogBytesPerInt);
3628   lwzx(yz_idx, y, tmp);
3629   multiply64(product_high, product, x_xstart, yz_idx);
3630   lwzx(yz_idx, z, tmp);
3631 
3632   add2_with_carry(product_high, product, yz_idx, carry);
3633 
3634   sldi(tmp, idx, LogBytesPerInt);
3635   stwx(product, z, tmp);
3636   srdi(product, product, 32);
3637 
3638   sldi(product_high, product_high, 32);
3639   orr(product, product, product_high);
3640   mr_if_needed(carry, product);
3641 
3642   bind(L_post_third_loop_done);
3643 }   // multiply_128_x_128_loop
3644 
3645 void MacroAssembler::multiply_to_len(Register x, Register xlen,
3646                                      Register y, Register ylen,
3647                                      Register z, Register zlen,
3648                                      Register tmp1, Register tmp2,
3649                                      Register tmp3, Register tmp4,
3650                                      Register tmp5, Register tmp6,
3651                                      Register tmp7, Register tmp8,
3652                                      Register tmp9, Register tmp10,
3653                                      Register tmp11, Register tmp12,
3654                                      Register tmp13) {
3655 
3656   ShortBranchVerifier sbv(this);
3657 
3658   assert_different_registers(x, xlen, y, ylen, z, zlen,
3659                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3660   assert_different_registers(x, xlen, y, ylen, z, zlen,
3661                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
3662   assert_different_registers(x, xlen, y, ylen, z, zlen,
3663                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
3664 
3665   const Register idx = tmp1;
3666   const Register kdx = tmp2;
3667   const Register xstart = tmp3;
3668 
3669   const Register y_idx = tmp4;
3670   const Register carry = tmp5;
3671   const Register product = tmp6;
3672   const Register product_high = tmp7;
3673   const Register x_xstart = tmp8;
3674   const Register tmp = tmp9;
3675 
3676   // First Loop.
3677   //
3678   //  final static long LONG_MASK = 0xffffffffL;
3679   //  int xstart = xlen - 1;
3680   //  int ystart = ylen - 1;
3681   //  long carry = 0;
3682   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3683   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3684   //    z[kdx] = (int)product;
3685   //    carry = product >>> 32;
3686   //  }
3687   //  z[xstart] = (int)carry;
3688 
3689   mr_if_needed(idx, ylen);        // idx = ylen
3690   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
3691   li(carry, 0);                   // carry = 0
3692 
3693   Label L_done;
3694 
3695   addic_(xstart, xlen, -1);
3696   blt(CCR0, L_done);
3697 
3698   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
3699                         carry, product_high, product, idx, kdx, tmp);
3700 
3701   Label L_second_loop;
3702 
3703   cmpdi(CCR0, kdx, 0);
3704   beq(CCR0, L_second_loop);
3705 
3706   Label L_carry;
3707 
3708   addic_(kdx, kdx, -1);
3709   beq(CCR0, L_carry);
3710 
3711   // Store lower 32 bits of carry.
3712   sldi(tmp, kdx, LogBytesPerInt);
3713   stwx(carry, z, tmp);
3714   srdi(carry, carry, 32);
3715   addi(kdx, kdx, -1);
3716 
3717 
3718   bind(L_carry);
3719 
3720   // Store upper 32 bits of carry.
3721   sldi(tmp, kdx, LogBytesPerInt);
3722   stwx(carry, z, tmp);
3723 
3724   // Second and third (nested) loops.
3725   //
3726   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
3727   //    carry = 0;
3728   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3729   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3730   //                     (z[k] & LONG_MASK) + carry;
3731   //      z[k] = (int)product;
3732   //      carry = product >>> 32;
3733   //    }
3734   //    z[i] = (int)carry;
3735   //  }
3736   //
3737   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
3738 
3739   bind(L_second_loop);
3740 
3741   li(carry, 0);                   // carry = 0;
3742 
3743   addic_(xstart, xstart, -1);     // i = xstart-1;
3744   blt(CCR0, L_done);
3745 
3746   Register zsave = tmp10;
3747 
3748   mr(zsave, z);
3749 
3750 
3751   Label L_last_x;
3752 
3753   sldi(tmp, xstart, LogBytesPerInt);
3754   add(z, z, tmp);                 // z = z + k - j
3755   addi(z, z, 4);
3756   addic_(xstart, xstart, -1);     // i = xstart-1;
3757   blt(CCR0, L_last_x);
3758 
3759   sldi(tmp, xstart, LogBytesPerInt);
3760   ldx(x_xstart, x, tmp);
3761 #ifdef VM_LITTLE_ENDIAN
3762   rldicl(x_xstart, x_xstart, 32, 0);
3763 #endif
3764 
3765 
3766   Label L_third_loop_prologue;
3767 
3768   bind(L_third_loop_prologue);
3769 
3770   Register xsave = tmp11;
3771   Register xlensave = tmp12;
3772   Register ylensave = tmp13;
3773 
3774   mr(xsave, x);
3775   mr(xlensave, xstart);
3776   mr(ylensave, ylen);
3777 
3778 
3779   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
3780                           carry, product_high, product, x, tmp);
3781 
3782   mr(z, zsave);
3783   mr(x, xsave);
3784   mr(xlen, xlensave);   // This is the decrement of the loop counter!
3785   mr(ylen, ylensave);
3786 
3787   addi(tmp3, xlen, 1);
3788   sldi(tmp, tmp3, LogBytesPerInt);
3789   stwx(carry, z, tmp);
3790   addic_(tmp3, tmp3, -1);
3791   blt(CCR0, L_done);
3792 
3793   srdi(carry, carry, 32);
3794   sldi(tmp, tmp3, LogBytesPerInt);
3795   stwx(carry, z, tmp);
3796   b(L_second_loop);
3797 
3798   // Next infrequent code is moved outside loops.
3799   bind(L_last_x);
3800 
3801   lwz(x_xstart, 0, x);
3802   b(L_third_loop_prologue);
3803 
3804   bind(L_done);
3805 }   // multiply_to_len
3806 
3807 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
3808 #ifdef ASSERT
3809   Label ok;
3810   if (check_equal) {
3811     beq(CCR0, ok);
3812   } else {
3813     bne(CCR0, ok);
3814   }
3815   stop(msg, id);
3816   bind(ok);
3817 #endif
3818 }
3819 
3820 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
3821                                           Register mem_base, const char* msg, int id) {
3822 #ifdef ASSERT
3823   switch (size) {
3824     case 4:
3825       lwz(R0, mem_offset, mem_base);


< prev index next >