< prev index next >

src/java.desktop/share/native/libmlib_image/mlib_ImageConv_16nw.c

Print this page
rev 14293 : remove ImageConv


 127 #define LOAD_BUFF(buff)                                         \
 128   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 129 
 130 #endif /* _LITTLE_ENDIAN */
 131 #endif /* _NO_LONGLONG */
 132 
 133 /***************************************************************/
 134 typedef union {
 135   mlib_d64 d64;
 136   struct {
 137     mlib_s32 i0;
 138     mlib_s32 i1;
 139   } i32s;
 140   struct {
 141     mlib_s32 f0;
 142     mlib_s32 f1;
 143   } f32s;
 144 } d64_2x32;
 145 
 146 /***************************************************************/
 147 #define BUFF_LINE 256
 148 
 149 /***************************************************************/
 150 #define DEF_VARS(type)                                          \
 151   type     *adr_src, *sl, *sp = NULL;                           \
 152   type     *adr_dst, *dl, *dp = NULL;                           \
 153   FTYPE    *pbuff = buff;                                       \
 154   mlib_s32 wid, hgt, sll, dll;                                  \
 155   mlib_s32 nchannel, chan1;                                     \
 156   mlib_s32 i, j, c
 157 
 158 /***************************************************************/
 159 #define LOAD_KERNEL3()                                                   \
 160   FTYPE    scalef = DSCALE;                                              \
 161   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7, k8;                           \
 162   FTYPE    p00, p01, p02, p03,                                           \
 163            p10, p11, p12, p13,                                           \
 164            p20, p21, p22, p23;                                           \
 165                                                                          \
 166   while (scalef_expon > 30) {                                            \
 167     scalef /= (1 << 30);                                                 \
 168     scalef_expon -= 30;                                                  \
 169   }                                                                      \
 170                                                                          \
 171   scalef /= (1 << scalef_expon);                                         \
 172                                                                          \
 173   /* keep kernel in regs */                                              \
 174   k0 = scalef * kern[0];  k1 = scalef * kern[1];  k2 = scalef * kern[2]; \
 175   k3 = scalef * kern[3];  k4 = scalef * kern[4];  k5 = scalef * kern[5]; \
 176   k6 = scalef * kern[6];  k7 = scalef * kern[7];  k8 = scalef * kern[8]
 177 
 178 /***************************************************************/
 179 #define LOAD_KERNEL(SIZE)                                       \
 180   FTYPE    scalef = DSCALE;                                     \
 181                                                                 \
 182   while (scalef_expon > 30) {                                   \
 183     scalef /= (1 << 30);                                        \
 184     scalef_expon -= 30;                                         \
 185   }                                                             \
 186                                                                 \
 187   scalef /= (1 << scalef_expon);                                \
 188                                                                 \
 189   for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j]
 190 
 191 /***************************************************************/
 192 #define GET_SRC_DST_PARAMETERS(type)                            \
 193   hgt = mlib_ImageGetHeight(src);                               \
 194   wid = mlib_ImageGetWidth(src);                                \
 195   nchannel = mlib_ImageGetChannels(src);                        \
 196   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 197   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 198   adr_src = (type *)mlib_ImageGetData(src);                     \
 199   adr_dst = (type *)mlib_ImageGetData(dst)
 200 
 201 /***************************************************************/
 202 #ifndef __sparc
 203 
 204 #if IMG_TYPE == 1
 205 
 206 /* Test for the presence of any "1" bit in bits
 207    8 to 31 of val. If present, then val is either
 208    negative or >255. If over/underflows of 8 bits
 209    are uncommon, then this technique can be a win,
 210    since only a single test, rather than two, is
 211    necessary to determine if clamping is needed.


 227 #define CLAMP_STORE(dst, val)                                   \
 228   if (val >= MLIB_S16_MAX)                                      \
 229     dst = MLIB_S16_MAX;                                         \
 230   else if (val <= MLIB_S16_MIN)                                 \
 231     dst = MLIB_S16_MIN;                                         \
 232   else                                                          \
 233     dst = (mlib_s16)val
 234 
 235 #elif IMG_TYPE == 3
 236 
 237 #define CLAMP_STORE(dst, val)                                   \
 238   if (val >= MLIB_U16_MAX)                                      \
 239     dst = MLIB_U16_MAX;                                         \
 240   else if (val <= MLIB_U16_MIN)                                 \
 241     dst = MLIB_U16_MIN;                                         \
 242   else                                                          \
 243     dst = (mlib_u16)val
 244 
 245 #endif /* IMG_TYPE == 1 */
 246 #endif /* __sparc */
 247 
 248 /***************************************************************/
 249 #define KSIZE  3
 250 
 251 mlib_status CONV_FUNC(3x3)(mlib_image       *dst,
 252                            const mlib_image *src,
 253                            const mlib_s32   *kern,
 254                            mlib_s32         scalef_expon,
 255                            mlib_s32         cmask)
 256 {
 257   FTYPE    buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT;
 258   DEF_VARS(DTYPE);
 259   DTYPE *sl1;
 260   mlib_s32 chan2;
 261   mlib_s32 *buffo, *buffi;
 262   DTYPE *sl2;
 263 #ifndef __sparc
 264   mlib_s32 d0, d1;
 265 #endif /* __sparc */
 266   LOAD_KERNEL3();
 267   GET_SRC_DST_PARAMETERS(DTYPE);
 268 
 269   if (wid > BUFF_LINE) {
 270     pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE)*wid);
 271 
 272     if (pbuff == NULL) return MLIB_FAILURE;
 273   }
 274 
 275   buff0 = pbuff;
 276   buff1 = buff0 + wid;
 277   buff2 = buff1 + wid;
 278   buff3 = buff2 + wid;
 279   buffo = (mlib_s32*)(buff3 + wid);
 280   buffi = buffo + (wid &~ 1);
 281 
 282   chan1 = nchannel;
 283   chan2 = chan1 + chan1;
 284 
 285   wid -= (KSIZE - 1);
 286   hgt -= (KSIZE - 1);
 287 
 288   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
 289 
 290   for (c = 0; c < nchannel; c++) {
 291     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 292 
 293     sl = adr_src + c;
 294     dl = adr_dst + c;
 295 
 296     sl1 = sl  + sll;
 297     sl2 = sl1 + sll;
 298 #ifdef __SUNPRO_C
 299 #pragma pipeloop(0)
 300 #endif /* __SUNPRO_C */
 301     for (i = 0; i < wid + (KSIZE - 1); i++) {
 302       buff0[i] = (FTYPE)sl[i*chan1];
 303       buff1[i] = (FTYPE)sl1[i*chan1];
 304       buff2[i] = (FTYPE)sl2[i*chan1];
 305     }
 306 
 307     sl += KSIZE*sll;
 308 
 309     for (j = 0; j < hgt; j++) {
 310       FTYPE    s0, s1;
 311 
 312       p02 = buff0[0];
 313       p12 = buff1[0];
 314       p22 = buff2[0];
 315 
 316       p03 = buff0[1];
 317       p13 = buff1[1];
 318       p23 = buff2[1];
 319 
 320       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 321       s1 = p03 * k0 + p13 * k3 + p23 * k6;
 322 
 323       sp = sl;
 324       dp = dl;
 325 
 326 #ifdef __SUNPRO_C
 327 #pragma pipeloop(0)
 328 #endif /* __SUNPRO_C */
 329       for (i = 0; i <= (wid - 2); i += 2) {
 330 #ifdef __sparc
 331 #ifdef _NO_LONGLONG
 332         mlib_s32 o64_1, o64_2;
 333 #else /* _NO_LONGLONG */
 334         mlib_s64 o64;
 335 #endif /* _NO_LONGLONG */
 336 #endif /* __sparc */
 337         d64_2x32 dd;
 338 
 339         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
 340         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
 341 
 342         LOAD_BUFF(buffi);
 343 
 344         dd.d64 = *(FTYPE   *)(buffi + i);
 345         buff3[i    ] = (FTYPE)dd.i32s.i0;
 346         buff3[i + 1] = (FTYPE)dd.i32s.i1;
 347 
 348 #ifndef __sparc
 349         d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
 350         d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
 351 
 352         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 353         s1 = p03 * k0 + p13 * k3 + p23 * k6;
 354 
 355         dp[0    ] = FROM_S32(d0);
 356         dp[chan1] = FROM_S32(d1);
 357 
 358 #else /* __sparc */
 359 
 360         dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
 361         dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
 362         *(FTYPE   *)(buffo + i) = dd.d64;
 363 
 364         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 365         s1 = p03 * k0 + p13 * k3 + p23 * k6;
 366 
 367 #ifdef _NO_LONGLONG
 368 
 369         o64_1 = buffo[i];
 370         o64_2 = buffo[i+1];
 371 #if IMG_TYPE != 1
 372         STORE2(FROM_S32(o64_1), FROM_S32(o64_2));
 373 #else
 374         STORE2(o64_1 >> 24, o64_2 >> 24);
 375 #endif /* IMG_TYPE != 1 */
 376 
 377 #else /* _NO_LONGLONG */
 378 
 379         o64 = *(mlib_s64*)(buffo + i);
 380 #if IMG_TYPE != 1
 381         STORE2(FROM_S32(o64 >> 32), FROM_S32(o64));
 382 #else
 383         STORE2(o64 >> 56, o64 >> 24);
 384 #endif /* IMG_TYPE != 1 */
 385 #endif /* _NO_LONGLONG */
 386 #endif /* __sparc */
 387 
 388         sp += chan2;
 389         dp += chan2;
 390       }
 391 
 392       for (; i < wid; i++) {
 393         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];
 394         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
 395         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
 396 
 397         buffi[i] = (mlib_s32)sp[0];
 398         buff3[i] = (FTYPE)buffi[i];
 399 
 400 #ifndef __sparc
 401 
 402         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
 403                  p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
 404 
 405         dp[0] = FROM_S32(d0);
 406 
 407 #else  /* __sparc */
 408 
 409         buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
 410                        p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
 411 #if IMG_TYPE != 1
 412         dp[0] = FROM_S32(buffo[i]);
 413 #else
 414         dp[0] = buffo[i] >> 24;
 415 #endif /* IMG_TYPE != 1 */
 416 #endif /* __sparc */
 417 
 418         sp += chan1;
 419         dp += chan1;
 420       }
 421 
 422       buffi[wid] = (mlib_s32)sp[0];
 423       buff3[wid] = (FTYPE)buffi[wid];
 424       buffi[wid + 1] = (mlib_s32)sp[chan1];
 425       buff3[wid + 1] = (FTYPE)buffi[wid + 1];
 426 
 427       sl += sll;
 428       dl += dll;
 429 
 430       buffT = buff0;
 431       buff0 = buff1;
 432       buff1 = buff2;
 433       buff2 = buff3;
 434       buff3 = buffT;
 435     }
 436   }
 437 
 438 #ifdef __sparc
 439 #if IMG_TYPE == 1
 440   {
 441     mlib_s32 amask = (1 << nchannel) - 1;
 442 
 443     if ((cmask & amask) != amask) {
 444       mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
 445     } else {
 446       mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
 447     }
 448   }
 449 
 450 #endif /* IMG_TYPE == 1 */
 451 #endif /* __sparc */
 452 
 453   if (pbuff != buff) mlib_free(pbuff);
 454 
 455   return MLIB_SUCCESS;
 456 }
 457 
 458 /***************************************************************/
 459 #ifndef __sparc /* for x86, using integer multiplies is faster */
 460 
 461 mlib_status CONV_FUNC_I(3x3)(mlib_image       *dst,
 462                              const mlib_image *src,
 463                              const mlib_s32   *kern,
 464                              mlib_s32         scalef_expon,
 465                              mlib_s32         cmask)
 466 {
 467   DTYPE    *adr_src, *sl, *sp0, *sp1, *sp2;
 468   DTYPE    *adr_dst, *dl, *dp;
 469   mlib_s32 wid, hgt, sll, dll;
 470   mlib_s32 nchannel, chan1, chan2;
 471   mlib_s32 i, j, c;
 472   mlib_s32 shift1, shift2;
 473   mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8;
 474   mlib_s32 p02, p03,
 475            p12, p13,
 476            p22, p23;
 477 
 478 #if IMG_TYPE != 1
 479   shift1 = 16;
 480 #else
 481   shift1 = 8;
 482 #endif /* IMG_TYPE != 1 */
 483 
 484   shift2 = scalef_expon - shift1;
 485 
 486   /* keep kernel in regs */
 487   k0 = kern[0] >> shift1;  k1 = kern[1] >> shift1;  k2 = kern[2] >> shift1;
 488   k3 = kern[3] >> shift1;  k4 = kern[4] >> shift1;  k5 = kern[5] >> shift1;
 489   k6 = kern[6] >> shift1;  k7 = kern[7] >> shift1;  k8 = kern[8] >> shift1;
 490 
 491   GET_SRC_DST_PARAMETERS(DTYPE);
 492 
 493   chan1 = nchannel;
 494   chan2 = chan1 + chan1;
 495 
 496   wid -= (KSIZE - 1);
 497   hgt -= (KSIZE - 1);
 498 
 499   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
 500 
 501   for (c = 0; c < chan1; c++) {
 502     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 503 
 504     sl = adr_src + c;
 505     dl = adr_dst + c;
 506 
 507     for (j = 0; j < hgt; j++) {
 508       mlib_s32 s0, s1;
 509       mlib_s32 pix0, pix1;
 510 
 511       dp  = dl;
 512       sp0 = sl;
 513       sp1 = sp0 + sll;
 514       sp2 = sp1 + sll;
 515 
 516       p02 = sp0[0];
 517       p12 = sp1[0];
 518       p22 = sp2[0];
 519 
 520       p03 = sp0[chan1];
 521       p13 = sp1[chan1];
 522       p23 = sp2[chan1];
 523 
 524       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 525       s1 = p03 * k0 + p13 * k3 + p23 * k6;
 526 
 527       sp0 += chan2;
 528       sp1 += chan2;
 529       sp2 += chan2;
 530 
 531 #ifdef __SUNPRO_C
 532 #pragma pipeloop(0)
 533 #endif /* __SUNPRO_C */
 534       for (i = 0; i <= (wid - 2); i += 2) {
 535         p02 = sp0[0];     p12 = sp1[0];     p22 = sp2[0];
 536         p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
 537 
 538         pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
 539         pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 +
 540                 p13 * k5 + p22 * k7 + p23 * k8) >> shift2;
 541 
 542         CLAMP_STORE(dp[0],     pix0);
 543         CLAMP_STORE(dp[chan1], pix1);
 544 
 545         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 546         s1 = p03 * k0 + p13 * k3 + p23 * k6;
 547 
 548         sp0 += chan2;
 549         sp1 += chan2;
 550         sp2 += chan2;
 551         dp += chan2;
 552       }
 553 
 554       if (wid & 1) {
 555         p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
 556         pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
 557         CLAMP_STORE(dp[0], pix0);
 558       }
 559 
 560       sl += sll;
 561       dl += dll;
 562     }
 563   }
 564 
 565   return MLIB_SUCCESS;
 566 }
 567 
 568 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
 569 
 570 /***************************************************************/
 571 #undef  KSIZE
 572 #define KSIZE 4
 573 
 574 mlib_status CONV_FUNC(4x4)(mlib_image       *dst,
 575                            const mlib_image *src,
 576                            const mlib_s32   *kern,
 577                            mlib_s32         scalef_expon,
 578                            mlib_s32         cmask)
 579 {
 580   FTYPE    buff[(KSIZE + 3)*BUFF_LINE];
 581   FTYPE    *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT;
 582   FTYPE    k[KSIZE*KSIZE];
 583   mlib_s32 d0, d1;
 584   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7;
 585   FTYPE    p00, p01, p02, p03, p04,
 586            p10, p11, p12, p13, p14,
 587            p20, p21, p22, p23,
 588            p30, p31, p32, p33;
 589   DEF_VARS(DTYPE);
 590   DTYPE *sl1;
 591   mlib_s32 chan2;
 592   mlib_s32 *buffo, *buffi;
 593   DTYPE *sl2, *sl3;
 594   LOAD_KERNEL(KSIZE*KSIZE);
 595   GET_SRC_DST_PARAMETERS(DTYPE);
 596 
 597   if (wid > BUFF_LINE) {
 598     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
 599 
 600     if (pbuff == NULL) return MLIB_FAILURE;
 601   }
 602 
 603   buff0 = pbuff;
 604   buff1 = buff0 + wid;
 605   buff2 = buff1 + wid;
 606   buff3 = buff2 + wid;
 607   buff4 = buff3 + wid;
 608   buffd = buff4 + wid;
 609   buffo = (mlib_s32*)(buffd + wid);
 610   buffi = buffo + (wid &~ 1);
 611 
 612   chan1 = nchannel;
 613   chan2 = chan1 + chan1;
 614 
 615   wid -= (KSIZE - 1);
 616   hgt -= (KSIZE - 1);
 617 
 618   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
 619 
 620   for (c = 0; c < nchannel; c++) {
 621     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 622 
 623     sl = adr_src + c;
 624     dl = adr_dst + c;
 625 
 626     sl1 = sl  + sll;
 627     sl2 = sl1 + sll;
 628     sl3 = sl2 + sll;
 629 #ifdef __SUNPRO_C
 630 #pragma pipeloop(0)
 631 #endif /* __SUNPRO_C */
 632     for (i = 0; i < wid + (KSIZE - 1); i++) {
 633       buff0[i] = (FTYPE)sl[i*chan1];
 634       buff1[i] = (FTYPE)sl1[i*chan1];
 635       buff2[i] = (FTYPE)sl2[i*chan1];
 636       buff3[i] = (FTYPE)sl3[i*chan1];
 637     }
 638 
 639     sl += KSIZE*sll;
 640 
 641     for (j = 0; j < hgt; j++) {
 642       d64_2x32 dd;
 643 
 644       /*
 645        *  First loop on two first lines of kernel
 646        */
 647       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
 648       k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
 649 
 650       sp = sl;
 651       dp = dl;
 652 
 653       p02 = buff0[0];
 654       p12 = buff1[0];
 655       p03 = buff0[1];
 656       p13 = buff1[1];
 657       p04 = buff0[2];
 658 
 659 #ifdef __SUNPRO_C
 660 #pragma pipeloop(0)
 661 #endif /* __SUNPRO_C */
 662       for (i = 0; i <= (wid - 2); i += 2) {
 663         p00 = p02; p10 = p12;
 664         p01 = p03; p11 = p13;
 665         p02 = p04; p12 = buff1[i + 2];
 666         p03 = buff0[i + 3]; p13 = buff1[i + 3];
 667         p04 = buff0[i + 4]; p14 = buff1[i + 4];
 668 
 669         LOAD_BUFF(buffi);
 670 
 671         dd.d64 = *(FTYPE   *)(buffi + i);
 672         buff4[i    ] = (FTYPE)dd.i32s.i0;
 673         buff4[i + 1] = (FTYPE)dd.i32s.i1;
 674 
 675         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
 676                         p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
 677         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
 678                         p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
 679 
 680         sp += chan2;
 681         dp += chan2;
 682       }
 683 
 684       /*
 685        *  Second loop on two last lines of kernel
 686        */
 687       k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
 688       k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
 689 
 690       sp = sl;
 691       dp = dl;
 692 
 693       p02 = buff2[0];
 694       p12 = buff3[0];
 695       p03 = buff2[1];
 696       p13 = buff3[1];
 697       p04 = buff2[2];
 698 
 699 #ifdef __SUNPRO_C
 700 #pragma pipeloop(0)
 701 #endif /* __SUNPRO_C */
 702       for (i = 0; i <= (wid - 2); i += 2) {
 703         p00 = p02; p10 = p12;
 704         p01 = p03; p11 = p13;
 705         p02 = p04; p12 = buff3[i + 2];
 706         p03 = buff2[i + 3]; p13 = buff3[i + 3];
 707         p04 = buff2[i + 4]; p14 = buff3[i + 4];
 708 
 709         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
 710                  p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]);
 711         d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
 712                  p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]);
 713 
 714         dp[0    ] = FROM_S32(d0);
 715         dp[chan1] = FROM_S32(d1);
 716 
 717         sp += chan2;
 718         dp += chan2;
 719       }
 720 
 721       /* last pixels */
 722       for (; i < wid; i++) {
 723         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
 724         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
 725         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
 726         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
 727 
 728         buff4[i] = (FTYPE)sp[0];
 729 
 730         buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
 731                        p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
 732                        p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
 733                        p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
 734 
 735         dp[0] = FROM_S32(buffo[i]);
 736 
 737         sp += chan1;
 738         dp += chan1;
 739       }
 740 
 741       buff4[wid    ] = (FTYPE)sp[0];
 742       buff4[wid + 1] = (FTYPE)sp[chan1];
 743       buff4[wid + 2] = (FTYPE)sp[chan2];
 744 
 745       /* next line */
 746       sl += sll;
 747       dl += dll;
 748 
 749       buffT = buff0;
 750       buff0 = buff1;
 751       buff1 = buff2;
 752       buff2 = buff3;
 753       buff3 = buff4;
 754       buff4 = buffT;
 755     }
 756   }
 757 
 758   if (pbuff != buff) mlib_free(pbuff);
 759 
 760   return MLIB_SUCCESS;
 761 }
 762 
 763 /***************************************************************/
 764 #undef  KSIZE
 765 #define KSIZE 5
 766 
 767 mlib_status CONV_FUNC(5x5)(mlib_image       *dst,
 768                            const mlib_image *src,
 769                            const mlib_s32   *kern,
 770                            mlib_s32         scalef_expon,
 771                            mlib_s32         cmask)
 772 {
 773   FTYPE    buff[(KSIZE + 3)*BUFF_LINE];
 774   FTYPE    *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT;
 775   FTYPE    k[KSIZE*KSIZE];
 776   mlib_s32 d0, d1;
 777   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
 778   FTYPE    p00, p01, p02, p03, p04, p05,
 779            p10, p11, p12, p13, p14, p15,
 780            p20, p21, p22, p23, p24,
 781            p30, p31, p32, p33, p34,
 782            p40, p41, p42, p43, p44;
 783   DEF_VARS(DTYPE);
 784   DTYPE *sl1;
 785   mlib_s32 chan2;
 786   mlib_s32 *buffo, *buffi;
 787   DTYPE *sl2, *sl3, *sl4;
 788   LOAD_KERNEL(KSIZE*KSIZE);
 789   GET_SRC_DST_PARAMETERS(DTYPE);
 790 
 791   if (wid > BUFF_LINE) {
 792     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
 793 
 794     if (pbuff == NULL) return MLIB_FAILURE;
 795   }
 796 
 797   buff0 = pbuff;
 798   buff1 = buff0 + wid;
 799   buff2 = buff1 + wid;
 800   buff3 = buff2 + wid;
 801   buff4 = buff3 + wid;
 802   buff5 = buff4 + wid;
 803   buffd = buff5 + wid;
 804   buffo = (mlib_s32*)(buffd + wid);
 805   buffi = buffo + (wid &~ 1);
 806 
 807   chan1 = nchannel;
 808   chan2 = chan1 + chan1;
 809 
 810   wid -= (KSIZE - 1);
 811   hgt -= (KSIZE - 1);
 812 
 813   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
 814 
 815   for (c = 0; c < nchannel; c++) {
 816     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 817 
 818     sl = adr_src + c;
 819     dl = adr_dst + c;
 820 
 821     sl1 = sl  + sll;
 822     sl2 = sl1 + sll;
 823     sl3 = sl2 + sll;
 824     sl4 = sl3 + sll;
 825 #ifdef __SUNPRO_C
 826 #pragma pipeloop(0)
 827 #endif /* __SUNPRO_C */
 828     for (i = 0; i < wid + (KSIZE - 1); i++) {
 829       buff0[i] = (FTYPE)sl[i*chan1];
 830       buff1[i] = (FTYPE)sl1[i*chan1];
 831       buff2[i] = (FTYPE)sl2[i*chan1];
 832       buff3[i] = (FTYPE)sl3[i*chan1];
 833       buff4[i] = (FTYPE)sl4[i*chan1];
 834     }
 835 
 836     sl += KSIZE*sll;
 837 
 838     for (j = 0; j < hgt; j++) {
 839       d64_2x32 dd;
 840 
 841       /*
 842        *  First loop
 843        */
 844       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
 845       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
 846 
 847       sp = sl;
 848       dp = dl;
 849 
 850       p02 = buff0[0];
 851       p12 = buff1[0];
 852       p03 = buff0[1];
 853       p13 = buff1[1];
 854       p04 = buff0[2];
 855       p14 = buff1[2];
 856 
 857 #ifdef __SUNPRO_C
 858 #pragma pipeloop(0)
 859 #endif /* __SUNPRO_C */
 860       for (i = 0; i <= (wid - 2); i += 2) {
 861         p00 = p02; p10 = p12;
 862         p01 = p03; p11 = p13;
 863         p02 = p04; p12 = p14;
 864 
 865         LOAD_BUFF(buffi);
 866 
 867         p03 = buff0[i + 3]; p13 = buff1[i + 3];
 868         p04 = buff0[i + 4]; p14 = buff1[i + 4];
 869         p05 = buff0[i + 5]; p15 = buff1[i + 5];
 870 
 871         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
 872                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
 873         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
 874                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
 875 
 876         sp += chan2;
 877         dp += chan2;
 878       }
 879 
 880       /*
 881        *  Second loop
 882        */
 883       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
 884       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
 885 
 886       sp = sl;
 887       dp = dl;
 888 
 889       p02 = buff2[0];
 890       p12 = buff3[0];
 891       p03 = buff2[1];
 892       p13 = buff3[1];
 893       p04 = buff2[2];
 894       p14 = buff3[2];
 895 
 896 #ifdef __SUNPRO_C
 897 #pragma pipeloop(0)
 898 #endif /* __SUNPRO_C */
 899       for (i = 0; i <= (wid - 2); i += 2) {
 900         p00 = p02; p10 = p12;
 901         p01 = p03; p11 = p13;
 902 
 903         p02 = buff2[i + 2]; p12 = buff3[i + 2];
 904         p03 = buff2[i + 3]; p13 = buff3[i + 3];
 905         p04 = buff2[i + 4]; p14 = buff3[i + 4];
 906         p05 = buff2[i + 5]; p15 = buff3[i + 5];
 907 
 908         dd.d64 = *(FTYPE   *)(buffi + i);
 909         buff5[i    ] = (FTYPE)dd.i32s.i0;
 910         buff5[i + 1] = (FTYPE)dd.i32s.i1;
 911 
 912         buffd[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
 913                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
 914         buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
 915                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
 916 
 917         sp += chan2;
 918         dp += chan2;
 919       }
 920 
 921       /*
 922        *  3 loop
 923        */
 924       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
 925 
 926       sp = sl;
 927       dp = dl;
 928 
 929       p02 = buff4[0];
 930       p03 = buff4[1];
 931       p04 = buff4[2];
 932       p05 = buff4[3];
 933 
 934 #ifdef __SUNPRO_C
 935 #pragma pipeloop(0)
 936 #endif /* __SUNPRO_C */
 937       for (i = 0; i <= (wid - 2); i += 2) {
 938         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
 939 
 940         p04 = buff4[i + 4]; p05 = buff4[i + 5];
 941 
 942         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]);
 943         d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]);
 944 
 945         dp[0    ] = FROM_S32(d0);
 946         dp[chan1] = FROM_S32(d1);
 947 
 948         sp += chan2;
 949         dp += chan2;
 950       }
 951 
 952       /* last pixels */
 953       for (; i < wid; i++) {
 954         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
 955         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
 956         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
 957         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
 958         p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
 959 
 960         p40 = buff4[i];     p41 = buff4[i + 1]; p42 = buff4[i + 2];
 961         p43 = buff4[i + 3]; p44 = buff4[i + 4];
 962 
 963         buff5[i] = (FTYPE)sp[0];
 964 
 965         buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
 966                        p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
 967                        p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
 968                        p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
 969                        p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
 970 
 971         dp[0] = FROM_S32(buffo[i]);
 972 
 973         sp += chan1;
 974         dp += chan1;
 975       }
 976 
 977       buff5[wid    ] = (FTYPE)sp[0];
 978       buff5[wid + 1] = (FTYPE)sp[chan1];
 979       buff5[wid + 2] = (FTYPE)sp[chan2];
 980       buff5[wid + 3] = (FTYPE)sp[chan2 + chan1];
 981 
 982       /* next line */
 983       sl += sll;
 984       dl += dll;
 985 
 986       buffT = buff0;
 987       buff0 = buff1;
 988       buff1 = buff2;
 989       buff2 = buff3;
 990       buff3 = buff4;
 991       buff4 = buff5;
 992       buff5 = buffT;
 993     }
 994   }
 995 
 996   if (pbuff != buff) mlib_free(pbuff);
 997 
 998   return MLIB_SUCCESS;
 999 }
1000 
1001 /***************************************************************/
1002 #ifndef __sparc /* for x86, using integer multiplies is faster */
1003 
1004 mlib_status CONV_FUNC_I(5x5)(mlib_image       *dst,
1005                              const mlib_image *src,
1006                              const mlib_s32   *kern,
1007                              mlib_s32         scalef_expon,
1008                              mlib_s32         cmask)
1009 {
1010   mlib_s32 buff[BUFF_LINE];
1011   mlib_s32 *buffd;
1012   mlib_s32 k[KSIZE*KSIZE];
1013   mlib_s32 shift1, shift2;
1014   mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
1015   mlib_s32 p00, p01, p02, p03, p04, p05,
1016            p10, p11, p12, p13, p14, p15;
1017   DTYPE    *adr_src, *sl, *sp0, *sp1;
1018   DTYPE    *adr_dst, *dl, *dp;
1019   mlib_s32 *pbuff = buff;
1020   mlib_s32 wid, hgt, sll, dll;
1021   mlib_s32 nchannel, chan1, chan2, chan3, chan4;
1022   mlib_s32 i, j, c;
1023 
1024 #if IMG_TYPE != 1
1025   shift1 = 16;
1026 #else
1027   shift1 = 8;
1028 #endif /* IMG_TYPE != 1 */
1029 
1030   shift2 = scalef_expon - shift1;
1031 
1032   for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1;
1033 
1034   GET_SRC_DST_PARAMETERS(DTYPE);
1035 
1036   if (wid > BUFF_LINE) {
1037     pbuff = mlib_malloc(sizeof(mlib_s32)*wid);
1038 
1039     if (pbuff == NULL) return MLIB_FAILURE;
1040   }
1041 
1042   buffd = pbuff;
1043 
1044   chan1 = nchannel;
1045   chan2 = chan1 + chan1;
1046   chan3 = chan2 + chan1;
1047   chan4 = chan3 + chan1;
1048 
1049   wid -= (KSIZE - 1);
1050   hgt -= (KSIZE - 1);
1051 
1052   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1053 
1054   for (c = 0; c < chan1; c++) {
1055     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1056 
1057     sl = adr_src + c;
1058     dl = adr_dst + c;
1059 
1060     for (j = 0; j < hgt; j++) {
1061       mlib_s32 pix0, pix1;
1062       /*
1063        *  First loop
1064        */
1065       sp0 = sl;
1066       sp1 = sp0 + sll;
1067       dp = dl;
1068 
1069       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
1070       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
1071 
1072       p02 = sp0[0];     p12 = sp1[0];
1073       p03 = sp0[chan1]; p13 = sp1[chan1];
1074       p04 = sp0[chan2]; p14 = sp1[chan2];
1075       p05 = sp0[chan3]; p15 = sp1[chan3];
1076 
1077       sp0 += chan4;
1078       sp1 += chan4;
1079 
1080 #ifdef __SUNPRO_C
1081 #pragma pipeloop(0)
1082 #endif /* __SUNPRO_C */
1083       for (i = 0; i <= (wid - 2); i += 2) {
1084         p00 = p02; p10 = p12;
1085         p01 = p03; p11 = p13;
1086         p02 = p04; p12 = p14;
1087         p03 = p05; p13 = p15;
1088 
1089         p04 = sp0[0];     p14 = sp1[0];
1090         p05 = sp0[chan1]; p15 = sp1[chan1];
1091 
1092         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1093                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1094         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1095                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1096 
1097         sp0 += chan2;
1098         sp1 += chan2;
1099         dp += chan2;
1100       }
1101 
1102       if (wid & 1) {
1103         p00 = p02; p10 = p12;
1104         p01 = p03; p11 = p13;
1105         p02 = p04; p12 = p14;
1106         p03 = p05; p13 = p15;
1107 
1108         p04 = sp0[0];     p14 = sp1[0];
1109 
1110         buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1111                     p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1112       }
1113 
1114       /*
1115        *  Second loop
1116        */
1117       sp0 = sl + 2*sll;
1118       sp1 = sp0 + sll;
1119       dp = dl;
1120 
1121       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
1122       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
1123 
1124       p02 = sp0[0];     p12 = sp1[0];
1125       p03 = sp0[chan1]; p13 = sp1[chan1];
1126       p04 = sp0[chan2]; p14 = sp1[chan2];
1127       p05 = sp0[chan3]; p15 = sp1[chan3];
1128 
1129       sp0 += chan4;
1130       sp1 += chan4;
1131 
1132 #ifdef __SUNPRO_C
1133 #pragma pipeloop(0)
1134 #endif /* __SUNPRO_C */
1135       for (i = 0; i <= (wid - 2); i += 2) {
1136         p00 = p02; p10 = p12;
1137         p01 = p03; p11 = p13;
1138         p02 = p04; p12 = p14;
1139         p03 = p05; p13 = p15;
1140 
1141         p04 = sp0[0];     p14 = sp1[0];
1142         p05 = sp0[chan1]; p15 = sp1[chan1];
1143 
1144         buffd[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1145                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1146         buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1147                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1148 
1149         sp0 += chan2;
1150         sp1 += chan2;
1151         dp += chan2;
1152       }
1153 
1154       if (wid & 1) {
1155         p00 = p02; p10 = p12;
1156         p01 = p03; p11 = p13;
1157         p02 = p04; p12 = p14;
1158         p03 = p05; p13 = p15;
1159 
1160         p04 = sp0[0];     p14 = sp1[0];
1161 
1162         buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1163                      p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1164       }
1165 
1166       /*
1167        *  3 loop
1168        */
1169       dp = dl;
1170       sp0 = sl + 4*sll;
1171 
1172       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1173 
1174       p02 = sp0[0];
1175       p03 = sp0[chan1];
1176       p04 = sp0[chan2];
1177       p05 = sp0[chan3];
1178 
1179       sp0 += chan2 + chan2;
1180 
1181 #ifdef __SUNPRO_C
1182 #pragma pipeloop(0)
1183 #endif /* __SUNPRO_C */
1184       for (i = 0; i <= (wid - 2); i += 2) {
1185         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1186 
1187         p04 = sp0[0]; p05 = sp0[chan1];
1188 
1189         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
1190                 p03 * k3 + p04 * k4) >> shift2;
1191         pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 +
1192                 p04 * k3 + p05 * k4) >> shift2;
1193 
1194         CLAMP_STORE(dp[0],     pix0);
1195         CLAMP_STORE(dp[chan1], pix1);
1196 
1197         dp  += chan2;
1198         sp0 += chan2;
1199       }
1200 
1201       if (wid & 1) {
1202         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1203 
1204         p04 = sp0[0];
1205 
1206         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
1207                 p03 * k3 + p04 * k4) >> shift2;
1208         CLAMP_STORE(dp[0],     pix0);
1209       }
1210 
1211       /* next line */
1212       sl += sll;
1213       dl += dll;
1214     }
1215   }
1216 
1217   if (pbuff != buff) mlib_free(pbuff);
1218 
1219   return MLIB_SUCCESS;
1220 }
1221 
1222 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
1223 
1224 /***************************************************************/
1225 #if IMG_TYPE == 1
1226 
1227 #undef  KSIZE
1228 #define KSIZE 7
1229 
1230 mlib_status CONV_FUNC(7x7)(mlib_image       *dst,
1231                            const mlib_image *src,
1232                            const mlib_s32   *kern,
1233                            mlib_s32         scalef_expon,
1234                            mlib_s32         cmask)
1235 {
1236   FTYPE    buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
1237   FTYPE    k[KSIZE*KSIZE];
1238   mlib_s32 l, m, buff_ind;
1239   mlib_s32 d0, d1;
1240   FTYPE    k0, k1, k2, k3, k4, k5, k6;
1241   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
1242   DTYPE *sl2, *sl3, *sl4, *sl5, *sl6;
1243   DEF_VARS(DTYPE);
1244   DTYPE *sl1;
1245   mlib_s32 chan2;
1246   mlib_s32 *buffo, *buffi;
1247   LOAD_KERNEL(KSIZE*KSIZE);
1248   GET_SRC_DST_PARAMETERS(DTYPE);
1249 
1250   if (wid > BUFF_LINE) {
1251     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
1252 
1253     if (pbuff == NULL) return MLIB_FAILURE;
1254   }
1255 
1256   for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*wid;
1257   for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
1258   buffd = buffs[KSIZE] + wid;
1259   buffo = (mlib_s32*)(buffd + wid);
1260   buffi = buffo + (wid &~ 1);
1261 
1262   chan1 = nchannel;
1263   chan2 = chan1 + chan1;
1264 
1265   wid -= (KSIZE - 1);
1266   hgt -= (KSIZE - 1);
1267 
1268   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1269 
1270   for (c = 0; c < nchannel; c++) {
1271     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1272 
1273     sl = adr_src + c;
1274     dl = adr_dst + c;
1275 
1276     sl1 = sl  + sll;
1277     sl2 = sl1 + sll;
1278     sl3 = sl2 + sll;
1279     sl4 = sl3 + sll;
1280     sl5 = sl4 + sll;
1281     sl6 = sl5 + sll;
1282 #ifdef __SUNPRO_C
1283 #pragma pipeloop(0)
1284 #endif /* __SUNPRO_C */
1285     for (i = 0; i < wid + (KSIZE - 1); i++) {
1286       buffs[0][i] = (FTYPE)sl[i*chan1];
1287       buffs[1][i] = (FTYPE)sl1[i*chan1];
1288       buffs[2][i] = (FTYPE)sl2[i*chan1];
1289       buffs[3][i] = (FTYPE)sl3[i*chan1];
1290       buffs[4][i] = (FTYPE)sl4[i*chan1];
1291       buffs[5][i] = (FTYPE)sl5[i*chan1];
1292       buffs[6][i] = (FTYPE)sl6[i*chan1];
1293     }
1294 
1295     buff_ind = 0;
1296 
1297 #ifdef __SUNPRO_C
1298 #pragma pipeloop(0)
1299 #endif /* __SUNPRO_C */
1300     for (i = 0; i < wid; i++) buffd[i] = 0.0;
1301 
1302     sl += KSIZE*sll;
1303 
1304     for (j = 0; j < hgt; j++) {
1305       FTYPE    **buffc = buffs + buff_ind;
1306       FTYPE    *buffn = buffc[KSIZE];
1307       FTYPE    *pk = k;
1308 
1309       for (l = 0; l < KSIZE; l++) {
1310         FTYPE    *buff = buffc[l];
1311         d64_2x32 dd;
1312 
1313         sp = sl;
1314         dp = dl;
1315 
1316         p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1317         p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1318 
1319         k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
1320         k4 = *pk++; k5 = *pk++; k6 = *pk++;
1321 
1322         if (l < (KSIZE - 1)) {
1323 #ifdef __SUNPRO_C
1324 #pragma pipeloop(0)
1325 #endif /* __SUNPRO_C */
1326           for (i = 0; i <= (wid - 2); i += 2) {
1327             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1328 
1329             p6 = buff[i + 6]; p7 = buff[i + 7];
1330 
1331             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1332             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1333           }
1334 
1335         } else {
1336 #ifdef __SUNPRO_C
1337 #pragma pipeloop(0)
1338 #endif /* __SUNPRO_C */
1339           for (i = 0; i <= (wid - 2); i += 2) {
1340             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1341 
1342             p6 = buff[i + 6]; p7 = buff[i + 7];
1343 
1344             LOAD_BUFF(buffi);
1345 
1346             dd.d64 = *(FTYPE   *)(buffi + i);
1347             buffn[i    ] = (FTYPE)dd.i32s.i0;
1348             buffn[i + 1] = (FTYPE)dd.i32s.i1;
1349 
1350             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1351             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1352 
1353             dp[0    ] = FROM_S32(d0);
1354             dp[chan1] = FROM_S32(d1);
1355 
1356             buffd[i    ] = 0.0;
1357             buffd[i + 1] = 0.0;
1358 
1359             sp += chan2;
1360             dp += chan2;
1361           }
1362         }
1363       }
1364 
1365       /* last pixels */
1366       for (; i < wid; i++) {
1367         FTYPE    *pk = k, s = 0;
1368         mlib_s32 d0;
1369 
1370         for (l = 0; l < KSIZE; l++) {
1371           FTYPE    *buff = buffc[l] + i;
1372 
1373           for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
1374         }
1375 
1376         d0 = D2I(s);
1377         dp[0] = FROM_S32(d0);
1378 
1379         buffn[i] = (FTYPE)sp[0];
1380 
1381         sp += chan1;
1382         dp += chan1;
1383       }
1384 
1385       for (l = 0; l < (KSIZE - 1); l++) buffn[wid + l] = sp[l*chan1];
1386 
1387       /* next line */
1388       sl += sll;
1389       dl += dll;
1390 
1391       buff_ind++;
1392 
1393       if (buff_ind >= KSIZE + 1) buff_ind = 0;
1394     }
1395   }
1396 
1397   if (pbuff != buff) mlib_free(pbuff);
1398 
1399   return MLIB_SUCCESS;
1400 }
1401 
1402 #endif /* IMG_TYPE == 1 */
1403 
1404 /***************************************************************/
1405 #define MAX_KER   7
1406 #define MAX_N    15
1407 
1408 static mlib_status mlib_ImageConv1xN(mlib_image       *dst,
1409                                      const mlib_image *src,
1410                                      const mlib_d64   *k,
1411                                      mlib_s32         n,
1412                                      mlib_s32         dn,
1413                                      mlib_s32         cmask)
1414 {
1415   FTYPE    buff[BUFF_SIZE];
1416   mlib_s32 off, kh;
1417   mlib_s32 d0, d1;
1418   const FTYPE    *pk;
1419   FTYPE    k0, k1, k2, k3;
1420   FTYPE    p0, p1, p2, p3, p4;
1421   DEF_VARS(DTYPE);
1422   DTYPE    *sl_c, *dl_c, *sl0;




 127 #define LOAD_BUFF(buff)                                         \
 128   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 129 
 130 #endif /* _LITTLE_ENDIAN */
 131 #endif /* _NO_LONGLONG */
 132 
 133 /***************************************************************/
 134 typedef union {
 135   mlib_d64 d64;
 136   struct {
 137     mlib_s32 i0;
 138     mlib_s32 i1;
 139   } i32s;
 140   struct {
 141     mlib_s32 f0;
 142     mlib_s32 f1;
 143   } f32s;
 144 } d64_2x32;
 145 
 146 /***************************************************************/



 147 #define DEF_VARS(type)                                          \
 148   type     *adr_src, *sl, *sp = NULL;                           \
 149   type     *adr_dst, *dl, *dp = NULL;                           \
 150   FTYPE    *pbuff = buff;                                       \
 151   mlib_s32 wid, hgt, sll, dll;                                  \
 152   mlib_s32 nchannel, chan1;                                     \
 153   mlib_s32 i, j, c
 154 
 155 /***************************************************************/

































 156 #define GET_SRC_DST_PARAMETERS(type)                            \
 157   hgt = mlib_ImageGetHeight(src);                               \
 158   wid = mlib_ImageGetWidth(src);                                \
 159   nchannel = mlib_ImageGetChannels(src);                        \
 160   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 161   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 162   adr_src = (type *)mlib_ImageGetData(src);                     \
 163   adr_dst = (type *)mlib_ImageGetData(dst)
 164 
 165 /***************************************************************/
 166 #ifndef __sparc
 167 
 168 #if IMG_TYPE == 1
 169 
 170 /* Test for the presence of any "1" bit in bits
 171    8 to 31 of val. If present, then val is either
 172    negative or >255. If over/underflows of 8 bits
 173    are uncommon, then this technique can be a win,
 174    since only a single test, rather than two, is
 175    necessary to determine if clamping is needed.


 191 #define CLAMP_STORE(dst, val)                                   \
 192   if (val >= MLIB_S16_MAX)                                      \
 193     dst = MLIB_S16_MAX;                                         \
 194   else if (val <= MLIB_S16_MIN)                                 \
 195     dst = MLIB_S16_MIN;                                         \
 196   else                                                          \
 197     dst = (mlib_s16)val
 198 
 199 #elif IMG_TYPE == 3
 200 
 201 #define CLAMP_STORE(dst, val)                                   \
 202   if (val >= MLIB_U16_MAX)                                      \
 203     dst = MLIB_U16_MAX;                                         \
 204   else if (val <= MLIB_U16_MIN)                                 \
 205     dst = MLIB_U16_MIN;                                         \
 206   else                                                          \
 207     dst = (mlib_u16)val
 208 
 209 #endif /* IMG_TYPE == 1 */
 210 #endif /* __sparc */




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































 211 
 212 /***************************************************************/
 213 #define MAX_KER   7
 214 #define MAX_N    15
 215 
 216 static mlib_status mlib_ImageConv1xN(mlib_image       *dst,
 217                                      const mlib_image *src,
 218                                      const mlib_d64   *k,
 219                                      mlib_s32         n,
 220                                      mlib_s32         dn,
 221                                      mlib_s32         cmask)
 222 {
 223   FTYPE    buff[BUFF_SIZE];
 224   mlib_s32 off, kh;
 225   mlib_s32 d0, d1;
 226   const FTYPE    *pk;
 227   FTYPE    k0, k1, k2, k3;
 228   FTYPE    p0, p1, p2, p3, p4;
 229   DEF_VARS(DTYPE);
 230   DTYPE    *sl_c, *dl_c, *sl0;


< prev index next >