< prev index next >

src/java.desktop/share/native/libmlib_image/mlib_ImageConv_16ext.c

Print this page
rev 14293 : remove ImageConv


  63 #define DSCALE            65536.0
  64 #define FROM_S32(x)       ((x) >> 16)
  65 #define S64TOS32(x)       ((x) & 0xffffffff)
  66 #define SAT_OFF
  67 
  68 #elif IMG_TYPE == 3
  69 
  70 #define DTYPE             mlib_u16
  71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
  72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
  73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
  74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
  75 #define DSCALE            65536.0
  76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
  77 #define S64TOS32(x)       (x)
  78 #define SAT_OFF           -(1u << 31)
  79 
  80 #endif /* IMG_TYPE == 1 */
  81 
  82 /***************************************************************/
  83 #define KSIZE1 (KSIZE - 1)
  84 
  85 /***************************************************************/
  86 #define PARAM                                                   \
  87   mlib_image       *dst,                                        \
  88   const mlib_image *src,                                        \
  89   mlib_s32         dx_l,                                        \
  90   mlib_s32         dx_r,                                        \
  91   mlib_s32         dy_t,                                        \
  92   mlib_s32         dy_b,                                        \
  93   const mlib_s32   *kern,                                       \
  94   mlib_s32         scalef_expon,                                \
  95   mlib_s32         cmask
  96 
  97 /***************************************************************/
  98 #define PARAM_MxN                                               \
  99   mlib_image       *dst,                                        \
 100   const mlib_image *src,                                        \
 101   const mlib_s32   *kernel,                                     \
 102   mlib_s32         m,                                           \
 103   mlib_s32         n,                                           \
 104   mlib_s32         dx_l,                                        \
 105   mlib_s32         dx_r,                                        \


 146 #define LOAD_BUFF(buff)                                         \
 147   buff[i    ] = sp[0];                                          \
 148   buff[i + 1] = sp[chan1]
 149 
 150 #else /* _NO_LONGLONG */
 151 
 152 #ifdef _LITTLE_ENDIAN
 153 
 154 #define LOAD_BUFF(buff)                                         \
 155   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
 156 
 157 #else /* _LITTLE_ENDIAN */
 158 
 159 #define LOAD_BUFF(buff)                                         \
 160   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 161 
 162 #endif /* _LITTLE_ENDIAN */
 163 #endif /* _NO_LONGLONG */
 164 
 165 /***************************************************************/
 166 #define MLIB_D2_24 16777216.0f
 167 
 168 /***************************************************************/
 169 typedef union {
 170   mlib_d64 d64;
 171   struct {
 172     mlib_s32 i0;
 173     mlib_s32 i1;
 174   } i32s;
 175 } d64_2x32;
 176 
 177 /***************************************************************/
 178 #define BUFF_LINE 256
 179 
 180 /***************************************************************/
 181 #define DEF_VARS(type)                                          \
 182   type     *adr_src, *sl, *sp, *sl1;                            \
 183   type     *adr_dst, *dl, *dp;                                  \
 184   FTYPE    *pbuff = buff;                                       \
 185   mlib_s32 *buffi, *buffo;                                      \
 186   mlib_s32 wid, hgt, sll, dll;                                  \
 187   mlib_s32 nchannel, chan1, chan2;                              \
 188   mlib_s32 i, j, c, swid
 189 
 190 /***************************************************************/
 191 #define LOAD_KERNEL3()                                                   \
 192   FTYPE    scalef = DSCALE;                                              \
 193   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7, k8;                           \
 194   FTYPE    p00, p01, p02, p03,                                           \
 195            p10, p11, p12, p13,                                           \
 196            p20, p21, p22, p23;                                           \
 197                                                                          \
 198   while (scalef_expon > 30) {                                            \
 199     scalef /= (1 << 30);                                                 \
 200     scalef_expon -= 30;                                                  \
 201   }                                                                      \
 202                                                                          \
 203   scalef /= (1 << scalef_expon);                                         \
 204                                                                          \
 205   /* keep kernel in regs */                                              \
 206   k0 = scalef * kern[0];  k1 = scalef * kern[1];  k2 = scalef * kern[2]; \
 207   k3 = scalef * kern[3];  k4 = scalef * kern[4];  k5 = scalef * kern[5]; \
 208   k6 = scalef * kern[6];  k7 = scalef * kern[7];  k8 = scalef * kern[8]
 209 
 210 /***************************************************************/
 211 #define LOAD_KERNEL(SIZE)                                       \
 212   FTYPE    scalef = DSCALE;                                     \
 213                                                                 \
 214   while (scalef_expon > 30) {                                   \
 215     scalef /= (1 << 30);                                        \
 216     scalef_expon -= 30;                                         \
 217   }                                                             \
 218                                                                 \
 219   scalef /= (1 << scalef_expon);                                \
 220                                                                 \
 221   for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j]
 222 
 223 /***************************************************************/
 224 #define GET_SRC_DST_PARAMETERS(type)                            \
 225   hgt = mlib_ImageGetHeight(src);                               \
 226   wid = mlib_ImageGetWidth(src);                                \
 227   nchannel = mlib_ImageGetChannels(src);                        \
 228   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 229   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 230   adr_src = (type *)mlib_ImageGetData(src);                     \
 231   adr_dst = (type *)mlib_ImageGetData(dst)
 232 
 233 /***************************************************************/
 234 #ifndef __sparc
 235 #if IMG_TYPE == 1
 236 
 237 /*
 238  * Test for the presence of any "1" bit in bits
 239    8 to 31 of val. If present, then val is either
 240    negative or >255. If over/underflows of 8 bits
 241    are uncommon, then this technique can be a win,
 242    since only a single test, rather than two, is
 243    necessary to determine if clamping is needed.


 259 #define CLAMP_STORE(dst, val)                                   \
 260   if (val >= MLIB_S16_MAX)                                      \
 261     dst = MLIB_S16_MAX;                                         \
 262   else if (val <= MLIB_S16_MIN)                                 \
 263     dst = MLIB_S16_MIN;                                         \
 264   else                                                          \
 265     dst = (mlib_s16)val
 266 
 267 #elif IMG_TYPE == 3
 268 
 269 #define CLAMP_STORE(dst, val)                                   \
 270   if (val >= MLIB_U16_MAX)                                      \
 271     dst = MLIB_U16_MAX;                                         \
 272   else if (val <= MLIB_U16_MIN)                                 \
 273     dst = MLIB_U16_MIN;                                         \
 274   else                                                          \
 275     dst = (mlib_u16)val
 276 
 277 #endif /* IMG_TYPE == 1 */
 278 #endif /* __sparc */
 279 
 280 /***************************************************************/
 281 #define KSIZE  3
 282 
 283 mlib_status CONV_FUNC(3x3)
 284 {
 285   FTYPE    buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT;
 286   DEF_VARS(DTYPE);
 287   DTYPE *sl2;
 288 #ifndef __sparc
 289   mlib_s32 d0, d1;
 290 #endif /* __sparc */
 291   LOAD_KERNEL3();
 292   GET_SRC_DST_PARAMETERS(DTYPE);
 293 
 294   swid = wid + KSIZE1;
 295 
 296   if (swid > BUFF_LINE) {
 297     pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE   )*swid);
 298 
 299     if (pbuff == NULL) return MLIB_FAILURE;
 300   }
 301 
 302   buff0 = pbuff;
 303   buff1 = buff0 + swid;
 304   buff2 = buff1 + swid;
 305   buff3 = buff2 + swid;
 306   buffo = (mlib_s32*)(buff3 + swid);
 307   buffi = buffo + (swid &~ 1);
 308 
 309   swid -= (dx_l + dx_r);
 310 
 311   chan1 = nchannel;
 312   chan2 = chan1 + chan1;
 313 
 314   for (c = 0; c < nchannel; c++) {
 315     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 316 
 317     sl = adr_src + c;
 318     dl = adr_dst + c;
 319 
 320     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
 321     else sl1 = sl;
 322 
 323     if ((hgt - dy_b) > 0) sl2 = sl1 + sll;
 324     else sl2 = sl1;
 325 
 326     for (i = 0; i < dx_l; i++) {
 327       buff0[i] = (FTYPE)sl[0];
 328       buff1[i] = (FTYPE)sl1[0];
 329       buff2[i] = (FTYPE)sl2[0];
 330     }
 331 
 332 #ifdef __SUNPRO_C
 333 #pragma pipeloop(0)
 334 #endif /* __SUNPRO_C */
 335     for (i = 0; i < swid; i++) {
 336       buff0[i + dx_l] = (FTYPE)sl[i*chan1];
 337       buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
 338       buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
 339     }
 340 
 341     for (i = 0; i < dx_r; i++) {
 342       buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
 343       buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
 344       buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
 345     }
 346 
 347     if ((hgt - dy_b) > 1) sl = sl2 + sll;
 348     else sl = sl2;
 349 
 350     for (j = 0; j < hgt; j++) {
 351       FTYPE    s0, s1;
 352 
 353       p02 = buff0[0];
 354       p12 = buff1[0];
 355       p22 = buff2[0];
 356 
 357       p03 = buff0[1];
 358       p13 = buff1[1];
 359       p23 = buff2[1];
 360 
 361       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 362       s1 = p03 * k0 + p13 * k3 + p23 * k6;
 363 
 364       sp = sl;
 365       dp = dl;
 366 
 367 #ifdef __SUNPRO_C
 368 #pragma pipeloop(0)
 369 #endif /* __SUNPRO_C */
 370       for (i = 0; i <= (wid - 2); i += 2) {
 371 #ifdef __sparc
 372 #ifdef _NO_LONGLONG
 373         mlib_s32 o64_1, o64_2;
 374 #else /* _NO_LONGLONG */
 375         mlib_s64 o64;
 376 #endif /* _NO_LONGLONG */
 377 #endif /* __sparc */
 378         d64_2x32 dd;
 379 
 380         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
 381         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
 382 
 383         LOAD_BUFF(buffi);
 384 
 385         dd.d64 = *(FTYPE   *)(buffi + i);
 386         buff3[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 387         buff3[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 388 
 389 #ifndef __sparc
 390 
 391         d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
 392         d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
 393 
 394         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 395         s1 = p03 * k0 + p13 * k3 + p23 * k6;
 396 
 397         dp[0    ] = FROM_S32(d0);
 398         dp[chan1] = FROM_S32(d1);
 399 
 400 #else /* __sparc */
 401 
 402         dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
 403         dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
 404         *(FTYPE   *)(buffo + i) = dd.d64;
 405 
 406         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 407         s1 = p03 * k0 + p13 * k3 + p23 * k6;
 408 
 409 #ifdef _NO_LONGLONG
 410 
 411         o64_1 = buffo[i];
 412         o64_2 = buffo[i+1];
 413 #if IMG_TYPE != 1
 414         STORE2(FROM_S32(o64_1), FROM_S32(o64_2));
 415 #else
 416         STORE2(o64_1 >> 24, o64_2 >> 24);
 417 #endif /* IMG_TYPE != 1 */
 418 
 419 #else /* _NO_LONGLONG */
 420 
 421         o64 = *(mlib_s64*)(buffo + i);
 422 #if IMG_TYPE != 1
 423         STORE2(FROM_S32(o64 >> 32), FROM_S32(o64));
 424 #else
 425         STORE2(o64 >> 56, o64 >> 24);
 426 #endif /* IMG_TYPE != 1 */
 427 #endif /* _NO_LONGLONG */
 428 #endif /* __sparc */
 429 
 430         sp += chan2;
 431         dp += chan2;
 432       }
 433 
 434       for (; i < wid; i++) {
 435         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];
 436         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
 437         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
 438 
 439         buffi[i] = (mlib_s32)sp[0];
 440         buff3[i + dx_l] = (FTYPE)buffi[i];
 441 
 442 #ifndef __sparc
 443 
 444         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
 445                  p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
 446 
 447         dp[0] = FROM_S32(d0);
 448 
 449 #else  /* __sparc */
 450 
 451         buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
 452                        p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
 453 #if IMG_TYPE != 1
 454         dp[0] = FROM_S32(buffo[i]);
 455 #else
 456         dp[0] = buffo[i] >> 24;
 457 #endif /* IMG_TYPE != 1 */
 458 #endif /* __sparc */
 459 
 460         sp += chan1;
 461         dp += chan1;
 462       }
 463 
 464       for (; i < swid; i++) {
 465         buffi[i] = (mlib_s32)sp[0];
 466         buff3[i + dx_l] = (FTYPE)buffi[i];
 467         sp += chan1;
 468       }
 469 
 470       for (i = 0; i < dx_l; i++) buff3[i] = buff3[dx_l];
 471       for (i = 0; i < dx_r; i++) buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
 472 
 473       if (j < hgt - dy_b - 2) sl += sll;
 474       dl += dll;
 475 
 476       buffT = buff0;
 477       buff0 = buff1;
 478       buff1 = buff2;
 479       buff2 = buff3;
 480       buff3 = buffT;
 481     }
 482   }
 483 
 484 #ifdef __sparc
 485 #if IMG_TYPE == 1
 486   {
 487     mlib_s32 amask = (1 << nchannel) - 1;
 488 
 489     if ((cmask & amask) != amask) {
 490       mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
 491     } else {
 492       mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
 493     }
 494   }
 495 
 496 #endif /* IMG_TYPE == 1 */
 497 #endif /* __sparc */
 498 
 499   if (pbuff != buff) mlib_free(pbuff);
 500 
 501   return MLIB_SUCCESS;
 502 }
 503 
 504 /***************************************************************/
 505 #ifndef __sparc /* for x86, using integer multiplies is faster */
 506 
 507 mlib_status CONV_FUNC_I(3x3)
 508 {
 509   DTYPE    *adr_src, *sl, *sp0, *sp1, *sp2, *sp_1, *sp_2;
 510   DTYPE    *adr_dst, *dl, *dp;
 511   mlib_s32 wid, hgt, sll, dll;
 512   mlib_s32 nchannel, chan1, chan2, delta_chan;
 513   mlib_s32 i, j, c;
 514   mlib_s32 shift1, shift2;
 515   mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8;
 516   mlib_s32 p02, p03,
 517            p12, p13,
 518            p22, p23;
 519 
 520 #if IMG_TYPE != 1
 521   shift1 = 16;
 522 #else
 523   shift1 = 8;
 524 #endif /* IMG_TYPE != 1 */
 525 
 526   shift2 = scalef_expon - shift1;
 527 
 528   /* keep kernel in regs */
 529   k0 = kern[0] >> shift1;  k1 = kern[1] >> shift1;  k2 = kern[2] >> shift1;
 530   k3 = kern[3] >> shift1;  k4 = kern[4] >> shift1;  k5 = kern[5] >> shift1;
 531   k6 = kern[6] >> shift1;  k7 = kern[7] >> shift1;  k8 = kern[8] >> shift1;
 532 
 533   GET_SRC_DST_PARAMETERS(DTYPE);
 534 
 535   chan1 = nchannel;
 536   chan2 = chan1 + chan1;
 537   delta_chan = 0;
 538 
 539   if ((1 > dx_l) && (1 < wid + KSIZE1 - dx_r)) delta_chan = chan1;
 540 
 541   for (c = 0; c < chan1; c++) {
 542     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 543 
 544     sl = adr_src + c;
 545     dl = adr_dst + c;
 546 
 547     sp_1 = sl;
 548 
 549     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl += sll;
 550     sp_2 = sl;
 551 
 552     if ((hgt - dy_b) > 0) sl += sll;
 553 
 554     for (j = 0; j < hgt; j++) {
 555       mlib_s32 s0, s1;
 556       mlib_s32 pix0, pix1;
 557 
 558       dp  = dl;
 559       sp0 = sp_1;
 560       sp_1 = sp_2;
 561       sp_2 = sl;
 562 
 563       sp1 = sp_1;
 564       sp2 = sp_2;
 565 
 566       p02 = sp0[0];
 567       p12 = sp1[0];
 568       p22 = sp2[0];
 569 
 570       p03 = sp0[delta_chan];
 571       p13 = sp1[delta_chan];
 572       p23 = sp2[delta_chan];
 573 
 574       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 575       s1 = p03 * k0 + p13 * k3 + p23 * k6;
 576 
 577       sp0 += (chan1 + delta_chan);
 578       sp1 += (chan1 + delta_chan);
 579       sp2 += (chan1 + delta_chan);
 580 
 581 #ifdef __SUNPRO_C
 582 #pragma pipeloop(0)
 583 #endif /* __SUNPRO_C */
 584       for (i = 0; i <= (wid - dx_r - 2); i += 2) {
 585         p02 = sp0[0];     p12 = sp1[0];     p22 = sp2[0];
 586         p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
 587 
 588         pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
 589         pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 +
 590                 p13 * k5 + p22 * k7 + p23 * k8) >> shift2;
 591 
 592         CLAMP_STORE(dp[0],     pix0);
 593         CLAMP_STORE(dp[chan1], pix1);
 594 
 595         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 596         s1 = p03 * k0 + p13 * k3 + p23 * k6;
 597 
 598         sp0 += chan2;
 599         sp1 += chan2;
 600         sp2 += chan2;
 601         dp += chan2;
 602       }
 603 
 604       p02 = p03; p12 = p13; p22 = p23;
 605 
 606       for (; i < wid - dx_r; i++) {
 607         p03 = sp0[0]; p13 = sp1[0]; p23 = sp2[0];
 608         pix0 = (s0 + p03 * k2 + p13 * k5 + p23 * k8) >> shift2;
 609         CLAMP_STORE(dp[0], pix0);
 610         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 611         p02 = p03; p12 = p13; p22 = p23;
 612         sp0 += chan1;
 613         sp1 += chan1;
 614         sp2 += chan1;
 615         dp += chan1;
 616       }
 617 
 618       sp0 -= chan1;
 619       sp1 -= chan1;
 620       sp2 -= chan1;
 621 
 622       for (; i < wid; i++) {
 623         p03 = sp0[0]; p13 = sp1[0]; p23 = sp2[0];
 624         pix0 = (s0 + p03 * k2 + p13 * k5 + p23 * k8) >> shift2;
 625         CLAMP_STORE(dp[0], pix0);
 626         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 627         p02 = p03; p12 = p13; p22 = p23;
 628         dp += chan1;
 629       }
 630 
 631       if (j < hgt - dy_b - 1) sl += sll;
 632       dl += dll;
 633     }
 634   }
 635 
 636   return MLIB_SUCCESS;
 637 }
 638 
 639 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
 640 
 641 /***************************************************************/
 642 #undef  KSIZE
 643 #define KSIZE 4
 644 
 645 mlib_status CONV_FUNC(4x4)
 646 {
 647   FTYPE    buff[(KSIZE + 3)*BUFF_LINE];
 648   FTYPE    *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT;
 649   FTYPE    k[KSIZE*KSIZE];
 650   mlib_s32 d0, d1;
 651   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7;
 652   FTYPE    p00, p01, p02, p03, p04,
 653            p10, p11, p12, p13, p14,
 654            p20, p21, p22, p23,
 655            p30, p31, p32, p33;
 656   DEF_VARS(DTYPE);
 657   DTYPE *sl2, *sl3;
 658   LOAD_KERNEL(KSIZE*KSIZE);
 659   GET_SRC_DST_PARAMETERS(DTYPE);
 660 
 661   swid = wid + KSIZE1;
 662 
 663   if (swid > BUFF_LINE) {
 664     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE   )*swid);
 665 
 666     if (pbuff == NULL) return MLIB_FAILURE;
 667   }
 668 
 669   buff0 = pbuff;
 670   buff1 = buff0 + swid;
 671   buff2 = buff1 + swid;
 672   buff3 = buff2 + swid;
 673   buff4 = buff3 + swid;
 674   buffd = buff4 + swid;
 675   buffo = (mlib_s32*)(buffd + swid);
 676   buffi = buffo + (swid &~ 1);
 677 
 678   swid -= (dx_l + dx_r);
 679 
 680   chan1 = nchannel;
 681   chan2 = chan1 + chan1;
 682 
 683   for (c = 0; c < nchannel; c++) {
 684     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 685 
 686     sl = adr_src + c;
 687     dl = adr_dst + c;
 688 
 689     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
 690     else sl1 = sl;
 691 
 692     if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
 693     else sl2 = sl1;
 694 
 695     if ((hgt - dy_b) > 0) sl3 = sl2 + sll;
 696     else sl3 = sl2;
 697 
 698     for (i = 0; i < dx_l; i++) {
 699       buff0[i] = (FTYPE)sl[0];
 700       buff1[i] = (FTYPE)sl1[0];
 701       buff2[i] = (FTYPE)sl2[0];
 702       buff3[i] = (FTYPE)sl3[0];
 703     }
 704 
 705 #ifdef __SUNPRO_C
 706 #pragma pipeloop(0)
 707 #endif /* __SUNPRO_C */
 708     for (i = 0; i < swid; i++) {
 709       buff0[i + dx_l] = (FTYPE)sl[i*chan1];
 710       buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
 711       buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
 712       buff3[i + dx_l] = (FTYPE)sl3[i*chan1];
 713     }
 714 
 715     for (i = 0; i < dx_r; i++) {
 716       buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
 717       buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
 718       buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
 719       buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
 720     }
 721 
 722     if ((hgt - dy_b) > 1) sl = sl3 + sll;
 723     else sl = sl3;
 724 
 725     for (j = 0; j < hgt; j++) {
 726       d64_2x32 dd;
 727 
 728       /*
 729        *  First loop on two first lines of kernel
 730        */
 731       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
 732       k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
 733 
 734       sp = sl;
 735       dp = dl;
 736 
 737       p02 = buff0[0];
 738       p12 = buff1[0];
 739       p03 = buff0[1];
 740       p13 = buff1[1];
 741       p04 = buff0[2];
 742 
 743 #ifdef __SUNPRO_C
 744 #pragma pipeloop(0)
 745 #endif /* __SUNPRO_C */
 746       for (i = 0; i <= (wid - 2); i += 2) {
 747         p00 = p02; p10 = p12;
 748         p01 = p03; p11 = p13;
 749         p02 = p04; p12 = buff1[i + 2];
 750         p03 = buff0[i + 3]; p13 = buff1[i + 3];
 751         p04 = buff0[i + 4]; p14 = buff1[i + 4];
 752 
 753         LOAD_BUFF(buffi);
 754 
 755         dd.d64 = *(FTYPE   *)(buffi + i);
 756         buff4[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 757         buff4[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 758 
 759         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
 760                         p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
 761         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
 762                         p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
 763 
 764         sp += chan2;
 765       }
 766 
 767       /*
 768        *  Second loop on two last lines of kernel
 769        */
 770       k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
 771       k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
 772 
 773       p02 = buff2[0];
 774       p12 = buff3[0];
 775       p03 = buff2[1];
 776       p13 = buff3[1];
 777       p04 = buff2[2];
 778 
 779 #ifdef __SUNPRO_C
 780 #pragma pipeloop(0)
 781 #endif /* __SUNPRO_C */
 782       for (i = 0; i <= (wid - 2); i += 2) {
 783         p00 = p02; p10 = p12;
 784         p01 = p03; p11 = p13;
 785         p02 = p04; p12 = buff3[i + 2];
 786         p03 = buff2[i + 3]; p13 = buff3[i + 3];
 787         p04 = buff2[i + 4]; p14 = buff3[i + 4];
 788 
 789         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
 790                  p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]);
 791         d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
 792                  p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]);
 793 
 794         dp[0    ] = FROM_S32(d0);
 795         dp[chan1] = FROM_S32(d1);
 796 
 797         dp += chan2;
 798       }
 799 
 800       /* last pixels */
 801       for (; i < wid; i++) {
 802         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
 803         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
 804         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
 805         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
 806 
 807         buff4[i + dx_l] = (FTYPE)sp[0];
 808 
 809         buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
 810                        p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
 811                        p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
 812                        p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
 813 
 814         dp[0] = FROM_S32(buffo[i]);
 815 
 816         sp += chan1;
 817         dp += chan1;
 818       }
 819 
 820       for (; i < swid; i++) {
 821         buff4[i + dx_l] = (FTYPE)sp[0];
 822         sp += chan1;
 823       }
 824 
 825       for (i = 0; i < dx_l; i++) buff4[i] = buff4[dx_l];
 826       for (i = 0; i < dx_r; i++) buff4[swid + dx_l + i] = buff4[swid + dx_l - 1];
 827 
 828       /* next line */
 829 
 830       if (j < hgt - dy_b - 2) sl += sll;
 831       dl += dll;
 832 
 833       buffT = buff0;
 834       buff0 = buff1;
 835       buff1 = buff2;
 836       buff2 = buff3;
 837       buff3 = buff4;
 838       buff4 = buffT;
 839     }
 840   }
 841 
 842   if (pbuff != buff) mlib_free(pbuff);
 843 
 844   return MLIB_SUCCESS;
 845 }
 846 
 847 /***************************************************************/
 848 #undef  KSIZE
 849 #define KSIZE 5
 850 
 851 mlib_status CONV_FUNC(5x5)
 852 {
 853   FTYPE    buff[(KSIZE + 3)*BUFF_LINE];
 854   FTYPE    *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT;
 855   FTYPE    k[KSIZE*KSIZE];
 856   mlib_s32 d0, d1;
 857   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
 858   FTYPE    p00, p01, p02, p03, p04, p05,
 859            p10, p11, p12, p13, p14, p15,
 860            p20, p21, p22, p23, p24,
 861            p30, p31, p32, p33, p34,
 862            p40, p41, p42, p43, p44;
 863   DEF_VARS(DTYPE);
 864   DTYPE *sl2, *sl3, *sl4;
 865   LOAD_KERNEL(KSIZE*KSIZE);
 866   GET_SRC_DST_PARAMETERS(DTYPE);
 867 
 868   swid = wid + KSIZE1;
 869 
 870   if (swid > BUFF_LINE) {
 871     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE   )*swid);
 872 
 873     if (pbuff == NULL) return MLIB_FAILURE;
 874   }
 875 
 876   buff0 = pbuff;
 877   buff1 = buff0 + swid;
 878   buff2 = buff1 + swid;
 879   buff3 = buff2 + swid;
 880   buff4 = buff3 + swid;
 881   buff5 = buff4 + swid;
 882   buffd = buff5 + swid;
 883   buffo = (mlib_s32*)(buffd + swid);
 884   buffi = buffo + (swid &~ 1);
 885 
 886   swid -= (dx_l + dx_r);
 887 
 888   chan1 = nchannel;
 889   chan2 = chan1 + chan1;
 890 
 891   for (c = 0; c < nchannel; c++) {
 892     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 893 
 894     sl = adr_src + c;
 895     dl = adr_dst + c;
 896 
 897     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
 898     else sl1 = sl;
 899 
 900     if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
 901     else sl2 = sl1;
 902 
 903     if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl3 = sl2 + sll;
 904     else sl3 = sl2;
 905 
 906     if ((hgt - dy_b) > 0) sl4 = sl3 + sll;
 907     else sl4 = sl3;
 908 
 909     for (i = 0; i < dx_l; i++) {
 910       buff0[i] = (FTYPE)sl[0];
 911       buff1[i] = (FTYPE)sl1[0];
 912       buff2[i] = (FTYPE)sl2[0];
 913       buff3[i] = (FTYPE)sl3[0];
 914       buff4[i] = (FTYPE)sl4[0];
 915     }
 916 
 917 #ifdef __SUNPRO_C
 918 #pragma pipeloop(0)
 919 #endif /* __SUNPRO_C */
 920     for (i = 0; i < swid; i++) {
 921       buff0[i + dx_l] = (FTYPE)sl[i*chan1];
 922       buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
 923       buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
 924       buff3[i + dx_l] = (FTYPE)sl3[i*chan1];
 925       buff4[i + dx_l] = (FTYPE)sl4[i*chan1];
 926     }
 927 
 928     for (i = 0; i < dx_r; i++) {
 929       buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
 930       buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
 931       buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
 932       buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
 933       buff4[swid + dx_l + i] = buff4[swid + dx_l - 1];
 934     }
 935 
 936     if ((hgt - dy_b) > 1) sl = sl4 + sll;
 937     else sl = sl4;
 938 
 939     for (j = 0; j < hgt; j++) {
 940       d64_2x32 dd;
 941 
 942       /*
 943        *  First loop
 944        */
 945       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
 946       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
 947 
 948       sp = sl;
 949       dp = dl;
 950 
 951       p02 = buff0[0];
 952       p12 = buff1[0];
 953       p03 = buff0[1];
 954       p13 = buff1[1];
 955       p04 = buff0[2];
 956       p14 = buff1[2];
 957 
 958 #ifdef __SUNPRO_C
 959 #pragma pipeloop(0)
 960 #endif /* __SUNPRO_C */
 961       for (i = 0; i <= (wid - 2); i += 2) {
 962         p00 = p02; p10 = p12;
 963         p01 = p03; p11 = p13;
 964         p02 = p04; p12 = p14;
 965 
 966         LOAD_BUFF(buffi);
 967 
 968         p03 = buff0[i + 3]; p13 = buff1[i + 3];
 969         p04 = buff0[i + 4]; p14 = buff1[i + 4];
 970         p05 = buff0[i + 5]; p15 = buff1[i + 5];
 971 
 972         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
 973                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
 974         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
 975                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
 976 
 977         sp += chan2;
 978       }
 979 
 980       /*
 981        *  Second loop
 982        */
 983       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
 984       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
 985 
 986       p02 = buff2[0];
 987       p12 = buff3[0];
 988       p03 = buff2[1];
 989       p13 = buff3[1];
 990 
 991 #ifdef __SUNPRO_C
 992 #pragma pipeloop(0)
 993 #endif /* __SUNPRO_C */
 994       for (i = 0; i <= (wid - 2); i += 2) {
 995         p00 = p02; p10 = p12;
 996         p01 = p03; p11 = p13;
 997 
 998         p02 = buff2[i + 2]; p12 = buff3[i + 2];
 999         p03 = buff2[i + 3]; p13 = buff3[i + 3];
1000         p04 = buff2[i + 4]; p14 = buff3[i + 4];
1001         p05 = buff2[i + 5]; p15 = buff3[i + 5];
1002 
1003         dd.d64 = *(FTYPE   *)(buffi + i);
1004         buff5[i + dx_l    ] = (FTYPE)dd.i32s.i0;
1005         buff5[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
1006 
1007         buffd[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1008                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1009         buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1010                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1011       }
1012 
1013       /*
1014        *  3 loop
1015        */
1016       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1017 
1018       p02 = buff4[0];
1019       p03 = buff4[1];
1020       p04 = buff4[2];
1021       p05 = buff4[3];
1022 
1023 #ifdef __SUNPRO_C
1024 #pragma pipeloop(0)
1025 #endif /* __SUNPRO_C */
1026       for (i = 0; i <= (wid - 2); i += 2) {
1027         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1028 
1029         p04 = buff4[i + 4]; p05 = buff4[i + 5];
1030 
1031         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]);
1032         d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]);
1033 
1034         dp[0    ] = FROM_S32(d0);
1035         dp[chan1] = FROM_S32(d1);
1036 
1037         dp += chan2;
1038       }
1039 
1040       /* last pixels */
1041       for (; i < wid; i++) {
1042         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
1043         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
1044         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
1045         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
1046         p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
1047 
1048         p40 = buff4[i];     p41 = buff4[i + 1]; p42 = buff4[i + 2];
1049         p43 = buff4[i + 3]; p44 = buff4[i + 4];
1050 
1051         buff5[i + dx_l] = (FTYPE)sp[0];
1052 
1053         buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
1054                        p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
1055                        p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
1056                        p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
1057                        p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
1058 
1059         dp[0] = FROM_S32(buffo[i]);
1060 
1061         sp += chan1;
1062         dp += chan1;
1063       }
1064 
1065       for (; i < swid; i++) {
1066         buff5[i + dx_l] = (FTYPE)sp[0];
1067         sp += chan1;
1068       }
1069 
1070       for (i = 0; i < dx_l; i++) buff5[i] = buff5[dx_l];
1071       for (i = 0; i < dx_r; i++) buff5[swid + dx_l + i] = buff5[swid + dx_l - 1];
1072 
1073       /* next line */
1074 
1075       if (j < hgt - dy_b - 2) sl += sll;
1076       dl += dll;
1077 
1078       buffT = buff0;
1079       buff0 = buff1;
1080       buff1 = buff2;
1081       buff2 = buff3;
1082       buff3 = buff4;
1083       buff4 = buff5;
1084       buff5 = buffT;
1085     }
1086   }
1087 
1088   if (pbuff != buff) mlib_free(pbuff);
1089 
1090   return MLIB_SUCCESS;
1091 }
1092 
1093 /***************************************************************/
1094 #ifndef __sparc /* for x86, using integer multiplies is faster */
1095 
1096 mlib_status CONV_FUNC_I(5x5)
1097 {
1098   mlib_s32 buff[BUFF_LINE];
1099   mlib_s32 *buffd;
1100   mlib_s32 k[KSIZE*KSIZE];
1101   mlib_s32 shift1, shift2;
1102   mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
1103   mlib_s32 p00, p01, p02, p03, p04, p05,
1104            p10, p11, p12, p13, p14, p15;
1105   DTYPE    *adr_src, *sl, *sp0, *sp1, *sp2, *sp3, *sp4;
1106   DTYPE    *sp_1, *sp_2, *sp_3, *sp_4;
1107   DTYPE    *adr_dst, *dl, *dp;
1108   mlib_s32 *pbuff = buff;
1109   mlib_s32 wid, hgt, sll, dll;
1110   mlib_s32 nchannel, chan1, chan2, chan4;
1111   mlib_s32 delta_chan1, delta_chan2, delta_chan3;
1112   mlib_s32 i, j, c;
1113 
1114 #if IMG_TYPE != 1
1115   shift1 = 16;
1116 #else
1117   shift1 = 8;
1118 #endif /* IMG_TYPE != 1 */
1119 
1120   shift2 = scalef_expon - shift1;
1121 
1122   for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1;
1123 
1124   GET_SRC_DST_PARAMETERS(DTYPE);
1125 
1126   if (wid > BUFF_LINE) {
1127     pbuff = mlib_malloc(sizeof(mlib_s32)*wid);
1128 
1129     if (pbuff == NULL) return MLIB_FAILURE;
1130   }
1131 
1132   buffd = pbuff;
1133 
1134   chan1 = nchannel;
1135   chan2 = chan1 + chan1;
1136 
1137   if ((1 > dx_l) && (1 < wid + KSIZE1 - dx_r)) delta_chan1 = chan1;
1138   else delta_chan1 = 0;
1139 
1140   if ((2 > dx_l) && (2 < wid + KSIZE1 - dx_r)) delta_chan2 = delta_chan1 + chan1;
1141   else delta_chan2 = delta_chan1;
1142 
1143   if ((3 > dx_l) && (3 < wid + KSIZE1 - dx_r)) delta_chan3 = delta_chan2 + chan1;
1144   else delta_chan3 = delta_chan2;
1145 
1146   chan4 = chan1 + delta_chan3;
1147 
1148   for (c = 0; c < chan1; c++) {
1149     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1150 
1151     sl = adr_src + c;
1152     dl = adr_dst + c;
1153 
1154     sp_1 = sl;
1155 
1156     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl += sll;
1157     sp_2 = sl;
1158 
1159     if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl += sll;
1160     sp_3 = sl;
1161 
1162     if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl += sll;
1163     sp_4 = sl;
1164 
1165     if ((hgt - dy_b) > 0) sl += sll;
1166 
1167     for (j = 0; j < hgt; j++) {
1168       mlib_s32 pix0, pix1;
1169 
1170       dp  = dl;
1171       sp0 = sp_1;
1172       sp_1 = sp_2;
1173       sp_2 = sp_3;
1174       sp_3 = sp_4;
1175       sp_4 = sl;
1176 
1177       sp1 = sp_1;
1178       sp2 = sp_2;
1179       sp3 = sp_3;
1180       sp4 = sp_4;
1181 
1182       /*
1183        *  First loop
1184        */
1185 
1186       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
1187       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
1188 
1189       p02 = sp0[0];           p12 = sp1[0];
1190       p03 = sp0[delta_chan1]; p13 = sp1[delta_chan1];
1191       p04 = sp0[delta_chan2]; p14 = sp1[delta_chan2];
1192       p05 = sp0[delta_chan3]; p15 = sp1[delta_chan3];
1193 
1194       sp0 += chan4;
1195       sp1 += chan4;
1196 
1197 #ifdef __SUNPRO_C
1198 #pragma pipeloop(0)
1199 #endif /* __SUNPRO_C */
1200       for (i = 0; i <= (wid - dx_r - 2); i += 2) {
1201         p00 = p02; p10 = p12;
1202         p01 = p03; p11 = p13;
1203         p02 = p04; p12 = p14;
1204         p03 = p05; p13 = p15;
1205 
1206         p04 = sp0[0];     p14 = sp1[0];
1207         p05 = sp0[chan1]; p15 = sp1[chan1];
1208 
1209         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1210                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1211         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1212                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1213 
1214         sp0 += chan2;
1215         sp1 += chan2;
1216       }
1217 
1218       p01 = p02; p02 = p03; p03 = p04; p04 = p05;
1219       p11 = p12; p12 = p13; p13 = p14; p14 = p15;
1220 
1221       for (; i < wid - dx_r; i++) {
1222         p00 = p01; p10 = p11;
1223         p01 = p02; p11 = p12;
1224         p02 = p03; p12 = p13;
1225         p03 = p04; p13 = p14;
1226 
1227         p04 = sp0[0];     p14 = sp1[0];
1228 
1229         buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1230                     p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1231 
1232         sp0 += chan1;
1233         sp1 += chan1;
1234       }
1235 
1236       sp0 -= chan1;
1237       sp1 -= chan1;
1238 
1239       for (; i < wid; i++) {
1240         p00 = p01; p10 = p11;
1241         p01 = p02; p11 = p12;
1242         p02 = p03; p12 = p13;
1243         p03 = p04; p13 = p14;
1244 
1245         p04 = sp0[0];     p14 = sp1[0];
1246 
1247         buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1248                     p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1249       }
1250 
1251       /*
1252        *  Second loop
1253        */
1254 
1255       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
1256       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
1257 
1258       p02 = sp2[0];           p12 = sp3[0];
1259       p03 = sp2[delta_chan1]; p13 = sp3[delta_chan1];
1260       p04 = sp2[delta_chan2]; p14 = sp3[delta_chan2];
1261       p05 = sp2[delta_chan3]; p15 = sp3[delta_chan3];
1262 
1263       sp2 += chan4;
1264       sp3 += chan4;
1265 
1266 #ifdef __SUNPRO_C
1267 #pragma pipeloop(0)
1268 #endif /* __SUNPRO_C */
1269       for (i = 0; i <= (wid - dx_r - 2); i += 2) {
1270         p00 = p02; p10 = p12;
1271         p01 = p03; p11 = p13;
1272         p02 = p04; p12 = p14;
1273         p03 = p05; p13 = p15;
1274 
1275         p04 = sp2[0];     p14 = sp3[0];
1276         p05 = sp2[chan1]; p15 = sp3[chan1];
1277 
1278         buffd[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1279                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1280         buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1281                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1282 
1283         sp2 += chan2;
1284         sp3 += chan2;
1285       }
1286 
1287       p01 = p02; p02 = p03; p03 = p04; p04 = p05;
1288       p11 = p12; p12 = p13; p13 = p14; p14 = p15;
1289 
1290       for (; i < wid - dx_r; i++) {
1291         p00 = p01; p10 = p11;
1292         p01 = p02; p11 = p12;
1293         p02 = p03; p12 = p13;
1294         p03 = p04; p13 = p14;
1295 
1296         p04 = sp2[0];     p14 = sp3[0];
1297 
1298         buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1299                      p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1300 
1301         sp2 += chan1;
1302         sp3 += chan1;
1303       }
1304 
1305       sp2 -= chan1;
1306       sp3 -= chan1;
1307 
1308       for (; i < wid; i++) {
1309         p00 = p01; p10 = p11;
1310         p01 = p02; p11 = p12;
1311         p02 = p03; p12 = p13;
1312         p03 = p04; p13 = p14;
1313 
1314         p04 = sp2[0];     p14 = sp3[0];
1315 
1316         buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1317                      p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1318       }
1319 
1320       /*
1321        *  3 loop
1322        */
1323 
1324       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1325 
1326       p02 = sp4[0];
1327       p03 = sp4[delta_chan1];
1328       p04 = sp4[delta_chan2];
1329       p05 = sp4[delta_chan3];
1330 
1331       sp4 += chan4;
1332 
1333 #ifdef __SUNPRO_C
1334 #pragma pipeloop(0)
1335 #endif /* __SUNPRO_C */
1336       for (i = 0; i <= (wid - dx_r - 2); i += 2) {
1337         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1338 
1339         p04 = sp4[0]; p05 = sp4[chan1];
1340 
1341         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
1342                 p03 * k3 + p04 * k4) >> shift2;
1343         pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 +
1344                 p04 * k3 + p05 * k4) >> shift2;
1345 
1346         CLAMP_STORE(dp[0],     pix0);
1347         CLAMP_STORE(dp[chan1], pix1);
1348 
1349         dp  += chan2;
1350         sp4 += chan2;
1351       }
1352 
1353       p01 = p02; p02 = p03; p03 = p04; p04 = p05;
1354 
1355       for (; i < wid - dx_r; i++) {
1356         p00 = p01; p01 = p02; p02 = p03; p03 = p04;
1357 
1358         p04 = sp4[0];
1359 
1360         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
1361                 p03 * k3 + p04 * k4) >> shift2;
1362         CLAMP_STORE(dp[0],     pix0);
1363 
1364         dp  += chan1;
1365         sp4 += chan1;
1366       }
1367 
1368       sp4 -= chan1;
1369 
1370       for (; i < wid; i++) {
1371         p00 = p01; p01 = p02; p02 = p03; p03 = p04;
1372 
1373         p04 = sp4[0];
1374 
1375         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
1376                 p03 * k3 + p04 * k4) >> shift2;
1377         CLAMP_STORE(dp[0],     pix0);
1378 
1379         dp  += chan1;
1380       }
1381 
1382       /* next line */
1383 
1384       if (j < hgt - dy_b - 1) sl += sll;
1385       dl += dll;
1386     }
1387   }
1388 
1389   if (pbuff != buff) mlib_free(pbuff);
1390 
1391   return MLIB_SUCCESS;
1392 }
1393 
1394 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
1395 
1396 /***************************************************************/
1397 #if IMG_TYPE == 1
1398 
1399 #undef  KSIZE
1400 #define KSIZE 7
1401 
1402 mlib_status CONV_FUNC(7x7)
1403 {
1404   FTYPE    buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
1405   FTYPE    k[KSIZE*KSIZE];
1406   mlib_s32 l, m, buff_ind;
1407   mlib_s32 d0, d1;
1408   FTYPE    k0, k1, k2, k3, k4, k5, k6;
1409   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
1410   DTYPE *sl2, *sl3, *sl4, *sl5, *sl6;
1411   DEF_VARS(DTYPE);
1412   LOAD_KERNEL(KSIZE*KSIZE);
1413   GET_SRC_DST_PARAMETERS(DTYPE);
1414 
1415   swid = wid + KSIZE1;
1416 
1417   if (wid > BUFF_LINE) {
1418     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE   )*wid);
1419 
1420     if (pbuff == NULL) return MLIB_FAILURE;
1421   }
1422 
1423   for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*swid;
1424   for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
1425   buffd = buffs[KSIZE] + swid;
1426   buffo = (mlib_s32*)(buffd + swid);
1427   buffi = buffo + (swid &~ 1);
1428 
1429   swid -= (dx_l + dx_r);
1430 
1431   chan1 = nchannel;
1432   chan2 = chan1 + chan1;
1433 
1434   for (c = 0; c < nchannel; c++) {
1435     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1436 
1437     sl = adr_src + c;
1438     dl = adr_dst + c;
1439 
1440     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
1441     else sl1 = sl;
1442 
1443     if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
1444     else sl2 = sl1;
1445 
1446     if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl3 = sl2 + sll;
1447     else sl3 = sl2;
1448 
1449     if ((4 > dy_t) && (4 < hgt + KSIZE1 - dy_b)) sl4 = sl3 + sll;
1450     else sl4 = sl3;
1451 
1452     if ((5 > dy_t) && (5 < hgt + KSIZE1 - dy_b)) sl5 = sl4 + sll;
1453     else sl5 = sl4;
1454 
1455     if ((hgt - dy_b) > 0) sl6 = sl5 + sll;
1456     else sl6 = sl5;
1457 
1458     for (i = 0; i < dx_l; i++) {
1459       buffs[0][i] = (FTYPE)sl[0];
1460       buffs[1][i] = (FTYPE)sl1[0];
1461       buffs[2][i] = (FTYPE)sl2[0];
1462       buffs[3][i] = (FTYPE)sl3[0];
1463       buffs[4][i] = (FTYPE)sl4[0];
1464       buffs[5][i] = (FTYPE)sl5[0];
1465       buffs[6][i] = (FTYPE)sl6[0];
1466     }
1467 
1468 #ifdef __SUNPRO_C
1469 #pragma pipeloop(0)
1470 #endif /* __SUNPRO_C */
1471     for (i = 0; i < swid; i++) {
1472       buffs[0][i + dx_l] = (FTYPE)sl[i*chan1];
1473       buffs[1][i + dx_l] = (FTYPE)sl1[i*chan1];
1474       buffs[2][i + dx_l] = (FTYPE)sl2[i*chan1];
1475       buffs[3][i + dx_l] = (FTYPE)sl3[i*chan1];
1476       buffs[4][i + dx_l] = (FTYPE)sl4[i*chan1];
1477       buffs[5][i + dx_l] = (FTYPE)sl5[i*chan1];
1478       buffs[6][i + dx_l] = (FTYPE)sl6[i*chan1];
1479     }
1480 
1481     for (i = 0; i < dx_r; i++) {
1482       buffs[0][swid + dx_l + i] = buffs[0][swid + dx_l - 1];
1483       buffs[1][swid + dx_l + i] = buffs[1][swid + dx_l - 1];
1484       buffs[2][swid + dx_l + i] = buffs[2][swid + dx_l - 1];
1485       buffs[3][swid + dx_l + i] = buffs[3][swid + dx_l - 1];
1486       buffs[4][swid + dx_l + i] = buffs[4][swid + dx_l - 1];
1487       buffs[5][swid + dx_l + i] = buffs[5][swid + dx_l - 1];
1488       buffs[6][swid + dx_l + i] = buffs[6][swid + dx_l - 1];
1489     }
1490 
1491     buff_ind = 0;
1492 
1493 #ifdef __SUNPRO_C
1494 #pragma pipeloop(0)
1495 #endif /* __SUNPRO_C */
1496     for (i = 0; i < wid; i++) buffd[i] = 0.0;
1497 
1498     if ((hgt - dy_b) > 1) sl = sl6 + sll;
1499     else sl = sl6;
1500 
1501     for (j = 0; j < hgt; j++) {
1502       FTYPE    **buffc = buffs + buff_ind;
1503       FTYPE    *buffn = buffc[KSIZE];
1504       FTYPE    *pk = k;
1505 
1506       for (l = 0; l < KSIZE; l++) {
1507         FTYPE    *buff = buffc[l];
1508         d64_2x32 dd;
1509 
1510         sp = sl;
1511         dp = dl;
1512 
1513         p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1514         p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1515 
1516         k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
1517         k4 = *pk++; k5 = *pk++; k6 = *pk++;
1518 
1519         if (l < (KSIZE - 1)) {
1520 #ifdef __SUNPRO_C
1521 #pragma pipeloop(0)
1522 #endif /* __SUNPRO_C */
1523           for (i = 0; i <= (wid - 2); i += 2) {
1524             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1525 
1526             p6 = buff[i + 6]; p7 = buff[i + 7];
1527 
1528             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1529             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1530           }
1531 
1532         } else {
1533 #ifdef __SUNPRO_C
1534 #pragma pipeloop(0)
1535 #endif /* __SUNPRO_C */
1536           for (i = 0; i <= (wid - 2); i += 2) {
1537             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1538 
1539             p6 = buff[i + 6]; p7 = buff[i + 7];
1540 
1541             LOAD_BUFF(buffi);
1542 
1543             dd.d64 = *(FTYPE   *)(buffi + i);
1544             buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
1545             buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
1546 
1547             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1548             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1549 
1550             dp[0    ] = FROM_S32(d0);
1551             dp[chan1] = FROM_S32(d1);
1552 
1553             buffd[i    ] = 0.0;
1554             buffd[i + 1] = 0.0;
1555 
1556             sp += chan2;
1557             dp += chan2;
1558           }
1559         }
1560       }
1561 
1562       /* last pixels */
1563       for (; i < wid; i++) {
1564         FTYPE    *pk = k, s = 0;
1565         mlib_s32 d0;
1566 
1567         for (l = 0; l < KSIZE; l++) {
1568           FTYPE    *buff = buffc[l] + i;
1569 
1570           for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
1571         }
1572 
1573         d0 = D2I(s);
1574         dp[0] = FROM_S32(d0);
1575 
1576         buffn[i + dx_l] = (FTYPE)sp[0];
1577 
1578         sp += chan1;
1579         dp += chan1;
1580       }
1581 
1582       for (; i < swid; i++) {
1583         buffn[i + dx_l] = (FTYPE)sp[0];
1584         sp += chan1;
1585       }
1586 
1587       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1588       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1589 
1590       /* next line */
1591 
1592       if (j < hgt - dy_b - 2) sl += sll;
1593       dl += dll;
1594 
1595       buff_ind++;
1596 
1597       if (buff_ind >= KSIZE + 1) buff_ind = 0;
1598     }
1599   }
1600 
1601   if (pbuff != buff) mlib_free(pbuff);
1602 
1603   return MLIB_SUCCESS;
1604 }
1605 
1606 #endif /* IMG_TYPE == 1 */
1607 
1608 /***************************************************************/
1609 #define MAX_KER   7
1610 #define MAX_N    15
1611 #define BUFF_SIZE   1600
1612 #define CACHE_SIZE  (64*1024)
1613 
1614 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
1615                                          const mlib_image *src,
1616                                          const mlib_d64   *k,
1617                                          mlib_s32         n,
1618                                          mlib_s32         dy_t,
1619                                          mlib_s32         dy_b,
1620                                          mlib_s32         cmask)
1621 {
1622   DTYPE    *adr_src, *sl;
1623   DTYPE    *adr_dst, *dl, *dp;
1624   FTYPE    buff[BUFF_SIZE];
1625   FTYPE    *buffd;
1626   FTYPE    *pbuff = buff;




  63 #define DSCALE            65536.0
  64 #define FROM_S32(x)       ((x) >> 16)
  65 #define S64TOS32(x)       ((x) & 0xffffffff)
  66 #define SAT_OFF
  67 
  68 #elif IMG_TYPE == 3
  69 
  70 #define DTYPE             mlib_u16
  71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
  72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
  73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
  74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
  75 #define DSCALE            65536.0
  76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
  77 #define S64TOS32(x)       (x)
  78 #define SAT_OFF           -(1u << 31)
  79 
  80 #endif /* IMG_TYPE == 1 */
  81 
  82 /***************************************************************/



  83 #define PARAM                                                   \
  84   mlib_image       *dst,                                        \
  85   const mlib_image *src,                                        \
  86   mlib_s32         dx_l,                                        \
  87   mlib_s32         dx_r,                                        \
  88   mlib_s32         dy_t,                                        \
  89   mlib_s32         dy_b,                                        \
  90   const mlib_s32   *kern,                                       \
  91   mlib_s32         scalef_expon,                                \
  92   mlib_s32         cmask
  93 
  94 /***************************************************************/
  95 #define PARAM_MxN                                               \
  96   mlib_image       *dst,                                        \
  97   const mlib_image *src,                                        \
  98   const mlib_s32   *kernel,                                     \
  99   mlib_s32         m,                                           \
 100   mlib_s32         n,                                           \
 101   mlib_s32         dx_l,                                        \
 102   mlib_s32         dx_r,                                        \


 143 #define LOAD_BUFF(buff)                                         \
 144   buff[i    ] = sp[0];                                          \
 145   buff[i + 1] = sp[chan1]
 146 
 147 #else /* _NO_LONGLONG */
 148 
 149 #ifdef _LITTLE_ENDIAN
 150 
 151 #define LOAD_BUFF(buff)                                         \
 152   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
 153 
 154 #else /* _LITTLE_ENDIAN */
 155 
 156 #define LOAD_BUFF(buff)                                         \
 157   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 158 
 159 #endif /* _LITTLE_ENDIAN */
 160 #endif /* _NO_LONGLONG */
 161 
 162 /***************************************************************/



 163 typedef union {
 164   mlib_d64 d64;
 165   struct {
 166     mlib_s32 i0;
 167     mlib_s32 i1;
 168   } i32s;
 169 } d64_2x32;
 170 
 171 /***************************************************************/














































 172 #define GET_SRC_DST_PARAMETERS(type)                            \
 173   hgt = mlib_ImageGetHeight(src);                               \
 174   wid = mlib_ImageGetWidth(src);                                \
 175   nchannel = mlib_ImageGetChannels(src);                        \
 176   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 177   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 178   adr_src = (type *)mlib_ImageGetData(src);                     \
 179   adr_dst = (type *)mlib_ImageGetData(dst)
 180 
 181 /***************************************************************/
 182 #ifndef __sparc
 183 #if IMG_TYPE == 1
 184 
 185 /*
 186  * Test for the presence of any "1" bit in bits
 187    8 to 31 of val. If present, then val is either
 188    negative or >255. If over/underflows of 8 bits
 189    are uncommon, then this technique can be a win,
 190    since only a single test, rather than two, is
 191    necessary to determine if clamping is needed.


 207 #define CLAMP_STORE(dst, val)                                   \
 208   if (val >= MLIB_S16_MAX)                                      \
 209     dst = MLIB_S16_MAX;                                         \
 210   else if (val <= MLIB_S16_MIN)                                 \
 211     dst = MLIB_S16_MIN;                                         \
 212   else                                                          \
 213     dst = (mlib_s16)val
 214 
 215 #elif IMG_TYPE == 3
 216 
 217 #define CLAMP_STORE(dst, val)                                   \
 218   if (val >= MLIB_U16_MAX)                                      \
 219     dst = MLIB_U16_MAX;                                         \
 220   else if (val <= MLIB_U16_MIN)                                 \
 221     dst = MLIB_U16_MIN;                                         \
 222   else                                                          \
 223     dst = (mlib_u16)val
 224 
 225 #endif /* IMG_TYPE == 1 */
 226 #endif /* __sparc */
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































 227 
 228 /***************************************************************/
 229 #define MAX_KER   7
 230 #define MAX_N    15
 231 #define BUFF_SIZE   1600
 232 #define CACHE_SIZE  (64*1024)
 233 
 234 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
 235                                          const mlib_image *src,
 236                                          const mlib_d64   *k,
 237                                          mlib_s32         n,
 238                                          mlib_s32         dy_t,
 239                                          mlib_s32         dy_b,
 240                                          mlib_s32         cmask)
 241 {
 242   DTYPE    *adr_src, *sl;
 243   DTYPE    *adr_dst, *dl, *dp;
 244   FTYPE    buff[BUFF_SIZE];
 245   FTYPE    *buffd;
 246   FTYPE    *pbuff = buff;


< prev index next >