1 /*
   2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 /*
  28  * FUNCTION
  29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 type and
  30  *   MLIB_EDGE_SRC_EXTEND mask
  31  */
  32 
  33 #include "mlib_image.h"
  34 #include "mlib_ImageConv.h"
  35 #include "mlib_c_ImageConv.h"
  36 
  37 /*
  38  * This define switches between functions of different data types
  39  */
  40 
  41 #define IMG_TYPE 3
  42 
  43 /***************************************************************/
  44 #if IMG_TYPE == 1
  45 
  46 #define DTYPE             mlib_u8
  47 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##ext_u8(PARAM)
  48 #define CONV_FUNC_MxN     mlib_c_convMxNext_u8(PARAM_MxN)
  49 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM)
  50 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u8(PARAM_MxN)
  51 #define DSCALE            (1 << 24)
  52 #define FROM_S32(x)       (((x) >> 24) ^ 128)
  53 #define S64TOS32(x)       (x)
  54 #define SAT_OFF           -(1u << 31)
  55 
  56 #elif IMG_TYPE == 2
  57 
  58 #define DTYPE             mlib_s16
  59 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_s16(PARAM)
  60 #define CONV_FUNC_MxN     mlib_convMxNext_s16(PARAM_MxN)
  61 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM)
  62 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_s16(PARAM_MxN)
  63 #define DSCALE            65536.0
  64 #define FROM_S32(x)       ((x) >> 16)
  65 #define S64TOS32(x)       ((x) & 0xffffffff)
  66 #define SAT_OFF
  67 
  68 #elif IMG_TYPE == 3
  69 
  70 #define DTYPE             mlib_u16
  71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
  72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
  73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
  74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
  75 #define DSCALE            65536.0
  76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
  77 #define S64TOS32(x)       (x)
  78 #define SAT_OFF           -(1u << 31)
  79 
  80 #endif /* IMG_TYPE == 1 */
  81 
  82 /***************************************************************/
  83 #define PARAM                                                   \
  84   mlib_image       *dst,                                        \
  85   const mlib_image *src,                                        \
  86   mlib_s32         dx_l,                                        \
  87   mlib_s32         dx_r,                                        \
  88   mlib_s32         dy_t,                                        \
  89   mlib_s32         dy_b,                                        \
  90   const mlib_s32   *kern,                                       \
  91   mlib_s32         scalef_expon,                                \
  92   mlib_s32         cmask
  93 
  94 /***************************************************************/
  95 #define PARAM_MxN                                               \
  96   mlib_image       *dst,                                        \
  97   const mlib_image *src,                                        \
  98   const mlib_s32   *kernel,                                     \
  99   mlib_s32         m,                                           \
 100   mlib_s32         n,                                           \
 101   mlib_s32         dx_l,                                        \
 102   mlib_s32         dx_r,                                        \
 103   mlib_s32         dy_t,                                        \
 104   mlib_s32         dy_b,                                        \
 105   mlib_s32         scale,                                       \
 106   mlib_s32         cmask
 107 
 108 /***************************************************************/
 109 #define FTYPE mlib_d64
 110 
 111 #ifndef MLIB_USE_FTOI_CLAMPING
 112 
 113 #define CLAMP_S32(x)                                            \
 114   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
 115 
 116 #else
 117 
 118 #define CLAMP_S32(x) ((mlib_s32)(x))
 119 
 120 #endif /* MLIB_USE_FTOI_CLAMPING */
 121 
 122 /***************************************************************/
 123 #define D2I(x) CLAMP_S32((x) SAT_OFF)
 124 
 125 /***************************************************************/
 126 #ifdef _LITTLE_ENDIAN
 127 
 128 #define STORE2(res0, res1)                                      \
 129   dp[0    ] = res1;                                             \
 130   dp[chan1] = res0
 131 
 132 #else
 133 
 134 #define STORE2(res0, res1)                                      \
 135   dp[0    ] = res0;                                             \
 136   dp[chan1] = res1
 137 
 138 #endif /* _LITTLE_ENDIAN */
 139 
 140 /***************************************************************/
 141 #ifdef _NO_LONGLONG
 142 
 143 #define LOAD_BUFF(buff)                                         \
 144   buff[i    ] = sp[0];                                          \
 145   buff[i + 1] = sp[chan1]
 146 
 147 #else /* _NO_LONGLONG */
 148 
 149 #ifdef _LITTLE_ENDIAN
 150 
 151 #define LOAD_BUFF(buff)                                         \
 152   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
 153 
 154 #else /* _LITTLE_ENDIAN */
 155 
 156 #define LOAD_BUFF(buff)                                         \
 157   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 158 
 159 #endif /* _LITTLE_ENDIAN */
 160 #endif /* _NO_LONGLONG */
 161 
 162 /***************************************************************/
 163 typedef union {
 164   mlib_d64 d64;
 165   struct {
 166     mlib_s32 i0;
 167     mlib_s32 i1;
 168   } i32s;
 169 } d64_2x32;
 170 
 171 /***************************************************************/
 172 #define DEF_VARS(type)                                          \
 173   type     *adr_src, *sl, *sp, *sl1;                            \
 174   type     *adr_dst, *dl, *dp;                                  \
 175   FTYPE    *pbuff = buff;                                       \
 176   mlib_s32 *buffi, *buffo;                                      \
 177   mlib_s32 wid, hgt, sll, dll;                                  \
 178   mlib_s32 nchannel, chan1, chan2;                              \
 179   mlib_s32 i, j, c, swid
 180 
 181 /***************************************************************/
 182 #define GET_SRC_DST_PARAMETERS(type)                            \
 183   hgt = mlib_ImageGetHeight(src);                               \
 184   wid = mlib_ImageGetWidth(src);                                \
 185   nchannel = mlib_ImageGetChannels(src);                        \
 186   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 187   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 188   adr_src = (type *)mlib_ImageGetData(src);                     \
 189   adr_dst = (type *)mlib_ImageGetData(dst)
 190 
 191 /***************************************************************/
 192 #if IMG_TYPE == 1
 193 
 194 /*
 195  * Test for the presence of any "1" bit in bits
 196    8 to 31 of val. If present, then val is either
 197    negative or >255. If over/underflows of 8 bits
 198    are uncommon, then this technique can be a win,
 199    since only a single test, rather than two, is
 200    necessary to determine if clamping is needed.
 201    On the other hand, if over/underflows are common,
 202    it adds an extra test.
 203 */
 204 #define CLAMP_STORE(dst, val)                                   \
 205   if (val & 0xffffff00) {                                       \
 206     if (val < MLIB_U8_MIN)                                      \
 207       dst = MLIB_U8_MIN;                                        \
 208     else                                                        \
 209       dst = MLIB_U8_MAX;                                        \
 210   } else {                                                      \
 211     dst = (mlib_u8)val;                                         \
 212   }
 213 
 214 #elif IMG_TYPE == 2
 215 
 216 #define CLAMP_STORE(dst, val)                                   \
 217   if (val >= MLIB_S16_MAX)                                      \
 218     dst = MLIB_S16_MAX;                                         \
 219   else if (val <= MLIB_S16_MIN)                                 \
 220     dst = MLIB_S16_MIN;                                         \
 221   else                                                          \
 222     dst = (mlib_s16)val
 223 
 224 #elif IMG_TYPE == 3
 225 
 226 #define CLAMP_STORE(dst, val)                                   \
 227   if (val >= MLIB_U16_MAX)                                      \
 228     dst = MLIB_U16_MAX;                                         \
 229   else if (val <= MLIB_U16_MIN)                                 \
 230     dst = MLIB_U16_MIN;                                         \
 231   else                                                          \
 232     dst = (mlib_u16)val
 233 
 234 #endif /* IMG_TYPE == 1 */
 235 
 236 /***************************************************************/
 237 #define MAX_KER   7
 238 #define MAX_N    15
 239 #define BUFF_SIZE   1600
 240 #define CACHE_SIZE  (64*1024)
 241 
 242 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
 243                                          const mlib_image *src,
 244                                          const mlib_d64   *k,
 245                                          mlib_s32         n,
 246                                          mlib_s32         dy_t,
 247                                          mlib_s32         dy_b,
 248                                          mlib_s32         cmask)
 249 {
 250   DTYPE    *adr_src, *sl;
 251   DTYPE    *adr_dst, *dl, *dp;
 252   FTYPE    buff[BUFF_SIZE];
 253   FTYPE    *buffd;
 254   FTYPE    *pbuff = buff;
 255   const FTYPE    *pk;
 256   FTYPE    k0, k1, k2, k3;
 257   FTYPE    p0, p1, p2, p3, p4;
 258   FTYPE    *sbuff;
 259   mlib_s32 l, k_off, off, bsize;
 260   mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh;
 261   mlib_s32 d0, d1, ii;
 262   mlib_s32 wid, hgt, sll, dll;
 263   mlib_s32 nchannel;
 264   mlib_s32 i, j, c;
 265   GET_SRC_DST_PARAMETERS(DTYPE);
 266 
 267   max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1);
 268 
 269   if (max_hsize < 1) max_hsize = 1;
 270   if (max_hsize > hgt) max_hsize = hgt;
 271 
 272   shgt = hgt + (n - 1);
 273   smax_hsize = max_hsize + (n - 1);
 274 
 275   bsize = 2 * (smax_hsize + 1);
 276 
 277   if (bsize > BUFF_SIZE) {
 278     pbuff = mlib_malloc(sizeof(FTYPE)*bsize);
 279 
 280     if (pbuff == NULL) return MLIB_FAILURE;
 281   }
 282 
 283   sbuff = pbuff;
 284   buffd = sbuff + smax_hsize;
 285 
 286   shgt -= (dy_t + dy_b);
 287   k_off = 0;
 288 
 289   for (l = 0; l < hgt; l += hsize) {
 290     hsize = hgt - l;
 291 
 292     if (hsize > max_hsize) hsize = max_hsize;
 293 
 294     smax_hsize = hsize + (n - 1);
 295 
 296     for (c = 0; c < nchannel; c++) {
 297       if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 298 
 299       sl = adr_src + c;
 300       dl = adr_dst + c;
 301 
 302 #ifdef __SUNPRO_C
 303 #pragma pipeloop(0)
 304 #endif /* __SUNPRO_C */
 305       for (i = 0; i < hsize; i++) buffd[i] = 0.0;
 306 
 307       for (j = 0; j < wid; j++) {
 308         FTYPE    *buff = sbuff;
 309 
 310         for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) {
 311           sbuff[i - k_off] = (FTYPE)sl[0];
 312         }
 313 
 314 #ifdef __SUNPRO_C
 315 #pragma pipeloop(0)
 316 #endif /* __SUNPRO_C */
 317         for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) {
 318           sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll];
 319         }
 320 
 321         for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) {
 322           sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll];
 323         }
 324 
 325         pk = k;
 326 
 327         for (off = 0; off < (n - 4); off += 4) {
 328 
 329           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 330           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 331 
 332 #ifdef __SUNPRO_C
 333 #pragma pipeloop(0)
 334 #endif /* __SUNPRO_C */
 335           for (i = 0; i < hsize; i += 2) {
 336             p0 = p2; p1 = p3; p2 = p4;
 337 
 338             p3 = buff[i + 3]; p4 = buff[i + 4];
 339 
 340             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 341             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 342           }
 343 
 344           pk += 4;
 345           buff += 4;
 346         }
 347 
 348         dp = dl;
 349         kh = n - off;
 350 
 351         if (kh == 4) {
 352           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 353           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 354 
 355 #ifdef __SUNPRO_C
 356 #pragma pipeloop(0)
 357 #endif /* __SUNPRO_C */
 358           for (i = 0; i <= (hsize - 2); i += 2) {
 359             p0 = p2; p1 = p3; p2 = p4;
 360 
 361             p3 = buff[i + 3]; p4 = buff[i + 4];
 362 
 363             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 364             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 365 
 366             dp[0  ] = FROM_S32(d0);
 367             dp[dll] = FROM_S32(d1);
 368 
 369             buffd[i    ] = 0.0;
 370             buffd[i + 1] = 0.0;
 371 
 372             dp += 2*dll;
 373           }
 374 
 375           if (i < hsize) {
 376             p0 = p2; p1 = p3; p2 = p4;
 377             p3 = buff[i + 3];
 378             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]);
 379             dp[0] = FROM_S32(d0);
 380             buffd[i] = 0.0;
 381           }
 382 
 383         } else if (kh == 3) {
 384 
 385           p2 = buff[0]; p3 = buff[1];
 386           k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 387 
 388 #ifdef __SUNPRO_C
 389 #pragma pipeloop(0)
 390 #endif /* __SUNPRO_C */
 391           for (i = 0; i <= (hsize - 2); i += 2) {
 392             p0 = p2; p1 = p3;
 393 
 394             p2 = buff[i + 2]; p3 = buff[i + 3];
 395 
 396             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 397             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 398 
 399             dp[0  ] = FROM_S32(d0);
 400             dp[dll] = FROM_S32(d1);
 401 
 402             buffd[i    ] = 0.0;
 403             buffd[i + 1] = 0.0;
 404 
 405             dp += 2*dll;
 406           }
 407 
 408           if (i < hsize) {
 409             p0 = p2; p1 = p3;
 410             p2 = buff[i + 2];
 411             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]);
 412             dp[0] = FROM_S32(d0);
 413 
 414             buffd[i] = 0.0;
 415           }
 416 
 417         } else if (kh == 2) {
 418 
 419           p2 = buff[0];
 420           k0 = pk[0]; k1 = pk[1];
 421 
 422 #ifdef __SUNPRO_C
 423 #pragma pipeloop(0)
 424 #endif /* __SUNPRO_C */
 425           for (i = 0; i <= (hsize - 2); i += 2) {
 426             p0 = p2;
 427 
 428             p1 = buff[i + 1]; p2 = buff[i + 2];
 429 
 430             d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 431             d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 432 
 433             dp[0  ] = FROM_S32(d0);
 434             dp[dll] = FROM_S32(d1);
 435 
 436             buffd[i    ] = 0.0;
 437             buffd[i + 1] = 0.0;
 438 
 439             dp += 2*dll;
 440           }
 441 
 442           if (i < hsize) {
 443             p0 = p2;
 444             p1 = buff[i + 1];
 445             d0 = D2I(p0*k0 + p1*k1 + buffd[i]);
 446             dp[0] = FROM_S32(d0);
 447 
 448             buffd[i] = 0.0;
 449           }
 450 
 451         } else /* kh == 1 */{
 452 
 453           k0 = pk[0];
 454 
 455 #ifdef __SUNPRO_C
 456 #pragma pipeloop(0)
 457 #endif /* __SUNPRO_C */
 458           for (i = 0; i <= (hsize - 2); i += 2) {
 459             p0 = buff[i]; p1 = buff[i + 1];
 460 
 461             d0 = D2I(p0*k0 + buffd[i    ]);
 462             d1 = D2I(p1*k0 + buffd[i + 1]);
 463 
 464             dp[0  ] = FROM_S32(d0);
 465             dp[dll] = FROM_S32(d1);
 466 
 467             buffd[i    ] = 0.0;
 468             buffd[i + 1] = 0.0;
 469 
 470             dp += 2*dll;
 471           }
 472 
 473           if (i < hsize) {
 474             p0 = buff[i];
 475             d0 = D2I(p0*k0 + buffd[i]);
 476             dp[0] = FROM_S32(d0);
 477 
 478             buffd[i] = 0.0;
 479           }
 480         }
 481 
 482         /* next line */
 483         sl += nchannel;
 484         dl += nchannel;
 485       }
 486     }
 487 
 488     k_off += max_hsize;
 489     adr_dst += max_hsize*dll;
 490   }
 491 
 492   if (pbuff != buff) mlib_free(pbuff);
 493 
 494   return MLIB_SUCCESS;
 495 }
 496 
 497 /***************************************************************/
 498 mlib_status CONV_FUNC_MxN
 499 {
 500   DTYPE    *adr_src, *sl, *sp = NULL;
 501   DTYPE    *adr_dst, *dl, *dp = NULL;
 502   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 503   FTYPE    **buffs = buffs_arr, *buffd;
 504   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
 505   FTYPE    *pbuff = buff;
 506   FTYPE    k0, k1, k2, k3, k4, k5, k6;
 507   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
 508   mlib_s32 *buffi;
 509   mlib_s32 mn, l, off, kw, bsize, buff_ind;
 510   mlib_s32 d0, d1;
 511   mlib_s32 wid, hgt, sll, dll;
 512   mlib_s32 nchannel, chan1, chan2;
 513   mlib_s32 i, j, c, swid;
 514   d64_2x32 dd;
 515   mlib_status status = MLIB_SUCCESS;
 516 
 517   GET_SRC_DST_PARAMETERS(DTYPE);
 518 
 519   if (scale > 30) {
 520     fscale *= 1.0/(1 << 30);
 521     scale -= 30;
 522   }
 523 
 524   fscale /= (1 << scale);
 525 
 526   mn = m*n;
 527 
 528   if (mn > 256) {
 529     k = mlib_malloc(mn*sizeof(mlib_d64));
 530 
 531     if (k == NULL) return MLIB_FAILURE;
 532   }
 533 
 534   for (i = 0; i < mn; i++) {
 535     k[i] = kernel[i]*fscale;
 536   }
 537 
 538   if (m == 1) {
 539     status = mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask);
 540     FREE_AND_RETURN_STATUS;
 541   }
 542 
 543   swid = wid + (m - 1);
 544 
 545   bsize = (n + 3)*swid;
 546 
 547   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 548     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
 549 
 550     if (pbuff == NULL) {
 551       status = MLIB_FAILURE;
 552       FREE_AND_RETURN_STATUS;
 553     }
 554     buffs = (FTYPE   **)(pbuff + bsize);
 555   }
 556 
 557   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
 558   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
 559   buffd = buffs[n] + swid;
 560   buffi = (mlib_s32*)(buffd + swid);
 561 
 562   chan1 = nchannel;
 563   chan2 = chan1 + chan1;
 564 
 565   swid -= (dx_l + dx_r);
 566 
 567   for (c = 0; c < nchannel; c++) {
 568     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 569 
 570     sl = adr_src + c;
 571     dl = adr_dst + c;
 572 
 573     for (l = 0; l < n; l++) {
 574       FTYPE    *buff = buffs[l];
 575 
 576       for (i = 0; i < dx_l; i++) {
 577         buff[i] = (FTYPE)sl[0];
 578       }
 579 
 580 #ifdef __SUNPRO_C
 581 #pragma pipeloop(0)
 582 #endif /* __SUNPRO_C */
 583       for (i = 0; i < swid; i++) {
 584         buff[i + dx_l] = (FTYPE)sl[i*chan1];
 585       }
 586 
 587       for (i = 0; i < dx_r; i++) {
 588         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
 589       }
 590 
 591       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
 592     }
 593 
 594     buff_ind = 0;
 595 
 596 #ifdef __SUNPRO_C
 597 #pragma pipeloop(0)
 598 #endif /* __SUNPRO_C */
 599     for (i = 0; i < wid; i++) buffd[i] = 0.0;
 600 
 601     for (j = 0; j < hgt; j++) {
 602       FTYPE    **buffc = buffs + buff_ind;
 603       FTYPE    *buffn = buffc[n];
 604       FTYPE    *pk = k;
 605 
 606       for (l = 0; l < n; l++) {
 607         FTYPE    *buff_l = buffc[l];
 608 
 609         for (off = 0; off < m;) {
 610           FTYPE    *buff = buff_l + off;
 611 
 612           kw = m - off;
 613 
 614           if (kw > 2*MAX_KER) kw = MAX_KER; else
 615             if (kw > MAX_KER) kw = kw/2;
 616           off += kw;
 617 
 618           sp = sl;
 619           dp = dl;
 620 
 621           if (kw == 7) {
 622 
 623             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 624             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
 625 
 626             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 627             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
 628 
 629             if (l < (n - 1) || off < m) {
 630 #ifdef __SUNPRO_C
 631 #pragma pipeloop(0)
 632 #endif /* __SUNPRO_C */
 633               for (i = 0; i <= (wid - 2); i += 2) {
 634                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 635 
 636                 p6 = buff[i + 6]; p7 = buff[i + 7];
 637 
 638                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
 639                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
 640               }
 641 
 642             } else {
 643 #ifdef __SUNPRO_C
 644 #pragma pipeloop(0)
 645 #endif /* __SUNPRO_C */
 646               for (i = 0; i <= (wid - 2); i += 2) {
 647                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 648 
 649                 p6 = buff[i + 6]; p7 = buff[i + 7];
 650 
 651                 LOAD_BUFF(buffi);
 652 
 653                 dd.d64 = *(FTYPE   *)(buffi + i);
 654                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 655                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 656 
 657                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
 658                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
 659 
 660                 dp[0    ] = FROM_S32(d0);
 661                 dp[chan1] = FROM_S32(d1);
 662 
 663                 buffd[i    ] = 0.0;
 664                 buffd[i + 1] = 0.0;
 665 
 666                 sp += chan2;
 667                 dp += chan2;
 668               }
 669             }
 670 
 671           } else if (kw == 6) {
 672 
 673             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 674             p5 = buff[3]; p6 = buff[4];
 675 
 676             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 677             k4 = pk[4]; k5 = pk[5];
 678 
 679             if (l < (n - 1) || off < m) {
 680 #ifdef __SUNPRO_C
 681 #pragma pipeloop(0)
 682 #endif /* __SUNPRO_C */
 683               for (i = 0; i <= (wid - 2); i += 2) {
 684                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 685 
 686                 p5 = buff[i + 5]; p6 = buff[i + 6];
 687 
 688                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
 689                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
 690               }
 691 
 692             } else {
 693 #ifdef __SUNPRO_C
 694 #pragma pipeloop(0)
 695 #endif /* __SUNPRO_C */
 696               for (i = 0; i <= (wid - 2); i += 2) {
 697                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 698 
 699                 p5 = buff[i + 5]; p6 = buff[i + 6];
 700 
 701                 LOAD_BUFF(buffi);
 702 
 703                 dd.d64 = *(FTYPE   *)(buffi + i);
 704                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 705                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 706 
 707                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
 708                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
 709 
 710                 dp[0    ] = FROM_S32(d0);
 711                 dp[chan1] = FROM_S32(d1);
 712 
 713                 buffd[i    ] = 0.0;
 714                 buffd[i + 1] = 0.0;
 715 
 716                 sp += chan2;
 717                 dp += chan2;
 718               }
 719             }
 720 
 721           } else if (kw == 5) {
 722 
 723             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 724             p5 = buff[3];
 725 
 726             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 727             k4 = pk[4];
 728 
 729             if (l < (n - 1) || off < m) {
 730 #ifdef __SUNPRO_C
 731 #pragma pipeloop(0)
 732 #endif /* __SUNPRO_C */
 733               for (i = 0; i <= (wid - 2); i += 2) {
 734                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 735 
 736                 p4 = buff[i + 4]; p5 = buff[i + 5];
 737 
 738                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
 739                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
 740               }
 741 
 742             } else {
 743 #ifdef __SUNPRO_C
 744 #pragma pipeloop(0)
 745 #endif /* __SUNPRO_C */
 746               for (i = 0; i <= (wid - 2); i += 2) {
 747                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 748 
 749                 p4 = buff[i + 4]; p5 = buff[i + 5];
 750 
 751                 LOAD_BUFF(buffi);
 752 
 753                 dd.d64 = *(FTYPE   *)(buffi + i);
 754                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 755                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 756 
 757                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
 758                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
 759 
 760                 dp[0    ] = FROM_S32(d0);
 761                 dp[chan1] = FROM_S32(d1);
 762 
 763                 buffd[i    ] = 0.0;
 764                 buffd[i + 1] = 0.0;
 765 
 766                 sp += chan2;
 767                 dp += chan2;
 768               }
 769             }
 770 
 771           } else if (kw == 4) {
 772 
 773             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 774 
 775             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 776 
 777             if (l < (n - 1) || off < m) {
 778 #ifdef __SUNPRO_C
 779 #pragma pipeloop(0)
 780 #endif /* __SUNPRO_C */
 781               for (i = 0; i <= (wid - 2); i += 2) {
 782                 p0 = p2; p1 = p3; p2 = p4;
 783 
 784                 p3 = buff[i + 3]; p4 = buff[i + 4];
 785 
 786                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 787                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 788               }
 789 
 790             } else {
 791 #ifdef __SUNPRO_C
 792 #pragma pipeloop(0)
 793 #endif /* __SUNPRO_C */
 794               for (i = 0; i <= (wid - 2); i += 2) {
 795                 p0 = p2; p1 = p3; p2 = p4;
 796 
 797                 p3 = buff[i + 3]; p4 = buff[i + 4];
 798 
 799                 LOAD_BUFF(buffi);
 800 
 801                 dd.d64 = *(FTYPE   *)(buffi + i);
 802                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 803                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 804 
 805                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 806                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 807 
 808                 dp[0    ] = FROM_S32(d0);
 809                 dp[chan1] = FROM_S32(d1);
 810 
 811                 buffd[i    ] = 0.0;
 812                 buffd[i + 1] = 0.0;
 813 
 814                 sp += chan2;
 815                 dp += chan2;
 816               }
 817             }
 818 
 819           } else if (kw == 3) {
 820 
 821             p2 = buff[0]; p3 = buff[1];
 822             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 823 
 824             if (l < (n - 1) || off < m) {
 825 #ifdef __SUNPRO_C
 826 #pragma pipeloop(0)
 827 #endif /* __SUNPRO_C */
 828               for (i = 0; i <= (wid - 2); i += 2) {
 829                 p0 = p2; p1 = p3;
 830 
 831                 p2 = buff[i + 2]; p3 = buff[i + 3];
 832 
 833                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
 834                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
 835               }
 836 
 837             } else {
 838 #ifdef __SUNPRO_C
 839 #pragma pipeloop(0)
 840 #endif /* __SUNPRO_C */
 841               for (i = 0; i <= (wid - 2); i += 2) {
 842                 p0 = p2; p1 = p3;
 843 
 844                 p2 = buff[i + 2]; p3 = buff[i + 3];
 845 
 846                 LOAD_BUFF(buffi);
 847 
 848                 dd.d64 = *(FTYPE   *)(buffi + i);
 849                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 850                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 851 
 852                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 853                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 854 
 855                 dp[0    ] = FROM_S32(d0);
 856                 dp[chan1] = FROM_S32(d1);
 857 
 858                 buffd[i    ] = 0.0;
 859                 buffd[i + 1] = 0.0;
 860 
 861                 sp += chan2;
 862                 dp += chan2;
 863               }
 864             }
 865 
 866           } else /* if (kw == 2) */ {
 867 
 868             p2 = buff[0];
 869             k0 = pk[0]; k1 = pk[1];
 870 
 871             if (l < (n - 1) || off < m) {
 872 #ifdef __SUNPRO_C
 873 #pragma pipeloop(0)
 874 #endif /* __SUNPRO_C */
 875               for (i = 0; i <= (wid - 2); i += 2) {
 876                 p0 = p2;
 877 
 878                 p1 = buff[i + 1]; p2 = buff[i + 2];
 879 
 880                 buffd[i    ] += p0*k0 + p1*k1;
 881                 buffd[i + 1] += p1*k0 + p2*k1;
 882               }
 883 
 884             } else {
 885 #ifdef __SUNPRO_C
 886 #pragma pipeloop(0)
 887 #endif /* __SUNPRO_C */
 888               for (i = 0; i <= (wid - 2); i += 2) {
 889                 p0 = p2;
 890 
 891                 p1 = buff[i + 1]; p2 = buff[i + 2];
 892 
 893                 LOAD_BUFF(buffi);
 894 
 895                 dd.d64 = *(FTYPE   *)(buffi + i);
 896                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 897                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 898 
 899                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 900                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 901 
 902                 dp[0    ] = FROM_S32(d0);
 903                 dp[chan1] = FROM_S32(d1);
 904 
 905                 buffd[i    ] = 0.0;
 906                 buffd[i + 1] = 0.0;
 907 
 908                 sp += chan2;
 909                 dp += chan2;
 910               }
 911             }
 912           }
 913 
 914           pk += kw;
 915         }
 916       }
 917 
 918       /* last pixels */
 919       for (; i < wid; i++) {
 920         FTYPE    *pk = k, s = 0;
 921         mlib_s32 x, d0;
 922 
 923         for (l = 0; l < n; l++) {
 924           FTYPE    *buff = buffc[l] + i;
 925 
 926           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
 927         }
 928 
 929         d0 = D2I(s);
 930         dp[0] = FROM_S32(d0);
 931 
 932         buffn[i + dx_l] = (FTYPE)sp[0];
 933 
 934         sp += chan1;
 935         dp += chan1;
 936       }
 937 
 938       for (; i < swid; i++) {
 939         buffn[i + dx_l] = (FTYPE)sp[0];
 940         sp += chan1;
 941       }
 942 
 943       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
 944       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
 945 
 946       /* next line */
 947 
 948       if (j < hgt - dy_b - 2) sl += sll;
 949       dl += dll;
 950 
 951       buff_ind++;
 952 
 953       if (buff_ind >= n + 1) buff_ind = 0;
 954     }
 955   }
 956 
 957   FREE_AND_RETURN_STATUS;
 958 }
 959 
 960 /***************************************************************/
 961 #define STORE_RES(res, x)                                       \
 962   x >>= shift2;                                                 \
 963   CLAMP_STORE(res, x)
 964 
 965 mlib_status CONV_FUNC_MxN_I
 966 {
 967   DTYPE    *adr_src, *sl, *sp = NULL;
 968   DTYPE    *adr_dst, *dl, *dp = NULL;
 969   mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 970   mlib_s32 *pbuff = buff;
 971   mlib_s32 **buffs = buffs_arr, *buffd;
 972   mlib_s32 l, off, kw, bsize, buff_ind;
 973   mlib_s32 d0, d1, shift1, shift2;
 974   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
 975   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
 976   mlib_s32 wid, hgt, sll, dll;
 977   mlib_s32 nchannel, chan1;
 978   mlib_s32 i, j, c, swid;
 979   mlib_s32 chan2;
 980   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
 981   GET_SRC_DST_PARAMETERS(DTYPE);
 982 
 983 #if IMG_TYPE != 1
 984   shift1 = 16;
 985 #else
 986   shift1 = 8;
 987 #endif /* IMG_TYPE != 1 */
 988   shift2 = scale - shift1;
 989 
 990   chan1 = nchannel;
 991   chan2 = chan1 + chan1;
 992 
 993   swid = wid + (m - 1);
 994 
 995   bsize = (n + 2)*swid;
 996 
 997   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 998     pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1));
 999 
1000     if (pbuff == NULL) return MLIB_FAILURE;
1001     buffs = (mlib_s32 **)(pbuff + bsize);
1002   }
1003 
1004   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
1005   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1006   buffd = buffs[n] + swid;
1007 
1008   if (m*n > MAX_N*MAX_N) {
1009     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
1010 
1011     if (k == NULL) {
1012       if (pbuff != buff) mlib_free(pbuff);
1013       return MLIB_FAILURE;
1014     }
1015   }
1016 
1017   for (i = 0; i < m*n; i++) {
1018     k[i] = kernel[i] >> shift1;
1019   }
1020 
1021   swid -= (dx_l + dx_r);
1022 
1023   for (c = 0; c < nchannel; c++) {
1024     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1025 
1026     sl = adr_src + c;
1027     dl = adr_dst + c;
1028 
1029     for (l = 0; l < n; l++) {
1030       mlib_s32  *buff = buffs[l];
1031 
1032       for (i = 0; i < dx_l; i++) {
1033         buff[i] = (mlib_s32)sl[0];
1034       }
1035 
1036 #ifdef __SUNPRO_C
1037 #pragma pipeloop(0)
1038 #endif /* __SUNPRO_C */
1039       for (i = 0; i < swid; i++) {
1040         buff[i + dx_l] = (mlib_s32)sl[i*chan1];
1041       }
1042 
1043       for (i = 0; i < dx_r; i++) {
1044         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
1045       }
1046 
1047       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
1048     }
1049 
1050     buff_ind = 0;
1051 
1052 #ifdef __SUNPRO_C
1053 #pragma pipeloop(0)
1054 #endif /* __SUNPRO_C */
1055     for (i = 0; i < wid; i++) buffd[i] = 0;
1056 
1057     for (j = 0; j < hgt; j++) {
1058       mlib_s32 **buffc = buffs + buff_ind;
1059       mlib_s32 *buffn = buffc[n];
1060       mlib_s32 *pk = k;
1061 
1062       for (l = 0; l < n; l++) {
1063         mlib_s32  *buff_l = buffc[l];
1064 
1065         for (off = 0; off < m;) {
1066           mlib_s32 *buff = buff_l + off;
1067 
1068           sp = sl;
1069           dp = dl;
1070 
1071           kw = m - off;
1072 
1073           if (kw > 2*MAX_KER) kw = MAX_KER; else
1074             if (kw > MAX_KER) kw = kw/2;
1075           off += kw;
1076 
1077           if (kw == 7) {
1078 
1079             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1080             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1081 
1082             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1083             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1084 
1085             if (l < (n - 1) || off < m) {
1086 #ifdef __SUNPRO_C
1087 #pragma pipeloop(0)
1088 #endif /* __SUNPRO_C */
1089               for (i = 0; i <= (wid - 2); i += 2) {
1090                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1091 
1092                 p6 = buff[i + 6]; p7 = buff[i + 7];
1093 
1094                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1095                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1096               }
1097 
1098             } else {
1099 #ifdef __SUNPRO_C
1100 #pragma pipeloop(0)
1101 #endif /* __SUNPRO_C */
1102               for (i = 0; i <= (wid - 2); i += 2) {
1103                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1104 
1105                 p6 = buff[i + 6]; p7 = buff[i + 7];
1106 
1107                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1108                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1109 
1110                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1111                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1112 
1113                 STORE_RES(dp[0    ], d0);
1114                 STORE_RES(dp[chan1], d1);
1115 
1116                 buffd[i    ] = 0;
1117                 buffd[i + 1] = 0;
1118 
1119                 sp += chan2;
1120                 dp += chan2;
1121               }
1122             }
1123 
1124           } else if (kw == 6) {
1125 
1126             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1127             p5 = buff[3]; p6 = buff[4];
1128 
1129             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1130             k4 = pk[4]; k5 = pk[5];
1131 
1132             if (l < (n - 1) || off < m) {
1133 #ifdef __SUNPRO_C
1134 #pragma pipeloop(0)
1135 #endif /* __SUNPRO_C */
1136               for (i = 0; i <= (wid - 2); i += 2) {
1137                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1138 
1139                 p5 = buff[i + 5]; p6 = buff[i + 6];
1140 
1141                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1142                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1143               }
1144 
1145             } else {
1146 #ifdef __SUNPRO_C
1147 #pragma pipeloop(0)
1148 #endif /* __SUNPRO_C */
1149               for (i = 0; i <= (wid - 2); i += 2) {
1150                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1151 
1152                 p5 = buff[i + 5]; p6 = buff[i + 6];
1153 
1154                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1155                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1156 
1157                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
1158                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1159 
1160                 STORE_RES(dp[0    ], d0);
1161                 STORE_RES(dp[chan1], d1);
1162 
1163                 buffd[i    ] = 0;
1164                 buffd[i + 1] = 0;
1165 
1166                 sp += chan2;
1167                 dp += chan2;
1168               }
1169             }
1170 
1171           } else if (kw == 5) {
1172 
1173             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1174             p5 = buff[3];
1175 
1176             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1177             k4 = pk[4];
1178 
1179             if (l < (n - 1) || off < m) {
1180 #ifdef __SUNPRO_C
1181 #pragma pipeloop(0)
1182 #endif /* __SUNPRO_C */
1183               for (i = 0; i <= (wid - 2); i += 2) {
1184                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1185 
1186                 p4 = buff[i + 4]; p5 = buff[i + 5];
1187 
1188                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1189                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1190               }
1191 
1192             } else {
1193 #ifdef __SUNPRO_C
1194 #pragma pipeloop(0)
1195 #endif /* __SUNPRO_C */
1196               for (i = 0; i <= (wid - 2); i += 2) {
1197                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1198 
1199                 p4 = buff[i + 4]; p5 = buff[i + 5];
1200 
1201                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1202                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1203 
1204                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
1205                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1206 
1207                 STORE_RES(dp[0    ], d0);
1208                 STORE_RES(dp[chan1], d1);
1209 
1210                 buffd[i    ] = 0;
1211                 buffd[i + 1] = 0;
1212 
1213                 sp += chan2;
1214                 dp += chan2;
1215               }
1216             }
1217 
1218           } else if (kw == 4) {
1219 
1220             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1221 
1222             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1223 
1224             if (l < (n - 1) || off < m) {
1225 #ifdef __SUNPRO_C
1226 #pragma pipeloop(0)
1227 #endif /* __SUNPRO_C */
1228               for (i = 0; i <= (wid - 2); i += 2) {
1229                 p0 = p2; p1 = p3; p2 = p4;
1230 
1231                 p3 = buff[i + 3]; p4 = buff[i + 4];
1232 
1233                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1234                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1235               }
1236 
1237             } else {
1238 #ifdef __SUNPRO_C
1239 #pragma pipeloop(0)
1240 #endif /* __SUNPRO_C */
1241               for (i = 0; i <= (wid - 2); i += 2) {
1242                 p0 = p2; p1 = p3; p2 = p4;
1243 
1244                 p3 = buff[i + 3]; p4 = buff[i + 4];
1245 
1246                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1247                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1248 
1249                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1250                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1251 
1252                 STORE_RES(dp[0    ], d0);
1253                 STORE_RES(dp[chan1], d1);
1254 
1255                 buffd[i    ] = 0;
1256                 buffd[i + 1] = 0;
1257 
1258                 sp += chan2;
1259                 dp += chan2;
1260               }
1261             }
1262 
1263           } else if (kw == 3) {
1264 
1265             p2 = buff[0]; p3 = buff[1];
1266             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
1267 
1268             if (l < (n - 1) || off < m) {
1269 #ifdef __SUNPRO_C
1270 #pragma pipeloop(0)
1271 #endif /* __SUNPRO_C */
1272               for (i = 0; i <= (wid - 2); i += 2) {
1273                 p0 = p2; p1 = p3;
1274 
1275                 p2 = buff[i + 2]; p3 = buff[i + 3];
1276 
1277                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1278                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1279               }
1280 
1281             } else {
1282 #ifdef __SUNPRO_C
1283 #pragma pipeloop(0)
1284 #endif /* __SUNPRO_C */
1285               for (i = 0; i <= (wid - 2); i += 2) {
1286                 p0 = p2; p1 = p3;
1287 
1288                 p2 = buff[i + 2]; p3 = buff[i + 3];
1289 
1290                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1291                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1292 
1293                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1294                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1295 
1296                 STORE_RES(dp[0    ], d0);
1297                 STORE_RES(dp[chan1], d1);
1298 
1299                 buffd[i    ] = 0;
1300                 buffd[i + 1] = 0;
1301 
1302                 sp += chan2;
1303                 dp += chan2;
1304               }
1305             }
1306 
1307           } else if (kw == 2) {
1308 
1309             p2 = buff[0];
1310             k0 = pk[0]; k1 = pk[1];
1311 
1312             if (l < (n - 1) || off < m) {
1313 #ifdef __SUNPRO_C
1314 #pragma pipeloop(0)
1315 #endif /* __SUNPRO_C */
1316               for (i = 0; i <= (wid - 2); i += 2) {
1317                 p0 = p2;
1318 
1319                 p1 = buff[i + 1]; p2 = buff[i + 2];
1320 
1321                 buffd[i    ] += p0*k0 + p1*k1;
1322                 buffd[i + 1] += p1*k0 + p2*k1;
1323               }
1324 
1325             } else {
1326 #ifdef __SUNPRO_C
1327 #pragma pipeloop(0)
1328 #endif /* __SUNPRO_C */
1329               for (i = 0; i <= (wid - 2); i += 2) {
1330                 p0 = p2;
1331 
1332                 p1 = buff[i + 1]; p2 = buff[i + 2];
1333 
1334                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1335                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1336 
1337                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
1338                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
1339 
1340                 STORE_RES(dp[0    ], d0);
1341                 STORE_RES(dp[chan1], d1);
1342 
1343                 buffd[i    ] = 0;
1344                 buffd[i + 1] = 0;
1345 
1346                 sp += chan2;
1347                 dp += chan2;
1348               }
1349             }
1350 
1351           } else /* kw == 1 */{
1352 
1353             k0 = pk[0];
1354 
1355             if (l < (n - 1) || off < m) {
1356 #ifdef __SUNPRO_C
1357 #pragma pipeloop(0)
1358 #endif /* __SUNPRO_C */
1359               for (i = 0; i <= (wid - 2); i += 2) {
1360                 p0 = buff[i]; p1 = buff[i + 1];
1361 
1362                 buffd[i    ] += p0*k0;
1363                 buffd[i + 1] += p1*k0;
1364               }
1365 
1366             } else {
1367 #ifdef __SUNPRO_C
1368 #pragma pipeloop(0)
1369 #endif /* __SUNPRO_C */
1370               for (i = 0; i <= (wid - 2); i += 2) {
1371                 p0 = buff[i]; p1 = buff[i + 1];
1372 
1373                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1374                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1375 
1376                 d0 = (p0*k0 + buffd[i    ]);
1377                 d1 = (p1*k0 + buffd[i + 1]);
1378 
1379                 STORE_RES(dp[0    ], d0);
1380                 STORE_RES(dp[chan1], d1);
1381 
1382                 buffd[i    ] = 0;
1383                 buffd[i + 1] = 0;
1384 
1385                 sp += chan2;
1386                 dp += chan2;
1387               }
1388             }
1389           }
1390 
1391           pk += kw;
1392         }
1393       }
1394 
1395       /* last pixels */
1396       for (; i < wid; i++) {
1397         mlib_s32 *pk = k, x, s = 0;
1398 
1399         for (l = 0; l < n; l++) {
1400           mlib_s32 *buff = buffc[l] + i;
1401 
1402           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
1403         }
1404 
1405         STORE_RES(dp[0], s);
1406 
1407         buffn[i + dx_l] = (mlib_s32)sp[0];
1408 
1409         sp += chan1;
1410         dp += chan1;
1411       }
1412 
1413       for (; i < swid; i++) {
1414         buffn[i + dx_l] = (mlib_s32)sp[0];
1415         sp += chan1;
1416       }
1417 
1418       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1419       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1420 
1421       /* next line */
1422 
1423       if (j < hgt - dy_b - 2) sl += sll;
1424       dl += dll;
1425 
1426       buff_ind++;
1427 
1428       if (buff_ind >= n + 1) buff_ind = 0;
1429     }
1430   }
1431 
1432   if (pbuff != buff) mlib_free(pbuff);
1433   if (k != k_locl) mlib_free(k);
1434 
1435   return MLIB_SUCCESS;
1436 }
1437 
1438 /***************************************************************/