1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 /*
  28  * FUNCTION
  29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 type and
  30  *   MLIB_EDGE_SRC_EXTEND mask
  31  */
  32 
  33 #include "mlib_image.h"
  34 #include "mlib_ImageConv.h"
  35 #include "mlib_c_ImageConv.h"
  36 
  37 /*
  38  * This define switches between functions of different data types
  39  */
  40 
  41 #define IMG_TYPE 2
  42 
  43 /***************************************************************/
  44 #if IMG_TYPE == 1
  45 
  46 #define DTYPE             mlib_u8
  47 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##ext_u8(PARAM)
  48 #define CONV_FUNC_MxN     mlib_c_convMxNext_u8(PARAM_MxN)
  49 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM)
  50 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u8(PARAM_MxN)
  51 #define DSCALE            (1 << 24)
  52 #define FROM_S32(x)       (((x) >> 24) ^ 128)
  53 #define S64TOS32(x)       (x)
  54 #define SAT_OFF           -(1u << 31)
  55 
  56 #elif IMG_TYPE == 2
  57 
  58 #define DTYPE             mlib_s16
  59 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_s16(PARAM)
  60 #define CONV_FUNC_MxN     mlib_convMxNext_s16(PARAM_MxN)
  61 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM)
  62 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_s16(PARAM_MxN)
  63 #define DSCALE            65536.0
  64 #define FROM_S32(x)       ((x) >> 16)
  65 #define S64TOS32(x)       ((x) & 0xffffffff)
  66 #define SAT_OFF
  67 
  68 #elif IMG_TYPE == 3
  69 
  70 #define DTYPE             mlib_u16
  71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
  72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
  73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
  74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
  75 #define DSCALE            65536.0
  76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
  77 #define S64TOS32(x)       (x)
  78 #define SAT_OFF           -(1u << 31)
  79 
  80 #endif /* IMG_TYPE == 1 */
  81 
  82 /***************************************************************/
  83 #define PARAM                                                   \
  84   mlib_image       *dst,                                        \
  85   const mlib_image *src,                                        \
  86   mlib_s32         dx_l,                                        \
  87   mlib_s32         dx_r,                                        \
  88   mlib_s32         dy_t,                                        \
  89   mlib_s32         dy_b,                                        \
  90   const mlib_s32   *kern,                                       \
  91   mlib_s32         scalef_expon,                                \
  92   mlib_s32         cmask
  93 
  94 /***************************************************************/
  95 #define PARAM_MxN                                               \
  96   mlib_image       *dst,                                        \
  97   const mlib_image *src,                                        \
  98   const mlib_s32   *kernel,                                     \
  99   mlib_s32         m,                                           \
 100   mlib_s32         n,                                           \
 101   mlib_s32         dx_l,                                        \
 102   mlib_s32         dx_r,                                        \
 103   mlib_s32         dy_t,                                        \
 104   mlib_s32         dy_b,                                        \
 105   mlib_s32         scale,                                       \
 106   mlib_s32         cmask
 107 
 108 /***************************************************************/
 109 #define FTYPE mlib_d64
 110 
 111 #ifndef MLIB_USE_FTOI_CLAMPING
 112 
 113 #define CLAMP_S32(x)                                            \
 114   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
 115 
 116 #else
 117 
 118 #define CLAMP_S32(x) ((mlib_s32)(x))
 119 
 120 #endif /* MLIB_USE_FTOI_CLAMPING */
 121 
 122 /***************************************************************/
 123 #define D2I(x) CLAMP_S32((x) SAT_OFF)
 124 
 125 /***************************************************************/
 126 #ifdef _LITTLE_ENDIAN
 127 
 128 #define STORE2(res0, res1)                                      \
 129   dp[0    ] = res1;                                             \
 130   dp[chan1] = res0
 131 
 132 #else
 133 
 134 #define STORE2(res0, res1)                                      \
 135   dp[0    ] = res0;                                             \
 136   dp[chan1] = res1
 137 
 138 #endif /* _LITTLE_ENDIAN */
 139 
 140 /***************************************************************/
 141 #ifdef _NO_LONGLONG
 142 
 143 #define LOAD_BUFF(buff)                                         \
 144   buff[i    ] = sp[0];                                          \
 145   buff[i + 1] = sp[chan1]
 146 
 147 #else /* _NO_LONGLONG */
 148 
 149 #ifdef _LITTLE_ENDIAN
 150 
 151 #define LOAD_BUFF(buff)                                         \
 152   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
 153 
 154 #else /* _LITTLE_ENDIAN */
 155 
 156 #define LOAD_BUFF(buff)                                         \
 157   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 158 
 159 #endif /* _LITTLE_ENDIAN */
 160 #endif /* _NO_LONGLONG */
 161 
 162 /***************************************************************/
 163 typedef union {
 164   mlib_d64 d64;
 165   struct {
 166     mlib_s32 i0;
 167     mlib_s32 i1;
 168   } i32s;
 169 } d64_2x32;
 170 
 171 /***************************************************************/
 172 #define GET_SRC_DST_PARAMETERS(type)                            \
 173   hgt = mlib_ImageGetHeight(src);                               \
 174   wid = mlib_ImageGetWidth(src);                                \
 175   nchannel = mlib_ImageGetChannels(src);                        \
 176   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 177   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 178   adr_src = (type *)mlib_ImageGetData(src);                     \
 179   adr_dst = (type *)mlib_ImageGetData(dst)
 180 
 181 /***************************************************************/
 182 #if IMG_TYPE == 1
 183 
 184 /*
 185  * Test for the presence of any "1" bit in bits
 186    8 to 31 of val. If present, then val is either
 187    negative or >255. If over/underflows of 8 bits
 188    are uncommon, then this technique can be a win,
 189    since only a single test, rather than two, is
 190    necessary to determine if clamping is needed.
 191    On the other hand, if over/underflows are common,
 192    it adds an extra test.
 193 */
 194 #define CLAMP_STORE(dst, val)                                   \
 195   if (val & 0xffffff00) {                                       \
 196     if (val < MLIB_U8_MIN)                                      \
 197       dst = MLIB_U8_MIN;                                        \
 198     else                                                        \
 199       dst = MLIB_U8_MAX;                                        \
 200   } else {                                                      \
 201     dst = (mlib_u8)val;                                         \
 202   }
 203 
 204 #elif IMG_TYPE == 2
 205 
 206 #define CLAMP_STORE(dst, val)                                   \
 207   if (val >= MLIB_S16_MAX)                                      \
 208     dst = MLIB_S16_MAX;                                         \
 209   else if (val <= MLIB_S16_MIN)                                 \
 210     dst = MLIB_S16_MIN;                                         \
 211   else                                                          \
 212     dst = (mlib_s16)val
 213 
 214 #elif IMG_TYPE == 3
 215 
 216 #define CLAMP_STORE(dst, val)                                   \
 217   if (val >= MLIB_U16_MAX)                                      \
 218     dst = MLIB_U16_MAX;                                         \
 219   else if (val <= MLIB_U16_MIN)                                 \
 220     dst = MLIB_U16_MIN;                                         \
 221   else                                                          \
 222     dst = (mlib_u16)val
 223 
 224 #endif /* IMG_TYPE == 1 */
 225 
 226 /***************************************************************/
 227 #define MAX_KER   7
 228 #define MAX_N    15
 229 #define BUFF_SIZE   1600
 230 #define CACHE_SIZE  (64*1024)
 231 
 232 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
 233                                          const mlib_image *src,
 234                                          const mlib_d64   *k,
 235                                          mlib_s32         n,
 236                                          mlib_s32         dy_t,
 237                                          mlib_s32         dy_b,
 238                                          mlib_s32         cmask)
 239 {
 240   DTYPE    *adr_src, *sl;
 241   DTYPE    *adr_dst, *dl, *dp;
 242   FTYPE    buff[BUFF_SIZE];
 243   FTYPE    *buffd;
 244   FTYPE    *pbuff = buff;
 245   const FTYPE    *pk;
 246   FTYPE    k0, k1, k2, k3;
 247   FTYPE    p0, p1, p2, p3, p4;
 248   FTYPE    *sbuff;
 249   mlib_s32 l, k_off, off, bsize;
 250   mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh;
 251   mlib_s32 d0, d1, ii;
 252   mlib_s32 wid, hgt, sll, dll;
 253   mlib_s32 nchannel;
 254   mlib_s32 i, j, c;
 255   GET_SRC_DST_PARAMETERS(DTYPE);
 256 
 257   max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1);
 258 
 259   if (max_hsize < 1) max_hsize = 1;
 260   if (max_hsize > hgt) max_hsize = hgt;
 261 
 262   shgt = hgt + (n - 1);
 263   smax_hsize = max_hsize + (n - 1);
 264 
 265   bsize = 2 * (smax_hsize + 1);
 266 
 267   if (bsize > BUFF_SIZE) {
 268     pbuff = mlib_malloc(sizeof(FTYPE)*bsize);
 269 
 270     if (pbuff == NULL) return MLIB_FAILURE;
 271   }
 272 
 273   sbuff = pbuff;
 274   buffd = sbuff + smax_hsize;
 275 
 276   shgt -= (dy_t + dy_b);
 277   k_off = 0;
 278 
 279   for (l = 0; l < hgt; l += hsize) {
 280     hsize = hgt - l;
 281 
 282     if (hsize > max_hsize) hsize = max_hsize;
 283 
 284     smax_hsize = hsize + (n - 1);
 285 
 286     for (c = 0; c < nchannel; c++) {
 287       if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 288 
 289       sl = adr_src + c;
 290       dl = adr_dst + c;
 291 
 292       for (i = 0; i < hsize; i++) buffd[i] = 0.0;
 293 
 294       for (j = 0; j < wid; j++) {
 295         FTYPE    *buff = sbuff;
 296 
 297         for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) {
 298           sbuff[i - k_off] = (FTYPE)sl[0];
 299         }
 300 
 301         for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) {
 302           sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll];
 303         }
 304 
 305         for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) {
 306           sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll];
 307         }
 308 
 309         pk = k;
 310 
 311         for (off = 0; off < (n - 4); off += 4) {
 312 
 313           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 314           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 315 
 316           for (i = 0; i < hsize; i += 2) {
 317             p0 = p2; p1 = p3; p2 = p4;
 318 
 319             p3 = buff[i + 3]; p4 = buff[i + 4];
 320 
 321             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 322             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 323           }
 324 
 325           pk += 4;
 326           buff += 4;
 327         }
 328 
 329         dp = dl;
 330         kh = n - off;
 331 
 332         if (kh == 4) {
 333           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 334           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 335 
 336           for (i = 0; i <= (hsize - 2); i += 2) {
 337             p0 = p2; p1 = p3; p2 = p4;
 338 
 339             p3 = buff[i + 3]; p4 = buff[i + 4];
 340 
 341             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 342             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 343 
 344             dp[0  ] = FROM_S32(d0);
 345             dp[dll] = FROM_S32(d1);
 346 
 347             buffd[i    ] = 0.0;
 348             buffd[i + 1] = 0.0;
 349 
 350             dp += 2*dll;
 351           }
 352 
 353           if (i < hsize) {
 354             p0 = p2; p1 = p3; p2 = p4;
 355             p3 = buff[i + 3];
 356             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]);
 357             dp[0] = FROM_S32(d0);
 358             buffd[i] = 0.0;
 359           }
 360 
 361         } else if (kh == 3) {
 362 
 363           p2 = buff[0]; p3 = buff[1];
 364           k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 365 
 366           for (i = 0; i <= (hsize - 2); i += 2) {
 367             p0 = p2; p1 = p3;
 368 
 369             p2 = buff[i + 2]; p3 = buff[i + 3];
 370 
 371             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 372             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 373 
 374             dp[0  ] = FROM_S32(d0);
 375             dp[dll] = FROM_S32(d1);
 376 
 377             buffd[i    ] = 0.0;
 378             buffd[i + 1] = 0.0;
 379 
 380             dp += 2*dll;
 381           }
 382 
 383           if (i < hsize) {
 384             p0 = p2; p1 = p3;
 385             p2 = buff[i + 2];
 386             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]);
 387             dp[0] = FROM_S32(d0);
 388 
 389             buffd[i] = 0.0;
 390           }
 391 
 392         } else if (kh == 2) {
 393 
 394           p2 = buff[0];
 395           k0 = pk[0]; k1 = pk[1];
 396 
 397           for (i = 0; i <= (hsize - 2); i += 2) {
 398             p0 = p2;
 399 
 400             p1 = buff[i + 1]; p2 = buff[i + 2];
 401 
 402             d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 403             d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 404 
 405             dp[0  ] = FROM_S32(d0);
 406             dp[dll] = FROM_S32(d1);
 407 
 408             buffd[i    ] = 0.0;
 409             buffd[i + 1] = 0.0;
 410 
 411             dp += 2*dll;
 412           }
 413 
 414           if (i < hsize) {
 415             p0 = p2;
 416             p1 = buff[i + 1];
 417             d0 = D2I(p0*k0 + p1*k1 + buffd[i]);
 418             dp[0] = FROM_S32(d0);
 419 
 420             buffd[i] = 0.0;
 421           }
 422 
 423         } else /* kh == 1 */{
 424 
 425           k0 = pk[0];
 426 
 427           for (i = 0; i <= (hsize - 2); i += 2) {
 428             p0 = buff[i]; p1 = buff[i + 1];
 429 
 430             d0 = D2I(p0*k0 + buffd[i    ]);
 431             d1 = D2I(p1*k0 + buffd[i + 1]);
 432 
 433             dp[0  ] = FROM_S32(d0);
 434             dp[dll] = FROM_S32(d1);
 435 
 436             buffd[i    ] = 0.0;
 437             buffd[i + 1] = 0.0;
 438 
 439             dp += 2*dll;
 440           }
 441 
 442           if (i < hsize) {
 443             p0 = buff[i];
 444             d0 = D2I(p0*k0 + buffd[i]);
 445             dp[0] = FROM_S32(d0);
 446 
 447             buffd[i] = 0.0;
 448           }
 449         }
 450 
 451         /* next line */
 452         sl += nchannel;
 453         dl += nchannel;
 454       }
 455     }
 456 
 457     k_off += max_hsize;
 458     adr_dst += max_hsize*dll;
 459   }
 460 
 461   if (pbuff != buff) mlib_free(pbuff);
 462 
 463   return MLIB_SUCCESS;
 464 }
 465 
 466 /***************************************************************/
 467 mlib_status CONV_FUNC_MxN
 468 {
 469   DTYPE    *adr_src, *sl, *sp = NULL;
 470   DTYPE    *adr_dst, *dl, *dp = NULL;
 471   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 472   FTYPE    **buffs = buffs_arr, *buffd;
 473   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
 474   FTYPE    *pbuff = buff;
 475   FTYPE    k0, k1, k2, k3, k4, k5, k6;
 476   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
 477   mlib_s32 *buffi;
 478   mlib_s32 mn, l, off, kw, bsize, buff_ind;
 479   mlib_s32 d0, d1;
 480   mlib_s32 wid, hgt, sll, dll;
 481   mlib_s32 nchannel, chan1, chan2;
 482   mlib_s32 i, j, c, swid;
 483   d64_2x32 dd;
 484   mlib_status status = MLIB_SUCCESS;
 485 
 486   GET_SRC_DST_PARAMETERS(DTYPE);
 487 
 488   if (scale > 30) {
 489     fscale *= 1.0/(1 << 30);
 490     scale -= 30;
 491   }
 492 
 493   fscale /= (1 << scale);
 494 
 495   mn = m*n;
 496 
 497   if (mn > 256) {
 498     k = mlib_malloc(mn*sizeof(mlib_d64));
 499 
 500     if (k == NULL) return MLIB_FAILURE;
 501   }
 502 
 503   for (i = 0; i < mn; i++) {
 504     k[i] = kernel[i]*fscale;
 505   }
 506 
 507   if (m == 1) {
 508     status = mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask);
 509     FREE_AND_RETURN_STATUS;
 510   }
 511 
 512   swid = wid + (m - 1);
 513 
 514   bsize = (n + 3)*swid;
 515 
 516   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 517     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
 518 
 519     if (pbuff == NULL) {
 520       status = MLIB_FAILURE;
 521       FREE_AND_RETURN_STATUS;
 522     }
 523     buffs = (FTYPE   **)(pbuff + bsize);
 524   }
 525 
 526   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
 527   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
 528   buffd = buffs[n] + swid;
 529   buffi = (mlib_s32*)(buffd + swid);
 530 
 531   chan1 = nchannel;
 532   chan2 = chan1 + chan1;
 533 
 534   swid -= (dx_l + dx_r);
 535 
 536   for (c = 0; c < nchannel; c++) {
 537     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 538 
 539     sl = adr_src + c;
 540     dl = adr_dst + c;
 541 
 542     for (l = 0; l < n; l++) {
 543       FTYPE    *buff = buffs[l];
 544 
 545       for (i = 0; i < dx_l; i++) {
 546         buff[i] = (FTYPE)sl[0];
 547       }
 548 
 549       for (i = 0; i < swid; i++) {
 550         buff[i + dx_l] = (FTYPE)sl[i*chan1];
 551       }
 552 
 553       for (i = 0; i < dx_r; i++) {
 554         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
 555       }
 556 
 557       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
 558     }
 559 
 560     buff_ind = 0;
 561 
 562     for (i = 0; i < wid; i++) buffd[i] = 0.0;
 563 
 564     for (j = 0; j < hgt; j++) {
 565       FTYPE    **buffc = buffs + buff_ind;
 566       FTYPE    *buffn = buffc[n];
 567       FTYPE    *pk = k;
 568 
 569       for (l = 0; l < n; l++) {
 570         FTYPE    *buff_l = buffc[l];
 571 
 572         for (off = 0; off < m;) {
 573           FTYPE    *buff = buff_l + off;
 574 
 575           kw = m - off;
 576 
 577           if (kw > 2*MAX_KER) kw = MAX_KER; else
 578             if (kw > MAX_KER) kw = kw/2;
 579           off += kw;
 580 
 581           sp = sl;
 582           dp = dl;
 583 
 584           if (kw == 7) {
 585 
 586             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 587             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
 588 
 589             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 590             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
 591 
 592             if (l < (n - 1) || off < m) {
 593               for (i = 0; i <= (wid - 2); i += 2) {
 594                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 595 
 596                 p6 = buff[i + 6]; p7 = buff[i + 7];
 597 
 598                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
 599                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
 600               }
 601 
 602             } else {
 603               for (i = 0; i <= (wid - 2); i += 2) {
 604                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 605 
 606                 p6 = buff[i + 6]; p7 = buff[i + 7];
 607 
 608                 LOAD_BUFF(buffi);
 609 
 610                 dd.d64 = *(FTYPE   *)(buffi + i);
 611                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 612                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 613 
 614                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
 615                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
 616 
 617                 dp[0    ] = FROM_S32(d0);
 618                 dp[chan1] = FROM_S32(d1);
 619 
 620                 buffd[i    ] = 0.0;
 621                 buffd[i + 1] = 0.0;
 622 
 623                 sp += chan2;
 624                 dp += chan2;
 625               }
 626             }
 627 
 628           } else if (kw == 6) {
 629 
 630             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 631             p5 = buff[3]; p6 = buff[4];
 632 
 633             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 634             k4 = pk[4]; k5 = pk[5];
 635 
 636             if (l < (n - 1) || off < m) {
 637               for (i = 0; i <= (wid - 2); i += 2) {
 638                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 639 
 640                 p5 = buff[i + 5]; p6 = buff[i + 6];
 641 
 642                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
 643                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
 644               }
 645 
 646             } else {
 647               for (i = 0; i <= (wid - 2); i += 2) {
 648                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 649 
 650                 p5 = buff[i + 5]; p6 = buff[i + 6];
 651 
 652                 LOAD_BUFF(buffi);
 653 
 654                 dd.d64 = *(FTYPE   *)(buffi + i);
 655                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 656                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 657 
 658                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
 659                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
 660 
 661                 dp[0    ] = FROM_S32(d0);
 662                 dp[chan1] = FROM_S32(d1);
 663 
 664                 buffd[i    ] = 0.0;
 665                 buffd[i + 1] = 0.0;
 666 
 667                 sp += chan2;
 668                 dp += chan2;
 669               }
 670             }
 671 
 672           } else if (kw == 5) {
 673 
 674             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 675             p5 = buff[3];
 676 
 677             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 678             k4 = pk[4];
 679 
 680             if (l < (n - 1) || off < m) {
 681               for (i = 0; i <= (wid - 2); i += 2) {
 682                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 683 
 684                 p4 = buff[i + 4]; p5 = buff[i + 5];
 685 
 686                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
 687                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
 688               }
 689 
 690             } else {
 691               for (i = 0; i <= (wid - 2); i += 2) {
 692                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 693 
 694                 p4 = buff[i + 4]; p5 = buff[i + 5];
 695 
 696                 LOAD_BUFF(buffi);
 697 
 698                 dd.d64 = *(FTYPE   *)(buffi + i);
 699                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 700                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 701 
 702                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
 703                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
 704 
 705                 dp[0    ] = FROM_S32(d0);
 706                 dp[chan1] = FROM_S32(d1);
 707 
 708                 buffd[i    ] = 0.0;
 709                 buffd[i + 1] = 0.0;
 710 
 711                 sp += chan2;
 712                 dp += chan2;
 713               }
 714             }
 715 
 716           } else if (kw == 4) {
 717 
 718             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 719 
 720             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 721 
 722             if (l < (n - 1) || off < m) {
 723               for (i = 0; i <= (wid - 2); i += 2) {
 724                 p0 = p2; p1 = p3; p2 = p4;
 725 
 726                 p3 = buff[i + 3]; p4 = buff[i + 4];
 727 
 728                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 729                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 730               }
 731 
 732             } else {
 733               for (i = 0; i <= (wid - 2); i += 2) {
 734                 p0 = p2; p1 = p3; p2 = p4;
 735 
 736                 p3 = buff[i + 3]; p4 = buff[i + 4];
 737 
 738                 LOAD_BUFF(buffi);
 739 
 740                 dd.d64 = *(FTYPE   *)(buffi + i);
 741                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 742                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 743 
 744                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 745                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 746 
 747                 dp[0    ] = FROM_S32(d0);
 748                 dp[chan1] = FROM_S32(d1);
 749 
 750                 buffd[i    ] = 0.0;
 751                 buffd[i + 1] = 0.0;
 752 
 753                 sp += chan2;
 754                 dp += chan2;
 755               }
 756             }
 757 
 758           } else if (kw == 3) {
 759 
 760             p2 = buff[0]; p3 = buff[1];
 761             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 762 
 763             if (l < (n - 1) || off < m) {
 764               for (i = 0; i <= (wid - 2); i += 2) {
 765                 p0 = p2; p1 = p3;
 766 
 767                 p2 = buff[i + 2]; p3 = buff[i + 3];
 768 
 769                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
 770                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
 771               }
 772 
 773             } else {
 774               for (i = 0; i <= (wid - 2); i += 2) {
 775                 p0 = p2; p1 = p3;
 776 
 777                 p2 = buff[i + 2]; p3 = buff[i + 3];
 778 
 779                 LOAD_BUFF(buffi);
 780 
 781                 dd.d64 = *(FTYPE   *)(buffi + i);
 782                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 783                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 784 
 785                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 786                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 787 
 788                 dp[0    ] = FROM_S32(d0);
 789                 dp[chan1] = FROM_S32(d1);
 790 
 791                 buffd[i    ] = 0.0;
 792                 buffd[i + 1] = 0.0;
 793 
 794                 sp += chan2;
 795                 dp += chan2;
 796               }
 797             }
 798 
 799           } else /* if (kw == 2) */ {
 800 
 801             p2 = buff[0];
 802             k0 = pk[0]; k1 = pk[1];
 803 
 804             if (l < (n - 1) || off < m) {
 805               for (i = 0; i <= (wid - 2); i += 2) {
 806                 p0 = p2;
 807 
 808                 p1 = buff[i + 1]; p2 = buff[i + 2];
 809 
 810                 buffd[i    ] += p0*k0 + p1*k1;
 811                 buffd[i + 1] += p1*k0 + p2*k1;
 812               }
 813 
 814             } else {
 815               for (i = 0; i <= (wid - 2); i += 2) {
 816                 p0 = p2;
 817 
 818                 p1 = buff[i + 1]; p2 = buff[i + 2];
 819 
 820                 LOAD_BUFF(buffi);
 821 
 822                 dd.d64 = *(FTYPE   *)(buffi + i);
 823                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 824                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 825 
 826                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 827                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 828 
 829                 dp[0    ] = FROM_S32(d0);
 830                 dp[chan1] = FROM_S32(d1);
 831 
 832                 buffd[i    ] = 0.0;
 833                 buffd[i + 1] = 0.0;
 834 
 835                 sp += chan2;
 836                 dp += chan2;
 837               }
 838             }
 839           }
 840 
 841           pk += kw;
 842         }
 843       }
 844 
 845       /* last pixels */
 846       for (; i < wid; i++) {
 847         FTYPE    *pk = k, s = 0;
 848         mlib_s32 x, d0;
 849 
 850         for (l = 0; l < n; l++) {
 851           FTYPE    *buff = buffc[l] + i;
 852 
 853           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
 854         }
 855 
 856         d0 = D2I(s);
 857         dp[0] = FROM_S32(d0);
 858 
 859         buffn[i + dx_l] = (FTYPE)sp[0];
 860 
 861         sp += chan1;
 862         dp += chan1;
 863       }
 864 
 865       for (; i < swid; i++) {
 866         buffn[i + dx_l] = (FTYPE)sp[0];
 867         sp += chan1;
 868       }
 869 
 870       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
 871       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
 872 
 873       /* next line */
 874 
 875       if (j < hgt - dy_b - 2) sl += sll;
 876       dl += dll;
 877 
 878       buff_ind++;
 879 
 880       if (buff_ind >= n + 1) buff_ind = 0;
 881     }
 882   }
 883 
 884   FREE_AND_RETURN_STATUS;
 885 }
 886 
 887 /***************************************************************/
 888 /* for x86, using integer multiplies is faster */
 889 
 890 #define STORE_RES(res, x)                                       \
 891   x >>= shift2;                                                 \
 892   CLAMP_STORE(res, x)
 893 
 894 mlib_status CONV_FUNC_MxN_I
 895 {
 896   DTYPE    *adr_src, *sl, *sp = NULL;
 897   DTYPE    *adr_dst, *dl, *dp = NULL;
 898   mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 899   mlib_s32 *pbuff = buff;
 900   mlib_s32 **buffs = buffs_arr, *buffd;
 901   mlib_s32 l, off, kw, bsize, buff_ind;
 902   mlib_s32 d0, d1, shift1, shift2;
 903   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
 904   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
 905   mlib_s32 wid, hgt, sll, dll;
 906   mlib_s32 nchannel, chan1;
 907   mlib_s32 i, j, c, swid;
 908   mlib_s32 chan2;
 909   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
 910   GET_SRC_DST_PARAMETERS(DTYPE);
 911 
 912 #if IMG_TYPE != 1
 913   shift1 = 16;
 914 #else
 915   shift1 = 8;
 916 #endif /* IMG_TYPE != 1 */
 917   shift2 = scale - shift1;
 918 
 919   chan1 = nchannel;
 920   chan2 = chan1 + chan1;
 921 
 922   swid = wid + (m - 1);
 923 
 924   bsize = (n + 2)*swid;
 925 
 926   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 927     pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1));
 928 
 929     if (pbuff == NULL) return MLIB_FAILURE;
 930     buffs = (mlib_s32 **)(pbuff + bsize);
 931   }
 932 
 933   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
 934   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
 935   buffd = buffs[n] + swid;
 936 
 937   if (m*n > MAX_N*MAX_N) {
 938     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
 939 
 940     if (k == NULL) {
 941       if (pbuff != buff) mlib_free(pbuff);
 942       return MLIB_FAILURE;
 943     }
 944   }
 945 
 946   for (i = 0; i < m*n; i++) {
 947     k[i] = kernel[i] >> shift1;
 948   }
 949 
 950   swid -= (dx_l + dx_r);
 951 
 952   for (c = 0; c < nchannel; c++) {
 953     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 954 
 955     sl = adr_src + c;
 956     dl = adr_dst + c;
 957 
 958     for (l = 0; l < n; l++) {
 959       mlib_s32  *buff = buffs[l];
 960 
 961       for (i = 0; i < dx_l; i++) {
 962         buff[i] = (mlib_s32)sl[0];
 963       }
 964 
 965       for (i = 0; i < swid; i++) {
 966         buff[i + dx_l] = (mlib_s32)sl[i*chan1];
 967       }
 968 
 969       for (i = 0; i < dx_r; i++) {
 970         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
 971       }
 972 
 973       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
 974     }
 975 
 976     buff_ind = 0;
 977 
 978     for (i = 0; i < wid; i++) buffd[i] = 0;
 979 
 980     for (j = 0; j < hgt; j++) {
 981       mlib_s32 **buffc = buffs + buff_ind;
 982       mlib_s32 *buffn = buffc[n];
 983       mlib_s32 *pk = k;
 984 
 985       for (l = 0; l < n; l++) {
 986         mlib_s32  *buff_l = buffc[l];
 987 
 988         for (off = 0; off < m;) {
 989           mlib_s32 *buff = buff_l + off;
 990 
 991           sp = sl;
 992           dp = dl;
 993 
 994           kw = m - off;
 995 
 996           if (kw > 2*MAX_KER) kw = MAX_KER; else
 997             if (kw > MAX_KER) kw = kw/2;
 998           off += kw;
 999 
1000           if (kw == 7) {
1001 
1002             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1003             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1004 
1005             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1006             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1007 
1008             if (l < (n - 1) || off < m) {
1009               for (i = 0; i <= (wid - 2); i += 2) {
1010                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1011 
1012                 p6 = buff[i + 6]; p7 = buff[i + 7];
1013 
1014                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1015                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1016               }
1017 
1018             } else {
1019               for (i = 0; i <= (wid - 2); i += 2) {
1020                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1021 
1022                 p6 = buff[i + 6]; p7 = buff[i + 7];
1023 
1024                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1025                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1026 
1027                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1028                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1029 
1030                 STORE_RES(dp[0    ], d0);
1031                 STORE_RES(dp[chan1], d1);
1032 
1033                 buffd[i    ] = 0;
1034                 buffd[i + 1] = 0;
1035 
1036                 sp += chan2;
1037                 dp += chan2;
1038               }
1039             }
1040 
1041           } else if (kw == 6) {
1042 
1043             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1044             p5 = buff[3]; p6 = buff[4];
1045 
1046             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1047             k4 = pk[4]; k5 = pk[5];
1048 
1049             if (l < (n - 1) || off < m) {
1050               for (i = 0; i <= (wid - 2); i += 2) {
1051                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1052 
1053                 p5 = buff[i + 5]; p6 = buff[i + 6];
1054 
1055                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1056                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1057               }
1058 
1059             } else {
1060               for (i = 0; i <= (wid - 2); i += 2) {
1061                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1062 
1063                 p5 = buff[i + 5]; p6 = buff[i + 6];
1064 
1065                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1066                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1067 
1068                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
1069                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1070 
1071                 STORE_RES(dp[0    ], d0);
1072                 STORE_RES(dp[chan1], d1);
1073 
1074                 buffd[i    ] = 0;
1075                 buffd[i + 1] = 0;
1076 
1077                 sp += chan2;
1078                 dp += chan2;
1079               }
1080             }
1081 
1082           } else if (kw == 5) {
1083 
1084             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1085             p5 = buff[3];
1086 
1087             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1088             k4 = pk[4];
1089 
1090             if (l < (n - 1) || off < m) {
1091               for (i = 0; i <= (wid - 2); i += 2) {
1092                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1093 
1094                 p4 = buff[i + 4]; p5 = buff[i + 5];
1095 
1096                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1097                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1098               }
1099 
1100             } else {
1101               for (i = 0; i <= (wid - 2); i += 2) {
1102                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1103 
1104                 p4 = buff[i + 4]; p5 = buff[i + 5];
1105 
1106                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1107                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1108 
1109                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
1110                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1111 
1112                 STORE_RES(dp[0    ], d0);
1113                 STORE_RES(dp[chan1], d1);
1114 
1115                 buffd[i    ] = 0;
1116                 buffd[i + 1] = 0;
1117 
1118                 sp += chan2;
1119                 dp += chan2;
1120               }
1121             }
1122 
1123           } else if (kw == 4) {
1124 
1125             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1126 
1127             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1128 
1129             if (l < (n - 1) || off < m) {
1130               for (i = 0; i <= (wid - 2); i += 2) {
1131                 p0 = p2; p1 = p3; p2 = p4;
1132 
1133                 p3 = buff[i + 3]; p4 = buff[i + 4];
1134 
1135                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1136                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1137               }
1138 
1139             } else {
1140               for (i = 0; i <= (wid - 2); i += 2) {
1141                 p0 = p2; p1 = p3; p2 = p4;
1142 
1143                 p3 = buff[i + 3]; p4 = buff[i + 4];
1144 
1145                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1146                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1147 
1148                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1149                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1150 
1151                 STORE_RES(dp[0    ], d0);
1152                 STORE_RES(dp[chan1], d1);
1153 
1154                 buffd[i    ] = 0;
1155                 buffd[i + 1] = 0;
1156 
1157                 sp += chan2;
1158                 dp += chan2;
1159               }
1160             }
1161 
1162           } else if (kw == 3) {
1163 
1164             p2 = buff[0]; p3 = buff[1];
1165             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
1166 
1167             if (l < (n - 1) || off < m) {
1168               for (i = 0; i <= (wid - 2); i += 2) {
1169                 p0 = p2; p1 = p3;
1170 
1171                 p2 = buff[i + 2]; p3 = buff[i + 3];
1172 
1173                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1174                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1175               }
1176 
1177             } else {
1178               for (i = 0; i <= (wid - 2); i += 2) {
1179                 p0 = p2; p1 = p3;
1180 
1181                 p2 = buff[i + 2]; p3 = buff[i + 3];
1182 
1183                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1184                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1185 
1186                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1187                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1188 
1189                 STORE_RES(dp[0    ], d0);
1190                 STORE_RES(dp[chan1], d1);
1191 
1192                 buffd[i    ] = 0;
1193                 buffd[i + 1] = 0;
1194 
1195                 sp += chan2;
1196                 dp += chan2;
1197               }
1198             }
1199 
1200           } else if (kw == 2) {
1201 
1202             p2 = buff[0];
1203             k0 = pk[0]; k1 = pk[1];
1204 
1205             if (l < (n - 1) || off < m) {
1206               for (i = 0; i <= (wid - 2); i += 2) {
1207                 p0 = p2;
1208 
1209                 p1 = buff[i + 1]; p2 = buff[i + 2];
1210 
1211                 buffd[i    ] += p0*k0 + p1*k1;
1212                 buffd[i + 1] += p1*k0 + p2*k1;
1213               }
1214 
1215             } else {
1216               for (i = 0; i <= (wid - 2); i += 2) {
1217                 p0 = p2;
1218 
1219                 p1 = buff[i + 1]; p2 = buff[i + 2];
1220 
1221                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1222                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1223 
1224                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
1225                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
1226 
1227                 STORE_RES(dp[0    ], d0);
1228                 STORE_RES(dp[chan1], d1);
1229 
1230                 buffd[i    ] = 0;
1231                 buffd[i + 1] = 0;
1232 
1233                 sp += chan2;
1234                 dp += chan2;
1235               }
1236             }
1237 
1238           } else /* kw == 1 */{
1239 
1240             k0 = pk[0];
1241 
1242             if (l < (n - 1) || off < m) {
1243               for (i = 0; i <= (wid - 2); i += 2) {
1244                 p0 = buff[i]; p1 = buff[i + 1];
1245 
1246                 buffd[i    ] += p0*k0;
1247                 buffd[i + 1] += p1*k0;
1248               }
1249 
1250             } else {
1251               for (i = 0; i <= (wid - 2); i += 2) {
1252                 p0 = buff[i]; p1 = buff[i + 1];
1253 
1254                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1255                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1256 
1257                 d0 = (p0*k0 + buffd[i    ]);
1258                 d1 = (p1*k0 + buffd[i + 1]);
1259 
1260                 STORE_RES(dp[0    ], d0);
1261                 STORE_RES(dp[chan1], d1);
1262 
1263                 buffd[i    ] = 0;
1264                 buffd[i + 1] = 0;
1265 
1266                 sp += chan2;
1267                 dp += chan2;
1268               }
1269             }
1270           }
1271 
1272           pk += kw;
1273         }
1274       }
1275 
1276       /* last pixels */
1277       for (; i < wid; i++) {
1278         mlib_s32 *pk = k, x, s = 0;
1279 
1280         for (l = 0; l < n; l++) {
1281           mlib_s32 *buff = buffc[l] + i;
1282 
1283           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
1284         }
1285 
1286         STORE_RES(dp[0], s);
1287 
1288         buffn[i + dx_l] = (mlib_s32)sp[0];
1289 
1290         sp += chan1;
1291         dp += chan1;
1292       }
1293 
1294       for (; i < swid; i++) {
1295         buffn[i + dx_l] = (mlib_s32)sp[0];
1296         sp += chan1;
1297       }
1298 
1299       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1300       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1301 
1302       /* next line */
1303 
1304       if (j < hgt - dy_b - 2) sl += sll;
1305       dl += dll;
1306 
1307       buff_ind++;
1308 
1309       if (buff_ind >= n + 1) buff_ind = 0;
1310     }
1311   }
1312 
1313   if (pbuff != buff) mlib_free(pbuff);
1314   if (k != k_locl) mlib_free(k);
1315 
1316   return MLIB_SUCCESS;
1317 }
1318 
1319 /***************************************************************/