1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 /*
  28  * FUNCTION
  29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 type and
  30  *   MLIB_EDGE_SRC_EXTEND mask
  31  */
  32 
  33 #include "mlib_image.h"
  34 #include "mlib_ImageConv.h"
  35 #include "mlib_c_ImageConv.h"
  36 
  37 /*
  38  * This define switches between functions of different data types
  39  */
  40 
  41 #define IMG_TYPE 3
  42 
  43 /***************************************************************/
  44 #if IMG_TYPE == 1
  45 
  46 #define DTYPE             mlib_u8
  47 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##ext_u8(PARAM)
  48 #define CONV_FUNC_MxN     mlib_c_convMxNext_u8(PARAM_MxN)
  49 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM)
  50 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u8(PARAM_MxN)
  51 #define DSCALE            (1 << 24)
  52 #define FROM_S32(x)       (((x) >> 24) ^ 128)
  53 #define S64TOS32(x)       (x)
  54 #define SAT_OFF           -(1u << 31)
  55 
  56 #elif IMG_TYPE == 2
  57 
  58 #define DTYPE             mlib_s16
  59 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_s16(PARAM)
  60 #define CONV_FUNC_MxN     mlib_convMxNext_s16(PARAM_MxN)
  61 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM)
  62 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_s16(PARAM_MxN)
  63 #define DSCALE            65536.0
  64 #define FROM_S32(x)       ((x) >> 16)
  65 #define S64TOS32(x)       ((x) & 0xffffffff)
  66 #define SAT_OFF
  67 
  68 #elif IMG_TYPE == 3
  69 
  70 #define DTYPE             mlib_u16
  71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
  72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
  73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
  74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
  75 #define DSCALE            65536.0
  76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
  77 #define S64TOS32(x)       (x)
  78 #define SAT_OFF           -(1u << 31)
  79 
  80 #endif /* IMG_TYPE == 1 */
  81 
  82 /***************************************************************/
  83 #define PARAM                                                   \
  84   mlib_image       *dst,                                        \
  85   const mlib_image *src,                                        \
  86   mlib_s32         dx_l,                                        \
  87   mlib_s32         dx_r,                                        \
  88   mlib_s32         dy_t,                                        \
  89   mlib_s32         dy_b,                                        \
  90   const mlib_s32   *kern,                                       \
  91   mlib_s32         scalef_expon,                                \
  92   mlib_s32         cmask
  93 
  94 /***************************************************************/
  95 #define PARAM_MxN                                               \
  96   mlib_image       *dst,                                        \
  97   const mlib_image *src,                                        \
  98   const mlib_s32   *kernel,                                     \
  99   mlib_s32         m,                                           \
 100   mlib_s32         n,                                           \
 101   mlib_s32         dx_l,                                        \
 102   mlib_s32         dx_r,                                        \
 103   mlib_s32         dy_t,                                        \
 104   mlib_s32         dy_b,                                        \
 105   mlib_s32         scale,                                       \
 106   mlib_s32         cmask
 107 
 108 /***************************************************************/
 109 #define FTYPE mlib_d64
 110 
 111 #ifndef MLIB_USE_FTOI_CLAMPING
 112 
 113 #define CLAMP_S32(x)                                            \
 114   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
 115 
 116 #else
 117 
 118 #define CLAMP_S32(x) ((mlib_s32)(x))
 119 
 120 #endif /* MLIB_USE_FTOI_CLAMPING */
 121 
 122 /***************************************************************/
 123 #define D2I(x) CLAMP_S32((x) SAT_OFF)
 124 
 125 /***************************************************************/
 126 #ifdef _LITTLE_ENDIAN
 127 
 128 #define STORE2(res0, res1)                                      \
 129   dp[0    ] = res1;                                             \
 130   dp[chan1] = res0
 131 
 132 #else
 133 
 134 #define STORE2(res0, res1)                                      \
 135   dp[0    ] = res0;                                             \
 136   dp[chan1] = res1
 137 
 138 #endif /* _LITTLE_ENDIAN */
 139 
 140 /***************************************************************/
 141 #ifdef _NO_LONGLONG
 142 
 143 #define LOAD_BUFF(buff)                                         \
 144   buff[i    ] = sp[0];                                          \
 145   buff[i + 1] = sp[chan1]
 146 
 147 #else /* _NO_LONGLONG */
 148 
 149 #ifdef _LITTLE_ENDIAN
 150 
 151 #define LOAD_BUFF(buff)                                         \
 152   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
 153 
 154 #else /* _LITTLE_ENDIAN */
 155 
 156 #define LOAD_BUFF(buff)                                         \
 157   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 158 
 159 #endif /* _LITTLE_ENDIAN */
 160 #endif /* _NO_LONGLONG */
 161 
 162 /***************************************************************/
 163 typedef union {
 164   mlib_d64 d64;
 165   struct {
 166     mlib_s32 i0;
 167     mlib_s32 i1;
 168   } i32s;
 169 } d64_2x32;
 170 
 171 /***************************************************************/
 172 #define DEF_VARS(type)                                          \
 173   type     *adr_src, *sl, *sp, *sl1;                            \
 174   type     *adr_dst, *dl, *dp;                                  \
 175   FTYPE    *pbuff = buff;                                       \
 176   mlib_s32 *buffi, *buffo;                                      \
 177   mlib_s32 wid, hgt, sll, dll;                                  \
 178   mlib_s32 nchannel, chan1, chan2;                              \
 179   mlib_s32 i, j, c, swid
 180 
 181 /***************************************************************/
 182 #define GET_SRC_DST_PARAMETERS(type)                            \
 183   hgt = mlib_ImageGetHeight(src);                               \
 184   wid = mlib_ImageGetWidth(src);                                \
 185   nchannel = mlib_ImageGetChannels(src);                        \
 186   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 187   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 188   adr_src = (type *)mlib_ImageGetData(src);                     \
 189   adr_dst = (type *)mlib_ImageGetData(dst)
 190 
 191 /***************************************************************/
 192 #if IMG_TYPE == 1
 193 
 194 /*
 195  * Test for the presence of any "1" bit in bits
 196    8 to 31 of val. If present, then val is either
 197    negative or >255. If over/underflows of 8 bits
 198    are uncommon, then this technique can be a win,
 199    since only a single test, rather than two, is
 200    necessary to determine if clamping is needed.
 201    On the other hand, if over/underflows are common,
 202    it adds an extra test.
 203 */
 204 #define CLAMP_STORE(dst, val)                                   \
 205   if (val & 0xffffff00) {                                       \
 206     if (val < MLIB_U8_MIN)                                      \
 207       dst = MLIB_U8_MIN;                                        \
 208     else                                                        \
 209       dst = MLIB_U8_MAX;                                        \
 210   } else {                                                      \
 211     dst = (mlib_u8)val;                                         \
 212   }
 213 
 214 #elif IMG_TYPE == 2
 215 
 216 #define CLAMP_STORE(dst, val)                                   \
 217   if (val >= MLIB_S16_MAX)                                      \
 218     dst = MLIB_S16_MAX;                                         \
 219   else if (val <= MLIB_S16_MIN)                                 \
 220     dst = MLIB_S16_MIN;                                         \
 221   else                                                          \
 222     dst = (mlib_s16)val
 223 
 224 #elif IMG_TYPE == 3
 225 
 226 #define CLAMP_STORE(dst, val)                                   \
 227   if (val >= MLIB_U16_MAX)                                      \
 228     dst = MLIB_U16_MAX;                                         \
 229   else if (val <= MLIB_U16_MIN)                                 \
 230     dst = MLIB_U16_MIN;                                         \
 231   else                                                          \
 232     dst = (mlib_u16)val
 233 
 234 #endif /* IMG_TYPE == 1 */
 235 
 236 /***************************************************************/
 237 #define MAX_KER   7
 238 #define MAX_N    15
 239 #define BUFF_SIZE   1600
 240 #define CACHE_SIZE  (64*1024)
 241 
 242 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
 243                                          const mlib_image *src,
 244                                          const mlib_d64   *k,
 245                                          mlib_s32         n,
 246                                          mlib_s32         dy_t,
 247                                          mlib_s32         dy_b,
 248                                          mlib_s32         cmask)
 249 {
 250   DTYPE    *adr_src, *sl;
 251   DTYPE    *adr_dst, *dl, *dp;
 252   FTYPE    buff[BUFF_SIZE];
 253   FTYPE    *buffd;
 254   FTYPE    *pbuff = buff;
 255   const FTYPE    *pk;
 256   FTYPE    k0, k1, k2, k3;
 257   FTYPE    p0, p1, p2, p3, p4;
 258   FTYPE    *sbuff;
 259   mlib_s32 l, k_off, off, bsize;
 260   mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh;
 261   mlib_s32 d0, d1, ii;
 262   mlib_s32 wid, hgt, sll, dll;
 263   mlib_s32 nchannel;
 264   mlib_s32 i, j, c;
 265   GET_SRC_DST_PARAMETERS(DTYPE);
 266 
 267   max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1);
 268 
 269   if (max_hsize < 1) max_hsize = 1;
 270   if (max_hsize > hgt) max_hsize = hgt;
 271 
 272   shgt = hgt + (n - 1);
 273   smax_hsize = max_hsize + (n - 1);
 274 
 275   bsize = 2 * (smax_hsize + 1);
 276 
 277   if (bsize > BUFF_SIZE) {
 278     pbuff = mlib_malloc(sizeof(FTYPE)*bsize);
 279 
 280     if (pbuff == NULL) return MLIB_FAILURE;
 281   }
 282 
 283   sbuff = pbuff;
 284   buffd = sbuff + smax_hsize;
 285 
 286   shgt -= (dy_t + dy_b);
 287   k_off = 0;
 288 
 289   for (l = 0; l < hgt; l += hsize) {
 290     hsize = hgt - l;
 291 
 292     if (hsize > max_hsize) hsize = max_hsize;
 293 
 294     smax_hsize = hsize + (n - 1);
 295 
 296     for (c = 0; c < nchannel; c++) {
 297       if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 298 
 299       sl = adr_src + c;
 300       dl = adr_dst + c;
 301 
 302       for (i = 0; i < hsize; i++) buffd[i] = 0.0;
 303 
 304       for (j = 0; j < wid; j++) {
 305         FTYPE    *buff = sbuff;
 306 
 307         for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) {
 308           sbuff[i - k_off] = (FTYPE)sl[0];
 309         }
 310 
 311         for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) {
 312           sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll];
 313         }
 314 
 315         for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) {
 316           sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll];
 317         }
 318 
 319         pk = k;
 320 
 321         for (off = 0; off < (n - 4); off += 4) {
 322 
 323           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 324           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 325 
 326           for (i = 0; i < hsize; i += 2) {
 327             p0 = p2; p1 = p3; p2 = p4;
 328 
 329             p3 = buff[i + 3]; p4 = buff[i + 4];
 330 
 331             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 332             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 333           }
 334 
 335           pk += 4;
 336           buff += 4;
 337         }
 338 
 339         dp = dl;
 340         kh = n - off;
 341 
 342         if (kh == 4) {
 343           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 344           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 345 
 346           for (i = 0; i <= (hsize - 2); i += 2) {
 347             p0 = p2; p1 = p3; p2 = p4;
 348 
 349             p3 = buff[i + 3]; p4 = buff[i + 4];
 350 
 351             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 352             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 353 
 354             dp[0  ] = FROM_S32(d0);
 355             dp[dll] = FROM_S32(d1);
 356 
 357             buffd[i    ] = 0.0;
 358             buffd[i + 1] = 0.0;
 359 
 360             dp += 2*dll;
 361           }
 362 
 363           if (i < hsize) {
 364             p0 = p2; p1 = p3; p2 = p4;
 365             p3 = buff[i + 3];
 366             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]);
 367             dp[0] = FROM_S32(d0);
 368             buffd[i] = 0.0;
 369           }
 370 
 371         } else if (kh == 3) {
 372 
 373           p2 = buff[0]; p3 = buff[1];
 374           k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 375 
 376           for (i = 0; i <= (hsize - 2); i += 2) {
 377             p0 = p2; p1 = p3;
 378 
 379             p2 = buff[i + 2]; p3 = buff[i + 3];
 380 
 381             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 382             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 383 
 384             dp[0  ] = FROM_S32(d0);
 385             dp[dll] = FROM_S32(d1);
 386 
 387             buffd[i    ] = 0.0;
 388             buffd[i + 1] = 0.0;
 389 
 390             dp += 2*dll;
 391           }
 392 
 393           if (i < hsize) {
 394             p0 = p2; p1 = p3;
 395             p2 = buff[i + 2];
 396             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]);
 397             dp[0] = FROM_S32(d0);
 398 
 399             buffd[i] = 0.0;
 400           }
 401 
 402         } else if (kh == 2) {
 403 
 404           p2 = buff[0];
 405           k0 = pk[0]; k1 = pk[1];
 406 
 407           for (i = 0; i <= (hsize - 2); i += 2) {
 408             p0 = p2;
 409 
 410             p1 = buff[i + 1]; p2 = buff[i + 2];
 411 
 412             d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 413             d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 414 
 415             dp[0  ] = FROM_S32(d0);
 416             dp[dll] = FROM_S32(d1);
 417 
 418             buffd[i    ] = 0.0;
 419             buffd[i + 1] = 0.0;
 420 
 421             dp += 2*dll;
 422           }
 423 
 424           if (i < hsize) {
 425             p0 = p2;
 426             p1 = buff[i + 1];
 427             d0 = D2I(p0*k0 + p1*k1 + buffd[i]);
 428             dp[0] = FROM_S32(d0);
 429 
 430             buffd[i] = 0.0;
 431           }
 432 
 433         } else /* kh == 1 */{
 434 
 435           k0 = pk[0];
 436 
 437           for (i = 0; i <= (hsize - 2); i += 2) {
 438             p0 = buff[i]; p1 = buff[i + 1];
 439 
 440             d0 = D2I(p0*k0 + buffd[i    ]);
 441             d1 = D2I(p1*k0 + buffd[i + 1]);
 442 
 443             dp[0  ] = FROM_S32(d0);
 444             dp[dll] = FROM_S32(d1);
 445 
 446             buffd[i    ] = 0.0;
 447             buffd[i + 1] = 0.0;
 448 
 449             dp += 2*dll;
 450           }
 451 
 452           if (i < hsize) {
 453             p0 = buff[i];
 454             d0 = D2I(p0*k0 + buffd[i]);
 455             dp[0] = FROM_S32(d0);
 456 
 457             buffd[i] = 0.0;
 458           }
 459         }
 460 
 461         /* next line */
 462         sl += nchannel;
 463         dl += nchannel;
 464       }
 465     }
 466 
 467     k_off += max_hsize;
 468     adr_dst += max_hsize*dll;
 469   }
 470 
 471   if (pbuff != buff) mlib_free(pbuff);
 472 
 473   return MLIB_SUCCESS;
 474 }
 475 
 476 /***************************************************************/
 477 mlib_status CONV_FUNC_MxN
 478 {
 479   DTYPE    *adr_src, *sl, *sp = NULL;
 480   DTYPE    *adr_dst, *dl, *dp = NULL;
 481   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 482   FTYPE    **buffs = buffs_arr, *buffd;
 483   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
 484   FTYPE    *pbuff = buff;
 485   FTYPE    k0, k1, k2, k3, k4, k5, k6;
 486   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
 487   mlib_s32 *buffi;
 488   mlib_s32 mn, l, off, kw, bsize, buff_ind;
 489   mlib_s32 d0, d1;
 490   mlib_s32 wid, hgt, sll, dll;
 491   mlib_s32 nchannel, chan1, chan2;
 492   mlib_s32 i, j, c, swid;
 493   d64_2x32 dd;
 494   mlib_status status = MLIB_SUCCESS;
 495 
 496   GET_SRC_DST_PARAMETERS(DTYPE);
 497 
 498   if (scale > 30) {
 499     fscale *= 1.0/(1 << 30);
 500     scale -= 30;
 501   }
 502 
 503   fscale /= (1 << scale);
 504 
 505   mn = m*n;
 506 
 507   if (mn > 256) {
 508     k = mlib_malloc(mn*sizeof(mlib_d64));
 509 
 510     if (k == NULL) return MLIB_FAILURE;
 511   }
 512 
 513   for (i = 0; i < mn; i++) {
 514     k[i] = kernel[i]*fscale;
 515   }
 516 
 517   if (m == 1) {
 518     status = mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask);
 519     FREE_AND_RETURN_STATUS;
 520   }
 521 
 522   swid = wid + (m - 1);
 523 
 524   bsize = (n + 3)*swid;
 525 
 526   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 527     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
 528 
 529     if (pbuff == NULL) {
 530       status = MLIB_FAILURE;
 531       FREE_AND_RETURN_STATUS;
 532     }
 533     buffs = (FTYPE   **)(pbuff + bsize);
 534   }
 535 
 536   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
 537   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
 538   buffd = buffs[n] + swid;
 539   buffi = (mlib_s32*)(buffd + swid);
 540 
 541   chan1 = nchannel;
 542   chan2 = chan1 + chan1;
 543 
 544   swid -= (dx_l + dx_r);
 545 
 546   for (c = 0; c < nchannel; c++) {
 547     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 548 
 549     sl = adr_src + c;
 550     dl = adr_dst + c;
 551 
 552     for (l = 0; l < n; l++) {
 553       FTYPE    *buff = buffs[l];
 554 
 555       for (i = 0; i < dx_l; i++) {
 556         buff[i] = (FTYPE)sl[0];
 557       }
 558 
 559       for (i = 0; i < swid; i++) {
 560         buff[i + dx_l] = (FTYPE)sl[i*chan1];
 561       }
 562 
 563       for (i = 0; i < dx_r; i++) {
 564         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
 565       }
 566 
 567       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
 568     }
 569 
 570     buff_ind = 0;
 571 
 572     for (i = 0; i < wid; i++) buffd[i] = 0.0;
 573 
 574     for (j = 0; j < hgt; j++) {
 575       FTYPE    **buffc = buffs + buff_ind;
 576       FTYPE    *buffn = buffc[n];
 577       FTYPE    *pk = k;
 578 
 579       for (l = 0; l < n; l++) {
 580         FTYPE    *buff_l = buffc[l];
 581 
 582         for (off = 0; off < m;) {
 583           FTYPE    *buff = buff_l + off;
 584 
 585           kw = m - off;
 586 
 587           if (kw > 2*MAX_KER) kw = MAX_KER; else
 588             if (kw > MAX_KER) kw = kw/2;
 589           off += kw;
 590 
 591           sp = sl;
 592           dp = dl;
 593 
 594           if (kw == 7) {
 595 
 596             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 597             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
 598 
 599             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 600             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
 601 
 602             if (l < (n - 1) || off < m) {
 603               for (i = 0; i <= (wid - 2); i += 2) {
 604                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 605 
 606                 p6 = buff[i + 6]; p7 = buff[i + 7];
 607 
 608                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
 609                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
 610               }
 611 
 612             } else {
 613               for (i = 0; i <= (wid - 2); i += 2) {
 614                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 615 
 616                 p6 = buff[i + 6]; p7 = buff[i + 7];
 617 
 618                 LOAD_BUFF(buffi);
 619 
 620                 dd.d64 = *(FTYPE   *)(buffi + i);
 621                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 622                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 623 
 624                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
 625                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
 626 
 627                 dp[0    ] = FROM_S32(d0);
 628                 dp[chan1] = FROM_S32(d1);
 629 
 630                 buffd[i    ] = 0.0;
 631                 buffd[i + 1] = 0.0;
 632 
 633                 sp += chan2;
 634                 dp += chan2;
 635               }
 636             }
 637 
 638           } else if (kw == 6) {
 639 
 640             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 641             p5 = buff[3]; p6 = buff[4];
 642 
 643             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 644             k4 = pk[4]; k5 = pk[5];
 645 
 646             if (l < (n - 1) || off < m) {
 647               for (i = 0; i <= (wid - 2); i += 2) {
 648                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 649 
 650                 p5 = buff[i + 5]; p6 = buff[i + 6];
 651 
 652                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
 653                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
 654               }
 655 
 656             } else {
 657               for (i = 0; i <= (wid - 2); i += 2) {
 658                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 659 
 660                 p5 = buff[i + 5]; p6 = buff[i + 6];
 661 
 662                 LOAD_BUFF(buffi);
 663 
 664                 dd.d64 = *(FTYPE   *)(buffi + i);
 665                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 666                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 667 
 668                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
 669                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
 670 
 671                 dp[0    ] = FROM_S32(d0);
 672                 dp[chan1] = FROM_S32(d1);
 673 
 674                 buffd[i    ] = 0.0;
 675                 buffd[i + 1] = 0.0;
 676 
 677                 sp += chan2;
 678                 dp += chan2;
 679               }
 680             }
 681 
 682           } else if (kw == 5) {
 683 
 684             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 685             p5 = buff[3];
 686 
 687             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 688             k4 = pk[4];
 689 
 690             if (l < (n - 1) || off < m) {
 691               for (i = 0; i <= (wid - 2); i += 2) {
 692                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 693 
 694                 p4 = buff[i + 4]; p5 = buff[i + 5];
 695 
 696                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
 697                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
 698               }
 699 
 700             } else {
 701               for (i = 0; i <= (wid - 2); i += 2) {
 702                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 703 
 704                 p4 = buff[i + 4]; p5 = buff[i + 5];
 705 
 706                 LOAD_BUFF(buffi);
 707 
 708                 dd.d64 = *(FTYPE   *)(buffi + i);
 709                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 710                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 711 
 712                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
 713                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
 714 
 715                 dp[0    ] = FROM_S32(d0);
 716                 dp[chan1] = FROM_S32(d1);
 717 
 718                 buffd[i    ] = 0.0;
 719                 buffd[i + 1] = 0.0;
 720 
 721                 sp += chan2;
 722                 dp += chan2;
 723               }
 724             }
 725 
 726           } else if (kw == 4) {
 727 
 728             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 729 
 730             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 731 
 732             if (l < (n - 1) || off < m) {
 733               for (i = 0; i <= (wid - 2); i += 2) {
 734                 p0 = p2; p1 = p3; p2 = p4;
 735 
 736                 p3 = buff[i + 3]; p4 = buff[i + 4];
 737 
 738                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 739                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 740               }
 741 
 742             } else {
 743               for (i = 0; i <= (wid - 2); i += 2) {
 744                 p0 = p2; p1 = p3; p2 = p4;
 745 
 746                 p3 = buff[i + 3]; p4 = buff[i + 4];
 747 
 748                 LOAD_BUFF(buffi);
 749 
 750                 dd.d64 = *(FTYPE   *)(buffi + i);
 751                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 752                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 753 
 754                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 755                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 756 
 757                 dp[0    ] = FROM_S32(d0);
 758                 dp[chan1] = FROM_S32(d1);
 759 
 760                 buffd[i    ] = 0.0;
 761                 buffd[i + 1] = 0.0;
 762 
 763                 sp += chan2;
 764                 dp += chan2;
 765               }
 766             }
 767 
 768           } else if (kw == 3) {
 769 
 770             p2 = buff[0]; p3 = buff[1];
 771             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 772 
 773             if (l < (n - 1) || off < m) {
 774               for (i = 0; i <= (wid - 2); i += 2) {
 775                 p0 = p2; p1 = p3;
 776 
 777                 p2 = buff[i + 2]; p3 = buff[i + 3];
 778 
 779                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
 780                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
 781               }
 782 
 783             } else {
 784               for (i = 0; i <= (wid - 2); i += 2) {
 785                 p0 = p2; p1 = p3;
 786 
 787                 p2 = buff[i + 2]; p3 = buff[i + 3];
 788 
 789                 LOAD_BUFF(buffi);
 790 
 791                 dd.d64 = *(FTYPE   *)(buffi + i);
 792                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 793                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 794 
 795                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 796                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 797 
 798                 dp[0    ] = FROM_S32(d0);
 799                 dp[chan1] = FROM_S32(d1);
 800 
 801                 buffd[i    ] = 0.0;
 802                 buffd[i + 1] = 0.0;
 803 
 804                 sp += chan2;
 805                 dp += chan2;
 806               }
 807             }
 808 
 809           } else /* if (kw == 2) */ {
 810 
 811             p2 = buff[0];
 812             k0 = pk[0]; k1 = pk[1];
 813 
 814             if (l < (n - 1) || off < m) {
 815               for (i = 0; i <= (wid - 2); i += 2) {
 816                 p0 = p2;
 817 
 818                 p1 = buff[i + 1]; p2 = buff[i + 2];
 819 
 820                 buffd[i    ] += p0*k0 + p1*k1;
 821                 buffd[i + 1] += p1*k0 + p2*k1;
 822               }
 823 
 824             } else {
 825               for (i = 0; i <= (wid - 2); i += 2) {
 826                 p0 = p2;
 827 
 828                 p1 = buff[i + 1]; p2 = buff[i + 2];
 829 
 830                 LOAD_BUFF(buffi);
 831 
 832                 dd.d64 = *(FTYPE   *)(buffi + i);
 833                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 834                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 835 
 836                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 837                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 838 
 839                 dp[0    ] = FROM_S32(d0);
 840                 dp[chan1] = FROM_S32(d1);
 841 
 842                 buffd[i    ] = 0.0;
 843                 buffd[i + 1] = 0.0;
 844 
 845                 sp += chan2;
 846                 dp += chan2;
 847               }
 848             }
 849           }
 850 
 851           pk += kw;
 852         }
 853       }
 854 
 855       /* last pixels */
 856       for (; i < wid; i++) {
 857         FTYPE    *pk = k, s = 0;
 858         mlib_s32 x, d0;
 859 
 860         for (l = 0; l < n; l++) {
 861           FTYPE    *buff = buffc[l] + i;
 862 
 863           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
 864         }
 865 
 866         d0 = D2I(s);
 867         dp[0] = FROM_S32(d0);
 868 
 869         buffn[i + dx_l] = (FTYPE)sp[0];
 870 
 871         sp += chan1;
 872         dp += chan1;
 873       }
 874 
 875       for (; i < swid; i++) {
 876         buffn[i + dx_l] = (FTYPE)sp[0];
 877         sp += chan1;
 878       }
 879 
 880       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
 881       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
 882 
 883       /* next line */
 884 
 885       if (j < hgt - dy_b - 2) sl += sll;
 886       dl += dll;
 887 
 888       buff_ind++;
 889 
 890       if (buff_ind >= n + 1) buff_ind = 0;
 891     }
 892   }
 893 
 894   FREE_AND_RETURN_STATUS;
 895 }
 896 
 897 /***************************************************************/
 898 #define STORE_RES(res, x)                                       \
 899   x >>= shift2;                                                 \
 900   CLAMP_STORE(res, x)
 901 
 902 mlib_status CONV_FUNC_MxN_I
 903 {
 904   DTYPE    *adr_src, *sl, *sp = NULL;
 905   DTYPE    *adr_dst, *dl, *dp = NULL;
 906   mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 907   mlib_s32 *pbuff = buff;
 908   mlib_s32 **buffs = buffs_arr, *buffd;
 909   mlib_s32 l, off, kw, bsize, buff_ind;
 910   mlib_s32 d0, d1, shift1, shift2;
 911   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
 912   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
 913   mlib_s32 wid, hgt, sll, dll;
 914   mlib_s32 nchannel, chan1;
 915   mlib_s32 i, j, c, swid;
 916   mlib_s32 chan2;
 917   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
 918   GET_SRC_DST_PARAMETERS(DTYPE);
 919 
 920 #if IMG_TYPE != 1
 921   shift1 = 16;
 922 #else
 923   shift1 = 8;
 924 #endif /* IMG_TYPE != 1 */
 925   shift2 = scale - shift1;
 926 
 927   chan1 = nchannel;
 928   chan2 = chan1 + chan1;
 929 
 930   swid = wid + (m - 1);
 931 
 932   bsize = (n + 2)*swid;
 933 
 934   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 935     pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1));
 936 
 937     if (pbuff == NULL) return MLIB_FAILURE;
 938     buffs = (mlib_s32 **)(pbuff + bsize);
 939   }
 940 
 941   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
 942   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
 943   buffd = buffs[n] + swid;
 944 
 945   if (m*n > MAX_N*MAX_N) {
 946     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
 947 
 948     if (k == NULL) {
 949       if (pbuff != buff) mlib_free(pbuff);
 950       return MLIB_FAILURE;
 951     }
 952   }
 953 
 954   for (i = 0; i < m*n; i++) {
 955     k[i] = kernel[i] >> shift1;
 956   }
 957 
 958   swid -= (dx_l + dx_r);
 959 
 960   for (c = 0; c < nchannel; c++) {
 961     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 962 
 963     sl = adr_src + c;
 964     dl = adr_dst + c;
 965 
 966     for (l = 0; l < n; l++) {
 967       mlib_s32  *buff = buffs[l];
 968 
 969       for (i = 0; i < dx_l; i++) {
 970         buff[i] = (mlib_s32)sl[0];
 971       }
 972 
 973       for (i = 0; i < swid; i++) {
 974         buff[i + dx_l] = (mlib_s32)sl[i*chan1];
 975       }
 976 
 977       for (i = 0; i < dx_r; i++) {
 978         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
 979       }
 980 
 981       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
 982     }
 983 
 984     buff_ind = 0;
 985 
 986     for (i = 0; i < wid; i++) buffd[i] = 0;
 987 
 988     for (j = 0; j < hgt; j++) {
 989       mlib_s32 **buffc = buffs + buff_ind;
 990       mlib_s32 *buffn = buffc[n];
 991       mlib_s32 *pk = k;
 992 
 993       for (l = 0; l < n; l++) {
 994         mlib_s32  *buff_l = buffc[l];
 995 
 996         for (off = 0; off < m;) {
 997           mlib_s32 *buff = buff_l + off;
 998 
 999           sp = sl;
1000           dp = dl;
1001 
1002           kw = m - off;
1003 
1004           if (kw > 2*MAX_KER) kw = MAX_KER; else
1005             if (kw > MAX_KER) kw = kw/2;
1006           off += kw;
1007 
1008           if (kw == 7) {
1009 
1010             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1011             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1012 
1013             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1014             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1015 
1016             if (l < (n - 1) || off < m) {
1017               for (i = 0; i <= (wid - 2); i += 2) {
1018                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1019 
1020                 p6 = buff[i + 6]; p7 = buff[i + 7];
1021 
1022                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1023                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1024               }
1025 
1026             } else {
1027               for (i = 0; i <= (wid - 2); i += 2) {
1028                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1029 
1030                 p6 = buff[i + 6]; p7 = buff[i + 7];
1031 
1032                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1033                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1034 
1035                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1036                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1037 
1038                 STORE_RES(dp[0    ], d0);
1039                 STORE_RES(dp[chan1], d1);
1040 
1041                 buffd[i    ] = 0;
1042                 buffd[i + 1] = 0;
1043 
1044                 sp += chan2;
1045                 dp += chan2;
1046               }
1047             }
1048 
1049           } else if (kw == 6) {
1050 
1051             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1052             p5 = buff[3]; p6 = buff[4];
1053 
1054             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1055             k4 = pk[4]; k5 = pk[5];
1056 
1057             if (l < (n - 1) || off < m) {
1058               for (i = 0; i <= (wid - 2); i += 2) {
1059                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1060 
1061                 p5 = buff[i + 5]; p6 = buff[i + 6];
1062 
1063                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1064                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1065               }
1066 
1067             } else {
1068               for (i = 0; i <= (wid - 2); i += 2) {
1069                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1070 
1071                 p5 = buff[i + 5]; p6 = buff[i + 6];
1072 
1073                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1074                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1075 
1076                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
1077                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1078 
1079                 STORE_RES(dp[0    ], d0);
1080                 STORE_RES(dp[chan1], d1);
1081 
1082                 buffd[i    ] = 0;
1083                 buffd[i + 1] = 0;
1084 
1085                 sp += chan2;
1086                 dp += chan2;
1087               }
1088             }
1089 
1090           } else if (kw == 5) {
1091 
1092             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1093             p5 = buff[3];
1094 
1095             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1096             k4 = pk[4];
1097 
1098             if (l < (n - 1) || off < m) {
1099               for (i = 0; i <= (wid - 2); i += 2) {
1100                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1101 
1102                 p4 = buff[i + 4]; p5 = buff[i + 5];
1103 
1104                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1105                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1106               }
1107 
1108             } else {
1109               for (i = 0; i <= (wid - 2); i += 2) {
1110                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1111 
1112                 p4 = buff[i + 4]; p5 = buff[i + 5];
1113 
1114                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1115                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1116 
1117                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
1118                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1119 
1120                 STORE_RES(dp[0    ], d0);
1121                 STORE_RES(dp[chan1], d1);
1122 
1123                 buffd[i    ] = 0;
1124                 buffd[i + 1] = 0;
1125 
1126                 sp += chan2;
1127                 dp += chan2;
1128               }
1129             }
1130 
1131           } else if (kw == 4) {
1132 
1133             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1134 
1135             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1136 
1137             if (l < (n - 1) || off < m) {
1138               for (i = 0; i <= (wid - 2); i += 2) {
1139                 p0 = p2; p1 = p3; p2 = p4;
1140 
1141                 p3 = buff[i + 3]; p4 = buff[i + 4];
1142 
1143                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1144                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1145               }
1146 
1147             } else {
1148               for (i = 0; i <= (wid - 2); i += 2) {
1149                 p0 = p2; p1 = p3; p2 = p4;
1150 
1151                 p3 = buff[i + 3]; p4 = buff[i + 4];
1152 
1153                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1154                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1155 
1156                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1157                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1158 
1159                 STORE_RES(dp[0    ], d0);
1160                 STORE_RES(dp[chan1], d1);
1161 
1162                 buffd[i    ] = 0;
1163                 buffd[i + 1] = 0;
1164 
1165                 sp += chan2;
1166                 dp += chan2;
1167               }
1168             }
1169 
1170           } else if (kw == 3) {
1171 
1172             p2 = buff[0]; p3 = buff[1];
1173             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
1174 
1175             if (l < (n - 1) || off < m) {
1176               for (i = 0; i <= (wid - 2); i += 2) {
1177                 p0 = p2; p1 = p3;
1178 
1179                 p2 = buff[i + 2]; p3 = buff[i + 3];
1180 
1181                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1182                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1183               }
1184 
1185             } else {
1186               for (i = 0; i <= (wid - 2); i += 2) {
1187                 p0 = p2; p1 = p3;
1188 
1189                 p2 = buff[i + 2]; p3 = buff[i + 3];
1190 
1191                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1192                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1193 
1194                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1195                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1196 
1197                 STORE_RES(dp[0    ], d0);
1198                 STORE_RES(dp[chan1], d1);
1199 
1200                 buffd[i    ] = 0;
1201                 buffd[i + 1] = 0;
1202 
1203                 sp += chan2;
1204                 dp += chan2;
1205               }
1206             }
1207 
1208           } else if (kw == 2) {
1209 
1210             p2 = buff[0];
1211             k0 = pk[0]; k1 = pk[1];
1212 
1213             if (l < (n - 1) || off < m) {
1214               for (i = 0; i <= (wid - 2); i += 2) {
1215                 p0 = p2;
1216 
1217                 p1 = buff[i + 1]; p2 = buff[i + 2];
1218 
1219                 buffd[i    ] += p0*k0 + p1*k1;
1220                 buffd[i + 1] += p1*k0 + p2*k1;
1221               }
1222 
1223             } else {
1224               for (i = 0; i <= (wid - 2); i += 2) {
1225                 p0 = p2;
1226 
1227                 p1 = buff[i + 1]; p2 = buff[i + 2];
1228 
1229                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1230                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1231 
1232                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
1233                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
1234 
1235                 STORE_RES(dp[0    ], d0);
1236                 STORE_RES(dp[chan1], d1);
1237 
1238                 buffd[i    ] = 0;
1239                 buffd[i + 1] = 0;
1240 
1241                 sp += chan2;
1242                 dp += chan2;
1243               }
1244             }
1245 
1246           } else /* kw == 1 */{
1247 
1248             k0 = pk[0];
1249 
1250             if (l < (n - 1) || off < m) {
1251               for (i = 0; i <= (wid - 2); i += 2) {
1252                 p0 = buff[i]; p1 = buff[i + 1];
1253 
1254                 buffd[i    ] += p0*k0;
1255                 buffd[i + 1] += p1*k0;
1256               }
1257 
1258             } else {
1259               for (i = 0; i <= (wid - 2); i += 2) {
1260                 p0 = buff[i]; p1 = buff[i + 1];
1261 
1262                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1263                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1264 
1265                 d0 = (p0*k0 + buffd[i    ]);
1266                 d1 = (p1*k0 + buffd[i + 1]);
1267 
1268                 STORE_RES(dp[0    ], d0);
1269                 STORE_RES(dp[chan1], d1);
1270 
1271                 buffd[i    ] = 0;
1272                 buffd[i + 1] = 0;
1273 
1274                 sp += chan2;
1275                 dp += chan2;
1276               }
1277             }
1278           }
1279 
1280           pk += kw;
1281         }
1282       }
1283 
1284       /* last pixels */
1285       for (; i < wid; i++) {
1286         mlib_s32 *pk = k, x, s = 0;
1287 
1288         for (l = 0; l < n; l++) {
1289           mlib_s32 *buff = buffc[l] + i;
1290 
1291           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
1292         }
1293 
1294         STORE_RES(dp[0], s);
1295 
1296         buffn[i + dx_l] = (mlib_s32)sp[0];
1297 
1298         sp += chan1;
1299         dp += chan1;
1300       }
1301 
1302       for (; i < swid; i++) {
1303         buffn[i + dx_l] = (mlib_s32)sp[0];
1304         sp += chan1;
1305       }
1306 
1307       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1308       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1309 
1310       /* next line */
1311 
1312       if (j < hgt - dy_b - 2) sl += sll;
1313       dl += dll;
1314 
1315       buff_ind++;
1316 
1317       if (buff_ind >= n + 1) buff_ind = 0;
1318     }
1319   }
1320 
1321   if (pbuff != buff) mlib_free(pbuff);
1322   if (k != k_locl) mlib_free(k);
1323 
1324   return MLIB_SUCCESS;
1325 }
1326 
1327 /***************************************************************/