1 /*
   2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 /*
  28  * FUNCTION
  29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 type and
  30  *   MLIB_EDGE_SRC_EXTEND mask
  31  */
  32 
  33 #include "mlib_image.h"
  34 #include "mlib_ImageConv.h"
  35 #include "mlib_c_ImageConv.h"
  36 
  37 /*
  38  * This define switches between functions of different data types
  39  */
  40 
  41 #define IMG_TYPE 1
  42 
  43 /***************************************************************/
  44 #if IMG_TYPE == 1
  45 
  46 #define DTYPE             mlib_u8
  47 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##ext_u8(PARAM)
  48 #define CONV_FUNC_MxN     mlib_c_convMxNext_u8(PARAM_MxN)
  49 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM)
  50 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u8(PARAM_MxN)
  51 #define DSCALE            (1 << 24)
  52 #define FROM_S32(x)       (((x) >> 24) ^ 128)
  53 #define S64TOS32(x)       (x)
  54 #define SAT_OFF           -(1u << 31)
  55 
  56 #elif IMG_TYPE == 2
  57 
  58 #define DTYPE             mlib_s16
  59 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_s16(PARAM)
  60 #define CONV_FUNC_MxN     mlib_convMxNext_s16(PARAM_MxN)
  61 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM)
  62 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_s16(PARAM_MxN)
  63 #define DSCALE            65536.0
  64 #define FROM_S32(x)       ((x) >> 16)
  65 #define S64TOS32(x)       ((x) & 0xffffffff)
  66 #define SAT_OFF
  67 
  68 #elif IMG_TYPE == 3
  69 
  70 #define DTYPE             mlib_u16
  71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
  72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
  73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
  74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
  75 #define DSCALE            65536.0
  76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
  77 #define S64TOS32(x)       (x)
  78 #define SAT_OFF           -(1u << 31)
  79 
  80 #endif /* IMG_TYPE == 1 */
  81 
  82 /***************************************************************/
  83 #define PARAM                                                   \
  84   mlib_image       *dst,                                        \
  85   const mlib_image *src,                                        \
  86   mlib_s32         dx_l,                                        \
  87   mlib_s32         dx_r,                                        \
  88   mlib_s32         dy_t,                                        \
  89   mlib_s32         dy_b,                                        \
  90   const mlib_s32   *kern,                                       \
  91   mlib_s32         scalef_expon,                                \
  92   mlib_s32         cmask
  93 
  94 /***************************************************************/
  95 #define PARAM_MxN                                               \
  96   mlib_image       *dst,                                        \
  97   const mlib_image *src,                                        \
  98   const mlib_s32   *kernel,                                     \
  99   mlib_s32         m,                                           \
 100   mlib_s32         n,                                           \
 101   mlib_s32         dx_l,                                        \
 102   mlib_s32         dx_r,                                        \
 103   mlib_s32         dy_t,                                        \
 104   mlib_s32         dy_b,                                        \
 105   mlib_s32         scale,                                       \
 106   mlib_s32         cmask
 107 
 108 /***************************************************************/
 109 #define FTYPE mlib_d64
 110 
 111 #ifndef MLIB_USE_FTOI_CLAMPING
 112 
 113 #define CLAMP_S32(x)                                            \
 114   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
 115 
 116 #else
 117 
 118 #define CLAMP_S32(x) ((mlib_s32)(x))
 119 
 120 #endif /* MLIB_USE_FTOI_CLAMPING */
 121 
 122 /***************************************************************/
 123 #define D2I(x) CLAMP_S32((x) SAT_OFF)
 124 
 125 /***************************************************************/
 126 #ifdef _NO_LONGLONG
 127 
 128 #define LOAD_BUFF(buff)                                         \
 129   buff[i    ] = sp[0];                                          \
 130   buff[i + 1] = sp[chan1]
 131 
 132 #else /* _NO_LONGLONG */
 133 
 134 #ifdef _LITTLE_ENDIAN
 135 
 136 #define LOAD_BUFF(buff)                                         \
 137   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
 138 
 139 #else /* _LITTLE_ENDIAN */
 140 
 141 #define LOAD_BUFF(buff)                                         \
 142   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 143 
 144 #endif /* _LITTLE_ENDIAN */
 145 #endif /* _NO_LONGLONG */
 146 
 147 /***************************************************************/
 148 typedef union {
 149   mlib_d64 d64;
 150   struct {
 151     mlib_s32 i0;
 152     mlib_s32 i1;
 153   } i32s;
 154 } d64_2x32;
 155 
 156 /***************************************************************/
 157 #define GET_SRC_DST_PARAMETERS(type)                            \
 158   hgt = mlib_ImageGetHeight(src);                               \
 159   wid = mlib_ImageGetWidth(src);                                \
 160   nchannel = mlib_ImageGetChannels(src);                        \
 161   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 162   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 163   adr_src = (type *)mlib_ImageGetData(src);                     \
 164   adr_dst = (type *)mlib_ImageGetData(dst)
 165 
 166 /***************************************************************/
 167 #if IMG_TYPE == 1
 168 
 169 /*
 170  * Test for the presence of any "1" bit in bits
 171    8 to 31 of val. If present, then val is either
 172    negative or >255. If over/underflows of 8 bits
 173    are uncommon, then this technique can be a win,
 174    since only a single test, rather than two, is
 175    necessary to determine if clamping is needed.
 176    On the other hand, if over/underflows are common,
 177    it adds an extra test.
 178 */
 179 #define CLAMP_STORE(dst, val)                                   \
 180   if (val & 0xffffff00) {                                       \
 181     if (val < MLIB_U8_MIN)                                      \
 182       dst = MLIB_U8_MIN;                                        \
 183     else                                                        \
 184       dst = MLIB_U8_MAX;                                        \
 185   } else {                                                      \
 186     dst = (mlib_u8)val;                                         \
 187   }
 188 
 189 #elif IMG_TYPE == 2
 190 
 191 #define CLAMP_STORE(dst, val)                                   \
 192   if (val >= MLIB_S16_MAX)                                      \
 193     dst = MLIB_S16_MAX;                                         \
 194   else if (val <= MLIB_S16_MIN)                                 \
 195     dst = MLIB_S16_MIN;                                         \
 196   else                                                          \
 197     dst = (mlib_s16)val
 198 
 199 #elif IMG_TYPE == 3
 200 
 201 #define CLAMP_STORE(dst, val)                                   \
 202   if (val >= MLIB_U16_MAX)                                      \
 203     dst = MLIB_U16_MAX;                                         \
 204   else if (val <= MLIB_U16_MIN)                                 \
 205     dst = MLIB_U16_MIN;                                         \
 206   else                                                          \
 207     dst = (mlib_u16)val
 208 
 209 #endif /* IMG_TYPE == 1 */
 210 
 211 /***************************************************************/
 212 #define MAX_KER   7
 213 #define MAX_N    15
 214 #define BUFF_SIZE   1600
 215 #define CACHE_SIZE  (64*1024)
 216 
 217 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
 218                                          const mlib_image *src,
 219                                          const mlib_d64   *k,
 220                                          mlib_s32         n,
 221                                          mlib_s32         dy_t,
 222                                          mlib_s32         dy_b,
 223                                          mlib_s32         cmask)
 224 {
 225   DTYPE    *adr_src, *sl;
 226   DTYPE    *adr_dst, *dl, *dp;
 227   FTYPE    buff[BUFF_SIZE];
 228   FTYPE    *buffd;
 229   FTYPE    *pbuff = buff;
 230   const FTYPE    *pk;
 231   FTYPE    k0, k1, k2, k3;
 232   FTYPE    p0, p1, p2, p3, p4;
 233   FTYPE    *sbuff;
 234   mlib_s32 l, k_off, off, bsize;
 235   mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh;
 236   mlib_s32 d0, d1, ii;
 237   mlib_s32 wid, hgt, sll, dll;
 238   mlib_s32 nchannel;
 239   mlib_s32 i, j, c;
 240   GET_SRC_DST_PARAMETERS(DTYPE);
 241 
 242   max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1);
 243 
 244   if (max_hsize < 1) max_hsize = 1;
 245   if (max_hsize > hgt) max_hsize = hgt;
 246 
 247   shgt = hgt + (n - 1);
 248   smax_hsize = max_hsize + (n - 1);
 249 
 250   bsize = 2 * (smax_hsize + 1);
 251 
 252   if (bsize > BUFF_SIZE) {
 253     pbuff = mlib_malloc(sizeof(FTYPE)*bsize);
 254 
 255     if (pbuff == NULL) return MLIB_FAILURE;
 256   }
 257 
 258   sbuff = pbuff;
 259   buffd = sbuff + smax_hsize;
 260 
 261   shgt -= (dy_t + dy_b);
 262   k_off = 0;
 263 
 264   for (l = 0; l < hgt; l += hsize) {
 265     hsize = hgt - l;
 266 
 267     if (hsize > max_hsize) hsize = max_hsize;
 268 
 269     smax_hsize = hsize + (n - 1);
 270 
 271     for (c = 0; c < nchannel; c++) {
 272       if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 273 
 274       sl = adr_src + c;
 275       dl = adr_dst + c;
 276 
 277 #ifdef __SUNPRO_C
 278 #pragma pipeloop(0)
 279 #endif /* __SUNPRO_C */
 280       for (i = 0; i < hsize; i++) buffd[i] = 0.0;
 281 
 282       for (j = 0; j < wid; j++) {
 283         FTYPE    *buff = sbuff;
 284 
 285         for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) {
 286           sbuff[i - k_off] = (FTYPE)sl[0];
 287         }
 288 
 289 #ifdef __SUNPRO_C
 290 #pragma pipeloop(0)
 291 #endif /* __SUNPRO_C */
 292         for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) {
 293           sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll];
 294         }
 295 
 296         for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) {
 297           sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll];
 298         }
 299 
 300         pk = k;
 301 
 302         for (off = 0; off < (n - 4); off += 4) {
 303 
 304           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 305           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 306 
 307 #ifdef __SUNPRO_C
 308 #pragma pipeloop(0)
 309 #endif /* __SUNPRO_C */
 310           for (i = 0; i < hsize; i += 2) {
 311             p0 = p2; p1 = p3; p2 = p4;
 312 
 313             p3 = buff[i + 3]; p4 = buff[i + 4];
 314 
 315             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 316             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 317           }
 318 
 319           pk += 4;
 320           buff += 4;
 321         }
 322 
 323         dp = dl;
 324         kh = n - off;
 325 
 326         if (kh == 4) {
 327           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 328           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 329 
 330 #ifdef __SUNPRO_C
 331 #pragma pipeloop(0)
 332 #endif /* __SUNPRO_C */
 333           for (i = 0; i <= (hsize - 2); i += 2) {
 334             p0 = p2; p1 = p3; p2 = p4;
 335 
 336             p3 = buff[i + 3]; p4 = buff[i + 4];
 337 
 338             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 339             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 340 
 341             dp[0  ] = FROM_S32(d0);
 342             dp[dll] = FROM_S32(d1);
 343 
 344             buffd[i    ] = 0.0;
 345             buffd[i + 1] = 0.0;
 346 
 347             dp += 2*dll;
 348           }
 349 
 350           if (i < hsize) {
 351             p0 = p2; p1 = p3; p2 = p4;
 352             p3 = buff[i + 3];
 353             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]);
 354             dp[0] = FROM_S32(d0);
 355             buffd[i] = 0.0;
 356           }
 357 
 358         } else if (kh == 3) {
 359 
 360           p2 = buff[0]; p3 = buff[1];
 361           k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 362 
 363 #ifdef __SUNPRO_C
 364 #pragma pipeloop(0)
 365 #endif /* __SUNPRO_C */
 366           for (i = 0; i <= (hsize - 2); i += 2) {
 367             p0 = p2; p1 = p3;
 368 
 369             p2 = buff[i + 2]; p3 = buff[i + 3];
 370 
 371             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 372             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 373 
 374             dp[0  ] = FROM_S32(d0);
 375             dp[dll] = FROM_S32(d1);
 376 
 377             buffd[i    ] = 0.0;
 378             buffd[i + 1] = 0.0;
 379 
 380             dp += 2*dll;
 381           }
 382 
 383           if (i < hsize) {
 384             p0 = p2; p1 = p3;
 385             p2 = buff[i + 2];
 386             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]);
 387             dp[0] = FROM_S32(d0);
 388 
 389             buffd[i] = 0.0;
 390           }
 391 
 392         } else if (kh == 2) {
 393 
 394           p2 = buff[0];
 395           k0 = pk[0]; k1 = pk[1];
 396 
 397 #ifdef __SUNPRO_C
 398 #pragma pipeloop(0)
 399 #endif /* __SUNPRO_C */
 400           for (i = 0; i <= (hsize - 2); i += 2) {
 401             p0 = p2;
 402 
 403             p1 = buff[i + 1]; p2 = buff[i + 2];
 404 
 405             d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 406             d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 407 
 408             dp[0  ] = FROM_S32(d0);
 409             dp[dll] = FROM_S32(d1);
 410 
 411             buffd[i    ] = 0.0;
 412             buffd[i + 1] = 0.0;
 413 
 414             dp += 2*dll;
 415           }
 416 
 417           if (i < hsize) {
 418             p0 = p2;
 419             p1 = buff[i + 1];
 420             d0 = D2I(p0*k0 + p1*k1 + buffd[i]);
 421             dp[0] = FROM_S32(d0);
 422 
 423             buffd[i] = 0.0;
 424           }
 425 
 426         } else /* kh == 1 */{
 427 
 428           k0 = pk[0];
 429 
 430 #ifdef __SUNPRO_C
 431 #pragma pipeloop(0)
 432 #endif /* __SUNPRO_C */
 433           for (i = 0; i <= (hsize - 2); i += 2) {
 434             p0 = buff[i]; p1 = buff[i + 1];
 435 
 436             d0 = D2I(p0*k0 + buffd[i    ]);
 437             d1 = D2I(p1*k0 + buffd[i + 1]);
 438 
 439             dp[0  ] = FROM_S32(d0);
 440             dp[dll] = FROM_S32(d1);
 441 
 442             buffd[i    ] = 0.0;
 443             buffd[i + 1] = 0.0;
 444 
 445             dp += 2*dll;
 446           }
 447 
 448           if (i < hsize) {
 449             p0 = buff[i];
 450             d0 = D2I(p0*k0 + buffd[i]);
 451             dp[0] = FROM_S32(d0);
 452 
 453             buffd[i] = 0.0;
 454           }
 455         }
 456 
 457         /* next line */
 458         sl += nchannel;
 459         dl += nchannel;
 460       }
 461     }
 462 
 463     k_off += max_hsize;
 464     adr_dst += max_hsize*dll;
 465   }
 466 
 467   if (pbuff != buff) mlib_free(pbuff);
 468 
 469   return MLIB_SUCCESS;
 470 }
 471 
 472 /***************************************************************/
 473 mlib_status CONV_FUNC_MxN
 474 {
 475   DTYPE    *adr_src, *sl, *sp = NULL;
 476   DTYPE    *adr_dst, *dl, *dp = NULL;
 477   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 478   FTYPE    **buffs = buffs_arr, *buffd;
 479   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
 480   FTYPE    *pbuff = buff;
 481   FTYPE    k0, k1, k2, k3, k4, k5, k6;
 482   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
 483   mlib_s32 *buffi;
 484   mlib_s32 mn, l, off, kw, bsize, buff_ind;
 485   mlib_s32 d0, d1;
 486   mlib_s32 wid, hgt, sll, dll;
 487   mlib_s32 nchannel, chan1, chan2;
 488   mlib_s32 i, j, c, swid;
 489   d64_2x32 dd;
 490   mlib_status status = MLIB_SUCCESS;
 491 
 492   GET_SRC_DST_PARAMETERS(DTYPE);
 493 
 494   if (scale > 30) {
 495     fscale *= 1.0/(1 << 30);
 496     scale -= 30;
 497   }
 498 
 499   fscale /= (1 << scale);
 500 
 501   mn = m*n;
 502 
 503   if (mn > 256) {
 504     k = mlib_malloc(mn*sizeof(mlib_d64));
 505 
 506     if (k == NULL) return MLIB_FAILURE;
 507   }
 508 
 509   for (i = 0; i < mn; i++) {
 510     k[i] = kernel[i]*fscale;
 511   }
 512 
 513   if (m == 1) {
 514     status = mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask);
 515     FREE_AND_RETURN_STATUS;
 516   }
 517 
 518   swid = wid + (m - 1);
 519 
 520   bsize = (n + 3)*swid;
 521 
 522   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 523     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
 524 
 525     if (pbuff == NULL) {
 526       status = MLIB_FAILURE;
 527       FREE_AND_RETURN_STATUS;
 528     }
 529     buffs = (FTYPE   **)(pbuff + bsize);
 530   }
 531 
 532   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
 533   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
 534   buffd = buffs[n] + swid;
 535   buffi = (mlib_s32*)(buffd + swid);
 536 
 537   chan1 = nchannel;
 538   chan2 = chan1 + chan1;
 539 
 540   swid -= (dx_l + dx_r);
 541 
 542   for (c = 0; c < nchannel; c++) {
 543     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 544 
 545     sl = adr_src + c;
 546     dl = adr_dst + c;
 547 
 548     for (l = 0; l < n; l++) {
 549       FTYPE    *buff = buffs[l];
 550 
 551       for (i = 0; i < dx_l; i++) {
 552         buff[i] = (FTYPE)sl[0];
 553       }
 554 
 555 #ifdef __SUNPRO_C
 556 #pragma pipeloop(0)
 557 #endif /* __SUNPRO_C */
 558       for (i = 0; i < swid; i++) {
 559         buff[i + dx_l] = (FTYPE)sl[i*chan1];
 560       }
 561 
 562       for (i = 0; i < dx_r; i++) {
 563         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
 564       }
 565 
 566       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
 567     }
 568 
 569     buff_ind = 0;
 570 
 571 #ifdef __SUNPRO_C
 572 #pragma pipeloop(0)
 573 #endif /* __SUNPRO_C */
 574     for (i = 0; i < wid; i++) buffd[i] = 0.0;
 575 
 576     for (j = 0; j < hgt; j++) {
 577       FTYPE    **buffc = buffs + buff_ind;
 578       FTYPE    *buffn = buffc[n];
 579       FTYPE    *pk = k;
 580 
 581       for (l = 0; l < n; l++) {
 582         FTYPE    *buff_l = buffc[l];
 583 
 584         for (off = 0; off < m;) {
 585           FTYPE    *buff = buff_l + off;
 586 
 587           kw = m - off;
 588 
 589           if (kw > 2*MAX_KER) kw = MAX_KER; else
 590             if (kw > MAX_KER) kw = kw/2;
 591           off += kw;
 592 
 593           sp = sl;
 594           dp = dl;
 595 
 596           if (kw == 7) {
 597 
 598             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 599             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
 600 
 601             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 602             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
 603 
 604             if (l < (n - 1) || off < m) {
 605 #ifdef __SUNPRO_C
 606 #pragma pipeloop(0)
 607 #endif /* __SUNPRO_C */
 608               for (i = 0; i <= (wid - 2); i += 2) {
 609                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 610 
 611                 p6 = buff[i + 6]; p7 = buff[i + 7];
 612 
 613                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
 614                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
 615               }
 616 
 617             } else {
 618 #ifdef __SUNPRO_C
 619 #pragma pipeloop(0)
 620 #endif /* __SUNPRO_C */
 621               for (i = 0; i <= (wid - 2); i += 2) {
 622                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 623 
 624                 p6 = buff[i + 6]; p7 = buff[i + 7];
 625 
 626                 LOAD_BUFF(buffi);
 627 
 628                 dd.d64 = *(FTYPE   *)(buffi + i);
 629                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 630                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 631 
 632                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
 633                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
 634 
 635                 dp[0    ] = FROM_S32(d0);
 636                 dp[chan1] = FROM_S32(d1);
 637 
 638                 buffd[i    ] = 0.0;
 639                 buffd[i + 1] = 0.0;
 640 
 641                 sp += chan2;
 642                 dp += chan2;
 643               }
 644             }
 645 
 646           } else if (kw == 6) {
 647 
 648             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 649             p5 = buff[3]; p6 = buff[4];
 650 
 651             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 652             k4 = pk[4]; k5 = pk[5];
 653 
 654             if (l < (n - 1) || off < m) {
 655 #ifdef __SUNPRO_C
 656 #pragma pipeloop(0)
 657 #endif /* __SUNPRO_C */
 658               for (i = 0; i <= (wid - 2); i += 2) {
 659                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 660 
 661                 p5 = buff[i + 5]; p6 = buff[i + 6];
 662 
 663                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
 664                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
 665               }
 666 
 667             } else {
 668 #ifdef __SUNPRO_C
 669 #pragma pipeloop(0)
 670 #endif /* __SUNPRO_C */
 671               for (i = 0; i <= (wid - 2); i += 2) {
 672                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 673 
 674                 p5 = buff[i + 5]; p6 = buff[i + 6];
 675 
 676                 LOAD_BUFF(buffi);
 677 
 678                 dd.d64 = *(FTYPE   *)(buffi + i);
 679                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 680                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 681 
 682                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
 683                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
 684 
 685                 dp[0    ] = FROM_S32(d0);
 686                 dp[chan1] = FROM_S32(d1);
 687 
 688                 buffd[i    ] = 0.0;
 689                 buffd[i + 1] = 0.0;
 690 
 691                 sp += chan2;
 692                 dp += chan2;
 693               }
 694             }
 695 
 696           } else if (kw == 5) {
 697 
 698             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 699             p5 = buff[3];
 700 
 701             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 702             k4 = pk[4];
 703 
 704             if (l < (n - 1) || off < m) {
 705 #ifdef __SUNPRO_C
 706 #pragma pipeloop(0)
 707 #endif /* __SUNPRO_C */
 708               for (i = 0; i <= (wid - 2); i += 2) {
 709                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 710 
 711                 p4 = buff[i + 4]; p5 = buff[i + 5];
 712 
 713                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
 714                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
 715               }
 716 
 717             } else {
 718 #ifdef __SUNPRO_C
 719 #pragma pipeloop(0)
 720 #endif /* __SUNPRO_C */
 721               for (i = 0; i <= (wid - 2); i += 2) {
 722                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 723 
 724                 p4 = buff[i + 4]; p5 = buff[i + 5];
 725 
 726                 LOAD_BUFF(buffi);
 727 
 728                 dd.d64 = *(FTYPE   *)(buffi + i);
 729                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 730                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 731 
 732                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
 733                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
 734 
 735                 dp[0    ] = FROM_S32(d0);
 736                 dp[chan1] = FROM_S32(d1);
 737 
 738                 buffd[i    ] = 0.0;
 739                 buffd[i + 1] = 0.0;
 740 
 741                 sp += chan2;
 742                 dp += chan2;
 743               }
 744             }
 745 
 746           } else if (kw == 4) {
 747 
 748             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 749 
 750             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 751 
 752             if (l < (n - 1) || off < m) {
 753 #ifdef __SUNPRO_C
 754 #pragma pipeloop(0)
 755 #endif /* __SUNPRO_C */
 756               for (i = 0; i <= (wid - 2); i += 2) {
 757                 p0 = p2; p1 = p3; p2 = p4;
 758 
 759                 p3 = buff[i + 3]; p4 = buff[i + 4];
 760 
 761                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 762                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 763               }
 764 
 765             } else {
 766 #ifdef __SUNPRO_C
 767 #pragma pipeloop(0)
 768 #endif /* __SUNPRO_C */
 769               for (i = 0; i <= (wid - 2); i += 2) {
 770                 p0 = p2; p1 = p3; p2 = p4;
 771 
 772                 p3 = buff[i + 3]; p4 = buff[i + 4];
 773 
 774                 LOAD_BUFF(buffi);
 775 
 776                 dd.d64 = *(FTYPE   *)(buffi + i);
 777                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 778                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 779 
 780                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 781                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 782 
 783                 dp[0    ] = FROM_S32(d0);
 784                 dp[chan1] = FROM_S32(d1);
 785 
 786                 buffd[i    ] = 0.0;
 787                 buffd[i + 1] = 0.0;
 788 
 789                 sp += chan2;
 790                 dp += chan2;
 791               }
 792             }
 793 
 794           } else if (kw == 3) {
 795 
 796             p2 = buff[0]; p3 = buff[1];
 797             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 798 
 799             if (l < (n - 1) || off < m) {
 800 #ifdef __SUNPRO_C
 801 #pragma pipeloop(0)
 802 #endif /* __SUNPRO_C */
 803               for (i = 0; i <= (wid - 2); i += 2) {
 804                 p0 = p2; p1 = p3;
 805 
 806                 p2 = buff[i + 2]; p3 = buff[i + 3];
 807 
 808                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
 809                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
 810               }
 811 
 812             } else {
 813 #ifdef __SUNPRO_C
 814 #pragma pipeloop(0)
 815 #endif /* __SUNPRO_C */
 816               for (i = 0; i <= (wid - 2); i += 2) {
 817                 p0 = p2; p1 = p3;
 818 
 819                 p2 = buff[i + 2]; p3 = buff[i + 3];
 820 
 821                 LOAD_BUFF(buffi);
 822 
 823                 dd.d64 = *(FTYPE   *)(buffi + i);
 824                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 825                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 826 
 827                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 828                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 829 
 830                 dp[0    ] = FROM_S32(d0);
 831                 dp[chan1] = FROM_S32(d1);
 832 
 833                 buffd[i    ] = 0.0;
 834                 buffd[i + 1] = 0.0;
 835 
 836                 sp += chan2;
 837                 dp += chan2;
 838               }
 839             }
 840 
 841           } else /* if (kw == 2) */ {
 842 
 843             p2 = buff[0];
 844             k0 = pk[0]; k1 = pk[1];
 845 
 846             if (l < (n - 1) || off < m) {
 847 #ifdef __SUNPRO_C
 848 #pragma pipeloop(0)
 849 #endif /* __SUNPRO_C */
 850               for (i = 0; i <= (wid - 2); i += 2) {
 851                 p0 = p2;
 852 
 853                 p1 = buff[i + 1]; p2 = buff[i + 2];
 854 
 855                 buffd[i    ] += p0*k0 + p1*k1;
 856                 buffd[i + 1] += p1*k0 + p2*k1;
 857               }
 858 
 859             } else {
 860 #ifdef __SUNPRO_C
 861 #pragma pipeloop(0)
 862 #endif /* __SUNPRO_C */
 863               for (i = 0; i <= (wid - 2); i += 2) {
 864                 p0 = p2;
 865 
 866                 p1 = buff[i + 1]; p2 = buff[i + 2];
 867 
 868                 LOAD_BUFF(buffi);
 869 
 870                 dd.d64 = *(FTYPE   *)(buffi + i);
 871                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 872                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 873 
 874                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 875                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 876 
 877                 dp[0    ] = FROM_S32(d0);
 878                 dp[chan1] = FROM_S32(d1);
 879 
 880                 buffd[i    ] = 0.0;
 881                 buffd[i + 1] = 0.0;
 882 
 883                 sp += chan2;
 884                 dp += chan2;
 885               }
 886             }
 887           }
 888 
 889           pk += kw;
 890         }
 891       }
 892 
 893       /* last pixels */
 894       for (; i < wid; i++) {
 895         FTYPE    *pk = k, s = 0;
 896         mlib_s32 x, d0;
 897 
 898         for (l = 0; l < n; l++) {
 899           FTYPE    *buff = buffc[l] + i;
 900 
 901           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
 902         }
 903 
 904         d0 = D2I(s);
 905         dp[0] = FROM_S32(d0);
 906 
 907         buffn[i + dx_l] = (FTYPE)sp[0];
 908 
 909         sp += chan1;
 910         dp += chan1;
 911       }
 912 
 913       for (; i < swid; i++) {
 914         buffn[i + dx_l] = (FTYPE)sp[0];
 915         sp += chan1;
 916       }
 917 
 918       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
 919       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
 920 
 921       /* next line */
 922 
 923       if (j < hgt - dy_b - 2) sl += sll;
 924       dl += dll;
 925 
 926       buff_ind++;
 927 
 928       if (buff_ind >= n + 1) buff_ind = 0;
 929     }
 930   }
 931 
 932   FREE_AND_RETURN_STATUS;
 933 }
 934 
 935 /***************************************************************/
 936 /* for x86, using integer multiplies is faster */
 937 
 938 #define STORE_RES(res, x)                                       \
 939   x >>= shift2;                                                 \
 940   CLAMP_STORE(res, x)
 941 
 942 mlib_status CONV_FUNC_MxN_I
 943 {
 944   DTYPE    *adr_src, *sl, *sp = NULL;
 945   DTYPE    *adr_dst, *dl, *dp = NULL;
 946   mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 947   mlib_s32 *pbuff = buff;
 948   mlib_s32 **buffs = buffs_arr, *buffd;
 949   mlib_s32 l, off, kw, bsize, buff_ind;
 950   mlib_s32 d0, d1, shift1, shift2;
 951   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
 952   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
 953   mlib_s32 wid, hgt, sll, dll;
 954   mlib_s32 nchannel, chan1;
 955   mlib_s32 i, j, c, swid;
 956   mlib_s32 chan2;
 957   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
 958   GET_SRC_DST_PARAMETERS(DTYPE);
 959 
 960 #if IMG_TYPE != 1
 961   shift1 = 16;
 962 #else
 963   shift1 = 8;
 964 #endif /* IMG_TYPE != 1 */
 965   shift2 = scale - shift1;
 966 
 967   chan1 = nchannel;
 968   chan2 = chan1 + chan1;
 969 
 970   swid = wid + (m - 1);
 971 
 972   bsize = (n + 2)*swid;
 973 
 974   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 975     pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1));
 976 
 977     if (pbuff == NULL) return MLIB_FAILURE;
 978     buffs = (mlib_s32 **)(pbuff + bsize);
 979   }
 980 
 981   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
 982   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
 983   buffd = buffs[n] + swid;
 984 
 985   if (m*n > MAX_N*MAX_N) {
 986     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
 987 
 988     if (k == NULL) {
 989       if (pbuff != buff) mlib_free(pbuff);
 990       return MLIB_FAILURE;
 991     }
 992   }
 993 
 994   for (i = 0; i < m*n; i++) {
 995     k[i] = kernel[i] >> shift1;
 996   }
 997 
 998   swid -= (dx_l + dx_r);
 999 
1000   for (c = 0; c < nchannel; c++) {
1001     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1002 
1003     sl = adr_src + c;
1004     dl = adr_dst + c;
1005 
1006     for (l = 0; l < n; l++) {
1007       mlib_s32  *buff = buffs[l];
1008 
1009       for (i = 0; i < dx_l; i++) {
1010         buff[i] = (mlib_s32)sl[0];
1011       }
1012 
1013 #ifdef __SUNPRO_C
1014 #pragma pipeloop(0)
1015 #endif /* __SUNPRO_C */
1016       for (i = 0; i < swid; i++) {
1017         buff[i + dx_l] = (mlib_s32)sl[i*chan1];
1018       }
1019 
1020       for (i = 0; i < dx_r; i++) {
1021         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
1022       }
1023 
1024       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
1025     }
1026 
1027     buff_ind = 0;
1028 
1029 #ifdef __SUNPRO_C
1030 #pragma pipeloop(0)
1031 #endif /* __SUNPRO_C */
1032     for (i = 0; i < wid; i++) buffd[i] = 0;
1033 
1034     for (j = 0; j < hgt; j++) {
1035       mlib_s32 **buffc = buffs + buff_ind;
1036       mlib_s32 *buffn = buffc[n];
1037       mlib_s32 *pk = k;
1038 
1039       for (l = 0; l < n; l++) {
1040         mlib_s32  *buff_l = buffc[l];
1041 
1042         for (off = 0; off < m;) {
1043           mlib_s32 *buff = buff_l + off;
1044 
1045           sp = sl;
1046           dp = dl;
1047 
1048           kw = m - off;
1049 
1050           if (kw > 2*MAX_KER) kw = MAX_KER; else
1051             if (kw > MAX_KER) kw = kw/2;
1052           off += kw;
1053 
1054           if (kw == 7) {
1055 
1056             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1057             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1058 
1059             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1060             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1061 
1062             if (l < (n - 1) || off < m) {
1063 #ifdef __SUNPRO_C
1064 #pragma pipeloop(0)
1065 #endif /* __SUNPRO_C */
1066               for (i = 0; i <= (wid - 2); i += 2) {
1067                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1068 
1069                 p6 = buff[i + 6]; p7 = buff[i + 7];
1070 
1071                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1072                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1073               }
1074 
1075             } else {
1076 #ifdef __SUNPRO_C
1077 #pragma pipeloop(0)
1078 #endif /* __SUNPRO_C */
1079               for (i = 0; i <= (wid - 2); i += 2) {
1080                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1081 
1082                 p6 = buff[i + 6]; p7 = buff[i + 7];
1083 
1084                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1085                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1086 
1087                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1088                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1089 
1090                 STORE_RES(dp[0    ], d0);
1091                 STORE_RES(dp[chan1], d1);
1092 
1093                 buffd[i    ] = 0;
1094                 buffd[i + 1] = 0;
1095 
1096                 sp += chan2;
1097                 dp += chan2;
1098               }
1099             }
1100 
1101           } else if (kw == 6) {
1102 
1103             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1104             p5 = buff[3]; p6 = buff[4];
1105 
1106             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1107             k4 = pk[4]; k5 = pk[5];
1108 
1109             if (l < (n - 1) || off < m) {
1110 #ifdef __SUNPRO_C
1111 #pragma pipeloop(0)
1112 #endif /* __SUNPRO_C */
1113               for (i = 0; i <= (wid - 2); i += 2) {
1114                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1115 
1116                 p5 = buff[i + 5]; p6 = buff[i + 6];
1117 
1118                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1119                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1120               }
1121 
1122             } else {
1123 #ifdef __SUNPRO_C
1124 #pragma pipeloop(0)
1125 #endif /* __SUNPRO_C */
1126               for (i = 0; i <= (wid - 2); i += 2) {
1127                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1128 
1129                 p5 = buff[i + 5]; p6 = buff[i + 6];
1130 
1131                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1132                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1133 
1134                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
1135                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1136 
1137                 STORE_RES(dp[0    ], d0);
1138                 STORE_RES(dp[chan1], d1);
1139 
1140                 buffd[i    ] = 0;
1141                 buffd[i + 1] = 0;
1142 
1143                 sp += chan2;
1144                 dp += chan2;
1145               }
1146             }
1147 
1148           } else if (kw == 5) {
1149 
1150             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1151             p5 = buff[3];
1152 
1153             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1154             k4 = pk[4];
1155 
1156             if (l < (n - 1) || off < m) {
1157 #ifdef __SUNPRO_C
1158 #pragma pipeloop(0)
1159 #endif /* __SUNPRO_C */
1160               for (i = 0; i <= (wid - 2); i += 2) {
1161                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1162 
1163                 p4 = buff[i + 4]; p5 = buff[i + 5];
1164 
1165                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1166                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1167               }
1168 
1169             } else {
1170 #ifdef __SUNPRO_C
1171 #pragma pipeloop(0)
1172 #endif /* __SUNPRO_C */
1173               for (i = 0; i <= (wid - 2); i += 2) {
1174                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1175 
1176                 p4 = buff[i + 4]; p5 = buff[i + 5];
1177 
1178                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1179                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1180 
1181                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
1182                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1183 
1184                 STORE_RES(dp[0    ], d0);
1185                 STORE_RES(dp[chan1], d1);
1186 
1187                 buffd[i    ] = 0;
1188                 buffd[i + 1] = 0;
1189 
1190                 sp += chan2;
1191                 dp += chan2;
1192               }
1193             }
1194 
1195           } else if (kw == 4) {
1196 
1197             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1198 
1199             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1200 
1201             if (l < (n - 1) || off < m) {
1202 #ifdef __SUNPRO_C
1203 #pragma pipeloop(0)
1204 #endif /* __SUNPRO_C */
1205               for (i = 0; i <= (wid - 2); i += 2) {
1206                 p0 = p2; p1 = p3; p2 = p4;
1207 
1208                 p3 = buff[i + 3]; p4 = buff[i + 4];
1209 
1210                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1211                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1212               }
1213 
1214             } else {
1215 #ifdef __SUNPRO_C
1216 #pragma pipeloop(0)
1217 #endif /* __SUNPRO_C */
1218               for (i = 0; i <= (wid - 2); i += 2) {
1219                 p0 = p2; p1 = p3; p2 = p4;
1220 
1221                 p3 = buff[i + 3]; p4 = buff[i + 4];
1222 
1223                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1224                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1225 
1226                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1227                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1228 
1229                 STORE_RES(dp[0    ], d0);
1230                 STORE_RES(dp[chan1], d1);
1231 
1232                 buffd[i    ] = 0;
1233                 buffd[i + 1] = 0;
1234 
1235                 sp += chan2;
1236                 dp += chan2;
1237               }
1238             }
1239 
1240           } else if (kw == 3) {
1241 
1242             p2 = buff[0]; p3 = buff[1];
1243             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
1244 
1245             if (l < (n - 1) || off < m) {
1246 #ifdef __SUNPRO_C
1247 #pragma pipeloop(0)
1248 #endif /* __SUNPRO_C */
1249               for (i = 0; i <= (wid - 2); i += 2) {
1250                 p0 = p2; p1 = p3;
1251 
1252                 p2 = buff[i + 2]; p3 = buff[i + 3];
1253 
1254                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1255                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1256               }
1257 
1258             } else {
1259 #ifdef __SUNPRO_C
1260 #pragma pipeloop(0)
1261 #endif /* __SUNPRO_C */
1262               for (i = 0; i <= (wid - 2); i += 2) {
1263                 p0 = p2; p1 = p3;
1264 
1265                 p2 = buff[i + 2]; p3 = buff[i + 3];
1266 
1267                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1268                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1269 
1270                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1271                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1272 
1273                 STORE_RES(dp[0    ], d0);
1274                 STORE_RES(dp[chan1], d1);
1275 
1276                 buffd[i    ] = 0;
1277                 buffd[i + 1] = 0;
1278 
1279                 sp += chan2;
1280                 dp += chan2;
1281               }
1282             }
1283 
1284           } else if (kw == 2) {
1285 
1286             p2 = buff[0];
1287             k0 = pk[0]; k1 = pk[1];
1288 
1289             if (l < (n - 1) || off < m) {
1290 #ifdef __SUNPRO_C
1291 #pragma pipeloop(0)
1292 #endif /* __SUNPRO_C */
1293               for (i = 0; i <= (wid - 2); i += 2) {
1294                 p0 = p2;
1295 
1296                 p1 = buff[i + 1]; p2 = buff[i + 2];
1297 
1298                 buffd[i    ] += p0*k0 + p1*k1;
1299                 buffd[i + 1] += p1*k0 + p2*k1;
1300               }
1301 
1302             } else {
1303 #ifdef __SUNPRO_C
1304 #pragma pipeloop(0)
1305 #endif /* __SUNPRO_C */
1306               for (i = 0; i <= (wid - 2); i += 2) {
1307                 p0 = p2;
1308 
1309                 p1 = buff[i + 1]; p2 = buff[i + 2];
1310 
1311                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1312                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1313 
1314                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
1315                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
1316 
1317                 STORE_RES(dp[0    ], d0);
1318                 STORE_RES(dp[chan1], d1);
1319 
1320                 buffd[i    ] = 0;
1321                 buffd[i + 1] = 0;
1322 
1323                 sp += chan2;
1324                 dp += chan2;
1325               }
1326             }
1327 
1328           } else /* kw == 1 */{
1329 
1330             k0 = pk[0];
1331 
1332             if (l < (n - 1) || off < m) {
1333 #ifdef __SUNPRO_C
1334 #pragma pipeloop(0)
1335 #endif /* __SUNPRO_C */
1336               for (i = 0; i <= (wid - 2); i += 2) {
1337                 p0 = buff[i]; p1 = buff[i + 1];
1338 
1339                 buffd[i    ] += p0*k0;
1340                 buffd[i + 1] += p1*k0;
1341               }
1342 
1343             } else {
1344 #ifdef __SUNPRO_C
1345 #pragma pipeloop(0)
1346 #endif /* __SUNPRO_C */
1347               for (i = 0; i <= (wid - 2); i += 2) {
1348                 p0 = buff[i]; p1 = buff[i + 1];
1349 
1350                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1351                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1352 
1353                 d0 = (p0*k0 + buffd[i    ]);
1354                 d1 = (p1*k0 + buffd[i + 1]);
1355 
1356                 STORE_RES(dp[0    ], d0);
1357                 STORE_RES(dp[chan1], d1);
1358 
1359                 buffd[i    ] = 0;
1360                 buffd[i + 1] = 0;
1361 
1362                 sp += chan2;
1363                 dp += chan2;
1364               }
1365             }
1366           }
1367 
1368           pk += kw;
1369         }
1370       }
1371 
1372       /* last pixels */
1373       for (; i < wid; i++) {
1374         mlib_s32 *pk = k, x, s = 0;
1375 
1376         for (l = 0; l < n; l++) {
1377           mlib_s32 *buff = buffc[l] + i;
1378 
1379           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
1380         }
1381 
1382         STORE_RES(dp[0], s);
1383 
1384         buffn[i + dx_l] = (mlib_s32)sp[0];
1385 
1386         sp += chan1;
1387         dp += chan1;
1388       }
1389 
1390       for (; i < swid; i++) {
1391         buffn[i + dx_l] = (mlib_s32)sp[0];
1392         sp += chan1;
1393       }
1394 
1395       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1396       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1397 
1398       /* next line */
1399 
1400       if (j < hgt - dy_b - 2) sl += sll;
1401       dl += dll;
1402 
1403       buff_ind++;
1404 
1405       if (buff_ind >= n + 1) buff_ind = 0;
1406     }
1407   }
1408 
1409   if (pbuff != buff) mlib_free(pbuff);
1410   if (k != k_locl) mlib_free(k);
1411 
1412   return MLIB_SUCCESS;
1413 }
1414 
1415 /***************************************************************/