1 /*
   2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 /*
  28  * FUNCTION
  29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 type and
  30  *   MLIB_EDGE_SRC_EXTEND mask
  31  */
  32 
  33 #include "mlib_image.h"
  34 #include "mlib_ImageConv.h"
  35 #include "mlib_c_ImageConv.h"
  36 
  37 /*
  38  * This define switches between functions of different data types
  39  */
  40 
  41 #define IMG_TYPE 3
  42 
  43 /***************************************************************/
  44 #if IMG_TYPE == 1
  45 
  46 #define DTYPE             mlib_u8
  47 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##ext_u8(PARAM)
  48 #define CONV_FUNC_MxN     mlib_c_convMxNext_u8(PARAM_MxN)
  49 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM)
  50 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u8(PARAM_MxN)
  51 #define DSCALE            (1 << 24)
  52 #define FROM_S32(x)       (((x) >> 24) ^ 128)
  53 #define S64TOS32(x)       (x)
  54 #define SAT_OFF           -(1u << 31)
  55 
  56 #elif IMG_TYPE == 2
  57 
  58 #define DTYPE             mlib_s16
  59 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_s16(PARAM)
  60 #define CONV_FUNC_MxN     mlib_convMxNext_s16(PARAM_MxN)
  61 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM)
  62 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_s16(PARAM_MxN)
  63 #define DSCALE            65536.0
  64 #define FROM_S32(x)       ((x) >> 16)
  65 #define S64TOS32(x)       ((x) & 0xffffffff)
  66 #define SAT_OFF
  67 
  68 #elif IMG_TYPE == 3
  69 
  70 #define DTYPE             mlib_u16
  71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
  72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
  73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
  74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
  75 #define DSCALE            65536.0
  76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
  77 #define S64TOS32(x)       (x)
  78 #define SAT_OFF           -(1u << 31)
  79 
  80 #endif /* IMG_TYPE == 1 */
  81 
  82 /***************************************************************/
  83 #define PARAM                                                   \
  84   mlib_image       *dst,                                        \
  85   const mlib_image *src,                                        \
  86   mlib_s32         dx_l,                                        \
  87   mlib_s32         dx_r,                                        \
  88   mlib_s32         dy_t,                                        \
  89   mlib_s32         dy_b,                                        \
  90   const mlib_s32   *kern,                                       \
  91   mlib_s32         scalef_expon,                                \
  92   mlib_s32         cmask
  93 
  94 /***************************************************************/
  95 #define PARAM_MxN                                               \
  96   mlib_image       *dst,                                        \
  97   const mlib_image *src,                                        \
  98   const mlib_s32   *kernel,                                     \
  99   mlib_s32         m,                                           \
 100   mlib_s32         n,                                           \
 101   mlib_s32         dx_l,                                        \
 102   mlib_s32         dx_r,                                        \
 103   mlib_s32         dy_t,                                        \
 104   mlib_s32         dy_b,                                        \
 105   mlib_s32         scale,                                       \
 106   mlib_s32         cmask
 107 
 108 /***************************************************************/
 109 #define FTYPE mlib_d64
 110 
 111 #ifndef MLIB_USE_FTOI_CLAMPING
 112 
 113 #define CLAMP_S32(x)                                            \
 114   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
 115 
 116 #else
 117 
 118 #define CLAMP_S32(x) ((mlib_s32)(x))
 119 
 120 #endif /* MLIB_USE_FTOI_CLAMPING */
 121 
 122 /***************************************************************/
 123 #define D2I(x) CLAMP_S32((x) SAT_OFF)
 124 
 125 /***************************************************************/
 126 #ifdef _LITTLE_ENDIAN
 127 
 128 #define STORE2(res0, res1)                                      \
 129   dp[0    ] = res1;                                             \
 130   dp[chan1] = res0
 131 
 132 #else
 133 
 134 #define STORE2(res0, res1)                                      \
 135   dp[0    ] = res0;                                             \
 136   dp[chan1] = res1
 137 
 138 #endif /* _LITTLE_ENDIAN */
 139 
 140 /***************************************************************/
 141 #ifdef _NO_LONGLONG
 142 
 143 #define LOAD_BUFF(buff)                                         \
 144   buff[i    ] = sp[0];                                          \
 145   buff[i + 1] = sp[chan1]
 146 
 147 #else /* _NO_LONGLONG */
 148 
 149 #ifdef _LITTLE_ENDIAN
 150 
 151 #define LOAD_BUFF(buff)                                         \
 152   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
 153 
 154 #else /* _LITTLE_ENDIAN */
 155 
 156 #define LOAD_BUFF(buff)                                         \
 157   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 158 
 159 #endif /* _LITTLE_ENDIAN */
 160 #endif /* _NO_LONGLONG */
 161 
 162 /***************************************************************/
 163 typedef union {
 164   mlib_d64 d64;
 165   struct {
 166     mlib_s32 i0;
 167     mlib_s32 i1;
 168   } i32s;
 169 } d64_2x32;
 170 
 171 /***************************************************************/
 172 #define DEF_VARS(type)                                          \
 173   type     *adr_src, *sl, *sp, *sl1;                            \
 174   type     *adr_dst, *dl, *dp;                                  \
 175   FTYPE    *pbuff = buff;                                       \
 176   mlib_s32 *buffi, *buffo;                                      \
 177   mlib_s32 wid, hgt, sll, dll;                                  \
 178   mlib_s32 nchannel, chan1, chan2;                              \
 179   mlib_s32 i, j, c, swid
 180 
 181 /***************************************************************/
 182 #define GET_SRC_DST_PARAMETERS(type)                            \
 183   hgt = mlib_ImageGetHeight(src);                               \
 184   wid = mlib_ImageGetWidth(src);                                \
 185   nchannel = mlib_ImageGetChannels(src);                        \
 186   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 187   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 188   adr_src = (type *)mlib_ImageGetData(src);                     \
 189   adr_dst = (type *)mlib_ImageGetData(dst)
 190 
 191 /***************************************************************/
 192 #ifndef __sparc
 193 #if IMG_TYPE == 1
 194 
 195 /*
 196  * Test for the presence of any "1" bit in bits
 197    8 to 31 of val. If present, then val is either
 198    negative or >255. If over/underflows of 8 bits
 199    are uncommon, then this technique can be a win,
 200    since only a single test, rather than two, is
 201    necessary to determine if clamping is needed.
 202    On the other hand, if over/underflows are common,
 203    it adds an extra test.
 204 */
 205 #define CLAMP_STORE(dst, val)                                   \
 206   if (val & 0xffffff00) {                                       \
 207     if (val < MLIB_U8_MIN)                                      \
 208       dst = MLIB_U8_MIN;                                        \
 209     else                                                        \
 210       dst = MLIB_U8_MAX;                                        \
 211   } else {                                                      \
 212     dst = (mlib_u8)val;                                         \
 213   }
 214 
 215 #elif IMG_TYPE == 2
 216 
 217 #define CLAMP_STORE(dst, val)                                   \
 218   if (val >= MLIB_S16_MAX)                                      \
 219     dst = MLIB_S16_MAX;                                         \
 220   else if (val <= MLIB_S16_MIN)                                 \
 221     dst = MLIB_S16_MIN;                                         \
 222   else                                                          \
 223     dst = (mlib_s16)val
 224 
 225 #elif IMG_TYPE == 3
 226 
 227 #define CLAMP_STORE(dst, val)                                   \
 228   if (val >= MLIB_U16_MAX)                                      \
 229     dst = MLIB_U16_MAX;                                         \
 230   else if (val <= MLIB_U16_MIN)                                 \
 231     dst = MLIB_U16_MIN;                                         \
 232   else                                                          \
 233     dst = (mlib_u16)val
 234 
 235 #endif /* IMG_TYPE == 1 */
 236 #endif /* __sparc */
 237 
 238 /***************************************************************/
 239 #define MAX_KER   7
 240 #define MAX_N    15
 241 #define BUFF_SIZE   1600
 242 #define CACHE_SIZE  (64*1024)
 243 
 244 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
 245                                          const mlib_image *src,
 246                                          const mlib_d64   *k,
 247                                          mlib_s32         n,
 248                                          mlib_s32         dy_t,
 249                                          mlib_s32         dy_b,
 250                                          mlib_s32         cmask)
 251 {
 252   DTYPE    *adr_src, *sl;
 253   DTYPE    *adr_dst, *dl, *dp;
 254   FTYPE    buff[BUFF_SIZE];
 255   FTYPE    *buffd;
 256   FTYPE    *pbuff = buff;
 257   const FTYPE    *pk;
 258   FTYPE    k0, k1, k2, k3;
 259   FTYPE    p0, p1, p2, p3, p4;
 260   FTYPE    *sbuff;
 261   mlib_s32 l, k_off, off, bsize;
 262   mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh;
 263   mlib_s32 d0, d1, ii;
 264   mlib_s32 wid, hgt, sll, dll;
 265   mlib_s32 nchannel;
 266   mlib_s32 i, j, c;
 267   GET_SRC_DST_PARAMETERS(DTYPE);
 268 
 269   max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1);
 270 
 271   if (max_hsize < 1) max_hsize = 1;
 272   if (max_hsize > hgt) max_hsize = hgt;
 273 
 274   shgt = hgt + (n - 1);
 275   smax_hsize = max_hsize + (n - 1);
 276 
 277   bsize = 2 * (smax_hsize + 1);
 278 
 279   if (bsize > BUFF_SIZE) {
 280     pbuff = mlib_malloc(sizeof(FTYPE)*bsize);
 281 
 282     if (pbuff == NULL) return MLIB_FAILURE;
 283   }
 284 
 285   sbuff = pbuff;
 286   buffd = sbuff + smax_hsize;
 287 
 288   shgt -= (dy_t + dy_b);
 289   k_off = 0;
 290 
 291   for (l = 0; l < hgt; l += hsize) {
 292     hsize = hgt - l;
 293 
 294     if (hsize > max_hsize) hsize = max_hsize;
 295 
 296     smax_hsize = hsize + (n - 1);
 297 
 298     for (c = 0; c < nchannel; c++) {
 299       if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 300 
 301       sl = adr_src + c;
 302       dl = adr_dst + c;
 303 
 304 #ifdef __SUNPRO_C
 305 #pragma pipeloop(0)
 306 #endif /* __SUNPRO_C */
 307       for (i = 0; i < hsize; i++) buffd[i] = 0.0;
 308 
 309       for (j = 0; j < wid; j++) {
 310         FTYPE    *buff = sbuff;
 311 
 312         for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) {
 313           sbuff[i - k_off] = (FTYPE)sl[0];
 314         }
 315 
 316 #ifdef __SUNPRO_C
 317 #pragma pipeloop(0)
 318 #endif /* __SUNPRO_C */
 319         for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) {
 320           sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll];
 321         }
 322 
 323         for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) {
 324           sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll];
 325         }
 326 
 327         pk = k;
 328 
 329         for (off = 0; off < (n - 4); off += 4) {
 330 
 331           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 332           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 333 
 334 #ifdef __SUNPRO_C
 335 #pragma pipeloop(0)
 336 #endif /* __SUNPRO_C */
 337           for (i = 0; i < hsize; i += 2) {
 338             p0 = p2; p1 = p3; p2 = p4;
 339 
 340             p3 = buff[i + 3]; p4 = buff[i + 4];
 341 
 342             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 343             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 344           }
 345 
 346           pk += 4;
 347           buff += 4;
 348         }
 349 
 350         dp = dl;
 351         kh = n - off;
 352 
 353         if (kh == 4) {
 354           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 355           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 356 
 357 #ifdef __SUNPRO_C
 358 #pragma pipeloop(0)
 359 #endif /* __SUNPRO_C */
 360           for (i = 0; i <= (hsize - 2); i += 2) {
 361             p0 = p2; p1 = p3; p2 = p4;
 362 
 363             p3 = buff[i + 3]; p4 = buff[i + 4];
 364 
 365             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 366             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 367 
 368             dp[0  ] = FROM_S32(d0);
 369             dp[dll] = FROM_S32(d1);
 370 
 371             buffd[i    ] = 0.0;
 372             buffd[i + 1] = 0.0;
 373 
 374             dp += 2*dll;
 375           }
 376 
 377           if (i < hsize) {
 378             p0 = p2; p1 = p3; p2 = p4;
 379             p3 = buff[i + 3];
 380             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]);
 381             dp[0] = FROM_S32(d0);
 382             buffd[i] = 0.0;
 383           }
 384 
 385         } else if (kh == 3) {
 386 
 387           p2 = buff[0]; p3 = buff[1];
 388           k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 389 
 390 #ifdef __SUNPRO_C
 391 #pragma pipeloop(0)
 392 #endif /* __SUNPRO_C */
 393           for (i = 0; i <= (hsize - 2); i += 2) {
 394             p0 = p2; p1 = p3;
 395 
 396             p2 = buff[i + 2]; p3 = buff[i + 3];
 397 
 398             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 399             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 400 
 401             dp[0  ] = FROM_S32(d0);
 402             dp[dll] = FROM_S32(d1);
 403 
 404             buffd[i    ] = 0.0;
 405             buffd[i + 1] = 0.0;
 406 
 407             dp += 2*dll;
 408           }
 409 
 410           if (i < hsize) {
 411             p0 = p2; p1 = p3;
 412             p2 = buff[i + 2];
 413             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]);
 414             dp[0] = FROM_S32(d0);
 415 
 416             buffd[i] = 0.0;
 417           }
 418 
 419         } else if (kh == 2) {
 420 
 421           p2 = buff[0];
 422           k0 = pk[0]; k1 = pk[1];
 423 
 424 #ifdef __SUNPRO_C
 425 #pragma pipeloop(0)
 426 #endif /* __SUNPRO_C */
 427           for (i = 0; i <= (hsize - 2); i += 2) {
 428             p0 = p2;
 429 
 430             p1 = buff[i + 1]; p2 = buff[i + 2];
 431 
 432             d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 433             d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 434 
 435             dp[0  ] = FROM_S32(d0);
 436             dp[dll] = FROM_S32(d1);
 437 
 438             buffd[i    ] = 0.0;
 439             buffd[i + 1] = 0.0;
 440 
 441             dp += 2*dll;
 442           }
 443 
 444           if (i < hsize) {
 445             p0 = p2;
 446             p1 = buff[i + 1];
 447             d0 = D2I(p0*k0 + p1*k1 + buffd[i]);
 448             dp[0] = FROM_S32(d0);
 449 
 450             buffd[i] = 0.0;
 451           }
 452 
 453         } else /* kh == 1 */{
 454 
 455           k0 = pk[0];
 456 
 457 #ifdef __SUNPRO_C
 458 #pragma pipeloop(0)
 459 #endif /* __SUNPRO_C */
 460           for (i = 0; i <= (hsize - 2); i += 2) {
 461             p0 = buff[i]; p1 = buff[i + 1];
 462 
 463             d0 = D2I(p0*k0 + buffd[i    ]);
 464             d1 = D2I(p1*k0 + buffd[i + 1]);
 465 
 466             dp[0  ] = FROM_S32(d0);
 467             dp[dll] = FROM_S32(d1);
 468 
 469             buffd[i    ] = 0.0;
 470             buffd[i + 1] = 0.0;
 471 
 472             dp += 2*dll;
 473           }
 474 
 475           if (i < hsize) {
 476             p0 = buff[i];
 477             d0 = D2I(p0*k0 + buffd[i]);
 478             dp[0] = FROM_S32(d0);
 479 
 480             buffd[i] = 0.0;
 481           }
 482         }
 483 
 484         /* next line */
 485         sl += nchannel;
 486         dl += nchannel;
 487       }
 488     }
 489 
 490     k_off += max_hsize;
 491     adr_dst += max_hsize*dll;
 492   }
 493 
 494   if (pbuff != buff) mlib_free(pbuff);
 495 
 496   return MLIB_SUCCESS;
 497 }
 498 
 499 /***************************************************************/
 500 mlib_status CONV_FUNC_MxN
 501 {
 502   DTYPE    *adr_src, *sl, *sp = NULL;
 503   DTYPE    *adr_dst, *dl, *dp = NULL;
 504   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 505   FTYPE    **buffs = buffs_arr, *buffd;
 506   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
 507   FTYPE    *pbuff = buff;
 508   FTYPE    k0, k1, k2, k3, k4, k5, k6;
 509   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
 510   mlib_s32 *buffi;
 511   mlib_s32 mn, l, off, kw, bsize, buff_ind;
 512   mlib_s32 d0, d1;
 513   mlib_s32 wid, hgt, sll, dll;
 514   mlib_s32 nchannel, chan1, chan2;
 515   mlib_s32 i, j, c, swid;
 516   d64_2x32 dd;
 517   mlib_status status = MLIB_SUCCESS;
 518 
 519   GET_SRC_DST_PARAMETERS(DTYPE);
 520 
 521   if (scale > 30) {
 522     fscale *= 1.0/(1 << 30);
 523     scale -= 30;
 524   }
 525 
 526   fscale /= (1 << scale);
 527 
 528   mn = m*n;
 529 
 530   if (mn > 256) {
 531     k = mlib_malloc(mn*sizeof(mlib_d64));
 532 
 533     if (k == NULL) return MLIB_FAILURE;
 534   }
 535 
 536   for (i = 0; i < mn; i++) {
 537     k[i] = kernel[i]*fscale;
 538   }
 539 
 540   if (m == 1) {
 541     status = mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask);
 542     FREE_AND_RETURN_STATUS
 543   }
 544 
 545   swid = wid + (m - 1);
 546 
 547   bsize = (n + 3)*swid;
 548 
 549   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 550     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
 551 
 552     if (pbuff == NULL) {
 553       status = MLIB_FAILURE;
 554       FREE_AND_RETURN_STATUS
 555     }
 556     buffs = (FTYPE   **)(pbuff + bsize);
 557   }
 558 
 559   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
 560   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
 561   buffd = buffs[n] + swid;
 562   buffi = (mlib_s32*)(buffd + swid);
 563 
 564   chan1 = nchannel;
 565   chan2 = chan1 + chan1;
 566 
 567   swid -= (dx_l + dx_r);
 568 
 569   for (c = 0; c < nchannel; c++) {
 570     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 571 
 572     sl = adr_src + c;
 573     dl = adr_dst + c;
 574 
 575     for (l = 0; l < n; l++) {
 576       FTYPE    *buff = buffs[l];
 577 
 578       for (i = 0; i < dx_l; i++) {
 579         buff[i] = (FTYPE)sl[0];
 580       }
 581 
 582 #ifdef __SUNPRO_C
 583 #pragma pipeloop(0)
 584 #endif /* __SUNPRO_C */
 585       for (i = 0; i < swid; i++) {
 586         buff[i + dx_l] = (FTYPE)sl[i*chan1];
 587       }
 588 
 589       for (i = 0; i < dx_r; i++) {
 590         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
 591       }
 592 
 593       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
 594     }
 595 
 596     buff_ind = 0;
 597 
 598 #ifdef __SUNPRO_C
 599 #pragma pipeloop(0)
 600 #endif /* __SUNPRO_C */
 601     for (i = 0; i < wid; i++) buffd[i] = 0.0;
 602 
 603     for (j = 0; j < hgt; j++) {
 604       FTYPE    **buffc = buffs + buff_ind;
 605       FTYPE    *buffn = buffc[n];
 606       FTYPE    *pk = k;
 607 
 608       for (l = 0; l < n; l++) {
 609         FTYPE    *buff_l = buffc[l];
 610 
 611         for (off = 0; off < m;) {
 612           FTYPE    *buff = buff_l + off;
 613 
 614           kw = m - off;
 615 
 616           if (kw > 2*MAX_KER) kw = MAX_KER; else
 617             if (kw > MAX_KER) kw = kw/2;
 618           off += kw;
 619 
 620           sp = sl;
 621           dp = dl;
 622 
 623           if (kw == 7) {
 624 
 625             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 626             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
 627 
 628             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 629             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
 630 
 631             if (l < (n - 1) || off < m) {
 632 #ifdef __SUNPRO_C
 633 #pragma pipeloop(0)
 634 #endif /* __SUNPRO_C */
 635               for (i = 0; i <= (wid - 2); i += 2) {
 636                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 637 
 638                 p6 = buff[i + 6]; p7 = buff[i + 7];
 639 
 640                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
 641                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
 642               }
 643 
 644             } else {
 645 #ifdef __SUNPRO_C
 646 #pragma pipeloop(0)
 647 #endif /* __SUNPRO_C */
 648               for (i = 0; i <= (wid - 2); i += 2) {
 649                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 650 
 651                 p6 = buff[i + 6]; p7 = buff[i + 7];
 652 
 653                 LOAD_BUFF(buffi);
 654 
 655                 dd.d64 = *(FTYPE   *)(buffi + i);
 656                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 657                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 658 
 659                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
 660                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
 661 
 662                 dp[0    ] = FROM_S32(d0);
 663                 dp[chan1] = FROM_S32(d1);
 664 
 665                 buffd[i    ] = 0.0;
 666                 buffd[i + 1] = 0.0;
 667 
 668                 sp += chan2;
 669                 dp += chan2;
 670               }
 671             }
 672 
 673           } else if (kw == 6) {
 674 
 675             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 676             p5 = buff[3]; p6 = buff[4];
 677 
 678             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 679             k4 = pk[4]; k5 = pk[5];
 680 
 681             if (l < (n - 1) || off < m) {
 682 #ifdef __SUNPRO_C
 683 #pragma pipeloop(0)
 684 #endif /* __SUNPRO_C */
 685               for (i = 0; i <= (wid - 2); i += 2) {
 686                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 687 
 688                 p5 = buff[i + 5]; p6 = buff[i + 6];
 689 
 690                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
 691                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
 692               }
 693 
 694             } else {
 695 #ifdef __SUNPRO_C
 696 #pragma pipeloop(0)
 697 #endif /* __SUNPRO_C */
 698               for (i = 0; i <= (wid - 2); i += 2) {
 699                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 700 
 701                 p5 = buff[i + 5]; p6 = buff[i + 6];
 702 
 703                 LOAD_BUFF(buffi);
 704 
 705                 dd.d64 = *(FTYPE   *)(buffi + i);
 706                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 707                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 708 
 709                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
 710                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
 711 
 712                 dp[0    ] = FROM_S32(d0);
 713                 dp[chan1] = FROM_S32(d1);
 714 
 715                 buffd[i    ] = 0.0;
 716                 buffd[i + 1] = 0.0;
 717 
 718                 sp += chan2;
 719                 dp += chan2;
 720               }
 721             }
 722 
 723           } else if (kw == 5) {
 724 
 725             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 726             p5 = buff[3];
 727 
 728             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 729             k4 = pk[4];
 730 
 731             if (l < (n - 1) || off < m) {
 732 #ifdef __SUNPRO_C
 733 #pragma pipeloop(0)
 734 #endif /* __SUNPRO_C */
 735               for (i = 0; i <= (wid - 2); i += 2) {
 736                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 737 
 738                 p4 = buff[i + 4]; p5 = buff[i + 5];
 739 
 740                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
 741                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
 742               }
 743 
 744             } else {
 745 #ifdef __SUNPRO_C
 746 #pragma pipeloop(0)
 747 #endif /* __SUNPRO_C */
 748               for (i = 0; i <= (wid - 2); i += 2) {
 749                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 750 
 751                 p4 = buff[i + 4]; p5 = buff[i + 5];
 752 
 753                 LOAD_BUFF(buffi);
 754 
 755                 dd.d64 = *(FTYPE   *)(buffi + i);
 756                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 757                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 758 
 759                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
 760                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
 761 
 762                 dp[0    ] = FROM_S32(d0);
 763                 dp[chan1] = FROM_S32(d1);
 764 
 765                 buffd[i    ] = 0.0;
 766                 buffd[i + 1] = 0.0;
 767 
 768                 sp += chan2;
 769                 dp += chan2;
 770               }
 771             }
 772 
 773           } else if (kw == 4) {
 774 
 775             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 776 
 777             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 778 
 779             if (l < (n - 1) || off < m) {
 780 #ifdef __SUNPRO_C
 781 #pragma pipeloop(0)
 782 #endif /* __SUNPRO_C */
 783               for (i = 0; i <= (wid - 2); i += 2) {
 784                 p0 = p2; p1 = p3; p2 = p4;
 785 
 786                 p3 = buff[i + 3]; p4 = buff[i + 4];
 787 
 788                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 789                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 790               }
 791 
 792             } else {
 793 #ifdef __SUNPRO_C
 794 #pragma pipeloop(0)
 795 #endif /* __SUNPRO_C */
 796               for (i = 0; i <= (wid - 2); i += 2) {
 797                 p0 = p2; p1 = p3; p2 = p4;
 798 
 799                 p3 = buff[i + 3]; p4 = buff[i + 4];
 800 
 801                 LOAD_BUFF(buffi);
 802 
 803                 dd.d64 = *(FTYPE   *)(buffi + i);
 804                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 805                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 806 
 807                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 808                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 809 
 810                 dp[0    ] = FROM_S32(d0);
 811                 dp[chan1] = FROM_S32(d1);
 812 
 813                 buffd[i    ] = 0.0;
 814                 buffd[i + 1] = 0.0;
 815 
 816                 sp += chan2;
 817                 dp += chan2;
 818               }
 819             }
 820 
 821           } else if (kw == 3) {
 822 
 823             p2 = buff[0]; p3 = buff[1];
 824             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 825 
 826             if (l < (n - 1) || off < m) {
 827 #ifdef __SUNPRO_C
 828 #pragma pipeloop(0)
 829 #endif /* __SUNPRO_C */
 830               for (i = 0; i <= (wid - 2); i += 2) {
 831                 p0 = p2; p1 = p3;
 832 
 833                 p2 = buff[i + 2]; p3 = buff[i + 3];
 834 
 835                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
 836                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
 837               }
 838 
 839             } else {
 840 #ifdef __SUNPRO_C
 841 #pragma pipeloop(0)
 842 #endif /* __SUNPRO_C */
 843               for (i = 0; i <= (wid - 2); i += 2) {
 844                 p0 = p2; p1 = p3;
 845 
 846                 p2 = buff[i + 2]; p3 = buff[i + 3];
 847 
 848                 LOAD_BUFF(buffi);
 849 
 850                 dd.d64 = *(FTYPE   *)(buffi + i);
 851                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 852                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 853 
 854                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 855                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 856 
 857                 dp[0    ] = FROM_S32(d0);
 858                 dp[chan1] = FROM_S32(d1);
 859 
 860                 buffd[i    ] = 0.0;
 861                 buffd[i + 1] = 0.0;
 862 
 863                 sp += chan2;
 864                 dp += chan2;
 865               }
 866             }
 867 
 868           } else /* if (kw == 2) */ {
 869 
 870             p2 = buff[0];
 871             k0 = pk[0]; k1 = pk[1];
 872 
 873             if (l < (n - 1) || off < m) {
 874 #ifdef __SUNPRO_C
 875 #pragma pipeloop(0)
 876 #endif /* __SUNPRO_C */
 877               for (i = 0; i <= (wid - 2); i += 2) {
 878                 p0 = p2;
 879 
 880                 p1 = buff[i + 1]; p2 = buff[i + 2];
 881 
 882                 buffd[i    ] += p0*k0 + p1*k1;
 883                 buffd[i + 1] += p1*k0 + p2*k1;
 884               }
 885 
 886             } else {
 887 #ifdef __SUNPRO_C
 888 #pragma pipeloop(0)
 889 #endif /* __SUNPRO_C */
 890               for (i = 0; i <= (wid - 2); i += 2) {
 891                 p0 = p2;
 892 
 893                 p1 = buff[i + 1]; p2 = buff[i + 2];
 894 
 895                 LOAD_BUFF(buffi);
 896 
 897                 dd.d64 = *(FTYPE   *)(buffi + i);
 898                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 899                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 900 
 901                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 902                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 903 
 904                 dp[0    ] = FROM_S32(d0);
 905                 dp[chan1] = FROM_S32(d1);
 906 
 907                 buffd[i    ] = 0.0;
 908                 buffd[i + 1] = 0.0;
 909 
 910                 sp += chan2;
 911                 dp += chan2;
 912               }
 913             }
 914           }
 915 
 916           pk += kw;
 917         }
 918       }
 919 
 920       /* last pixels */
 921       for (; i < wid; i++) {
 922         FTYPE    *pk = k, s = 0;
 923         mlib_s32 x, d0;
 924 
 925         for (l = 0; l < n; l++) {
 926           FTYPE    *buff = buffc[l] + i;
 927 
 928           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
 929         }
 930 
 931         d0 = D2I(s);
 932         dp[0] = FROM_S32(d0);
 933 
 934         buffn[i + dx_l] = (FTYPE)sp[0];
 935 
 936         sp += chan1;
 937         dp += chan1;
 938       }
 939 
 940       for (; i < swid; i++) {
 941         buffn[i + dx_l] = (FTYPE)sp[0];
 942         sp += chan1;
 943       }
 944 
 945       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
 946       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
 947 
 948       /* next line */
 949 
 950       if (j < hgt - dy_b - 2) sl += sll;
 951       dl += dll;
 952 
 953       buff_ind++;
 954 
 955       if (buff_ind >= n + 1) buff_ind = 0;
 956     }
 957   }
 958 
 959   FREE_AND_RETURN_STATUS
 960 }
 961 
 962 /***************************************************************/
 963 #ifndef __sparc /* for x86, using integer multiplies is faster */
 964 
 965 #define STORE_RES(res, x)                                       \
 966   x >>= shift2;                                                 \
 967   CLAMP_STORE(res, x)
 968 
 969 mlib_status CONV_FUNC_MxN_I
 970 {
 971   DTYPE    *adr_src, *sl, *sp = NULL;
 972   DTYPE    *adr_dst, *dl, *dp = NULL;
 973   mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 974   mlib_s32 *pbuff = buff;
 975   mlib_s32 **buffs = buffs_arr, *buffd;
 976   mlib_s32 l, off, kw, bsize, buff_ind;
 977   mlib_s32 d0, d1, shift1, shift2;
 978   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
 979   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
 980   mlib_s32 wid, hgt, sll, dll;
 981   mlib_s32 nchannel, chan1;
 982   mlib_s32 i, j, c, swid;
 983   mlib_s32 chan2;
 984   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
 985   GET_SRC_DST_PARAMETERS(DTYPE);
 986 
 987 #if IMG_TYPE != 1
 988   shift1 = 16;
 989 #else
 990   shift1 = 8;
 991 #endif /* IMG_TYPE != 1 */
 992   shift2 = scale - shift1;
 993 
 994   chan1 = nchannel;
 995   chan2 = chan1 + chan1;
 996 
 997   swid = wid + (m - 1);
 998 
 999   bsize = (n + 2)*swid;
1000 
1001   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
1002     pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1));
1003 
1004     if (pbuff == NULL) return MLIB_FAILURE;
1005     buffs = (mlib_s32 **)(pbuff + bsize);
1006   }
1007 
1008   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
1009   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1010   buffd = buffs[n] + swid;
1011 
1012   if (m*n > MAX_N*MAX_N) {
1013     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
1014 
1015     if (k == NULL) {
1016       if (pbuff != buff) mlib_free(pbuff);
1017       return MLIB_FAILURE;
1018     }
1019   }
1020 
1021   for (i = 0; i < m*n; i++) {
1022     k[i] = kernel[i] >> shift1;
1023   }
1024 
1025   swid -= (dx_l + dx_r);
1026 
1027   for (c = 0; c < nchannel; c++) {
1028     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1029 
1030     sl = adr_src + c;
1031     dl = adr_dst + c;
1032 
1033     for (l = 0; l < n; l++) {
1034       mlib_s32  *buff = buffs[l];
1035 
1036       for (i = 0; i < dx_l; i++) {
1037         buff[i] = (mlib_s32)sl[0];
1038       }
1039 
1040 #ifdef __SUNPRO_C
1041 #pragma pipeloop(0)
1042 #endif /* __SUNPRO_C */
1043       for (i = 0; i < swid; i++) {
1044         buff[i + dx_l] = (mlib_s32)sl[i*chan1];
1045       }
1046 
1047       for (i = 0; i < dx_r; i++) {
1048         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
1049       }
1050 
1051       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
1052     }
1053 
1054     buff_ind = 0;
1055 
1056 #ifdef __SUNPRO_C
1057 #pragma pipeloop(0)
1058 #endif /* __SUNPRO_C */
1059     for (i = 0; i < wid; i++) buffd[i] = 0;
1060 
1061     for (j = 0; j < hgt; j++) {
1062       mlib_s32 **buffc = buffs + buff_ind;
1063       mlib_s32 *buffn = buffc[n];
1064       mlib_s32 *pk = k;
1065 
1066       for (l = 0; l < n; l++) {
1067         mlib_s32  *buff_l = buffc[l];
1068 
1069         for (off = 0; off < m;) {
1070           mlib_s32 *buff = buff_l + off;
1071 
1072           sp = sl;
1073           dp = dl;
1074 
1075           kw = m - off;
1076 
1077           if (kw > 2*MAX_KER) kw = MAX_KER; else
1078             if (kw > MAX_KER) kw = kw/2;
1079           off += kw;
1080 
1081           if (kw == 7) {
1082 
1083             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1084             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1085 
1086             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1087             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1088 
1089             if (l < (n - 1) || off < m) {
1090 #ifdef __SUNPRO_C
1091 #pragma pipeloop(0)
1092 #endif /* __SUNPRO_C */
1093               for (i = 0; i <= (wid - 2); i += 2) {
1094                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1095 
1096                 p6 = buff[i + 6]; p7 = buff[i + 7];
1097 
1098                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1099                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1100               }
1101 
1102             } else {
1103 #ifdef __SUNPRO_C
1104 #pragma pipeloop(0)
1105 #endif /* __SUNPRO_C */
1106               for (i = 0; i <= (wid - 2); i += 2) {
1107                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1108 
1109                 p6 = buff[i + 6]; p7 = buff[i + 7];
1110 
1111                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1112                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1113 
1114                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1115                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1116 
1117                 STORE_RES(dp[0    ], d0);
1118                 STORE_RES(dp[chan1], d1);
1119 
1120                 buffd[i    ] = 0;
1121                 buffd[i + 1] = 0;
1122 
1123                 sp += chan2;
1124                 dp += chan2;
1125               }
1126             }
1127 
1128           } else if (kw == 6) {
1129 
1130             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1131             p5 = buff[3]; p6 = buff[4];
1132 
1133             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1134             k4 = pk[4]; k5 = pk[5];
1135 
1136             if (l < (n - 1) || off < m) {
1137 #ifdef __SUNPRO_C
1138 #pragma pipeloop(0)
1139 #endif /* __SUNPRO_C */
1140               for (i = 0; i <= (wid - 2); i += 2) {
1141                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1142 
1143                 p5 = buff[i + 5]; p6 = buff[i + 6];
1144 
1145                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1146                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1147               }
1148 
1149             } else {
1150 #ifdef __SUNPRO_C
1151 #pragma pipeloop(0)
1152 #endif /* __SUNPRO_C */
1153               for (i = 0; i <= (wid - 2); i += 2) {
1154                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1155 
1156                 p5 = buff[i + 5]; p6 = buff[i + 6];
1157 
1158                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1159                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1160 
1161                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
1162                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1163 
1164                 STORE_RES(dp[0    ], d0);
1165                 STORE_RES(dp[chan1], d1);
1166 
1167                 buffd[i    ] = 0;
1168                 buffd[i + 1] = 0;
1169 
1170                 sp += chan2;
1171                 dp += chan2;
1172               }
1173             }
1174 
1175           } else if (kw == 5) {
1176 
1177             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1178             p5 = buff[3];
1179 
1180             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1181             k4 = pk[4];
1182 
1183             if (l < (n - 1) || off < m) {
1184 #ifdef __SUNPRO_C
1185 #pragma pipeloop(0)
1186 #endif /* __SUNPRO_C */
1187               for (i = 0; i <= (wid - 2); i += 2) {
1188                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1189 
1190                 p4 = buff[i + 4]; p5 = buff[i + 5];
1191 
1192                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1193                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1194               }
1195 
1196             } else {
1197 #ifdef __SUNPRO_C
1198 #pragma pipeloop(0)
1199 #endif /* __SUNPRO_C */
1200               for (i = 0; i <= (wid - 2); i += 2) {
1201                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1202 
1203                 p4 = buff[i + 4]; p5 = buff[i + 5];
1204 
1205                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1206                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1207 
1208                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
1209                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1210 
1211                 STORE_RES(dp[0    ], d0);
1212                 STORE_RES(dp[chan1], d1);
1213 
1214                 buffd[i    ] = 0;
1215                 buffd[i + 1] = 0;
1216 
1217                 sp += chan2;
1218                 dp += chan2;
1219               }
1220             }
1221 
1222           } else if (kw == 4) {
1223 
1224             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1225 
1226             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1227 
1228             if (l < (n - 1) || off < m) {
1229 #ifdef __SUNPRO_C
1230 #pragma pipeloop(0)
1231 #endif /* __SUNPRO_C */
1232               for (i = 0; i <= (wid - 2); i += 2) {
1233                 p0 = p2; p1 = p3; p2 = p4;
1234 
1235                 p3 = buff[i + 3]; p4 = buff[i + 4];
1236 
1237                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1238                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1239               }
1240 
1241             } else {
1242 #ifdef __SUNPRO_C
1243 #pragma pipeloop(0)
1244 #endif /* __SUNPRO_C */
1245               for (i = 0; i <= (wid - 2); i += 2) {
1246                 p0 = p2; p1 = p3; p2 = p4;
1247 
1248                 p3 = buff[i + 3]; p4 = buff[i + 4];
1249 
1250                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1251                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1252 
1253                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1254                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1255 
1256                 STORE_RES(dp[0    ], d0);
1257                 STORE_RES(dp[chan1], d1);
1258 
1259                 buffd[i    ] = 0;
1260                 buffd[i + 1] = 0;
1261 
1262                 sp += chan2;
1263                 dp += chan2;
1264               }
1265             }
1266 
1267           } else if (kw == 3) {
1268 
1269             p2 = buff[0]; p3 = buff[1];
1270             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
1271 
1272             if (l < (n - 1) || off < m) {
1273 #ifdef __SUNPRO_C
1274 #pragma pipeloop(0)
1275 #endif /* __SUNPRO_C */
1276               for (i = 0; i <= (wid - 2); i += 2) {
1277                 p0 = p2; p1 = p3;
1278 
1279                 p2 = buff[i + 2]; p3 = buff[i + 3];
1280 
1281                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1282                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1283               }
1284 
1285             } else {
1286 #ifdef __SUNPRO_C
1287 #pragma pipeloop(0)
1288 #endif /* __SUNPRO_C */
1289               for (i = 0; i <= (wid - 2); i += 2) {
1290                 p0 = p2; p1 = p3;
1291 
1292                 p2 = buff[i + 2]; p3 = buff[i + 3];
1293 
1294                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1295                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1296 
1297                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1298                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1299 
1300                 STORE_RES(dp[0    ], d0);
1301                 STORE_RES(dp[chan1], d1);
1302 
1303                 buffd[i    ] = 0;
1304                 buffd[i + 1] = 0;
1305 
1306                 sp += chan2;
1307                 dp += chan2;
1308               }
1309             }
1310 
1311           } else if (kw == 2) {
1312 
1313             p2 = buff[0];
1314             k0 = pk[0]; k1 = pk[1];
1315 
1316             if (l < (n - 1) || off < m) {
1317 #ifdef __SUNPRO_C
1318 #pragma pipeloop(0)
1319 #endif /* __SUNPRO_C */
1320               for (i = 0; i <= (wid - 2); i += 2) {
1321                 p0 = p2;
1322 
1323                 p1 = buff[i + 1]; p2 = buff[i + 2];
1324 
1325                 buffd[i    ] += p0*k0 + p1*k1;
1326                 buffd[i + 1] += p1*k0 + p2*k1;
1327               }
1328 
1329             } else {
1330 #ifdef __SUNPRO_C
1331 #pragma pipeloop(0)
1332 #endif /* __SUNPRO_C */
1333               for (i = 0; i <= (wid - 2); i += 2) {
1334                 p0 = p2;
1335 
1336                 p1 = buff[i + 1]; p2 = buff[i + 2];
1337 
1338                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1339                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1340 
1341                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
1342                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
1343 
1344                 STORE_RES(dp[0    ], d0);
1345                 STORE_RES(dp[chan1], d1);
1346 
1347                 buffd[i    ] = 0;
1348                 buffd[i + 1] = 0;
1349 
1350                 sp += chan2;
1351                 dp += chan2;
1352               }
1353             }
1354 
1355           } else /* kw == 1 */{
1356 
1357             k0 = pk[0];
1358 
1359             if (l < (n - 1) || off < m) {
1360 #ifdef __SUNPRO_C
1361 #pragma pipeloop(0)
1362 #endif /* __SUNPRO_C */
1363               for (i = 0; i <= (wid - 2); i += 2) {
1364                 p0 = buff[i]; p1 = buff[i + 1];
1365 
1366                 buffd[i    ] += p0*k0;
1367                 buffd[i + 1] += p1*k0;
1368               }
1369 
1370             } else {
1371 #ifdef __SUNPRO_C
1372 #pragma pipeloop(0)
1373 #endif /* __SUNPRO_C */
1374               for (i = 0; i <= (wid - 2); i += 2) {
1375                 p0 = buff[i]; p1 = buff[i + 1];
1376 
1377                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1378                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1379 
1380                 d0 = (p0*k0 + buffd[i    ]);
1381                 d1 = (p1*k0 + buffd[i + 1]);
1382 
1383                 STORE_RES(dp[0    ], d0);
1384                 STORE_RES(dp[chan1], d1);
1385 
1386                 buffd[i    ] = 0;
1387                 buffd[i + 1] = 0;
1388 
1389                 sp += chan2;
1390                 dp += chan2;
1391               }
1392             }
1393           }
1394 
1395           pk += kw;
1396         }
1397       }
1398 
1399       /* last pixels */
1400       for (; i < wid; i++) {
1401         mlib_s32 *pk = k, x, s = 0;
1402 
1403         for (l = 0; l < n; l++) {
1404           mlib_s32 *buff = buffc[l] + i;
1405 
1406           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
1407         }
1408 
1409         STORE_RES(dp[0], s);
1410 
1411         buffn[i + dx_l] = (mlib_s32)sp[0];
1412 
1413         sp += chan1;
1414         dp += chan1;
1415       }
1416 
1417       for (; i < swid; i++) {
1418         buffn[i + dx_l] = (mlib_s32)sp[0];
1419         sp += chan1;
1420       }
1421 
1422       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1423       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1424 
1425       /* next line */
1426 
1427       if (j < hgt - dy_b - 2) sl += sll;
1428       dl += dll;
1429 
1430       buff_ind++;
1431 
1432       if (buff_ind >= n + 1) buff_ind = 0;
1433     }
1434   }
1435 
1436   if (pbuff != buff) mlib_free(pbuff);
1437   if (k != k_locl) mlib_free(k);
1438 
1439   return MLIB_SUCCESS;
1440 }
1441 
1442 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
1443 
1444 /***************************************************************/