1 /*
   2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 /*
  28  * FUNCTION
  29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 type and
  30  *   MLIB_EDGE_SRC_EXTEND mask
  31  */
  32 
  33 #include "mlib_image.h"
  34 #include "mlib_ImageConv.h"
  35 #include "mlib_c_ImageConv.h"
  36 
  37 /*
  38  * This define switches between functions of different data types
  39  */
  40 
  41 #define IMG_TYPE 2
  42 
  43 /***************************************************************/
  44 #if IMG_TYPE == 1
  45 
  46 #define DTYPE             mlib_u8
  47 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##ext_u8(PARAM)
  48 #define CONV_FUNC_MxN     mlib_c_convMxNext_u8(PARAM_MxN)
  49 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM)
  50 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u8(PARAM_MxN)
  51 #define DSCALE            (1 << 24)
  52 #define FROM_S32(x)       (((x) >> 24) ^ 128)
  53 #define S64TOS32(x)       (x)
  54 #define SAT_OFF           -(1u << 31)
  55 
  56 #elif IMG_TYPE == 2
  57 
  58 #define DTYPE             mlib_s16
  59 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_s16(PARAM)
  60 #define CONV_FUNC_MxN     mlib_convMxNext_s16(PARAM_MxN)
  61 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM)
  62 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_s16(PARAM_MxN)
  63 #define DSCALE            65536.0
  64 #define FROM_S32(x)       ((x) >> 16)
  65 #define S64TOS32(x)       ((x) & 0xffffffff)
  66 #define SAT_OFF
  67 
  68 #elif IMG_TYPE == 3
  69 
  70 #define DTYPE             mlib_u16
  71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
  72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
  73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
  74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
  75 #define DSCALE            65536.0
  76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
  77 #define S64TOS32(x)       (x)
  78 #define SAT_OFF           -(1u << 31)
  79 
  80 #endif /* IMG_TYPE == 1 */
  81 
  82 /***************************************************************/
  83 #define PARAM                                                   \
  84   mlib_image       *dst,                                        \
  85   const mlib_image *src,                                        \
  86   mlib_s32         dx_l,                                        \
  87   mlib_s32         dx_r,                                        \
  88   mlib_s32         dy_t,                                        \
  89   mlib_s32         dy_b,                                        \
  90   const mlib_s32   *kern,                                       \
  91   mlib_s32         scalef_expon,                                \
  92   mlib_s32         cmask
  93 
  94 /***************************************************************/
  95 #define PARAM_MxN                                               \
  96   mlib_image       *dst,                                        \
  97   const mlib_image *src,                                        \
  98   const mlib_s32   *kernel,                                     \
  99   mlib_s32         m,                                           \
 100   mlib_s32         n,                                           \
 101   mlib_s32         dx_l,                                        \
 102   mlib_s32         dx_r,                                        \
 103   mlib_s32         dy_t,                                        \
 104   mlib_s32         dy_b,                                        \
 105   mlib_s32         scale,                                       \
 106   mlib_s32         cmask
 107 
 108 /***************************************************************/
 109 #define FTYPE mlib_d64
 110 
 111 #ifndef MLIB_USE_FTOI_CLAMPING
 112 
 113 #define CLAMP_S32(x)                                            \
 114   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
 115 
 116 #else
 117 
 118 #define CLAMP_S32(x) ((mlib_s32)(x))
 119 
 120 #endif /* MLIB_USE_FTOI_CLAMPING */
 121 
 122 /***************************************************************/
 123 #define D2I(x) CLAMP_S32((x) SAT_OFF)
 124 
 125 /***************************************************************/
 126 #ifdef _LITTLE_ENDIAN
 127 
 128 #define STORE2(res0, res1)                                      \
 129   dp[0    ] = res1;                                             \
 130   dp[chan1] = res0
 131 
 132 #else
 133 
 134 #define STORE2(res0, res1)                                      \
 135   dp[0    ] = res0;                                             \
 136   dp[chan1] = res1
 137 
 138 #endif /* _LITTLE_ENDIAN */
 139 
 140 /***************************************************************/
 141 #ifdef _NO_LONGLONG
 142 
 143 #define LOAD_BUFF(buff)                                         \
 144   buff[i    ] = sp[0];                                          \
 145   buff[i + 1] = sp[chan1]
 146 
 147 #else /* _NO_LONGLONG */
 148 
 149 #ifdef _LITTLE_ENDIAN
 150 
 151 #define LOAD_BUFF(buff)                                         \
 152   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
 153 
 154 #else /* _LITTLE_ENDIAN */
 155 
 156 #define LOAD_BUFF(buff)                                         \
 157   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 158 
 159 #endif /* _LITTLE_ENDIAN */
 160 #endif /* _NO_LONGLONG */
 161 
 162 /***************************************************************/
 163 typedef union {
 164   mlib_d64 d64;
 165   struct {
 166     mlib_s32 i0;
 167     mlib_s32 i1;
 168   } i32s;
 169 } d64_2x32;
 170 
 171 /***************************************************************/
 172 #define GET_SRC_DST_PARAMETERS(type)                            \
 173   hgt = mlib_ImageGetHeight(src);                               \
 174   wid = mlib_ImageGetWidth(src);                                \
 175   nchannel = mlib_ImageGetChannels(src);                        \
 176   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 177   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 178   adr_src = (type *)mlib_ImageGetData(src);                     \
 179   adr_dst = (type *)mlib_ImageGetData(dst)
 180 
 181 /***************************************************************/
 182 #ifndef __sparc
 183 #if IMG_TYPE == 1
 184 
 185 /*
 186  * Test for the presence of any "1" bit in bits
 187    8 to 31 of val. If present, then val is either
 188    negative or >255. If over/underflows of 8 bits
 189    are uncommon, then this technique can be a win,
 190    since only a single test, rather than two, is
 191    necessary to determine if clamping is needed.
 192    On the other hand, if over/underflows are common,
 193    it adds an extra test.
 194 */
 195 #define CLAMP_STORE(dst, val)                                   \
 196   if (val & 0xffffff00) {                                       \
 197     if (val < MLIB_U8_MIN)                                      \
 198       dst = MLIB_U8_MIN;                                        \
 199     else                                                        \
 200       dst = MLIB_U8_MAX;                                        \
 201   } else {                                                      \
 202     dst = (mlib_u8)val;                                         \
 203   }
 204 
 205 #elif IMG_TYPE == 2
 206 
 207 #define CLAMP_STORE(dst, val)                                   \
 208   if (val >= MLIB_S16_MAX)                                      \
 209     dst = MLIB_S16_MAX;                                         \
 210   else if (val <= MLIB_S16_MIN)                                 \
 211     dst = MLIB_S16_MIN;                                         \
 212   else                                                          \
 213     dst = (mlib_s16)val
 214 
 215 #elif IMG_TYPE == 3
 216 
 217 #define CLAMP_STORE(dst, val)                                   \
 218   if (val >= MLIB_U16_MAX)                                      \
 219     dst = MLIB_U16_MAX;                                         \
 220   else if (val <= MLIB_U16_MIN)                                 \
 221     dst = MLIB_U16_MIN;                                         \
 222   else                                                          \
 223     dst = (mlib_u16)val
 224 
 225 #endif /* IMG_TYPE == 1 */
 226 #endif /* __sparc */
 227 
 228 /***************************************************************/
 229 #define MAX_KER   7
 230 #define MAX_N    15
 231 #define BUFF_SIZE   1600
 232 #define CACHE_SIZE  (64*1024)
 233 
 234 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
 235                                          const mlib_image *src,
 236                                          const mlib_d64   *k,
 237                                          mlib_s32         n,
 238                                          mlib_s32         dy_t,
 239                                          mlib_s32         dy_b,
 240                                          mlib_s32         cmask)
 241 {
 242   DTYPE    *adr_src, *sl;
 243   DTYPE    *adr_dst, *dl, *dp;
 244   FTYPE    buff[BUFF_SIZE];
 245   FTYPE    *buffd;
 246   FTYPE    *pbuff = buff;
 247   const FTYPE    *pk;
 248   FTYPE    k0, k1, k2, k3;
 249   FTYPE    p0, p1, p2, p3, p4;
 250   FTYPE    *sbuff;
 251   mlib_s32 l, k_off, off, bsize;
 252   mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh;
 253   mlib_s32 d0, d1, ii;
 254   mlib_s32 wid, hgt, sll, dll;
 255   mlib_s32 nchannel;
 256   mlib_s32 i, j, c;
 257   GET_SRC_DST_PARAMETERS(DTYPE);
 258 
 259   max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1);
 260 
 261   if (max_hsize < 1) max_hsize = 1;
 262   if (max_hsize > hgt) max_hsize = hgt;
 263 
 264   shgt = hgt + (n - 1);
 265   smax_hsize = max_hsize + (n - 1);
 266 
 267   bsize = 2 * (smax_hsize + 1);
 268 
 269   if (bsize > BUFF_SIZE) {
 270     pbuff = mlib_malloc(sizeof(FTYPE)*bsize);
 271 
 272     if (pbuff == NULL) return MLIB_FAILURE;
 273   }
 274 
 275   sbuff = pbuff;
 276   buffd = sbuff + smax_hsize;
 277 
 278   shgt -= (dy_t + dy_b);
 279   k_off = 0;
 280 
 281   for (l = 0; l < hgt; l += hsize) {
 282     hsize = hgt - l;
 283 
 284     if (hsize > max_hsize) hsize = max_hsize;
 285 
 286     smax_hsize = hsize + (n - 1);
 287 
 288     for (c = 0; c < nchannel; c++) {
 289       if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 290 
 291       sl = adr_src + c;
 292       dl = adr_dst + c;
 293 
 294 #ifdef __SUNPRO_C
 295 #pragma pipeloop(0)
 296 #endif /* __SUNPRO_C */
 297       for (i = 0; i < hsize; i++) buffd[i] = 0.0;
 298 
 299       for (j = 0; j < wid; j++) {
 300         FTYPE    *buff = sbuff;
 301 
 302         for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) {
 303           sbuff[i - k_off] = (FTYPE)sl[0];
 304         }
 305 
 306 #ifdef __SUNPRO_C
 307 #pragma pipeloop(0)
 308 #endif /* __SUNPRO_C */
 309         for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) {
 310           sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll];
 311         }
 312 
 313         for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) {
 314           sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll];
 315         }
 316 
 317         pk = k;
 318 
 319         for (off = 0; off < (n - 4); off += 4) {
 320 
 321           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 322           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 323 
 324 #ifdef __SUNPRO_C
 325 #pragma pipeloop(0)
 326 #endif /* __SUNPRO_C */
 327           for (i = 0; i < hsize; i += 2) {
 328             p0 = p2; p1 = p3; p2 = p4;
 329 
 330             p3 = buff[i + 3]; p4 = buff[i + 4];
 331 
 332             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 333             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 334           }
 335 
 336           pk += 4;
 337           buff += 4;
 338         }
 339 
 340         dp = dl;
 341         kh = n - off;
 342 
 343         if (kh == 4) {
 344           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 345           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 346 
 347 #ifdef __SUNPRO_C
 348 #pragma pipeloop(0)
 349 #endif /* __SUNPRO_C */
 350           for (i = 0; i <= (hsize - 2); i += 2) {
 351             p0 = p2; p1 = p3; p2 = p4;
 352 
 353             p3 = buff[i + 3]; p4 = buff[i + 4];
 354 
 355             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 356             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 357 
 358             dp[0  ] = FROM_S32(d0);
 359             dp[dll] = FROM_S32(d1);
 360 
 361             buffd[i    ] = 0.0;
 362             buffd[i + 1] = 0.0;
 363 
 364             dp += 2*dll;
 365           }
 366 
 367           if (i < hsize) {
 368             p0 = p2; p1 = p3; p2 = p4;
 369             p3 = buff[i + 3];
 370             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]);
 371             dp[0] = FROM_S32(d0);
 372             buffd[i] = 0.0;
 373           }
 374 
 375         } else if (kh == 3) {
 376 
 377           p2 = buff[0]; p3 = buff[1];
 378           k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 379 
 380 #ifdef __SUNPRO_C
 381 #pragma pipeloop(0)
 382 #endif /* __SUNPRO_C */
 383           for (i = 0; i <= (hsize - 2); i += 2) {
 384             p0 = p2; p1 = p3;
 385 
 386             p2 = buff[i + 2]; p3 = buff[i + 3];
 387 
 388             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 389             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 390 
 391             dp[0  ] = FROM_S32(d0);
 392             dp[dll] = FROM_S32(d1);
 393 
 394             buffd[i    ] = 0.0;
 395             buffd[i + 1] = 0.0;
 396 
 397             dp += 2*dll;
 398           }
 399 
 400           if (i < hsize) {
 401             p0 = p2; p1 = p3;
 402             p2 = buff[i + 2];
 403             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]);
 404             dp[0] = FROM_S32(d0);
 405 
 406             buffd[i] = 0.0;
 407           }
 408 
 409         } else if (kh == 2) {
 410 
 411           p2 = buff[0];
 412           k0 = pk[0]; k1 = pk[1];
 413 
 414 #ifdef __SUNPRO_C
 415 #pragma pipeloop(0)
 416 #endif /* __SUNPRO_C */
 417           for (i = 0; i <= (hsize - 2); i += 2) {
 418             p0 = p2;
 419 
 420             p1 = buff[i + 1]; p2 = buff[i + 2];
 421 
 422             d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 423             d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 424 
 425             dp[0  ] = FROM_S32(d0);
 426             dp[dll] = FROM_S32(d1);
 427 
 428             buffd[i    ] = 0.0;
 429             buffd[i + 1] = 0.0;
 430 
 431             dp += 2*dll;
 432           }
 433 
 434           if (i < hsize) {
 435             p0 = p2;
 436             p1 = buff[i + 1];
 437             d0 = D2I(p0*k0 + p1*k1 + buffd[i]);
 438             dp[0] = FROM_S32(d0);
 439 
 440             buffd[i] = 0.0;
 441           }
 442 
 443         } else /* kh == 1 */{
 444 
 445           k0 = pk[0];
 446 
 447 #ifdef __SUNPRO_C
 448 #pragma pipeloop(0)
 449 #endif /* __SUNPRO_C */
 450           for (i = 0; i <= (hsize - 2); i += 2) {
 451             p0 = buff[i]; p1 = buff[i + 1];
 452 
 453             d0 = D2I(p0*k0 + buffd[i    ]);
 454             d1 = D2I(p1*k0 + buffd[i + 1]);
 455 
 456             dp[0  ] = FROM_S32(d0);
 457             dp[dll] = FROM_S32(d1);
 458 
 459             buffd[i    ] = 0.0;
 460             buffd[i + 1] = 0.0;
 461 
 462             dp += 2*dll;
 463           }
 464 
 465           if (i < hsize) {
 466             p0 = buff[i];
 467             d0 = D2I(p0*k0 + buffd[i]);
 468             dp[0] = FROM_S32(d0);
 469 
 470             buffd[i] = 0.0;
 471           }
 472         }
 473 
 474         /* next line */
 475         sl += nchannel;
 476         dl += nchannel;
 477       }
 478     }
 479 
 480     k_off += max_hsize;
 481     adr_dst += max_hsize*dll;
 482   }
 483 
 484   if (pbuff != buff) mlib_free(pbuff);
 485 
 486   return MLIB_SUCCESS;
 487 }
 488 
 489 /***************************************************************/
 490 mlib_status CONV_FUNC_MxN
 491 {
 492   DTYPE    *adr_src, *sl, *sp = NULL;
 493   DTYPE    *adr_dst, *dl, *dp = NULL;
 494   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 495   FTYPE    **buffs = buffs_arr, *buffd;
 496   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
 497   FTYPE    *pbuff = buff;
 498   FTYPE    k0, k1, k2, k3, k4, k5, k6;
 499   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
 500   mlib_s32 *buffi;
 501   mlib_s32 mn, l, off, kw, bsize, buff_ind;
 502   mlib_s32 d0, d1;
 503   mlib_s32 wid, hgt, sll, dll;
 504   mlib_s32 nchannel, chan1, chan2;
 505   mlib_s32 i, j, c, swid;
 506   d64_2x32 dd;
 507   mlib_status status = MLIB_SUCCESS;
 508 
 509   GET_SRC_DST_PARAMETERS(DTYPE);
 510 
 511   if (scale > 30) {
 512     fscale *= 1.0/(1 << 30);
 513     scale -= 30;
 514   }
 515 
 516   fscale /= (1 << scale);
 517 
 518   mn = m*n;
 519 
 520   if (mn > 256) {
 521     k = mlib_malloc(mn*sizeof(mlib_d64));
 522 
 523     if (k == NULL) return MLIB_FAILURE;
 524   }
 525 
 526   for (i = 0; i < mn; i++) {
 527     k[i] = kernel[i]*fscale;
 528   }
 529 
 530   if (m == 1) {
 531     status = mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask);
 532     FREE_AND_RETURN_STATUS
 533   }
 534 
 535   swid = wid + (m - 1);
 536 
 537   bsize = (n + 3)*swid;
 538 
 539   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 540     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
 541 
 542     if (pbuff == NULL) {
 543       status = MLIB_FAILURE;
 544       FREE_AND_RETURN_STATUS
 545     }
 546     buffs = (FTYPE   **)(pbuff + bsize);
 547   }
 548 
 549   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
 550   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
 551   buffd = buffs[n] + swid;
 552   buffi = (mlib_s32*)(buffd + swid);
 553 
 554   chan1 = nchannel;
 555   chan2 = chan1 + chan1;
 556 
 557   swid -= (dx_l + dx_r);
 558 
 559   for (c = 0; c < nchannel; c++) {
 560     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 561 
 562     sl = adr_src + c;
 563     dl = adr_dst + c;
 564 
 565     for (l = 0; l < n; l++) {
 566       FTYPE    *buff = buffs[l];
 567 
 568       for (i = 0; i < dx_l; i++) {
 569         buff[i] = (FTYPE)sl[0];
 570       }
 571 
 572 #ifdef __SUNPRO_C
 573 #pragma pipeloop(0)
 574 #endif /* __SUNPRO_C */
 575       for (i = 0; i < swid; i++) {
 576         buff[i + dx_l] = (FTYPE)sl[i*chan1];
 577       }
 578 
 579       for (i = 0; i < dx_r; i++) {
 580         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
 581       }
 582 
 583       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
 584     }
 585 
 586     buff_ind = 0;
 587 
 588 #ifdef __SUNPRO_C
 589 #pragma pipeloop(0)
 590 #endif /* __SUNPRO_C */
 591     for (i = 0; i < wid; i++) buffd[i] = 0.0;
 592 
 593     for (j = 0; j < hgt; j++) {
 594       FTYPE    **buffc = buffs + buff_ind;
 595       FTYPE    *buffn = buffc[n];
 596       FTYPE    *pk = k;
 597 
 598       for (l = 0; l < n; l++) {
 599         FTYPE    *buff_l = buffc[l];
 600 
 601         for (off = 0; off < m;) {
 602           FTYPE    *buff = buff_l + off;
 603 
 604           kw = m - off;
 605 
 606           if (kw > 2*MAX_KER) kw = MAX_KER; else
 607             if (kw > MAX_KER) kw = kw/2;
 608           off += kw;
 609 
 610           sp = sl;
 611           dp = dl;
 612 
 613           if (kw == 7) {
 614 
 615             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 616             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
 617 
 618             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 619             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
 620 
 621             if (l < (n - 1) || off < m) {
 622 #ifdef __SUNPRO_C
 623 #pragma pipeloop(0)
 624 #endif /* __SUNPRO_C */
 625               for (i = 0; i <= (wid - 2); i += 2) {
 626                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 627 
 628                 p6 = buff[i + 6]; p7 = buff[i + 7];
 629 
 630                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
 631                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
 632               }
 633 
 634             } else {
 635 #ifdef __SUNPRO_C
 636 #pragma pipeloop(0)
 637 #endif /* __SUNPRO_C */
 638               for (i = 0; i <= (wid - 2); i += 2) {
 639                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 640 
 641                 p6 = buff[i + 6]; p7 = buff[i + 7];
 642 
 643                 LOAD_BUFF(buffi);
 644 
 645                 dd.d64 = *(FTYPE   *)(buffi + i);
 646                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 647                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 648 
 649                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
 650                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
 651 
 652                 dp[0    ] = FROM_S32(d0);
 653                 dp[chan1] = FROM_S32(d1);
 654 
 655                 buffd[i    ] = 0.0;
 656                 buffd[i + 1] = 0.0;
 657 
 658                 sp += chan2;
 659                 dp += chan2;
 660               }
 661             }
 662 
 663           } else if (kw == 6) {
 664 
 665             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 666             p5 = buff[3]; p6 = buff[4];
 667 
 668             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 669             k4 = pk[4]; k5 = pk[5];
 670 
 671             if (l < (n - 1) || off < m) {
 672 #ifdef __SUNPRO_C
 673 #pragma pipeloop(0)
 674 #endif /* __SUNPRO_C */
 675               for (i = 0; i <= (wid - 2); i += 2) {
 676                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 677 
 678                 p5 = buff[i + 5]; p6 = buff[i + 6];
 679 
 680                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
 681                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
 682               }
 683 
 684             } else {
 685 #ifdef __SUNPRO_C
 686 #pragma pipeloop(0)
 687 #endif /* __SUNPRO_C */
 688               for (i = 0; i <= (wid - 2); i += 2) {
 689                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 690 
 691                 p5 = buff[i + 5]; p6 = buff[i + 6];
 692 
 693                 LOAD_BUFF(buffi);
 694 
 695                 dd.d64 = *(FTYPE   *)(buffi + i);
 696                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 697                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 698 
 699                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
 700                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
 701 
 702                 dp[0    ] = FROM_S32(d0);
 703                 dp[chan1] = FROM_S32(d1);
 704 
 705                 buffd[i    ] = 0.0;
 706                 buffd[i + 1] = 0.0;
 707 
 708                 sp += chan2;
 709                 dp += chan2;
 710               }
 711             }
 712 
 713           } else if (kw == 5) {
 714 
 715             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 716             p5 = buff[3];
 717 
 718             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 719             k4 = pk[4];
 720 
 721             if (l < (n - 1) || off < m) {
 722 #ifdef __SUNPRO_C
 723 #pragma pipeloop(0)
 724 #endif /* __SUNPRO_C */
 725               for (i = 0; i <= (wid - 2); i += 2) {
 726                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 727 
 728                 p4 = buff[i + 4]; p5 = buff[i + 5];
 729 
 730                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
 731                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
 732               }
 733 
 734             } else {
 735 #ifdef __SUNPRO_C
 736 #pragma pipeloop(0)
 737 #endif /* __SUNPRO_C */
 738               for (i = 0; i <= (wid - 2); i += 2) {
 739                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 740 
 741                 p4 = buff[i + 4]; p5 = buff[i + 5];
 742 
 743                 LOAD_BUFF(buffi);
 744 
 745                 dd.d64 = *(FTYPE   *)(buffi + i);
 746                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 747                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 748 
 749                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
 750                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
 751 
 752                 dp[0    ] = FROM_S32(d0);
 753                 dp[chan1] = FROM_S32(d1);
 754 
 755                 buffd[i    ] = 0.0;
 756                 buffd[i + 1] = 0.0;
 757 
 758                 sp += chan2;
 759                 dp += chan2;
 760               }
 761             }
 762 
 763           } else if (kw == 4) {
 764 
 765             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 766 
 767             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 768 
 769             if (l < (n - 1) || off < m) {
 770 #ifdef __SUNPRO_C
 771 #pragma pipeloop(0)
 772 #endif /* __SUNPRO_C */
 773               for (i = 0; i <= (wid - 2); i += 2) {
 774                 p0 = p2; p1 = p3; p2 = p4;
 775 
 776                 p3 = buff[i + 3]; p4 = buff[i + 4];
 777 
 778                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 779                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 780               }
 781 
 782             } else {
 783 #ifdef __SUNPRO_C
 784 #pragma pipeloop(0)
 785 #endif /* __SUNPRO_C */
 786               for (i = 0; i <= (wid - 2); i += 2) {
 787                 p0 = p2; p1 = p3; p2 = p4;
 788 
 789                 p3 = buff[i + 3]; p4 = buff[i + 4];
 790 
 791                 LOAD_BUFF(buffi);
 792 
 793                 dd.d64 = *(FTYPE   *)(buffi + i);
 794                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 795                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 796 
 797                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 798                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 799 
 800                 dp[0    ] = FROM_S32(d0);
 801                 dp[chan1] = FROM_S32(d1);
 802 
 803                 buffd[i    ] = 0.0;
 804                 buffd[i + 1] = 0.0;
 805 
 806                 sp += chan2;
 807                 dp += chan2;
 808               }
 809             }
 810 
 811           } else if (kw == 3) {
 812 
 813             p2 = buff[0]; p3 = buff[1];
 814             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 815 
 816             if (l < (n - 1) || off < m) {
 817 #ifdef __SUNPRO_C
 818 #pragma pipeloop(0)
 819 #endif /* __SUNPRO_C */
 820               for (i = 0; i <= (wid - 2); i += 2) {
 821                 p0 = p2; p1 = p3;
 822 
 823                 p2 = buff[i + 2]; p3 = buff[i + 3];
 824 
 825                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
 826                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
 827               }
 828 
 829             } else {
 830 #ifdef __SUNPRO_C
 831 #pragma pipeloop(0)
 832 #endif /* __SUNPRO_C */
 833               for (i = 0; i <= (wid - 2); i += 2) {
 834                 p0 = p2; p1 = p3;
 835 
 836                 p2 = buff[i + 2]; p3 = buff[i + 3];
 837 
 838                 LOAD_BUFF(buffi);
 839 
 840                 dd.d64 = *(FTYPE   *)(buffi + i);
 841                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 842                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 843 
 844                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 845                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 846 
 847                 dp[0    ] = FROM_S32(d0);
 848                 dp[chan1] = FROM_S32(d1);
 849 
 850                 buffd[i    ] = 0.0;
 851                 buffd[i + 1] = 0.0;
 852 
 853                 sp += chan2;
 854                 dp += chan2;
 855               }
 856             }
 857 
 858           } else /* if (kw == 2) */ {
 859 
 860             p2 = buff[0];
 861             k0 = pk[0]; k1 = pk[1];
 862 
 863             if (l < (n - 1) || off < m) {
 864 #ifdef __SUNPRO_C
 865 #pragma pipeloop(0)
 866 #endif /* __SUNPRO_C */
 867               for (i = 0; i <= (wid - 2); i += 2) {
 868                 p0 = p2;
 869 
 870                 p1 = buff[i + 1]; p2 = buff[i + 2];
 871 
 872                 buffd[i    ] += p0*k0 + p1*k1;
 873                 buffd[i + 1] += p1*k0 + p2*k1;
 874               }
 875 
 876             } else {
 877 #ifdef __SUNPRO_C
 878 #pragma pipeloop(0)
 879 #endif /* __SUNPRO_C */
 880               for (i = 0; i <= (wid - 2); i += 2) {
 881                 p0 = p2;
 882 
 883                 p1 = buff[i + 1]; p2 = buff[i + 2];
 884 
 885                 LOAD_BUFF(buffi);
 886 
 887                 dd.d64 = *(FTYPE   *)(buffi + i);
 888                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 889                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 890 
 891                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 892                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 893 
 894                 dp[0    ] = FROM_S32(d0);
 895                 dp[chan1] = FROM_S32(d1);
 896 
 897                 buffd[i    ] = 0.0;
 898                 buffd[i + 1] = 0.0;
 899 
 900                 sp += chan2;
 901                 dp += chan2;
 902               }
 903             }
 904           }
 905 
 906           pk += kw;
 907         }
 908       }
 909 
 910       /* last pixels */
 911       for (; i < wid; i++) {
 912         FTYPE    *pk = k, s = 0;
 913         mlib_s32 x, d0;
 914 
 915         for (l = 0; l < n; l++) {
 916           FTYPE    *buff = buffc[l] + i;
 917 
 918           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
 919         }
 920 
 921         d0 = D2I(s);
 922         dp[0] = FROM_S32(d0);
 923 
 924         buffn[i + dx_l] = (FTYPE)sp[0];
 925 
 926         sp += chan1;
 927         dp += chan1;
 928       }
 929 
 930       for (; i < swid; i++) {
 931         buffn[i + dx_l] = (FTYPE)sp[0];
 932         sp += chan1;
 933       }
 934 
 935       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
 936       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
 937 
 938       /* next line */
 939 
 940       if (j < hgt - dy_b - 2) sl += sll;
 941       dl += dll;
 942 
 943       buff_ind++;
 944 
 945       if (buff_ind >= n + 1) buff_ind = 0;
 946     }
 947   }
 948 
 949   FREE_AND_RETURN_STATUS
 950 }
 951 
 952 /***************************************************************/
 953 #ifndef __sparc /* for x86, using integer multiplies is faster */
 954 
 955 #define STORE_RES(res, x)                                       \
 956   x >>= shift2;                                                 \
 957   CLAMP_STORE(res, x)
 958 
 959 mlib_status CONV_FUNC_MxN_I
 960 {
 961   DTYPE    *adr_src, *sl, *sp = NULL;
 962   DTYPE    *adr_dst, *dl, *dp = NULL;
 963   mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 964   mlib_s32 *pbuff = buff;
 965   mlib_s32 **buffs = buffs_arr, *buffd;
 966   mlib_s32 l, off, kw, bsize, buff_ind;
 967   mlib_s32 d0, d1, shift1, shift2;
 968   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
 969   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
 970   mlib_s32 wid, hgt, sll, dll;
 971   mlib_s32 nchannel, chan1;
 972   mlib_s32 i, j, c, swid;
 973   mlib_s32 chan2;
 974   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
 975   GET_SRC_DST_PARAMETERS(DTYPE);
 976 
 977 #if IMG_TYPE != 1
 978   shift1 = 16;
 979 #else
 980   shift1 = 8;
 981 #endif /* IMG_TYPE != 1 */
 982   shift2 = scale - shift1;
 983 
 984   chan1 = nchannel;
 985   chan2 = chan1 + chan1;
 986 
 987   swid = wid + (m - 1);
 988 
 989   bsize = (n + 2)*swid;
 990 
 991   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 992     pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1));
 993 
 994     if (pbuff == NULL) return MLIB_FAILURE;
 995     buffs = (mlib_s32 **)(pbuff + bsize);
 996   }
 997 
 998   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
 999   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1000   buffd = buffs[n] + swid;
1001 
1002   if (m*n > MAX_N*MAX_N) {
1003     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
1004 
1005     if (k == NULL) {
1006       if (pbuff != buff) mlib_free(pbuff);
1007       return MLIB_FAILURE;
1008     }
1009   }
1010 
1011   for (i = 0; i < m*n; i++) {
1012     k[i] = kernel[i] >> shift1;
1013   }
1014 
1015   swid -= (dx_l + dx_r);
1016 
1017   for (c = 0; c < nchannel; c++) {
1018     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1019 
1020     sl = adr_src + c;
1021     dl = adr_dst + c;
1022 
1023     for (l = 0; l < n; l++) {
1024       mlib_s32  *buff = buffs[l];
1025 
1026       for (i = 0; i < dx_l; i++) {
1027         buff[i] = (mlib_s32)sl[0];
1028       }
1029 
1030 #ifdef __SUNPRO_C
1031 #pragma pipeloop(0)
1032 #endif /* __SUNPRO_C */
1033       for (i = 0; i < swid; i++) {
1034         buff[i + dx_l] = (mlib_s32)sl[i*chan1];
1035       }
1036 
1037       for (i = 0; i < dx_r; i++) {
1038         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
1039       }
1040 
1041       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
1042     }
1043 
1044     buff_ind = 0;
1045 
1046 #ifdef __SUNPRO_C
1047 #pragma pipeloop(0)
1048 #endif /* __SUNPRO_C */
1049     for (i = 0; i < wid; i++) buffd[i] = 0;
1050 
1051     for (j = 0; j < hgt; j++) {
1052       mlib_s32 **buffc = buffs + buff_ind;
1053       mlib_s32 *buffn = buffc[n];
1054       mlib_s32 *pk = k;
1055 
1056       for (l = 0; l < n; l++) {
1057         mlib_s32  *buff_l = buffc[l];
1058 
1059         for (off = 0; off < m;) {
1060           mlib_s32 *buff = buff_l + off;
1061 
1062           sp = sl;
1063           dp = dl;
1064 
1065           kw = m - off;
1066 
1067           if (kw > 2*MAX_KER) kw = MAX_KER; else
1068             if (kw > MAX_KER) kw = kw/2;
1069           off += kw;
1070 
1071           if (kw == 7) {
1072 
1073             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1074             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1075 
1076             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1077             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1078 
1079             if (l < (n - 1) || off < m) {
1080 #ifdef __SUNPRO_C
1081 #pragma pipeloop(0)
1082 #endif /* __SUNPRO_C */
1083               for (i = 0; i <= (wid - 2); i += 2) {
1084                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1085 
1086                 p6 = buff[i + 6]; p7 = buff[i + 7];
1087 
1088                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1089                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1090               }
1091 
1092             } else {
1093 #ifdef __SUNPRO_C
1094 #pragma pipeloop(0)
1095 #endif /* __SUNPRO_C */
1096               for (i = 0; i <= (wid - 2); i += 2) {
1097                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1098 
1099                 p6 = buff[i + 6]; p7 = buff[i + 7];
1100 
1101                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1102                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1103 
1104                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1105                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1106 
1107                 STORE_RES(dp[0    ], d0);
1108                 STORE_RES(dp[chan1], d1);
1109 
1110                 buffd[i    ] = 0;
1111                 buffd[i + 1] = 0;
1112 
1113                 sp += chan2;
1114                 dp += chan2;
1115               }
1116             }
1117 
1118           } else if (kw == 6) {
1119 
1120             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1121             p5 = buff[3]; p6 = buff[4];
1122 
1123             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1124             k4 = pk[4]; k5 = pk[5];
1125 
1126             if (l < (n - 1) || off < m) {
1127 #ifdef __SUNPRO_C
1128 #pragma pipeloop(0)
1129 #endif /* __SUNPRO_C */
1130               for (i = 0; i <= (wid - 2); i += 2) {
1131                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1132 
1133                 p5 = buff[i + 5]; p6 = buff[i + 6];
1134 
1135                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1136                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1137               }
1138 
1139             } else {
1140 #ifdef __SUNPRO_C
1141 #pragma pipeloop(0)
1142 #endif /* __SUNPRO_C */
1143               for (i = 0; i <= (wid - 2); i += 2) {
1144                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1145 
1146                 p5 = buff[i + 5]; p6 = buff[i + 6];
1147 
1148                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1149                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1150 
1151                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
1152                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1153 
1154                 STORE_RES(dp[0    ], d0);
1155                 STORE_RES(dp[chan1], d1);
1156 
1157                 buffd[i    ] = 0;
1158                 buffd[i + 1] = 0;
1159 
1160                 sp += chan2;
1161                 dp += chan2;
1162               }
1163             }
1164 
1165           } else if (kw == 5) {
1166 
1167             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1168             p5 = buff[3];
1169 
1170             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1171             k4 = pk[4];
1172 
1173             if (l < (n - 1) || off < m) {
1174 #ifdef __SUNPRO_C
1175 #pragma pipeloop(0)
1176 #endif /* __SUNPRO_C */
1177               for (i = 0; i <= (wid - 2); i += 2) {
1178                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1179 
1180                 p4 = buff[i + 4]; p5 = buff[i + 5];
1181 
1182                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1183                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1184               }
1185 
1186             } else {
1187 #ifdef __SUNPRO_C
1188 #pragma pipeloop(0)
1189 #endif /* __SUNPRO_C */
1190               for (i = 0; i <= (wid - 2); i += 2) {
1191                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1192 
1193                 p4 = buff[i + 4]; p5 = buff[i + 5];
1194 
1195                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1196                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1197 
1198                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
1199                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1200 
1201                 STORE_RES(dp[0    ], d0);
1202                 STORE_RES(dp[chan1], d1);
1203 
1204                 buffd[i    ] = 0;
1205                 buffd[i + 1] = 0;
1206 
1207                 sp += chan2;
1208                 dp += chan2;
1209               }
1210             }
1211 
1212           } else if (kw == 4) {
1213 
1214             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1215 
1216             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1217 
1218             if (l < (n - 1) || off < m) {
1219 #ifdef __SUNPRO_C
1220 #pragma pipeloop(0)
1221 #endif /* __SUNPRO_C */
1222               for (i = 0; i <= (wid - 2); i += 2) {
1223                 p0 = p2; p1 = p3; p2 = p4;
1224 
1225                 p3 = buff[i + 3]; p4 = buff[i + 4];
1226 
1227                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1228                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1229               }
1230 
1231             } else {
1232 #ifdef __SUNPRO_C
1233 #pragma pipeloop(0)
1234 #endif /* __SUNPRO_C */
1235               for (i = 0; i <= (wid - 2); i += 2) {
1236                 p0 = p2; p1 = p3; p2 = p4;
1237 
1238                 p3 = buff[i + 3]; p4 = buff[i + 4];
1239 
1240                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1241                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1242 
1243                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1244                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1245 
1246                 STORE_RES(dp[0    ], d0);
1247                 STORE_RES(dp[chan1], d1);
1248 
1249                 buffd[i    ] = 0;
1250                 buffd[i + 1] = 0;
1251 
1252                 sp += chan2;
1253                 dp += chan2;
1254               }
1255             }
1256 
1257           } else if (kw == 3) {
1258 
1259             p2 = buff[0]; p3 = buff[1];
1260             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
1261 
1262             if (l < (n - 1) || off < m) {
1263 #ifdef __SUNPRO_C
1264 #pragma pipeloop(0)
1265 #endif /* __SUNPRO_C */
1266               for (i = 0; i <= (wid - 2); i += 2) {
1267                 p0 = p2; p1 = p3;
1268 
1269                 p2 = buff[i + 2]; p3 = buff[i + 3];
1270 
1271                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1272                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1273               }
1274 
1275             } else {
1276 #ifdef __SUNPRO_C
1277 #pragma pipeloop(0)
1278 #endif /* __SUNPRO_C */
1279               for (i = 0; i <= (wid - 2); i += 2) {
1280                 p0 = p2; p1 = p3;
1281 
1282                 p2 = buff[i + 2]; p3 = buff[i + 3];
1283 
1284                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1285                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1286 
1287                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1288                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1289 
1290                 STORE_RES(dp[0    ], d0);
1291                 STORE_RES(dp[chan1], d1);
1292 
1293                 buffd[i    ] = 0;
1294                 buffd[i + 1] = 0;
1295 
1296                 sp += chan2;
1297                 dp += chan2;
1298               }
1299             }
1300 
1301           } else if (kw == 2) {
1302 
1303             p2 = buff[0];
1304             k0 = pk[0]; k1 = pk[1];
1305 
1306             if (l < (n - 1) || off < m) {
1307 #ifdef __SUNPRO_C
1308 #pragma pipeloop(0)
1309 #endif /* __SUNPRO_C */
1310               for (i = 0; i <= (wid - 2); i += 2) {
1311                 p0 = p2;
1312 
1313                 p1 = buff[i + 1]; p2 = buff[i + 2];
1314 
1315                 buffd[i    ] += p0*k0 + p1*k1;
1316                 buffd[i + 1] += p1*k0 + p2*k1;
1317               }
1318 
1319             } else {
1320 #ifdef __SUNPRO_C
1321 #pragma pipeloop(0)
1322 #endif /* __SUNPRO_C */
1323               for (i = 0; i <= (wid - 2); i += 2) {
1324                 p0 = p2;
1325 
1326                 p1 = buff[i + 1]; p2 = buff[i + 2];
1327 
1328                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1329                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1330 
1331                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
1332                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
1333 
1334                 STORE_RES(dp[0    ], d0);
1335                 STORE_RES(dp[chan1], d1);
1336 
1337                 buffd[i    ] = 0;
1338                 buffd[i + 1] = 0;
1339 
1340                 sp += chan2;
1341                 dp += chan2;
1342               }
1343             }
1344 
1345           } else /* kw == 1 */{
1346 
1347             k0 = pk[0];
1348 
1349             if (l < (n - 1) || off < m) {
1350 #ifdef __SUNPRO_C
1351 #pragma pipeloop(0)
1352 #endif /* __SUNPRO_C */
1353               for (i = 0; i <= (wid - 2); i += 2) {
1354                 p0 = buff[i]; p1 = buff[i + 1];
1355 
1356                 buffd[i    ] += p0*k0;
1357                 buffd[i + 1] += p1*k0;
1358               }
1359 
1360             } else {
1361 #ifdef __SUNPRO_C
1362 #pragma pipeloop(0)
1363 #endif /* __SUNPRO_C */
1364               for (i = 0; i <= (wid - 2); i += 2) {
1365                 p0 = buff[i]; p1 = buff[i + 1];
1366 
1367                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1368                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1369 
1370                 d0 = (p0*k0 + buffd[i    ]);
1371                 d1 = (p1*k0 + buffd[i + 1]);
1372 
1373                 STORE_RES(dp[0    ], d0);
1374                 STORE_RES(dp[chan1], d1);
1375 
1376                 buffd[i    ] = 0;
1377                 buffd[i + 1] = 0;
1378 
1379                 sp += chan2;
1380                 dp += chan2;
1381               }
1382             }
1383           }
1384 
1385           pk += kw;
1386         }
1387       }
1388 
1389       /* last pixels */
1390       for (; i < wid; i++) {
1391         mlib_s32 *pk = k, x, s = 0;
1392 
1393         for (l = 0; l < n; l++) {
1394           mlib_s32 *buff = buffc[l] + i;
1395 
1396           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
1397         }
1398 
1399         STORE_RES(dp[0], s);
1400 
1401         buffn[i + dx_l] = (mlib_s32)sp[0];
1402 
1403         sp += chan1;
1404         dp += chan1;
1405       }
1406 
1407       for (; i < swid; i++) {
1408         buffn[i + dx_l] = (mlib_s32)sp[0];
1409         sp += chan1;
1410       }
1411 
1412       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1413       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1414 
1415       /* next line */
1416 
1417       if (j < hgt - dy_b - 2) sl += sll;
1418       dl += dll;
1419 
1420       buff_ind++;
1421 
1422       if (buff_ind >= n + 1) buff_ind = 0;
1423     }
1424   }
1425 
1426   if (pbuff != buff) mlib_free(pbuff);
1427   if (k != k_locl) mlib_free(k);
1428 
1429   return MLIB_SUCCESS;
1430 }
1431 
1432 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
1433 
1434 /***************************************************************/