1 /*
   2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 /*
  28  * FUNCTION
  29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 type and
  30  *   MLIB_EDGE_SRC_EXTEND mask
  31  */
  32 
  33 #include "mlib_image.h"
  34 #include "mlib_ImageConv.h"
  35 #include "mlib_c_ImageConv.h"
  36 
  37 /*
  38  * This define switches between functions of different data types
  39  */
  40 
  41 #define IMG_TYPE 2
  42 
  43 /***************************************************************/
  44 #if IMG_TYPE == 1
  45 
  46 #define DTYPE             mlib_u8
  47 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##ext_u8(PARAM)
  48 #define CONV_FUNC_MxN     mlib_c_convMxNext_u8(PARAM_MxN)
  49 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM)
  50 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u8(PARAM_MxN)
  51 #define DSCALE            (1 << 24)
  52 #define FROM_S32(x)       (((x) >> 24) ^ 128)
  53 #define S64TOS32(x)       (x)
  54 #define SAT_OFF           -(1u << 31)
  55 
  56 #elif IMG_TYPE == 2
  57 
  58 #define DTYPE             mlib_s16
  59 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_s16(PARAM)
  60 #define CONV_FUNC_MxN     mlib_convMxNext_s16(PARAM_MxN)
  61 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM)
  62 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_s16(PARAM_MxN)
  63 #define DSCALE            65536.0
  64 #define FROM_S32(x)       ((x) >> 16)
  65 #define S64TOS32(x)       ((x) & 0xffffffff)
  66 #define SAT_OFF
  67 
  68 #elif IMG_TYPE == 3
  69 
  70 #define DTYPE             mlib_u16
  71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
  72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
  73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
  74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
  75 #define DSCALE            65536.0
  76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
  77 #define S64TOS32(x)       (x)
  78 #define SAT_OFF           -(1u << 31)
  79 
  80 #endif /* IMG_TYPE == 1 */
  81 
  82 /***************************************************************/
  83 #define PARAM                                                   \
  84   mlib_image       *dst,                                        \
  85   const mlib_image *src,                                        \
  86   mlib_s32         dx_l,                                        \
  87   mlib_s32         dx_r,                                        \
  88   mlib_s32         dy_t,                                        \
  89   mlib_s32         dy_b,                                        \
  90   const mlib_s32   *kern,                                       \
  91   mlib_s32         scalef_expon,                                \
  92   mlib_s32         cmask
  93 
  94 /***************************************************************/
  95 #define PARAM_MxN                                               \
  96   mlib_image       *dst,                                        \
  97   const mlib_image *src,                                        \
  98   const mlib_s32   *kernel,                                     \
  99   mlib_s32         m,                                           \
 100   mlib_s32         n,                                           \
 101   mlib_s32         dx_l,                                        \
 102   mlib_s32         dx_r,                                        \
 103   mlib_s32         dy_t,                                        \
 104   mlib_s32         dy_b,                                        \
 105   mlib_s32         scale,                                       \
 106   mlib_s32         cmask
 107 
 108 /***************************************************************/
 109 #define FTYPE mlib_d64
 110 
 111 #ifndef MLIB_USE_FTOI_CLAMPING
 112 
 113 #define CLAMP_S32(x)                                            \
 114   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
 115 
 116 #else
 117 
 118 #define CLAMP_S32(x) ((mlib_s32)(x))
 119 
 120 #endif /* MLIB_USE_FTOI_CLAMPING */
 121 
 122 /***************************************************************/
 123 #define D2I(x) CLAMP_S32((x) SAT_OFF)
 124 
 125 /***************************************************************/
 126 #ifdef _LITTLE_ENDIAN
 127 
 128 #define STORE2(res0, res1)                                      \
 129   dp[0    ] = res1;                                             \
 130   dp[chan1] = res0
 131 
 132 #else
 133 
 134 #define STORE2(res0, res1)                                      \
 135   dp[0    ] = res0;                                             \
 136   dp[chan1] = res1
 137 
 138 #endif /* _LITTLE_ENDIAN */
 139 
 140 /***************************************************************/
 141 #ifdef _NO_LONGLONG
 142 
 143 #define LOAD_BUFF(buff)                                         \
 144   buff[i    ] = sp[0];                                          \
 145   buff[i + 1] = sp[chan1]
 146 
 147 #else /* _NO_LONGLONG */
 148 
 149 #ifdef _LITTLE_ENDIAN
 150 
 151 #define LOAD_BUFF(buff)                                         \
 152   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
 153 
 154 #else /* _LITTLE_ENDIAN */
 155 
 156 #define LOAD_BUFF(buff)                                         \
 157   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 158 
 159 #endif /* _LITTLE_ENDIAN */
 160 #endif /* _NO_LONGLONG */
 161 
 162 /***************************************************************/
 163 typedef union {
 164   mlib_d64 d64;
 165   struct {
 166     mlib_s32 i0;
 167     mlib_s32 i1;
 168   } i32s;
 169 } d64_2x32;
 170 
 171 /***************************************************************/
 172 #define GET_SRC_DST_PARAMETERS(type)                            \
 173   hgt = mlib_ImageGetHeight(src);                               \
 174   wid = mlib_ImageGetWidth(src);                                \
 175   nchannel = mlib_ImageGetChannels(src);                        \
 176   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 177   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 178   adr_src = (type *)mlib_ImageGetData(src);                     \
 179   adr_dst = (type *)mlib_ImageGetData(dst)
 180 
 181 /***************************************************************/
 182 #if IMG_TYPE == 1
 183 
 184 /*
 185  * Test for the presence of any "1" bit in bits
 186    8 to 31 of val. If present, then val is either
 187    negative or >255. If over/underflows of 8 bits
 188    are uncommon, then this technique can be a win,
 189    since only a single test, rather than two, is
 190    necessary to determine if clamping is needed.
 191    On the other hand, if over/underflows are common,
 192    it adds an extra test.
 193 */
 194 #define CLAMP_STORE(dst, val)                                   \
 195   if (val & 0xffffff00) {                                       \
 196     if (val < MLIB_U8_MIN)                                      \
 197       dst = MLIB_U8_MIN;                                        \
 198     else                                                        \
 199       dst = MLIB_U8_MAX;                                        \
 200   } else {                                                      \
 201     dst = (mlib_u8)val;                                         \
 202   }
 203 
 204 #elif IMG_TYPE == 2
 205 
 206 #define CLAMP_STORE(dst, val)                                   \
 207   if (val >= MLIB_S16_MAX)                                      \
 208     dst = MLIB_S16_MAX;                                         \
 209   else if (val <= MLIB_S16_MIN)                                 \
 210     dst = MLIB_S16_MIN;                                         \
 211   else                                                          \
 212     dst = (mlib_s16)val
 213 
 214 #elif IMG_TYPE == 3
 215 
 216 #define CLAMP_STORE(dst, val)                                   \
 217   if (val >= MLIB_U16_MAX)                                      \
 218     dst = MLIB_U16_MAX;                                         \
 219   else if (val <= MLIB_U16_MIN)                                 \
 220     dst = MLIB_U16_MIN;                                         \
 221   else                                                          \
 222     dst = (mlib_u16)val
 223 
 224 #endif /* IMG_TYPE == 1 */
 225 
 226 /***************************************************************/
 227 #define MAX_KER   7
 228 #define MAX_N    15
 229 #define BUFF_SIZE   1600
 230 #define CACHE_SIZE  (64*1024)
 231 
 232 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
 233                                          const mlib_image *src,
 234                                          const mlib_d64   *k,
 235                                          mlib_s32         n,
 236                                          mlib_s32         dy_t,
 237                                          mlib_s32         dy_b,
 238                                          mlib_s32         cmask)
 239 {
 240   DTYPE    *adr_src, *sl;
 241   DTYPE    *adr_dst, *dl, *dp;
 242   FTYPE    buff[BUFF_SIZE];
 243   FTYPE    *buffd;
 244   FTYPE    *pbuff = buff;
 245   const FTYPE    *pk;
 246   FTYPE    k0, k1, k2, k3;
 247   FTYPE    p0, p1, p2, p3, p4;
 248   FTYPE    *sbuff;
 249   mlib_s32 l, k_off, off, bsize;
 250   mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh;
 251   mlib_s32 d0, d1, ii;
 252   mlib_s32 wid, hgt, sll, dll;
 253   mlib_s32 nchannel;
 254   mlib_s32 i, j, c;
 255   GET_SRC_DST_PARAMETERS(DTYPE);
 256 
 257   max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1);
 258 
 259   if (max_hsize < 1) max_hsize = 1;
 260   if (max_hsize > hgt) max_hsize = hgt;
 261 
 262   shgt = hgt + (n - 1);
 263   smax_hsize = max_hsize + (n - 1);
 264 
 265   bsize = 2 * (smax_hsize + 1);
 266 
 267   if (bsize > BUFF_SIZE) {
 268     pbuff = mlib_malloc(sizeof(FTYPE)*bsize);
 269 
 270     if (pbuff == NULL) return MLIB_FAILURE;
 271   }
 272 
 273   sbuff = pbuff;
 274   buffd = sbuff + smax_hsize;
 275 
 276   shgt -= (dy_t + dy_b);
 277   k_off = 0;
 278 
 279   for (l = 0; l < hgt; l += hsize) {
 280     hsize = hgt - l;
 281 
 282     if (hsize > max_hsize) hsize = max_hsize;
 283 
 284     smax_hsize = hsize + (n - 1);
 285 
 286     for (c = 0; c < nchannel; c++) {
 287       if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 288 
 289       sl = adr_src + c;
 290       dl = adr_dst + c;
 291 
 292 #ifdef __SUNPRO_C
 293 #pragma pipeloop(0)
 294 #endif /* __SUNPRO_C */
 295       for (i = 0; i < hsize; i++) buffd[i] = 0.0;
 296 
 297       for (j = 0; j < wid; j++) {
 298         FTYPE    *buff = sbuff;
 299 
 300         for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) {
 301           sbuff[i - k_off] = (FTYPE)sl[0];
 302         }
 303 
 304 #ifdef __SUNPRO_C
 305 #pragma pipeloop(0)
 306 #endif /* __SUNPRO_C */
 307         for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) {
 308           sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll];
 309         }
 310 
 311         for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) {
 312           sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll];
 313         }
 314 
 315         pk = k;
 316 
 317         for (off = 0; off < (n - 4); off += 4) {
 318 
 319           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 320           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 321 
 322 #ifdef __SUNPRO_C
 323 #pragma pipeloop(0)
 324 #endif /* __SUNPRO_C */
 325           for (i = 0; i < hsize; i += 2) {
 326             p0 = p2; p1 = p3; p2 = p4;
 327 
 328             p3 = buff[i + 3]; p4 = buff[i + 4];
 329 
 330             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 331             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 332           }
 333 
 334           pk += 4;
 335           buff += 4;
 336         }
 337 
 338         dp = dl;
 339         kh = n - off;
 340 
 341         if (kh == 4) {
 342           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 343           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 344 
 345 #ifdef __SUNPRO_C
 346 #pragma pipeloop(0)
 347 #endif /* __SUNPRO_C */
 348           for (i = 0; i <= (hsize - 2); i += 2) {
 349             p0 = p2; p1 = p3; p2 = p4;
 350 
 351             p3 = buff[i + 3]; p4 = buff[i + 4];
 352 
 353             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 354             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 355 
 356             dp[0  ] = FROM_S32(d0);
 357             dp[dll] = FROM_S32(d1);
 358 
 359             buffd[i    ] = 0.0;
 360             buffd[i + 1] = 0.0;
 361 
 362             dp += 2*dll;
 363           }
 364 
 365           if (i < hsize) {
 366             p0 = p2; p1 = p3; p2 = p4;
 367             p3 = buff[i + 3];
 368             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]);
 369             dp[0] = FROM_S32(d0);
 370             buffd[i] = 0.0;
 371           }
 372 
 373         } else if (kh == 3) {
 374 
 375           p2 = buff[0]; p3 = buff[1];
 376           k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 377 
 378 #ifdef __SUNPRO_C
 379 #pragma pipeloop(0)
 380 #endif /* __SUNPRO_C */
 381           for (i = 0; i <= (hsize - 2); i += 2) {
 382             p0 = p2; p1 = p3;
 383 
 384             p2 = buff[i + 2]; p3 = buff[i + 3];
 385 
 386             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 387             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 388 
 389             dp[0  ] = FROM_S32(d0);
 390             dp[dll] = FROM_S32(d1);
 391 
 392             buffd[i    ] = 0.0;
 393             buffd[i + 1] = 0.0;
 394 
 395             dp += 2*dll;
 396           }
 397 
 398           if (i < hsize) {
 399             p0 = p2; p1 = p3;
 400             p2 = buff[i + 2];
 401             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]);
 402             dp[0] = FROM_S32(d0);
 403 
 404             buffd[i] = 0.0;
 405           }
 406 
 407         } else if (kh == 2) {
 408 
 409           p2 = buff[0];
 410           k0 = pk[0]; k1 = pk[1];
 411 
 412 #ifdef __SUNPRO_C
 413 #pragma pipeloop(0)
 414 #endif /* __SUNPRO_C */
 415           for (i = 0; i <= (hsize - 2); i += 2) {
 416             p0 = p2;
 417 
 418             p1 = buff[i + 1]; p2 = buff[i + 2];
 419 
 420             d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 421             d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 422 
 423             dp[0  ] = FROM_S32(d0);
 424             dp[dll] = FROM_S32(d1);
 425 
 426             buffd[i    ] = 0.0;
 427             buffd[i + 1] = 0.0;
 428 
 429             dp += 2*dll;
 430           }
 431 
 432           if (i < hsize) {
 433             p0 = p2;
 434             p1 = buff[i + 1];
 435             d0 = D2I(p0*k0 + p1*k1 + buffd[i]);
 436             dp[0] = FROM_S32(d0);
 437 
 438             buffd[i] = 0.0;
 439           }
 440 
 441         } else /* kh == 1 */{
 442 
 443           k0 = pk[0];
 444 
 445 #ifdef __SUNPRO_C
 446 #pragma pipeloop(0)
 447 #endif /* __SUNPRO_C */
 448           for (i = 0; i <= (hsize - 2); i += 2) {
 449             p0 = buff[i]; p1 = buff[i + 1];
 450 
 451             d0 = D2I(p0*k0 + buffd[i    ]);
 452             d1 = D2I(p1*k0 + buffd[i + 1]);
 453 
 454             dp[0  ] = FROM_S32(d0);
 455             dp[dll] = FROM_S32(d1);
 456 
 457             buffd[i    ] = 0.0;
 458             buffd[i + 1] = 0.0;
 459 
 460             dp += 2*dll;
 461           }
 462 
 463           if (i < hsize) {
 464             p0 = buff[i];
 465             d0 = D2I(p0*k0 + buffd[i]);
 466             dp[0] = FROM_S32(d0);
 467 
 468             buffd[i] = 0.0;
 469           }
 470         }
 471 
 472         /* next line */
 473         sl += nchannel;
 474         dl += nchannel;
 475       }
 476     }
 477 
 478     k_off += max_hsize;
 479     adr_dst += max_hsize*dll;
 480   }
 481 
 482   if (pbuff != buff) mlib_free(pbuff);
 483 
 484   return MLIB_SUCCESS;
 485 }
 486 
 487 /***************************************************************/
 488 mlib_status CONV_FUNC_MxN
 489 {
 490   DTYPE    *adr_src, *sl, *sp = NULL;
 491   DTYPE    *adr_dst, *dl, *dp = NULL;
 492   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 493   FTYPE    **buffs = buffs_arr, *buffd;
 494   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
 495   FTYPE    *pbuff = buff;
 496   FTYPE    k0, k1, k2, k3, k4, k5, k6;
 497   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
 498   mlib_s32 *buffi;
 499   mlib_s32 mn, l, off, kw, bsize, buff_ind;
 500   mlib_s32 d0, d1;
 501   mlib_s32 wid, hgt, sll, dll;
 502   mlib_s32 nchannel, chan1, chan2;
 503   mlib_s32 i, j, c, swid;
 504   d64_2x32 dd;
 505   mlib_status status = MLIB_SUCCESS;
 506 
 507   GET_SRC_DST_PARAMETERS(DTYPE);
 508 
 509   if (scale > 30) {
 510     fscale *= 1.0/(1 << 30);
 511     scale -= 30;
 512   }
 513 
 514   fscale /= (1 << scale);
 515 
 516   mn = m*n;
 517 
 518   if (mn > 256) {
 519     k = mlib_malloc(mn*sizeof(mlib_d64));
 520 
 521     if (k == NULL) return MLIB_FAILURE;
 522   }
 523 
 524   for (i = 0; i < mn; i++) {
 525     k[i] = kernel[i]*fscale;
 526   }
 527 
 528   if (m == 1) {
 529     status = mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask);
 530     FREE_AND_RETURN_STATUS;
 531   }
 532 
 533   swid = wid + (m - 1);
 534 
 535   bsize = (n + 3)*swid;
 536 
 537   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 538     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
 539 
 540     if (pbuff == NULL) {
 541       status = MLIB_FAILURE;
 542       FREE_AND_RETURN_STATUS;
 543     }
 544     buffs = (FTYPE   **)(pbuff + bsize);
 545   }
 546 
 547   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
 548   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
 549   buffd = buffs[n] + swid;
 550   buffi = (mlib_s32*)(buffd + swid);
 551 
 552   chan1 = nchannel;
 553   chan2 = chan1 + chan1;
 554 
 555   swid -= (dx_l + dx_r);
 556 
 557   for (c = 0; c < nchannel; c++) {
 558     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 559 
 560     sl = adr_src + c;
 561     dl = adr_dst + c;
 562 
 563     for (l = 0; l < n; l++) {
 564       FTYPE    *buff = buffs[l];
 565 
 566       for (i = 0; i < dx_l; i++) {
 567         buff[i] = (FTYPE)sl[0];
 568       }
 569 
 570 #ifdef __SUNPRO_C
 571 #pragma pipeloop(0)
 572 #endif /* __SUNPRO_C */
 573       for (i = 0; i < swid; i++) {
 574         buff[i + dx_l] = (FTYPE)sl[i*chan1];
 575       }
 576 
 577       for (i = 0; i < dx_r; i++) {
 578         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
 579       }
 580 
 581       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
 582     }
 583 
 584     buff_ind = 0;
 585 
 586 #ifdef __SUNPRO_C
 587 #pragma pipeloop(0)
 588 #endif /* __SUNPRO_C */
 589     for (i = 0; i < wid; i++) buffd[i] = 0.0;
 590 
 591     for (j = 0; j < hgt; j++) {
 592       FTYPE    **buffc = buffs + buff_ind;
 593       FTYPE    *buffn = buffc[n];
 594       FTYPE    *pk = k;
 595 
 596       for (l = 0; l < n; l++) {
 597         FTYPE    *buff_l = buffc[l];
 598 
 599         for (off = 0; off < m;) {
 600           FTYPE    *buff = buff_l + off;
 601 
 602           kw = m - off;
 603 
 604           if (kw > 2*MAX_KER) kw = MAX_KER; else
 605             if (kw > MAX_KER) kw = kw/2;
 606           off += kw;
 607 
 608           sp = sl;
 609           dp = dl;
 610 
 611           if (kw == 7) {
 612 
 613             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 614             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
 615 
 616             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 617             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
 618 
 619             if (l < (n - 1) || off < m) {
 620 #ifdef __SUNPRO_C
 621 #pragma pipeloop(0)
 622 #endif /* __SUNPRO_C */
 623               for (i = 0; i <= (wid - 2); i += 2) {
 624                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 625 
 626                 p6 = buff[i + 6]; p7 = buff[i + 7];
 627 
 628                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
 629                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
 630               }
 631 
 632             } else {
 633 #ifdef __SUNPRO_C
 634 #pragma pipeloop(0)
 635 #endif /* __SUNPRO_C */
 636               for (i = 0; i <= (wid - 2); i += 2) {
 637                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 638 
 639                 p6 = buff[i + 6]; p7 = buff[i + 7];
 640 
 641                 LOAD_BUFF(buffi);
 642 
 643                 dd.d64 = *(FTYPE   *)(buffi + i);
 644                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 645                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 646 
 647                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
 648                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
 649 
 650                 dp[0    ] = FROM_S32(d0);
 651                 dp[chan1] = FROM_S32(d1);
 652 
 653                 buffd[i    ] = 0.0;
 654                 buffd[i + 1] = 0.0;
 655 
 656                 sp += chan2;
 657                 dp += chan2;
 658               }
 659             }
 660 
 661           } else if (kw == 6) {
 662 
 663             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 664             p5 = buff[3]; p6 = buff[4];
 665 
 666             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 667             k4 = pk[4]; k5 = pk[5];
 668 
 669             if (l < (n - 1) || off < m) {
 670 #ifdef __SUNPRO_C
 671 #pragma pipeloop(0)
 672 #endif /* __SUNPRO_C */
 673               for (i = 0; i <= (wid - 2); i += 2) {
 674                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 675 
 676                 p5 = buff[i + 5]; p6 = buff[i + 6];
 677 
 678                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
 679                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
 680               }
 681 
 682             } else {
 683 #ifdef __SUNPRO_C
 684 #pragma pipeloop(0)
 685 #endif /* __SUNPRO_C */
 686               for (i = 0; i <= (wid - 2); i += 2) {
 687                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 688 
 689                 p5 = buff[i + 5]; p6 = buff[i + 6];
 690 
 691                 LOAD_BUFF(buffi);
 692 
 693                 dd.d64 = *(FTYPE   *)(buffi + i);
 694                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 695                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 696 
 697                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
 698                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
 699 
 700                 dp[0    ] = FROM_S32(d0);
 701                 dp[chan1] = FROM_S32(d1);
 702 
 703                 buffd[i    ] = 0.0;
 704                 buffd[i + 1] = 0.0;
 705 
 706                 sp += chan2;
 707                 dp += chan2;
 708               }
 709             }
 710 
 711           } else if (kw == 5) {
 712 
 713             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 714             p5 = buff[3];
 715 
 716             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 717             k4 = pk[4];
 718 
 719             if (l < (n - 1) || off < m) {
 720 #ifdef __SUNPRO_C
 721 #pragma pipeloop(0)
 722 #endif /* __SUNPRO_C */
 723               for (i = 0; i <= (wid - 2); i += 2) {
 724                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 725 
 726                 p4 = buff[i + 4]; p5 = buff[i + 5];
 727 
 728                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
 729                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
 730               }
 731 
 732             } else {
 733 #ifdef __SUNPRO_C
 734 #pragma pipeloop(0)
 735 #endif /* __SUNPRO_C */
 736               for (i = 0; i <= (wid - 2); i += 2) {
 737                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 738 
 739                 p4 = buff[i + 4]; p5 = buff[i + 5];
 740 
 741                 LOAD_BUFF(buffi);
 742 
 743                 dd.d64 = *(FTYPE   *)(buffi + i);
 744                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 745                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 746 
 747                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
 748                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
 749 
 750                 dp[0    ] = FROM_S32(d0);
 751                 dp[chan1] = FROM_S32(d1);
 752 
 753                 buffd[i    ] = 0.0;
 754                 buffd[i + 1] = 0.0;
 755 
 756                 sp += chan2;
 757                 dp += chan2;
 758               }
 759             }
 760 
 761           } else if (kw == 4) {
 762 
 763             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 764 
 765             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 766 
 767             if (l < (n - 1) || off < m) {
 768 #ifdef __SUNPRO_C
 769 #pragma pipeloop(0)
 770 #endif /* __SUNPRO_C */
 771               for (i = 0; i <= (wid - 2); i += 2) {
 772                 p0 = p2; p1 = p3; p2 = p4;
 773 
 774                 p3 = buff[i + 3]; p4 = buff[i + 4];
 775 
 776                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 777                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 778               }
 779 
 780             } else {
 781 #ifdef __SUNPRO_C
 782 #pragma pipeloop(0)
 783 #endif /* __SUNPRO_C */
 784               for (i = 0; i <= (wid - 2); i += 2) {
 785                 p0 = p2; p1 = p3; p2 = p4;
 786 
 787                 p3 = buff[i + 3]; p4 = buff[i + 4];
 788 
 789                 LOAD_BUFF(buffi);
 790 
 791                 dd.d64 = *(FTYPE   *)(buffi + i);
 792                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 793                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 794 
 795                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 796                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 797 
 798                 dp[0    ] = FROM_S32(d0);
 799                 dp[chan1] = FROM_S32(d1);
 800 
 801                 buffd[i    ] = 0.0;
 802                 buffd[i + 1] = 0.0;
 803 
 804                 sp += chan2;
 805                 dp += chan2;
 806               }
 807             }
 808 
 809           } else if (kw == 3) {
 810 
 811             p2 = buff[0]; p3 = buff[1];
 812             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
 813 
 814             if (l < (n - 1) || off < m) {
 815 #ifdef __SUNPRO_C
 816 #pragma pipeloop(0)
 817 #endif /* __SUNPRO_C */
 818               for (i = 0; i <= (wid - 2); i += 2) {
 819                 p0 = p2; p1 = p3;
 820 
 821                 p2 = buff[i + 2]; p3 = buff[i + 3];
 822 
 823                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
 824                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
 825               }
 826 
 827             } else {
 828 #ifdef __SUNPRO_C
 829 #pragma pipeloop(0)
 830 #endif /* __SUNPRO_C */
 831               for (i = 0; i <= (wid - 2); i += 2) {
 832                 p0 = p2; p1 = p3;
 833 
 834                 p2 = buff[i + 2]; p3 = buff[i + 3];
 835 
 836                 LOAD_BUFF(buffi);
 837 
 838                 dd.d64 = *(FTYPE   *)(buffi + i);
 839                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 840                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 841 
 842                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 843                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 844 
 845                 dp[0    ] = FROM_S32(d0);
 846                 dp[chan1] = FROM_S32(d1);
 847 
 848                 buffd[i    ] = 0.0;
 849                 buffd[i + 1] = 0.0;
 850 
 851                 sp += chan2;
 852                 dp += chan2;
 853               }
 854             }
 855 
 856           } else /* if (kw == 2) */ {
 857 
 858             p2 = buff[0];
 859             k0 = pk[0]; k1 = pk[1];
 860 
 861             if (l < (n - 1) || off < m) {
 862 #ifdef __SUNPRO_C
 863 #pragma pipeloop(0)
 864 #endif /* __SUNPRO_C */
 865               for (i = 0; i <= (wid - 2); i += 2) {
 866                 p0 = p2;
 867 
 868                 p1 = buff[i + 1]; p2 = buff[i + 2];
 869 
 870                 buffd[i    ] += p0*k0 + p1*k1;
 871                 buffd[i + 1] += p1*k0 + p2*k1;
 872               }
 873 
 874             } else {
 875 #ifdef __SUNPRO_C
 876 #pragma pipeloop(0)
 877 #endif /* __SUNPRO_C */
 878               for (i = 0; i <= (wid - 2); i += 2) {
 879                 p0 = p2;
 880 
 881                 p1 = buff[i + 1]; p2 = buff[i + 2];
 882 
 883                 LOAD_BUFF(buffi);
 884 
 885                 dd.d64 = *(FTYPE   *)(buffi + i);
 886                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 887                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 888 
 889                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 890                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 891 
 892                 dp[0    ] = FROM_S32(d0);
 893                 dp[chan1] = FROM_S32(d1);
 894 
 895                 buffd[i    ] = 0.0;
 896                 buffd[i + 1] = 0.0;
 897 
 898                 sp += chan2;
 899                 dp += chan2;
 900               }
 901             }
 902           }
 903 
 904           pk += kw;
 905         }
 906       }
 907 
 908       /* last pixels */
 909       for (; i < wid; i++) {
 910         FTYPE    *pk = k, s = 0;
 911         mlib_s32 x, d0;
 912 
 913         for (l = 0; l < n; l++) {
 914           FTYPE    *buff = buffc[l] + i;
 915 
 916           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
 917         }
 918 
 919         d0 = D2I(s);
 920         dp[0] = FROM_S32(d0);
 921 
 922         buffn[i + dx_l] = (FTYPE)sp[0];
 923 
 924         sp += chan1;
 925         dp += chan1;
 926       }
 927 
 928       for (; i < swid; i++) {
 929         buffn[i + dx_l] = (FTYPE)sp[0];
 930         sp += chan1;
 931       }
 932 
 933       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
 934       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
 935 
 936       /* next line */
 937 
 938       if (j < hgt - dy_b - 2) sl += sll;
 939       dl += dll;
 940 
 941       buff_ind++;
 942 
 943       if (buff_ind >= n + 1) buff_ind = 0;
 944     }
 945   }
 946 
 947   FREE_AND_RETURN_STATUS;
 948 }
 949 
 950 /***************************************************************/
 951 /* for x86, using integer multiplies is faster */
 952 
 953 #define STORE_RES(res, x)                                       \
 954   x >>= shift2;                                                 \
 955   CLAMP_STORE(res, x)
 956 
 957 mlib_status CONV_FUNC_MxN_I
 958 {
 959   DTYPE    *adr_src, *sl, *sp = NULL;
 960   DTYPE    *adr_dst, *dl, *dp = NULL;
 961   mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 962   mlib_s32 *pbuff = buff;
 963   mlib_s32 **buffs = buffs_arr, *buffd;
 964   mlib_s32 l, off, kw, bsize, buff_ind;
 965   mlib_s32 d0, d1, shift1, shift2;
 966   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
 967   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
 968   mlib_s32 wid, hgt, sll, dll;
 969   mlib_s32 nchannel, chan1;
 970   mlib_s32 i, j, c, swid;
 971   mlib_s32 chan2;
 972   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
 973   GET_SRC_DST_PARAMETERS(DTYPE);
 974 
 975 #if IMG_TYPE != 1
 976   shift1 = 16;
 977 #else
 978   shift1 = 8;
 979 #endif /* IMG_TYPE != 1 */
 980   shift2 = scale - shift1;
 981 
 982   chan1 = nchannel;
 983   chan2 = chan1 + chan1;
 984 
 985   swid = wid + (m - 1);
 986 
 987   bsize = (n + 2)*swid;
 988 
 989   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 990     pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1));
 991 
 992     if (pbuff == NULL) return MLIB_FAILURE;
 993     buffs = (mlib_s32 **)(pbuff + bsize);
 994   }
 995 
 996   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
 997   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
 998   buffd = buffs[n] + swid;
 999 
1000   if (m*n > MAX_N*MAX_N) {
1001     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
1002 
1003     if (k == NULL) {
1004       if (pbuff != buff) mlib_free(pbuff);
1005       return MLIB_FAILURE;
1006     }
1007   }
1008 
1009   for (i = 0; i < m*n; i++) {
1010     k[i] = kernel[i] >> shift1;
1011   }
1012 
1013   swid -= (dx_l + dx_r);
1014 
1015   for (c = 0; c < nchannel; c++) {
1016     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1017 
1018     sl = adr_src + c;
1019     dl = adr_dst + c;
1020 
1021     for (l = 0; l < n; l++) {
1022       mlib_s32  *buff = buffs[l];
1023 
1024       for (i = 0; i < dx_l; i++) {
1025         buff[i] = (mlib_s32)sl[0];
1026       }
1027 
1028 #ifdef __SUNPRO_C
1029 #pragma pipeloop(0)
1030 #endif /* __SUNPRO_C */
1031       for (i = 0; i < swid; i++) {
1032         buff[i + dx_l] = (mlib_s32)sl[i*chan1];
1033       }
1034 
1035       for (i = 0; i < dx_r; i++) {
1036         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
1037       }
1038 
1039       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
1040     }
1041 
1042     buff_ind = 0;
1043 
1044 #ifdef __SUNPRO_C
1045 #pragma pipeloop(0)
1046 #endif /* __SUNPRO_C */
1047     for (i = 0; i < wid; i++) buffd[i] = 0;
1048 
1049     for (j = 0; j < hgt; j++) {
1050       mlib_s32 **buffc = buffs + buff_ind;
1051       mlib_s32 *buffn = buffc[n];
1052       mlib_s32 *pk = k;
1053 
1054       for (l = 0; l < n; l++) {
1055         mlib_s32  *buff_l = buffc[l];
1056 
1057         for (off = 0; off < m;) {
1058           mlib_s32 *buff = buff_l + off;
1059 
1060           sp = sl;
1061           dp = dl;
1062 
1063           kw = m - off;
1064 
1065           if (kw > 2*MAX_KER) kw = MAX_KER; else
1066             if (kw > MAX_KER) kw = kw/2;
1067           off += kw;
1068 
1069           if (kw == 7) {
1070 
1071             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1072             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1073 
1074             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1075             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1076 
1077             if (l < (n - 1) || off < m) {
1078 #ifdef __SUNPRO_C
1079 #pragma pipeloop(0)
1080 #endif /* __SUNPRO_C */
1081               for (i = 0; i <= (wid - 2); i += 2) {
1082                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1083 
1084                 p6 = buff[i + 6]; p7 = buff[i + 7];
1085 
1086                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1087                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1088               }
1089 
1090             } else {
1091 #ifdef __SUNPRO_C
1092 #pragma pipeloop(0)
1093 #endif /* __SUNPRO_C */
1094               for (i = 0; i <= (wid - 2); i += 2) {
1095                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1096 
1097                 p6 = buff[i + 6]; p7 = buff[i + 7];
1098 
1099                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1100                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1101 
1102                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1103                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1104 
1105                 STORE_RES(dp[0    ], d0);
1106                 STORE_RES(dp[chan1], d1);
1107 
1108                 buffd[i    ] = 0;
1109                 buffd[i + 1] = 0;
1110 
1111                 sp += chan2;
1112                 dp += chan2;
1113               }
1114             }
1115 
1116           } else if (kw == 6) {
1117 
1118             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1119             p5 = buff[3]; p6 = buff[4];
1120 
1121             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1122             k4 = pk[4]; k5 = pk[5];
1123 
1124             if (l < (n - 1) || off < m) {
1125 #ifdef __SUNPRO_C
1126 #pragma pipeloop(0)
1127 #endif /* __SUNPRO_C */
1128               for (i = 0; i <= (wid - 2); i += 2) {
1129                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1130 
1131                 p5 = buff[i + 5]; p6 = buff[i + 6];
1132 
1133                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1134                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1135               }
1136 
1137             } else {
1138 #ifdef __SUNPRO_C
1139 #pragma pipeloop(0)
1140 #endif /* __SUNPRO_C */
1141               for (i = 0; i <= (wid - 2); i += 2) {
1142                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1143 
1144                 p5 = buff[i + 5]; p6 = buff[i + 6];
1145 
1146                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1147                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1148 
1149                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
1150                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1151 
1152                 STORE_RES(dp[0    ], d0);
1153                 STORE_RES(dp[chan1], d1);
1154 
1155                 buffd[i    ] = 0;
1156                 buffd[i + 1] = 0;
1157 
1158                 sp += chan2;
1159                 dp += chan2;
1160               }
1161             }
1162 
1163           } else if (kw == 5) {
1164 
1165             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1166             p5 = buff[3];
1167 
1168             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1169             k4 = pk[4];
1170 
1171             if (l < (n - 1) || off < m) {
1172 #ifdef __SUNPRO_C
1173 #pragma pipeloop(0)
1174 #endif /* __SUNPRO_C */
1175               for (i = 0; i <= (wid - 2); i += 2) {
1176                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1177 
1178                 p4 = buff[i + 4]; p5 = buff[i + 5];
1179 
1180                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1181                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1182               }
1183 
1184             } else {
1185 #ifdef __SUNPRO_C
1186 #pragma pipeloop(0)
1187 #endif /* __SUNPRO_C */
1188               for (i = 0; i <= (wid - 2); i += 2) {
1189                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1190 
1191                 p4 = buff[i + 4]; p5 = buff[i + 5];
1192 
1193                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1194                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1195 
1196                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
1197                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1198 
1199                 STORE_RES(dp[0    ], d0);
1200                 STORE_RES(dp[chan1], d1);
1201 
1202                 buffd[i    ] = 0;
1203                 buffd[i + 1] = 0;
1204 
1205                 sp += chan2;
1206                 dp += chan2;
1207               }
1208             }
1209 
1210           } else if (kw == 4) {
1211 
1212             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1213 
1214             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1215 
1216             if (l < (n - 1) || off < m) {
1217 #ifdef __SUNPRO_C
1218 #pragma pipeloop(0)
1219 #endif /* __SUNPRO_C */
1220               for (i = 0; i <= (wid - 2); i += 2) {
1221                 p0 = p2; p1 = p3; p2 = p4;
1222 
1223                 p3 = buff[i + 3]; p4 = buff[i + 4];
1224 
1225                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1226                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1227               }
1228 
1229             } else {
1230 #ifdef __SUNPRO_C
1231 #pragma pipeloop(0)
1232 #endif /* __SUNPRO_C */
1233               for (i = 0; i <= (wid - 2); i += 2) {
1234                 p0 = p2; p1 = p3; p2 = p4;
1235 
1236                 p3 = buff[i + 3]; p4 = buff[i + 4];
1237 
1238                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1239                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1240 
1241                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1242                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1243 
1244                 STORE_RES(dp[0    ], d0);
1245                 STORE_RES(dp[chan1], d1);
1246 
1247                 buffd[i    ] = 0;
1248                 buffd[i + 1] = 0;
1249 
1250                 sp += chan2;
1251                 dp += chan2;
1252               }
1253             }
1254 
1255           } else if (kw == 3) {
1256 
1257             p2 = buff[0]; p3 = buff[1];
1258             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
1259 
1260             if (l < (n - 1) || off < m) {
1261 #ifdef __SUNPRO_C
1262 #pragma pipeloop(0)
1263 #endif /* __SUNPRO_C */
1264               for (i = 0; i <= (wid - 2); i += 2) {
1265                 p0 = p2; p1 = p3;
1266 
1267                 p2 = buff[i + 2]; p3 = buff[i + 3];
1268 
1269                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1270                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1271               }
1272 
1273             } else {
1274 #ifdef __SUNPRO_C
1275 #pragma pipeloop(0)
1276 #endif /* __SUNPRO_C */
1277               for (i = 0; i <= (wid - 2); i += 2) {
1278                 p0 = p2; p1 = p3;
1279 
1280                 p2 = buff[i + 2]; p3 = buff[i + 3];
1281 
1282                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1283                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1284 
1285                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1286                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1287 
1288                 STORE_RES(dp[0    ], d0);
1289                 STORE_RES(dp[chan1], d1);
1290 
1291                 buffd[i    ] = 0;
1292                 buffd[i + 1] = 0;
1293 
1294                 sp += chan2;
1295                 dp += chan2;
1296               }
1297             }
1298 
1299           } else if (kw == 2) {
1300 
1301             p2 = buff[0];
1302             k0 = pk[0]; k1 = pk[1];
1303 
1304             if (l < (n - 1) || off < m) {
1305 #ifdef __SUNPRO_C
1306 #pragma pipeloop(0)
1307 #endif /* __SUNPRO_C */
1308               for (i = 0; i <= (wid - 2); i += 2) {
1309                 p0 = p2;
1310 
1311                 p1 = buff[i + 1]; p2 = buff[i + 2];
1312 
1313                 buffd[i    ] += p0*k0 + p1*k1;
1314                 buffd[i + 1] += p1*k0 + p2*k1;
1315               }
1316 
1317             } else {
1318 #ifdef __SUNPRO_C
1319 #pragma pipeloop(0)
1320 #endif /* __SUNPRO_C */
1321               for (i = 0; i <= (wid - 2); i += 2) {
1322                 p0 = p2;
1323 
1324                 p1 = buff[i + 1]; p2 = buff[i + 2];
1325 
1326                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1327                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1328 
1329                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
1330                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
1331 
1332                 STORE_RES(dp[0    ], d0);
1333                 STORE_RES(dp[chan1], d1);
1334 
1335                 buffd[i    ] = 0;
1336                 buffd[i + 1] = 0;
1337 
1338                 sp += chan2;
1339                 dp += chan2;
1340               }
1341             }
1342 
1343           } else /* kw == 1 */{
1344 
1345             k0 = pk[0];
1346 
1347             if (l < (n - 1) || off < m) {
1348 #ifdef __SUNPRO_C
1349 #pragma pipeloop(0)
1350 #endif /* __SUNPRO_C */
1351               for (i = 0; i <= (wid - 2); i += 2) {
1352                 p0 = buff[i]; p1 = buff[i + 1];
1353 
1354                 buffd[i    ] += p0*k0;
1355                 buffd[i + 1] += p1*k0;
1356               }
1357 
1358             } else {
1359 #ifdef __SUNPRO_C
1360 #pragma pipeloop(0)
1361 #endif /* __SUNPRO_C */
1362               for (i = 0; i <= (wid - 2); i += 2) {
1363                 p0 = buff[i]; p1 = buff[i + 1];
1364 
1365                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
1366                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
1367 
1368                 d0 = (p0*k0 + buffd[i    ]);
1369                 d1 = (p1*k0 + buffd[i + 1]);
1370 
1371                 STORE_RES(dp[0    ], d0);
1372                 STORE_RES(dp[chan1], d1);
1373 
1374                 buffd[i    ] = 0;
1375                 buffd[i + 1] = 0;
1376 
1377                 sp += chan2;
1378                 dp += chan2;
1379               }
1380             }
1381           }
1382 
1383           pk += kw;
1384         }
1385       }
1386 
1387       /* last pixels */
1388       for (; i < wid; i++) {
1389         mlib_s32 *pk = k, x, s = 0;
1390 
1391         for (l = 0; l < n; l++) {
1392           mlib_s32 *buff = buffc[l] + i;
1393 
1394           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
1395         }
1396 
1397         STORE_RES(dp[0], s);
1398 
1399         buffn[i + dx_l] = (mlib_s32)sp[0];
1400 
1401         sp += chan1;
1402         dp += chan1;
1403       }
1404 
1405       for (; i < swid; i++) {
1406         buffn[i + dx_l] = (mlib_s32)sp[0];
1407         sp += chan1;
1408       }
1409 
1410       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1411       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1412 
1413       /* next line */
1414 
1415       if (j < hgt - dy_b - 2) sl += sll;
1416       dl += dll;
1417 
1418       buff_ind++;
1419 
1420       if (buff_ind >= n + 1) buff_ind = 0;
1421     }
1422   }
1423 
1424   if (pbuff != buff) mlib_free(pbuff);
1425   if (k != k_locl) mlib_free(k);
1426 
1427   return MLIB_SUCCESS;
1428 }
1429 
1430 /***************************************************************/