1 /*
   2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 /*
  28  * FUNCTION
  29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 type and
  30  *   MLIB_EDGE_SRC_EXTEND mask
  31  */
  32 
  33 #include "mlib_image.h"
  34 #include "mlib_ImageConv.h"
  35 #include "mlib_c_ImageConv.h"
  36 
  37 /*
  38  * This define switches between functions of different data types
  39  */
  40 
  41 #define IMG_TYPE 3
  42 
  43 /***************************************************************/
  44 #if IMG_TYPE == 1
  45 
  46 #define DTYPE             mlib_u8
  47 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##ext_u8(PARAM)
  48 #define CONV_FUNC_MxN     mlib_c_convMxNext_u8(PARAM_MxN)
  49 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM)
  50 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u8(PARAM_MxN)
  51 #define DSCALE            (1 << 24)
  52 #define FROM_S32(x)       (((x) >> 24) ^ 128)
  53 #define S64TOS32(x)       (x)
  54 #define SAT_OFF           -(1u << 31)
  55 
  56 #elif IMG_TYPE == 2
  57 
  58 #define DTYPE             mlib_s16
  59 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_s16(PARAM)
  60 #define CONV_FUNC_MxN     mlib_convMxNext_s16(PARAM_MxN)
  61 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM)
  62 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_s16(PARAM_MxN)
  63 #define DSCALE            65536.0
  64 #define FROM_S32(x)       ((x) >> 16)
  65 #define S64TOS32(x)       ((x) & 0xffffffff)
  66 #define SAT_OFF
  67 
  68 #elif IMG_TYPE == 3
  69 
  70 #define DTYPE             mlib_u16
  71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
  72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
  73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
  74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
  75 #define DSCALE            65536.0
  76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
  77 #define S64TOS32(x)       (x)
  78 #define SAT_OFF           -(1u << 31)
  79 
  80 #endif /* IMG_TYPE == 1 */
  81 
  82 /***************************************************************/
  83 #define KSIZE1 (KSIZE - 1)
  84 
  85 /***************************************************************/
  86 #define PARAM                                                   \
  87   mlib_image       *dst,                                        \
  88   const mlib_image *src,                                        \
  89   mlib_s32         dx_l,                                        \
  90   mlib_s32         dx_r,                                        \
  91   mlib_s32         dy_t,                                        \
  92   mlib_s32         dy_b,                                        \
  93   const mlib_s32   *kern,                                       \
  94   mlib_s32         scalef_expon,                                \
  95   mlib_s32         cmask
  96 
  97 /***************************************************************/
  98 #define PARAM_MxN                                               \
  99   mlib_image       *dst,                                        \
 100   const mlib_image *src,                                        \
 101   const mlib_s32   *kernel,                                     \
 102   mlib_s32         m,                                           \
 103   mlib_s32         n,                                           \
 104   mlib_s32         dx_l,                                        \
 105   mlib_s32         dx_r,                                        \
 106   mlib_s32         dy_t,                                        \
 107   mlib_s32         dy_b,                                        \
 108   mlib_s32         scale,                                       \
 109   mlib_s32         cmask
 110 
 111 /***************************************************************/
 112 #define FTYPE mlib_d64
 113 
 114 #ifndef MLIB_USE_FTOI_CLAMPING
 115 
 116 #define CLAMP_S32(x)                                            \
 117   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
 118 
 119 #else
 120 
 121 #define CLAMP_S32(x) ((mlib_s32)(x))
 122 
 123 #endif /* MLIB_USE_FTOI_CLAMPING */
 124 
 125 /***************************************************************/
 126 #define D2I(x) CLAMP_S32((x) SAT_OFF)
 127 
 128 /***************************************************************/
 129 #ifdef _LITTLE_ENDIAN
 130 
 131 #define STORE2(res0, res1)                                      \
 132   dp[0    ] = res1;                                             \
 133   dp[chan1] = res0
 134 
 135 #else
 136 
 137 #define STORE2(res0, res1)                                      \
 138   dp[0    ] = res0;                                             \
 139   dp[chan1] = res1
 140 
 141 #endif /* _LITTLE_ENDIAN */
 142 
 143 /***************************************************************/
 144 #ifdef _NO_LONGLONG
 145 
 146 #define LOAD_BUFF(buff)                                         \
 147   buff[i    ] = sp[0];                                          \
 148   buff[i + 1] = sp[chan1]
 149 
 150 #else /* _NO_LONGLONG */
 151 
 152 #ifdef _LITTLE_ENDIAN
 153 
 154 #define LOAD_BUFF(buff)                                         \
 155   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
 156 
 157 #else /* _LITTLE_ENDIAN */
 158 
 159 #define LOAD_BUFF(buff)                                         \
 160   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 161 
 162 #endif /* _LITTLE_ENDIAN */
 163 #endif /* _NO_LONGLONG */
 164 
 165 /***************************************************************/
 166 #define MLIB_D2_24 16777216.0f
 167 
 168 /***************************************************************/
 169 typedef union {
 170   mlib_d64 d64;
 171   struct {
 172     mlib_s32 i0;
 173     mlib_s32 i1;
 174   } i32s;
 175 } d64_2x32;
 176 
 177 /***************************************************************/
 178 #define BUFF_LINE 256
 179 
 180 /***************************************************************/
 181 #define DEF_VARS(type)                                          \
 182   type     *adr_src, *sl, *sp, *sl1;                            \
 183   type     *adr_dst, *dl, *dp;                                  \
 184   FTYPE    *pbuff = buff;                                       \
 185   mlib_s32 *buffi, *buffo;                                      \
 186   mlib_s32 wid, hgt, sll, dll;                                  \
 187   mlib_s32 nchannel, chan1, chan2;                              \
 188   mlib_s32 i, j, c, swid
 189 
 190 /***************************************************************/
 191 #define LOAD_KERNEL3()                                                   \
 192   FTYPE    scalef = DSCALE;                                              \
 193   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7, k8;                           \
 194   FTYPE    p00, p01, p02, p03,                                           \
 195            p10, p11, p12, p13,                                           \
 196            p20, p21, p22, p23;                                           \
 197                                                                          \
 198   while (scalef_expon > 30) {                                            \
 199     scalef /= (1 << 30);                                                 \
 200     scalef_expon -= 30;                                                  \
 201   }                                                                      \
 202                                                                          \
 203   scalef /= (1 << scalef_expon);                                         \
 204                                                                          \
 205   /* keep kernel in regs */                                              \
 206   k0 = scalef * kern[0];  k1 = scalef * kern[1];  k2 = scalef * kern[2]; \
 207   k3 = scalef * kern[3];  k4 = scalef * kern[4];  k5 = scalef * kern[5]; \
 208   k6 = scalef * kern[6];  k7 = scalef * kern[7];  k8 = scalef * kern[8]
 209 
 210 /***************************************************************/
 211 #define LOAD_KERNEL(SIZE)                                       \
 212   FTYPE    scalef = DSCALE;                                     \
 213                                                                 \
 214   while (scalef_expon > 30) {                                   \
 215     scalef /= (1 << 30);                                        \
 216     scalef_expon -= 30;                                         \
 217   }                                                             \
 218                                                                 \
 219   scalef /= (1 << scalef_expon);                                \
 220                                                                 \
 221   for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j]
 222 
 223 /***************************************************************/
 224 #define GET_SRC_DST_PARAMETERS(type)                            \
 225   hgt = mlib_ImageGetHeight(src);                               \
 226   wid = mlib_ImageGetWidth(src);                                \
 227   nchannel = mlib_ImageGetChannels(src);                        \
 228   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 229   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 230   adr_src = (type *)mlib_ImageGetData(src);                     \
 231   adr_dst = (type *)mlib_ImageGetData(dst)
 232 
 233 /***************************************************************/
 234 #ifndef __sparc
 235 #if IMG_TYPE == 1
 236 
 237 /*
 238  * Test for the presence of any "1" bit in bits
 239    8 to 31 of val. If present, then val is either
 240    negative or >255. If over/underflows of 8 bits
 241    are uncommon, then this technique can be a win,
 242    since only a single test, rather than two, is
 243    necessary to determine if clamping is needed.
 244    On the other hand, if over/underflows are common,
 245    it adds an extra test.
 246 */
 247 #define CLAMP_STORE(dst, val)                                   \
 248   if (val & 0xffffff00) {                                       \
 249     if (val < MLIB_U8_MIN)                                      \
 250       dst = MLIB_U8_MIN;                                        \
 251     else                                                        \
 252       dst = MLIB_U8_MAX;                                        \
 253   } else {                                                      \
 254     dst = (mlib_u8)val;                                         \
 255   }
 256 
 257 #elif IMG_TYPE == 2
 258 
 259 #define CLAMP_STORE(dst, val)                                   \
 260   if (val >= MLIB_S16_MAX)                                      \
 261     dst = MLIB_S16_MAX;                                         \
 262   else if (val <= MLIB_S16_MIN)                                 \
 263     dst = MLIB_S16_MIN;                                         \
 264   else                                                          \
 265     dst = (mlib_s16)val
 266 
 267 #elif IMG_TYPE == 3
 268 
 269 #define CLAMP_STORE(dst, val)                                   \
 270   if (val >= MLIB_U16_MAX)                                      \
 271     dst = MLIB_U16_MAX;                                         \
 272   else if (val <= MLIB_U16_MIN)                                 \
 273     dst = MLIB_U16_MIN;                                         \
 274   else                                                          \
 275     dst = (mlib_u16)val
 276 
 277 #endif /* IMG_TYPE == 1 */
 278 #endif /* __sparc */
 279 
 280 /***************************************************************/
 281 #define KSIZE  3
 282 
 283 mlib_status CONV_FUNC(3x3)
 284 {
 285   FTYPE    buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT;
 286   DEF_VARS(DTYPE);
 287   DTYPE *sl2;
 288 #ifndef __sparc
 289   mlib_s32 d0, d1;
 290 #endif /* __sparc */
 291   LOAD_KERNEL3();
 292   GET_SRC_DST_PARAMETERS(DTYPE);
 293 
 294   swid = wid + KSIZE1;
 295 
 296   if (swid > BUFF_LINE) {
 297     pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE   )*swid);
 298 
 299     if (pbuff == NULL) return MLIB_FAILURE;
 300   }
 301 
 302   buff0 = pbuff;
 303   buff1 = buff0 + swid;
 304   buff2 = buff1 + swid;
 305   buff3 = buff2 + swid;
 306   buffo = (mlib_s32*)(buff3 + swid);
 307   buffi = buffo + (swid &~ 1);
 308 
 309   swid -= (dx_l + dx_r);
 310 
 311   chan1 = nchannel;
 312   chan2 = chan1 + chan1;
 313 
 314   for (c = 0; c < nchannel; c++) {
 315     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 316 
 317     sl = adr_src + c;
 318     dl = adr_dst + c;
 319 
 320     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
 321     else sl1 = sl;
 322 
 323     if ((hgt - dy_b) > 0) sl2 = sl1 + sll;
 324     else sl2 = sl1;
 325 
 326     for (i = 0; i < dx_l; i++) {
 327       buff0[i] = (FTYPE)sl[0];
 328       buff1[i] = (FTYPE)sl1[0];
 329       buff2[i] = (FTYPE)sl2[0];
 330     }
 331 
 332 #ifdef __SUNPRO_C
 333 #pragma pipeloop(0)
 334 #endif /* __SUNPRO_C */
 335     for (i = 0; i < swid; i++) {
 336       buff0[i + dx_l] = (FTYPE)sl[i*chan1];
 337       buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
 338       buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
 339     }
 340 
 341     for (i = 0; i < dx_r; i++) {
 342       buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
 343       buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
 344       buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
 345     }
 346 
 347     if ((hgt - dy_b) > 1) sl = sl2 + sll;
 348     else sl = sl2;
 349 
 350     for (j = 0; j < hgt; j++) {
 351       FTYPE    s0, s1;
 352 
 353       p02 = buff0[0];
 354       p12 = buff1[0];
 355       p22 = buff2[0];
 356 
 357       p03 = buff0[1];
 358       p13 = buff1[1];
 359       p23 = buff2[1];
 360 
 361       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 362       s1 = p03 * k0 + p13 * k3 + p23 * k6;
 363 
 364       sp = sl;
 365       dp = dl;
 366 
 367 #ifdef __SUNPRO_C
 368 #pragma pipeloop(0)
 369 #endif /* __SUNPRO_C */
 370       for (i = 0; i <= (wid - 2); i += 2) {
 371 #ifdef __sparc
 372 #ifdef _NO_LONGLONG
 373         mlib_s32 o64_1, o64_2;
 374 #else /* _NO_LONGLONG */
 375         mlib_s64 o64;
 376 #endif /* _NO_LONGLONG */
 377 #endif /* __sparc */
 378         d64_2x32 dd;
 379 
 380         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
 381         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
 382 
 383         LOAD_BUFF(buffi);
 384 
 385         dd.d64 = *(FTYPE   *)(buffi + i);
 386         buff3[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 387         buff3[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 388 
 389 #ifndef __sparc
 390 
 391         d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
 392         d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
 393 
 394         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 395         s1 = p03 * k0 + p13 * k3 + p23 * k6;
 396 
 397         dp[0    ] = FROM_S32(d0);
 398         dp[chan1] = FROM_S32(d1);
 399 
 400 #else /* __sparc */
 401 
 402         dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
 403         dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
 404         *(FTYPE   *)(buffo + i) = dd.d64;
 405 
 406         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 407         s1 = p03 * k0 + p13 * k3 + p23 * k6;
 408 
 409 #ifdef _NO_LONGLONG
 410 
 411         o64_1 = buffo[i];
 412         o64_2 = buffo[i+1];
 413 #if IMG_TYPE != 1
 414         STORE2(FROM_S32(o64_1), FROM_S32(o64_2));
 415 #else
 416         STORE2(o64_1 >> 24, o64_2 >> 24);
 417 #endif /* IMG_TYPE != 1 */
 418 
 419 #else /* _NO_LONGLONG */
 420 
 421         o64 = *(mlib_s64*)(buffo + i);
 422 #if IMG_TYPE != 1
 423         STORE2(FROM_S32(o64 >> 32), FROM_S32(o64));
 424 #else
 425         STORE2(o64 >> 56, o64 >> 24);
 426 #endif /* IMG_TYPE != 1 */
 427 #endif /* _NO_LONGLONG */
 428 #endif /* __sparc */
 429 
 430         sp += chan2;
 431         dp += chan2;
 432       }
 433 
 434       for (; i < wid; i++) {
 435         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];
 436         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
 437         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
 438 
 439         buffi[i] = (mlib_s32)sp[0];
 440         buff3[i + dx_l] = (FTYPE)buffi[i];
 441 
 442 #ifndef __sparc
 443 
 444         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
 445                  p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
 446 
 447         dp[0] = FROM_S32(d0);
 448 
 449 #else  /* __sparc */
 450 
 451         buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
 452                        p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
 453 #if IMG_TYPE != 1
 454         dp[0] = FROM_S32(buffo[i]);
 455 #else
 456         dp[0] = buffo[i] >> 24;
 457 #endif /* IMG_TYPE != 1 */
 458 #endif /* __sparc */
 459 
 460         sp += chan1;
 461         dp += chan1;
 462       }
 463 
 464       for (; i < swid; i++) {
 465         buffi[i] = (mlib_s32)sp[0];
 466         buff3[i + dx_l] = (FTYPE)buffi[i];
 467         sp += chan1;
 468       }
 469 
 470       for (i = 0; i < dx_l; i++) buff3[i] = buff3[dx_l];
 471       for (i = 0; i < dx_r; i++) buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
 472 
 473       if (j < hgt - dy_b - 2) sl += sll;
 474       dl += dll;
 475 
 476       buffT = buff0;
 477       buff0 = buff1;
 478       buff1 = buff2;
 479       buff2 = buff3;
 480       buff3 = buffT;
 481     }
 482   }
 483 
 484 #ifdef __sparc
 485 #if IMG_TYPE == 1
 486   {
 487     mlib_s32 amask = (1 << nchannel) - 1;
 488 
 489     if ((cmask & amask) != amask) {
 490       mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
 491     } else {
 492       mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
 493     }
 494   }
 495 
 496 #endif /* IMG_TYPE == 1 */
 497 #endif /* __sparc */
 498 
 499   if (pbuff != buff) mlib_free(pbuff);
 500 
 501   return MLIB_SUCCESS;
 502 }
 503 
 504 /***************************************************************/
 505 #ifndef __sparc /* for x86, using integer multiplies is faster */
 506 
 507 mlib_status CONV_FUNC_I(3x3)
 508 {
 509   DTYPE    *adr_src, *sl, *sp0, *sp1, *sp2, *sp_1, *sp_2;
 510   DTYPE    *adr_dst, *dl, *dp;
 511   mlib_s32 wid, hgt, sll, dll;
 512   mlib_s32 nchannel, chan1, chan2, delta_chan;
 513   mlib_s32 i, j, c;
 514   mlib_s32 shift1, shift2;
 515   mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8;
 516   mlib_s32 p02, p03,
 517            p12, p13,
 518            p22, p23;
 519 
 520 #if IMG_TYPE != 1
 521   shift1 = 16;
 522 #else
 523   shift1 = 8;
 524 #endif /* IMG_TYPE != 1 */
 525 
 526   shift2 = scalef_expon - shift1;
 527 
 528   /* keep kernel in regs */
 529   k0 = kern[0] >> shift1;  k1 = kern[1] >> shift1;  k2 = kern[2] >> shift1;
 530   k3 = kern[3] >> shift1;  k4 = kern[4] >> shift1;  k5 = kern[5] >> shift1;
 531   k6 = kern[6] >> shift1;  k7 = kern[7] >> shift1;  k8 = kern[8] >> shift1;
 532 
 533   GET_SRC_DST_PARAMETERS(DTYPE);
 534 
 535   chan1 = nchannel;
 536   chan2 = chan1 + chan1;
 537   delta_chan = 0;
 538 
 539   if ((1 > dx_l) && (1 < wid + KSIZE1 - dx_r)) delta_chan = chan1;
 540 
 541   for (c = 0; c < chan1; c++) {
 542     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 543 
 544     sl = adr_src + c;
 545     dl = adr_dst + c;
 546 
 547     sp_1 = sl;
 548 
 549     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl += sll;
 550     sp_2 = sl;
 551 
 552     if ((hgt - dy_b) > 0) sl += sll;
 553 
 554     for (j = 0; j < hgt; j++) {
 555       mlib_s32 s0, s1;
 556       mlib_s32 pix0, pix1;
 557 
 558       dp  = dl;
 559       sp0 = sp_1;
 560       sp_1 = sp_2;
 561       sp_2 = sl;
 562 
 563       sp1 = sp_1;
 564       sp2 = sp_2;
 565 
 566       p02 = sp0[0];
 567       p12 = sp1[0];
 568       p22 = sp2[0];
 569 
 570       p03 = sp0[delta_chan];
 571       p13 = sp1[delta_chan];
 572       p23 = sp2[delta_chan];
 573 
 574       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 575       s1 = p03 * k0 + p13 * k3 + p23 * k6;
 576 
 577       sp0 += (chan1 + delta_chan);
 578       sp1 += (chan1 + delta_chan);
 579       sp2 += (chan1 + delta_chan);
 580 
 581 #ifdef __SUNPRO_C
 582 #pragma pipeloop(0)
 583 #endif /* __SUNPRO_C */
 584       for (i = 0; i <= (wid - dx_r - 2); i += 2) {
 585         p02 = sp0[0];     p12 = sp1[0];     p22 = sp2[0];
 586         p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
 587 
 588         pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
 589         pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 +
 590                 p13 * k5 + p22 * k7 + p23 * k8) >> shift2;
 591 
 592         CLAMP_STORE(dp[0],     pix0);
 593         CLAMP_STORE(dp[chan1], pix1);
 594 
 595         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 596         s1 = p03 * k0 + p13 * k3 + p23 * k6;
 597 
 598         sp0 += chan2;
 599         sp1 += chan2;
 600         sp2 += chan2;
 601         dp += chan2;
 602       }
 603 
 604       p02 = p03; p12 = p13; p22 = p23;
 605 
 606       for (; i < wid - dx_r; i++) {
 607         p03 = sp0[0]; p13 = sp1[0]; p23 = sp2[0];
 608         pix0 = (s0 + p03 * k2 + p13 * k5 + p23 * k8) >> shift2;
 609         CLAMP_STORE(dp[0], pix0);
 610         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 611         p02 = p03; p12 = p13; p22 = p23;
 612         sp0 += chan1;
 613         sp1 += chan1;
 614         sp2 += chan1;
 615         dp += chan1;
 616       }
 617 
 618       sp0 -= chan1;
 619       sp1 -= chan1;
 620       sp2 -= chan1;
 621 
 622       for (; i < wid; i++) {
 623         p03 = sp0[0]; p13 = sp1[0]; p23 = sp2[0];
 624         pix0 = (s0 + p03 * k2 + p13 * k5 + p23 * k8) >> shift2;
 625         CLAMP_STORE(dp[0], pix0);
 626         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 627         p02 = p03; p12 = p13; p22 = p23;
 628         dp += chan1;
 629       }
 630 
 631       if (j < hgt - dy_b - 1) sl += sll;
 632       dl += dll;
 633     }
 634   }
 635 
 636   return MLIB_SUCCESS;
 637 }
 638 
 639 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
 640 
 641 /***************************************************************/
 642 #undef  KSIZE
 643 #define KSIZE 4
 644 
 645 mlib_status CONV_FUNC(4x4)
 646 {
 647   FTYPE    buff[(KSIZE + 3)*BUFF_LINE];
 648   FTYPE    *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT;
 649   FTYPE    k[KSIZE*KSIZE];
 650   mlib_s32 d0, d1;
 651   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7;
 652   FTYPE    p00, p01, p02, p03, p04,
 653            p10, p11, p12, p13, p14,
 654            p20, p21, p22, p23,
 655            p30, p31, p32, p33;
 656   DEF_VARS(DTYPE);
 657   DTYPE *sl2, *sl3;
 658   LOAD_KERNEL(KSIZE*KSIZE);
 659   GET_SRC_DST_PARAMETERS(DTYPE);
 660 
 661   swid = wid + KSIZE1;
 662 
 663   if (swid > BUFF_LINE) {
 664     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE   )*swid);
 665 
 666     if (pbuff == NULL) return MLIB_FAILURE;
 667   }
 668 
 669   buff0 = pbuff;
 670   buff1 = buff0 + swid;
 671   buff2 = buff1 + swid;
 672   buff3 = buff2 + swid;
 673   buff4 = buff3 + swid;
 674   buffd = buff4 + swid;
 675   buffo = (mlib_s32*)(buffd + swid);
 676   buffi = buffo + (swid &~ 1);
 677 
 678   swid -= (dx_l + dx_r);
 679 
 680   chan1 = nchannel;
 681   chan2 = chan1 + chan1;
 682 
 683   for (c = 0; c < nchannel; c++) {
 684     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 685 
 686     sl = adr_src + c;
 687     dl = adr_dst + c;
 688 
 689     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
 690     else sl1 = sl;
 691 
 692     if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
 693     else sl2 = sl1;
 694 
 695     if ((hgt - dy_b) > 0) sl3 = sl2 + sll;
 696     else sl3 = sl2;
 697 
 698     for (i = 0; i < dx_l; i++) {
 699       buff0[i] = (FTYPE)sl[0];
 700       buff1[i] = (FTYPE)sl1[0];
 701       buff2[i] = (FTYPE)sl2[0];
 702       buff3[i] = (FTYPE)sl3[0];
 703     }
 704 
 705 #ifdef __SUNPRO_C
 706 #pragma pipeloop(0)
 707 #endif /* __SUNPRO_C */
 708     for (i = 0; i < swid; i++) {
 709       buff0[i + dx_l] = (FTYPE)sl[i*chan1];
 710       buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
 711       buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
 712       buff3[i + dx_l] = (FTYPE)sl3[i*chan1];
 713     }
 714 
 715     for (i = 0; i < dx_r; i++) {
 716       buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
 717       buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
 718       buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
 719       buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
 720     }
 721 
 722     if ((hgt - dy_b) > 1) sl = sl3 + sll;
 723     else sl = sl3;
 724 
 725     for (j = 0; j < hgt; j++) {
 726       d64_2x32 dd;
 727 
 728       /*
 729        *  First loop on two first lines of kernel
 730        */
 731       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
 732       k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
 733 
 734       sp = sl;
 735       dp = dl;
 736 
 737       p02 = buff0[0];
 738       p12 = buff1[0];
 739       p03 = buff0[1];
 740       p13 = buff1[1];
 741       p04 = buff0[2];
 742 
 743 #ifdef __SUNPRO_C
 744 #pragma pipeloop(0)
 745 #endif /* __SUNPRO_C */
 746       for (i = 0; i <= (wid - 2); i += 2) {
 747         p00 = p02; p10 = p12;
 748         p01 = p03; p11 = p13;
 749         p02 = p04; p12 = buff1[i + 2];
 750         p03 = buff0[i + 3]; p13 = buff1[i + 3];
 751         p04 = buff0[i + 4]; p14 = buff1[i + 4];
 752 
 753         LOAD_BUFF(buffi);
 754 
 755         dd.d64 = *(FTYPE   *)(buffi + i);
 756         buff4[i + dx_l    ] = (FTYPE)dd.i32s.i0;
 757         buff4[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
 758 
 759         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
 760                         p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
 761         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
 762                         p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
 763 
 764         sp += chan2;
 765       }
 766 
 767       /*
 768        *  Second loop on two last lines of kernel
 769        */
 770       k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
 771       k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
 772 
 773       p02 = buff2[0];
 774       p12 = buff3[0];
 775       p03 = buff2[1];
 776       p13 = buff3[1];
 777       p04 = buff2[2];
 778 
 779 #ifdef __SUNPRO_C
 780 #pragma pipeloop(0)
 781 #endif /* __SUNPRO_C */
 782       for (i = 0; i <= (wid - 2); i += 2) {
 783         p00 = p02; p10 = p12;
 784         p01 = p03; p11 = p13;
 785         p02 = p04; p12 = buff3[i + 2];
 786         p03 = buff2[i + 3]; p13 = buff3[i + 3];
 787         p04 = buff2[i + 4]; p14 = buff3[i + 4];
 788 
 789         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
 790                  p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]);
 791         d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
 792                  p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]);
 793 
 794         dp[0    ] = FROM_S32(d0);
 795         dp[chan1] = FROM_S32(d1);
 796 
 797         dp += chan2;
 798       }
 799 
 800       /* last pixels */
 801       for (; i < wid; i++) {
 802         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
 803         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
 804         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
 805         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
 806 
 807         buff4[i + dx_l] = (FTYPE)sp[0];
 808 
 809         buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
 810                        p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
 811                        p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
 812                        p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
 813 
 814         dp[0] = FROM_S32(buffo[i]);
 815 
 816         sp += chan1;
 817         dp += chan1;
 818       }
 819 
 820       for (; i < swid; i++) {
 821         buff4[i + dx_l] = (FTYPE)sp[0];
 822         sp += chan1;
 823       }
 824 
 825       for (i = 0; i < dx_l; i++) buff4[i] = buff4[dx_l];
 826       for (i = 0; i < dx_r; i++) buff4[swid + dx_l + i] = buff4[swid + dx_l - 1];
 827 
 828       /* next line */
 829 
 830       if (j < hgt - dy_b - 2) sl += sll;
 831       dl += dll;
 832 
 833       buffT = buff0;
 834       buff0 = buff1;
 835       buff1 = buff2;
 836       buff2 = buff3;
 837       buff3 = buff4;
 838       buff4 = buffT;
 839     }
 840   }
 841 
 842   if (pbuff != buff) mlib_free(pbuff);
 843 
 844   return MLIB_SUCCESS;
 845 }
 846 
 847 /***************************************************************/
 848 #undef  KSIZE
 849 #define KSIZE 5
 850 
 851 mlib_status CONV_FUNC(5x5)
 852 {
 853   FTYPE    buff[(KSIZE + 3)*BUFF_LINE];
 854   FTYPE    *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT;
 855   FTYPE    k[KSIZE*KSIZE];
 856   mlib_s32 d0, d1;
 857   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
 858   FTYPE    p00, p01, p02, p03, p04, p05,
 859            p10, p11, p12, p13, p14, p15,
 860            p20, p21, p22, p23, p24,
 861            p30, p31, p32, p33, p34,
 862            p40, p41, p42, p43, p44;
 863   DEF_VARS(DTYPE);
 864   DTYPE *sl2, *sl3, *sl4;
 865   LOAD_KERNEL(KSIZE*KSIZE);
 866   GET_SRC_DST_PARAMETERS(DTYPE);
 867 
 868   swid = wid + KSIZE1;
 869 
 870   if (swid > BUFF_LINE) {
 871     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE   )*swid);
 872 
 873     if (pbuff == NULL) return MLIB_FAILURE;
 874   }
 875 
 876   buff0 = pbuff;
 877   buff1 = buff0 + swid;
 878   buff2 = buff1 + swid;
 879   buff3 = buff2 + swid;
 880   buff4 = buff3 + swid;
 881   buff5 = buff4 + swid;
 882   buffd = buff5 + swid;
 883   buffo = (mlib_s32*)(buffd + swid);
 884   buffi = buffo + (swid &~ 1);
 885 
 886   swid -= (dx_l + dx_r);
 887 
 888   chan1 = nchannel;
 889   chan2 = chan1 + chan1;
 890 
 891   for (c = 0; c < nchannel; c++) {
 892     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 893 
 894     sl = adr_src + c;
 895     dl = adr_dst + c;
 896 
 897     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
 898     else sl1 = sl;
 899 
 900     if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
 901     else sl2 = sl1;
 902 
 903     if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl3 = sl2 + sll;
 904     else sl3 = sl2;
 905 
 906     if ((hgt - dy_b) > 0) sl4 = sl3 + sll;
 907     else sl4 = sl3;
 908 
 909     for (i = 0; i < dx_l; i++) {
 910       buff0[i] = (FTYPE)sl[0];
 911       buff1[i] = (FTYPE)sl1[0];
 912       buff2[i] = (FTYPE)sl2[0];
 913       buff3[i] = (FTYPE)sl3[0];
 914       buff4[i] = (FTYPE)sl4[0];
 915     }
 916 
 917 #ifdef __SUNPRO_C
 918 #pragma pipeloop(0)
 919 #endif /* __SUNPRO_C */
 920     for (i = 0; i < swid; i++) {
 921       buff0[i + dx_l] = (FTYPE)sl[i*chan1];
 922       buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
 923       buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
 924       buff3[i + dx_l] = (FTYPE)sl3[i*chan1];
 925       buff4[i + dx_l] = (FTYPE)sl4[i*chan1];
 926     }
 927 
 928     for (i = 0; i < dx_r; i++) {
 929       buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
 930       buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
 931       buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
 932       buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
 933       buff4[swid + dx_l + i] = buff4[swid + dx_l - 1];
 934     }
 935 
 936     if ((hgt - dy_b) > 1) sl = sl4 + sll;
 937     else sl = sl4;
 938 
 939     for (j = 0; j < hgt; j++) {
 940       d64_2x32 dd;
 941 
 942       /*
 943        *  First loop
 944        */
 945       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
 946       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
 947 
 948       sp = sl;
 949       dp = dl;
 950 
 951       p02 = buff0[0];
 952       p12 = buff1[0];
 953       p03 = buff0[1];
 954       p13 = buff1[1];
 955       p04 = buff0[2];
 956       p14 = buff1[2];
 957 
 958 #ifdef __SUNPRO_C
 959 #pragma pipeloop(0)
 960 #endif /* __SUNPRO_C */
 961       for (i = 0; i <= (wid - 2); i += 2) {
 962         p00 = p02; p10 = p12;
 963         p01 = p03; p11 = p13;
 964         p02 = p04; p12 = p14;
 965 
 966         LOAD_BUFF(buffi);
 967 
 968         p03 = buff0[i + 3]; p13 = buff1[i + 3];
 969         p04 = buff0[i + 4]; p14 = buff1[i + 4];
 970         p05 = buff0[i + 5]; p15 = buff1[i + 5];
 971 
 972         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
 973                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
 974         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
 975                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
 976 
 977         sp += chan2;
 978       }
 979 
 980       /*
 981        *  Second loop
 982        */
 983       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
 984       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
 985 
 986       p02 = buff2[0];
 987       p12 = buff3[0];
 988       p03 = buff2[1];
 989       p13 = buff3[1];
 990 
 991 #ifdef __SUNPRO_C
 992 #pragma pipeloop(0)
 993 #endif /* __SUNPRO_C */
 994       for (i = 0; i <= (wid - 2); i += 2) {
 995         p00 = p02; p10 = p12;
 996         p01 = p03; p11 = p13;
 997 
 998         p02 = buff2[i + 2]; p12 = buff3[i + 2];
 999         p03 = buff2[i + 3]; p13 = buff3[i + 3];
1000         p04 = buff2[i + 4]; p14 = buff3[i + 4];
1001         p05 = buff2[i + 5]; p15 = buff3[i + 5];
1002 
1003         dd.d64 = *(FTYPE   *)(buffi + i);
1004         buff5[i + dx_l    ] = (FTYPE)dd.i32s.i0;
1005         buff5[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
1006 
1007         buffd[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1008                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1009         buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1010                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1011       }
1012 
1013       /*
1014        *  3 loop
1015        */
1016       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1017 
1018       p02 = buff4[0];
1019       p03 = buff4[1];
1020       p04 = buff4[2];
1021       p05 = buff4[3];
1022 
1023 #ifdef __SUNPRO_C
1024 #pragma pipeloop(0)
1025 #endif /* __SUNPRO_C */
1026       for (i = 0; i <= (wid - 2); i += 2) {
1027         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1028 
1029         p04 = buff4[i + 4]; p05 = buff4[i + 5];
1030 
1031         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]);
1032         d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]);
1033 
1034         dp[0    ] = FROM_S32(d0);
1035         dp[chan1] = FROM_S32(d1);
1036 
1037         dp += chan2;
1038       }
1039 
1040       /* last pixels */
1041       for (; i < wid; i++) {
1042         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
1043         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
1044         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
1045         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
1046         p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
1047 
1048         p40 = buff4[i];     p41 = buff4[i + 1]; p42 = buff4[i + 2];
1049         p43 = buff4[i + 3]; p44 = buff4[i + 4];
1050 
1051         buff5[i + dx_l] = (FTYPE)sp[0];
1052 
1053         buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
1054                        p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
1055                        p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
1056                        p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
1057                        p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
1058 
1059         dp[0] = FROM_S32(buffo[i]);
1060 
1061         sp += chan1;
1062         dp += chan1;
1063       }
1064 
1065       for (; i < swid; i++) {
1066         buff5[i + dx_l] = (FTYPE)sp[0];
1067         sp += chan1;
1068       }
1069 
1070       for (i = 0; i < dx_l; i++) buff5[i] = buff5[dx_l];
1071       for (i = 0; i < dx_r; i++) buff5[swid + dx_l + i] = buff5[swid + dx_l - 1];
1072 
1073       /* next line */
1074 
1075       if (j < hgt - dy_b - 2) sl += sll;
1076       dl += dll;
1077 
1078       buffT = buff0;
1079       buff0 = buff1;
1080       buff1 = buff2;
1081       buff2 = buff3;
1082       buff3 = buff4;
1083       buff4 = buff5;
1084       buff5 = buffT;
1085     }
1086   }
1087 
1088   if (pbuff != buff) mlib_free(pbuff);
1089 
1090   return MLIB_SUCCESS;
1091 }
1092 
1093 /***************************************************************/
1094 #ifndef __sparc /* for x86, using integer multiplies is faster */
1095 
1096 mlib_status CONV_FUNC_I(5x5)
1097 {
1098   mlib_s32 buff[BUFF_LINE];
1099   mlib_s32 *buffd;
1100   mlib_s32 k[KSIZE*KSIZE];
1101   mlib_s32 shift1, shift2;
1102   mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
1103   mlib_s32 p00, p01, p02, p03, p04, p05,
1104            p10, p11, p12, p13, p14, p15;
1105   DTYPE    *adr_src, *sl, *sp0, *sp1, *sp2, *sp3, *sp4;
1106   DTYPE    *sp_1, *sp_2, *sp_3, *sp_4;
1107   DTYPE    *adr_dst, *dl, *dp;
1108   mlib_s32 *pbuff = buff;
1109   mlib_s32 wid, hgt, sll, dll;
1110   mlib_s32 nchannel, chan1, chan2, chan4;
1111   mlib_s32 delta_chan1, delta_chan2, delta_chan3;
1112   mlib_s32 i, j, c;
1113 
1114 #if IMG_TYPE != 1
1115   shift1 = 16;
1116 #else
1117   shift1 = 8;
1118 #endif /* IMG_TYPE != 1 */
1119 
1120   shift2 = scalef_expon - shift1;
1121 
1122   for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1;
1123 
1124   GET_SRC_DST_PARAMETERS(DTYPE);
1125 
1126   if (wid > BUFF_LINE) {
1127     pbuff = mlib_malloc(sizeof(mlib_s32)*wid);
1128 
1129     if (pbuff == NULL) return MLIB_FAILURE;
1130   }
1131 
1132   buffd = pbuff;
1133 
1134   chan1 = nchannel;
1135   chan2 = chan1 + chan1;
1136 
1137   if ((1 > dx_l) && (1 < wid + KSIZE1 - dx_r)) delta_chan1 = chan1;
1138   else delta_chan1 = 0;
1139 
1140   if ((2 > dx_l) && (2 < wid + KSIZE1 - dx_r)) delta_chan2 = delta_chan1 + chan1;
1141   else delta_chan2 = delta_chan1;
1142 
1143   if ((3 > dx_l) && (3 < wid + KSIZE1 - dx_r)) delta_chan3 = delta_chan2 + chan1;
1144   else delta_chan3 = delta_chan2;
1145 
1146   chan4 = chan1 + delta_chan3;
1147 
1148   for (c = 0; c < chan1; c++) {
1149     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1150 
1151     sl = adr_src + c;
1152     dl = adr_dst + c;
1153 
1154     sp_1 = sl;
1155 
1156     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl += sll;
1157     sp_2 = sl;
1158 
1159     if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl += sll;
1160     sp_3 = sl;
1161 
1162     if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl += sll;
1163     sp_4 = sl;
1164 
1165     if ((hgt - dy_b) > 0) sl += sll;
1166 
1167     for (j = 0; j < hgt; j++) {
1168       mlib_s32 pix0, pix1;
1169 
1170       dp  = dl;
1171       sp0 = sp_1;
1172       sp_1 = sp_2;
1173       sp_2 = sp_3;
1174       sp_3 = sp_4;
1175       sp_4 = sl;
1176 
1177       sp1 = sp_1;
1178       sp2 = sp_2;
1179       sp3 = sp_3;
1180       sp4 = sp_4;
1181 
1182       /*
1183        *  First loop
1184        */
1185 
1186       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
1187       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
1188 
1189       p02 = sp0[0];           p12 = sp1[0];
1190       p03 = sp0[delta_chan1]; p13 = sp1[delta_chan1];
1191       p04 = sp0[delta_chan2]; p14 = sp1[delta_chan2];
1192       p05 = sp0[delta_chan3]; p15 = sp1[delta_chan3];
1193 
1194       sp0 += chan4;
1195       sp1 += chan4;
1196 
1197 #ifdef __SUNPRO_C
1198 #pragma pipeloop(0)
1199 #endif /* __SUNPRO_C */
1200       for (i = 0; i <= (wid - dx_r - 2); i += 2) {
1201         p00 = p02; p10 = p12;
1202         p01 = p03; p11 = p13;
1203         p02 = p04; p12 = p14;
1204         p03 = p05; p13 = p15;
1205 
1206         p04 = sp0[0];     p14 = sp1[0];
1207         p05 = sp0[chan1]; p15 = sp1[chan1];
1208 
1209         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1210                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1211         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1212                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1213 
1214         sp0 += chan2;
1215         sp1 += chan2;
1216       }
1217 
1218       p01 = p02; p02 = p03; p03 = p04; p04 = p05;
1219       p11 = p12; p12 = p13; p13 = p14; p14 = p15;
1220 
1221       for (; i < wid - dx_r; i++) {
1222         p00 = p01; p10 = p11;
1223         p01 = p02; p11 = p12;
1224         p02 = p03; p12 = p13;
1225         p03 = p04; p13 = p14;
1226 
1227         p04 = sp0[0];     p14 = sp1[0];
1228 
1229         buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1230                     p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1231 
1232         sp0 += chan1;
1233         sp1 += chan1;
1234       }
1235 
1236       sp0 -= chan1;
1237       sp1 -= chan1;
1238 
1239       for (; i < wid; i++) {
1240         p00 = p01; p10 = p11;
1241         p01 = p02; p11 = p12;
1242         p02 = p03; p12 = p13;
1243         p03 = p04; p13 = p14;
1244 
1245         p04 = sp0[0];     p14 = sp1[0];
1246 
1247         buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1248                     p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1249       }
1250 
1251       /*
1252        *  Second loop
1253        */
1254 
1255       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
1256       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
1257 
1258       p02 = sp2[0];           p12 = sp3[0];
1259       p03 = sp2[delta_chan1]; p13 = sp3[delta_chan1];
1260       p04 = sp2[delta_chan2]; p14 = sp3[delta_chan2];
1261       p05 = sp2[delta_chan3]; p15 = sp3[delta_chan3];
1262 
1263       sp2 += chan4;
1264       sp3 += chan4;
1265 
1266 #ifdef __SUNPRO_C
1267 #pragma pipeloop(0)
1268 #endif /* __SUNPRO_C */
1269       for (i = 0; i <= (wid - dx_r - 2); i += 2) {
1270         p00 = p02; p10 = p12;
1271         p01 = p03; p11 = p13;
1272         p02 = p04; p12 = p14;
1273         p03 = p05; p13 = p15;
1274 
1275         p04 = sp2[0];     p14 = sp3[0];
1276         p05 = sp2[chan1]; p15 = sp3[chan1];
1277 
1278         buffd[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1279                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1280         buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1281                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1282 
1283         sp2 += chan2;
1284         sp3 += chan2;
1285       }
1286 
1287       p01 = p02; p02 = p03; p03 = p04; p04 = p05;
1288       p11 = p12; p12 = p13; p13 = p14; p14 = p15;
1289 
1290       for (; i < wid - dx_r; i++) {
1291         p00 = p01; p10 = p11;
1292         p01 = p02; p11 = p12;
1293         p02 = p03; p12 = p13;
1294         p03 = p04; p13 = p14;
1295 
1296         p04 = sp2[0];     p14 = sp3[0];
1297 
1298         buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1299                      p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1300 
1301         sp2 += chan1;
1302         sp3 += chan1;
1303       }
1304 
1305       sp2 -= chan1;
1306       sp3 -= chan1;
1307 
1308       for (; i < wid; i++) {
1309         p00 = p01; p10 = p11;
1310         p01 = p02; p11 = p12;
1311         p02 = p03; p12 = p13;
1312         p03 = p04; p13 = p14;
1313 
1314         p04 = sp2[0];     p14 = sp3[0];
1315 
1316         buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1317                      p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1318       }
1319 
1320       /*
1321        *  3 loop
1322        */
1323 
1324       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1325 
1326       p02 = sp4[0];
1327       p03 = sp4[delta_chan1];
1328       p04 = sp4[delta_chan2];
1329       p05 = sp4[delta_chan3];
1330 
1331       sp4 += chan4;
1332 
1333 #ifdef __SUNPRO_C
1334 #pragma pipeloop(0)
1335 #endif /* __SUNPRO_C */
1336       for (i = 0; i <= (wid - dx_r - 2); i += 2) {
1337         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1338 
1339         p04 = sp4[0]; p05 = sp4[chan1];
1340 
1341         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
1342                 p03 * k3 + p04 * k4) >> shift2;
1343         pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 +
1344                 p04 * k3 + p05 * k4) >> shift2;
1345 
1346         CLAMP_STORE(dp[0],     pix0);
1347         CLAMP_STORE(dp[chan1], pix1);
1348 
1349         dp  += chan2;
1350         sp4 += chan2;
1351       }
1352 
1353       p01 = p02; p02 = p03; p03 = p04; p04 = p05;
1354 
1355       for (; i < wid - dx_r; i++) {
1356         p00 = p01; p01 = p02; p02 = p03; p03 = p04;
1357 
1358         p04 = sp4[0];
1359 
1360         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
1361                 p03 * k3 + p04 * k4) >> shift2;
1362         CLAMP_STORE(dp[0],     pix0);
1363 
1364         dp  += chan1;
1365         sp4 += chan1;
1366       }
1367 
1368       sp4 -= chan1;
1369 
1370       for (; i < wid; i++) {
1371         p00 = p01; p01 = p02; p02 = p03; p03 = p04;
1372 
1373         p04 = sp4[0];
1374 
1375         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
1376                 p03 * k3 + p04 * k4) >> shift2;
1377         CLAMP_STORE(dp[0],     pix0);
1378 
1379         dp  += chan1;
1380       }
1381 
1382       /* next line */
1383 
1384       if (j < hgt - dy_b - 1) sl += sll;
1385       dl += dll;
1386     }
1387   }
1388 
1389   if (pbuff != buff) mlib_free(pbuff);
1390 
1391   return MLIB_SUCCESS;
1392 }
1393 
1394 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
1395 
1396 /***************************************************************/
1397 #if IMG_TYPE == 1
1398 
1399 #undef  KSIZE
1400 #define KSIZE 7
1401 
1402 mlib_status CONV_FUNC(7x7)
1403 {
1404   FTYPE    buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
1405   FTYPE    k[KSIZE*KSIZE];
1406   mlib_s32 l, m, buff_ind;
1407   mlib_s32 d0, d1;
1408   FTYPE    k0, k1, k2, k3, k4, k5, k6;
1409   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
1410   DTYPE *sl2, *sl3, *sl4, *sl5, *sl6;
1411   DEF_VARS(DTYPE);
1412   LOAD_KERNEL(KSIZE*KSIZE);
1413   GET_SRC_DST_PARAMETERS(DTYPE);
1414 
1415   swid = wid + KSIZE1;
1416 
1417   if (wid > BUFF_LINE) {
1418     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE   )*wid);
1419 
1420     if (pbuff == NULL) return MLIB_FAILURE;
1421   }
1422 
1423   for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*swid;
1424   for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
1425   buffd = buffs[KSIZE] + swid;
1426   buffo = (mlib_s32*)(buffd + swid);
1427   buffi = buffo + (swid &~ 1);
1428 
1429   swid -= (dx_l + dx_r);
1430 
1431   chan1 = nchannel;
1432   chan2 = chan1 + chan1;
1433 
1434   for (c = 0; c < nchannel; c++) {
1435     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1436 
1437     sl = adr_src + c;
1438     dl = adr_dst + c;
1439 
1440     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
1441     else sl1 = sl;
1442 
1443     if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
1444     else sl2 = sl1;
1445 
1446     if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl3 = sl2 + sll;
1447     else sl3 = sl2;
1448 
1449     if ((4 > dy_t) && (4 < hgt + KSIZE1 - dy_b)) sl4 = sl3 + sll;
1450     else sl4 = sl3;
1451 
1452     if ((5 > dy_t) && (5 < hgt + KSIZE1 - dy_b)) sl5 = sl4 + sll;
1453     else sl5 = sl4;
1454 
1455     if ((hgt - dy_b) > 0) sl6 = sl5 + sll;
1456     else sl6 = sl5;
1457 
1458     for (i = 0; i < dx_l; i++) {
1459       buffs[0][i] = (FTYPE)sl[0];
1460       buffs[1][i] = (FTYPE)sl1[0];
1461       buffs[2][i] = (FTYPE)sl2[0];
1462       buffs[3][i] = (FTYPE)sl3[0];
1463       buffs[4][i] = (FTYPE)sl4[0];
1464       buffs[5][i] = (FTYPE)sl5[0];
1465       buffs[6][i] = (FTYPE)sl6[0];
1466     }
1467 
1468 #ifdef __SUNPRO_C
1469 #pragma pipeloop(0)
1470 #endif /* __SUNPRO_C */
1471     for (i = 0; i < swid; i++) {
1472       buffs[0][i + dx_l] = (FTYPE)sl[i*chan1];
1473       buffs[1][i + dx_l] = (FTYPE)sl1[i*chan1];
1474       buffs[2][i + dx_l] = (FTYPE)sl2[i*chan1];
1475       buffs[3][i + dx_l] = (FTYPE)sl3[i*chan1];
1476       buffs[4][i + dx_l] = (FTYPE)sl4[i*chan1];
1477       buffs[5][i + dx_l] = (FTYPE)sl5[i*chan1];
1478       buffs[6][i + dx_l] = (FTYPE)sl6[i*chan1];
1479     }
1480 
1481     for (i = 0; i < dx_r; i++) {
1482       buffs[0][swid + dx_l + i] = buffs[0][swid + dx_l - 1];
1483       buffs[1][swid + dx_l + i] = buffs[1][swid + dx_l - 1];
1484       buffs[2][swid + dx_l + i] = buffs[2][swid + dx_l - 1];
1485       buffs[3][swid + dx_l + i] = buffs[3][swid + dx_l - 1];
1486       buffs[4][swid + dx_l + i] = buffs[4][swid + dx_l - 1];
1487       buffs[5][swid + dx_l + i] = buffs[5][swid + dx_l - 1];
1488       buffs[6][swid + dx_l + i] = buffs[6][swid + dx_l - 1];
1489     }
1490 
1491     buff_ind = 0;
1492 
1493 #ifdef __SUNPRO_C
1494 #pragma pipeloop(0)
1495 #endif /* __SUNPRO_C */
1496     for (i = 0; i < wid; i++) buffd[i] = 0.0;
1497 
1498     if ((hgt - dy_b) > 1) sl = sl6 + sll;
1499     else sl = sl6;
1500 
1501     for (j = 0; j < hgt; j++) {
1502       FTYPE    **buffc = buffs + buff_ind;
1503       FTYPE    *buffn = buffc[KSIZE];
1504       FTYPE    *pk = k;
1505 
1506       for (l = 0; l < KSIZE; l++) {
1507         FTYPE    *buff = buffc[l];
1508         d64_2x32 dd;
1509 
1510         sp = sl;
1511         dp = dl;
1512 
1513         p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1514         p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1515 
1516         k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
1517         k4 = *pk++; k5 = *pk++; k6 = *pk++;
1518 
1519         if (l < (KSIZE - 1)) {
1520 #ifdef __SUNPRO_C
1521 #pragma pipeloop(0)
1522 #endif /* __SUNPRO_C */
1523           for (i = 0; i <= (wid - 2); i += 2) {
1524             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1525 
1526             p6 = buff[i + 6]; p7 = buff[i + 7];
1527 
1528             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1529             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1530           }
1531 
1532         } else {
1533 #ifdef __SUNPRO_C
1534 #pragma pipeloop(0)
1535 #endif /* __SUNPRO_C */
1536           for (i = 0; i <= (wid - 2); i += 2) {
1537             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1538 
1539             p6 = buff[i + 6]; p7 = buff[i + 7];
1540 
1541             LOAD_BUFF(buffi);
1542 
1543             dd.d64 = *(FTYPE   *)(buffi + i);
1544             buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
1545             buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
1546 
1547             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
1548             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1549 
1550             dp[0    ] = FROM_S32(d0);
1551             dp[chan1] = FROM_S32(d1);
1552 
1553             buffd[i    ] = 0.0;
1554             buffd[i + 1] = 0.0;
1555 
1556             sp += chan2;
1557             dp += chan2;
1558           }
1559         }
1560       }
1561 
1562       /* last pixels */
1563       for (; i < wid; i++) {
1564         FTYPE    *pk = k, s = 0;
1565         mlib_s32 d0;
1566 
1567         for (l = 0; l < KSIZE; l++) {
1568           FTYPE    *buff = buffc[l] + i;
1569 
1570           for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
1571         }
1572 
1573         d0 = D2I(s);
1574         dp[0] = FROM_S32(d0);
1575 
1576         buffn[i + dx_l] = (FTYPE)sp[0];
1577 
1578         sp += chan1;
1579         dp += chan1;
1580       }
1581 
1582       for (; i < swid; i++) {
1583         buffn[i + dx_l] = (FTYPE)sp[0];
1584         sp += chan1;
1585       }
1586 
1587       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1588       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1589 
1590       /* next line */
1591 
1592       if (j < hgt - dy_b - 2) sl += sll;
1593       dl += dll;
1594 
1595       buff_ind++;
1596 
1597       if (buff_ind >= KSIZE + 1) buff_ind = 0;
1598     }
1599   }
1600 
1601   if (pbuff != buff) mlib_free(pbuff);
1602 
1603   return MLIB_SUCCESS;
1604 }
1605 
1606 #endif /* IMG_TYPE == 1 */
1607 
1608 /***************************************************************/
1609 #define MAX_KER   7
1610 #define MAX_N    15
1611 #define BUFF_SIZE   1600
1612 #define CACHE_SIZE  (64*1024)
1613 
1614 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
1615                                          const mlib_image *src,
1616                                          const mlib_d64   *k,
1617                                          mlib_s32         n,
1618                                          mlib_s32         dy_t,
1619                                          mlib_s32         dy_b,
1620                                          mlib_s32         cmask)
1621 {
1622   DTYPE    *adr_src, *sl;
1623   DTYPE    *adr_dst, *dl, *dp;
1624   FTYPE    buff[BUFF_SIZE];
1625   FTYPE    *buffd;
1626   FTYPE    *pbuff = buff;
1627   const FTYPE    *pk;
1628   FTYPE    k0, k1, k2, k3;
1629   FTYPE    p0, p1, p2, p3, p4;
1630   FTYPE    *sbuff;
1631   mlib_s32 l, k_off, off, bsize;
1632   mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh;
1633   mlib_s32 d0, d1, ii;
1634   mlib_s32 wid, hgt, sll, dll;
1635   mlib_s32 nchannel;
1636   mlib_s32 i, j, c;
1637   GET_SRC_DST_PARAMETERS(DTYPE);
1638 
1639   max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1);
1640 
1641   if (max_hsize < 1) max_hsize = 1;
1642   if (max_hsize > hgt) max_hsize = hgt;
1643 
1644   shgt = hgt + (n - 1);
1645   smax_hsize = max_hsize + (n - 1);
1646 
1647   bsize = 2 * (smax_hsize + 1);
1648 
1649   if (bsize > BUFF_SIZE) {
1650     pbuff = mlib_malloc(sizeof(FTYPE)*bsize);
1651 
1652     if (pbuff == NULL) return MLIB_FAILURE;
1653   }
1654 
1655   sbuff = pbuff;
1656   buffd = sbuff + smax_hsize;
1657 
1658   shgt -= (dy_t + dy_b);
1659   k_off = 0;
1660 
1661   for (l = 0; l < hgt; l += hsize) {
1662     hsize = hgt - l;
1663 
1664     if (hsize > max_hsize) hsize = max_hsize;
1665 
1666     smax_hsize = hsize + (n - 1);
1667 
1668     for (c = 0; c < nchannel; c++) {
1669       if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1670 
1671       sl = adr_src + c;
1672       dl = adr_dst + c;
1673 
1674 #ifdef __SUNPRO_C
1675 #pragma pipeloop(0)
1676 #endif /* __SUNPRO_C */
1677       for (i = 0; i < hsize; i++) buffd[i] = 0.0;
1678 
1679       for (j = 0; j < wid; j++) {
1680         FTYPE    *buff = sbuff;
1681 
1682         for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) {
1683           sbuff[i - k_off] = (FTYPE)sl[0];
1684         }
1685 
1686 #ifdef __SUNPRO_C
1687 #pragma pipeloop(0)
1688 #endif /* __SUNPRO_C */
1689         for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) {
1690           sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll];
1691         }
1692 
1693         for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) {
1694           sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll];
1695         }
1696 
1697         pk = k;
1698 
1699         for (off = 0; off < (n - 4); off += 4) {
1700 
1701           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1702           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1703 
1704 #ifdef __SUNPRO_C
1705 #pragma pipeloop(0)
1706 #endif /* __SUNPRO_C */
1707           for (i = 0; i < hsize; i += 2) {
1708             p0 = p2; p1 = p3; p2 = p4;
1709 
1710             p3 = buff[i + 3]; p4 = buff[i + 4];
1711 
1712             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1713             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1714           }
1715 
1716           pk += 4;
1717           buff += 4;
1718         }
1719 
1720         dp = dl;
1721         kh = n - off;
1722 
1723         if (kh == 4) {
1724           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1725           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1726 
1727 #ifdef __SUNPRO_C
1728 #pragma pipeloop(0)
1729 #endif /* __SUNPRO_C */
1730           for (i = 0; i <= (hsize - 2); i += 2) {
1731             p0 = p2; p1 = p3; p2 = p4;
1732 
1733             p3 = buff[i + 3]; p4 = buff[i + 4];
1734 
1735             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1736             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1737 
1738             dp[0  ] = FROM_S32(d0);
1739             dp[dll] = FROM_S32(d1);
1740 
1741             buffd[i    ] = 0.0;
1742             buffd[i + 1] = 0.0;
1743 
1744             dp += 2*dll;
1745           }
1746 
1747           if (i < hsize) {
1748             p0 = p2; p1 = p3; p2 = p4;
1749             p3 = buff[i + 3];
1750             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]);
1751             dp[0] = FROM_S32(d0);
1752             buffd[i] = 0.0;
1753           }
1754 
1755         } else if (kh == 3) {
1756 
1757           p2 = buff[0]; p3 = buff[1];
1758           k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
1759 
1760 #ifdef __SUNPRO_C
1761 #pragma pipeloop(0)
1762 #endif /* __SUNPRO_C */
1763           for (i = 0; i <= (hsize - 2); i += 2) {
1764             p0 = p2; p1 = p3;
1765 
1766             p2 = buff[i + 2]; p3 = buff[i + 3];
1767 
1768             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1769             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1770 
1771             dp[0  ] = FROM_S32(d0);
1772             dp[dll] = FROM_S32(d1);
1773 
1774             buffd[i    ] = 0.0;
1775             buffd[i + 1] = 0.0;
1776 
1777             dp += 2*dll;
1778           }
1779 
1780           if (i < hsize) {
1781             p0 = p2; p1 = p3;
1782             p2 = buff[i + 2];
1783             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]);
1784             dp[0] = FROM_S32(d0);
1785 
1786             buffd[i] = 0.0;
1787           }
1788 
1789         } else if (kh == 2) {
1790 
1791           p2 = buff[0];
1792           k0 = pk[0]; k1 = pk[1];
1793 
1794 #ifdef __SUNPRO_C
1795 #pragma pipeloop(0)
1796 #endif /* __SUNPRO_C */
1797           for (i = 0; i <= (hsize - 2); i += 2) {
1798             p0 = p2;
1799 
1800             p1 = buff[i + 1]; p2 = buff[i + 2];
1801 
1802             d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
1803             d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
1804 
1805             dp[0  ] = FROM_S32(d0);
1806             dp[dll] = FROM_S32(d1);
1807 
1808             buffd[i    ] = 0.0;
1809             buffd[i + 1] = 0.0;
1810 
1811             dp += 2*dll;
1812           }
1813 
1814           if (i < hsize) {
1815             p0 = p2;
1816             p1 = buff[i + 1];
1817             d0 = D2I(p0*k0 + p1*k1 + buffd[i]);
1818             dp[0] = FROM_S32(d0);
1819 
1820             buffd[i] = 0.0;
1821           }
1822 
1823         } else /* kh == 1 */{
1824 
1825           k0 = pk[0];
1826 
1827 #ifdef __SUNPRO_C
1828 #pragma pipeloop(0)
1829 #endif /* __SUNPRO_C */
1830           for (i = 0; i <= (hsize - 2); i += 2) {
1831             p0 = buff[i]; p1 = buff[i + 1];
1832 
1833             d0 = D2I(p0*k0 + buffd[i    ]);
1834             d1 = D2I(p1*k0 + buffd[i + 1]);
1835 
1836             dp[0  ] = FROM_S32(d0);
1837             dp[dll] = FROM_S32(d1);
1838 
1839             buffd[i    ] = 0.0;
1840             buffd[i + 1] = 0.0;
1841 
1842             dp += 2*dll;
1843           }
1844 
1845           if (i < hsize) {
1846             p0 = buff[i];
1847             d0 = D2I(p0*k0 + buffd[i]);
1848             dp[0] = FROM_S32(d0);
1849 
1850             buffd[i] = 0.0;
1851           }
1852         }
1853 
1854         /* next line */
1855         sl += nchannel;
1856         dl += nchannel;
1857       }
1858     }
1859 
1860     k_off += max_hsize;
1861     adr_dst += max_hsize*dll;
1862   }
1863 
1864   if (pbuff != buff) mlib_free(pbuff);
1865 
1866   return MLIB_SUCCESS;
1867 }
1868 
1869 /***************************************************************/
1870 mlib_status CONV_FUNC_MxN
1871 {
1872   DTYPE    *adr_src, *sl, *sp = NULL;
1873   DTYPE    *adr_dst, *dl, *dp = NULL;
1874   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
1875   FTYPE    **buffs = buffs_arr, *buffd;
1876   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
1877   FTYPE    *pbuff = buff;
1878   FTYPE    k0, k1, k2, k3, k4, k5, k6;
1879   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
1880   mlib_s32 *buffi;
1881   mlib_s32 mn, l, off, kw, bsize, buff_ind;
1882   mlib_s32 d0, d1;
1883   mlib_s32 wid, hgt, sll, dll;
1884   mlib_s32 nchannel, chan1, chan2;
1885   mlib_s32 i, j, c, swid;
1886   d64_2x32 dd;
1887   mlib_status status = MLIB_SUCCESS;
1888 
1889   GET_SRC_DST_PARAMETERS(DTYPE);
1890 
1891   if (scale > 30) {
1892     fscale *= 1.0/(1 << 30);
1893     scale -= 30;
1894   }
1895 
1896   fscale /= (1 << scale);
1897 
1898   mn = m*n;
1899 
1900   if (mn > 256) {
1901     k = mlib_malloc(mn*sizeof(mlib_d64));
1902 
1903     if (k == NULL) return MLIB_FAILURE;
1904   }
1905 
1906   for (i = 0; i < mn; i++) {
1907     k[i] = kernel[i]*fscale;
1908   }
1909 
1910   if (m == 1) {
1911     status = mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask);
1912     FREE_AND_RETURN_STATUS;
1913   }
1914 
1915   swid = wid + (m - 1);
1916 
1917   bsize = (n + 3)*swid;
1918 
1919   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
1920     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
1921 
1922     if (pbuff == NULL) {
1923       status = MLIB_FAILURE;
1924       FREE_AND_RETURN_STATUS;
1925     }
1926     buffs = (FTYPE   **)(pbuff + bsize);
1927   }
1928 
1929   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
1930   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1931   buffd = buffs[n] + swid;
1932   buffi = (mlib_s32*)(buffd + swid);
1933 
1934   chan1 = nchannel;
1935   chan2 = chan1 + chan1;
1936 
1937   swid -= (dx_l + dx_r);
1938 
1939   for (c = 0; c < nchannel; c++) {
1940     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1941 
1942     sl = adr_src + c;
1943     dl = adr_dst + c;
1944 
1945     for (l = 0; l < n; l++) {
1946       FTYPE    *buff = buffs[l];
1947 
1948       for (i = 0; i < dx_l; i++) {
1949         buff[i] = (FTYPE)sl[0];
1950       }
1951 
1952 #ifdef __SUNPRO_C
1953 #pragma pipeloop(0)
1954 #endif /* __SUNPRO_C */
1955       for (i = 0; i < swid; i++) {
1956         buff[i + dx_l] = (FTYPE)sl[i*chan1];
1957       }
1958 
1959       for (i = 0; i < dx_r; i++) {
1960         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
1961       }
1962 
1963       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
1964     }
1965 
1966     buff_ind = 0;
1967 
1968 #ifdef __SUNPRO_C
1969 #pragma pipeloop(0)
1970 #endif /* __SUNPRO_C */
1971     for (i = 0; i < wid; i++) buffd[i] = 0.0;
1972 
1973     for (j = 0; j < hgt; j++) {
1974       FTYPE    **buffc = buffs + buff_ind;
1975       FTYPE    *buffn = buffc[n];
1976       FTYPE    *pk = k;
1977 
1978       for (l = 0; l < n; l++) {
1979         FTYPE    *buff_l = buffc[l];
1980 
1981         for (off = 0; off < m;) {
1982           FTYPE    *buff = buff_l + off;
1983 
1984           kw = m - off;
1985 
1986           if (kw > 2*MAX_KER) kw = MAX_KER; else
1987             if (kw > MAX_KER) kw = kw/2;
1988           off += kw;
1989 
1990           sp = sl;
1991           dp = dl;
1992 
1993           if (kw == 7) {
1994 
1995             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1996             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1997 
1998             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1999             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
2000 
2001             if (l < (n - 1) || off < m) {
2002 #ifdef __SUNPRO_C
2003 #pragma pipeloop(0)
2004 #endif /* __SUNPRO_C */
2005               for (i = 0; i <= (wid - 2); i += 2) {
2006                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2007 
2008                 p6 = buff[i + 6]; p7 = buff[i + 7];
2009 
2010                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
2011                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
2012               }
2013 
2014             } else {
2015 #ifdef __SUNPRO_C
2016 #pragma pipeloop(0)
2017 #endif /* __SUNPRO_C */
2018               for (i = 0; i <= (wid - 2); i += 2) {
2019                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2020 
2021                 p6 = buff[i + 6]; p7 = buff[i + 7];
2022 
2023                 LOAD_BUFF(buffi);
2024 
2025                 dd.d64 = *(FTYPE   *)(buffi + i);
2026                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
2027                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
2028 
2029                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
2030                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
2031 
2032                 dp[0    ] = FROM_S32(d0);
2033                 dp[chan1] = FROM_S32(d1);
2034 
2035                 buffd[i    ] = 0.0;
2036                 buffd[i + 1] = 0.0;
2037 
2038                 sp += chan2;
2039                 dp += chan2;
2040               }
2041             }
2042 
2043           } else if (kw == 6) {
2044 
2045             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2046             p5 = buff[3]; p6 = buff[4];
2047 
2048             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2049             k4 = pk[4]; k5 = pk[5];
2050 
2051             if (l < (n - 1) || off < m) {
2052 #ifdef __SUNPRO_C
2053 #pragma pipeloop(0)
2054 #endif /* __SUNPRO_C */
2055               for (i = 0; i <= (wid - 2); i += 2) {
2056                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2057 
2058                 p5 = buff[i + 5]; p6 = buff[i + 6];
2059 
2060                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
2061                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
2062               }
2063 
2064             } else {
2065 #ifdef __SUNPRO_C
2066 #pragma pipeloop(0)
2067 #endif /* __SUNPRO_C */
2068               for (i = 0; i <= (wid - 2); i += 2) {
2069                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2070 
2071                 p5 = buff[i + 5]; p6 = buff[i + 6];
2072 
2073                 LOAD_BUFF(buffi);
2074 
2075                 dd.d64 = *(FTYPE   *)(buffi + i);
2076                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
2077                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
2078 
2079                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
2080                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
2081 
2082                 dp[0    ] = FROM_S32(d0);
2083                 dp[chan1] = FROM_S32(d1);
2084 
2085                 buffd[i    ] = 0.0;
2086                 buffd[i + 1] = 0.0;
2087 
2088                 sp += chan2;
2089                 dp += chan2;
2090               }
2091             }
2092 
2093           } else if (kw == 5) {
2094 
2095             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2096             p5 = buff[3];
2097 
2098             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2099             k4 = pk[4];
2100 
2101             if (l < (n - 1) || off < m) {
2102 #ifdef __SUNPRO_C
2103 #pragma pipeloop(0)
2104 #endif /* __SUNPRO_C */
2105               for (i = 0; i <= (wid - 2); i += 2) {
2106                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2107 
2108                 p4 = buff[i + 4]; p5 = buff[i + 5];
2109 
2110                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
2111                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
2112               }
2113 
2114             } else {
2115 #ifdef __SUNPRO_C
2116 #pragma pipeloop(0)
2117 #endif /* __SUNPRO_C */
2118               for (i = 0; i <= (wid - 2); i += 2) {
2119                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2120 
2121                 p4 = buff[i + 4]; p5 = buff[i + 5];
2122 
2123                 LOAD_BUFF(buffi);
2124 
2125                 dd.d64 = *(FTYPE   *)(buffi + i);
2126                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
2127                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
2128 
2129                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
2130                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
2131 
2132                 dp[0    ] = FROM_S32(d0);
2133                 dp[chan1] = FROM_S32(d1);
2134 
2135                 buffd[i    ] = 0.0;
2136                 buffd[i + 1] = 0.0;
2137 
2138                 sp += chan2;
2139                 dp += chan2;
2140               }
2141             }
2142 
2143           } else if (kw == 4) {
2144 
2145             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2146 
2147             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2148 
2149             if (l < (n - 1) || off < m) {
2150 #ifdef __SUNPRO_C
2151 #pragma pipeloop(0)
2152 #endif /* __SUNPRO_C */
2153               for (i = 0; i <= (wid - 2); i += 2) {
2154                 p0 = p2; p1 = p3; p2 = p4;
2155 
2156                 p3 = buff[i + 3]; p4 = buff[i + 4];
2157 
2158                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
2159                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
2160               }
2161 
2162             } else {
2163 #ifdef __SUNPRO_C
2164 #pragma pipeloop(0)
2165 #endif /* __SUNPRO_C */
2166               for (i = 0; i <= (wid - 2); i += 2) {
2167                 p0 = p2; p1 = p3; p2 = p4;
2168 
2169                 p3 = buff[i + 3]; p4 = buff[i + 4];
2170 
2171                 LOAD_BUFF(buffi);
2172 
2173                 dd.d64 = *(FTYPE   *)(buffi + i);
2174                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
2175                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
2176 
2177                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
2178                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
2179 
2180                 dp[0    ] = FROM_S32(d0);
2181                 dp[chan1] = FROM_S32(d1);
2182 
2183                 buffd[i    ] = 0.0;
2184                 buffd[i + 1] = 0.0;
2185 
2186                 sp += chan2;
2187                 dp += chan2;
2188               }
2189             }
2190 
2191           } else if (kw == 3) {
2192 
2193             p2 = buff[0]; p3 = buff[1];
2194             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
2195 
2196             if (l < (n - 1) || off < m) {
2197 #ifdef __SUNPRO_C
2198 #pragma pipeloop(0)
2199 #endif /* __SUNPRO_C */
2200               for (i = 0; i <= (wid - 2); i += 2) {
2201                 p0 = p2; p1 = p3;
2202 
2203                 p2 = buff[i + 2]; p3 = buff[i + 3];
2204 
2205                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
2206                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
2207               }
2208 
2209             } else {
2210 #ifdef __SUNPRO_C
2211 #pragma pipeloop(0)
2212 #endif /* __SUNPRO_C */
2213               for (i = 0; i <= (wid - 2); i += 2) {
2214                 p0 = p2; p1 = p3;
2215 
2216                 p2 = buff[i + 2]; p3 = buff[i + 3];
2217 
2218                 LOAD_BUFF(buffi);
2219 
2220                 dd.d64 = *(FTYPE   *)(buffi + i);
2221                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
2222                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
2223 
2224                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
2225                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
2226 
2227                 dp[0    ] = FROM_S32(d0);
2228                 dp[chan1] = FROM_S32(d1);
2229 
2230                 buffd[i    ] = 0.0;
2231                 buffd[i + 1] = 0.0;
2232 
2233                 sp += chan2;
2234                 dp += chan2;
2235               }
2236             }
2237 
2238           } else /* if (kw == 2) */ {
2239 
2240             p2 = buff[0];
2241             k0 = pk[0]; k1 = pk[1];
2242 
2243             if (l < (n - 1) || off < m) {
2244 #ifdef __SUNPRO_C
2245 #pragma pipeloop(0)
2246 #endif /* __SUNPRO_C */
2247               for (i = 0; i <= (wid - 2); i += 2) {
2248                 p0 = p2;
2249 
2250                 p1 = buff[i + 1]; p2 = buff[i + 2];
2251 
2252                 buffd[i    ] += p0*k0 + p1*k1;
2253                 buffd[i + 1] += p1*k0 + p2*k1;
2254               }
2255 
2256             } else {
2257 #ifdef __SUNPRO_C
2258 #pragma pipeloop(0)
2259 #endif /* __SUNPRO_C */
2260               for (i = 0; i <= (wid - 2); i += 2) {
2261                 p0 = p2;
2262 
2263                 p1 = buff[i + 1]; p2 = buff[i + 2];
2264 
2265                 LOAD_BUFF(buffi);
2266 
2267                 dd.d64 = *(FTYPE   *)(buffi + i);
2268                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
2269                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
2270 
2271                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
2272                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
2273 
2274                 dp[0    ] = FROM_S32(d0);
2275                 dp[chan1] = FROM_S32(d1);
2276 
2277                 buffd[i    ] = 0.0;
2278                 buffd[i + 1] = 0.0;
2279 
2280                 sp += chan2;
2281                 dp += chan2;
2282               }
2283             }
2284           }
2285 
2286           pk += kw;
2287         }
2288       }
2289 
2290       /* last pixels */
2291       for (; i < wid; i++) {
2292         FTYPE    *pk = k, s = 0;
2293         mlib_s32 x, d0;
2294 
2295         for (l = 0; l < n; l++) {
2296           FTYPE    *buff = buffc[l] + i;
2297 
2298           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
2299         }
2300 
2301         d0 = D2I(s);
2302         dp[0] = FROM_S32(d0);
2303 
2304         buffn[i + dx_l] = (FTYPE)sp[0];
2305 
2306         sp += chan1;
2307         dp += chan1;
2308       }
2309 
2310       for (; i < swid; i++) {
2311         buffn[i + dx_l] = (FTYPE)sp[0];
2312         sp += chan1;
2313       }
2314 
2315       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
2316       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
2317 
2318       /* next line */
2319 
2320       if (j < hgt - dy_b - 2) sl += sll;
2321       dl += dll;
2322 
2323       buff_ind++;
2324 
2325       if (buff_ind >= n + 1) buff_ind = 0;
2326     }
2327   }
2328 
2329   FREE_AND_RETURN_STATUS;
2330 }
2331 
2332 /***************************************************************/
2333 #ifndef __sparc /* for x86, using integer multiplies is faster */
2334 
2335 #define STORE_RES(res, x)                                       \
2336   x >>= shift2;                                                 \
2337   CLAMP_STORE(res, x)
2338 
2339 mlib_status CONV_FUNC_MxN_I
2340 {
2341   DTYPE    *adr_src, *sl, *sp = NULL;
2342   DTYPE    *adr_dst, *dl, *dp = NULL;
2343   mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
2344   mlib_s32 *pbuff = buff;
2345   mlib_s32 **buffs = buffs_arr, *buffd;
2346   mlib_s32 l, off, kw, bsize, buff_ind;
2347   mlib_s32 d0, d1, shift1, shift2;
2348   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
2349   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
2350   mlib_s32 wid, hgt, sll, dll;
2351   mlib_s32 nchannel, chan1;
2352   mlib_s32 i, j, c, swid;
2353   mlib_s32 chan2;
2354   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
2355   GET_SRC_DST_PARAMETERS(DTYPE);
2356 
2357 #if IMG_TYPE != 1
2358   shift1 = 16;
2359 #else
2360   shift1 = 8;
2361 #endif /* IMG_TYPE != 1 */
2362   shift2 = scale - shift1;
2363 
2364   chan1 = nchannel;
2365   chan2 = chan1 + chan1;
2366 
2367   swid = wid + (m - 1);
2368 
2369   bsize = (n + 2)*swid;
2370 
2371   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
2372     pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1));
2373 
2374     if (pbuff == NULL) return MLIB_FAILURE;
2375     buffs = (mlib_s32 **)(pbuff + bsize);
2376   }
2377 
2378   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
2379   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
2380   buffd = buffs[n] + swid;
2381 
2382   if (m*n > MAX_N*MAX_N) {
2383     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
2384 
2385     if (k == NULL) {
2386       if (pbuff != buff) mlib_free(pbuff);
2387       return MLIB_FAILURE;
2388     }
2389   }
2390 
2391   for (i = 0; i < m*n; i++) {
2392     k[i] = kernel[i] >> shift1;
2393   }
2394 
2395   swid -= (dx_l + dx_r);
2396 
2397   for (c = 0; c < nchannel; c++) {
2398     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
2399 
2400     sl = adr_src + c;
2401     dl = adr_dst + c;
2402 
2403     for (l = 0; l < n; l++) {
2404       mlib_s32  *buff = buffs[l];
2405 
2406       for (i = 0; i < dx_l; i++) {
2407         buff[i] = (mlib_s32)sl[0];
2408       }
2409 
2410 #ifdef __SUNPRO_C
2411 #pragma pipeloop(0)
2412 #endif /* __SUNPRO_C */
2413       for (i = 0; i < swid; i++) {
2414         buff[i + dx_l] = (mlib_s32)sl[i*chan1];
2415       }
2416 
2417       for (i = 0; i < dx_r; i++) {
2418         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
2419       }
2420 
2421       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
2422     }
2423 
2424     buff_ind = 0;
2425 
2426 #ifdef __SUNPRO_C
2427 #pragma pipeloop(0)
2428 #endif /* __SUNPRO_C */
2429     for (i = 0; i < wid; i++) buffd[i] = 0;
2430 
2431     for (j = 0; j < hgt; j++) {
2432       mlib_s32 **buffc = buffs + buff_ind;
2433       mlib_s32 *buffn = buffc[n];
2434       mlib_s32 *pk = k;
2435 
2436       for (l = 0; l < n; l++) {
2437         mlib_s32  *buff_l = buffc[l];
2438 
2439         for (off = 0; off < m;) {
2440           mlib_s32 *buff = buff_l + off;
2441 
2442           sp = sl;
2443           dp = dl;
2444 
2445           kw = m - off;
2446 
2447           if (kw > 2*MAX_KER) kw = MAX_KER; else
2448             if (kw > MAX_KER) kw = kw/2;
2449           off += kw;
2450 
2451           if (kw == 7) {
2452 
2453             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2454             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
2455 
2456             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2457             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
2458 
2459             if (l < (n - 1) || off < m) {
2460 #ifdef __SUNPRO_C
2461 #pragma pipeloop(0)
2462 #endif /* __SUNPRO_C */
2463               for (i = 0; i <= (wid - 2); i += 2) {
2464                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2465 
2466                 p6 = buff[i + 6]; p7 = buff[i + 7];
2467 
2468                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
2469                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
2470               }
2471 
2472             } else {
2473 #ifdef __SUNPRO_C
2474 #pragma pipeloop(0)
2475 #endif /* __SUNPRO_C */
2476               for (i = 0; i <= (wid - 2); i += 2) {
2477                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2478 
2479                 p6 = buff[i + 6]; p7 = buff[i + 7];
2480 
2481                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
2482                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2483 
2484                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
2485                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
2486 
2487                 STORE_RES(dp[0    ], d0);
2488                 STORE_RES(dp[chan1], d1);
2489 
2490                 buffd[i    ] = 0;
2491                 buffd[i + 1] = 0;
2492 
2493                 sp += chan2;
2494                 dp += chan2;
2495               }
2496             }
2497 
2498           } else if (kw == 6) {
2499 
2500             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2501             p5 = buff[3]; p6 = buff[4];
2502 
2503             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2504             k4 = pk[4]; k5 = pk[5];
2505 
2506             if (l < (n - 1) || off < m) {
2507 #ifdef __SUNPRO_C
2508 #pragma pipeloop(0)
2509 #endif /* __SUNPRO_C */
2510               for (i = 0; i <= (wid - 2); i += 2) {
2511                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2512 
2513                 p5 = buff[i + 5]; p6 = buff[i + 6];
2514 
2515                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
2516                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
2517               }
2518 
2519             } else {
2520 #ifdef __SUNPRO_C
2521 #pragma pipeloop(0)
2522 #endif /* __SUNPRO_C */
2523               for (i = 0; i <= (wid - 2); i += 2) {
2524                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2525 
2526                 p5 = buff[i + 5]; p6 = buff[i + 6];
2527 
2528                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
2529                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2530 
2531                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
2532                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
2533 
2534                 STORE_RES(dp[0    ], d0);
2535                 STORE_RES(dp[chan1], d1);
2536 
2537                 buffd[i    ] = 0;
2538                 buffd[i + 1] = 0;
2539 
2540                 sp += chan2;
2541                 dp += chan2;
2542               }
2543             }
2544 
2545           } else if (kw == 5) {
2546 
2547             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2548             p5 = buff[3];
2549 
2550             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2551             k4 = pk[4];
2552 
2553             if (l < (n - 1) || off < m) {
2554 #ifdef __SUNPRO_C
2555 #pragma pipeloop(0)
2556 #endif /* __SUNPRO_C */
2557               for (i = 0; i <= (wid - 2); i += 2) {
2558                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2559 
2560                 p4 = buff[i + 4]; p5 = buff[i + 5];
2561 
2562                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
2563                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
2564               }
2565 
2566             } else {
2567 #ifdef __SUNPRO_C
2568 #pragma pipeloop(0)
2569 #endif /* __SUNPRO_C */
2570               for (i = 0; i <= (wid - 2); i += 2) {
2571                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2572 
2573                 p4 = buff[i + 4]; p5 = buff[i + 5];
2574 
2575                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
2576                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2577 
2578                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
2579                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
2580 
2581                 STORE_RES(dp[0    ], d0);
2582                 STORE_RES(dp[chan1], d1);
2583 
2584                 buffd[i    ] = 0;
2585                 buffd[i + 1] = 0;
2586 
2587                 sp += chan2;
2588                 dp += chan2;
2589               }
2590             }
2591 
2592           } else if (kw == 4) {
2593 
2594             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2595 
2596             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2597 
2598             if (l < (n - 1) || off < m) {
2599 #ifdef __SUNPRO_C
2600 #pragma pipeloop(0)
2601 #endif /* __SUNPRO_C */
2602               for (i = 0; i <= (wid - 2); i += 2) {
2603                 p0 = p2; p1 = p3; p2 = p4;
2604 
2605                 p3 = buff[i + 3]; p4 = buff[i + 4];
2606 
2607                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
2608                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
2609               }
2610 
2611             } else {
2612 #ifdef __SUNPRO_C
2613 #pragma pipeloop(0)
2614 #endif /* __SUNPRO_C */
2615               for (i = 0; i <= (wid - 2); i += 2) {
2616                 p0 = p2; p1 = p3; p2 = p4;
2617 
2618                 p3 = buff[i + 3]; p4 = buff[i + 4];
2619 
2620                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
2621                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2622 
2623                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
2624                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
2625 
2626                 STORE_RES(dp[0    ], d0);
2627                 STORE_RES(dp[chan1], d1);
2628 
2629                 buffd[i    ] = 0;
2630                 buffd[i + 1] = 0;
2631 
2632                 sp += chan2;
2633                 dp += chan2;
2634               }
2635             }
2636 
2637           } else if (kw == 3) {
2638 
2639             p2 = buff[0]; p3 = buff[1];
2640             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
2641 
2642             if (l < (n - 1) || off < m) {
2643 #ifdef __SUNPRO_C
2644 #pragma pipeloop(0)
2645 #endif /* __SUNPRO_C */
2646               for (i = 0; i <= (wid - 2); i += 2) {
2647                 p0 = p2; p1 = p3;
2648 
2649                 p2 = buff[i + 2]; p3 = buff[i + 3];
2650 
2651                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
2652                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
2653               }
2654 
2655             } else {
2656 #ifdef __SUNPRO_C
2657 #pragma pipeloop(0)
2658 #endif /* __SUNPRO_C */
2659               for (i = 0; i <= (wid - 2); i += 2) {
2660                 p0 = p2; p1 = p3;
2661 
2662                 p2 = buff[i + 2]; p3 = buff[i + 3];
2663 
2664                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
2665                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2666 
2667                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
2668                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
2669 
2670                 STORE_RES(dp[0    ], d0);
2671                 STORE_RES(dp[chan1], d1);
2672 
2673                 buffd[i    ] = 0;
2674                 buffd[i + 1] = 0;
2675 
2676                 sp += chan2;
2677                 dp += chan2;
2678               }
2679             }
2680 
2681           } else if (kw == 2) {
2682 
2683             p2 = buff[0];
2684             k0 = pk[0]; k1 = pk[1];
2685 
2686             if (l < (n - 1) || off < m) {
2687 #ifdef __SUNPRO_C
2688 #pragma pipeloop(0)
2689 #endif /* __SUNPRO_C */
2690               for (i = 0; i <= (wid - 2); i += 2) {
2691                 p0 = p2;
2692 
2693                 p1 = buff[i + 1]; p2 = buff[i + 2];
2694 
2695                 buffd[i    ] += p0*k0 + p1*k1;
2696                 buffd[i + 1] += p1*k0 + p2*k1;
2697               }
2698 
2699             } else {
2700 #ifdef __SUNPRO_C
2701 #pragma pipeloop(0)
2702 #endif /* __SUNPRO_C */
2703               for (i = 0; i <= (wid - 2); i += 2) {
2704                 p0 = p2;
2705 
2706                 p1 = buff[i + 1]; p2 = buff[i + 2];
2707 
2708                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
2709                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2710 
2711                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
2712                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
2713 
2714                 STORE_RES(dp[0    ], d0);
2715                 STORE_RES(dp[chan1], d1);
2716 
2717                 buffd[i    ] = 0;
2718                 buffd[i + 1] = 0;
2719 
2720                 sp += chan2;
2721                 dp += chan2;
2722               }
2723             }
2724 
2725           } else /* kw == 1 */{
2726 
2727             k0 = pk[0];
2728 
2729             if (l < (n - 1) || off < m) {
2730 #ifdef __SUNPRO_C
2731 #pragma pipeloop(0)
2732 #endif /* __SUNPRO_C */
2733               for (i = 0; i <= (wid - 2); i += 2) {
2734                 p0 = buff[i]; p1 = buff[i + 1];
2735 
2736                 buffd[i    ] += p0*k0;
2737                 buffd[i + 1] += p1*k0;
2738               }
2739 
2740             } else {
2741 #ifdef __SUNPRO_C
2742 #pragma pipeloop(0)
2743 #endif /* __SUNPRO_C */
2744               for (i = 0; i <= (wid - 2); i += 2) {
2745                 p0 = buff[i]; p1 = buff[i + 1];
2746 
2747                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
2748                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2749 
2750                 d0 = (p0*k0 + buffd[i    ]);
2751                 d1 = (p1*k0 + buffd[i + 1]);
2752 
2753                 STORE_RES(dp[0    ], d0);
2754                 STORE_RES(dp[chan1], d1);
2755 
2756                 buffd[i    ] = 0;
2757                 buffd[i + 1] = 0;
2758 
2759                 sp += chan2;
2760                 dp += chan2;
2761               }
2762             }
2763           }
2764 
2765           pk += kw;
2766         }
2767       }
2768 
2769       /* last pixels */
2770       for (; i < wid; i++) {
2771         mlib_s32 *pk = k, x, s = 0;
2772 
2773         for (l = 0; l < n; l++) {
2774           mlib_s32 *buff = buffc[l] + i;
2775 
2776           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
2777         }
2778 
2779         STORE_RES(dp[0], s);
2780 
2781         buffn[i + dx_l] = (mlib_s32)sp[0];
2782 
2783         sp += chan1;
2784         dp += chan1;
2785       }
2786 
2787       for (; i < swid; i++) {
2788         buffn[i + dx_l] = (mlib_s32)sp[0];
2789         sp += chan1;
2790       }
2791 
2792       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
2793       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
2794 
2795       /* next line */
2796 
2797       if (j < hgt - dy_b - 2) sl += sll;
2798       dl += dll;
2799 
2800       buff_ind++;
2801 
2802       if (buff_ind >= n + 1) buff_ind = 0;
2803     }
2804   }
2805 
2806   if (pbuff != buff) mlib_free(pbuff);
2807   if (k != k_locl) mlib_free(k);
2808 
2809   return MLIB_SUCCESS;
2810 }
2811 
2812 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
2813 
2814 /***************************************************************/