1 /*
   2  * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 /*
  28  * FUNCTION
  29  *   Internal functions for mlib_ImageConv* on S32 type and
  30  *   MLIB_EDGE_DST_NO_WRITE mask
  31  *
  32  */
  33 
  34 #include "mlib_image.h"
  35 #include "mlib_ImageConv.h"
  36 
  37 /***************************************************************/
  38 #define BUFF_LINE  256
  39 
  40 #define CACHE_SIZE (64*1024)
  41 
  42 /***************************************************************/
  43 #define CONV_FUNC(KERN) mlib_conv##KERN##nw_s32
  44 
  45 /***************************************************************/
  46 #ifndef MLIB_USE_FTOI_CLAMPING
  47 
  48 #define CLAMP_S32(dst, src)                                       \
  49   if (src > (mlib_d64)MLIB_S32_MAX) src = (mlib_d64)MLIB_S32_MAX; \
  50   if (src < (mlib_d64)MLIB_S32_MIN) src = (mlib_d64)MLIB_S32_MIN; \
  51   dst = (mlib_s32)src
  52 
  53 #else
  54 
  55 #define CLAMP_S32(dst, src) dst = (mlib_s32)(src)
  56 
  57 #endif /* MLIB_USE_FTOI_CLAMPING */
  58 
  59 /***************************************************************/
  60 #define GET_SRC_DST_PARAMETERS(type)                            \
  61   mlib_s32 hgt = mlib_ImageGetHeight(src);                      \
  62   mlib_s32 wid = mlib_ImageGetWidth(src);                       \
  63   mlib_s32 sll = mlib_ImageGetStride(src) / sizeof(type);       \
  64   mlib_s32 dll = mlib_ImageGetStride(dst) / sizeof(type);       \
  65   type*    adr_src = mlib_ImageGetData(src);                    \
  66   type*    adr_dst = mlib_ImageGetData(dst);                    \
  67   mlib_s32 chan1 = mlib_ImageGetChannels(src)
  68 /*  mlib_s32 chan2 = chan1 + chan1 */
  69 
  70 /***************************************************************/
  71 #define DEF_VARS(type)                                          \
  72   GET_SRC_DST_PARAMETERS(type);                                 \
  73   type     *sl, *sp, *sl1, *dl, *dp;                            \
  74   mlib_d64 *pbuff = buff, *buff0, *buff1, *buff2, *buffT;       \
  75   mlib_s32 i, j, c;                                             \
  76   mlib_d64 scalef, d0, d1
  77 
  78 /***************************************************************/
  79 #define DEF_VARS_MxN(type)                                      \
  80   GET_SRC_DST_PARAMETERS(type);                                 \
  81   type     *sl, *sp = NULL, *dl, *dp = NULL;                    \
  82   mlib_d64 *pbuff = buff;                                       \
  83   mlib_s32 i, j, c
  84 
  85 /***************************************************************/
  86 #define CALC_SCALE()                                            \
  87   scalef = 1.0;                                                 \
  88   while (scalef_expon > 30) {                                   \
  89     scalef /= (1 << 30);                                        \
  90     scalef_expon -= 30;                                         \
  91   }                                                             \
  92                                                                 \
  93   scalef /= (1 << scalef_expon)
  94 
  95 /***************************************************************/
  96 #undef  KSIZE
  97 #define KSIZE 2
  98 
  99 mlib_status CONV_FUNC(2x2)(mlib_image       *dst,
 100                            const mlib_image *src,
 101                            const mlib_s32   *kern,
 102                            mlib_s32         scalef_expon,
 103                            mlib_s32         cmask)
 104 {
 105   mlib_d64 buff[(KSIZE + 1)*BUFF_LINE];
 106   mlib_d64 k0, k1, k2, k3;
 107   mlib_d64 p00, p01, p02, p03,
 108            p10, p11, p12, p13;
 109   mlib_d64 d2;
 110   DEF_VARS(mlib_s32);
 111   mlib_s32 chan2 = chan1 + chan1;
 112   mlib_s32 chan3 = chan1 + chan2;
 113 
 114   if (wid > BUFF_LINE) {
 115     pbuff = mlib_malloc((KSIZE + 1)*sizeof(mlib_d64)*wid);
 116 
 117     if (pbuff == NULL) return MLIB_FAILURE;
 118   }
 119 
 120   buff0 = pbuff;
 121   buff1 = buff0 + wid;
 122   buff2 = buff1 + wid;
 123 
 124   wid -= (KSIZE - 1);
 125   hgt -= (KSIZE - 1);
 126 
 127   /* keep kernel in regs */
 128   CALC_SCALE();
 129   k0 = scalef * kern[0];  k1 = scalef * kern[1];
 130   k2 = scalef * kern[2];  k3 = scalef * kern[3];
 131 
 132   for (c = 0; c < chan1; c++) {
 133     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 134 
 135     sl = adr_src + c;
 136     dl = adr_dst + c;
 137 
 138     sl1 = sl + sll;
 139 #ifdef __SUNPRO_C
 140 #pragma pipeloop(0)
 141 #endif /* __SUNPRO_C */
 142     for (i = 0; i < wid + (KSIZE - 1); i++) {
 143       buff0[i] = (mlib_d64)sl[i*chan1];
 144       buff1[i] = (mlib_d64)sl1[i*chan1];
 145     }
 146 
 147     sl += KSIZE*sll;
 148 
 149     for (j = 0; j < hgt; j++) {
 150       p03 = buff0[0];
 151       p13 = buff1[0];
 152 
 153       sp = sl;
 154       dp = dl;
 155 
 156 #ifdef __SUNPRO_C
 157 #pragma pipeloop(0)
 158 #endif /* __SUNPRO_C */
 159       for (i = 0; i <= (wid - 3); i += 3) {
 160 
 161         p00 = p03; p10 = p13;
 162 
 163         p01 = buff0[i + 1]; p11 = buff1[i + 1];
 164         p02 = buff0[i + 2]; p12 = buff1[i + 2];
 165         p03 = buff0[i + 3]; p13 = buff1[i + 3];
 166 
 167         buff2[i    ] = (mlib_d64)sp[0];
 168         buff2[i + 1] = (mlib_d64)sp[chan1];
 169         buff2[i + 2] = (mlib_d64)sp[chan2];
 170 
 171         d0 = p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3;
 172         d1 = p01 * k0 + p02 * k1 + p11 * k2 + p12 * k3;
 173         d2 = p02 * k0 + p03 * k1 + p12 * k2 + p13 * k3;
 174 
 175         CLAMP_S32(dp[0    ], d0);
 176         CLAMP_S32(dp[chan1], d1);
 177         CLAMP_S32(dp[chan2], d2);
 178 
 179         sp += chan3;
 180         dp += chan3;
 181       }
 182 
 183       for (; i < wid; i++) {
 184         p00 = buff0[i];     p10 = buff1[i];
 185         p01 = buff0[i + 1]; p11 = buff1[i + 1];
 186 
 187         buff2[i] = (mlib_d64)sp[0];
 188 
 189         d0 = p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3;
 190         CLAMP_S32(dp[0], d0);
 191 
 192         sp += chan1;
 193         dp += chan1;
 194       }
 195 
 196       buff2[wid] = (mlib_d64)sp[0];
 197 
 198       sl += sll;
 199       dl += dll;
 200 
 201       buffT = buff0;
 202       buff0 = buff1;
 203       buff1 = buff2;
 204       buff2 = buffT;
 205     }
 206   }
 207 
 208   if (pbuff != buff) mlib_free(pbuff);
 209 
 210   return MLIB_SUCCESS;
 211 }
 212 
 213 /***************************************************************/
 214 #undef  KSIZE
 215 #define KSIZE 3
 216 
 217 mlib_status CONV_FUNC(3x3)(mlib_image       *dst,
 218                            const mlib_image *src,
 219                            const mlib_s32   *kern,
 220                            mlib_s32         scalef_expon,
 221                            mlib_s32         cmask)
 222 {
 223   mlib_d64 buff[(KSIZE + 1)*BUFF_LINE], *buff3;
 224   mlib_d64 k0, k1, k2, k3, k4, k5, k6, k7, k8;
 225   mlib_d64 p00, p01, p02, p03,
 226            p10, p11, p12, p13,
 227            p20, p21, p22, p23;
 228   mlib_s32 *sl2;
 229   DEF_VARS(mlib_s32);
 230   mlib_s32 chan2 = chan1 + chan1;
 231 
 232   if (wid > BUFF_LINE) {
 233     pbuff = mlib_malloc((KSIZE + 1)*sizeof(mlib_d64)*wid);
 234 
 235     if (pbuff == NULL) return MLIB_FAILURE;
 236   }
 237 
 238   buff0 = pbuff;
 239   buff1 = buff0 + wid;
 240   buff2 = buff1 + wid;
 241   buff3 = buff2 + wid;
 242 
 243   wid -= (KSIZE - 1);
 244   hgt -= (KSIZE - 1);
 245 
 246   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
 247 
 248   CALC_SCALE();
 249   k0 = scalef * kern[0];  k1 = scalef * kern[1];  k2 = scalef * kern[2];
 250   k3 = scalef * kern[3];  k4 = scalef * kern[4];  k5 = scalef * kern[5];
 251   k6 = scalef * kern[6];  k7 = scalef * kern[7];  k8 = scalef * kern[8];
 252 
 253   for (c = 0; c < chan1; c++) {
 254     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 255 
 256     sl = adr_src + c;
 257     dl = adr_dst + c;
 258 
 259     sl1 = sl  + sll;
 260     sl2 = sl1 + sll;
 261 #ifdef __SUNPRO_C
 262 #pragma pipeloop(0)
 263 #endif /* __SUNPRO_C */
 264     for (i = 0; i < wid + (KSIZE - 1); i++) {
 265       buff0[i] = (mlib_d64)sl[i*chan1];
 266       buff1[i] = (mlib_d64)sl1[i*chan1];
 267       buff2[i] = (mlib_d64)sl2[i*chan1];
 268     }
 269 
 270     sl += KSIZE*sll;
 271 
 272     for (j = 0; j < hgt; j++) {
 273       mlib_d64 s0, s1;
 274 
 275       p02 = buff0[0];
 276       p12 = buff1[0];
 277       p22 = buff2[0];
 278 
 279       p03 = buff0[1];
 280       p13 = buff1[1];
 281       p23 = buff2[1];
 282 
 283       sp = sl;
 284       dp = dl;
 285 
 286       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 287       s1 = p03 * k0 + p13 * k3 + p23 * k6;
 288 
 289 #ifdef __SUNPRO_C
 290 #pragma pipeloop(0)
 291 #endif /* __SUNPRO_C */
 292       for (i = 0; i <= (wid - 2); i += 2) {
 293         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
 294         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
 295 
 296         buff3[i    ] = (mlib_d64)sp[0];
 297         buff3[i + 1] = (mlib_d64)sp[chan1];
 298 
 299         d0 = s0 + p02 * k2 + p12 * k5 + p22 * k8;
 300         d1 = s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8;
 301 
 302         CLAMP_S32(dp[0    ], d0);
 303         CLAMP_S32(dp[chan1], d1);
 304 
 305         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
 306         s1 = p03 * k0 + p13 * k3 + p23 * k6;
 307 
 308         sp += chan2;
 309         dp += chan2;
 310       }
 311 
 312       for (; i < wid; i++) {
 313         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];
 314         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
 315         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
 316 
 317         buff3[i] = (mlib_d64)sp[0];
 318 
 319         d0 = (p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
 320               p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
 321 
 322         CLAMP_S32(dp[0], d0);
 323 
 324         sp += chan1;
 325         dp += chan1;
 326       }
 327 
 328       buff3[wid    ] = (mlib_d64)sp[0];
 329       buff3[wid + 1] = (mlib_d64)sp[chan1];
 330 
 331       sl += sll;
 332       dl += dll;
 333 
 334       buffT = buff0;
 335       buff0 = buff1;
 336       buff1 = buff2;
 337       buff2 = buff3;
 338       buff3 = buffT;
 339     }
 340   }
 341 
 342   if (pbuff != buff) mlib_free(pbuff);
 343 
 344   return MLIB_SUCCESS;
 345 }
 346 
 347 /***************************************************************/
 348 #undef  KSIZE
 349 #define KSIZE 4
 350 
 351 mlib_status CONV_FUNC(4x4)(mlib_image       *dst,
 352                            const mlib_image *src,
 353                            const mlib_s32   *kern,
 354                            mlib_s32         scalef_expon,
 355                            mlib_s32         cmask)
 356 {
 357   mlib_d64 buff[(KSIZE + 2)*BUFF_LINE], *buff3, *buff4, *buff5;
 358   mlib_d64 k[KSIZE*KSIZE];
 359   mlib_d64 k0, k1, k2, k3, k4, k5, k6, k7;
 360   mlib_d64 p00, p01, p02, p03, p04,
 361            p10, p11, p12, p13, p14,
 362            p20, p21, p22, p23,
 363            p30, p31, p32, p33;
 364   mlib_s32 *sl2, *sl3;
 365   DEF_VARS(mlib_s32);
 366   mlib_s32 chan2 = chan1 + chan1;
 367 
 368   if (wid > BUFF_LINE) {
 369     pbuff = mlib_malloc((KSIZE + 2)*sizeof(mlib_d64)*wid);
 370 
 371     if (pbuff == NULL) return MLIB_FAILURE;
 372   }
 373 
 374   buff0 = pbuff;
 375   buff1 = buff0 + wid;
 376   buff2 = buff1 + wid;
 377   buff3 = buff2 + wid;
 378   buff4 = buff3 + wid;
 379   buff5 = buff4 + wid;
 380 
 381   wid -= (KSIZE - 1);
 382   hgt -= (KSIZE - 1);
 383 
 384   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
 385 
 386   CALC_SCALE();
 387   for (j = 0; j < 16; j++) k[j] = scalef * kern[j];
 388 
 389   for (c = 0; c < chan1; c++) {
 390     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 391 
 392     sl = adr_src + c;
 393     dl = adr_dst + c;
 394 
 395     sl1 = sl  + sll;
 396     sl2 = sl1 + sll;
 397     sl3 = sl2 + sll;
 398 #ifdef __SUNPRO_C
 399 #pragma pipeloop(0)
 400 #endif /* __SUNPRO_C */
 401     for (i = 0; i < wid + (KSIZE - 1); i++) {
 402       buff0[i] = (mlib_d64)sl[i*chan1];
 403       buff1[i] = (mlib_d64)sl1[i*chan1];
 404       buff2[i] = (mlib_d64)sl2[i*chan1];
 405       buff3[i] = (mlib_d64)sl3[i*chan1];
 406     }
 407 
 408     sl += KSIZE*sll;
 409 
 410     for (j = 0; j < hgt; j++) {
 411       /*
 412        *  First loop on two first lines of kernel
 413        */
 414       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
 415       k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
 416 
 417       sp = sl;
 418       dp = dl;
 419 
 420       p02 = buff0[0];
 421       p12 = buff1[0];
 422       p03 = buff0[1];
 423       p13 = buff1[1];
 424       p04 = buff0[2];
 425 
 426 #ifdef __SUNPRO_C
 427 #pragma pipeloop(0)
 428 #endif /* __SUNPRO_C */
 429       for (i = 0; i <= (wid - 2); i += 2) {
 430         p00 = p02; p10 = p12;
 431         p01 = p03; p11 = p13;
 432         p02 = p04; p12 = buff1[i + 2];
 433         p03 = buff0[i + 3]; p13 = buff1[i + 3];
 434         p04 = buff0[i + 4]; p14 = buff1[i + 4];
 435 
 436         buff4[i] = (mlib_d64)sp[0];
 437         buff4[i + 1] = (mlib_d64)sp[chan1];
 438 
 439         buff5[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
 440                         p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
 441         buff5[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
 442                         p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
 443 
 444         sp += chan2;
 445         dp += chan2;
 446       }
 447 
 448       /*
 449        *  Second loop on two last lines of kernel
 450        */
 451       k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
 452       k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
 453 
 454       sp = sl;
 455       dp = dl;
 456 
 457       p02 = buff2[0];
 458       p12 = buff3[0];
 459       p03 = buff2[1];
 460       p13 = buff3[1];
 461       p04 = buff2[2];
 462 
 463 #ifdef __SUNPRO_C
 464 #pragma pipeloop(0)
 465 #endif /* __SUNPRO_C */
 466       for (i = 0; i <= (wid - 2); i += 2) {
 467         p00 = p02; p10 = p12;
 468         p01 = p03; p11 = p13;
 469         p02 = p04; p12 = buff3[i + 2];
 470         p03 = buff2[i + 3]; p13 = buff3[i + 3];
 471         p04 = buff2[i + 4]; p14 = buff3[i + 4];
 472 
 473         d0 = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
 474               p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buff5[i]);
 475         d1 = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
 476               p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buff5[i + 1]);
 477 
 478         CLAMP_S32(dp[0    ], d0);
 479         CLAMP_S32(dp[chan1], d1);
 480 
 481         sp += chan2;
 482         dp += chan2;
 483       }
 484 
 485       /* last pixels */
 486       for (; i < wid; i++) {
 487         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
 488         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
 489         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
 490         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
 491 
 492         buff4[i] = (mlib_d64)sp[0];
 493 
 494         d0 = (p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
 495               p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
 496               p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
 497               p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
 498 
 499         CLAMP_S32(dp[0], d0);
 500 
 501         sp += chan1;
 502         dp += chan1;
 503       }
 504 
 505       buff4[wid    ] = (mlib_d64)sp[0];
 506       buff4[wid + 1] = (mlib_d64)sp[chan1];
 507       buff4[wid + 2] = (mlib_d64)sp[chan2];
 508 
 509       /* next line */
 510       sl += sll;
 511       dl += dll;
 512 
 513       buffT = buff0;
 514       buff0 = buff1;
 515       buff1 = buff2;
 516       buff2 = buff3;
 517       buff3 = buff4;
 518       buff4 = buffT;
 519     }
 520   }
 521 
 522   if (pbuff != buff) mlib_free(pbuff);
 523 
 524   return MLIB_SUCCESS;
 525 }
 526 
 527 /***************************************************************/
 528 #undef  KSIZE
 529 #define KSIZE 5
 530 
 531 mlib_status CONV_FUNC(5x5)(mlib_image       *dst,
 532                            const mlib_image *src,
 533                            const mlib_s32   *kern,
 534                            mlib_s32         scalef_expon,
 535                            mlib_s32         cmask)
 536 {
 537   mlib_d64 buff[(KSIZE + 2)*BUFF_LINE], *buff3, *buff4, *buff5, *buff6;
 538   mlib_d64 k[KSIZE*KSIZE];
 539   mlib_d64 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
 540   mlib_d64 p00, p01, p02, p03, p04, p05,
 541            p10, p11, p12, p13, p14, p15,
 542            p20, p21, p22, p23, p24,
 543            p30, p31, p32, p33, p34,
 544            p40, p41, p42, p43, p44;
 545   mlib_s32 *sl2, *sl3, *sl4;
 546   DEF_VARS(mlib_s32);
 547   mlib_s32 chan2 = chan1 + chan1;
 548   mlib_s32 chan3 = chan1 + chan2;
 549 
 550   if (wid > BUFF_LINE) {
 551     pbuff = mlib_malloc((KSIZE + 2)*sizeof(mlib_d64)*wid);
 552 
 553     if (pbuff == NULL) return MLIB_FAILURE;
 554   }
 555 
 556   buff0 = pbuff;
 557   buff1 = buff0 + wid;
 558   buff2 = buff1 + wid;
 559   buff3 = buff2 + wid;
 560   buff4 = buff3 + wid;
 561   buff5 = buff4 + wid;
 562   buff6 = buff5 + wid;
 563 
 564   wid -= (KSIZE - 1);
 565   hgt -= (KSIZE - 1);
 566 
 567   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
 568 
 569   CALC_SCALE();
 570   for (j = 0; j < 25; j++) k[j] = scalef * kern[j];
 571 
 572   for (c = 0; c < chan1; c++) {
 573     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 574 
 575     sl = adr_src + c;
 576     dl = adr_dst + c;
 577 
 578     sl1 = sl  + sll;
 579     sl2 = sl1 + sll;
 580     sl3 = sl2 + sll;
 581     sl4 = sl3 + sll;
 582 #ifdef __SUNPRO_C
 583 #pragma pipeloop(0)
 584 #endif /* __SUNPRO_C */
 585     for (i = 0; i < wid + (KSIZE - 1); i++) {
 586       buff0[i] = (mlib_d64)sl[i*chan1];
 587       buff1[i] = (mlib_d64)sl1[i*chan1];
 588       buff2[i] = (mlib_d64)sl2[i*chan1];
 589       buff3[i] = (mlib_d64)sl3[i*chan1];
 590       buff4[i] = (mlib_d64)sl4[i*chan1];
 591     }
 592 
 593     sl += KSIZE*sll;
 594 
 595     for (j = 0; j < hgt; j++) {
 596       /*
 597        *  First loop
 598        */
 599       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
 600       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
 601 
 602       sp = sl;
 603       dp = dl;
 604 
 605       p02 = buff0[0];
 606       p12 = buff1[0];
 607       p03 = buff0[1];
 608       p13 = buff1[1];
 609       p04 = buff0[2];
 610       p14 = buff1[2];
 611 
 612 #ifdef __SUNPRO_C
 613 #pragma pipeloop(0)
 614 #endif /* __SUNPRO_C */
 615       for (i = 0; i <= (wid - 2); i += 2) {
 616         p00 = p02; p10 = p12;
 617         p01 = p03; p11 = p13;
 618         p02 = p04; p12 = p14;
 619 
 620         p03 = buff0[i + 3]; p13 = buff1[i + 3];
 621         p04 = buff0[i + 4]; p14 = buff1[i + 4];
 622         p05 = buff0[i + 5]; p15 = buff1[i + 5];
 623 
 624         buff6[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
 625                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
 626         buff6[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
 627                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
 628 
 629         sp += chan2;
 630         dp += chan2;
 631       }
 632 
 633       /*
 634        *  Second loop
 635        */
 636       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
 637       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
 638 
 639       sp = sl;
 640       dp = dl;
 641 
 642       p02 = buff2[0];
 643       p12 = buff3[0];
 644       p03 = buff2[1];
 645       p13 = buff3[1];
 646 
 647 #ifdef __SUNPRO_C
 648 #pragma pipeloop(0)
 649 #endif /* __SUNPRO_C */
 650       for (i = 0; i <= (wid - 2); i += 2) {
 651         p00 = p02; p10 = p12;
 652         p01 = p03; p11 = p13;
 653 
 654         p02 = buff2[i + 2]; p12 = buff3[i + 2];
 655         p03 = buff2[i + 3]; p13 = buff3[i + 3];
 656         p04 = buff2[i + 4]; p14 = buff3[i + 4];
 657         p05 = buff2[i + 5]; p15 = buff3[i + 5];
 658 
 659         buff6[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
 660                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
 661         buff6[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
 662                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
 663 
 664         sp += chan2;
 665         dp += chan2;
 666       }
 667 
 668       /*
 669        *  3 loop
 670        */
 671       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
 672 
 673       sp = sl;
 674       dp = dl;
 675 
 676       p02 = buff4[0];
 677       p03 = buff4[1];
 678       p04 = buff4[2];
 679       p05 = buff4[3];
 680 
 681 #ifdef __SUNPRO_C
 682 #pragma pipeloop(0)
 683 #endif /* __SUNPRO_C */
 684       for (i = 0; i <= (wid - 2); i += 2) {
 685         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
 686 
 687         p04 = buff4[i + 4]; p05 = buff4[i + 5];
 688 
 689         buff5[i    ] = (mlib_d64)sp[0];
 690         buff5[i + 1] = (mlib_d64)sp[chan1];
 691 
 692         d0 = p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buff6[i];
 693         d1 = p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buff6[i + 1];
 694 
 695         CLAMP_S32(dp[0    ], d0);
 696         CLAMP_S32(dp[chan1], d1);
 697 
 698         sp += chan2;
 699         dp += chan2;
 700       }
 701 
 702       /* last pixels */
 703       for (; i < wid; i++) {
 704         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
 705         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
 706         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
 707         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
 708         p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
 709 
 710         p40 = buff4[i];        p41 = buff4[i + 1]; p42 = buff4[i + 2];
 711         p43 = buff4[i + 3]; p44 = buff4[i + 4];
 712 
 713         buff5[i] = (mlib_d64)sp[0];
 714 
 715         d0 = (p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
 716               p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
 717               p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
 718               p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
 719               p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
 720 
 721         CLAMP_S32(dp[0], d0);
 722 
 723         sp += chan1;
 724         dp += chan1;
 725       }
 726 
 727       buff5[wid    ] = (mlib_d64)sp[0];
 728       buff5[wid + 1] = (mlib_d64)sp[chan1];
 729       buff5[wid + 2] = (mlib_d64)sp[chan2];
 730       buff5[wid + 3] = (mlib_d64)sp[chan3];
 731 
 732       /* next line */
 733       sl += sll;
 734       dl += dll;
 735 
 736       buffT = buff0;
 737       buff0 = buff1;
 738       buff1 = buff2;
 739       buff2 = buff3;
 740       buff3 = buff4;
 741       buff4 = buff5;
 742       buff5 = buffT;
 743     }
 744   }
 745 
 746   if (pbuff != buff) mlib_free(pbuff);
 747 
 748   return MLIB_SUCCESS;
 749 }
 750 
 751 /***************************************************************/
 752 #undef  KSIZE
 753 #define KSIZE 7
 754 
 755 mlib_status CONV_FUNC(7x7)(mlib_image       *dst,
 756                            const mlib_image *src,
 757                            const mlib_s32   *kern,
 758                            mlib_s32         scalef_expon,
 759                            mlib_s32         cmask)
 760 {
 761   mlib_d64 buff[(KSIZE + 2)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
 762   mlib_d64 k[KSIZE*KSIZE];
 763   mlib_d64 k0, k1, k2, k3, k4, k5, k6;
 764   mlib_d64 p0, p1, p2, p3, p4, p5, p6, p7;
 765   mlib_d64 d0, d1;
 766   mlib_s32 l, m, buff_ind, *sl2, *sl3, *sl4, *sl5, *sl6;
 767   mlib_d64 scalef;
 768   DEF_VARS_MxN(mlib_s32);
 769   mlib_s32 chan2 = chan1 + chan1;
 770   mlib_s32 *sl1;
 771 
 772   if (wid > BUFF_LINE) {
 773     pbuff = mlib_malloc((KSIZE + 2)*sizeof(mlib_d64)*wid);
 774 
 775     if (pbuff == NULL) return MLIB_FAILURE;
 776   }
 777 
 778   for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*wid;
 779   for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
 780   buffd = buffs[KSIZE] + wid;
 781 
 782   wid -= (KSIZE - 1);
 783   hgt -= (KSIZE - 1);
 784 
 785   adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
 786 
 787   CALC_SCALE();
 788   for (j = 0; j < 49; j++) k[j] = scalef * kern[j];
 789 
 790   for (c = 0; c < chan1; c++) {
 791     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 792 
 793     sl = adr_src + c;
 794     dl = adr_dst + c;
 795 
 796     sl1 = sl  + sll;
 797     sl2 = sl1 + sll;
 798     sl3 = sl2 + sll;
 799     sl4 = sl3 + sll;
 800     sl5 = sl4 + sll;
 801     sl6 = sl5 + sll;
 802 #ifdef __SUNPRO_C
 803 #pragma pipeloop(0)
 804 #endif /* __SUNPRO_C */
 805     for (i = 0; i < wid + (KSIZE - 1); i++) {
 806       buffs[0][i] = (mlib_d64)sl[i*chan1];
 807       buffs[1][i] = (mlib_d64)sl1[i*chan1];
 808       buffs[2][i] = (mlib_d64)sl2[i*chan1];
 809       buffs[3][i] = (mlib_d64)sl3[i*chan1];
 810       buffs[4][i] = (mlib_d64)sl4[i*chan1];
 811       buffs[5][i] = (mlib_d64)sl5[i*chan1];
 812       buffs[6][i] = (mlib_d64)sl6[i*chan1];
 813     }
 814 
 815     buff_ind = 0;
 816 
 817 #ifdef __SUNPRO_C
 818 #pragma pipeloop(0)
 819 #endif /* __SUNPRO_C */
 820     for (i = 0; i < wid; i++) buffd[i] = 0.0;
 821 
 822     sl += KSIZE*sll;
 823 
 824     for (j = 0; j < hgt; j++) {
 825       mlib_d64 **buffc = buffs + buff_ind;
 826       mlib_d64 *buffn = buffc[KSIZE];
 827       mlib_d64 *pk = k;
 828 
 829       for (l = 0; l < KSIZE; l++) {
 830         mlib_d64 *buff = buffc[l];
 831 
 832         sp = sl;
 833         dp = dl;
 834 
 835         p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 836         p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
 837 
 838         k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
 839         k4 = *pk++; k5 = *pk++; k6 = *pk++;
 840 
 841         if (l < (KSIZE - 1)) {
 842 #ifdef __SUNPRO_C
 843 #pragma pipeloop(0)
 844 #endif /* __SUNPRO_C */
 845           for (i = 0; i <= (wid - 2); i += 2) {
 846             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 847 
 848             p6 = buff[i + 6]; p7 = buff[i + 7];
 849 
 850             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
 851             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
 852           }
 853 
 854         } else {
 855 #ifdef __SUNPRO_C
 856 #pragma pipeloop(0)
 857 #endif /* __SUNPRO_C */
 858           for (i = 0; i <= (wid - 2); i += 2) {
 859             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 860 
 861             p6 = buff[i + 6]; p7 = buff[i + 7];
 862 
 863             buffn[i    ] = (mlib_d64)sp[0];
 864             buffn[i + 1] = (mlib_d64)sp[chan1];
 865 
 866             d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ];
 867             d1 = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1];
 868 
 869             CLAMP_S32(dp[0    ], d0);
 870             CLAMP_S32(dp[chan1], d1);
 871 
 872             buffd[i    ] = 0.0;
 873             buffd[i + 1] = 0.0;
 874 
 875             sp += chan2;
 876             dp += chan2;
 877           }
 878         }
 879       }
 880 
 881       /* last pixels */
 882       for (; i < wid; i++) {
 883         mlib_d64 *pk = k, s = 0;
 884 
 885         for (l = 0; l < KSIZE; l++) {
 886           mlib_d64 *buff = buffc[l] + i;
 887 
 888           for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
 889         }
 890 
 891         CLAMP_S32(dp[0], s);
 892 
 893         buffn[i] = (mlib_d64)sp[0];
 894 
 895         sp += chan1;
 896         dp += chan1;
 897       }
 898 
 899       for (l = 0; l < (KSIZE - 1); l++) buffn[wid + l] = sp[l*chan1];
 900 
 901       /* next line */
 902       sl += sll;
 903       dl += dll;
 904 
 905       buff_ind++;
 906 
 907       if (buff_ind >= KSIZE + 1) buff_ind = 0;
 908     }
 909   }
 910 
 911   if (pbuff != buff) mlib_free(pbuff);
 912 
 913   return MLIB_SUCCESS;
 914 }
 915 
 916 /***************************************************************/
 917 #define FTYPE  mlib_d64
 918 #define DTYPE  mlib_s32
 919 
 920 #define BUFF_SIZE  1600
 921 
 922 static mlib_status mlib_ImageConv1xN(mlib_image       *dst,
 923                                      const mlib_image *src,
 924                                      const mlib_d64   *k,
 925                                      mlib_s32         n,
 926                                      mlib_s32         dn,
 927                                      mlib_s32         cmask)
 928 {
 929   FTYPE    buff[BUFF_SIZE];
 930   mlib_s32 off, kh;
 931   const FTYPE    *pk;
 932   FTYPE    k0, k1, k2, k3, d0, d1;
 933   FTYPE    p0, p1, p2, p3, p4;
 934   DTYPE    *sl_c, *dl_c, *sl0;
 935   mlib_s32 l, hsize, max_hsize;
 936   DEF_VARS_MxN(DTYPE);
 937 
 938   hgt -= (n - 1);
 939   adr_dst += dn*dll;
 940 
 941   max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;
 942 
 943   if (!max_hsize) max_hsize = 1;
 944 
 945   if (max_hsize > BUFF_SIZE) {
 946     pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
 947   }
 948 
 949   sl_c = adr_src;
 950   dl_c = adr_dst;
 951 
 952   for (l = 0; l < hgt; l += hsize) {
 953     hsize = hgt - l;
 954 
 955     if (hsize > max_hsize) hsize = max_hsize;
 956 
 957     for (c = 0; c < chan1; c++) {
 958     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 959 
 960       sl = sl_c + c;
 961       dl = dl_c + c;
 962 
 963 #ifdef __SUNPRO_C
 964 #pragma pipeloop(0)
 965 #endif /* __SUNPRO_C */
 966       for (j = 0; j < hsize; j++) pbuff[j] = 0.0;
 967 
 968       for (i = 0; i < wid; i++) {
 969         sl0 = sl;
 970 
 971         for (off = 0; off < (n - 4); off += 4) {
 972           pk = k + off;
 973           sp = sl0;
 974 
 975           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 976           p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
 977           sp += 3*sll;
 978 
 979 #ifdef __SUNPRO_C
 980 #pragma pipeloop(0)
 981 #endif /* __SUNPRO_C */
 982           for (j = 0; j < hsize; j += 2) {
 983             p0 = p2; p1 = p3; p2 = p4;
 984             p3 = sp[0];
 985             p4 = sp[sll];
 986 
 987             pbuff[j    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 988             pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 989 
 990             sp += 2*sll;
 991           }
 992 
 993           sl0 += 4*sll;
 994         }
 995 
 996         pk = k + off;
 997         sp = sl0;
 998 
 999         k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1000         p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1001 
1002         dp = dl;
1003         kh = n - off;
1004 
1005         if (kh == 4) {
1006           sp += 3*sll;
1007 
1008 #ifdef __SUNPRO_C
1009 #pragma pipeloop(0)
1010 #endif /* __SUNPRO_C */
1011           for (j = 0; j <= (hsize - 2); j += 2) {
1012             p0 = p2; p1 = p3; p2 = p4;
1013             p3 = sp[0];
1014             p4 = sp[sll];
1015 
1016             d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j];
1017             d1 = p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1];
1018             CLAMP_S32(dp[0  ], d0);
1019             CLAMP_S32(dp[dll], d1);
1020 
1021             pbuff[j] = 0;
1022             pbuff[j + 1] = 0;
1023 
1024             sp += 2*sll;
1025             dp += 2*dll;
1026           }
1027 
1028           if (j < hsize) {
1029             p0 = p2; p1 = p3; p2 = p4;
1030             p3 = sp[0];
1031 
1032             d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j];
1033             CLAMP_S32(dp[0], d0);
1034 
1035             pbuff[j] = 0;
1036           }
1037 
1038         } else if (kh == 3) {
1039           sp += 2*sll;
1040 
1041 #ifdef __SUNPRO_C
1042 #pragma pipeloop(0)
1043 #endif /* __SUNPRO_C */
1044           for (j = 0; j <= (hsize - 2); j += 2) {
1045             p0 = p2; p1 = p3;
1046             p2 = sp[0];
1047             p3 = sp[sll];
1048 
1049             d0 = p0*k0 + p1*k1 + p2*k2 + pbuff[j];
1050             d1 = p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1];
1051             CLAMP_S32(dp[0  ], d0);
1052             CLAMP_S32(dp[dll], d1);
1053 
1054             pbuff[j] = 0;
1055             pbuff[j + 1] = 0;
1056 
1057             sp += 2*sll;
1058             dp += 2*dll;
1059           }
1060 
1061           if (j < hsize) {
1062             p0 = p2; p1 = p3;
1063             p2 = sp[0];
1064 
1065             d0 = p0*k0 + p1*k1 + p2*k2 + pbuff[j];
1066             CLAMP_S32(dp[0], d0);
1067 
1068             pbuff[j] = 0;
1069           }
1070 
1071         } else if (kh == 2) {
1072           sp += sll;
1073 
1074 #ifdef __SUNPRO_C
1075 #pragma pipeloop(0)
1076 #endif /* __SUNPRO_C */
1077           for (j = 0; j <= (hsize - 2); j += 2) {
1078             p0 = p2;
1079             p1 = sp[0];
1080             p2 = sp[sll];
1081 
1082             d0 = p0*k0 + p1*k1 + pbuff[j];
1083             d1 = p1*k0 + p2*k1 + pbuff[j + 1];
1084             CLAMP_S32(dp[0  ], d0);
1085             CLAMP_S32(dp[dll], d1);
1086 
1087             pbuff[j] = 0;
1088             pbuff[j + 1] = 0;
1089 
1090             sp += 2*sll;
1091             dp += 2*dll;
1092           }
1093 
1094           if (j < hsize) {
1095             p0 = p2;
1096             p1 = sp[0];
1097 
1098             d0 = p0*k0 + p1*k1 + pbuff[j];
1099             CLAMP_S32(dp[0], d0);
1100 
1101             pbuff[j] = 0;
1102           }
1103 
1104         } else /* if (kh == 1) */ {
1105 #ifdef __SUNPRO_C
1106 #pragma pipeloop(0)
1107 #endif /* __SUNPRO_C */
1108           for (j = 0; j < hsize; j++) {
1109             p0 = sp[0];
1110 
1111             d0 = p0*k0 + pbuff[j];
1112             CLAMP_S32(dp[0], d0);
1113 
1114             pbuff[j] = 0;
1115 
1116             sp += sll;
1117             dp += dll;
1118           }
1119         }
1120 
1121         sl += chan1;
1122         dl += chan1;
1123       }
1124     }
1125 
1126     sl_c += max_hsize*sll;
1127     dl_c += max_hsize*dll;
1128   }
1129 
1130   if (pbuff != buff) mlib_free(pbuff);
1131 
1132   return MLIB_SUCCESS;
1133 }
1134 
1135 /***************************************************************/
1136 #define MAX_KER 7
1137 
1138 #define MAX_N     15
1139 
1140 #undef  BUFF_SIZE
1141 #define BUFF_SIZE 1500
1142 
1143 mlib_status CONV_FUNC(MxN)(mlib_image       *dst,
1144                            const mlib_image *src,
1145                            const mlib_s32   *kernel,
1146                            mlib_s32         m,
1147                            mlib_s32         n,
1148                            mlib_s32         dm,
1149                            mlib_s32         dn,
1150                            mlib_s32         scale,
1151                            mlib_s32         cmask)
1152 {
1153   mlib_d64  buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
1154   mlib_d64  **buffs = buffs_arr, *buffd;
1155   mlib_d64  akernel[256], *k = akernel, fscale = 1.0;
1156   mlib_s32  l, off, kw, bsize, buff_ind, mn;
1157   mlib_d64  d0, d1;
1158   mlib_d64  k0, k1, k2, k3, k4, k5, k6;
1159   mlib_d64  p0, p1, p2, p3, p4, p5, p6, p7;
1160   DEF_VARS_MxN(mlib_s32);
1161   mlib_s32 chan2 = chan1 + chan1;
1162 
1163   mlib_status status = MLIB_SUCCESS;
1164 
1165   if (scale > 30) {
1166     fscale *= 1.0/(1 << 30);
1167     scale -= 30;
1168   }
1169 
1170   fscale /= (1 << scale);
1171 
1172   mn = m*n;
1173 
1174   if (mn > 256) {
1175     k = mlib_malloc(mn*sizeof(mlib_d64));
1176 
1177     if (k == NULL) return MLIB_FAILURE;
1178   }
1179 
1180   for (i = 0; i < mn; i++) {
1181     k[i] = kernel[i]*fscale;
1182   }
1183 
1184   if (m == 1) {
1185     status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
1186     FREE_AND_RETURN_STATUS;
1187   }
1188 
1189   bsize = (n + 2)*wid;
1190 
1191   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
1192     pbuff = mlib_malloc(sizeof(mlib_d64)*bsize + sizeof(mlib_d64*)*2*(n + 1));
1193 
1194     if (pbuff == NULL) {
1195       status = MLIB_FAILURE;
1196       FREE_AND_RETURN_STATUS;
1197     }
1198     buffs = (mlib_d64**)(pbuff + bsize);
1199   }
1200 
1201   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;
1202   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1203   buffd = buffs[n] + wid;
1204 
1205   wid -= (m - 1);
1206   hgt -= (n - 1);
1207   adr_dst += dn*dll + dm*chan1;
1208 
1209   for (c = 0; c < chan1; c++) {
1210     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1211 
1212     sl = adr_src + c;
1213     dl = adr_dst + c;
1214 
1215     for (l = 0; l < n; l++) {
1216       mlib_d64 *buff = buffs[l];
1217 
1218 #ifdef __SUNPRO_C
1219 #pragma pipeloop(0)
1220 #endif /* __SUNPRO_C */
1221       for (i = 0; i < wid + (m - 1); i++) {
1222         buff[i] = (mlib_d64)sl[i*chan1];
1223       }
1224 
1225       sl += sll;
1226     }
1227 
1228     buff_ind = 0;
1229 
1230 #ifdef __SUNPRO_C
1231 #pragma pipeloop(0)
1232 #endif /* __SUNPRO_C */
1233     for (i = 0; i < wid; i++) buffd[i] = 0.0;
1234 
1235     for (j = 0; j < hgt; j++) {
1236       mlib_d64 **buffc = buffs + buff_ind;
1237       mlib_d64 *buffn = buffc[n];
1238       mlib_d64 *pk = k;
1239 
1240       for (l = 0; l < n; l++) {
1241         mlib_d64 *buff_l = buffc[l];
1242 
1243         for (off = 0; off < m;) {
1244           mlib_d64 *buff = buff_l + off;
1245 
1246           kw = m - off;
1247 
1248           if (kw > 2*MAX_KER) kw = MAX_KER; else
1249             if (kw > MAX_KER) kw = kw/2;
1250           off += kw;
1251 
1252           sp = sl;
1253           dp = dl;
1254 
1255           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1256           p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1257 
1258           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1259           k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1260           pk += kw;
1261 
1262           if (kw == 7) {
1263 
1264             if (l < (n - 1) || off < m) {
1265 #ifdef __SUNPRO_C
1266 #pragma pipeloop(0)
1267 #endif /* __SUNPRO_C */
1268               for (i = 0; i <= (wid - 2); i += 2) {
1269                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1270 
1271                 p6 = buff[i + 6]; p7 = buff[i + 7];
1272 
1273                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1274                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1275               }
1276 
1277             } else {
1278 #ifdef __SUNPRO_C
1279 #pragma pipeloop(0)
1280 #endif /* __SUNPRO_C */
1281               for (i = 0; i <= (wid - 2); i += 2) {
1282                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1283 
1284                 p6 = buff[i + 6]; p7 = buff[i + 7];
1285 
1286                 buffn[i    ] = (mlib_d64)sp[0];
1287                 buffn[i + 1] = (mlib_d64)sp[chan1];
1288 
1289                 d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ];
1290                 d1 = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1];
1291 
1292                 CLAMP_S32(dp[0],     d0);
1293                 CLAMP_S32(dp[chan1], d1);
1294 
1295                 buffd[i    ] = 0.0;
1296                 buffd[i + 1] = 0.0;
1297 
1298                 sp += chan2;
1299                 dp += chan2;
1300               }
1301             }
1302 
1303           } else if (kw == 6) {
1304 
1305             if (l < (n - 1) || off < m) {
1306 #ifdef __SUNPRO_C
1307 #pragma pipeloop(0)
1308 #endif /* __SUNPRO_C */
1309               for (i = 0; i <= (wid - 2); i += 2) {
1310                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1311 
1312                 p5 = buff[i + 5]; p6 = buff[i + 6];
1313 
1314                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1315                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1316               }
1317 
1318             } else {
1319 #ifdef __SUNPRO_C
1320 #pragma pipeloop(0)
1321 #endif /* __SUNPRO_C */
1322               for (i = 0; i <= (wid - 2); i += 2) {
1323                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1324 
1325                 p5 = buff[i + 5]; p6 = buff[i + 6];
1326 
1327                 buffn[i    ] = (mlib_d64)sp[0];
1328                 buffn[i + 1] = (mlib_d64)sp[chan1];
1329 
1330                 d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ];
1331                 d1 = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1];
1332 
1333                 CLAMP_S32(dp[0],     d0);
1334                 CLAMP_S32(dp[chan1], d1);
1335 
1336                 buffd[i    ] = 0.0;
1337                 buffd[i + 1] = 0.0;
1338 
1339                 sp += chan2;
1340                 dp += chan2;
1341               }
1342             }
1343 
1344           } else if (kw == 5) {
1345 
1346             if (l < (n - 1) || off < m) {
1347 #ifdef __SUNPRO_C
1348 #pragma pipeloop(0)
1349 #endif /* __SUNPRO_C */
1350               for (i = 0; i <= (wid - 2); i += 2) {
1351                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1352 
1353                 p4 = buff[i + 4]; p5 = buff[i + 5];
1354 
1355                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1356                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1357               }
1358 
1359             } else {
1360 #ifdef __SUNPRO_C
1361 #pragma pipeloop(0)
1362 #endif /* __SUNPRO_C */
1363               for (i = 0; i <= (wid - 2); i += 2) {
1364                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1365 
1366                 p4 = buff[i + 4]; p5 = buff[i + 5];
1367 
1368                 buffn[i    ] = (mlib_d64)sp[0];
1369                 buffn[i + 1] = (mlib_d64)sp[chan1];
1370 
1371                 d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ];
1372                 d1 = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1];
1373 
1374                 CLAMP_S32(dp[0],     d0);
1375                 CLAMP_S32(dp[chan1], d1);
1376 
1377                 buffd[i    ] = 0.0;
1378                 buffd[i + 1] = 0.0;
1379 
1380                 sp += chan2;
1381                 dp += chan2;
1382               }
1383             }
1384 
1385           } else if (kw == 4) {
1386 
1387             if (l < (n - 1) || off < m) {
1388 #ifdef __SUNPRO_C
1389 #pragma pipeloop(0)
1390 #endif /* __SUNPRO_C */
1391               for (i = 0; i <= (wid - 2); i += 2) {
1392                 p0 = p2; p1 = p3; p2 = p4;
1393 
1394                 p3 = buff[i + 3]; p4 = buff[i + 4];
1395 
1396                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1397                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1398               }
1399 
1400             } else {
1401 #ifdef __SUNPRO_C
1402 #pragma pipeloop(0)
1403 #endif /* __SUNPRO_C */
1404               for (i = 0; i <= (wid - 2); i += 2) {
1405                 p0 = p2; p1 = p3; p2 = p4;
1406 
1407                 p3 = buff[i + 3]; p4 = buff[i + 4];
1408 
1409                 buffn[i    ] = (mlib_d64)sp[0];
1410                 buffn[i + 1] = (mlib_d64)sp[chan1];
1411 
1412                 d0 = p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ];
1413                 d1 = p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1];
1414 
1415                 CLAMP_S32(dp[0],     d0);
1416                 CLAMP_S32(dp[chan1], d1);
1417 
1418                 buffd[i    ] = 0.0;
1419                 buffd[i + 1] = 0.0;
1420 
1421                 sp += chan2;
1422                 dp += chan2;
1423               }
1424             }
1425 
1426           } else if (kw == 3) {
1427 
1428             if (l < (n - 1) || off < m) {
1429 #ifdef __SUNPRO_C
1430 #pragma pipeloop(0)
1431 #endif /* __SUNPRO_C */
1432               for (i = 0; i <= (wid - 2); i += 2) {
1433                 p0 = p2; p1 = p3;
1434 
1435                 p2 = buff[i + 2]; p3 = buff[i + 3];
1436 
1437                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1438                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1439               }
1440 
1441             } else {
1442 #ifdef __SUNPRO_C
1443 #pragma pipeloop(0)
1444 #endif /* __SUNPRO_C */
1445               for (i = 0; i <= (wid - 2); i += 2) {
1446                 p0 = p2; p1 = p3;
1447 
1448                 p2 = buff[i + 2]; p3 = buff[i + 3];
1449 
1450                 buffn[i    ] = (mlib_d64)sp[0];
1451                 buffn[i + 1] = (mlib_d64)sp[chan1];
1452 
1453                 d0 = p0*k0 + p1*k1 + p2*k2 + buffd[i    ];
1454                 d1 = p1*k0 + p2*k1 + p3*k2 + buffd[i + 1];
1455 
1456                 CLAMP_S32(dp[0],     d0);
1457                 CLAMP_S32(dp[chan1], d1);
1458 
1459                 buffd[i    ] = 0.0;
1460                 buffd[i + 1] = 0.0;
1461 
1462                 sp += chan2;
1463                 dp += chan2;
1464               }
1465             }
1466 
1467           } else { /* kw == 2 */
1468 
1469             if (l < (n - 1) || off < m) {
1470 #ifdef __SUNPRO_C
1471 #pragma pipeloop(0)
1472 #endif /* __SUNPRO_C */
1473               for (i = 0; i <= (wid - 2); i += 2) {
1474                 p0 = p2;
1475 
1476                 p1 = buff[i + 1]; p2 = buff[i + 2];
1477 
1478                 buffd[i    ] += p0*k0 + p1*k1;
1479                 buffd[i + 1] += p1*k0 + p2*k1;
1480               }
1481 
1482             } else {
1483 #ifdef __SUNPRO_C
1484 #pragma pipeloop(0)
1485 #endif /* __SUNPRO_C */
1486               for (i = 0; i <= (wid - 2); i += 2) {
1487                 p0 = p2;
1488 
1489                 p1 = buff[i + 1]; p2 = buff[i + 2];
1490 
1491                 buffn[i    ] = (mlib_d64)sp[0];
1492                 buffn[i + 1] = (mlib_d64)sp[chan1];
1493 
1494                 d0 = p0*k0 + p1*k1 + buffd[i    ];
1495                 d1 = p1*k0 + p2*k1 + buffd[i + 1];
1496 
1497                 CLAMP_S32(dp[0],     d0);
1498                 CLAMP_S32(dp[chan1], d1);
1499 
1500                 buffd[i    ] = 0.0;
1501                 buffd[i + 1] = 0.0;
1502 
1503                 sp += chan2;
1504                 dp += chan2;
1505               }
1506             }
1507           }
1508         }
1509       }
1510 
1511       /* last pixels */
1512       for (; i < wid; i++) {
1513         mlib_d64 *pk = k, s = 0;
1514         mlib_s32 x;
1515 
1516         for (l = 0; l < n; l++) {
1517           mlib_d64 *buff = buffc[l] + i;
1518 
1519           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
1520         }
1521 
1522         CLAMP_S32(dp[0], s);
1523 
1524         buffn[i] = (mlib_d64)sp[0];
1525 
1526         sp += chan1;
1527         dp += chan1;
1528       }
1529 
1530       for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];
1531 
1532       /* next line */
1533       sl += sll;
1534       dl += dll;
1535 
1536       buff_ind++;
1537 
1538       if (buff_ind >= n + 1) buff_ind = 0;
1539     }
1540   }
1541 
1542   FREE_AND_RETURN_STATUS;
1543 }
1544 
1545 /***************************************************************/