1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 /*
  28  * FUNCTION
  29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 types and
  30  *   MLIB_EDGE_DST_NO_WRITE mask
  31  */
  32 
  33 #include "mlib_image.h"
  34 #include "mlib_ImageConv.h"
  35 #include "mlib_c_ImageConv.h"
  36 
  37 /*
  38   This define switches between functions of different data types
  39 */
  40 #define IMG_TYPE 1
  41 
  42 /***************************************************************/
  43 #if IMG_TYPE == 1
  44 
  45 #define DTYPE             mlib_u8
  46 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##nw_u8
  47 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u8
  48 #define DSCALE            (1 << 24)
  49 #define FROM_S32(x)       (((x) >> 24) ^ 128)
  50 #define S64TOS32(x)       (x)
  51 #define SAT_OFF           -(1u << 31)
  52 
  53 #elif IMG_TYPE == 2
  54 
  55 #define DTYPE             mlib_s16
  56 #define CONV_FUNC(KERN)   mlib_conv##KERN##nw_s16
  57 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_s16
  58 #define DSCALE            65536.0
  59 #define FROM_S32(x)       ((x) >> 16)
  60 #define S64TOS32(x)       ((x) & 0xffffffff)
  61 #define SAT_OFF
  62 
  63 #elif IMG_TYPE == 3
  64 
  65 #define DTYPE             mlib_u16
  66 #define CONV_FUNC(KERN)   mlib_conv##KERN##nw_u16
  67 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u16
  68 #define DSCALE            65536.0
  69 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
  70 #define S64TOS32(x)       (x)
  71 #define SAT_OFF           -(1u << 31)
  72 
  73 #endif /* IMG_TYPE == 1 */
  74 
  75 /***************************************************************/
  76 #define BUFF_SIZE   1600
  77 
  78 #define CACHE_SIZE  (64*1024)
  79 
  80 /***************************************************************/
  81 #define FTYPE mlib_d64
  82 
  83 #ifndef MLIB_USE_FTOI_CLAMPING
  84 
  85 #define CLAMP_S32(x)                                            \
  86   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
  87 
  88 #else
  89 
  90 #define CLAMP_S32(x) ((mlib_s32)(x))
  91 
  92 #endif /* MLIB_USE_FTOI_CLAMPING */
  93 
  94 /***************************************************************/
  95 #define D2I(x) CLAMP_S32((x) SAT_OFF)
  96 
  97 /***************************************************************/
  98 #ifdef _LITTLE_ENDIAN
  99 
 100 #define STORE2(res0, res1)                                      \
 101   dp[0    ] = res1;                                             \
 102   dp[chan1] = res0
 103 
 104 #else
 105 
 106 #define STORE2(res0, res1)                                      \
 107   dp[0    ] = res0;                                             \
 108   dp[chan1] = res1
 109 
 110 #endif /* _LITTLE_ENDIAN */
 111 
 112 /***************************************************************/
 113 #ifdef _NO_LONGLONG
 114 
 115 #define LOAD_BUFF(buff)                                         \
 116   buff[i    ] = sp[0];                                          \
 117   buff[i + 1] = sp[chan1]
 118 
 119 #else /* _NO_LONGLONG */
 120 
 121 #ifdef _LITTLE_ENDIAN
 122 
 123 #define LOAD_BUFF(buff)                                         \
 124   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
 125 
 126 #else /* _LITTLE_ENDIAN */
 127 
 128 #define LOAD_BUFF(buff)                                         \
 129   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
 130 
 131 #endif /* _LITTLE_ENDIAN */
 132 #endif /* _NO_LONGLONG */
 133 
 134 /***************************************************************/
 135 typedef union {
 136   mlib_d64 d64;
 137   struct {
 138     mlib_s32 i0;
 139     mlib_s32 i1;
 140   } i32s;
 141   struct {
 142     mlib_s32 f0;
 143     mlib_s32 f1;
 144   } f32s;
 145 } d64_2x32;
 146 
 147 /***************************************************************/
 148 #define DEF_VARS(type)                                          \
 149   type     *adr_src, *sl, *sp = NULL;                           \
 150   type     *adr_dst, *dl, *dp = NULL;                           \
 151   FTYPE    *pbuff = buff;                                       \
 152   mlib_s32 wid, hgt, sll, dll;                                  \
 153   mlib_s32 nchannel, chan1;                                     \
 154   mlib_s32 i, j, c
 155 
 156 /***************************************************************/
 157 #define GET_SRC_DST_PARAMETERS(type)                            \
 158   hgt = mlib_ImageGetHeight(src);                               \
 159   wid = mlib_ImageGetWidth(src);                                \
 160   nchannel = mlib_ImageGetChannels(src);                        \
 161   sll = mlib_ImageGetStride(src) / sizeof(type);                \
 162   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
 163   adr_src = (type *)mlib_ImageGetData(src);                     \
 164   adr_dst = (type *)mlib_ImageGetData(dst)
 165 
 166 /***************************************************************/
 167 #if IMG_TYPE == 1
 168 
 169 /* Test for the presence of any "1" bit in bits
 170    8 to 31 of val. If present, then val is either
 171    negative or >255. If over/underflows of 8 bits
 172    are uncommon, then this technique can be a win,
 173    since only a single test, rather than two, is
 174    necessary to determine if clamping is needed.
 175    On the other hand, if over/underflows are common,
 176    it adds an extra test.
 177 */
 178 #define CLAMP_STORE(dst, val)                                   \
 179   if (val & 0xffffff00) {                                       \
 180     if (val < MLIB_U8_MIN)                                      \
 181       dst = MLIB_U8_MIN;                                        \
 182     else                                                        \
 183       dst = MLIB_U8_MAX;                                        \
 184   } else {                                                      \
 185     dst = (mlib_u8)val;                                         \
 186   }
 187 
 188 #elif IMG_TYPE == 2
 189 
 190 #define CLAMP_STORE(dst, val)                                   \
 191   if (val >= MLIB_S16_MAX)                                      \
 192     dst = MLIB_S16_MAX;                                         \
 193   else if (val <= MLIB_S16_MIN)                                 \
 194     dst = MLIB_S16_MIN;                                         \
 195   else                                                          \
 196     dst = (mlib_s16)val
 197 
 198 #elif IMG_TYPE == 3
 199 
 200 #define CLAMP_STORE(dst, val)                                   \
 201   if (val >= MLIB_U16_MAX)                                      \
 202     dst = MLIB_U16_MAX;                                         \
 203   else if (val <= MLIB_U16_MIN)                                 \
 204     dst = MLIB_U16_MIN;                                         \
 205   else                                                          \
 206     dst = (mlib_u16)val
 207 
 208 #endif /* IMG_TYPE == 1 */
 209 
 210 /***************************************************************/
 211 #define MAX_KER   7
 212 #define MAX_N    15
 213 
 214 static mlib_status mlib_ImageConv1xN(mlib_image       *dst,
 215                                      const mlib_image *src,
 216                                      const mlib_d64   *k,
 217                                      mlib_s32         n,
 218                                      mlib_s32         dn,
 219                                      mlib_s32         cmask)
 220 {
 221   FTYPE    buff[BUFF_SIZE];
 222   mlib_s32 off, kh;
 223   mlib_s32 d0, d1;
 224   const FTYPE    *pk;
 225   FTYPE    k0, k1, k2, k3;
 226   FTYPE    p0, p1, p2, p3, p4;
 227   DEF_VARS(DTYPE);
 228   DTYPE    *sl_c, *dl_c, *sl0;
 229   mlib_s32 l, hsize, max_hsize;
 230   GET_SRC_DST_PARAMETERS(DTYPE);
 231 
 232   hgt -= (n - 1);
 233   adr_dst += dn*dll;
 234 
 235   max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;
 236 
 237   if (!max_hsize) max_hsize = 1;
 238 
 239   if (max_hsize > BUFF_SIZE) {
 240     pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
 241   }
 242 
 243   chan1 = nchannel;
 244 
 245   sl_c = adr_src;
 246   dl_c = adr_dst;
 247 
 248   for (l = 0; l < hgt; l += hsize) {
 249     hsize = hgt - l;
 250 
 251     if (hsize > max_hsize) hsize = max_hsize;
 252 
 253     for (c = 0; c < nchannel; c++) {
 254       if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 255 
 256       sl = sl_c + c;
 257       dl = dl_c + c;
 258 
 259       for (j = 0; j < hsize; j++) pbuff[j] = 0.0;
 260 
 261       for (i = 0; i < wid; i++) {
 262         sl0 = sl;
 263 
 264         for (off = 0; off < (n - 4); off += 4) {
 265           pk = k + off;
 266           sp = sl0;
 267 
 268           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 269           p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
 270           sp += 3*sll;
 271 
 272           for (j = 0; j < hsize; j += 2) {
 273             p0 = p2; p1 = p3; p2 = p4;
 274             p3 = sp[0];
 275             p4 = sp[sll];
 276 
 277             pbuff[j    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 278             pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 279 
 280             sp += 2*sll;
 281           }
 282 
 283           sl0 += 4*sll;
 284         }
 285 
 286         pk = k + off;
 287         sp = sl0;
 288 
 289         k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 290         p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
 291 
 292         dp = dl;
 293         kh = n - off;
 294 
 295         if (kh == 4) {
 296           sp += 3*sll;
 297 
 298           for (j = 0; j <= (hsize - 2); j += 2) {
 299             p0 = p2; p1 = p3; p2 = p4;
 300             p3 = sp[0];
 301             p4 = sp[sll];
 302 
 303             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
 304             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]);
 305 
 306             dp[0  ] = FROM_S32(d0);
 307             dp[dll] = FROM_S32(d1);
 308 
 309             pbuff[j] = 0;
 310             pbuff[j + 1] = 0;
 311 
 312             sp += 2*sll;
 313             dp += 2*dll;
 314           }
 315 
 316           if (j < hsize) {
 317             p0 = p2; p1 = p3; p2 = p4;
 318             p3 = sp[0];
 319 
 320             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
 321 
 322             pbuff[j] = 0;
 323 
 324             dp[0] = FROM_S32(d0);
 325           }
 326 
 327         } else if (kh == 3) {
 328           sp += 2*sll;
 329 
 330           for (j = 0; j <= (hsize - 2); j += 2) {
 331             p0 = p2; p1 = p3;
 332             p2 = sp[0];
 333             p3 = sp[sll];
 334 
 335             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
 336             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]);
 337 
 338             dp[0  ] = FROM_S32(d0);
 339             dp[dll] = FROM_S32(d1);
 340 
 341             pbuff[j] = 0;
 342             pbuff[j + 1] = 0;
 343 
 344             sp += 2*sll;
 345             dp += 2*dll;
 346           }
 347 
 348           if (j < hsize) {
 349             p0 = p2; p1 = p3;
 350             p2 = sp[0];
 351 
 352             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
 353 
 354             pbuff[j] = 0;
 355 
 356             dp[0] = FROM_S32(d0);
 357           }
 358 
 359         } else if (kh == 2) {
 360           sp += sll;
 361 
 362           for (j = 0; j <= (hsize - 2); j += 2) {
 363             p0 = p2;
 364             p1 = sp[0];
 365             p2 = sp[sll];
 366 
 367             d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
 368             d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]);
 369 
 370             dp[0  ] = FROM_S32(d0);
 371             dp[dll] = FROM_S32(d1);
 372 
 373             pbuff[j] = 0;
 374             pbuff[j + 1] = 0;
 375 
 376             sp += 2*sll;
 377             dp += 2*dll;
 378           }
 379 
 380           if (j < hsize) {
 381             p0 = p2;
 382             p1 = sp[0];
 383 
 384             d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
 385 
 386             pbuff[j] = 0;
 387 
 388             dp[0] = FROM_S32(d0);
 389           }
 390 
 391         } else /* if (kh == 1) */ {
 392           for (j = 0; j < hsize; j++) {
 393             p0 = sp[0];
 394 
 395             d0 = D2I(p0*k0 + pbuff[j]);
 396 
 397             dp[0] = FROM_S32(d0);
 398 
 399             pbuff[j] = 0;
 400 
 401             sp += sll;
 402             dp += dll;
 403           }
 404         }
 405 
 406         sl += chan1;
 407         dl += chan1;
 408       }
 409     }
 410 
 411     sl_c += max_hsize*sll;
 412     dl_c += max_hsize*dll;
 413   }
 414 
 415   if (pbuff != buff) mlib_free(pbuff);
 416 
 417   return MLIB_SUCCESS;
 418 }
 419 
 420 /***************************************************************/
 421 mlib_status CONV_FUNC(MxN)(mlib_image       *dst,
 422                            const mlib_image *src,
 423                            const mlib_s32   *kernel,
 424                            mlib_s32         m,
 425                            mlib_s32         n,
 426                            mlib_s32         dm,
 427                            mlib_s32         dn,
 428                            mlib_s32         scale,
 429                            mlib_s32         cmask)
 430 {
 431   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
 432   FTYPE    **buffs = buffs_arr, *buffd;
 433   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
 434   mlib_s32 mn, l, off, kw, bsize, buff_ind;
 435   mlib_s32 d0, d1;
 436   FTYPE    k0, k1, k2, k3, k4, k5, k6;
 437   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
 438   d64_2x32 dd;
 439   DEF_VARS(DTYPE);
 440   mlib_s32 chan2;
 441   mlib_s32 *buffo, *buffi;
 442   mlib_status status = MLIB_SUCCESS;
 443 
 444   GET_SRC_DST_PARAMETERS(DTYPE);
 445 
 446   if (scale > 30) {
 447     fscale *= 1.0/(1 << 30);
 448     scale -= 30;
 449   }
 450 
 451   fscale /= (1 << scale);
 452 
 453   mn = m*n;
 454 
 455   if (mn > 256) {
 456     k = mlib_malloc(mn*sizeof(mlib_d64));
 457 
 458     if (k == NULL) return MLIB_FAILURE;
 459   }
 460 
 461   for (i = 0; i < mn; i++) {
 462     k[i] = kernel[i]*fscale;
 463   }
 464 
 465   if (m == 1) {
 466     status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
 467     FREE_AND_RETURN_STATUS;
 468   }
 469 
 470   bsize = (n + 3)*wid;
 471 
 472   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
 473     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
 474 
 475     if (pbuff == NULL) {
 476       status = MLIB_FAILURE;
 477       FREE_AND_RETURN_STATUS;
 478     }
 479     buffs = (FTYPE   **)(pbuff + bsize);
 480   }
 481 
 482   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;
 483   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
 484   buffd = buffs[n] + wid;
 485   buffo = (mlib_s32*)(buffd + wid);
 486   buffi = buffo + (wid &~ 1);
 487 
 488   chan1 = nchannel;
 489   chan2 = chan1 + chan1;
 490 
 491   wid -= (m - 1);
 492   hgt -= (n - 1);
 493   adr_dst += dn*dll + dm*nchannel;
 494 
 495   for (c = 0; c < nchannel; c++) {
 496     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
 497 
 498     sl = adr_src + c;
 499     dl = adr_dst + c;
 500 
 501     for (l = 0; l < n; l++) {
 502       FTYPE    *buff = buffs[l];
 503 
 504       for (i = 0; i < wid + (m - 1); i++) {
 505         buff[i] = (FTYPE)sl[i*chan1];
 506       }
 507 
 508       sl += sll;
 509     }
 510 
 511     buff_ind = 0;
 512 
 513     for (i = 0; i < wid; i++) buffd[i] = 0.0;
 514 
 515     for (j = 0; j < hgt; j++) {
 516       FTYPE    **buffc = buffs + buff_ind;
 517       FTYPE    *buffn = buffc[n];
 518       FTYPE    *pk = k;
 519 
 520       for (l = 0; l < n; l++) {
 521         FTYPE    *buff_l = buffc[l];
 522 
 523         for (off = 0; off < m;) {
 524           FTYPE    *buff = buff_l + off;
 525 
 526           kw = m - off;
 527 
 528           if (kw > 2*MAX_KER) kw = MAX_KER; else
 529             if (kw > MAX_KER) kw = kw/2;
 530           off += kw;
 531 
 532           sp = sl;
 533           dp = dl;
 534 
 535           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
 536           p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
 537 
 538           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 539           k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
 540           pk += kw;
 541 
 542           if (kw == 7) {
 543 
 544             if (l < (n - 1) || off < m) {
 545               for (i = 0; i <= (wid - 2); i += 2) {
 546                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 547 
 548                 p6 = buff[i + 6]; p7 = buff[i + 7];
 549 
 550                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
 551                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
 552               }
 553 
 554             } else {
 555               for (i = 0; i <= (wid - 2); i += 2) {
 556                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 557 
 558                 p6 = buff[i + 6]; p7 = buff[i + 7];
 559 
 560                 LOAD_BUFF(buffi);
 561 
 562                 dd.d64 = *(FTYPE   *)(buffi + i);
 563                 buffn[i    ] = (FTYPE)dd.i32s.i0;
 564                 buffn[i + 1] = (FTYPE)dd.i32s.i1;
 565 
 566                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
 567                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
 568 
 569                 dp[0    ] = FROM_S32(d0);
 570                 dp[chan1] = FROM_S32(d1);
 571 
 572                 buffd[i    ] = 0.0;
 573                 buffd[i + 1] = 0.0;
 574 
 575                 sp += chan2;
 576                 dp += chan2;
 577               }
 578             }
 579 
 580           } else if (kw == 6) {
 581 
 582             if (l < (n - 1) || off < m) {
 583               for (i = 0; i <= (wid - 2); i += 2) {
 584                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 585 
 586                 p5 = buff[i + 5]; p6 = buff[i + 6];
 587 
 588                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
 589                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
 590               }
 591 
 592             } else {
 593               for (i = 0; i <= (wid - 2); i += 2) {
 594                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 595 
 596                 p5 = buff[i + 5]; p6 = buff[i + 6];
 597 
 598                 buffn[i    ] = (FTYPE)sp[0];
 599                 buffn[i + 1] = (FTYPE)sp[chan1];
 600 
 601                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
 602                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
 603 
 604                 dp[0    ] = FROM_S32(d0);
 605                 dp[chan1] = FROM_S32(d1);
 606 
 607                 buffd[i    ] = 0.0;
 608                 buffd[i + 1] = 0.0;
 609 
 610                 sp += chan2;
 611                 dp += chan2;
 612               }
 613             }
 614 
 615           } else if (kw == 5) {
 616 
 617             if (l < (n - 1) || off < m) {
 618               for (i = 0; i <= (wid - 2); i += 2) {
 619                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 620 
 621                 p4 = buff[i + 4]; p5 = buff[i + 5];
 622 
 623                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
 624                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
 625               }
 626 
 627             } else {
 628               for (i = 0; i <= (wid - 2); i += 2) {
 629                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 630 
 631                 p4 = buff[i + 4]; p5 = buff[i + 5];
 632 
 633                 buffn[i    ] = (FTYPE)sp[0];
 634                 buffn[i + 1] = (FTYPE)sp[chan1];
 635 
 636                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
 637                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
 638 
 639                 dp[0    ] = FROM_S32(d0);
 640                 dp[chan1] = FROM_S32(d1);
 641 
 642                 buffd[i    ] = 0.0;
 643                 buffd[i + 1] = 0.0;
 644 
 645                 sp += chan2;
 646                 dp += chan2;
 647               }
 648             }
 649 
 650           } else if (kw == 4) {
 651 
 652             if (l < (n - 1) || off < m) {
 653               for (i = 0; i <= (wid - 2); i += 2) {
 654                 p0 = p2; p1 = p3; p2 = p4;
 655 
 656                 p3 = buff[i + 3]; p4 = buff[i + 4];
 657 
 658                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
 659                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
 660               }
 661 
 662             } else {
 663               for (i = 0; i <= (wid - 2); i += 2) {
 664                 p0 = p2; p1 = p3; p2 = p4;
 665 
 666                 p3 = buff[i + 3]; p4 = buff[i + 4];
 667 
 668                 buffn[i    ] = (FTYPE)sp[0];
 669                 buffn[i + 1] = (FTYPE)sp[chan1];
 670 
 671                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
 672                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
 673 
 674                 dp[0    ] = FROM_S32(d0);
 675                 dp[chan1] = FROM_S32(d1);
 676 
 677                 buffd[i    ] = 0.0;
 678                 buffd[i + 1] = 0.0;
 679 
 680                 sp += chan2;
 681                 dp += chan2;
 682               }
 683             }
 684 
 685           } else if (kw == 3) {
 686 
 687             if (l < (n - 1) || off < m) {
 688               for (i = 0; i <= (wid - 2); i += 2) {
 689                 p0 = p2; p1 = p3;
 690 
 691                 p2 = buff[i + 2]; p3 = buff[i + 3];
 692 
 693                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
 694                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
 695               }
 696 
 697             } else {
 698               for (i = 0; i <= (wid - 2); i += 2) {
 699                 p0 = p2; p1 = p3;
 700 
 701                 p2 = buff[i + 2]; p3 = buff[i + 3];
 702 
 703                 buffn[i    ] = (FTYPE)sp[0];
 704                 buffn[i + 1] = (FTYPE)sp[chan1];
 705 
 706                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
 707                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
 708 
 709                 dp[0    ] = FROM_S32(d0);
 710                 dp[chan1] = FROM_S32(d1);
 711 
 712                 buffd[i    ] = 0.0;
 713                 buffd[i + 1] = 0.0;
 714 
 715                 sp += chan2;
 716                 dp += chan2;
 717               }
 718             }
 719 
 720           } else /*if (kw == 2)*/ {
 721 
 722             if (l < (n - 1) || off < m) {
 723               for (i = 0; i <= (wid - 2); i += 2) {
 724                 p0 = p2;
 725 
 726                 p1 = buff[i + 1]; p2 = buff[i + 2];
 727 
 728                 buffd[i    ] += p0*k0 + p1*k1;
 729                 buffd[i + 1] += p1*k0 + p2*k1;
 730               }
 731 
 732             } else {
 733               for (i = 0; i <= (wid - 2); i += 2) {
 734                 p0 = p2;
 735 
 736                 p1 = buff[i + 1]; p2 = buff[i + 2];
 737 
 738                 buffn[i    ] = (FTYPE)sp[0];
 739                 buffn[i + 1] = (FTYPE)sp[chan1];
 740 
 741                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
 742                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
 743 
 744                 dp[0    ] = FROM_S32(d0);
 745                 dp[chan1] = FROM_S32(d1);
 746 
 747                 buffd[i    ] = 0.0;
 748                 buffd[i + 1] = 0.0;
 749 
 750                 sp += chan2;
 751                 dp += chan2;
 752               }
 753             }
 754           }
 755         }
 756       }
 757 
 758       /* last pixels */
 759       for (; i < wid; i++) {
 760         FTYPE    *pk = k, s = 0;
 761         mlib_s32 x, d0;
 762 
 763         for (l = 0; l < n; l++) {
 764           FTYPE    *buff = buffc[l] + i;
 765 
 766           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
 767         }
 768 
 769         d0 = D2I(s);
 770         dp[0] = FROM_S32(d0);
 771 
 772         buffn[i] = (FTYPE)sp[0];
 773 
 774         sp += chan1;
 775         dp += chan1;
 776       }
 777 
 778       for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];
 779 
 780       /* next line */
 781       sl += sll;
 782       dl += dll;
 783 
 784       buff_ind++;
 785 
 786       if (buff_ind >= n + 1) buff_ind = 0;
 787     }
 788   }
 789 
 790   FREE_AND_RETURN_STATUS;
 791 }
 792 
 793 /***************************************************************/
 794 /* for x86, using integer multiplies is faster */
 795 
 796 #define STORE_RES(res, x)                                       \
 797   x >>= shift2;                                                 \
 798   CLAMP_STORE(res, x)
 799 
 800 mlib_status CONV_FUNC_I(MxN)(mlib_image       *dst,
 801                              const mlib_image *src,
 802                              const mlib_s32   *kernel,
 803                              mlib_s32         m,
 804                              mlib_s32         n,
 805                              mlib_s32         dm,
 806                              mlib_s32         dn,
 807                              mlib_s32         scale,
 808                              mlib_s32         cmask)
 809 {
 810   mlib_s32 buff[BUFF_SIZE], *buffd = buff;
 811   mlib_s32 l, off, kw;
 812   mlib_s32 d0, d1, shift1, shift2;
 813   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
 814   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
 815   DTYPE    *adr_src, *sl, *sp = NULL;
 816   DTYPE    *adr_dst, *dl, *dp = NULL;
 817   mlib_s32 wid, hgt, sll, dll;
 818   mlib_s32 nchannel, chan1;
 819   mlib_s32 i, j, c;
 820   mlib_s32 chan2;
 821   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
 822   GET_SRC_DST_PARAMETERS(DTYPE);
 823 
 824 #if IMG_TYPE != 1
 825   shift1 = 16;
 826 #else
 827   shift1 = 8;
 828 #endif /* IMG_TYPE != 1 */
 829   shift2 = scale - shift1;
 830 
 831   chan1 = nchannel;
 832   chan2 = chan1 + chan1;
 833 
 834   wid -= (m - 1);
 835   hgt -= (n - 1);
 836   adr_dst += dn*dll + dm*nchannel;
 837 
 838   if (wid > BUFF_SIZE) {
 839     buffd = mlib_malloc(sizeof(mlib_s32)*wid);
 840 
 841     if (buffd == NULL) return MLIB_FAILURE;
 842   }
 843 
 844   if (m*n > MAX_N*MAX_N) {
 845     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
 846 
 847     if (k == NULL) {
 848       if (buffd != buff) mlib_free(buffd);
 849       return MLIB_FAILURE;
 850     }
 851   }
 852 
 853   for (i = 0; i < m*n; i++) {
 854     k[i] = kernel[i] >> shift1;
 855   }
 856 
 857   for (c = 0; c < nchannel; c++) {
 858     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 859 
 860     sl = adr_src + c;
 861     dl = adr_dst + c;
 862 
 863     for (i = 0; i < wid; i++) buffd[i] = 0;
 864 
 865     for (j = 0; j < hgt; j++) {
 866       mlib_s32 *pk = k;
 867 
 868       for (l = 0; l < n; l++) {
 869         DTYPE *sp0 = sl + l*sll;
 870 
 871         for (off = 0; off < m;) {
 872           sp = sp0 + off*chan1;
 873           dp = dl;
 874 
 875           kw = m - off;
 876 
 877           if (kw > 2*MAX_KER) kw = MAX_KER; else
 878             if (kw > MAX_KER) kw = kw/2;
 879           off += kw;
 880 
 881           p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2];
 882           p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1];
 883 
 884           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
 885           k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
 886           pk += kw;
 887 
 888           sp += (kw - 1)*chan1;
 889 
 890           if (kw == 7) {
 891 
 892             if (l < (n - 1) || off < m) {
 893               for (i = 0; i <= (wid - 2); i += 2) {
 894                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 895                 p6 = sp[0];
 896                 p7 = sp[chan1];
 897 
 898                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
 899                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
 900 
 901                 sp += chan2;
 902               }
 903 
 904             } else {
 905               for (i = 0; i <= (wid - 2); i += 2) {
 906                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
 907                 p6 = sp[0];
 908                 p7 = sp[chan1];
 909 
 910                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
 911                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
 912 
 913                 STORE_RES(dp[0    ], d0);
 914                 STORE_RES(dp[chan1], d1);
 915 
 916                 buffd[i    ] = 0;
 917                 buffd[i + 1] = 0;
 918 
 919                 sp += chan2;
 920                 dp += chan2;
 921               }
 922             }
 923 
 924           } else if (kw == 6) {
 925 
 926             if (l < (n - 1) || off < m) {
 927               for (i = 0; i <= (wid - 2); i += 2) {
 928                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 929                 p5 = sp[0];
 930                 p6 = sp[chan1];
 931 
 932                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
 933                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
 934 
 935                 sp += chan2;
 936               }
 937 
 938             } else {
 939               for (i = 0; i <= (wid - 2); i += 2) {
 940                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
 941                 p5 = sp[0];
 942                 p6 = sp[chan1];
 943 
 944                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
 945                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
 946 
 947                 STORE_RES(dp[0    ], d0);
 948                 STORE_RES(dp[chan1], d1);
 949 
 950                 buffd[i    ] = 0;
 951                 buffd[i + 1] = 0;
 952 
 953                 sp += chan2;
 954                 dp += chan2;
 955               }
 956             }
 957 
 958           } else if (kw == 5) {
 959 
 960             if (l < (n - 1) || off < m) {
 961               for (i = 0; i <= (wid - 2); i += 2) {
 962                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 963                 p4 = sp[0];
 964                 p5 = sp[chan1];
 965 
 966                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
 967                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
 968 
 969                 sp += chan2;
 970               }
 971 
 972             } else {
 973               for (i = 0; i <= (wid - 2); i += 2) {
 974                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
 975                 p4 = sp[0];
 976                 p5 = sp[chan1];
 977 
 978                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
 979                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
 980 
 981                 STORE_RES(dp[0    ], d0);
 982                 STORE_RES(dp[chan1], d1);
 983 
 984                 buffd[i    ] = 0;
 985                 buffd[i + 1] = 0;
 986 
 987                 sp += chan2;
 988                 dp += chan2;
 989               }
 990             }
 991 
 992           } else if (kw == 4) {
 993 
 994             if (l < (n - 1) || off < m) {
 995               for (i = 0; i <= (wid - 2); i += 2) {
 996                 p0 = p2; p1 = p3; p2 = p4;
 997                 p3 = sp[0];
 998                 p4 = sp[chan1];
 999 
1000                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1001                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1002 
1003                 sp += chan2;
1004               }
1005 
1006             } else {
1007               for (i = 0; i <= (wid - 2); i += 2) {
1008                 p0 = p2; p1 = p3; p2 = p4;
1009                 p3 = sp[0];
1010                 p4 = sp[chan1];
1011 
1012                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1013                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1014 
1015                 STORE_RES(dp[0    ], d0);
1016                 STORE_RES(dp[chan1], d1);
1017 
1018                 buffd[i    ] = 0;
1019                 buffd[i + 1] = 0;
1020 
1021                 sp += chan2;
1022                 dp += chan2;
1023               }
1024             }
1025 
1026           } else if (kw == 3) {
1027 
1028             if (l < (n - 1) || off < m) {
1029               for (i = 0; i <= (wid - 2); i += 2) {
1030                 p0 = p2; p1 = p3;
1031                 p2 = sp[0];
1032                 p3 = sp[chan1];
1033 
1034                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1035                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1036 
1037                 sp += chan2;
1038               }
1039 
1040             } else {
1041               for (i = 0; i <= (wid - 2); i += 2) {
1042                 p0 = p2; p1 = p3;
1043                 p2 = sp[0];
1044                 p3 = sp[chan1];
1045 
1046                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1047                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1048 
1049                 STORE_RES(dp[0    ], d0);
1050                 STORE_RES(dp[chan1], d1);
1051 
1052                 buffd[i    ] = 0;
1053                 buffd[i + 1] = 0;
1054 
1055                 sp += chan2;
1056                 dp += chan2;
1057               }
1058             }
1059 
1060           } else if (kw == 2) {
1061 
1062             if (l < (n - 1) || off < m) {
1063               for (i = 0; i <= (wid - 2); i += 2) {
1064                 p0 = p2;
1065                 p1 = sp[0];
1066                 p2 = sp[chan1];
1067 
1068                 buffd[i    ] += p0*k0 + p1*k1;
1069                 buffd[i + 1] += p1*k0 + p2*k1;
1070 
1071                 sp += chan2;
1072               }
1073 
1074             } else {
1075               for (i = 0; i <= (wid - 2); i += 2) {
1076                 p0 = p2;
1077                 p1 = sp[0];
1078                 p2 = sp[chan1];
1079 
1080                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
1081                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
1082 
1083                 STORE_RES(dp[0    ], d0);
1084                 STORE_RES(dp[chan1], d1);
1085 
1086                 buffd[i    ] = 0;
1087                 buffd[i + 1] = 0;
1088 
1089                 sp += chan2;
1090                 dp += chan2;
1091               }
1092             }
1093 
1094           } else /*if (kw == 1)*/ {
1095 
1096             if (l < (n - 1) || off < m) {
1097               for (i = 0; i <= (wid - 2); i += 2) {
1098                 p0 = sp[0];
1099                 p1 = sp[chan1];
1100 
1101                 buffd[i    ] += p0*k0;
1102                 buffd[i + 1] += p1*k0;
1103 
1104                 sp += chan2;
1105               }
1106 
1107             } else {
1108               for (i = 0; i <= (wid - 2); i += 2) {
1109                 p0 = sp[0];
1110                 p1 = sp[chan1];
1111 
1112                 d0 = (p0*k0 + buffd[i    ]);
1113                 d1 = (p1*k0 + buffd[i + 1]);
1114 
1115                 STORE_RES(dp[0    ], d0);
1116                 STORE_RES(dp[chan1], d1);
1117 
1118                 buffd[i    ] = 0;
1119                 buffd[i + 1] = 0;
1120 
1121                 sp += chan2;
1122                 dp += chan2;
1123               }
1124             }
1125           }
1126         }
1127       }
1128 
1129       /* last pixels */
1130       for (; i < wid; i++) {
1131         mlib_s32 *pk = k, s = 0;
1132         mlib_s32 x;
1133 
1134         for (l = 0; l < n; l++) {
1135           sp = sl + l*sll + i*chan1;
1136 
1137           for (x = 0; x < m; x++) {
1138             s += sp[0] * pk[0];
1139             sp += chan1;
1140             pk ++;
1141           }
1142         }
1143 
1144         STORE_RES(dp[0], s);
1145 
1146         sp += chan1;
1147         dp += chan1;
1148       }
1149 
1150       sl += sll;
1151       dl += dll;
1152     }
1153   }
1154 
1155   if (buffd != buff) mlib_free(buffd);
1156   if (k != k_locl) mlib_free(k);
1157 
1158   return MLIB_SUCCESS;
1159 }
1160 
1161 /***************************************************************/