1 /*
   2  * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 /*
  28  * FUNCTION
  29  *      Internal functions for mlib_ImageConv2x2 on U8/S16/U16 types
  30  *      and MLIB_EDGE_DST_NO_WRITE mask.
  31  */
  32 
  33 #include "mlib_image.h"
  34 #include "mlib_ImageConv.h"
  35 #include "mlib_c_ImageConv.h"
  36 
  37 /***************************************************************/
  38 #ifdef i386 /* do not copy by mlib_d64 data type for x86 */
  39 
  40 typedef struct {
  41   mlib_s32 int0, int1;
  42 } two_int;
  43 
  44 #define TYPE_64BIT two_int
  45 
  46 #else /* i386 */
  47 
  48 #define TYPE_64BIT mlib_d64
  49 
  50 #endif /* i386 ( do not copy by mlib_d64 data type for x86 ) */
  51 
  52 /***************************************************************/
  53 #define LOAD_KERNEL_INTO_DOUBLE()                                        \
  54   while (scalef_expon > 30) {                                            \
  55     scalef /= (1 << 30);                                                 \
  56     scalef_expon -= 30;                                                  \
  57   }                                                                      \
  58                                                                          \
  59   scalef /= (1 << scalef_expon);                                         \
  60                                                                          \
  61   /* keep kernel in regs */                                              \
  62   k0 = scalef * kern[0];  k1 = scalef * kern[1];  k2 = scalef * kern[2]; \
  63   k3 = scalef * kern[3]
  64 
  65 /***************************************************************/
  66 #define GET_SRC_DST_PARAMETERS(type)                            \
  67   hgt = mlib_ImageGetHeight(src);                               \
  68   wid = mlib_ImageGetWidth(src);                                \
  69   nchannel = mlib_ImageGetChannels(src);                        \
  70   sll = mlib_ImageGetStride(src) / sizeof(type);                \
  71   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
  72   adr_src = (type *)mlib_ImageGetData(src);                     \
  73   adr_dst = (type *)mlib_ImageGetData(dst)
  74 
  75 /***************************************************************/
  76 #ifndef MLIB_USE_FTOI_CLAMPING
  77 
  78 #define CLAMP_S32(x)                                            \
  79   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN :                       \
  80   (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
  81 
  82 #else
  83 
  84 #define CLAMP_S32(x) ((mlib_s32)(x))
  85 
  86 #endif /* MLIB_USE_FTOI_CLAMPING */
  87 
  88 /***************************************************************/
  89 #if defined(_LITTLE_ENDIAN) && !defined(_NO_LONGLONG)
  90 
  91 /* NB: Explicit cast to DTYPE is necessary to avoid warning from Microsoft VC compiler.
  92       And we need to explicitly define cast behavior if source exceeds destination range.
  93       (it is undefined according to C99 spec). We use mask here because this macro is typically
  94       used to extract bit regions. */
  95 
  96 #define STORE2(res0, res1)                                      \
  97   dp[0    ] = (DTYPE) ((res1) & DTYPE_MASK);                      \
  98   dp[chan1] = (DTYPE) ((res0) & DTYPE_MASK)
  99 
 100 #else
 101 
 102 #define STORE2(res0, res1)                                      \
 103   dp[0    ] = (DTYPE) ((res0) & DTYPE_MASK);                      \
 104   dp[chan1] = (DTYPE) ((res1) & DTYPE_MASK)
 105 
 106 #endif /* defined(_LITTLE_ENDIAN) && !defined(_NO_LONGLONG) */
 107 
 108 /***************************************************************/
 109 #ifdef _NO_LONGLONG
 110 
 111 #define LOAD_BUFF(buff)                                         \
 112   buff[i    ] = sp[0];                                          \
 113   buff[i + 1] = sp[chan1]
 114 
 115 #else /* _NO_LONGLONG */
 116 
 117 #ifdef _LITTLE_ENDIAN
 118 
 119 #define LOAD_BUFF(buff)                                         \
 120   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | ((mlib_s64)sp[0] & 0xffffffff)
 121 
 122 #else /* _LITTLE_ENDIAN */
 123 
 124 #define LOAD_BUFF(buff)                                         \
 125   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | ((mlib_s64)sp[chan1] & 0xffffffff)
 126 
 127 #endif /* _LITTLE_ENDIAN */
 128 
 129 #endif /* _NO_LONGLONG */
 130 
 131 /***************************************************************/
 132 typedef union {
 133   TYPE_64BIT d64;
 134   struct {
 135     mlib_s32 i0, i1;
 136   } i32s;
 137 } d64_2x32;
 138 
 139 /***************************************************************/
 140 #define D_KER     1
 141 
 142 #define BUFF_LINE 256
 143 
 144 /***************************************************************/
 145 #define XOR_80(x) x ^= 0x80
 146 
 147 void mlib_ImageXor80_aa(mlib_u8  *dl,
 148                         mlib_s32 wid,
 149                         mlib_s32 hgt,
 150                         mlib_s32 str)
 151 {
 152   mlib_u8  *dp, *dend;
 153 #ifdef _NO_LONGLONG
 154   mlib_u32 cadd = 0x80808080;
 155 #else /* _NO_LONGLONG */
 156   mlib_u64 cadd = MLIB_U64_CONST(0x8080808080808080);
 157 #endif /* _NO_LONGLONG */
 158   mlib_s32 j;
 159 
 160   if (wid == str) {
 161     wid *= hgt;
 162     hgt = 1;
 163   }
 164 
 165   for (j = 0; j < hgt; j++) {
 166     dend = dl + wid;
 167 
 168     for (dp = dl; ((mlib_addr)dp & 7) && (dp < dend); dp++) XOR_80(dp[0]);
 169 
 170 #ifdef __SUNPRO_C
 171 #pragma pipeloop(0)
 172 #endif /* __SUNPRO_C */
 173     for (; dp <= (dend - 8); dp += 8) {
 174 #ifdef _NO_LONGLONG
 175       *((mlib_s32*)dp) ^= cadd;
 176       *((mlib_s32*)dp+1) ^= cadd;
 177 #else /* _NO_LONGLONG */
 178       *((mlib_u64*)dp) ^= cadd;
 179 #endif /* _NO_LONGLONG */
 180     }
 181 
 182     for (; (dp < dend); dp++) XOR_80(dp[0]);
 183 
 184     dl += str;
 185   }
 186 }
 187 
 188 /***************************************************************/
 189 void mlib_ImageXor80(mlib_u8  *dl,
 190                      mlib_s32 wid,
 191                      mlib_s32 hgt,
 192                      mlib_s32 str,
 193                      mlib_s32 nchan,
 194                      mlib_s32 cmask)
 195 {
 196   mlib_s32 i, j, c;
 197 
 198   for (j = 0; j < hgt; j++) {
 199     for (c = 0; c < nchan; c++) {
 200       if (cmask & (1 << (nchan - 1 - c))) {
 201         mlib_u8 *dp = dl + c;
 202 
 203 #ifdef __SUNPRO_C
 204 #pragma pipeloop(0)
 205 #endif /* __SUNPRO_C */
 206         for (i = 0; i < wid; i++) XOR_80(dp[i*nchan]);
 207       }
 208     }
 209 
 210     dl += str;
 211   }
 212 }
 213 
 214 /***************************************************************/
 215 #define DTYPE mlib_s16
 216 #define DTYPE_MASK 0xffff
 217 
 218 mlib_status mlib_c_conv2x2nw_s16(mlib_image       *dst,
 219                                  const mlib_image *src,
 220                                  const mlib_s32   *kern,
 221                                  mlib_s32         scalef_expon,
 222                                  mlib_s32         cmask)
 223 {
 224   mlib_d64 buff_arr[2*BUFF_LINE];
 225   mlib_s32 *pbuff = (mlib_s32*)buff_arr, *buffo, *buff0, *buff1, *buff2, *buffT;
 226   DTYPE    *adr_src, *sl, *sp, *sl1;
 227   DTYPE    *adr_dst, *dl, *dp;
 228   mlib_d64 k0, k1, k2, k3, scalef = 65536.0;
 229   mlib_d64 p00, p01, p02,
 230            p10, p11, p12;
 231   mlib_s32 wid, hgt, sll, dll, wid1;
 232   mlib_s32 nchannel, chan1, chan2;
 233   mlib_s32 i, j, c;
 234   LOAD_KERNEL_INTO_DOUBLE();
 235   GET_SRC_DST_PARAMETERS(DTYPE);
 236 
 237   wid1 = (wid + 1) &~ 1;
 238 
 239   if (wid1 > BUFF_LINE) {
 240     pbuff = mlib_malloc(4*sizeof(mlib_s32)*wid1);
 241 
 242     if (pbuff == NULL) return MLIB_FAILURE;
 243   }
 244 
 245   buffo = pbuff;
 246   buff0 = buffo + wid1;
 247   buff1 = buff0 + wid1;
 248   buff2 = buff1 + wid1;
 249 
 250   chan1 = nchannel;
 251   chan2 = chan1 + chan1;
 252 
 253   wid -= D_KER;
 254   hgt -= D_KER;
 255 
 256   for (c = 0; c < nchannel; c++) {
 257     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 258 
 259     sl = adr_src + c;
 260     dl = adr_dst + c;
 261 
 262     sl1 = sl + sll;
 263 #ifdef __SUNPRO_C
 264 #pragma pipeloop(0)
 265 #endif /* __SUNPRO_C */
 266     for (i = 0; i < wid + D_KER; i++) {
 267       buff0[i - 1] = (mlib_s32)sl[i*chan1];
 268       buff1[i - 1] = (mlib_s32)sl1[i*chan1];
 269     }
 270 
 271     sl += (D_KER + 1)*sll;
 272 
 273     for (j = 0; j < hgt; j++) {
 274       sp = sl;
 275       dp = dl;
 276 
 277       buff2[-1] = (mlib_s32)sp[0];
 278       sp += chan1;
 279 
 280       p02 = buff0[-1];
 281       p12 = buff1[-1];
 282 
 283 #ifdef __SUNPRO_C
 284 #pragma pipeloop(0)
 285 #endif /* __SUNPRO_C */
 286       for (i = 0; i <= (wid - 2); i += 2) {
 287 #ifdef _NO_LONGLONG
 288         mlib_s32 o64_1, o64_2;
 289 #else /* _NO_LONGLONG */
 290         mlib_s64 o64;
 291 #endif /* _NO_LONGLONG */
 292         d64_2x32 sd0, sd1, dd;
 293 
 294         p00 = p02; p10 = p12;
 295 
 296         sd0.d64 = *(TYPE_64BIT*)(buff0 + i);
 297         sd1.d64 = *(TYPE_64BIT*)(buff1 + i);
 298         p01 = (mlib_d64)sd0.i32s.i0;
 299         p02 = (mlib_d64)sd0.i32s.i1;
 300         p11 = (mlib_d64)sd1.i32s.i0;
 301         p12 = (mlib_d64)sd1.i32s.i1;
 302 
 303         LOAD_BUFF(buff2);
 304 
 305         dd.i32s.i0 = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3);
 306         dd.i32s.i1 = CLAMP_S32(p01 * k0 + p02 * k1 + p11 * k2 + p12 * k3);
 307         *(TYPE_64BIT*)(buffo + i) = dd.d64;
 308 
 309 #ifdef _NO_LONGLONG
 310 
 311         o64_1 = buffo[i];
 312         o64_2 = buffo[i+1];
 313         STORE2(o64_1 >> 16, o64_2 >> 16);
 314 
 315 #else /* _NO_LONGLONG */
 316 
 317         o64 = *(mlib_s64*)(buffo + i);
 318         STORE2(o64 >> 48, o64 >> 16);
 319 
 320 #endif /* _NO_LONGLONG */
 321 
 322         sp += chan2;
 323         dp += chan2;
 324       }
 325 
 326       for (; i < wid; i++) {
 327         p00 = buff0[i - 1]; p10 = buff1[i - 1];
 328         p01 = buff0[i];     p11 = buff1[i];
 329 
 330         buff2[i] = (mlib_s32)sp[0];
 331 
 332         buffo[i] = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3);
 333         dp[0] = buffo[i] >> 16;
 334 
 335         sp += chan1;
 336         dp += chan1;
 337       }
 338 
 339       sl += sll;
 340       dl += dll;
 341 
 342       buffT = buff0;
 343       buff0 = buff1;
 344       buff1 = buff2;
 345       buff2 = buffT;
 346     }
 347   }
 348 
 349   if (pbuff != (mlib_s32*)buff_arr) mlib_free(pbuff);
 350 
 351   return MLIB_SUCCESS;
 352 }
 353 
 354 /***************************************************************/
 355 mlib_status mlib_c_conv2x2ext_s16(mlib_image       *dst,
 356                                   const mlib_image *src,
 357                                   mlib_s32         dx_l,
 358                                   mlib_s32         dx_r,
 359                                   mlib_s32         dy_t,
 360                                   mlib_s32         dy_b,
 361                                   const mlib_s32   *kern,
 362                                   mlib_s32         scalef_expon,
 363                                   mlib_s32         cmask)
 364 {
 365   mlib_d64 buff_arr[2*BUFF_LINE];
 366   mlib_s32 *pbuff = (mlib_s32*)buff_arr, *buffo, *buff0, *buff1, *buff2, *buffT;
 367   DTYPE    *adr_src, *sl, *sp, *sl1;
 368   DTYPE    *adr_dst, *dl, *dp;
 369   mlib_d64 k0, k1, k2, k3, scalef = 65536.0;
 370   mlib_d64 p00, p01, p02,
 371            p10, p11, p12;
 372   mlib_s32 wid, hgt, sll, dll, wid1;
 373   mlib_s32 nchannel, chan1, chan2;
 374   mlib_s32 i, j, c, swid;
 375   LOAD_KERNEL_INTO_DOUBLE();
 376   GET_SRC_DST_PARAMETERS(DTYPE);
 377 
 378   swid = wid + D_KER;
 379 
 380   wid1 = (swid + 1) &~ 1;
 381 
 382   if (wid1 > BUFF_LINE) {
 383     pbuff = mlib_malloc(4*sizeof(mlib_s32)*wid1);
 384 
 385     if (pbuff == NULL) return MLIB_FAILURE;
 386   }
 387 
 388   buffo = pbuff;
 389   buff0 = buffo + wid1;
 390   buff1 = buff0 + wid1;
 391   buff2 = buff1 + wid1;
 392 
 393   swid -= dx_r;
 394 
 395   chan1 = nchannel;
 396   chan2 = chan1 + chan1;
 397 
 398   for (c = 0; c < nchannel; c++) {
 399     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 400 
 401     sl = adr_src + c;
 402     dl = adr_dst + c;
 403 
 404     if ((hgt - dy_b) > 0) sl1 = sl + sll;
 405     else sl1 = sl;
 406 
 407 #ifdef __SUNPRO_C
 408 #pragma pipeloop(0)
 409 #endif /* __SUNPRO_C */
 410     for (i = 0; i < swid; i++) {
 411       buff0[i - 1] = (mlib_s32)sl[i*chan1];
 412       buff1[i - 1] = (mlib_s32)sl1[i*chan1];
 413     }
 414 
 415     if (dx_r != 0) {
 416       buff0[swid - 1] = buff0[swid - 2];
 417       buff1[swid - 1] = buff1[swid - 2];
 418     }
 419 
 420     if ((hgt - dy_b) > 1) sl = sl1 + sll;
 421     else sl = sl1;
 422 
 423     for (j = 0; j < hgt; j++) {
 424       sp = sl;
 425       dp = dl;
 426 
 427       buff2[-1] = (mlib_s32)sp[0];
 428       sp += chan1;
 429 
 430       p02 = buff0[-1];
 431       p12 = buff1[-1];
 432 
 433 #ifdef __SUNPRO_C
 434 #pragma pipeloop(0)
 435 #endif /* __SUNPRO_C */
 436       for (i = 0; i <= (wid - 2); i += 2) {
 437 #ifdef _NO_LONGLONG
 438         mlib_s32 o64_1, o64_2;
 439 #else /* _NO_LONGLONG */
 440         mlib_s64 o64;
 441 #endif /* _NO_LONGLONG */
 442         d64_2x32 sd0, sd1, dd;
 443 
 444         p00 = p02; p10 = p12;
 445 
 446         sd0.d64 = *(TYPE_64BIT*)(buff0 + i);
 447         sd1.d64 = *(TYPE_64BIT*)(buff1 + i);
 448         p01 = (mlib_d64)sd0.i32s.i0;
 449         p02 = (mlib_d64)sd0.i32s.i1;
 450         p11 = (mlib_d64)sd1.i32s.i0;
 451         p12 = (mlib_d64)sd1.i32s.i1;
 452 
 453         LOAD_BUFF(buff2);
 454 
 455         dd.i32s.i0 = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3);
 456         dd.i32s.i1 = CLAMP_S32(p01 * k0 + p02 * k1 + p11 * k2 + p12 * k3);
 457         *(TYPE_64BIT*)(buffo + i) = dd.d64;
 458 
 459 #ifdef _NO_LONGLONG
 460 
 461         o64_1 = buffo[i];
 462         o64_2 = buffo[i+1];
 463         STORE2(o64_1 >> 16, o64_2 >> 16);
 464 
 465 #else /* _NO_LONGLONG */
 466 
 467         o64 = *(mlib_s64*)(buffo + i);
 468         STORE2(o64 >> 48, o64 >> 16);
 469 
 470 #endif /* _NO_LONGLONG */
 471 
 472         sp += chan2;
 473         dp += chan2;
 474       }
 475 
 476       for (; i < wid; i++) {
 477         p00 = buff0[i - 1]; p10 = buff1[i - 1];
 478         p01 = buff0[i];     p11 = buff1[i];
 479 
 480         buff2[i] = (mlib_s32)sp[0];
 481 
 482         buffo[i] = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3);
 483         dp[0] = buffo[i] >> 16;
 484 
 485         sp += chan1;
 486         dp += chan1;
 487       }
 488 
 489       if (dx_r != 0) buff2[swid - 1] = buff2[swid - 2];
 490 
 491       if (j < hgt - dy_b - 2) sl += sll;
 492       dl += dll;
 493 
 494       buffT = buff0;
 495       buff0 = buff1;
 496       buff1 = buff2;
 497       buff2 = buffT;
 498     }
 499   }
 500 
 501   if (pbuff != (mlib_s32*)buff_arr) mlib_free(pbuff);
 502 
 503   return MLIB_SUCCESS;
 504 }
 505 
 506 /***************************************************************/
 507 #undef  DTYPE
 508 #define DTYPE mlib_u16
 509 
 510 mlib_status mlib_c_conv2x2nw_u16(mlib_image       *dst,
 511                                  const mlib_image *src,
 512                                  const mlib_s32   *kern,
 513                                  mlib_s32         scalef_expon,
 514                                  mlib_s32         cmask)
 515 {
 516   mlib_d64 buff_arr[2*BUFF_LINE];
 517   mlib_s32 *pbuff = (mlib_s32*)buff_arr, *buffo, *buff0, *buff1, *buff2, *buffT;
 518   DTYPE    *adr_src, *sl, *sp, *sl1;
 519   DTYPE    *adr_dst, *dl, *dp;
 520   mlib_d64 k0, k1, k2, k3, scalef = 65536.0;
 521   mlib_d64 p00, p01, p02,
 522            p10, p11, p12;
 523   mlib_s32 wid, hgt, sll, dll, wid1;
 524   mlib_s32 nchannel, chan1, chan2;
 525   mlib_s32 i, j, c;
 526   mlib_d64 doff = 0x7FFF8000;
 527   LOAD_KERNEL_INTO_DOUBLE();
 528   GET_SRC_DST_PARAMETERS(DTYPE);
 529 
 530   wid1 = (wid + 1) &~ 1;
 531 
 532   if (wid1 > BUFF_LINE) {
 533     pbuff = mlib_malloc(4*sizeof(mlib_s32)*wid1);
 534 
 535     if (pbuff == NULL) return MLIB_FAILURE;
 536   }
 537 
 538   buffo = pbuff;
 539   buff0 = buffo + wid1;
 540   buff1 = buff0 + wid1;
 541   buff2 = buff1 + wid1;
 542 
 543   chan1 = nchannel;
 544   chan2 = chan1 + chan1;
 545 
 546   wid -= D_KER;
 547   hgt -= D_KER;
 548 
 549   for (c = 0; c < nchannel; c++) {
 550     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 551 
 552     sl = adr_src + c;
 553     dl = adr_dst + c;
 554 
 555     sl1 = sl + sll;
 556 #ifdef __SUNPRO_C
 557 #pragma pipeloop(0)
 558 #endif /* __SUNPRO_C */
 559     for (i = 0; i < wid + D_KER; i++) {
 560       buff0[i - 1] = (mlib_s32)sl[i*chan1];
 561       buff1[i - 1] = (mlib_s32)sl1[i*chan1];
 562     }
 563 
 564     sl += (D_KER + 1)*sll;
 565 
 566     for (j = 0; j < hgt; j++) {
 567       sp = sl;
 568       dp = dl;
 569 
 570       buff2[-1] = (mlib_s32)sp[0];
 571       sp += chan1;
 572 
 573       p02 = buff0[-1];
 574       p12 = buff1[-1];
 575 
 576 #ifdef __SUNPRO_C
 577 #pragma pipeloop(0)
 578 #endif /* __SUNPRO_C */
 579       for (i = 0; i <= (wid - 2); i += 2) {
 580 #ifdef _NO_LONGLONG
 581         mlib_s32 o64_1, o64_2;
 582 #else /* _NO_LONGLONG */
 583         mlib_s64 o64;
 584 #endif /* _NO_LONGLONG */
 585         d64_2x32 sd0, sd1, dd;
 586 
 587         p00 = p02; p10 = p12;
 588 
 589         sd0.d64 = *(TYPE_64BIT*)(buff0 + i);
 590         sd1.d64 = *(TYPE_64BIT*)(buff1 + i);
 591         p01 = (mlib_d64)sd0.i32s.i0;
 592         p02 = (mlib_d64)sd0.i32s.i1;
 593         p11 = (mlib_d64)sd1.i32s.i0;
 594         p12 = (mlib_d64)sd1.i32s.i1;
 595 
 596         LOAD_BUFF(buff2);
 597 
 598         dd.i32s.i0 = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3 - doff);
 599         dd.i32s.i1 = CLAMP_S32(p01 * k0 + p02 * k1 + p11 * k2 + p12 * k3 - doff);
 600         *(TYPE_64BIT*)(buffo + i) = dd.d64;
 601 
 602 #ifdef _NO_LONGLONG
 603 
 604         o64_1 = buffo[i];
 605         o64_2 = buffo[i+1];
 606         o64_1 = o64_1 ^ 0x80000000U;
 607         o64_2 = o64_2 ^ 0x80000000U;
 608         STORE2(o64_1 >> 16, o64_2 >> 16);
 609 
 610 #else /* _NO_LONGLONG */
 611 
 612         o64 = *(mlib_s64*)(buffo + i);
 613         o64 = o64 ^ MLIB_U64_CONST(0x8000000080000000);
 614         STORE2(o64 >> 48, o64 >> 16);
 615 
 616 #endif /* _NO_LONGLONG */
 617 
 618         sp += chan2;
 619         dp += chan2;
 620       }
 621 
 622       for (; i < wid; i++) {
 623         p00 = buff0[i - 1]; p10 = buff1[i - 1];
 624         p01 = buff0[i];     p11 = buff1[i];
 625 
 626         buff2[i] = (mlib_s32)sp[0];
 627 
 628         buffo[i] = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3 - doff);
 629         dp[0] = (buffo[i] >> 16) ^ 0x8000;
 630 
 631         sp += chan1;
 632         dp += chan1;
 633       }
 634 
 635       sl += sll;
 636       dl += dll;
 637 
 638       buffT = buff0;
 639       buff0 = buff1;
 640       buff1 = buff2;
 641       buff2 = buffT;
 642     }
 643   }
 644 
 645   if (pbuff != (mlib_s32*)buff_arr) mlib_free(pbuff);
 646 
 647   return MLIB_SUCCESS;
 648 }
 649 
 650 /***************************************************************/
 651 mlib_status mlib_c_conv2x2ext_u16(mlib_image       *dst,
 652                                   const mlib_image *src,
 653                                   mlib_s32         dx_l,
 654                                   mlib_s32         dx_r,
 655                                   mlib_s32         dy_t,
 656                                   mlib_s32         dy_b,
 657                                   const mlib_s32   *kern,
 658                                   mlib_s32         scalef_expon,
 659                                   mlib_s32         cmask)
 660 {
 661   mlib_d64 buff_arr[2*BUFF_LINE];
 662   mlib_s32 *pbuff = (mlib_s32*)buff_arr, *buffo, *buff0, *buff1, *buff2, *buffT;
 663   DTYPE    *adr_src, *sl, *sp, *sl1;
 664   DTYPE    *adr_dst, *dl, *dp;
 665   mlib_d64 k0, k1, k2, k3, scalef = 65536.0;
 666   mlib_d64 p00, p01, p02,
 667            p10, p11, p12;
 668   mlib_s32 wid, hgt, sll, dll, wid1;
 669   mlib_s32 nchannel, chan1, chan2;
 670   mlib_s32 i, j, c, swid;
 671   mlib_d64 doff = 0x7FFF8000;
 672   LOAD_KERNEL_INTO_DOUBLE();
 673   GET_SRC_DST_PARAMETERS(DTYPE);
 674 
 675   swid = wid + D_KER;
 676 
 677   wid1 = (swid + 1) &~ 1;
 678 
 679   if (wid1 > BUFF_LINE) {
 680     pbuff = mlib_malloc(4*sizeof(mlib_s32)*wid1);
 681 
 682     if (pbuff == NULL) return MLIB_FAILURE;
 683   }
 684 
 685   buffo = pbuff;
 686   buff0 = buffo + wid1;
 687   buff1 = buff0 + wid1;
 688   buff2 = buff1 + wid1;
 689 
 690   swid -= dx_r;
 691 
 692   chan1 = nchannel;
 693   chan2 = chan1 + chan1;
 694 
 695   for (c = 0; c < nchannel; c++) {
 696     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 697 
 698     sl = adr_src + c;
 699     dl = adr_dst + c;
 700 
 701     if ((hgt - dy_b) > 0) sl1 = sl + sll;
 702     else sl1 = sl;
 703 
 704 #ifdef __SUNPRO_C
 705 #pragma pipeloop(0)
 706 #endif /* __SUNPRO_C */
 707     for (i = 0; i < swid; i++) {
 708       buff0[i - 1] = (mlib_s32)sl[i*chan1];
 709       buff1[i - 1] = (mlib_s32)sl1[i*chan1];
 710     }
 711 
 712     if (dx_r != 0) {
 713       buff0[swid - 1] = buff0[swid - 2];
 714       buff1[swid - 1] = buff1[swid - 2];
 715     }
 716 
 717     if ((hgt - dy_b) > 1) sl = sl1 + sll;
 718     else sl = sl1;
 719 
 720     for (j = 0; j < hgt; j++) {
 721       sp = sl;
 722       dp = dl;
 723 
 724       buff2[-1] = (mlib_s32)sp[0];
 725       sp += chan1;
 726 
 727       p02 = buff0[-1];
 728       p12 = buff1[-1];
 729 
 730 #ifdef __SUNPRO_C
 731 #pragma pipeloop(0)
 732 #endif /* __SUNPRO_C */
 733       for (i = 0; i <= (wid - 2); i += 2) {
 734 #ifdef _NO_LONGLONG
 735         mlib_s32 o64_1, o64_2;
 736 #else /* _NO_LONGLONG */
 737         mlib_s64 o64;
 738 #endif /* _NO_LONGLONG */
 739         d64_2x32 sd0, sd1, dd;
 740 
 741         p00 = p02; p10 = p12;
 742 
 743         sd0.d64 = *(TYPE_64BIT*)(buff0 + i);
 744         sd1.d64 = *(TYPE_64BIT*)(buff1 + i);
 745         p01 = (mlib_d64)sd0.i32s.i0;
 746         p02 = (mlib_d64)sd0.i32s.i1;
 747         p11 = (mlib_d64)sd1.i32s.i0;
 748         p12 = (mlib_d64)sd1.i32s.i1;
 749 
 750         LOAD_BUFF(buff2);
 751 
 752         dd.i32s.i0 = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3 - doff);
 753         dd.i32s.i1 = CLAMP_S32(p01 * k0 + p02 * k1 + p11 * k2 + p12 * k3 - doff);
 754         *(TYPE_64BIT*)(buffo + i) = dd.d64;
 755 
 756 #ifdef _NO_LONGLONG
 757 
 758         o64_1 = buffo[i];
 759         o64_2 = buffo[i+1];
 760         o64_1 = o64_1 ^ 0x80000000U;
 761         o64_2 = o64_2 ^ 0x80000000U;
 762         STORE2(o64_1 >> 16, o64_2 >> 16);
 763 
 764 #else /* _NO_LONGLONG */
 765 
 766         o64 = *(mlib_s64*)(buffo + i);
 767         o64 = o64 ^ MLIB_U64_CONST(0x8000000080000000);
 768         STORE2(o64 >> 48, o64 >> 16);
 769 
 770 #endif /* _NO_LONGLONG */
 771 
 772         sp += chan2;
 773         dp += chan2;
 774       }
 775 
 776       for (; i < wid; i++) {
 777         p00 = buff0[i - 1]; p10 = buff1[i - 1];
 778         p01 = buff0[i];     p11 = buff1[i];
 779 
 780         buff2[i] = (mlib_s32)sp[0];
 781 
 782         buffo[i] = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3 - doff);
 783         dp[0] = (buffo[i] >> 16) ^ 0x8000;
 784 
 785         sp += chan1;
 786         dp += chan1;
 787       }
 788 
 789       if (dx_r != 0) buff2[swid - 1] = buff2[swid - 2];
 790 
 791       if (j < hgt - dy_b - 2) sl += sll;
 792       dl += dll;
 793 
 794       buffT = buff0;
 795       buff0 = buff1;
 796       buff1 = buff2;
 797       buff2 = buffT;
 798     }
 799   }
 800 
 801   if (pbuff != (mlib_s32*)buff_arr) mlib_free(pbuff);
 802 
 803   return MLIB_SUCCESS;
 804 }
 805 
 806 /***************************************************************/
 807 #undef  DTYPE
 808 #define DTYPE mlib_u8
 809 
 810 mlib_status mlib_c_conv2x2nw_u8(mlib_image       *dst,
 811                                 const mlib_image *src,
 812                                 const mlib_s32   *kern,
 813                                 mlib_s32         scalef_expon,
 814                                 mlib_s32         cmask)
 815 {
 816   mlib_d64 buff_arr[2*BUFF_LINE];
 817   mlib_s32 *pbuff = (mlib_s32*)buff_arr, *buffo, *buff0, *buff1, *buff2, *buffT;
 818   DTYPE    *adr_src, *sl, *sp, *sl1;
 819   DTYPE    *adr_dst, *dl, *dp;
 820   mlib_d64 k0, k1, k2, k3, scalef = (1 << 24);
 821   mlib_d64 p00, p01, p02,
 822            p10, p11, p12;
 823   mlib_s32 wid, hgt, sll, dll, wid1;
 824   mlib_s32 nchannel, chan1, chan2;
 825   mlib_s32 i, j, c;
 826   LOAD_KERNEL_INTO_DOUBLE();
 827   GET_SRC_DST_PARAMETERS(DTYPE);
 828 
 829   wid1 = (wid + 1) &~ 1;
 830 
 831   if (wid1 > BUFF_LINE) {
 832     pbuff = mlib_malloc(4*sizeof(mlib_s32)*wid1);
 833 
 834     if (pbuff == NULL) return MLIB_FAILURE;
 835   }
 836 
 837   buffo = pbuff;
 838   buff0 = buffo + wid1;
 839   buff1 = buff0 + wid1;
 840   buff2 = buff1 + wid1;
 841 
 842   chan1 = nchannel;
 843   chan2 = chan1 + chan1;
 844 
 845   wid -= D_KER;
 846   hgt -= D_KER;
 847 
 848   for (c = 0; c < nchannel; c++) {
 849     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
 850 
 851     sl = adr_src + c;
 852     dl = adr_dst + c;
 853 
 854     sl1 = sl + sll;
 855 #ifdef __SUNPRO_C
 856 #pragma pipeloop(0)
 857 #endif /* __SUNPRO_C */
 858     for (i = 0; i < wid + D_KER; i++) {
 859       buff0[i - 1] = (mlib_s32)sl[i*chan1];
 860       buff1[i - 1] = (mlib_s32)sl1[i*chan1];
 861     }
 862 
 863     sl += (D_KER + 1)*sll;
 864 
 865     for (j = 0; j < hgt; j++) {
 866       sp = sl;
 867       dp = dl;
 868 
 869       buff2[-1] = (mlib_s32)sp[0];
 870       sp += chan1;
 871 
 872       p02 = buff0[-1];
 873       p12 = buff1[-1];
 874 
 875 #ifdef __SUNPRO_C
 876 #pragma pipeloop(0)
 877 #endif /* __SUNPRO_C */
 878       for (i = 0; i <= (wid - 2); i += 2) {
 879 #ifdef _NO_LONGLONG
 880         mlib_s32 o64_1, o64_2;
 881 #else /* _NO_LONGLONG */
 882         mlib_s64 o64;
 883 #endif /* _NO_LONGLONG */
 884         d64_2x32 sd0, sd1, dd;
 885 
 886         p00 = p02; p10 = p12;
 887 
 888         sd0.d64 = *(TYPE_64BIT*)(buff0 + i);
 889         sd1.d64 = *(TYPE_64BIT*)(buff1 + i);
 890         p01 = (mlib_d64)sd0.i32s.i0;
 891         p02 = (mlib_d64)sd0.i32s.i1;
 892         p11 = (mlib_d64)sd1.i32s.i0;
 893         p12 = (mlib_d64)sd1.i32s.i1;
 894 
 895         LOAD_BUFF(buff2);
 896 
 897         dd.i32s.i0 = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3 - (1u << 31));
 898         dd.i32s.i1 = CLAMP_S32(p01 * k0 + p02 * k1 + p11 * k2 + p12 * k3 - (1u << 31));
 899         *(TYPE_64BIT*)(buffo + i) = dd.d64;
 900 
 901 #ifdef _NO_LONGLONG
 902 
 903         o64_1 = buffo[i];
 904         o64_2 = buffo[i+1];
 905         STORE2(o64_1 >> 24, o64_2 >> 24);
 906 
 907 #else /* _NO_LONGLONG */
 908 
 909         o64 = *(mlib_s64*)(buffo + i);
 910         STORE2(o64 >> 56, o64 >> 24);
 911 
 912 #endif /* _NO_LONGLONG */
 913 
 914         sp += chan2;
 915         dp += chan2;
 916       }
 917 
 918       for (; i < wid; i++) {
 919         p00 = buff0[i - 1]; p10 = buff1[i - 1];
 920         p01 = buff0[i];     p11 = buff1[i];
 921 
 922         buff2[i] = (mlib_s32)sp[0];
 923 
 924         buffo[i] = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3 - (1u << 31));
 925         dp[0] = (buffo[i] >> 24);
 926 
 927         sp += chan1;
 928         dp += chan1;
 929       }
 930 
 931       sl += sll;
 932       dl += dll;
 933 
 934       buffT = buff0;
 935       buff0 = buff1;
 936       buff1 = buff2;
 937       buff2 = buffT;
 938     }
 939   }
 940 
 941   {
 942     mlib_s32 amask = (1 << nchannel) - 1;
 943 
 944     if ((cmask & amask) != amask) {
 945       mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
 946     } else {
 947       mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
 948     }
 949   }
 950 
 951   if (pbuff != (mlib_s32*)buff_arr) mlib_free(pbuff);
 952 
 953   return MLIB_SUCCESS;
 954 }
 955 
 956 /***************************************************************/
 957 mlib_status mlib_c_conv2x2ext_u8(mlib_image       *dst,
 958                                  const mlib_image *src,
 959                                  mlib_s32         dx_l,
 960                                  mlib_s32         dx_r,
 961                                  mlib_s32         dy_t,
 962                                  mlib_s32         dy_b,
 963                                  const mlib_s32   *kern,
 964                                  mlib_s32         scalef_expon,
 965                                  mlib_s32         cmask)
 966 {
 967   mlib_d64 buff_arr[4*BUFF_LINE];
 968   mlib_s32 *pbuff = (mlib_s32*)buff_arr, *buffo, *buff0, *buff1, *buff2, *buffT;
 969   DTYPE    *adr_src, *sl, *sp, *sl1;
 970   DTYPE    *adr_dst, *dl, *dp;
 971   mlib_d64 k0, k1, k2, k3, scalef = (1 << 24);
 972   mlib_d64 p00, p01, p02,
 973            p10, p11, p12;
 974   mlib_s32 wid, hgt, sll, dll, wid1;
 975   mlib_s32 nchannel, chan1, chan2;
 976   mlib_s32 i, j, c, swid;
 977   LOAD_KERNEL_INTO_DOUBLE();
 978   GET_SRC_DST_PARAMETERS(DTYPE);
 979 
 980   swid = wid + D_KER;
 981 
 982   wid1 = (swid + 1) &~ 1;
 983 
 984   if (wid1 > BUFF_LINE) {
 985     pbuff = mlib_malloc(4*sizeof(mlib_s32)*wid1);
 986 
 987     if (pbuff == NULL) return MLIB_FAILURE;
 988   }
 989 
 990   buffo = pbuff;
 991   buff0 = buffo + wid1;
 992   buff1 = buff0 + wid1;
 993   buff2 = buff1 + wid1;
 994 
 995   chan1 = nchannel;
 996   chan2 = chan1 + chan1;
 997 
 998   swid -= dx_r;
 999 
1000   for (c = 0; c < nchannel; c++) {
1001     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1002 
1003     sl = adr_src + c;
1004     dl = adr_dst + c;
1005 
1006     if ((hgt - dy_b) > 0) sl1 = sl + sll;
1007     else sl1 = sl;
1008 
1009 #ifdef __SUNPRO_C
1010 #pragma pipeloop(0)
1011 #endif /* __SUNPRO_C */
1012     for (i = 0; i < swid; i++) {
1013       buff0[i - 1] = (mlib_s32)sl[i*chan1];
1014       buff1[i - 1] = (mlib_s32)sl1[i*chan1];
1015     }
1016 
1017     if (dx_r != 0) {
1018       buff0[swid - 1] = buff0[swid - 2];
1019       buff1[swid - 1] = buff1[swid - 2];
1020     }
1021 
1022     if ((hgt - dy_b) > 1) sl = sl1 + sll;
1023     else sl = sl1;
1024 
1025     for (j = 0; j < hgt; j++) {
1026       sp = sl;
1027       dp = dl;
1028 
1029       buff2[-1] = (mlib_s32)sp[0];
1030       sp += chan1;
1031 
1032       p02 = buff0[-1];
1033       p12 = buff1[-1];
1034 
1035 #ifdef __SUNPRO_C
1036 #pragma pipeloop(0)
1037 #endif /* __SUNPRO_C */
1038       for (i = 0; i <= (wid - 2); i += 2) {
1039 #ifdef _NO_LONGLONG
1040         mlib_s32 o64_1, o64_2;
1041 #else /* _NO_LONGLONG */
1042         mlib_s64 o64;
1043 #endif /* _NO_LONGLONG */
1044         d64_2x32 sd0, sd1, dd;
1045 
1046         p00 = p02; p10 = p12;
1047 
1048         sd0.d64 = *(TYPE_64BIT*)(buff0 + i);
1049         sd1.d64 = *(TYPE_64BIT*)(buff1 + i);
1050         p01 = (mlib_d64)sd0.i32s.i0;
1051         p02 = (mlib_d64)sd0.i32s.i1;
1052         p11 = (mlib_d64)sd1.i32s.i0;
1053         p12 = (mlib_d64)sd1.i32s.i1;
1054 
1055         LOAD_BUFF(buff2);
1056 
1057         dd.i32s.i0 = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3 - (1u << 31));
1058         dd.i32s.i1 = CLAMP_S32(p01 * k0 + p02 * k1 + p11 * k2 + p12 * k3 - (1u << 31));
1059         *(TYPE_64BIT*)(buffo + i) = dd.d64;
1060 
1061 #ifdef _NO_LONGLONG
1062 
1063         o64_1 = buffo[i];
1064         o64_2 = buffo[i+1];
1065         STORE2(o64_1 >> 24, o64_2 >> 24);
1066 
1067 #else /* _NO_LONGLONG */
1068 
1069         o64 = *(mlib_s64*)(buffo + i);
1070         STORE2(o64 >> 56, o64 >> 24);
1071 
1072 #endif /* _NO_LONGLONG */
1073 
1074         sp += chan2;
1075         dp += chan2;
1076       }
1077 
1078       for (; i < wid; i++) {
1079         p00 = buff0[i - 1]; p10 = buff1[i - 1];
1080         p01 = buff0[i];     p11 = buff1[i];
1081 
1082         buff2[i] = (mlib_s32)sp[0];
1083 
1084         buffo[i] = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3 - (1u << 31));
1085         dp[0] = (buffo[i] >> 24);
1086 
1087         sp += chan1;
1088         dp += chan1;
1089       }
1090 
1091       if (dx_r != 0) buff2[swid - 1] = buff2[swid - 2];
1092 
1093       if (j < hgt - dy_b - 2) sl += sll;
1094       dl += dll;
1095 
1096       buffT = buff0;
1097       buff0 = buff1;
1098       buff1 = buff2;
1099       buff2 = buffT;
1100     }
1101   }
1102 
1103   {
1104     mlib_s32 amask = (1 << nchannel) - 1;
1105 
1106     if ((cmask & amask) != amask) {
1107       mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
1108     } else {
1109       mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
1110     }
1111   }
1112 
1113   if (pbuff != (mlib_s32*)buff_arr) mlib_free(pbuff);
1114 
1115   return MLIB_SUCCESS;
1116 }
1117 
1118 /***************************************************************/