1 /*
   2  * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 
  28 /*
  29  *      The functions step along the lines from xLeft to xRight and apply
  30  *      the bilinear filtering.
  31  *
  32  */
  33 
  34 #include "vis_proto.h"
  35 #include "mlib_image.h"
  36 #include "mlib_ImageColormap.h"
  37 #include "mlib_ImageCopy.h"
  38 #include "mlib_ImageAffine.h"
  39 #include "mlib_v_ImageFilters.h"
  40 #include "mlib_v_ImageChannelExtract.h"
  41 
  42 /*#define MLIB_VIS2*/
  43 
  44 /***************************************************************/
  45 #define DTYPE mlib_s16
  46 
  47 #define FUN_NAME(CHAN) mlib_ImageAffine_s16_##CHAN##_bl
  48 
  49 /***************************************************************/
  50 static mlib_status FUN_NAME(2ch_na)(mlib_affine_param *param);
  51 static mlib_status FUN_NAME(4ch_na)(mlib_affine_param *param);
  52 
  53 /***************************************************************/
  54 const mlib_u64 mlib_dmask_arr[] = {
  55   0x0000000000000000, 0x000000000000FFFF, 0x00000000FFFF0000, 0x00000000FFFFFFFF,
  56   0x0000FFFF00000000, 0x0000FFFF0000FFFF, 0x0000FFFFFFFF0000, 0x0000FFFFFFFFFFFF,
  57   0xFFFF000000000000, 0xFFFF00000000FFFF, 0xFFFF0000FFFF0000, 0xFFFF0000FFFFFFFF,
  58   0xFFFFFFFF00000000, 0xFFFFFFFF0000FFFF, 0xFFFFFFFFFFFF0000, 0xFFFFFFFFFFFFFFFF
  59 };
  60 
  61 /***************************************************************/
  62 #define XOR_8000(x)
  63 
  64 /***************************************************************/
  65 #ifdef MLIB_VIS2
  66 #define MLIB_WRITE_BMASK(bmask) vis_write_bmask(bmask, 0)
  67 #else
  68 #define MLIB_WRITE_BMASK(bmask)
  69 #endif
  70 
  71 /***************************************************************/
  72 #undef  DECLAREVAR
  73 #define DECLAREVAR()                                            \
  74   DECLAREVAR0();                                                \
  75   mlib_s32  *warp_tbl   = param -> warp_tbl;                    \
  76   mlib_s32  srcYStride = param -> srcYStride;                   \
  77   mlib_u8   *dl;                                                \
  78   mlib_s32  i, size;                                            \
  79   /*mlib_d64  mask_8000 = vis_to_double_dup(0x80008000);*/      \
  80   mlib_d64  mask_7fff = vis_to_double_dup(0x7FFF7FFF);          \
  81   mlib_d64  dx64, dy64, deltax, deltay, delta1_x, delta1_y;     \
  82   mlib_d64  s0, s1, s2, s3;                                     \
  83   mlib_d64  d0, d1, d2, d3, dd
  84 
  85 /***************************************************************/
  86 
  87 /* arguments (x, y) are swapped to prevent overflow */
  88 #define FMUL_16x16(x, y)                        \
  89   vis_fpadd16(vis_fmul8sux16(y, x),             \
  90               vis_fmul8ulx16(y, x))
  91 
  92 /***************************************************************/
  93 #define BUF_SIZE  512
  94 
  95 /***************************************************************/
  96 #define DOUBLE_4U16(x0, x1, x2, x3)                                 \
  97   vis_to_double(((((x0) & 0xFFFE) << 15) | (((x1) & 0xFFFE) >> 1)), \
  98                 ((((x2) & 0xFFFE) << 15) | (((x3) & 0xFFFE) >> 1)))
  99 
 100 /***************************************************************/
 101 #define BL_SUM()                                                \
 102   XOR_8000(s0);                                                 \
 103   XOR_8000(s1);                                                 \
 104   XOR_8000(s2);                                                 \
 105   XOR_8000(s3);                                                 \
 106                                                                 \
 107   delta1_x = vis_fpsub16(mask_7fff, deltax);                    \
 108   delta1_y = vis_fpsub16(mask_7fff, deltay);                    \
 109                                                                 \
 110   d0 = FMUL_16x16(s0, delta1_x);                                \
 111   d1 = FMUL_16x16(s1, deltax);                                  \
 112   d0 = vis_fpadd16(d0, d1);                                     \
 113   d0 = vis_fpadd16(d0, d0);                                     \
 114   d0 = FMUL_16x16(d0, delta1_y);                                \
 115                                                                 \
 116   d2 = FMUL_16x16(s2, delta1_x);                                \
 117   d3 = FMUL_16x16(s3, deltax);                                  \
 118   d2 = vis_fpadd16(d2, d3);                                     \
 119   d2 = vis_fpadd16(d2, d2);                                     \
 120   d2 = FMUL_16x16(d2, deltay);                                  \
 121                                                                 \
 122   dd = vis_fpadd16(d0, d2);                                     \
 123   dd = vis_fpadd16(dd, dd);                                     \
 124   XOR_8000(dd);                                                 \
 125                                                                 \
 126   deltax = vis_fpadd16(deltax, dx64);                           \
 127   deltay = vis_fpadd16(deltay, dy64);                           \
 128   deltax = vis_fand(deltax, mask_7fff);                         \
 129   deltay = vis_fand(deltay, mask_7fff)
 130 
 131 /***************************************************************/
 132 #define BL_SUM_3CH()                                            \
 133   XOR_8000(s0);                                                 \
 134   XOR_8000(s1);                                                 \
 135   XOR_8000(s2);                                                 \
 136   XOR_8000(s3);                                                 \
 137                                                                 \
 138   delta1_x = vis_fpsub16(mask_7fff, deltax);                    \
 139   delta1_y = vis_fpsub16(mask_7fff, deltay);                    \
 140                                                                 \
 141   d0 = FMUL_16x16(s0, delta1_y);                                \
 142   d2 = FMUL_16x16(s2, deltay);                                  \
 143   d0 = vis_fpadd16(d0, d2);                                     \
 144   d0 = vis_fpadd16(d0, d0);                                     \
 145   d0 = FMUL_16x16(d0, delta1_x);                                \
 146                                                                 \
 147   d1 = FMUL_16x16(s1, delta1_y);                                \
 148   d3 = FMUL_16x16(s3, deltay);                                  \
 149   d1 = vis_fpadd16(d1, d3);                                     \
 150   d1 = vis_fpadd16(d1, d1);                                     \
 151   d1 = FMUL_16x16(d1, deltax);                                  \
 152                                                                 \
 153   vis_alignaddr((void*)0, 2);                                   \
 154   d0 = vis_faligndata(d0, d0);                                  \
 155   dd = vis_fpadd16(d0, d1);                                     \
 156   dd = vis_fpadd16(dd, dd);                                     \
 157   XOR_8000(dd);                                                 \
 158                                                                 \
 159   deltax = vis_fpadd16(deltax, dx64);                           \
 160   deltay = vis_fpadd16(deltay, dy64);                           \
 161   deltax = vis_fand(deltax, mask_7fff);                         \
 162   deltay = vis_fand(deltay, mask_7fff)
 163 
 164 /***************************************************************/
 165 #define LD_U16(sp, ind) vis_ld_u16(sp + ind)
 166 
 167 /***************************************************************/
 168 #ifndef MLIB_VIS2
 169 
 170 #define LOAD_1CH()                                              \
 171   s0 = vis_faligndata(LD_U16(sp3, 0), mask_7fff);               \
 172   s1 = vis_faligndata(LD_U16(sp3, 2), mask_7fff);               \
 173   s2 = vis_faligndata(LD_U16(sp3, srcYStride), mask_7fff);      \
 174   s3 = vis_faligndata(LD_U16(sp3, srcYStride + 2), mask_7fff);  \
 175                                                                 \
 176   s0 = vis_faligndata(LD_U16(sp2, 0), s0);                      \
 177   s1 = vis_faligndata(LD_U16(sp2, 2), s1);                      \
 178   s2 = vis_faligndata(LD_U16(sp2, srcYStride), s2);             \
 179   s3 = vis_faligndata(LD_U16(sp2, srcYStride + 2), s3);         \
 180                                                                 \
 181   s0 = vis_faligndata(LD_U16(sp1, 0), s0);                      \
 182   s1 = vis_faligndata(LD_U16(sp1, 2), s1);                      \
 183   s2 = vis_faligndata(LD_U16(sp1, srcYStride), s2);             \
 184   s3 = vis_faligndata(LD_U16(sp1, srcYStride + 2), s3);         \
 185                                                                 \
 186   s0 = vis_faligndata(LD_U16(sp0, 0), s0);                      \
 187   s1 = vis_faligndata(LD_U16(sp0, 2), s1);                      \
 188   s2 = vis_faligndata(LD_U16(sp0, srcYStride), s2);             \
 189   s3 = vis_faligndata(LD_U16(sp0, srcYStride + 2), s3)
 190 
 191 #else
 192 
 193 #define LOAD_1CH()                                                             \
 194   s0 = vis_bshuffle(LD_U16(sp0, 0), LD_U16(sp2, 0));                           \
 195   s1 = vis_bshuffle(LD_U16(sp0, 2), LD_U16(sp2, 2));                           \
 196   s2 = vis_bshuffle(LD_U16(sp0, srcYStride), LD_U16(sp2, srcYStride));         \
 197   s3 = vis_bshuffle(LD_U16(sp0, srcYStride + 2), LD_U16(sp2, srcYStride + 2)); \
 198                                                                                \
 199   t0 = vis_bshuffle(LD_U16(sp1, 0), LD_U16(sp3, 0));                           \
 200   t1 = vis_bshuffle(LD_U16(sp1, 2), LD_U16(sp3, 2));                           \
 201   t2 = vis_bshuffle(LD_U16(sp1, srcYStride), LD_U16(sp3, srcYStride));         \
 202   t3 = vis_bshuffle(LD_U16(sp1, srcYStride + 2), LD_U16(sp3, srcYStride + 2)); \
 203                                                                                \
 204   s0 = vis_bshuffle(s0, t0);                                                   \
 205   s1 = vis_bshuffle(s1, t1);                                                   \
 206   s2 = vis_bshuffle(s2, t2);                                                   \
 207   s3 = vis_bshuffle(s3, t3)
 208 
 209 #endif
 210 
 211 /***************************************************************/
 212 #define GET_POINTER(sp)                                                       \
 213   sp = *(mlib_u8**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + 2*(X >> MLIB_SHIFT); \
 214   X += dX;                                                                    \
 215   Y += dY
 216 
 217 /***************************************************************/
 218 #undef  PREPARE_DELTAS
 219 #define PREPARE_DELTAS                                                             \
 220   if (warp_tbl != NULL) {                                                          \
 221     dX = warp_tbl[2*j    ];                                                        \
 222     dY = warp_tbl[2*j + 1];                                                        \
 223     dx64 = vis_to_double_dup((((dX << 1) & 0xFFFF) << 16) | ((dX << 1) & 0xFFFF)); \
 224     dy64 = vis_to_double_dup((((dY << 1) & 0xFFFF) << 16) | ((dY << 1) & 0xFFFF)); \
 225   }
 226 
 227 /***************************************************************/
 228 mlib_status FUN_NAME(1ch)(mlib_affine_param *param)
 229 {
 230   DECLAREVAR();
 231   mlib_s32 off;
 232   mlib_s32 x0, x1, x2, x3, y0, y1, y2, y3;
 233 #ifdef MLIB_VIS2
 234   mlib_d64 t0, t1, t2, t3;
 235   vis_write_bmask(0x45CD67EF, 0);
 236 #else
 237   vis_alignaddr((void*)0, 6);
 238 #endif
 239 
 240   dx64 = vis_to_double_dup((((dX << 1) & 0xFFFF) << 16) | ((dX << 1) & 0xFFFF));
 241   dy64 = vis_to_double_dup((((dY << 1) & 0xFFFF) << 16) | ((dY << 1) & 0xFFFF));
 242 
 243   for (j = yStart; j <= yFinish; j++) {
 244     mlib_u8  *sp0, *sp1, *sp2, *sp3;
 245     mlib_d64 *dp, dmask;
 246 
 247     NEW_LINE(1);
 248 
 249     off = (mlib_s32)dl & 7;
 250     dp = (mlib_d64*)(dl - off);
 251     off >>= 1;
 252 
 253     x0 = X - off*dX; y0 = Y - off*dY;
 254     x1 = x0 + dX;    y1 = y0 + dY;
 255     x2 = x1 + dX;    y2 = y1 + dY;
 256     x3 = x2 + dX;    y3 = y2 + dY;
 257 
 258     deltax = DOUBLE_4U16(x0, x1, x2, x3);
 259     deltay = DOUBLE_4U16(y0, y1, y2, y3);
 260 
 261     if (off) {
 262       mlib_s32 emask = vis_edge16((void*)(2*off), (void*)(2*(off + size - 1)));
 263 
 264       off = 4 - off;
 265       GET_POINTER(sp3);
 266       sp0 = sp1 = sp2 = sp3;
 267 
 268       if (off > 1 && size > 1) {
 269         GET_POINTER(sp3);
 270       }
 271 
 272       if (off > 2) {
 273         sp2 = sp3;
 274 
 275         if (size > 2) {
 276           GET_POINTER(sp3);
 277         }
 278       }
 279 
 280       LOAD_1CH();
 281       BL_SUM();
 282 
 283       dmask = ((mlib_d64*)mlib_dmask_arr)[emask];
 284       *dp++ = vis_for (vis_fand(dmask, dd), vis_fandnot(dmask, dp[0]));
 285 
 286       size -= off;
 287 
 288       if (size < 0) size = 0;
 289     }
 290 
 291 #pragma pipeloop(0)
 292     for (i = 0; i < size/4; i++) {
 293       GET_POINTER(sp0);
 294       GET_POINTER(sp1);
 295       GET_POINTER(sp2);
 296       GET_POINTER(sp3);
 297 
 298       LOAD_1CH();
 299       BL_SUM();
 300 
 301       dp[i] = dd;
 302     }
 303 
 304     off = size & 3;
 305 
 306     if (off) {
 307       GET_POINTER(sp0);
 308       sp1 = sp2 = sp3 = sp0;
 309 
 310       if (off > 1) {
 311         GET_POINTER(sp1);
 312       }
 313 
 314       if (off > 2) {
 315         GET_POINTER(sp2);
 316       }
 317 
 318       LOAD_1CH();
 319       BL_SUM();
 320 
 321       dmask = ((mlib_d64*)mlib_dmask_arr)[(0xF0 >> off) & 0x0F];
 322       dp[i] = vis_for (vis_fand(dmask, dd), vis_fandnot(dmask, dp[i]));
 323     }
 324   }
 325 
 326   return MLIB_SUCCESS;
 327 }
 328 
 329 /***************************************************************/
 330 #undef  GET_POINTER
 331 #define GET_POINTER(sp)                                                      \
 332   sp = *(mlib_f32**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + (X >> MLIB_SHIFT); \
 333   X += dX;                                                                   \
 334   Y += dY
 335 
 336 /***************************************************************/
 337 #define LOAD_2CH()                                              \
 338   s0 = vis_freg_pair(sp0[0], sp1[0]);                           \
 339   s1 = vis_freg_pair(sp0[1], sp1[1]);                           \
 340   s2 = vis_freg_pair(sp0[srcYStride], sp1[srcYStride]);         \
 341   s3 = vis_freg_pair(sp0[srcYStride + 1], sp1[srcYStride + 1])
 342 
 343 /***************************************************************/
 344 #undef  PREPARE_DELTAS
 345 #define PREPARE_DELTAS                                               \
 346   if (warp_tbl != NULL) {                                            \
 347     dX = warp_tbl[2*j    ];                                          \
 348     dY = warp_tbl[2*j + 1];                                          \
 349     dx64 = vis_to_double_dup(((dX & 0xFFFF) << 16) | (dX & 0xFFFF)); \
 350     dy64 = vis_to_double_dup(((dY & 0xFFFF) << 16) | (dY & 0xFFFF)); \
 351   }
 352 
 353 /***************************************************************/
 354 mlib_status FUN_NAME(2ch)(mlib_affine_param *param)
 355 {
 356   DECLAREVAR();
 357   mlib_s32 off;
 358   mlib_s32 x0, x1, y0, y1;
 359 
 360   if (((mlib_s32)lineAddr[0] | (mlib_s32)dstData | srcYStride | dstYStride) & 3) {
 361     return FUN_NAME(2ch_na)(param);
 362   }
 363 
 364   srcYStride >>= 2;
 365 
 366   dx64 = vis_to_double_dup(((dX & 0xFFFF) << 16) | (dX & 0xFFFF));
 367   dy64 = vis_to_double_dup(((dY & 0xFFFF) << 16) | (dY & 0xFFFF));
 368 
 369   for (j = yStart; j <= yFinish; j++) {
 370     mlib_f32 *sp0, *sp1;
 371     mlib_d64 *dp;
 372 
 373     NEW_LINE(2);
 374 
 375     off = (mlib_s32)dl & 7;
 376     dp = (mlib_d64*)(dl - off);
 377 
 378     if (off) {
 379       x0 = X - dX; y0 = Y - dY;
 380       x1 = X;      y1 = Y;
 381     } else {
 382       x0 = X;      y0 = Y;
 383       x1 = X + dX; y1 = Y + dY;
 384     }
 385 
 386     deltax = DOUBLE_4U16(x0, x0, x1, x1);
 387     deltay = DOUBLE_4U16(y0, y0, y1, y1);
 388 
 389     if (off) {
 390       GET_POINTER(sp1);
 391       sp0 = sp1;
 392       LOAD_2CH();
 393 
 394       BL_SUM();
 395 
 396       ((mlib_f32*)dp)[1] = vis_read_lo(dd);
 397       dp++;
 398       size--;
 399     }
 400 
 401 #pragma pipeloop(0)
 402     for (i = 0; i < size/2; i++) {
 403       GET_POINTER(sp0);
 404       GET_POINTER(sp1);
 405       LOAD_2CH();
 406 
 407       BL_SUM();
 408 
 409       *dp++ = dd;
 410     }
 411 
 412     if (size & 1) {
 413       GET_POINTER(sp0);
 414       sp1 = sp0;
 415       LOAD_2CH();
 416 
 417       BL_SUM();
 418 
 419       ((mlib_f32*)dp)[0] = vis_read_hi(dd);
 420     }
 421   }
 422 
 423   return MLIB_SUCCESS;
 424 }
 425 
 426 /***************************************************************/
 427 #undef  GET_POINTER
 428 #define GET_POINTER(sp)                                                       \
 429   sp = *(mlib_u8**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + 4*(X >> MLIB_SHIFT); \
 430   X += dX;                                                                    \
 431   Y += dY
 432 
 433 /***************************************************************/
 434 #ifndef MLIB_VIS2
 435 
 436 #define LOAD_2CH_NA()                                           \
 437   s0 = vis_faligndata(LD_U16(sp1, 2), mask_7fff);               \
 438   s1 = vis_faligndata(LD_U16(sp1, 6), mask_7fff);               \
 439   s2 = vis_faligndata(LD_U16(sp1, srcYStride + 2), mask_7fff);  \
 440   s3 = vis_faligndata(LD_U16(sp1, srcYStride + 6), mask_7fff);  \
 441                                                                 \
 442   s0 = vis_faligndata(LD_U16(sp1, 0), s0);                      \
 443   s1 = vis_faligndata(LD_U16(sp1, 4), s1);                      \
 444   s2 = vis_faligndata(LD_U16(sp1, srcYStride), s2);             \
 445   s3 = vis_faligndata(LD_U16(sp1, srcYStride + 4), s3);         \
 446                                                                 \
 447   s0 = vis_faligndata(LD_U16(sp0, 2), s0);                      \
 448   s1 = vis_faligndata(LD_U16(sp0, 6), s1);                      \
 449   s2 = vis_faligndata(LD_U16(sp0, srcYStride + 2), s2);         \
 450   s3 = vis_faligndata(LD_U16(sp0, srcYStride + 6), s3);         \
 451                                                                 \
 452   s0 = vis_faligndata(LD_U16(sp0, 0), s0);                      \
 453   s1 = vis_faligndata(LD_U16(sp0, 4), s1);                      \
 454   s2 = vis_faligndata(LD_U16(sp0, srcYStride), s2);             \
 455   s3 = vis_faligndata(LD_U16(sp0, srcYStride + 4), s3)
 456 
 457 #else
 458 
 459 #define LOAD_2CH_NA()                                                          \
 460   s0 = vis_bshuffle(LD_U16(sp0, 0), LD_U16(sp1, 0));                           \
 461   s1 = vis_bshuffle(LD_U16(sp0, 4), LD_U16(sp1, 4));                           \
 462   s2 = vis_bshuffle(LD_U16(sp0, srcYStride), LD_U16(sp1, srcYStride));         \
 463   s3 = vis_bshuffle(LD_U16(sp0, srcYStride + 4), LD_U16(sp1, srcYStride + 4)); \
 464                                                                                \
 465   t0 = vis_bshuffle(LD_U16(sp0, 2), LD_U16(sp1, 2));                           \
 466   t1 = vis_bshuffle(LD_U16(sp0, 6), LD_U16(sp1, 6));                           \
 467   t2 = vis_bshuffle(LD_U16(sp0, srcYStride + 2), LD_U16(sp1, srcYStride + 2)); \
 468   t3 = vis_bshuffle(LD_U16(sp0, srcYStride + 6), LD_U16(sp1, srcYStride + 6)); \
 469                                                                                \
 470   s0 = vis_bshuffle(s0, t0);                                                   \
 471   s1 = vis_bshuffle(s1, t1);                                                   \
 472   s2 = vis_bshuffle(s2, t2);                                                   \
 473   s3 = vis_bshuffle(s3, t3)
 474 
 475 #endif
 476 
 477 /***************************************************************/
 478 mlib_status FUN_NAME(2ch_na)(mlib_affine_param *param)
 479 {
 480   DECLAREVAR();
 481   mlib_s32 max_xsize = param -> max_xsize, bsize;
 482   mlib_s32 x0, x1, y0, y1;
 483   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
 484 #ifdef MLIB_VIS2
 485   mlib_d64 t0, t1, t2, t3;
 486 #endif
 487 
 488   bsize = (max_xsize + 1)/2;
 489 
 490   if (bsize > BUF_SIZE) {
 491     pbuff = mlib_malloc(bsize*sizeof(mlib_d64));
 492 
 493     if (pbuff == NULL) return MLIB_FAILURE;
 494   }
 495 
 496   MLIB_WRITE_BMASK(0x45CD67EF);
 497 
 498   dx64 = vis_to_double_dup(((dX & 0xFFFF) << 16) | (dX & 0xFFFF));
 499   dy64 = vis_to_double_dup(((dY & 0xFFFF) << 16) | (dY & 0xFFFF));
 500 
 501   for (j = yStart; j <= yFinish; j++) {
 502     mlib_u8 *sp0, *sp1;
 503 
 504 #ifndef MLIB_VIS2
 505     vis_alignaddr((void*)0, 6);
 506 #endif
 507 
 508     NEW_LINE(2);
 509 
 510     x0 = X;      y0 = Y;
 511     x1 = X + dX; y1 = Y + dY;
 512 
 513     deltax = DOUBLE_4U16(x0, x0, x1, x1);
 514     deltay = DOUBLE_4U16(y0, y0, y1, y1);
 515 
 516 #pragma pipeloop(0)
 517     for (i = 0; i < size/2; i++) {
 518       GET_POINTER(sp0);
 519       GET_POINTER(sp1);
 520       LOAD_2CH_NA();
 521 
 522       BL_SUM();
 523 
 524       pbuff[i] = dd;
 525     }
 526 
 527     if (size & 1) {
 528       GET_POINTER(sp0);
 529       sp1 = sp0;
 530       LOAD_2CH_NA();
 531 
 532       BL_SUM();
 533 
 534       pbuff[i] = dd;
 535     }
 536 
 537     mlib_ImageCopy_na((mlib_u8*)pbuff, dl, 4*size);
 538   }
 539 
 540   if (pbuff != buff) {
 541     mlib_free(pbuff);
 542   }
 543 
 544   return MLIB_SUCCESS;
 545 }
 546 
 547 /***************************************************************/
 548 #undef  PREPARE_DELTAS
 549 #define PREPARE_DELTAS                                                             \
 550   if (warp_tbl != NULL) {                                                          \
 551     dX = warp_tbl[2*j    ];                                                        \
 552     dY = warp_tbl[2*j + 1];                                                        \
 553     dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */                       \
 554     dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */                       \
 555     dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF)); \
 556     dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF)); \
 557   }
 558 
 559 /***************************************************************/
 560 mlib_status FUN_NAME(3ch)(mlib_affine_param *param)
 561 {
 562   DECLAREVAR();
 563   mlib_s32 max_xsize = param -> max_xsize;
 564   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
 565 
 566   if (max_xsize > BUF_SIZE) {
 567     pbuff = mlib_malloc(max_xsize*sizeof(mlib_d64));
 568 
 569     if (pbuff == NULL) return MLIB_FAILURE;
 570   }
 571 
 572   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
 573   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
 574   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
 575   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
 576 
 577   for (j = yStart; j <= yFinish; j++) {
 578     mlib_u8  *sp;
 579     mlib_d64 *sp0, *sp1;
 580 
 581     NEW_LINE(3);
 582 
 583     deltax = DOUBLE_4U16(X, X, X, X);
 584     deltay = DOUBLE_4U16(Y, Y, Y, Y);
 585 
 586 #pragma pipeloop(0)
 587     for (i = 0; i < size; i++) {
 588       sp = *(mlib_u8**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + 6*(X >> MLIB_SHIFT) - 2;
 589 
 590       vis_alignaddr(sp, 0);
 591       sp0 = AL_ADDR(sp, 0);
 592       s0 = vis_faligndata(sp0[0], sp0[1]);
 593       s1 = vis_faligndata(sp0[1], sp0[2]);
 594 
 595       vis_alignaddr(sp, srcYStride);
 596       sp1 = AL_ADDR(sp, srcYStride);
 597       s2 = vis_faligndata(sp1[0], sp1[1]);
 598       s3 = vis_faligndata(sp1[1], sp1[2]);
 599 
 600       BL_SUM_3CH();
 601 
 602       pbuff[i] = dd;
 603       X += dX;
 604       Y += dY;
 605     }
 606 
 607     mlib_v_ImageChannelExtract_S16_43L_D1((void *)pbuff, (void *)dl, size);
 608   }
 609 
 610   if (pbuff != buff) {
 611     mlib_free(pbuff);
 612   }
 613 
 614   return MLIB_SUCCESS;
 615 }
 616 
 617 /***************************************************************/
 618 mlib_status FUN_NAME(4ch)(mlib_affine_param *param)
 619 {
 620   DECLAREVAR();
 621 
 622   if (((mlib_s32)lineAddr[0] | (mlib_s32)dstData | srcYStride | dstYStride) & 7) {
 623     return FUN_NAME(4ch_na)(param);
 624   }
 625 
 626   srcYStride >>= 3;
 627 
 628   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
 629   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
 630   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
 631   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
 632 
 633   for (j = yStart; j <= yFinish; j++) {
 634     mlib_d64 *sp;
 635 
 636     NEW_LINE(4);
 637 
 638     deltax = DOUBLE_4U16(X, X, X, X);
 639     deltay = DOUBLE_4U16(Y, Y, Y, Y);
 640 
 641 #pragma pipeloop(0)
 642     for (i = 0; i < size; i++) {
 643       sp = *(mlib_d64**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + (X >> MLIB_SHIFT);
 644       s0 = sp[0];
 645       s1 = sp[1];
 646       s2 = sp[srcYStride];
 647       s3 = sp[srcYStride + 1];
 648 
 649       BL_SUM();
 650 
 651       ((mlib_d64*)dl)[i] = dd;
 652       X += dX;
 653       Y += dY;
 654     }
 655   }
 656 
 657   return MLIB_SUCCESS;
 658 }
 659 
 660 /***************************************************************/
 661 mlib_status FUN_NAME(4ch_na)(mlib_affine_param *param)
 662 {
 663   DECLAREVAR();
 664   mlib_s32 max_xsize = param -> max_xsize;
 665   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
 666 
 667   if (max_xsize > BUF_SIZE) {
 668     pbuff = mlib_malloc(max_xsize*sizeof(mlib_d64));
 669 
 670     if (pbuff == NULL) return MLIB_FAILURE;
 671   }
 672 
 673   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
 674   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
 675   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
 676   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
 677 
 678   for (j = yStart; j <= yFinish; j++) {
 679     mlib_u8  *sp;
 680     mlib_d64 *sp0, *sp1;
 681 
 682     NEW_LINE(4);
 683 
 684     deltax = DOUBLE_4U16(X, X, X, X);
 685     deltay = DOUBLE_4U16(Y, Y, Y, Y);
 686 
 687 #pragma pipeloop(0)
 688     for (i = 0; i < size; i++) {
 689       sp = *(mlib_u8**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + 8*(X >> MLIB_SHIFT);
 690 
 691       vis_alignaddr(sp, 0);
 692       sp0 = AL_ADDR(sp, 0);
 693       s0 = vis_faligndata(sp0[0], sp0[1]);
 694       s1 = vis_faligndata(sp0[1], sp0[2]);
 695 
 696       vis_alignaddr(sp, srcYStride);
 697       sp1 = AL_ADDR(sp, srcYStride);
 698       s2 = vis_faligndata(sp1[0], sp1[1]);
 699       s3 = vis_faligndata(sp1[1], sp1[2]);
 700 
 701       BL_SUM();
 702 
 703       pbuff[i] = dd;
 704       X += dX;
 705       Y += dY;
 706     }
 707 
 708     mlib_ImageCopy_na((mlib_u8*)pbuff, dl, 8*size);
 709   }
 710 
 711   if (pbuff != buff) {
 712     mlib_free(pbuff);
 713   }
 714 
 715   return MLIB_SUCCESS;
 716 }
 717 
 718 /***************************************************************/
 719 #define LUT(x)  plut[x]
 720 
 721 mlib_status FUN_NAME(s16_i)(mlib_affine_param *param,
 722                             const void        *colormap)
 723 {
 724   DECLAREVAR();
 725   mlib_s32 nchan   = mlib_ImageGetLutChannels(colormap);
 726   mlib_s32 lut_off = mlib_ImageGetLutOffset(colormap);
 727   mlib_d64 *plut = (mlib_d64*)mlib_ImageGetLutNormalTable(colormap) - lut_off;
 728   mlib_s32 max_xsize = param -> max_xsize;
 729   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
 730 
 731   srcYStride /= sizeof(DTYPE);
 732 
 733   if (max_xsize > BUF_SIZE) {
 734     pbuff = mlib_malloc(max_xsize*sizeof(mlib_d64));
 735 
 736     if (pbuff == NULL) return MLIB_FAILURE;
 737   }
 738 
 739   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
 740   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
 741   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
 742   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
 743 
 744   for (j = yStart; j <= yFinish; j++) {
 745     DTYPE *sp;
 746 
 747     NEW_LINE(1);
 748 
 749     deltax = DOUBLE_4U16(X, X, X, X);
 750     deltay = DOUBLE_4U16(Y, Y, Y, Y);
 751 
 752 #pragma pipeloop(0)
 753     for (i = 0; i < size; i++) {
 754       sp = *(DTYPE**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + (X >> MLIB_SHIFT);
 755       s0 = LUT(sp[0]);
 756       s1 = LUT(sp[1]);
 757       s2 = LUT(sp[srcYStride]);
 758       s3 = LUT(sp[srcYStride + 1]);
 759 
 760       BL_SUM();
 761 
 762       pbuff[i] = dd;
 763       X += dX;
 764       Y += dY;
 765     }
 766 
 767     if (nchan == 3) {
 768       mlib_ImageColorTrue2IndexLine_S16_S16_3_in_4((void*)pbuff, (void*)dl, size, colormap);
 769     } else {
 770       mlib_ImageColorTrue2IndexLine_S16_S16_4((void*)pbuff, (void*)dl, size, colormap);
 771     }
 772   }
 773 
 774   if (pbuff != buff) {
 775     mlib_free(pbuff);
 776   }
 777 
 778   return MLIB_SUCCESS;
 779 }
 780 
 781 /***************************************************************/
 782 #undef  DTYPE
 783 #define DTYPE mlib_u8
 784 
 785 mlib_status FUN_NAME(u8_i)(mlib_affine_param *param,
 786                            const void        *colormap)
 787 {
 788   DECLAREVAR();
 789   mlib_s32 nchan   = mlib_ImageGetLutChannels(colormap);
 790   mlib_s32 lut_off = mlib_ImageGetLutOffset(colormap);
 791   mlib_d64 *plut = (mlib_d64*)mlib_ImageGetLutNormalTable(colormap) - lut_off;
 792   mlib_s32 max_xsize = param -> max_xsize;
 793   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
 794 
 795   if (max_xsize > BUF_SIZE) {
 796     pbuff = mlib_malloc(max_xsize*sizeof(mlib_d64));
 797 
 798     if (pbuff == NULL) return MLIB_FAILURE;
 799   }
 800 
 801   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
 802   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
 803   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
 804   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
 805 
 806   for (j = yStart; j <= yFinish; j++) {
 807     DTYPE *sp;
 808 
 809     NEW_LINE(1);
 810 
 811     deltax = DOUBLE_4U16(X, X, X, X);
 812     deltay = DOUBLE_4U16(Y, Y, Y, Y);
 813 
 814 #pragma pipeloop(0)
 815     for (i = 0; i < size; i++) {
 816       sp = *(DTYPE**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + (X >> MLIB_SHIFT);
 817       s0 = LUT(sp[0]);
 818       s1 = LUT(sp[1]);
 819       s2 = LUT(sp[srcYStride]);
 820       s3 = LUT(sp[srcYStride + 1]);
 821 
 822       BL_SUM();
 823 
 824       pbuff[i] = dd;
 825       X += dX;
 826       Y += dY;
 827     }
 828 
 829     if (nchan == 3) {
 830       mlib_ImageColorTrue2IndexLine_S16_U8_3_in_4((void*)pbuff, (void*)dl, size, colormap);
 831     } else {
 832       mlib_ImageColorTrue2IndexLine_S16_U8_4((void*)pbuff, (void*)dl, size, colormap);
 833     }
 834   }
 835 
 836   if (pbuff != buff) {
 837     mlib_free(pbuff);
 838   }
 839 
 840   return MLIB_SUCCESS;
 841 }
 842 
 843 /***************************************************************/