1 /*
   2  * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 
  28 /*
  29  *      The functions step along the lines from xLeft to xRight and apply
  30  *      the bilinear filtering.
  31  *
  32  */
  33 
  34 #include "vis_proto.h"
  35 #include "mlib_image.h"
  36 #include "mlib_ImageCopy.h"
  37 #include "mlib_ImageAffine.h"
  38 #include "mlib_v_ImageFilters.h"
  39 #include "mlib_v_ImageChannelExtract.h"
  40 
  41 /*#define MLIB_VIS2*/
  42 
  43 /***************************************************************/
  44 #define DTYPE mlib_s16
  45 
  46 #define FUN_NAME(CHAN) mlib_ImageAffine_s16_##CHAN##_bl
  47 
  48 /***************************************************************/
  49 static mlib_status FUN_NAME(2ch_na)(mlib_affine_param *param);
  50 static mlib_status FUN_NAME(4ch_na)(mlib_affine_param *param);
  51 
  52 /***************************************************************/
  53 const mlib_u64 mlib_dmask_arr[] = {
  54   0x0000000000000000, 0x000000000000FFFF, 0x00000000FFFF0000, 0x00000000FFFFFFFF,
  55   0x0000FFFF00000000, 0x0000FFFF0000FFFF, 0x0000FFFFFFFF0000, 0x0000FFFFFFFFFFFF,
  56   0xFFFF000000000000, 0xFFFF00000000FFFF, 0xFFFF0000FFFF0000, 0xFFFF0000FFFFFFFF,
  57   0xFFFFFFFF00000000, 0xFFFFFFFF0000FFFF, 0xFFFFFFFFFFFF0000, 0xFFFFFFFFFFFFFFFF
  58 };
  59 
  60 /***************************************************************/
  61 #define XOR_8000(x)
  62 
  63 /***************************************************************/
  64 #ifdef MLIB_VIS2
  65 #define MLIB_WRITE_BMASK(bmask) vis_write_bmask(bmask, 0)
  66 #else
  67 #define MLIB_WRITE_BMASK(bmask)
  68 #endif
  69 
  70 /***************************************************************/
  71 #undef  DECLAREVAR
  72 #define DECLAREVAR()                                            \
  73   DECLAREVAR0();                                                \
  74   mlib_s32  *warp_tbl   = param -> warp_tbl;                    \
  75   mlib_s32  srcYStride = param -> srcYStride;                   \
  76   mlib_u8   *dl;                                                \
  77   mlib_s32  i, size;                                            \
  78   /*mlib_d64  mask_8000 = vis_to_double_dup(0x80008000);*/      \
  79   mlib_d64  mask_7fff = vis_to_double_dup(0x7FFF7FFF);          \
  80   mlib_d64  dx64, dy64, deltax, deltay, delta1_x, delta1_y;     \
  81   mlib_d64  s0, s1, s2, s3;                                     \
  82   mlib_d64  d0, d1, d2, d3, dd
  83 
  84 /***************************************************************/
  85 
  86 /* arguments (x, y) are swapped to prevent overflow */
  87 #define FMUL_16x16(x, y)                        \
  88   vis_fpadd16(vis_fmul8sux16(y, x),             \
  89               vis_fmul8ulx16(y, x))
  90 
  91 /***************************************************************/
  92 #define BUF_SIZE  512
  93 
  94 /***************************************************************/
  95 #define DOUBLE_4U16(x0, x1, x2, x3)                                 \
  96   vis_to_double(((((x0) & 0xFFFE) << 15) | (((x1) & 0xFFFE) >> 1)), \
  97                 ((((x2) & 0xFFFE) << 15) | (((x3) & 0xFFFE) >> 1)))
  98 
  99 /***************************************************************/
 100 #define BL_SUM()                                                \
 101   XOR_8000(s0);                                                 \
 102   XOR_8000(s1);                                                 \
 103   XOR_8000(s2);                                                 \
 104   XOR_8000(s3);                                                 \
 105                                                                 \
 106   delta1_x = vis_fpsub16(mask_7fff, deltax);                    \
 107   delta1_y = vis_fpsub16(mask_7fff, deltay);                    \
 108                                                                 \
 109   d0 = FMUL_16x16(s0, delta1_x);                                \
 110   d1 = FMUL_16x16(s1, deltax);                                  \
 111   d0 = vis_fpadd16(d0, d1);                                     \
 112   d0 = vis_fpadd16(d0, d0);                                     \
 113   d0 = FMUL_16x16(d0, delta1_y);                                \
 114                                                                 \
 115   d2 = FMUL_16x16(s2, delta1_x);                                \
 116   d3 = FMUL_16x16(s3, deltax);                                  \
 117   d2 = vis_fpadd16(d2, d3);                                     \
 118   d2 = vis_fpadd16(d2, d2);                                     \
 119   d2 = FMUL_16x16(d2, deltay);                                  \
 120                                                                 \
 121   dd = vis_fpadd16(d0, d2);                                     \
 122   dd = vis_fpadd16(dd, dd);                                     \
 123   XOR_8000(dd);                                                 \
 124                                                                 \
 125   deltax = vis_fpadd16(deltax, dx64);                           \
 126   deltay = vis_fpadd16(deltay, dy64);                           \
 127   deltax = vis_fand(deltax, mask_7fff);                         \
 128   deltay = vis_fand(deltay, mask_7fff)
 129 
 130 /***************************************************************/
 131 #define BL_SUM_3CH()                                            \
 132   XOR_8000(s0);                                                 \
 133   XOR_8000(s1);                                                 \
 134   XOR_8000(s2);                                                 \
 135   XOR_8000(s3);                                                 \
 136                                                                 \
 137   delta1_x = vis_fpsub16(mask_7fff, deltax);                    \
 138   delta1_y = vis_fpsub16(mask_7fff, deltay);                    \
 139                                                                 \
 140   d0 = FMUL_16x16(s0, delta1_y);                                \
 141   d2 = FMUL_16x16(s2, deltay);                                  \
 142   d0 = vis_fpadd16(d0, d2);                                     \
 143   d0 = vis_fpadd16(d0, d0);                                     \
 144   d0 = FMUL_16x16(d0, delta1_x);                                \
 145                                                                 \
 146   d1 = FMUL_16x16(s1, delta1_y);                                \
 147   d3 = FMUL_16x16(s3, deltay);                                  \
 148   d1 = vis_fpadd16(d1, d3);                                     \
 149   d1 = vis_fpadd16(d1, d1);                                     \
 150   d1 = FMUL_16x16(d1, deltax);                                  \
 151                                                                 \
 152   vis_alignaddr((void*)0, 2);                                   \
 153   d0 = vis_faligndata(d0, d0);                                  \
 154   dd = vis_fpadd16(d0, d1);                                     \
 155   dd = vis_fpadd16(dd, dd);                                     \
 156   XOR_8000(dd);                                                 \
 157                                                                 \
 158   deltax = vis_fpadd16(deltax, dx64);                           \
 159   deltay = vis_fpadd16(deltay, dy64);                           \
 160   deltax = vis_fand(deltax, mask_7fff);                         \
 161   deltay = vis_fand(deltay, mask_7fff)
 162 
 163 /***************************************************************/
 164 #define LD_U16(sp, ind) vis_ld_u16(sp + ind)
 165 
 166 /***************************************************************/
 167 #ifndef MLIB_VIS2
 168 
 169 #define LOAD_1CH()                                              \
 170   s0 = vis_faligndata(LD_U16(sp3, 0), mask_7fff);               \
 171   s1 = vis_faligndata(LD_U16(sp3, 2), mask_7fff);               \
 172   s2 = vis_faligndata(LD_U16(sp3, srcYStride), mask_7fff);      \
 173   s3 = vis_faligndata(LD_U16(sp3, srcYStride + 2), mask_7fff);  \
 174                                                                 \
 175   s0 = vis_faligndata(LD_U16(sp2, 0), s0);                      \
 176   s1 = vis_faligndata(LD_U16(sp2, 2), s1);                      \
 177   s2 = vis_faligndata(LD_U16(sp2, srcYStride), s2);             \
 178   s3 = vis_faligndata(LD_U16(sp2, srcYStride + 2), s3);         \
 179                                                                 \
 180   s0 = vis_faligndata(LD_U16(sp1, 0), s0);                      \
 181   s1 = vis_faligndata(LD_U16(sp1, 2), s1);                      \
 182   s2 = vis_faligndata(LD_U16(sp1, srcYStride), s2);             \
 183   s3 = vis_faligndata(LD_U16(sp1, srcYStride + 2), s3);         \
 184                                                                 \
 185   s0 = vis_faligndata(LD_U16(sp0, 0), s0);                      \
 186   s1 = vis_faligndata(LD_U16(sp0, 2), s1);                      \
 187   s2 = vis_faligndata(LD_U16(sp0, srcYStride), s2);             \
 188   s3 = vis_faligndata(LD_U16(sp0, srcYStride + 2), s3)
 189 
 190 #else
 191 
 192 #define LOAD_1CH()                                                             \
 193   s0 = vis_bshuffle(LD_U16(sp0, 0), LD_U16(sp2, 0));                           \
 194   s1 = vis_bshuffle(LD_U16(sp0, 2), LD_U16(sp2, 2));                           \
 195   s2 = vis_bshuffle(LD_U16(sp0, srcYStride), LD_U16(sp2, srcYStride));         \
 196   s3 = vis_bshuffle(LD_U16(sp0, srcYStride + 2), LD_U16(sp2, srcYStride + 2)); \
 197                                                                                \
 198   t0 = vis_bshuffle(LD_U16(sp1, 0), LD_U16(sp3, 0));                           \
 199   t1 = vis_bshuffle(LD_U16(sp1, 2), LD_U16(sp3, 2));                           \
 200   t2 = vis_bshuffle(LD_U16(sp1, srcYStride), LD_U16(sp3, srcYStride));         \
 201   t3 = vis_bshuffle(LD_U16(sp1, srcYStride + 2), LD_U16(sp3, srcYStride + 2)); \
 202                                                                                \
 203   s0 = vis_bshuffle(s0, t0);                                                   \
 204   s1 = vis_bshuffle(s1, t1);                                                   \
 205   s2 = vis_bshuffle(s2, t2);                                                   \
 206   s3 = vis_bshuffle(s3, t3)
 207 
 208 #endif
 209 
 210 /***************************************************************/
 211 #define GET_POINTER(sp)                                                       \
 212   sp = *(mlib_u8**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + 2*(X >> MLIB_SHIFT); \
 213   X += dX;                                                                    \
 214   Y += dY
 215 
 216 /***************************************************************/
 217 #undef  PREPARE_DELTAS
 218 #define PREPARE_DELTAS                                                             \
 219   if (warp_tbl != NULL) {                                                          \
 220     dX = warp_tbl[2*j    ];                                                        \
 221     dY = warp_tbl[2*j + 1];                                                        \
 222     dx64 = vis_to_double_dup((((dX << 1) & 0xFFFF) << 16) | ((dX << 1) & 0xFFFF)); \
 223     dy64 = vis_to_double_dup((((dY << 1) & 0xFFFF) << 16) | ((dY << 1) & 0xFFFF)); \
 224   }
 225 
 226 /***************************************************************/
 227 mlib_status FUN_NAME(1ch)(mlib_affine_param *param)
 228 {
 229   DECLAREVAR();
 230   mlib_s32 off;
 231   mlib_s32 x0, x1, x2, x3, y0, y1, y2, y3;
 232 #ifdef MLIB_VIS2
 233   mlib_d64 t0, t1, t2, t3;
 234   vis_write_bmask(0x45CD67EF, 0);
 235 #else
 236   vis_alignaddr((void*)0, 6);
 237 #endif
 238 
 239   dx64 = vis_to_double_dup((((dX << 1) & 0xFFFF) << 16) | ((dX << 1) & 0xFFFF));
 240   dy64 = vis_to_double_dup((((dY << 1) & 0xFFFF) << 16) | ((dY << 1) & 0xFFFF));
 241 
 242   for (j = yStart; j <= yFinish; j++) {
 243     mlib_u8  *sp0, *sp1, *sp2, *sp3;
 244     mlib_d64 *dp, dmask;
 245 
 246     NEW_LINE(1);
 247 
 248     off = (mlib_s32)dl & 7;
 249     dp = (mlib_d64*)(dl - off);
 250     off >>= 1;
 251 
 252     x0 = X - off*dX; y0 = Y - off*dY;
 253     x1 = x0 + dX;    y1 = y0 + dY;
 254     x2 = x1 + dX;    y2 = y1 + dY;
 255     x3 = x2 + dX;    y3 = y2 + dY;
 256 
 257     deltax = DOUBLE_4U16(x0, x1, x2, x3);
 258     deltay = DOUBLE_4U16(y0, y1, y2, y3);
 259 
 260     if (off) {
 261       mlib_s32 emask = vis_edge16((void*)(2*off), (void*)(2*(off + size - 1)));
 262 
 263       off = 4 - off;
 264       GET_POINTER(sp3);
 265       sp0 = sp1 = sp2 = sp3;
 266 
 267       if (off > 1 && size > 1) {
 268         GET_POINTER(sp3);
 269       }
 270 
 271       if (off > 2) {
 272         sp2 = sp3;
 273 
 274         if (size > 2) {
 275           GET_POINTER(sp3);
 276         }
 277       }
 278 
 279       LOAD_1CH();
 280       BL_SUM();
 281 
 282       dmask = ((mlib_d64*)mlib_dmask_arr)[emask];
 283       *dp++ = vis_for (vis_fand(dmask, dd), vis_fandnot(dmask, dp[0]));
 284 
 285       size -= off;
 286 
 287       if (size < 0) size = 0;
 288     }
 289 
 290 #pragma pipeloop(0)
 291     for (i = 0; i < size/4; i++) {
 292       GET_POINTER(sp0);
 293       GET_POINTER(sp1);
 294       GET_POINTER(sp2);
 295       GET_POINTER(sp3);
 296 
 297       LOAD_1CH();
 298       BL_SUM();
 299 
 300       dp[i] = dd;
 301     }
 302 
 303     off = size & 3;
 304 
 305     if (off) {
 306       GET_POINTER(sp0);
 307       sp1 = sp2 = sp3 = sp0;
 308 
 309       if (off > 1) {
 310         GET_POINTER(sp1);
 311       }
 312 
 313       if (off > 2) {
 314         GET_POINTER(sp2);
 315       }
 316 
 317       LOAD_1CH();
 318       BL_SUM();
 319 
 320       dmask = ((mlib_d64*)mlib_dmask_arr)[(0xF0 >> off) & 0x0F];
 321       dp[i] = vis_for (vis_fand(dmask, dd), vis_fandnot(dmask, dp[i]));
 322     }
 323   }
 324 
 325   return MLIB_SUCCESS;
 326 }
 327 
 328 /***************************************************************/
 329 #undef  GET_POINTER
 330 #define GET_POINTER(sp)                                                      \
 331   sp = *(mlib_f32**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + (X >> MLIB_SHIFT); \
 332   X += dX;                                                                   \
 333   Y += dY
 334 
 335 /***************************************************************/
 336 #define LOAD_2CH()                                              \
 337   s0 = vis_freg_pair(sp0[0], sp1[0]);                           \
 338   s1 = vis_freg_pair(sp0[1], sp1[1]);                           \
 339   s2 = vis_freg_pair(sp0[srcYStride], sp1[srcYStride]);         \
 340   s3 = vis_freg_pair(sp0[srcYStride + 1], sp1[srcYStride + 1])
 341 
 342 /***************************************************************/
 343 #undef  PREPARE_DELTAS
 344 #define PREPARE_DELTAS                                               \
 345   if (warp_tbl != NULL) {                                            \
 346     dX = warp_tbl[2*j    ];                                          \
 347     dY = warp_tbl[2*j + 1];                                          \
 348     dx64 = vis_to_double_dup(((dX & 0xFFFF) << 16) | (dX & 0xFFFF)); \
 349     dy64 = vis_to_double_dup(((dY & 0xFFFF) << 16) | (dY & 0xFFFF)); \
 350   }
 351 
 352 /***************************************************************/
 353 mlib_status FUN_NAME(2ch)(mlib_affine_param *param)
 354 {
 355   DECLAREVAR();
 356   mlib_s32 off;
 357   mlib_s32 x0, x1, y0, y1;
 358 
 359   if (((mlib_s32)lineAddr[0] | (mlib_s32)dstData | srcYStride | dstYStride) & 3) {
 360     return FUN_NAME(2ch_na)(param);
 361   }
 362 
 363   srcYStride >>= 2;
 364 
 365   dx64 = vis_to_double_dup(((dX & 0xFFFF) << 16) | (dX & 0xFFFF));
 366   dy64 = vis_to_double_dup(((dY & 0xFFFF) << 16) | (dY & 0xFFFF));
 367 
 368   for (j = yStart; j <= yFinish; j++) {
 369     mlib_f32 *sp0, *sp1;
 370     mlib_d64 *dp;
 371 
 372     NEW_LINE(2);
 373 
 374     off = (mlib_s32)dl & 7;
 375     dp = (mlib_d64*)(dl - off);
 376 
 377     if (off) {
 378       x0 = X - dX; y0 = Y - dY;
 379       x1 = X;      y1 = Y;
 380     } else {
 381       x0 = X;      y0 = Y;
 382       x1 = X + dX; y1 = Y + dY;
 383     }
 384 
 385     deltax = DOUBLE_4U16(x0, x0, x1, x1);
 386     deltay = DOUBLE_4U16(y0, y0, y1, y1);
 387 
 388     if (off) {
 389       GET_POINTER(sp1);
 390       sp0 = sp1;
 391       LOAD_2CH();
 392 
 393       BL_SUM();
 394 
 395       ((mlib_f32*)dp)[1] = vis_read_lo(dd);
 396       dp++;
 397       size--;
 398     }
 399 
 400 #pragma pipeloop(0)
 401     for (i = 0; i < size/2; i++) {
 402       GET_POINTER(sp0);
 403       GET_POINTER(sp1);
 404       LOAD_2CH();
 405 
 406       BL_SUM();
 407 
 408       *dp++ = dd;
 409     }
 410 
 411     if (size & 1) {
 412       GET_POINTER(sp0);
 413       sp1 = sp0;
 414       LOAD_2CH();
 415 
 416       BL_SUM();
 417 
 418       ((mlib_f32*)dp)[0] = vis_read_hi(dd);
 419     }
 420   }
 421 
 422   return MLIB_SUCCESS;
 423 }
 424 
 425 /***************************************************************/
 426 #undef  GET_POINTER
 427 #define GET_POINTER(sp)                                                       \
 428   sp = *(mlib_u8**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + 4*(X >> MLIB_SHIFT); \
 429   X += dX;                                                                    \
 430   Y += dY
 431 
 432 /***************************************************************/
 433 #ifndef MLIB_VIS2
 434 
 435 #define LOAD_2CH_NA()                                           \
 436   s0 = vis_faligndata(LD_U16(sp1, 2), mask_7fff);               \
 437   s1 = vis_faligndata(LD_U16(sp1, 6), mask_7fff);               \
 438   s2 = vis_faligndata(LD_U16(sp1, srcYStride + 2), mask_7fff);  \
 439   s3 = vis_faligndata(LD_U16(sp1, srcYStride + 6), mask_7fff);  \
 440                                                                 \
 441   s0 = vis_faligndata(LD_U16(sp1, 0), s0);                      \
 442   s1 = vis_faligndata(LD_U16(sp1, 4), s1);                      \
 443   s2 = vis_faligndata(LD_U16(sp1, srcYStride), s2);             \
 444   s3 = vis_faligndata(LD_U16(sp1, srcYStride + 4), s3);         \
 445                                                                 \
 446   s0 = vis_faligndata(LD_U16(sp0, 2), s0);                      \
 447   s1 = vis_faligndata(LD_U16(sp0, 6), s1);                      \
 448   s2 = vis_faligndata(LD_U16(sp0, srcYStride + 2), s2);         \
 449   s3 = vis_faligndata(LD_U16(sp0, srcYStride + 6), s3);         \
 450                                                                 \
 451   s0 = vis_faligndata(LD_U16(sp0, 0), s0);                      \
 452   s1 = vis_faligndata(LD_U16(sp0, 4), s1);                      \
 453   s2 = vis_faligndata(LD_U16(sp0, srcYStride), s2);             \
 454   s3 = vis_faligndata(LD_U16(sp0, srcYStride + 4), s3)
 455 
 456 #else
 457 
 458 #define LOAD_2CH_NA()                                                          \
 459   s0 = vis_bshuffle(LD_U16(sp0, 0), LD_U16(sp1, 0));                           \
 460   s1 = vis_bshuffle(LD_U16(sp0, 4), LD_U16(sp1, 4));                           \
 461   s2 = vis_bshuffle(LD_U16(sp0, srcYStride), LD_U16(sp1, srcYStride));         \
 462   s3 = vis_bshuffle(LD_U16(sp0, srcYStride + 4), LD_U16(sp1, srcYStride + 4)); \
 463                                                                                \
 464   t0 = vis_bshuffle(LD_U16(sp0, 2), LD_U16(sp1, 2));                           \
 465   t1 = vis_bshuffle(LD_U16(sp0, 6), LD_U16(sp1, 6));                           \
 466   t2 = vis_bshuffle(LD_U16(sp0, srcYStride + 2), LD_U16(sp1, srcYStride + 2)); \
 467   t3 = vis_bshuffle(LD_U16(sp0, srcYStride + 6), LD_U16(sp1, srcYStride + 6)); \
 468                                                                                \
 469   s0 = vis_bshuffle(s0, t0);                                                   \
 470   s1 = vis_bshuffle(s1, t1);                                                   \
 471   s2 = vis_bshuffle(s2, t2);                                                   \
 472   s3 = vis_bshuffle(s3, t3)
 473 
 474 #endif
 475 
 476 /***************************************************************/
 477 mlib_status FUN_NAME(2ch_na)(mlib_affine_param *param)
 478 {
 479   DECLAREVAR();
 480   mlib_s32 max_xsize = param -> max_xsize, bsize;
 481   mlib_s32 x0, x1, y0, y1;
 482   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
 483 #ifdef MLIB_VIS2
 484   mlib_d64 t0, t1, t2, t3;
 485 #endif
 486 
 487   bsize = (max_xsize + 1)/2;
 488 
 489   if (bsize > BUF_SIZE) {
 490     pbuff = mlib_malloc(bsize*sizeof(mlib_d64));
 491 
 492     if (pbuff == NULL) return MLIB_FAILURE;
 493   }
 494 
 495   MLIB_WRITE_BMASK(0x45CD67EF);
 496 
 497   dx64 = vis_to_double_dup(((dX & 0xFFFF) << 16) | (dX & 0xFFFF));
 498   dy64 = vis_to_double_dup(((dY & 0xFFFF) << 16) | (dY & 0xFFFF));
 499 
 500   for (j = yStart; j <= yFinish; j++) {
 501     mlib_u8 *sp0, *sp1;
 502 
 503 #ifndef MLIB_VIS2
 504     vis_alignaddr((void*)0, 6);
 505 #endif
 506 
 507     NEW_LINE(2);
 508 
 509     x0 = X;      y0 = Y;
 510     x1 = X + dX; y1 = Y + dY;
 511 
 512     deltax = DOUBLE_4U16(x0, x0, x1, x1);
 513     deltay = DOUBLE_4U16(y0, y0, y1, y1);
 514 
 515 #pragma pipeloop(0)
 516     for (i = 0; i < size/2; i++) {
 517       GET_POINTER(sp0);
 518       GET_POINTER(sp1);
 519       LOAD_2CH_NA();
 520 
 521       BL_SUM();
 522 
 523       pbuff[i] = dd;
 524     }
 525 
 526     if (size & 1) {
 527       GET_POINTER(sp0);
 528       sp1 = sp0;
 529       LOAD_2CH_NA();
 530 
 531       BL_SUM();
 532 
 533       pbuff[i] = dd;
 534     }
 535 
 536     mlib_ImageCopy_na((mlib_u8*)pbuff, dl, 4*size);
 537   }
 538 
 539   if (pbuff != buff) {
 540     mlib_free(pbuff);
 541   }
 542 
 543   return MLIB_SUCCESS;
 544 }
 545 
 546 /***************************************************************/
 547 #undef  PREPARE_DELTAS
 548 #define PREPARE_DELTAS                                                             \
 549   if (warp_tbl != NULL) {                                                          \
 550     dX = warp_tbl[2*j    ];                                                        \
 551     dY = warp_tbl[2*j + 1];                                                        \
 552     dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */                       \
 553     dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */                       \
 554     dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF)); \
 555     dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF)); \
 556   }
 557 
 558 /***************************************************************/
 559 mlib_status FUN_NAME(3ch)(mlib_affine_param *param)
 560 {
 561   DECLAREVAR();
 562   mlib_s32 max_xsize = param -> max_xsize;
 563   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
 564 
 565   if (max_xsize > BUF_SIZE) {
 566     pbuff = mlib_malloc(max_xsize*sizeof(mlib_d64));
 567 
 568     if (pbuff == NULL) return MLIB_FAILURE;
 569   }
 570 
 571   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
 572   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
 573   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
 574   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
 575 
 576   for (j = yStart; j <= yFinish; j++) {
 577     mlib_u8  *sp;
 578     mlib_d64 *sp0, *sp1;
 579 
 580     NEW_LINE(3);
 581 
 582     deltax = DOUBLE_4U16(X, X, X, X);
 583     deltay = DOUBLE_4U16(Y, Y, Y, Y);
 584 
 585 #pragma pipeloop(0)
 586     for (i = 0; i < size; i++) {
 587       sp = *(mlib_u8**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + 6*(X >> MLIB_SHIFT) - 2;
 588 
 589       vis_alignaddr(sp, 0);
 590       sp0 = AL_ADDR(sp, 0);
 591       s0 = vis_faligndata(sp0[0], sp0[1]);
 592       s1 = vis_faligndata(sp0[1], sp0[2]);
 593 
 594       vis_alignaddr(sp, srcYStride);
 595       sp1 = AL_ADDR(sp, srcYStride);
 596       s2 = vis_faligndata(sp1[0], sp1[1]);
 597       s3 = vis_faligndata(sp1[1], sp1[2]);
 598 
 599       BL_SUM_3CH();
 600 
 601       pbuff[i] = dd;
 602       X += dX;
 603       Y += dY;
 604     }
 605 
 606     mlib_v_ImageChannelExtract_S16_43L_D1((void *)pbuff, (void *)dl, size);
 607   }
 608 
 609   if (pbuff != buff) {
 610     mlib_free(pbuff);
 611   }
 612 
 613   return MLIB_SUCCESS;
 614 }
 615 
 616 /***************************************************************/
 617 mlib_status FUN_NAME(4ch)(mlib_affine_param *param)
 618 {
 619   DECLAREVAR();
 620 
 621   if (((mlib_s32)lineAddr[0] | (mlib_s32)dstData | srcYStride | dstYStride) & 7) {
 622     return FUN_NAME(4ch_na)(param);
 623   }
 624 
 625   srcYStride >>= 3;
 626 
 627   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
 628   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
 629   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
 630   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
 631 
 632   for (j = yStart; j <= yFinish; j++) {
 633     mlib_d64 *sp;
 634 
 635     NEW_LINE(4);
 636 
 637     deltax = DOUBLE_4U16(X, X, X, X);
 638     deltay = DOUBLE_4U16(Y, Y, Y, Y);
 639 
 640 #pragma pipeloop(0)
 641     for (i = 0; i < size; i++) {
 642       sp = *(mlib_d64**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + (X >> MLIB_SHIFT);
 643       s0 = sp[0];
 644       s1 = sp[1];
 645       s2 = sp[srcYStride];
 646       s3 = sp[srcYStride + 1];
 647 
 648       BL_SUM();
 649 
 650       ((mlib_d64*)dl)[i] = dd;
 651       X += dX;
 652       Y += dY;
 653     }
 654   }
 655 
 656   return MLIB_SUCCESS;
 657 }
 658 
 659 /***************************************************************/
 660 mlib_status FUN_NAME(4ch_na)(mlib_affine_param *param)
 661 {
 662   DECLAREVAR();
 663   mlib_s32 max_xsize = param -> max_xsize;
 664   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
 665 
 666   if (max_xsize > BUF_SIZE) {
 667     pbuff = mlib_malloc(max_xsize*sizeof(mlib_d64));
 668 
 669     if (pbuff == NULL) return MLIB_FAILURE;
 670   }
 671 
 672   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
 673   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
 674   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
 675   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
 676 
 677   for (j = yStart; j <= yFinish; j++) {
 678     mlib_u8  *sp;
 679     mlib_d64 *sp0, *sp1;
 680 
 681     NEW_LINE(4);
 682 
 683     deltax = DOUBLE_4U16(X, X, X, X);
 684     deltay = DOUBLE_4U16(Y, Y, Y, Y);
 685 
 686 #pragma pipeloop(0)
 687     for (i = 0; i < size; i++) {
 688       sp = *(mlib_u8**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + 8*(X >> MLIB_SHIFT);
 689 
 690       vis_alignaddr(sp, 0);
 691       sp0 = AL_ADDR(sp, 0);
 692       s0 = vis_faligndata(sp0[0], sp0[1]);
 693       s1 = vis_faligndata(sp0[1], sp0[2]);
 694 
 695       vis_alignaddr(sp, srcYStride);
 696       sp1 = AL_ADDR(sp, srcYStride);
 697       s2 = vis_faligndata(sp1[0], sp1[1]);
 698       s3 = vis_faligndata(sp1[1], sp1[2]);
 699 
 700       BL_SUM();
 701 
 702       pbuff[i] = dd;
 703       X += dX;
 704       Y += dY;
 705     }
 706 
 707     mlib_ImageCopy_na((mlib_u8*)pbuff, dl, 8*size);
 708   }
 709 
 710   if (pbuff != buff) {
 711     mlib_free(pbuff);
 712   }
 713 
 714   return MLIB_SUCCESS;
 715 }
 716 
 717 /***************************************************************/