1 /*
   2  * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 
  28 /*
  29  * FILENAME: mlib_v_ImageChannelExtract_43.c
  30  *
  31  * FUNCTIONS
  32  *      mlib_v_ImageChannelExtract_U8_43R_A8D1X8
  33  *      mlib_v_ImageChannelExtract_U8_43R_A8D2X8
  34  *      mlib_v_ImageChannelExtract_U8_43R_D1
  35  *      mlib_v_ImageChannelExtract_U8_43R
  36  *      mlib_v_ImageChannelExtract_S16_43R_A8D1X4
  37  *      mlib_v_ImageChannelExtract_S16_43R_A8D2X4
  38  *      mlib_v_ImageChannelExtract_S16_43R_D1
  39  *      mlib_v_ImageChannelExtract_S16_43R
  40  *      mlib_v_ImageChannelExtract_U8_43L_A8D1X8
  41  *      mlib_v_ImageChannelExtract_U8_43L_A8D2X8
  42  *      mlib_v_ImageChannelExtract_U8_43L_D1
  43  *      mlib_v_ImageChannelExtract_U8_43L
  44  *      mlib_v_ImageChannelExtract_S16_43L_A8D1X4
  45  *      mlib_v_ImageChannelExtract_S16_43L_A8D2X4
  46  *      mlib_v_ImageChannelExtract_S16_43L_D1
  47  *      mlib_v_ImageChannelExtract_S16_43L
  48  *
  49  * SYNOPSIS
  50  *
  51  * ARGUMENT
  52  *      src    pointer to source image data
  53  *      dst    pointer to destination image data
  54  *      slb    source image line stride in bytes
  55  *      dlb    destination image line stride in bytes
  56  *      dsize image data size in pixels
  57  *      xsize  image width in pixels
  58  *      ysize  image height in lines
  59  *      cmask channel mask
  60  *
  61  * DESCRIPTION
  62  *      extract the right or left 3 channels of a 4-channel image to
  63  *      a 3-channel image -- VIS version low level functions.
  64  *
  65  *      ABGR => BGR   (43R), or  RGBA => RGB  (43L)
  66  *
  67  * NOTE
  68  *      These functions are separated from mlib_v_ImageChannelExtract.c
  69  *      for loop unrolling and structure clarity.
  70  */
  71 
  72 #include "vis_proto.h"
  73 #include "mlib_image.h"
  74 #include "mlib_v_ImageChannelExtract.h"
  75 
  76 /***************************************************************/
  77 #define EXTRACT_U8_43R_old          /* shift right */           \
  78   dd2 = vis_faligndata(sd3, dd2);    /* r7-------------- */     \
  79   sd3 = vis_faligndata(sd3, sd3);                               \
  80   dd2 = vis_faligndata(sd3, dd2);    /* g7r7------------ */     \
  81   sd3 = vis_faligndata(sd3, sd3);                               \
  82   dd2 = vis_faligndata(sd3, dd2);    /* b7g7r7---------- */     \
  83   sd3 = vis_faligndata(sd3, sd3);                               \
  84   sd3 = vis_faligndata(sd3, sd3);                               \
  85   dd2 = vis_faligndata(sd3, dd2);    /* r6b7g7r7-------- */     \
  86   sd3 = vis_faligndata(sd3, sd3);                               \
  87   dd2 = vis_faligndata(sd3, dd2);    /* g6r6b7g7r7------ */     \
  88   sd3 = vis_faligndata(sd3, sd3);                               \
  89   dd2 = vis_faligndata(sd3, dd2);    /* b6g6r6b7g7r7---- */     \
  90                                                                 \
  91   dd2 = vis_faligndata(sd2, dd2);    /* r5b6g6r6b7g7r7-- */     \
  92   sd2 = vis_faligndata(sd2, sd2);                               \
  93   dd2 = vis_faligndata(sd2, dd2);    /* g5r5b6g6r6b7g7r7 */     \
  94                                                                 \
  95   sd2 = vis_faligndata(sd2, sd2);                               \
  96   dd1 = vis_faligndata(sd2, dd1);    /* b5-------------- */     \
  97   sd2 = vis_faligndata(sd2, sd2);                               \
  98   sd2 = vis_faligndata(sd2, sd2);                               \
  99   dd1 = vis_faligndata(sd2, dd1);    /* r4b5------------ */     \
 100   sd2 = vis_faligndata(sd2, sd2);                               \
 101   dd1 = vis_faligndata(sd2, dd1);    /* g4r4b5---------- */     \
 102   sd2 = vis_faligndata(sd2, sd2);                               \
 103   dd1 = vis_faligndata(sd2, dd1);    /* b4g4r4b5-------- */     \
 104                                                                 \
 105   dd1 = vis_faligndata(sd1, dd1);    /* r3b4g4r4b5------ */     \
 106   sd1 = vis_faligndata(sd1, sd1);                               \
 107   dd1 = vis_faligndata(sd1, dd1);    /* g3r3b4g4r4b5---- */     \
 108   sd1 = vis_faligndata(sd1, sd1);                               \
 109   dd1 = vis_faligndata(sd1, dd1);    /* b3g3r3b4g4r4b5-- */     \
 110   sd1 = vis_faligndata(sd1, sd1);                               \
 111   sd1 = vis_faligndata(sd1, sd1);                               \
 112   dd1 = vis_faligndata(sd1, dd1);    /* r2b3g3r3b4g4r4b5 */     \
 113                                                                 \
 114   sd1 = vis_faligndata(sd1, sd1);                               \
 115   dd0 = vis_faligndata(sd1, dd0);    /* g2-------------- */     \
 116   sd1 = vis_faligndata(sd1, sd1);                               \
 117   dd0 = vis_faligndata(sd1, dd0);    /* b2g2------------ */     \
 118                                                                 \
 119   dd0 = vis_faligndata(sd0, dd0);    /* r1b2g2---------- */     \
 120   sd0 = vis_faligndata(sd0, sd0);                               \
 121   dd0 = vis_faligndata(sd0, dd0);    /* g1r1b2g2-------- */     \
 122   sd0 = vis_faligndata(sd0, sd0);                               \
 123   dd0 = vis_faligndata(sd0, dd0);    /* b1g1r1b2g2------ */     \
 124   sd0 = vis_faligndata(sd0, sd0);                               \
 125   sd0 = vis_faligndata(sd0, sd0);                               \
 126   dd0 = vis_faligndata(sd0, dd0);    /* r0b1g1r1b2g2---- */     \
 127   sd0 = vis_faligndata(sd0, sd0);                               \
 128   dd0 = vis_faligndata(sd0, dd0);    /* g0r0b1g1r1b2g2-- */     \
 129   sd0 = vis_faligndata(sd0, sd0);                               \
 130   dd0 = vis_faligndata(sd0, dd0);           /* b0g0r0b1g1r1b2g2 */
 131 
 132 /***************************************************************/
 133 #define EXTRACT_U8_43R              /* shift right */           \
 134   vis_alignaddr((void *)0, 5);                                  \
 135   dd2 = vis_faligndata(sd3, dd2);    /* b7g7r7---------- */     \
 136   sda = vis_freg_pair(vis_read_hi(sd3), vis_read_hi(sd3));      \
 137   dd2 = vis_faligndata(sda, dd2);    /* b6g6r6b7g7r7---- */     \
 138                                                                 \
 139   vis_alignaddr((void *)0, 6);                                  \
 140   dd2 = vis_faligndata(sd2, dd2);    /* g5r5b6g6r6b7g7r7 */     \
 141                                                                 \
 142   vis_alignaddr((void *)0, 5);                                  \
 143   dd1 = vis_faligndata(sd2, dd1);    /* b5g5r5---------- */     \
 144   sda = vis_freg_pair(vis_read_hi(sd2), vis_read_hi(sd2));      \
 145   dd1 = vis_faligndata(sda, dd1);    /* b4g4r4b5g5r5---- */     \
 146   dd1 = vis_faligndata(sd1, dd1);    /* b3g3r3b4g4r4b5g5 */     \
 147   sda = vis_freg_pair(vis_read_hi(sd1), vis_read_hi(sd1));      \
 148   vis_alignaddr((void *)0, 7);                                  \
 149   dd1 = vis_faligndata(sda, dd1);    /* r2b3g3r3b4g4r4b5 */     \
 150                                                                 \
 151   vis_alignaddr((void *)0, 5);                                  \
 152   dd0 = vis_faligndata(sda, dd0);    /* b2g2r2---------- */     \
 153   dd0 = vis_faligndata(sd0, dd0);    /* b1g1r1b2g2r2---- */     \
 154   sda = vis_freg_pair(vis_read_hi(sd0), vis_read_hi(sd0));      \
 155   dd0 = vis_faligndata(sda, dd0);           /* b0g0r0b1g1r1b2g2 */
 156 
 157 /***************************************************************/
 158 #define LOAD_EXTRACT_U8_43R_STORE                               \
 159   sd0 = *sp++;          /* --b0g0r0--b1g1r1 */                  \
 160   sd1 = *sp++;          /* --b2g2r2--b3g3r3 */                  \
 161   sd2 = *sp++;          /* --b4g4r4--b5g5r5 */                  \
 162   sd3 = *sp++;          /* --b6g6r6--b7g7r7 */                  \
 163   EXTRACT_U8_43R;                                               \
 164   *dp++ = dd0;          /* b0g0r0b1g1r1b2g2 */                  \
 165   *dp++ = dd1;          /* r2b3g3r3b4g4r4b5 */                  \
 166   *dp++ = dd2;                              /* g5r5b6g6r6b7g7r7 */
 167 
 168 /***************************************************************/
 169 #define LOAD_EXTRACT_U8_43R                                     \
 170   vis_alignaddr((void *)soff, 0);                               \
 171   s0 = s4;                                                      \
 172   s1 = sp[1];                                                   \
 173   s2 = sp[2];                                                   \
 174   s3 = sp[3];                                                   \
 175   s4 = sp[4];                                                   \
 176   sd0 = vis_faligndata(s0, s1);                                 \
 177   sd1 = vis_faligndata(s1, s2);                                 \
 178   sd2 = vis_faligndata(s2, s3);                                 \
 179   sd3 = vis_faligndata(s3, s4);                                 \
 180   sp += 4;                                                      \
 181   dd2old = dd2;                                                 \
 182   EXTRACT_U8_43R
 183 
 184 /***************************************************************/
 185 /*
 186  * Both source and destination image data are 1-d vectors and
 187  * 8-byte aligned. And dsize is multiple of 8.
 188  */
 189 
 190 void mlib_v_ImageChannelExtract_U8_43R_A8D1X8(const mlib_u8 *src,
 191                                               mlib_u8       *dst,
 192                                               mlib_s32      dsize)
 193 {
 194   mlib_d64 *sp, *dp;
 195   mlib_d64 sd0, sd1, sd2, sd3;                        /* source data */
 196   mlib_d64 dd0, dd1, dd2;                             /* dst data */
 197   mlib_d64 sda;
 198   mlib_s32 i;
 199 
 200   sp = (mlib_d64 *) src;
 201   dp = (mlib_d64 *) dst;
 202 
 203   /* set GSR.offset for vis_faligndata()  */
 204 /* vis_alignaddr((void *)0, 7); *//* only for _old */
 205 
 206 #pragma pipeloop(0)
 207   for (i = 0; i < dsize / 8; i++) {
 208     LOAD_EXTRACT_U8_43R_STORE;
 209   }
 210 }
 211 
 212 /***************************************************************/
 213 /*
 214  * Either source or destination image data are not 1-d vectors, but
 215  * they are 8-byte aligned. And slb and dlb are multiple of 8.
 216  * The xsize is multiple of 8.
 217  */
 218 
 219 void mlib_v_ImageChannelExtract_U8_43R_A8D2X8(const mlib_u8 *src,
 220                                               mlib_s32      slb,
 221                                               mlib_u8       *dst,
 222                                               mlib_s32      dlb,
 223                                               mlib_s32      xsize,
 224                                               mlib_s32      ysize)
 225 {
 226   mlib_d64 *sp, *dp;                                  /* 8-byte aligned pointer for pixel */
 227   mlib_d64 *sl, *dl;                                  /* 8-byte aligned pointer for line */
 228   mlib_d64 sd0, sd1, sd2, sd3;                        /* source data */
 229   mlib_d64 dd0, dd1, dd2;                             /* dst data */
 230   mlib_d64 sda;
 231   mlib_s32 i, j;                                      /* indices for x, y */
 232 
 233   /* set GSR.offset for vis_faligndata()  */
 234 /* vis_alignaddr((void *)0, 7); *//* only for _old */
 235 
 236   sp = sl = (mlib_d64 *) src;
 237   dp = dl = (mlib_d64 *) dst;
 238 
 239   /* row loop */
 240   for (j = 0; j < ysize; j++) {
 241     /* 8-byte column loop */
 242 #pragma pipeloop(0)
 243     for (i = 0; i < xsize / 8; i++) {
 244       LOAD_EXTRACT_U8_43R_STORE;
 245     }
 246 
 247     sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
 248     dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
 249   }
 250 }
 251 
 252 /***************************************************************/
 253 /*
 254  * Either source or destination data are not 8-byte aligned.
 255  * And dsize is in pixels.
 256  */
 257 
 258 void mlib_v_ImageChannelExtract_U8_43R_D1(const mlib_u8 *src,
 259                                           mlib_u8       *dst,
 260                                           mlib_s32      dsize)
 261 {
 262   mlib_u8 *sa, *da;
 263   mlib_u8 *dend, *dend2;                              /* end points in dst */
 264   mlib_d64 *dp;                                       /* 8-byte aligned start points in dst */
 265   mlib_d64 *sp;                                       /* 8-byte aligned start point in src */
 266   mlib_d64 s0, s1, s2, s3, s4;                        /* 8-byte source row data */
 267   mlib_d64 sd0, sd1, sd2, sd3;                        /* 8-byte source data */
 268   mlib_d64 dd0, dd1, dd2;                             /* dst data */
 269   mlib_d64 dd2old;                                    /* the last datum of the last step */
 270   mlib_d64 sda;
 271   mlib_s32 soff;                                      /* offset of address in src */
 272   mlib_s32 doff;                                      /* offset of address in dst */
 273   mlib_s32 emask;                                     /* edge mask */
 274   mlib_s32 i, n;
 275 
 276   sa = (void *)src;
 277   da = dst;
 278 
 279   /* prepare the source address */
 280   sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
 281   soff = ((mlib_addr) sa & 7);
 282 
 283   /* prepare the destination addresses */
 284   dp = (mlib_d64 *) ((mlib_addr) da & (~7));
 285   dend = da + dsize * 3 - 1;
 286   dend2 = dend - 23;
 287   doff = 8 - ((mlib_addr) da & 7);
 288 
 289   /* generate edge mask for the start point */
 290   emask = vis_edge8(da, dend);
 291 
 292   /* load 32 byte, convert, store 24 bytes */
 293   s4 = sp[0];                               /* initial value */
 294   LOAD_EXTRACT_U8_43R;
 295 
 296   if (dsize >= 8) {
 297     if (doff == 8) {
 298       vis_pst_8(dd0, dp++, emask);
 299       *dp++ = dd1;
 300       *dp++ = dd2;
 301     }
 302     else {
 303       vis_alignaddr((void *)doff, 0);
 304       vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask);
 305       *dp++ = vis_faligndata(dd0, dd1);
 306       *dp++ = vis_faligndata(dd1, dd2);
 307     }
 308   }
 309   else {                                    /* for very small size */
 310     if (doff == 8) {
 311       vis_pst_8(dd0, dp++, emask);
 312       if ((mlib_addr) dp <= (mlib_addr) dend) {
 313         emask = vis_edge8(dp, dend);
 314         vis_pst_8(dd1, dp++, emask);
 315         if ((mlib_addr) dp <= (mlib_addr) dend) {
 316           emask = vis_edge8(dp, dend);
 317           vis_pst_8(dd2, dp++, emask);
 318         }
 319       }
 320     }
 321     else {
 322       vis_alignaddr((void *)doff, 0);
 323       vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask);
 324       if ((mlib_addr) dp <= (mlib_addr) dend) {
 325         emask = vis_edge8(dp, dend);
 326         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 327         if ((mlib_addr) dp <= (mlib_addr) dend) {
 328           emask = vis_edge8(dp, dend);
 329           vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask);
 330           if ((mlib_addr) dp <= (mlib_addr) dend) {
 331             emask = vis_edge8(dp, dend);
 332             vis_pst_8(vis_faligndata(dd2, dd2), dp++, emask);
 333           }
 334         }
 335       }
 336     }
 337   }
 338 
 339   /* no edge handling is needed in the loop */
 340   if (doff == 8) {
 341     if ((mlib_addr) dp <= (mlib_addr) dend2) {
 342       n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 24 + 1;
 343 #pragma pipeloop(0)
 344       for (i = 0; i < n; i++) {
 345         LOAD_EXTRACT_U8_43R;
 346         *dp++ = dd0;
 347         *dp++ = dd1;
 348         *dp++ = dd2;
 349       }
 350     }
 351   }
 352   else {
 353     if ((mlib_addr) dp <= (mlib_addr) dend2) {
 354       n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 24 + 1;
 355 #pragma pipeloop(0)
 356       for (i = 0; i < n; i++) {
 357         LOAD_EXTRACT_U8_43R;
 358         vis_alignaddr((void *)doff, 0);
 359         *dp++ = vis_faligndata(dd2old, dd0);
 360         *dp++ = vis_faligndata(dd0, dd1);
 361         *dp++ = vis_faligndata(dd1, dd2);
 362       }
 363     }
 364   }
 365 
 366   if ((mlib_addr) dp <= (mlib_addr) dend) {
 367     LOAD_EXTRACT_U8_43R;
 368     emask = vis_edge8(dp, dend);
 369     if (doff == 8) {
 370       vis_pst_8(dd0, dp++, emask);
 371       if ((mlib_addr) dp <= (mlib_addr) dend) {
 372         emask = vis_edge8(dp, dend);
 373         vis_pst_8(dd1, dp++, emask);
 374         if ((mlib_addr) dp <= (mlib_addr) dend) {
 375           emask = vis_edge8(dp, dend);
 376           vis_pst_8(dd2, dp++, emask);
 377         }
 378       }
 379     }
 380     else {
 381       vis_alignaddr((void *)doff, 0);
 382       vis_pst_8(vis_faligndata(dd2old, dd0), dp++, emask);
 383       if ((mlib_addr) dp <= (mlib_addr) dend) {
 384         emask = vis_edge8(dp, dend);
 385         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 386         if ((mlib_addr) dp <= (mlib_addr) dend) {
 387           emask = vis_edge8(dp, dend);
 388           vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask);
 389         }
 390       }
 391     }
 392   }
 393 }
 394 
 395 /***************************************************************/
 396 void mlib_v_ImageChannelExtract_U8_43R(const mlib_u8 *src,
 397                                        mlib_s32      slb,
 398                                        mlib_u8       *dst,
 399                                        mlib_s32      dlb,
 400                                        mlib_s32      xsize,
 401                                        mlib_s32      ysize)
 402 {
 403   mlib_u8 *sa, *da;
 404   mlib_u8 *sl, *dl;
 405   mlib_s32 j;
 406 
 407   sa = sl = (void *)src;
 408   da = dl = dst;
 409 
 410   for (j = 0; j < ysize; j++) {
 411     mlib_v_ImageChannelExtract_U8_43R_D1(sa, da, xsize);
 412     sa = sl += slb;
 413     da = dl += dlb;
 414   }
 415 }
 416 
 417 /***************************************************************/
 418 #define EXTRACT_S16_43R_old      /* shift right */              \
 419                                                                 \
 420   dd2 = vis_faligndata(sd3, dd2);    /* r3------ */             \
 421   sd3 = vis_faligndata(sd3, sd3);                               \
 422   dd2 = vis_faligndata(sd3, dd2);    /* g3r3---- */             \
 423   sd3 = vis_faligndata(sd3, sd3);                               \
 424   dd2 = vis_faligndata(sd3, dd2);    /* b3g3r3-- */             \
 425                                                                 \
 426   dd2 = vis_faligndata(sd2, dd2);    /* r2b3g3r3 */             \
 427   sd2 = vis_faligndata(sd2, sd2);                               \
 428   dd1 = vis_faligndata(sd2, dd1);    /* g2------ */             \
 429   sd2 = vis_faligndata(sd2, sd2);                               \
 430   dd1 = vis_faligndata(sd2, dd1);    /* b2g2---- */             \
 431                                                                 \
 432   dd1 = vis_faligndata(sd1, dd1);    /* r1b2g2-- */             \
 433   sd1 = vis_faligndata(sd1, sd1);                               \
 434   dd1 = vis_faligndata(sd1, dd1);    /* g1r1b2g2 */             \
 435   sd1 = vis_faligndata(sd1, sd1);                               \
 436   dd0 = vis_faligndata(sd1, dd0);    /* b1------ */             \
 437                                                                 \
 438   dd0 = vis_faligndata(sd0, dd0);    /* r0b1---- */             \
 439   sd0 = vis_faligndata(sd0, sd0);                               \
 440   dd0 = vis_faligndata(sd0, dd0);    /* g0r0b1-- */             \
 441   sd0 = vis_faligndata(sd0, sd0);                               \
 442   dd0 = vis_faligndata(sd0, dd0);           /* b0g0r0b1 */
 443 
 444 /***************************************************************/
 445 #define EXTRACT_S16_43R        /* shift right */                \
 446                                                                 \
 447   vis_alignaddr((void *)0, 2);                                  \
 448   dd2 = vis_faligndata(sd3, dd2);    /* b3g3r3-- */             \
 449                                                                 \
 450   vis_alignaddr((void *)0, 6);                                  \
 451   dd2 = vis_faligndata(sd2, dd2);    /* r2b3g3r3 */             \
 452   vis_alignaddr((void *)0, 2);                                  \
 453   dd1 = vis_faligndata(sd2, dd1);    /* b2g2r2-- */             \
 454                                                                 \
 455   vis_alignaddr((void *)0, 4);                                  \
 456   dd1 = vis_faligndata(sd1, dd1);    /* g1r1b2g2 */             \
 457   vis_alignaddr((void *)0, 2);                                  \
 458   dd0 = vis_faligndata(sd1, dd0);    /* b1g1r1-- */             \
 459   dd0 = vis_faligndata(sd0, dd0);           /* b0g0r0b1 */
 460 
 461 /***************************************************************/
 462 #define LOAD_EXTRACT_S16_43R_STORE                              \
 463                                                                 \
 464   sd0 = *sp++;          /* --b0g0r0 */                          \
 465   sd1 = *sp++;          /* --b1g1r1 */                          \
 466   sd2 = *sp++;          /* --b2g2r2 */                          \
 467   sd3 = *sp++;          /* --b3g3r3 */                          \
 468                                                                 \
 469   EXTRACT_S16_43R;                                              \
 470                                                                 \
 471   *dp++ = dd0;          /* b0g0r0b1 */                          \
 472   *dp++ = dd1;          /* g1r1b2g2 */                          \
 473   *dp++ = dd2;                              /* r2b3g3r3 */
 474 
 475 /***************************************************************/
 476 #define LOAD_EXTRACT_S16_43R                                    \
 477                                                                 \
 478   vis_alignaddr((void *)soff, 0);                               \
 479   s0 = s4;                                                      \
 480   s1 = sp[1];                                                   \
 481   s2 = sp[2];                                                   \
 482   s3 = sp[3];                                                   \
 483   s4 = sp[4];                                                   \
 484   sd0 = vis_faligndata(s0, s1);                                 \
 485   sd1 = vis_faligndata(s1, s2);                                 \
 486   sd2 = vis_faligndata(s2, s3);                                 \
 487   sd3 = vis_faligndata(s3, s4);                                 \
 488   sp += 4;                                                      \
 489   dd2old = dd2;                                                 \
 490   EXTRACT_S16_43R
 491 
 492 /***************************************************************/
 493 /*
 494  * Both source and destination image data are 1-d vectors and
 495  * 8-byte aligned. And size is in 4-pixels.
 496  */
 497 
 498 void mlib_v_ImageChannelExtract_S16_43R_A8D1X4(const mlib_s16 *src,
 499                                                mlib_s16       *dst,
 500                                                mlib_s32       dsize)
 501 {
 502   mlib_d64 *sp, *dp;                                  /* 8-byte aligned pointer for pixel */
 503   mlib_d64 sd0, sd1, sd2, sd3;                        /* source data */
 504   mlib_d64 dd0, dd1, dd2;                             /* dst data */
 505   mlib_s32 i;
 506 
 507   sp = (mlib_d64 *) src;
 508   dp = (mlib_d64 *) dst;
 509 
 510   /* set GSR.offset for vis_faligndata()  */
 511 /* vis_alignaddr((void *)0, 6); *//* only for _old */
 512 
 513 #pragma pipeloop(0)
 514   for (i = 0; i < dsize / 4; i++) {
 515     LOAD_EXTRACT_S16_43R_STORE;
 516   }
 517 }
 518 
 519 /***************************************************************/
 520 /*
 521  * Either source or destination image data are not 1-d vectors, but
 522  * they are 8-byte aligned. The xsize is multiple of 8.
 523  * slb and dlb are multiple of 8.
 524  */
 525 
 526 void mlib_v_ImageChannelExtract_S16_43R_A8D2X4(const mlib_s16 *src,
 527                                                mlib_s32       slb,
 528                                                mlib_s16       *dst,
 529                                                mlib_s32       dlb,
 530                                                mlib_s32       xsize,
 531                                                mlib_s32       ysize)
 532 {
 533   mlib_d64 *sp, *dp;                                  /* 8-byte aligned pointer for pixel */
 534   mlib_d64 *sl, *dl;                                  /* 8-byte aligned pointer for line */
 535   mlib_d64 sd0, sd1, sd2, sd3;                        /* source data */
 536   mlib_d64 dd0, dd1, dd2;                             /* dst data */
 537   mlib_s32 i, j;                                      /* indices for x, y */
 538 
 539   /* set GSR.offset for vis_faligndata()  */
 540 /* vis_alignaddr((void *)0, 6); *//* only for _old */
 541 
 542   sp = sl = (mlib_d64 *) src;
 543   dp = dl = (mlib_d64 *) dst;
 544 
 545   /* row loop */
 546   for (j = 0; j < ysize; j++) {
 547     /* 4-pixel column loop */
 548 #pragma pipeloop(0)
 549     for (i = 0; i < xsize / 4; i++) {
 550       LOAD_EXTRACT_S16_43R_STORE;
 551     }
 552 
 553     sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
 554     dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
 555   }
 556 }
 557 
 558 /***************************************************************/
 559 /*
 560  * Either source or destination data are not 8-byte aligned.
 561  * And dsize is multiple of 8.
 562  */
 563 
 564 void mlib_v_ImageChannelExtract_S16_43R_D1(const mlib_s16 *src,
 565                                            mlib_s16       *dst,
 566                                            mlib_s32       dsize)
 567 {
 568   mlib_s16 *sa, *da;                                  /* pointer for pixel */
 569   mlib_s16 *dend, *dend2;                             /* end points in dst */
 570   mlib_d64 *dp;                                       /* 8-byte aligned start points in dst */
 571   mlib_d64 *sp;                                       /* 8-byte aligned start point in src */
 572   mlib_d64 s0, s1, s2, s3, s4;                        /* 8-byte source row data */
 573   mlib_d64 sd0, sd1, sd2, sd3;                        /* 8-byte source data */
 574   mlib_d64 dd0, dd1, dd2;                             /* dst data */
 575   mlib_d64 dd2old;                                    /* the last datum of the last step */
 576   mlib_s32 soff;                                      /* offset of address in src */
 577   mlib_s32 doff;                                      /* offset of address in dst */
 578   mlib_s32 emask;                                     /* edge mask */
 579   mlib_s32 i, n;
 580 
 581   sa = (void *)src;
 582   da = dst;
 583 
 584   /* prepare the source address */
 585   sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
 586   soff = ((mlib_addr) sa & 7);
 587 
 588   /* prepare the destination addresses */
 589   dp = (mlib_d64 *) ((mlib_addr) da & (~7));
 590   dend = da + dsize * 3 - 1;
 591   dend2 = dend - 11;
 592   doff = 8 - ((mlib_addr) da & 7);
 593 
 594   /* generate edge mask for the start point */
 595   emask = vis_edge16(da, dend);
 596 
 597   /* load 32 byte, convert, store 24 bytes */
 598   s4 = sp[0];                               /* initial value */
 599   LOAD_EXTRACT_S16_43R;
 600 
 601   if (dsize >= 4) {
 602     if (doff == 8) {
 603       vis_pst_16(dd0, dp++, emask);
 604       *dp++ = dd1;
 605       *dp++ = dd2;
 606     }
 607     else {
 608       vis_alignaddr((void *)doff, 0);
 609       vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask);
 610       *dp++ = vis_faligndata(dd0, dd1);
 611       *dp++ = vis_faligndata(dd1, dd2);
 612     }
 613   }
 614   else {                                    /* for very small size */
 615     if (doff == 8) {
 616       vis_pst_16(dd0, dp++, emask);
 617       if ((mlib_addr) dp <= (mlib_addr) dend) {
 618         emask = vis_edge16(dp, dend);
 619         vis_pst_16(dd1, dp++, emask);
 620         if ((mlib_addr) dp <= (mlib_addr) dend) {
 621           emask = vis_edge16(dp, dend);
 622           vis_pst_16(dd2, dp++, emask);
 623         }
 624       }
 625     }
 626     else {
 627       vis_alignaddr((void *)doff, 0);
 628       vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask);
 629       if ((mlib_addr) dp <= (mlib_addr) dend) {
 630         emask = vis_edge16(dp, dend);
 631         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
 632         if ((mlib_addr) dp <= (mlib_addr) dend) {
 633           emask = vis_edge16(dp, dend);
 634           vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask);
 635         }
 636       }
 637     }
 638   }
 639 
 640   /* no edge handling is needed in the loop */
 641   if (doff == 8) {
 642     if ((mlib_addr) dp <= (mlib_addr) dend2) {
 643       n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 24 + 1;
 644 #pragma pipeloop(0)
 645       for (i = 0; i < n; i++) {
 646         LOAD_EXTRACT_S16_43R;
 647         *dp++ = dd0;
 648         *dp++ = dd1;
 649         *dp++ = dd2;
 650       }
 651     }
 652   }
 653   else {
 654     if ((mlib_addr) dp <= (mlib_addr) dend2) {
 655       n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 24 + 1;
 656 #pragma pipeloop(0)
 657       for (i = 0; i < n; i++) {
 658         LOAD_EXTRACT_S16_43R;
 659         vis_alignaddr((void *)doff, 0);
 660         *dp++ = vis_faligndata(dd2old, dd0);
 661         *dp++ = vis_faligndata(dd0, dd1);
 662         *dp++ = vis_faligndata(dd1, dd2);
 663       }
 664     }
 665   }
 666 
 667   if ((mlib_addr) dp <= (mlib_addr) dend) {
 668     LOAD_EXTRACT_S16_43R;
 669     emask = vis_edge16(dp, dend);
 670     if (doff == 8) {
 671       vis_pst_16(dd0, dp++, emask);
 672       if ((mlib_addr) dp <= (mlib_addr) dend) {
 673         emask = vis_edge16(dp, dend);
 674         vis_pst_16(dd1, dp++, emask);
 675         if ((mlib_addr) dp <= (mlib_addr) dend) {
 676           emask = vis_edge16(dp, dend);
 677           vis_pst_16(dd2, dp++, emask);
 678         }
 679       }
 680     }
 681     else {
 682       vis_alignaddr((void *)doff, 0);
 683       vis_pst_16(vis_faligndata(dd2old, dd0), dp++, emask);
 684       if ((mlib_addr) dp <= (mlib_addr) dend) {
 685         emask = vis_edge16(dp, dend);
 686         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
 687         if ((mlib_addr) dp <= (mlib_addr) dend) {
 688           emask = vis_edge16(dp, dend);
 689           vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask);
 690         }
 691       }
 692     }
 693   }
 694 }
 695 
 696 /***************************************************************/
 697 void mlib_v_ImageChannelExtract_S16_43R(const mlib_s16 *src,
 698                                         mlib_s32       slb,
 699                                         mlib_s16       *dst,
 700                                         mlib_s32       dlb,
 701                                         mlib_s32       xsize,
 702                                         mlib_s32       ysize)
 703 {
 704   mlib_s16 *sa, *da;
 705   mlib_s16 *sl, *dl;
 706   mlib_s32 j;
 707 
 708   sa = sl = (void *)src;
 709   da = dl = dst;
 710 
 711   for (j = 0; j < ysize; j++) {
 712     mlib_v_ImageChannelExtract_S16_43R_D1(sa, da, xsize);
 713     sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
 714     da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
 715   }
 716 }
 717 
 718 /***************************************************************/
 719 #define EXTRACT_U8_43L_old      /* shift left */                \
 720                                                                 \
 721   dd0 = vis_faligndata(dd0, sd0);    /* --------------r0 */     \
 722   sd0 = vis_faligndata(sd0, sd0);                               \
 723   dd0 = vis_faligndata(dd0, sd0);    /* ------------r0g0 */     \
 724   sd0 = vis_faligndata(sd0, sd0);                               \
 725   dd0 = vis_faligndata(dd0, sd0);    /* ----------r0g0b0 */     \
 726   sd0 = vis_faligndata(sd0, sd0);                               \
 727   sd0 = vis_faligndata(sd0, sd0);                               \
 728   dd0 = vis_faligndata(dd0, sd0);    /* --------r0g0b0r1 */     \
 729   sd0 = vis_faligndata(sd0, sd0);                               \
 730   dd0 = vis_faligndata(dd0, sd0);    /* ------r0g0b0r1g1 */     \
 731   sd0 = vis_faligndata(sd0, sd0);                               \
 732   dd0 = vis_faligndata(dd0, sd0);    /* ----r0g0b0r1g1b1 */     \
 733                                                                 \
 734   dd0 = vis_faligndata(dd0, sd1);    /* --r0g0b0r1g1b1r2 */     \
 735   sd1 = vis_faligndata(sd1, sd1);                               \
 736   dd0 = vis_faligndata(dd0, sd1);    /* r0g0b0r1g1b1r2g2 */     \
 737                                                                 \
 738   sd1 = vis_faligndata(sd1, sd1);                               \
 739   dd1 = vis_faligndata(dd1, sd1);    /* --------------b2 */     \
 740   sd1 = vis_faligndata(sd1, sd1);                               \
 741   sd1 = vis_faligndata(sd1, sd1);                               \
 742   dd1 = vis_faligndata(dd1, sd1);    /* ------------b2r3 */     \
 743   sd1 = vis_faligndata(sd1, sd1);                               \
 744   dd1 = vis_faligndata(dd1, sd1);    /* ----------b2r3g3 */     \
 745   sd1 = vis_faligndata(sd1, sd1);                               \
 746   dd1 = vis_faligndata(dd1, sd1);    /* --------b2r3g3b3 */     \
 747                                                                 \
 748   dd1 = vis_faligndata(dd1, sd2);    /* ------b2r3g3b3r4 */     \
 749   sd2 = vis_faligndata(sd2, sd2);                               \
 750   dd1 = vis_faligndata(dd1, sd2);    /* ----b2r3g3b3r4g4 */     \
 751   sd2 = vis_faligndata(sd2, sd2);                               \
 752   dd1 = vis_faligndata(dd1, sd2);    /* --b2r3g3b3r4g4b4 */     \
 753   sd2 = vis_faligndata(sd2, sd2);                               \
 754   sd2 = vis_faligndata(sd2, sd2);                               \
 755   dd1 = vis_faligndata(dd1, sd2);    /* b2r3g3b3r4g4b4r5 */     \
 756                                                                 \
 757   sd2 = vis_faligndata(sd2, sd2);                               \
 758   dd2 = vis_faligndata(dd2, sd2);    /* --------------g5 */     \
 759   sd2 = vis_faligndata(sd2, sd2);                               \
 760   dd2 = vis_faligndata(dd2, sd2);    /* ------------g5b5 */     \
 761                                                                 \
 762   dd2 = vis_faligndata(dd2, sd3);    /* ----------g5b5r6 */     \
 763   sd3 = vis_faligndata(sd3, sd3);                               \
 764   dd2 = vis_faligndata(dd2, sd3);    /* --------g5b5r6g6 */     \
 765   sd3 = vis_faligndata(sd3, sd3);                               \
 766   dd2 = vis_faligndata(dd2, sd3);    /* ------g5b5r6g6b6 */     \
 767   sd3 = vis_faligndata(sd3, sd3);                               \
 768   sd3 = vis_faligndata(sd3, sd3);                               \
 769   dd2 = vis_faligndata(dd2, sd3);    /* ----g5b5r6g6b6r7 */     \
 770   sd3 = vis_faligndata(sd3, sd3);                               \
 771   dd2 = vis_faligndata(dd2, sd3);    /* --g5b5r6g6b6r7g7 */     \
 772   sd3 = vis_faligndata(sd3, sd3);                               \
 773   dd2 = vis_faligndata(dd2, sd3);           /* g5b5r6g6b6r7g7b7 */
 774 
 775 /***************************************************************/
 776 #define EXTRACT_U8_43L        /* shift left */                  \
 777                                                                 \
 778   vis_alignaddr((void *)0, 3);                                  \
 779   dd0 = vis_faligndata(dd0, sd0);    /* ----------r0g0b0 */     \
 780   sda = vis_freg_pair(vis_read_lo(sd0), vis_read_hi(sd0));      \
 781   dd0 = vis_faligndata(dd0, sda);    /* ----r0g0b0r1g1b1 */     \
 782                                                                 \
 783   vis_alignaddr((void *)0, 2);                                  \
 784   dd0 = vis_faligndata(dd0, sd1);    /* r0g0b0r1g1b1r2g2 */     \
 785                                                                 \
 786   vis_alignaddr((void *)0, 3);                                  \
 787   dd1 = vis_faligndata(dd1, sd1);    /* ----------r2g2b2 */     \
 788   sda = vis_freg_pair(vis_read_lo(sd1), vis_read_hi(sd1));      \
 789   dd1 = vis_faligndata(dd1, sda);    /* ----r2g2b2r3g3b3 */     \
 790   dd1 = vis_faligndata(dd1, sd2);    /* g2b2r3g3b3r4g4b4 */     \
 791                                                                 \
 792   sda = vis_freg_pair(vis_read_lo(sd2), vis_read_hi(sd2));      \
 793   vis_alignaddr((void *)0, 1);                                  \
 794   dd1 = vis_faligndata(dd1, sda);    /* b2r3g3b3r4g4b4r5 */     \
 795                                                                 \
 796   vis_alignaddr((void *)0, 3);                                  \
 797   dd2 = vis_faligndata(dd2, sda);    /* ----------r5g5b5 */     \
 798                                                                 \
 799   dd2 = vis_faligndata(dd2, sd3);    /* ----r5g5b5r6g6b6 */     \
 800   sda = vis_freg_pair(vis_read_lo(sd3), vis_read_hi(sd3));      \
 801   dd2 = vis_faligndata(dd2, sda);           /* g5b5r6g6b6r7g7b7 */
 802 
 803 /***************************************************************/
 804 #define LOAD_EXTRACT_U8_43L_STORE                               \
 805                                                                 \
 806   sd0 = *sp++;          /* r0g0b0--r1g1b1-- */                  \
 807   sd1 = *sp++;          /* r2g2b2--r3g3b3-- */                  \
 808   sd2 = *sp++;          /* r4g4b4--r5g5b5-- */                  \
 809   sd3 = *sp++;          /* r6g6b6--r7g7b7-- */                  \
 810                                                                 \
 811   EXTRACT_U8_43L;                                               \
 812                                                                 \
 813   *dp++ = dd0;          /* r0g0b0r1g1b1r2g2 */                  \
 814   *dp++ = dd1;          /* b2r3g3b3r4g4b4r5 */                  \
 815   *dp++ = dd2;                              /* g5b5r6g6b6r7g7b7 */
 816 
 817 /***************************************************************/
 818 #define LOAD_EXTRACT_U8_43L                                             \
 819                                                                         \
 820   vis_alignaddr((void *)soff, 0);                                       \
 821   s0 = s4;                                                              \
 822   s1 = sp[1];                                                           \
 823   s2 = sp[2];                                                           \
 824   s3 = sp[3];                                                           \
 825   s4 = sp[4];                                                           \
 826   sd0 = vis_faligndata(s0, s1);  /* the intermediate is ABGR aligned */ \
 827   sd1 = vis_faligndata(s1, s2);                                         \
 828   sd2 = vis_faligndata(s2, s3);                                         \
 829   sd3 = vis_faligndata(s3, s4);                                         \
 830   sp += 4;                                                              \
 831                                                                         \
 832 /*  vis_alignaddr((void *)0, 1); */    /* for _old only */              \
 833   dd2old = dd2;                                                         \
 834   EXTRACT_U8_43L
 835 
 836 /***************************************************************/
 837 /*
 838  * Both source and destination image data are 1-d vectors and
 839  * 8-byte aligned. And dsize is multiple of 8.
 840  */
 841 
 842 void mlib_v_ImageChannelExtract_U8_43L_A8D1X8(const mlib_u8 *src,
 843                                               mlib_u8       *dst,
 844                                               mlib_s32      dsize)
 845 {
 846   mlib_d64 *sp, *dp;
 847   mlib_d64 sd0, sd1, sd2, sd3;                        /* source data */
 848   mlib_d64 dd0, dd1, dd2;                             /* dst data */
 849   mlib_d64 sda;
 850   mlib_s32 i;
 851 
 852   sp = (mlib_d64 *) src;
 853   dp = (mlib_d64 *) dst;
 854 
 855   /* set GSR.offset for vis_faligndata()  */
 856 /* vis_alignaddr((void *)0, 1); *//* for _old only */
 857 
 858 #pragma pipeloop(0)
 859   for (i = 0; i < dsize / 8; i++) {
 860     LOAD_EXTRACT_U8_43L_STORE;
 861   }
 862 }
 863 
 864 /***************************************************************/
 865 /*
 866  * Either source or destination image data are not 1-d vectors, but
 867  * they are 8-byte aligned. And slb and dlb are multiple of 8.
 868  * The xsize is multiple of 8.
 869  */
 870 
 871 void mlib_v_ImageChannelExtract_U8_43L_A8D2X8(const mlib_u8 *src,
 872                                               mlib_s32      slb,
 873                                               mlib_u8       *dst,
 874                                               mlib_s32      dlb,
 875                                               mlib_s32      xsize,
 876                                               mlib_s32      ysize)
 877 {
 878   mlib_d64 *sp, *dp;                                  /* 8-byte aligned pointer for pixel */
 879   mlib_d64 *sl, *dl;                                  /* 8-byte aligned pointer for line */
 880   mlib_d64 sd0, sd1, sd2, sd3;                        /* source data */
 881   mlib_d64 dd0, dd1, dd2;                             /* dst data */
 882   mlib_d64 sda;
 883   mlib_s32 i, j;                                      /* indices for x, y */
 884 
 885   /* set GSR.offset for vis_faligndata()  */
 886 /* vis_alignaddr((void *)0, 1); *//* for _old only */
 887 
 888   sp = sl = (mlib_d64 *) src;
 889   dp = dl = (mlib_d64 *) dst;
 890 
 891   /* row loop */
 892   for (j = 0; j < ysize; j++) {
 893     /* 8-byte column loop */
 894 #pragma pipeloop(0)
 895     for (i = 0; i < xsize / 8; i++) {
 896       LOAD_EXTRACT_U8_43L_STORE;
 897     }
 898 
 899     sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
 900     dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
 901   }
 902 }
 903 
 904 /***************************************************************/
 905 /*
 906  * Either source or destination data are not 8-byte aligned.
 907  * And ssize is multiple of 8.
 908  */
 909 
 910 void mlib_v_ImageChannelExtract_U8_43L_D1(const mlib_u8 *src,
 911                                           mlib_u8       *dst,
 912                                           mlib_s32      dsize)
 913 {
 914   mlib_u8 *sa, *da;
 915   mlib_u8 *dend, *dend2;                              /* end points in dst */
 916   mlib_d64 *dp;                                       /* 8-byte aligned start points in dst */
 917   mlib_d64 *sp;                                       /* 8-byte aligned start point in src */
 918   mlib_d64 s0, s1, s2, s3, s4;                        /* 8-byte source row data */
 919   mlib_d64 sd0, sd1, sd2, sd3;                        /* 8-byte source data */
 920   mlib_d64 dd0, dd1, dd2;                             /* dst data */
 921   mlib_d64 dd2old;                                    /* the last datum of the last step */
 922   mlib_d64 sda;
 923   mlib_s32 soff;                                      /* offset of address in src */
 924   mlib_s32 doff;                                      /* offset of address in dst */
 925   mlib_s32 emask;                                     /* edge mask */
 926   mlib_s32 i, n;
 927 
 928   sa = (void *)src;
 929   da = dst;
 930 
 931   /* prepare the source address */
 932   sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
 933   soff = ((mlib_addr) sa & 7);
 934 
 935   /* prepare the destination addresses */
 936   dp = (mlib_d64 *) ((mlib_addr) da & (~7));
 937   dend = da + dsize * 3 - 1;
 938   dend2 = dend - 23;
 939   doff = 8 - ((mlib_addr) da & 7);
 940 
 941   /* generate edge mask for the start point */
 942   emask = vis_edge8(da, dend);
 943 
 944   /* load 32 byte, convert, store 24 bytes */
 945   s4 = sp[0];                               /* initial value */
 946   LOAD_EXTRACT_U8_43L;
 947 
 948   if (dsize >= 8) {
 949     if (doff == 8) {
 950       vis_pst_8(dd0, dp++, emask);
 951       *dp++ = dd1;
 952       *dp++ = dd2;
 953     }
 954     else {
 955       vis_alignaddr((void *)doff, 0);
 956       vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask);
 957       *dp++ = vis_faligndata(dd0, dd1);
 958       *dp++ = vis_faligndata(dd1, dd2);
 959     }
 960   }
 961   else {                                    /* for very small size */
 962     if (doff == 8) {
 963       vis_pst_8(dd0, dp++, emask);
 964       if ((mlib_addr) dp <= (mlib_addr) dend) {
 965         emask = vis_edge8(dp, dend);
 966         vis_pst_8(dd1, dp++, emask);
 967         if ((mlib_addr) dp <= (mlib_addr) dend) {
 968           emask = vis_edge8(dp, dend);
 969           vis_pst_8(dd2, dp++, emask);
 970         }
 971       }
 972     }
 973     else {
 974       vis_alignaddr((void *)doff, 0);
 975       vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask);
 976       if ((mlib_addr) dp <= (mlib_addr) dend) {
 977         emask = vis_edge8(dp, dend);
 978         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 979         if ((mlib_addr) dp <= (mlib_addr) dend) {
 980           emask = vis_edge8(dp, dend);
 981           vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask);
 982           if ((mlib_addr) dp <= (mlib_addr) dend) {
 983             emask = vis_edge8(dp, dend);
 984             vis_pst_8(vis_faligndata(dd2, dd2), dp++, emask);
 985           }
 986         }
 987       }
 988     }
 989   }
 990 
 991   /* no edge handling is needed in the loop */
 992   if (doff == 8) {
 993     if ((mlib_addr) dp <= (mlib_addr) dend2) {
 994       n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 24 + 1;
 995 #pragma pipeloop(0)
 996       for (i = 0; i < n; i++) {
 997         LOAD_EXTRACT_U8_43L;
 998         *dp++ = dd0;
 999         *dp++ = dd1;
1000         *dp++ = dd2;
1001       }
1002     }
1003   }
1004   else {
1005     if ((mlib_addr) dp <= (mlib_addr) dend2) {
1006       n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 24 + 1;
1007 #pragma pipeloop(0)
1008       for (i = 0; i < n; i++) {
1009         LOAD_EXTRACT_U8_43L;
1010         vis_alignaddr((void *)doff, 0);
1011         *dp++ = vis_faligndata(dd2old, dd0);
1012         *dp++ = vis_faligndata(dd0, dd1);
1013         *dp++ = vis_faligndata(dd1, dd2);
1014       }
1015     }
1016   }
1017 
1018   if ((mlib_addr) dp <= (mlib_addr) dend) {
1019     LOAD_EXTRACT_U8_43L;
1020     emask = vis_edge8(dp, dend);
1021     if (doff == 8) {
1022       vis_pst_8(dd0, dp++, emask);
1023       if ((mlib_addr) dp <= (mlib_addr) dend) {
1024         emask = vis_edge8(dp, dend);
1025         vis_pst_8(dd1, dp++, emask);
1026         if ((mlib_addr) dp <= (mlib_addr) dend) {
1027           emask = vis_edge8(dp, dend);
1028           vis_pst_8(dd2, dp++, emask);
1029         }
1030       }
1031     }
1032     else {
1033       vis_alignaddr((void *)doff, 0);
1034       vis_pst_8(vis_faligndata(dd2old, dd0), dp++, emask);
1035       if ((mlib_addr) dp <= (mlib_addr) dend) {
1036         emask = vis_edge8(dp, dend);
1037         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
1038         if ((mlib_addr) dp <= (mlib_addr) dend) {
1039           emask = vis_edge8(dp, dend);
1040           vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask);
1041         }
1042       }
1043     }
1044   }
1045 }
1046 
1047 /***************************************************************/
1048 void mlib_v_ImageChannelExtract_U8_43L(const mlib_u8 *src,
1049                                        mlib_s32      slb,
1050                                        mlib_u8       *dst,
1051                                        mlib_s32      dlb,
1052                                        mlib_s32      xsize,
1053                                        mlib_s32      ysize)
1054 {
1055   mlib_u8 *sa, *da;
1056   mlib_u8 *sl, *dl;
1057   mlib_s32 j;
1058 
1059   sa = sl = (void *)src;
1060   da = dl = dst;
1061 
1062   for (j = 0; j < ysize; j++) {
1063     mlib_v_ImageChannelExtract_U8_43L_D1(sa, da, xsize);
1064     sa = sl += slb;
1065     da = dl += dlb;
1066   }
1067 }
1068 
1069 /***************************************************************/
1070 #define EXTRACT_S16_43L              /* shift left */           \
1071   vis_alignaddr((void *)0, 6);                                  \
1072   dd0 = vis_faligndata(dd0, sd0);    /* --r0g0b0 */             \
1073   vis_alignaddr((void *)0, 2);                                  \
1074   dd0 = vis_faligndata(dd0, sd1);    /* r0g0b0r1 */             \
1075                                                                 \
1076   vis_alignaddr((void *)0, 6);                                  \
1077   dd1 = vis_faligndata(dd1, sd1);    /* --r1g1b1 */             \
1078   vis_alignaddr((void *)0, 4);                                  \
1079   dd1 = vis_faligndata(dd1, sd2);    /* g1b1r2g2 */             \
1080                                                                 \
1081   vis_alignaddr((void *)0, 6);                                  \
1082   dd2 = vis_faligndata(dd2, sd2);    /* --r2g2b2 */             \
1083   dd2 = vis_faligndata(dd2, sd3);           /* b2r3g3b3 */
1084 
1085 /***************************************************************/
1086 #define LOAD_EXTRACT_S16_43L_STORE                              \
1087                                                                 \
1088   sd0 = *sp++;          /* r0g0b0-- */                          \
1089   sd1 = *sp++;          /* r1g1b1-- */                          \
1090   sd2 = *sp++;          /* r2g2b2-- */                          \
1091   sd3 = *sp++;          /* r3g3b3-- */                          \
1092                                                                 \
1093   EXTRACT_S16_43L;                                              \
1094                                                                 \
1095   *dp++ = dd0;          /* r0g0b0r1 */                          \
1096   *dp++ = dd1;          /* g1b1r2g2 */                          \
1097   *dp++ = dd2;                              /* b2r3g3b3 */
1098 
1099 /***************************************************************/
1100 #define LOAD_EXTRACT_S16_43L                                    \
1101                                                                 \
1102   vis_alignaddr((void *)soff, 0);                               \
1103   s0 = s4;                                                      \
1104   s1 = sp[1];                                                   \
1105   s2 = sp[2];                                                   \
1106   s3 = sp[3];                                                   \
1107   s4 = sp[4];                                                   \
1108   sd0 = vis_faligndata(s0, s1);                                 \
1109   sd1 = vis_faligndata(s1, s2);                                 \
1110   sd2 = vis_faligndata(s2, s3);                                 \
1111   sd3 = vis_faligndata(s3, s4);                                 \
1112   sp += 4;                                                      \
1113   dd2old = dd2;                                                 \
1114   EXTRACT_S16_43L
1115 
1116 /***************************************************************/
1117 /*
1118  * Both source and destination image data are 1-d vectors and
1119  * 8-byte aligned. And dsize is multiple of 4.
1120  */
1121 
1122 void mlib_v_ImageChannelExtract_S16_43L_A8D1X4(const mlib_s16 *src,
1123                                                mlib_s16       *dst,
1124                                                mlib_s32       dsize)
1125 {
1126   mlib_d64 *sp, *dp;                                  /* 8-byte aligned pointer for pixel */
1127   mlib_d64 sd0, sd1, sd2, sd3;                        /* source data */
1128   mlib_d64 dd0, dd1, dd2;                             /* dst data */
1129   mlib_s32 i;
1130 
1131   sp = (mlib_d64 *) src;
1132   dp = (mlib_d64 *) dst;
1133 
1134   /* set GSR.offset for vis_faligndata()  */
1135 /* vis_alignaddr((void *)0, 2); *//* only for _old */
1136 
1137 #pragma pipeloop(0)
1138   for (i = 0; i < dsize / 4; i++) {
1139     LOAD_EXTRACT_S16_43L_STORE;
1140   }
1141 }
1142 
1143 /***************************************************************/
1144 /*
1145  * Either source or destination image data are not 1-d vectors, but
1146  * they are 8-byte aligned. The xsize is multiple of 4.
1147  * And slb and dlb are multiple of 8.
1148  */
1149 
1150 void mlib_v_ImageChannelExtract_S16_43L_A8D2X4(const mlib_s16 *src,
1151                                                mlib_s32       slb,
1152                                                mlib_s16       *dst,
1153                                                mlib_s32       dlb,
1154                                                mlib_s32       xsize,
1155                                                mlib_s32       ysize)
1156 {
1157   mlib_d64 *sp, *dp;                                  /* 8-byte aligned pointer for pixel */
1158   mlib_d64 *sl, *dl;                                  /* 8-byte aligned pointer for line */
1159   mlib_d64 sd0, sd1, sd2, sd3;                        /* source data */
1160   mlib_d64 dd0, dd1, dd2;                             /* dst data */
1161   mlib_s32 i, j;                                      /* indices for x, y */
1162 
1163   /* set GSR.offset for vis_faligndata()  */
1164 /* vis_alignaddr((void *)0, 2); *//* only for _old */
1165 
1166   sp = sl = (mlib_d64 *) src;
1167   dp = dl = (mlib_d64 *) dst;
1168 
1169   /* row loop */
1170   for (j = 0; j < ysize; j++) {
1171     /* 4-pixel column loop */
1172 #pragma pipeloop(0)
1173     for (i = 0; i < xsize / 4; i++) {
1174       LOAD_EXTRACT_S16_43L_STORE;
1175     }
1176 
1177     sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1178     dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1179   }
1180 }
1181 
1182 /***************************************************************/
1183 /*
1184  * Either source or destination data are not 8-byte aligned.
1185  * And size is in pixels.
1186  */
1187 
1188 void mlib_v_ImageChannelExtract_S16_43L_D1(const mlib_s16 *src,
1189                                            mlib_s16       *dst,
1190                                            mlib_s32       dsize)
1191 {
1192   mlib_s16 *sa, *da;                                  /* pointer for pixel */
1193   mlib_s16 *dend, *dend2;                             /* end points in dst */
1194   mlib_d64 *dp;                                       /* 8-byte aligned start points in dst */
1195   mlib_d64 *sp;                                       /* 8-byte aligned start point in src */
1196   mlib_d64 s0, s1, s2, s3, s4;                        /* 8-byte source row data */
1197   mlib_d64 sd0, sd1, sd2, sd3;                        /* 8-byte source data */
1198   mlib_d64 dd0, dd1, dd2;                             /* dst data */
1199   mlib_d64 dd2old;                                    /* the last datum of the last step */
1200   mlib_s32 soff;                                      /* offset of address in src */
1201   mlib_s32 doff;                                      /* offset of address in dst */
1202   mlib_s32 emask;                                     /* edge mask */
1203   mlib_s32 i, n;
1204 
1205   sa = (void *)src;
1206   da = dst;
1207 
1208   /* prepare the source address */
1209   sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
1210   soff = ((mlib_addr) sa & 7);
1211 
1212   /* prepare the destination addresses */
1213   dp = (mlib_d64 *) ((mlib_addr) da & (~7));
1214   dend = da + dsize * 3 - 1;
1215   dend2 = dend - 11;
1216   doff = 8 - ((mlib_addr) da & 7);
1217 
1218   /* generate edge mask for the start point */
1219   emask = vis_edge16(da, dend);
1220 
1221   /* load 32 byte, convert, store 24 bytes */
1222   s4 = sp[0];                               /* initial value */
1223   LOAD_EXTRACT_S16_43L;
1224 
1225   if (dsize >= 4) {
1226     if (doff == 8) {
1227       vis_pst_16(dd0, dp++, emask);
1228       *dp++ = dd1;
1229       *dp++ = dd2;
1230     }
1231     else {
1232       vis_alignaddr((void *)doff, 0);
1233       vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask);
1234       *dp++ = vis_faligndata(dd0, dd1);
1235       *dp++ = vis_faligndata(dd1, dd2);
1236     }
1237   }
1238   else {                                    /* for very small size */
1239     if (doff == 8) {
1240       vis_pst_16(dd0, dp++, emask);
1241       if ((mlib_addr) dp <= (mlib_addr) dend) {
1242         emask = vis_edge16(dp, dend);
1243         vis_pst_16(dd1, dp++, emask);
1244         if ((mlib_addr) dp <= (mlib_addr) dend) {
1245           emask = vis_edge16(dp, dend);
1246           vis_pst_16(dd2, dp++, emask);
1247         }
1248       }
1249     }
1250     else {
1251       vis_alignaddr((void *)doff, 0);
1252       vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask);
1253       if ((mlib_addr) dp <= (mlib_addr) dend) {
1254         emask = vis_edge16(dp, dend);
1255         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
1256         if ((mlib_addr) dp <= (mlib_addr) dend) {
1257           emask = vis_edge16(dp, dend);
1258           vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask);
1259         }
1260       }
1261     }
1262   }
1263 
1264   /* no edge handling is needed in the loop */
1265   if (doff == 8) {
1266     if ((mlib_addr) dp <= (mlib_addr) dend2) {
1267       n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 24 + 1;
1268 #pragma pipeloop(0)
1269       for (i = 0; i < n; i++) {
1270         LOAD_EXTRACT_S16_43L;
1271         *dp++ = dd0;
1272         *dp++ = dd1;
1273         *dp++ = dd2;
1274       }
1275     }
1276   }
1277   else {
1278     if ((mlib_addr) dp <= (mlib_addr) dend2) {
1279       n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 24 + 1;
1280 #pragma pipeloop(0)
1281       for (i = 0; i < n; i++) {
1282         LOAD_EXTRACT_S16_43L;
1283         vis_alignaddr((void *)doff, 0);
1284         *dp++ = vis_faligndata(dd2old, dd0);
1285         *dp++ = vis_faligndata(dd0, dd1);
1286         *dp++ = vis_faligndata(dd1, dd2);
1287       }
1288     }
1289   }
1290 
1291   if ((mlib_addr) dp <= (mlib_addr) dend) {
1292     LOAD_EXTRACT_S16_43L;
1293     emask = vis_edge16(dp, dend);
1294     if (doff == 8) {
1295       vis_pst_16(dd0, dp++, emask);
1296       if ((mlib_addr) dp <= (mlib_addr) dend) {
1297         emask = vis_edge16(dp, dend);
1298         vis_pst_16(dd1, dp++, emask);
1299         if ((mlib_addr) dp <= (mlib_addr) dend) {
1300           emask = vis_edge16(dp, dend);
1301           vis_pst_16(dd2, dp++, emask);
1302         }
1303       }
1304     }
1305     else {
1306       vis_alignaddr((void *)doff, 0);
1307       vis_pst_16(vis_faligndata(dd2old, dd0), dp++, emask);
1308       if ((mlib_addr) dp <= (mlib_addr) dend) {
1309         emask = vis_edge16(dp, dend);
1310         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
1311         if ((mlib_addr) dp <= (mlib_addr) dend) {
1312           emask = vis_edge16(dp, dend);
1313           vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask);
1314         }
1315       }
1316     }
1317   }
1318 }
1319 
1320 /***************************************************************/
1321 void mlib_v_ImageChannelExtract_S16_43L(const mlib_s16 *src,
1322                                         mlib_s32       slb,
1323                                         mlib_s16       *dst,
1324                                         mlib_s32       dlb,
1325                                         mlib_s32       xsize,
1326                                         mlib_s32       ysize)
1327 {
1328   mlib_s16 *sa, *da;
1329   mlib_s16 *sl, *dl;
1330   mlib_s32 j;
1331 
1332   sa = sl = (void *)src;
1333   da = dl = dst;
1334 
1335   for (j = 0; j < ysize; j++) {
1336     mlib_v_ImageChannelExtract_S16_43L_D1(sa, da, xsize);
1337     sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
1338     da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1339   }
1340 }
1341 
1342 /***************************************************************/