1 /*
   2  * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 
  28 /*
  29  * FILENAME: mlib_ImageChannelExtract_1.c
  30  *
  31  * FUNCTIONS
  32  *      mlib_v_ImageChannelExtract_U8_21_A8D1X8
  33  *      mlib_v_ImageChannelExtract_U8_21_A8D2X8
  34  *      mlib_v_ImageChannelExtract_U8_21_D1
  35  *      mlib_v_ImageChannelExtract_U8_21
  36  *      mlib_v_ImageChannelExtract_U8_31_A8D1X8
  37  *      mlib_v_ImageChannelExtract_U8_31_A8D2X8
  38  *      mlib_v_ImageChannelExtract_U8_31_D1
  39  *      mlib_v_ImageChannelExtract_U8_31
  40  *      mlib_v_ImageChannelExtract_U8_41_A8D1X8
  41  *      mlib_v_ImageChannelExtract_U8_41_A8D2X8
  42  *      mlib_v_ImageChannelExtract_U8_41_D1
  43  *      mlib_v_ImageChannelExtract_U8_41
  44  *      mlib_v_ImageChannelExtract_S16_21_A8D1X4
  45  *      mlib_v_ImageChannelExtract_S16_21_A8D2X4
  46  *      mlib_v_ImageChannelExtract_S16_21_D1
  47  *      mlib_v_ImageChannelExtract_S16_21
  48  *      mlib_v_ImageChannelExtract_S16_31_A8D1X4
  49  *      mlib_v_ImageChannelExtract_S16_31_A8D2X4
  50  *      mlib_v_ImageChannelExtract_S16_31_D1
  51  *      mlib_v_ImageChannelExtract_S16_31
  52  *      mlib_v_ImageChannelExtract_S16_41_A8D1X4
  53  *      mlib_v_ImageChannelExtract_S16_41_A8D2X4
  54  *      mlib_v_ImageChannelExtract_S16_41_D1
  55  *      mlib_v_ImageChannelExtract_S16_41
  56  *
  57  * ARGUMENT
  58  *      src    pointer to source image data
  59  *      dst    pointer to destination image data
  60  *      slb    source image line stride in bytes
  61  *      dlb   destination image line stride in bytes
  62  *      dsize  image data size in pixels
  63  *      xsize  image width in pixels
  64  *      ysize  image height in lines
  65  *      cmask channel mask
  66  *
  67  * DESCRIPTION
  68  *      Extract the one selected channel of the source image into the
  69  *      1-channel destination image.
  70  *
  71  * NOTE
  72  *      These functions are separated from mlib_ImageChannelExtract.c
  73  *      for loop unrolling and structure clarity.
  74  */
  75 
  76 #include "vis_proto.h"
  77 #include "mlib_image.h"
  78 #include "mlib_v_ImageChannelExtract.h"
  79 
  80 /***************************************************************/
  81 #define CHANNELEXTRACT_U8_21L(sd0, sd1, dd)                     \
  82   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1));        \
  83   sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1));        \
  84   sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb));        \
  85   sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb));        \
  86   dd  = vis_fpmerge(vis_read_hi(sdc), vis_read_hi(sdd))
  87 
  88 /***************************************************************/
  89 #define CHANNELEXTRACT_U8_21R(sd0, sd1, dd)                     \
  90   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1));        \
  91   sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1));        \
  92   sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb));        \
  93   sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb));        \
  94   dd  = vis_fpmerge(vis_read_lo(sdc), vis_read_lo(sdd))
  95 
  96 /***************************************************************/
  97 /* extract one channel from a 2-channel image.
  98  * both source and destination image data are 8-byte aligned.
  99  * xsize is multiple of 8.
 100  */
 101 
 102 void mlib_v_ImageChannelExtract_U8_21_A8D1X8(const mlib_u8 *src,
 103                                              mlib_u8       *dst,
 104                                              mlib_s32      dsize,
 105                                              mlib_s32      cmask)
 106 {
 107   mlib_d64 *sp, *dp;
 108   mlib_d64 sd0, sd1;
 109   mlib_d64 sda, sdb, sdc, sdd;
 110   mlib_d64 dd;
 111   mlib_s32 i;
 112 
 113   sp = (mlib_d64 *) src;
 114   dp = (mlib_d64 *) dst;
 115 
 116   if (cmask == 2) {
 117 #pragma pipeloop(0)
 118     for (i = 0; i < dsize / 8; i++) {
 119       sd0 = *sp++;
 120       sd1 = *sp++;
 121       CHANNELEXTRACT_U8_21L(sd0, sd1, dd);
 122       *dp++ = dd;
 123     }
 124   }
 125   else {
 126 #pragma pipeloop(0)
 127     for (i = 0; i < dsize / 8; i++) {
 128       sd0 = *sp++;
 129       sd1 = *sp++;
 130       CHANNELEXTRACT_U8_21R(sd0, sd1, dd);
 131       *dp++ = dd;
 132     }
 133   }
 134 }
 135 
 136 /***************************************************************/
 137 /* extract one channel from a 2-channel image.
 138  * both source and destination image data are 8-byte aligned.
 139  * xsize is multiple of 8.
 140  */
 141 
 142 void mlib_v_ImageChannelExtract_U8_21_A8D2X8(const mlib_u8 *src,
 143                                              mlib_s32      slb,
 144                                              mlib_u8       *dst,
 145                                              mlib_s32      dlb,
 146                                              mlib_s32      xsize,
 147                                              mlib_s32      ysize,
 148                                              mlib_s32      cmask)
 149 {
 150   mlib_d64 *sp, *dp;
 151   mlib_d64 *sl, *dl;
 152   mlib_d64 sd0, sd1;
 153   mlib_d64 sda, sdb, sdc, sdd;
 154   mlib_d64 dd;
 155   mlib_s32 i, j;
 156 
 157   sp = sl = (mlib_d64 *) src;
 158   dp = dl = (mlib_d64 *) dst;
 159 
 160   if (cmask == 2) {
 161     for (j = 0; j < ysize; j++) {
 162 #pragma pipeloop(0)
 163       for (i = 0; i < xsize / 8; i++) {
 164         sd0 = *sp++;
 165         sd1 = *sp++;
 166         CHANNELEXTRACT_U8_21L(sd0, sd1, dd);
 167         *dp++ = dd;
 168       }
 169 
 170       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
 171       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
 172     }
 173   }
 174   else {
 175     for (j = 0; j < ysize; j++) {
 176 #pragma pipeloop(0)
 177       for (i = 0; i < xsize / 8; i++) {
 178         sd0 = *sp++;
 179         sd1 = *sp++;
 180         CHANNELEXTRACT_U8_21R(sd0, sd1, dd);
 181         *dp++ = dd;
 182       }
 183 
 184       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
 185       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
 186     }
 187   }
 188 }
 189 
 190 /***************************************************************/
 191 /* extract one channel from a 2-channel image.
 192  */
 193 
 194 void mlib_v_ImageChannelExtract_U8_21_D1(const mlib_u8 *src,
 195                                          mlib_u8       *dst,
 196                                          mlib_s32      dsize,
 197                                          mlib_s32      cmask)
 198 {
 199   mlib_u8 *sa, *da;
 200   mlib_u8 *dend, *dend2;                              /* end points in dst */
 201   mlib_d64 *dp;                                       /* 8-byte aligned start points in dst */
 202   mlib_d64 *sp;                                       /* 8-byte aligned start point in src */
 203   mlib_d64 sd0, sd1, sd2, sd3;                        /* 8-byte source data */
 204   mlib_d64 sda, sdb, sdc, sdd;
 205   mlib_d64 dd0, dd1;
 206   mlib_s32 soff;                                      /* offset of address in src */
 207   mlib_s32 doff;                                      /* offset of address in dst */
 208   mlib_s32 off;                                       /* offset of src over dst */
 209   mlib_s32 emask;                                     /* edge mask */
 210   mlib_s32 i, n;
 211 
 212   sa = (void *)src;
 213   da = dst;
 214 
 215   /* prepare the source address */
 216   sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
 217   soff = ((mlib_addr) sa & 7);
 218 
 219   /* prepare the destination addresses */
 220   dp = (mlib_d64 *) ((mlib_addr) da & (~7));
 221   doff = ((mlib_addr) da & 7);
 222   dend = da + dsize - 1;
 223   dend2 = dend - 7;
 224 
 225   /* calculate the src's offset over dst */
 226   if (cmask == 2) {
 227     off = soff / 2 - doff;
 228   }
 229   else {
 230     off = (soff + 1) / 2 - doff;
 231   }
 232 
 233   if (((cmask == 2) && (soff % 2 == 0)) || ((cmask == 1) && (soff % 2 != 0))) { /* extract even bytes */
 234 
 235     if (off == 0) {                         /* src and dst have same alignment */
 236 
 237       /* generate edge mask for the start point */
 238       emask = vis_edge8(da, dend);
 239 
 240       /* load 16 bytes */
 241       sd0 = *sp++;
 242       sd1 = *sp++;
 243 
 244       /* extract, including some garbage at the start point */
 245       CHANNELEXTRACT_U8_21L(sd0, sd1, dd0);
 246 
 247       /* store 8 bytes result */
 248       vis_pst_8(dd0, dp++, emask);
 249 
 250       if ((mlib_addr) dp <= (mlib_addr) dend2) {
 251         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
 252 
 253         /* 8-pixel column loop, emask not needed */
 254 #pragma pipeloop(0)
 255         for (i = 0; i < n; i++) {
 256           sd0 = *sp++;
 257           sd1 = *sp++;
 258           CHANNELEXTRACT_U8_21L(sd0, sd1, dd0);
 259           *dp++ = dd0;
 260         }
 261       }
 262 
 263       /* end point handling */
 264       if ((mlib_addr) dp <= (mlib_addr) dend) {
 265         emask = vis_edge8(dp, dend);
 266         sd0 = *sp++;
 267         sd1 = *sp++;
 268         CHANNELEXTRACT_U8_21L(sd0, sd1, dd0);
 269         vis_pst_8(dd0, dp++, emask);
 270       }
 271     }
 272     else {
 273       vis_alignaddr((void *)0, off);
 274 
 275       /* generate edge mask for the start point */
 276       emask = vis_edge8(da, dend);
 277 
 278       if (off < 0) {
 279         /* load 16 bytes */
 280         sd2 = *sp++;
 281         sd3 = *sp++;
 282 
 283         /* extract and store 8 bytes */
 284         CHANNELEXTRACT_U8_21L(sd2, sd3, dd1);
 285         vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
 286       }
 287       else {
 288         /* load 32 bytes */
 289         sd0 = *sp++;
 290         sd1 = *sp++;
 291         sd2 = *sp++;
 292         sd3 = *sp++;
 293 
 294         /* extract and store 8 bytes */
 295         CHANNELEXTRACT_U8_21L(sd0, sd1, dd0);
 296         CHANNELEXTRACT_U8_21L(sd2, sd3, dd1);
 297         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 298       }
 299 
 300       if ((mlib_addr) dp <= (mlib_addr) dend2) {
 301         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
 302 
 303         /* 8-pixel column loop, emask not needed */
 304 #pragma pipeloop(0)
 305         for (i = 0; i < n; i++) {
 306           dd0 = dd1;
 307           sd2 = *sp++;
 308           sd3 = *sp++;
 309           CHANNELEXTRACT_U8_21L(sd2, sd3, dd1);
 310           *dp++ = vis_faligndata(dd0, dd1);
 311         }
 312       }
 313 
 314       /* end point handling */
 315       if ((mlib_addr) dp <= (mlib_addr) dend) {
 316         emask = vis_edge8(dp, dend);
 317         dd0 = dd1;
 318         sd2 = *sp++;
 319         sd3 = *sp++;
 320         CHANNELEXTRACT_U8_21L(sd2, sd3, dd1);
 321         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 322       }
 323     }
 324   }
 325   else {                                    /* extract odd bytes */
 326 
 327     if (off == 0) {                         /* src and dst have same alignment */
 328 
 329       /* generate edge mask for the start point */
 330       emask = vis_edge8(da, dend);
 331 
 332       /* load 16 bytes, don't care the garbage at the start point */
 333       sd0 = *sp++;
 334       sd1 = *sp++;
 335 
 336       /* extract and store 8 bytes */
 337       CHANNELEXTRACT_U8_21R(sd0, sd1, dd0);
 338       vis_pst_8(dd0, dp++, emask);
 339 
 340       if ((mlib_addr) dp <= (mlib_addr) dend2) {
 341         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
 342 
 343         /* 8-pixel column loop, emask not needed */
 344 #pragma pipeloop(0)
 345         for (i = 0; i < n; i++) {
 346           sd0 = *sp++;
 347           sd1 = *sp++;
 348           CHANNELEXTRACT_U8_21R(sd0, sd1, dd0);
 349           *dp++ = dd0;
 350         }
 351       }
 352 
 353       /* end point handling */
 354       if ((mlib_addr) dp <= (mlib_addr) dend) {
 355         emask = vis_edge8(dp, dend);
 356         sd0 = *sp++;
 357         sd1 = *sp++;
 358         CHANNELEXTRACT_U8_21R(sd0, sd1, dd0);
 359         vis_pst_8(dd0, dp++, emask);
 360       }
 361     }
 362     else {
 363       vis_alignaddr((void *)0, off);
 364 
 365       /* generate edge mask for the start point */
 366       emask = vis_edge8(da, dend);
 367 
 368       if (off < 0) {
 369         /* load 16 bytes */
 370         sd2 = *sp++;
 371         sd3 = *sp++;
 372 
 373         /* extract and store 8 bytes */
 374         CHANNELEXTRACT_U8_21R(sd2, sd3, dd1);
 375         vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
 376       }
 377       else {
 378         /* load 32 bytes */
 379         sd0 = *sp++;
 380         sd1 = *sp++;
 381         sd2 = *sp++;
 382         sd3 = *sp++;
 383 
 384         /* extract and store 8 bytes */
 385         CHANNELEXTRACT_U8_21R(sd0, sd1, dd0);
 386         CHANNELEXTRACT_U8_21R(sd2, sd3, dd1);
 387         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 388       }
 389 
 390       if ((mlib_addr) dp <= (mlib_addr) dend2) {
 391         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
 392 
 393         /* 8-pixel column loop, emask not needed */
 394 #pragma pipeloop(0)
 395         for (i = 0; i < n; i++) {
 396           dd0 = dd1;
 397           sd2 = *sp++;
 398           sd3 = *sp++;
 399           CHANNELEXTRACT_U8_21R(sd2, sd3, dd1);
 400           *dp++ = vis_faligndata(dd0, dd1);
 401         }
 402       }
 403 
 404       /* end point handling */
 405       if ((mlib_addr) dp <= (mlib_addr) dend) {
 406         emask = vis_edge8(dp, dend);
 407         dd0 = dd1;
 408         sd2 = *sp++;
 409         sd3 = *sp++;
 410         CHANNELEXTRACT_U8_21R(sd2, sd3, dd1);
 411         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 412       }
 413     }
 414   }
 415 }
 416 
 417 /***************************************************************/
 418 /* extract one channel from a 2-channel image.
 419  */
 420 
 421 void mlib_v_ImageChannelExtract_U8_21(const mlib_u8 *src,
 422                                       mlib_s32      slb,
 423                                       mlib_u8       *dst,
 424                                       mlib_s32      dlb,
 425                                       mlib_s32      xsize,
 426                                       mlib_s32      ysize,
 427                                       mlib_s32      cmask)
 428 {
 429   mlib_u8 *sa, *da;
 430   mlib_u8 *sl, *dl;
 431   mlib_s32 j;
 432 
 433   sa = sl = (void *)src;
 434   da = dl = dst;
 435 
 436   for (j = 0; j < ysize; j++) {
 437     mlib_v_ImageChannelExtract_U8_21_D1(sa, da, xsize, cmask);
 438     sa = sl += slb;
 439     da = dl += dlb;
 440   }
 441 }
 442 
 443 /***************************************************************/
 444 #define CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd)                \
 445   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));        \
 446   sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));        \
 447   sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));        \
 448   sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb));        \
 449   sde = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc));        \
 450   dd  = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde))
 451 
 452 /***************************************************************/
 453 #define CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd)                \
 454   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));        \
 455   sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));        \
 456   sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));        \
 457   sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb));        \
 458   sde = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc));        \
 459   dd  = vis_fpmerge(vis_read_lo(sdd), vis_read_hi(sde))
 460 
 461 /***************************************************************/
 462 #define CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd)                \
 463   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));        \
 464   sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));        \
 465   sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));        \
 466   sdd = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc));        \
 467   sde = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc));        \
 468   dd  = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde))
 469 
 470 /***************************************************************/
 471 void mlib_v_ImageChannelExtract_U8_31_A8D1X8(const mlib_u8 *src,
 472                                              mlib_u8       *dst,
 473                                              mlib_s32      dsize,
 474                                              mlib_s32      cmask)
 475 {
 476   mlib_d64 *sp, *dp;
 477   mlib_d64 sd0, sd1, sd2;
 478   mlib_d64 sda, sdb, sdc, sdd, sde;
 479   mlib_d64 dd;
 480   mlib_s32 i;
 481 
 482   sp = (mlib_d64 *) src;
 483   dp = (mlib_d64 *) dst;
 484 
 485   if (cmask == 4) {
 486 #pragma pipeloop(0)
 487     for (i = 0; i < dsize / 8; i++) {
 488       sd0 = *sp++;
 489       sd1 = *sp++;
 490       sd2 = *sp++;
 491       CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd);
 492       *dp++ = dd;
 493     }
 494   }
 495   else if (cmask == 2) {
 496 #pragma pipeloop(0)
 497     for (i = 0; i < dsize / 8; i++) {
 498       sd0 = *sp++;
 499       sd1 = *sp++;
 500       sd2 = *sp++;
 501       CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd);
 502       *dp++ = dd;
 503     }
 504   }
 505   else {
 506 #pragma pipeloop(0)
 507     for (i = 0; i < dsize / 8; i++) {
 508       sd0 = *sp++;
 509       sd1 = *sp++;
 510       sd2 = *sp++;
 511       CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd);
 512       *dp++ = dd;
 513     }
 514   }
 515 }
 516 
 517 /***************************************************************/
 518 void mlib_v_ImageChannelExtract_U8_31_A8D2X8(const mlib_u8 *src,
 519                                              mlib_s32      slb,
 520                                              mlib_u8       *dst,
 521                                              mlib_s32      dlb,
 522                                              mlib_s32      xsize,
 523                                              mlib_s32      ysize,
 524                                              mlib_s32      cmask)
 525 {
 526   mlib_d64 *sp, *dp;
 527   mlib_d64 *sl, *dl;
 528   mlib_d64 sd0, sd1, sd2;
 529   mlib_d64 sda, sdb, sdc, sdd, sde;
 530   mlib_d64 dd;
 531   mlib_s32 i, j;
 532 
 533   sp = sl = (mlib_d64 *) src;
 534   dp = dl = (mlib_d64 *) dst;
 535 
 536   if (cmask == 4) {
 537     for (j = 0; j < ysize; j++) {
 538 #pragma pipeloop(0)
 539       for (i = 0; i < xsize / 8; i++) {
 540         sd0 = *sp++;
 541         sd1 = *sp++;
 542         sd2 = *sp++;
 543         CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd);
 544         *dp++ = dd;
 545       }
 546 
 547       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
 548       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
 549     }
 550   }
 551   else if (cmask == 2) {
 552     for (j = 0; j < ysize; j++) {
 553 #pragma pipeloop(0)
 554       for (i = 0; i < xsize / 8; i++) {
 555         sd0 = *sp++;
 556         sd1 = *sp++;
 557         sd2 = *sp++;
 558         CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd);
 559         *dp++ = dd;
 560       }
 561 
 562       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
 563       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
 564     }
 565   }
 566   else {
 567     for (j = 0; j < ysize; j++) {
 568 #pragma pipeloop(0)
 569       for (i = 0; i < xsize / 8; i++) {
 570         sd0 = *sp++;
 571         sd1 = *sp++;
 572         sd2 = *sp++;
 573         CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd);
 574         *dp++ = dd;
 575       }
 576 
 577       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
 578       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
 579     }
 580   }
 581 }
 582 
 583 /***************************************************************/
 584 void mlib_v_ImageChannelExtract_U8_31_D1(const mlib_u8 *src,
 585                                          mlib_u8       *dst,
 586                                          mlib_s32      dsize,
 587                                          mlib_s32      cmask)
 588 {
 589   mlib_u8 *sa, *da;
 590   mlib_u8 *dend, *dend2;                              /* end points in dst */
 591   mlib_d64 *dp;                                       /* 8-byte aligned start points in dst */
 592   mlib_d64 *sp;                                       /* 8-byte aligned start point in src */
 593   mlib_d64 sd0, sd1, sd2;                             /* 8-byte source data */
 594   mlib_d64 sd3, sd4, sd5;
 595   mlib_d64 sda, sdb, sdc, sdd, sde;
 596   mlib_d64 dd0, dd1;
 597   mlib_s32 soff;                                      /* offset of address in src */
 598   mlib_s32 doff;                                      /* offset of address in dst */
 599   mlib_s32 off;                                       /* offset of src over dst */
 600   mlib_s32 emask;                                     /* edge mask */
 601   mlib_s32 i, n;
 602 
 603   sa = (void *)src;
 604   da = dst;
 605 
 606   /* prepare the source address */
 607   sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
 608   soff = ((mlib_addr) sa & 7);
 609 
 610   /* prepare the destination addresses */
 611   dp = (mlib_d64 *) ((mlib_addr) da & (~7));
 612   doff = ((mlib_addr) da & 7);
 613   dend = da + dsize - 1;
 614   dend2 = dend - 7;
 615 
 616   /* calculate the src's offset over dst */
 617   if (cmask == 4) {
 618     off = soff / 3 - doff;
 619   }
 620   else if (cmask == 2) {
 621     off = (soff + 1) / 3 - doff;
 622   }
 623   else {
 624     off = (soff + 2) / 3 - doff;
 625   }
 626 
 627   if (((cmask == 4) && (soff % 3 == 0)) ||
 628       ((cmask == 2) && (soff % 3 == 2)) ||
 629       ((cmask == 1) && (soff % 3 == 1))) { /* extract left channel */
 630 
 631     if (off == 0) {                         /* src and dst have same alignment */
 632 
 633       /* generate edge mask for the start point */
 634       emask = vis_edge8(da, dend);
 635 
 636       /* load 16 bytes */
 637       sd0 = *sp++;
 638       sd1 = *sp++;
 639       sd2 = *sp++;
 640 
 641       /* extract, including some garbage at the start point */
 642       CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd0);
 643 
 644       /* store 8 bytes result */
 645       vis_pst_8(dd0, dp++, emask);
 646 
 647       if ((mlib_addr) dp <= (mlib_addr) dend2) {
 648         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
 649 
 650         /* 8-pixel column loop, emask not needed */
 651 #pragma pipeloop(0)
 652         for (i = 0; i < n; i++) {
 653           sd0 = *sp++;
 654           sd1 = *sp++;
 655           sd2 = *sp++;
 656           CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd0);
 657           *dp++ = dd0;
 658         }
 659       }
 660 
 661       /* end point handling */
 662       if ((mlib_addr) dp <= (mlib_addr) dend) {
 663         emask = vis_edge8(dp, dend);
 664         sd0 = *sp++;
 665         sd1 = *sp++;
 666         sd2 = *sp++;
 667         CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd0);
 668         vis_pst_8(dd0, dp++, emask);
 669       }
 670     }
 671     else {
 672       vis_alignaddr((void *)0, off);
 673 
 674       /* generate edge mask for the start point */
 675       emask = vis_edge8(da, dend);
 676 
 677       if (off < 0) {
 678         /* load 24 bytes */
 679         sd3 = *sp++;
 680         sd4 = *sp++;
 681         sd5 = *sp++;
 682 
 683         /* extract and store 8 bytes */
 684         CHANNELEXTRACT_U8_31L(sd3, sd4, sd5, dd1);
 685         vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
 686       }
 687       else {
 688         /* load 48 bytes */
 689         sd0 = *sp++;
 690         sd1 = *sp++;
 691         sd2 = *sp++;
 692         sd3 = *sp++;
 693         sd4 = *sp++;
 694         sd5 = *sp++;
 695 
 696         /* extract and store 8 bytes */
 697         CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd0);
 698         CHANNELEXTRACT_U8_31L(sd3, sd4, sd5, dd1);
 699         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 700       }
 701 
 702       if ((mlib_addr) dp <= (mlib_addr) dend2) {
 703         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
 704 
 705         /* 8-pixel column loop, emask not needed */
 706 #pragma pipeloop(0)
 707         for (i = 0; i < n; i++) {
 708           dd0 = dd1;
 709           sd3 = *sp++;
 710           sd4 = *sp++;
 711           sd5 = *sp++;
 712           CHANNELEXTRACT_U8_31L(sd3, sd4, sd5, dd1);
 713           *dp++ = vis_faligndata(dd0, dd1);
 714         }
 715       }
 716 
 717       /* end point handling */
 718       if ((mlib_addr) dp <= (mlib_addr) dend) {
 719         emask = vis_edge8(dp, dend);
 720         dd0 = dd1;
 721         sd3 = *sp++;
 722         sd4 = *sp++;
 723         sd5 = *sp++;
 724         CHANNELEXTRACT_U8_31L(sd3, sd4, sd5, dd1);
 725         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 726       }
 727     }
 728   }
 729   else if (((cmask == 4) && (soff % 3 == 1)) ||
 730            ((cmask == 2) && (soff % 3 == 0)) ||
 731            ((cmask == 1) && (soff % 3 == 2))) {
 732     /* extract middle channel */
 733 
 734     if (off == 0) {                         /* src and dst have same alignment */
 735 
 736       /* generate edge mask for the start point */
 737       emask = vis_edge8(da, dend);
 738 
 739       /* load 16 bytes */
 740       sd0 = *sp++;
 741       sd1 = *sp++;
 742       sd2 = *sp++;
 743 
 744       /* extract, including some garbage at the start point */
 745       CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd0);
 746 
 747       /* store 8 bytes result */
 748       vis_pst_8(dd0, dp++, emask);
 749 
 750       if ((mlib_addr) dp <= (mlib_addr) dend2) {
 751         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
 752 
 753         /* 8-pixel column loop, emask not needed */
 754 #pragma pipeloop(0)
 755         for (i = 0; i < n; i++) {
 756           sd0 = *sp++;
 757           sd1 = *sp++;
 758           sd2 = *sp++;
 759           CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd0);
 760           *dp++ = dd0;
 761         }
 762       }
 763 
 764       /* end point handling */
 765       if ((mlib_addr) dp <= (mlib_addr) dend) {
 766         emask = vis_edge8(dp, dend);
 767         sd0 = *sp++;
 768         sd1 = *sp++;
 769         sd2 = *sp++;
 770         CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd0);
 771         vis_pst_8(dd0, dp++, emask);
 772       }
 773     }
 774     else {
 775       vis_alignaddr((void *)0, off);
 776 
 777       /* generate edge mask for the start point */
 778       emask = vis_edge8(da, dend);
 779 
 780       if (off < 0) {
 781         /* load 24 bytes */
 782         sd3 = *sp++;
 783         sd4 = *sp++;
 784         sd5 = *sp++;
 785 
 786         /* extract and store 8 bytes */
 787         CHANNELEXTRACT_U8_31M(sd3, sd4, sd5, dd1);
 788         vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
 789       }
 790       else {
 791         /* load 48 bytes */
 792         sd0 = *sp++;
 793         sd1 = *sp++;
 794         sd2 = *sp++;
 795         sd3 = *sp++;
 796         sd4 = *sp++;
 797         sd5 = *sp++;
 798 
 799         /* extract and store 8 bytes */
 800         CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd0);
 801         CHANNELEXTRACT_U8_31M(sd3, sd4, sd5, dd1);
 802         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 803       }
 804 
 805       if ((mlib_addr) dp <= (mlib_addr) dend2) {
 806         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
 807 
 808         /* 8-pixel column loop, emask not needed */
 809 #pragma pipeloop(0)
 810         for (i = 0; i < n; i++) {
 811           dd0 = dd1;
 812           sd3 = *sp++;
 813           sd4 = *sp++;
 814           sd5 = *sp++;
 815           CHANNELEXTRACT_U8_31M(sd3, sd4, sd5, dd1);
 816           *dp++ = vis_faligndata(dd0, dd1);
 817         }
 818       }
 819 
 820       /* end point handling */
 821       if ((mlib_addr) dp <= (mlib_addr) dend) {
 822         emask = vis_edge8(dp, dend);
 823         dd0 = dd1;
 824         sd3 = *sp++;
 825         sd4 = *sp++;
 826         sd5 = *sp++;
 827         CHANNELEXTRACT_U8_31M(sd3, sd4, sd5, dd1);
 828         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 829       }
 830     }
 831   }
 832   else {                                    /* extract right channel */
 833 
 834     if (off == 0) {                         /* src and dst have same alignment */
 835 
 836       /* generate edge mask for the start point */
 837       emask = vis_edge8(da, dend);
 838 
 839       /* load 16 bytes */
 840       sd0 = *sp++;
 841       sd1 = *sp++;
 842       sd2 = *sp++;
 843 
 844       /* extract, including some garbage at the start point */
 845       CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd0);
 846 
 847       /* store 8 bytes result */
 848       vis_pst_8(dd0, dp++, emask);
 849 
 850       if ((mlib_addr) dp <= (mlib_addr) dend2) {
 851         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
 852 
 853         /* 8-pixel column loop, emask not needed */
 854 #pragma pipeloop(0)
 855         for (i = 0; i < n; i++) {
 856           sd0 = *sp++;
 857           sd1 = *sp++;
 858           sd2 = *sp++;
 859           CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd0);
 860           *dp++ = dd0;
 861         }
 862       }
 863 
 864       /* end point handling */
 865       if ((mlib_addr) dp <= (mlib_addr) dend) {
 866         emask = vis_edge8(dp, dend);
 867         sd0 = *sp++;
 868         sd1 = *sp++;
 869         sd2 = *sp++;
 870         CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd0);
 871         vis_pst_8(dd0, dp++, emask);
 872       }
 873     }
 874     else {
 875       vis_alignaddr((void *)0, off);
 876 
 877       /* generate edge mask for the start point */
 878       emask = vis_edge8(da, dend);
 879 
 880       if (off < 0) {
 881         /* load 24 bytes */
 882         sd3 = *sp++;
 883         sd4 = *sp++;
 884         sd5 = *sp++;
 885 
 886         /* extract and store 8 bytes */
 887         CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1);
 888         vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
 889       }
 890       else {
 891         /* load 48 bytes */
 892         sd0 = *sp++;
 893         sd1 = *sp++;
 894         sd2 = *sp++;
 895         sd3 = *sp++;
 896         sd4 = *sp++;
 897         sd5 = *sp++;
 898 
 899         /* extract and store 8 bytes */
 900         CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd0);
 901         CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1);
 902         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 903       }
 904 
 905       if ((mlib_addr) dp <= (mlib_addr) dend2) {
 906         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
 907 
 908         /* 8-pixel column loop, emask not needed */
 909 #pragma pipeloop(0)
 910         for (i = 0; i < n; i++) {
 911           dd0 = dd1;
 912           sd3 = *sp++;
 913           sd4 = *sp++;
 914           sd5 = *sp++;
 915           CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1);
 916           *dp++ = vis_faligndata(dd0, dd1);
 917         }
 918       }
 919 
 920       /* end point handling */
 921       if ((mlib_addr) dp <= (mlib_addr) dend) {
 922         emask = vis_edge8(dp, dend);
 923         dd0 = dd1;
 924         sd3 = *sp++;
 925         sd4 = *sp++;
 926         sd5 = *sp++;
 927         CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1);
 928         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
 929       }
 930     }
 931   }
 932 }
 933 
 934 /***************************************************************/
 935 void mlib_v_ImageChannelExtract_U8_31(const mlib_u8 *src,
 936                                       mlib_s32      slb,
 937                                       mlib_u8       *dst,
 938                                       mlib_s32      dlb,
 939                                       mlib_s32      xsize,
 940                                       mlib_s32      ysize,
 941                                       mlib_s32      cmask)
 942 {
 943   mlib_u8 *sa, *da;
 944   mlib_u8 *sl, *dl;
 945   mlib_s32 j;
 946 
 947   sa = sl = (void *)src;
 948   da = dl = dst;
 949 
 950   for (j = 0; j < ysize; j++) {
 951     mlib_v_ImageChannelExtract_U8_31_D1(sa, da, xsize, cmask);
 952     sa = sl += slb;
 953     da = dl += dlb;
 954   }
 955 }
 956 
 957 /***************************************************************/
 958 #define CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd)           \
 959   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2));        \
 960   sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2));        \
 961   sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3));        \
 962   sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3));        \
 963   sde = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdc));        \
 964   sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdd));        \
 965   dd  = vis_fpmerge(vis_read_hi(sde), vis_read_hi(sdf))
 966 
 967 /***************************************************************/
 968 #define CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd)          \
 969   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2));        \
 970   sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2));        \
 971   sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3));        \
 972   sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3));        \
 973   sde = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdc));        \
 974   sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdd));        \
 975   dd  = vis_fpmerge(vis_read_lo(sde), vis_read_lo(sdf))
 976 
 977 /***************************************************************/
 978 #define CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd)          \
 979   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2));        \
 980   sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2));        \
 981   sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3));        \
 982   sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3));        \
 983   sde = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdc));        \
 984   sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdd));        \
 985   dd  = vis_fpmerge(vis_read_hi(sde), vis_read_hi(sdf))
 986 
 987 /***************************************************************/
 988 #define CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd)           \
 989   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2));        \
 990   sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2));        \
 991   sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3));        \
 992   sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3));        \
 993   sde = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdc));        \
 994   sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdd));        \
 995   dd  = vis_fpmerge(vis_read_lo(sde), vis_read_lo(sdf))
 996 
 997 /***************************************************************/
 998 void mlib_v_ImageChannelExtract_U8_41_A8D1X8(const mlib_u8 *src,
 999                                              mlib_u8       *dst,
1000                                              mlib_s32      dsize,
1001                                              mlib_s32      cmask)
1002 {
1003   mlib_d64 *sp, *dp;
1004   mlib_d64 sd0, sd1, sd2, sd3;
1005   mlib_d64 sda, sdb, sdc, sdd, sde, sdf;
1006   mlib_d64 dd;
1007   mlib_s32 i;
1008 
1009   sp = (mlib_d64 *) src;
1010   dp = (mlib_d64 *) dst;
1011 
1012   if (cmask == 8) {
1013 #pragma pipeloop(0)
1014     for (i = 0; i < dsize / 8; i++) {
1015       sd0 = *sp++;
1016       sd1 = *sp++;
1017       sd2 = *sp++;
1018       sd3 = *sp++;
1019       CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd);
1020       *dp++ = dd;
1021     }
1022   }
1023   else if (cmask == 4) {
1024 #pragma pipeloop(0)
1025     for (i = 0; i < dsize / 8; i++) {
1026       sd0 = *sp++;
1027       sd1 = *sp++;
1028       sd2 = *sp++;
1029       sd3 = *sp++;
1030       CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd);
1031       *dp++ = dd;
1032     }
1033   }
1034   else if (cmask == 2) {
1035 #pragma pipeloop(0)
1036     for (i = 0; i < dsize / 8; i++) {
1037       sd0 = *sp++;
1038       sd1 = *sp++;
1039       sd2 = *sp++;
1040       sd3 = *sp++;
1041       CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd);
1042       *dp++ = dd;
1043     }
1044   }
1045   else {
1046 #pragma pipeloop(0)
1047     for (i = 0; i < dsize / 8; i++) {
1048       sd0 = *sp++;
1049       sd1 = *sp++;
1050       sd2 = *sp++;
1051       sd3 = *sp++;
1052       CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd);
1053       *dp++ = dd;
1054     }
1055   }
1056 }
1057 
1058 /***************************************************************/
1059 void mlib_v_ImageChannelExtract_U8_41_A8D2X8(const mlib_u8 *src,
1060                                              mlib_s32      slb,
1061                                              mlib_u8       *dst,
1062                                              mlib_s32      dlb,
1063                                              mlib_s32      xsize,
1064                                              mlib_s32      ysize,
1065                                              mlib_s32      cmask)
1066 {
1067   mlib_d64 *sp, *dp;
1068   mlib_d64 *sl, *dl;
1069   mlib_d64 sd0, sd1, sd2, sd3;
1070   mlib_d64 sda, sdb, sdc, sdd, sde, sdf;
1071   mlib_d64 dd;
1072   mlib_s32 i, j;
1073 
1074   sp = sl = (mlib_d64 *) src;
1075   dp = dl = (mlib_d64 *) dst;
1076 
1077   if (cmask == 8) {
1078     for (j = 0; j < ysize; j++) {
1079 #pragma pipeloop(0)
1080       for (i = 0; i < xsize / 8; i++) {
1081         sd0 = *sp++;
1082         sd1 = *sp++;
1083         sd2 = *sp++;
1084         sd3 = *sp++;
1085         CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd);
1086         *dp++ = dd;
1087       }
1088 
1089       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1090       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1091     }
1092   }
1093   else if (cmask == 4) {
1094     for (j = 0; j < ysize; j++) {
1095 #pragma pipeloop(0)
1096       for (i = 0; i < xsize / 8; i++) {
1097         sd0 = *sp++;
1098         sd1 = *sp++;
1099         sd2 = *sp++;
1100         sd3 = *sp++;
1101         CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd);
1102         *dp++ = dd;
1103       }
1104 
1105       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1106       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1107     }
1108   }
1109   else if (cmask == 2) {
1110     for (j = 0; j < ysize; j++) {
1111 #pragma pipeloop(0)
1112       for (i = 0; i < xsize / 8; i++) {
1113         sd0 = *sp++;
1114         sd1 = *sp++;
1115         sd2 = *sp++;
1116         sd3 = *sp++;
1117         CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd);
1118         *dp++ = dd;
1119       }
1120 
1121       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1122       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1123     }
1124   }
1125   else {
1126     for (j = 0; j < ysize; j++) {
1127 #pragma pipeloop(0)
1128       for (i = 0; i < xsize / 8; i++) {
1129         sd0 = *sp++;
1130         sd1 = *sp++;
1131         sd2 = *sp++;
1132         sd3 = *sp++;
1133         CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd);
1134         *dp++ = dd;
1135       }
1136 
1137       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1138       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1139     }
1140   }
1141 }
1142 
1143 /***************************************************************/
1144 void mlib_v_ImageChannelExtract_U8_41_D1(const mlib_u8 *src,
1145                                          mlib_u8       *dst,
1146                                          mlib_s32      dsize,
1147                                          mlib_s32      cmask)
1148 {
1149   mlib_u8 *sa, *da;
1150   mlib_u8 *dend, *dend2;                              /* end points in dst */
1151   mlib_d64 *dp;                                       /* 8-byte aligned start points in dst */
1152   mlib_d64 *sp;                                       /* 8-byte aligned start point in src */
1153   mlib_d64 sd0, sd1, sd2, sd3;                        /* 8-byte source data */
1154   mlib_d64 sd4, sd5, sd6, sd7;
1155   mlib_d64 sda, sdb, sdc, sdd;
1156   mlib_d64 sde, sdf;
1157   mlib_d64 dd0, dd1;
1158   mlib_s32 soff;                                      /* offset of address in src */
1159   mlib_s32 doff;                                      /* offset of address in dst */
1160   mlib_s32 off;                                       /* offset of src over dst */
1161   mlib_s32 emask;                                     /* edge mask */
1162   mlib_s32 i, n;
1163 
1164   sa = (void *)src;
1165   da = dst;
1166 
1167   /* prepare the source address */
1168   sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
1169   soff = ((mlib_addr) sa & 7);
1170 
1171   /* prepare the destination addresses */
1172   dp = (mlib_d64 *) ((mlib_addr) da & (~7));
1173   doff = ((mlib_addr) da & 7);
1174   dend = da + dsize - 1;
1175   dend2 = dend - 7;
1176 
1177   /* calculate the src's offset over dst */
1178   if (cmask == 8) {
1179     off = soff / 4 - doff;
1180   }
1181   else if (cmask == 4) {
1182     off = (soff + 1) / 4 - doff;
1183   }
1184   else if (cmask == 2) {
1185     off = (soff + 2) / 4 - doff;
1186   }
1187   else {
1188     off = (soff + 3) / 4 - doff;
1189   }
1190 
1191   if (((cmask == 8) && (soff % 4 == 0)) ||
1192       ((cmask == 4) && (soff % 4 == 3)) ||
1193       ((cmask == 2) && (soff % 4 == 2)) ||
1194       ((cmask == 1) && (soff % 4 == 1))) { /* extract left channel */
1195 
1196     if (off == 0) {                         /* src and dst have same alignment */
1197 
1198       /* generate edge mask for the start point */
1199       emask = vis_edge8(da, dend);
1200 
1201       /* load 16 bytes */
1202       sd0 = *sp++;
1203       sd1 = *sp++;
1204       sd2 = *sp++;
1205       sd3 = *sp++;
1206 
1207       /* extract, including some garbage at the start point */
1208       CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd0);
1209 
1210       /* store 8 bytes result */
1211       vis_pst_8(dd0, dp++, emask);
1212 
1213       if ((mlib_addr) dp <= (mlib_addr) dend2) {
1214         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1215 
1216         /* 8-pixel column loop, emask not needed */
1217 #pragma pipeloop(0)
1218         for (i = 0; i < n; i++) {
1219           sd0 = *sp++;
1220           sd1 = *sp++;
1221           sd2 = *sp++;
1222           sd3 = *sp++;
1223           CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd0);
1224           *dp++ = dd0;
1225         }
1226       }
1227 
1228       /* end point handling */
1229       if ((mlib_addr) dp <= (mlib_addr) dend) {
1230         emask = vis_edge8(dp, dend);
1231         sd0 = *sp++;
1232         sd1 = *sp++;
1233         sd2 = *sp++;
1234         sd3 = *sp++;
1235         CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd0);
1236         vis_pst_8(dd0, dp++, emask);
1237       }
1238     }
1239     else {
1240       vis_alignaddr((void *)0, off);
1241 
1242       /* generate edge mask for the start point */
1243       emask = vis_edge8(da, dend);
1244 
1245       if (off < 0) {
1246         /* load 24 bytes */
1247         sd4 = *sp++;
1248         sd5 = *sp++;
1249         sd6 = *sp++;
1250         sd7 = *sp++;
1251 
1252         /* extract and store 8 bytes */
1253         CHANNELEXTRACT_U8_41L(sd4, sd5, sd6, sd7, dd1);
1254         vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
1255       }
1256       else {
1257         /* load 48 bytes */
1258         sd0 = *sp++;
1259         sd1 = *sp++;
1260         sd2 = *sp++;
1261         sd3 = *sp++;
1262         sd4 = *sp++;
1263         sd5 = *sp++;
1264         sd6 = *sp++;
1265         sd7 = *sp++;
1266 
1267         /* extract and store 8 bytes */
1268         CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd0);
1269         CHANNELEXTRACT_U8_41L(sd4, sd5, sd6, sd7, dd1);
1270         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
1271       }
1272 
1273       if ((mlib_addr) dp <= (mlib_addr) dend2) {
1274         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1275 
1276         /* 8-pixel column loop, emask not needed */
1277 #pragma pipeloop(0)
1278         for (i = 0; i < n; i++) {
1279           dd0 = dd1;
1280           sd4 = *sp++;
1281           sd5 = *sp++;
1282           sd6 = *sp++;
1283           sd7 = *sp++;
1284           CHANNELEXTRACT_U8_41L(sd4, sd5, sd6, sd7, dd1);
1285           *dp++ = vis_faligndata(dd0, dd1);
1286         }
1287       }
1288 
1289       /* end point handling */
1290       if ((mlib_addr) dp <= (mlib_addr) dend) {
1291         emask = vis_edge8(dp, dend);
1292         dd0 = dd1;
1293         sd4 = *sp++;
1294         sd5 = *sp++;
1295         sd6 = *sp++;
1296         sd7 = *sp++;
1297         CHANNELEXTRACT_U8_41L(sd4, sd5, sd6, sd7, dd1);
1298         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
1299       }
1300     }
1301   }
1302   else if (((cmask == 8) && (soff % 4 == 1)) ||
1303            ((cmask == 4) && (soff % 4 == 0)) ||
1304            ((cmask == 2) && (soff % 4 == 3)) ||
1305            ((cmask == 1) && (soff % 4 == 2))) {
1306     /* extract middle left channel */
1307 
1308     if (off == 0) {                         /* src and dst have same alignment */
1309 
1310       /* generate edge mask for the start point */
1311       emask = vis_edge8(da, dend);
1312 
1313       /* load 16 bytes */
1314       sd0 = *sp++;
1315       sd1 = *sp++;
1316       sd2 = *sp++;
1317       sd3 = *sp++;
1318 
1319       /* extract, including some garbage at the start point */
1320       CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd0);
1321 
1322       /* store 8 bytes result */
1323       vis_pst_8(dd0, dp++, emask);
1324 
1325       if ((mlib_addr) dp <= (mlib_addr) dend2) {
1326         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1327 
1328         /* 8-pixel column loop, emask not needed */
1329 #pragma pipeloop(0)
1330         for (i = 0; i < n; i++) {
1331           sd0 = *sp++;
1332           sd1 = *sp++;
1333           sd2 = *sp++;
1334           sd3 = *sp++;
1335           CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd0);
1336           *dp++ = dd0;
1337         }
1338       }
1339 
1340       /* end point handling */
1341       if ((mlib_addr) dp <= (mlib_addr) dend) {
1342         emask = vis_edge8(dp, dend);
1343         sd0 = *sp++;
1344         sd1 = *sp++;
1345         sd2 = *sp++;
1346         sd3 = *sp++;
1347         CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd0);
1348         vis_pst_8(dd0, dp++, emask);
1349       }
1350     }
1351     else {
1352       vis_alignaddr((void *)0, off);
1353 
1354       /* generate edge mask for the start point */
1355       emask = vis_edge8(da, dend);
1356 
1357       if (off < 0) {
1358         /* load 24 bytes */
1359         sd4 = *sp++;
1360         sd5 = *sp++;
1361         sd6 = *sp++;
1362         sd7 = *sp++;
1363 
1364         /* extract and store 8 bytes */
1365         CHANNELEXTRACT_U8_41ML(sd4, sd5, sd6, sd7, dd1);
1366         vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
1367       }
1368       else {
1369         /* load 48 bytes */
1370         sd0 = *sp++;
1371         sd1 = *sp++;
1372         sd2 = *sp++;
1373         sd3 = *sp++;
1374         sd4 = *sp++;
1375         sd5 = *sp++;
1376         sd6 = *sp++;
1377         sd7 = *sp++;
1378 
1379         /* extract and store 8 bytes */
1380         CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd0);
1381         CHANNELEXTRACT_U8_41ML(sd4, sd5, sd6, sd7, dd1);
1382         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
1383       }
1384 
1385       if ((mlib_addr) dp <= (mlib_addr) dend2) {
1386         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1387 
1388         /* 8-pixel column loop, emask not needed */
1389 #pragma pipeloop(0)
1390         for (i = 0; i < n; i++) {
1391           dd0 = dd1;
1392           sd4 = *sp++;
1393           sd5 = *sp++;
1394           sd6 = *sp++;
1395           sd7 = *sp++;
1396           CHANNELEXTRACT_U8_41ML(sd4, sd5, sd6, sd7, dd1);
1397           *dp++ = vis_faligndata(dd0, dd1);
1398         }
1399       }
1400 
1401       /* end point handling */
1402       if ((mlib_addr) dp <= (mlib_addr) dend) {
1403         emask = vis_edge8(dp, dend);
1404         dd0 = dd1;
1405         sd4 = *sp++;
1406         sd5 = *sp++;
1407         sd6 = *sp++;
1408         sd7 = *sp++;
1409         CHANNELEXTRACT_U8_41ML(sd4, sd5, sd6, sd7, dd1);
1410         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
1411       }
1412     }
1413   }
1414   else if (((cmask == 8) && (soff % 4 == 2)) ||
1415            ((cmask == 4) && (soff % 4 == 1)) ||
1416            ((cmask == 2) && (soff % 4 == 0)) ||
1417            ((cmask == 1) && (soff % 4 == 3))) { /* extract middle right channel */
1418 
1419     if (off == 0) {                         /* src and dst have same alignment */
1420 
1421       /* generate edge mask for the start point */
1422       emask = vis_edge8(da, dend);
1423 
1424       /* load 16 bytes */
1425       sd0 = *sp++;
1426       sd1 = *sp++;
1427       sd2 = *sp++;
1428       sd3 = *sp++;
1429 
1430       /* extract, including some garbage at the start point */
1431       CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd0);
1432 
1433       /* store 8 bytes result */
1434       vis_pst_8(dd0, dp++, emask);
1435 
1436       if ((mlib_addr) dp <= (mlib_addr) dend2) {
1437         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1438 
1439         /* 8-pixel column loop, emask not needed */
1440 #pragma pipeloop(0)
1441         for (i = 0; i < n; i++) {
1442           sd0 = *sp++;
1443           sd1 = *sp++;
1444           sd2 = *sp++;
1445           sd3 = *sp++;
1446           CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd0);
1447           *dp++ = dd0;
1448         }
1449       }
1450 
1451       /* end point handling */
1452       if ((mlib_addr) dp <= (mlib_addr) dend) {
1453         emask = vis_edge8(dp, dend);
1454         sd0 = *sp++;
1455         sd1 = *sp++;
1456         sd2 = *sp++;
1457         sd3 = *sp++;
1458         CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd0);
1459         vis_pst_8(dd0, dp++, emask);
1460       }
1461     }
1462     else {
1463       vis_alignaddr((void *)0, off);
1464 
1465       /* generate edge mask for the start point */
1466       emask = vis_edge8(da, dend);
1467 
1468       if (off < 0) {
1469         /* load 24 bytes */
1470         sd4 = *sp++;
1471         sd5 = *sp++;
1472         sd6 = *sp++;
1473         sd7 = *sp++;
1474 
1475         /* extract and store 8 bytes */
1476         CHANNELEXTRACT_U8_41MR(sd4, sd5, sd6, sd7, dd1);
1477         vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
1478       }
1479       else {
1480         /* load 48 bytes */
1481         sd0 = *sp++;
1482         sd1 = *sp++;
1483         sd2 = *sp++;
1484         sd3 = *sp++;
1485         sd4 = *sp++;
1486         sd5 = *sp++;
1487         sd6 = *sp++;
1488         sd7 = *sp++;
1489 
1490         /* extract and store 8 bytes */
1491         CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd0);
1492         CHANNELEXTRACT_U8_41MR(sd4, sd5, sd6, sd7, dd1);
1493         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
1494       }
1495 
1496       if ((mlib_addr) dp <= (mlib_addr) dend2) {
1497         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1498 
1499         /* 8-pixel column loop, emask not needed */
1500 #pragma pipeloop(0)
1501         for (i = 0; i < n; i++) {
1502           dd0 = dd1;
1503           sd4 = *sp++;
1504           sd5 = *sp++;
1505           sd6 = *sp++;
1506           sd7 = *sp++;
1507           CHANNELEXTRACT_U8_41MR(sd4, sd5, sd6, sd7, dd1);
1508           *dp++ = vis_faligndata(dd0, dd1);
1509         }
1510       }
1511 
1512       /* end point handling */
1513       if ((mlib_addr) dp <= (mlib_addr) dend) {
1514         emask = vis_edge8(dp, dend);
1515         dd0 = dd1;
1516         sd4 = *sp++;
1517         sd5 = *sp++;
1518         sd6 = *sp++;
1519         sd7 = *sp++;
1520         CHANNELEXTRACT_U8_41MR(sd4, sd5, sd6, sd7, dd1);
1521         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
1522       }
1523     }
1524   }
1525   else {                                    /* extract right channel */
1526     if (off == 0) {                         /* src and dst have same alignment */
1527 
1528       /* generate edge mask for the start point */
1529       emask = vis_edge8(da, dend);
1530 
1531       /* load 16 bytes */
1532       sd0 = *sp++;
1533       sd1 = *sp++;
1534       sd2 = *sp++;
1535       sd3 = *sp++;
1536 
1537       /* extract, including some garbage at the start point */
1538       CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd0);
1539 
1540       /* store 8 bytes result */
1541       vis_pst_8(dd0, dp++, emask);
1542 
1543       if ((mlib_addr) dp <= (mlib_addr) dend2) {
1544         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1545 
1546         /* 8-pixel column loop, emask not needed */
1547 #pragma pipeloop(0)
1548         for (i = 0; i < n; i++) {
1549           sd0 = *sp++;
1550           sd1 = *sp++;
1551           sd2 = *sp++;
1552           sd3 = *sp++;
1553           CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd0);
1554           *dp++ = dd0;
1555         }
1556       }
1557 
1558       /* end point handling */
1559       if ((mlib_addr) dp <= (mlib_addr) dend) {
1560         emask = vis_edge8(dp, dend);
1561         sd0 = *sp++;
1562         sd1 = *sp++;
1563         sd2 = *sp++;
1564         sd3 = *sp++;
1565         CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd0);
1566         vis_pst_8(dd0, dp++, emask);
1567       }
1568     }
1569     else {
1570       vis_alignaddr((void *)0, off);
1571 
1572       /* generate edge mask for the start point */
1573       emask = vis_edge8(da, dend);
1574 
1575       if (off < 0) {
1576         /* load 24 bytes */
1577         sd4 = *sp++;
1578         sd5 = *sp++;
1579         sd6 = *sp++;
1580         sd7 = *sp++;
1581 
1582         /* extract and store 8 bytes */
1583         CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1);
1584         vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
1585       }
1586       else {
1587         /* load 48 bytes */
1588         sd0 = *sp++;
1589         sd1 = *sp++;
1590         sd2 = *sp++;
1591         sd3 = *sp++;
1592         sd4 = *sp++;
1593         sd5 = *sp++;
1594         sd6 = *sp++;
1595         sd7 = *sp++;
1596 
1597         /* extract and store 8 bytes */
1598         CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd0);
1599         CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1);
1600         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
1601       }
1602 
1603       if ((mlib_addr) dp <= (mlib_addr) dend2) {
1604         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1605 
1606         /* 8-pixel column loop, emask not needed */
1607 #pragma pipeloop(0)
1608         for (i = 0; i < n; i++) {
1609           dd0 = dd1;
1610           sd4 = *sp++;
1611           sd5 = *sp++;
1612           sd6 = *sp++;
1613           sd7 = *sp++;
1614           CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1);
1615           *dp++ = vis_faligndata(dd0, dd1);
1616         }
1617       }
1618 
1619       /* end point handling */
1620       if ((mlib_addr) dp <= (mlib_addr) dend) {
1621         emask = vis_edge8(dp, dend);
1622         dd0 = dd1;
1623         sd4 = *sp++;
1624         sd5 = *sp++;
1625         sd6 = *sp++;
1626         sd7 = *sp++;
1627         CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1);
1628         vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
1629       }
1630     }
1631   }
1632 }
1633 
1634 /***************************************************************/
1635 void mlib_v_ImageChannelExtract_U8_41(const mlib_u8 *src,
1636                                       mlib_s32      slb,
1637                                       mlib_u8       *dst,
1638                                       mlib_s32      dlb,
1639                                       mlib_s32      xsize,
1640                                       mlib_s32      ysize,
1641                                       mlib_s32      cmask)
1642 {
1643   mlib_u8 *sa, *da;
1644   mlib_u8 *sl, *dl;
1645   mlib_s32 j;
1646 
1647   sa = sl = (void *)src;
1648   da = dl = dst;
1649 
1650   for (j = 0; j < ysize; j++) {
1651     mlib_v_ImageChannelExtract_U8_41_D1(sa, da, xsize, cmask);
1652     sa = sl += slb;
1653     da = dl += dlb;
1654   }
1655 }
1656 
1657 /***************************************************************/
1658 #define CHANNELEXTRACT_S16_21L(sd0, sd1, dd)                    \
1659   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1));        \
1660   sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1));        \
1661   sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb));        \
1662   dd  = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
1663 
1664 /***************************************************************/
1665 #define CHANNELEXTRACT_S16_21R(sd0, sd1, dd)                    \
1666   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1));        \
1667   sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1));        \
1668   sdc = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb));        \
1669   dd  = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
1670 
1671 /***************************************************************/
1672 /* extract one channel from a 2-channel image.
1673  * both source and destination image data are 8-byte aligned.
1674  * dsize is multiple of 4.
1675  */
1676 
1677 void mlib_v_ImageChannelExtract_S16_21_A8D1X4(const mlib_s16 *src,
1678                                               mlib_s16       *dst,
1679                                               mlib_s32       dsize,
1680                                               mlib_s32       cmask)
1681 {
1682   mlib_d64 *sp, *dp;
1683   mlib_d64 sd0, sd1;
1684   mlib_d64 sda, sdb, sdc;
1685   mlib_d64 dd;
1686   mlib_s32 i;
1687 
1688   sp = (mlib_d64 *) src;
1689   dp = (mlib_d64 *) dst;
1690 
1691   if (cmask == 2) {
1692 #pragma pipeloop(0)
1693     for (i = 0; i < dsize / 4; i++) {
1694       sd0 = *sp++;
1695       sd1 = *sp++;
1696       CHANNELEXTRACT_S16_21L(sd0, sd1, dd);
1697       *dp++ = dd;
1698     }
1699   }
1700   else {
1701 #pragma pipeloop(0)
1702     for (i = 0; i < dsize / 4; i++) {
1703       sd0 = *sp++;
1704       sd1 = *sp++;
1705       CHANNELEXTRACT_S16_21R(sd0, sd1, dd);
1706       *dp++ = dd;
1707     }
1708   }
1709 }
1710 
1711 /***************************************************************/
1712 void mlib_v_ImageChannelExtract_S16_21_A8D2X4(const mlib_s16 *src,
1713                                               mlib_s32       slb,
1714                                               mlib_s16       *dst,
1715                                               mlib_s32       dlb,
1716                                               mlib_s32       xsize,
1717                                               mlib_s32       ysize,
1718                                               mlib_s32       cmask)
1719 {
1720   mlib_d64 *sp, *dp;
1721   mlib_d64 *sl, *dl;
1722   mlib_d64 sd0, sd1;
1723   mlib_d64 sda, sdb, sdc;
1724   mlib_d64 dd;
1725   mlib_s32 i, j;
1726 
1727   sp = sl = (mlib_d64 *) src;
1728   dp = dl = (mlib_d64 *) dst;
1729 
1730   if (cmask == 2) {
1731     for (j = 0; j < ysize; j++) {
1732 #pragma pipeloop(0)
1733       for (i = 0; i < xsize / 4; i++) {
1734         sd0 = *sp++;
1735         sd1 = *sp++;
1736         CHANNELEXTRACT_S16_21L(sd0, sd1, dd);
1737         *dp++ = dd;
1738       }
1739 
1740       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1741       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1742     }
1743   }
1744   else {
1745     for (j = 0; j < ysize; j++) {
1746 #pragma pipeloop(0)
1747       for (i = 0; i < xsize / 4; i++) {
1748         sd0 = *sp++;
1749         sd1 = *sp++;
1750         CHANNELEXTRACT_S16_21R(sd0, sd1, dd);
1751         *dp++ = dd;
1752       }
1753 
1754       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1755       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1756     }
1757   }
1758 }
1759 
1760 /***************************************************************/
1761 void mlib_v_ImageChannelExtract_S16_21_D1(const mlib_s16 *src,
1762                                           mlib_s16       *dst,
1763                                           mlib_s32       dsize,
1764                                           mlib_s32       cmask)
1765 {
1766   mlib_s16 *sa, *da;
1767   mlib_s16 *dend, *dend2;                             /* end points in dst */
1768   mlib_d64 *dp;                                       /* 8-byte aligned start points in dst */
1769   mlib_d64 *sp;                                       /* 8-byte aligned start point in src */
1770   mlib_d64 sd0, sd1, sd2, sd3;                        /* 8-byte source data */
1771   mlib_d64 sda, sdb, sdc;
1772   mlib_d64 dd0, dd1;
1773   mlib_s32 soff;                                      /* offset of address in src */
1774   mlib_s32 doff;                                      /* offset of address in dst */
1775   mlib_s32 off;                                       /* offset of dst over src */
1776   mlib_s32 emask;                                     /* edge mask */
1777   mlib_s32 i, n;
1778 
1779   sa = (void *)src;
1780   da = dst;
1781 
1782   /* prepare the source address */
1783   sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
1784   soff = ((mlib_addr) sa & 7);
1785 
1786   /* prepare the destination addresses */
1787   dp = (mlib_d64 *) ((mlib_addr) da & (~7));
1788   doff = ((mlib_addr) da & 7);
1789   dend = da + dsize - 1;
1790   dend2 = dend - 3;
1791 
1792   /* calculate the src's offset over dst */
1793   if (cmask == 2) {
1794     off = (soff / 4) * 2 - doff;
1795   }
1796   else {
1797     off = ((soff + 3) / 4) * 2 - doff;
1798   }
1799 
1800   if (((cmask == 2) && (soff % 4 == 0)) || ((cmask == 1) && (soff % 4 != 0))) { /* extract even words */
1801 
1802     if (off == 0) {                         /* src and dst have same alignment */
1803 
1804       /* generate edge mask for the start point */
1805       emask = vis_edge16(da, dend);
1806 
1807       /* load 16 bytes */
1808       sd0 = *sp++;
1809       sd1 = *sp++;
1810 
1811       /* extract, including some garbage at the start point */
1812       CHANNELEXTRACT_S16_21L(sd0, sd1, dd0);
1813 
1814       /* store 8 bytes result */
1815       vis_pst_16(dd0, dp++, emask);
1816 
1817       if ((mlib_addr) dp <= (mlib_addr) dend2) {
1818         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1819 
1820         /* 8-pixel column loop, emask not needed */
1821 #pragma pipeloop(0)
1822         for (i = 0; i < n; i++) {
1823           sd0 = *sp++;
1824           sd1 = *sp++;
1825           CHANNELEXTRACT_S16_21L(sd0, sd1, dd0);
1826           *dp++ = dd0;
1827         }
1828       }
1829 
1830       /* end point handling */
1831       if ((mlib_addr) dp <= (mlib_addr) dend) {
1832         emask = vis_edge16(dp, dend);
1833         sd0 = *sp++;
1834         sd1 = *sp++;
1835         CHANNELEXTRACT_S16_21L(sd0, sd1, dd0);
1836         vis_pst_16(dd0, dp++, emask);
1837       }
1838     }
1839     else {
1840       vis_alignaddr((void *)0, off);
1841 
1842       /* generate edge mask for the start point */
1843       emask = vis_edge16(da, dend);
1844 
1845       if (off < 0) {
1846         /* load 16 bytes */
1847         sd2 = *sp++;
1848         sd3 = *sp++;
1849 
1850         /* extract and store 8 bytes */
1851         CHANNELEXTRACT_S16_21L(sd2, sd3, dd1);
1852         vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
1853       }
1854       else {
1855         /* load 32 bytes */
1856         sd0 = *sp++;
1857         sd1 = *sp++;
1858         sd2 = *sp++;
1859         sd3 = *sp++;
1860 
1861         /* extract and store 8 bytes */
1862         CHANNELEXTRACT_S16_21L(sd0, sd1, dd0);
1863         CHANNELEXTRACT_S16_21L(sd2, sd3, dd1);
1864         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
1865       }
1866 
1867       if ((mlib_addr) dp <= (mlib_addr) dend2) {
1868         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1869 
1870         /* 8-pixel column loop, emask not needed */
1871 #pragma pipeloop(0)
1872         for (i = 0; i < n; i++) {
1873           dd0 = dd1;
1874           sd2 = *sp++;
1875           sd3 = *sp++;
1876           CHANNELEXTRACT_S16_21L(sd2, sd3, dd1);
1877           *dp++ = vis_faligndata(dd0, dd1);
1878         }
1879       }
1880 
1881       /* end point handling */
1882       if ((mlib_addr) dp <= (mlib_addr) dend) {
1883         emask = vis_edge16(dp, dend);
1884         dd0 = dd1;
1885         sd2 = *sp++;
1886         sd3 = *sp++;
1887         CHANNELEXTRACT_S16_21L(sd2, sd3, dd1);
1888         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
1889       }
1890     }
1891   }
1892   else {                                    /* extract odd words */
1893 
1894     if (off == 0) {                         /* src and dst have same alignment */
1895 
1896       /* generate edge mask for the start point */
1897       emask = vis_edge16(da, dend);
1898 
1899       /* load 16 bytes, don't care the garbage at the start point */
1900       sd0 = *sp++;
1901       sd1 = *sp++;
1902 
1903       /* extract and store 8 bytes */
1904       CHANNELEXTRACT_S16_21R(sd0, sd1, dd0);
1905       vis_pst_16(dd0, dp++, emask);
1906 
1907       if ((mlib_addr) dp <= (mlib_addr) dend2) {
1908         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1909 
1910         /* 8-pixel column loop, emask not needed */
1911 #pragma pipeloop(0)
1912         for (i = 0; i < n; i++) {
1913           sd0 = *sp++;
1914           sd1 = *sp++;
1915           CHANNELEXTRACT_S16_21R(sd0, sd1, dd0);
1916           *dp++ = dd0;
1917         }
1918       }
1919 
1920       /* end point handling */
1921       if ((mlib_addr) dp <= (mlib_addr) dend) {
1922         emask = vis_edge16(dp, dend);
1923         sd0 = *sp++;
1924         sd1 = *sp++;
1925         CHANNELEXTRACT_S16_21R(sd0, sd1, dd0);
1926         vis_pst_16(dd0, dp++, emask);
1927       }
1928     }
1929     else {
1930       vis_alignaddr((void *)0, off);
1931 
1932       /* generate edge mask for the start point */
1933       emask = vis_edge16(da, dend);
1934 
1935       if (off < 0) {
1936         /* load 16 bytes */
1937         sd2 = *sp++;
1938         sd3 = *sp++;
1939 
1940         /* extract and store 8 bytes */
1941         CHANNELEXTRACT_S16_21R(sd2, sd3, dd1);
1942         vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
1943       }
1944       else {
1945         /* load 32 bytes */
1946         sd0 = *sp++;
1947         sd1 = *sp++;
1948         sd2 = *sp++;
1949         sd3 = *sp++;
1950 
1951         /* extract and store 8 bytes */
1952         CHANNELEXTRACT_S16_21R(sd0, sd1, dd0);
1953         CHANNELEXTRACT_S16_21R(sd2, sd3, dd1);
1954         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
1955       }
1956 
1957       if ((mlib_addr) dp <= (mlib_addr) dend2) {
1958         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1959 
1960         /* 8-pixel column loop, emask not needed */
1961 #pragma pipeloop(0)
1962         for (i = 0; i < n; i++) {
1963           dd0 = dd1;
1964           sd2 = *sp++;
1965           sd3 = *sp++;
1966           CHANNELEXTRACT_S16_21R(sd2, sd3, dd1);
1967           *dp++ = vis_faligndata(dd0, dd1);
1968         }
1969       }
1970 
1971       /* end point handling */
1972       if ((mlib_addr) dp <= (mlib_addr) dend) {
1973         emask = vis_edge16(dp, dend);
1974         dd0 = dd1;
1975         sd2 = *sp++;
1976         sd3 = *sp++;
1977         CHANNELEXTRACT_S16_21R(sd2, sd3, dd1);
1978         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
1979       }
1980     }
1981   }
1982 }
1983 
1984 /***************************************************************/
1985 void mlib_v_ImageChannelExtract_S16_21(const mlib_s16 *src,
1986                                        mlib_s32       slb,
1987                                        mlib_s16       *dst,
1988                                        mlib_s32       dlb,
1989                                        mlib_s32       xsize,
1990                                        mlib_s32       ysize,
1991                                        mlib_s32       cmask)
1992 {
1993   mlib_s16 *sa, *da;
1994   mlib_s16 *sl, *dl;
1995   mlib_s32 j;
1996 
1997   sa = sl = (void *)src;
1998   da = dl = dst;
1999 
2000   for (j = 0; j < ysize; j++) {
2001     mlib_v_ImageChannelExtract_S16_21_D1(sa, da, xsize, cmask);
2002     sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
2003     da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
2004   }
2005 }
2006 
2007 /***************************************************************/
2008 #define CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd)               \
2009   /* extract the left channel */                                \
2010   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));        \
2011   sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));        \
2012   sdc = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb));        \
2013   dd  = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2014 
2015 /***************************************************************/
2016 #define CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd)               \
2017   /* extract the middle channel */                              \
2018   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));        \
2019   sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));        \
2020   sdc = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdb));        \
2021   dd  = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2022 
2023 /***************************************************************/
2024 #define CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd)               \
2025   /* extract the right channel */                               \
2026   sda = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));        \
2027   sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));        \
2028   sdc = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb));        \
2029   dd  = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2030 
2031 /***************************************************************/
2032 void mlib_v_ImageChannelExtract_S16_31_A8D1X4(const mlib_s16 *src,
2033                                               mlib_s16       *dst,
2034                                               mlib_s32       dsize,
2035                                               mlib_s32       cmask)
2036 {
2037   mlib_d64 *sp, *dp;
2038   mlib_d64 sd0, sd1, sd2;
2039   mlib_d64 sda, sdb, sdc;
2040   mlib_d64 dd;
2041   mlib_s32 i;
2042 
2043   sp = (mlib_d64 *) src;
2044   dp = (mlib_d64 *) dst;
2045 
2046   if (cmask == 4) {
2047 #pragma pipeloop(0)
2048     for (i = 0; i < dsize / 4; i++) {
2049       sd0 = *sp++;
2050       sd1 = *sp++;
2051       sd2 = *sp++;
2052       CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd);
2053       *dp++ = dd;
2054     }
2055   }
2056   else if (cmask == 2) {
2057 #pragma pipeloop(0)
2058     for (i = 0; i < dsize / 4; i++) {
2059       sd0 = *sp++;
2060       sd1 = *sp++;
2061       sd2 = *sp++;
2062       CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd);
2063       *dp++ = dd;
2064     }
2065   }
2066   else {
2067 #pragma pipeloop(0)
2068     for (i = 0; i < dsize / 4; i++) {
2069       sd0 = *sp++;
2070       sd1 = *sp++;
2071       sd2 = *sp++;
2072       CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd);
2073       *dp++ = dd;
2074     }
2075   }
2076 }
2077 
2078 /***************************************************************/
2079 void mlib_v_ImageChannelExtract_S16_31_A8D2X4(const mlib_s16 *src,
2080                                               mlib_s32       slb,
2081                                               mlib_s16       *dst,
2082                                               mlib_s32       dlb,
2083                                               mlib_s32       xsize,
2084                                               mlib_s32       ysize,
2085                                               mlib_s32       cmask)
2086 {
2087   mlib_d64 *sp, *dp;
2088   mlib_d64 *sl, *dl;
2089   mlib_d64 sd0, sd1, sd2;
2090   mlib_d64 sda, sdb, sdc;
2091   mlib_d64 dd;
2092   mlib_s32 i, j;
2093 
2094   sp = sl = (mlib_d64 *) src;
2095   dp = dl = (mlib_d64 *) dst;
2096 
2097   if (cmask == 4) {
2098     for (j = 0; j < ysize; j++) {
2099 #pragma pipeloop(0)
2100       for (i = 0; i < xsize / 4; i++) {
2101         sd0 = *sp++;
2102         sd1 = *sp++;
2103         sd2 = *sp++;
2104         CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd);
2105         *dp++ = dd;
2106       }
2107 
2108       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2109       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2110     }
2111   }
2112   else if (cmask == 2) {
2113     for (j = 0; j < ysize; j++) {
2114 #pragma pipeloop(0)
2115       for (i = 0; i < xsize / 4; i++) {
2116         sd0 = *sp++;
2117         sd1 = *sp++;
2118         sd2 = *sp++;
2119         CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd);
2120         *dp++ = dd;
2121       }
2122 
2123       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2124       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2125     }
2126   }
2127   else {
2128     for (j = 0; j < ysize; j++) {
2129 #pragma pipeloop(0)
2130       for (i = 0; i < xsize / 4; i++) {
2131         sd0 = *sp++;
2132         sd1 = *sp++;
2133         sd2 = *sp++;
2134         CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd);
2135         *dp++ = dd;
2136       }
2137 
2138       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2139       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2140     }
2141   }
2142 }
2143 
2144 /***************************************************************/
2145 void mlib_v_ImageChannelExtract_S16_31_D1(const mlib_s16 *src,
2146                                           mlib_s16       *dst,
2147                                           mlib_s32       dsize,
2148                                           mlib_s32       cmask)
2149 {
2150   mlib_s16 *sa, *da;
2151   mlib_s16 *dend, *dend2;                             /* end points in dst */
2152   mlib_d64 *dp;                                       /* 8-byte aligned start points in dst */
2153   mlib_d64 *sp;                                       /* 8-byte aligned start point in src */
2154   mlib_d64 sd0, sd1, sd2;                             /* 8-byte source data */
2155   mlib_d64 sd3, sd4, sd5;
2156   mlib_d64 sda, sdb, sdc;
2157   mlib_d64 dd0, dd1;
2158   mlib_s32 soff;                                      /* offset of address in src */
2159   mlib_s32 doff;                                      /* offset of address in dst */
2160   mlib_s32 off;                                       /* offset of src over dst */
2161   mlib_s32 emask;                                     /* edge mask */
2162   mlib_s32 i, n;
2163 
2164   sa = (void *)src;
2165   da = dst;
2166 
2167   /* prepare the source address */
2168   sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
2169   soff = ((mlib_addr) sa & 7);
2170 
2171   /* prepare the destination addresses */
2172   dp = (mlib_d64 *) ((mlib_addr) da & (~7));
2173   doff = ((mlib_addr) da & 7);
2174   dend = da + dsize - 1;
2175   dend2 = dend - 3;
2176 
2177   /* calculate the src's offset over dst */
2178   if (cmask == 4) {
2179     off = (soff / 6) * 2 - doff;
2180   }
2181   else if (cmask == 2) {
2182     off = ((soff + 2) / 6) * 2 - doff;
2183   }
2184   else {
2185     off = ((soff + 4) / 6) * 2 - doff;
2186   }
2187 
2188   if (((cmask == 4) && (soff % 6 == 0)) ||
2189       ((cmask == 2) && (soff % 6 == 4)) ||
2190       ((cmask == 1) && (soff % 6 == 2))) { /* extract left channel */
2191 
2192     if (off == 0) {                         /* src and dst have same alignment */
2193 
2194       /* generate edge mask for the start point */
2195       emask = vis_edge16(da, dend);
2196 
2197       /* load 16 bytes */
2198       sd0 = *sp++;
2199       sd1 = *sp++;
2200       sd2 = *sp++;
2201 
2202       /* extract, including some garbage at the start point */
2203       CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0);
2204 
2205       /* store 8 bytes result */
2206       vis_pst_16(dd0, dp++, emask);
2207 
2208       if ((mlib_addr) dp <= (mlib_addr) dend2) {
2209         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2210 
2211         /* 8-pixel column loop, emask not needed */
2212 #pragma pipeloop(0)
2213         for (i = 0; i < n; i++) {
2214           sd0 = *sp++;
2215           sd1 = *sp++;
2216           sd2 = *sp++;
2217           CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0);
2218           *dp++ = dd0;
2219         }
2220       }
2221 
2222       /* end point handling */
2223       if ((mlib_addr) dp <= (mlib_addr) dend) {
2224         emask = vis_edge16(dp, dend);
2225         sd0 = *sp++;
2226         sd1 = *sp++;
2227         sd2 = *sp++;
2228         CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0);
2229         vis_pst_16(dd0, dp++, emask);
2230       }
2231     }
2232     else {
2233       vis_alignaddr((void *)0, off);
2234 
2235       /* generate edge mask for the start point */
2236       emask = vis_edge16(da, dend);
2237 
2238       if (off < 0) {
2239         /* load 24 bytes */
2240         sd3 = *sp++;
2241         sd4 = *sp++;
2242         sd5 = *sp++;
2243 
2244         /* extract and store 8 bytes */
2245         CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1);
2246         vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
2247       }
2248       else {
2249         /* load 48 bytes */
2250         sd0 = *sp++;
2251         sd1 = *sp++;
2252         sd2 = *sp++;
2253         sd3 = *sp++;
2254         sd4 = *sp++;
2255         sd5 = *sp++;
2256 
2257         /* extract and store 8 bytes */
2258         CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0);
2259         CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1);
2260         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2261       }
2262 
2263       if ((mlib_addr) dp <= (mlib_addr) dend2) {
2264         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2265 
2266         /* 8-pixel column loop, emask not needed */
2267 #pragma pipeloop(0)
2268         for (i = 0; i < n; i++) {
2269           dd0 = dd1;
2270           sd3 = *sp++;
2271           sd4 = *sp++;
2272           sd5 = *sp++;
2273           CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1);
2274           *dp++ = vis_faligndata(dd0, dd1);
2275         }
2276       }
2277 
2278       /* end point handling */
2279       if ((mlib_addr) dp <= (mlib_addr) dend) {
2280         emask = vis_edge16(dp, dend);
2281         dd0 = dd1;
2282         sd3 = *sp++;
2283         sd4 = *sp++;
2284         sd5 = *sp++;
2285         CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1);
2286         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2287       }
2288     }
2289   }
2290   else if (((cmask == 4) && (soff % 6 == 2)) ||
2291            ((cmask == 2) && (soff % 6 == 0)) ||
2292            ((cmask == 1) && (soff % 6 == 4))) {
2293     /* extract middle channel */
2294 
2295     if (off == 0) {                         /* src and dst have same alignment */
2296 
2297       /* generate edge mask for the start point */
2298       emask = vis_edge16(da, dend);
2299 
2300       /* load 16 bytes */
2301       sd0 = *sp++;
2302       sd1 = *sp++;
2303       sd2 = *sp++;
2304 
2305       /* extract, including some garbage at the start point */
2306       CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0);
2307 
2308       /* store 8 bytes result */
2309       vis_pst_16(dd0, dp++, emask);
2310 
2311       if ((mlib_addr) dp <= (mlib_addr) dend2) {
2312         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2313 
2314         /* 8-pixel column loop, emask not needed */
2315 #pragma pipeloop(0)
2316         for (i = 0; i < n; i++) {
2317           sd0 = *sp++;
2318           sd1 = *sp++;
2319           sd2 = *sp++;
2320           CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0);
2321           *dp++ = dd0;
2322         }
2323       }
2324 
2325       /* end point handling */
2326       if ((mlib_addr) dp <= (mlib_addr) dend) {
2327         emask = vis_edge16(dp, dend);
2328         sd0 = *sp++;
2329         sd1 = *sp++;
2330         sd2 = *sp++;
2331         CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0);
2332         vis_pst_16(dd0, dp++, emask);
2333       }
2334     }
2335     else {
2336       vis_alignaddr((void *)0, off);
2337 
2338       /* generate edge mask for the start point */
2339       emask = vis_edge16(da, dend);
2340 
2341       if (off < 0) {
2342         /* load 24 bytes */
2343         sd3 = *sp++;
2344         sd4 = *sp++;
2345         sd5 = *sp++;
2346 
2347         /* extract and store 8 bytes */
2348         CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1);
2349         vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
2350       }
2351       else {
2352         /* load 48 bytes */
2353         sd0 = *sp++;
2354         sd1 = *sp++;
2355         sd2 = *sp++;
2356         sd3 = *sp++;
2357         sd4 = *sp++;
2358         sd5 = *sp++;
2359 
2360         /* extract and store 8 bytes */
2361         CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0);
2362         CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1);
2363         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2364       }
2365 
2366       if ((mlib_addr) dp <= (mlib_addr) dend2) {
2367         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2368 
2369         /* 8-pixel column loop, emask not needed */
2370 #pragma pipeloop(0)
2371         for (i = 0; i < n; i++) {
2372           dd0 = dd1;
2373           sd3 = *sp++;
2374           sd4 = *sp++;
2375           sd5 = *sp++;
2376           CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1);
2377           *dp++ = vis_faligndata(dd0, dd1);
2378         }
2379       }
2380 
2381       /* end point handling */
2382       if ((mlib_addr) dp <= (mlib_addr) dend) {
2383         emask = vis_edge16(dp, dend);
2384         dd0 = dd1;
2385         sd3 = *sp++;
2386         sd4 = *sp++;
2387         sd5 = *sp++;
2388         CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1);
2389         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2390       }
2391     }
2392   }
2393   else {                                    /* extract right channel */
2394 
2395     if (off == 0) {                         /* src and dst have same alignment */
2396 
2397       /* generate edge mask for the start point */
2398       emask = vis_edge16(da, dend);
2399 
2400       /* load 16 bytes */
2401       sd0 = *sp++;
2402       sd1 = *sp++;
2403       sd2 = *sp++;
2404 
2405       /* extract, including some garbage at the start point */
2406       CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0);
2407 
2408       /* store 8 bytes result */
2409       vis_pst_16(dd0, dp++, emask);
2410 
2411       if ((mlib_addr) dp <= (mlib_addr) dend2) {
2412         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2413 
2414         /* 8-pixel column loop, emask not needed */
2415 #pragma pipeloop(0)
2416         for (i = 0; i < n; i++) {
2417           sd0 = *sp++;
2418           sd1 = *sp++;
2419           sd2 = *sp++;
2420           CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0);
2421           *dp++ = dd0;
2422         }
2423       }
2424 
2425       /* end point handling */
2426       if ((mlib_addr) dp <= (mlib_addr) dend) {
2427         emask = vis_edge16(dp, dend);
2428         sd0 = *sp++;
2429         sd1 = *sp++;
2430         sd2 = *sp++;
2431         CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0);
2432         vis_pst_16(dd0, dp++, emask);
2433       }
2434     }
2435     else {
2436       vis_alignaddr((void *)0, off);
2437 
2438       /* generate edge mask for the start point */
2439       emask = vis_edge16(da, dend);
2440 
2441       if (off < 0) {
2442         /* load 24 bytes */
2443         sd3 = *sp++;
2444         sd4 = *sp++;
2445         sd5 = *sp++;
2446 
2447         /* extract and store 8 bytes */
2448         CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1);
2449         vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
2450       }
2451       else {
2452         /* load 48 bytes */
2453         sd0 = *sp++;
2454         sd1 = *sp++;
2455         sd2 = *sp++;
2456         sd3 = *sp++;
2457         sd4 = *sp++;
2458         sd5 = *sp++;
2459 
2460         /* extract and store 8 bytes */
2461         CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0);
2462         CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1);
2463         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2464       }
2465 
2466       if ((mlib_addr) dp <= (mlib_addr) dend2) {
2467         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2468 
2469         /* 8-pixel column loop, emask not needed */
2470 #pragma pipeloop(0)
2471         for (i = 0; i < n; i++) {
2472           dd0 = dd1;
2473           sd3 = *sp++;
2474           sd4 = *sp++;
2475           sd5 = *sp++;
2476           CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1);
2477           *dp++ = vis_faligndata(dd0, dd1);
2478         }
2479       }
2480 
2481       /* end point handling */
2482       if ((mlib_addr) dp <= (mlib_addr) dend) {
2483         emask = vis_edge16(dp, dend);
2484         dd0 = dd1;
2485         sd3 = *sp++;
2486         sd4 = *sp++;
2487         sd5 = *sp++;
2488         CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1);
2489         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2490       }
2491     }
2492   }
2493 }
2494 
2495 /***************************************************************/
2496 void mlib_v_ImageChannelExtract_S16_31(const mlib_s16 *src,
2497                                        mlib_s32       slb,
2498                                        mlib_s16       *dst,
2499                                        mlib_s32       dlb,
2500                                        mlib_s32       xsize,
2501                                        mlib_s32       ysize,
2502                                        mlib_s32       cmask)
2503 {
2504   mlib_s16 *sa, *da;
2505   mlib_s16 *sl, *dl;
2506   mlib_s32 j;
2507 
2508   sa = sl = (void *)src;
2509   da = dl = dst;
2510 
2511   for (j = 0; j < ysize; j++) {
2512     mlib_v_ImageChannelExtract_S16_31_D1(sa, da, xsize, cmask);
2513     sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
2514     da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
2515   }
2516 }
2517 
2518 /***************************************************************/
2519 #define CHANNELEXTRACT_S16_41L(sd0, sd1,  sd2, sd3, dd)         \
2520   /* extract the left channel */                                \
2521   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2));        \
2522   sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3));        \
2523   sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb));        \
2524   dd  = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2525 
2526 /***************************************************************/
2527 #define CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd)         \
2528   /* extract the middle left channel */                         \
2529   sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2));        \
2530   sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3));        \
2531   sdc = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb));        \
2532   dd  = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2533 
2534 /***************************************************************/
2535 #define CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd)         \
2536   /* extract the middle right channel */                        \
2537   sda = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2));        \
2538   sdb = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3));        \
2539   sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb));        \
2540   dd  = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2541 
2542 /***************************************************************/
2543 #define CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd)          \
2544   /* extract the right channel */                               \
2545   sda = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2));        \
2546   sdb = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3));        \
2547   sdc = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb));        \
2548   dd  = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2549 
2550 /***************************************************************/
2551 void mlib_v_ImageChannelExtract_S16_41_A8D1X4(const mlib_s16 *src,
2552                                               mlib_s16       *dst,
2553                                               mlib_s32       dsize,
2554                                               mlib_s32       cmask)
2555 {
2556   mlib_d64 *sp, *dp;
2557   mlib_d64 sd0, sd1, sd2, sd3;
2558   mlib_d64 sda, sdb, sdc;
2559   mlib_d64 dd;
2560   mlib_s32 i;
2561 
2562   sp = (mlib_d64 *) src;
2563   dp = (mlib_d64 *) dst;
2564 
2565   if (cmask == 8) {
2566 #pragma pipeloop(0)
2567     for (i = 0; i < dsize / 4; i++) {
2568       sd0 = *sp++;
2569       sd1 = *sp++;
2570       sd2 = *sp++;
2571       sd3 = *sp++;
2572       CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd);
2573       *dp++ = dd;
2574     }
2575   }
2576   else if (cmask == 4) {
2577 #pragma pipeloop(0)
2578     for (i = 0; i < dsize / 4; i++) {
2579       sd0 = *sp++;
2580       sd1 = *sp++;
2581       sd2 = *sp++;
2582       sd3 = *sp++;
2583       CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd);
2584       *dp++ = dd;
2585     }
2586   }
2587   else if (cmask == 2) {
2588 #pragma pipeloop(0)
2589     for (i = 0; i < dsize / 4; i++) {
2590       sd0 = *sp++;
2591       sd1 = *sp++;
2592       sd2 = *sp++;
2593       sd3 = *sp++;
2594       CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd);
2595       *dp++ = dd;
2596     }
2597   }
2598   else {
2599 #pragma pipeloop(0)
2600     for (i = 0; i < dsize / 4; i++) {
2601       sd0 = *sp++;
2602       sd1 = *sp++;
2603       sd2 = *sp++;
2604       sd3 = *sp++;
2605       CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd);
2606       *dp++ = dd;
2607     }
2608   }
2609 }
2610 
2611 /***************************************************************/
2612 void mlib_v_ImageChannelExtract_S16_41_A8D2X4(const mlib_s16 *src,
2613                                               mlib_s32       slb,
2614                                               mlib_s16       *dst,
2615                                               mlib_s32       dlb,
2616                                               mlib_s32       xsize,
2617                                               mlib_s32       ysize,
2618                                               mlib_s32       cmask)
2619 {
2620   mlib_d64 *sp, *dp;
2621   mlib_d64 *sl, *dl;
2622   mlib_d64 sd0, sd1, sd2, sd3;
2623   mlib_d64 sda, sdb, sdc;
2624   mlib_d64 dd;
2625   mlib_s32 i, j;
2626 
2627   sp = sl = (mlib_d64 *) src;
2628   dp = dl = (mlib_d64 *) dst;
2629 
2630   if (cmask == 8) {
2631     for (j = 0; j < ysize; j++) {
2632 #pragma pipeloop(0)
2633       for (i = 0; i < xsize / 4; i++) {
2634         sd0 = *sp++;
2635         sd1 = *sp++;
2636         sd2 = *sp++;
2637         sd3 = *sp++;
2638         CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd);
2639         *dp++ = dd;
2640       }
2641 
2642       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2643       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2644     }
2645   }
2646   else if (cmask == 4) {
2647     for (j = 0; j < ysize; j++) {
2648 #pragma pipeloop(0)
2649       for (i = 0; i < xsize / 4; i++) {
2650         sd0 = *sp++;
2651         sd1 = *sp++;
2652         sd2 = *sp++;
2653         sd3 = *sp++;
2654         CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd);
2655         *dp++ = dd;
2656       }
2657 
2658       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2659       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2660     }
2661   }
2662   else if (cmask == 2) {
2663     for (j = 0; j < ysize; j++) {
2664 #pragma pipeloop(0)
2665       for (i = 0; i < xsize / 4; i++) {
2666         sd0 = *sp++;
2667         sd1 = *sp++;
2668         sd2 = *sp++;
2669         sd3 = *sp++;
2670         CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd);
2671         *dp++ = dd;
2672       }
2673 
2674       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2675       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2676     }
2677   }
2678   else {
2679     for (j = 0; j < ysize; j++) {
2680 #pragma pipeloop(0)
2681       for (i = 0; i < xsize / 4; i++) {
2682         sd0 = *sp++;
2683         sd1 = *sp++;
2684         sd2 = *sp++;
2685         sd3 = *sp++;
2686         CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd);
2687         *dp++ = dd;
2688       }
2689 
2690       sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2691       dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2692     }
2693   }
2694 }
2695 
2696 /***************************************************************/
2697 void mlib_v_ImageChannelExtract_S16_41_D1(const mlib_s16 *src,
2698                                           mlib_s16       *dst,
2699                                           mlib_s32       dsize,
2700                                           mlib_s32       cmask)
2701 {
2702   mlib_s16 *sa, *da;
2703   mlib_s16 *dend, *dend2;                             /* end points in dst */
2704   mlib_d64 *dp;                                       /* 8-byte aligned start points in dst */
2705   mlib_d64 *sp;                                       /* 8-byte aligned start point in src */
2706   mlib_d64 sd0, sd1, sd2, sd3;                        /* 8-byte source data */
2707   mlib_d64 sd4, sd5, sd6, sd7;
2708   mlib_d64 sda, sdb, sdc;
2709   mlib_d64 dd0, dd1;
2710   mlib_s32 soff;                                      /* offset of address in src */
2711   mlib_s32 doff;                                      /* offset of address in dst */
2712   mlib_s32 off;                                       /* offset of src over dst */
2713   mlib_s32 emask;                                     /* edge mask */
2714   mlib_s32 i, n;
2715 
2716   sa = (void *)src;
2717   da = dst;
2718 
2719   /* prepare the source address */
2720   sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
2721   soff = ((mlib_addr) sa & 7);
2722 
2723   /* prepare the destination addresses */
2724   dp = (mlib_d64 *) ((mlib_addr) da & (~7));
2725   doff = ((mlib_addr) da & 7);
2726   dend = da + dsize - 1;
2727   dend2 = dend - 3;
2728 
2729   /* calculate the src's offset over dst */
2730   if (cmask == 8) {
2731     off = (soff / 8) * 2 - doff;
2732   }
2733   else if (cmask == 4) {
2734     off = ((soff + 2) / 8) * 2 - doff;
2735   }
2736   else if (cmask == 2) {
2737     off = ((soff + 4) / 8) * 2 - doff;
2738   }
2739   else {
2740     off = ((soff + 6) / 8) * 2 - doff;
2741   }
2742 
2743   if (((cmask == 8) && (soff == 0)) ||
2744       ((cmask == 4) && (soff == 6)) ||
2745       ((cmask == 2) && (soff == 4)) ||
2746       ((cmask == 1) && (soff == 2))) { /* extract left channel */
2747 
2748     if (off == 0) {                         /* src and dst have same alignment */
2749 
2750       /* generate edge mask for the start point */
2751       emask = vis_edge16(da, dend);
2752 
2753       /* load 16 bytes */
2754       sd0 = *sp++;
2755       sd1 = *sp++;
2756       sd2 = *sp++;
2757       sd3 = *sp++;
2758 
2759       /* extract, including some garbage at the start point */
2760       CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0);
2761 
2762       /* store 8 bytes result */
2763       vis_pst_16(dd0, dp++, emask);
2764 
2765       if ((mlib_addr) dp <= (mlib_addr) dend2) {
2766         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2767 
2768         /* 8-pixel column loop, emask not needed */
2769 #pragma pipeloop(0)
2770         for (i = 0; i < n; i++) {
2771           sd0 = *sp++;
2772           sd1 = *sp++;
2773           sd2 = *sp++;
2774           sd3 = *sp++;
2775           CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0);
2776           *dp++ = dd0;
2777         }
2778       }
2779 
2780       /* end point handling */
2781       if ((mlib_addr) dp <= (mlib_addr) dend) {
2782         emask = vis_edge16(dp, dend);
2783         sd0 = *sp++;
2784         sd1 = *sp++;
2785         sd2 = *sp++;
2786         sd3 = *sp++;
2787         CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0);
2788         vis_pst_16(dd0, dp++, emask);
2789       }
2790     }
2791     else {
2792       vis_alignaddr((void *)0, off);
2793 
2794       /* generate edge mask for the start point */
2795       emask = vis_edge16(da, dend);
2796 
2797       if (off < 0) {
2798         /* load 24 bytes */
2799         sd4 = *sp++;
2800         sd5 = *sp++;
2801         sd6 = *sp++;
2802         sd7 = *sp++;
2803 
2804         /* extract and store 8 bytes */
2805         CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1);
2806         vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
2807       }
2808       else {
2809         /* load 48 bytes */
2810         sd0 = *sp++;
2811         sd1 = *sp++;
2812         sd2 = *sp++;
2813         sd3 = *sp++;
2814         sd4 = *sp++;
2815         sd5 = *sp++;
2816         sd6 = *sp++;
2817         sd7 = *sp++;
2818 
2819         /* extract and store 8 bytes */
2820         CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0);
2821         CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1);
2822         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2823       }
2824 
2825       if ((mlib_addr) dp <= (mlib_addr) dend2) {
2826         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2827 
2828         /* 8-pixel column loop, emask not needed */
2829 #pragma pipeloop(0)
2830         for (i = 0; i < n; i++) {
2831           dd0 = dd1;
2832           sd4 = *sp++;
2833           sd5 = *sp++;
2834           sd6 = *sp++;
2835           sd7 = *sp++;
2836           CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1);
2837           *dp++ = vis_faligndata(dd0, dd1);
2838         }
2839       }
2840 
2841       /* end point handling */
2842       if ((mlib_addr) dp <= (mlib_addr) dend) {
2843         emask = vis_edge16(dp, dend);
2844         dd0 = dd1;
2845         sd4 = *sp++;
2846         sd5 = *sp++;
2847         sd6 = *sp++;
2848         sd7 = *sp++;
2849         CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1);
2850         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2851       }
2852     }
2853   }
2854   else if (((cmask == 8) && (soff == 2)) ||
2855            ((cmask == 4) && (soff == 0)) ||
2856            ((cmask == 2) && (soff == 6)) ||
2857            ((cmask == 1) && (soff == 4))) { /* extract middle left channel */
2858 
2859     if (off == 0) {                         /* src and dst have same alignment */
2860 
2861       /* generate edge mask for the start point */
2862       emask = vis_edge16(da, dend);
2863 
2864       /* load 16 bytes */
2865       sd0 = *sp++;
2866       sd1 = *sp++;
2867       sd2 = *sp++;
2868       sd3 = *sp++;
2869 
2870       /* extract, including some garbage at the start point */
2871       CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0);
2872 
2873       /* store 8 bytes result */
2874       vis_pst_16(dd0, dp++, emask);
2875 
2876       if ((mlib_addr) dp <= (mlib_addr) dend2) {
2877         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2878 
2879         /* 8-pixel column loop, emask not needed */
2880 #pragma pipeloop(0)
2881         for (i = 0; i < n; i++) {
2882           sd0 = *sp++;
2883           sd1 = *sp++;
2884           sd2 = *sp++;
2885           sd3 = *sp++;
2886           CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0);
2887           *dp++ = dd0;
2888         }
2889       }
2890 
2891       /* end point handling */
2892       if ((mlib_addr) dp <= (mlib_addr) dend) {
2893         emask = vis_edge16(dp, dend);
2894         sd0 = *sp++;
2895         sd1 = *sp++;
2896         sd2 = *sp++;
2897         sd3 = *sp++;
2898         CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0);
2899         vis_pst_16(dd0, dp++, emask);
2900       }
2901     }
2902     else {
2903       vis_alignaddr((void *)0, off);
2904 
2905       /* generate edge mask for the start point */
2906       emask = vis_edge16(da, dend);
2907 
2908       if (off < 0) {
2909         /* load 24 bytes */
2910         sd4 = *sp++;
2911         sd5 = *sp++;
2912         sd6 = *sp++;
2913         sd7 = *sp++;
2914 
2915         /* extract and store 8 bytes */
2916         CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1);
2917         vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
2918       }
2919       else {
2920         /* load 48 bytes */
2921         sd0 = *sp++;
2922         sd1 = *sp++;
2923         sd2 = *sp++;
2924         sd3 = *sp++;
2925         sd4 = *sp++;
2926         sd5 = *sp++;
2927         sd6 = *sp++;
2928         sd7 = *sp++;
2929 
2930         /* extract and store 8 bytes */
2931         CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0);
2932         CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1);
2933         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2934       }
2935 
2936       if ((mlib_addr) dp <= (mlib_addr) dend2) {
2937         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2938 
2939         /* 8-pixel column loop, emask not needed */
2940 #pragma pipeloop(0)
2941         for (i = 0; i < n; i++) {
2942           dd0 = dd1;
2943           sd4 = *sp++;
2944           sd5 = *sp++;
2945           sd6 = *sp++;
2946           sd7 = *sp++;
2947           CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1);
2948           *dp++ = vis_faligndata(dd0, dd1);
2949         }
2950       }
2951 
2952       /* end point handling */
2953       if ((mlib_addr) dp <= (mlib_addr) dend) {
2954         emask = vis_edge16(dp, dend);
2955         dd0 = dd1;
2956         sd4 = *sp++;
2957         sd5 = *sp++;
2958         sd6 = *sp++;
2959         sd7 = *sp++;
2960         CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1);
2961         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2962       }
2963     }
2964   }
2965   else if (((cmask == 8) && (soff == 4)) ||
2966            ((cmask == 4) && (soff == 2)) ||
2967            ((cmask == 2) && (soff == 0)) ||
2968            ((cmask == 1) && (soff == 6))) { /* extract middle right channel */
2969 
2970     if (off == 0) {                         /* src and dst have same alignment */
2971 
2972       /* generate edge mask for the start point */
2973       emask = vis_edge16(da, dend);
2974 
2975       /* load 16 bytes */
2976       sd0 = *sp++;
2977       sd1 = *sp++;
2978       sd2 = *sp++;
2979       sd3 = *sp++;
2980 
2981       /* extract, including some garbage at the start point */
2982       CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0);
2983 
2984       /* store 8 bytes result */
2985       vis_pst_16(dd0, dp++, emask);
2986 
2987       if ((mlib_addr) dp <= (mlib_addr) dend2) {
2988         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2989 
2990         /* 8-pixel column loop, emask not needed */
2991 #pragma pipeloop(0)
2992         for (i = 0; i < n; i++) {
2993           sd0 = *sp++;
2994           sd1 = *sp++;
2995           sd2 = *sp++;
2996           sd3 = *sp++;
2997           CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0);
2998           *dp++ = dd0;
2999         }
3000       }
3001 
3002       /* end point handling */
3003       if ((mlib_addr) dp <= (mlib_addr) dend) {
3004         emask = vis_edge16(dp, dend);
3005         sd0 = *sp++;
3006         sd1 = *sp++;
3007         sd2 = *sp++;
3008         sd3 = *sp++;
3009         CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0);
3010         vis_pst_16(dd0, dp++, emask);
3011       }
3012     }
3013     else {
3014       vis_alignaddr((void *)0, off);
3015 
3016       /* generate edge mask for the start point */
3017       emask = vis_edge16(da, dend);
3018 
3019       if (off < 0) {
3020         /* load 24 bytes */
3021         sd4 = *sp++;
3022         sd5 = *sp++;
3023         sd6 = *sp++;
3024         sd7 = *sp++;
3025 
3026         /* extract and store 8 bytes */
3027         CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1);
3028         vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
3029       }
3030       else {
3031         /* load 48 bytes */
3032         sd0 = *sp++;
3033         sd1 = *sp++;
3034         sd2 = *sp++;
3035         sd3 = *sp++;
3036         sd4 = *sp++;
3037         sd5 = *sp++;
3038         sd6 = *sp++;
3039         sd7 = *sp++;
3040 
3041         /* extract and store 8 bytes */
3042         CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0);
3043         CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1);
3044         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
3045       }
3046 
3047       if ((mlib_addr) dp <= (mlib_addr) dend2) {
3048         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
3049 
3050         /* 8-pixel column loop, emask not needed */
3051 #pragma pipeloop(0)
3052         for (i = 0; i < n; i++) {
3053           dd0 = dd1;
3054           sd4 = *sp++;
3055           sd5 = *sp++;
3056           sd6 = *sp++;
3057           sd7 = *sp++;
3058           CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1);
3059           *dp++ = vis_faligndata(dd0, dd1);
3060         }
3061       }
3062 
3063       /* end point handling */
3064       if ((mlib_addr) dp <= (mlib_addr) dend) {
3065         emask = vis_edge16(dp, dend);
3066         dd0 = dd1;
3067         sd4 = *sp++;
3068         sd5 = *sp++;
3069         sd6 = *sp++;
3070         sd7 = *sp++;
3071         CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1);
3072         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
3073       }
3074     }
3075   }
3076   else {                                    /* extract right channel */
3077     if (off == 0) {                         /* src and dst have same alignment */
3078 
3079       /* generate edge mask for the start point */
3080       emask = vis_edge16(da, dend);
3081 
3082       /* load 16 bytes */
3083       sd0 = *sp++;
3084       sd1 = *sp++;
3085       sd2 = *sp++;
3086       sd3 = *sp++;
3087 
3088       /* extract, including some garbage at the start point */
3089       CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd0);
3090 
3091       /* store 8 bytes result */
3092       vis_pst_16(dd0, dp++, emask);
3093 
3094       if ((mlib_addr) dp <= (mlib_addr) dend2) {
3095         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
3096 
3097         /* 8-pixel column loop, emask not needed */
3098 #pragma pipeloop(0)
3099         for (i = 0; i < n; i++) {
3100           sd0 = *sp++;
3101           sd1 = *sp++;
3102           sd2 = *sp++;
3103           sd3 = *sp++;
3104           CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd0);
3105           *dp++ = dd0;
3106         }
3107       }
3108 
3109       /* end point handling */
3110       if ((mlib_addr) dp <= (mlib_addr) dend) {
3111         emask = vis_edge16(dp, dend);
3112         sd0 = *sp++;
3113         sd1 = *sp++;
3114         sd2 = *sp++;
3115         sd3 = *sp++;
3116         CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd0);
3117         vis_pst_16(dd0, dp++, emask);
3118       }
3119     }
3120     else {
3121       vis_alignaddr((void *)0, off);
3122 
3123       /* generate edge mask for the start point */
3124       emask = vis_edge16(da, dend);
3125 
3126       if (off < 0) {
3127         /* load 24 bytes */
3128         sd4 = *sp++;
3129         sd5 = *sp++;
3130         sd6 = *sp++;
3131         sd7 = *sp++;
3132 
3133         /* extract and store 8 bytes */
3134         CHANNELEXTRACT_S16_41R(sd4, sd5, sd6, sd7, dd1);
3135         vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
3136       }
3137 
3138       if ((mlib_addr) dp <= (mlib_addr) dend2) {
3139         n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
3140 
3141         /* 8-pixel column loop, emask not needed */
3142 #pragma pipeloop(0)
3143         for (i = 0; i < n; i++) {
3144           dd0 = dd1;
3145           sd4 = *sp++;
3146           sd5 = *sp++;
3147           sd6 = *sp++;
3148           sd7 = *sp++;
3149           CHANNELEXTRACT_S16_41R(sd4, sd5, sd6, sd7, dd1);
3150           *dp++ = vis_faligndata(dd0, dd1);
3151         }
3152       }
3153 
3154       /* end point handling */
3155       if ((mlib_addr) dp <= (mlib_addr) dend) {
3156         emask = vis_edge16(dp, dend);
3157         dd0 = dd1;
3158         sd4 = *sp++;
3159         sd5 = *sp++;
3160         sd6 = *sp++;
3161         sd7 = *sp++;
3162         CHANNELEXTRACT_S16_41R(sd4, sd5, sd6, sd7, dd1);
3163         vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
3164       }
3165     }
3166   }
3167 }
3168 
3169 /***************************************************************/
3170 void mlib_v_ImageChannelExtract_S16_41(const mlib_s16 *src,
3171                                        mlib_s32       slb,
3172                                        mlib_s16       *dst,
3173                                        mlib_s32       dlb,
3174                                        mlib_s32       xsize,
3175                                        mlib_s32       ysize,
3176                                        mlib_s32       cmask)
3177 {
3178   mlib_s16 *sa, *da;
3179   mlib_s16 *sl, *dl;
3180   mlib_s32 j;
3181 
3182   sa = sl = (void *)src;
3183   da = dl = dst;
3184   for (j = 0; j < ysize; j++) {
3185     mlib_v_ImageChannelExtract_S16_41_D1(sa, da, xsize, cmask);
3186     sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
3187     da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
3188   }
3189 }
3190 
3191 /***************************************************************/