12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27
28 /*
29 * FILENAME: mlib_ImageChannelExtract_1.c
30 *
31 * FUNCTIONS
32 * mlib_v_ImageChannelExtract_U8_21_A8D1X8
33 * mlib_v_ImageChannelExtract_U8_21_A8D2X8
34 * mlib_v_ImageChannelExtract_U8_21_D1
35 * mlib_v_ImageChannelExtract_U8_21
36 * mlib_v_ImageChannelExtract_U8_31_A8D1X8
37 * mlib_v_ImageChannelExtract_U8_31_A8D2X8
38 * mlib_v_ImageChannelExtract_U8_31_D1
39 * mlib_v_ImageChannelExtract_U8_31
40 * mlib_v_ImageChannelExtract_U8_41_A8D1X8
41 * mlib_v_ImageChannelExtract_U8_41_A8D2X8
42 * mlib_v_ImageChannelExtract_U8_41_D1
43 * mlib_v_ImageChannelExtract_U8_41
44 * mlib_v_ImageChannelExtract_S16_21_A8D1X4
45 * mlib_v_ImageChannelExtract_S16_21_A8D2X4
46 * mlib_v_ImageChannelExtract_S16_21_D1
47 * mlib_v_ImageChannelExtract_S16_21
48 * mlib_v_ImageChannelExtract_S16_31_A8D1X4
49 * mlib_v_ImageChannelExtract_S16_31_A8D2X4
50 * mlib_v_ImageChannelExtract_S16_31_D1
51 * mlib_v_ImageChannelExtract_S16_31
52 * mlib_v_ImageChannelExtract_S16_41_A8D1X4
53 * mlib_v_ImageChannelExtract_S16_41_A8D2X4
54 * mlib_v_ImageChannelExtract_S16_41_D1
55 * mlib_v_ImageChannelExtract_S16_41
56 *
57 * ARGUMENT
58 * src pointer to source image data
59 * dst pointer to destination image data
60 * slb source image line stride in bytes
61 * dlb destination image line stride in bytes
62 * dsize image data size in pixels
63 * xsize image width in pixels
64 * ysize image height in lines
65 * cmask channel mask
66 *
67 * DESCRIPTION
68 * Extract the one selected channel of the source image into the
69 * 1-channel destination image.
70 *
71 * NOTE
72 * These functions are separated from mlib_ImageChannelExtract.c
73 * for loop unrolling and structure clarity.
74 */
75
78 #include "mlib_v_ImageChannelExtract.h"
79
80 /***************************************************************/
81 #define CHANNELEXTRACT_U8_21L(sd0, sd1, dd) \
82 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \
83 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \
84 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \
85 sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \
86 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_hi(sdd))
87
88 /***************************************************************/
89 #define CHANNELEXTRACT_U8_21R(sd0, sd1, dd) \
90 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \
91 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \
92 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \
93 sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \
94 dd = vis_fpmerge(vis_read_lo(sdc), vis_read_lo(sdd))
95
96 /***************************************************************/
97 /* extract one channel from a 2-channel image.
98 * both source and destination image data are 8-byte aligned.
99 * xsize is multiple of 8.
100 */
101
102 void mlib_v_ImageChannelExtract_U8_21_A8D1X8(const mlib_u8 *src,
103 mlib_u8 *dst,
104 mlib_s32 dsize,
105 mlib_s32 cmask)
106 {
107 mlib_d64 *sp, *dp;
108 mlib_d64 sd0, sd1;
109 mlib_d64 sda, sdb, sdc, sdd;
110 mlib_d64 dd;
111 mlib_s32 i;
112
113 sp = (mlib_d64 *) src;
114 dp = (mlib_d64 *) dst;
115
116 if (cmask == 2) {
117 #pragma pipeloop(0)
118 for (i = 0; i < dsize / 8; i++) {
119 sd0 = *sp++;
120 sd1 = *sp++;
121 CHANNELEXTRACT_U8_21L(sd0, sd1, dd);
122 *dp++ = dd;
123 }
124 }
125 else {
126 #pragma pipeloop(0)
127 for (i = 0; i < dsize / 8; i++) {
128 sd0 = *sp++;
129 sd1 = *sp++;
130 CHANNELEXTRACT_U8_21R(sd0, sd1, dd);
131 *dp++ = dd;
132 }
133 }
134 }
135
136 /***************************************************************/
137 /* extract one channel from a 2-channel image.
138 * both source and destination image data are 8-byte aligned.
139 * xsize is multiple of 8.
140 */
141
142 void mlib_v_ImageChannelExtract_U8_21_A8D2X8(const mlib_u8 *src,
143 mlib_s32 slb,
144 mlib_u8 *dst,
145 mlib_s32 dlb,
146 mlib_s32 xsize,
147 mlib_s32 ysize,
148 mlib_s32 cmask)
149 {
150 mlib_d64 *sp, *dp;
151 mlib_d64 *sl, *dl;
152 mlib_d64 sd0, sd1;
153 mlib_d64 sda, sdb, sdc, sdd;
154 mlib_d64 dd;
155 mlib_s32 i, j;
156
157 sp = sl = (mlib_d64 *) src;
158 dp = dl = (mlib_d64 *) dst;
159
160 if (cmask == 2) {
161 for (j = 0; j < ysize; j++) {
162 #pragma pipeloop(0)
163 for (i = 0; i < xsize / 8; i++) {
164 sd0 = *sp++;
165 sd1 = *sp++;
166 CHANNELEXTRACT_U8_21L(sd0, sd1, dd);
167 *dp++ = dd;
168 }
169
170 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
171 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
172 }
173 }
174 else {
175 for (j = 0; j < ysize; j++) {
176 #pragma pipeloop(0)
177 for (i = 0; i < xsize / 8; i++) {
178 sd0 = *sp++;
179 sd1 = *sp++;
180 CHANNELEXTRACT_U8_21R(sd0, sd1, dd);
181 *dp++ = dd;
182 }
183
184 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
185 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
186 }
187 }
188 }
189
190 /***************************************************************/
191 /* extract one channel from a 2-channel image.
192 */
193
194 void mlib_v_ImageChannelExtract_U8_21_D1(const mlib_u8 *src,
195 mlib_u8 *dst,
196 mlib_s32 dsize,
197 mlib_s32 cmask)
198 {
199 mlib_u8 *sa, *da;
200 mlib_u8 *dend, *dend2; /* end points in dst */
201 mlib_d64 *dp; /* 8-byte aligned start points in dst */
202 mlib_d64 *sp; /* 8-byte aligned start point in src */
203 mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */
204 mlib_d64 sda, sdb, sdc, sdd;
205 mlib_d64 dd0, dd1;
206 mlib_s32 soff; /* offset of address in src */
207 mlib_s32 doff; /* offset of address in dst */
208 mlib_s32 off; /* offset of src over dst */
209 mlib_s32 emask; /* edge mask */
210 mlib_s32 i, n;
211
398 sd3 = *sp++;
399 CHANNELEXTRACT_U8_21R(sd2, sd3, dd1);
400 *dp++ = vis_faligndata(dd0, dd1);
401 }
402 }
403
404 /* end point handling */
405 if ((mlib_addr) dp <= (mlib_addr) dend) {
406 emask = vis_edge8(dp, dend);
407 dd0 = dd1;
408 sd2 = *sp++;
409 sd3 = *sp++;
410 CHANNELEXTRACT_U8_21R(sd2, sd3, dd1);
411 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
412 }
413 }
414 }
415 }
416
417 /***************************************************************/
418 /* extract one channel from a 2-channel image.
419 */
420
421 void mlib_v_ImageChannelExtract_U8_21(const mlib_u8 *src,
422 mlib_s32 slb,
423 mlib_u8 *dst,
424 mlib_s32 dlb,
425 mlib_s32 xsize,
426 mlib_s32 ysize,
427 mlib_s32 cmask)
428 {
429 mlib_u8 *sa, *da;
430 mlib_u8 *sl, *dl;
431 mlib_s32 j;
432
433 sa = sl = (void *)src;
434 da = dl = dst;
435
436 for (j = 0; j < ysize; j++) {
437 mlib_v_ImageChannelExtract_U8_21_D1(sa, da, xsize, cmask);
438 sa = sl += slb;
439 da = dl += dlb;
440 }
441 }
442
443 /***************************************************************/
444 #define CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd) \
445 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
446 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
447 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
448 sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \
449 sde = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc)); \
450 dd = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde))
451
452 /***************************************************************/
453 #define CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd) \
454 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
455 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
456 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
457 sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \
458 sde = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc)); \
459 dd = vis_fpmerge(vis_read_lo(sdd), vis_read_hi(sde))
460
461 /***************************************************************/
462 #define CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd) \
463 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
464 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
465 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
466 sdd = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc)); \
467 sde = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc)); \
468 dd = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde))
469
470 /***************************************************************/
471 void mlib_v_ImageChannelExtract_U8_31_A8D1X8(const mlib_u8 *src,
472 mlib_u8 *dst,
473 mlib_s32 dsize,
474 mlib_s32 cmask)
475 {
476 mlib_d64 *sp, *dp;
477 mlib_d64 sd0, sd1, sd2;
478 mlib_d64 sda, sdb, sdc, sdd, sde;
479 mlib_d64 dd;
480 mlib_s32 i;
481
482 sp = (mlib_d64 *) src;
483 dp = (mlib_d64 *) dst;
484
485 if (cmask == 4) {
486 #pragma pipeloop(0)
487 for (i = 0; i < dsize / 8; i++) {
488 sd0 = *sp++;
489 sd1 = *sp++;
490 sd2 = *sp++;
491 CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd);
492 *dp++ = dd;
493 }
494 }
495 else if (cmask == 2) {
496 #pragma pipeloop(0)
497 for (i = 0; i < dsize / 8; i++) {
498 sd0 = *sp++;
499 sd1 = *sp++;
500 sd2 = *sp++;
501 CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd);
502 *dp++ = dd;
503 }
504 }
505 else {
506 #pragma pipeloop(0)
507 for (i = 0; i < dsize / 8; i++) {
508 sd0 = *sp++;
509 sd1 = *sp++;
510 sd2 = *sp++;
511 CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd);
512 *dp++ = dd;
513 }
514 }
515 }
516
517 /***************************************************************/
518 void mlib_v_ImageChannelExtract_U8_31_A8D2X8(const mlib_u8 *src,
519 mlib_s32 slb,
520 mlib_u8 *dst,
521 mlib_s32 dlb,
522 mlib_s32 xsize,
523 mlib_s32 ysize,
524 mlib_s32 cmask)
525 {
526 mlib_d64 *sp, *dp;
527 mlib_d64 *sl, *dl;
528 mlib_d64 sd0, sd1, sd2;
529 mlib_d64 sda, sdb, sdc, sdd, sde;
530 mlib_d64 dd;
531 mlib_s32 i, j;
532
533 sp = sl = (mlib_d64 *) src;
534 dp = dl = (mlib_d64 *) dst;
535
536 if (cmask == 4) {
537 for (j = 0; j < ysize; j++) {
538 #pragma pipeloop(0)
539 for (i = 0; i < xsize / 8; i++) {
540 sd0 = *sp++;
541 sd1 = *sp++;
542 sd2 = *sp++;
543 CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd);
544 *dp++ = dd;
545 }
546
547 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
548 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
549 }
550 }
551 else if (cmask == 2) {
552 for (j = 0; j < ysize; j++) {
553 #pragma pipeloop(0)
554 for (i = 0; i < xsize / 8; i++) {
555 sd0 = *sp++;
556 sd1 = *sp++;
557 sd2 = *sp++;
558 CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd);
559 *dp++ = dd;
560 }
561
562 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
563 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
564 }
565 }
566 else {
567 for (j = 0; j < ysize; j++) {
568 #pragma pipeloop(0)
569 for (i = 0; i < xsize / 8; i++) {
570 sd0 = *sp++;
571 sd1 = *sp++;
572 sd2 = *sp++;
573 CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd);
574 *dp++ = dd;
575 }
576
577 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
578 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
579 }
580 }
581 }
582
583 /***************************************************************/
584 void mlib_v_ImageChannelExtract_U8_31_D1(const mlib_u8 *src,
585 mlib_u8 *dst,
586 mlib_s32 dsize,
587 mlib_s32 cmask)
588 {
589 mlib_u8 *sa, *da;
590 mlib_u8 *dend, *dend2; /* end points in dst */
591 mlib_d64 *dp; /* 8-byte aligned start points in dst */
592 mlib_d64 *sp; /* 8-byte aligned start point in src */
593 mlib_d64 sd0, sd1, sd2; /* 8-byte source data */
594 mlib_d64 sd3, sd4, sd5;
595 mlib_d64 sda, sdb, sdc, sdd, sde;
596 mlib_d64 dd0, dd1;
597 mlib_s32 soff; /* offset of address in src */
598 mlib_s32 doff; /* offset of address in dst */
599 mlib_s32 off; /* offset of src over dst */
600 mlib_s32 emask; /* edge mask */
601 mlib_s32 i, n;
602
603 sa = (void *)src;
915 CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1);
916 *dp++ = vis_faligndata(dd0, dd1);
917 }
918 }
919
920 /* end point handling */
921 if ((mlib_addr) dp <= (mlib_addr) dend) {
922 emask = vis_edge8(dp, dend);
923 dd0 = dd1;
924 sd3 = *sp++;
925 sd4 = *sp++;
926 sd5 = *sp++;
927 CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1);
928 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
929 }
930 }
931 }
932 }
933
934 /***************************************************************/
935 void mlib_v_ImageChannelExtract_U8_31(const mlib_u8 *src,
936 mlib_s32 slb,
937 mlib_u8 *dst,
938 mlib_s32 dlb,
939 mlib_s32 xsize,
940 mlib_s32 ysize,
941 mlib_s32 cmask)
942 {
943 mlib_u8 *sa, *da;
944 mlib_u8 *sl, *dl;
945 mlib_s32 j;
946
947 sa = sl = (void *)src;
948 da = dl = dst;
949
950 for (j = 0; j < ysize; j++) {
951 mlib_v_ImageChannelExtract_U8_31_D1(sa, da, xsize, cmask);
952 sa = sl += slb;
953 da = dl += dlb;
954 }
955 }
956
957 /***************************************************************/
958 #define CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd) \
959 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
960 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
961 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
962 sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
963 sde = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdc)); \
964 sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdd)); \
965 dd = vis_fpmerge(vis_read_hi(sde), vis_read_hi(sdf))
966
967 /***************************************************************/
968 #define CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd) \
969 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
970 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
971 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
972 sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
973 sde = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdc)); \
974 sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdd)); \
975 dd = vis_fpmerge(vis_read_lo(sde), vis_read_lo(sdf))
976
977 /***************************************************************/
978 #define CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd) \
979 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
980 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
981 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
982 sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
983 sde = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdc)); \
984 sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdd)); \
985 dd = vis_fpmerge(vis_read_hi(sde), vis_read_hi(sdf))
986
987 /***************************************************************/
988 #define CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd) \
989 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
990 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
991 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
992 sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
993 sde = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdc)); \
994 sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdd)); \
995 dd = vis_fpmerge(vis_read_lo(sde), vis_read_lo(sdf))
996
997 /***************************************************************/
998 void mlib_v_ImageChannelExtract_U8_41_A8D1X8(const mlib_u8 *src,
999 mlib_u8 *dst,
1000 mlib_s32 dsize,
1001 mlib_s32 cmask)
1002 {
1003 mlib_d64 *sp, *dp;
1004 mlib_d64 sd0, sd1, sd2, sd3;
1005 mlib_d64 sda, sdb, sdc, sdd, sde, sdf;
1006 mlib_d64 dd;
1007 mlib_s32 i;
1008
1009 sp = (mlib_d64 *) src;
1010 dp = (mlib_d64 *) dst;
1011
1012 if (cmask == 8) {
1013 #pragma pipeloop(0)
1014 for (i = 0; i < dsize / 8; i++) {
1015 sd0 = *sp++;
1016 sd1 = *sp++;
1017 sd2 = *sp++;
1018 sd3 = *sp++;
1019 CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd);
1020 *dp++ = dd;
1021 }
1022 }
1023 else if (cmask == 4) {
1024 #pragma pipeloop(0)
1025 for (i = 0; i < dsize / 8; i++) {
1026 sd0 = *sp++;
1027 sd1 = *sp++;
1028 sd2 = *sp++;
1029 sd3 = *sp++;
1030 CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd);
1031 *dp++ = dd;
1032 }
1033 }
1034 else if (cmask == 2) {
1035 #pragma pipeloop(0)
1036 for (i = 0; i < dsize / 8; i++) {
1037 sd0 = *sp++;
1038 sd1 = *sp++;
1039 sd2 = *sp++;
1040 sd3 = *sp++;
1041 CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd);
1042 *dp++ = dd;
1043 }
1044 }
1045 else {
1046 #pragma pipeloop(0)
1047 for (i = 0; i < dsize / 8; i++) {
1048 sd0 = *sp++;
1049 sd1 = *sp++;
1050 sd2 = *sp++;
1051 sd3 = *sp++;
1052 CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd);
1053 *dp++ = dd;
1054 }
1055 }
1056 }
1057
1058 /***************************************************************/
1059 void mlib_v_ImageChannelExtract_U8_41_A8D2X8(const mlib_u8 *src,
1060 mlib_s32 slb,
1061 mlib_u8 *dst,
1062 mlib_s32 dlb,
1063 mlib_s32 xsize,
1064 mlib_s32 ysize,
1065 mlib_s32 cmask)
1066 {
1067 mlib_d64 *sp, *dp;
1068 mlib_d64 *sl, *dl;
1069 mlib_d64 sd0, sd1, sd2, sd3;
1070 mlib_d64 sda, sdb, sdc, sdd, sde, sdf;
1071 mlib_d64 dd;
1072 mlib_s32 i, j;
1073
1074 sp = sl = (mlib_d64 *) src;
1075 dp = dl = (mlib_d64 *) dst;
1076
1077 if (cmask == 8) {
1078 for (j = 0; j < ysize; j++) {
1079 #pragma pipeloop(0)
1080 for (i = 0; i < xsize / 8; i++) {
1081 sd0 = *sp++;
1082 sd1 = *sp++;
1083 sd2 = *sp++;
1084 sd3 = *sp++;
1085 CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd);
1086 *dp++ = dd;
1087 }
1088
1089 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1090 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1091 }
1092 }
1093 else if (cmask == 4) {
1094 for (j = 0; j < ysize; j++) {
1095 #pragma pipeloop(0)
1096 for (i = 0; i < xsize / 8; i++) {
1097 sd0 = *sp++;
1098 sd1 = *sp++;
1099 sd2 = *sp++;
1100 sd3 = *sp++;
1101 CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd);
1102 *dp++ = dd;
1103 }
1104
1105 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1106 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1107 }
1108 }
1109 else if (cmask == 2) {
1110 for (j = 0; j < ysize; j++) {
1111 #pragma pipeloop(0)
1112 for (i = 0; i < xsize / 8; i++) {
1113 sd0 = *sp++;
1114 sd1 = *sp++;
1115 sd2 = *sp++;
1116 sd3 = *sp++;
1117 CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd);
1118 *dp++ = dd;
1119 }
1120
1121 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1122 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1123 }
1124 }
1125 else {
1126 for (j = 0; j < ysize; j++) {
1127 #pragma pipeloop(0)
1128 for (i = 0; i < xsize / 8; i++) {
1129 sd0 = *sp++;
1130 sd1 = *sp++;
1131 sd2 = *sp++;
1132 sd3 = *sp++;
1133 CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd);
1134 *dp++ = dd;
1135 }
1136
1137 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1138 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1139 }
1140 }
1141 }
1142
1143 /***************************************************************/
1144 void mlib_v_ImageChannelExtract_U8_41_D1(const mlib_u8 *src,
1145 mlib_u8 *dst,
1146 mlib_s32 dsize,
1147 mlib_s32 cmask)
1148 {
1149 mlib_u8 *sa, *da;
1150 mlib_u8 *dend, *dend2; /* end points in dst */
1151 mlib_d64 *dp; /* 8-byte aligned start points in dst */
1152 mlib_d64 *sp; /* 8-byte aligned start point in src */
1153 mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */
1154 mlib_d64 sd4, sd5, sd6, sd7;
1155 mlib_d64 sda, sdb, sdc, sdd;
1156 mlib_d64 sde, sdf;
1157 mlib_d64 dd0, dd1;
1158 mlib_s32 soff; /* offset of address in src */
1159 mlib_s32 doff; /* offset of address in dst */
1160 mlib_s32 off; /* offset of src over dst */
1161 mlib_s32 emask; /* edge mask */
1162 mlib_s32 i, n;
1163
1611 sd5 = *sp++;
1612 sd6 = *sp++;
1613 sd7 = *sp++;
1614 CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1);
1615 *dp++ = vis_faligndata(dd0, dd1);
1616 }
1617 }
1618
1619 /* end point handling */
1620 if ((mlib_addr) dp <= (mlib_addr) dend) {
1621 emask = vis_edge8(dp, dend);
1622 dd0 = dd1;
1623 sd4 = *sp++;
1624 sd5 = *sp++;
1625 sd6 = *sp++;
1626 sd7 = *sp++;
1627 CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1);
1628 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
1629 }
1630 }
1631 }
1632 }
1633
1634 /***************************************************************/
1635 void mlib_v_ImageChannelExtract_U8_41(const mlib_u8 *src,
1636 mlib_s32 slb,
1637 mlib_u8 *dst,
1638 mlib_s32 dlb,
1639 mlib_s32 xsize,
1640 mlib_s32 ysize,
1641 mlib_s32 cmask)
1642 {
1643 mlib_u8 *sa, *da;
1644 mlib_u8 *sl, *dl;
1645 mlib_s32 j;
1646
1647 sa = sl = (void *)src;
1648 da = dl = dst;
1649
1650 for (j = 0; j < ysize; j++) {
1651 mlib_v_ImageChannelExtract_U8_41_D1(sa, da, xsize, cmask);
1652 sa = sl += slb;
1653 da = dl += dlb;
1654 }
1655 }
1656
1657 /***************************************************************/
1658 #define CHANNELEXTRACT_S16_21L(sd0, sd1, dd) \
1659 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \
1660 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \
1661 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \
1662 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
1663
1664 /***************************************************************/
1665 #define CHANNELEXTRACT_S16_21R(sd0, sd1, dd) \
1666 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \
1667 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \
1668 sdc = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \
1669 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
1670
1671 /***************************************************************/
1672 /* extract one channel from a 2-channel image.
1673 * both source and destination image data are 8-byte aligned.
1674 * dsize is multiple of 4.
1675 */
1676
1677 void mlib_v_ImageChannelExtract_S16_21_A8D1X4(const mlib_s16 *src,
1678 mlib_s16 *dst,
1679 mlib_s32 dsize,
1680 mlib_s32 cmask)
1681 {
1682 mlib_d64 *sp, *dp;
1683 mlib_d64 sd0, sd1;
1684 mlib_d64 sda, sdb, sdc;
1685 mlib_d64 dd;
1686 mlib_s32 i;
1687
1688 sp = (mlib_d64 *) src;
1689 dp = (mlib_d64 *) dst;
1690
1691 if (cmask == 2) {
1692 #pragma pipeloop(0)
1693 for (i = 0; i < dsize / 4; i++) {
1694 sd0 = *sp++;
1695 sd1 = *sp++;
1696 CHANNELEXTRACT_S16_21L(sd0, sd1, dd);
1697 *dp++ = dd;
1698 }
1699 }
1700 else {
1701 #pragma pipeloop(0)
1702 for (i = 0; i < dsize / 4; i++) {
1703 sd0 = *sp++;
1704 sd1 = *sp++;
1705 CHANNELEXTRACT_S16_21R(sd0, sd1, dd);
1706 *dp++ = dd;
1707 }
1708 }
1709 }
1710
1711 /***************************************************************/
1712 void mlib_v_ImageChannelExtract_S16_21_A8D2X4(const mlib_s16 *src,
1713 mlib_s32 slb,
1714 mlib_s16 *dst,
1715 mlib_s32 dlb,
1716 mlib_s32 xsize,
1717 mlib_s32 ysize,
1718 mlib_s32 cmask)
1719 {
1720 mlib_d64 *sp, *dp;
1721 mlib_d64 *sl, *dl;
1722 mlib_d64 sd0, sd1;
1723 mlib_d64 sda, sdb, sdc;
1724 mlib_d64 dd;
1725 mlib_s32 i, j;
1726
1727 sp = sl = (mlib_d64 *) src;
1728 dp = dl = (mlib_d64 *) dst;
1729
1730 if (cmask == 2) {
1731 for (j = 0; j < ysize; j++) {
1732 #pragma pipeloop(0)
1733 for (i = 0; i < xsize / 4; i++) {
1734 sd0 = *sp++;
1735 sd1 = *sp++;
1736 CHANNELEXTRACT_S16_21L(sd0, sd1, dd);
1737 *dp++ = dd;
1738 }
1739
1740 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1741 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1742 }
1743 }
1744 else {
1745 for (j = 0; j < ysize; j++) {
1746 #pragma pipeloop(0)
1747 for (i = 0; i < xsize / 4; i++) {
1748 sd0 = *sp++;
1749 sd1 = *sp++;
1750 CHANNELEXTRACT_S16_21R(sd0, sd1, dd);
1751 *dp++ = dd;
1752 }
1753
1754 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1755 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1756 }
1757 }
1758 }
1759
1760 /***************************************************************/
1761 void mlib_v_ImageChannelExtract_S16_21_D1(const mlib_s16 *src,
1762 mlib_s16 *dst,
1763 mlib_s32 dsize,
1764 mlib_s32 cmask)
1765 {
1766 mlib_s16 *sa, *da;
1767 mlib_s16 *dend, *dend2; /* end points in dst */
1768 mlib_d64 *dp; /* 8-byte aligned start points in dst */
1769 mlib_d64 *sp; /* 8-byte aligned start point in src */
1770 mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */
1771 mlib_d64 sda, sdb, sdc;
1772 mlib_d64 dd0, dd1;
1773 mlib_s32 soff; /* offset of address in src */
1774 mlib_s32 doff; /* offset of address in dst */
1775 mlib_s32 off; /* offset of dst over src */
1776 mlib_s32 emask; /* edge mask */
1777 mlib_s32 i, n;
1778
1779 sa = (void *)src;
1780 da = dst;
1781
1782 /* prepare the source address */
1783 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
1784 soff = ((mlib_addr) sa & 7);
1785
1786 /* prepare the destination addresses */
1787 dp = (mlib_d64 *) ((mlib_addr) da & (~7));
1788 doff = ((mlib_addr) da & 7);
1789 dend = da + dsize - 1;
1790 dend2 = dend - 3;
1791
1792 /* calculate the src's offset over dst */
1793 if (cmask == 2) {
1794 off = (soff / 4) * 2 - doff;
1795 }
1796 else {
1797 off = ((soff + 3) / 4) * 2 - doff;
1798 }
1799
1800 if (((cmask == 2) && (soff % 4 == 0)) || ((cmask == 1) && (soff % 4 != 0))) { /* extract even words */
1801
1802 if (off == 0) { /* src and dst have same alignment */
1803
1804 /* generate edge mask for the start point */
1805 emask = vis_edge16(da, dend);
1806
1807 /* load 16 bytes */
1808 sd0 = *sp++;
1809 sd1 = *sp++;
1810
1811 /* extract, including some garbage at the start point */
1812 CHANNELEXTRACT_S16_21L(sd0, sd1, dd0);
1813
1814 /* store 8 bytes result */
1815 vis_pst_16(dd0, dp++, emask);
1816
1817 if ((mlib_addr) dp <= (mlib_addr) dend2) {
1818 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1819
1820 /* 8-pixel column loop, emask not needed */
1821 #pragma pipeloop(0)
1822 for (i = 0; i < n; i++) {
1823 sd0 = *sp++;
1824 sd1 = *sp++;
1825 CHANNELEXTRACT_S16_21L(sd0, sd1, dd0);
1826 *dp++ = dd0;
1827 }
1828 }
1829
1830 /* end point handling */
1831 if ((mlib_addr) dp <= (mlib_addr) dend) {
1832 emask = vis_edge16(dp, dend);
1833 sd0 = *sp++;
1834 sd1 = *sp++;
1835 CHANNELEXTRACT_S16_21L(sd0, sd1, dd0);
1836 vis_pst_16(dd0, dp++, emask);
1837 }
1838 }
1839 else {
1840 vis_alignaddr((void *)0, off);
1841
1842 /* generate edge mask for the start point */
1843 emask = vis_edge16(da, dend);
1844
1845 if (off < 0) {
1846 /* load 16 bytes */
1847 sd2 = *sp++;
1848 sd3 = *sp++;
1849
1850 /* extract and store 8 bytes */
1851 CHANNELEXTRACT_S16_21L(sd2, sd3, dd1);
1852 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
1853 }
1854 else {
1855 /* load 32 bytes */
1856 sd0 = *sp++;
1857 sd1 = *sp++;
1858 sd2 = *sp++;
1859 sd3 = *sp++;
1860
1861 /* extract and store 8 bytes */
1862 CHANNELEXTRACT_S16_21L(sd0, sd1, dd0);
1863 CHANNELEXTRACT_S16_21L(sd2, sd3, dd1);
1864 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
1865 }
1866
1867 if ((mlib_addr) dp <= (mlib_addr) dend2) {
1868 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1869
1870 /* 8-pixel column loop, emask not needed */
1871 #pragma pipeloop(0)
1872 for (i = 0; i < n; i++) {
1873 dd0 = dd1;
1874 sd2 = *sp++;
1875 sd3 = *sp++;
1876 CHANNELEXTRACT_S16_21L(sd2, sd3, dd1);
1877 *dp++ = vis_faligndata(dd0, dd1);
1878 }
1879 }
1880
1881 /* end point handling */
1882 if ((mlib_addr) dp <= (mlib_addr) dend) {
1883 emask = vis_edge16(dp, dend);
1884 dd0 = dd1;
1885 sd2 = *sp++;
1886 sd3 = *sp++;
1887 CHANNELEXTRACT_S16_21L(sd2, sd3, dd1);
1888 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
1889 }
1890 }
1891 }
1892 else { /* extract odd words */
1893
1894 if (off == 0) { /* src and dst have same alignment */
1895
1896 /* generate edge mask for the start point */
1897 emask = vis_edge16(da, dend);
1898
1899 /* load 16 bytes, don't care the garbage at the start point */
1900 sd0 = *sp++;
1901 sd1 = *sp++;
1902
1903 /* extract and store 8 bytes */
1904 CHANNELEXTRACT_S16_21R(sd0, sd1, dd0);
1905 vis_pst_16(dd0, dp++, emask);
1906
1907 if ((mlib_addr) dp <= (mlib_addr) dend2) {
1908 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1909
1910 /* 8-pixel column loop, emask not needed */
1911 #pragma pipeloop(0)
1912 for (i = 0; i < n; i++) {
1913 sd0 = *sp++;
1914 sd1 = *sp++;
1915 CHANNELEXTRACT_S16_21R(sd0, sd1, dd0);
1916 *dp++ = dd0;
1917 }
1918 }
1919
1920 /* end point handling */
1921 if ((mlib_addr) dp <= (mlib_addr) dend) {
1922 emask = vis_edge16(dp, dend);
1923 sd0 = *sp++;
1924 sd1 = *sp++;
1925 CHANNELEXTRACT_S16_21R(sd0, sd1, dd0);
1926 vis_pst_16(dd0, dp++, emask);
1927 }
1928 }
1929 else {
1930 vis_alignaddr((void *)0, off);
1931
1932 /* generate edge mask for the start point */
1933 emask = vis_edge16(da, dend);
1934
1935 if (off < 0) {
1936 /* load 16 bytes */
1937 sd2 = *sp++;
1938 sd3 = *sp++;
1939
1940 /* extract and store 8 bytes */
1941 CHANNELEXTRACT_S16_21R(sd2, sd3, dd1);
1942 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
1943 }
1944 else {
1945 /* load 32 bytes */
1946 sd0 = *sp++;
1947 sd1 = *sp++;
1948 sd2 = *sp++;
1949 sd3 = *sp++;
1950
1951 /* extract and store 8 bytes */
1952 CHANNELEXTRACT_S16_21R(sd0, sd1, dd0);
1953 CHANNELEXTRACT_S16_21R(sd2, sd3, dd1);
1954 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
1955 }
1956
1957 if ((mlib_addr) dp <= (mlib_addr) dend2) {
1958 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
1959
1960 /* 8-pixel column loop, emask not needed */
1961 #pragma pipeloop(0)
1962 for (i = 0; i < n; i++) {
1963 dd0 = dd1;
1964 sd2 = *sp++;
1965 sd3 = *sp++;
1966 CHANNELEXTRACT_S16_21R(sd2, sd3, dd1);
1967 *dp++ = vis_faligndata(dd0, dd1);
1968 }
1969 }
1970
1971 /* end point handling */
1972 if ((mlib_addr) dp <= (mlib_addr) dend) {
1973 emask = vis_edge16(dp, dend);
1974 dd0 = dd1;
1975 sd2 = *sp++;
1976 sd3 = *sp++;
1977 CHANNELEXTRACT_S16_21R(sd2, sd3, dd1);
1978 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
1979 }
1980 }
1981 }
1982 }
1983
1984 /***************************************************************/
1985 void mlib_v_ImageChannelExtract_S16_21(const mlib_s16 *src,
1986 mlib_s32 slb,
1987 mlib_s16 *dst,
1988 mlib_s32 dlb,
1989 mlib_s32 xsize,
1990 mlib_s32 ysize,
1991 mlib_s32 cmask)
1992 {
1993 mlib_s16 *sa, *da;
1994 mlib_s16 *sl, *dl;
1995 mlib_s32 j;
1996
1997 sa = sl = (void *)src;
1998 da = dl = dst;
1999
2000 for (j = 0; j < ysize; j++) {
2001 mlib_v_ImageChannelExtract_S16_21_D1(sa, da, xsize, cmask);
2002 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
2003 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
2004 }
2005 }
2006
2007 /***************************************************************/
2008 #define CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd) \
2009 /* extract the left channel */ \
2010 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
2011 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
2012 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \
2013 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2014
2015 /***************************************************************/
2016 #define CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd) \
2017 /* extract the middle channel */ \
2018 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
2019 sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
2020 sdc = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdb)); \
2021 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2022
2023 /***************************************************************/
2024 #define CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd) \
2025 /* extract the right channel */ \
2026 sda = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
2027 sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
2028 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \
2029 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2030
2031 /***************************************************************/
2032 void mlib_v_ImageChannelExtract_S16_31_A8D1X4(const mlib_s16 *src,
2033 mlib_s16 *dst,
2034 mlib_s32 dsize,
2035 mlib_s32 cmask)
2036 {
2037 mlib_d64 *sp, *dp;
2038 mlib_d64 sd0, sd1, sd2;
2039 mlib_d64 sda, sdb, sdc;
2040 mlib_d64 dd;
2041 mlib_s32 i;
2042
2043 sp = (mlib_d64 *) src;
2044 dp = (mlib_d64 *) dst;
2045
2046 if (cmask == 4) {
2047 #pragma pipeloop(0)
2048 for (i = 0; i < dsize / 4; i++) {
2049 sd0 = *sp++;
2050 sd1 = *sp++;
2051 sd2 = *sp++;
2052 CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd);
2053 *dp++ = dd;
2054 }
2055 }
2056 else if (cmask == 2) {
2057 #pragma pipeloop(0)
2058 for (i = 0; i < dsize / 4; i++) {
2059 sd0 = *sp++;
2060 sd1 = *sp++;
2061 sd2 = *sp++;
2062 CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd);
2063 *dp++ = dd;
2064 }
2065 }
2066 else {
2067 #pragma pipeloop(0)
2068 for (i = 0; i < dsize / 4; i++) {
2069 sd0 = *sp++;
2070 sd1 = *sp++;
2071 sd2 = *sp++;
2072 CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd);
2073 *dp++ = dd;
2074 }
2075 }
2076 }
2077
2078 /***************************************************************/
2079 void mlib_v_ImageChannelExtract_S16_31_A8D2X4(const mlib_s16 *src,
2080 mlib_s32 slb,
2081 mlib_s16 *dst,
2082 mlib_s32 dlb,
2083 mlib_s32 xsize,
2084 mlib_s32 ysize,
2085 mlib_s32 cmask)
2086 {
2087 mlib_d64 *sp, *dp;
2088 mlib_d64 *sl, *dl;
2089 mlib_d64 sd0, sd1, sd2;
2090 mlib_d64 sda, sdb, sdc;
2091 mlib_d64 dd;
2092 mlib_s32 i, j;
2093
2094 sp = sl = (mlib_d64 *) src;
2095 dp = dl = (mlib_d64 *) dst;
2096
2097 if (cmask == 4) {
2098 for (j = 0; j < ysize; j++) {
2099 #pragma pipeloop(0)
2100 for (i = 0; i < xsize / 4; i++) {
2101 sd0 = *sp++;
2102 sd1 = *sp++;
2103 sd2 = *sp++;
2104 CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd);
2105 *dp++ = dd;
2106 }
2107
2108 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2109 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2110 }
2111 }
2112 else if (cmask == 2) {
2113 for (j = 0; j < ysize; j++) {
2114 #pragma pipeloop(0)
2115 for (i = 0; i < xsize / 4; i++) {
2116 sd0 = *sp++;
2117 sd1 = *sp++;
2118 sd2 = *sp++;
2119 CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd);
2120 *dp++ = dd;
2121 }
2122
2123 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2124 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2125 }
2126 }
2127 else {
2128 for (j = 0; j < ysize; j++) {
2129 #pragma pipeloop(0)
2130 for (i = 0; i < xsize / 4; i++) {
2131 sd0 = *sp++;
2132 sd1 = *sp++;
2133 sd2 = *sp++;
2134 CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd);
2135 *dp++ = dd;
2136 }
2137
2138 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2139 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2140 }
2141 }
2142 }
2143
2144 /***************************************************************/
2145 void mlib_v_ImageChannelExtract_S16_31_D1(const mlib_s16 *src,
2146 mlib_s16 *dst,
2147 mlib_s32 dsize,
2148 mlib_s32 cmask)
2149 {
2150 mlib_s16 *sa, *da;
2151 mlib_s16 *dend, *dend2; /* end points in dst */
2152 mlib_d64 *dp; /* 8-byte aligned start points in dst */
2153 mlib_d64 *sp; /* 8-byte aligned start point in src */
2154 mlib_d64 sd0, sd1, sd2; /* 8-byte source data */
2155 mlib_d64 sd3, sd4, sd5;
2156 mlib_d64 sda, sdb, sdc;
2157 mlib_d64 dd0, dd1;
2158 mlib_s32 soff; /* offset of address in src */
2159 mlib_s32 doff; /* offset of address in dst */
2160 mlib_s32 off; /* offset of src over dst */
2161 mlib_s32 emask; /* edge mask */
2162 mlib_s32 i, n;
2163
2164 sa = (void *)src;
2165 da = dst;
2166
2167 /* prepare the source address */
2168 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
2169 soff = ((mlib_addr) sa & 7);
2170
2171 /* prepare the destination addresses */
2172 dp = (mlib_d64 *) ((mlib_addr) da & (~7));
2173 doff = ((mlib_addr) da & 7);
2174 dend = da + dsize - 1;
2175 dend2 = dend - 3;
2176
2177 /* calculate the src's offset over dst */
2178 if (cmask == 4) {
2179 off = (soff / 6) * 2 - doff;
2180 }
2181 else if (cmask == 2) {
2182 off = ((soff + 2) / 6) * 2 - doff;
2183 }
2184 else {
2185 off = ((soff + 4) / 6) * 2 - doff;
2186 }
2187
2188 if (((cmask == 4) && (soff % 6 == 0)) ||
2189 ((cmask == 2) && (soff % 6 == 4)) ||
2190 ((cmask == 1) && (soff % 6 == 2))) { /* extract left channel */
2191
2192 if (off == 0) { /* src and dst have same alignment */
2193
2194 /* generate edge mask for the start point */
2195 emask = vis_edge16(da, dend);
2196
2197 /* load 16 bytes */
2198 sd0 = *sp++;
2199 sd1 = *sp++;
2200 sd2 = *sp++;
2201
2202 /* extract, including some garbage at the start point */
2203 CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0);
2204
2205 /* store 8 bytes result */
2206 vis_pst_16(dd0, dp++, emask);
2207
2208 if ((mlib_addr) dp <= (mlib_addr) dend2) {
2209 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2210
2211 /* 8-pixel column loop, emask not needed */
2212 #pragma pipeloop(0)
2213 for (i = 0; i < n; i++) {
2214 sd0 = *sp++;
2215 sd1 = *sp++;
2216 sd2 = *sp++;
2217 CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0);
2218 *dp++ = dd0;
2219 }
2220 }
2221
2222 /* end point handling */
2223 if ((mlib_addr) dp <= (mlib_addr) dend) {
2224 emask = vis_edge16(dp, dend);
2225 sd0 = *sp++;
2226 sd1 = *sp++;
2227 sd2 = *sp++;
2228 CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0);
2229 vis_pst_16(dd0, dp++, emask);
2230 }
2231 }
2232 else {
2233 vis_alignaddr((void *)0, off);
2234
2235 /* generate edge mask for the start point */
2236 emask = vis_edge16(da, dend);
2237
2238 if (off < 0) {
2239 /* load 24 bytes */
2240 sd3 = *sp++;
2241 sd4 = *sp++;
2242 sd5 = *sp++;
2243
2244 /* extract and store 8 bytes */
2245 CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1);
2246 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
2247 }
2248 else {
2249 /* load 48 bytes */
2250 sd0 = *sp++;
2251 sd1 = *sp++;
2252 sd2 = *sp++;
2253 sd3 = *sp++;
2254 sd4 = *sp++;
2255 sd5 = *sp++;
2256
2257 /* extract and store 8 bytes */
2258 CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0);
2259 CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1);
2260 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2261 }
2262
2263 if ((mlib_addr) dp <= (mlib_addr) dend2) {
2264 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2265
2266 /* 8-pixel column loop, emask not needed */
2267 #pragma pipeloop(0)
2268 for (i = 0; i < n; i++) {
2269 dd0 = dd1;
2270 sd3 = *sp++;
2271 sd4 = *sp++;
2272 sd5 = *sp++;
2273 CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1);
2274 *dp++ = vis_faligndata(dd0, dd1);
2275 }
2276 }
2277
2278 /* end point handling */
2279 if ((mlib_addr) dp <= (mlib_addr) dend) {
2280 emask = vis_edge16(dp, dend);
2281 dd0 = dd1;
2282 sd3 = *sp++;
2283 sd4 = *sp++;
2284 sd5 = *sp++;
2285 CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1);
2286 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2287 }
2288 }
2289 }
2290 else if (((cmask == 4) && (soff % 6 == 2)) ||
2291 ((cmask == 2) && (soff % 6 == 0)) ||
2292 ((cmask == 1) && (soff % 6 == 4))) {
2293 /* extract middle channel */
2294
2295 if (off == 0) { /* src and dst have same alignment */
2296
2297 /* generate edge mask for the start point */
2298 emask = vis_edge16(da, dend);
2299
2300 /* load 16 bytes */
2301 sd0 = *sp++;
2302 sd1 = *sp++;
2303 sd2 = *sp++;
2304
2305 /* extract, including some garbage at the start point */
2306 CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0);
2307
2308 /* store 8 bytes result */
2309 vis_pst_16(dd0, dp++, emask);
2310
2311 if ((mlib_addr) dp <= (mlib_addr) dend2) {
2312 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2313
2314 /* 8-pixel column loop, emask not needed */
2315 #pragma pipeloop(0)
2316 for (i = 0; i < n; i++) {
2317 sd0 = *sp++;
2318 sd1 = *sp++;
2319 sd2 = *sp++;
2320 CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0);
2321 *dp++ = dd0;
2322 }
2323 }
2324
2325 /* end point handling */
2326 if ((mlib_addr) dp <= (mlib_addr) dend) {
2327 emask = vis_edge16(dp, dend);
2328 sd0 = *sp++;
2329 sd1 = *sp++;
2330 sd2 = *sp++;
2331 CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0);
2332 vis_pst_16(dd0, dp++, emask);
2333 }
2334 }
2335 else {
2336 vis_alignaddr((void *)0, off);
2337
2338 /* generate edge mask for the start point */
2339 emask = vis_edge16(da, dend);
2340
2341 if (off < 0) {
2342 /* load 24 bytes */
2343 sd3 = *sp++;
2344 sd4 = *sp++;
2345 sd5 = *sp++;
2346
2347 /* extract and store 8 bytes */
2348 CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1);
2349 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
2350 }
2351 else {
2352 /* load 48 bytes */
2353 sd0 = *sp++;
2354 sd1 = *sp++;
2355 sd2 = *sp++;
2356 sd3 = *sp++;
2357 sd4 = *sp++;
2358 sd5 = *sp++;
2359
2360 /* extract and store 8 bytes */
2361 CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0);
2362 CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1);
2363 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2364 }
2365
2366 if ((mlib_addr) dp <= (mlib_addr) dend2) {
2367 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2368
2369 /* 8-pixel column loop, emask not needed */
2370 #pragma pipeloop(0)
2371 for (i = 0; i < n; i++) {
2372 dd0 = dd1;
2373 sd3 = *sp++;
2374 sd4 = *sp++;
2375 sd5 = *sp++;
2376 CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1);
2377 *dp++ = vis_faligndata(dd0, dd1);
2378 }
2379 }
2380
2381 /* end point handling */
2382 if ((mlib_addr) dp <= (mlib_addr) dend) {
2383 emask = vis_edge16(dp, dend);
2384 dd0 = dd1;
2385 sd3 = *sp++;
2386 sd4 = *sp++;
2387 sd5 = *sp++;
2388 CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1);
2389 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2390 }
2391 }
2392 }
2393 else { /* extract right channel */
2394
2395 if (off == 0) { /* src and dst have same alignment */
2396
2397 /* generate edge mask for the start point */
2398 emask = vis_edge16(da, dend);
2399
2400 /* load 16 bytes */
2401 sd0 = *sp++;
2402 sd1 = *sp++;
2403 sd2 = *sp++;
2404
2405 /* extract, including some garbage at the start point */
2406 CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0);
2407
2408 /* store 8 bytes result */
2409 vis_pst_16(dd0, dp++, emask);
2410
2411 if ((mlib_addr) dp <= (mlib_addr) dend2) {
2412 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2413
2414 /* 8-pixel column loop, emask not needed */
2415 #pragma pipeloop(0)
2416 for (i = 0; i < n; i++) {
2417 sd0 = *sp++;
2418 sd1 = *sp++;
2419 sd2 = *sp++;
2420 CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0);
2421 *dp++ = dd0;
2422 }
2423 }
2424
2425 /* end point handling */
2426 if ((mlib_addr) dp <= (mlib_addr) dend) {
2427 emask = vis_edge16(dp, dend);
2428 sd0 = *sp++;
2429 sd1 = *sp++;
2430 sd2 = *sp++;
2431 CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0);
2432 vis_pst_16(dd0, dp++, emask);
2433 }
2434 }
2435 else {
2436 vis_alignaddr((void *)0, off);
2437
2438 /* generate edge mask for the start point */
2439 emask = vis_edge16(da, dend);
2440
2441 if (off < 0) {
2442 /* load 24 bytes */
2443 sd3 = *sp++;
2444 sd4 = *sp++;
2445 sd5 = *sp++;
2446
2447 /* extract and store 8 bytes */
2448 CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1);
2449 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
2450 }
2451 else {
2452 /* load 48 bytes */
2453 sd0 = *sp++;
2454 sd1 = *sp++;
2455 sd2 = *sp++;
2456 sd3 = *sp++;
2457 sd4 = *sp++;
2458 sd5 = *sp++;
2459
2460 /* extract and store 8 bytes */
2461 CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0);
2462 CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1);
2463 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2464 }
2465
2466 if ((mlib_addr) dp <= (mlib_addr) dend2) {
2467 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2468
2469 /* 8-pixel column loop, emask not needed */
2470 #pragma pipeloop(0)
2471 for (i = 0; i < n; i++) {
2472 dd0 = dd1;
2473 sd3 = *sp++;
2474 sd4 = *sp++;
2475 sd5 = *sp++;
2476 CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1);
2477 *dp++ = vis_faligndata(dd0, dd1);
2478 }
2479 }
2480
2481 /* end point handling */
2482 if ((mlib_addr) dp <= (mlib_addr) dend) {
2483 emask = vis_edge16(dp, dend);
2484 dd0 = dd1;
2485 sd3 = *sp++;
2486 sd4 = *sp++;
2487 sd5 = *sp++;
2488 CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1);
2489 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2490 }
2491 }
2492 }
2493 }
2494
2495 /***************************************************************/
2496 void mlib_v_ImageChannelExtract_S16_31(const mlib_s16 *src,
2497 mlib_s32 slb,
2498 mlib_s16 *dst,
2499 mlib_s32 dlb,
2500 mlib_s32 xsize,
2501 mlib_s32 ysize,
2502 mlib_s32 cmask)
2503 {
2504 mlib_s16 *sa, *da;
2505 mlib_s16 *sl, *dl;
2506 mlib_s32 j;
2507
2508 sa = sl = (void *)src;
2509 da = dl = dst;
2510
2511 for (j = 0; j < ysize; j++) {
2512 mlib_v_ImageChannelExtract_S16_31_D1(sa, da, xsize, cmask);
2513 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
2514 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
2515 }
2516 }
2517
2518 /***************************************************************/
2519 #define CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd) \
2520 /* extract the left channel */ \
2521 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
2522 sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
2523 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \
2524 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2525
2526 /***************************************************************/
2527 #define CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd) \
2528 /* extract the middle left channel */ \
2529 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
2530 sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
2531 sdc = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \
2532 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2533
2534 /***************************************************************/
2535 #define CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd) \
2536 /* extract the middle right channel */ \
2537 sda = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
2538 sdb = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
2539 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \
2540 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2541
2542 /***************************************************************/
2543 #define CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd) \
2544 /* extract the right channel */ \
2545 sda = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
2546 sdb = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
2547 sdc = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \
2548 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
2549
2550 /***************************************************************/
2551 void mlib_v_ImageChannelExtract_S16_41_A8D1X4(const mlib_s16 *src,
2552 mlib_s16 *dst,
2553 mlib_s32 dsize,
2554 mlib_s32 cmask)
2555 {
2556 mlib_d64 *sp, *dp;
2557 mlib_d64 sd0, sd1, sd2, sd3;
2558 mlib_d64 sda, sdb, sdc;
2559 mlib_d64 dd;
2560 mlib_s32 i;
2561
2562 sp = (mlib_d64 *) src;
2563 dp = (mlib_d64 *) dst;
2564
2565 if (cmask == 8) {
2566 #pragma pipeloop(0)
2567 for (i = 0; i < dsize / 4; i++) {
2568 sd0 = *sp++;
2569 sd1 = *sp++;
2570 sd2 = *sp++;
2571 sd3 = *sp++;
2572 CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd);
2573 *dp++ = dd;
2574 }
2575 }
2576 else if (cmask == 4) {
2577 #pragma pipeloop(0)
2578 for (i = 0; i < dsize / 4; i++) {
2579 sd0 = *sp++;
2580 sd1 = *sp++;
2581 sd2 = *sp++;
2582 sd3 = *sp++;
2583 CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd);
2584 *dp++ = dd;
2585 }
2586 }
2587 else if (cmask == 2) {
2588 #pragma pipeloop(0)
2589 for (i = 0; i < dsize / 4; i++) {
2590 sd0 = *sp++;
2591 sd1 = *sp++;
2592 sd2 = *sp++;
2593 sd3 = *sp++;
2594 CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd);
2595 *dp++ = dd;
2596 }
2597 }
2598 else {
2599 #pragma pipeloop(0)
2600 for (i = 0; i < dsize / 4; i++) {
2601 sd0 = *sp++;
2602 sd1 = *sp++;
2603 sd2 = *sp++;
2604 sd3 = *sp++;
2605 CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd);
2606 *dp++ = dd;
2607 }
2608 }
2609 }
2610
2611 /***************************************************************/
2612 void mlib_v_ImageChannelExtract_S16_41_A8D2X4(const mlib_s16 *src,
2613 mlib_s32 slb,
2614 mlib_s16 *dst,
2615 mlib_s32 dlb,
2616 mlib_s32 xsize,
2617 mlib_s32 ysize,
2618 mlib_s32 cmask)
2619 {
2620 mlib_d64 *sp, *dp;
2621 mlib_d64 *sl, *dl;
2622 mlib_d64 sd0, sd1, sd2, sd3;
2623 mlib_d64 sda, sdb, sdc;
2624 mlib_d64 dd;
2625 mlib_s32 i, j;
2626
2627 sp = sl = (mlib_d64 *) src;
2628 dp = dl = (mlib_d64 *) dst;
2629
2630 if (cmask == 8) {
2631 for (j = 0; j < ysize; j++) {
2632 #pragma pipeloop(0)
2633 for (i = 0; i < xsize / 4; i++) {
2634 sd0 = *sp++;
2635 sd1 = *sp++;
2636 sd2 = *sp++;
2637 sd3 = *sp++;
2638 CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd);
2639 *dp++ = dd;
2640 }
2641
2642 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2643 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2644 }
2645 }
2646 else if (cmask == 4) {
2647 for (j = 0; j < ysize; j++) {
2648 #pragma pipeloop(0)
2649 for (i = 0; i < xsize / 4; i++) {
2650 sd0 = *sp++;
2651 sd1 = *sp++;
2652 sd2 = *sp++;
2653 sd3 = *sp++;
2654 CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd);
2655 *dp++ = dd;
2656 }
2657
2658 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2659 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2660 }
2661 }
2662 else if (cmask == 2) {
2663 for (j = 0; j < ysize; j++) {
2664 #pragma pipeloop(0)
2665 for (i = 0; i < xsize / 4; i++) {
2666 sd0 = *sp++;
2667 sd1 = *sp++;
2668 sd2 = *sp++;
2669 sd3 = *sp++;
2670 CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd);
2671 *dp++ = dd;
2672 }
2673
2674 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2675 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2676 }
2677 }
2678 else {
2679 for (j = 0; j < ysize; j++) {
2680 #pragma pipeloop(0)
2681 for (i = 0; i < xsize / 4; i++) {
2682 sd0 = *sp++;
2683 sd1 = *sp++;
2684 sd2 = *sp++;
2685 sd3 = *sp++;
2686 CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd);
2687 *dp++ = dd;
2688 }
2689
2690 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
2691 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
2692 }
2693 }
2694 }
2695
2696 /***************************************************************/
2697 void mlib_v_ImageChannelExtract_S16_41_D1(const mlib_s16 *src,
2698 mlib_s16 *dst,
2699 mlib_s32 dsize,
2700 mlib_s32 cmask)
2701 {
2702 mlib_s16 *sa, *da;
2703 mlib_s16 *dend, *dend2; /* end points in dst */
2704 mlib_d64 *dp; /* 8-byte aligned start points in dst */
2705 mlib_d64 *sp; /* 8-byte aligned start point in src */
2706 mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */
2707 mlib_d64 sd4, sd5, sd6, sd7;
2708 mlib_d64 sda, sdb, sdc;
2709 mlib_d64 dd0, dd1;
2710 mlib_s32 soff; /* offset of address in src */
2711 mlib_s32 doff; /* offset of address in dst */
2712 mlib_s32 off; /* offset of src over dst */
2713 mlib_s32 emask; /* edge mask */
2714 mlib_s32 i, n;
2715
2716 sa = (void *)src;
2717 da = dst;
2718
2719 /* prepare the source address */
2720 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
2721 soff = ((mlib_addr) sa & 7);
2722
2723 /* prepare the destination addresses */
2724 dp = (mlib_d64 *) ((mlib_addr) da & (~7));
2725 doff = ((mlib_addr) da & 7);
2726 dend = da + dsize - 1;
2727 dend2 = dend - 3;
2728
2729 /* calculate the src's offset over dst */
2730 if (cmask == 8) {
2731 off = (soff / 8) * 2 - doff;
2732 }
2733 else if (cmask == 4) {
2734 off = ((soff + 2) / 8) * 2 - doff;
2735 }
2736 else if (cmask == 2) {
2737 off = ((soff + 4) / 8) * 2 - doff;
2738 }
2739 else {
2740 off = ((soff + 6) / 8) * 2 - doff;
2741 }
2742
2743 if (((cmask == 8) && (soff == 0)) ||
2744 ((cmask == 4) && (soff == 6)) ||
2745 ((cmask == 2) && (soff == 4)) ||
2746 ((cmask == 1) && (soff == 2))) { /* extract left channel */
2747
2748 if (off == 0) { /* src and dst have same alignment */
2749
2750 /* generate edge mask for the start point */
2751 emask = vis_edge16(da, dend);
2752
2753 /* load 16 bytes */
2754 sd0 = *sp++;
2755 sd1 = *sp++;
2756 sd2 = *sp++;
2757 sd3 = *sp++;
2758
2759 /* extract, including some garbage at the start point */
2760 CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0);
2761
2762 /* store 8 bytes result */
2763 vis_pst_16(dd0, dp++, emask);
2764
2765 if ((mlib_addr) dp <= (mlib_addr) dend2) {
2766 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2767
2768 /* 8-pixel column loop, emask not needed */
2769 #pragma pipeloop(0)
2770 for (i = 0; i < n; i++) {
2771 sd0 = *sp++;
2772 sd1 = *sp++;
2773 sd2 = *sp++;
2774 sd3 = *sp++;
2775 CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0);
2776 *dp++ = dd0;
2777 }
2778 }
2779
2780 /* end point handling */
2781 if ((mlib_addr) dp <= (mlib_addr) dend) {
2782 emask = vis_edge16(dp, dend);
2783 sd0 = *sp++;
2784 sd1 = *sp++;
2785 sd2 = *sp++;
2786 sd3 = *sp++;
2787 CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0);
2788 vis_pst_16(dd0, dp++, emask);
2789 }
2790 }
2791 else {
2792 vis_alignaddr((void *)0, off);
2793
2794 /* generate edge mask for the start point */
2795 emask = vis_edge16(da, dend);
2796
2797 if (off < 0) {
2798 /* load 24 bytes */
2799 sd4 = *sp++;
2800 sd5 = *sp++;
2801 sd6 = *sp++;
2802 sd7 = *sp++;
2803
2804 /* extract and store 8 bytes */
2805 CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1);
2806 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
2807 }
2808 else {
2809 /* load 48 bytes */
2810 sd0 = *sp++;
2811 sd1 = *sp++;
2812 sd2 = *sp++;
2813 sd3 = *sp++;
2814 sd4 = *sp++;
2815 sd5 = *sp++;
2816 sd6 = *sp++;
2817 sd7 = *sp++;
2818
2819 /* extract and store 8 bytes */
2820 CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0);
2821 CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1);
2822 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2823 }
2824
2825 if ((mlib_addr) dp <= (mlib_addr) dend2) {
2826 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2827
2828 /* 8-pixel column loop, emask not needed */
2829 #pragma pipeloop(0)
2830 for (i = 0; i < n; i++) {
2831 dd0 = dd1;
2832 sd4 = *sp++;
2833 sd5 = *sp++;
2834 sd6 = *sp++;
2835 sd7 = *sp++;
2836 CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1);
2837 *dp++ = vis_faligndata(dd0, dd1);
2838 }
2839 }
2840
2841 /* end point handling */
2842 if ((mlib_addr) dp <= (mlib_addr) dend) {
2843 emask = vis_edge16(dp, dend);
2844 dd0 = dd1;
2845 sd4 = *sp++;
2846 sd5 = *sp++;
2847 sd6 = *sp++;
2848 sd7 = *sp++;
2849 CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1);
2850 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2851 }
2852 }
2853 }
2854 else if (((cmask == 8) && (soff == 2)) ||
2855 ((cmask == 4) && (soff == 0)) ||
2856 ((cmask == 2) && (soff == 6)) ||
2857 ((cmask == 1) && (soff == 4))) { /* extract middle left channel */
2858
2859 if (off == 0) { /* src and dst have same alignment */
2860
2861 /* generate edge mask for the start point */
2862 emask = vis_edge16(da, dend);
2863
2864 /* load 16 bytes */
2865 sd0 = *sp++;
2866 sd1 = *sp++;
2867 sd2 = *sp++;
2868 sd3 = *sp++;
2869
2870 /* extract, including some garbage at the start point */
2871 CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0);
2872
2873 /* store 8 bytes result */
2874 vis_pst_16(dd0, dp++, emask);
2875
2876 if ((mlib_addr) dp <= (mlib_addr) dend2) {
2877 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2878
2879 /* 8-pixel column loop, emask not needed */
2880 #pragma pipeloop(0)
2881 for (i = 0; i < n; i++) {
2882 sd0 = *sp++;
2883 sd1 = *sp++;
2884 sd2 = *sp++;
2885 sd3 = *sp++;
2886 CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0);
2887 *dp++ = dd0;
2888 }
2889 }
2890
2891 /* end point handling */
2892 if ((mlib_addr) dp <= (mlib_addr) dend) {
2893 emask = vis_edge16(dp, dend);
2894 sd0 = *sp++;
2895 sd1 = *sp++;
2896 sd2 = *sp++;
2897 sd3 = *sp++;
2898 CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0);
2899 vis_pst_16(dd0, dp++, emask);
2900 }
2901 }
2902 else {
2903 vis_alignaddr((void *)0, off);
2904
2905 /* generate edge mask for the start point */
2906 emask = vis_edge16(da, dend);
2907
2908 if (off < 0) {
2909 /* load 24 bytes */
2910 sd4 = *sp++;
2911 sd5 = *sp++;
2912 sd6 = *sp++;
2913 sd7 = *sp++;
2914
2915 /* extract and store 8 bytes */
2916 CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1);
2917 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
2918 }
2919 else {
2920 /* load 48 bytes */
2921 sd0 = *sp++;
2922 sd1 = *sp++;
2923 sd2 = *sp++;
2924 sd3 = *sp++;
2925 sd4 = *sp++;
2926 sd5 = *sp++;
2927 sd6 = *sp++;
2928 sd7 = *sp++;
2929
2930 /* extract and store 8 bytes */
2931 CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0);
2932 CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1);
2933 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2934 }
2935
2936 if ((mlib_addr) dp <= (mlib_addr) dend2) {
2937 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2938
2939 /* 8-pixel column loop, emask not needed */
2940 #pragma pipeloop(0)
2941 for (i = 0; i < n; i++) {
2942 dd0 = dd1;
2943 sd4 = *sp++;
2944 sd5 = *sp++;
2945 sd6 = *sp++;
2946 sd7 = *sp++;
2947 CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1);
2948 *dp++ = vis_faligndata(dd0, dd1);
2949 }
2950 }
2951
2952 /* end point handling */
2953 if ((mlib_addr) dp <= (mlib_addr) dend) {
2954 emask = vis_edge16(dp, dend);
2955 dd0 = dd1;
2956 sd4 = *sp++;
2957 sd5 = *sp++;
2958 sd6 = *sp++;
2959 sd7 = *sp++;
2960 CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1);
2961 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
2962 }
2963 }
2964 }
2965 else if (((cmask == 8) && (soff == 4)) ||
2966 ((cmask == 4) && (soff == 2)) ||
2967 ((cmask == 2) && (soff == 0)) ||
2968 ((cmask == 1) && (soff == 6))) { /* extract middle right channel */
2969
2970 if (off == 0) { /* src and dst have same alignment */
2971
2972 /* generate edge mask for the start point */
2973 emask = vis_edge16(da, dend);
2974
2975 /* load 16 bytes */
2976 sd0 = *sp++;
2977 sd1 = *sp++;
2978 sd2 = *sp++;
2979 sd3 = *sp++;
2980
2981 /* extract, including some garbage at the start point */
2982 CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0);
2983
2984 /* store 8 bytes result */
2985 vis_pst_16(dd0, dp++, emask);
2986
2987 if ((mlib_addr) dp <= (mlib_addr) dend2) {
2988 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
2989
2990 /* 8-pixel column loop, emask not needed */
2991 #pragma pipeloop(0)
2992 for (i = 0; i < n; i++) {
2993 sd0 = *sp++;
2994 sd1 = *sp++;
2995 sd2 = *sp++;
2996 sd3 = *sp++;
2997 CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0);
2998 *dp++ = dd0;
2999 }
3000 }
3001
3002 /* end point handling */
3003 if ((mlib_addr) dp <= (mlib_addr) dend) {
3004 emask = vis_edge16(dp, dend);
3005 sd0 = *sp++;
3006 sd1 = *sp++;
3007 sd2 = *sp++;
3008 sd3 = *sp++;
3009 CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0);
3010 vis_pst_16(dd0, dp++, emask);
3011 }
3012 }
3013 else {
3014 vis_alignaddr((void *)0, off);
3015
3016 /* generate edge mask for the start point */
3017 emask = vis_edge16(da, dend);
3018
3019 if (off < 0) {
3020 /* load 24 bytes */
3021 sd4 = *sp++;
3022 sd5 = *sp++;
3023 sd6 = *sp++;
3024 sd7 = *sp++;
3025
3026 /* extract and store 8 bytes */
3027 CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1);
3028 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
3029 }
3030 else {
3031 /* load 48 bytes */
3032 sd0 = *sp++;
3033 sd1 = *sp++;
3034 sd2 = *sp++;
3035 sd3 = *sp++;
3036 sd4 = *sp++;
3037 sd5 = *sp++;
3038 sd6 = *sp++;
3039 sd7 = *sp++;
3040
3041 /* extract and store 8 bytes */
3042 CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0);
3043 CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1);
3044 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
3045 }
3046
3047 if ((mlib_addr) dp <= (mlib_addr) dend2) {
3048 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
3049
3050 /* 8-pixel column loop, emask not needed */
3051 #pragma pipeloop(0)
3052 for (i = 0; i < n; i++) {
3053 dd0 = dd1;
3054 sd4 = *sp++;
3055 sd5 = *sp++;
3056 sd6 = *sp++;
3057 sd7 = *sp++;
3058 CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1);
3059 *dp++ = vis_faligndata(dd0, dd1);
3060 }
3061 }
3062
3063 /* end point handling */
3064 if ((mlib_addr) dp <= (mlib_addr) dend) {
3065 emask = vis_edge16(dp, dend);
3066 dd0 = dd1;
3067 sd4 = *sp++;
3068 sd5 = *sp++;
3069 sd6 = *sp++;
3070 sd7 = *sp++;
3071 CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1);
3072 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
3073 }
3074 }
3075 }
3076 else { /* extract right channel */
3077 if (off == 0) { /* src and dst have same alignment */
3078
3079 /* generate edge mask for the start point */
3080 emask = vis_edge16(da, dend);
3081
3082 /* load 16 bytes */
3083 sd0 = *sp++;
3084 sd1 = *sp++;
3085 sd2 = *sp++;
3086 sd3 = *sp++;
3087
3088 /* extract, including some garbage at the start point */
3089 CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd0);
3090
3091 /* store 8 bytes result */
3092 vis_pst_16(dd0, dp++, emask);
3093
3094 if ((mlib_addr) dp <= (mlib_addr) dend2) {
3095 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
3096
3097 /* 8-pixel column loop, emask not needed */
3098 #pragma pipeloop(0)
3099 for (i = 0; i < n; i++) {
3100 sd0 = *sp++;
3101 sd1 = *sp++;
3102 sd2 = *sp++;
3103 sd3 = *sp++;
3104 CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd0);
3105 *dp++ = dd0;
3106 }
3107 }
3108
3109 /* end point handling */
3110 if ((mlib_addr) dp <= (mlib_addr) dend) {
3111 emask = vis_edge16(dp, dend);
3112 sd0 = *sp++;
3113 sd1 = *sp++;
3114 sd2 = *sp++;
3115 sd3 = *sp++;
3116 CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd0);
3117 vis_pst_16(dd0, dp++, emask);
3118 }
3119 }
3120 else {
3121 vis_alignaddr((void *)0, off);
3122
3123 /* generate edge mask for the start point */
3124 emask = vis_edge16(da, dend);
3125
3126 if (off < 0) {
3127 /* load 24 bytes */
3128 sd4 = *sp++;
3129 sd5 = *sp++;
3130 sd6 = *sp++;
3131 sd7 = *sp++;
3132
3133 /* extract and store 8 bytes */
3134 CHANNELEXTRACT_S16_41R(sd4, sd5, sd6, sd7, dd1);
3135 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
3136 }
3137
3138 if ((mlib_addr) dp <= (mlib_addr) dend2) {
3139 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
3140
3141 /* 8-pixel column loop, emask not needed */
3142 #pragma pipeloop(0)
3143 for (i = 0; i < n; i++) {
3144 dd0 = dd1;
3145 sd4 = *sp++;
3146 sd5 = *sp++;
3147 sd6 = *sp++;
3148 sd7 = *sp++;
3149 CHANNELEXTRACT_S16_41R(sd4, sd5, sd6, sd7, dd1);
3150 *dp++ = vis_faligndata(dd0, dd1);
3151 }
3152 }
3153
3154 /* end point handling */
3155 if ((mlib_addr) dp <= (mlib_addr) dend) {
3156 emask = vis_edge16(dp, dend);
3157 dd0 = dd1;
3158 sd4 = *sp++;
3159 sd5 = *sp++;
3160 sd6 = *sp++;
3161 sd7 = *sp++;
3162 CHANNELEXTRACT_S16_41R(sd4, sd5, sd6, sd7, dd1);
3163 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
3164 }
3165 }
3166 }
3167 }
3168
3169 /***************************************************************/
3170 void mlib_v_ImageChannelExtract_S16_41(const mlib_s16 *src,
3171 mlib_s32 slb,
3172 mlib_s16 *dst,
3173 mlib_s32 dlb,
3174 mlib_s32 xsize,
3175 mlib_s32 ysize,
3176 mlib_s32 cmask)
3177 {
3178 mlib_s16 *sa, *da;
3179 mlib_s16 *sl, *dl;
3180 mlib_s32 j;
3181
3182 sa = sl = (void *)src;
3183 da = dl = dst;
3184 for (j = 0; j < ysize; j++) {
3185 mlib_v_ImageChannelExtract_S16_41_D1(sa, da, xsize, cmask);
3186 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
3187 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
3188 }
3189 }
3190
3191 /***************************************************************/
|
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27
28 /*
29 * FILENAME: mlib_ImageChannelExtract_1.c
30 *
31 * FUNCTIONS
32 * mlib_v_ImageChannelExtract_U8_21_D1
33 * mlib_v_ImageChannelExtract_U8_31_D1
34 * mlib_v_ImageChannelExtract_U8_41_D1
35 *
36 * ARGUMENT
37 * src pointer to source image data
38 * dst pointer to destination image data
39 * slb source image line stride in bytes
40 * dlb destination image line stride in bytes
41 * dsize image data size in pixels
42 * xsize image width in pixels
43 * ysize image height in lines
44 * cmask channel mask
45 *
46 * DESCRIPTION
47 * Extract the one selected channel of the source image into the
48 * 1-channel destination image.
49 *
50 * NOTE
51 * These functions are separated from mlib_ImageChannelExtract.c
52 * for loop unrolling and structure clarity.
53 */
54
57 #include "mlib_v_ImageChannelExtract.h"
58
59 /***************************************************************/
60 #define CHANNELEXTRACT_U8_21L(sd0, sd1, dd) \
61 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \
62 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \
63 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \
64 sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \
65 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_hi(sdd))
66
67 /***************************************************************/
68 #define CHANNELEXTRACT_U8_21R(sd0, sd1, dd) \
69 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \
70 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \
71 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \
72 sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \
73 dd = vis_fpmerge(vis_read_lo(sdc), vis_read_lo(sdd))
74
75 /***************************************************************/
76 /* extract one channel from a 2-channel image.
77 */
78
79 void mlib_v_ImageChannelExtract_U8_21_D1(const mlib_u8 *src,
80 mlib_u8 *dst,
81 mlib_s32 dsize,
82 mlib_s32 cmask)
83 {
84 mlib_u8 *sa, *da;
85 mlib_u8 *dend, *dend2; /* end points in dst */
86 mlib_d64 *dp; /* 8-byte aligned start points in dst */
87 mlib_d64 *sp; /* 8-byte aligned start point in src */
88 mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */
89 mlib_d64 sda, sdb, sdc, sdd;
90 mlib_d64 dd0, dd1;
91 mlib_s32 soff; /* offset of address in src */
92 mlib_s32 doff; /* offset of address in dst */
93 mlib_s32 off; /* offset of src over dst */
94 mlib_s32 emask; /* edge mask */
95 mlib_s32 i, n;
96
283 sd3 = *sp++;
284 CHANNELEXTRACT_U8_21R(sd2, sd3, dd1);
285 *dp++ = vis_faligndata(dd0, dd1);
286 }
287 }
288
289 /* end point handling */
290 if ((mlib_addr) dp <= (mlib_addr) dend) {
291 emask = vis_edge8(dp, dend);
292 dd0 = dd1;
293 sd2 = *sp++;
294 sd3 = *sp++;
295 CHANNELEXTRACT_U8_21R(sd2, sd3, dd1);
296 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
297 }
298 }
299 }
300 }
301
302 /***************************************************************/
303 #define CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd) \
304 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
305 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
306 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
307 sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \
308 sde = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc)); \
309 dd = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde))
310
311 /***************************************************************/
312 #define CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd) \
313 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
314 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
315 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
316 sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \
317 sde = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc)); \
318 dd = vis_fpmerge(vis_read_lo(sdd), vis_read_hi(sde))
319
320 /***************************************************************/
321 #define CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd) \
322 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
323 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
324 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
325 sdd = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc)); \
326 sde = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc)); \
327 dd = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde))
328
329 /***************************************************************/
330 void mlib_v_ImageChannelExtract_U8_31_D1(const mlib_u8 *src,
331 mlib_u8 *dst,
332 mlib_s32 dsize,
333 mlib_s32 cmask)
334 {
335 mlib_u8 *sa, *da;
336 mlib_u8 *dend, *dend2; /* end points in dst */
337 mlib_d64 *dp; /* 8-byte aligned start points in dst */
338 mlib_d64 *sp; /* 8-byte aligned start point in src */
339 mlib_d64 sd0, sd1, sd2; /* 8-byte source data */
340 mlib_d64 sd3, sd4, sd5;
341 mlib_d64 sda, sdb, sdc, sdd, sde;
342 mlib_d64 dd0, dd1;
343 mlib_s32 soff; /* offset of address in src */
344 mlib_s32 doff; /* offset of address in dst */
345 mlib_s32 off; /* offset of src over dst */
346 mlib_s32 emask; /* edge mask */
347 mlib_s32 i, n;
348
349 sa = (void *)src;
661 CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1);
662 *dp++ = vis_faligndata(dd0, dd1);
663 }
664 }
665
666 /* end point handling */
667 if ((mlib_addr) dp <= (mlib_addr) dend) {
668 emask = vis_edge8(dp, dend);
669 dd0 = dd1;
670 sd3 = *sp++;
671 sd4 = *sp++;
672 sd5 = *sp++;
673 CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1);
674 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
675 }
676 }
677 }
678 }
679
680 /***************************************************************/
681 #define CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd) \
682 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
683 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
684 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
685 sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
686 sde = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdc)); \
687 sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdd)); \
688 dd = vis_fpmerge(vis_read_hi(sde), vis_read_hi(sdf))
689
690 /***************************************************************/
691 #define CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd) \
692 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
693 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
694 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
695 sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
696 sde = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdc)); \
697 sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdd)); \
698 dd = vis_fpmerge(vis_read_lo(sde), vis_read_lo(sdf))
699
700 /***************************************************************/
701 #define CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd) \
702 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
703 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
704 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
705 sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
706 sde = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdc)); \
707 sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdd)); \
708 dd = vis_fpmerge(vis_read_hi(sde), vis_read_hi(sdf))
709
710 /***************************************************************/
711 #define CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd) \
712 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
713 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
714 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
715 sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
716 sde = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdc)); \
717 sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdd)); \
718 dd = vis_fpmerge(vis_read_lo(sde), vis_read_lo(sdf))
719
720 /***************************************************************/
721 void mlib_v_ImageChannelExtract_U8_41_D1(const mlib_u8 *src,
722 mlib_u8 *dst,
723 mlib_s32 dsize,
724 mlib_s32 cmask)
725 {
726 mlib_u8 *sa, *da;
727 mlib_u8 *dend, *dend2; /* end points in dst */
728 mlib_d64 *dp; /* 8-byte aligned start points in dst */
729 mlib_d64 *sp; /* 8-byte aligned start point in src */
730 mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */
731 mlib_d64 sd4, sd5, sd6, sd7;
732 mlib_d64 sda, sdb, sdc, sdd;
733 mlib_d64 sde, sdf;
734 mlib_d64 dd0, dd1;
735 mlib_s32 soff; /* offset of address in src */
736 mlib_s32 doff; /* offset of address in dst */
737 mlib_s32 off; /* offset of src over dst */
738 mlib_s32 emask; /* edge mask */
739 mlib_s32 i, n;
740
1188 sd5 = *sp++;
1189 sd6 = *sp++;
1190 sd7 = *sp++;
1191 CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1);
1192 *dp++ = vis_faligndata(dd0, dd1);
1193 }
1194 }
1195
1196 /* end point handling */
1197 if ((mlib_addr) dp <= (mlib_addr) dend) {
1198 emask = vis_edge8(dp, dend);
1199 dd0 = dd1;
1200 sd4 = *sp++;
1201 sd5 = *sp++;
1202 sd6 = *sp++;
1203 sd7 = *sp++;
1204 CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1);
1205 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
1206 }
1207 }
1208 }
1209 }
1210
1211 /***************************************************************/
|