10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27
28 /*
29 * FUNCTIONS
30 * mlib_v_ImageChannelInsert_U8
31 * mlib_v_ImageChannelInsert_U8_12_A8D1X8
32 * mlib_v_ImageChannelInsert_U8_12_A8D2X8
33 * mlib_v_ImageChannelInsert_U8_12_D1
34 * mlib_v_ImageChannelInsert_U8_12
35 * mlib_v_ImageChannelInsert_U8_13_A8D1X8
36 * mlib_v_ImageChannelInsert_U8_13_A8D2X8
37 * mlib_v_ImageChannelInsert_U8_13_D1
38 * mlib_v_ImageChannelInsert_U8_13
39 * mlib_v_ImageChannelInsert_U8_14_A8D1X8
40 * mlib_v_ImageChannelInsert_U8_14_A8D2X8
41 * mlib_v_ImageChannelInsert_U8_14_D1
42 * mlib_v_ImageChannelInsert_U8_14
43 * mlib_v_ImageChannelInsert_S16
44 * mlib_v_ImageChannelInsert_S16_12_A8D1X4
45 * mlib_v_ImageChannelInsert_S16_12_A8D2X4
46 * mlib_v_ImageChannelInsert_S16_12_D1
47 * mlib_v_ImageChannelInsert_S16_12
48 * mlib_v_ImageChannelInsert_S16_13_A8D1X4
49 * mlib_v_ImageChannelInsert_S16_13_A8D2X4
50 * mlib_v_ImageChannelInsert_S16_13_D1
51 * mlib_v_ImageChannelInsert_S16_13
52 * mlib_v_ImageChannelInsert_S16_14_A8D1X4
53 * mlib_v_ImageChannelInsert_S16_14_A8D2X4
54 * mlib_v_ImageChannelInsert_S16_14_D1
55 * mlib_v_ImageChannelInsert_S16_14
56 * mlib_v_ImageChannelInsert_S32
57 * mlib_v_ImageChannelInsert_D64
58 *
59 * ARGUMENT
60 * src pointer to source image data
61 * dst pointer to destination image data
62 * slb source image line stride in bytes
63 * dlb destination image line stride in bytes
64 * dsize image data size in pixels
65 * xsize image width in pixels
66 * ysize image height in lines
67 * cmask channel mask
68 *
69 * DESCRIPTION
70 * Copy the 1-channel source image into the selected channel
71 * of the destination image -- VIS version low level functions.
72 *
73 * NOTE
74 * These functions are separated from mlib_v_ImageChannelInsert.c
75 * for loop unrolling and structure clarity.
76 */
77
78 #include "vis_proto.h"
79 #include "mlib_image.h"
80 #include "mlib_v_ImageChannelInsert.h"
81
82 /***************************************************************/
83 /* general channel insertion: slower due to the inner loop */
84 void mlib_v_ImageChannelInsert_U8(const mlib_u8 *src,
85 mlib_s32 slb,
86 mlib_u8 *dst,
87 mlib_s32 dlb,
88 mlib_s32 channels,
89 mlib_s32 channeld,
90 mlib_s32 width,
91 mlib_s32 height,
92 mlib_s32 cmask)
93 {
94 mlib_u8 *sp; /* pointer for pixel in src */
95 mlib_u8 *sl; /* pointer for line in src */
96 mlib_u8 *dp; /* pointer for pixel in dst */
97 mlib_u8 *dl; /* pointer for line in dst */
98 mlib_s32 i, j, k; /* indices for x, y, channel */
99 mlib_s32 deltac[5] = { 0, 1, 1, 1, 1 };
100 mlib_s32 inc0, inc1, inc2;
101 mlib_u8 s0, s1, s2;
102
103 deltac[channels] = 1;
104 for (i = (channeld - 1), k = 0; i >= 0; i--) {
105 if ((cmask & (1 << i)) == 0)
106 deltac[k]++;
107 else
108 k++;
109 }
110
111 deltac[channels] = channeld;
112 for (i = 1; i < channels; i++) {
113 deltac[channels] -= deltac[i];
114 }
115
116 sp = sl = (void *)src;
117 dp = dl = dst + deltac[0];
118
119 if (channels == 2) {
120 inc0 = deltac[1];
121 inc1 = deltac[2] + inc0;
122 for (j = 0; j < height; j++) {
123 #pragma pipeloop(0)
124 for (i = 0; i < width; i++) {
125 s0 = sp[0];
126 s1 = sp[1];
127 dp[0] = s0;
128 dp[inc0] = s1;
129 dp += inc1;
130 sp += 2;
131 }
132
133 sp = sl += slb;
134 dp = dl += dlb;
135 }
136 }
137 else if (channels == 3) {
138 inc0 = deltac[1];
139 inc1 = deltac[2] + inc0;
140 inc2 = deltac[3] + inc1;
141 for (j = 0; j < height; j++) {
142 #pragma pipeloop(0)
143 for (i = 0; i < width; i++) {
144 s0 = sp[0];
145 s1 = sp[1];
146 s2 = sp[2];
147 dp[0] = s0;
148 dp[inc0] = s1;
149 dp[inc1] = s2;
150 dp += inc2;
151 sp += 3;
152 }
153
154 sp = sl += slb;
155 dp = dl += dlb;
156 }
157 }
158 }
159
160 /***************************************************************/
161 /* general channel insertion: slower due to the inner loop */
162 void mlib_v_ImageChannelInsert_D64(const mlib_d64 *src,
163 mlib_s32 slb,
164 mlib_d64 *dst,
165 mlib_s32 dlb,
166 mlib_s32 channels,
167 mlib_s32 channeld,
168 mlib_s32 width,
169 mlib_s32 height,
170 mlib_s32 cmask)
171 {
172 mlib_d64 *sp; /* pointer for pixel in src */
173 mlib_d64 *sl; /* pointer for line in src */
174 mlib_d64 *dp; /* pointer for pixel in dst */
175 mlib_d64 *dl; /* pointer for line in dst */
176 mlib_s32 i, j, k; /* indices for x, y, channel */
177 mlib_s32 deltac[5] = { 0, 1, 1, 1, 1 };
178 mlib_s32 inc0, inc1, inc2;
179 mlib_d64 s0, s1, s2;
180
181 deltac[channels] = 1;
182 for (i = (channeld - 1), k = 0; i >= 0; i--) {
183 if ((cmask & (1 << i)) == 0)
184 deltac[k]++;
185 else
186 k++;
187 }
188
189 deltac[channels] = channeld;
190 for (i = 1; i < channels; i++) {
191 deltac[channels] -= deltac[i];
192 }
193
194 sp = sl = (void *)src;
195 dp = dl = dst + deltac[0];
196
197 if (channels == 1) {
198 for (j = 0; j < height; j++) {
199 #pragma pipeloop(0)
200 for (i = 0; i < width; i++) {
201 s0 = sp[0];
202 dp[0] = s0;
203 dp += channeld;
204 sp++;
205 }
206
207 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
208 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
209 }
210 }
211 else if (channels == 2) {
212 inc0 = deltac[1];
213 inc1 = deltac[2] + inc0;
214 for (j = 0; j < height; j++) {
215 #pragma pipeloop(0)
216 for (i = 0; i < width; i++) {
217 s0 = sp[0];
218 s1 = sp[1];
219 dp[0] = s0;
220 dp[inc0] = s1;
221 dp += inc1;
222 sp += 2;
223 }
224
225 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
226 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
227 }
228 }
229 else if (channels == 3) {
230 inc0 = deltac[1];
231 inc1 = deltac[2] + inc0;
232 inc2 = deltac[3] + inc1;
233 for (j = 0; j < height; j++) {
234 #pragma pipeloop(0)
235 for (i = 0; i < width; i++) {
236 s0 = sp[0];
237 s1 = sp[1];
238 s2 = sp[2];
239 dp[0] = s0;
240 dp[inc0] = s1;
241 dp[inc1] = s2;
242 dp += inc2;
243 sp += 3;
244 }
245
246 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
247 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
248 }
249 }
250 }
251
252 /***************************************************************/
253 /* general channel insertion: slower due to the inner loop */
254 void mlib_v_ImageChannelInsert_S16(const mlib_s16 *src,
255 mlib_s32 slb,
256 mlib_s16 *dst,
257 mlib_s32 dlb,
258 mlib_s32 channels,
259 mlib_s32 channeld,
260 mlib_s32 width,
261 mlib_s32 height,
262 mlib_s32 cmask)
263 {
264 mlib_s16 *sp; /* pointer for pixel in src */
265 mlib_s16 *sl; /* pointer for line in src */
266 mlib_s16 *dp; /* pointer for pixel in dst */
267 mlib_s16 *dl; /* pointer for line in dst */
268 mlib_s32 i, j, k; /* indices for x, y, channel */
269 mlib_s32 deltac[5] = { 0, 1, 1, 1, 1 };
270 mlib_s32 inc0, inc1, inc2;
271 mlib_s16 s0, s1, s2;
272
273 deltac[channels] = 1;
274 for (i = (channeld - 1), k = 0; i >= 0; i--) {
275 if ((cmask & (1 << i)) == 0)
276 deltac[k]++;
277 else
278 k++;
279 }
280
281 deltac[channels] = channeld;
282 for (i = 1; i < channels; i++) {
283 deltac[channels] -= deltac[i];
284 }
285
286 sp = sl = (void *)src;
287 dp = dl = dst + deltac[0];
288
289 if (channels == 2) {
290 inc0 = deltac[1];
291 inc1 = deltac[2] + inc0;
292 for (j = 0; j < height; j++) {
293 #pragma pipeloop(0)
294 for (i = 0; i < width; i++) {
295 s0 = sp[0];
296 s1 = sp[1];
297 dp[0] = s0;
298 dp[inc0] = s1;
299 dp += inc1;
300 sp += 2;
301 }
302
303 sp = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
304 dp = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
305 }
306 }
307 else if (channels == 3) {
308 inc0 = deltac[1];
309 inc1 = deltac[2] + inc0;
310 inc2 = deltac[3] + inc1;
311 for (j = 0; j < height; j++) {
312 #pragma pipeloop(0)
313 for (i = 0; i < width; i++) {
314 s0 = sp[0];
315 s1 = sp[1];
316 s2 = sp[2];
317 dp[0] = s0;
318 dp[inc0] = s1;
319 dp[inc1] = s2;
320 dp += inc2;
321 sp += 3;
322 }
323
324 sp = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
325 dp = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
326 }
327 }
328 }
329
330 /***************************************************************/
331 /* general channel insertion: slower due to the inner loop */
332
333 void mlib_v_ImageChannelInsert_S32(const mlib_s32 *src,
334 mlib_s32 slb,
335 mlib_s32 *dst,
336 mlib_s32 dlb,
337 mlib_s32 channels,
338 mlib_s32 channeld,
339 mlib_s32 width,
340 mlib_s32 height,
341 mlib_s32 cmask)
342 {
343 mlib_s32 *sp; /* pointer for pixel in src */
344 mlib_s32 *sl; /* pointer for line in src */
345 mlib_s32 *dp; /* pointer for pixel in dst */
346 mlib_s32 *dl; /* pointer for line in dst */
347 mlib_s32 i, j, k; /* indices for x, y, channel */
348 mlib_s32 deltac[5] = { 0, 1, 1, 1, 1 };
349 mlib_s32 inc0, inc1, inc2;
350 mlib_s32 s0, s1, s2;
351
352 deltac[channels] = 1;
353 for (i = (channeld - 1), k = 0; i >= 0; i--) {
354 if ((cmask & (1 << i)) == 0)
355 deltac[k]++;
356 else
357 k++;
358 }
359
360 deltac[channels] = channeld;
361 for (i = 1; i < channels; i++) {
362 deltac[channels] -= deltac[i];
363 }
364
365 sp = sl = (void *)src;
366 dp = dl = dst + deltac[0];
367
368 if (channels == 1) {
369 for (j = 0; j < height; j++) {
370 #pragma pipeloop(0)
371 for (i = 0; i < width; i++) {
372 s0 = sp[0];
373 dp[0] = s0;
374 dp += channeld;
375 sp++;
376 }
377
378 sp = sl = (mlib_s32 *) ((mlib_u8 *) sl + slb);
379 dp = dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb);
380 }
381 }
382 else if (channels == 2) {
383 inc0 = deltac[1];
384 inc1 = deltac[2] + inc0;
385 for (j = 0; j < height; j++) {
386 #pragma pipeloop(0)
387 for (i = 0; i < width; i++) {
388 s0 = sp[0];
389 s1 = sp[1];
390 dp[0] = s0;
391 dp[inc0] = s1;
392 dp += inc1;
393 sp += 2;
394 }
395
396 sp = sl = (mlib_s32 *) ((mlib_u8 *) sl + slb);
397 dp = dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb);
398 }
399 }
400 else if (channels == 3) {
401 inc0 = deltac[1];
402 inc1 = deltac[2] + inc0;
403 inc2 = deltac[3] + inc1;
404 for (j = 0; j < height; j++) {
405 #pragma pipeloop(0)
406 for (i = 0; i < width; i++) {
407 s0 = sp[0];
408 s1 = sp[1];
409 s2 = sp[2];
410 dp[0] = s0;
411 dp[inc0] = s1;
412 dp[inc1] = s2;
413 dp += inc2;
414 sp += 3;
415 }
416
417 sp = sl = (mlib_s32 *) ((mlib_u8 *) sl + slb);
418 dp = dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb);
419 }
420 }
421 }
422
423 /***************************************************************/
424 #define INSERT_U8_12(sd0, dd0, dd1) /* channel duplicate */ \
425 dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \
426 dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0))
427
428 /***************************************************************/
429 /* insert one channel to a 2-channel image.
430 * both source and destination image data are 8-byte aligned.
431 * dsize is multiple of 8.
432 */
433
434 void mlib_v_ImageChannelInsert_U8_12_A8D1X8(const mlib_u8 *src,
435 mlib_u8 *dst,
436 mlib_s32 dsize,
437 mlib_s32 cmask)
438 {
439 mlib_d64 *sp, *dp;
440 mlib_d64 sd0;
441 mlib_d64 dd0, dd1;
442 mlib_s32 bmask;
443 mlib_s32 i;
444
445 bmask = cmask | (cmask << 2) | (cmask << 4) | (cmask << 6);
446
447 sp = (mlib_d64 *) src;
448 dp = (mlib_d64 *) dst;
449
450 #pragma pipeloop(0)
451 for (i = 0; i < dsize / 8; i++) {
452 sd0 = *sp++;
453 INSERT_U8_12(sd0, dd0, dd1);
454 vis_pst_8(dd0, dp++, bmask);
455 vis_pst_8(dd1, dp++, bmask);
456 }
457 }
458
459 /***************************************************************/
460 /* insert one channel to a 2-channel image.
461 * both source and destination image data are 8-byte aligned.
462 * xsize is multiple of 8.
463 */
464
465 void mlib_v_ImageChannelInsert_U8_12_A8D2X8(const mlib_u8 *src,
466 mlib_s32 slb,
467 mlib_u8 *dst,
468 mlib_s32 dlb,
469 mlib_s32 xsize,
470 mlib_s32 ysize,
471 mlib_s32 cmask)
472 {
473 mlib_d64 *sp, *dp;
474 mlib_d64 *sl, *dl;
475 mlib_d64 sd0;
476 mlib_d64 dd0, dd1;
477 mlib_s32 bmask;
478 mlib_s32 i, j;
479
480 bmask = cmask | (cmask << 2) | (cmask << 4) | (cmask << 6);
481
482 sp = sl = (mlib_d64 *) src;
483 dp = dl = (mlib_d64 *) dst;
484
485 for (j = 0; j < ysize; j++) {
486 #pragma pipeloop(0)
487 for (i = 0; i < xsize / 8; i++) {
488 sd0 = *sp++;
489 INSERT_U8_12(sd0, dd0, dd1);
490 vis_pst_8(dd0, dp++, bmask);
491 vis_pst_8(dd1, dp++, bmask);
492 }
493
494 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
495 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
496 }
497 }
498
499 /***************************************************************/
500 /* insert one channel to a 2-channel image.
501 */
502
503 void mlib_v_ImageChannelInsert_U8_12_D1(const mlib_u8 *src,
504 mlib_u8 *dst,
505 mlib_s32 dsize,
506 mlib_s32 cmask)
507 {
508 mlib_u8 *sa, *da;
509 mlib_u8 *dend, *dend2; /* end points in dst */
510 mlib_d64 *dp; /* 8-byte aligned start points in dst */
511 mlib_d64 *sp; /* 8-byte aligned start point in src */
512 mlib_d64 sd0, sd1; /* 8-byte source data */
513 mlib_d64 dd0, dd1, dd2, dd3; /* 8-byte destination data */
514 mlib_s32 soff; /* offset of address in src */
515 mlib_s32 doff; /* offset of address in dst */
516 mlib_s32 off; /* offset of src over dst */
517 mlib_s32 emask; /* edge mask */
518 mlib_s32 bmask; /* channel mask */
519 mlib_s32 i, n;
520
707 }
708 }
709
710 /* end point handling */
711 if ((mlib_addr) dp <= (mlib_addr) dend) {
712 emask = vis_edge8(dp, dend);
713 dd0 = dd2;
714 dd1 = dd3;
715 sd1 = *sp++;
716 INSERT_U8_12(sd1, dd2, dd3);
717 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
718 if ((mlib_addr) dp <= (mlib_addr) dend) {
719 emask = vis_edge8(dp, dend);
720 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
721 }
722 }
723 }
724 }
725
726 /***************************************************************/
727 /* insert one channel to a 2-channel image.
728 */
729
730 void mlib_v_ImageChannelInsert_U8_12(const mlib_u8 *src,
731 mlib_s32 slb,
732 mlib_u8 *dst,
733 mlib_s32 dlb,
734 mlib_s32 xsize,
735 mlib_s32 ysize,
736 mlib_s32 cmask)
737 {
738 mlib_u8 *sa, *da;
739 mlib_u8 *sl, *dl;
740 mlib_s32 j;
741
742 sa = sl = (void *)src;
743 da = dl = dst;
744
745 #pragma pipeloop(0)
746 for (j = 0; j < ysize; j++) {
747 mlib_v_ImageChannelInsert_U8_12_D1(sa, da, xsize, cmask);
748 sa = sl += slb;
749 da = dl += dlb;
750 }
751 }
752
753 /***************************************************************/
754 #define INSERT_U8_13(sd0, dd0, dd1, dd2) \
755 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd0)); \
756 sdb = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sda)); \
757 sdc = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdb)); \
758 sdd = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdb)); \
759 dd0 = vis_fpmerge(vis_read_hi(sdc), vis_read_hi(sdd)); \
760 sde = vis_fpmerge(vis_read_lo(sdc), vis_read_lo(sdd)); \
761 dd1 = vis_freg_pair(vis_read_lo(dd0), vis_read_hi(sde)); \
762 dd2 = vis_freg_pair(vis_read_lo(sde), vis_read_lo(sde))
763
764 /***************************************************************/
765 #define LOAD_INSERT_STORE_U8_A8(channeld) \
766 sd = *sp++; \
767 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
768 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
769 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
770 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
771 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
772 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
773 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
774 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld
775
776 /***************************************************************/
777 #define LOAD_INSERT_STORE_U8(channeld) \
778 vis_alignaddr((void *)0, off); \
779 sd0 = sd1; \
780 sd1 = *sp++; \
781 sd = vis_faligndata(sd0, sd1); \
782 vis_alignaddr((void *)0, 1); \
783 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
784 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
785 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
786 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
787 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
788 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
789 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
790 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld
791
792 /***************************************************************/
793 void mlib_v_ImageChannelInsert_U8_13_A8D1X8(const mlib_u8 *src,
794 mlib_u8 *dst,
795 mlib_s32 dsize,
796 mlib_s32 cmask)
797 {
798 mlib_u8 *da;
799 mlib_d64 *sp;
800 mlib_d64 sd;
801 mlib_s32 i;
802
803 vis_alignaddr((void *)0, 1); /* for 1-byte left shift */
804
805 sp = (mlib_d64 *) src;
806 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
807
808 #pragma pipeloop(0)
809 for (i = 0; i < dsize / 8; i++) {
810 LOAD_INSERT_STORE_U8_A8(3);
811 }
812 }
813
814 /***************************************************************/
815 void mlib_v_ImageChannelInsert_U8_13_A8D2X8(const mlib_u8 *src,
816 mlib_s32 slb,
817 mlib_u8 *dst,
818 mlib_s32 dlb,
819 mlib_s32 xsize,
820 mlib_s32 ysize,
821 mlib_s32 cmask)
822 {
823 mlib_u8 *da, *dl;
824 mlib_d64 *sp, *sl;
825 mlib_d64 sd;
826 mlib_s32 i, j;
827
828 vis_alignaddr((void *)0, 1);
829
830 sp = sl = (mlib_d64 *) src;
831 da = dl = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
832
833 for (j = 0; j < ysize; j++) {
834 #pragma pipeloop(0)
835 for (i = 0; i < xsize / 8; i++) {
836 LOAD_INSERT_STORE_U8_A8(3);
837 }
838
839 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
840 da = dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
841 }
842 }
843
844 /***************************************************************/
845 void mlib_v_ImageChannelInsert_U8_13_D1(const mlib_u8 *src,
846 mlib_u8 *dst,
847 mlib_s32 dsize,
848 mlib_s32 cmask)
849 {
850 mlib_u8 *sa, *da;
851 mlib_u8 *dend; /* end point in destination */
852 mlib_d64 *sp; /* 8-byte aligned start points in src */
853 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */
854 mlib_s32 off; /* offset of address alignment in src */
855 mlib_s32 i;
856
857 /* prepare the src address */
858 sa = (void *)src;
859 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
860 off = (mlib_addr) sa & 7;
861
862 /* prepare the dst address */
863 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
864 dend = da + dsize * 3 - 1;
891 vis_st_u8(sd = vis_faligndata(sd, sd), da);
892 da += 3;
893 if ((mlib_addr) da <= (mlib_addr) dend) {
894 vis_st_u8(sd = vis_faligndata(sd, sd), da);
895 da += 3;
896 if ((mlib_addr) da <= (mlib_addr) dend) {
897 vis_st_u8(sd = vis_faligndata(sd, sd), da);
898 da += 3;
899 if ((mlib_addr) da <= (mlib_addr) dend) {
900 vis_st_u8(sd = vis_faligndata(sd, sd), da);
901 }
902 }
903 }
904 }
905 }
906 }
907 }
908 }
909
910 /***************************************************************/
911 void mlib_v_ImageChannelInsert_U8_13(const mlib_u8 *src,
912 mlib_s32 slb,
913 mlib_u8 *dst,
914 mlib_s32 dlb,
915 mlib_s32 xsize,
916 mlib_s32 ysize,
917 mlib_s32 cmask)
918 {
919 mlib_u8 *sa, *da;
920 mlib_u8 *sl, *dl;
921 mlib_s32 j;
922
923 sa = sl = (void *)src;
924 da = dl = dst;
925
926 #pragma pipeloop(0)
927 for (j = 0; j < ysize; j++) {
928 mlib_v_ImageChannelInsert_U8_13_D1(sa, da, xsize, cmask);
929 sa = sl += slb;
930 da = dl += dlb;
931 }
932 }
933
934 /***************************************************************/
935 #define INSERT_U8_14(sd0, dd0, dd1, dd2, dd3) \
936 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \
937 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0)); \
938 dd0 = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sda)); \
939 dd1 = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sda)); \
940 dd2 = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdb)); \
941 dd3 = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdb))
942
943 /***************************************************************/
944 void mlib_v_ImageChannelInsert_U8_14_A8D1X8(const mlib_u8 *src,
945 mlib_u8 *dst,
946 mlib_s32 dsize,
947 mlib_s32 cmask)
948 {
949 mlib_d64 *sp, *dp;
950 mlib_d64 sd0;
951 mlib_d64 sda, sdb;
952 mlib_d64 dd0, dd1, dd2, dd3;
953 mlib_s32 bmask;
954 mlib_s32 i;
955
956 bmask = cmask | (cmask << 4);
957
958 sp = (mlib_d64 *) src;
959 dp = (mlib_d64 *) dst;
960
961 #pragma pipeloop(0)
962 for (i = 0; i < dsize / 8; i++) {
963 sd0 = *sp++;
964 INSERT_U8_14(sd0, dd0, dd1, dd2, dd3);
965 vis_pst_8(dd0, dp++, bmask);
966 vis_pst_8(dd1, dp++, bmask);
967 vis_pst_8(dd2, dp++, bmask);
968 vis_pst_8(dd3, dp++, bmask);
969 }
970 }
971
972 /***************************************************************/
973 void mlib_v_ImageChannelInsert_U8_14_A8D2X8(const mlib_u8 *src,
974 mlib_s32 slb,
975 mlib_u8 *dst,
976 mlib_s32 dlb,
977 mlib_s32 xsize,
978 mlib_s32 ysize,
979 mlib_s32 cmask)
980 {
981 mlib_d64 *sp, *dp;
982 mlib_d64 *sl, *dl;
983 mlib_d64 sd0;
984 mlib_d64 sda, sdb;
985 mlib_d64 dd0, dd1, dd2, dd3;
986 mlib_s32 bmask;
987 mlib_s32 i, j;
988
989 bmask = cmask | (cmask << 4);
990
991 sp = sl = (mlib_d64 *) src;
992 dp = dl = (mlib_d64 *) dst;
993
994 for (j = 0; j < ysize; j++) {
995 #pragma pipeloop(0)
996 for (i = 0; i < xsize / 8; i++) {
997 sd0 = *sp++;
998 INSERT_U8_14(sd0, dd0, dd1, dd2, dd3);
999 vis_pst_8(dd0, dp++, bmask);
1000 vis_pst_8(dd1, dp++, bmask);
1001 vis_pst_8(dd2, dp++, bmask);
1002 vis_pst_8(dd3, dp++, bmask);
1003 }
1004
1005 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1006 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1007 }
1008 }
1009
1010 /***************************************************************/
1011 void mlib_v_ImageChannelInsert_U8_14_D1(const mlib_u8 *src,
1012 mlib_u8 *dst,
1013 mlib_s32 dsize,
1014 mlib_s32 cmask)
1015 {
1016 mlib_u8 *sa, *da;
1017 mlib_u8 *dend, *dend2; /* end points in dst */
1018 mlib_d64 *dp; /* 8-byte aligned start points in dst */
1019 mlib_d64 *sp; /* 8-byte aligned start point in src */
1020 mlib_d64 sd0, sd1, sd; /* 8-byte source data */
1021 mlib_d64 sda, sdb;
1022 mlib_d64 dd0, dd1, dd2, dd3, dd4;
1023 mlib_s32 soff; /* offset of address in src */
1024 mlib_s32 doff; /* offset of address in dst */
1025 mlib_s32 emask; /* edge mask */
1026 mlib_s32 bmask; /* channel mask */
1027 mlib_s32 i, n;
1028
1029 sa = (void *)src;
1030 da = dst;
1171
1172 vis_alignaddr((void *)0, -doff);
1173 emask = vis_edge8(dp, dend);
1174 vis_pst_8(vis_faligndata(dd4, dd0), dp++, emask & bmask);
1175 if ((mlib_addr) dp <= (mlib_addr) dend) {
1176 emask = vis_edge8(dp, dend);
1177 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
1178 if ((mlib_addr) dp <= (mlib_addr) dend) {
1179 emask = vis_edge8(dp, dend);
1180 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
1181 if ((mlib_addr) dp <= (mlib_addr) dend) {
1182 emask = vis_edge8(dp, dend);
1183 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
1184 }
1185 }
1186 }
1187 }
1188 }
1189 }
1190
1191 /***************************************************************/
1192 void mlib_v_ImageChannelInsert_U8_14(const mlib_u8 *src,
1193 mlib_s32 slb,
1194 mlib_u8 *dst,
1195 mlib_s32 dlb,
1196 mlib_s32 xsize,
1197 mlib_s32 ysize,
1198 mlib_s32 cmask)
1199 {
1200 mlib_u8 *sa, *da;
1201 mlib_u8 *sl, *dl;
1202 mlib_s32 j;
1203
1204 sa = sl = (void *)src;
1205 da = dl = dst;
1206
1207 #pragma pipeloop(0)
1208 for (j = 0; j < ysize; j++) {
1209 mlib_v_ImageChannelInsert_U8_14_D1(sa, da, xsize, cmask);
1210 sa = sl += slb;
1211 da = dl += dlb;
1212 }
1213 }
1214
1215 /***************************************************************/
1216 #define LOAD_INSERT_STORE_S16_1X_A8(channeld) \
1217 sd = *sp++; \
1218 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \
1219 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \
1220 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \
1221 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld
1222
1223 /***************************************************************/
1224 #define LOAD_INSERT_STORE_S16_1X(channeld) \
1225 vis_alignaddr((void *)0, off); \
1226 sd0 = sd1; \
1227 sd1 = *sp++; \
1228 sd = vis_faligndata(sd0, sd1); \
1229 vis_alignaddr((void *)0, 2); \
1230 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \
1231 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \
1232 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \
1233 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld
1234
1235 /***************************************************************/
1236 void mlib_v_ImageChannelInsert_S16_12_A8D1X4(const mlib_s16 *src,
1237 mlib_s16 *dst,
1238 mlib_s32 dsize,
1239 mlib_s32 cmask)
1240 {
1241 mlib_s16 *da;
1242 mlib_d64 *sp;
1243 mlib_d64 sd;
1244 mlib_s32 i;
1245
1246 sp = (mlib_d64 *) src;
1247 da = dst + (2 - cmask); /* 2,1 -> 0,1 */
1248
1249 vis_alignaddr((void *)0, 2);
1250
1251 #pragma pipeloop(0)
1252 for (i = 0; i < dsize / 4; i++) {
1253 LOAD_INSERT_STORE_S16_1X_A8(2);
1254 }
1255 }
1256
1257 /***************************************************************/
1258 void mlib_v_ImageChannelInsert_S16_12_A8D2X4(const mlib_s16 *src,
1259 mlib_s32 slb,
1260 mlib_s16 *dst,
1261 mlib_s32 dlb,
1262 mlib_s32 xsize,
1263 mlib_s32 ysize,
1264 mlib_s32 cmask)
1265 {
1266 mlib_s16 *da, *dl;
1267 mlib_d64 *sp, *sl;
1268 mlib_d64 sd;
1269 mlib_s32 i, j;
1270
1271 sp = sl = (mlib_d64 *) src;
1272 da = dl = dst + (2 - cmask); /* 2,1 -> 0,1 */
1273
1274 vis_alignaddr((void *)0, 2);
1275
1276 for (j = 0; j < ysize; j++) {
1277 #pragma pipeloop(0)
1278 for (i = 0; i < xsize / 4; i++) {
1279 LOAD_INSERT_STORE_S16_1X_A8(2);
1280 }
1281
1282 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1283 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1284 }
1285 }
1286
1287 /***************************************************************/
1288 void mlib_v_ImageChannelInsert_S16_12_D1(const mlib_s16 *src,
1289 mlib_s16 *dst,
1290 mlib_s32 dsize,
1291 mlib_s32 cmask)
1292 {
1293 mlib_s16 *sa, *da;
1294 mlib_s16 *dend; /* end point in destination */
1295 mlib_d64 *sp; /* 8-byte aligned start points in src */
1296 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */
1297 mlib_s32 off; /* offset of address alignment in src */
1298 mlib_s32 i;
1299
1300 sa = (void *)src;
1301 da = dst + (2 - cmask); /* 2,1 -> 0,1 */
1302
1303 /* prepare the src address */
1304 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
1305 off = (mlib_addr) sa & 7;
1306
1307 dend = da + dsize * 2 - 1;
1308
1309 sd1 = *sp++;
1310
1311 #pragma pipeloop(0)
1312 for (i = 0; i < dsize / 4; i++) {
1313 LOAD_INSERT_STORE_S16_1X(2);
1314 }
1315
1316 /* right end handling */
1317 if ((mlib_addr) da <= (mlib_addr) dend) {
1318
1319 vis_alignaddr((void *)0, off);
1320 sd0 = sd1;
1321 sd1 = *sp++;
1322 sd = vis_faligndata(sd0, sd1);
1323
1324 vis_alignaddr((void *)0, 2);
1325 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1326 da += 2;
1327 if ((mlib_addr) da <= (mlib_addr) dend) {
1328 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1329 da += 2;
1330 if ((mlib_addr) da <= (mlib_addr) dend) {
1331 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1332 }
1333 }
1334 }
1335 }
1336
1337 /***************************************************************/
1338 void mlib_v_ImageChannelInsert_S16_12(const mlib_s16 *src,
1339 mlib_s32 slb,
1340 mlib_s16 *dst,
1341 mlib_s32 dlb,
1342 mlib_s32 xsize,
1343 mlib_s32 ysize,
1344 mlib_s32 cmask)
1345 {
1346 mlib_s16 *sa, *da;
1347 mlib_s16 *sl, *dl;
1348 mlib_s32 j;
1349
1350 sa = sl = (void *)src;
1351 da = dl = dst;
1352
1353 #pragma pipeloop(0)
1354 for (j = 0; j < ysize; j++) {
1355 mlib_v_ImageChannelInsert_S16_12_D1(sa, da, xsize, cmask);
1356 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
1357 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1358 }
1359 }
1360
1361 /***************************************************************/
1362 void mlib_v_ImageChannelInsert_S16_13_A8D1X4(const mlib_s16 *src,
1363 mlib_s16 *dst,
1364 mlib_s32 dsize,
1365 mlib_s32 cmask)
1366 {
1367 mlib_s16 *da;
1368 mlib_d64 *sp;
1369 mlib_d64 sd;
1370 mlib_s32 i;
1371
1372 sp = (mlib_d64 *) src;
1373 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
1374
1375 vis_alignaddr((void *)0, 2);
1376
1377 #pragma pipeloop(0)
1378 for (i = 0; i < dsize / 4; i++) {
1379 LOAD_INSERT_STORE_S16_1X_A8(3);
1380 }
1381 }
1382
1383 /***************************************************************/
1384 void mlib_v_ImageChannelInsert_S16_13_A8D2X4(const mlib_s16 *src,
1385 mlib_s32 slb,
1386 mlib_s16 *dst,
1387 mlib_s32 dlb,
1388 mlib_s32 xsize,
1389 mlib_s32 ysize,
1390 mlib_s32 cmask)
1391 {
1392 mlib_s16 *da, *dl;
1393 mlib_d64 *sp, *sl;
1394 mlib_d64 sd;
1395 mlib_s32 i, j;
1396
1397 sp = sl = (mlib_d64 *) src;
1398 da = dl = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
1399
1400 vis_alignaddr((void *)0, 2);
1401
1402 for (j = 0; j < ysize; j++) {
1403 #pragma pipeloop(0)
1404 for (i = 0; i < xsize / 4; i++) {
1405 LOAD_INSERT_STORE_S16_1X_A8(3);
1406 }
1407
1408 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1409 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1410 }
1411 }
1412
1413 /***************************************************************/
1414 void mlib_v_ImageChannelInsert_S16_13_D1(const mlib_s16 *src,
1415 mlib_s16 *dst,
1416 mlib_s32 dsize,
1417 mlib_s32 cmask)
1418 {
1419 mlib_s16 *sa, *da;
1420 mlib_s16 *dend; /* end point in destination */
1421 mlib_d64 *sp; /* 8-byte aligned start points in src */
1422 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */
1423 mlib_s32 off; /* offset of address alignment in src */
1424 mlib_s32 i;
1425
1426 sa = (void *)src;
1427 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
1428
1429 /* prepare the src address */
1430 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
1431 off = (mlib_addr) sa & 7;
1432
1433 dend = da + dsize * 3 - 1;
1434
1435 sd1 = *sp++;
1436
1437 #pragma pipeloop(0)
1438 for (i = 0; i < dsize / 4; i++) {
1439 LOAD_INSERT_STORE_S16_1X(3);
1440 }
1441
1442 /* right end handling */
1443 if ((mlib_addr) da <= (mlib_addr) dend) {
1444
1445 vis_alignaddr((void *)0, off);
1446 sd0 = sd1;
1447 sd1 = *sp++;
1448 sd = vis_faligndata(sd0, sd1);
1449
1450 vis_alignaddr((void *)0, 2);
1451 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1452 da += 3;
1453 if ((mlib_addr) da <= (mlib_addr) dend) {
1454 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1455 da += 3;
1456 if ((mlib_addr) da <= (mlib_addr) dend) {
1457 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1458 }
1459 }
1460 }
1461 }
1462
1463 /***************************************************************/
1464 void mlib_v_ImageChannelInsert_S16_13(const mlib_s16 *src,
1465 mlib_s32 slb,
1466 mlib_s16 *dst,
1467 mlib_s32 dlb,
1468 mlib_s32 xsize,
1469 mlib_s32 ysize,
1470 mlib_s32 cmask)
1471 {
1472 mlib_s16 *sa, *da;
1473 mlib_s16 *sl, *dl;
1474 mlib_s32 j;
1475
1476 sa = sl = (void *)src;
1477 da = dl = dst;
1478
1479 #pragma pipeloop(0)
1480 for (j = 0; j < ysize; j++) {
1481 mlib_v_ImageChannelInsert_S16_13_D1(sa, da, xsize, cmask);
1482 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
1483 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1484 }
1485 }
1486
1487 /***************************************************************/
1488 #define INSERT_S16_14(sp, dp, bmask) /* channel duplicate */ \
1489 /* obsolete: it is slower than the vis_st_u16() version*/ \
1490 sd0 = *sp++; \
1491 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \
1492 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0)); \
1493 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sda)); \
1494 sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sda)); \
1495 sde = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdb)); \
1496 sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdb)); \
1497 dd0 = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc)); \
1498 dd1 = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sdd)); \
1499 dd2 = vis_fpmerge(vis_read_hi(sde), vis_read_lo(sde)); \
1500 dd3 = vis_fpmerge(vis_read_hi(sdf), vis_read_lo(sdf)); \
1501 vis_pst_16(dd0, dp++, bmask); \
1502 vis_pst_16(dd1, dp++, bmask); \
1503 vis_pst_16(dd2, dp++, bmask); \
1504 vis_pst_16(dd3, dp++, bmask)
1505
1506 /***************************************************************/
1507 void mlib_v_ImageChannelInsert_S16_14_A8D1X4(const mlib_s16 *src,
1508 mlib_s16 *dst,
1509 mlib_s32 dsize,
1510 mlib_s32 cmask)
1511 {
1512 mlib_s16 *da;
1513 mlib_d64 *sp;
1514 mlib_d64 sd;
1515 mlib_s32 i;
1516
1517 sp = (mlib_d64 *) src;
1518 da = dst + (6 / cmask + 1) / 2; /* 8,4,2,1 -> 0,1,2,3 */
1519
1520 vis_alignaddr((void *)0, 2);
1521
1522 #pragma pipeloop(0)
1523 for (i = 0; i < dsize / 4; i++) {
1524 LOAD_INSERT_STORE_S16_1X_A8(4);
1525 }
1526 }
1527
1528 /***************************************************************/
1529 void mlib_v_ImageChannelInsert_S16_14_A8D2X4(const mlib_s16 *src,
1530 mlib_s32 slb,
1531 mlib_s16 *dst,
1532 mlib_s32 dlb,
1533 mlib_s32 xsize,
1534 mlib_s32 ysize,
1535 mlib_s32 cmask)
1536 {
1537 mlib_s16 *da, *dl;
1538 mlib_d64 *sp, *sl;
1539 mlib_d64 sd;
1540 mlib_s32 i, j;
1541
1542 sp = sl = (mlib_d64 *) src;
1543 da = dl = dst + (6 / cmask + 1) / 2; /* 8,4,2,1 -> 0,1,2,3 */
1544
1545 vis_alignaddr((void *)0, 2);
1546
1547 for (j = 0; j < ysize; j++) {
1548 #pragma pipeloop(0)
1549 for (i = 0; i < xsize / 4; i++) {
1550 LOAD_INSERT_STORE_S16_1X_A8(4);
1551 }
1552
1553 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1554 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1555 }
1556 }
1557
1558 /***************************************************************/
1559 void mlib_v_ImageChannelInsert_S16_14_D1(const mlib_s16 *src,
1560 mlib_s16 *dst,
1561 mlib_s32 dsize,
1562 mlib_s32 cmask)
1563 {
1564 mlib_s16 *sa, *da;
1565 mlib_s16 *dend; /* end point in destination */
1566 mlib_d64 *sp; /* 8-byte aligned start points in src */
1567 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */
1568 mlib_s32 off; /* offset of address alignment in src */
1569 mlib_s32 i;
1570
1571 sa = (void *)src;
1572 da = dst + (6 / cmask + 1) / 2; /* 8,4,2,1 -> 0,1,2,3 */
1573
1574 /* prepare the src address */
1575 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
1576 off = (mlib_addr) sa & 7;
1577
1578 dend = da + dsize * 4 - 1;
1579
1580 sd1 = *sp++;
1581
1582 #pragma pipeloop(0)
1583 for (i = 0; i < dsize / 4; i++) {
1584 LOAD_INSERT_STORE_S16_1X(4);
1585 }
1586
1587 /* right end handling */
1588 if ((mlib_addr) da <= (mlib_addr) dend) {
1589
1590 vis_alignaddr((void *)0, off);
1591 sd0 = sd1;
1592 sd1 = *sp++;
1593 sd = vis_faligndata(sd0, sd1);
1594
1595 vis_alignaddr((void *)0, 2);
1596 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1597 da += 4;
1598 if ((mlib_addr) da <= (mlib_addr) dend) {
1599 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1600 da += 4;
1601 if ((mlib_addr) da <= (mlib_addr) dend) {
1602 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1603 }
1604 }
1605 }
1606 }
1607
1608 /***************************************************************/
1609 void mlib_v_ImageChannelInsert_S16_14(const mlib_s16 *src,
1610 mlib_s32 slb,
1611 mlib_s16 *dst,
1612 mlib_s32 dlb,
1613 mlib_s32 xsize,
1614 mlib_s32 ysize,
1615 mlib_s32 cmask)
1616 {
1617 mlib_s16 *sa, *da;
1618 mlib_s16 *sl, *dl;
1619 mlib_s32 j;
1620
1621 sa = sl = (void *)src;
1622 da = dl = dst;
1623
1624 #pragma pipeloop(0)
1625 for (j = 0; j < ysize; j++) {
1626 mlib_v_ImageChannelInsert_S16_14_D1(sa, da, xsize, cmask);
1627 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
1628 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1629 }
1630 }
1631
1632 /***************************************************************/
|
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27
28 /*
29 * FUNCTIONS
30 * mlib_v_ImageChannelInsert_U8_12_D1
31 * mlib_v_ImageChannelInsert_U8_13_D1
32 * mlib_v_ImageChannelInsert_U8_14_D1
33 *
34 * ARGUMENT
35 * src pointer to source image data
36 * dst pointer to destination image data
37 * slb source image line stride in bytes
38 * dlb destination image line stride in bytes
39 * dsize image data size in pixels
40 * xsize image width in pixels
41 * ysize image height in lines
42 * cmask channel mask
43 *
44 * DESCRIPTION
45 * Copy the 1-channel source image into the selected channel
46 * of the destination image -- VIS version low level functions.
47 *
48 * NOTE
49 * These functions are separated from mlib_v_ImageChannelInsert.c
50 * for loop unrolling and structure clarity.
51 */
52
53 #include "vis_proto.h"
54 #include "mlib_image.h"
55 #include "mlib_v_ImageChannelInsert.h"
56
57 /***************************************************************/
58 #define INSERT_U8_12(sd0, dd0, dd1) /* channel duplicate */ \
59 dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \
60 dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0))
61
62 /***************************************************************/
63 /* insert one channel to a 2-channel image.
64 */
65
66 void mlib_v_ImageChannelInsert_U8_12_D1(const mlib_u8 *src,
67 mlib_u8 *dst,
68 mlib_s32 dsize,
69 mlib_s32 cmask)
70 {
71 mlib_u8 *sa, *da;
72 mlib_u8 *dend, *dend2; /* end points in dst */
73 mlib_d64 *dp; /* 8-byte aligned start points in dst */
74 mlib_d64 *sp; /* 8-byte aligned start point in src */
75 mlib_d64 sd0, sd1; /* 8-byte source data */
76 mlib_d64 dd0, dd1, dd2, dd3; /* 8-byte destination data */
77 mlib_s32 soff; /* offset of address in src */
78 mlib_s32 doff; /* offset of address in dst */
79 mlib_s32 off; /* offset of src over dst */
80 mlib_s32 emask; /* edge mask */
81 mlib_s32 bmask; /* channel mask */
82 mlib_s32 i, n;
83
270 }
271 }
272
273 /* end point handling */
274 if ((mlib_addr) dp <= (mlib_addr) dend) {
275 emask = vis_edge8(dp, dend);
276 dd0 = dd2;
277 dd1 = dd3;
278 sd1 = *sp++;
279 INSERT_U8_12(sd1, dd2, dd3);
280 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
281 if ((mlib_addr) dp <= (mlib_addr) dend) {
282 emask = vis_edge8(dp, dend);
283 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
284 }
285 }
286 }
287 }
288
289 /***************************************************************/
290 #define LOAD_INSERT_STORE_U8(channeld) \
291 vis_alignaddr((void *)0, off); \
292 sd0 = sd1; \
293 sd1 = *sp++; \
294 sd = vis_faligndata(sd0, sd1); \
295 vis_alignaddr((void *)0, 1); \
296 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
297 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
298 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
299 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
300 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
301 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
302 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
303 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld
304
305 /***************************************************************/
306 void mlib_v_ImageChannelInsert_U8_13_D1(const mlib_u8 *src,
307 mlib_u8 *dst,
308 mlib_s32 dsize,
309 mlib_s32 cmask)
310 {
311 mlib_u8 *sa, *da;
312 mlib_u8 *dend; /* end point in destination */
313 mlib_d64 *sp; /* 8-byte aligned start points in src */
314 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */
315 mlib_s32 off; /* offset of address alignment in src */
316 mlib_s32 i;
317
318 /* prepare the src address */
319 sa = (void *)src;
320 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
321 off = (mlib_addr) sa & 7;
322
323 /* prepare the dst address */
324 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
325 dend = da + dsize * 3 - 1;
352 vis_st_u8(sd = vis_faligndata(sd, sd), da);
353 da += 3;
354 if ((mlib_addr) da <= (mlib_addr) dend) {
355 vis_st_u8(sd = vis_faligndata(sd, sd), da);
356 da += 3;
357 if ((mlib_addr) da <= (mlib_addr) dend) {
358 vis_st_u8(sd = vis_faligndata(sd, sd), da);
359 da += 3;
360 if ((mlib_addr) da <= (mlib_addr) dend) {
361 vis_st_u8(sd = vis_faligndata(sd, sd), da);
362 }
363 }
364 }
365 }
366 }
367 }
368 }
369 }
370
371 /***************************************************************/
372 #define INSERT_U8_14(sd0, dd0, dd1, dd2, dd3) \
373 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \
374 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0)); \
375 dd0 = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sda)); \
376 dd1 = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sda)); \
377 dd2 = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdb)); \
378 dd3 = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdb))
379
380 /***************************************************************/
381 void mlib_v_ImageChannelInsert_U8_14_D1(const mlib_u8 *src,
382 mlib_u8 *dst,
383 mlib_s32 dsize,
384 mlib_s32 cmask)
385 {
386 mlib_u8 *sa, *da;
387 mlib_u8 *dend, *dend2; /* end points in dst */
388 mlib_d64 *dp; /* 8-byte aligned start points in dst */
389 mlib_d64 *sp; /* 8-byte aligned start point in src */
390 mlib_d64 sd0, sd1, sd; /* 8-byte source data */
391 mlib_d64 sda, sdb;
392 mlib_d64 dd0, dd1, dd2, dd3, dd4;
393 mlib_s32 soff; /* offset of address in src */
394 mlib_s32 doff; /* offset of address in dst */
395 mlib_s32 emask; /* edge mask */
396 mlib_s32 bmask; /* channel mask */
397 mlib_s32 i, n;
398
399 sa = (void *)src;
400 da = dst;
541
542 vis_alignaddr((void *)0, -doff);
543 emask = vis_edge8(dp, dend);
544 vis_pst_8(vis_faligndata(dd4, dd0), dp++, emask & bmask);
545 if ((mlib_addr) dp <= (mlib_addr) dend) {
546 emask = vis_edge8(dp, dend);
547 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
548 if ((mlib_addr) dp <= (mlib_addr) dend) {
549 emask = vis_edge8(dp, dend);
550 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
551 if ((mlib_addr) dp <= (mlib_addr) dend) {
552 emask = vis_edge8(dp, dend);
553 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
554 }
555 }
556 }
557 }
558 }
559 }
560
561
562 /***************************************************************/
|