63 #define DSCALE 65536.0
64 #define FROM_S32(x) ((x) >> 16)
65 #define S64TOS32(x) ((x) & 0xffffffff)
66 #define SAT_OFF
67
68 #elif IMG_TYPE == 3
69
70 #define DTYPE mlib_u16
71 #define CONV_FUNC(KERN) mlib_conv##KERN##ext_u16(PARAM)
72 #define CONV_FUNC_MxN mlib_convMxNext_u16(PARAM_MxN)
73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
74 #define CONV_FUNC_MxN_I mlib_i_convMxNext_u16(PARAM_MxN)
75 #define DSCALE 65536.0
76 #define FROM_S32(x) (((x) >> 16) ^ 0x8000)
77 #define S64TOS32(x) (x)
78 #define SAT_OFF -(1u << 31)
79
80 #endif /* IMG_TYPE == 1 */
81
82 /***************************************************************/
83 #define KSIZE1 (KSIZE - 1)
84
85 /***************************************************************/
86 #define PARAM \
87 mlib_image *dst, \
88 const mlib_image *src, \
89 mlib_s32 dx_l, \
90 mlib_s32 dx_r, \
91 mlib_s32 dy_t, \
92 mlib_s32 dy_b, \
93 const mlib_s32 *kern, \
94 mlib_s32 scalef_expon, \
95 mlib_s32 cmask
96
97 /***************************************************************/
98 #define PARAM_MxN \
99 mlib_image *dst, \
100 const mlib_image *src, \
101 const mlib_s32 *kernel, \
102 mlib_s32 m, \
103 mlib_s32 n, \
104 mlib_s32 dx_l, \
105 mlib_s32 dx_r, \
146 #define LOAD_BUFF(buff) \
147 buff[i ] = sp[0]; \
148 buff[i + 1] = sp[chan1]
149
150 #else /* _NO_LONGLONG */
151
152 #ifdef _LITTLE_ENDIAN
153
154 #define LOAD_BUFF(buff) \
155 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
156
157 #else /* _LITTLE_ENDIAN */
158
159 #define LOAD_BUFF(buff) \
160 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
161
162 #endif /* _LITTLE_ENDIAN */
163 #endif /* _NO_LONGLONG */
164
165 /***************************************************************/
166 #define MLIB_D2_24 16777216.0f
167
168 /***************************************************************/
169 typedef union {
170 mlib_d64 d64;
171 struct {
172 mlib_s32 i0;
173 mlib_s32 i1;
174 } i32s;
175 } d64_2x32;
176
177 /***************************************************************/
178 #define BUFF_LINE 256
179
180 /***************************************************************/
181 #define DEF_VARS(type) \
182 type *adr_src, *sl, *sp, *sl1; \
183 type *adr_dst, *dl, *dp; \
184 FTYPE *pbuff = buff; \
185 mlib_s32 *buffi, *buffo; \
186 mlib_s32 wid, hgt, sll, dll; \
187 mlib_s32 nchannel, chan1, chan2; \
188 mlib_s32 i, j, c, swid
189
190 /***************************************************************/
191 #define LOAD_KERNEL3() \
192 FTYPE scalef = DSCALE; \
193 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8; \
194 FTYPE p00, p01, p02, p03, \
195 p10, p11, p12, p13, \
196 p20, p21, p22, p23; \
197 \
198 while (scalef_expon > 30) { \
199 scalef /= (1 << 30); \
200 scalef_expon -= 30; \
201 } \
202 \
203 scalef /= (1 << scalef_expon); \
204 \
205 /* keep kernel in regs */ \
206 k0 = scalef * kern[0]; k1 = scalef * kern[1]; k2 = scalef * kern[2]; \
207 k3 = scalef * kern[3]; k4 = scalef * kern[4]; k5 = scalef * kern[5]; \
208 k6 = scalef * kern[6]; k7 = scalef * kern[7]; k8 = scalef * kern[8]
209
210 /***************************************************************/
211 #define LOAD_KERNEL(SIZE) \
212 FTYPE scalef = DSCALE; \
213 \
214 while (scalef_expon > 30) { \
215 scalef /= (1 << 30); \
216 scalef_expon -= 30; \
217 } \
218 \
219 scalef /= (1 << scalef_expon); \
220 \
221 for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j]
222
223 /***************************************************************/
224 #define GET_SRC_DST_PARAMETERS(type) \
225 hgt = mlib_ImageGetHeight(src); \
226 wid = mlib_ImageGetWidth(src); \
227 nchannel = mlib_ImageGetChannels(src); \
228 sll = mlib_ImageGetStride(src) / sizeof(type); \
229 dll = mlib_ImageGetStride(dst) / sizeof(type); \
230 adr_src = (type *)mlib_ImageGetData(src); \
231 adr_dst = (type *)mlib_ImageGetData(dst)
232
233 /***************************************************************/
234 #ifndef __sparc
235 #if IMG_TYPE == 1
236
237 /*
238 * Test for the presence of any "1" bit in bits
239 8 to 31 of val. If present, then val is either
240 negative or >255. If over/underflows of 8 bits
241 are uncommon, then this technique can be a win,
242 since only a single test, rather than two, is
243 necessary to determine if clamping is needed.
259 #define CLAMP_STORE(dst, val) \
260 if (val >= MLIB_S16_MAX) \
261 dst = MLIB_S16_MAX; \
262 else if (val <= MLIB_S16_MIN) \
263 dst = MLIB_S16_MIN; \
264 else \
265 dst = (mlib_s16)val
266
267 #elif IMG_TYPE == 3
268
269 #define CLAMP_STORE(dst, val) \
270 if (val >= MLIB_U16_MAX) \
271 dst = MLIB_U16_MAX; \
272 else if (val <= MLIB_U16_MIN) \
273 dst = MLIB_U16_MIN; \
274 else \
275 dst = (mlib_u16)val
276
277 #endif /* IMG_TYPE == 1 */
278 #endif /* __sparc */
279
280 /***************************************************************/
281 #define KSIZE 3
282
283 mlib_status CONV_FUNC(3x3)
284 {
285 FTYPE buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT;
286 DEF_VARS(DTYPE);
287 DTYPE *sl2;
288 #ifndef __sparc
289 mlib_s32 d0, d1;
290 #endif /* __sparc */
291 LOAD_KERNEL3();
292 GET_SRC_DST_PARAMETERS(DTYPE);
293
294 swid = wid + KSIZE1;
295
296 if (swid > BUFF_LINE) {
297 pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE )*swid);
298
299 if (pbuff == NULL) return MLIB_FAILURE;
300 }
301
302 buff0 = pbuff;
303 buff1 = buff0 + swid;
304 buff2 = buff1 + swid;
305 buff3 = buff2 + swid;
306 buffo = (mlib_s32*)(buff3 + swid);
307 buffi = buffo + (swid &~ 1);
308
309 swid -= (dx_l + dx_r);
310
311 chan1 = nchannel;
312 chan2 = chan1 + chan1;
313
314 for (c = 0; c < nchannel; c++) {
315 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
316
317 sl = adr_src + c;
318 dl = adr_dst + c;
319
320 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
321 else sl1 = sl;
322
323 if ((hgt - dy_b) > 0) sl2 = sl1 + sll;
324 else sl2 = sl1;
325
326 for (i = 0; i < dx_l; i++) {
327 buff0[i] = (FTYPE)sl[0];
328 buff1[i] = (FTYPE)sl1[0];
329 buff2[i] = (FTYPE)sl2[0];
330 }
331
332 #ifdef __SUNPRO_C
333 #pragma pipeloop(0)
334 #endif /* __SUNPRO_C */
335 for (i = 0; i < swid; i++) {
336 buff0[i + dx_l] = (FTYPE)sl[i*chan1];
337 buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
338 buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
339 }
340
341 for (i = 0; i < dx_r; i++) {
342 buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
343 buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
344 buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
345 }
346
347 if ((hgt - dy_b) > 1) sl = sl2 + sll;
348 else sl = sl2;
349
350 for (j = 0; j < hgt; j++) {
351 FTYPE s0, s1;
352
353 p02 = buff0[0];
354 p12 = buff1[0];
355 p22 = buff2[0];
356
357 p03 = buff0[1];
358 p13 = buff1[1];
359 p23 = buff2[1];
360
361 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
362 s1 = p03 * k0 + p13 * k3 + p23 * k6;
363
364 sp = sl;
365 dp = dl;
366
367 #ifdef __SUNPRO_C
368 #pragma pipeloop(0)
369 #endif /* __SUNPRO_C */
370 for (i = 0; i <= (wid - 2); i += 2) {
371 #ifdef __sparc
372 #ifdef _NO_LONGLONG
373 mlib_s32 o64_1, o64_2;
374 #else /* _NO_LONGLONG */
375 mlib_s64 o64;
376 #endif /* _NO_LONGLONG */
377 #endif /* __sparc */
378 d64_2x32 dd;
379
380 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
381 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
382
383 LOAD_BUFF(buffi);
384
385 dd.d64 = *(FTYPE *)(buffi + i);
386 buff3[i + dx_l ] = (FTYPE)dd.i32s.i0;
387 buff3[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
388
389 #ifndef __sparc
390
391 d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
392 d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
393
394 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
395 s1 = p03 * k0 + p13 * k3 + p23 * k6;
396
397 dp[0 ] = FROM_S32(d0);
398 dp[chan1] = FROM_S32(d1);
399
400 #else /* __sparc */
401
402 dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
403 dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
404 *(FTYPE *)(buffo + i) = dd.d64;
405
406 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
407 s1 = p03 * k0 + p13 * k3 + p23 * k6;
408
409 #ifdef _NO_LONGLONG
410
411 o64_1 = buffo[i];
412 o64_2 = buffo[i+1];
413 #if IMG_TYPE != 1
414 STORE2(FROM_S32(o64_1), FROM_S32(o64_2));
415 #else
416 STORE2(o64_1 >> 24, o64_2 >> 24);
417 #endif /* IMG_TYPE != 1 */
418
419 #else /* _NO_LONGLONG */
420
421 o64 = *(mlib_s64*)(buffo + i);
422 #if IMG_TYPE != 1
423 STORE2(FROM_S32(o64 >> 32), FROM_S32(o64));
424 #else
425 STORE2(o64 >> 56, o64 >> 24);
426 #endif /* IMG_TYPE != 1 */
427 #endif /* _NO_LONGLONG */
428 #endif /* __sparc */
429
430 sp += chan2;
431 dp += chan2;
432 }
433
434 for (; i < wid; i++) {
435 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i];
436 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
437 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
438
439 buffi[i] = (mlib_s32)sp[0];
440 buff3[i + dx_l] = (FTYPE)buffi[i];
441
442 #ifndef __sparc
443
444 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
445 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
446
447 dp[0] = FROM_S32(d0);
448
449 #else /* __sparc */
450
451 buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
452 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
453 #if IMG_TYPE != 1
454 dp[0] = FROM_S32(buffo[i]);
455 #else
456 dp[0] = buffo[i] >> 24;
457 #endif /* IMG_TYPE != 1 */
458 #endif /* __sparc */
459
460 sp += chan1;
461 dp += chan1;
462 }
463
464 for (; i < swid; i++) {
465 buffi[i] = (mlib_s32)sp[0];
466 buff3[i + dx_l] = (FTYPE)buffi[i];
467 sp += chan1;
468 }
469
470 for (i = 0; i < dx_l; i++) buff3[i] = buff3[dx_l];
471 for (i = 0; i < dx_r; i++) buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
472
473 if (j < hgt - dy_b - 2) sl += sll;
474 dl += dll;
475
476 buffT = buff0;
477 buff0 = buff1;
478 buff1 = buff2;
479 buff2 = buff3;
480 buff3 = buffT;
481 }
482 }
483
484 #ifdef __sparc
485 #if IMG_TYPE == 1
486 {
487 mlib_s32 amask = (1 << nchannel) - 1;
488
489 if ((cmask & amask) != amask) {
490 mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
491 } else {
492 mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
493 }
494 }
495
496 #endif /* IMG_TYPE == 1 */
497 #endif /* __sparc */
498
499 if (pbuff != buff) mlib_free(pbuff);
500
501 return MLIB_SUCCESS;
502 }
503
504 /***************************************************************/
505 #ifndef __sparc /* for x86, using integer multiplies is faster */
506
507 mlib_status CONV_FUNC_I(3x3)
508 {
509 DTYPE *adr_src, *sl, *sp0, *sp1, *sp2, *sp_1, *sp_2;
510 DTYPE *adr_dst, *dl, *dp;
511 mlib_s32 wid, hgt, sll, dll;
512 mlib_s32 nchannel, chan1, chan2, delta_chan;
513 mlib_s32 i, j, c;
514 mlib_s32 shift1, shift2;
515 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8;
516 mlib_s32 p02, p03,
517 p12, p13,
518 p22, p23;
519
520 #if IMG_TYPE != 1
521 shift1 = 16;
522 #else
523 shift1 = 8;
524 #endif /* IMG_TYPE != 1 */
525
526 shift2 = scalef_expon - shift1;
527
528 /* keep kernel in regs */
529 k0 = kern[0] >> shift1; k1 = kern[1] >> shift1; k2 = kern[2] >> shift1;
530 k3 = kern[3] >> shift1; k4 = kern[4] >> shift1; k5 = kern[5] >> shift1;
531 k6 = kern[6] >> shift1; k7 = kern[7] >> shift1; k8 = kern[8] >> shift1;
532
533 GET_SRC_DST_PARAMETERS(DTYPE);
534
535 chan1 = nchannel;
536 chan2 = chan1 + chan1;
537 delta_chan = 0;
538
539 if ((1 > dx_l) && (1 < wid + KSIZE1 - dx_r)) delta_chan = chan1;
540
541 for (c = 0; c < chan1; c++) {
542 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
543
544 sl = adr_src + c;
545 dl = adr_dst + c;
546
547 sp_1 = sl;
548
549 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl += sll;
550 sp_2 = sl;
551
552 if ((hgt - dy_b) > 0) sl += sll;
553
554 for (j = 0; j < hgt; j++) {
555 mlib_s32 s0, s1;
556 mlib_s32 pix0, pix1;
557
558 dp = dl;
559 sp0 = sp_1;
560 sp_1 = sp_2;
561 sp_2 = sl;
562
563 sp1 = sp_1;
564 sp2 = sp_2;
565
566 p02 = sp0[0];
567 p12 = sp1[0];
568 p22 = sp2[0];
569
570 p03 = sp0[delta_chan];
571 p13 = sp1[delta_chan];
572 p23 = sp2[delta_chan];
573
574 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
575 s1 = p03 * k0 + p13 * k3 + p23 * k6;
576
577 sp0 += (chan1 + delta_chan);
578 sp1 += (chan1 + delta_chan);
579 sp2 += (chan1 + delta_chan);
580
581 #ifdef __SUNPRO_C
582 #pragma pipeloop(0)
583 #endif /* __SUNPRO_C */
584 for (i = 0; i <= (wid - dx_r - 2); i += 2) {
585 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
586 p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
587
588 pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
589 pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 +
590 p13 * k5 + p22 * k7 + p23 * k8) >> shift2;
591
592 CLAMP_STORE(dp[0], pix0);
593 CLAMP_STORE(dp[chan1], pix1);
594
595 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
596 s1 = p03 * k0 + p13 * k3 + p23 * k6;
597
598 sp0 += chan2;
599 sp1 += chan2;
600 sp2 += chan2;
601 dp += chan2;
602 }
603
604 p02 = p03; p12 = p13; p22 = p23;
605
606 for (; i < wid - dx_r; i++) {
607 p03 = sp0[0]; p13 = sp1[0]; p23 = sp2[0];
608 pix0 = (s0 + p03 * k2 + p13 * k5 + p23 * k8) >> shift2;
609 CLAMP_STORE(dp[0], pix0);
610 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
611 p02 = p03; p12 = p13; p22 = p23;
612 sp0 += chan1;
613 sp1 += chan1;
614 sp2 += chan1;
615 dp += chan1;
616 }
617
618 sp0 -= chan1;
619 sp1 -= chan1;
620 sp2 -= chan1;
621
622 for (; i < wid; i++) {
623 p03 = sp0[0]; p13 = sp1[0]; p23 = sp2[0];
624 pix0 = (s0 + p03 * k2 + p13 * k5 + p23 * k8) >> shift2;
625 CLAMP_STORE(dp[0], pix0);
626 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
627 p02 = p03; p12 = p13; p22 = p23;
628 dp += chan1;
629 }
630
631 if (j < hgt - dy_b - 1) sl += sll;
632 dl += dll;
633 }
634 }
635
636 return MLIB_SUCCESS;
637 }
638
639 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
640
641 /***************************************************************/
642 #undef KSIZE
643 #define KSIZE 4
644
645 mlib_status CONV_FUNC(4x4)
646 {
647 FTYPE buff[(KSIZE + 3)*BUFF_LINE];
648 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT;
649 FTYPE k[KSIZE*KSIZE];
650 mlib_s32 d0, d1;
651 FTYPE k0, k1, k2, k3, k4, k5, k6, k7;
652 FTYPE p00, p01, p02, p03, p04,
653 p10, p11, p12, p13, p14,
654 p20, p21, p22, p23,
655 p30, p31, p32, p33;
656 DEF_VARS(DTYPE);
657 DTYPE *sl2, *sl3;
658 LOAD_KERNEL(KSIZE*KSIZE);
659 GET_SRC_DST_PARAMETERS(DTYPE);
660
661 swid = wid + KSIZE1;
662
663 if (swid > BUFF_LINE) {
664 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE )*swid);
665
666 if (pbuff == NULL) return MLIB_FAILURE;
667 }
668
669 buff0 = pbuff;
670 buff1 = buff0 + swid;
671 buff2 = buff1 + swid;
672 buff3 = buff2 + swid;
673 buff4 = buff3 + swid;
674 buffd = buff4 + swid;
675 buffo = (mlib_s32*)(buffd + swid);
676 buffi = buffo + (swid &~ 1);
677
678 swid -= (dx_l + dx_r);
679
680 chan1 = nchannel;
681 chan2 = chan1 + chan1;
682
683 for (c = 0; c < nchannel; c++) {
684 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
685
686 sl = adr_src + c;
687 dl = adr_dst + c;
688
689 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
690 else sl1 = sl;
691
692 if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
693 else sl2 = sl1;
694
695 if ((hgt - dy_b) > 0) sl3 = sl2 + sll;
696 else sl3 = sl2;
697
698 for (i = 0; i < dx_l; i++) {
699 buff0[i] = (FTYPE)sl[0];
700 buff1[i] = (FTYPE)sl1[0];
701 buff2[i] = (FTYPE)sl2[0];
702 buff3[i] = (FTYPE)sl3[0];
703 }
704
705 #ifdef __SUNPRO_C
706 #pragma pipeloop(0)
707 #endif /* __SUNPRO_C */
708 for (i = 0; i < swid; i++) {
709 buff0[i + dx_l] = (FTYPE)sl[i*chan1];
710 buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
711 buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
712 buff3[i + dx_l] = (FTYPE)sl3[i*chan1];
713 }
714
715 for (i = 0; i < dx_r; i++) {
716 buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
717 buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
718 buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
719 buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
720 }
721
722 if ((hgt - dy_b) > 1) sl = sl3 + sll;
723 else sl = sl3;
724
725 for (j = 0; j < hgt; j++) {
726 d64_2x32 dd;
727
728 /*
729 * First loop on two first lines of kernel
730 */
731 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
732 k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
733
734 sp = sl;
735 dp = dl;
736
737 p02 = buff0[0];
738 p12 = buff1[0];
739 p03 = buff0[1];
740 p13 = buff1[1];
741 p04 = buff0[2];
742
743 #ifdef __SUNPRO_C
744 #pragma pipeloop(0)
745 #endif /* __SUNPRO_C */
746 for (i = 0; i <= (wid - 2); i += 2) {
747 p00 = p02; p10 = p12;
748 p01 = p03; p11 = p13;
749 p02 = p04; p12 = buff1[i + 2];
750 p03 = buff0[i + 3]; p13 = buff1[i + 3];
751 p04 = buff0[i + 4]; p14 = buff1[i + 4];
752
753 LOAD_BUFF(buffi);
754
755 dd.d64 = *(FTYPE *)(buffi + i);
756 buff4[i + dx_l ] = (FTYPE)dd.i32s.i0;
757 buff4[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
758
759 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
760 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
761 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
762 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
763
764 sp += chan2;
765 }
766
767 /*
768 * Second loop on two last lines of kernel
769 */
770 k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
771 k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
772
773 p02 = buff2[0];
774 p12 = buff3[0];
775 p03 = buff2[1];
776 p13 = buff3[1];
777 p04 = buff2[2];
778
779 #ifdef __SUNPRO_C
780 #pragma pipeloop(0)
781 #endif /* __SUNPRO_C */
782 for (i = 0; i <= (wid - 2); i += 2) {
783 p00 = p02; p10 = p12;
784 p01 = p03; p11 = p13;
785 p02 = p04; p12 = buff3[i + 2];
786 p03 = buff2[i + 3]; p13 = buff3[i + 3];
787 p04 = buff2[i + 4]; p14 = buff3[i + 4];
788
789 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
790 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]);
791 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
792 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]);
793
794 dp[0 ] = FROM_S32(d0);
795 dp[chan1] = FROM_S32(d1);
796
797 dp += chan2;
798 }
799
800 /* last pixels */
801 for (; i < wid; i++) {
802 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i];
803 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
804 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
805 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
806
807 buff4[i + dx_l] = (FTYPE)sp[0];
808
809 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
810 p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
811 p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
812 p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
813
814 dp[0] = FROM_S32(buffo[i]);
815
816 sp += chan1;
817 dp += chan1;
818 }
819
820 for (; i < swid; i++) {
821 buff4[i + dx_l] = (FTYPE)sp[0];
822 sp += chan1;
823 }
824
825 for (i = 0; i < dx_l; i++) buff4[i] = buff4[dx_l];
826 for (i = 0; i < dx_r; i++) buff4[swid + dx_l + i] = buff4[swid + dx_l - 1];
827
828 /* next line */
829
830 if (j < hgt - dy_b - 2) sl += sll;
831 dl += dll;
832
833 buffT = buff0;
834 buff0 = buff1;
835 buff1 = buff2;
836 buff2 = buff3;
837 buff3 = buff4;
838 buff4 = buffT;
839 }
840 }
841
842 if (pbuff != buff) mlib_free(pbuff);
843
844 return MLIB_SUCCESS;
845 }
846
847 /***************************************************************/
848 #undef KSIZE
849 #define KSIZE 5
850
851 mlib_status CONV_FUNC(5x5)
852 {
853 FTYPE buff[(KSIZE + 3)*BUFF_LINE];
854 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT;
855 FTYPE k[KSIZE*KSIZE];
856 mlib_s32 d0, d1;
857 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
858 FTYPE p00, p01, p02, p03, p04, p05,
859 p10, p11, p12, p13, p14, p15,
860 p20, p21, p22, p23, p24,
861 p30, p31, p32, p33, p34,
862 p40, p41, p42, p43, p44;
863 DEF_VARS(DTYPE);
864 DTYPE *sl2, *sl3, *sl4;
865 LOAD_KERNEL(KSIZE*KSIZE);
866 GET_SRC_DST_PARAMETERS(DTYPE);
867
868 swid = wid + KSIZE1;
869
870 if (swid > BUFF_LINE) {
871 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE )*swid);
872
873 if (pbuff == NULL) return MLIB_FAILURE;
874 }
875
876 buff0 = pbuff;
877 buff1 = buff0 + swid;
878 buff2 = buff1 + swid;
879 buff3 = buff2 + swid;
880 buff4 = buff3 + swid;
881 buff5 = buff4 + swid;
882 buffd = buff5 + swid;
883 buffo = (mlib_s32*)(buffd + swid);
884 buffi = buffo + (swid &~ 1);
885
886 swid -= (dx_l + dx_r);
887
888 chan1 = nchannel;
889 chan2 = chan1 + chan1;
890
891 for (c = 0; c < nchannel; c++) {
892 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
893
894 sl = adr_src + c;
895 dl = adr_dst + c;
896
897 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
898 else sl1 = sl;
899
900 if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
901 else sl2 = sl1;
902
903 if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl3 = sl2 + sll;
904 else sl3 = sl2;
905
906 if ((hgt - dy_b) > 0) sl4 = sl3 + sll;
907 else sl4 = sl3;
908
909 for (i = 0; i < dx_l; i++) {
910 buff0[i] = (FTYPE)sl[0];
911 buff1[i] = (FTYPE)sl1[0];
912 buff2[i] = (FTYPE)sl2[0];
913 buff3[i] = (FTYPE)sl3[0];
914 buff4[i] = (FTYPE)sl4[0];
915 }
916
917 #ifdef __SUNPRO_C
918 #pragma pipeloop(0)
919 #endif /* __SUNPRO_C */
920 for (i = 0; i < swid; i++) {
921 buff0[i + dx_l] = (FTYPE)sl[i*chan1];
922 buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
923 buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
924 buff3[i + dx_l] = (FTYPE)sl3[i*chan1];
925 buff4[i + dx_l] = (FTYPE)sl4[i*chan1];
926 }
927
928 for (i = 0; i < dx_r; i++) {
929 buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
930 buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
931 buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
932 buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
933 buff4[swid + dx_l + i] = buff4[swid + dx_l - 1];
934 }
935
936 if ((hgt - dy_b) > 1) sl = sl4 + sll;
937 else sl = sl4;
938
939 for (j = 0; j < hgt; j++) {
940 d64_2x32 dd;
941
942 /*
943 * First loop
944 */
945 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
946 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
947
948 sp = sl;
949 dp = dl;
950
951 p02 = buff0[0];
952 p12 = buff1[0];
953 p03 = buff0[1];
954 p13 = buff1[1];
955 p04 = buff0[2];
956 p14 = buff1[2];
957
958 #ifdef __SUNPRO_C
959 #pragma pipeloop(0)
960 #endif /* __SUNPRO_C */
961 for (i = 0; i <= (wid - 2); i += 2) {
962 p00 = p02; p10 = p12;
963 p01 = p03; p11 = p13;
964 p02 = p04; p12 = p14;
965
966 LOAD_BUFF(buffi);
967
968 p03 = buff0[i + 3]; p13 = buff1[i + 3];
969 p04 = buff0[i + 4]; p14 = buff1[i + 4];
970 p05 = buff0[i + 5]; p15 = buff1[i + 5];
971
972 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
973 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
974 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
975 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
976
977 sp += chan2;
978 }
979
980 /*
981 * Second loop
982 */
983 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
984 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
985
986 p02 = buff2[0];
987 p12 = buff3[0];
988 p03 = buff2[1];
989 p13 = buff3[1];
990
991 #ifdef __SUNPRO_C
992 #pragma pipeloop(0)
993 #endif /* __SUNPRO_C */
994 for (i = 0; i <= (wid - 2); i += 2) {
995 p00 = p02; p10 = p12;
996 p01 = p03; p11 = p13;
997
998 p02 = buff2[i + 2]; p12 = buff3[i + 2];
999 p03 = buff2[i + 3]; p13 = buff3[i + 3];
1000 p04 = buff2[i + 4]; p14 = buff3[i + 4];
1001 p05 = buff2[i + 5]; p15 = buff3[i + 5];
1002
1003 dd.d64 = *(FTYPE *)(buffi + i);
1004 buff5[i + dx_l ] = (FTYPE)dd.i32s.i0;
1005 buff5[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
1006
1007 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1008 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1009 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1010 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1011 }
1012
1013 /*
1014 * 3 loop
1015 */
1016 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1017
1018 p02 = buff4[0];
1019 p03 = buff4[1];
1020 p04 = buff4[2];
1021 p05 = buff4[3];
1022
1023 #ifdef __SUNPRO_C
1024 #pragma pipeloop(0)
1025 #endif /* __SUNPRO_C */
1026 for (i = 0; i <= (wid - 2); i += 2) {
1027 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1028
1029 p04 = buff4[i + 4]; p05 = buff4[i + 5];
1030
1031 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]);
1032 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]);
1033
1034 dp[0 ] = FROM_S32(d0);
1035 dp[chan1] = FROM_S32(d1);
1036
1037 dp += chan2;
1038 }
1039
1040 /* last pixels */
1041 for (; i < wid; i++) {
1042 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i];
1043 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
1044 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
1045 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
1046 p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
1047
1048 p40 = buff4[i]; p41 = buff4[i + 1]; p42 = buff4[i + 2];
1049 p43 = buff4[i + 3]; p44 = buff4[i + 4];
1050
1051 buff5[i + dx_l] = (FTYPE)sp[0];
1052
1053 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
1054 p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
1055 p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
1056 p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
1057 p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
1058
1059 dp[0] = FROM_S32(buffo[i]);
1060
1061 sp += chan1;
1062 dp += chan1;
1063 }
1064
1065 for (; i < swid; i++) {
1066 buff5[i + dx_l] = (FTYPE)sp[0];
1067 sp += chan1;
1068 }
1069
1070 for (i = 0; i < dx_l; i++) buff5[i] = buff5[dx_l];
1071 for (i = 0; i < dx_r; i++) buff5[swid + dx_l + i] = buff5[swid + dx_l - 1];
1072
1073 /* next line */
1074
1075 if (j < hgt - dy_b - 2) sl += sll;
1076 dl += dll;
1077
1078 buffT = buff0;
1079 buff0 = buff1;
1080 buff1 = buff2;
1081 buff2 = buff3;
1082 buff3 = buff4;
1083 buff4 = buff5;
1084 buff5 = buffT;
1085 }
1086 }
1087
1088 if (pbuff != buff) mlib_free(pbuff);
1089
1090 return MLIB_SUCCESS;
1091 }
1092
1093 /***************************************************************/
1094 #ifndef __sparc /* for x86, using integer multiplies is faster */
1095
1096 mlib_status CONV_FUNC_I(5x5)
1097 {
1098 mlib_s32 buff[BUFF_LINE];
1099 mlib_s32 *buffd;
1100 mlib_s32 k[KSIZE*KSIZE];
1101 mlib_s32 shift1, shift2;
1102 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
1103 mlib_s32 p00, p01, p02, p03, p04, p05,
1104 p10, p11, p12, p13, p14, p15;
1105 DTYPE *adr_src, *sl, *sp0, *sp1, *sp2, *sp3, *sp4;
1106 DTYPE *sp_1, *sp_2, *sp_3, *sp_4;
1107 DTYPE *adr_dst, *dl, *dp;
1108 mlib_s32 *pbuff = buff;
1109 mlib_s32 wid, hgt, sll, dll;
1110 mlib_s32 nchannel, chan1, chan2, chan4;
1111 mlib_s32 delta_chan1, delta_chan2, delta_chan3;
1112 mlib_s32 i, j, c;
1113
1114 #if IMG_TYPE != 1
1115 shift1 = 16;
1116 #else
1117 shift1 = 8;
1118 #endif /* IMG_TYPE != 1 */
1119
1120 shift2 = scalef_expon - shift1;
1121
1122 for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1;
1123
1124 GET_SRC_DST_PARAMETERS(DTYPE);
1125
1126 if (wid > BUFF_LINE) {
1127 pbuff = mlib_malloc(sizeof(mlib_s32)*wid);
1128
1129 if (pbuff == NULL) return MLIB_FAILURE;
1130 }
1131
1132 buffd = pbuff;
1133
1134 chan1 = nchannel;
1135 chan2 = chan1 + chan1;
1136
1137 if ((1 > dx_l) && (1 < wid + KSIZE1 - dx_r)) delta_chan1 = chan1;
1138 else delta_chan1 = 0;
1139
1140 if ((2 > dx_l) && (2 < wid + KSIZE1 - dx_r)) delta_chan2 = delta_chan1 + chan1;
1141 else delta_chan2 = delta_chan1;
1142
1143 if ((3 > dx_l) && (3 < wid + KSIZE1 - dx_r)) delta_chan3 = delta_chan2 + chan1;
1144 else delta_chan3 = delta_chan2;
1145
1146 chan4 = chan1 + delta_chan3;
1147
1148 for (c = 0; c < chan1; c++) {
1149 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1150
1151 sl = adr_src + c;
1152 dl = adr_dst + c;
1153
1154 sp_1 = sl;
1155
1156 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl += sll;
1157 sp_2 = sl;
1158
1159 if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl += sll;
1160 sp_3 = sl;
1161
1162 if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl += sll;
1163 sp_4 = sl;
1164
1165 if ((hgt - dy_b) > 0) sl += sll;
1166
1167 for (j = 0; j < hgt; j++) {
1168 mlib_s32 pix0, pix1;
1169
1170 dp = dl;
1171 sp0 = sp_1;
1172 sp_1 = sp_2;
1173 sp_2 = sp_3;
1174 sp_3 = sp_4;
1175 sp_4 = sl;
1176
1177 sp1 = sp_1;
1178 sp2 = sp_2;
1179 sp3 = sp_3;
1180 sp4 = sp_4;
1181
1182 /*
1183 * First loop
1184 */
1185
1186 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
1187 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
1188
1189 p02 = sp0[0]; p12 = sp1[0];
1190 p03 = sp0[delta_chan1]; p13 = sp1[delta_chan1];
1191 p04 = sp0[delta_chan2]; p14 = sp1[delta_chan2];
1192 p05 = sp0[delta_chan3]; p15 = sp1[delta_chan3];
1193
1194 sp0 += chan4;
1195 sp1 += chan4;
1196
1197 #ifdef __SUNPRO_C
1198 #pragma pipeloop(0)
1199 #endif /* __SUNPRO_C */
1200 for (i = 0; i <= (wid - dx_r - 2); i += 2) {
1201 p00 = p02; p10 = p12;
1202 p01 = p03; p11 = p13;
1203 p02 = p04; p12 = p14;
1204 p03 = p05; p13 = p15;
1205
1206 p04 = sp0[0]; p14 = sp1[0];
1207 p05 = sp0[chan1]; p15 = sp1[chan1];
1208
1209 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1210 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1211 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1212 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1213
1214 sp0 += chan2;
1215 sp1 += chan2;
1216 }
1217
1218 p01 = p02; p02 = p03; p03 = p04; p04 = p05;
1219 p11 = p12; p12 = p13; p13 = p14; p14 = p15;
1220
1221 for (; i < wid - dx_r; i++) {
1222 p00 = p01; p10 = p11;
1223 p01 = p02; p11 = p12;
1224 p02 = p03; p12 = p13;
1225 p03 = p04; p13 = p14;
1226
1227 p04 = sp0[0]; p14 = sp1[0];
1228
1229 buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1230 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1231
1232 sp0 += chan1;
1233 sp1 += chan1;
1234 }
1235
1236 sp0 -= chan1;
1237 sp1 -= chan1;
1238
1239 for (; i < wid; i++) {
1240 p00 = p01; p10 = p11;
1241 p01 = p02; p11 = p12;
1242 p02 = p03; p12 = p13;
1243 p03 = p04; p13 = p14;
1244
1245 p04 = sp0[0]; p14 = sp1[0];
1246
1247 buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1248 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1249 }
1250
1251 /*
1252 * Second loop
1253 */
1254
1255 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
1256 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
1257
1258 p02 = sp2[0]; p12 = sp3[0];
1259 p03 = sp2[delta_chan1]; p13 = sp3[delta_chan1];
1260 p04 = sp2[delta_chan2]; p14 = sp3[delta_chan2];
1261 p05 = sp2[delta_chan3]; p15 = sp3[delta_chan3];
1262
1263 sp2 += chan4;
1264 sp3 += chan4;
1265
1266 #ifdef __SUNPRO_C
1267 #pragma pipeloop(0)
1268 #endif /* __SUNPRO_C */
1269 for (i = 0; i <= (wid - dx_r - 2); i += 2) {
1270 p00 = p02; p10 = p12;
1271 p01 = p03; p11 = p13;
1272 p02 = p04; p12 = p14;
1273 p03 = p05; p13 = p15;
1274
1275 p04 = sp2[0]; p14 = sp3[0];
1276 p05 = sp2[chan1]; p15 = sp3[chan1];
1277
1278 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1279 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1280 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1281 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1282
1283 sp2 += chan2;
1284 sp3 += chan2;
1285 }
1286
1287 p01 = p02; p02 = p03; p03 = p04; p04 = p05;
1288 p11 = p12; p12 = p13; p13 = p14; p14 = p15;
1289
1290 for (; i < wid - dx_r; i++) {
1291 p00 = p01; p10 = p11;
1292 p01 = p02; p11 = p12;
1293 p02 = p03; p12 = p13;
1294 p03 = p04; p13 = p14;
1295
1296 p04 = sp2[0]; p14 = sp3[0];
1297
1298 buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1299 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1300
1301 sp2 += chan1;
1302 sp3 += chan1;
1303 }
1304
1305 sp2 -= chan1;
1306 sp3 -= chan1;
1307
1308 for (; i < wid; i++) {
1309 p00 = p01; p10 = p11;
1310 p01 = p02; p11 = p12;
1311 p02 = p03; p12 = p13;
1312 p03 = p04; p13 = p14;
1313
1314 p04 = sp2[0]; p14 = sp3[0];
1315
1316 buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1317 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1318 }
1319
1320 /*
1321 * 3 loop
1322 */
1323
1324 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1325
1326 p02 = sp4[0];
1327 p03 = sp4[delta_chan1];
1328 p04 = sp4[delta_chan2];
1329 p05 = sp4[delta_chan3];
1330
1331 sp4 += chan4;
1332
1333 #ifdef __SUNPRO_C
1334 #pragma pipeloop(0)
1335 #endif /* __SUNPRO_C */
1336 for (i = 0; i <= (wid - dx_r - 2); i += 2) {
1337 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1338
1339 p04 = sp4[0]; p05 = sp4[chan1];
1340
1341 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1342 p03 * k3 + p04 * k4) >> shift2;
1343 pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 +
1344 p04 * k3 + p05 * k4) >> shift2;
1345
1346 CLAMP_STORE(dp[0], pix0);
1347 CLAMP_STORE(dp[chan1], pix1);
1348
1349 dp += chan2;
1350 sp4 += chan2;
1351 }
1352
1353 p01 = p02; p02 = p03; p03 = p04; p04 = p05;
1354
1355 for (; i < wid - dx_r; i++) {
1356 p00 = p01; p01 = p02; p02 = p03; p03 = p04;
1357
1358 p04 = sp4[0];
1359
1360 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1361 p03 * k3 + p04 * k4) >> shift2;
1362 CLAMP_STORE(dp[0], pix0);
1363
1364 dp += chan1;
1365 sp4 += chan1;
1366 }
1367
1368 sp4 -= chan1;
1369
1370 for (; i < wid; i++) {
1371 p00 = p01; p01 = p02; p02 = p03; p03 = p04;
1372
1373 p04 = sp4[0];
1374
1375 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1376 p03 * k3 + p04 * k4) >> shift2;
1377 CLAMP_STORE(dp[0], pix0);
1378
1379 dp += chan1;
1380 }
1381
1382 /* next line */
1383
1384 if (j < hgt - dy_b - 1) sl += sll;
1385 dl += dll;
1386 }
1387 }
1388
1389 if (pbuff != buff) mlib_free(pbuff);
1390
1391 return MLIB_SUCCESS;
1392 }
1393
1394 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
1395
1396 /***************************************************************/
1397 #if IMG_TYPE == 1
1398
1399 #undef KSIZE
1400 #define KSIZE 7
1401
1402 mlib_status CONV_FUNC(7x7)
1403 {
1404 FTYPE buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
1405 FTYPE k[KSIZE*KSIZE];
1406 mlib_s32 l, m, buff_ind;
1407 mlib_s32 d0, d1;
1408 FTYPE k0, k1, k2, k3, k4, k5, k6;
1409 FTYPE p0, p1, p2, p3, p4, p5, p6, p7;
1410 DTYPE *sl2, *sl3, *sl4, *sl5, *sl6;
1411 DEF_VARS(DTYPE);
1412 LOAD_KERNEL(KSIZE*KSIZE);
1413 GET_SRC_DST_PARAMETERS(DTYPE);
1414
1415 swid = wid + KSIZE1;
1416
1417 if (wid > BUFF_LINE) {
1418 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE )*wid);
1419
1420 if (pbuff == NULL) return MLIB_FAILURE;
1421 }
1422
1423 for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*swid;
1424 for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
1425 buffd = buffs[KSIZE] + swid;
1426 buffo = (mlib_s32*)(buffd + swid);
1427 buffi = buffo + (swid &~ 1);
1428
1429 swid -= (dx_l + dx_r);
1430
1431 chan1 = nchannel;
1432 chan2 = chan1 + chan1;
1433
1434 for (c = 0; c < nchannel; c++) {
1435 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1436
1437 sl = adr_src + c;
1438 dl = adr_dst + c;
1439
1440 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
1441 else sl1 = sl;
1442
1443 if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
1444 else sl2 = sl1;
1445
1446 if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl3 = sl2 + sll;
1447 else sl3 = sl2;
1448
1449 if ((4 > dy_t) && (4 < hgt + KSIZE1 - dy_b)) sl4 = sl3 + sll;
1450 else sl4 = sl3;
1451
1452 if ((5 > dy_t) && (5 < hgt + KSIZE1 - dy_b)) sl5 = sl4 + sll;
1453 else sl5 = sl4;
1454
1455 if ((hgt - dy_b) > 0) sl6 = sl5 + sll;
1456 else sl6 = sl5;
1457
1458 for (i = 0; i < dx_l; i++) {
1459 buffs[0][i] = (FTYPE)sl[0];
1460 buffs[1][i] = (FTYPE)sl1[0];
1461 buffs[2][i] = (FTYPE)sl2[0];
1462 buffs[3][i] = (FTYPE)sl3[0];
1463 buffs[4][i] = (FTYPE)sl4[0];
1464 buffs[5][i] = (FTYPE)sl5[0];
1465 buffs[6][i] = (FTYPE)sl6[0];
1466 }
1467
1468 #ifdef __SUNPRO_C
1469 #pragma pipeloop(0)
1470 #endif /* __SUNPRO_C */
1471 for (i = 0; i < swid; i++) {
1472 buffs[0][i + dx_l] = (FTYPE)sl[i*chan1];
1473 buffs[1][i + dx_l] = (FTYPE)sl1[i*chan1];
1474 buffs[2][i + dx_l] = (FTYPE)sl2[i*chan1];
1475 buffs[3][i + dx_l] = (FTYPE)sl3[i*chan1];
1476 buffs[4][i + dx_l] = (FTYPE)sl4[i*chan1];
1477 buffs[5][i + dx_l] = (FTYPE)sl5[i*chan1];
1478 buffs[6][i + dx_l] = (FTYPE)sl6[i*chan1];
1479 }
1480
1481 for (i = 0; i < dx_r; i++) {
1482 buffs[0][swid + dx_l + i] = buffs[0][swid + dx_l - 1];
1483 buffs[1][swid + dx_l + i] = buffs[1][swid + dx_l - 1];
1484 buffs[2][swid + dx_l + i] = buffs[2][swid + dx_l - 1];
1485 buffs[3][swid + dx_l + i] = buffs[3][swid + dx_l - 1];
1486 buffs[4][swid + dx_l + i] = buffs[4][swid + dx_l - 1];
1487 buffs[5][swid + dx_l + i] = buffs[5][swid + dx_l - 1];
1488 buffs[6][swid + dx_l + i] = buffs[6][swid + dx_l - 1];
1489 }
1490
1491 buff_ind = 0;
1492
1493 #ifdef __SUNPRO_C
1494 #pragma pipeloop(0)
1495 #endif /* __SUNPRO_C */
1496 for (i = 0; i < wid; i++) buffd[i] = 0.0;
1497
1498 if ((hgt - dy_b) > 1) sl = sl6 + sll;
1499 else sl = sl6;
1500
1501 for (j = 0; j < hgt; j++) {
1502 FTYPE **buffc = buffs + buff_ind;
1503 FTYPE *buffn = buffc[KSIZE];
1504 FTYPE *pk = k;
1505
1506 for (l = 0; l < KSIZE; l++) {
1507 FTYPE *buff = buffc[l];
1508 d64_2x32 dd;
1509
1510 sp = sl;
1511 dp = dl;
1512
1513 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1514 p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1515
1516 k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
1517 k4 = *pk++; k5 = *pk++; k6 = *pk++;
1518
1519 if (l < (KSIZE - 1)) {
1520 #ifdef __SUNPRO_C
1521 #pragma pipeloop(0)
1522 #endif /* __SUNPRO_C */
1523 for (i = 0; i <= (wid - 2); i += 2) {
1524 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1525
1526 p6 = buff[i + 6]; p7 = buff[i + 7];
1527
1528 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1529 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1530 }
1531
1532 } else {
1533 #ifdef __SUNPRO_C
1534 #pragma pipeloop(0)
1535 #endif /* __SUNPRO_C */
1536 for (i = 0; i <= (wid - 2); i += 2) {
1537 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1538
1539 p6 = buff[i + 6]; p7 = buff[i + 7];
1540
1541 LOAD_BUFF(buffi);
1542
1543 dd.d64 = *(FTYPE *)(buffi + i);
1544 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0;
1545 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
1546
1547 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
1548 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1549
1550 dp[0 ] = FROM_S32(d0);
1551 dp[chan1] = FROM_S32(d1);
1552
1553 buffd[i ] = 0.0;
1554 buffd[i + 1] = 0.0;
1555
1556 sp += chan2;
1557 dp += chan2;
1558 }
1559 }
1560 }
1561
1562 /* last pixels */
1563 for (; i < wid; i++) {
1564 FTYPE *pk = k, s = 0;
1565 mlib_s32 d0;
1566
1567 for (l = 0; l < KSIZE; l++) {
1568 FTYPE *buff = buffc[l] + i;
1569
1570 for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
1571 }
1572
1573 d0 = D2I(s);
1574 dp[0] = FROM_S32(d0);
1575
1576 buffn[i + dx_l] = (FTYPE)sp[0];
1577
1578 sp += chan1;
1579 dp += chan1;
1580 }
1581
1582 for (; i < swid; i++) {
1583 buffn[i + dx_l] = (FTYPE)sp[0];
1584 sp += chan1;
1585 }
1586
1587 for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1588 for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1589
1590 /* next line */
1591
1592 if (j < hgt - dy_b - 2) sl += sll;
1593 dl += dll;
1594
1595 buff_ind++;
1596
1597 if (buff_ind >= KSIZE + 1) buff_ind = 0;
1598 }
1599 }
1600
1601 if (pbuff != buff) mlib_free(pbuff);
1602
1603 return MLIB_SUCCESS;
1604 }
1605
1606 #endif /* IMG_TYPE == 1 */
1607
1608 /***************************************************************/
1609 #define MAX_KER 7
1610 #define MAX_N 15
1611 #define BUFF_SIZE 1600
1612 #define CACHE_SIZE (64*1024)
1613
1614 static mlib_status mlib_ImageConv1xN_ext(mlib_image *dst,
1615 const mlib_image *src,
1616 const mlib_d64 *k,
1617 mlib_s32 n,
1618 mlib_s32 dy_t,
1619 mlib_s32 dy_b,
1620 mlib_s32 cmask)
1621 {
1622 DTYPE *adr_src, *sl;
1623 DTYPE *adr_dst, *dl, *dp;
1624 FTYPE buff[BUFF_SIZE];
1625 FTYPE *buffd;
1626 FTYPE *pbuff = buff;
|
63 #define DSCALE 65536.0
64 #define FROM_S32(x) ((x) >> 16)
65 #define S64TOS32(x) ((x) & 0xffffffff)
66 #define SAT_OFF
67
68 #elif IMG_TYPE == 3
69
70 #define DTYPE mlib_u16
71 #define CONV_FUNC(KERN) mlib_conv##KERN##ext_u16(PARAM)
72 #define CONV_FUNC_MxN mlib_convMxNext_u16(PARAM_MxN)
73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
74 #define CONV_FUNC_MxN_I mlib_i_convMxNext_u16(PARAM_MxN)
75 #define DSCALE 65536.0
76 #define FROM_S32(x) (((x) >> 16) ^ 0x8000)
77 #define S64TOS32(x) (x)
78 #define SAT_OFF -(1u << 31)
79
80 #endif /* IMG_TYPE == 1 */
81
82 /***************************************************************/
83 #define PARAM \
84 mlib_image *dst, \
85 const mlib_image *src, \
86 mlib_s32 dx_l, \
87 mlib_s32 dx_r, \
88 mlib_s32 dy_t, \
89 mlib_s32 dy_b, \
90 const mlib_s32 *kern, \
91 mlib_s32 scalef_expon, \
92 mlib_s32 cmask
93
94 /***************************************************************/
95 #define PARAM_MxN \
96 mlib_image *dst, \
97 const mlib_image *src, \
98 const mlib_s32 *kernel, \
99 mlib_s32 m, \
100 mlib_s32 n, \
101 mlib_s32 dx_l, \
102 mlib_s32 dx_r, \
143 #define LOAD_BUFF(buff) \
144 buff[i ] = sp[0]; \
145 buff[i + 1] = sp[chan1]
146
147 #else /* _NO_LONGLONG */
148
149 #ifdef _LITTLE_ENDIAN
150
151 #define LOAD_BUFF(buff) \
152 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
153
154 #else /* _LITTLE_ENDIAN */
155
156 #define LOAD_BUFF(buff) \
157 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
158
159 #endif /* _LITTLE_ENDIAN */
160 #endif /* _NO_LONGLONG */
161
162 /***************************************************************/
163 typedef union {
164 mlib_d64 d64;
165 struct {
166 mlib_s32 i0;
167 mlib_s32 i1;
168 } i32s;
169 } d64_2x32;
170
171 /***************************************************************/
172 #define GET_SRC_DST_PARAMETERS(type) \
173 hgt = mlib_ImageGetHeight(src); \
174 wid = mlib_ImageGetWidth(src); \
175 nchannel = mlib_ImageGetChannels(src); \
176 sll = mlib_ImageGetStride(src) / sizeof(type); \
177 dll = mlib_ImageGetStride(dst) / sizeof(type); \
178 adr_src = (type *)mlib_ImageGetData(src); \
179 adr_dst = (type *)mlib_ImageGetData(dst)
180
181 /***************************************************************/
182 #ifndef __sparc
183 #if IMG_TYPE == 1
184
185 /*
186 * Test for the presence of any "1" bit in bits
187 8 to 31 of val. If present, then val is either
188 negative or >255. If over/underflows of 8 bits
189 are uncommon, then this technique can be a win,
190 since only a single test, rather than two, is
191 necessary to determine if clamping is needed.
207 #define CLAMP_STORE(dst, val) \
208 if (val >= MLIB_S16_MAX) \
209 dst = MLIB_S16_MAX; \
210 else if (val <= MLIB_S16_MIN) \
211 dst = MLIB_S16_MIN; \
212 else \
213 dst = (mlib_s16)val
214
215 #elif IMG_TYPE == 3
216
217 #define CLAMP_STORE(dst, val) \
218 if (val >= MLIB_U16_MAX) \
219 dst = MLIB_U16_MAX; \
220 else if (val <= MLIB_U16_MIN) \
221 dst = MLIB_U16_MIN; \
222 else \
223 dst = (mlib_u16)val
224
225 #endif /* IMG_TYPE == 1 */
226 #endif /* __sparc */
227
228 /***************************************************************/
229 #define MAX_KER 7
230 #define MAX_N 15
231 #define BUFF_SIZE 1600
232 #define CACHE_SIZE (64*1024)
233
234 static mlib_status mlib_ImageConv1xN_ext(mlib_image *dst,
235 const mlib_image *src,
236 const mlib_d64 *k,
237 mlib_s32 n,
238 mlib_s32 dy_t,
239 mlib_s32 dy_b,
240 mlib_s32 cmask)
241 {
242 DTYPE *adr_src, *sl;
243 DTYPE *adr_dst, *dl, *dp;
244 FTYPE buff[BUFF_SIZE];
245 FTYPE *buffd;
246 FTYPE *pbuff = buff;
|