1 /* 2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 27 /* 28 * FUNCTION 29 * Internal functions for mlib_ImageConv* on U8/S16/U16 type and 30 * MLIB_EDGE_SRC_EXTEND mask 31 */ 32 33 #include "mlib_image.h" 34 #include "mlib_ImageConv.h" 35 #include "mlib_c_ImageConv.h" 36 37 /* 38 * This define switches between functions of different data types 39 */ 40 41 #define IMG_TYPE 3 42 43 /***************************************************************/ 44 #if IMG_TYPE == 1 45 46 #define DTYPE mlib_u8 47 #define CONV_FUNC(KERN) mlib_c_conv##KERN##ext_u8(PARAM) 48 #define CONV_FUNC_MxN mlib_c_convMxNext_u8(PARAM_MxN) 49 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM) 50 #define CONV_FUNC_MxN_I mlib_i_convMxNext_u8(PARAM_MxN) 51 #define DSCALE (1 << 24) 52 #define FROM_S32(x) (((x) >> 24) ^ 128) 53 #define S64TOS32(x) (x) 54 #define SAT_OFF -(1u << 31) 55 56 #elif IMG_TYPE == 2 57 58 #define DTYPE mlib_s16 59 #define CONV_FUNC(KERN) mlib_conv##KERN##ext_s16(PARAM) 60 #define CONV_FUNC_MxN mlib_convMxNext_s16(PARAM_MxN) 61 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM) 62 #define CONV_FUNC_MxN_I mlib_i_convMxNext_s16(PARAM_MxN) 63 #define DSCALE 65536.0 64 #define FROM_S32(x) ((x) >> 16) 65 #define S64TOS32(x) ((x) & 0xffffffff) 66 #define SAT_OFF 67 68 #elif IMG_TYPE == 3 69 70 #define DTYPE mlib_u16 71 #define CONV_FUNC(KERN) mlib_conv##KERN##ext_u16(PARAM) 72 #define CONV_FUNC_MxN mlib_convMxNext_u16(PARAM_MxN) 73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM) 74 #define CONV_FUNC_MxN_I mlib_i_convMxNext_u16(PARAM_MxN) 75 #define DSCALE 65536.0 76 #define FROM_S32(x) (((x) >> 16) ^ 0x8000) 77 #define S64TOS32(x) (x) 78 #define SAT_OFF -(1u << 31) 79 80 #endif /* IMG_TYPE == 1 */ 81 82 /***************************************************************/ 83 #define KSIZE1 (KSIZE - 1) 84 85 /***************************************************************/ 86 #define PARAM \ 87 mlib_image *dst, \ 88 const mlib_image *src, \ 89 mlib_s32 dx_l, \ 90 mlib_s32 dx_r, \ 91 mlib_s32 dy_t, \ 92 mlib_s32 dy_b, \ 93 const mlib_s32 *kern, \ 94 mlib_s32 scalef_expon, \ 95 mlib_s32 cmask 96 97 /***************************************************************/ 98 #define PARAM_MxN \ 99 mlib_image *dst, \ 100 const mlib_image *src, \ 101 const mlib_s32 *kernel, \ 102 mlib_s32 m, \ 103 mlib_s32 n, \ 104 mlib_s32 dx_l, \ 105 mlib_s32 dx_r, \ 106 mlib_s32 dy_t, \ 107 mlib_s32 dy_b, \ 108 mlib_s32 scale, \ 109 mlib_s32 cmask 110 111 /***************************************************************/ 112 #define FTYPE mlib_d64 113 114 #ifndef MLIB_USE_FTOI_CLAMPING 115 116 #define CLAMP_S32(x) \ 117 (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x))) 118 119 #else 120 121 #define CLAMP_S32(x) ((mlib_s32)(x)) 122 123 #endif /* MLIB_USE_FTOI_CLAMPING */ 124 125 /***************************************************************/ 126 #define D2I(x) CLAMP_S32((x) SAT_OFF) 127 128 /***************************************************************/ 129 #ifdef _LITTLE_ENDIAN 130 131 #define STORE2(res0, res1) \ 132 dp[0 ] = res1; \ 133 dp[chan1] = res0 134 135 #else 136 137 #define STORE2(res0, res1) \ 138 dp[0 ] = res0; \ 139 dp[chan1] = res1 140 141 #endif /* _LITTLE_ENDIAN */ 142 143 /***************************************************************/ 144 #ifdef _NO_LONGLONG 145 146 #define LOAD_BUFF(buff) \ 147 buff[i ] = sp[0]; \ 148 buff[i + 1] = sp[chan1] 149 150 #else /* _NO_LONGLONG */ 151 152 #ifdef _LITTLE_ENDIAN 153 154 #define LOAD_BUFF(buff) \ 155 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0]) 156 157 #else /* _LITTLE_ENDIAN */ 158 159 #define LOAD_BUFF(buff) \ 160 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1]) 161 162 #endif /* _LITTLE_ENDIAN */ 163 #endif /* _NO_LONGLONG */ 164 165 /***************************************************************/ 166 #define MLIB_D2_24 16777216.0f 167 168 /***************************************************************/ 169 typedef union { 170 mlib_d64 d64; 171 struct { 172 mlib_s32 i0; 173 mlib_s32 i1; 174 } i32s; 175 } d64_2x32; 176 177 /***************************************************************/ 178 #define BUFF_LINE 256 179 180 /***************************************************************/ 181 #define DEF_VARS(type) \ 182 type *adr_src, *sl, *sp, *sl1; \ 183 type *adr_dst, *dl, *dp; \ 184 FTYPE *pbuff = buff; \ 185 mlib_s32 *buffi, *buffo; \ 186 mlib_s32 wid, hgt, sll, dll; \ 187 mlib_s32 nchannel, chan1, chan2; \ 188 mlib_s32 i, j, c, swid 189 190 /***************************************************************/ 191 #define LOAD_KERNEL3() \ 192 FTYPE scalef = DSCALE; \ 193 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8; \ 194 FTYPE p00, p01, p02, p03, \ 195 p10, p11, p12, p13, \ 196 p20, p21, p22, p23; \ 197 \ 198 while (scalef_expon > 30) { \ 199 scalef /= (1 << 30); \ 200 scalef_expon -= 30; \ 201 } \ 202 \ 203 scalef /= (1 << scalef_expon); \ 204 \ 205 /* keep kernel in regs */ \ 206 k0 = scalef * kern[0]; k1 = scalef * kern[1]; k2 = scalef * kern[2]; \ 207 k3 = scalef * kern[3]; k4 = scalef * kern[4]; k5 = scalef * kern[5]; \ 208 k6 = scalef * kern[6]; k7 = scalef * kern[7]; k8 = scalef * kern[8] 209 210 /***************************************************************/ 211 #define LOAD_KERNEL(SIZE) \ 212 FTYPE scalef = DSCALE; \ 213 \ 214 while (scalef_expon > 30) { \ 215 scalef /= (1 << 30); \ 216 scalef_expon -= 30; \ 217 } \ 218 \ 219 scalef /= (1 << scalef_expon); \ 220 \ 221 for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j] 222 223 /***************************************************************/ 224 #define GET_SRC_DST_PARAMETERS(type) \ 225 hgt = mlib_ImageGetHeight(src); \ 226 wid = mlib_ImageGetWidth(src); \ 227 nchannel = mlib_ImageGetChannels(src); \ 228 sll = mlib_ImageGetStride(src) / sizeof(type); \ 229 dll = mlib_ImageGetStride(dst) / sizeof(type); \ 230 adr_src = (type *)mlib_ImageGetData(src); \ 231 adr_dst = (type *)mlib_ImageGetData(dst) 232 233 /***************************************************************/ 234 #ifndef __sparc 235 #if IMG_TYPE == 1 236 237 /* 238 * Test for the presence of any "1" bit in bits 239 8 to 31 of val. If present, then val is either 240 negative or >255. If over/underflows of 8 bits 241 are uncommon, then this technique can be a win, 242 since only a single test, rather than two, is 243 necessary to determine if clamping is needed. 244 On the other hand, if over/underflows are common, 245 it adds an extra test. 246 */ 247 #define CLAMP_STORE(dst, val) \ 248 if (val & 0xffffff00) { \ 249 if (val < MLIB_U8_MIN) \ 250 dst = MLIB_U8_MIN; \ 251 else \ 252 dst = MLIB_U8_MAX; \ 253 } else { \ 254 dst = (mlib_u8)val; \ 255 } 256 257 #elif IMG_TYPE == 2 258 259 #define CLAMP_STORE(dst, val) \ 260 if (val >= MLIB_S16_MAX) \ 261 dst = MLIB_S16_MAX; \ 262 else if (val <= MLIB_S16_MIN) \ 263 dst = MLIB_S16_MIN; \ 264 else \ 265 dst = (mlib_s16)val 266 267 #elif IMG_TYPE == 3 268 269 #define CLAMP_STORE(dst, val) \ 270 if (val >= MLIB_U16_MAX) \ 271 dst = MLIB_U16_MAX; \ 272 else if (val <= MLIB_U16_MIN) \ 273 dst = MLIB_U16_MIN; \ 274 else \ 275 dst = (mlib_u16)val 276 277 #endif /* IMG_TYPE == 1 */ 278 #endif /* __sparc */ 279 280 /***************************************************************/ 281 #define KSIZE 3 282 283 mlib_status CONV_FUNC(3x3) 284 { 285 FTYPE buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT; 286 DEF_VARS(DTYPE); 287 DTYPE *sl2; 288 #ifndef __sparc 289 mlib_s32 d0, d1; 290 #endif /* __sparc */ 291 LOAD_KERNEL3(); 292 GET_SRC_DST_PARAMETERS(DTYPE); 293 294 swid = wid + KSIZE1; 295 296 if (swid > BUFF_LINE) { 297 pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE )*swid); 298 299 if (pbuff == NULL) return MLIB_FAILURE; 300 } 301 302 buff0 = pbuff; 303 buff1 = buff0 + swid; 304 buff2 = buff1 + swid; 305 buff3 = buff2 + swid; 306 buffo = (mlib_s32*)(buff3 + swid); 307 buffi = buffo + (swid &~ 1); 308 309 swid -= (dx_l + dx_r); 310 311 chan1 = nchannel; 312 chan2 = chan1 + chan1; 313 314 for (c = 0; c < nchannel; c++) { 315 if (!(cmask & (1 << (nchannel - 1 - c)))) continue; 316 317 sl = adr_src + c; 318 dl = adr_dst + c; 319 320 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll; 321 else sl1 = sl; 322 323 if ((hgt - dy_b) > 0) sl2 = sl1 + sll; 324 else sl2 = sl1; 325 326 for (i = 0; i < dx_l; i++) { 327 buff0[i] = (FTYPE)sl[0]; 328 buff1[i] = (FTYPE)sl1[0]; 329 buff2[i] = (FTYPE)sl2[0]; 330 } 331 332 #ifdef __SUNPRO_C 333 #pragma pipeloop(0) 334 #endif /* __SUNPRO_C */ 335 for (i = 0; i < swid; i++) { 336 buff0[i + dx_l] = (FTYPE)sl[i*chan1]; 337 buff1[i + dx_l] = (FTYPE)sl1[i*chan1]; 338 buff2[i + dx_l] = (FTYPE)sl2[i*chan1]; 339 } 340 341 for (i = 0; i < dx_r; i++) { 342 buff0[swid + dx_l + i] = buff0[swid + dx_l - 1]; 343 buff1[swid + dx_l + i] = buff1[swid + dx_l - 1]; 344 buff2[swid + dx_l + i] = buff2[swid + dx_l - 1]; 345 } 346 347 if ((hgt - dy_b) > 1) sl = sl2 + sll; 348 else sl = sl2; 349 350 for (j = 0; j < hgt; j++) { 351 FTYPE s0, s1; 352 353 p02 = buff0[0]; 354 p12 = buff1[0]; 355 p22 = buff2[0]; 356 357 p03 = buff0[1]; 358 p13 = buff1[1]; 359 p23 = buff2[1]; 360 361 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7; 362 s1 = p03 * k0 + p13 * k3 + p23 * k6; 363 364 sp = sl; 365 dp = dl; 366 367 #ifdef __SUNPRO_C 368 #pragma pipeloop(0) 369 #endif /* __SUNPRO_C */ 370 for (i = 0; i <= (wid - 2); i += 2) { 371 #ifdef __sparc 372 #ifdef _NO_LONGLONG 373 mlib_s32 o64_1, o64_2; 374 #else /* _NO_LONGLONG */ 375 mlib_s64 o64; 376 #endif /* _NO_LONGLONG */ 377 #endif /* __sparc */ 378 d64_2x32 dd; 379 380 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; 381 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; 382 383 LOAD_BUFF(buffi); 384 385 dd.d64 = *(FTYPE *)(buffi + i); 386 buff3[i + dx_l ] = (FTYPE)dd.i32s.i0; 387 buff3[i + dx_l + 1] = (FTYPE)dd.i32s.i1; 388 389 #ifndef __sparc 390 391 d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8); 392 d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8); 393 394 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7; 395 s1 = p03 * k0 + p13 * k3 + p23 * k6; 396 397 dp[0 ] = FROM_S32(d0); 398 dp[chan1] = FROM_S32(d1); 399 400 #else /* __sparc */ 401 402 dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8); 403 dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8); 404 *(FTYPE *)(buffo + i) = dd.d64; 405 406 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7; 407 s1 = p03 * k0 + p13 * k3 + p23 * k6; 408 409 #ifdef _NO_LONGLONG 410 411 o64_1 = buffo[i]; 412 o64_2 = buffo[i+1]; 413 #if IMG_TYPE != 1 414 STORE2(FROM_S32(o64_1), FROM_S32(o64_2)); 415 #else 416 STORE2(o64_1 >> 24, o64_2 >> 24); 417 #endif /* IMG_TYPE != 1 */ 418 419 #else /* _NO_LONGLONG */ 420 421 o64 = *(mlib_s64*)(buffo + i); 422 #if IMG_TYPE != 1 423 STORE2(FROM_S32(o64 >> 32), FROM_S32(o64)); 424 #else 425 STORE2(o64 >> 56, o64 >> 24); 426 #endif /* IMG_TYPE != 1 */ 427 #endif /* _NO_LONGLONG */ 428 #endif /* __sparc */ 429 430 sp += chan2; 431 dp += chan2; 432 } 433 434 for (; i < wid; i++) { 435 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; 436 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; 437 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; 438 439 buffi[i] = (mlib_s32)sp[0]; 440 buff3[i + dx_l] = (FTYPE)buffi[i]; 441 442 #ifndef __sparc 443 444 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 + 445 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8); 446 447 dp[0] = FROM_S32(d0); 448 449 #else /* __sparc */ 450 451 buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 + 452 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8); 453 #if IMG_TYPE != 1 454 dp[0] = FROM_S32(buffo[i]); 455 #else 456 dp[0] = buffo[i] >> 24; 457 #endif /* IMG_TYPE != 1 */ 458 #endif /* __sparc */ 459 460 sp += chan1; 461 dp += chan1; 462 } 463 464 for (; i < swid; i++) { 465 buffi[i] = (mlib_s32)sp[0]; 466 buff3[i + dx_l] = (FTYPE)buffi[i]; 467 sp += chan1; 468 } 469 470 for (i = 0; i < dx_l; i++) buff3[i] = buff3[dx_l]; 471 for (i = 0; i < dx_r; i++) buff3[swid + dx_l + i] = buff3[swid + dx_l - 1]; 472 473 if (j < hgt - dy_b - 2) sl += sll; 474 dl += dll; 475 476 buffT = buff0; 477 buff0 = buff1; 478 buff1 = buff2; 479 buff2 = buff3; 480 buff3 = buffT; 481 } 482 } 483 484 #ifdef __sparc 485 #if IMG_TYPE == 1 486 { 487 mlib_s32 amask = (1 << nchannel) - 1; 488 489 if ((cmask & amask) != amask) { 490 mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask); 491 } else { 492 mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll); 493 } 494 } 495 496 #endif /* IMG_TYPE == 1 */ 497 #endif /* __sparc */ 498 499 if (pbuff != buff) mlib_free(pbuff); 500 501 return MLIB_SUCCESS; 502 } 503 504 /***************************************************************/ 505 #ifndef __sparc /* for x86, using integer multiplies is faster */ 506 507 mlib_status CONV_FUNC_I(3x3) 508 { 509 DTYPE *adr_src, *sl, *sp0, *sp1, *sp2, *sp_1, *sp_2; 510 DTYPE *adr_dst, *dl, *dp; 511 mlib_s32 wid, hgt, sll, dll; 512 mlib_s32 nchannel, chan1, chan2, delta_chan; 513 mlib_s32 i, j, c; 514 mlib_s32 shift1, shift2; 515 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8; 516 mlib_s32 p02, p03, 517 p12, p13, 518 p22, p23; 519 520 #if IMG_TYPE != 1 521 shift1 = 16; 522 #else 523 shift1 = 8; 524 #endif /* IMG_TYPE != 1 */ 525 526 shift2 = scalef_expon - shift1; 527 528 /* keep kernel in regs */ 529 k0 = kern[0] >> shift1; k1 = kern[1] >> shift1; k2 = kern[2] >> shift1; 530 k3 = kern[3] >> shift1; k4 = kern[4] >> shift1; k5 = kern[5] >> shift1; 531 k6 = kern[6] >> shift1; k7 = kern[7] >> shift1; k8 = kern[8] >> shift1; 532 533 GET_SRC_DST_PARAMETERS(DTYPE); 534 535 chan1 = nchannel; 536 chan2 = chan1 + chan1; 537 delta_chan = 0; 538 539 if ((1 > dx_l) && (1 < wid + KSIZE1 - dx_r)) delta_chan = chan1; 540 541 for (c = 0; c < chan1; c++) { 542 if (!(cmask & (1 << (chan1 - 1 - c)))) continue; 543 544 sl = adr_src + c; 545 dl = adr_dst + c; 546 547 sp_1 = sl; 548 549 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl += sll; 550 sp_2 = sl; 551 552 if ((hgt - dy_b) > 0) sl += sll; 553 554 for (j = 0; j < hgt; j++) { 555 mlib_s32 s0, s1; 556 mlib_s32 pix0, pix1; 557 558 dp = dl; 559 sp0 = sp_1; 560 sp_1 = sp_2; 561 sp_2 = sl; 562 563 sp1 = sp_1; 564 sp2 = sp_2; 565 566 p02 = sp0[0]; 567 p12 = sp1[0]; 568 p22 = sp2[0]; 569 570 p03 = sp0[delta_chan]; 571 p13 = sp1[delta_chan]; 572 p23 = sp2[delta_chan]; 573 574 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7; 575 s1 = p03 * k0 + p13 * k3 + p23 * k6; 576 577 sp0 += (chan1 + delta_chan); 578 sp1 += (chan1 + delta_chan); 579 sp2 += (chan1 + delta_chan); 580 581 #ifdef __SUNPRO_C 582 #pragma pipeloop(0) 583 #endif /* __SUNPRO_C */ 584 for (i = 0; i <= (wid - dx_r - 2); i += 2) { 585 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0]; 586 p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1]; 587 588 pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2; 589 pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 + 590 p13 * k5 + p22 * k7 + p23 * k8) >> shift2; 591 592 CLAMP_STORE(dp[0], pix0); 593 CLAMP_STORE(dp[chan1], pix1); 594 595 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7; 596 s1 = p03 * k0 + p13 * k3 + p23 * k6; 597 598 sp0 += chan2; 599 sp1 += chan2; 600 sp2 += chan2; 601 dp += chan2; 602 } 603 604 p02 = p03; p12 = p13; p22 = p23; 605 606 for (; i < wid - dx_r; i++) { 607 p03 = sp0[0]; p13 = sp1[0]; p23 = sp2[0]; 608 pix0 = (s0 + p03 * k2 + p13 * k5 + p23 * k8) >> shift2; 609 CLAMP_STORE(dp[0], pix0); 610 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7; 611 p02 = p03; p12 = p13; p22 = p23; 612 sp0 += chan1; 613 sp1 += chan1; 614 sp2 += chan1; 615 dp += chan1; 616 } 617 618 sp0 -= chan1; 619 sp1 -= chan1; 620 sp2 -= chan1; 621 622 for (; i < wid; i++) { 623 p03 = sp0[0]; p13 = sp1[0]; p23 = sp2[0]; 624 pix0 = (s0 + p03 * k2 + p13 * k5 + p23 * k8) >> shift2; 625 CLAMP_STORE(dp[0], pix0); 626 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7; 627 p02 = p03; p12 = p13; p22 = p23; 628 dp += chan1; 629 } 630 631 if (j < hgt - dy_b - 1) sl += sll; 632 dl += dll; 633 } 634 } 635 636 return MLIB_SUCCESS; 637 } 638 639 #endif /* __sparc ( for x86, using integer multiplies is faster ) */ 640 641 /***************************************************************/ 642 #undef KSIZE 643 #define KSIZE 4 644 645 mlib_status CONV_FUNC(4x4) 646 { 647 FTYPE buff[(KSIZE + 3)*BUFF_LINE]; 648 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT; 649 FTYPE k[KSIZE*KSIZE]; 650 mlib_s32 d0, d1; 651 FTYPE k0, k1, k2, k3, k4, k5, k6, k7; 652 FTYPE p00, p01, p02, p03, p04, 653 p10, p11, p12, p13, p14, 654 p20, p21, p22, p23, 655 p30, p31, p32, p33; 656 DEF_VARS(DTYPE); 657 DTYPE *sl2, *sl3; 658 LOAD_KERNEL(KSIZE*KSIZE); 659 GET_SRC_DST_PARAMETERS(DTYPE); 660 661 swid = wid + KSIZE1; 662 663 if (swid > BUFF_LINE) { 664 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE )*swid); 665 666 if (pbuff == NULL) return MLIB_FAILURE; 667 } 668 669 buff0 = pbuff; 670 buff1 = buff0 + swid; 671 buff2 = buff1 + swid; 672 buff3 = buff2 + swid; 673 buff4 = buff3 + swid; 674 buffd = buff4 + swid; 675 buffo = (mlib_s32*)(buffd + swid); 676 buffi = buffo + (swid &~ 1); 677 678 swid -= (dx_l + dx_r); 679 680 chan1 = nchannel; 681 chan2 = chan1 + chan1; 682 683 for (c = 0; c < nchannel; c++) { 684 if (!(cmask & (1 << (nchannel - 1 - c)))) continue; 685 686 sl = adr_src + c; 687 dl = adr_dst + c; 688 689 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll; 690 else sl1 = sl; 691 692 if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll; 693 else sl2 = sl1; 694 695 if ((hgt - dy_b) > 0) sl3 = sl2 + sll; 696 else sl3 = sl2; 697 698 for (i = 0; i < dx_l; i++) { 699 buff0[i] = (FTYPE)sl[0]; 700 buff1[i] = (FTYPE)sl1[0]; 701 buff2[i] = (FTYPE)sl2[0]; 702 buff3[i] = (FTYPE)sl3[0]; 703 } 704 705 #ifdef __SUNPRO_C 706 #pragma pipeloop(0) 707 #endif /* __SUNPRO_C */ 708 for (i = 0; i < swid; i++) { 709 buff0[i + dx_l] = (FTYPE)sl[i*chan1]; 710 buff1[i + dx_l] = (FTYPE)sl1[i*chan1]; 711 buff2[i + dx_l] = (FTYPE)sl2[i*chan1]; 712 buff3[i + dx_l] = (FTYPE)sl3[i*chan1]; 713 } 714 715 for (i = 0; i < dx_r; i++) { 716 buff0[swid + dx_l + i] = buff0[swid + dx_l - 1]; 717 buff1[swid + dx_l + i] = buff1[swid + dx_l - 1]; 718 buff2[swid + dx_l + i] = buff2[swid + dx_l - 1]; 719 buff3[swid + dx_l + i] = buff3[swid + dx_l - 1]; 720 } 721 722 if ((hgt - dy_b) > 1) sl = sl3 + sll; 723 else sl = sl3; 724 725 for (j = 0; j < hgt; j++) { 726 d64_2x32 dd; 727 728 /* 729 * First loop on two first lines of kernel 730 */ 731 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; 732 k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7]; 733 734 sp = sl; 735 dp = dl; 736 737 p02 = buff0[0]; 738 p12 = buff1[0]; 739 p03 = buff0[1]; 740 p13 = buff1[1]; 741 p04 = buff0[2]; 742 743 #ifdef __SUNPRO_C 744 #pragma pipeloop(0) 745 #endif /* __SUNPRO_C */ 746 for (i = 0; i <= (wid - 2); i += 2) { 747 p00 = p02; p10 = p12; 748 p01 = p03; p11 = p13; 749 p02 = p04; p12 = buff1[i + 2]; 750 p03 = buff0[i + 3]; p13 = buff1[i + 3]; 751 p04 = buff0[i + 4]; p14 = buff1[i + 4]; 752 753 LOAD_BUFF(buffi); 754 755 dd.d64 = *(FTYPE *)(buffi + i); 756 buff4[i + dx_l ] = (FTYPE)dd.i32s.i0; 757 buff4[i + dx_l + 1] = (FTYPE)dd.i32s.i1; 758 759 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + 760 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7); 761 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + 762 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7); 763 764 sp += chan2; 765 } 766 767 /* 768 * Second loop on two last lines of kernel 769 */ 770 k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11]; 771 k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15]; 772 773 p02 = buff2[0]; 774 p12 = buff3[0]; 775 p03 = buff2[1]; 776 p13 = buff3[1]; 777 p04 = buff2[2]; 778 779 #ifdef __SUNPRO_C 780 #pragma pipeloop(0) 781 #endif /* __SUNPRO_C */ 782 for (i = 0; i <= (wid - 2); i += 2) { 783 p00 = p02; p10 = p12; 784 p01 = p03; p11 = p13; 785 p02 = p04; p12 = buff3[i + 2]; 786 p03 = buff2[i + 3]; p13 = buff3[i + 3]; 787 p04 = buff2[i + 4]; p14 = buff3[i + 4]; 788 789 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + 790 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]); 791 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + 792 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]); 793 794 dp[0 ] = FROM_S32(d0); 795 dp[chan1] = FROM_S32(d1); 796 797 dp += chan2; 798 } 799 800 /* last pixels */ 801 for (; i < wid; i++) { 802 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i]; 803 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1]; 804 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2]; 805 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3]; 806 807 buff4[i + dx_l] = (FTYPE)sp[0]; 808 809 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + 810 p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] + 811 p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] + 812 p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]); 813 814 dp[0] = FROM_S32(buffo[i]); 815 816 sp += chan1; 817 dp += chan1; 818 } 819 820 for (; i < swid; i++) { 821 buff4[i + dx_l] = (FTYPE)sp[0]; 822 sp += chan1; 823 } 824 825 for (i = 0; i < dx_l; i++) buff4[i] = buff4[dx_l]; 826 for (i = 0; i < dx_r; i++) buff4[swid + dx_l + i] = buff4[swid + dx_l - 1]; 827 828 /* next line */ 829 830 if (j < hgt - dy_b - 2) sl += sll; 831 dl += dll; 832 833 buffT = buff0; 834 buff0 = buff1; 835 buff1 = buff2; 836 buff2 = buff3; 837 buff3 = buff4; 838 buff4 = buffT; 839 } 840 } 841 842 if (pbuff != buff) mlib_free(pbuff); 843 844 return MLIB_SUCCESS; 845 } 846 847 /***************************************************************/ 848 #undef KSIZE 849 #define KSIZE 5 850 851 mlib_status CONV_FUNC(5x5) 852 { 853 FTYPE buff[(KSIZE + 3)*BUFF_LINE]; 854 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT; 855 FTYPE k[KSIZE*KSIZE]; 856 mlib_s32 d0, d1; 857 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; 858 FTYPE p00, p01, p02, p03, p04, p05, 859 p10, p11, p12, p13, p14, p15, 860 p20, p21, p22, p23, p24, 861 p30, p31, p32, p33, p34, 862 p40, p41, p42, p43, p44; 863 DEF_VARS(DTYPE); 864 DTYPE *sl2, *sl3, *sl4; 865 LOAD_KERNEL(KSIZE*KSIZE); 866 GET_SRC_DST_PARAMETERS(DTYPE); 867 868 swid = wid + KSIZE1; 869 870 if (swid > BUFF_LINE) { 871 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE )*swid); 872 873 if (pbuff == NULL) return MLIB_FAILURE; 874 } 875 876 buff0 = pbuff; 877 buff1 = buff0 + swid; 878 buff2 = buff1 + swid; 879 buff3 = buff2 + swid; 880 buff4 = buff3 + swid; 881 buff5 = buff4 + swid; 882 buffd = buff5 + swid; 883 buffo = (mlib_s32*)(buffd + swid); 884 buffi = buffo + (swid &~ 1); 885 886 swid -= (dx_l + dx_r); 887 888 chan1 = nchannel; 889 chan2 = chan1 + chan1; 890 891 for (c = 0; c < nchannel; c++) { 892 if (!(cmask & (1 << (nchannel - 1 - c)))) continue; 893 894 sl = adr_src + c; 895 dl = adr_dst + c; 896 897 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll; 898 else sl1 = sl; 899 900 if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll; 901 else sl2 = sl1; 902 903 if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl3 = sl2 + sll; 904 else sl3 = sl2; 905 906 if ((hgt - dy_b) > 0) sl4 = sl3 + sll; 907 else sl4 = sl3; 908 909 for (i = 0; i < dx_l; i++) { 910 buff0[i] = (FTYPE)sl[0]; 911 buff1[i] = (FTYPE)sl1[0]; 912 buff2[i] = (FTYPE)sl2[0]; 913 buff3[i] = (FTYPE)sl3[0]; 914 buff4[i] = (FTYPE)sl4[0]; 915 } 916 917 #ifdef __SUNPRO_C 918 #pragma pipeloop(0) 919 #endif /* __SUNPRO_C */ 920 for (i = 0; i < swid; i++) { 921 buff0[i + dx_l] = (FTYPE)sl[i*chan1]; 922 buff1[i + dx_l] = (FTYPE)sl1[i*chan1]; 923 buff2[i + dx_l] = (FTYPE)sl2[i*chan1]; 924 buff3[i + dx_l] = (FTYPE)sl3[i*chan1]; 925 buff4[i + dx_l] = (FTYPE)sl4[i*chan1]; 926 } 927 928 for (i = 0; i < dx_r; i++) { 929 buff0[swid + dx_l + i] = buff0[swid + dx_l - 1]; 930 buff1[swid + dx_l + i] = buff1[swid + dx_l - 1]; 931 buff2[swid + dx_l + i] = buff2[swid + dx_l - 1]; 932 buff3[swid + dx_l + i] = buff3[swid + dx_l - 1]; 933 buff4[swid + dx_l + i] = buff4[swid + dx_l - 1]; 934 } 935 936 if ((hgt - dy_b) > 1) sl = sl4 + sll; 937 else sl = sl4; 938 939 for (j = 0; j < hgt; j++) { 940 d64_2x32 dd; 941 942 /* 943 * First loop 944 */ 945 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4]; 946 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9]; 947 948 sp = sl; 949 dp = dl; 950 951 p02 = buff0[0]; 952 p12 = buff1[0]; 953 p03 = buff0[1]; 954 p13 = buff1[1]; 955 p04 = buff0[2]; 956 p14 = buff1[2]; 957 958 #ifdef __SUNPRO_C 959 #pragma pipeloop(0) 960 #endif /* __SUNPRO_C */ 961 for (i = 0; i <= (wid - 2); i += 2) { 962 p00 = p02; p10 = p12; 963 p01 = p03; p11 = p13; 964 p02 = p04; p12 = p14; 965 966 LOAD_BUFF(buffi); 967 968 p03 = buff0[i + 3]; p13 = buff1[i + 3]; 969 p04 = buff0[i + 4]; p14 = buff1[i + 4]; 970 p05 = buff0[i + 5]; p15 = buff1[i + 5]; 971 972 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + 973 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9); 974 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + 975 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9); 976 977 sp += chan2; 978 } 979 980 /* 981 * Second loop 982 */ 983 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14]; 984 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19]; 985 986 p02 = buff2[0]; 987 p12 = buff3[0]; 988 p03 = buff2[1]; 989 p13 = buff3[1]; 990 991 #ifdef __SUNPRO_C 992 #pragma pipeloop(0) 993 #endif /* __SUNPRO_C */ 994 for (i = 0; i <= (wid - 2); i += 2) { 995 p00 = p02; p10 = p12; 996 p01 = p03; p11 = p13; 997 998 p02 = buff2[i + 2]; p12 = buff3[i + 2]; 999 p03 = buff2[i + 3]; p13 = buff3[i + 3]; 1000 p04 = buff2[i + 4]; p14 = buff3[i + 4]; 1001 p05 = buff2[i + 5]; p15 = buff3[i + 5]; 1002 1003 dd.d64 = *(FTYPE *)(buffi + i); 1004 buff5[i + dx_l ] = (FTYPE)dd.i32s.i0; 1005 buff5[i + dx_l + 1] = (FTYPE)dd.i32s.i1; 1006 1007 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + 1008 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9); 1009 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + 1010 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9); 1011 } 1012 1013 /* 1014 * 3 loop 1015 */ 1016 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24]; 1017 1018 p02 = buff4[0]; 1019 p03 = buff4[1]; 1020 p04 = buff4[2]; 1021 p05 = buff4[3]; 1022 1023 #ifdef __SUNPRO_C 1024 #pragma pipeloop(0) 1025 #endif /* __SUNPRO_C */ 1026 for (i = 0; i <= (wid - 2); i += 2) { 1027 p00 = p02; p01 = p03; p02 = p04; p03 = p05; 1028 1029 p04 = buff4[i + 4]; p05 = buff4[i + 5]; 1030 1031 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]); 1032 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]); 1033 1034 dp[0 ] = FROM_S32(d0); 1035 dp[chan1] = FROM_S32(d1); 1036 1037 dp += chan2; 1038 } 1039 1040 /* last pixels */ 1041 for (; i < wid; i++) { 1042 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i]; 1043 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1]; 1044 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2]; 1045 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3]; 1046 p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4]; 1047 1048 p40 = buff4[i]; p41 = buff4[i + 1]; p42 = buff4[i + 2]; 1049 p43 = buff4[i + 3]; p44 = buff4[i + 4]; 1050 1051 buff5[i + dx_l] = (FTYPE)sp[0]; 1052 1053 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] + 1054 p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] + 1055 p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] + 1056 p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] + 1057 p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]); 1058 1059 dp[0] = FROM_S32(buffo[i]); 1060 1061 sp += chan1; 1062 dp += chan1; 1063 } 1064 1065 for (; i < swid; i++) { 1066 buff5[i + dx_l] = (FTYPE)sp[0]; 1067 sp += chan1; 1068 } 1069 1070 for (i = 0; i < dx_l; i++) buff5[i] = buff5[dx_l]; 1071 for (i = 0; i < dx_r; i++) buff5[swid + dx_l + i] = buff5[swid + dx_l - 1]; 1072 1073 /* next line */ 1074 1075 if (j < hgt - dy_b - 2) sl += sll; 1076 dl += dll; 1077 1078 buffT = buff0; 1079 buff0 = buff1; 1080 buff1 = buff2; 1081 buff2 = buff3; 1082 buff3 = buff4; 1083 buff4 = buff5; 1084 buff5 = buffT; 1085 } 1086 } 1087 1088 if (pbuff != buff) mlib_free(pbuff); 1089 1090 return MLIB_SUCCESS; 1091 } 1092 1093 /***************************************************************/ 1094 #ifndef __sparc /* for x86, using integer multiplies is faster */ 1095 1096 mlib_status CONV_FUNC_I(5x5) 1097 { 1098 mlib_s32 buff[BUFF_LINE]; 1099 mlib_s32 *buffd; 1100 mlib_s32 k[KSIZE*KSIZE]; 1101 mlib_s32 shift1, shift2; 1102 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; 1103 mlib_s32 p00, p01, p02, p03, p04, p05, 1104 p10, p11, p12, p13, p14, p15; 1105 DTYPE *adr_src, *sl, *sp0, *sp1, *sp2, *sp3, *sp4; 1106 DTYPE *sp_1, *sp_2, *sp_3, *sp_4; 1107 DTYPE *adr_dst, *dl, *dp; 1108 mlib_s32 *pbuff = buff; 1109 mlib_s32 wid, hgt, sll, dll; 1110 mlib_s32 nchannel, chan1, chan2, chan4; 1111 mlib_s32 delta_chan1, delta_chan2, delta_chan3; 1112 mlib_s32 i, j, c; 1113 1114 #if IMG_TYPE != 1 1115 shift1 = 16; 1116 #else 1117 shift1 = 8; 1118 #endif /* IMG_TYPE != 1 */ 1119 1120 shift2 = scalef_expon - shift1; 1121 1122 for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1; 1123 1124 GET_SRC_DST_PARAMETERS(DTYPE); 1125 1126 if (wid > BUFF_LINE) { 1127 pbuff = mlib_malloc(sizeof(mlib_s32)*wid); 1128 1129 if (pbuff == NULL) return MLIB_FAILURE; 1130 } 1131 1132 buffd = pbuff; 1133 1134 chan1 = nchannel; 1135 chan2 = chan1 + chan1; 1136 1137 if ((1 > dx_l) && (1 < wid + KSIZE1 - dx_r)) delta_chan1 = chan1; 1138 else delta_chan1 = 0; 1139 1140 if ((2 > dx_l) && (2 < wid + KSIZE1 - dx_r)) delta_chan2 = delta_chan1 + chan1; 1141 else delta_chan2 = delta_chan1; 1142 1143 if ((3 > dx_l) && (3 < wid + KSIZE1 - dx_r)) delta_chan3 = delta_chan2 + chan1; 1144 else delta_chan3 = delta_chan2; 1145 1146 chan4 = chan1 + delta_chan3; 1147 1148 for (c = 0; c < chan1; c++) { 1149 if (!(cmask & (1 << (chan1 - 1 - c)))) continue; 1150 1151 sl = adr_src + c; 1152 dl = adr_dst + c; 1153 1154 sp_1 = sl; 1155 1156 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl += sll; 1157 sp_2 = sl; 1158 1159 if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl += sll; 1160 sp_3 = sl; 1161 1162 if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl += sll; 1163 sp_4 = sl; 1164 1165 if ((hgt - dy_b) > 0) sl += sll; 1166 1167 for (j = 0; j < hgt; j++) { 1168 mlib_s32 pix0, pix1; 1169 1170 dp = dl; 1171 sp0 = sp_1; 1172 sp_1 = sp_2; 1173 sp_2 = sp_3; 1174 sp_3 = sp_4; 1175 sp_4 = sl; 1176 1177 sp1 = sp_1; 1178 sp2 = sp_2; 1179 sp3 = sp_3; 1180 sp4 = sp_4; 1181 1182 /* 1183 * First loop 1184 */ 1185 1186 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4]; 1187 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9]; 1188 1189 p02 = sp0[0]; p12 = sp1[0]; 1190 p03 = sp0[delta_chan1]; p13 = sp1[delta_chan1]; 1191 p04 = sp0[delta_chan2]; p14 = sp1[delta_chan2]; 1192 p05 = sp0[delta_chan3]; p15 = sp1[delta_chan3]; 1193 1194 sp0 += chan4; 1195 sp1 += chan4; 1196 1197 #ifdef __SUNPRO_C 1198 #pragma pipeloop(0) 1199 #endif /* __SUNPRO_C */ 1200 for (i = 0; i <= (wid - dx_r - 2); i += 2) { 1201 p00 = p02; p10 = p12; 1202 p01 = p03; p11 = p13; 1203 p02 = p04; p12 = p14; 1204 p03 = p05; p13 = p15; 1205 1206 p04 = sp0[0]; p14 = sp1[0]; 1207 p05 = sp0[chan1]; p15 = sp1[chan1]; 1208 1209 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + 1210 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9); 1211 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + 1212 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9); 1213 1214 sp0 += chan2; 1215 sp1 += chan2; 1216 } 1217 1218 p01 = p02; p02 = p03; p03 = p04; p04 = p05; 1219 p11 = p12; p12 = p13; p13 = p14; p14 = p15; 1220 1221 for (; i < wid - dx_r; i++) { 1222 p00 = p01; p10 = p11; 1223 p01 = p02; p11 = p12; 1224 p02 = p03; p12 = p13; 1225 p03 = p04; p13 = p14; 1226 1227 p04 = sp0[0]; p14 = sp1[0]; 1228 1229 buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + 1230 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9); 1231 1232 sp0 += chan1; 1233 sp1 += chan1; 1234 } 1235 1236 sp0 -= chan1; 1237 sp1 -= chan1; 1238 1239 for (; i < wid; i++) { 1240 p00 = p01; p10 = p11; 1241 p01 = p02; p11 = p12; 1242 p02 = p03; p12 = p13; 1243 p03 = p04; p13 = p14; 1244 1245 p04 = sp0[0]; p14 = sp1[0]; 1246 1247 buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + 1248 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9); 1249 } 1250 1251 /* 1252 * Second loop 1253 */ 1254 1255 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14]; 1256 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19]; 1257 1258 p02 = sp2[0]; p12 = sp3[0]; 1259 p03 = sp2[delta_chan1]; p13 = sp3[delta_chan1]; 1260 p04 = sp2[delta_chan2]; p14 = sp3[delta_chan2]; 1261 p05 = sp2[delta_chan3]; p15 = sp3[delta_chan3]; 1262 1263 sp2 += chan4; 1264 sp3 += chan4; 1265 1266 #ifdef __SUNPRO_C 1267 #pragma pipeloop(0) 1268 #endif /* __SUNPRO_C */ 1269 for (i = 0; i <= (wid - dx_r - 2); i += 2) { 1270 p00 = p02; p10 = p12; 1271 p01 = p03; p11 = p13; 1272 p02 = p04; p12 = p14; 1273 p03 = p05; p13 = p15; 1274 1275 p04 = sp2[0]; p14 = sp3[0]; 1276 p05 = sp2[chan1]; p15 = sp3[chan1]; 1277 1278 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + 1279 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9); 1280 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + 1281 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9); 1282 1283 sp2 += chan2; 1284 sp3 += chan2; 1285 } 1286 1287 p01 = p02; p02 = p03; p03 = p04; p04 = p05; 1288 p11 = p12; p12 = p13; p13 = p14; p14 = p15; 1289 1290 for (; i < wid - dx_r; i++) { 1291 p00 = p01; p10 = p11; 1292 p01 = p02; p11 = p12; 1293 p02 = p03; p12 = p13; 1294 p03 = p04; p13 = p14; 1295 1296 p04 = sp2[0]; p14 = sp3[0]; 1297 1298 buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + 1299 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9); 1300 1301 sp2 += chan1; 1302 sp3 += chan1; 1303 } 1304 1305 sp2 -= chan1; 1306 sp3 -= chan1; 1307 1308 for (; i < wid; i++) { 1309 p00 = p01; p10 = p11; 1310 p01 = p02; p11 = p12; 1311 p02 = p03; p12 = p13; 1312 p03 = p04; p13 = p14; 1313 1314 p04 = sp2[0]; p14 = sp3[0]; 1315 1316 buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + 1317 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9); 1318 } 1319 1320 /* 1321 * 3 loop 1322 */ 1323 1324 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24]; 1325 1326 p02 = sp4[0]; 1327 p03 = sp4[delta_chan1]; 1328 p04 = sp4[delta_chan2]; 1329 p05 = sp4[delta_chan3]; 1330 1331 sp4 += chan4; 1332 1333 #ifdef __SUNPRO_C 1334 #pragma pipeloop(0) 1335 #endif /* __SUNPRO_C */ 1336 for (i = 0; i <= (wid - dx_r - 2); i += 2) { 1337 p00 = p02; p01 = p03; p02 = p04; p03 = p05; 1338 1339 p04 = sp4[0]; p05 = sp4[chan1]; 1340 1341 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 + 1342 p03 * k3 + p04 * k4) >> shift2; 1343 pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 + 1344 p04 * k3 + p05 * k4) >> shift2; 1345 1346 CLAMP_STORE(dp[0], pix0); 1347 CLAMP_STORE(dp[chan1], pix1); 1348 1349 dp += chan2; 1350 sp4 += chan2; 1351 } 1352 1353 p01 = p02; p02 = p03; p03 = p04; p04 = p05; 1354 1355 for (; i < wid - dx_r; i++) { 1356 p00 = p01; p01 = p02; p02 = p03; p03 = p04; 1357 1358 p04 = sp4[0]; 1359 1360 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 + 1361 p03 * k3 + p04 * k4) >> shift2; 1362 CLAMP_STORE(dp[0], pix0); 1363 1364 dp += chan1; 1365 sp4 += chan1; 1366 } 1367 1368 sp4 -= chan1; 1369 1370 for (; i < wid; i++) { 1371 p00 = p01; p01 = p02; p02 = p03; p03 = p04; 1372 1373 p04 = sp4[0]; 1374 1375 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 + 1376 p03 * k3 + p04 * k4) >> shift2; 1377 CLAMP_STORE(dp[0], pix0); 1378 1379 dp += chan1; 1380 } 1381 1382 /* next line */ 1383 1384 if (j < hgt - dy_b - 1) sl += sll; 1385 dl += dll; 1386 } 1387 } 1388 1389 if (pbuff != buff) mlib_free(pbuff); 1390 1391 return MLIB_SUCCESS; 1392 } 1393 1394 #endif /* __sparc ( for x86, using integer multiplies is faster ) */ 1395 1396 /***************************************************************/ 1397 #if IMG_TYPE == 1 1398 1399 #undef KSIZE 1400 #define KSIZE 7 1401 1402 mlib_status CONV_FUNC(7x7) 1403 { 1404 FTYPE buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd; 1405 FTYPE k[KSIZE*KSIZE]; 1406 mlib_s32 l, m, buff_ind; 1407 mlib_s32 d0, d1; 1408 FTYPE k0, k1, k2, k3, k4, k5, k6; 1409 FTYPE p0, p1, p2, p3, p4, p5, p6, p7; 1410 DTYPE *sl2, *sl3, *sl4, *sl5, *sl6; 1411 DEF_VARS(DTYPE); 1412 LOAD_KERNEL(KSIZE*KSIZE); 1413 GET_SRC_DST_PARAMETERS(DTYPE); 1414 1415 swid = wid + KSIZE1; 1416 1417 if (wid > BUFF_LINE) { 1418 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE )*wid); 1419 1420 if (pbuff == NULL) return MLIB_FAILURE; 1421 } 1422 1423 for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*swid; 1424 for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l]; 1425 buffd = buffs[KSIZE] + swid; 1426 buffo = (mlib_s32*)(buffd + swid); 1427 buffi = buffo + (swid &~ 1); 1428 1429 swid -= (dx_l + dx_r); 1430 1431 chan1 = nchannel; 1432 chan2 = chan1 + chan1; 1433 1434 for (c = 0; c < nchannel; c++) { 1435 if (!(cmask & (1 << (nchannel - 1 - c)))) continue; 1436 1437 sl = adr_src + c; 1438 dl = adr_dst + c; 1439 1440 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll; 1441 else sl1 = sl; 1442 1443 if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll; 1444 else sl2 = sl1; 1445 1446 if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl3 = sl2 + sll; 1447 else sl3 = sl2; 1448 1449 if ((4 > dy_t) && (4 < hgt + KSIZE1 - dy_b)) sl4 = sl3 + sll; 1450 else sl4 = sl3; 1451 1452 if ((5 > dy_t) && (5 < hgt + KSIZE1 - dy_b)) sl5 = sl4 + sll; 1453 else sl5 = sl4; 1454 1455 if ((hgt - dy_b) > 0) sl6 = sl5 + sll; 1456 else sl6 = sl5; 1457 1458 for (i = 0; i < dx_l; i++) { 1459 buffs[0][i] = (FTYPE)sl[0]; 1460 buffs[1][i] = (FTYPE)sl1[0]; 1461 buffs[2][i] = (FTYPE)sl2[0]; 1462 buffs[3][i] = (FTYPE)sl3[0]; 1463 buffs[4][i] = (FTYPE)sl4[0]; 1464 buffs[5][i] = (FTYPE)sl5[0]; 1465 buffs[6][i] = (FTYPE)sl6[0]; 1466 } 1467 1468 #ifdef __SUNPRO_C 1469 #pragma pipeloop(0) 1470 #endif /* __SUNPRO_C */ 1471 for (i = 0; i < swid; i++) { 1472 buffs[0][i + dx_l] = (FTYPE)sl[i*chan1]; 1473 buffs[1][i + dx_l] = (FTYPE)sl1[i*chan1]; 1474 buffs[2][i + dx_l] = (FTYPE)sl2[i*chan1]; 1475 buffs[3][i + dx_l] = (FTYPE)sl3[i*chan1]; 1476 buffs[4][i + dx_l] = (FTYPE)sl4[i*chan1]; 1477 buffs[5][i + dx_l] = (FTYPE)sl5[i*chan1]; 1478 buffs[6][i + dx_l] = (FTYPE)sl6[i*chan1]; 1479 } 1480 1481 for (i = 0; i < dx_r; i++) { 1482 buffs[0][swid + dx_l + i] = buffs[0][swid + dx_l - 1]; 1483 buffs[1][swid + dx_l + i] = buffs[1][swid + dx_l - 1]; 1484 buffs[2][swid + dx_l + i] = buffs[2][swid + dx_l - 1]; 1485 buffs[3][swid + dx_l + i] = buffs[3][swid + dx_l - 1]; 1486 buffs[4][swid + dx_l + i] = buffs[4][swid + dx_l - 1]; 1487 buffs[5][swid + dx_l + i] = buffs[5][swid + dx_l - 1]; 1488 buffs[6][swid + dx_l + i] = buffs[6][swid + dx_l - 1]; 1489 } 1490 1491 buff_ind = 0; 1492 1493 #ifdef __SUNPRO_C 1494 #pragma pipeloop(0) 1495 #endif /* __SUNPRO_C */ 1496 for (i = 0; i < wid; i++) buffd[i] = 0.0; 1497 1498 if ((hgt - dy_b) > 1) sl = sl6 + sll; 1499 else sl = sl6; 1500 1501 for (j = 0; j < hgt; j++) { 1502 FTYPE **buffc = buffs + buff_ind; 1503 FTYPE *buffn = buffc[KSIZE]; 1504 FTYPE *pk = k; 1505 1506 for (l = 0; l < KSIZE; l++) { 1507 FTYPE *buff = buffc[l]; 1508 d64_2x32 dd; 1509 1510 sp = sl; 1511 dp = dl; 1512 1513 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 1514 p5 = buff[3]; p6 = buff[4]; p7 = buff[5]; 1515 1516 k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++; 1517 k4 = *pk++; k5 = *pk++; k6 = *pk++; 1518 1519 if (l < (KSIZE - 1)) { 1520 #ifdef __SUNPRO_C 1521 #pragma pipeloop(0) 1522 #endif /* __SUNPRO_C */ 1523 for (i = 0; i <= (wid - 2); i += 2) { 1524 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 1525 1526 p6 = buff[i + 6]; p7 = buff[i + 7]; 1527 1528 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; 1529 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; 1530 } 1531 1532 } else { 1533 #ifdef __SUNPRO_C 1534 #pragma pipeloop(0) 1535 #endif /* __SUNPRO_C */ 1536 for (i = 0; i <= (wid - 2); i += 2) { 1537 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 1538 1539 p6 = buff[i + 6]; p7 = buff[i + 7]; 1540 1541 LOAD_BUFF(buffi); 1542 1543 dd.d64 = *(FTYPE *)(buffi + i); 1544 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0; 1545 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1; 1546 1547 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]); 1548 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]); 1549 1550 dp[0 ] = FROM_S32(d0); 1551 dp[chan1] = FROM_S32(d1); 1552 1553 buffd[i ] = 0.0; 1554 buffd[i + 1] = 0.0; 1555 1556 sp += chan2; 1557 dp += chan2; 1558 } 1559 } 1560 } 1561 1562 /* last pixels */ 1563 for (; i < wid; i++) { 1564 FTYPE *pk = k, s = 0; 1565 mlib_s32 d0; 1566 1567 for (l = 0; l < KSIZE; l++) { 1568 FTYPE *buff = buffc[l] + i; 1569 1570 for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++); 1571 } 1572 1573 d0 = D2I(s); 1574 dp[0] = FROM_S32(d0); 1575 1576 buffn[i + dx_l] = (FTYPE)sp[0]; 1577 1578 sp += chan1; 1579 dp += chan1; 1580 } 1581 1582 for (; i < swid; i++) { 1583 buffn[i + dx_l] = (FTYPE)sp[0]; 1584 sp += chan1; 1585 } 1586 1587 for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l]; 1588 for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1]; 1589 1590 /* next line */ 1591 1592 if (j < hgt - dy_b - 2) sl += sll; 1593 dl += dll; 1594 1595 buff_ind++; 1596 1597 if (buff_ind >= KSIZE + 1) buff_ind = 0; 1598 } 1599 } 1600 1601 if (pbuff != buff) mlib_free(pbuff); 1602 1603 return MLIB_SUCCESS; 1604 } 1605 1606 #endif /* IMG_TYPE == 1 */ 1607 1608 /***************************************************************/ 1609 #define MAX_KER 7 1610 #define MAX_N 15 1611 #define BUFF_SIZE 1600 1612 #define CACHE_SIZE (64*1024) 1613 1614 static mlib_status mlib_ImageConv1xN_ext(mlib_image *dst, 1615 const mlib_image *src, 1616 const mlib_d64 *k, 1617 mlib_s32 n, 1618 mlib_s32 dy_t, 1619 mlib_s32 dy_b, 1620 mlib_s32 cmask) 1621 { 1622 DTYPE *adr_src, *sl; 1623 DTYPE *adr_dst, *dl, *dp; 1624 FTYPE buff[BUFF_SIZE]; 1625 FTYPE *buffd; 1626 FTYPE *pbuff = buff; 1627 const FTYPE *pk; 1628 FTYPE k0, k1, k2, k3; 1629 FTYPE p0, p1, p2, p3, p4; 1630 FTYPE *sbuff; 1631 mlib_s32 l, k_off, off, bsize; 1632 mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh; 1633 mlib_s32 d0, d1, ii; 1634 mlib_s32 wid, hgt, sll, dll; 1635 mlib_s32 nchannel; 1636 mlib_s32 i, j, c; 1637 GET_SRC_DST_PARAMETERS(DTYPE); 1638 1639 max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1); 1640 1641 if (max_hsize < 1) max_hsize = 1; 1642 if (max_hsize > hgt) max_hsize = hgt; 1643 1644 shgt = hgt + (n - 1); 1645 smax_hsize = max_hsize + (n - 1); 1646 1647 bsize = 2 * (smax_hsize + 1); 1648 1649 if (bsize > BUFF_SIZE) { 1650 pbuff = mlib_malloc(sizeof(FTYPE)*bsize); 1651 1652 if (pbuff == NULL) return MLIB_FAILURE; 1653 } 1654 1655 sbuff = pbuff; 1656 buffd = sbuff + smax_hsize; 1657 1658 shgt -= (dy_t + dy_b); 1659 k_off = 0; 1660 1661 for (l = 0; l < hgt; l += hsize) { 1662 hsize = hgt - l; 1663 1664 if (hsize > max_hsize) hsize = max_hsize; 1665 1666 smax_hsize = hsize + (n - 1); 1667 1668 for (c = 0; c < nchannel; c++) { 1669 if (!(cmask & (1 << (nchannel - 1 - c)))) continue; 1670 1671 sl = adr_src + c; 1672 dl = adr_dst + c; 1673 1674 #ifdef __SUNPRO_C 1675 #pragma pipeloop(0) 1676 #endif /* __SUNPRO_C */ 1677 for (i = 0; i < hsize; i++) buffd[i] = 0.0; 1678 1679 for (j = 0; j < wid; j++) { 1680 FTYPE *buff = sbuff; 1681 1682 for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) { 1683 sbuff[i - k_off] = (FTYPE)sl[0]; 1684 } 1685 1686 #ifdef __SUNPRO_C 1687 #pragma pipeloop(0) 1688 #endif /* __SUNPRO_C */ 1689 for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) { 1690 sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll]; 1691 } 1692 1693 for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) { 1694 sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll]; 1695 } 1696 1697 pk = k; 1698 1699 for (off = 0; off < (n - 4); off += 4) { 1700 1701 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 1702 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 1703 1704 #ifdef __SUNPRO_C 1705 #pragma pipeloop(0) 1706 #endif /* __SUNPRO_C */ 1707 for (i = 0; i < hsize; i += 2) { 1708 p0 = p2; p1 = p3; p2 = p4; 1709 1710 p3 = buff[i + 3]; p4 = buff[i + 4]; 1711 1712 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; 1713 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; 1714 } 1715 1716 pk += 4; 1717 buff += 4; 1718 } 1719 1720 dp = dl; 1721 kh = n - off; 1722 1723 if (kh == 4) { 1724 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 1725 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 1726 1727 #ifdef __SUNPRO_C 1728 #pragma pipeloop(0) 1729 #endif /* __SUNPRO_C */ 1730 for (i = 0; i <= (hsize - 2); i += 2) { 1731 p0 = p2; p1 = p3; p2 = p4; 1732 1733 p3 = buff[i + 3]; p4 = buff[i + 4]; 1734 1735 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]); 1736 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]); 1737 1738 dp[0 ] = FROM_S32(d0); 1739 dp[dll] = FROM_S32(d1); 1740 1741 buffd[i ] = 0.0; 1742 buffd[i + 1] = 0.0; 1743 1744 dp += 2*dll; 1745 } 1746 1747 if (i < hsize) { 1748 p0 = p2; p1 = p3; p2 = p4; 1749 p3 = buff[i + 3]; 1750 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]); 1751 dp[0] = FROM_S32(d0); 1752 buffd[i] = 0.0; 1753 } 1754 1755 } else if (kh == 3) { 1756 1757 p2 = buff[0]; p3 = buff[1]; 1758 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; 1759 1760 #ifdef __SUNPRO_C 1761 #pragma pipeloop(0) 1762 #endif /* __SUNPRO_C */ 1763 for (i = 0; i <= (hsize - 2); i += 2) { 1764 p0 = p2; p1 = p3; 1765 1766 p2 = buff[i + 2]; p3 = buff[i + 3]; 1767 1768 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]); 1769 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]); 1770 1771 dp[0 ] = FROM_S32(d0); 1772 dp[dll] = FROM_S32(d1); 1773 1774 buffd[i ] = 0.0; 1775 buffd[i + 1] = 0.0; 1776 1777 dp += 2*dll; 1778 } 1779 1780 if (i < hsize) { 1781 p0 = p2; p1 = p3; 1782 p2 = buff[i + 2]; 1783 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]); 1784 dp[0] = FROM_S32(d0); 1785 1786 buffd[i] = 0.0; 1787 } 1788 1789 } else if (kh == 2) { 1790 1791 p2 = buff[0]; 1792 k0 = pk[0]; k1 = pk[1]; 1793 1794 #ifdef __SUNPRO_C 1795 #pragma pipeloop(0) 1796 #endif /* __SUNPRO_C */ 1797 for (i = 0; i <= (hsize - 2); i += 2) { 1798 p0 = p2; 1799 1800 p1 = buff[i + 1]; p2 = buff[i + 2]; 1801 1802 d0 = D2I(p0*k0 + p1*k1 + buffd[i ]); 1803 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]); 1804 1805 dp[0 ] = FROM_S32(d0); 1806 dp[dll] = FROM_S32(d1); 1807 1808 buffd[i ] = 0.0; 1809 buffd[i + 1] = 0.0; 1810 1811 dp += 2*dll; 1812 } 1813 1814 if (i < hsize) { 1815 p0 = p2; 1816 p1 = buff[i + 1]; 1817 d0 = D2I(p0*k0 + p1*k1 + buffd[i]); 1818 dp[0] = FROM_S32(d0); 1819 1820 buffd[i] = 0.0; 1821 } 1822 1823 } else /* kh == 1 */{ 1824 1825 k0 = pk[0]; 1826 1827 #ifdef __SUNPRO_C 1828 #pragma pipeloop(0) 1829 #endif /* __SUNPRO_C */ 1830 for (i = 0; i <= (hsize - 2); i += 2) { 1831 p0 = buff[i]; p1 = buff[i + 1]; 1832 1833 d0 = D2I(p0*k0 + buffd[i ]); 1834 d1 = D2I(p1*k0 + buffd[i + 1]); 1835 1836 dp[0 ] = FROM_S32(d0); 1837 dp[dll] = FROM_S32(d1); 1838 1839 buffd[i ] = 0.0; 1840 buffd[i + 1] = 0.0; 1841 1842 dp += 2*dll; 1843 } 1844 1845 if (i < hsize) { 1846 p0 = buff[i]; 1847 d0 = D2I(p0*k0 + buffd[i]); 1848 dp[0] = FROM_S32(d0); 1849 1850 buffd[i] = 0.0; 1851 } 1852 } 1853 1854 /* next line */ 1855 sl += nchannel; 1856 dl += nchannel; 1857 } 1858 } 1859 1860 k_off += max_hsize; 1861 adr_dst += max_hsize*dll; 1862 } 1863 1864 if (pbuff != buff) mlib_free(pbuff); 1865 1866 return MLIB_SUCCESS; 1867 } 1868 1869 /***************************************************************/ 1870 mlib_status CONV_FUNC_MxN 1871 { 1872 DTYPE *adr_src, *sl, *sp = NULL; 1873 DTYPE *adr_dst, *dl, *dp = NULL; 1874 FTYPE buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)]; 1875 FTYPE **buffs = buffs_arr, *buffd; 1876 FTYPE akernel[256], *k = akernel, fscale = DSCALE; 1877 FTYPE *pbuff = buff; 1878 FTYPE k0, k1, k2, k3, k4, k5, k6; 1879 FTYPE p0, p1, p2, p3, p4, p5, p6, p7; 1880 mlib_s32 *buffi; 1881 mlib_s32 mn, l, off, kw, bsize, buff_ind; 1882 mlib_s32 d0, d1; 1883 mlib_s32 wid, hgt, sll, dll; 1884 mlib_s32 nchannel, chan1, chan2; 1885 mlib_s32 i, j, c, swid; 1886 d64_2x32 dd; 1887 mlib_status status = MLIB_SUCCESS; 1888 1889 GET_SRC_DST_PARAMETERS(DTYPE); 1890 1891 if (scale > 30) { 1892 fscale *= 1.0/(1 << 30); 1893 scale -= 30; 1894 } 1895 1896 fscale /= (1 << scale); 1897 1898 mn = m*n; 1899 1900 if (mn > 256) { 1901 k = mlib_malloc(mn*sizeof(mlib_d64)); 1902 1903 if (k == NULL) return MLIB_FAILURE; 1904 } 1905 1906 for (i = 0; i < mn; i++) { 1907 k[i] = kernel[i]*fscale; 1908 } 1909 1910 if (m == 1) { 1911 status = mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask); 1912 FREE_AND_RETURN_STATUS; 1913 } 1914 1915 swid = wid + (m - 1); 1916 1917 bsize = (n + 3)*swid; 1918 1919 if ((bsize > BUFF_SIZE) || (n > MAX_N)) { 1920 pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1)); 1921 1922 if (pbuff == NULL) { 1923 status = MLIB_FAILURE; 1924 FREE_AND_RETURN_STATUS; 1925 } 1926 buffs = (FTYPE **)(pbuff + bsize); 1927 } 1928 1929 for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid; 1930 for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l]; 1931 buffd = buffs[n] + swid; 1932 buffi = (mlib_s32*)(buffd + swid); 1933 1934 chan1 = nchannel; 1935 chan2 = chan1 + chan1; 1936 1937 swid -= (dx_l + dx_r); 1938 1939 for (c = 0; c < nchannel; c++) { 1940 if (!(cmask & (1 << (chan1 - 1 - c)))) continue; 1941 1942 sl = adr_src + c; 1943 dl = adr_dst + c; 1944 1945 for (l = 0; l < n; l++) { 1946 FTYPE *buff = buffs[l]; 1947 1948 for (i = 0; i < dx_l; i++) { 1949 buff[i] = (FTYPE)sl[0]; 1950 } 1951 1952 #ifdef __SUNPRO_C 1953 #pragma pipeloop(0) 1954 #endif /* __SUNPRO_C */ 1955 for (i = 0; i < swid; i++) { 1956 buff[i + dx_l] = (FTYPE)sl[i*chan1]; 1957 } 1958 1959 for (i = 0; i < dx_r; i++) { 1960 buff[swid + dx_l + i] = buff[swid + dx_l - 1]; 1961 } 1962 1963 if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll; 1964 } 1965 1966 buff_ind = 0; 1967 1968 #ifdef __SUNPRO_C 1969 #pragma pipeloop(0) 1970 #endif /* __SUNPRO_C */ 1971 for (i = 0; i < wid; i++) buffd[i] = 0.0; 1972 1973 for (j = 0; j < hgt; j++) { 1974 FTYPE **buffc = buffs + buff_ind; 1975 FTYPE *buffn = buffc[n]; 1976 FTYPE *pk = k; 1977 1978 for (l = 0; l < n; l++) { 1979 FTYPE *buff_l = buffc[l]; 1980 1981 for (off = 0; off < m;) { 1982 FTYPE *buff = buff_l + off; 1983 1984 kw = m - off; 1985 1986 if (kw > 2*MAX_KER) kw = MAX_KER; else 1987 if (kw > MAX_KER) kw = kw/2; 1988 off += kw; 1989 1990 sp = sl; 1991 dp = dl; 1992 1993 if (kw == 7) { 1994 1995 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 1996 p5 = buff[3]; p6 = buff[4]; p7 = buff[5]; 1997 1998 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 1999 k4 = pk[4]; k5 = pk[5]; k6 = pk[6]; 2000 2001 if (l < (n - 1) || off < m) { 2002 #ifdef __SUNPRO_C 2003 #pragma pipeloop(0) 2004 #endif /* __SUNPRO_C */ 2005 for (i = 0; i <= (wid - 2); i += 2) { 2006 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 2007 2008 p6 = buff[i + 6]; p7 = buff[i + 7]; 2009 2010 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; 2011 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; 2012 } 2013 2014 } else { 2015 #ifdef __SUNPRO_C 2016 #pragma pipeloop(0) 2017 #endif /* __SUNPRO_C */ 2018 for (i = 0; i <= (wid - 2); i += 2) { 2019 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 2020 2021 p6 = buff[i + 6]; p7 = buff[i + 7]; 2022 2023 LOAD_BUFF(buffi); 2024 2025 dd.d64 = *(FTYPE *)(buffi + i); 2026 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0; 2027 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1; 2028 2029 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]); 2030 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]); 2031 2032 dp[0 ] = FROM_S32(d0); 2033 dp[chan1] = FROM_S32(d1); 2034 2035 buffd[i ] = 0.0; 2036 buffd[i + 1] = 0.0; 2037 2038 sp += chan2; 2039 dp += chan2; 2040 } 2041 } 2042 2043 } else if (kw == 6) { 2044 2045 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 2046 p5 = buff[3]; p6 = buff[4]; 2047 2048 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 2049 k4 = pk[4]; k5 = pk[5]; 2050 2051 if (l < (n - 1) || off < m) { 2052 #ifdef __SUNPRO_C 2053 #pragma pipeloop(0) 2054 #endif /* __SUNPRO_C */ 2055 for (i = 0; i <= (wid - 2); i += 2) { 2056 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; 2057 2058 p5 = buff[i + 5]; p6 = buff[i + 6]; 2059 2060 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; 2061 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; 2062 } 2063 2064 } else { 2065 #ifdef __SUNPRO_C 2066 #pragma pipeloop(0) 2067 #endif /* __SUNPRO_C */ 2068 for (i = 0; i <= (wid - 2); i += 2) { 2069 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; 2070 2071 p5 = buff[i + 5]; p6 = buff[i + 6]; 2072 2073 LOAD_BUFF(buffi); 2074 2075 dd.d64 = *(FTYPE *)(buffi + i); 2076 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0; 2077 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1; 2078 2079 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]); 2080 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]); 2081 2082 dp[0 ] = FROM_S32(d0); 2083 dp[chan1] = FROM_S32(d1); 2084 2085 buffd[i ] = 0.0; 2086 buffd[i + 1] = 0.0; 2087 2088 sp += chan2; 2089 dp += chan2; 2090 } 2091 } 2092 2093 } else if (kw == 5) { 2094 2095 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 2096 p5 = buff[3]; 2097 2098 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 2099 k4 = pk[4]; 2100 2101 if (l < (n - 1) || off < m) { 2102 #ifdef __SUNPRO_C 2103 #pragma pipeloop(0) 2104 #endif /* __SUNPRO_C */ 2105 for (i = 0; i <= (wid - 2); i += 2) { 2106 p0 = p2; p1 = p3; p2 = p4; p3 = p5; 2107 2108 p4 = buff[i + 4]; p5 = buff[i + 5]; 2109 2110 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; 2111 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; 2112 } 2113 2114 } else { 2115 #ifdef __SUNPRO_C 2116 #pragma pipeloop(0) 2117 #endif /* __SUNPRO_C */ 2118 for (i = 0; i <= (wid - 2); i += 2) { 2119 p0 = p2; p1 = p3; p2 = p4; p3 = p5; 2120 2121 p4 = buff[i + 4]; p5 = buff[i + 5]; 2122 2123 LOAD_BUFF(buffi); 2124 2125 dd.d64 = *(FTYPE *)(buffi + i); 2126 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0; 2127 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1; 2128 2129 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]); 2130 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]); 2131 2132 dp[0 ] = FROM_S32(d0); 2133 dp[chan1] = FROM_S32(d1); 2134 2135 buffd[i ] = 0.0; 2136 buffd[i + 1] = 0.0; 2137 2138 sp += chan2; 2139 dp += chan2; 2140 } 2141 } 2142 2143 } else if (kw == 4) { 2144 2145 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 2146 2147 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 2148 2149 if (l < (n - 1) || off < m) { 2150 #ifdef __SUNPRO_C 2151 #pragma pipeloop(0) 2152 #endif /* __SUNPRO_C */ 2153 for (i = 0; i <= (wid - 2); i += 2) { 2154 p0 = p2; p1 = p3; p2 = p4; 2155 2156 p3 = buff[i + 3]; p4 = buff[i + 4]; 2157 2158 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; 2159 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; 2160 } 2161 2162 } else { 2163 #ifdef __SUNPRO_C 2164 #pragma pipeloop(0) 2165 #endif /* __SUNPRO_C */ 2166 for (i = 0; i <= (wid - 2); i += 2) { 2167 p0 = p2; p1 = p3; p2 = p4; 2168 2169 p3 = buff[i + 3]; p4 = buff[i + 4]; 2170 2171 LOAD_BUFF(buffi); 2172 2173 dd.d64 = *(FTYPE *)(buffi + i); 2174 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0; 2175 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1; 2176 2177 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]); 2178 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]); 2179 2180 dp[0 ] = FROM_S32(d0); 2181 dp[chan1] = FROM_S32(d1); 2182 2183 buffd[i ] = 0.0; 2184 buffd[i + 1] = 0.0; 2185 2186 sp += chan2; 2187 dp += chan2; 2188 } 2189 } 2190 2191 } else if (kw == 3) { 2192 2193 p2 = buff[0]; p3 = buff[1]; 2194 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; 2195 2196 if (l < (n - 1) || off < m) { 2197 #ifdef __SUNPRO_C 2198 #pragma pipeloop(0) 2199 #endif /* __SUNPRO_C */ 2200 for (i = 0; i <= (wid - 2); i += 2) { 2201 p0 = p2; p1 = p3; 2202 2203 p2 = buff[i + 2]; p3 = buff[i + 3]; 2204 2205 buffd[i ] += p0*k0 + p1*k1 + p2*k2; 2206 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2; 2207 } 2208 2209 } else { 2210 #ifdef __SUNPRO_C 2211 #pragma pipeloop(0) 2212 #endif /* __SUNPRO_C */ 2213 for (i = 0; i <= (wid - 2); i += 2) { 2214 p0 = p2; p1 = p3; 2215 2216 p2 = buff[i + 2]; p3 = buff[i + 3]; 2217 2218 LOAD_BUFF(buffi); 2219 2220 dd.d64 = *(FTYPE *)(buffi + i); 2221 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0; 2222 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1; 2223 2224 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]); 2225 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]); 2226 2227 dp[0 ] = FROM_S32(d0); 2228 dp[chan1] = FROM_S32(d1); 2229 2230 buffd[i ] = 0.0; 2231 buffd[i + 1] = 0.0; 2232 2233 sp += chan2; 2234 dp += chan2; 2235 } 2236 } 2237 2238 } else /* if (kw == 2) */ { 2239 2240 p2 = buff[0]; 2241 k0 = pk[0]; k1 = pk[1]; 2242 2243 if (l < (n - 1) || off < m) { 2244 #ifdef __SUNPRO_C 2245 #pragma pipeloop(0) 2246 #endif /* __SUNPRO_C */ 2247 for (i = 0; i <= (wid - 2); i += 2) { 2248 p0 = p2; 2249 2250 p1 = buff[i + 1]; p2 = buff[i + 2]; 2251 2252 buffd[i ] += p0*k0 + p1*k1; 2253 buffd[i + 1] += p1*k0 + p2*k1; 2254 } 2255 2256 } else { 2257 #ifdef __SUNPRO_C 2258 #pragma pipeloop(0) 2259 #endif /* __SUNPRO_C */ 2260 for (i = 0; i <= (wid - 2); i += 2) { 2261 p0 = p2; 2262 2263 p1 = buff[i + 1]; p2 = buff[i + 2]; 2264 2265 LOAD_BUFF(buffi); 2266 2267 dd.d64 = *(FTYPE *)(buffi + i); 2268 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0; 2269 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1; 2270 2271 d0 = D2I(p0*k0 + p1*k1 + buffd[i ]); 2272 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]); 2273 2274 dp[0 ] = FROM_S32(d0); 2275 dp[chan1] = FROM_S32(d1); 2276 2277 buffd[i ] = 0.0; 2278 buffd[i + 1] = 0.0; 2279 2280 sp += chan2; 2281 dp += chan2; 2282 } 2283 } 2284 } 2285 2286 pk += kw; 2287 } 2288 } 2289 2290 /* last pixels */ 2291 for (; i < wid; i++) { 2292 FTYPE *pk = k, s = 0; 2293 mlib_s32 x, d0; 2294 2295 for (l = 0; l < n; l++) { 2296 FTYPE *buff = buffc[l] + i; 2297 2298 for (x = 0; x < m; x++) s += buff[x] * (*pk++); 2299 } 2300 2301 d0 = D2I(s); 2302 dp[0] = FROM_S32(d0); 2303 2304 buffn[i + dx_l] = (FTYPE)sp[0]; 2305 2306 sp += chan1; 2307 dp += chan1; 2308 } 2309 2310 for (; i < swid; i++) { 2311 buffn[i + dx_l] = (FTYPE)sp[0]; 2312 sp += chan1; 2313 } 2314 2315 for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l]; 2316 for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1]; 2317 2318 /* next line */ 2319 2320 if (j < hgt - dy_b - 2) sl += sll; 2321 dl += dll; 2322 2323 buff_ind++; 2324 2325 if (buff_ind >= n + 1) buff_ind = 0; 2326 } 2327 } 2328 2329 FREE_AND_RETURN_STATUS; 2330 } 2331 2332 /***************************************************************/ 2333 #ifndef __sparc /* for x86, using integer multiplies is faster */ 2334 2335 #define STORE_RES(res, x) \ 2336 x >>= shift2; \ 2337 CLAMP_STORE(res, x) 2338 2339 mlib_status CONV_FUNC_MxN_I 2340 { 2341 DTYPE *adr_src, *sl, *sp = NULL; 2342 DTYPE *adr_dst, *dl, *dp = NULL; 2343 mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)]; 2344 mlib_s32 *pbuff = buff; 2345 mlib_s32 **buffs = buffs_arr, *buffd; 2346 mlib_s32 l, off, kw, bsize, buff_ind; 2347 mlib_s32 d0, d1, shift1, shift2; 2348 mlib_s32 k0, k1, k2, k3, k4, k5, k6; 2349 mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7; 2350 mlib_s32 wid, hgt, sll, dll; 2351 mlib_s32 nchannel, chan1; 2352 mlib_s32 i, j, c, swid; 2353 mlib_s32 chan2; 2354 mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl; 2355 GET_SRC_DST_PARAMETERS(DTYPE); 2356 2357 #if IMG_TYPE != 1 2358 shift1 = 16; 2359 #else 2360 shift1 = 8; 2361 #endif /* IMG_TYPE != 1 */ 2362 shift2 = scale - shift1; 2363 2364 chan1 = nchannel; 2365 chan2 = chan1 + chan1; 2366 2367 swid = wid + (m - 1); 2368 2369 bsize = (n + 2)*swid; 2370 2371 if ((bsize > BUFF_SIZE) || (n > MAX_N)) { 2372 pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1)); 2373 2374 if (pbuff == NULL) return MLIB_FAILURE; 2375 buffs = (mlib_s32 **)(pbuff + bsize); 2376 } 2377 2378 for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid; 2379 for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l]; 2380 buffd = buffs[n] + swid; 2381 2382 if (m*n > MAX_N*MAX_N) { 2383 k = mlib_malloc(sizeof(mlib_s32)*(m*n)); 2384 2385 if (k == NULL) { 2386 if (pbuff != buff) mlib_free(pbuff); 2387 return MLIB_FAILURE; 2388 } 2389 } 2390 2391 for (i = 0; i < m*n; i++) { 2392 k[i] = kernel[i] >> shift1; 2393 } 2394 2395 swid -= (dx_l + dx_r); 2396 2397 for (c = 0; c < nchannel; c++) { 2398 if (!(cmask & (1 << (nchannel - 1 - c)))) continue; 2399 2400 sl = adr_src + c; 2401 dl = adr_dst + c; 2402 2403 for (l = 0; l < n; l++) { 2404 mlib_s32 *buff = buffs[l]; 2405 2406 for (i = 0; i < dx_l; i++) { 2407 buff[i] = (mlib_s32)sl[0]; 2408 } 2409 2410 #ifdef __SUNPRO_C 2411 #pragma pipeloop(0) 2412 #endif /* __SUNPRO_C */ 2413 for (i = 0; i < swid; i++) { 2414 buff[i + dx_l] = (mlib_s32)sl[i*chan1]; 2415 } 2416 2417 for (i = 0; i < dx_r; i++) { 2418 buff[swid + dx_l + i] = buff[swid + dx_l - 1]; 2419 } 2420 2421 if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll; 2422 } 2423 2424 buff_ind = 0; 2425 2426 #ifdef __SUNPRO_C 2427 #pragma pipeloop(0) 2428 #endif /* __SUNPRO_C */ 2429 for (i = 0; i < wid; i++) buffd[i] = 0; 2430 2431 for (j = 0; j < hgt; j++) { 2432 mlib_s32 **buffc = buffs + buff_ind; 2433 mlib_s32 *buffn = buffc[n]; 2434 mlib_s32 *pk = k; 2435 2436 for (l = 0; l < n; l++) { 2437 mlib_s32 *buff_l = buffc[l]; 2438 2439 for (off = 0; off < m;) { 2440 mlib_s32 *buff = buff_l + off; 2441 2442 sp = sl; 2443 dp = dl; 2444 2445 kw = m - off; 2446 2447 if (kw > 2*MAX_KER) kw = MAX_KER; else 2448 if (kw > MAX_KER) kw = kw/2; 2449 off += kw; 2450 2451 if (kw == 7) { 2452 2453 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 2454 p5 = buff[3]; p6 = buff[4]; p7 = buff[5]; 2455 2456 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 2457 k4 = pk[4]; k5 = pk[5]; k6 = pk[6]; 2458 2459 if (l < (n - 1) || off < m) { 2460 #ifdef __SUNPRO_C 2461 #pragma pipeloop(0) 2462 #endif /* __SUNPRO_C */ 2463 for (i = 0; i <= (wid - 2); i += 2) { 2464 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 2465 2466 p6 = buff[i + 6]; p7 = buff[i + 7]; 2467 2468 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; 2469 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; 2470 } 2471 2472 } else { 2473 #ifdef __SUNPRO_C 2474 #pragma pipeloop(0) 2475 #endif /* __SUNPRO_C */ 2476 for (i = 0; i <= (wid - 2); i += 2) { 2477 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 2478 2479 p6 = buff[i + 6]; p7 = buff[i + 7]; 2480 2481 buffn[i + dx_l ] = (mlib_s32)sp[0]; 2482 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1]; 2483 2484 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]); 2485 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]); 2486 2487 STORE_RES(dp[0 ], d0); 2488 STORE_RES(dp[chan1], d1); 2489 2490 buffd[i ] = 0; 2491 buffd[i + 1] = 0; 2492 2493 sp += chan2; 2494 dp += chan2; 2495 } 2496 } 2497 2498 } else if (kw == 6) { 2499 2500 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 2501 p5 = buff[3]; p6 = buff[4]; 2502 2503 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 2504 k4 = pk[4]; k5 = pk[5]; 2505 2506 if (l < (n - 1) || off < m) { 2507 #ifdef __SUNPRO_C 2508 #pragma pipeloop(0) 2509 #endif /* __SUNPRO_C */ 2510 for (i = 0; i <= (wid - 2); i += 2) { 2511 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; 2512 2513 p5 = buff[i + 5]; p6 = buff[i + 6]; 2514 2515 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; 2516 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; 2517 } 2518 2519 } else { 2520 #ifdef __SUNPRO_C 2521 #pragma pipeloop(0) 2522 #endif /* __SUNPRO_C */ 2523 for (i = 0; i <= (wid - 2); i += 2) { 2524 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; 2525 2526 p5 = buff[i + 5]; p6 = buff[i + 6]; 2527 2528 buffn[i + dx_l ] = (mlib_s32)sp[0]; 2529 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1]; 2530 2531 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]); 2532 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]); 2533 2534 STORE_RES(dp[0 ], d0); 2535 STORE_RES(dp[chan1], d1); 2536 2537 buffd[i ] = 0; 2538 buffd[i + 1] = 0; 2539 2540 sp += chan2; 2541 dp += chan2; 2542 } 2543 } 2544 2545 } else if (kw == 5) { 2546 2547 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 2548 p5 = buff[3]; 2549 2550 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 2551 k4 = pk[4]; 2552 2553 if (l < (n - 1) || off < m) { 2554 #ifdef __SUNPRO_C 2555 #pragma pipeloop(0) 2556 #endif /* __SUNPRO_C */ 2557 for (i = 0; i <= (wid - 2); i += 2) { 2558 p0 = p2; p1 = p3; p2 = p4; p3 = p5; 2559 2560 p4 = buff[i + 4]; p5 = buff[i + 5]; 2561 2562 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; 2563 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; 2564 } 2565 2566 } else { 2567 #ifdef __SUNPRO_C 2568 #pragma pipeloop(0) 2569 #endif /* __SUNPRO_C */ 2570 for (i = 0; i <= (wid - 2); i += 2) { 2571 p0 = p2; p1 = p3; p2 = p4; p3 = p5; 2572 2573 p4 = buff[i + 4]; p5 = buff[i + 5]; 2574 2575 buffn[i + dx_l ] = (mlib_s32)sp[0]; 2576 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1]; 2577 2578 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]); 2579 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]); 2580 2581 STORE_RES(dp[0 ], d0); 2582 STORE_RES(dp[chan1], d1); 2583 2584 buffd[i ] = 0; 2585 buffd[i + 1] = 0; 2586 2587 sp += chan2; 2588 dp += chan2; 2589 } 2590 } 2591 2592 } else if (kw == 4) { 2593 2594 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 2595 2596 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 2597 2598 if (l < (n - 1) || off < m) { 2599 #ifdef __SUNPRO_C 2600 #pragma pipeloop(0) 2601 #endif /* __SUNPRO_C */ 2602 for (i = 0; i <= (wid - 2); i += 2) { 2603 p0 = p2; p1 = p3; p2 = p4; 2604 2605 p3 = buff[i + 3]; p4 = buff[i + 4]; 2606 2607 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; 2608 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; 2609 } 2610 2611 } else { 2612 #ifdef __SUNPRO_C 2613 #pragma pipeloop(0) 2614 #endif /* __SUNPRO_C */ 2615 for (i = 0; i <= (wid - 2); i += 2) { 2616 p0 = p2; p1 = p3; p2 = p4; 2617 2618 p3 = buff[i + 3]; p4 = buff[i + 4]; 2619 2620 buffn[i + dx_l ] = (mlib_s32)sp[0]; 2621 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1]; 2622 2623 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]); 2624 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]); 2625 2626 STORE_RES(dp[0 ], d0); 2627 STORE_RES(dp[chan1], d1); 2628 2629 buffd[i ] = 0; 2630 buffd[i + 1] = 0; 2631 2632 sp += chan2; 2633 dp += chan2; 2634 } 2635 } 2636 2637 } else if (kw == 3) { 2638 2639 p2 = buff[0]; p3 = buff[1]; 2640 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; 2641 2642 if (l < (n - 1) || off < m) { 2643 #ifdef __SUNPRO_C 2644 #pragma pipeloop(0) 2645 #endif /* __SUNPRO_C */ 2646 for (i = 0; i <= (wid - 2); i += 2) { 2647 p0 = p2; p1 = p3; 2648 2649 p2 = buff[i + 2]; p3 = buff[i + 3]; 2650 2651 buffd[i ] += p0*k0 + p1*k1 + p2*k2; 2652 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2; 2653 } 2654 2655 } else { 2656 #ifdef __SUNPRO_C 2657 #pragma pipeloop(0) 2658 #endif /* __SUNPRO_C */ 2659 for (i = 0; i <= (wid - 2); i += 2) { 2660 p0 = p2; p1 = p3; 2661 2662 p2 = buff[i + 2]; p3 = buff[i + 3]; 2663 2664 buffn[i + dx_l ] = (mlib_s32)sp[0]; 2665 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1]; 2666 2667 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i ]); 2668 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]); 2669 2670 STORE_RES(dp[0 ], d0); 2671 STORE_RES(dp[chan1], d1); 2672 2673 buffd[i ] = 0; 2674 buffd[i + 1] = 0; 2675 2676 sp += chan2; 2677 dp += chan2; 2678 } 2679 } 2680 2681 } else if (kw == 2) { 2682 2683 p2 = buff[0]; 2684 k0 = pk[0]; k1 = pk[1]; 2685 2686 if (l < (n - 1) || off < m) { 2687 #ifdef __SUNPRO_C 2688 #pragma pipeloop(0) 2689 #endif /* __SUNPRO_C */ 2690 for (i = 0; i <= (wid - 2); i += 2) { 2691 p0 = p2; 2692 2693 p1 = buff[i + 1]; p2 = buff[i + 2]; 2694 2695 buffd[i ] += p0*k0 + p1*k1; 2696 buffd[i + 1] += p1*k0 + p2*k1; 2697 } 2698 2699 } else { 2700 #ifdef __SUNPRO_C 2701 #pragma pipeloop(0) 2702 #endif /* __SUNPRO_C */ 2703 for (i = 0; i <= (wid - 2); i += 2) { 2704 p0 = p2; 2705 2706 p1 = buff[i + 1]; p2 = buff[i + 2]; 2707 2708 buffn[i + dx_l ] = (mlib_s32)sp[0]; 2709 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1]; 2710 2711 d0 = (p0*k0 + p1*k1 + buffd[i ]); 2712 d1 = (p1*k0 + p2*k1 + buffd[i + 1]); 2713 2714 STORE_RES(dp[0 ], d0); 2715 STORE_RES(dp[chan1], d1); 2716 2717 buffd[i ] = 0; 2718 buffd[i + 1] = 0; 2719 2720 sp += chan2; 2721 dp += chan2; 2722 } 2723 } 2724 2725 } else /* kw == 1 */{ 2726 2727 k0 = pk[0]; 2728 2729 if (l < (n - 1) || off < m) { 2730 #ifdef __SUNPRO_C 2731 #pragma pipeloop(0) 2732 #endif /* __SUNPRO_C */ 2733 for (i = 0; i <= (wid - 2); i += 2) { 2734 p0 = buff[i]; p1 = buff[i + 1]; 2735 2736 buffd[i ] += p0*k0; 2737 buffd[i + 1] += p1*k0; 2738 } 2739 2740 } else { 2741 #ifdef __SUNPRO_C 2742 #pragma pipeloop(0) 2743 #endif /* __SUNPRO_C */ 2744 for (i = 0; i <= (wid - 2); i += 2) { 2745 p0 = buff[i]; p1 = buff[i + 1]; 2746 2747 buffn[i + dx_l ] = (mlib_s32)sp[0]; 2748 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1]; 2749 2750 d0 = (p0*k0 + buffd[i ]); 2751 d1 = (p1*k0 + buffd[i + 1]); 2752 2753 STORE_RES(dp[0 ], d0); 2754 STORE_RES(dp[chan1], d1); 2755 2756 buffd[i ] = 0; 2757 buffd[i + 1] = 0; 2758 2759 sp += chan2; 2760 dp += chan2; 2761 } 2762 } 2763 } 2764 2765 pk += kw; 2766 } 2767 } 2768 2769 /* last pixels */ 2770 for (; i < wid; i++) { 2771 mlib_s32 *pk = k, x, s = 0; 2772 2773 for (l = 0; l < n; l++) { 2774 mlib_s32 *buff = buffc[l] + i; 2775 2776 for (x = 0; x < m; x++) s += buff[x] * (*pk++); 2777 } 2778 2779 STORE_RES(dp[0], s); 2780 2781 buffn[i + dx_l] = (mlib_s32)sp[0]; 2782 2783 sp += chan1; 2784 dp += chan1; 2785 } 2786 2787 for (; i < swid; i++) { 2788 buffn[i + dx_l] = (mlib_s32)sp[0]; 2789 sp += chan1; 2790 } 2791 2792 for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l]; 2793 for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1]; 2794 2795 /* next line */ 2796 2797 if (j < hgt - dy_b - 2) sl += sll; 2798 dl += dll; 2799 2800 buff_ind++; 2801 2802 if (buff_ind >= n + 1) buff_ind = 0; 2803 } 2804 } 2805 2806 if (pbuff != buff) mlib_free(pbuff); 2807 if (k != k_locl) mlib_free(k); 2808 2809 return MLIB_SUCCESS; 2810 } 2811 2812 #endif /* __sparc ( for x86, using integer multiplies is faster ) */ 2813 2814 /***************************************************************/