1 /* 2 * Copyright (c) 2000, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 27 /* 28 * FUNCTION 29 * Internal functions for mlib_ImageConv* on U8/S16/U16 types and 30 * MLIB_EDGE_DST_NO_WRITE mask 31 */ 32 33 #include "mlib_image.h" 34 #include "mlib_c_ImageConv.h" 35 36 /* 37 This define switches between functions of different data types 38 */ 39 #define IMG_TYPE 2 40 41 /***************************************************************/ 42 #if IMG_TYPE == 1 43 44 #define DTYPE mlib_u8 45 #define CONV_FUNC(KERN) mlib_c_conv##KERN##nw_u8 46 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u8 47 #define DSCALE (1 << 24) 48 #define FROM_S32(x) (((x) >> 24) ^ 128) 49 #define S64TOS32(x) (x) 50 #define SAT_OFF -(1u << 31) 51 52 #elif IMG_TYPE == 2 53 54 #define DTYPE mlib_s16 55 #define CONV_FUNC(KERN) mlib_conv##KERN##nw_s16 56 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_s16 57 #define DSCALE 65536.0 58 #define FROM_S32(x) ((x) >> 16) 59 #define S64TOS32(x) ((x) & 0xffffffff) 60 #define SAT_OFF 61 62 #elif IMG_TYPE == 3 63 64 #define DTYPE mlib_u16 65 #define CONV_FUNC(KERN) mlib_conv##KERN##nw_u16 66 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u16 67 #define DSCALE 65536.0 68 #define FROM_S32(x) (((x) >> 16) ^ 0x8000) 69 #define S64TOS32(x) (x) 70 #define SAT_OFF -(1u << 31) 71 72 #endif /* IMG_TYPE == 1 */ 73 74 /***************************************************************/ 75 #define BUFF_SIZE 1600 76 77 #define CACHE_SIZE (64*1024) 78 79 /***************************************************************/ 80 #define FTYPE mlib_d64 81 82 #ifndef MLIB_USE_FTOI_CLAMPING 83 84 #define CLAMP_S32(x) \ 85 (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x))) 86 87 #else 88 89 #define CLAMP_S32(x) ((mlib_s32)(x)) 90 91 #endif /* MLIB_USE_FTOI_CLAMPING */ 92 93 /***************************************************************/ 94 #define D2I(x) CLAMP_S32((x) SAT_OFF) 95 96 /***************************************************************/ 97 #ifdef _LITTLE_ENDIAN 98 99 #define STORE2(res0, res1) \ 100 dp[0 ] = res1; \ 101 dp[chan1] = res0 102 103 #else 104 105 #define STORE2(res0, res1) \ 106 dp[0 ] = res0; \ 107 dp[chan1] = res1 108 109 #endif /* _LITTLE_ENDIAN */ 110 111 /***************************************************************/ 112 #ifdef _NO_LONGLONG 113 114 #define LOAD_BUFF(buff) \ 115 buff[i ] = sp[0]; \ 116 buff[i + 1] = sp[chan1] 117 118 #else /* _NO_LONGLONG */ 119 120 #ifdef _LITTLE_ENDIAN 121 122 #define LOAD_BUFF(buff) \ 123 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0]) 124 125 #else /* _LITTLE_ENDIAN */ 126 127 #define LOAD_BUFF(buff) \ 128 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1]) 129 130 #endif /* _LITTLE_ENDIAN */ 131 #endif /* _NO_LONGLONG */ 132 133 /***************************************************************/ 134 typedef union { 135 mlib_d64 d64; 136 struct { 137 mlib_s32 i0; 138 mlib_s32 i1; 139 } i32s; 140 struct { 141 mlib_s32 f0; 142 mlib_s32 f1; 143 } f32s; 144 } d64_2x32; 145 146 /***************************************************************/ 147 #define DEF_VARS(type) \ 148 type *adr_src, *sl, *sp = NULL; \ 149 type *adr_dst, *dl, *dp = NULL; \ 150 FTYPE *pbuff = buff; \ 151 mlib_s32 wid, hgt, sll, dll; \ 152 mlib_s32 nchannel, chan1; \ 153 mlib_s32 i, j, c 154 155 /***************************************************************/ 156 #define GET_SRC_DST_PARAMETERS(type) \ 157 hgt = mlib_ImageGetHeight(src); \ 158 wid = mlib_ImageGetWidth(src); \ 159 nchannel = mlib_ImageGetChannels(src); \ 160 sll = mlib_ImageGetStride(src) / sizeof(type); \ 161 dll = mlib_ImageGetStride(dst) / sizeof(type); \ 162 adr_src = (type *)mlib_ImageGetData(src); \ 163 adr_dst = (type *)mlib_ImageGetData(dst) 164 165 /***************************************************************/ 166 #if IMG_TYPE == 1 167 168 /* Test for the presence of any "1" bit in bits 169 8 to 31 of val. If present, then val is either 170 negative or >255. If over/underflows of 8 bits 171 are uncommon, then this technique can be a win, 172 since only a single test, rather than two, is 173 necessary to determine if clamping is needed. 174 On the other hand, if over/underflows are common, 175 it adds an extra test. 176 */ 177 #define CLAMP_STORE(dst, val) \ 178 if (val & 0xffffff00) { \ 179 if (val < MLIB_U8_MIN) \ 180 dst = MLIB_U8_MIN; \ 181 else \ 182 dst = MLIB_U8_MAX; \ 183 } else { \ 184 dst = (mlib_u8)val; \ 185 } 186 187 #elif IMG_TYPE == 2 188 189 #define CLAMP_STORE(dst, val) \ 190 if (val >= MLIB_S16_MAX) \ 191 dst = MLIB_S16_MAX; \ 192 else if (val <= MLIB_S16_MIN) \ 193 dst = MLIB_S16_MIN; \ 194 else \ 195 dst = (mlib_s16)val 196 197 #elif IMG_TYPE == 3 198 199 #define CLAMP_STORE(dst, val) \ 200 if (val >= MLIB_U16_MAX) \ 201 dst = MLIB_U16_MAX; \ 202 else if (val <= MLIB_U16_MIN) \ 203 dst = MLIB_U16_MIN; \ 204 else \ 205 dst = (mlib_u16)val 206 207 #endif /* IMG_TYPE == 1 */ 208 209 /***************************************************************/ 210 #define MAX_KER 7 211 #define MAX_N 15 212 213 static mlib_status mlib_ImageConv1xN(mlib_image *dst, 214 const mlib_image *src, 215 const mlib_d64 *k, 216 mlib_s32 n, 217 mlib_s32 dn, 218 mlib_s32 cmask) 219 { 220 FTYPE buff[BUFF_SIZE]; 221 mlib_s32 off, kh; 222 mlib_s32 d0, d1; 223 const FTYPE *pk; 224 FTYPE k0, k1, k2, k3; 225 FTYPE p0, p1, p2, p3, p4; 226 DEF_VARS(DTYPE); 227 DTYPE *sl_c, *dl_c, *sl0; 228 mlib_s32 l, hsize, max_hsize; 229 GET_SRC_DST_PARAMETERS(DTYPE); 230 231 hgt -= (n - 1); 232 adr_dst += dn*dll; 233 234 max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll; 235 236 if (!max_hsize) max_hsize = 1; 237 238 if (max_hsize > BUFF_SIZE) { 239 pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize); 240 } 241 242 chan1 = nchannel; 243 244 sl_c = adr_src; 245 dl_c = adr_dst; 246 247 for (l = 0; l < hgt; l += hsize) { 248 hsize = hgt - l; 249 250 if (hsize > max_hsize) hsize = max_hsize; 251 252 for (c = 0; c < nchannel; c++) { 253 if (!(cmask & (1 << (chan1 - 1 - c)))) continue; 254 255 sl = sl_c + c; 256 dl = dl_c + c; 257 258 for (j = 0; j < hsize; j++) pbuff[j] = 0.0; 259 260 for (i = 0; i < wid; i++) { 261 sl0 = sl; 262 263 for (off = 0; off < (n - 4); off += 4) { 264 pk = k + off; 265 sp = sl0; 266 267 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 268 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll]; 269 sp += 3*sll; 270 271 for (j = 0; j < hsize; j += 2) { 272 p0 = p2; p1 = p3; p2 = p4; 273 p3 = sp[0]; 274 p4 = sp[sll]; 275 276 pbuff[j ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; 277 pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; 278 279 sp += 2*sll; 280 } 281 282 sl0 += 4*sll; 283 } 284 285 pk = k + off; 286 sp = sl0; 287 288 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 289 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll]; 290 291 dp = dl; 292 kh = n - off; 293 294 if (kh == 4) { 295 sp += 3*sll; 296 297 for (j = 0; j <= (hsize - 2); j += 2) { 298 p0 = p2; p1 = p3; p2 = p4; 299 p3 = sp[0]; 300 p4 = sp[sll]; 301 302 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]); 303 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]); 304 305 dp[0 ] = FROM_S32(d0); 306 dp[dll] = FROM_S32(d1); 307 308 pbuff[j] = 0; 309 pbuff[j + 1] = 0; 310 311 sp += 2*sll; 312 dp += 2*dll; 313 } 314 315 if (j < hsize) { 316 p0 = p2; p1 = p3; p2 = p4; 317 p3 = sp[0]; 318 319 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]); 320 321 pbuff[j] = 0; 322 323 dp[0] = FROM_S32(d0); 324 } 325 326 } else if (kh == 3) { 327 sp += 2*sll; 328 329 for (j = 0; j <= (hsize - 2); j += 2) { 330 p0 = p2; p1 = p3; 331 p2 = sp[0]; 332 p3 = sp[sll]; 333 334 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]); 335 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]); 336 337 dp[0 ] = FROM_S32(d0); 338 dp[dll] = FROM_S32(d1); 339 340 pbuff[j] = 0; 341 pbuff[j + 1] = 0; 342 343 sp += 2*sll; 344 dp += 2*dll; 345 } 346 347 if (j < hsize) { 348 p0 = p2; p1 = p3; 349 p2 = sp[0]; 350 351 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]); 352 353 pbuff[j] = 0; 354 355 dp[0] = FROM_S32(d0); 356 } 357 358 } else if (kh == 2) { 359 sp += sll; 360 361 for (j = 0; j <= (hsize - 2); j += 2) { 362 p0 = p2; 363 p1 = sp[0]; 364 p2 = sp[sll]; 365 366 d0 = D2I(p0*k0 + p1*k1 + pbuff[j]); 367 d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]); 368 369 dp[0 ] = FROM_S32(d0); 370 dp[dll] = FROM_S32(d1); 371 372 pbuff[j] = 0; 373 pbuff[j + 1] = 0; 374 375 sp += 2*sll; 376 dp += 2*dll; 377 } 378 379 if (j < hsize) { 380 p0 = p2; 381 p1 = sp[0]; 382 383 d0 = D2I(p0*k0 + p1*k1 + pbuff[j]); 384 385 pbuff[j] = 0; 386 387 dp[0] = FROM_S32(d0); 388 } 389 390 } else /* if (kh == 1) */ { 391 for (j = 0; j < hsize; j++) { 392 p0 = sp[0]; 393 394 d0 = D2I(p0*k0 + pbuff[j]); 395 396 dp[0] = FROM_S32(d0); 397 398 pbuff[j] = 0; 399 400 sp += sll; 401 dp += dll; 402 } 403 } 404 405 sl += chan1; 406 dl += chan1; 407 } 408 } 409 410 sl_c += max_hsize*sll; 411 dl_c += max_hsize*dll; 412 } 413 414 if (pbuff != buff) mlib_free(pbuff); 415 416 return MLIB_SUCCESS; 417 } 418 419 /***************************************************************/ 420 mlib_status CONV_FUNC(MxN)(mlib_image *dst, 421 const mlib_image *src, 422 const mlib_s32 *kernel, 423 mlib_s32 m, 424 mlib_s32 n, 425 mlib_s32 dm, 426 mlib_s32 dn, 427 mlib_s32 scale, 428 mlib_s32 cmask) 429 { 430 FTYPE buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)]; 431 FTYPE **buffs = buffs_arr, *buffd; 432 FTYPE akernel[256], *k = akernel, fscale = DSCALE; 433 mlib_s32 mn, l, off, kw, bsize, buff_ind; 434 mlib_s32 d0, d1; 435 FTYPE k0, k1, k2, k3, k4, k5, k6; 436 FTYPE p0, p1, p2, p3, p4, p5, p6, p7; 437 d64_2x32 dd; 438 DEF_VARS(DTYPE); 439 mlib_s32 chan2; 440 mlib_s32 *buffo, *buffi; 441 mlib_status status = MLIB_SUCCESS; 442 443 GET_SRC_DST_PARAMETERS(DTYPE); 444 445 if (scale > 30) { 446 fscale *= 1.0/(1 << 30); 447 scale -= 30; 448 } 449 450 fscale /= (1 << scale); 451 452 mn = m*n; 453 454 if (mn > 256) { 455 k = mlib_malloc(mn*sizeof(mlib_d64)); 456 457 if (k == NULL) return MLIB_FAILURE; 458 } 459 460 for (i = 0; i < mn; i++) { 461 k[i] = kernel[i]*fscale; 462 } 463 464 if (m == 1) { 465 status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask); 466 FREE_AND_RETURN_STATUS; 467 } 468 469 bsize = (n + 3)*wid; 470 471 if ((bsize > BUFF_SIZE) || (n > MAX_N)) { 472 pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1)); 473 474 if (pbuff == NULL) { 475 status = MLIB_FAILURE; 476 FREE_AND_RETURN_STATUS; 477 } 478 buffs = (FTYPE **)(pbuff + bsize); 479 } 480 481 for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid; 482 for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l]; 483 buffd = buffs[n] + wid; 484 buffo = (mlib_s32*)(buffd + wid); 485 buffi = buffo + (wid &~ 1); 486 487 chan1 = nchannel; 488 chan2 = chan1 + chan1; 489 490 wid -= (m - 1); 491 hgt -= (n - 1); 492 adr_dst += dn*dll + dm*nchannel; 493 494 for (c = 0; c < nchannel; c++) { 495 if (!(cmask & (1 << (chan1 - 1 - c)))) continue; 496 497 sl = adr_src + c; 498 dl = adr_dst + c; 499 500 for (l = 0; l < n; l++) { 501 FTYPE *buff = buffs[l]; 502 503 for (i = 0; i < wid + (m - 1); i++) { 504 buff[i] = (FTYPE)sl[i*chan1]; 505 } 506 507 sl += sll; 508 } 509 510 buff_ind = 0; 511 512 for (i = 0; i < wid; i++) buffd[i] = 0.0; 513 514 for (j = 0; j < hgt; j++) { 515 FTYPE **buffc = buffs + buff_ind; 516 FTYPE *buffn = buffc[n]; 517 FTYPE *pk = k; 518 519 for (l = 0; l < n; l++) { 520 FTYPE *buff_l = buffc[l]; 521 522 for (off = 0; off < m;) { 523 FTYPE *buff = buff_l + off; 524 525 kw = m - off; 526 527 if (kw > 2*MAX_KER) kw = MAX_KER; else 528 if (kw > MAX_KER) kw = kw/2; 529 off += kw; 530 531 sp = sl; 532 dp = dl; 533 534 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 535 p5 = buff[3]; p6 = buff[4]; p7 = buff[5]; 536 537 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 538 k4 = pk[4]; k5 = pk[5]; k6 = pk[6]; 539 pk += kw; 540 541 if (kw == 7) { 542 543 if (l < (n - 1) || off < m) { 544 for (i = 0; i <= (wid - 2); i += 2) { 545 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 546 547 p6 = buff[i + 6]; p7 = buff[i + 7]; 548 549 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; 550 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; 551 } 552 553 } else { 554 for (i = 0; i <= (wid - 2); i += 2) { 555 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 556 557 p6 = buff[i + 6]; p7 = buff[i + 7]; 558 559 LOAD_BUFF(buffi); 560 561 dd.d64 = *(FTYPE *)(buffi + i); 562 buffn[i ] = (FTYPE)dd.i32s.i0; 563 buffn[i + 1] = (FTYPE)dd.i32s.i1; 564 565 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]); 566 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]); 567 568 dp[0 ] = FROM_S32(d0); 569 dp[chan1] = FROM_S32(d1); 570 571 buffd[i ] = 0.0; 572 buffd[i + 1] = 0.0; 573 574 sp += chan2; 575 dp += chan2; 576 } 577 } 578 579 } else if (kw == 6) { 580 581 if (l < (n - 1) || off < m) { 582 for (i = 0; i <= (wid - 2); i += 2) { 583 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; 584 585 p5 = buff[i + 5]; p6 = buff[i + 6]; 586 587 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; 588 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; 589 } 590 591 } else { 592 for (i = 0; i <= (wid - 2); i += 2) { 593 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; 594 595 p5 = buff[i + 5]; p6 = buff[i + 6]; 596 597 buffn[i ] = (FTYPE)sp[0]; 598 buffn[i + 1] = (FTYPE)sp[chan1]; 599 600 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]); 601 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]); 602 603 dp[0 ] = FROM_S32(d0); 604 dp[chan1] = FROM_S32(d1); 605 606 buffd[i ] = 0.0; 607 buffd[i + 1] = 0.0; 608 609 sp += chan2; 610 dp += chan2; 611 } 612 } 613 614 } else if (kw == 5) { 615 616 if (l < (n - 1) || off < m) { 617 for (i = 0; i <= (wid - 2); i += 2) { 618 p0 = p2; p1 = p3; p2 = p4; p3 = p5; 619 620 p4 = buff[i + 4]; p5 = buff[i + 5]; 621 622 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; 623 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; 624 } 625 626 } else { 627 for (i = 0; i <= (wid - 2); i += 2) { 628 p0 = p2; p1 = p3; p2 = p4; p3 = p5; 629 630 p4 = buff[i + 4]; p5 = buff[i + 5]; 631 632 buffn[i ] = (FTYPE)sp[0]; 633 buffn[i + 1] = (FTYPE)sp[chan1]; 634 635 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]); 636 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]); 637 638 dp[0 ] = FROM_S32(d0); 639 dp[chan1] = FROM_S32(d1); 640 641 buffd[i ] = 0.0; 642 buffd[i + 1] = 0.0; 643 644 sp += chan2; 645 dp += chan2; 646 } 647 } 648 649 } else if (kw == 4) { 650 651 if (l < (n - 1) || off < m) { 652 for (i = 0; i <= (wid - 2); i += 2) { 653 p0 = p2; p1 = p3; p2 = p4; 654 655 p3 = buff[i + 3]; p4 = buff[i + 4]; 656 657 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; 658 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; 659 } 660 661 } else { 662 for (i = 0; i <= (wid - 2); i += 2) { 663 p0 = p2; p1 = p3; p2 = p4; 664 665 p3 = buff[i + 3]; p4 = buff[i + 4]; 666 667 buffn[i ] = (FTYPE)sp[0]; 668 buffn[i + 1] = (FTYPE)sp[chan1]; 669 670 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]); 671 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]); 672 673 dp[0 ] = FROM_S32(d0); 674 dp[chan1] = FROM_S32(d1); 675 676 buffd[i ] = 0.0; 677 buffd[i + 1] = 0.0; 678 679 sp += chan2; 680 dp += chan2; 681 } 682 } 683 684 } else if (kw == 3) { 685 686 if (l < (n - 1) || off < m) { 687 for (i = 0; i <= (wid - 2); i += 2) { 688 p0 = p2; p1 = p3; 689 690 p2 = buff[i + 2]; p3 = buff[i + 3]; 691 692 buffd[i ] += p0*k0 + p1*k1 + p2*k2; 693 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2; 694 } 695 696 } else { 697 for (i = 0; i <= (wid - 2); i += 2) { 698 p0 = p2; p1 = p3; 699 700 p2 = buff[i + 2]; p3 = buff[i + 3]; 701 702 buffn[i ] = (FTYPE)sp[0]; 703 buffn[i + 1] = (FTYPE)sp[chan1]; 704 705 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]); 706 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]); 707 708 dp[0 ] = FROM_S32(d0); 709 dp[chan1] = FROM_S32(d1); 710 711 buffd[i ] = 0.0; 712 buffd[i + 1] = 0.0; 713 714 sp += chan2; 715 dp += chan2; 716 } 717 } 718 719 } else /*if (kw == 2)*/ { 720 721 if (l < (n - 1) || off < m) { 722 for (i = 0; i <= (wid - 2); i += 2) { 723 p0 = p2; 724 725 p1 = buff[i + 1]; p2 = buff[i + 2]; 726 727 buffd[i ] += p0*k0 + p1*k1; 728 buffd[i + 1] += p1*k0 + p2*k1; 729 } 730 731 } else { 732 for (i = 0; i <= (wid - 2); i += 2) { 733 p0 = p2; 734 735 p1 = buff[i + 1]; p2 = buff[i + 2]; 736 737 buffn[i ] = (FTYPE)sp[0]; 738 buffn[i + 1] = (FTYPE)sp[chan1]; 739 740 d0 = D2I(p0*k0 + p1*k1 + buffd[i ]); 741 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]); 742 743 dp[0 ] = FROM_S32(d0); 744 dp[chan1] = FROM_S32(d1); 745 746 buffd[i ] = 0.0; 747 buffd[i + 1] = 0.0; 748 749 sp += chan2; 750 dp += chan2; 751 } 752 } 753 } 754 } 755 } 756 757 /* last pixels */ 758 for (; i < wid; i++) { 759 FTYPE *pk = k, s = 0; 760 mlib_s32 x, d0; 761 762 for (l = 0; l < n; l++) { 763 FTYPE *buff = buffc[l] + i; 764 765 for (x = 0; x < m; x++) s += buff[x] * (*pk++); 766 } 767 768 d0 = D2I(s); 769 dp[0] = FROM_S32(d0); 770 771 buffn[i] = (FTYPE)sp[0]; 772 773 sp += chan1; 774 dp += chan1; 775 } 776 777 for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1]; 778 779 /* next line */ 780 sl += sll; 781 dl += dll; 782 783 buff_ind++; 784 785 if (buff_ind >= n + 1) buff_ind = 0; 786 } 787 } 788 789 FREE_AND_RETURN_STATUS; 790 } 791 792 /***************************************************************/ 793 /* for x86, using integer multiplies is faster */ 794 795 #define STORE_RES(res, x) \ 796 x >>= shift2; \ 797 CLAMP_STORE(res, x) 798 799 mlib_status CONV_FUNC_I(MxN)(mlib_image *dst, 800 const mlib_image *src, 801 const mlib_s32 *kernel, 802 mlib_s32 m, 803 mlib_s32 n, 804 mlib_s32 dm, 805 mlib_s32 dn, 806 mlib_s32 scale, 807 mlib_s32 cmask) 808 { 809 mlib_s32 buff[BUFF_SIZE], *buffd = buff; 810 mlib_s32 l, off, kw; 811 mlib_s32 d0, d1, shift1, shift2; 812 mlib_s32 k0, k1, k2, k3, k4, k5, k6; 813 mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7; 814 DTYPE *adr_src, *sl, *sp = NULL; 815 DTYPE *adr_dst, *dl, *dp = NULL; 816 mlib_s32 wid, hgt, sll, dll; 817 mlib_s32 nchannel, chan1; 818 mlib_s32 i, j, c; 819 mlib_s32 chan2; 820 mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl; 821 GET_SRC_DST_PARAMETERS(DTYPE); 822 823 #if IMG_TYPE != 1 824 shift1 = 16; 825 #else 826 shift1 = 8; 827 #endif /* IMG_TYPE != 1 */ 828 shift2 = scale - shift1; 829 830 chan1 = nchannel; 831 chan2 = chan1 + chan1; 832 833 wid -= (m - 1); 834 hgt -= (n - 1); 835 adr_dst += dn*dll + dm*nchannel; 836 837 if (wid > BUFF_SIZE) { 838 buffd = mlib_malloc(sizeof(mlib_s32)*wid); 839 840 if (buffd == NULL) return MLIB_FAILURE; 841 } 842 843 if (m*n > MAX_N*MAX_N) { 844 k = mlib_malloc(sizeof(mlib_s32)*(m*n)); 845 846 if (k == NULL) { 847 if (buffd != buff) mlib_free(buffd); 848 return MLIB_FAILURE; 849 } 850 } 851 852 for (i = 0; i < m*n; i++) { 853 k[i] = kernel[i] >> shift1; 854 } 855 856 for (c = 0; c < nchannel; c++) { 857 if (!(cmask & (1 << (nchannel - 1 - c)))) continue; 858 859 sl = adr_src + c; 860 dl = adr_dst + c; 861 862 for (i = 0; i < wid; i++) buffd[i] = 0; 863 864 for (j = 0; j < hgt; j++) { 865 mlib_s32 *pk = k; 866 867 for (l = 0; l < n; l++) { 868 DTYPE *sp0 = sl + l*sll; 869 870 for (off = 0; off < m;) { 871 sp = sp0 + off*chan1; 872 dp = dl; 873 874 kw = m - off; 875 876 if (kw > 2*MAX_KER) kw = MAX_KER; else 877 if (kw > MAX_KER) kw = kw/2; 878 off += kw; 879 880 p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2]; 881 p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1]; 882 883 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 884 k4 = pk[4]; k5 = pk[5]; k6 = pk[6]; 885 pk += kw; 886 887 sp += (kw - 1)*chan1; 888 889 if (kw == 7) { 890 891 if (l < (n - 1) || off < m) { 892 for (i = 0; i <= (wid - 2); i += 2) { 893 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 894 p6 = sp[0]; 895 p7 = sp[chan1]; 896 897 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; 898 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; 899 900 sp += chan2; 901 } 902 903 } else { 904 for (i = 0; i <= (wid - 2); i += 2) { 905 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 906 p6 = sp[0]; 907 p7 = sp[chan1]; 908 909 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]); 910 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]); 911 912 STORE_RES(dp[0 ], d0); 913 STORE_RES(dp[chan1], d1); 914 915 buffd[i ] = 0; 916 buffd[i + 1] = 0; 917 918 sp += chan2; 919 dp += chan2; 920 } 921 } 922 923 } else if (kw == 6) { 924 925 if (l < (n - 1) || off < m) { 926 for (i = 0; i <= (wid - 2); i += 2) { 927 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; 928 p5 = sp[0]; 929 p6 = sp[chan1]; 930 931 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; 932 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; 933 934 sp += chan2; 935 } 936 937 } else { 938 for (i = 0; i <= (wid - 2); i += 2) { 939 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; 940 p5 = sp[0]; 941 p6 = sp[chan1]; 942 943 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]); 944 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]); 945 946 STORE_RES(dp[0 ], d0); 947 STORE_RES(dp[chan1], d1); 948 949 buffd[i ] = 0; 950 buffd[i + 1] = 0; 951 952 sp += chan2; 953 dp += chan2; 954 } 955 } 956 957 } else if (kw == 5) { 958 959 if (l < (n - 1) || off < m) { 960 for (i = 0; i <= (wid - 2); i += 2) { 961 p0 = p2; p1 = p3; p2 = p4; p3 = p5; 962 p4 = sp[0]; 963 p5 = sp[chan1]; 964 965 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; 966 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; 967 968 sp += chan2; 969 } 970 971 } else { 972 for (i = 0; i <= (wid - 2); i += 2) { 973 p0 = p2; p1 = p3; p2 = p4; p3 = p5; 974 p4 = sp[0]; 975 p5 = sp[chan1]; 976 977 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]); 978 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]); 979 980 STORE_RES(dp[0 ], d0); 981 STORE_RES(dp[chan1], d1); 982 983 buffd[i ] = 0; 984 buffd[i + 1] = 0; 985 986 sp += chan2; 987 dp += chan2; 988 } 989 } 990 991 } else if (kw == 4) { 992 993 if (l < (n - 1) || off < m) { 994 for (i = 0; i <= (wid - 2); i += 2) { 995 p0 = p2; p1 = p3; p2 = p4; 996 p3 = sp[0]; 997 p4 = sp[chan1]; 998 999 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; 1000 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; 1001 1002 sp += chan2; 1003 } 1004 1005 } else { 1006 for (i = 0; i <= (wid - 2); i += 2) { 1007 p0 = p2; p1 = p3; p2 = p4; 1008 p3 = sp[0]; 1009 p4 = sp[chan1]; 1010 1011 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]); 1012 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]); 1013 1014 STORE_RES(dp[0 ], d0); 1015 STORE_RES(dp[chan1], d1); 1016 1017 buffd[i ] = 0; 1018 buffd[i + 1] = 0; 1019 1020 sp += chan2; 1021 dp += chan2; 1022 } 1023 } 1024 1025 } else if (kw == 3) { 1026 1027 if (l < (n - 1) || off < m) { 1028 for (i = 0; i <= (wid - 2); i += 2) { 1029 p0 = p2; p1 = p3; 1030 p2 = sp[0]; 1031 p3 = sp[chan1]; 1032 1033 buffd[i ] += p0*k0 + p1*k1 + p2*k2; 1034 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2; 1035 1036 sp += chan2; 1037 } 1038 1039 } else { 1040 for (i = 0; i <= (wid - 2); i += 2) { 1041 p0 = p2; p1 = p3; 1042 p2 = sp[0]; 1043 p3 = sp[chan1]; 1044 1045 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i ]); 1046 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]); 1047 1048 STORE_RES(dp[0 ], d0); 1049 STORE_RES(dp[chan1], d1); 1050 1051 buffd[i ] = 0; 1052 buffd[i + 1] = 0; 1053 1054 sp += chan2; 1055 dp += chan2; 1056 } 1057 } 1058 1059 } else if (kw == 2) { 1060 1061 if (l < (n - 1) || off < m) { 1062 for (i = 0; i <= (wid - 2); i += 2) { 1063 p0 = p2; 1064 p1 = sp[0]; 1065 p2 = sp[chan1]; 1066 1067 buffd[i ] += p0*k0 + p1*k1; 1068 buffd[i + 1] += p1*k0 + p2*k1; 1069 1070 sp += chan2; 1071 } 1072 1073 } else { 1074 for (i = 0; i <= (wid - 2); i += 2) { 1075 p0 = p2; 1076 p1 = sp[0]; 1077 p2 = sp[chan1]; 1078 1079 d0 = (p0*k0 + p1*k1 + buffd[i ]); 1080 d1 = (p1*k0 + p2*k1 + buffd[i + 1]); 1081 1082 STORE_RES(dp[0 ], d0); 1083 STORE_RES(dp[chan1], d1); 1084 1085 buffd[i ] = 0; 1086 buffd[i + 1] = 0; 1087 1088 sp += chan2; 1089 dp += chan2; 1090 } 1091 } 1092 1093 } else /*if (kw == 1)*/ { 1094 1095 if (l < (n - 1) || off < m) { 1096 for (i = 0; i <= (wid - 2); i += 2) { 1097 p0 = sp[0]; 1098 p1 = sp[chan1]; 1099 1100 buffd[i ] += p0*k0; 1101 buffd[i + 1] += p1*k0; 1102 1103 sp += chan2; 1104 } 1105 1106 } else { 1107 for (i = 0; i <= (wid - 2); i += 2) { 1108 p0 = sp[0]; 1109 p1 = sp[chan1]; 1110 1111 d0 = (p0*k0 + buffd[i ]); 1112 d1 = (p1*k0 + buffd[i + 1]); 1113 1114 STORE_RES(dp[0 ], d0); 1115 STORE_RES(dp[chan1], d1); 1116 1117 buffd[i ] = 0; 1118 buffd[i + 1] = 0; 1119 1120 sp += chan2; 1121 dp += chan2; 1122 } 1123 } 1124 } 1125 } 1126 } 1127 1128 /* last pixels */ 1129 for (; i < wid; i++) { 1130 mlib_s32 *pk = k, s = 0; 1131 mlib_s32 x; 1132 1133 for (l = 0; l < n; l++) { 1134 sp = sl + l*sll + i*chan1; 1135 1136 for (x = 0; x < m; x++) { 1137 s += sp[0] * pk[0]; 1138 sp += chan1; 1139 pk ++; 1140 } 1141 } 1142 1143 STORE_RES(dp[0], s); 1144 1145 sp += chan1; 1146 dp += chan1; 1147 } 1148 1149 sl += sll; 1150 dl += dll; 1151 } 1152 } 1153 1154 if (buffd != buff) mlib_free(buffd); 1155 if (k != k_locl) mlib_free(k); 1156 1157 return MLIB_SUCCESS; 1158 } 1159 1160 /***************************************************************/