1 /* 2 * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 27 28 /* 29 * FILENAME: mlib_ImageChannelExtract_1.c 30 * 31 * FUNCTIONS 32 * mlib_v_ImageChannelExtract_U8_21_A8D1X8 33 * mlib_v_ImageChannelExtract_U8_21_A8D2X8 34 * mlib_v_ImageChannelExtract_U8_21_D1 35 * mlib_v_ImageChannelExtract_U8_21 36 * mlib_v_ImageChannelExtract_U8_31_A8D1X8 37 * mlib_v_ImageChannelExtract_U8_31_A8D2X8 38 * mlib_v_ImageChannelExtract_U8_31_D1 39 * mlib_v_ImageChannelExtract_U8_31 40 * mlib_v_ImageChannelExtract_U8_41_A8D1X8 41 * mlib_v_ImageChannelExtract_U8_41_A8D2X8 42 * mlib_v_ImageChannelExtract_U8_41_D1 43 * mlib_v_ImageChannelExtract_U8_41 44 * mlib_v_ImageChannelExtract_S16_21_A8D1X4 45 * mlib_v_ImageChannelExtract_S16_21_A8D2X4 46 * mlib_v_ImageChannelExtract_S16_21_D1 47 * mlib_v_ImageChannelExtract_S16_21 48 * mlib_v_ImageChannelExtract_S16_31_A8D1X4 49 * mlib_v_ImageChannelExtract_S16_31_A8D2X4 50 * mlib_v_ImageChannelExtract_S16_31_D1 51 * mlib_v_ImageChannelExtract_S16_31 52 * mlib_v_ImageChannelExtract_S16_41_A8D1X4 53 * mlib_v_ImageChannelExtract_S16_41_A8D2X4 54 * mlib_v_ImageChannelExtract_S16_41_D1 55 * mlib_v_ImageChannelExtract_S16_41 56 * 57 * ARGUMENT 58 * src pointer to source image data 59 * dst pointer to destination image data 60 * slb source image line stride in bytes 61 * dlb destination image line stride in bytes 62 * dsize image data size in pixels 63 * xsize image width in pixels 64 * ysize image height in lines 65 * cmask channel mask 66 * 67 * DESCRIPTION 68 * Extract the one selected channel of the source image into the 69 * 1-channel destination image. 70 * 71 * NOTE 72 * These functions are separated from mlib_ImageChannelExtract.c 73 * for loop unrolling and structure clarity. 74 */ 75 76 #include "vis_proto.h" 77 #include "mlib_image.h" 78 #include "mlib_v_ImageChannelExtract.h" 79 80 /***************************************************************/ 81 #define CHANNELEXTRACT_U8_21L(sd0, sd1, dd) \ 82 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \ 83 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \ 84 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \ 85 sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \ 86 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_hi(sdd)) 87 88 /***************************************************************/ 89 #define CHANNELEXTRACT_U8_21R(sd0, sd1, dd) \ 90 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \ 91 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \ 92 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \ 93 sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \ 94 dd = vis_fpmerge(vis_read_lo(sdc), vis_read_lo(sdd)) 95 96 /***************************************************************/ 97 /* extract one channel from a 2-channel image. 98 * both source and destination image data are 8-byte aligned. 99 * xsize is multiple of 8. 100 */ 101 102 void mlib_v_ImageChannelExtract_U8_21_A8D1X8(const mlib_u8 *src, 103 mlib_u8 *dst, 104 mlib_s32 dsize, 105 mlib_s32 cmask) 106 { 107 mlib_d64 *sp, *dp; 108 mlib_d64 sd0, sd1; 109 mlib_d64 sda, sdb, sdc, sdd; 110 mlib_d64 dd; 111 mlib_s32 i; 112 113 sp = (mlib_d64 *) src; 114 dp = (mlib_d64 *) dst; 115 116 if (cmask == 2) { 117 #pragma pipeloop(0) 118 for (i = 0; i < dsize / 8; i++) { 119 sd0 = *sp++; 120 sd1 = *sp++; 121 CHANNELEXTRACT_U8_21L(sd0, sd1, dd); 122 *dp++ = dd; 123 } 124 } 125 else { 126 #pragma pipeloop(0) 127 for (i = 0; i < dsize / 8; i++) { 128 sd0 = *sp++; 129 sd1 = *sp++; 130 CHANNELEXTRACT_U8_21R(sd0, sd1, dd); 131 *dp++ = dd; 132 } 133 } 134 } 135 136 /***************************************************************/ 137 /* extract one channel from a 2-channel image. 138 * both source and destination image data are 8-byte aligned. 139 * xsize is multiple of 8. 140 */ 141 142 void mlib_v_ImageChannelExtract_U8_21_A8D2X8(const mlib_u8 *src, 143 mlib_s32 slb, 144 mlib_u8 *dst, 145 mlib_s32 dlb, 146 mlib_s32 xsize, 147 mlib_s32 ysize, 148 mlib_s32 cmask) 149 { 150 mlib_d64 *sp, *dp; 151 mlib_d64 *sl, *dl; 152 mlib_d64 sd0, sd1; 153 mlib_d64 sda, sdb, sdc, sdd; 154 mlib_d64 dd; 155 mlib_s32 i, j; 156 157 sp = sl = (mlib_d64 *) src; 158 dp = dl = (mlib_d64 *) dst; 159 160 if (cmask == 2) { 161 for (j = 0; j < ysize; j++) { 162 #pragma pipeloop(0) 163 for (i = 0; i < xsize / 8; i++) { 164 sd0 = *sp++; 165 sd1 = *sp++; 166 CHANNELEXTRACT_U8_21L(sd0, sd1, dd); 167 *dp++ = dd; 168 } 169 170 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 171 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 172 } 173 } 174 else { 175 for (j = 0; j < ysize; j++) { 176 #pragma pipeloop(0) 177 for (i = 0; i < xsize / 8; i++) { 178 sd0 = *sp++; 179 sd1 = *sp++; 180 CHANNELEXTRACT_U8_21R(sd0, sd1, dd); 181 *dp++ = dd; 182 } 183 184 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 185 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 186 } 187 } 188 } 189 190 /***************************************************************/ 191 /* extract one channel from a 2-channel image. 192 */ 193 194 void mlib_v_ImageChannelExtract_U8_21_D1(const mlib_u8 *src, 195 mlib_u8 *dst, 196 mlib_s32 dsize, 197 mlib_s32 cmask) 198 { 199 mlib_u8 *sa, *da; 200 mlib_u8 *dend, *dend2; /* end points in dst */ 201 mlib_d64 *dp; /* 8-byte aligned start points in dst */ 202 mlib_d64 *sp; /* 8-byte aligned start point in src */ 203 mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */ 204 mlib_d64 sda, sdb, sdc, sdd; 205 mlib_d64 dd0, dd1; 206 mlib_s32 soff; /* offset of address in src */ 207 mlib_s32 doff; /* offset of address in dst */ 208 mlib_s32 off; /* offset of src over dst */ 209 mlib_s32 emask; /* edge mask */ 210 mlib_s32 i, n; 211 212 sa = (void *)src; 213 da = dst; 214 215 /* prepare the source address */ 216 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 217 soff = ((mlib_addr) sa & 7); 218 219 /* prepare the destination addresses */ 220 dp = (mlib_d64 *) ((mlib_addr) da & (~7)); 221 doff = ((mlib_addr) da & 7); 222 dend = da + dsize - 1; 223 dend2 = dend - 7; 224 225 /* calculate the src's offset over dst */ 226 if (cmask == 2) { 227 off = soff / 2 - doff; 228 } 229 else { 230 off = (soff + 1) / 2 - doff; 231 } 232 233 if (((cmask == 2) && (soff % 2 == 0)) || ((cmask == 1) && (soff % 2 != 0))) { /* extract even bytes */ 234 235 if (off == 0) { /* src and dst have same alignment */ 236 237 /* generate edge mask for the start point */ 238 emask = vis_edge8(da, dend); 239 240 /* load 16 bytes */ 241 sd0 = *sp++; 242 sd1 = *sp++; 243 244 /* extract, including some garbage at the start point */ 245 CHANNELEXTRACT_U8_21L(sd0, sd1, dd0); 246 247 /* store 8 bytes result */ 248 vis_pst_8(dd0, dp++, emask); 249 250 if ((mlib_addr) dp <= (mlib_addr) dend2) { 251 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 252 253 /* 8-pixel column loop, emask not needed */ 254 #pragma pipeloop(0) 255 for (i = 0; i < n; i++) { 256 sd0 = *sp++; 257 sd1 = *sp++; 258 CHANNELEXTRACT_U8_21L(sd0, sd1, dd0); 259 *dp++ = dd0; 260 } 261 } 262 263 /* end point handling */ 264 if ((mlib_addr) dp <= (mlib_addr) dend) { 265 emask = vis_edge8(dp, dend); 266 sd0 = *sp++; 267 sd1 = *sp++; 268 CHANNELEXTRACT_U8_21L(sd0, sd1, dd0); 269 vis_pst_8(dd0, dp++, emask); 270 } 271 } 272 else { 273 vis_alignaddr((void *)0, off); 274 275 /* generate edge mask for the start point */ 276 emask = vis_edge8(da, dend); 277 278 if (off < 0) { 279 /* load 16 bytes */ 280 sd2 = *sp++; 281 sd3 = *sp++; 282 283 /* extract and store 8 bytes */ 284 CHANNELEXTRACT_U8_21L(sd2, sd3, dd1); 285 vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask); 286 } 287 else { 288 /* load 32 bytes */ 289 sd0 = *sp++; 290 sd1 = *sp++; 291 sd2 = *sp++; 292 sd3 = *sp++; 293 294 /* extract and store 8 bytes */ 295 CHANNELEXTRACT_U8_21L(sd0, sd1, dd0); 296 CHANNELEXTRACT_U8_21L(sd2, sd3, dd1); 297 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 298 } 299 300 if ((mlib_addr) dp <= (mlib_addr) dend2) { 301 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 302 303 /* 8-pixel column loop, emask not needed */ 304 #pragma pipeloop(0) 305 for (i = 0; i < n; i++) { 306 dd0 = dd1; 307 sd2 = *sp++; 308 sd3 = *sp++; 309 CHANNELEXTRACT_U8_21L(sd2, sd3, dd1); 310 *dp++ = vis_faligndata(dd0, dd1); 311 } 312 } 313 314 /* end point handling */ 315 if ((mlib_addr) dp <= (mlib_addr) dend) { 316 emask = vis_edge8(dp, dend); 317 dd0 = dd1; 318 sd2 = *sp++; 319 sd3 = *sp++; 320 CHANNELEXTRACT_U8_21L(sd2, sd3, dd1); 321 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 322 } 323 } 324 } 325 else { /* extract odd bytes */ 326 327 if (off == 0) { /* src and dst have same alignment */ 328 329 /* generate edge mask for the start point */ 330 emask = vis_edge8(da, dend); 331 332 /* load 16 bytes, don't care the garbage at the start point */ 333 sd0 = *sp++; 334 sd1 = *sp++; 335 336 /* extract and store 8 bytes */ 337 CHANNELEXTRACT_U8_21R(sd0, sd1, dd0); 338 vis_pst_8(dd0, dp++, emask); 339 340 if ((mlib_addr) dp <= (mlib_addr) dend2) { 341 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 342 343 /* 8-pixel column loop, emask not needed */ 344 #pragma pipeloop(0) 345 for (i = 0; i < n; i++) { 346 sd0 = *sp++; 347 sd1 = *sp++; 348 CHANNELEXTRACT_U8_21R(sd0, sd1, dd0); 349 *dp++ = dd0; 350 } 351 } 352 353 /* end point handling */ 354 if ((mlib_addr) dp <= (mlib_addr) dend) { 355 emask = vis_edge8(dp, dend); 356 sd0 = *sp++; 357 sd1 = *sp++; 358 CHANNELEXTRACT_U8_21R(sd0, sd1, dd0); 359 vis_pst_8(dd0, dp++, emask); 360 } 361 } 362 else { 363 vis_alignaddr((void *)0, off); 364 365 /* generate edge mask for the start point */ 366 emask = vis_edge8(da, dend); 367 368 if (off < 0) { 369 /* load 16 bytes */ 370 sd2 = *sp++; 371 sd3 = *sp++; 372 373 /* extract and store 8 bytes */ 374 CHANNELEXTRACT_U8_21R(sd2, sd3, dd1); 375 vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask); 376 } 377 else { 378 /* load 32 bytes */ 379 sd0 = *sp++; 380 sd1 = *sp++; 381 sd2 = *sp++; 382 sd3 = *sp++; 383 384 /* extract and store 8 bytes */ 385 CHANNELEXTRACT_U8_21R(sd0, sd1, dd0); 386 CHANNELEXTRACT_U8_21R(sd2, sd3, dd1); 387 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 388 } 389 390 if ((mlib_addr) dp <= (mlib_addr) dend2) { 391 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 392 393 /* 8-pixel column loop, emask not needed */ 394 #pragma pipeloop(0) 395 for (i = 0; i < n; i++) { 396 dd0 = dd1; 397 sd2 = *sp++; 398 sd3 = *sp++; 399 CHANNELEXTRACT_U8_21R(sd2, sd3, dd1); 400 *dp++ = vis_faligndata(dd0, dd1); 401 } 402 } 403 404 /* end point handling */ 405 if ((mlib_addr) dp <= (mlib_addr) dend) { 406 emask = vis_edge8(dp, dend); 407 dd0 = dd1; 408 sd2 = *sp++; 409 sd3 = *sp++; 410 CHANNELEXTRACT_U8_21R(sd2, sd3, dd1); 411 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 412 } 413 } 414 } 415 } 416 417 /***************************************************************/ 418 /* extract one channel from a 2-channel image. 419 */ 420 421 void mlib_v_ImageChannelExtract_U8_21(const mlib_u8 *src, 422 mlib_s32 slb, 423 mlib_u8 *dst, 424 mlib_s32 dlb, 425 mlib_s32 xsize, 426 mlib_s32 ysize, 427 mlib_s32 cmask) 428 { 429 mlib_u8 *sa, *da; 430 mlib_u8 *sl, *dl; 431 mlib_s32 j; 432 433 sa = sl = (void *)src; 434 da = dl = dst; 435 436 for (j = 0; j < ysize; j++) { 437 mlib_v_ImageChannelExtract_U8_21_D1(sa, da, xsize, cmask); 438 sa = sl += slb; 439 da = dl += dlb; 440 } 441 } 442 443 /***************************************************************/ 444 #define CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd) \ 445 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \ 446 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \ 447 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \ 448 sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \ 449 sde = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc)); \ 450 dd = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde)) 451 452 /***************************************************************/ 453 #define CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd) \ 454 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \ 455 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \ 456 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \ 457 sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \ 458 sde = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc)); \ 459 dd = vis_fpmerge(vis_read_lo(sdd), vis_read_hi(sde)) 460 461 /***************************************************************/ 462 #define CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd) \ 463 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \ 464 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \ 465 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \ 466 sdd = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc)); \ 467 sde = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc)); \ 468 dd = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde)) 469 470 /***************************************************************/ 471 void mlib_v_ImageChannelExtract_U8_31_A8D1X8(const mlib_u8 *src, 472 mlib_u8 *dst, 473 mlib_s32 dsize, 474 mlib_s32 cmask) 475 { 476 mlib_d64 *sp, *dp; 477 mlib_d64 sd0, sd1, sd2; 478 mlib_d64 sda, sdb, sdc, sdd, sde; 479 mlib_d64 dd; 480 mlib_s32 i; 481 482 sp = (mlib_d64 *) src; 483 dp = (mlib_d64 *) dst; 484 485 if (cmask == 4) { 486 #pragma pipeloop(0) 487 for (i = 0; i < dsize / 8; i++) { 488 sd0 = *sp++; 489 sd1 = *sp++; 490 sd2 = *sp++; 491 CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd); 492 *dp++ = dd; 493 } 494 } 495 else if (cmask == 2) { 496 #pragma pipeloop(0) 497 for (i = 0; i < dsize / 8; i++) { 498 sd0 = *sp++; 499 sd1 = *sp++; 500 sd2 = *sp++; 501 CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd); 502 *dp++ = dd; 503 } 504 } 505 else { 506 #pragma pipeloop(0) 507 for (i = 0; i < dsize / 8; i++) { 508 sd0 = *sp++; 509 sd1 = *sp++; 510 sd2 = *sp++; 511 CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd); 512 *dp++ = dd; 513 } 514 } 515 } 516 517 /***************************************************************/ 518 void mlib_v_ImageChannelExtract_U8_31_A8D2X8(const mlib_u8 *src, 519 mlib_s32 slb, 520 mlib_u8 *dst, 521 mlib_s32 dlb, 522 mlib_s32 xsize, 523 mlib_s32 ysize, 524 mlib_s32 cmask) 525 { 526 mlib_d64 *sp, *dp; 527 mlib_d64 *sl, *dl; 528 mlib_d64 sd0, sd1, sd2; 529 mlib_d64 sda, sdb, sdc, sdd, sde; 530 mlib_d64 dd; 531 mlib_s32 i, j; 532 533 sp = sl = (mlib_d64 *) src; 534 dp = dl = (mlib_d64 *) dst; 535 536 if (cmask == 4) { 537 for (j = 0; j < ysize; j++) { 538 #pragma pipeloop(0) 539 for (i = 0; i < xsize / 8; i++) { 540 sd0 = *sp++; 541 sd1 = *sp++; 542 sd2 = *sp++; 543 CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd); 544 *dp++ = dd; 545 } 546 547 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 548 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 549 } 550 } 551 else if (cmask == 2) { 552 for (j = 0; j < ysize; j++) { 553 #pragma pipeloop(0) 554 for (i = 0; i < xsize / 8; i++) { 555 sd0 = *sp++; 556 sd1 = *sp++; 557 sd2 = *sp++; 558 CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd); 559 *dp++ = dd; 560 } 561 562 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 563 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 564 } 565 } 566 else { 567 for (j = 0; j < ysize; j++) { 568 #pragma pipeloop(0) 569 for (i = 0; i < xsize / 8; i++) { 570 sd0 = *sp++; 571 sd1 = *sp++; 572 sd2 = *sp++; 573 CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd); 574 *dp++ = dd; 575 } 576 577 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 578 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 579 } 580 } 581 } 582 583 /***************************************************************/ 584 void mlib_v_ImageChannelExtract_U8_31_D1(const mlib_u8 *src, 585 mlib_u8 *dst, 586 mlib_s32 dsize, 587 mlib_s32 cmask) 588 { 589 mlib_u8 *sa, *da; 590 mlib_u8 *dend, *dend2; /* end points in dst */ 591 mlib_d64 *dp; /* 8-byte aligned start points in dst */ 592 mlib_d64 *sp; /* 8-byte aligned start point in src */ 593 mlib_d64 sd0, sd1, sd2; /* 8-byte source data */ 594 mlib_d64 sd3, sd4, sd5; 595 mlib_d64 sda, sdb, sdc, sdd, sde; 596 mlib_d64 dd0, dd1; 597 mlib_s32 soff; /* offset of address in src */ 598 mlib_s32 doff; /* offset of address in dst */ 599 mlib_s32 off; /* offset of src over dst */ 600 mlib_s32 emask; /* edge mask */ 601 mlib_s32 i, n; 602 603 sa = (void *)src; 604 da = dst; 605 606 /* prepare the source address */ 607 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 608 soff = ((mlib_addr) sa & 7); 609 610 /* prepare the destination addresses */ 611 dp = (mlib_d64 *) ((mlib_addr) da & (~7)); 612 doff = ((mlib_addr) da & 7); 613 dend = da + dsize - 1; 614 dend2 = dend - 7; 615 616 /* calculate the src's offset over dst */ 617 if (cmask == 4) { 618 off = soff / 3 - doff; 619 } 620 else if (cmask == 2) { 621 off = (soff + 1) / 3 - doff; 622 } 623 else { 624 off = (soff + 2) / 3 - doff; 625 } 626 627 if (((cmask == 4) && (soff % 3 == 0)) || 628 ((cmask == 2) && (soff % 3 == 2)) || 629 ((cmask == 1) && (soff % 3 == 1))) { /* extract left channel */ 630 631 if (off == 0) { /* src and dst have same alignment */ 632 633 /* generate edge mask for the start point */ 634 emask = vis_edge8(da, dend); 635 636 /* load 16 bytes */ 637 sd0 = *sp++; 638 sd1 = *sp++; 639 sd2 = *sp++; 640 641 /* extract, including some garbage at the start point */ 642 CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd0); 643 644 /* store 8 bytes result */ 645 vis_pst_8(dd0, dp++, emask); 646 647 if ((mlib_addr) dp <= (mlib_addr) dend2) { 648 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 649 650 /* 8-pixel column loop, emask not needed */ 651 #pragma pipeloop(0) 652 for (i = 0; i < n; i++) { 653 sd0 = *sp++; 654 sd1 = *sp++; 655 sd2 = *sp++; 656 CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd0); 657 *dp++ = dd0; 658 } 659 } 660 661 /* end point handling */ 662 if ((mlib_addr) dp <= (mlib_addr) dend) { 663 emask = vis_edge8(dp, dend); 664 sd0 = *sp++; 665 sd1 = *sp++; 666 sd2 = *sp++; 667 CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd0); 668 vis_pst_8(dd0, dp++, emask); 669 } 670 } 671 else { 672 vis_alignaddr((void *)0, off); 673 674 /* generate edge mask for the start point */ 675 emask = vis_edge8(da, dend); 676 677 if (off < 0) { 678 /* load 24 bytes */ 679 sd3 = *sp++; 680 sd4 = *sp++; 681 sd5 = *sp++; 682 683 /* extract and store 8 bytes */ 684 CHANNELEXTRACT_U8_31L(sd3, sd4, sd5, dd1); 685 vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask); 686 } 687 else { 688 /* load 48 bytes */ 689 sd0 = *sp++; 690 sd1 = *sp++; 691 sd2 = *sp++; 692 sd3 = *sp++; 693 sd4 = *sp++; 694 sd5 = *sp++; 695 696 /* extract and store 8 bytes */ 697 CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd0); 698 CHANNELEXTRACT_U8_31L(sd3, sd4, sd5, dd1); 699 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 700 } 701 702 if ((mlib_addr) dp <= (mlib_addr) dend2) { 703 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 704 705 /* 8-pixel column loop, emask not needed */ 706 #pragma pipeloop(0) 707 for (i = 0; i < n; i++) { 708 dd0 = dd1; 709 sd3 = *sp++; 710 sd4 = *sp++; 711 sd5 = *sp++; 712 CHANNELEXTRACT_U8_31L(sd3, sd4, sd5, dd1); 713 *dp++ = vis_faligndata(dd0, dd1); 714 } 715 } 716 717 /* end point handling */ 718 if ((mlib_addr) dp <= (mlib_addr) dend) { 719 emask = vis_edge8(dp, dend); 720 dd0 = dd1; 721 sd3 = *sp++; 722 sd4 = *sp++; 723 sd5 = *sp++; 724 CHANNELEXTRACT_U8_31L(sd3, sd4, sd5, dd1); 725 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 726 } 727 } 728 } 729 else if (((cmask == 4) && (soff % 3 == 1)) || 730 ((cmask == 2) && (soff % 3 == 0)) || 731 ((cmask == 1) && (soff % 3 == 2))) { 732 /* extract middle channel */ 733 734 if (off == 0) { /* src and dst have same alignment */ 735 736 /* generate edge mask for the start point */ 737 emask = vis_edge8(da, dend); 738 739 /* load 16 bytes */ 740 sd0 = *sp++; 741 sd1 = *sp++; 742 sd2 = *sp++; 743 744 /* extract, including some garbage at the start point */ 745 CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd0); 746 747 /* store 8 bytes result */ 748 vis_pst_8(dd0, dp++, emask); 749 750 if ((mlib_addr) dp <= (mlib_addr) dend2) { 751 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 752 753 /* 8-pixel column loop, emask not needed */ 754 #pragma pipeloop(0) 755 for (i = 0; i < n; i++) { 756 sd0 = *sp++; 757 sd1 = *sp++; 758 sd2 = *sp++; 759 CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd0); 760 *dp++ = dd0; 761 } 762 } 763 764 /* end point handling */ 765 if ((mlib_addr) dp <= (mlib_addr) dend) { 766 emask = vis_edge8(dp, dend); 767 sd0 = *sp++; 768 sd1 = *sp++; 769 sd2 = *sp++; 770 CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd0); 771 vis_pst_8(dd0, dp++, emask); 772 } 773 } 774 else { 775 vis_alignaddr((void *)0, off); 776 777 /* generate edge mask for the start point */ 778 emask = vis_edge8(da, dend); 779 780 if (off < 0) { 781 /* load 24 bytes */ 782 sd3 = *sp++; 783 sd4 = *sp++; 784 sd5 = *sp++; 785 786 /* extract and store 8 bytes */ 787 CHANNELEXTRACT_U8_31M(sd3, sd4, sd5, dd1); 788 vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask); 789 } 790 else { 791 /* load 48 bytes */ 792 sd0 = *sp++; 793 sd1 = *sp++; 794 sd2 = *sp++; 795 sd3 = *sp++; 796 sd4 = *sp++; 797 sd5 = *sp++; 798 799 /* extract and store 8 bytes */ 800 CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd0); 801 CHANNELEXTRACT_U8_31M(sd3, sd4, sd5, dd1); 802 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 803 } 804 805 if ((mlib_addr) dp <= (mlib_addr) dend2) { 806 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 807 808 /* 8-pixel column loop, emask not needed */ 809 #pragma pipeloop(0) 810 for (i = 0; i < n; i++) { 811 dd0 = dd1; 812 sd3 = *sp++; 813 sd4 = *sp++; 814 sd5 = *sp++; 815 CHANNELEXTRACT_U8_31M(sd3, sd4, sd5, dd1); 816 *dp++ = vis_faligndata(dd0, dd1); 817 } 818 } 819 820 /* end point handling */ 821 if ((mlib_addr) dp <= (mlib_addr) dend) { 822 emask = vis_edge8(dp, dend); 823 dd0 = dd1; 824 sd3 = *sp++; 825 sd4 = *sp++; 826 sd5 = *sp++; 827 CHANNELEXTRACT_U8_31M(sd3, sd4, sd5, dd1); 828 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 829 } 830 } 831 } 832 else { /* extract right channel */ 833 834 if (off == 0) { /* src and dst have same alignment */ 835 836 /* generate edge mask for the start point */ 837 emask = vis_edge8(da, dend); 838 839 /* load 16 bytes */ 840 sd0 = *sp++; 841 sd1 = *sp++; 842 sd2 = *sp++; 843 844 /* extract, including some garbage at the start point */ 845 CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd0); 846 847 /* store 8 bytes result */ 848 vis_pst_8(dd0, dp++, emask); 849 850 if ((mlib_addr) dp <= (mlib_addr) dend2) { 851 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 852 853 /* 8-pixel column loop, emask not needed */ 854 #pragma pipeloop(0) 855 for (i = 0; i < n; i++) { 856 sd0 = *sp++; 857 sd1 = *sp++; 858 sd2 = *sp++; 859 CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd0); 860 *dp++ = dd0; 861 } 862 } 863 864 /* end point handling */ 865 if ((mlib_addr) dp <= (mlib_addr) dend) { 866 emask = vis_edge8(dp, dend); 867 sd0 = *sp++; 868 sd1 = *sp++; 869 sd2 = *sp++; 870 CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd0); 871 vis_pst_8(dd0, dp++, emask); 872 } 873 } 874 else { 875 vis_alignaddr((void *)0, off); 876 877 /* generate edge mask for the start point */ 878 emask = vis_edge8(da, dend); 879 880 if (off < 0) { 881 /* load 24 bytes */ 882 sd3 = *sp++; 883 sd4 = *sp++; 884 sd5 = *sp++; 885 886 /* extract and store 8 bytes */ 887 CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1); 888 vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask); 889 } 890 else { 891 /* load 48 bytes */ 892 sd0 = *sp++; 893 sd1 = *sp++; 894 sd2 = *sp++; 895 sd3 = *sp++; 896 sd4 = *sp++; 897 sd5 = *sp++; 898 899 /* extract and store 8 bytes */ 900 CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd0); 901 CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1); 902 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 903 } 904 905 if ((mlib_addr) dp <= (mlib_addr) dend2) { 906 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 907 908 /* 8-pixel column loop, emask not needed */ 909 #pragma pipeloop(0) 910 for (i = 0; i < n; i++) { 911 dd0 = dd1; 912 sd3 = *sp++; 913 sd4 = *sp++; 914 sd5 = *sp++; 915 CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1); 916 *dp++ = vis_faligndata(dd0, dd1); 917 } 918 } 919 920 /* end point handling */ 921 if ((mlib_addr) dp <= (mlib_addr) dend) { 922 emask = vis_edge8(dp, dend); 923 dd0 = dd1; 924 sd3 = *sp++; 925 sd4 = *sp++; 926 sd5 = *sp++; 927 CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1); 928 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 929 } 930 } 931 } 932 } 933 934 /***************************************************************/ 935 void mlib_v_ImageChannelExtract_U8_31(const mlib_u8 *src, 936 mlib_s32 slb, 937 mlib_u8 *dst, 938 mlib_s32 dlb, 939 mlib_s32 xsize, 940 mlib_s32 ysize, 941 mlib_s32 cmask) 942 { 943 mlib_u8 *sa, *da; 944 mlib_u8 *sl, *dl; 945 mlib_s32 j; 946 947 sa = sl = (void *)src; 948 da = dl = dst; 949 950 for (j = 0; j < ysize; j++) { 951 mlib_v_ImageChannelExtract_U8_31_D1(sa, da, xsize, cmask); 952 sa = sl += slb; 953 da = dl += dlb; 954 } 955 } 956 957 /***************************************************************/ 958 #define CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd) \ 959 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \ 960 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \ 961 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \ 962 sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \ 963 sde = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdc)); \ 964 sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdd)); \ 965 dd = vis_fpmerge(vis_read_hi(sde), vis_read_hi(sdf)) 966 967 /***************************************************************/ 968 #define CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd) \ 969 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \ 970 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \ 971 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \ 972 sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \ 973 sde = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdc)); \ 974 sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdd)); \ 975 dd = vis_fpmerge(vis_read_lo(sde), vis_read_lo(sdf)) 976 977 /***************************************************************/ 978 #define CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd) \ 979 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \ 980 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \ 981 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \ 982 sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \ 983 sde = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdc)); \ 984 sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdd)); \ 985 dd = vis_fpmerge(vis_read_hi(sde), vis_read_hi(sdf)) 986 987 /***************************************************************/ 988 #define CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd) \ 989 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \ 990 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \ 991 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \ 992 sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \ 993 sde = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdc)); \ 994 sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdd)); \ 995 dd = vis_fpmerge(vis_read_lo(sde), vis_read_lo(sdf)) 996 997 /***************************************************************/ 998 void mlib_v_ImageChannelExtract_U8_41_A8D1X8(const mlib_u8 *src, 999 mlib_u8 *dst, 1000 mlib_s32 dsize, 1001 mlib_s32 cmask) 1002 { 1003 mlib_d64 *sp, *dp; 1004 mlib_d64 sd0, sd1, sd2, sd3; 1005 mlib_d64 sda, sdb, sdc, sdd, sde, sdf; 1006 mlib_d64 dd; 1007 mlib_s32 i; 1008 1009 sp = (mlib_d64 *) src; 1010 dp = (mlib_d64 *) dst; 1011 1012 if (cmask == 8) { 1013 #pragma pipeloop(0) 1014 for (i = 0; i < dsize / 8; i++) { 1015 sd0 = *sp++; 1016 sd1 = *sp++; 1017 sd2 = *sp++; 1018 sd3 = *sp++; 1019 CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd); 1020 *dp++ = dd; 1021 } 1022 } 1023 else if (cmask == 4) { 1024 #pragma pipeloop(0) 1025 for (i = 0; i < dsize / 8; i++) { 1026 sd0 = *sp++; 1027 sd1 = *sp++; 1028 sd2 = *sp++; 1029 sd3 = *sp++; 1030 CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd); 1031 *dp++ = dd; 1032 } 1033 } 1034 else if (cmask == 2) { 1035 #pragma pipeloop(0) 1036 for (i = 0; i < dsize / 8; i++) { 1037 sd0 = *sp++; 1038 sd1 = *sp++; 1039 sd2 = *sp++; 1040 sd3 = *sp++; 1041 CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd); 1042 *dp++ = dd; 1043 } 1044 } 1045 else { 1046 #pragma pipeloop(0) 1047 for (i = 0; i < dsize / 8; i++) { 1048 sd0 = *sp++; 1049 sd1 = *sp++; 1050 sd2 = *sp++; 1051 sd3 = *sp++; 1052 CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd); 1053 *dp++ = dd; 1054 } 1055 } 1056 } 1057 1058 /***************************************************************/ 1059 void mlib_v_ImageChannelExtract_U8_41_A8D2X8(const mlib_u8 *src, 1060 mlib_s32 slb, 1061 mlib_u8 *dst, 1062 mlib_s32 dlb, 1063 mlib_s32 xsize, 1064 mlib_s32 ysize, 1065 mlib_s32 cmask) 1066 { 1067 mlib_d64 *sp, *dp; 1068 mlib_d64 *sl, *dl; 1069 mlib_d64 sd0, sd1, sd2, sd3; 1070 mlib_d64 sda, sdb, sdc, sdd, sde, sdf; 1071 mlib_d64 dd; 1072 mlib_s32 i, j; 1073 1074 sp = sl = (mlib_d64 *) src; 1075 dp = dl = (mlib_d64 *) dst; 1076 1077 if (cmask == 8) { 1078 for (j = 0; j < ysize; j++) { 1079 #pragma pipeloop(0) 1080 for (i = 0; i < xsize / 8; i++) { 1081 sd0 = *sp++; 1082 sd1 = *sp++; 1083 sd2 = *sp++; 1084 sd3 = *sp++; 1085 CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd); 1086 *dp++ = dd; 1087 } 1088 1089 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 1090 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 1091 } 1092 } 1093 else if (cmask == 4) { 1094 for (j = 0; j < ysize; j++) { 1095 #pragma pipeloop(0) 1096 for (i = 0; i < xsize / 8; i++) { 1097 sd0 = *sp++; 1098 sd1 = *sp++; 1099 sd2 = *sp++; 1100 sd3 = *sp++; 1101 CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd); 1102 *dp++ = dd; 1103 } 1104 1105 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 1106 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 1107 } 1108 } 1109 else if (cmask == 2) { 1110 for (j = 0; j < ysize; j++) { 1111 #pragma pipeloop(0) 1112 for (i = 0; i < xsize / 8; i++) { 1113 sd0 = *sp++; 1114 sd1 = *sp++; 1115 sd2 = *sp++; 1116 sd3 = *sp++; 1117 CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd); 1118 *dp++ = dd; 1119 } 1120 1121 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 1122 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 1123 } 1124 } 1125 else { 1126 for (j = 0; j < ysize; j++) { 1127 #pragma pipeloop(0) 1128 for (i = 0; i < xsize / 8; i++) { 1129 sd0 = *sp++; 1130 sd1 = *sp++; 1131 sd2 = *sp++; 1132 sd3 = *sp++; 1133 CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd); 1134 *dp++ = dd; 1135 } 1136 1137 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 1138 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 1139 } 1140 } 1141 } 1142 1143 /***************************************************************/ 1144 void mlib_v_ImageChannelExtract_U8_41_D1(const mlib_u8 *src, 1145 mlib_u8 *dst, 1146 mlib_s32 dsize, 1147 mlib_s32 cmask) 1148 { 1149 mlib_u8 *sa, *da; 1150 mlib_u8 *dend, *dend2; /* end points in dst */ 1151 mlib_d64 *dp; /* 8-byte aligned start points in dst */ 1152 mlib_d64 *sp; /* 8-byte aligned start point in src */ 1153 mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */ 1154 mlib_d64 sd4, sd5, sd6, sd7; 1155 mlib_d64 sda, sdb, sdc, sdd; 1156 mlib_d64 sde, sdf; 1157 mlib_d64 dd0, dd1; 1158 mlib_s32 soff; /* offset of address in src */ 1159 mlib_s32 doff; /* offset of address in dst */ 1160 mlib_s32 off; /* offset of src over dst */ 1161 mlib_s32 emask; /* edge mask */ 1162 mlib_s32 i, n; 1163 1164 sa = (void *)src; 1165 da = dst; 1166 1167 /* prepare the source address */ 1168 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 1169 soff = ((mlib_addr) sa & 7); 1170 1171 /* prepare the destination addresses */ 1172 dp = (mlib_d64 *) ((mlib_addr) da & (~7)); 1173 doff = ((mlib_addr) da & 7); 1174 dend = da + dsize - 1; 1175 dend2 = dend - 7; 1176 1177 /* calculate the src's offset over dst */ 1178 if (cmask == 8) { 1179 off = soff / 4 - doff; 1180 } 1181 else if (cmask == 4) { 1182 off = (soff + 1) / 4 - doff; 1183 } 1184 else if (cmask == 2) { 1185 off = (soff + 2) / 4 - doff; 1186 } 1187 else { 1188 off = (soff + 3) / 4 - doff; 1189 } 1190 1191 if (((cmask == 8) && (soff % 4 == 0)) || 1192 ((cmask == 4) && (soff % 4 == 3)) || 1193 ((cmask == 2) && (soff % 4 == 2)) || 1194 ((cmask == 1) && (soff % 4 == 1))) { /* extract left channel */ 1195 1196 if (off == 0) { /* src and dst have same alignment */ 1197 1198 /* generate edge mask for the start point */ 1199 emask = vis_edge8(da, dend); 1200 1201 /* load 16 bytes */ 1202 sd0 = *sp++; 1203 sd1 = *sp++; 1204 sd2 = *sp++; 1205 sd3 = *sp++; 1206 1207 /* extract, including some garbage at the start point */ 1208 CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd0); 1209 1210 /* store 8 bytes result */ 1211 vis_pst_8(dd0, dp++, emask); 1212 1213 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1214 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 1215 1216 /* 8-pixel column loop, emask not needed */ 1217 #pragma pipeloop(0) 1218 for (i = 0; i < n; i++) { 1219 sd0 = *sp++; 1220 sd1 = *sp++; 1221 sd2 = *sp++; 1222 sd3 = *sp++; 1223 CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd0); 1224 *dp++ = dd0; 1225 } 1226 } 1227 1228 /* end point handling */ 1229 if ((mlib_addr) dp <= (mlib_addr) dend) { 1230 emask = vis_edge8(dp, dend); 1231 sd0 = *sp++; 1232 sd1 = *sp++; 1233 sd2 = *sp++; 1234 sd3 = *sp++; 1235 CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd0); 1236 vis_pst_8(dd0, dp++, emask); 1237 } 1238 } 1239 else { 1240 vis_alignaddr((void *)0, off); 1241 1242 /* generate edge mask for the start point */ 1243 emask = vis_edge8(da, dend); 1244 1245 if (off < 0) { 1246 /* load 24 bytes */ 1247 sd4 = *sp++; 1248 sd5 = *sp++; 1249 sd6 = *sp++; 1250 sd7 = *sp++; 1251 1252 /* extract and store 8 bytes */ 1253 CHANNELEXTRACT_U8_41L(sd4, sd5, sd6, sd7, dd1); 1254 vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask); 1255 } 1256 else { 1257 /* load 48 bytes */ 1258 sd0 = *sp++; 1259 sd1 = *sp++; 1260 sd2 = *sp++; 1261 sd3 = *sp++; 1262 sd4 = *sp++; 1263 sd5 = *sp++; 1264 sd6 = *sp++; 1265 sd7 = *sp++; 1266 1267 /* extract and store 8 bytes */ 1268 CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd0); 1269 CHANNELEXTRACT_U8_41L(sd4, sd5, sd6, sd7, dd1); 1270 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 1271 } 1272 1273 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1274 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 1275 1276 /* 8-pixel column loop, emask not needed */ 1277 #pragma pipeloop(0) 1278 for (i = 0; i < n; i++) { 1279 dd0 = dd1; 1280 sd4 = *sp++; 1281 sd5 = *sp++; 1282 sd6 = *sp++; 1283 sd7 = *sp++; 1284 CHANNELEXTRACT_U8_41L(sd4, sd5, sd6, sd7, dd1); 1285 *dp++ = vis_faligndata(dd0, dd1); 1286 } 1287 } 1288 1289 /* end point handling */ 1290 if ((mlib_addr) dp <= (mlib_addr) dend) { 1291 emask = vis_edge8(dp, dend); 1292 dd0 = dd1; 1293 sd4 = *sp++; 1294 sd5 = *sp++; 1295 sd6 = *sp++; 1296 sd7 = *sp++; 1297 CHANNELEXTRACT_U8_41L(sd4, sd5, sd6, sd7, dd1); 1298 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 1299 } 1300 } 1301 } 1302 else if (((cmask == 8) && (soff % 4 == 1)) || 1303 ((cmask == 4) && (soff % 4 == 0)) || 1304 ((cmask == 2) && (soff % 4 == 3)) || 1305 ((cmask == 1) && (soff % 4 == 2))) { 1306 /* extract middle left channel */ 1307 1308 if (off == 0) { /* src and dst have same alignment */ 1309 1310 /* generate edge mask for the start point */ 1311 emask = vis_edge8(da, dend); 1312 1313 /* load 16 bytes */ 1314 sd0 = *sp++; 1315 sd1 = *sp++; 1316 sd2 = *sp++; 1317 sd3 = *sp++; 1318 1319 /* extract, including some garbage at the start point */ 1320 CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd0); 1321 1322 /* store 8 bytes result */ 1323 vis_pst_8(dd0, dp++, emask); 1324 1325 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1326 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 1327 1328 /* 8-pixel column loop, emask not needed */ 1329 #pragma pipeloop(0) 1330 for (i = 0; i < n; i++) { 1331 sd0 = *sp++; 1332 sd1 = *sp++; 1333 sd2 = *sp++; 1334 sd3 = *sp++; 1335 CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd0); 1336 *dp++ = dd0; 1337 } 1338 } 1339 1340 /* end point handling */ 1341 if ((mlib_addr) dp <= (mlib_addr) dend) { 1342 emask = vis_edge8(dp, dend); 1343 sd0 = *sp++; 1344 sd1 = *sp++; 1345 sd2 = *sp++; 1346 sd3 = *sp++; 1347 CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd0); 1348 vis_pst_8(dd0, dp++, emask); 1349 } 1350 } 1351 else { 1352 vis_alignaddr((void *)0, off); 1353 1354 /* generate edge mask for the start point */ 1355 emask = vis_edge8(da, dend); 1356 1357 if (off < 0) { 1358 /* load 24 bytes */ 1359 sd4 = *sp++; 1360 sd5 = *sp++; 1361 sd6 = *sp++; 1362 sd7 = *sp++; 1363 1364 /* extract and store 8 bytes */ 1365 CHANNELEXTRACT_U8_41ML(sd4, sd5, sd6, sd7, dd1); 1366 vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask); 1367 } 1368 else { 1369 /* load 48 bytes */ 1370 sd0 = *sp++; 1371 sd1 = *sp++; 1372 sd2 = *sp++; 1373 sd3 = *sp++; 1374 sd4 = *sp++; 1375 sd5 = *sp++; 1376 sd6 = *sp++; 1377 sd7 = *sp++; 1378 1379 /* extract and store 8 bytes */ 1380 CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd0); 1381 CHANNELEXTRACT_U8_41ML(sd4, sd5, sd6, sd7, dd1); 1382 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 1383 } 1384 1385 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1386 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 1387 1388 /* 8-pixel column loop, emask not needed */ 1389 #pragma pipeloop(0) 1390 for (i = 0; i < n; i++) { 1391 dd0 = dd1; 1392 sd4 = *sp++; 1393 sd5 = *sp++; 1394 sd6 = *sp++; 1395 sd7 = *sp++; 1396 CHANNELEXTRACT_U8_41ML(sd4, sd5, sd6, sd7, dd1); 1397 *dp++ = vis_faligndata(dd0, dd1); 1398 } 1399 } 1400 1401 /* end point handling */ 1402 if ((mlib_addr) dp <= (mlib_addr) dend) { 1403 emask = vis_edge8(dp, dend); 1404 dd0 = dd1; 1405 sd4 = *sp++; 1406 sd5 = *sp++; 1407 sd6 = *sp++; 1408 sd7 = *sp++; 1409 CHANNELEXTRACT_U8_41ML(sd4, sd5, sd6, sd7, dd1); 1410 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 1411 } 1412 } 1413 } 1414 else if (((cmask == 8) && (soff % 4 == 2)) || 1415 ((cmask == 4) && (soff % 4 == 1)) || 1416 ((cmask == 2) && (soff % 4 == 0)) || 1417 ((cmask == 1) && (soff % 4 == 3))) { /* extract middle right channel */ 1418 1419 if (off == 0) { /* src and dst have same alignment */ 1420 1421 /* generate edge mask for the start point */ 1422 emask = vis_edge8(da, dend); 1423 1424 /* load 16 bytes */ 1425 sd0 = *sp++; 1426 sd1 = *sp++; 1427 sd2 = *sp++; 1428 sd3 = *sp++; 1429 1430 /* extract, including some garbage at the start point */ 1431 CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd0); 1432 1433 /* store 8 bytes result */ 1434 vis_pst_8(dd0, dp++, emask); 1435 1436 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1437 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 1438 1439 /* 8-pixel column loop, emask not needed */ 1440 #pragma pipeloop(0) 1441 for (i = 0; i < n; i++) { 1442 sd0 = *sp++; 1443 sd1 = *sp++; 1444 sd2 = *sp++; 1445 sd3 = *sp++; 1446 CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd0); 1447 *dp++ = dd0; 1448 } 1449 } 1450 1451 /* end point handling */ 1452 if ((mlib_addr) dp <= (mlib_addr) dend) { 1453 emask = vis_edge8(dp, dend); 1454 sd0 = *sp++; 1455 sd1 = *sp++; 1456 sd2 = *sp++; 1457 sd3 = *sp++; 1458 CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd0); 1459 vis_pst_8(dd0, dp++, emask); 1460 } 1461 } 1462 else { 1463 vis_alignaddr((void *)0, off); 1464 1465 /* generate edge mask for the start point */ 1466 emask = vis_edge8(da, dend); 1467 1468 if (off < 0) { 1469 /* load 24 bytes */ 1470 sd4 = *sp++; 1471 sd5 = *sp++; 1472 sd6 = *sp++; 1473 sd7 = *sp++; 1474 1475 /* extract and store 8 bytes */ 1476 CHANNELEXTRACT_U8_41MR(sd4, sd5, sd6, sd7, dd1); 1477 vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask); 1478 } 1479 else { 1480 /* load 48 bytes */ 1481 sd0 = *sp++; 1482 sd1 = *sp++; 1483 sd2 = *sp++; 1484 sd3 = *sp++; 1485 sd4 = *sp++; 1486 sd5 = *sp++; 1487 sd6 = *sp++; 1488 sd7 = *sp++; 1489 1490 /* extract and store 8 bytes */ 1491 CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd0); 1492 CHANNELEXTRACT_U8_41MR(sd4, sd5, sd6, sd7, dd1); 1493 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 1494 } 1495 1496 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1497 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 1498 1499 /* 8-pixel column loop, emask not needed */ 1500 #pragma pipeloop(0) 1501 for (i = 0; i < n; i++) { 1502 dd0 = dd1; 1503 sd4 = *sp++; 1504 sd5 = *sp++; 1505 sd6 = *sp++; 1506 sd7 = *sp++; 1507 CHANNELEXTRACT_U8_41MR(sd4, sd5, sd6, sd7, dd1); 1508 *dp++ = vis_faligndata(dd0, dd1); 1509 } 1510 } 1511 1512 /* end point handling */ 1513 if ((mlib_addr) dp <= (mlib_addr) dend) { 1514 emask = vis_edge8(dp, dend); 1515 dd0 = dd1; 1516 sd4 = *sp++; 1517 sd5 = *sp++; 1518 sd6 = *sp++; 1519 sd7 = *sp++; 1520 CHANNELEXTRACT_U8_41MR(sd4, sd5, sd6, sd7, dd1); 1521 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 1522 } 1523 } 1524 } 1525 else { /* extract right channel */ 1526 if (off == 0) { /* src and dst have same alignment */ 1527 1528 /* generate edge mask for the start point */ 1529 emask = vis_edge8(da, dend); 1530 1531 /* load 16 bytes */ 1532 sd0 = *sp++; 1533 sd1 = *sp++; 1534 sd2 = *sp++; 1535 sd3 = *sp++; 1536 1537 /* extract, including some garbage at the start point */ 1538 CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd0); 1539 1540 /* store 8 bytes result */ 1541 vis_pst_8(dd0, dp++, emask); 1542 1543 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1544 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 1545 1546 /* 8-pixel column loop, emask not needed */ 1547 #pragma pipeloop(0) 1548 for (i = 0; i < n; i++) { 1549 sd0 = *sp++; 1550 sd1 = *sp++; 1551 sd2 = *sp++; 1552 sd3 = *sp++; 1553 CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd0); 1554 *dp++ = dd0; 1555 } 1556 } 1557 1558 /* end point handling */ 1559 if ((mlib_addr) dp <= (mlib_addr) dend) { 1560 emask = vis_edge8(dp, dend); 1561 sd0 = *sp++; 1562 sd1 = *sp++; 1563 sd2 = *sp++; 1564 sd3 = *sp++; 1565 CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd0); 1566 vis_pst_8(dd0, dp++, emask); 1567 } 1568 } 1569 else { 1570 vis_alignaddr((void *)0, off); 1571 1572 /* generate edge mask for the start point */ 1573 emask = vis_edge8(da, dend); 1574 1575 if (off < 0) { 1576 /* load 24 bytes */ 1577 sd4 = *sp++; 1578 sd5 = *sp++; 1579 sd6 = *sp++; 1580 sd7 = *sp++; 1581 1582 /* extract and store 8 bytes */ 1583 CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1); 1584 vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask); 1585 } 1586 else { 1587 /* load 48 bytes */ 1588 sd0 = *sp++; 1589 sd1 = *sp++; 1590 sd2 = *sp++; 1591 sd3 = *sp++; 1592 sd4 = *sp++; 1593 sd5 = *sp++; 1594 sd6 = *sp++; 1595 sd7 = *sp++; 1596 1597 /* extract and store 8 bytes */ 1598 CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd0); 1599 CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1); 1600 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 1601 } 1602 1603 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1604 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 1605 1606 /* 8-pixel column loop, emask not needed */ 1607 #pragma pipeloop(0) 1608 for (i = 0; i < n; i++) { 1609 dd0 = dd1; 1610 sd4 = *sp++; 1611 sd5 = *sp++; 1612 sd6 = *sp++; 1613 sd7 = *sp++; 1614 CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1); 1615 *dp++ = vis_faligndata(dd0, dd1); 1616 } 1617 } 1618 1619 /* end point handling */ 1620 if ((mlib_addr) dp <= (mlib_addr) dend) { 1621 emask = vis_edge8(dp, dend); 1622 dd0 = dd1; 1623 sd4 = *sp++; 1624 sd5 = *sp++; 1625 sd6 = *sp++; 1626 sd7 = *sp++; 1627 CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1); 1628 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask); 1629 } 1630 } 1631 } 1632 } 1633 1634 /***************************************************************/ 1635 void mlib_v_ImageChannelExtract_U8_41(const mlib_u8 *src, 1636 mlib_s32 slb, 1637 mlib_u8 *dst, 1638 mlib_s32 dlb, 1639 mlib_s32 xsize, 1640 mlib_s32 ysize, 1641 mlib_s32 cmask) 1642 { 1643 mlib_u8 *sa, *da; 1644 mlib_u8 *sl, *dl; 1645 mlib_s32 j; 1646 1647 sa = sl = (void *)src; 1648 da = dl = dst; 1649 1650 for (j = 0; j < ysize; j++) { 1651 mlib_v_ImageChannelExtract_U8_41_D1(sa, da, xsize, cmask); 1652 sa = sl += slb; 1653 da = dl += dlb; 1654 } 1655 } 1656 1657 /***************************************************************/ 1658 #define CHANNELEXTRACT_S16_21L(sd0, sd1, dd) \ 1659 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \ 1660 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \ 1661 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \ 1662 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc)) 1663 1664 /***************************************************************/ 1665 #define CHANNELEXTRACT_S16_21R(sd0, sd1, dd) \ 1666 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \ 1667 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \ 1668 sdc = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \ 1669 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc)) 1670 1671 /***************************************************************/ 1672 /* extract one channel from a 2-channel image. 1673 * both source and destination image data are 8-byte aligned. 1674 * dsize is multiple of 4. 1675 */ 1676 1677 void mlib_v_ImageChannelExtract_S16_21_A8D1X4(const mlib_s16 *src, 1678 mlib_s16 *dst, 1679 mlib_s32 dsize, 1680 mlib_s32 cmask) 1681 { 1682 mlib_d64 *sp, *dp; 1683 mlib_d64 sd0, sd1; 1684 mlib_d64 sda, sdb, sdc; 1685 mlib_d64 dd; 1686 mlib_s32 i; 1687 1688 sp = (mlib_d64 *) src; 1689 dp = (mlib_d64 *) dst; 1690 1691 if (cmask == 2) { 1692 #pragma pipeloop(0) 1693 for (i = 0; i < dsize / 4; i++) { 1694 sd0 = *sp++; 1695 sd1 = *sp++; 1696 CHANNELEXTRACT_S16_21L(sd0, sd1, dd); 1697 *dp++ = dd; 1698 } 1699 } 1700 else { 1701 #pragma pipeloop(0) 1702 for (i = 0; i < dsize / 4; i++) { 1703 sd0 = *sp++; 1704 sd1 = *sp++; 1705 CHANNELEXTRACT_S16_21R(sd0, sd1, dd); 1706 *dp++ = dd; 1707 } 1708 } 1709 } 1710 1711 /***************************************************************/ 1712 void mlib_v_ImageChannelExtract_S16_21_A8D2X4(const mlib_s16 *src, 1713 mlib_s32 slb, 1714 mlib_s16 *dst, 1715 mlib_s32 dlb, 1716 mlib_s32 xsize, 1717 mlib_s32 ysize, 1718 mlib_s32 cmask) 1719 { 1720 mlib_d64 *sp, *dp; 1721 mlib_d64 *sl, *dl; 1722 mlib_d64 sd0, sd1; 1723 mlib_d64 sda, sdb, sdc; 1724 mlib_d64 dd; 1725 mlib_s32 i, j; 1726 1727 sp = sl = (mlib_d64 *) src; 1728 dp = dl = (mlib_d64 *) dst; 1729 1730 if (cmask == 2) { 1731 for (j = 0; j < ysize; j++) { 1732 #pragma pipeloop(0) 1733 for (i = 0; i < xsize / 4; i++) { 1734 sd0 = *sp++; 1735 sd1 = *sp++; 1736 CHANNELEXTRACT_S16_21L(sd0, sd1, dd); 1737 *dp++ = dd; 1738 } 1739 1740 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 1741 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 1742 } 1743 } 1744 else { 1745 for (j = 0; j < ysize; j++) { 1746 #pragma pipeloop(0) 1747 for (i = 0; i < xsize / 4; i++) { 1748 sd0 = *sp++; 1749 sd1 = *sp++; 1750 CHANNELEXTRACT_S16_21R(sd0, sd1, dd); 1751 *dp++ = dd; 1752 } 1753 1754 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 1755 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 1756 } 1757 } 1758 } 1759 1760 /***************************************************************/ 1761 void mlib_v_ImageChannelExtract_S16_21_D1(const mlib_s16 *src, 1762 mlib_s16 *dst, 1763 mlib_s32 dsize, 1764 mlib_s32 cmask) 1765 { 1766 mlib_s16 *sa, *da; 1767 mlib_s16 *dend, *dend2; /* end points in dst */ 1768 mlib_d64 *dp; /* 8-byte aligned start points in dst */ 1769 mlib_d64 *sp; /* 8-byte aligned start point in src */ 1770 mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */ 1771 mlib_d64 sda, sdb, sdc; 1772 mlib_d64 dd0, dd1; 1773 mlib_s32 soff; /* offset of address in src */ 1774 mlib_s32 doff; /* offset of address in dst */ 1775 mlib_s32 off; /* offset of dst over src */ 1776 mlib_s32 emask; /* edge mask */ 1777 mlib_s32 i, n; 1778 1779 sa = (void *)src; 1780 da = dst; 1781 1782 /* prepare the source address */ 1783 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 1784 soff = ((mlib_addr) sa & 7); 1785 1786 /* prepare the destination addresses */ 1787 dp = (mlib_d64 *) ((mlib_addr) da & (~7)); 1788 doff = ((mlib_addr) da & 7); 1789 dend = da + dsize - 1; 1790 dend2 = dend - 3; 1791 1792 /* calculate the src's offset over dst */ 1793 if (cmask == 2) { 1794 off = (soff / 4) * 2 - doff; 1795 } 1796 else { 1797 off = ((soff + 3) / 4) * 2 - doff; 1798 } 1799 1800 if (((cmask == 2) && (soff % 4 == 0)) || ((cmask == 1) && (soff % 4 != 0))) { /* extract even words */ 1801 1802 if (off == 0) { /* src and dst have same alignment */ 1803 1804 /* generate edge mask for the start point */ 1805 emask = vis_edge16(da, dend); 1806 1807 /* load 16 bytes */ 1808 sd0 = *sp++; 1809 sd1 = *sp++; 1810 1811 /* extract, including some garbage at the start point */ 1812 CHANNELEXTRACT_S16_21L(sd0, sd1, dd0); 1813 1814 /* store 8 bytes result */ 1815 vis_pst_16(dd0, dp++, emask); 1816 1817 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1818 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 1819 1820 /* 8-pixel column loop, emask not needed */ 1821 #pragma pipeloop(0) 1822 for (i = 0; i < n; i++) { 1823 sd0 = *sp++; 1824 sd1 = *sp++; 1825 CHANNELEXTRACT_S16_21L(sd0, sd1, dd0); 1826 *dp++ = dd0; 1827 } 1828 } 1829 1830 /* end point handling */ 1831 if ((mlib_addr) dp <= (mlib_addr) dend) { 1832 emask = vis_edge16(dp, dend); 1833 sd0 = *sp++; 1834 sd1 = *sp++; 1835 CHANNELEXTRACT_S16_21L(sd0, sd1, dd0); 1836 vis_pst_16(dd0, dp++, emask); 1837 } 1838 } 1839 else { 1840 vis_alignaddr((void *)0, off); 1841 1842 /* generate edge mask for the start point */ 1843 emask = vis_edge16(da, dend); 1844 1845 if (off < 0) { 1846 /* load 16 bytes */ 1847 sd2 = *sp++; 1848 sd3 = *sp++; 1849 1850 /* extract and store 8 bytes */ 1851 CHANNELEXTRACT_S16_21L(sd2, sd3, dd1); 1852 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask); 1853 } 1854 else { 1855 /* load 32 bytes */ 1856 sd0 = *sp++; 1857 sd1 = *sp++; 1858 sd2 = *sp++; 1859 sd3 = *sp++; 1860 1861 /* extract and store 8 bytes */ 1862 CHANNELEXTRACT_S16_21L(sd0, sd1, dd0); 1863 CHANNELEXTRACT_S16_21L(sd2, sd3, dd1); 1864 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 1865 } 1866 1867 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1868 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 1869 1870 /* 8-pixel column loop, emask not needed */ 1871 #pragma pipeloop(0) 1872 for (i = 0; i < n; i++) { 1873 dd0 = dd1; 1874 sd2 = *sp++; 1875 sd3 = *sp++; 1876 CHANNELEXTRACT_S16_21L(sd2, sd3, dd1); 1877 *dp++ = vis_faligndata(dd0, dd1); 1878 } 1879 } 1880 1881 /* end point handling */ 1882 if ((mlib_addr) dp <= (mlib_addr) dend) { 1883 emask = vis_edge16(dp, dend); 1884 dd0 = dd1; 1885 sd2 = *sp++; 1886 sd3 = *sp++; 1887 CHANNELEXTRACT_S16_21L(sd2, sd3, dd1); 1888 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 1889 } 1890 } 1891 } 1892 else { /* extract odd words */ 1893 1894 if (off == 0) { /* src and dst have same alignment */ 1895 1896 /* generate edge mask for the start point */ 1897 emask = vis_edge16(da, dend); 1898 1899 /* load 16 bytes, don't care the garbage at the start point */ 1900 sd0 = *sp++; 1901 sd1 = *sp++; 1902 1903 /* extract and store 8 bytes */ 1904 CHANNELEXTRACT_S16_21R(sd0, sd1, dd0); 1905 vis_pst_16(dd0, dp++, emask); 1906 1907 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1908 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 1909 1910 /* 8-pixel column loop, emask not needed */ 1911 #pragma pipeloop(0) 1912 for (i = 0; i < n; i++) { 1913 sd0 = *sp++; 1914 sd1 = *sp++; 1915 CHANNELEXTRACT_S16_21R(sd0, sd1, dd0); 1916 *dp++ = dd0; 1917 } 1918 } 1919 1920 /* end point handling */ 1921 if ((mlib_addr) dp <= (mlib_addr) dend) { 1922 emask = vis_edge16(dp, dend); 1923 sd0 = *sp++; 1924 sd1 = *sp++; 1925 CHANNELEXTRACT_S16_21R(sd0, sd1, dd0); 1926 vis_pst_16(dd0, dp++, emask); 1927 } 1928 } 1929 else { 1930 vis_alignaddr((void *)0, off); 1931 1932 /* generate edge mask for the start point */ 1933 emask = vis_edge16(da, dend); 1934 1935 if (off < 0) { 1936 /* load 16 bytes */ 1937 sd2 = *sp++; 1938 sd3 = *sp++; 1939 1940 /* extract and store 8 bytes */ 1941 CHANNELEXTRACT_S16_21R(sd2, sd3, dd1); 1942 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask); 1943 } 1944 else { 1945 /* load 32 bytes */ 1946 sd0 = *sp++; 1947 sd1 = *sp++; 1948 sd2 = *sp++; 1949 sd3 = *sp++; 1950 1951 /* extract and store 8 bytes */ 1952 CHANNELEXTRACT_S16_21R(sd0, sd1, dd0); 1953 CHANNELEXTRACT_S16_21R(sd2, sd3, dd1); 1954 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 1955 } 1956 1957 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1958 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 1959 1960 /* 8-pixel column loop, emask not needed */ 1961 #pragma pipeloop(0) 1962 for (i = 0; i < n; i++) { 1963 dd0 = dd1; 1964 sd2 = *sp++; 1965 sd3 = *sp++; 1966 CHANNELEXTRACT_S16_21R(sd2, sd3, dd1); 1967 *dp++ = vis_faligndata(dd0, dd1); 1968 } 1969 } 1970 1971 /* end point handling */ 1972 if ((mlib_addr) dp <= (mlib_addr) dend) { 1973 emask = vis_edge16(dp, dend); 1974 dd0 = dd1; 1975 sd2 = *sp++; 1976 sd3 = *sp++; 1977 CHANNELEXTRACT_S16_21R(sd2, sd3, dd1); 1978 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 1979 } 1980 } 1981 } 1982 } 1983 1984 /***************************************************************/ 1985 void mlib_v_ImageChannelExtract_S16_21(const mlib_s16 *src, 1986 mlib_s32 slb, 1987 mlib_s16 *dst, 1988 mlib_s32 dlb, 1989 mlib_s32 xsize, 1990 mlib_s32 ysize, 1991 mlib_s32 cmask) 1992 { 1993 mlib_s16 *sa, *da; 1994 mlib_s16 *sl, *dl; 1995 mlib_s32 j; 1996 1997 sa = sl = (void *)src; 1998 da = dl = dst; 1999 2000 for (j = 0; j < ysize; j++) { 2001 mlib_v_ImageChannelExtract_S16_21_D1(sa, da, xsize, cmask); 2002 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb); 2003 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb); 2004 } 2005 } 2006 2007 /***************************************************************/ 2008 #define CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd) \ 2009 /* extract the left channel */ \ 2010 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \ 2011 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \ 2012 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \ 2013 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc)) 2014 2015 /***************************************************************/ 2016 #define CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd) \ 2017 /* extract the middle channel */ \ 2018 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \ 2019 sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \ 2020 sdc = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdb)); \ 2021 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc)) 2022 2023 /***************************************************************/ 2024 #define CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd) \ 2025 /* extract the right channel */ \ 2026 sda = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \ 2027 sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \ 2028 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \ 2029 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc)) 2030 2031 /***************************************************************/ 2032 void mlib_v_ImageChannelExtract_S16_31_A8D1X4(const mlib_s16 *src, 2033 mlib_s16 *dst, 2034 mlib_s32 dsize, 2035 mlib_s32 cmask) 2036 { 2037 mlib_d64 *sp, *dp; 2038 mlib_d64 sd0, sd1, sd2; 2039 mlib_d64 sda, sdb, sdc; 2040 mlib_d64 dd; 2041 mlib_s32 i; 2042 2043 sp = (mlib_d64 *) src; 2044 dp = (mlib_d64 *) dst; 2045 2046 if (cmask == 4) { 2047 #pragma pipeloop(0) 2048 for (i = 0; i < dsize / 4; i++) { 2049 sd0 = *sp++; 2050 sd1 = *sp++; 2051 sd2 = *sp++; 2052 CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd); 2053 *dp++ = dd; 2054 } 2055 } 2056 else if (cmask == 2) { 2057 #pragma pipeloop(0) 2058 for (i = 0; i < dsize / 4; i++) { 2059 sd0 = *sp++; 2060 sd1 = *sp++; 2061 sd2 = *sp++; 2062 CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd); 2063 *dp++ = dd; 2064 } 2065 } 2066 else { 2067 #pragma pipeloop(0) 2068 for (i = 0; i < dsize / 4; i++) { 2069 sd0 = *sp++; 2070 sd1 = *sp++; 2071 sd2 = *sp++; 2072 CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd); 2073 *dp++ = dd; 2074 } 2075 } 2076 } 2077 2078 /***************************************************************/ 2079 void mlib_v_ImageChannelExtract_S16_31_A8D2X4(const mlib_s16 *src, 2080 mlib_s32 slb, 2081 mlib_s16 *dst, 2082 mlib_s32 dlb, 2083 mlib_s32 xsize, 2084 mlib_s32 ysize, 2085 mlib_s32 cmask) 2086 { 2087 mlib_d64 *sp, *dp; 2088 mlib_d64 *sl, *dl; 2089 mlib_d64 sd0, sd1, sd2; 2090 mlib_d64 sda, sdb, sdc; 2091 mlib_d64 dd; 2092 mlib_s32 i, j; 2093 2094 sp = sl = (mlib_d64 *) src; 2095 dp = dl = (mlib_d64 *) dst; 2096 2097 if (cmask == 4) { 2098 for (j = 0; j < ysize; j++) { 2099 #pragma pipeloop(0) 2100 for (i = 0; i < xsize / 4; i++) { 2101 sd0 = *sp++; 2102 sd1 = *sp++; 2103 sd2 = *sp++; 2104 CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd); 2105 *dp++ = dd; 2106 } 2107 2108 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 2109 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 2110 } 2111 } 2112 else if (cmask == 2) { 2113 for (j = 0; j < ysize; j++) { 2114 #pragma pipeloop(0) 2115 for (i = 0; i < xsize / 4; i++) { 2116 sd0 = *sp++; 2117 sd1 = *sp++; 2118 sd2 = *sp++; 2119 CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd); 2120 *dp++ = dd; 2121 } 2122 2123 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 2124 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 2125 } 2126 } 2127 else { 2128 for (j = 0; j < ysize; j++) { 2129 #pragma pipeloop(0) 2130 for (i = 0; i < xsize / 4; i++) { 2131 sd0 = *sp++; 2132 sd1 = *sp++; 2133 sd2 = *sp++; 2134 CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd); 2135 *dp++ = dd; 2136 } 2137 2138 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 2139 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 2140 } 2141 } 2142 } 2143 2144 /***************************************************************/ 2145 void mlib_v_ImageChannelExtract_S16_31_D1(const mlib_s16 *src, 2146 mlib_s16 *dst, 2147 mlib_s32 dsize, 2148 mlib_s32 cmask) 2149 { 2150 mlib_s16 *sa, *da; 2151 mlib_s16 *dend, *dend2; /* end points in dst */ 2152 mlib_d64 *dp; /* 8-byte aligned start points in dst */ 2153 mlib_d64 *sp; /* 8-byte aligned start point in src */ 2154 mlib_d64 sd0, sd1, sd2; /* 8-byte source data */ 2155 mlib_d64 sd3, sd4, sd5; 2156 mlib_d64 sda, sdb, sdc; 2157 mlib_d64 dd0, dd1; 2158 mlib_s32 soff; /* offset of address in src */ 2159 mlib_s32 doff; /* offset of address in dst */ 2160 mlib_s32 off; /* offset of src over dst */ 2161 mlib_s32 emask; /* edge mask */ 2162 mlib_s32 i, n; 2163 2164 sa = (void *)src; 2165 da = dst; 2166 2167 /* prepare the source address */ 2168 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 2169 soff = ((mlib_addr) sa & 7); 2170 2171 /* prepare the destination addresses */ 2172 dp = (mlib_d64 *) ((mlib_addr) da & (~7)); 2173 doff = ((mlib_addr) da & 7); 2174 dend = da + dsize - 1; 2175 dend2 = dend - 3; 2176 2177 /* calculate the src's offset over dst */ 2178 if (cmask == 4) { 2179 off = (soff / 6) * 2 - doff; 2180 } 2181 else if (cmask == 2) { 2182 off = ((soff + 2) / 6) * 2 - doff; 2183 } 2184 else { 2185 off = ((soff + 4) / 6) * 2 - doff; 2186 } 2187 2188 if (((cmask == 4) && (soff % 6 == 0)) || 2189 ((cmask == 2) && (soff % 6 == 4)) || 2190 ((cmask == 1) && (soff % 6 == 2))) { /* extract left channel */ 2191 2192 if (off == 0) { /* src and dst have same alignment */ 2193 2194 /* generate edge mask for the start point */ 2195 emask = vis_edge16(da, dend); 2196 2197 /* load 16 bytes */ 2198 sd0 = *sp++; 2199 sd1 = *sp++; 2200 sd2 = *sp++; 2201 2202 /* extract, including some garbage at the start point */ 2203 CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0); 2204 2205 /* store 8 bytes result */ 2206 vis_pst_16(dd0, dp++, emask); 2207 2208 if ((mlib_addr) dp <= (mlib_addr) dend2) { 2209 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 2210 2211 /* 8-pixel column loop, emask not needed */ 2212 #pragma pipeloop(0) 2213 for (i = 0; i < n; i++) { 2214 sd0 = *sp++; 2215 sd1 = *sp++; 2216 sd2 = *sp++; 2217 CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0); 2218 *dp++ = dd0; 2219 } 2220 } 2221 2222 /* end point handling */ 2223 if ((mlib_addr) dp <= (mlib_addr) dend) { 2224 emask = vis_edge16(dp, dend); 2225 sd0 = *sp++; 2226 sd1 = *sp++; 2227 sd2 = *sp++; 2228 CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0); 2229 vis_pst_16(dd0, dp++, emask); 2230 } 2231 } 2232 else { 2233 vis_alignaddr((void *)0, off); 2234 2235 /* generate edge mask for the start point */ 2236 emask = vis_edge16(da, dend); 2237 2238 if (off < 0) { 2239 /* load 24 bytes */ 2240 sd3 = *sp++; 2241 sd4 = *sp++; 2242 sd5 = *sp++; 2243 2244 /* extract and store 8 bytes */ 2245 CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1); 2246 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask); 2247 } 2248 else { 2249 /* load 48 bytes */ 2250 sd0 = *sp++; 2251 sd1 = *sp++; 2252 sd2 = *sp++; 2253 sd3 = *sp++; 2254 sd4 = *sp++; 2255 sd5 = *sp++; 2256 2257 /* extract and store 8 bytes */ 2258 CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0); 2259 CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1); 2260 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 2261 } 2262 2263 if ((mlib_addr) dp <= (mlib_addr) dend2) { 2264 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 2265 2266 /* 8-pixel column loop, emask not needed */ 2267 #pragma pipeloop(0) 2268 for (i = 0; i < n; i++) { 2269 dd0 = dd1; 2270 sd3 = *sp++; 2271 sd4 = *sp++; 2272 sd5 = *sp++; 2273 CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1); 2274 *dp++ = vis_faligndata(dd0, dd1); 2275 } 2276 } 2277 2278 /* end point handling */ 2279 if ((mlib_addr) dp <= (mlib_addr) dend) { 2280 emask = vis_edge16(dp, dend); 2281 dd0 = dd1; 2282 sd3 = *sp++; 2283 sd4 = *sp++; 2284 sd5 = *sp++; 2285 CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1); 2286 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 2287 } 2288 } 2289 } 2290 else if (((cmask == 4) && (soff % 6 == 2)) || 2291 ((cmask == 2) && (soff % 6 == 0)) || 2292 ((cmask == 1) && (soff % 6 == 4))) { 2293 /* extract middle channel */ 2294 2295 if (off == 0) { /* src and dst have same alignment */ 2296 2297 /* generate edge mask for the start point */ 2298 emask = vis_edge16(da, dend); 2299 2300 /* load 16 bytes */ 2301 sd0 = *sp++; 2302 sd1 = *sp++; 2303 sd2 = *sp++; 2304 2305 /* extract, including some garbage at the start point */ 2306 CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0); 2307 2308 /* store 8 bytes result */ 2309 vis_pst_16(dd0, dp++, emask); 2310 2311 if ((mlib_addr) dp <= (mlib_addr) dend2) { 2312 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 2313 2314 /* 8-pixel column loop, emask not needed */ 2315 #pragma pipeloop(0) 2316 for (i = 0; i < n; i++) { 2317 sd0 = *sp++; 2318 sd1 = *sp++; 2319 sd2 = *sp++; 2320 CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0); 2321 *dp++ = dd0; 2322 } 2323 } 2324 2325 /* end point handling */ 2326 if ((mlib_addr) dp <= (mlib_addr) dend) { 2327 emask = vis_edge16(dp, dend); 2328 sd0 = *sp++; 2329 sd1 = *sp++; 2330 sd2 = *sp++; 2331 CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0); 2332 vis_pst_16(dd0, dp++, emask); 2333 } 2334 } 2335 else { 2336 vis_alignaddr((void *)0, off); 2337 2338 /* generate edge mask for the start point */ 2339 emask = vis_edge16(da, dend); 2340 2341 if (off < 0) { 2342 /* load 24 bytes */ 2343 sd3 = *sp++; 2344 sd4 = *sp++; 2345 sd5 = *sp++; 2346 2347 /* extract and store 8 bytes */ 2348 CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1); 2349 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask); 2350 } 2351 else { 2352 /* load 48 bytes */ 2353 sd0 = *sp++; 2354 sd1 = *sp++; 2355 sd2 = *sp++; 2356 sd3 = *sp++; 2357 sd4 = *sp++; 2358 sd5 = *sp++; 2359 2360 /* extract and store 8 bytes */ 2361 CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0); 2362 CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1); 2363 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 2364 } 2365 2366 if ((mlib_addr) dp <= (mlib_addr) dend2) { 2367 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 2368 2369 /* 8-pixel column loop, emask not needed */ 2370 #pragma pipeloop(0) 2371 for (i = 0; i < n; i++) { 2372 dd0 = dd1; 2373 sd3 = *sp++; 2374 sd4 = *sp++; 2375 sd5 = *sp++; 2376 CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1); 2377 *dp++ = vis_faligndata(dd0, dd1); 2378 } 2379 } 2380 2381 /* end point handling */ 2382 if ((mlib_addr) dp <= (mlib_addr) dend) { 2383 emask = vis_edge16(dp, dend); 2384 dd0 = dd1; 2385 sd3 = *sp++; 2386 sd4 = *sp++; 2387 sd5 = *sp++; 2388 CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1); 2389 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 2390 } 2391 } 2392 } 2393 else { /* extract right channel */ 2394 2395 if (off == 0) { /* src and dst have same alignment */ 2396 2397 /* generate edge mask for the start point */ 2398 emask = vis_edge16(da, dend); 2399 2400 /* load 16 bytes */ 2401 sd0 = *sp++; 2402 sd1 = *sp++; 2403 sd2 = *sp++; 2404 2405 /* extract, including some garbage at the start point */ 2406 CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0); 2407 2408 /* store 8 bytes result */ 2409 vis_pst_16(dd0, dp++, emask); 2410 2411 if ((mlib_addr) dp <= (mlib_addr) dend2) { 2412 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 2413 2414 /* 8-pixel column loop, emask not needed */ 2415 #pragma pipeloop(0) 2416 for (i = 0; i < n; i++) { 2417 sd0 = *sp++; 2418 sd1 = *sp++; 2419 sd2 = *sp++; 2420 CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0); 2421 *dp++ = dd0; 2422 } 2423 } 2424 2425 /* end point handling */ 2426 if ((mlib_addr) dp <= (mlib_addr) dend) { 2427 emask = vis_edge16(dp, dend); 2428 sd0 = *sp++; 2429 sd1 = *sp++; 2430 sd2 = *sp++; 2431 CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0); 2432 vis_pst_16(dd0, dp++, emask); 2433 } 2434 } 2435 else { 2436 vis_alignaddr((void *)0, off); 2437 2438 /* generate edge mask for the start point */ 2439 emask = vis_edge16(da, dend); 2440 2441 if (off < 0) { 2442 /* load 24 bytes */ 2443 sd3 = *sp++; 2444 sd4 = *sp++; 2445 sd5 = *sp++; 2446 2447 /* extract and store 8 bytes */ 2448 CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1); 2449 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask); 2450 } 2451 else { 2452 /* load 48 bytes */ 2453 sd0 = *sp++; 2454 sd1 = *sp++; 2455 sd2 = *sp++; 2456 sd3 = *sp++; 2457 sd4 = *sp++; 2458 sd5 = *sp++; 2459 2460 /* extract and store 8 bytes */ 2461 CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0); 2462 CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1); 2463 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 2464 } 2465 2466 if ((mlib_addr) dp <= (mlib_addr) dend2) { 2467 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 2468 2469 /* 8-pixel column loop, emask not needed */ 2470 #pragma pipeloop(0) 2471 for (i = 0; i < n; i++) { 2472 dd0 = dd1; 2473 sd3 = *sp++; 2474 sd4 = *sp++; 2475 sd5 = *sp++; 2476 CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1); 2477 *dp++ = vis_faligndata(dd0, dd1); 2478 } 2479 } 2480 2481 /* end point handling */ 2482 if ((mlib_addr) dp <= (mlib_addr) dend) { 2483 emask = vis_edge16(dp, dend); 2484 dd0 = dd1; 2485 sd3 = *sp++; 2486 sd4 = *sp++; 2487 sd5 = *sp++; 2488 CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1); 2489 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 2490 } 2491 } 2492 } 2493 } 2494 2495 /***************************************************************/ 2496 void mlib_v_ImageChannelExtract_S16_31(const mlib_s16 *src, 2497 mlib_s32 slb, 2498 mlib_s16 *dst, 2499 mlib_s32 dlb, 2500 mlib_s32 xsize, 2501 mlib_s32 ysize, 2502 mlib_s32 cmask) 2503 { 2504 mlib_s16 *sa, *da; 2505 mlib_s16 *sl, *dl; 2506 mlib_s32 j; 2507 2508 sa = sl = (void *)src; 2509 da = dl = dst; 2510 2511 for (j = 0; j < ysize; j++) { 2512 mlib_v_ImageChannelExtract_S16_31_D1(sa, da, xsize, cmask); 2513 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb); 2514 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb); 2515 } 2516 } 2517 2518 /***************************************************************/ 2519 #define CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd) \ 2520 /* extract the left channel */ \ 2521 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \ 2522 sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \ 2523 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \ 2524 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc)) 2525 2526 /***************************************************************/ 2527 #define CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd) \ 2528 /* extract the middle left channel */ \ 2529 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \ 2530 sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \ 2531 sdc = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \ 2532 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc)) 2533 2534 /***************************************************************/ 2535 #define CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd) \ 2536 /* extract the middle right channel */ \ 2537 sda = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \ 2538 sdb = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \ 2539 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \ 2540 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc)) 2541 2542 /***************************************************************/ 2543 #define CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd) \ 2544 /* extract the right channel */ \ 2545 sda = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \ 2546 sdb = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \ 2547 sdc = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \ 2548 dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc)) 2549 2550 /***************************************************************/ 2551 void mlib_v_ImageChannelExtract_S16_41_A8D1X4(const mlib_s16 *src, 2552 mlib_s16 *dst, 2553 mlib_s32 dsize, 2554 mlib_s32 cmask) 2555 { 2556 mlib_d64 *sp, *dp; 2557 mlib_d64 sd0, sd1, sd2, sd3; 2558 mlib_d64 sda, sdb, sdc; 2559 mlib_d64 dd; 2560 mlib_s32 i; 2561 2562 sp = (mlib_d64 *) src; 2563 dp = (mlib_d64 *) dst; 2564 2565 if (cmask == 8) { 2566 #pragma pipeloop(0) 2567 for (i = 0; i < dsize / 4; i++) { 2568 sd0 = *sp++; 2569 sd1 = *sp++; 2570 sd2 = *sp++; 2571 sd3 = *sp++; 2572 CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd); 2573 *dp++ = dd; 2574 } 2575 } 2576 else if (cmask == 4) { 2577 #pragma pipeloop(0) 2578 for (i = 0; i < dsize / 4; i++) { 2579 sd0 = *sp++; 2580 sd1 = *sp++; 2581 sd2 = *sp++; 2582 sd3 = *sp++; 2583 CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd); 2584 *dp++ = dd; 2585 } 2586 } 2587 else if (cmask == 2) { 2588 #pragma pipeloop(0) 2589 for (i = 0; i < dsize / 4; i++) { 2590 sd0 = *sp++; 2591 sd1 = *sp++; 2592 sd2 = *sp++; 2593 sd3 = *sp++; 2594 CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd); 2595 *dp++ = dd; 2596 } 2597 } 2598 else { 2599 #pragma pipeloop(0) 2600 for (i = 0; i < dsize / 4; i++) { 2601 sd0 = *sp++; 2602 sd1 = *sp++; 2603 sd2 = *sp++; 2604 sd3 = *sp++; 2605 CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd); 2606 *dp++ = dd; 2607 } 2608 } 2609 } 2610 2611 /***************************************************************/ 2612 void mlib_v_ImageChannelExtract_S16_41_A8D2X4(const mlib_s16 *src, 2613 mlib_s32 slb, 2614 mlib_s16 *dst, 2615 mlib_s32 dlb, 2616 mlib_s32 xsize, 2617 mlib_s32 ysize, 2618 mlib_s32 cmask) 2619 { 2620 mlib_d64 *sp, *dp; 2621 mlib_d64 *sl, *dl; 2622 mlib_d64 sd0, sd1, sd2, sd3; 2623 mlib_d64 sda, sdb, sdc; 2624 mlib_d64 dd; 2625 mlib_s32 i, j; 2626 2627 sp = sl = (mlib_d64 *) src; 2628 dp = dl = (mlib_d64 *) dst; 2629 2630 if (cmask == 8) { 2631 for (j = 0; j < ysize; j++) { 2632 #pragma pipeloop(0) 2633 for (i = 0; i < xsize / 4; i++) { 2634 sd0 = *sp++; 2635 sd1 = *sp++; 2636 sd2 = *sp++; 2637 sd3 = *sp++; 2638 CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd); 2639 *dp++ = dd; 2640 } 2641 2642 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 2643 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 2644 } 2645 } 2646 else if (cmask == 4) { 2647 for (j = 0; j < ysize; j++) { 2648 #pragma pipeloop(0) 2649 for (i = 0; i < xsize / 4; i++) { 2650 sd0 = *sp++; 2651 sd1 = *sp++; 2652 sd2 = *sp++; 2653 sd3 = *sp++; 2654 CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd); 2655 *dp++ = dd; 2656 } 2657 2658 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 2659 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 2660 } 2661 } 2662 else if (cmask == 2) { 2663 for (j = 0; j < ysize; j++) { 2664 #pragma pipeloop(0) 2665 for (i = 0; i < xsize / 4; i++) { 2666 sd0 = *sp++; 2667 sd1 = *sp++; 2668 sd2 = *sp++; 2669 sd3 = *sp++; 2670 CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd); 2671 *dp++ = dd; 2672 } 2673 2674 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 2675 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 2676 } 2677 } 2678 else { 2679 for (j = 0; j < ysize; j++) { 2680 #pragma pipeloop(0) 2681 for (i = 0; i < xsize / 4; i++) { 2682 sd0 = *sp++; 2683 sd1 = *sp++; 2684 sd2 = *sp++; 2685 sd3 = *sp++; 2686 CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd); 2687 *dp++ = dd; 2688 } 2689 2690 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 2691 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 2692 } 2693 } 2694 } 2695 2696 /***************************************************************/ 2697 void mlib_v_ImageChannelExtract_S16_41_D1(const mlib_s16 *src, 2698 mlib_s16 *dst, 2699 mlib_s32 dsize, 2700 mlib_s32 cmask) 2701 { 2702 mlib_s16 *sa, *da; 2703 mlib_s16 *dend, *dend2; /* end points in dst */ 2704 mlib_d64 *dp; /* 8-byte aligned start points in dst */ 2705 mlib_d64 *sp; /* 8-byte aligned start point in src */ 2706 mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */ 2707 mlib_d64 sd4, sd5, sd6, sd7; 2708 mlib_d64 sda, sdb, sdc; 2709 mlib_d64 dd0, dd1; 2710 mlib_s32 soff; /* offset of address in src */ 2711 mlib_s32 doff; /* offset of address in dst */ 2712 mlib_s32 off; /* offset of src over dst */ 2713 mlib_s32 emask; /* edge mask */ 2714 mlib_s32 i, n; 2715 2716 sa = (void *)src; 2717 da = dst; 2718 2719 /* prepare the source address */ 2720 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 2721 soff = ((mlib_addr) sa & 7); 2722 2723 /* prepare the destination addresses */ 2724 dp = (mlib_d64 *) ((mlib_addr) da & (~7)); 2725 doff = ((mlib_addr) da & 7); 2726 dend = da + dsize - 1; 2727 dend2 = dend - 3; 2728 2729 /* calculate the src's offset over dst */ 2730 if (cmask == 8) { 2731 off = (soff / 8) * 2 - doff; 2732 } 2733 else if (cmask == 4) { 2734 off = ((soff + 2) / 8) * 2 - doff; 2735 } 2736 else if (cmask == 2) { 2737 off = ((soff + 4) / 8) * 2 - doff; 2738 } 2739 else { 2740 off = ((soff + 6) / 8) * 2 - doff; 2741 } 2742 2743 if (((cmask == 8) && (soff == 0)) || 2744 ((cmask == 4) && (soff == 6)) || 2745 ((cmask == 2) && (soff == 4)) || 2746 ((cmask == 1) && (soff == 2))) { /* extract left channel */ 2747 2748 if (off == 0) { /* src and dst have same alignment */ 2749 2750 /* generate edge mask for the start point */ 2751 emask = vis_edge16(da, dend); 2752 2753 /* load 16 bytes */ 2754 sd0 = *sp++; 2755 sd1 = *sp++; 2756 sd2 = *sp++; 2757 sd3 = *sp++; 2758 2759 /* extract, including some garbage at the start point */ 2760 CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0); 2761 2762 /* store 8 bytes result */ 2763 vis_pst_16(dd0, dp++, emask); 2764 2765 if ((mlib_addr) dp <= (mlib_addr) dend2) { 2766 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 2767 2768 /* 8-pixel column loop, emask not needed */ 2769 #pragma pipeloop(0) 2770 for (i = 0; i < n; i++) { 2771 sd0 = *sp++; 2772 sd1 = *sp++; 2773 sd2 = *sp++; 2774 sd3 = *sp++; 2775 CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0); 2776 *dp++ = dd0; 2777 } 2778 } 2779 2780 /* end point handling */ 2781 if ((mlib_addr) dp <= (mlib_addr) dend) { 2782 emask = vis_edge16(dp, dend); 2783 sd0 = *sp++; 2784 sd1 = *sp++; 2785 sd2 = *sp++; 2786 sd3 = *sp++; 2787 CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0); 2788 vis_pst_16(dd0, dp++, emask); 2789 } 2790 } 2791 else { 2792 vis_alignaddr((void *)0, off); 2793 2794 /* generate edge mask for the start point */ 2795 emask = vis_edge16(da, dend); 2796 2797 if (off < 0) { 2798 /* load 24 bytes */ 2799 sd4 = *sp++; 2800 sd5 = *sp++; 2801 sd6 = *sp++; 2802 sd7 = *sp++; 2803 2804 /* extract and store 8 bytes */ 2805 CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1); 2806 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask); 2807 } 2808 else { 2809 /* load 48 bytes */ 2810 sd0 = *sp++; 2811 sd1 = *sp++; 2812 sd2 = *sp++; 2813 sd3 = *sp++; 2814 sd4 = *sp++; 2815 sd5 = *sp++; 2816 sd6 = *sp++; 2817 sd7 = *sp++; 2818 2819 /* extract and store 8 bytes */ 2820 CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0); 2821 CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1); 2822 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 2823 } 2824 2825 if ((mlib_addr) dp <= (mlib_addr) dend2) { 2826 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 2827 2828 /* 8-pixel column loop, emask not needed */ 2829 #pragma pipeloop(0) 2830 for (i = 0; i < n; i++) { 2831 dd0 = dd1; 2832 sd4 = *sp++; 2833 sd5 = *sp++; 2834 sd6 = *sp++; 2835 sd7 = *sp++; 2836 CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1); 2837 *dp++ = vis_faligndata(dd0, dd1); 2838 } 2839 } 2840 2841 /* end point handling */ 2842 if ((mlib_addr) dp <= (mlib_addr) dend) { 2843 emask = vis_edge16(dp, dend); 2844 dd0 = dd1; 2845 sd4 = *sp++; 2846 sd5 = *sp++; 2847 sd6 = *sp++; 2848 sd7 = *sp++; 2849 CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1); 2850 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 2851 } 2852 } 2853 } 2854 else if (((cmask == 8) && (soff == 2)) || 2855 ((cmask == 4) && (soff == 0)) || 2856 ((cmask == 2) && (soff == 6)) || 2857 ((cmask == 1) && (soff == 4))) { /* extract middle left channel */ 2858 2859 if (off == 0) { /* src and dst have same alignment */ 2860 2861 /* generate edge mask for the start point */ 2862 emask = vis_edge16(da, dend); 2863 2864 /* load 16 bytes */ 2865 sd0 = *sp++; 2866 sd1 = *sp++; 2867 sd2 = *sp++; 2868 sd3 = *sp++; 2869 2870 /* extract, including some garbage at the start point */ 2871 CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0); 2872 2873 /* store 8 bytes result */ 2874 vis_pst_16(dd0, dp++, emask); 2875 2876 if ((mlib_addr) dp <= (mlib_addr) dend2) { 2877 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 2878 2879 /* 8-pixel column loop, emask not needed */ 2880 #pragma pipeloop(0) 2881 for (i = 0; i < n; i++) { 2882 sd0 = *sp++; 2883 sd1 = *sp++; 2884 sd2 = *sp++; 2885 sd3 = *sp++; 2886 CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0); 2887 *dp++ = dd0; 2888 } 2889 } 2890 2891 /* end point handling */ 2892 if ((mlib_addr) dp <= (mlib_addr) dend) { 2893 emask = vis_edge16(dp, dend); 2894 sd0 = *sp++; 2895 sd1 = *sp++; 2896 sd2 = *sp++; 2897 sd3 = *sp++; 2898 CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0); 2899 vis_pst_16(dd0, dp++, emask); 2900 } 2901 } 2902 else { 2903 vis_alignaddr((void *)0, off); 2904 2905 /* generate edge mask for the start point */ 2906 emask = vis_edge16(da, dend); 2907 2908 if (off < 0) { 2909 /* load 24 bytes */ 2910 sd4 = *sp++; 2911 sd5 = *sp++; 2912 sd6 = *sp++; 2913 sd7 = *sp++; 2914 2915 /* extract and store 8 bytes */ 2916 CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1); 2917 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask); 2918 } 2919 else { 2920 /* load 48 bytes */ 2921 sd0 = *sp++; 2922 sd1 = *sp++; 2923 sd2 = *sp++; 2924 sd3 = *sp++; 2925 sd4 = *sp++; 2926 sd5 = *sp++; 2927 sd6 = *sp++; 2928 sd7 = *sp++; 2929 2930 /* extract and store 8 bytes */ 2931 CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0); 2932 CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1); 2933 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 2934 } 2935 2936 if ((mlib_addr) dp <= (mlib_addr) dend2) { 2937 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 2938 2939 /* 8-pixel column loop, emask not needed */ 2940 #pragma pipeloop(0) 2941 for (i = 0; i < n; i++) { 2942 dd0 = dd1; 2943 sd4 = *sp++; 2944 sd5 = *sp++; 2945 sd6 = *sp++; 2946 sd7 = *sp++; 2947 CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1); 2948 *dp++ = vis_faligndata(dd0, dd1); 2949 } 2950 } 2951 2952 /* end point handling */ 2953 if ((mlib_addr) dp <= (mlib_addr) dend) { 2954 emask = vis_edge16(dp, dend); 2955 dd0 = dd1; 2956 sd4 = *sp++; 2957 sd5 = *sp++; 2958 sd6 = *sp++; 2959 sd7 = *sp++; 2960 CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1); 2961 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 2962 } 2963 } 2964 } 2965 else if (((cmask == 8) && (soff == 4)) || 2966 ((cmask == 4) && (soff == 2)) || 2967 ((cmask == 2) && (soff == 0)) || 2968 ((cmask == 1) && (soff == 6))) { /* extract middle right channel */ 2969 2970 if (off == 0) { /* src and dst have same alignment */ 2971 2972 /* generate edge mask for the start point */ 2973 emask = vis_edge16(da, dend); 2974 2975 /* load 16 bytes */ 2976 sd0 = *sp++; 2977 sd1 = *sp++; 2978 sd2 = *sp++; 2979 sd3 = *sp++; 2980 2981 /* extract, including some garbage at the start point */ 2982 CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0); 2983 2984 /* store 8 bytes result */ 2985 vis_pst_16(dd0, dp++, emask); 2986 2987 if ((mlib_addr) dp <= (mlib_addr) dend2) { 2988 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 2989 2990 /* 8-pixel column loop, emask not needed */ 2991 #pragma pipeloop(0) 2992 for (i = 0; i < n; i++) { 2993 sd0 = *sp++; 2994 sd1 = *sp++; 2995 sd2 = *sp++; 2996 sd3 = *sp++; 2997 CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0); 2998 *dp++ = dd0; 2999 } 3000 } 3001 3002 /* end point handling */ 3003 if ((mlib_addr) dp <= (mlib_addr) dend) { 3004 emask = vis_edge16(dp, dend); 3005 sd0 = *sp++; 3006 sd1 = *sp++; 3007 sd2 = *sp++; 3008 sd3 = *sp++; 3009 CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0); 3010 vis_pst_16(dd0, dp++, emask); 3011 } 3012 } 3013 else { 3014 vis_alignaddr((void *)0, off); 3015 3016 /* generate edge mask for the start point */ 3017 emask = vis_edge16(da, dend); 3018 3019 if (off < 0) { 3020 /* load 24 bytes */ 3021 sd4 = *sp++; 3022 sd5 = *sp++; 3023 sd6 = *sp++; 3024 sd7 = *sp++; 3025 3026 /* extract and store 8 bytes */ 3027 CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1); 3028 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask); 3029 } 3030 else { 3031 /* load 48 bytes */ 3032 sd0 = *sp++; 3033 sd1 = *sp++; 3034 sd2 = *sp++; 3035 sd3 = *sp++; 3036 sd4 = *sp++; 3037 sd5 = *sp++; 3038 sd6 = *sp++; 3039 sd7 = *sp++; 3040 3041 /* extract and store 8 bytes */ 3042 CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0); 3043 CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1); 3044 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 3045 } 3046 3047 if ((mlib_addr) dp <= (mlib_addr) dend2) { 3048 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 3049 3050 /* 8-pixel column loop, emask not needed */ 3051 #pragma pipeloop(0) 3052 for (i = 0; i < n; i++) { 3053 dd0 = dd1; 3054 sd4 = *sp++; 3055 sd5 = *sp++; 3056 sd6 = *sp++; 3057 sd7 = *sp++; 3058 CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1); 3059 *dp++ = vis_faligndata(dd0, dd1); 3060 } 3061 } 3062 3063 /* end point handling */ 3064 if ((mlib_addr) dp <= (mlib_addr) dend) { 3065 emask = vis_edge16(dp, dend); 3066 dd0 = dd1; 3067 sd4 = *sp++; 3068 sd5 = *sp++; 3069 sd6 = *sp++; 3070 sd7 = *sp++; 3071 CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1); 3072 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 3073 } 3074 } 3075 } 3076 else { /* extract right channel */ 3077 if (off == 0) { /* src and dst have same alignment */ 3078 3079 /* generate edge mask for the start point */ 3080 emask = vis_edge16(da, dend); 3081 3082 /* load 16 bytes */ 3083 sd0 = *sp++; 3084 sd1 = *sp++; 3085 sd2 = *sp++; 3086 sd3 = *sp++; 3087 3088 /* extract, including some garbage at the start point */ 3089 CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd0); 3090 3091 /* store 8 bytes result */ 3092 vis_pst_16(dd0, dp++, emask); 3093 3094 if ((mlib_addr) dp <= (mlib_addr) dend2) { 3095 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 3096 3097 /* 8-pixel column loop, emask not needed */ 3098 #pragma pipeloop(0) 3099 for (i = 0; i < n; i++) { 3100 sd0 = *sp++; 3101 sd1 = *sp++; 3102 sd2 = *sp++; 3103 sd3 = *sp++; 3104 CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd0); 3105 *dp++ = dd0; 3106 } 3107 } 3108 3109 /* end point handling */ 3110 if ((mlib_addr) dp <= (mlib_addr) dend) { 3111 emask = vis_edge16(dp, dend); 3112 sd0 = *sp++; 3113 sd1 = *sp++; 3114 sd2 = *sp++; 3115 sd3 = *sp++; 3116 CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd0); 3117 vis_pst_16(dd0, dp++, emask); 3118 } 3119 } 3120 else { 3121 vis_alignaddr((void *)0, off); 3122 3123 /* generate edge mask for the start point */ 3124 emask = vis_edge16(da, dend); 3125 3126 if (off < 0) { 3127 /* load 24 bytes */ 3128 sd4 = *sp++; 3129 sd5 = *sp++; 3130 sd6 = *sp++; 3131 sd7 = *sp++; 3132 3133 /* extract and store 8 bytes */ 3134 CHANNELEXTRACT_S16_41R(sd4, sd5, sd6, sd7, dd1); 3135 vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask); 3136 } 3137 3138 if ((mlib_addr) dp <= (mlib_addr) dend2) { 3139 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1; 3140 3141 /* 8-pixel column loop, emask not needed */ 3142 #pragma pipeloop(0) 3143 for (i = 0; i < n; i++) { 3144 dd0 = dd1; 3145 sd4 = *sp++; 3146 sd5 = *sp++; 3147 sd6 = *sp++; 3148 sd7 = *sp++; 3149 CHANNELEXTRACT_S16_41R(sd4, sd5, sd6, sd7, dd1); 3150 *dp++ = vis_faligndata(dd0, dd1); 3151 } 3152 } 3153 3154 /* end point handling */ 3155 if ((mlib_addr) dp <= (mlib_addr) dend) { 3156 emask = vis_edge16(dp, dend); 3157 dd0 = dd1; 3158 sd4 = *sp++; 3159 sd5 = *sp++; 3160 sd6 = *sp++; 3161 sd7 = *sp++; 3162 CHANNELEXTRACT_S16_41R(sd4, sd5, sd6, sd7, dd1); 3163 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask); 3164 } 3165 } 3166 } 3167 } 3168 3169 /***************************************************************/ 3170 void mlib_v_ImageChannelExtract_S16_41(const mlib_s16 *src, 3171 mlib_s32 slb, 3172 mlib_s16 *dst, 3173 mlib_s32 dlb, 3174 mlib_s32 xsize, 3175 mlib_s32 ysize, 3176 mlib_s32 cmask) 3177 { 3178 mlib_s16 *sa, *da; 3179 mlib_s16 *sl, *dl; 3180 mlib_s32 j; 3181 3182 sa = sl = (void *)src; 3183 da = dl = dst; 3184 for (j = 0; j < ysize; j++) { 3185 mlib_v_ImageChannelExtract_S16_41_D1(sa, da, xsize, cmask); 3186 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb); 3187 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb); 3188 } 3189 } 3190 3191 /***************************************************************/