1 /* 2 * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 27 28 /* 29 * FUNCTIONS 30 * mlib_v_ImageChannelInsert_U8 31 * mlib_v_ImageChannelInsert_U8_12_A8D1X8 32 * mlib_v_ImageChannelInsert_U8_12_A8D2X8 33 * mlib_v_ImageChannelInsert_U8_12_D1 34 * mlib_v_ImageChannelInsert_U8_12 35 * mlib_v_ImageChannelInsert_U8_13_A8D1X8 36 * mlib_v_ImageChannelInsert_U8_13_A8D2X8 37 * mlib_v_ImageChannelInsert_U8_13_D1 38 * mlib_v_ImageChannelInsert_U8_13 39 * mlib_v_ImageChannelInsert_U8_14_A8D1X8 40 * mlib_v_ImageChannelInsert_U8_14_A8D2X8 41 * mlib_v_ImageChannelInsert_U8_14_D1 42 * mlib_v_ImageChannelInsert_U8_14 43 * mlib_v_ImageChannelInsert_S16 44 * mlib_v_ImageChannelInsert_S16_12_A8D1X4 45 * mlib_v_ImageChannelInsert_S16_12_A8D2X4 46 * mlib_v_ImageChannelInsert_S16_12_D1 47 * mlib_v_ImageChannelInsert_S16_12 48 * mlib_v_ImageChannelInsert_S16_13_A8D1X4 49 * mlib_v_ImageChannelInsert_S16_13_A8D2X4 50 * mlib_v_ImageChannelInsert_S16_13_D1 51 * mlib_v_ImageChannelInsert_S16_13 52 * mlib_v_ImageChannelInsert_S16_14_A8D1X4 53 * mlib_v_ImageChannelInsert_S16_14_A8D2X4 54 * mlib_v_ImageChannelInsert_S16_14_D1 55 * mlib_v_ImageChannelInsert_S16_14 56 * mlib_v_ImageChannelInsert_S32 57 * mlib_v_ImageChannelInsert_D64 58 * 59 * ARGUMENT 60 * src pointer to source image data 61 * dst pointer to destination image data 62 * slb source image line stride in bytes 63 * dlb destination image line stride in bytes 64 * dsize image data size in pixels 65 * xsize image width in pixels 66 * ysize image height in lines 67 * cmask channel mask 68 * 69 * DESCRIPTION 70 * Copy the 1-channel source image into the selected channel 71 * of the destination image -- VIS version low level functions. 72 * 73 * NOTE 74 * These functions are separated from mlib_v_ImageChannelInsert.c 75 * for loop unrolling and structure clarity. 76 */ 77 78 #include "vis_proto.h" 79 #include "mlib_image.h" 80 #include "mlib_v_ImageChannelInsert.h" 81 82 /***************************************************************/ 83 /* general channel insertion: slower due to the inner loop */ 84 void mlib_v_ImageChannelInsert_U8(const mlib_u8 *src, 85 mlib_s32 slb, 86 mlib_u8 *dst, 87 mlib_s32 dlb, 88 mlib_s32 channels, 89 mlib_s32 channeld, 90 mlib_s32 width, 91 mlib_s32 height, 92 mlib_s32 cmask) 93 { 94 mlib_u8 *sp; /* pointer for pixel in src */ 95 mlib_u8 *sl; /* pointer for line in src */ 96 mlib_u8 *dp; /* pointer for pixel in dst */ 97 mlib_u8 *dl; /* pointer for line in dst */ 98 mlib_s32 i, j, k; /* indices for x, y, channel */ 99 mlib_s32 deltac[5] = { 0, 1, 1, 1, 1 }; 100 mlib_s32 inc0, inc1, inc2; 101 mlib_u8 s0, s1, s2; 102 103 deltac[channels] = 1; 104 for (i = (channeld - 1), k = 0; i >= 0; i--) { 105 if ((cmask & (1 << i)) == 0) 106 deltac[k]++; 107 else 108 k++; 109 } 110 111 deltac[channels] = channeld; 112 for (i = 1; i < channels; i++) { 113 deltac[channels] -= deltac[i]; 114 } 115 116 sp = sl = (void *)src; 117 dp = dl = dst + deltac[0]; 118 119 if (channels == 2) { 120 inc0 = deltac[1]; 121 inc1 = deltac[2] + inc0; 122 for (j = 0; j < height; j++) { 123 #pragma pipeloop(0) 124 for (i = 0; i < width; i++) { 125 s0 = sp[0]; 126 s1 = sp[1]; 127 dp[0] = s0; 128 dp[inc0] = s1; 129 dp += inc1; 130 sp += 2; 131 } 132 133 sp = sl += slb; 134 dp = dl += dlb; 135 } 136 } 137 else if (channels == 3) { 138 inc0 = deltac[1]; 139 inc1 = deltac[2] + inc0; 140 inc2 = deltac[3] + inc1; 141 for (j = 0; j < height; j++) { 142 #pragma pipeloop(0) 143 for (i = 0; i < width; i++) { 144 s0 = sp[0]; 145 s1 = sp[1]; 146 s2 = sp[2]; 147 dp[0] = s0; 148 dp[inc0] = s1; 149 dp[inc1] = s2; 150 dp += inc2; 151 sp += 3; 152 } 153 154 sp = sl += slb; 155 dp = dl += dlb; 156 } 157 } 158 } 159 160 /***************************************************************/ 161 /* general channel insertion: slower due to the inner loop */ 162 void mlib_v_ImageChannelInsert_D64(const mlib_d64 *src, 163 mlib_s32 slb, 164 mlib_d64 *dst, 165 mlib_s32 dlb, 166 mlib_s32 channels, 167 mlib_s32 channeld, 168 mlib_s32 width, 169 mlib_s32 height, 170 mlib_s32 cmask) 171 { 172 mlib_d64 *sp; /* pointer for pixel in src */ 173 mlib_d64 *sl; /* pointer for line in src */ 174 mlib_d64 *dp; /* pointer for pixel in dst */ 175 mlib_d64 *dl; /* pointer for line in dst */ 176 mlib_s32 i, j, k; /* indices for x, y, channel */ 177 mlib_s32 deltac[5] = { 0, 1, 1, 1, 1 }; 178 mlib_s32 inc0, inc1, inc2; 179 mlib_d64 s0, s1, s2; 180 181 deltac[channels] = 1; 182 for (i = (channeld - 1), k = 0; i >= 0; i--) { 183 if ((cmask & (1 << i)) == 0) 184 deltac[k]++; 185 else 186 k++; 187 } 188 189 deltac[channels] = channeld; 190 for (i = 1; i < channels; i++) { 191 deltac[channels] -= deltac[i]; 192 } 193 194 sp = sl = (void *)src; 195 dp = dl = dst + deltac[0]; 196 197 if (channels == 1) { 198 for (j = 0; j < height; j++) { 199 #pragma pipeloop(0) 200 for (i = 0; i < width; i++) { 201 s0 = sp[0]; 202 dp[0] = s0; 203 dp += channeld; 204 sp++; 205 } 206 207 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 208 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 209 } 210 } 211 else if (channels == 2) { 212 inc0 = deltac[1]; 213 inc1 = deltac[2] + inc0; 214 for (j = 0; j < height; j++) { 215 #pragma pipeloop(0) 216 for (i = 0; i < width; i++) { 217 s0 = sp[0]; 218 s1 = sp[1]; 219 dp[0] = s0; 220 dp[inc0] = s1; 221 dp += inc1; 222 sp += 2; 223 } 224 225 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 226 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 227 } 228 } 229 else if (channels == 3) { 230 inc0 = deltac[1]; 231 inc1 = deltac[2] + inc0; 232 inc2 = deltac[3] + inc1; 233 for (j = 0; j < height; j++) { 234 #pragma pipeloop(0) 235 for (i = 0; i < width; i++) { 236 s0 = sp[0]; 237 s1 = sp[1]; 238 s2 = sp[2]; 239 dp[0] = s0; 240 dp[inc0] = s1; 241 dp[inc1] = s2; 242 dp += inc2; 243 sp += 3; 244 } 245 246 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 247 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 248 } 249 } 250 } 251 252 /***************************************************************/ 253 /* general channel insertion: slower due to the inner loop */ 254 void mlib_v_ImageChannelInsert_S16(const mlib_s16 *src, 255 mlib_s32 slb, 256 mlib_s16 *dst, 257 mlib_s32 dlb, 258 mlib_s32 channels, 259 mlib_s32 channeld, 260 mlib_s32 width, 261 mlib_s32 height, 262 mlib_s32 cmask) 263 { 264 mlib_s16 *sp; /* pointer for pixel in src */ 265 mlib_s16 *sl; /* pointer for line in src */ 266 mlib_s16 *dp; /* pointer for pixel in dst */ 267 mlib_s16 *dl; /* pointer for line in dst */ 268 mlib_s32 i, j, k; /* indices for x, y, channel */ 269 mlib_s32 deltac[5] = { 0, 1, 1, 1, 1 }; 270 mlib_s32 inc0, inc1, inc2; 271 mlib_s16 s0, s1, s2; 272 273 deltac[channels] = 1; 274 for (i = (channeld - 1), k = 0; i >= 0; i--) { 275 if ((cmask & (1 << i)) == 0) 276 deltac[k]++; 277 else 278 k++; 279 } 280 281 deltac[channels] = channeld; 282 for (i = 1; i < channels; i++) { 283 deltac[channels] -= deltac[i]; 284 } 285 286 sp = sl = (void *)src; 287 dp = dl = dst + deltac[0]; 288 289 if (channels == 2) { 290 inc0 = deltac[1]; 291 inc1 = deltac[2] + inc0; 292 for (j = 0; j < height; j++) { 293 #pragma pipeloop(0) 294 for (i = 0; i < width; i++) { 295 s0 = sp[0]; 296 s1 = sp[1]; 297 dp[0] = s0; 298 dp[inc0] = s1; 299 dp += inc1; 300 sp += 2; 301 } 302 303 sp = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb); 304 dp = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb); 305 } 306 } 307 else if (channels == 3) { 308 inc0 = deltac[1]; 309 inc1 = deltac[2] + inc0; 310 inc2 = deltac[3] + inc1; 311 for (j = 0; j < height; j++) { 312 #pragma pipeloop(0) 313 for (i = 0; i < width; i++) { 314 s0 = sp[0]; 315 s1 = sp[1]; 316 s2 = sp[2]; 317 dp[0] = s0; 318 dp[inc0] = s1; 319 dp[inc1] = s2; 320 dp += inc2; 321 sp += 3; 322 } 323 324 sp = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb); 325 dp = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb); 326 } 327 } 328 } 329 330 /***************************************************************/ 331 /* general channel insertion: slower due to the inner loop */ 332 333 void mlib_v_ImageChannelInsert_S32(const mlib_s32 *src, 334 mlib_s32 slb, 335 mlib_s32 *dst, 336 mlib_s32 dlb, 337 mlib_s32 channels, 338 mlib_s32 channeld, 339 mlib_s32 width, 340 mlib_s32 height, 341 mlib_s32 cmask) 342 { 343 mlib_s32 *sp; /* pointer for pixel in src */ 344 mlib_s32 *sl; /* pointer for line in src */ 345 mlib_s32 *dp; /* pointer for pixel in dst */ 346 mlib_s32 *dl; /* pointer for line in dst */ 347 mlib_s32 i, j, k; /* indices for x, y, channel */ 348 mlib_s32 deltac[5] = { 0, 1, 1, 1, 1 }; 349 mlib_s32 inc0, inc1, inc2; 350 mlib_s32 s0, s1, s2; 351 352 deltac[channels] = 1; 353 for (i = (channeld - 1), k = 0; i >= 0; i--) { 354 if ((cmask & (1 << i)) == 0) 355 deltac[k]++; 356 else 357 k++; 358 } 359 360 deltac[channels] = channeld; 361 for (i = 1; i < channels; i++) { 362 deltac[channels] -= deltac[i]; 363 } 364 365 sp = sl = (void *)src; 366 dp = dl = dst + deltac[0]; 367 368 if (channels == 1) { 369 for (j = 0; j < height; j++) { 370 #pragma pipeloop(0) 371 for (i = 0; i < width; i++) { 372 s0 = sp[0]; 373 dp[0] = s0; 374 dp += channeld; 375 sp++; 376 } 377 378 sp = sl = (mlib_s32 *) ((mlib_u8 *) sl + slb); 379 dp = dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb); 380 } 381 } 382 else if (channels == 2) { 383 inc0 = deltac[1]; 384 inc1 = deltac[2] + inc0; 385 for (j = 0; j < height; j++) { 386 #pragma pipeloop(0) 387 for (i = 0; i < width; i++) { 388 s0 = sp[0]; 389 s1 = sp[1]; 390 dp[0] = s0; 391 dp[inc0] = s1; 392 dp += inc1; 393 sp += 2; 394 } 395 396 sp = sl = (mlib_s32 *) ((mlib_u8 *) sl + slb); 397 dp = dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb); 398 } 399 } 400 else if (channels == 3) { 401 inc0 = deltac[1]; 402 inc1 = deltac[2] + inc0; 403 inc2 = deltac[3] + inc1; 404 for (j = 0; j < height; j++) { 405 #pragma pipeloop(0) 406 for (i = 0; i < width; i++) { 407 s0 = sp[0]; 408 s1 = sp[1]; 409 s2 = sp[2]; 410 dp[0] = s0; 411 dp[inc0] = s1; 412 dp[inc1] = s2; 413 dp += inc2; 414 sp += 3; 415 } 416 417 sp = sl = (mlib_s32 *) ((mlib_u8 *) sl + slb); 418 dp = dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb); 419 } 420 } 421 } 422 423 /***************************************************************/ 424 #define INSERT_U8_12(sd0, dd0, dd1) /* channel duplicate */ \ 425 dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \ 426 dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0)) 427 428 /***************************************************************/ 429 /* insert one channel to a 2-channel image. 430 * both source and destination image data are 8-byte aligned. 431 * dsize is multiple of 8. 432 */ 433 434 void mlib_v_ImageChannelInsert_U8_12_A8D1X8(const mlib_u8 *src, 435 mlib_u8 *dst, 436 mlib_s32 dsize, 437 mlib_s32 cmask) 438 { 439 mlib_d64 *sp, *dp; 440 mlib_d64 sd0; 441 mlib_d64 dd0, dd1; 442 mlib_s32 bmask; 443 mlib_s32 i; 444 445 bmask = cmask | (cmask << 2) | (cmask << 4) | (cmask << 6); 446 447 sp = (mlib_d64 *) src; 448 dp = (mlib_d64 *) dst; 449 450 #pragma pipeloop(0) 451 for (i = 0; i < dsize / 8; i++) { 452 sd0 = *sp++; 453 INSERT_U8_12(sd0, dd0, dd1); 454 vis_pst_8(dd0, dp++, bmask); 455 vis_pst_8(dd1, dp++, bmask); 456 } 457 } 458 459 /***************************************************************/ 460 /* insert one channel to a 2-channel image. 461 * both source and destination image data are 8-byte aligned. 462 * xsize is multiple of 8. 463 */ 464 465 void mlib_v_ImageChannelInsert_U8_12_A8D2X8(const mlib_u8 *src, 466 mlib_s32 slb, 467 mlib_u8 *dst, 468 mlib_s32 dlb, 469 mlib_s32 xsize, 470 mlib_s32 ysize, 471 mlib_s32 cmask) 472 { 473 mlib_d64 *sp, *dp; 474 mlib_d64 *sl, *dl; 475 mlib_d64 sd0; 476 mlib_d64 dd0, dd1; 477 mlib_s32 bmask; 478 mlib_s32 i, j; 479 480 bmask = cmask | (cmask << 2) | (cmask << 4) | (cmask << 6); 481 482 sp = sl = (mlib_d64 *) src; 483 dp = dl = (mlib_d64 *) dst; 484 485 for (j = 0; j < ysize; j++) { 486 #pragma pipeloop(0) 487 for (i = 0; i < xsize / 8; i++) { 488 sd0 = *sp++; 489 INSERT_U8_12(sd0, dd0, dd1); 490 vis_pst_8(dd0, dp++, bmask); 491 vis_pst_8(dd1, dp++, bmask); 492 } 493 494 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 495 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 496 } 497 } 498 499 /***************************************************************/ 500 /* insert one channel to a 2-channel image. 501 */ 502 503 void mlib_v_ImageChannelInsert_U8_12_D1(const mlib_u8 *src, 504 mlib_u8 *dst, 505 mlib_s32 dsize, 506 mlib_s32 cmask) 507 { 508 mlib_u8 *sa, *da; 509 mlib_u8 *dend, *dend2; /* end points in dst */ 510 mlib_d64 *dp; /* 8-byte aligned start points in dst */ 511 mlib_d64 *sp; /* 8-byte aligned start point in src */ 512 mlib_d64 sd0, sd1; /* 8-byte source data */ 513 mlib_d64 dd0, dd1, dd2, dd3; /* 8-byte destination data */ 514 mlib_s32 soff; /* offset of address in src */ 515 mlib_s32 doff; /* offset of address in dst */ 516 mlib_s32 off; /* offset of src over dst */ 517 mlib_s32 emask; /* edge mask */ 518 mlib_s32 bmask; /* channel mask */ 519 mlib_s32 i, n; 520 521 bmask = cmask | (cmask << 2) | (cmask << 4) | (cmask << 6); 522 523 sa = (void *)src; 524 da = dst; 525 526 /* prepare the source address */ 527 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 528 soff = ((mlib_addr) sa & 7); 529 530 /* prepare the destination addresses */ 531 dp = (mlib_d64 *) ((mlib_addr) da & (~7)); 532 doff = ((mlib_addr) da & 7); 533 dend = da + dsize * 2 - 1; 534 dend2 = dend - 15; 535 536 /* calculate the src's offset over dst */ 537 off = soff * 2 - doff; 538 539 if (doff % 2 != 0) { 540 bmask = (~bmask) & 0xff; 541 } 542 543 if (off == 0) { /* src and dst have same alignment */ 544 545 /* load 8 bytes */ 546 sd0 = *sp++; 547 548 /* insert, including some garbage at the start point */ 549 INSERT_U8_12(sd0, dd0, dd1); 550 551 /* store 16 bytes result */ 552 emask = vis_edge8(da, dend); 553 vis_pst_8(dd0, dp++, emask & bmask); 554 if ((mlib_addr) dp <= (mlib_addr) dend) { 555 emask = vis_edge8(dp, dend); 556 vis_pst_8(dd1, dp++, emask & bmask); 557 } 558 559 if ((mlib_addr) dp <= (mlib_addr) dend2) { 560 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1; 561 562 /* 8-pixel column loop, emask not needed */ 563 #pragma pipeloop(0) 564 for (i = 0; i < n; i++) { 565 sd0 = *sp++; 566 INSERT_U8_12(sd0, dd0, dd1); 567 vis_pst_8(dd0, dp++, bmask); 568 vis_pst_8(dd1, dp++, bmask); 569 } 570 } 571 572 /* end point handling */ 573 if ((mlib_addr) dp <= (mlib_addr) dend) { 574 sd0 = *sp++; 575 INSERT_U8_12(sd0, dd0, dd1); 576 emask = vis_edge8(dp, dend); 577 vis_pst_8(dd0, dp++, emask & bmask); 578 if ((mlib_addr) dp <= (mlib_addr) dend) { 579 emask = vis_edge8(dp, dend); 580 vis_pst_8(dd1, dp++, emask & bmask); 581 } 582 } 583 } 584 else if (off < 0) { 585 vis_alignaddr((void *)0, off); 586 587 /* generate edge mask for the start point */ 588 emask = vis_edge8(da, dend); 589 590 /* load 8 bytes */ 591 sd0 = *sp++; 592 593 /* insert and store 16 bytes */ 594 INSERT_U8_12(sd0, dd0, dd1); 595 vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask); 596 if ((mlib_addr) dp <= (mlib_addr) dend) { 597 emask = vis_edge8(dp, dend); 598 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); 599 } 600 601 if ((mlib_addr) dp <= (mlib_addr) dend2) { 602 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1; 603 604 /* 8-pixel column loop, emask not needed */ 605 #pragma pipeloop(0) 606 for (i = 0; i < n; i++) { 607 dd2 = dd1; 608 sd0 = *sp++; 609 INSERT_U8_12(sd0, dd0, dd1); 610 vis_pst_8(vis_faligndata(dd2, dd0), dp++, bmask); 611 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask); 612 } 613 } 614 615 /* end point handling */ 616 if ((mlib_addr) dp <= (mlib_addr) dend) { 617 emask = vis_edge8(dp, dend); 618 dd2 = dd1; 619 sd0 = *sp++; 620 INSERT_U8_12(sd0, dd0, dd1); 621 vis_pst_8(vis_faligndata(dd2, dd0), dp++, emask & bmask); 622 if ((mlib_addr) dp <= (mlib_addr) dend) { 623 emask = vis_edge8(dp, dend); 624 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); 625 } 626 } 627 } 628 else if (off < 8) { 629 vis_alignaddr((void *)0, off); 630 631 /* generate edge mask for the start point */ 632 emask = vis_edge8(da, dend); 633 634 /* load 16 bytes */ 635 sd0 = *sp++; 636 sd1 = *sp++; 637 638 /* insert and store 16 bytes */ 639 INSERT_U8_12(sd0, dd0, dd1); 640 INSERT_U8_12(sd1, dd2, dd3); 641 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); 642 if ((mlib_addr) dp <= (mlib_addr) dend) { 643 emask = vis_edge8(dp, dend); 644 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); 645 } 646 647 if ((mlib_addr) dp <= (mlib_addr) dend2) { 648 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1; 649 650 /* 8-pixel column loop, emask not needed */ 651 #pragma pipeloop(0) 652 for (i = 0; i < n; i++) { 653 dd0 = dd2; 654 dd1 = dd3; 655 sd1 = *sp++; 656 INSERT_U8_12(sd1, dd2, dd3); 657 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask); 658 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask); 659 } 660 } 661 662 /* end point handling */ 663 if ((mlib_addr) dp <= (mlib_addr) dend) { 664 emask = vis_edge8(dp, dend); 665 dd0 = dd2; 666 dd1 = dd3; 667 sd1 = *sp++; 668 INSERT_U8_12(sd1, dd2, dd3); 669 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); 670 if ((mlib_addr) dp <= (mlib_addr) dend) { 671 emask = vis_edge8(dp, dend); 672 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); 673 } 674 } 675 } 676 else { /* (off >= 8) */ 677 vis_alignaddr((void *)0, off); 678 679 /* generate edge mask for the start point */ 680 emask = vis_edge8(da, dend); 681 682 /* load 16 bytes */ 683 sd0 = *sp++; 684 sd1 = *sp++; 685 686 /* insert and store 16 bytes */ 687 INSERT_U8_12(sd0, dd0, dd1); 688 INSERT_U8_12(sd1, dd2, dd3); 689 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); 690 if ((mlib_addr) dp <= (mlib_addr) dend) { 691 emask = vis_edge8(dp, dend); 692 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask); 693 } 694 695 if ((mlib_addr) dp <= (mlib_addr) dend2) { 696 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1; 697 698 /* 8-pixel column loop, emask not needed */ 699 #pragma pipeloop(0) 700 for (i = 0; i < n; i++) { 701 dd0 = dd2; 702 dd1 = dd3; 703 sd1 = *sp++; 704 INSERT_U8_12(sd1, dd2, dd3); 705 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask); 706 vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask); 707 } 708 } 709 710 /* end point handling */ 711 if ((mlib_addr) dp <= (mlib_addr) dend) { 712 emask = vis_edge8(dp, dend); 713 dd0 = dd2; 714 dd1 = dd3; 715 sd1 = *sp++; 716 INSERT_U8_12(sd1, dd2, dd3); 717 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); 718 if ((mlib_addr) dp <= (mlib_addr) dend) { 719 emask = vis_edge8(dp, dend); 720 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask); 721 } 722 } 723 } 724 } 725 726 /***************************************************************/ 727 /* insert one channel to a 2-channel image. 728 */ 729 730 void mlib_v_ImageChannelInsert_U8_12(const mlib_u8 *src, 731 mlib_s32 slb, 732 mlib_u8 *dst, 733 mlib_s32 dlb, 734 mlib_s32 xsize, 735 mlib_s32 ysize, 736 mlib_s32 cmask) 737 { 738 mlib_u8 *sa, *da; 739 mlib_u8 *sl, *dl; 740 mlib_s32 j; 741 742 sa = sl = (void *)src; 743 da = dl = dst; 744 745 #pragma pipeloop(0) 746 for (j = 0; j < ysize; j++) { 747 mlib_v_ImageChannelInsert_U8_12_D1(sa, da, xsize, cmask); 748 sa = sl += slb; 749 da = dl += dlb; 750 } 751 } 752 753 /***************************************************************/ 754 #define INSERT_U8_13(sd0, dd0, dd1, dd2) \ 755 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd0)); \ 756 sdb = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sda)); \ 757 sdc = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdb)); \ 758 sdd = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdb)); \ 759 dd0 = vis_fpmerge(vis_read_hi(sdc), vis_read_hi(sdd)); \ 760 sde = vis_fpmerge(vis_read_lo(sdc), vis_read_lo(sdd)); \ 761 dd1 = vis_freg_pair(vis_read_lo(dd0), vis_read_hi(sde)); \ 762 dd2 = vis_freg_pair(vis_read_lo(sde), vis_read_lo(sde)) 763 764 /***************************************************************/ 765 #define LOAD_INSERT_STORE_U8_A8(channeld) \ 766 sd = *sp++; \ 767 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 768 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 769 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 770 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 771 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 772 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 773 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 774 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld 775 776 /***************************************************************/ 777 #define LOAD_INSERT_STORE_U8(channeld) \ 778 vis_alignaddr((void *)0, off); \ 779 sd0 = sd1; \ 780 sd1 = *sp++; \ 781 sd = vis_faligndata(sd0, sd1); \ 782 vis_alignaddr((void *)0, 1); \ 783 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 784 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 785 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 786 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 787 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 788 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 789 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 790 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld 791 792 /***************************************************************/ 793 void mlib_v_ImageChannelInsert_U8_13_A8D1X8(const mlib_u8 *src, 794 mlib_u8 *dst, 795 mlib_s32 dsize, 796 mlib_s32 cmask) 797 { 798 mlib_u8 *da; 799 mlib_d64 *sp; 800 mlib_d64 sd; 801 mlib_s32 i; 802 803 vis_alignaddr((void *)0, 1); /* for 1-byte left shift */ 804 805 sp = (mlib_d64 *) src; 806 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */ 807 808 #pragma pipeloop(0) 809 for (i = 0; i < dsize / 8; i++) { 810 LOAD_INSERT_STORE_U8_A8(3); 811 } 812 } 813 814 /***************************************************************/ 815 void mlib_v_ImageChannelInsert_U8_13_A8D2X8(const mlib_u8 *src, 816 mlib_s32 slb, 817 mlib_u8 *dst, 818 mlib_s32 dlb, 819 mlib_s32 xsize, 820 mlib_s32 ysize, 821 mlib_s32 cmask) 822 { 823 mlib_u8 *da, *dl; 824 mlib_d64 *sp, *sl; 825 mlib_d64 sd; 826 mlib_s32 i, j; 827 828 vis_alignaddr((void *)0, 1); 829 830 sp = sl = (mlib_d64 *) src; 831 da = dl = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */ 832 833 for (j = 0; j < ysize; j++) { 834 #pragma pipeloop(0) 835 for (i = 0; i < xsize / 8; i++) { 836 LOAD_INSERT_STORE_U8_A8(3); 837 } 838 839 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 840 da = dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb); 841 } 842 } 843 844 /***************************************************************/ 845 void mlib_v_ImageChannelInsert_U8_13_D1(const mlib_u8 *src, 846 mlib_u8 *dst, 847 mlib_s32 dsize, 848 mlib_s32 cmask) 849 { 850 mlib_u8 *sa, *da; 851 mlib_u8 *dend; /* end point in destination */ 852 mlib_d64 *sp; /* 8-byte aligned start points in src */ 853 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */ 854 mlib_s32 off; /* offset of address alignment in src */ 855 mlib_s32 i; 856 857 /* prepare the src address */ 858 sa = (void *)src; 859 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 860 off = (mlib_addr) sa & 7; 861 862 /* prepare the dst address */ 863 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */ 864 dend = da + dsize * 3 - 1; 865 866 sd1 = *sp++; 867 868 #pragma pipeloop(0) 869 for (i = 0; i < dsize / 8; i++) { 870 LOAD_INSERT_STORE_U8(3); 871 } 872 873 /* right end handling */ 874 if ((mlib_addr) da <= (mlib_addr) dend) { 875 876 vis_alignaddr((void *)0, off); 877 sd0 = sd1; 878 sd1 = *sp++; 879 sd = vis_faligndata(sd0, sd1); 880 881 vis_alignaddr((void *)0, 1); 882 vis_st_u8(sd = vis_faligndata(sd, sd), da); 883 da += 3; 884 if ((mlib_addr) da <= (mlib_addr) dend) { 885 vis_st_u8(sd = vis_faligndata(sd, sd), da); 886 da += 3; 887 if ((mlib_addr) da <= (mlib_addr) dend) { 888 vis_st_u8(sd = vis_faligndata(sd, sd), da); 889 da += 3; 890 if ((mlib_addr) da <= (mlib_addr) dend) { 891 vis_st_u8(sd = vis_faligndata(sd, sd), da); 892 da += 3; 893 if ((mlib_addr) da <= (mlib_addr) dend) { 894 vis_st_u8(sd = vis_faligndata(sd, sd), da); 895 da += 3; 896 if ((mlib_addr) da <= (mlib_addr) dend) { 897 vis_st_u8(sd = vis_faligndata(sd, sd), da); 898 da += 3; 899 if ((mlib_addr) da <= (mlib_addr) dend) { 900 vis_st_u8(sd = vis_faligndata(sd, sd), da); 901 } 902 } 903 } 904 } 905 } 906 } 907 } 908 } 909 910 /***************************************************************/ 911 void mlib_v_ImageChannelInsert_U8_13(const mlib_u8 *src, 912 mlib_s32 slb, 913 mlib_u8 *dst, 914 mlib_s32 dlb, 915 mlib_s32 xsize, 916 mlib_s32 ysize, 917 mlib_s32 cmask) 918 { 919 mlib_u8 *sa, *da; 920 mlib_u8 *sl, *dl; 921 mlib_s32 j; 922 923 sa = sl = (void *)src; 924 da = dl = dst; 925 926 #pragma pipeloop(0) 927 for (j = 0; j < ysize; j++) { 928 mlib_v_ImageChannelInsert_U8_13_D1(sa, da, xsize, cmask); 929 sa = sl += slb; 930 da = dl += dlb; 931 } 932 } 933 934 /***************************************************************/ 935 #define INSERT_U8_14(sd0, dd0, dd1, dd2, dd3) \ 936 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \ 937 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0)); \ 938 dd0 = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sda)); \ 939 dd1 = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sda)); \ 940 dd2 = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdb)); \ 941 dd3 = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdb)) 942 943 /***************************************************************/ 944 void mlib_v_ImageChannelInsert_U8_14_A8D1X8(const mlib_u8 *src, 945 mlib_u8 *dst, 946 mlib_s32 dsize, 947 mlib_s32 cmask) 948 { 949 mlib_d64 *sp, *dp; 950 mlib_d64 sd0; 951 mlib_d64 sda, sdb; 952 mlib_d64 dd0, dd1, dd2, dd3; 953 mlib_s32 bmask; 954 mlib_s32 i; 955 956 bmask = cmask | (cmask << 4); 957 958 sp = (mlib_d64 *) src; 959 dp = (mlib_d64 *) dst; 960 961 #pragma pipeloop(0) 962 for (i = 0; i < dsize / 8; i++) { 963 sd0 = *sp++; 964 INSERT_U8_14(sd0, dd0, dd1, dd2, dd3); 965 vis_pst_8(dd0, dp++, bmask); 966 vis_pst_8(dd1, dp++, bmask); 967 vis_pst_8(dd2, dp++, bmask); 968 vis_pst_8(dd3, dp++, bmask); 969 } 970 } 971 972 /***************************************************************/ 973 void mlib_v_ImageChannelInsert_U8_14_A8D2X8(const mlib_u8 *src, 974 mlib_s32 slb, 975 mlib_u8 *dst, 976 mlib_s32 dlb, 977 mlib_s32 xsize, 978 mlib_s32 ysize, 979 mlib_s32 cmask) 980 { 981 mlib_d64 *sp, *dp; 982 mlib_d64 *sl, *dl; 983 mlib_d64 sd0; 984 mlib_d64 sda, sdb; 985 mlib_d64 dd0, dd1, dd2, dd3; 986 mlib_s32 bmask; 987 mlib_s32 i, j; 988 989 bmask = cmask | (cmask << 4); 990 991 sp = sl = (mlib_d64 *) src; 992 dp = dl = (mlib_d64 *) dst; 993 994 for (j = 0; j < ysize; j++) { 995 #pragma pipeloop(0) 996 for (i = 0; i < xsize / 8; i++) { 997 sd0 = *sp++; 998 INSERT_U8_14(sd0, dd0, dd1, dd2, dd3); 999 vis_pst_8(dd0, dp++, bmask); 1000 vis_pst_8(dd1, dp++, bmask); 1001 vis_pst_8(dd2, dp++, bmask); 1002 vis_pst_8(dd3, dp++, bmask); 1003 } 1004 1005 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 1006 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb); 1007 } 1008 } 1009 1010 /***************************************************************/ 1011 void mlib_v_ImageChannelInsert_U8_14_D1(const mlib_u8 *src, 1012 mlib_u8 *dst, 1013 mlib_s32 dsize, 1014 mlib_s32 cmask) 1015 { 1016 mlib_u8 *sa, *da; 1017 mlib_u8 *dend, *dend2; /* end points in dst */ 1018 mlib_d64 *dp; /* 8-byte aligned start points in dst */ 1019 mlib_d64 *sp; /* 8-byte aligned start point in src */ 1020 mlib_d64 sd0, sd1, sd; /* 8-byte source data */ 1021 mlib_d64 sda, sdb; 1022 mlib_d64 dd0, dd1, dd2, dd3, dd4; 1023 mlib_s32 soff; /* offset of address in src */ 1024 mlib_s32 doff; /* offset of address in dst */ 1025 mlib_s32 emask; /* edge mask */ 1026 mlib_s32 bmask; /* channel mask */ 1027 mlib_s32 i, n; 1028 1029 sa = (void *)src; 1030 da = dst; 1031 1032 bmask = cmask | (cmask << 4) | (cmask << 8); 1033 1034 /* prepare the source address */ 1035 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 1036 soff = ((mlib_addr) sa & 7); 1037 1038 /* prepare the destination addresses */ 1039 dp = (mlib_d64 *) ((mlib_addr) da & (~7)); 1040 doff = ((mlib_addr) da & 7); 1041 dend = da + dsize * 4 - 1; 1042 dend2 = dend - 31; 1043 1044 bmask = (bmask >> (doff % 4)) & 0xff; 1045 1046 if (doff == 0) { /* dst is 8-byte aligned */ 1047 1048 vis_alignaddr((void *)0, soff); 1049 sd0 = *sp++; 1050 sd1 = *sp++; 1051 sd = vis_faligndata(sd0, sd1); /* the intermediate is aligned */ 1052 1053 INSERT_U8_14(sd, dd0, dd1, dd2, dd3); 1054 1055 emask = vis_edge8(da, dend); 1056 vis_pst_8(dd0, dp++, emask & bmask); 1057 if ((mlib_addr) dp <= (mlib_addr) dend) { /* for very small size */ 1058 emask = vis_edge8(dp, dend); 1059 vis_pst_8(dd1, dp++, emask & bmask); 1060 if ((mlib_addr) dp <= (mlib_addr) dend) { 1061 emask = vis_edge8(dp, dend); 1062 vis_pst_8(dd2, dp++, emask & bmask); 1063 if ((mlib_addr) dp <= (mlib_addr) dend) { 1064 emask = vis_edge8(dp, dend); 1065 vis_pst_8(dd3, dp++, emask & bmask); 1066 } 1067 } 1068 } 1069 1070 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1071 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 32 + 1; 1072 1073 /* 8-pixel column loop, emask not needed */ 1074 #pragma pipeloop(0) 1075 for (i = 0; i < n; i++) { 1076 sd0 = sd1; 1077 sd1 = *sp++; 1078 sd = vis_faligndata(sd0, sd1); 1079 1080 INSERT_U8_14(sd, dd0, dd1, dd2, dd3); 1081 1082 vis_pst_8(dd0, dp++, bmask); 1083 vis_pst_8(dd1, dp++, bmask); 1084 vis_pst_8(dd2, dp++, bmask); 1085 vis_pst_8(dd3, dp++, bmask); 1086 } 1087 } 1088 1089 /* end point handling */ 1090 if ((mlib_addr) dp <= (mlib_addr) dend) { 1091 sd0 = sd1; 1092 sd1 = *sp++; 1093 sd = vis_faligndata(sd0, sd1); 1094 1095 INSERT_U8_14(sd, dd0, dd1, dd2, dd3); 1096 1097 emask = vis_edge8(dp, dend); 1098 vis_pst_8(dd0, dp++, emask & bmask); 1099 if ((mlib_addr) dp <= (mlib_addr) dend) { 1100 emask = vis_edge8(dp, dend); 1101 vis_pst_8(dd1, dp++, emask & bmask); 1102 if ((mlib_addr) dp <= (mlib_addr) dend) { 1103 emask = vis_edge8(dp, dend); 1104 vis_pst_8(dd2, dp++, emask & bmask); 1105 if ((mlib_addr) dp <= (mlib_addr) dend) { 1106 emask = vis_edge8(dp, dend); 1107 vis_pst_8(dd3, dp++, emask & bmask); 1108 } 1109 } 1110 } 1111 } 1112 } 1113 else { /* dst is not 8-byte aligned */ 1114 vis_alignaddr((void *)0, soff); 1115 sd0 = *sp++; 1116 sd1 = *sp++; 1117 sd = vis_faligndata(sd0, sd1); /* the intermediate is aligned */ 1118 1119 INSERT_U8_14(sd, dd0, dd1, dd2, dd3); 1120 1121 vis_alignaddr((void *)0, -doff); 1122 1123 emask = vis_edge8(da, dend); 1124 vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask); 1125 if ((mlib_addr) dp <= (mlib_addr) dend) { /* for very small size */ 1126 emask = vis_edge8(dp, dend); 1127 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); 1128 if ((mlib_addr) dp <= (mlib_addr) dend) { 1129 emask = vis_edge8(dp, dend); 1130 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); 1131 if ((mlib_addr) dp <= (mlib_addr) dend) { 1132 emask = vis_edge8(dp, dend); 1133 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask); 1134 } 1135 } 1136 } 1137 1138 if ((mlib_addr) dp <= (mlib_addr) dend2) { 1139 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 32 + 1; 1140 1141 /* 8-pixel column loop, emask not needed */ 1142 #pragma pipeloop(0) 1143 for (i = 0; i < n; i++) { 1144 dd4 = dd3; 1145 1146 vis_alignaddr((void *)0, soff); 1147 sd0 = sd1; 1148 sd1 = *sp++; 1149 sd = vis_faligndata(sd0, sd1); 1150 1151 INSERT_U8_14(sd, dd0, dd1, dd2, dd3); 1152 1153 vis_alignaddr((void *)0, -doff); 1154 vis_pst_8(vis_faligndata(dd4, dd0), dp++, bmask); 1155 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask); 1156 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask); 1157 vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask); 1158 } 1159 } 1160 1161 /* end point handling */ 1162 if ((mlib_addr) dp <= (mlib_addr) dend) { 1163 dd4 = dd3; 1164 1165 vis_alignaddr((void *)0, soff); 1166 sd0 = sd1; 1167 sd1 = *sp++; 1168 sd = vis_faligndata(sd0, sd1); 1169 1170 INSERT_U8_14(sd, dd0, dd1, dd2, dd3); 1171 1172 vis_alignaddr((void *)0, -doff); 1173 emask = vis_edge8(dp, dend); 1174 vis_pst_8(vis_faligndata(dd4, dd0), dp++, emask & bmask); 1175 if ((mlib_addr) dp <= (mlib_addr) dend) { 1176 emask = vis_edge8(dp, dend); 1177 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); 1178 if ((mlib_addr) dp <= (mlib_addr) dend) { 1179 emask = vis_edge8(dp, dend); 1180 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); 1181 if ((mlib_addr) dp <= (mlib_addr) dend) { 1182 emask = vis_edge8(dp, dend); 1183 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask); 1184 } 1185 } 1186 } 1187 } 1188 } 1189 } 1190 1191 /***************************************************************/ 1192 void mlib_v_ImageChannelInsert_U8_14(const mlib_u8 *src, 1193 mlib_s32 slb, 1194 mlib_u8 *dst, 1195 mlib_s32 dlb, 1196 mlib_s32 xsize, 1197 mlib_s32 ysize, 1198 mlib_s32 cmask) 1199 { 1200 mlib_u8 *sa, *da; 1201 mlib_u8 *sl, *dl; 1202 mlib_s32 j; 1203 1204 sa = sl = (void *)src; 1205 da = dl = dst; 1206 1207 #pragma pipeloop(0) 1208 for (j = 0; j < ysize; j++) { 1209 mlib_v_ImageChannelInsert_U8_14_D1(sa, da, xsize, cmask); 1210 sa = sl += slb; 1211 da = dl += dlb; 1212 } 1213 } 1214 1215 /***************************************************************/ 1216 #define LOAD_INSERT_STORE_S16_1X_A8(channeld) \ 1217 sd = *sp++; \ 1218 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \ 1219 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \ 1220 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \ 1221 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld 1222 1223 /***************************************************************/ 1224 #define LOAD_INSERT_STORE_S16_1X(channeld) \ 1225 vis_alignaddr((void *)0, off); \ 1226 sd0 = sd1; \ 1227 sd1 = *sp++; \ 1228 sd = vis_faligndata(sd0, sd1); \ 1229 vis_alignaddr((void *)0, 2); \ 1230 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \ 1231 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \ 1232 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \ 1233 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld 1234 1235 /***************************************************************/ 1236 void mlib_v_ImageChannelInsert_S16_12_A8D1X4(const mlib_s16 *src, 1237 mlib_s16 *dst, 1238 mlib_s32 dsize, 1239 mlib_s32 cmask) 1240 { 1241 mlib_s16 *da; 1242 mlib_d64 *sp; 1243 mlib_d64 sd; 1244 mlib_s32 i; 1245 1246 sp = (mlib_d64 *) src; 1247 da = dst + (2 - cmask); /* 2,1 -> 0,1 */ 1248 1249 vis_alignaddr((void *)0, 2); 1250 1251 #pragma pipeloop(0) 1252 for (i = 0; i < dsize / 4; i++) { 1253 LOAD_INSERT_STORE_S16_1X_A8(2); 1254 } 1255 } 1256 1257 /***************************************************************/ 1258 void mlib_v_ImageChannelInsert_S16_12_A8D2X4(const mlib_s16 *src, 1259 mlib_s32 slb, 1260 mlib_s16 *dst, 1261 mlib_s32 dlb, 1262 mlib_s32 xsize, 1263 mlib_s32 ysize, 1264 mlib_s32 cmask) 1265 { 1266 mlib_s16 *da, *dl; 1267 mlib_d64 *sp, *sl; 1268 mlib_d64 sd; 1269 mlib_s32 i, j; 1270 1271 sp = sl = (mlib_d64 *) src; 1272 da = dl = dst + (2 - cmask); /* 2,1 -> 0,1 */ 1273 1274 vis_alignaddr((void *)0, 2); 1275 1276 for (j = 0; j < ysize; j++) { 1277 #pragma pipeloop(0) 1278 for (i = 0; i < xsize / 4; i++) { 1279 LOAD_INSERT_STORE_S16_1X_A8(2); 1280 } 1281 1282 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 1283 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb); 1284 } 1285 } 1286 1287 /***************************************************************/ 1288 void mlib_v_ImageChannelInsert_S16_12_D1(const mlib_s16 *src, 1289 mlib_s16 *dst, 1290 mlib_s32 dsize, 1291 mlib_s32 cmask) 1292 { 1293 mlib_s16 *sa, *da; 1294 mlib_s16 *dend; /* end point in destination */ 1295 mlib_d64 *sp; /* 8-byte aligned start points in src */ 1296 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */ 1297 mlib_s32 off; /* offset of address alignment in src */ 1298 mlib_s32 i; 1299 1300 sa = (void *)src; 1301 da = dst + (2 - cmask); /* 2,1 -> 0,1 */ 1302 1303 /* prepare the src address */ 1304 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 1305 off = (mlib_addr) sa & 7; 1306 1307 dend = da + dsize * 2 - 1; 1308 1309 sd1 = *sp++; 1310 1311 #pragma pipeloop(0) 1312 for (i = 0; i < dsize / 4; i++) { 1313 LOAD_INSERT_STORE_S16_1X(2); 1314 } 1315 1316 /* right end handling */ 1317 if ((mlib_addr) da <= (mlib_addr) dend) { 1318 1319 vis_alignaddr((void *)0, off); 1320 sd0 = sd1; 1321 sd1 = *sp++; 1322 sd = vis_faligndata(sd0, sd1); 1323 1324 vis_alignaddr((void *)0, 2); 1325 vis_st_u16(sd = vis_faligndata(sd, sd), da); 1326 da += 2; 1327 if ((mlib_addr) da <= (mlib_addr) dend) { 1328 vis_st_u16(sd = vis_faligndata(sd, sd), da); 1329 da += 2; 1330 if ((mlib_addr) da <= (mlib_addr) dend) { 1331 vis_st_u16(sd = vis_faligndata(sd, sd), da); 1332 } 1333 } 1334 } 1335 } 1336 1337 /***************************************************************/ 1338 void mlib_v_ImageChannelInsert_S16_12(const mlib_s16 *src, 1339 mlib_s32 slb, 1340 mlib_s16 *dst, 1341 mlib_s32 dlb, 1342 mlib_s32 xsize, 1343 mlib_s32 ysize, 1344 mlib_s32 cmask) 1345 { 1346 mlib_s16 *sa, *da; 1347 mlib_s16 *sl, *dl; 1348 mlib_s32 j; 1349 1350 sa = sl = (void *)src; 1351 da = dl = dst; 1352 1353 #pragma pipeloop(0) 1354 for (j = 0; j < ysize; j++) { 1355 mlib_v_ImageChannelInsert_S16_12_D1(sa, da, xsize, cmask); 1356 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb); 1357 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb); 1358 } 1359 } 1360 1361 /***************************************************************/ 1362 void mlib_v_ImageChannelInsert_S16_13_A8D1X4(const mlib_s16 *src, 1363 mlib_s16 *dst, 1364 mlib_s32 dsize, 1365 mlib_s32 cmask) 1366 { 1367 mlib_s16 *da; 1368 mlib_d64 *sp; 1369 mlib_d64 sd; 1370 mlib_s32 i; 1371 1372 sp = (mlib_d64 *) src; 1373 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */ 1374 1375 vis_alignaddr((void *)0, 2); 1376 1377 #pragma pipeloop(0) 1378 for (i = 0; i < dsize / 4; i++) { 1379 LOAD_INSERT_STORE_S16_1X_A8(3); 1380 } 1381 } 1382 1383 /***************************************************************/ 1384 void mlib_v_ImageChannelInsert_S16_13_A8D2X4(const mlib_s16 *src, 1385 mlib_s32 slb, 1386 mlib_s16 *dst, 1387 mlib_s32 dlb, 1388 mlib_s32 xsize, 1389 mlib_s32 ysize, 1390 mlib_s32 cmask) 1391 { 1392 mlib_s16 *da, *dl; 1393 mlib_d64 *sp, *sl; 1394 mlib_d64 sd; 1395 mlib_s32 i, j; 1396 1397 sp = sl = (mlib_d64 *) src; 1398 da = dl = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */ 1399 1400 vis_alignaddr((void *)0, 2); 1401 1402 for (j = 0; j < ysize; j++) { 1403 #pragma pipeloop(0) 1404 for (i = 0; i < xsize / 4; i++) { 1405 LOAD_INSERT_STORE_S16_1X_A8(3); 1406 } 1407 1408 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 1409 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb); 1410 } 1411 } 1412 1413 /***************************************************************/ 1414 void mlib_v_ImageChannelInsert_S16_13_D1(const mlib_s16 *src, 1415 mlib_s16 *dst, 1416 mlib_s32 dsize, 1417 mlib_s32 cmask) 1418 { 1419 mlib_s16 *sa, *da; 1420 mlib_s16 *dend; /* end point in destination */ 1421 mlib_d64 *sp; /* 8-byte aligned start points in src */ 1422 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */ 1423 mlib_s32 off; /* offset of address alignment in src */ 1424 mlib_s32 i; 1425 1426 sa = (void *)src; 1427 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */ 1428 1429 /* prepare the src address */ 1430 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 1431 off = (mlib_addr) sa & 7; 1432 1433 dend = da + dsize * 3 - 1; 1434 1435 sd1 = *sp++; 1436 1437 #pragma pipeloop(0) 1438 for (i = 0; i < dsize / 4; i++) { 1439 LOAD_INSERT_STORE_S16_1X(3); 1440 } 1441 1442 /* right end handling */ 1443 if ((mlib_addr) da <= (mlib_addr) dend) { 1444 1445 vis_alignaddr((void *)0, off); 1446 sd0 = sd1; 1447 sd1 = *sp++; 1448 sd = vis_faligndata(sd0, sd1); 1449 1450 vis_alignaddr((void *)0, 2); 1451 vis_st_u16(sd = vis_faligndata(sd, sd), da); 1452 da += 3; 1453 if ((mlib_addr) da <= (mlib_addr) dend) { 1454 vis_st_u16(sd = vis_faligndata(sd, sd), da); 1455 da += 3; 1456 if ((mlib_addr) da <= (mlib_addr) dend) { 1457 vis_st_u16(sd = vis_faligndata(sd, sd), da); 1458 } 1459 } 1460 } 1461 } 1462 1463 /***************************************************************/ 1464 void mlib_v_ImageChannelInsert_S16_13(const mlib_s16 *src, 1465 mlib_s32 slb, 1466 mlib_s16 *dst, 1467 mlib_s32 dlb, 1468 mlib_s32 xsize, 1469 mlib_s32 ysize, 1470 mlib_s32 cmask) 1471 { 1472 mlib_s16 *sa, *da; 1473 mlib_s16 *sl, *dl; 1474 mlib_s32 j; 1475 1476 sa = sl = (void *)src; 1477 da = dl = dst; 1478 1479 #pragma pipeloop(0) 1480 for (j = 0; j < ysize; j++) { 1481 mlib_v_ImageChannelInsert_S16_13_D1(sa, da, xsize, cmask); 1482 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb); 1483 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb); 1484 } 1485 } 1486 1487 /***************************************************************/ 1488 #define INSERT_S16_14(sp, dp, bmask) /* channel duplicate */ \ 1489 /* obsolete: it is slower than the vis_st_u16() version*/ \ 1490 sd0 = *sp++; \ 1491 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \ 1492 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0)); \ 1493 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sda)); \ 1494 sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sda)); \ 1495 sde = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdb)); \ 1496 sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdb)); \ 1497 dd0 = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc)); \ 1498 dd1 = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sdd)); \ 1499 dd2 = vis_fpmerge(vis_read_hi(sde), vis_read_lo(sde)); \ 1500 dd3 = vis_fpmerge(vis_read_hi(sdf), vis_read_lo(sdf)); \ 1501 vis_pst_16(dd0, dp++, bmask); \ 1502 vis_pst_16(dd1, dp++, bmask); \ 1503 vis_pst_16(dd2, dp++, bmask); \ 1504 vis_pst_16(dd3, dp++, bmask) 1505 1506 /***************************************************************/ 1507 void mlib_v_ImageChannelInsert_S16_14_A8D1X4(const mlib_s16 *src, 1508 mlib_s16 *dst, 1509 mlib_s32 dsize, 1510 mlib_s32 cmask) 1511 { 1512 mlib_s16 *da; 1513 mlib_d64 *sp; 1514 mlib_d64 sd; 1515 mlib_s32 i; 1516 1517 sp = (mlib_d64 *) src; 1518 da = dst + (6 / cmask + 1) / 2; /* 8,4,2,1 -> 0,1,2,3 */ 1519 1520 vis_alignaddr((void *)0, 2); 1521 1522 #pragma pipeloop(0) 1523 for (i = 0; i < dsize / 4; i++) { 1524 LOAD_INSERT_STORE_S16_1X_A8(4); 1525 } 1526 } 1527 1528 /***************************************************************/ 1529 void mlib_v_ImageChannelInsert_S16_14_A8D2X4(const mlib_s16 *src, 1530 mlib_s32 slb, 1531 mlib_s16 *dst, 1532 mlib_s32 dlb, 1533 mlib_s32 xsize, 1534 mlib_s32 ysize, 1535 mlib_s32 cmask) 1536 { 1537 mlib_s16 *da, *dl; 1538 mlib_d64 *sp, *sl; 1539 mlib_d64 sd; 1540 mlib_s32 i, j; 1541 1542 sp = sl = (mlib_d64 *) src; 1543 da = dl = dst + (6 / cmask + 1) / 2; /* 8,4,2,1 -> 0,1,2,3 */ 1544 1545 vis_alignaddr((void *)0, 2); 1546 1547 for (j = 0; j < ysize; j++) { 1548 #pragma pipeloop(0) 1549 for (i = 0; i < xsize / 4; i++) { 1550 LOAD_INSERT_STORE_S16_1X_A8(4); 1551 } 1552 1553 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb); 1554 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb); 1555 } 1556 } 1557 1558 /***************************************************************/ 1559 void mlib_v_ImageChannelInsert_S16_14_D1(const mlib_s16 *src, 1560 mlib_s16 *dst, 1561 mlib_s32 dsize, 1562 mlib_s32 cmask) 1563 { 1564 mlib_s16 *sa, *da; 1565 mlib_s16 *dend; /* end point in destination */ 1566 mlib_d64 *sp; /* 8-byte aligned start points in src */ 1567 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */ 1568 mlib_s32 off; /* offset of address alignment in src */ 1569 mlib_s32 i; 1570 1571 sa = (void *)src; 1572 da = dst + (6 / cmask + 1) / 2; /* 8,4,2,1 -> 0,1,2,3 */ 1573 1574 /* prepare the src address */ 1575 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 1576 off = (mlib_addr) sa & 7; 1577 1578 dend = da + dsize * 4 - 1; 1579 1580 sd1 = *sp++; 1581 1582 #pragma pipeloop(0) 1583 for (i = 0; i < dsize / 4; i++) { 1584 LOAD_INSERT_STORE_S16_1X(4); 1585 } 1586 1587 /* right end handling */ 1588 if ((mlib_addr) da <= (mlib_addr) dend) { 1589 1590 vis_alignaddr((void *)0, off); 1591 sd0 = sd1; 1592 sd1 = *sp++; 1593 sd = vis_faligndata(sd0, sd1); 1594 1595 vis_alignaddr((void *)0, 2); 1596 vis_st_u16(sd = vis_faligndata(sd, sd), da); 1597 da += 4; 1598 if ((mlib_addr) da <= (mlib_addr) dend) { 1599 vis_st_u16(sd = vis_faligndata(sd, sd), da); 1600 da += 4; 1601 if ((mlib_addr) da <= (mlib_addr) dend) { 1602 vis_st_u16(sd = vis_faligndata(sd, sd), da); 1603 } 1604 } 1605 } 1606 } 1607 1608 /***************************************************************/ 1609 void mlib_v_ImageChannelInsert_S16_14(const mlib_s16 *src, 1610 mlib_s32 slb, 1611 mlib_s16 *dst, 1612 mlib_s32 dlb, 1613 mlib_s32 xsize, 1614 mlib_s32 ysize, 1615 mlib_s32 cmask) 1616 { 1617 mlib_s16 *sa, *da; 1618 mlib_s16 *sl, *dl; 1619 mlib_s32 j; 1620 1621 sa = sl = (void *)src; 1622 da = dl = dst; 1623 1624 #pragma pipeloop(0) 1625 for (j = 0; j < ysize; j++) { 1626 mlib_v_ImageChannelInsert_S16_14_D1(sa, da, xsize, cmask); 1627 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb); 1628 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb); 1629 } 1630 } 1631 1632 /***************************************************************/