1 /* 2 * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 27 28 /* 29 * FUNCTIONS 30 * mlib_v_ImageChannelInsert_U8_12_D1 31 * mlib_v_ImageChannelInsert_U8_13_D1 32 * mlib_v_ImageChannelInsert_U8_14_D1 33 * 34 * ARGUMENT 35 * src pointer to source image data 36 * dst pointer to destination image data 37 * slb source image line stride in bytes 38 * dlb destination image line stride in bytes 39 * dsize image data size in pixels 40 * xsize image width in pixels 41 * ysize image height in lines 42 * cmask channel mask 43 * 44 * DESCRIPTION 45 * Copy the 1-channel source image into the selected channel 46 * of the destination image -- VIS version low level functions. 47 * 48 * NOTE 49 * These functions are separated from mlib_v_ImageChannelInsert.c 50 * for loop unrolling and structure clarity. 51 */ 52 53 #include "vis_proto.h" 54 #include "mlib_image.h" 55 #include "mlib_v_ImageChannelInsert.h" 56 57 /***************************************************************/ 58 #define INSERT_U8_12(sd0, dd0, dd1) /* channel duplicate */ \ 59 dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \ 60 dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0)) 61 62 /***************************************************************/ 63 /* insert one channel to a 2-channel image. 64 */ 65 66 void mlib_v_ImageChannelInsert_U8_12_D1(const mlib_u8 *src, 67 mlib_u8 *dst, 68 mlib_s32 dsize, 69 mlib_s32 cmask) 70 { 71 mlib_u8 *sa, *da; 72 mlib_u8 *dend, *dend2; /* end points in dst */ 73 mlib_d64 *dp; /* 8-byte aligned start points in dst */ 74 mlib_d64 *sp; /* 8-byte aligned start point in src */ 75 mlib_d64 sd0, sd1; /* 8-byte source data */ 76 mlib_d64 dd0, dd1, dd2, dd3; /* 8-byte destination data */ 77 mlib_s32 soff; /* offset of address in src */ 78 mlib_s32 doff; /* offset of address in dst */ 79 mlib_s32 off; /* offset of src over dst */ 80 mlib_s32 emask; /* edge mask */ 81 mlib_s32 bmask; /* channel mask */ 82 mlib_s32 i, n; 83 84 bmask = cmask | (cmask << 2) | (cmask << 4) | (cmask << 6); 85 86 sa = (void *)src; 87 da = dst; 88 89 /* prepare the source address */ 90 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 91 soff = ((mlib_addr) sa & 7); 92 93 /* prepare the destination addresses */ 94 dp = (mlib_d64 *) ((mlib_addr) da & (~7)); 95 doff = ((mlib_addr) da & 7); 96 dend = da + dsize * 2 - 1; 97 dend2 = dend - 15; 98 99 /* calculate the src's offset over dst */ 100 off = soff * 2 - doff; 101 102 if (doff % 2 != 0) { 103 bmask = (~bmask) & 0xff; 104 } 105 106 if (off == 0) { /* src and dst have same alignment */ 107 108 /* load 8 bytes */ 109 sd0 = *sp++; 110 111 /* insert, including some garbage at the start point */ 112 INSERT_U8_12(sd0, dd0, dd1); 113 114 /* store 16 bytes result */ 115 emask = vis_edge8(da, dend); 116 vis_pst_8(dd0, dp++, emask & bmask); 117 if ((mlib_addr) dp <= (mlib_addr) dend) { 118 emask = vis_edge8(dp, dend); 119 vis_pst_8(dd1, dp++, emask & bmask); 120 } 121 122 if ((mlib_addr) dp <= (mlib_addr) dend2) { 123 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1; 124 125 /* 8-pixel column loop, emask not needed */ 126 #pragma pipeloop(0) 127 for (i = 0; i < n; i++) { 128 sd0 = *sp++; 129 INSERT_U8_12(sd0, dd0, dd1); 130 vis_pst_8(dd0, dp++, bmask); 131 vis_pst_8(dd1, dp++, bmask); 132 } 133 } 134 135 /* end point handling */ 136 if ((mlib_addr) dp <= (mlib_addr) dend) { 137 sd0 = *sp++; 138 INSERT_U8_12(sd0, dd0, dd1); 139 emask = vis_edge8(dp, dend); 140 vis_pst_8(dd0, dp++, emask & bmask); 141 if ((mlib_addr) dp <= (mlib_addr) dend) { 142 emask = vis_edge8(dp, dend); 143 vis_pst_8(dd1, dp++, emask & bmask); 144 } 145 } 146 } 147 else if (off < 0) { 148 vis_alignaddr((void *)0, off); 149 150 /* generate edge mask for the start point */ 151 emask = vis_edge8(da, dend); 152 153 /* load 8 bytes */ 154 sd0 = *sp++; 155 156 /* insert and store 16 bytes */ 157 INSERT_U8_12(sd0, dd0, dd1); 158 vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask); 159 if ((mlib_addr) dp <= (mlib_addr) dend) { 160 emask = vis_edge8(dp, dend); 161 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); 162 } 163 164 if ((mlib_addr) dp <= (mlib_addr) dend2) { 165 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1; 166 167 /* 8-pixel column loop, emask not needed */ 168 #pragma pipeloop(0) 169 for (i = 0; i < n; i++) { 170 dd2 = dd1; 171 sd0 = *sp++; 172 INSERT_U8_12(sd0, dd0, dd1); 173 vis_pst_8(vis_faligndata(dd2, dd0), dp++, bmask); 174 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask); 175 } 176 } 177 178 /* end point handling */ 179 if ((mlib_addr) dp <= (mlib_addr) dend) { 180 emask = vis_edge8(dp, dend); 181 dd2 = dd1; 182 sd0 = *sp++; 183 INSERT_U8_12(sd0, dd0, dd1); 184 vis_pst_8(vis_faligndata(dd2, dd0), dp++, emask & bmask); 185 if ((mlib_addr) dp <= (mlib_addr) dend) { 186 emask = vis_edge8(dp, dend); 187 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); 188 } 189 } 190 } 191 else if (off < 8) { 192 vis_alignaddr((void *)0, off); 193 194 /* generate edge mask for the start point */ 195 emask = vis_edge8(da, dend); 196 197 /* load 16 bytes */ 198 sd0 = *sp++; 199 sd1 = *sp++; 200 201 /* insert and store 16 bytes */ 202 INSERT_U8_12(sd0, dd0, dd1); 203 INSERT_U8_12(sd1, dd2, dd3); 204 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); 205 if ((mlib_addr) dp <= (mlib_addr) dend) { 206 emask = vis_edge8(dp, dend); 207 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); 208 } 209 210 if ((mlib_addr) dp <= (mlib_addr) dend2) { 211 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1; 212 213 /* 8-pixel column loop, emask not needed */ 214 #pragma pipeloop(0) 215 for (i = 0; i < n; i++) { 216 dd0 = dd2; 217 dd1 = dd3; 218 sd1 = *sp++; 219 INSERT_U8_12(sd1, dd2, dd3); 220 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask); 221 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask); 222 } 223 } 224 225 /* end point handling */ 226 if ((mlib_addr) dp <= (mlib_addr) dend) { 227 emask = vis_edge8(dp, dend); 228 dd0 = dd2; 229 dd1 = dd3; 230 sd1 = *sp++; 231 INSERT_U8_12(sd1, dd2, dd3); 232 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); 233 if ((mlib_addr) dp <= (mlib_addr) dend) { 234 emask = vis_edge8(dp, dend); 235 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); 236 } 237 } 238 } 239 else { /* (off >= 8) */ 240 vis_alignaddr((void *)0, off); 241 242 /* generate edge mask for the start point */ 243 emask = vis_edge8(da, dend); 244 245 /* load 16 bytes */ 246 sd0 = *sp++; 247 sd1 = *sp++; 248 249 /* insert and store 16 bytes */ 250 INSERT_U8_12(sd0, dd0, dd1); 251 INSERT_U8_12(sd1, dd2, dd3); 252 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); 253 if ((mlib_addr) dp <= (mlib_addr) dend) { 254 emask = vis_edge8(dp, dend); 255 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask); 256 } 257 258 if ((mlib_addr) dp <= (mlib_addr) dend2) { 259 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1; 260 261 /* 8-pixel column loop, emask not needed */ 262 #pragma pipeloop(0) 263 for (i = 0; i < n; i++) { 264 dd0 = dd2; 265 dd1 = dd3; 266 sd1 = *sp++; 267 INSERT_U8_12(sd1, dd2, dd3); 268 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask); 269 vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask); 270 } 271 } 272 273 /* end point handling */ 274 if ((mlib_addr) dp <= (mlib_addr) dend) { 275 emask = vis_edge8(dp, dend); 276 dd0 = dd2; 277 dd1 = dd3; 278 sd1 = *sp++; 279 INSERT_U8_12(sd1, dd2, dd3); 280 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); 281 if ((mlib_addr) dp <= (mlib_addr) dend) { 282 emask = vis_edge8(dp, dend); 283 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask); 284 } 285 } 286 } 287 } 288 289 /***************************************************************/ 290 #define LOAD_INSERT_STORE_U8(channeld) \ 291 vis_alignaddr((void *)0, off); \ 292 sd0 = sd1; \ 293 sd1 = *sp++; \ 294 sd = vis_faligndata(sd0, sd1); \ 295 vis_alignaddr((void *)0, 1); \ 296 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 297 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 298 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 299 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 300 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 301 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 302 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \ 303 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld 304 305 /***************************************************************/ 306 void mlib_v_ImageChannelInsert_U8_13_D1(const mlib_u8 *src, 307 mlib_u8 *dst, 308 mlib_s32 dsize, 309 mlib_s32 cmask) 310 { 311 mlib_u8 *sa, *da; 312 mlib_u8 *dend; /* end point in destination */ 313 mlib_d64 *sp; /* 8-byte aligned start points in src */ 314 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */ 315 mlib_s32 off; /* offset of address alignment in src */ 316 mlib_s32 i; 317 318 /* prepare the src address */ 319 sa = (void *)src; 320 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 321 off = (mlib_addr) sa & 7; 322 323 /* prepare the dst address */ 324 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */ 325 dend = da + dsize * 3 - 1; 326 327 sd1 = *sp++; 328 329 #pragma pipeloop(0) 330 for (i = 0; i < dsize / 8; i++) { 331 LOAD_INSERT_STORE_U8(3); 332 } 333 334 /* right end handling */ 335 if ((mlib_addr) da <= (mlib_addr) dend) { 336 337 vis_alignaddr((void *)0, off); 338 sd0 = sd1; 339 sd1 = *sp++; 340 sd = vis_faligndata(sd0, sd1); 341 342 vis_alignaddr((void *)0, 1); 343 vis_st_u8(sd = vis_faligndata(sd, sd), da); 344 da += 3; 345 if ((mlib_addr) da <= (mlib_addr) dend) { 346 vis_st_u8(sd = vis_faligndata(sd, sd), da); 347 da += 3; 348 if ((mlib_addr) da <= (mlib_addr) dend) { 349 vis_st_u8(sd = vis_faligndata(sd, sd), da); 350 da += 3; 351 if ((mlib_addr) da <= (mlib_addr) dend) { 352 vis_st_u8(sd = vis_faligndata(sd, sd), da); 353 da += 3; 354 if ((mlib_addr) da <= (mlib_addr) dend) { 355 vis_st_u8(sd = vis_faligndata(sd, sd), da); 356 da += 3; 357 if ((mlib_addr) da <= (mlib_addr) dend) { 358 vis_st_u8(sd = vis_faligndata(sd, sd), da); 359 da += 3; 360 if ((mlib_addr) da <= (mlib_addr) dend) { 361 vis_st_u8(sd = vis_faligndata(sd, sd), da); 362 } 363 } 364 } 365 } 366 } 367 } 368 } 369 } 370 371 /***************************************************************/ 372 #define INSERT_U8_14(sd0, dd0, dd1, dd2, dd3) \ 373 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \ 374 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0)); \ 375 dd0 = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sda)); \ 376 dd1 = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sda)); \ 377 dd2 = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdb)); \ 378 dd3 = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdb)) 379 380 /***************************************************************/ 381 void mlib_v_ImageChannelInsert_U8_14_D1(const mlib_u8 *src, 382 mlib_u8 *dst, 383 mlib_s32 dsize, 384 mlib_s32 cmask) 385 { 386 mlib_u8 *sa, *da; 387 mlib_u8 *dend, *dend2; /* end points in dst */ 388 mlib_d64 *dp; /* 8-byte aligned start points in dst */ 389 mlib_d64 *sp; /* 8-byte aligned start point in src */ 390 mlib_d64 sd0, sd1, sd; /* 8-byte source data */ 391 mlib_d64 sda, sdb; 392 mlib_d64 dd0, dd1, dd2, dd3, dd4; 393 mlib_s32 soff; /* offset of address in src */ 394 mlib_s32 doff; /* offset of address in dst */ 395 mlib_s32 emask; /* edge mask */ 396 mlib_s32 bmask; /* channel mask */ 397 mlib_s32 i, n; 398 399 sa = (void *)src; 400 da = dst; 401 402 bmask = cmask | (cmask << 4) | (cmask << 8); 403 404 /* prepare the source address */ 405 sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); 406 soff = ((mlib_addr) sa & 7); 407 408 /* prepare the destination addresses */ 409 dp = (mlib_d64 *) ((mlib_addr) da & (~7)); 410 doff = ((mlib_addr) da & 7); 411 dend = da + dsize * 4 - 1; 412 dend2 = dend - 31; 413 414 bmask = (bmask >> (doff % 4)) & 0xff; 415 416 if (doff == 0) { /* dst is 8-byte aligned */ 417 418 vis_alignaddr((void *)0, soff); 419 sd0 = *sp++; 420 sd1 = *sp++; 421 sd = vis_faligndata(sd0, sd1); /* the intermediate is aligned */ 422 423 INSERT_U8_14(sd, dd0, dd1, dd2, dd3); 424 425 emask = vis_edge8(da, dend); 426 vis_pst_8(dd0, dp++, emask & bmask); 427 if ((mlib_addr) dp <= (mlib_addr) dend) { /* for very small size */ 428 emask = vis_edge8(dp, dend); 429 vis_pst_8(dd1, dp++, emask & bmask); 430 if ((mlib_addr) dp <= (mlib_addr) dend) { 431 emask = vis_edge8(dp, dend); 432 vis_pst_8(dd2, dp++, emask & bmask); 433 if ((mlib_addr) dp <= (mlib_addr) dend) { 434 emask = vis_edge8(dp, dend); 435 vis_pst_8(dd3, dp++, emask & bmask); 436 } 437 } 438 } 439 440 if ((mlib_addr) dp <= (mlib_addr) dend2) { 441 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 32 + 1; 442 443 /* 8-pixel column loop, emask not needed */ 444 #pragma pipeloop(0) 445 for (i = 0; i < n; i++) { 446 sd0 = sd1; 447 sd1 = *sp++; 448 sd = vis_faligndata(sd0, sd1); 449 450 INSERT_U8_14(sd, dd0, dd1, dd2, dd3); 451 452 vis_pst_8(dd0, dp++, bmask); 453 vis_pst_8(dd1, dp++, bmask); 454 vis_pst_8(dd2, dp++, bmask); 455 vis_pst_8(dd3, dp++, bmask); 456 } 457 } 458 459 /* end point handling */ 460 if ((mlib_addr) dp <= (mlib_addr) dend) { 461 sd0 = sd1; 462 sd1 = *sp++; 463 sd = vis_faligndata(sd0, sd1); 464 465 INSERT_U8_14(sd, dd0, dd1, dd2, dd3); 466 467 emask = vis_edge8(dp, dend); 468 vis_pst_8(dd0, dp++, emask & bmask); 469 if ((mlib_addr) dp <= (mlib_addr) dend) { 470 emask = vis_edge8(dp, dend); 471 vis_pst_8(dd1, dp++, emask & bmask); 472 if ((mlib_addr) dp <= (mlib_addr) dend) { 473 emask = vis_edge8(dp, dend); 474 vis_pst_8(dd2, dp++, emask & bmask); 475 if ((mlib_addr) dp <= (mlib_addr) dend) { 476 emask = vis_edge8(dp, dend); 477 vis_pst_8(dd3, dp++, emask & bmask); 478 } 479 } 480 } 481 } 482 } 483 else { /* dst is not 8-byte aligned */ 484 vis_alignaddr((void *)0, soff); 485 sd0 = *sp++; 486 sd1 = *sp++; 487 sd = vis_faligndata(sd0, sd1); /* the intermediate is aligned */ 488 489 INSERT_U8_14(sd, dd0, dd1, dd2, dd3); 490 491 vis_alignaddr((void *)0, -doff); 492 493 emask = vis_edge8(da, dend); 494 vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask); 495 if ((mlib_addr) dp <= (mlib_addr) dend) { /* for very small size */ 496 emask = vis_edge8(dp, dend); 497 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); 498 if ((mlib_addr) dp <= (mlib_addr) dend) { 499 emask = vis_edge8(dp, dend); 500 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); 501 if ((mlib_addr) dp <= (mlib_addr) dend) { 502 emask = vis_edge8(dp, dend); 503 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask); 504 } 505 } 506 } 507 508 if ((mlib_addr) dp <= (mlib_addr) dend2) { 509 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 32 + 1; 510 511 /* 8-pixel column loop, emask not needed */ 512 #pragma pipeloop(0) 513 for (i = 0; i < n; i++) { 514 dd4 = dd3; 515 516 vis_alignaddr((void *)0, soff); 517 sd0 = sd1; 518 sd1 = *sp++; 519 sd = vis_faligndata(sd0, sd1); 520 521 INSERT_U8_14(sd, dd0, dd1, dd2, dd3); 522 523 vis_alignaddr((void *)0, -doff); 524 vis_pst_8(vis_faligndata(dd4, dd0), dp++, bmask); 525 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask); 526 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask); 527 vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask); 528 } 529 } 530 531 /* end point handling */ 532 if ((mlib_addr) dp <= (mlib_addr) dend) { 533 dd4 = dd3; 534 535 vis_alignaddr((void *)0, soff); 536 sd0 = sd1; 537 sd1 = *sp++; 538 sd = vis_faligndata(sd0, sd1); 539 540 INSERT_U8_14(sd, dd0, dd1, dd2, dd3); 541 542 vis_alignaddr((void *)0, -doff); 543 emask = vis_edge8(dp, dend); 544 vis_pst_8(vis_faligndata(dd4, dd0), dp++, emask & bmask); 545 if ((mlib_addr) dp <= (mlib_addr) dend) { 546 emask = vis_edge8(dp, dend); 547 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); 548 if ((mlib_addr) dp <= (mlib_addr) dend) { 549 emask = vis_edge8(dp, dend); 550 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); 551 if ((mlib_addr) dp <= (mlib_addr) dend) { 552 emask = vis_edge8(dp, dend); 553 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask); 554 } 555 } 556 } 557 } 558 } 559 } 560 561 562 /***************************************************************/