< prev index next >

modules/javafx.graphics/src/main/native-iio/libjpeg7/jfdctint.c

Print this page


   1 /*
   2  * jfdctint.c
   3  *
   4  * Copyright (C) 1991-1996, Thomas G. Lane.
   5  * Modification developed 2003-2009 by Guido Vollbeding.
   6  * This file is part of the Independent JPEG Group's software.
   7  * For conditions of distribution and use, see the accompanying README file.
   8  *
   9  * This file contains a slow-but-accurate integer implementation of the
  10  * forward DCT (Discrete Cosine Transform).
  11  *
  12  * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
  13  * on each column.  Direct algorithms are also available, but they are
  14  * much more complex and seem not to be any faster when reduced to code.
  15  *
  16  * This implementation is based on an algorithm described in
  17  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  18  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  19  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  20  * The primary algorithm described there uses 11 multiplies and 29 adds.
  21  * We use their alternate method with 12 multiplies and 32 adds.
  22  * The advantage of this method is that no data path contains more than one
  23  * multiplication; this allows a very simple and accurate implementation in
  24  * scaled fixed-point arithmetic, with a minimal number of shifts.
  25  *


 148 #else
 149 #define MULTIPLY(var,const)  ((var) * (const))
 150 #endif
 151 
 152 
 153 /*
 154  * Perform the forward DCT on one block of samples.
 155  */
 156 
 157 GLOBAL(void)
 158 jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 159 {
 160   INT32 tmp0, tmp1, tmp2, tmp3;
 161   INT32 tmp10, tmp11, tmp12, tmp13;
 162   INT32 z1;
 163   DCTELEM *dataptr;
 164   JSAMPROW elemptr;
 165   int ctr;
 166   SHIFT_TEMPS
 167 
 168   /* Pass 1: process rows. */
 169   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
 170   /* furthermore, we scale the results by 2**PASS1_BITS. */


 171 
 172   dataptr = data;
 173   for (ctr = 0; ctr < DCTSIZE; ctr++) {
 174     elemptr = sample_data[ctr] + start_col;
 175 
 176     /* Even part per LL&M figure 1 --- note that published figure is faulty;
 177      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
 178      */
 179 
 180     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
 181     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
 182     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
 183     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
 184 
 185     tmp10 = tmp0 + tmp3;
 186     tmp12 = tmp0 - tmp3;
 187     tmp11 = tmp1 + tmp2;
 188     tmp13 = tmp1 - tmp2;
 189 
 190     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
 191     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
 192     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
 193     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
 194 
 195     /* Apply unsigned->signed conversion */
 196     dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
 197     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
 198 
 199     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
 200     /* Add fudge factor here for final descale. */
 201     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
 202     dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),


 203                                        CONST_BITS-PASS1_BITS);
 204     dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),

 205                                        CONST_BITS-PASS1_BITS);
 206 
 207     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
 208      * cK represents sqrt(2) * cos(K*pi/16).
 209      * i0..i3 in the paper are tmp0..tmp3 here.
 210      */
 211 
 212     tmp10 = tmp0 + tmp3;
 213     tmp11 = tmp1 + tmp2;
 214     tmp12 = tmp0 + tmp2;
 215     tmp13 = tmp1 + tmp3;

 216     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
 217     /* Add fudge factor here for final descale. */
 218     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
 219 
 220     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
 221     tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
 222     tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
 223     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
 224     tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
 225     tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
 226     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
 227     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
 228 
 229     tmp12 += z1;
 230     tmp13 += z1;
 231 
 232     dataptr[1] = (DCTELEM)
 233       RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
 234     dataptr[3] = (DCTELEM)
 235       RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
 236     dataptr[5] = (DCTELEM)
 237       RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
 238     dataptr[7] = (DCTELEM)
 239       RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);








 240 
 241     dataptr += DCTSIZE;         /* advance pointer to next row */
 242   }
 243 
 244   /* Pass 2: process columns.
 245    * We remove the PASS1_BITS scaling, but leave the results scaled up
 246    * by an overall factor of 8.

 247    */
 248 
 249   dataptr = data;
 250   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
 251     /* Even part per LL&M figure 1 --- note that published figure is faulty;
 252      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
 253      */
 254 
 255     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
 256     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
 257     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
 258     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
 259 
 260     /* Add fudge factor here for final descale. */
 261     tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
 262     tmp12 = tmp0 - tmp3;
 263     tmp11 = tmp1 + tmp2;
 264     tmp13 = tmp1 - tmp2;
 265 
 266     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
 267     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
 268     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
 269     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
 270 
 271     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
 272     dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
 273 
 274     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
 275     /* Add fudge factor here for final descale. */
 276     z1 += ONE << (CONST_BITS+PASS1_BITS-1);

 277     dataptr[DCTSIZE*2] = (DCTELEM)
 278       RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);

 279     dataptr[DCTSIZE*6] = (DCTELEM)
 280       RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);

 281 
 282     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
 283      * cK represents sqrt(2) * cos(K*pi/16).
 284      * i0..i3 in the paper are tmp0..tmp3 here.
 285      */
 286 
 287     tmp10 = tmp0 + tmp3;
 288     tmp11 = tmp1 + tmp2;
 289     tmp12 = tmp0 + tmp2;
 290     tmp13 = tmp1 + tmp3;

 291     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
 292     /* Add fudge factor here for final descale. */
 293     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
 294 
 295     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
 296     tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
 297     tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
 298     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
 299     tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
 300     tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
 301     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
 302     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
 303 
 304     tmp12 += z1;
 305     tmp13 += z1;
 306 
 307     dataptr[DCTSIZE*1] = (DCTELEM)
 308       RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
 309     dataptr[DCTSIZE*3] = (DCTELEM)
 310       RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
 311     dataptr[DCTSIZE*5] = (DCTELEM)
 312       RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
 313     dataptr[DCTSIZE*7] = (DCTELEM)
 314       RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);








 315 
 316     dataptr++;                  /* advance pointer to next column */
 317   }
 318 }
 319 
 320 #ifdef DCT_SCALING_SUPPORTED
 321 
 322 
 323 /*
 324  * Perform the forward DCT on a 7x7 sample block.
 325  */
 326 
 327 GLOBAL(void)
 328 jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 329 {
 330   INT32 tmp0, tmp1, tmp2, tmp3;
 331   INT32 tmp10, tmp11, tmp12;
 332   INT32 z1, z2, z3;
 333   DCTELEM *dataptr;
 334   JSAMPROW elemptr;
 335   int ctr;
 336   SHIFT_TEMPS
 337 
 338   /* Pre-zero output coefficient block. */
 339   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 340 
 341   /* Pass 1: process rows. */
 342   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
 343   /* furthermore, we scale the results by 2**PASS1_BITS. */
 344   /* cK represents sqrt(2) * cos(K*pi/14). */

 345 
 346   dataptr = data;
 347   for (ctr = 0; ctr < 7; ctr++) {
 348     elemptr = sample_data[ctr] + start_col;
 349 
 350     /* Even part */
 351 
 352     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
 353     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
 354     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
 355     tmp3 = GETJSAMPLE(elemptr[3]);
 356 
 357     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
 358     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
 359     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
 360 
 361     z1 = tmp0 + tmp2;
 362     /* Apply unsigned->signed conversion */
 363     dataptr[0] = (DCTELEM)
 364       ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
 365     tmp3 += tmp3;
 366     z1 -= tmp3;
 367     z1 -= tmp3;
 368     z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
 369     z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
 370     z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
 371     dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
 372     z1 -= z2;
 373     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
 374     dataptr[4] = (DCTELEM)
 375       DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
 376               CONST_BITS-PASS1_BITS);
 377     dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
 378 
 379     /* Odd part */
 380 
 381     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
 382     tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */


 455 }
 456 
 457 
 458 /*
 459  * Perform the forward DCT on a 6x6 sample block.
 460  */
 461 
 462 GLOBAL(void)
 463 jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 464 {
 465   INT32 tmp0, tmp1, tmp2;
 466   INT32 tmp10, tmp11, tmp12;
 467   DCTELEM *dataptr;
 468   JSAMPROW elemptr;
 469   int ctr;
 470   SHIFT_TEMPS
 471 
 472   /* Pre-zero output coefficient block. */
 473   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 474 
 475   /* Pass 1: process rows. */
 476   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
 477   /* furthermore, we scale the results by 2**PASS1_BITS. */
 478   /* cK represents sqrt(2) * cos(K*pi/12). */

 479 
 480   dataptr = data;
 481   for (ctr = 0; ctr < 6; ctr++) {
 482     elemptr = sample_data[ctr] + start_col;
 483 
 484     /* Even part */
 485 
 486     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
 487     tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
 488     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
 489 
 490     tmp10 = tmp0 + tmp2;
 491     tmp12 = tmp0 - tmp2;
 492 
 493     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
 494     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
 495     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
 496 
 497     /* Apply unsigned->signed conversion */
 498     dataptr[0] = (DCTELEM)
 499       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
 500     dataptr[2] = (DCTELEM)
 501       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
 502               CONST_BITS-PASS1_BITS);
 503     dataptr[4] = (DCTELEM)
 504       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
 505               CONST_BITS-PASS1_BITS);
 506 
 507     /* Odd part */
 508 
 509     tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
 510                     CONST_BITS-PASS1_BITS);
 511 
 512     dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
 513     dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
 514     dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
 515 
 516     dataptr += DCTSIZE;         /* advance pointer to next row */
 517   }


 568 }
 569 
 570 
 571 /*
 572  * Perform the forward DCT on a 5x5 sample block.
 573  */
 574 
 575 GLOBAL(void)
 576 jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 577 {
 578   INT32 tmp0, tmp1, tmp2;
 579   INT32 tmp10, tmp11;
 580   DCTELEM *dataptr;
 581   JSAMPROW elemptr;
 582   int ctr;
 583   SHIFT_TEMPS
 584 
 585   /* Pre-zero output coefficient block. */
 586   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 587 
 588   /* Pass 1: process rows. */
 589   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
 590   /* furthermore, we scale the results by 2**PASS1_BITS. */
 591   /* We scale the results further by 2 as part of output adaption */
 592   /* scaling for different DCT size. */
 593   /* cK represents sqrt(2) * cos(K*pi/10). */

 594 
 595   dataptr = data;
 596   for (ctr = 0; ctr < 5; ctr++) {
 597     elemptr = sample_data[ctr] + start_col;
 598 
 599     /* Even part */
 600 
 601     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
 602     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
 603     tmp2 = GETJSAMPLE(elemptr[2]);
 604 
 605     tmp10 = tmp0 + tmp1;
 606     tmp11 = tmp0 - tmp1;
 607 
 608     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
 609     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
 610 
 611     /* Apply unsigned->signed conversion */
 612     dataptr[0] = (DCTELEM)
 613       ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
 614     tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
 615     tmp10 -= tmp2 << 2;
 616     tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
 617     dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
 618     dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
 619 
 620     /* Odd part */
 621 
 622     tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
 623 
 624     dataptr[1] = (DCTELEM)
 625       DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
 626               CONST_BITS-PASS1_BITS-1);
 627     dataptr[3] = (DCTELEM)
 628       DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
 629               CONST_BITS-PASS1_BITS-1);
 630 
 631     dataptr += DCTSIZE;         /* advance pointer to next row */


 678 }
 679 
 680 
 681 /*
 682  * Perform the forward DCT on a 4x4 sample block.
 683  */
 684 
 685 GLOBAL(void)
 686 jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 687 {
 688   INT32 tmp0, tmp1;
 689   INT32 tmp10, tmp11;
 690   DCTELEM *dataptr;
 691   JSAMPROW elemptr;
 692   int ctr;
 693   SHIFT_TEMPS
 694 
 695   /* Pre-zero output coefficient block. */
 696   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 697 
 698   /* Pass 1: process rows. */
 699   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
 700   /* furthermore, we scale the results by 2**PASS1_BITS. */
 701   /* We must also scale the output by (8/4)**2 = 2**2, which we add here. */
 702   /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */

 703 
 704   dataptr = data;
 705   for (ctr = 0; ctr < 4; ctr++) {
 706     elemptr = sample_data[ctr] + start_col;
 707 
 708     /* Even part */
 709 
 710     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
 711     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
 712 
 713     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
 714     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
 715 
 716     /* Apply unsigned->signed conversion */
 717     dataptr[0] = (DCTELEM)
 718       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
 719     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
 720 
 721     /* Odd part */
 722 
 723     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
 724     /* Add fudge factor here for final descale. */
 725     tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
 726 
 727     dataptr[1] = (DCTELEM)
 728       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
 729                   CONST_BITS-PASS1_BITS-2);
 730     dataptr[3] = (DCTELEM)
 731       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
 732                   CONST_BITS-PASS1_BITS-2);
 733 
 734     dataptr += DCTSIZE;         /* advance pointer to next row */
 735   }
 736 
 737   /* Pass 2: process columns.
 738    * We remove the PASS1_BITS scaling, but leave the results scaled up
 739    * by an overall factor of 8.

 740    */
 741 
 742   dataptr = data;
 743   for (ctr = 0; ctr < 4; ctr++) {
 744     /* Even part */
 745 
 746     /* Add fudge factor here for final descale. */
 747     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
 748     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
 749 
 750     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
 751     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
 752 
 753     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
 754     dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
 755 
 756     /* Odd part */
 757 
 758     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
 759     /* Add fudge factor here for final descale. */


 770   }
 771 }
 772 
 773 
 774 /*
 775  * Perform the forward DCT on a 3x3 sample block.
 776  */
 777 
 778 GLOBAL(void)
 779 jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 780 {
 781   INT32 tmp0, tmp1, tmp2;
 782   DCTELEM *dataptr;
 783   JSAMPROW elemptr;
 784   int ctr;
 785   SHIFT_TEMPS
 786 
 787   /* Pre-zero output coefficient block. */
 788   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 789 
 790   /* Pass 1: process rows. */
 791   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
 792   /* furthermore, we scale the results by 2**PASS1_BITS. */
 793   /* We scale the results further by 2**2 as part of output adaption */
 794   /* scaling for different DCT size. */
 795   /* cK represents sqrt(2) * cos(K*pi/6). */

 796 
 797   dataptr = data;
 798   for (ctr = 0; ctr < 3; ctr++) {
 799     elemptr = sample_data[ctr] + start_col;
 800 
 801     /* Even part */
 802 
 803     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
 804     tmp1 = GETJSAMPLE(elemptr[1]);
 805 
 806     tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
 807 
 808     /* Apply unsigned->signed conversion */
 809     dataptr[0] = (DCTELEM)
 810       ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
 811     dataptr[2] = (DCTELEM)
 812       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
 813               CONST_BITS-PASS1_BITS-2);
 814 
 815     /* Odd part */
 816 
 817     dataptr[1] = (DCTELEM)
 818       DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
 819               CONST_BITS-PASS1_BITS-2);
 820 
 821     dataptr += DCTSIZE;         /* advance pointer to next row */
 822   }
 823 
 824   /* Pass 2: process columns.
 825    * We remove the PASS1_BITS scaling, but leave the results scaled up
 826    * by an overall factor of 8.
 827    * We must also scale the output by (8/3)**2 = 64/9, which we partially
 828    * fold into the constant multipliers (other part was done in pass 1):


 846               CONST_BITS+PASS1_BITS);
 847 
 848     /* Odd part */
 849 
 850     dataptr[DCTSIZE*1] = (DCTELEM)
 851       DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
 852               CONST_BITS+PASS1_BITS);
 853 
 854     dataptr++;                  /* advance pointer to next column */
 855   }
 856 }
 857 
 858 
 859 /*
 860  * Perform the forward DCT on a 2x2 sample block.
 861  */
 862 
 863 GLOBAL(void)
 864 jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 865 {
 866   INT32 tmp0, tmp1, tmp2, tmp3;
 867   JSAMPROW elemptr;
 868 
 869   /* Pre-zero output coefficient block. */
 870   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 871 
 872   /* Pass 1: process rows. */
 873   /* Note results are scaled up by sqrt(8) compared to a true DCT. */

 874 
 875   /* Row 0 */
 876   elemptr = sample_data[0] + start_col;
 877 
 878   tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
 879   tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
 880 
 881   /* Row 1 */
 882   elemptr = sample_data[1] + start_col;
 883 
 884   tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
 885   tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
 886 
 887   /* Pass 2: process columns.
 888    * We leave the results scaled up by an overall factor of 8.
 889    * We must also scale the output by (8/2)**2 = 2**4.
 890    */
 891 
 892   /* Column 0 */
 893   /* Apply unsigned->signed conversion */
 894   data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4);
 895   data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);
 896 
 897   /* Column 1 */
 898   data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
 899   data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
 900 }
 901 
 902 
 903 /*
 904  * Perform the forward DCT on a 1x1 sample block.
 905  */
 906 
 907 GLOBAL(void)
 908 jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 909 {


 910   /* Pre-zero output coefficient block. */
 911   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 912 


 913   /* We leave the result scaled up by an overall factor of 8. */
 914   /* We must also scale the output by (8/1)**2 = 2**6. */
 915   /* Apply unsigned->signed conversion */
 916   data[0] = (DCTELEM)
 917     ((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
 918 }
 919 
 920 
 921 /*
 922  * Perform the forward DCT on a 9x9 sample block.
 923  */
 924 
 925 GLOBAL(void)
 926 jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 927 {
 928   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
 929   INT32 tmp10, tmp11, tmp12, tmp13;
 930   INT32 z1, z2;
 931   DCTELEM workspace[8];
 932   DCTELEM *dataptr;
 933   DCTELEM *wsptr;
 934   JSAMPROW elemptr;
 935   int ctr;
 936   SHIFT_TEMPS
 937 
 938   /* Pass 1: process rows. */
 939   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
 940   /* we scale the results further by 2 as part of output adaption */
 941   /* scaling for different DCT size. */
 942   /* cK represents sqrt(2) * cos(K*pi/18). */

 943 
 944   dataptr = data;
 945   ctr = 0;
 946   for (;;) {
 947     elemptr = sample_data[ctr] + start_col;
 948 
 949     /* Even part */
 950 
 951     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
 952     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
 953     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
 954     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
 955     tmp4 = GETJSAMPLE(elemptr[4]);
 956 
 957     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
 958     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
 959     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
 960     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
 961 
 962     z1 = tmp0 + tmp2 + tmp3;
 963     z2 = tmp1 + tmp4;
 964     /* Apply unsigned->signed conversion */
 965     dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
 966     dataptr[6] = (DCTELEM)
 967       DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)),  /* c6 */
 968               CONST_BITS-1);
 969     z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049));        /* c2 */
 970     z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
 971     dataptr[2] = (DCTELEM)
 972       DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441))    /* c4 */
 973               + z1 + z2, CONST_BITS-1);
 974     dataptr[4] = (DCTELEM)
 975       DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608))    /* c8 */
 976               + z1 - z2, CONST_BITS-1);
 977 
 978     /* Odd part */
 979 
 980     dataptr[3] = (DCTELEM)
 981       DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
 982               CONST_BITS-1);
 983 
 984     tmp11 = MULTIPLY(tmp11, FIX(1.224744871));        /* c3 */


1067   }
1068 }
1069 
1070 
1071 /*
1072  * Perform the forward DCT on a 10x10 sample block.
1073  */
1074 
1075 GLOBAL(void)
1076 jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1077 {
1078   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
1079   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1080   DCTELEM workspace[8*2];
1081   DCTELEM *dataptr;
1082   DCTELEM *wsptr;
1083   JSAMPROW elemptr;
1084   int ctr;
1085   SHIFT_TEMPS
1086 
1087   /* Pass 1: process rows. */
1088   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
1089   /* we scale the results further by 2 as part of output adaption */
1090   /* scaling for different DCT size. */
1091   /* cK represents sqrt(2) * cos(K*pi/20). */

1092 
1093   dataptr = data;
1094   ctr = 0;
1095   for (;;) {
1096     elemptr = sample_data[ctr] + start_col;
1097 
1098     /* Even part */
1099 
1100     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
1101     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
1102     tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
1103     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
1104     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
1105 
1106     tmp10 = tmp0 + tmp4;
1107     tmp13 = tmp0 - tmp4;
1108     tmp11 = tmp1 + tmp3;
1109     tmp14 = tmp1 - tmp3;
1110 
1111     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
1112     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
1113     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
1114     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
1115     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
1116 
1117     /* Apply unsigned->signed conversion */
1118     dataptr[0] = (DCTELEM)
1119       ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
1120     tmp12 += tmp12;
1121     dataptr[4] = (DCTELEM)
1122       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
1123               MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
1124               CONST_BITS-1);
1125     tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
1126     dataptr[2] = (DCTELEM)
1127       DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
1128               CONST_BITS-1);
1129     dataptr[6] = (DCTELEM)
1130       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
1131               CONST_BITS-1);
1132 
1133     /* Odd part */
1134 
1135     tmp10 = tmp0 + tmp4;
1136     tmp11 = tmp1 - tmp3;
1137     dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);


1231 }
1232 
1233 
1234 /*
1235  * Perform the forward DCT on an 11x11 sample block.
1236  */
1237 
1238 GLOBAL(void)
1239 jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1240 {
1241   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1242   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1243   INT32 z1, z2, z3;
1244   DCTELEM workspace[8*3];
1245   DCTELEM *dataptr;
1246   DCTELEM *wsptr;
1247   JSAMPROW elemptr;
1248   int ctr;
1249   SHIFT_TEMPS
1250 
1251   /* Pass 1: process rows. */
1252   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
1253   /* we scale the results further by 2 as part of output adaption */
1254   /* scaling for different DCT size. */
1255   /* cK represents sqrt(2) * cos(K*pi/22). */

1256 
1257   dataptr = data;
1258   ctr = 0;
1259   for (;;) {
1260     elemptr = sample_data[ctr] + start_col;
1261 
1262     /* Even part */
1263 
1264     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
1265     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
1266     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
1267     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
1268     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
1269     tmp5 = GETJSAMPLE(elemptr[5]);
1270 
1271     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
1272     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
1273     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
1274     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
1275     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
1276 
1277     /* Apply unsigned->signed conversion */
1278     dataptr[0] = (DCTELEM)
1279       ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
1280     tmp5 += tmp5;
1281     tmp0 -= tmp5;
1282     tmp1 -= tmp5;
1283     tmp2 -= tmp5;
1284     tmp3 -= tmp5;
1285     tmp4 -= tmp5;
1286     z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) +       /* c2 */
1287          MULTIPLY(tmp2 + tmp4, FIX(0.201263574));        /* c10 */
1288     z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931));        /* c6 */
1289     z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156));        /* c4 */
1290     dataptr[2] = (DCTELEM)
1291       DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
1292               - MULTIPLY(tmp4, FIX(1.390975730)),        /* c4+c10 */
1293               CONST_BITS-1);
1294     dataptr[4] = (DCTELEM)
1295       DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
1296               - MULTIPLY(tmp2, FIX(1.356927976))         /* c2 */
1297               + MULTIPLY(tmp4, FIX(0.587485545)),        /* c8 */


1413   }
1414 }
1415 
1416 
1417 /*
1418  * Perform the forward DCT on a 12x12 sample block.
1419  */
1420 
1421 GLOBAL(void)
1422 jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1423 {
1424   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1425   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1426   DCTELEM workspace[8*4];
1427   DCTELEM *dataptr;
1428   DCTELEM *wsptr;
1429   JSAMPROW elemptr;
1430   int ctr;
1431   SHIFT_TEMPS
1432 
1433   /* Pass 1: process rows. */
1434   /* Note results are scaled up by sqrt(8) compared to a true DCT. */
1435   /* cK represents sqrt(2) * cos(K*pi/24). */

1436 
1437   dataptr = data;
1438   ctr = 0;
1439   for (;;) {
1440     elemptr = sample_data[ctr] + start_col;
1441 
1442     /* Even part */
1443 
1444     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
1445     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
1446     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
1447     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
1448     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
1449     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
1450 
1451     tmp10 = tmp0 + tmp5;
1452     tmp13 = tmp0 - tmp5;
1453     tmp11 = tmp1 + tmp4;
1454     tmp14 = tmp1 - tmp4;
1455     tmp12 = tmp2 + tmp3;
1456     tmp15 = tmp2 - tmp3;
1457 
1458     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
1459     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
1460     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
1461     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
1462     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
1463     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
1464 
1465     /* Apply unsigned->signed conversion */
1466     dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
1467     dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
1468     dataptr[4] = (DCTELEM)
1469       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
1470               CONST_BITS);
1471     dataptr[2] = (DCTELEM)
1472       DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
1473               CONST_BITS);
1474 
1475     /* Odd part */
1476 
1477     tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
1478     tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
1479     tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
1480     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
1481     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
1482     tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
1483             + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
1484     tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
1485     tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */


1579 }
1580 
1581 
1582 /*
1583  * Perform the forward DCT on a 13x13 sample block.
1584  */
1585 
1586 GLOBAL(void)
1587 jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1588 {
1589   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1590   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1591   INT32 z1, z2;
1592   DCTELEM workspace[8*5];
1593   DCTELEM *dataptr;
1594   DCTELEM *wsptr;
1595   JSAMPROW elemptr;
1596   int ctr;
1597   SHIFT_TEMPS
1598 
1599   /* Pass 1: process rows. */
1600   /* Note results are scaled up by sqrt(8) compared to a true DCT. */
1601   /* cK represents sqrt(2) * cos(K*pi/26). */

1602 
1603   dataptr = data;
1604   ctr = 0;
1605   for (;;) {
1606     elemptr = sample_data[ctr] + start_col;
1607 
1608     /* Even part */
1609 
1610     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
1611     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
1612     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
1613     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
1614     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
1615     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
1616     tmp6 = GETJSAMPLE(elemptr[6]);
1617 
1618     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
1619     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
1620     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
1621     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
1622     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
1623     tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
1624 
1625     /* Apply unsigned->signed conversion */
1626     dataptr[0] = (DCTELEM)
1627       (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
1628     tmp6 += tmp6;
1629     tmp0 -= tmp6;
1630     tmp1 -= tmp6;
1631     tmp2 -= tmp6;
1632     tmp3 -= tmp6;
1633     tmp4 -= tmp6;
1634     tmp5 -= tmp6;
1635     dataptr[2] = (DCTELEM)
1636       DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) +   /* c2 */
1637               MULTIPLY(tmp1, FIX(1.058554052)) +   /* c6 */
1638               MULTIPLY(tmp2, FIX(0.501487041)) -   /* c10 */
1639               MULTIPLY(tmp3, FIX(0.170464608)) -   /* c12 */
1640               MULTIPLY(tmp4, FIX(0.803364869)) -   /* c8 */
1641               MULTIPLY(tmp5, FIX(1.252223920)),    /* c4 */
1642               CONST_BITS);
1643     z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
1644          MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
1645          MULTIPLY(tmp1 - tmp5, FIX(0.316450131));  /* (c8-c12)/2 */


1777   }
1778 }
1779 
1780 
1781 /*
1782  * Perform the forward DCT on a 14x14 sample block.
1783  */
1784 
1785 GLOBAL(void)
1786 jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1787 {
1788   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1789   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1790   DCTELEM workspace[8*6];
1791   DCTELEM *dataptr;
1792   DCTELEM *wsptr;
1793   JSAMPROW elemptr;
1794   int ctr;
1795   SHIFT_TEMPS
1796 
1797   /* Pass 1: process rows. */
1798   /* Note results are scaled up by sqrt(8) compared to a true DCT. */
1799   /* cK represents sqrt(2) * cos(K*pi/28). */

1800 
1801   dataptr = data;
1802   ctr = 0;
1803   for (;;) {
1804     elemptr = sample_data[ctr] + start_col;
1805 
1806     /* Even part */
1807 
1808     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
1809     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
1810     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
1811     tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
1812     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
1813     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
1814     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
1815 
1816     tmp10 = tmp0 + tmp6;
1817     tmp14 = tmp0 - tmp6;
1818     tmp11 = tmp1 + tmp5;
1819     tmp15 = tmp1 - tmp5;
1820     tmp12 = tmp2 + tmp4;
1821     tmp16 = tmp2 - tmp4;
1822 
1823     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
1824     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
1825     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
1826     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
1827     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
1828     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
1829     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
1830 
1831     /* Apply unsigned->signed conversion */
1832     dataptr[0] = (DCTELEM)
1833       (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
1834     tmp13 += tmp13;
1835     dataptr[4] = (DCTELEM)
1836       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
1837               MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
1838               MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
1839               CONST_BITS);
1840 
1841     tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
1842 
1843     dataptr[2] = (DCTELEM)
1844       DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
1845               + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
1846               CONST_BITS);
1847     dataptr[6] = (DCTELEM)
1848       DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
1849               - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
1850               CONST_BITS);
1851 


1978 }
1979 
1980 
1981 /*
1982  * Perform the forward DCT on a 15x15 sample block.
1983  */
1984 
1985 GLOBAL(void)
1986 jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1987 {
1988   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1989   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1990   INT32 z1, z2, z3;
1991   DCTELEM workspace[8*7];
1992   DCTELEM *dataptr;
1993   DCTELEM *wsptr;
1994   JSAMPROW elemptr;
1995   int ctr;
1996   SHIFT_TEMPS
1997 
1998   /* Pass 1: process rows. */
1999   /* Note results are scaled up by sqrt(8) compared to a true DCT. */
2000   /* cK represents sqrt(2) * cos(K*pi/30). */

2001 
2002   dataptr = data;
2003   ctr = 0;
2004   for (;;) {
2005     elemptr = sample_data[ctr] + start_col;
2006 
2007     /* Even part */
2008 
2009     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
2010     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
2011     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
2012     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
2013     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
2014     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
2015     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
2016     tmp7 = GETJSAMPLE(elemptr[7]);
2017 
2018     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
2019     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
2020     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
2021     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
2022     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
2023     tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
2024     tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
2025 
2026     z1 = tmp0 + tmp4 + tmp5;
2027     z2 = tmp1 + tmp3 + tmp6;
2028     z3 = tmp2 + tmp7;
2029     /* Apply unsigned->signed conversion */
2030     dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
2031     z3 += z3;
2032     dataptr[6] = (DCTELEM)
2033       DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
2034               MULTIPLY(z2 - z3, FIX(0.437016024)),  /* c12 */
2035               CONST_BITS);
2036     tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2037     z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) -  /* c2+c14 */
2038          MULTIPLY(tmp6 - tmp2, FIX(2.238241955));   /* c4+c8 */
2039     z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) -  /* c8-c14 */
2040          MULTIPLY(tmp0 - tmp2, FIX(0.091361227));   /* c2-c4 */
2041     z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) +  /* c2 */
2042          MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) +  /* c8 */
2043          MULTIPLY(tmp1 - tmp4, FIX(0.790569415));   /* (c6+c12)/2 */
2044 
2045     dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
2046     dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
2047 
2048     /* Odd part */
2049 


2156   }
2157 }
2158 
2159 
2160 /*
2161  * Perform the forward DCT on a 16x16 sample block.
2162  */
2163 
2164 GLOBAL(void)
2165 jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2166 {
2167   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2168   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2169   DCTELEM workspace[DCTSIZE2];
2170   DCTELEM *dataptr;
2171   DCTELEM *wsptr;
2172   JSAMPROW elemptr;
2173   int ctr;
2174   SHIFT_TEMPS
2175 
2176   /* Pass 1: process rows. */
2177   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
2178   /* furthermore, we scale the results by 2**PASS1_BITS. */
2179   /* cK represents sqrt(2) * cos(K*pi/32). */

2180 
2181   dataptr = data;
2182   ctr = 0;
2183   for (;;) {
2184     elemptr = sample_data[ctr] + start_col;
2185 
2186     /* Even part */
2187 
2188     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2189     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2190     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2191     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2192     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2193     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2194     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2195     tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2196 
2197     tmp10 = tmp0 + tmp7;
2198     tmp14 = tmp0 - tmp7;
2199     tmp11 = tmp1 + tmp6;
2200     tmp15 = tmp1 - tmp6;
2201     tmp12 = tmp2 + tmp5;
2202     tmp16 = tmp2 - tmp5;
2203     tmp13 = tmp3 + tmp4;
2204     tmp17 = tmp3 - tmp4;
2205 
2206     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2207     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2208     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2209     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2210     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2211     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2212     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2213     tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2214 
2215     /* Apply unsigned->signed conversion */
2216     dataptr[0] = (DCTELEM)
2217       ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2218     dataptr[4] = (DCTELEM)
2219       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2220               MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2221               CONST_BITS-PASS1_BITS);
2222 
2223     tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2224             MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2225 
2226     dataptr[2] = (DCTELEM)
2227       DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2228               + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
2229               CONST_BITS-PASS1_BITS);
2230     dataptr[6] = (DCTELEM)
2231       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2232               - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2233               CONST_BITS-PASS1_BITS);
2234 
2235     /* Odd part */


2258 
2259     dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2260     dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2261     dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2262     dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2263 
2264     ctr++;
2265 
2266     if (ctr != DCTSIZE) {
2267       if (ctr == DCTSIZE * 2)
2268         break;                  /* Done. */
2269       dataptr += DCTSIZE;       /* advance pointer to next row */
2270     } else
2271       dataptr = workspace;      /* switch pointer to extended workspace */
2272   }
2273 
2274   /* Pass 2: process columns.
2275    * We remove the PASS1_BITS scaling, but leave the results scaled up
2276    * by an overall factor of 8.
2277    * We must also scale the output by (8/16)**2 = 1/2**2.

2278    */
2279 
2280   dataptr = data;
2281   wsptr = workspace;
2282   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2283     /* Even part */
2284 
2285     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
2286     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
2287     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
2288     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
2289     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
2290     tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
2291     tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
2292     tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
2293 
2294     tmp10 = tmp0 + tmp7;
2295     tmp14 = tmp0 - tmp7;
2296     tmp11 = tmp1 + tmp6;
2297     tmp15 = tmp1 - tmp6;


2363 }
2364 
2365 
2366 /*
2367  * Perform the forward DCT on a 16x8 sample block.
2368  *
2369  * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
2370  */
2371 
2372 GLOBAL(void)
2373 jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2374 {
2375   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2376   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2377   INT32 z1;
2378   DCTELEM *dataptr;
2379   JSAMPROW elemptr;
2380   int ctr;
2381   SHIFT_TEMPS
2382 
2383   /* Pass 1: process rows. */
2384   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
2385   /* furthermore, we scale the results by 2**PASS1_BITS. */
2386   /* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32). */

2387 
2388   dataptr = data;
2389   ctr = 0;
2390   for (ctr = 0; ctr < DCTSIZE; ctr++) {
2391     elemptr = sample_data[ctr] + start_col;
2392 
2393     /* Even part */
2394 
2395     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2396     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2397     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2398     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2399     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2400     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2401     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2402     tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2403 
2404     tmp10 = tmp0 + tmp7;
2405     tmp14 = tmp0 - tmp7;
2406     tmp11 = tmp1 + tmp6;
2407     tmp15 = tmp1 - tmp6;
2408     tmp12 = tmp2 + tmp5;
2409     tmp16 = tmp2 - tmp5;
2410     tmp13 = tmp3 + tmp4;
2411     tmp17 = tmp3 - tmp4;
2412 
2413     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2414     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2415     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2416     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2417     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2418     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2419     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2420     tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2421 
2422     /* Apply unsigned->signed conversion */
2423     dataptr[0] = (DCTELEM)
2424       ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2425     dataptr[4] = (DCTELEM)
2426       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2427               MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2428               CONST_BITS-PASS1_BITS);
2429 
2430     tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2431             MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2432 
2433     dataptr[2] = (DCTELEM)
2434       DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2435               + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
2436               CONST_BITS-PASS1_BITS);
2437     dataptr[6] = (DCTELEM)
2438       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2439               - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2440               CONST_BITS-PASS1_BITS);
2441 
2442     /* Odd part */


2458             MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
2459     tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2460              - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
2461     tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2462              + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
2463     tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2464              + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
2465 
2466     dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2467     dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2468     dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2469     dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2470 
2471     dataptr += DCTSIZE;         /* advance pointer to next row */
2472   }
2473 
2474   /* Pass 2: process columns.
2475    * We remove the PASS1_BITS scaling, but leave the results scaled up
2476    * by an overall factor of 8.
2477    * We must also scale the output by 8/16 = 1/2.

2478    */
2479 
2480   dataptr = data;
2481   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2482     /* Even part per LL&M figure 1 --- note that published figure is faulty;
2483      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
2484      */
2485 
2486     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
2487     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
2488     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
2489     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
2490 
2491     tmp10 = tmp0 + tmp3;
2492     tmp12 = tmp0 - tmp3;
2493     tmp11 = tmp1 + tmp2;
2494     tmp13 = tmp1 - tmp2;
2495 
2496     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
2497     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
2498     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
2499     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
2500 
2501     dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
2502     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
2503 
2504     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
2505     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),

2506                                            CONST_BITS+PASS1_BITS+1);
2507     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),

2508                                            CONST_BITS+PASS1_BITS+1);
2509 
2510     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
2511      * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
2512      * i0..i3 in the paper are tmp0..tmp3 here.
2513      */
2514 
2515     tmp10 = tmp0 + tmp3;
2516     tmp11 = tmp1 + tmp2;
2517     tmp12 = tmp0 + tmp2;
2518     tmp13 = tmp1 + tmp3;

2519     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */




2520 

2521     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
2522     tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
2523     tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
2524     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
2525     tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
2526     tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
2527     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
2528     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
2529 
2530     tmp12 += z1;
2531     tmp13 += z1;



2532 
2533     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
2534                                            CONST_BITS+PASS1_BITS+1);
2535     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
2536                                            CONST_BITS+PASS1_BITS+1);
2537     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
2538                                            CONST_BITS+PASS1_BITS+1);
2539     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
2540                                            CONST_BITS+PASS1_BITS+1);
2541 
2542     dataptr++;                  /* advance pointer to next column */
2543   }
2544 }
2545 
2546 
2547 /*
2548  * Perform the forward DCT on a 14x7 sample block.
2549  *
2550  * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
2551  */
2552 
2553 GLOBAL(void)
2554 jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2555 {
2556   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
2557   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2558   INT32 z1, z2, z3;
2559   DCTELEM *dataptr;
2560   JSAMPROW elemptr;
2561   int ctr;
2562   SHIFT_TEMPS
2563 
2564   /* Zero bottom row of output coefficient block. */
2565   MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
2566 
2567   /* Pass 1: process rows. */
2568   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
2569   /* furthermore, we scale the results by 2**PASS1_BITS. */
2570   /* 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28). */

2571 
2572   dataptr = data;
2573   for (ctr = 0; ctr < 7; ctr++) {
2574     elemptr = sample_data[ctr] + start_col;
2575 
2576     /* Even part */
2577 
2578     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
2579     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
2580     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
2581     tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
2582     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
2583     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
2584     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
2585 
2586     tmp10 = tmp0 + tmp6;
2587     tmp14 = tmp0 - tmp6;
2588     tmp11 = tmp1 + tmp5;
2589     tmp15 = tmp1 - tmp5;
2590     tmp12 = tmp2 + tmp4;
2591     tmp16 = tmp2 - tmp4;
2592 
2593     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
2594     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
2595     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
2596     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
2597     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
2598     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
2599     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
2600 
2601     /* Apply unsigned->signed conversion */
2602     dataptr[0] = (DCTELEM)
2603       ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
2604     tmp13 += tmp13;
2605     dataptr[4] = (DCTELEM)
2606       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
2607               MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
2608               MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
2609               CONST_BITS-PASS1_BITS);
2610 
2611     tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
2612 
2613     dataptr[2] = (DCTELEM)
2614       DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
2615               + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
2616               CONST_BITS-PASS1_BITS);
2617     dataptr[6] = (DCTELEM)
2618       DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
2619               - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
2620               CONST_BITS-PASS1_BITS);
2621 


2710 
2711 /*
2712  * Perform the forward DCT on a 12x6 sample block.
2713  *
2714  * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
2715  */
2716 
2717 GLOBAL(void)
2718 jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2719 {
2720   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
2721   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2722   DCTELEM *dataptr;
2723   JSAMPROW elemptr;
2724   int ctr;
2725   SHIFT_TEMPS
2726 
2727   /* Zero 2 bottom rows of output coefficient block. */
2728   MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
2729 
2730   /* Pass 1: process rows. */
2731   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
2732   /* furthermore, we scale the results by 2**PASS1_BITS. */
2733   /* 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24). */

2734 
2735   dataptr = data;
2736   for (ctr = 0; ctr < 6; ctr++) {
2737     elemptr = sample_data[ctr] + start_col;
2738 
2739     /* Even part */
2740 
2741     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
2742     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
2743     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
2744     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
2745     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
2746     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
2747 
2748     tmp10 = tmp0 + tmp5;
2749     tmp13 = tmp0 - tmp5;
2750     tmp11 = tmp1 + tmp4;
2751     tmp14 = tmp1 - tmp4;
2752     tmp12 = tmp2 + tmp3;
2753     tmp15 = tmp2 - tmp3;
2754 
2755     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
2756     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
2757     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
2758     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
2759     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
2760     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
2761 
2762     /* Apply unsigned->signed conversion */
2763     dataptr[0] = (DCTELEM)
2764       ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
2765     dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
2766     dataptr[4] = (DCTELEM)
2767       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
2768               CONST_BITS-PASS1_BITS);
2769     dataptr[2] = (DCTELEM)
2770       DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
2771               CONST_BITS-PASS1_BITS);
2772 
2773     /* Odd part */
2774 
2775     tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
2776     tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
2777     tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
2778     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
2779     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
2780     tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
2781             + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
2782     tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */


2849 
2850 /*
2851  * Perform the forward DCT on a 10x5 sample block.
2852  *
2853  * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
2854  */
2855 
2856 GLOBAL(void)
2857 jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2858 {
2859   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
2860   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
2861   DCTELEM *dataptr;
2862   JSAMPROW elemptr;
2863   int ctr;
2864   SHIFT_TEMPS
2865 
2866   /* Zero 3 bottom rows of output coefficient block. */
2867   MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
2868 
2869   /* Pass 1: process rows. */
2870   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
2871   /* furthermore, we scale the results by 2**PASS1_BITS. */
2872   /* 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20). */

2873 
2874   dataptr = data;
2875   for (ctr = 0; ctr < 5; ctr++) {
2876     elemptr = sample_data[ctr] + start_col;
2877 
2878     /* Even part */
2879 
2880     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
2881     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
2882     tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
2883     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
2884     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
2885 
2886     tmp10 = tmp0 + tmp4;
2887     tmp13 = tmp0 - tmp4;
2888     tmp11 = tmp1 + tmp3;
2889     tmp14 = tmp1 - tmp3;
2890 
2891     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
2892     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
2893     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
2894     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
2895     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
2896 
2897     /* Apply unsigned->signed conversion */
2898     dataptr[0] = (DCTELEM)
2899       ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
2900     tmp12 += tmp12;
2901     dataptr[4] = (DCTELEM)
2902       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
2903               MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
2904               CONST_BITS-PASS1_BITS);
2905     tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
2906     dataptr[2] = (DCTELEM)
2907       DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
2908               CONST_BITS-PASS1_BITS);
2909     dataptr[6] = (DCTELEM)
2910       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
2911               CONST_BITS-PASS1_BITS);
2912 
2913     /* Odd part */
2914 
2915     tmp10 = tmp0 + tmp4;
2916     tmp11 = tmp1 - tmp3;
2917     dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);


2982 /*
2983  * Perform the forward DCT on an 8x4 sample block.
2984  *
2985  * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
2986  */
2987 
2988 GLOBAL(void)
2989 jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2990 {
2991   INT32 tmp0, tmp1, tmp2, tmp3;
2992   INT32 tmp10, tmp11, tmp12, tmp13;
2993   INT32 z1;
2994   DCTELEM *dataptr;
2995   JSAMPROW elemptr;
2996   int ctr;
2997   SHIFT_TEMPS
2998 
2999   /* Zero 4 bottom rows of output coefficient block. */
3000   MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
3001 
3002   /* Pass 1: process rows. */
3003   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
3004   /* furthermore, we scale the results by 2**PASS1_BITS. */
3005   /* We must also scale the output by 8/4 = 2, which we add here. */


3006 
3007   dataptr = data;
3008   for (ctr = 0; ctr < 4; ctr++) {
3009     elemptr = sample_data[ctr] + start_col;
3010 
3011     /* Even part per LL&M figure 1 --- note that published figure is faulty;
3012      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
3013      */
3014 
3015     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3016     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3017     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3018     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3019 
3020     tmp10 = tmp0 + tmp3;
3021     tmp12 = tmp0 - tmp3;
3022     tmp11 = tmp1 + tmp2;
3023     tmp13 = tmp1 - tmp2;
3024 
3025     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3026     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3027     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3028     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3029 
3030     /* Apply unsigned->signed conversion */
3031     dataptr[0] = (DCTELEM)
3032       ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
3033     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
3034 
3035     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
3036     /* Add fudge factor here for final descale. */
3037     z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3038     dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),


3039                                        CONST_BITS-PASS1_BITS-1);
3040     dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),

3041                                        CONST_BITS-PASS1_BITS-1);
3042 
3043     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3044      * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3045      * i0..i3 in the paper are tmp0..tmp3 here.
3046      */
3047 
3048     tmp10 = tmp0 + tmp3;
3049     tmp11 = tmp1 + tmp2;
3050     tmp12 = tmp0 + tmp2;
3051     tmp13 = tmp1 + tmp3;

3052     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
3053     /* Add fudge factor here for final descale. */
3054     z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3055 
3056     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
3057     tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
3058     tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
3059     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
3060     tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
3061     tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
3062     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
3063     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
3064 
3065     tmp12 += z1;
3066     tmp13 += z1;
3067 
3068     dataptr[1] = (DCTELEM)
3069       RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
3070     dataptr[3] = (DCTELEM)
3071       RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
3072     dataptr[5] = (DCTELEM)
3073       RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
3074     dataptr[7] = (DCTELEM)
3075       RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);








3076 
3077     dataptr += DCTSIZE;         /* advance pointer to next row */
3078   }
3079 
3080   /* Pass 2: process columns.
3081    * We remove the PASS1_BITS scaling, but leave the results scaled up
3082    * by an overall factor of 8.
3083    * 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).

3084    */
3085 
3086   dataptr = data;
3087   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3088     /* Even part */
3089 
3090     /* Add fudge factor here for final descale. */
3091     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
3092     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
3093 
3094     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
3095     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
3096 
3097     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3098     dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3099 
3100     /* Odd part */
3101 
3102     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);   /* c6 */
3103     /* Add fudge factor here for final descale. */


3117 
3118 /*
3119  * Perform the forward DCT on a 6x3 sample block.
3120  *
3121  * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
3122  */
3123 
3124 GLOBAL(void)
3125 jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3126 {
3127   INT32 tmp0, tmp1, tmp2;
3128   INT32 tmp10, tmp11, tmp12;
3129   DCTELEM *dataptr;
3130   JSAMPROW elemptr;
3131   int ctr;
3132   SHIFT_TEMPS
3133 
3134   /* Pre-zero output coefficient block. */
3135   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3136 
3137   /* Pass 1: process rows. */
3138   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
3139   /* furthermore, we scale the results by 2**PASS1_BITS. */
3140   /* We scale the results further by 2 as part of output adaption */
3141   /* scaling for different DCT size. */
3142   /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */

3143 
3144   dataptr = data;
3145   for (ctr = 0; ctr < 3; ctr++) {
3146     elemptr = sample_data[ctr] + start_col;
3147 
3148     /* Even part */
3149 
3150     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3151     tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3152     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3153 
3154     tmp10 = tmp0 + tmp2;
3155     tmp12 = tmp0 - tmp2;
3156 
3157     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3158     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3159     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3160 
3161     /* Apply unsigned->signed conversion */
3162     dataptr[0] = (DCTELEM)
3163       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
3164     dataptr[2] = (DCTELEM)
3165       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
3166               CONST_BITS-PASS1_BITS-1);
3167     dataptr[4] = (DCTELEM)
3168       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3169               CONST_BITS-PASS1_BITS-1);
3170 
3171     /* Odd part */
3172 
3173     tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
3174                     CONST_BITS-PASS1_BITS-1);
3175 
3176     dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
3177     dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
3178     dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
3179 
3180     dataptr += DCTSIZE;         /* advance pointer to next row */
3181   }


3217 
3218 /*
3219  * Perform the forward DCT on a 4x2 sample block.
3220  *
3221  * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
3222  */
3223 
3224 GLOBAL(void)
3225 jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3226 {
3227   INT32 tmp0, tmp1;
3228   INT32 tmp10, tmp11;
3229   DCTELEM *dataptr;
3230   JSAMPROW elemptr;
3231   int ctr;
3232   SHIFT_TEMPS
3233 
3234   /* Pre-zero output coefficient block. */
3235   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3236 
3237   /* Pass 1: process rows. */
3238   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
3239   /* furthermore, we scale the results by 2**PASS1_BITS. */
3240   /* We must also scale the output by (8/4)*(8/2) = 2**3, which we add here. */
3241   /* 4-point FDCT kernel, */
3242   /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */

3243 
3244   dataptr = data;
3245   for (ctr = 0; ctr < 2; ctr++) {
3246     elemptr = sample_data[ctr] + start_col;
3247 
3248     /* Even part */
3249 
3250     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
3251     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
3252 
3253     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
3254     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
3255 
3256     /* Apply unsigned->signed conversion */
3257     dataptr[0] = (DCTELEM)
3258       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
3259     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
3260 
3261     /* Odd part */
3262 
3263     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
3264     /* Add fudge factor here for final descale. */
3265     tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
3266 
3267     dataptr[1] = (DCTELEM)
3268       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3269                   CONST_BITS-PASS1_BITS-3);
3270     dataptr[3] = (DCTELEM)
3271       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3272                   CONST_BITS-PASS1_BITS-3);
3273 
3274     dataptr += DCTSIZE;         /* advance pointer to next row */
3275   }
3276 


3290     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3291 
3292     /* Odd part */
3293 
3294     dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3295 
3296     dataptr++;                  /* advance pointer to next column */
3297   }
3298 }
3299 
3300 
3301 /*
3302  * Perform the forward DCT on a 2x1 sample block.
3303  *
3304  * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
3305  */
3306 
3307 GLOBAL(void)
3308 jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3309 {
3310   INT32 tmp0, tmp1;
3311   JSAMPROW elemptr;
3312 
3313   /* Pre-zero output coefficient block. */
3314   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3315 
3316   elemptr = sample_data[0] + start_col;
3317 
3318   tmp0 = GETJSAMPLE(elemptr[0]);
3319   tmp1 = GETJSAMPLE(elemptr[1]);
3320 
3321   /* We leave the results scaled up by an overall factor of 8.
3322    * We must also scale the output by (8/2)*(8/1) = 2**5.
3323    */
3324 
3325   /* Even part */
3326   /* Apply unsigned->signed conversion */
3327   data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);

3328 
3329   /* Odd part */
3330   data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);

3331 }
3332 
3333 
3334 /*
3335  * Perform the forward DCT on an 8x16 sample block.
3336  *
3337  * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
3338  */
3339 
3340 GLOBAL(void)
3341 jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3342 {
3343   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3344   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
3345   INT32 z1;
3346   DCTELEM workspace[DCTSIZE2];
3347   DCTELEM *dataptr;
3348   DCTELEM *wsptr;
3349   JSAMPROW elemptr;
3350   int ctr;
3351   SHIFT_TEMPS
3352 
3353   /* Pass 1: process rows. */
3354   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
3355   /* furthermore, we scale the results by 2**PASS1_BITS. */


3356 
3357   dataptr = data;
3358   ctr = 0;
3359   for (;;) {
3360     elemptr = sample_data[ctr] + start_col;
3361 
3362     /* Even part per LL&M figure 1 --- note that published figure is faulty;
3363      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
3364      */
3365 
3366     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3367     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3368     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3369     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3370 
3371     tmp10 = tmp0 + tmp3;
3372     tmp12 = tmp0 - tmp3;
3373     tmp11 = tmp1 + tmp2;
3374     tmp13 = tmp1 - tmp2;
3375 
3376     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3377     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3378     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3379     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3380 
3381     /* Apply unsigned->signed conversion */
3382     dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
3383     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
3384 
3385     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
3386     dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),

3387                                    CONST_BITS-PASS1_BITS);
3388     dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),

3389                                    CONST_BITS-PASS1_BITS);
3390 
3391     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3392      * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3393      * i0..i3 in the paper are tmp0..tmp3 here.
3394      */
3395 
3396     tmp10 = tmp0 + tmp3;
3397     tmp11 = tmp1 + tmp2;
3398     tmp12 = tmp0 + tmp2;
3399     tmp13 = tmp1 + tmp3;

3400     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */




3401 

3402     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
3403     tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
3404     tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
3405     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
3406     tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
3407     tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
3408     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
3409     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
3410 
3411     tmp12 += z1;
3412     tmp13 += z1;



3413 
3414     dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
3415     dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
3416     dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
3417     dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
3418 
3419     ctr++;
3420 
3421     if (ctr != DCTSIZE) {
3422       if (ctr == DCTSIZE * 2)
3423         break;                  /* Done. */
3424       dataptr += DCTSIZE;       /* advance pointer to next row */
3425     } else
3426       dataptr = workspace;      /* switch pointer to extended workspace */
3427   }
3428 
3429   /* Pass 2: process columns.
3430    * We remove the PASS1_BITS scaling, but leave the results scaled up
3431    * by an overall factor of 8.
3432    * We must also scale the output by 8/16 = 1/2.
3433    * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
3434    */
3435 
3436   dataptr = data;
3437   wsptr = workspace;


3524  *
3525  * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
3526  */
3527 
3528 GLOBAL(void)
3529 jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3530 {
3531   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
3532   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3533   INT32 z1, z2, z3;
3534   DCTELEM workspace[8*6];
3535   DCTELEM *dataptr;
3536   DCTELEM *wsptr;
3537   JSAMPROW elemptr;
3538   int ctr;
3539   SHIFT_TEMPS
3540 
3541   /* Pre-zero output coefficient block. */
3542   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3543 
3544   /* Pass 1: process rows. */
3545   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
3546   /* furthermore, we scale the results by 2**PASS1_BITS. */
3547   /* 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14). */

3548 
3549   dataptr = data;
3550   ctr = 0;
3551   for (;;) {
3552     elemptr = sample_data[ctr] + start_col;
3553 
3554     /* Even part */
3555 
3556     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
3557     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
3558     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
3559     tmp3 = GETJSAMPLE(elemptr[3]);
3560 
3561     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
3562     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
3563     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
3564 
3565     z1 = tmp0 + tmp2;
3566     /* Apply unsigned->signed conversion */
3567     dataptr[0] = (DCTELEM)
3568       ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
3569     tmp3 += tmp3;
3570     z1 -= tmp3;
3571     z1 -= tmp3;
3572     z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
3573     z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
3574     z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
3575     dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
3576     z1 -= z2;
3577     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
3578     dataptr[4] = (DCTELEM)
3579       DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
3580               CONST_BITS-PASS1_BITS);
3581     dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
3582 
3583     /* Odd part */
3584 
3585     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
3586     tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */


3704  * Perform the forward DCT on a 6x12 sample block.
3705  *
3706  * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
3707  */
3708 
3709 GLOBAL(void)
3710 jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3711 {
3712   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3713   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3714   DCTELEM workspace[8*4];
3715   DCTELEM *dataptr;
3716   DCTELEM *wsptr;
3717   JSAMPROW elemptr;
3718   int ctr;
3719   SHIFT_TEMPS
3720 
3721   /* Pre-zero output coefficient block. */
3722   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3723 
3724   /* Pass 1: process rows. */
3725   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
3726   /* furthermore, we scale the results by 2**PASS1_BITS. */
3727   /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */

3728 
3729   dataptr = data;
3730   ctr = 0;
3731   for (;;) {
3732     elemptr = sample_data[ctr] + start_col;
3733 
3734     /* Even part */
3735 
3736     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3737     tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3738     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3739 
3740     tmp10 = tmp0 + tmp2;
3741     tmp12 = tmp0 - tmp2;
3742 
3743     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3744     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3745     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3746 
3747     /* Apply unsigned->signed conversion */
3748     dataptr[0] = (DCTELEM)
3749       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
3750     dataptr[2] = (DCTELEM)
3751       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
3752               CONST_BITS-PASS1_BITS);
3753     dataptr[4] = (DCTELEM)
3754       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3755               CONST_BITS-PASS1_BITS);
3756 
3757     /* Odd part */
3758 
3759     tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
3760                     CONST_BITS-PASS1_BITS);
3761 
3762     dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
3763     dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
3764     dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
3765 
3766     ctr++;
3767 


3853  * Perform the forward DCT on a 5x10 sample block.
3854  *
3855  * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
3856  */
3857 
3858 GLOBAL(void)
3859 jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3860 {
3861   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
3862   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3863   DCTELEM workspace[8*2];
3864   DCTELEM *dataptr;
3865   DCTELEM *wsptr;
3866   JSAMPROW elemptr;
3867   int ctr;
3868   SHIFT_TEMPS
3869 
3870   /* Pre-zero output coefficient block. */
3871   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3872 
3873   /* Pass 1: process rows. */
3874   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
3875   /* furthermore, we scale the results by 2**PASS1_BITS. */
3876   /* 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10). */

3877 
3878   dataptr = data;
3879   ctr = 0;
3880   for (;;) {
3881     elemptr = sample_data[ctr] + start_col;
3882 
3883     /* Even part */
3884 
3885     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
3886     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
3887     tmp2 = GETJSAMPLE(elemptr[2]);
3888 
3889     tmp10 = tmp0 + tmp1;
3890     tmp11 = tmp0 - tmp1;
3891 
3892     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
3893     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
3894 
3895     /* Apply unsigned->signed conversion */
3896     dataptr[0] = (DCTELEM)
3897       ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
3898     tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
3899     tmp10 -= tmp2 << 2;
3900     tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
3901     dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
3902     dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
3903 
3904     /* Odd part */
3905 
3906     tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
3907 
3908     dataptr[1] = (DCTELEM)
3909       DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
3910               CONST_BITS-PASS1_BITS);
3911     dataptr[3] = (DCTELEM)
3912       DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
3913               CONST_BITS-PASS1_BITS);
3914 
3915     ctr++;


3998 /*
3999  * Perform the forward DCT on a 4x8 sample block.
4000  *
4001  * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
4002  */
4003 
4004 GLOBAL(void)
4005 jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4006 {
4007   INT32 tmp0, tmp1, tmp2, tmp3;
4008   INT32 tmp10, tmp11, tmp12, tmp13;
4009   INT32 z1;
4010   DCTELEM *dataptr;
4011   JSAMPROW elemptr;
4012   int ctr;
4013   SHIFT_TEMPS
4014 
4015   /* Pre-zero output coefficient block. */
4016   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4017 
4018   /* Pass 1: process rows. */
4019   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
4020   /* furthermore, we scale the results by 2**PASS1_BITS. */
4021   /* We must also scale the output by 8/4 = 2, which we add here. */
4022   /* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). */


4023 
4024   dataptr = data;
4025   for (ctr = 0; ctr < DCTSIZE; ctr++) {
4026     elemptr = sample_data[ctr] + start_col;
4027 
4028     /* Even part */
4029 
4030     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
4031     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
4032 
4033     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
4034     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
4035 
4036     /* Apply unsigned->signed conversion */
4037     dataptr[0] = (DCTELEM)
4038       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
4039     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
4040 
4041     /* Odd part */
4042 
4043     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
4044     /* Add fudge factor here for final descale. */
4045     tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
4046 
4047     dataptr[1] = (DCTELEM)
4048       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4049                   CONST_BITS-PASS1_BITS-1);
4050     dataptr[3] = (DCTELEM)
4051       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4052                   CONST_BITS-PASS1_BITS-1);
4053 
4054     dataptr += DCTSIZE;         /* advance pointer to next row */
4055   }
4056 
4057   /* Pass 2: process columns.
4058    * We remove the PASS1_BITS scaling, but leave the results scaled up
4059    * by an overall factor of 8.

4060    */
4061 
4062   dataptr = data;
4063   for (ctr = 0; ctr < 4; ctr++) {
4064     /* Even part per LL&M figure 1 --- note that published figure is faulty;
4065      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
4066      */
4067 
4068     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
4069     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
4070     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
4071     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
4072 
4073     /* Add fudge factor here for final descale. */
4074     tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
4075     tmp12 = tmp0 - tmp3;
4076     tmp11 = tmp1 + tmp2;
4077     tmp13 = tmp1 - tmp2;
4078 
4079     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
4080     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
4081     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
4082     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
4083 
4084     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
4085     dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
4086 
4087     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
4088     /* Add fudge factor here for final descale. */
4089     z1 += ONE << (CONST_BITS+PASS1_BITS-1);

4090     dataptr[DCTSIZE*2] = (DCTELEM)
4091       RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);

4092     dataptr[DCTSIZE*6] = (DCTELEM)
4093       RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);

4094 
4095     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
4096      * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4097      * i0..i3 in the paper are tmp0..tmp3 here.
4098      */
4099 
4100     tmp10 = tmp0 + tmp3;
4101     tmp11 = tmp1 + tmp2;
4102     tmp12 = tmp0 + tmp2;
4103     tmp13 = tmp1 + tmp3;

4104     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
4105     /* Add fudge factor here for final descale. */
4106     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4107 
4108     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
4109     tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
4110     tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
4111     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
4112     tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
4113     tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
4114     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
4115     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
4116 
4117     tmp12 += z1;
4118     tmp13 += z1;
4119 
4120     dataptr[DCTSIZE*1] = (DCTELEM)
4121       RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
4122     dataptr[DCTSIZE*3] = (DCTELEM)
4123       RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
4124     dataptr[DCTSIZE*5] = (DCTELEM)
4125       RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
4126     dataptr[DCTSIZE*7] = (DCTELEM)
4127       RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);








4128 
4129     dataptr++;                  /* advance pointer to next column */
4130   }
4131 }
4132 
4133 
4134 /*
4135  * Perform the forward DCT on a 3x6 sample block.
4136  *
4137  * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
4138  */
4139 
4140 GLOBAL(void)
4141 jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4142 {
4143   INT32 tmp0, tmp1, tmp2;
4144   INT32 tmp10, tmp11, tmp12;
4145   DCTELEM *dataptr;
4146   JSAMPROW elemptr;
4147   int ctr;
4148   SHIFT_TEMPS
4149 
4150   /* Pre-zero output coefficient block. */
4151   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4152 
4153   /* Pass 1: process rows. */
4154   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
4155   /* furthermore, we scale the results by 2**PASS1_BITS. */
4156   /* We scale the results further by 2 as part of output adaption */
4157   /* scaling for different DCT size. */
4158   /* 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6). */

4159 
4160   dataptr = data;
4161   for (ctr = 0; ctr < 6; ctr++) {
4162     elemptr = sample_data[ctr] + start_col;
4163 
4164     /* Even part */
4165 
4166     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
4167     tmp1 = GETJSAMPLE(elemptr[1]);
4168 
4169     tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
4170 
4171     /* Apply unsigned->signed conversion */
4172     dataptr[0] = (DCTELEM)
4173       ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
4174     dataptr[2] = (DCTELEM)
4175       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
4176               CONST_BITS-PASS1_BITS-1);
4177 
4178     /* Odd part */
4179 
4180     dataptr[1] = (DCTELEM)
4181       DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
4182               CONST_BITS-PASS1_BITS-1);
4183 
4184     dataptr += DCTSIZE;         /* advance pointer to next row */
4185   }
4186 
4187   /* Pass 2: process columns.
4188    * We remove the PASS1_BITS scaling, but leave the results scaled up
4189    * by an overall factor of 8.
4190    * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
4191    * fold into the constant multipliers (other part was done in pass 1):


4238 
4239 /*
4240  * Perform the forward DCT on a 2x4 sample block.
4241  *
4242  * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
4243  */
4244 
4245 GLOBAL(void)
4246 jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4247 {
4248   INT32 tmp0, tmp1;
4249   INT32 tmp10, tmp11;
4250   DCTELEM *dataptr;
4251   JSAMPROW elemptr;
4252   int ctr;
4253   SHIFT_TEMPS
4254 
4255   /* Pre-zero output coefficient block. */
4256   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4257 
4258   /* Pass 1: process rows. */
4259   /* Note results are scaled up by sqrt(8) compared to a true DCT. */
4260   /* We must also scale the output by (8/2)*(8/4) = 2**3, which we add here. */

4261 
4262   dataptr = data;
4263   for (ctr = 0; ctr < 4; ctr++) {
4264     elemptr = sample_data[ctr] + start_col;
4265 
4266     /* Even part */
4267 
4268     tmp0 = GETJSAMPLE(elemptr[0]);
4269     tmp1 = GETJSAMPLE(elemptr[1]);
4270 
4271     /* Apply unsigned->signed conversion */
4272     dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
4273 
4274     /* Odd part */
4275 
4276     dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
4277 
4278     dataptr += DCTSIZE;         /* advance pointer to next row */
4279   }
4280 
4281   /* Pass 2: process columns.
4282    * We leave the results scaled up by an overall factor of 8.
4283    * 4-point FDCT kernel,
4284    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4285    */
4286 
4287   dataptr = data;
4288   for (ctr = 0; ctr < 2; ctr++) {
4289     /* Even part */
4290 
4291     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];


4307       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4308                   CONST_BITS);
4309     dataptr[DCTSIZE*3] = (DCTELEM)
4310       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4311                   CONST_BITS);
4312 
4313     dataptr++;                  /* advance pointer to next column */
4314   }
4315 }
4316 
4317 
4318 /*
4319  * Perform the forward DCT on a 1x2 sample block.
4320  *
4321  * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
4322  */
4323 
4324 GLOBAL(void)
4325 jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4326 {
4327   INT32 tmp0, tmp1;
4328 
4329   /* Pre-zero output coefficient block. */
4330   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4331 
4332   tmp0 = GETJSAMPLE(sample_data[0][start_col]);
4333   tmp1 = GETJSAMPLE(sample_data[1][start_col]);
4334 
4335   /* We leave the results scaled up by an overall factor of 8.

4336    * We must also scale the output by (8/1)*(8/2) = 2**5.
4337    */
4338 
4339   /* Even part */
4340   /* Apply unsigned->signed conversion */
4341   data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);




4342 
4343   /* Odd part */
4344   data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);

4345 }
4346 
4347 #endif /* DCT_SCALING_SUPPORTED */
4348 #endif /* DCT_ISLOW_SUPPORTED */
   1 /*
   2  * jfdctint.c
   3  *
   4  * Copyright (C) 1991-1996, Thomas G. Lane.
   5  * Modification developed 2003-2015 by Guido Vollbeding.
   6  * This file is part of the Independent JPEG Group's software.
   7  * For conditions of distribution and use, see the accompanying README file.
   8  *
   9  * This file contains a slow-but-accurate integer implementation of the
  10  * forward DCT (Discrete Cosine Transform).
  11  *
  12  * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
  13  * on each column.  Direct algorithms are also available, but they are
  14  * much more complex and seem not to be any faster when reduced to code.
  15  *
  16  * This implementation is based on an algorithm described in
  17  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  18  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  19  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  20  * The primary algorithm described there uses 11 multiplies and 29 adds.
  21  * We use their alternate method with 12 multiplies and 32 adds.
  22  * The advantage of this method is that no data path contains more than one
  23  * multiplication; this allows a very simple and accurate implementation in
  24  * scaled fixed-point arithmetic, with a minimal number of shifts.
  25  *


 148 #else
 149 #define MULTIPLY(var,const)  ((var) * (const))
 150 #endif
 151 
 152 
 153 /*
 154  * Perform the forward DCT on one block of samples.
 155  */
 156 
 157 GLOBAL(void)
 158 jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 159 {
 160   INT32 tmp0, tmp1, tmp2, tmp3;
 161   INT32 tmp10, tmp11, tmp12, tmp13;
 162   INT32 z1;
 163   DCTELEM *dataptr;
 164   JSAMPROW elemptr;
 165   int ctr;
 166   SHIFT_TEMPS
 167 
 168   /* Pass 1: process rows.
 169    * Note results are scaled up by sqrt(8) compared to a true DCT;
 170    * furthermore, we scale the results by 2**PASS1_BITS.
 171    * cK represents sqrt(2) * cos(K*pi/16).
 172    */
 173 
 174   dataptr = data;
 175   for (ctr = 0; ctr < DCTSIZE; ctr++) {
 176     elemptr = sample_data[ctr] + start_col;
 177 
 178     /* Even part per LL&M figure 1 --- note that published figure is faulty;
 179      * rotator "c1" should be "c6".
 180      */
 181 
 182     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
 183     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
 184     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
 185     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
 186 
 187     tmp10 = tmp0 + tmp3;
 188     tmp12 = tmp0 - tmp3;
 189     tmp11 = tmp1 + tmp2;
 190     tmp13 = tmp1 - tmp2;
 191 
 192     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
 193     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
 194     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
 195     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
 196 
 197     /* Apply unsigned->signed conversion. */
 198     dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
 199     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
 200 
 201     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
 202     /* Add fudge factor here for final descale. */
 203     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
 204 
 205     dataptr[2] = (DCTELEM)
 206       RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
 207                   CONST_BITS-PASS1_BITS);
 208     dataptr[6] = (DCTELEM)
 209       RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
 210                   CONST_BITS-PASS1_BITS);
 211 
 212     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).

 213      * i0..i3 in the paper are tmp0..tmp3 here.
 214      */
 215 


 216     tmp12 = tmp0 + tmp2;
 217     tmp13 = tmp1 + tmp3;
 218 
 219     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
 220     /* Add fudge factor here for final descale. */
 221     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
 222 
 223     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */






 224     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */

 225     tmp12 += z1;
 226     tmp13 += z1;
 227 
 228     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
 229     tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
 230     tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
 231     tmp0 += z1 + tmp12;
 232     tmp3 += z1 + tmp13;
 233 
 234     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
 235     tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
 236     tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
 237     tmp1 += z1 + tmp13;
 238     tmp2 += z1 + tmp12;
 239 
 240     dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS);
 241     dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS);
 242     dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
 243     dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS);
 244 
 245     dataptr += DCTSIZE;         /* advance pointer to next row */
 246   }
 247 
 248   /* Pass 2: process columns.
 249    * We remove the PASS1_BITS scaling, but leave the results scaled up
 250    * by an overall factor of 8.
 251    * cK represents sqrt(2) * cos(K*pi/16).
 252    */
 253 
 254   dataptr = data;
 255   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
 256     /* Even part per LL&M figure 1 --- note that published figure is faulty;
 257      * rotator "c1" should be "c6".
 258      */
 259 
 260     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
 261     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
 262     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
 263     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
 264 
 265     /* Add fudge factor here for final descale. */
 266     tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
 267     tmp12 = tmp0 - tmp3;
 268     tmp11 = tmp1 + tmp2;
 269     tmp13 = tmp1 - tmp2;
 270 
 271     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
 272     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
 273     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
 274     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
 275 
 276     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
 277     dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
 278 
 279     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
 280     /* Add fudge factor here for final descale. */
 281     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
 282 
 283     dataptr[DCTSIZE*2] = (DCTELEM)
 284       RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
 285                   CONST_BITS+PASS1_BITS);
 286     dataptr[DCTSIZE*6] = (DCTELEM)
 287       RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
 288                   CONST_BITS+PASS1_BITS);
 289 
 290     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).

 291      * i0..i3 in the paper are tmp0..tmp3 here.
 292      */
 293 


 294     tmp12 = tmp0 + tmp2;
 295     tmp13 = tmp1 + tmp3;
 296 
 297     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
 298     /* Add fudge factor here for final descale. */
 299     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
 300 
 301     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */






 302     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */

 303     tmp12 += z1;
 304     tmp13 += z1;
 305 
 306     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
 307     tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
 308     tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
 309     tmp0 += z1 + tmp12;
 310     tmp3 += z1 + tmp13;
 311 
 312     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
 313     tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
 314     tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
 315     tmp1 += z1 + tmp13;
 316     tmp2 += z1 + tmp12;
 317 
 318     dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS1_BITS);
 319     dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS1_BITS);
 320     dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS1_BITS);
 321     dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS1_BITS);
 322 
 323     dataptr++;                  /* advance pointer to next column */
 324   }
 325 }
 326 
 327 #ifdef DCT_SCALING_SUPPORTED
 328 
 329 
 330 /*
 331  * Perform the forward DCT on a 7x7 sample block.
 332  */
 333 
 334 GLOBAL(void)
 335 jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 336 {
 337   INT32 tmp0, tmp1, tmp2, tmp3;
 338   INT32 tmp10, tmp11, tmp12;
 339   INT32 z1, z2, z3;
 340   DCTELEM *dataptr;
 341   JSAMPROW elemptr;
 342   int ctr;
 343   SHIFT_TEMPS
 344 
 345   /* Pre-zero output coefficient block. */
 346   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 347 
 348   /* Pass 1: process rows.
 349    * Note results are scaled up by sqrt(8) compared to a true DCT;
 350    * furthermore, we scale the results by 2**PASS1_BITS.
 351    * cK represents sqrt(2) * cos(K*pi/14).
 352    */
 353 
 354   dataptr = data;
 355   for (ctr = 0; ctr < 7; ctr++) {
 356     elemptr = sample_data[ctr] + start_col;
 357 
 358     /* Even part */
 359 
 360     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
 361     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
 362     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
 363     tmp3 = GETJSAMPLE(elemptr[3]);
 364 
 365     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
 366     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
 367     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
 368 
 369     z1 = tmp0 + tmp2;
 370     /* Apply unsigned->signed conversion. */
 371     dataptr[0] = (DCTELEM)
 372       ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
 373     tmp3 += tmp3;
 374     z1 -= tmp3;
 375     z1 -= tmp3;
 376     z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
 377     z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
 378     z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
 379     dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
 380     z1 -= z2;
 381     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
 382     dataptr[4] = (DCTELEM)
 383       DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
 384               CONST_BITS-PASS1_BITS);
 385     dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
 386 
 387     /* Odd part */
 388 
 389     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
 390     tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */


 463 }
 464 
 465 
 466 /*
 467  * Perform the forward DCT on a 6x6 sample block.
 468  */
 469 
 470 GLOBAL(void)
 471 jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 472 {
 473   INT32 tmp0, tmp1, tmp2;
 474   INT32 tmp10, tmp11, tmp12;
 475   DCTELEM *dataptr;
 476   JSAMPROW elemptr;
 477   int ctr;
 478   SHIFT_TEMPS
 479 
 480   /* Pre-zero output coefficient block. */
 481   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 482 
 483   /* Pass 1: process rows.
 484    * Note results are scaled up by sqrt(8) compared to a true DCT;
 485    * furthermore, we scale the results by 2**PASS1_BITS.
 486    * cK represents sqrt(2) * cos(K*pi/12).
 487    */
 488 
 489   dataptr = data;
 490   for (ctr = 0; ctr < 6; ctr++) {
 491     elemptr = sample_data[ctr] + start_col;
 492 
 493     /* Even part */
 494 
 495     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
 496     tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
 497     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
 498 
 499     tmp10 = tmp0 + tmp2;
 500     tmp12 = tmp0 - tmp2;
 501 
 502     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
 503     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
 504     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
 505 
 506     /* Apply unsigned->signed conversion. */
 507     dataptr[0] = (DCTELEM)
 508       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
 509     dataptr[2] = (DCTELEM)
 510       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
 511               CONST_BITS-PASS1_BITS);
 512     dataptr[4] = (DCTELEM)
 513       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
 514               CONST_BITS-PASS1_BITS);
 515 
 516     /* Odd part */
 517 
 518     tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
 519                     CONST_BITS-PASS1_BITS);
 520 
 521     dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
 522     dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
 523     dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
 524 
 525     dataptr += DCTSIZE;         /* advance pointer to next row */
 526   }


 577 }
 578 
 579 
 580 /*
 581  * Perform the forward DCT on a 5x5 sample block.
 582  */
 583 
 584 GLOBAL(void)
 585 jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 586 {
 587   INT32 tmp0, tmp1, tmp2;
 588   INT32 tmp10, tmp11;
 589   DCTELEM *dataptr;
 590   JSAMPROW elemptr;
 591   int ctr;
 592   SHIFT_TEMPS
 593 
 594   /* Pre-zero output coefficient block. */
 595   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 596 
 597   /* Pass 1: process rows.
 598    * Note results are scaled up by sqrt(8) compared to a true DCT;
 599    * furthermore, we scale the results by 2**PASS1_BITS.
 600    * We scale the results further by 2 as part of output adaption
 601    * scaling for different DCT size.
 602    * cK represents sqrt(2) * cos(K*pi/10).
 603    */
 604 
 605   dataptr = data;
 606   for (ctr = 0; ctr < 5; ctr++) {
 607     elemptr = sample_data[ctr] + start_col;
 608 
 609     /* Even part */
 610 
 611     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
 612     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
 613     tmp2 = GETJSAMPLE(elemptr[2]);
 614 
 615     tmp10 = tmp0 + tmp1;
 616     tmp11 = tmp0 - tmp1;
 617 
 618     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
 619     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
 620 
 621     /* Apply unsigned->signed conversion. */
 622     dataptr[0] = (DCTELEM)
 623       ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
 624     tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
 625     tmp10 -= tmp2 << 2;
 626     tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
 627     dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
 628     dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
 629 
 630     /* Odd part */
 631 
 632     tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
 633 
 634     dataptr[1] = (DCTELEM)
 635       DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
 636               CONST_BITS-PASS1_BITS-1);
 637     dataptr[3] = (DCTELEM)
 638       DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
 639               CONST_BITS-PASS1_BITS-1);
 640 
 641     dataptr += DCTSIZE;         /* advance pointer to next row */


 688 }
 689 
 690 
 691 /*
 692  * Perform the forward DCT on a 4x4 sample block.
 693  */
 694 
 695 GLOBAL(void)
 696 jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 697 {
 698   INT32 tmp0, tmp1;
 699   INT32 tmp10, tmp11;
 700   DCTELEM *dataptr;
 701   JSAMPROW elemptr;
 702   int ctr;
 703   SHIFT_TEMPS
 704 
 705   /* Pre-zero output coefficient block. */
 706   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 707 
 708   /* Pass 1: process rows.
 709    * Note results are scaled up by sqrt(8) compared to a true DCT;
 710    * furthermore, we scale the results by 2**PASS1_BITS.
 711    * We must also scale the output by (8/4)**2 = 2**2, which we add here.
 712    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
 713    */
 714 
 715   dataptr = data;
 716   for (ctr = 0; ctr < 4; ctr++) {
 717     elemptr = sample_data[ctr] + start_col;
 718 
 719     /* Even part */
 720 
 721     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
 722     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
 723 
 724     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
 725     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
 726 
 727     /* Apply unsigned->signed conversion. */
 728     dataptr[0] = (DCTELEM)
 729       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
 730     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
 731 
 732     /* Odd part */
 733 
 734     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
 735     /* Add fudge factor here for final descale. */
 736     tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
 737 
 738     dataptr[1] = (DCTELEM)
 739       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
 740                   CONST_BITS-PASS1_BITS-2);
 741     dataptr[3] = (DCTELEM)
 742       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
 743                   CONST_BITS-PASS1_BITS-2);
 744 
 745     dataptr += DCTSIZE;         /* advance pointer to next row */
 746   }
 747 
 748   /* Pass 2: process columns.
 749    * We remove the PASS1_BITS scaling, but leave the results scaled up
 750    * by an overall factor of 8.
 751    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
 752    */
 753 
 754   dataptr = data;
 755   for (ctr = 0; ctr < 4; ctr++) {
 756     /* Even part */
 757 
 758     /* Add fudge factor here for final descale. */
 759     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
 760     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
 761 
 762     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
 763     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
 764 
 765     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
 766     dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
 767 
 768     /* Odd part */
 769 
 770     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
 771     /* Add fudge factor here for final descale. */


 782   }
 783 }
 784 
 785 
 786 /*
 787  * Perform the forward DCT on a 3x3 sample block.
 788  */
 789 
 790 GLOBAL(void)
 791 jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 792 {
 793   INT32 tmp0, tmp1, tmp2;
 794   DCTELEM *dataptr;
 795   JSAMPROW elemptr;
 796   int ctr;
 797   SHIFT_TEMPS
 798 
 799   /* Pre-zero output coefficient block. */
 800   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 801 
 802   /* Pass 1: process rows.
 803    * Note results are scaled up by sqrt(8) compared to a true DCT;
 804    * furthermore, we scale the results by 2**PASS1_BITS.
 805    * We scale the results further by 2**2 as part of output adaption
 806    * scaling for different DCT size.
 807    * cK represents sqrt(2) * cos(K*pi/6).
 808    */
 809 
 810   dataptr = data;
 811   for (ctr = 0; ctr < 3; ctr++) {
 812     elemptr = sample_data[ctr] + start_col;
 813 
 814     /* Even part */
 815 
 816     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
 817     tmp1 = GETJSAMPLE(elemptr[1]);
 818 
 819     tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
 820 
 821     /* Apply unsigned->signed conversion. */
 822     dataptr[0] = (DCTELEM)
 823       ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
 824     dataptr[2] = (DCTELEM)
 825       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
 826               CONST_BITS-PASS1_BITS-2);
 827 
 828     /* Odd part */
 829 
 830     dataptr[1] = (DCTELEM)
 831       DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
 832               CONST_BITS-PASS1_BITS-2);
 833 
 834     dataptr += DCTSIZE;         /* advance pointer to next row */
 835   }
 836 
 837   /* Pass 2: process columns.
 838    * We remove the PASS1_BITS scaling, but leave the results scaled up
 839    * by an overall factor of 8.
 840    * We must also scale the output by (8/3)**2 = 64/9, which we partially
 841    * fold into the constant multipliers (other part was done in pass 1):


 859               CONST_BITS+PASS1_BITS);
 860 
 861     /* Odd part */
 862 
 863     dataptr[DCTSIZE*1] = (DCTELEM)
 864       DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
 865               CONST_BITS+PASS1_BITS);
 866 
 867     dataptr++;                  /* advance pointer to next column */
 868   }
 869 }
 870 
 871 
 872 /*
 873  * Perform the forward DCT on a 2x2 sample block.
 874  */
 875 
 876 GLOBAL(void)
 877 jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 878 {
 879   DCTELEM tmp0, tmp1, tmp2, tmp3;
 880   JSAMPROW elemptr;
 881 
 882   /* Pre-zero output coefficient block. */
 883   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 884 
 885   /* Pass 1: process rows.
 886    * Note results are scaled up by sqrt(8) compared to a true DCT.
 887    */
 888 
 889   /* Row 0 */
 890   elemptr = sample_data[0] + start_col;
 891 
 892   tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
 893   tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
 894 
 895   /* Row 1 */
 896   elemptr = sample_data[1] + start_col;
 897 
 898   tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
 899   tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
 900 
 901   /* Pass 2: process columns.
 902    * We leave the results scaled up by an overall factor of 8.
 903    * We must also scale the output by (8/2)**2 = 2**4.
 904    */
 905 
 906   /* Column 0 */
 907   /* Apply unsigned->signed conversion. */
 908   data[DCTSIZE*0] = (tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4;
 909   data[DCTSIZE*1] = (tmp0 - tmp2) << 4;
 910 
 911   /* Column 1 */
 912   data[DCTSIZE*0+1] = (tmp1 + tmp3) << 4;
 913   data[DCTSIZE*1+1] = (tmp1 - tmp3) << 4;
 914 }
 915 
 916 
 917 /*
 918  * Perform the forward DCT on a 1x1 sample block.
 919  */
 920 
 921 GLOBAL(void)
 922 jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 923 {
 924   DCTELEM dcval;
 925 
 926   /* Pre-zero output coefficient block. */
 927   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 928 
 929   dcval = GETJSAMPLE(sample_data[0][start_col]);
 930 
 931   /* We leave the result scaled up by an overall factor of 8. */
 932   /* We must also scale the output by (8/1)**2 = 2**6. */
 933   /* Apply unsigned->signed conversion. */
 934   data[0] = (dcval - CENTERJSAMPLE) << 6;

 935 }
 936 
 937 
 938 /*
 939  * Perform the forward DCT on a 9x9 sample block.
 940  */
 941 
 942 GLOBAL(void)
 943 jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 944 {
 945   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
 946   INT32 tmp10, tmp11, tmp12, tmp13;
 947   INT32 z1, z2;
 948   DCTELEM workspace[8];
 949   DCTELEM *dataptr;
 950   DCTELEM *wsptr;
 951   JSAMPROW elemptr;
 952   int ctr;
 953   SHIFT_TEMPS
 954 
 955   /* Pass 1: process rows.
 956    * Note results are scaled up by sqrt(8) compared to a true DCT;
 957    * we scale the results further by 2 as part of output adaption
 958    * scaling for different DCT size.
 959    * cK represents sqrt(2) * cos(K*pi/18).
 960    */
 961 
 962   dataptr = data;
 963   ctr = 0;
 964   for (;;) {
 965     elemptr = sample_data[ctr] + start_col;
 966 
 967     /* Even part */
 968 
 969     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
 970     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
 971     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
 972     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
 973     tmp4 = GETJSAMPLE(elemptr[4]);
 974 
 975     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
 976     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
 977     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
 978     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
 979 
 980     z1 = tmp0 + tmp2 + tmp3;
 981     z2 = tmp1 + tmp4;
 982     /* Apply unsigned->signed conversion. */
 983     dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
 984     dataptr[6] = (DCTELEM)
 985       DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)),  /* c6 */
 986               CONST_BITS-1);
 987     z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049));        /* c2 */
 988     z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
 989     dataptr[2] = (DCTELEM)
 990       DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441))    /* c4 */
 991               + z1 + z2, CONST_BITS-1);
 992     dataptr[4] = (DCTELEM)
 993       DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608))    /* c8 */
 994               + z1 - z2, CONST_BITS-1);
 995 
 996     /* Odd part */
 997 
 998     dataptr[3] = (DCTELEM)
 999       DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
1000               CONST_BITS-1);
1001 
1002     tmp11 = MULTIPLY(tmp11, FIX(1.224744871));        /* c3 */


1085   }
1086 }
1087 
1088 
1089 /*
1090  * Perform the forward DCT on a 10x10 sample block.
1091  */
1092 
1093 GLOBAL(void)
1094 jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1095 {
1096   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
1097   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1098   DCTELEM workspace[8*2];
1099   DCTELEM *dataptr;
1100   DCTELEM *wsptr;
1101   JSAMPROW elemptr;
1102   int ctr;
1103   SHIFT_TEMPS
1104 
1105   /* Pass 1: process rows.
1106    * Note results are scaled up by sqrt(8) compared to a true DCT;
1107    * we scale the results further by 2 as part of output adaption
1108    * scaling for different DCT size.
1109    * cK represents sqrt(2) * cos(K*pi/20).
1110    */
1111 
1112   dataptr = data;
1113   ctr = 0;
1114   for (;;) {
1115     elemptr = sample_data[ctr] + start_col;
1116 
1117     /* Even part */
1118 
1119     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
1120     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
1121     tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
1122     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
1123     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
1124 
1125     tmp10 = tmp0 + tmp4;
1126     tmp13 = tmp0 - tmp4;
1127     tmp11 = tmp1 + tmp3;
1128     tmp14 = tmp1 - tmp3;
1129 
1130     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
1131     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
1132     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
1133     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
1134     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
1135 
1136     /* Apply unsigned->signed conversion. */
1137     dataptr[0] = (DCTELEM)
1138       ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
1139     tmp12 += tmp12;
1140     dataptr[4] = (DCTELEM)
1141       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
1142               MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
1143               CONST_BITS-1);
1144     tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
1145     dataptr[2] = (DCTELEM)
1146       DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
1147               CONST_BITS-1);
1148     dataptr[6] = (DCTELEM)
1149       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
1150               CONST_BITS-1);
1151 
1152     /* Odd part */
1153 
1154     tmp10 = tmp0 + tmp4;
1155     tmp11 = tmp1 - tmp3;
1156     dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);


1250 }
1251 
1252 
1253 /*
1254  * Perform the forward DCT on an 11x11 sample block.
1255  */
1256 
1257 GLOBAL(void)
1258 jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1259 {
1260   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1261   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1262   INT32 z1, z2, z3;
1263   DCTELEM workspace[8*3];
1264   DCTELEM *dataptr;
1265   DCTELEM *wsptr;
1266   JSAMPROW elemptr;
1267   int ctr;
1268   SHIFT_TEMPS
1269 
1270   /* Pass 1: process rows.
1271    * Note results are scaled up by sqrt(8) compared to a true DCT;
1272    * we scale the results further by 2 as part of output adaption
1273    * scaling for different DCT size.
1274    * cK represents sqrt(2) * cos(K*pi/22).
1275    */
1276 
1277   dataptr = data;
1278   ctr = 0;
1279   for (;;) {
1280     elemptr = sample_data[ctr] + start_col;
1281 
1282     /* Even part */
1283 
1284     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
1285     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
1286     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
1287     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
1288     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
1289     tmp5 = GETJSAMPLE(elemptr[5]);
1290 
1291     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
1292     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
1293     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
1294     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
1295     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
1296 
1297     /* Apply unsigned->signed conversion. */
1298     dataptr[0] = (DCTELEM)
1299       ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
1300     tmp5 += tmp5;
1301     tmp0 -= tmp5;
1302     tmp1 -= tmp5;
1303     tmp2 -= tmp5;
1304     tmp3 -= tmp5;
1305     tmp4 -= tmp5;
1306     z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) +       /* c2 */
1307          MULTIPLY(tmp2 + tmp4, FIX(0.201263574));        /* c10 */
1308     z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931));        /* c6 */
1309     z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156));        /* c4 */
1310     dataptr[2] = (DCTELEM)
1311       DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
1312               - MULTIPLY(tmp4, FIX(1.390975730)),        /* c4+c10 */
1313               CONST_BITS-1);
1314     dataptr[4] = (DCTELEM)
1315       DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
1316               - MULTIPLY(tmp2, FIX(1.356927976))         /* c2 */
1317               + MULTIPLY(tmp4, FIX(0.587485545)),        /* c8 */


1433   }
1434 }
1435 
1436 
1437 /*
1438  * Perform the forward DCT on a 12x12 sample block.
1439  */
1440 
1441 GLOBAL(void)
1442 jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1443 {
1444   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1445   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1446   DCTELEM workspace[8*4];
1447   DCTELEM *dataptr;
1448   DCTELEM *wsptr;
1449   JSAMPROW elemptr;
1450   int ctr;
1451   SHIFT_TEMPS
1452 
1453   /* Pass 1: process rows.
1454    * Note results are scaled up by sqrt(8) compared to a true DCT.
1455    * cK represents sqrt(2) * cos(K*pi/24).
1456    */
1457 
1458   dataptr = data;
1459   ctr = 0;
1460   for (;;) {
1461     elemptr = sample_data[ctr] + start_col;
1462 
1463     /* Even part */
1464 
1465     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
1466     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
1467     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
1468     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
1469     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
1470     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
1471 
1472     tmp10 = tmp0 + tmp5;
1473     tmp13 = tmp0 - tmp5;
1474     tmp11 = tmp1 + tmp4;
1475     tmp14 = tmp1 - tmp4;
1476     tmp12 = tmp2 + tmp3;
1477     tmp15 = tmp2 - tmp3;
1478 
1479     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
1480     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
1481     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
1482     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
1483     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
1484     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
1485 
1486     /* Apply unsigned->signed conversion. */
1487     dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
1488     dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
1489     dataptr[4] = (DCTELEM)
1490       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
1491               CONST_BITS);
1492     dataptr[2] = (DCTELEM)
1493       DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
1494               CONST_BITS);
1495 
1496     /* Odd part */
1497 
1498     tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
1499     tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
1500     tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
1501     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
1502     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
1503     tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
1504             + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
1505     tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
1506     tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */


1600 }
1601 
1602 
1603 /*
1604  * Perform the forward DCT on a 13x13 sample block.
1605  */
1606 
1607 GLOBAL(void)
1608 jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1609 {
1610   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1611   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1612   INT32 z1, z2;
1613   DCTELEM workspace[8*5];
1614   DCTELEM *dataptr;
1615   DCTELEM *wsptr;
1616   JSAMPROW elemptr;
1617   int ctr;
1618   SHIFT_TEMPS
1619 
1620   /* Pass 1: process rows.
1621    * Note results are scaled up by sqrt(8) compared to a true DCT.
1622    * cK represents sqrt(2) * cos(K*pi/26).
1623    */
1624 
1625   dataptr = data;
1626   ctr = 0;
1627   for (;;) {
1628     elemptr = sample_data[ctr] + start_col;
1629 
1630     /* Even part */
1631 
1632     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
1633     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
1634     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
1635     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
1636     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
1637     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
1638     tmp6 = GETJSAMPLE(elemptr[6]);
1639 
1640     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
1641     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
1642     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
1643     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
1644     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
1645     tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
1646 
1647     /* Apply unsigned->signed conversion. */
1648     dataptr[0] = (DCTELEM)
1649       (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
1650     tmp6 += tmp6;
1651     tmp0 -= tmp6;
1652     tmp1 -= tmp6;
1653     tmp2 -= tmp6;
1654     tmp3 -= tmp6;
1655     tmp4 -= tmp6;
1656     tmp5 -= tmp6;
1657     dataptr[2] = (DCTELEM)
1658       DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) +   /* c2 */
1659               MULTIPLY(tmp1, FIX(1.058554052)) +   /* c6 */
1660               MULTIPLY(tmp2, FIX(0.501487041)) -   /* c10 */
1661               MULTIPLY(tmp3, FIX(0.170464608)) -   /* c12 */
1662               MULTIPLY(tmp4, FIX(0.803364869)) -   /* c8 */
1663               MULTIPLY(tmp5, FIX(1.252223920)),    /* c4 */
1664               CONST_BITS);
1665     z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
1666          MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
1667          MULTIPLY(tmp1 - tmp5, FIX(0.316450131));  /* (c8-c12)/2 */


1799   }
1800 }
1801 
1802 
1803 /*
1804  * Perform the forward DCT on a 14x14 sample block.
1805  */
1806 
1807 GLOBAL(void)
1808 jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1809 {
1810   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1811   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1812   DCTELEM workspace[8*6];
1813   DCTELEM *dataptr;
1814   DCTELEM *wsptr;
1815   JSAMPROW elemptr;
1816   int ctr;
1817   SHIFT_TEMPS
1818 
1819   /* Pass 1: process rows.
1820    * Note results are scaled up by sqrt(8) compared to a true DCT.
1821    * cK represents sqrt(2) * cos(K*pi/28).
1822    */
1823 
1824   dataptr = data;
1825   ctr = 0;
1826   for (;;) {
1827     elemptr = sample_data[ctr] + start_col;
1828 
1829     /* Even part */
1830 
1831     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
1832     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
1833     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
1834     tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
1835     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
1836     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
1837     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
1838 
1839     tmp10 = tmp0 + tmp6;
1840     tmp14 = tmp0 - tmp6;
1841     tmp11 = tmp1 + tmp5;
1842     tmp15 = tmp1 - tmp5;
1843     tmp12 = tmp2 + tmp4;
1844     tmp16 = tmp2 - tmp4;
1845 
1846     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
1847     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
1848     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
1849     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
1850     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
1851     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
1852     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
1853 
1854     /* Apply unsigned->signed conversion. */
1855     dataptr[0] = (DCTELEM)
1856       (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
1857     tmp13 += tmp13;
1858     dataptr[4] = (DCTELEM)
1859       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
1860               MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
1861               MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
1862               CONST_BITS);
1863 
1864     tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
1865 
1866     dataptr[2] = (DCTELEM)
1867       DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
1868               + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
1869               CONST_BITS);
1870     dataptr[6] = (DCTELEM)
1871       DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
1872               - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
1873               CONST_BITS);
1874 


2001 }
2002 
2003 
2004 /*
2005  * Perform the forward DCT on a 15x15 sample block.
2006  */
2007 
2008 GLOBAL(void)
2009 jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2010 {
2011   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2012   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2013   INT32 z1, z2, z3;
2014   DCTELEM workspace[8*7];
2015   DCTELEM *dataptr;
2016   DCTELEM *wsptr;
2017   JSAMPROW elemptr;
2018   int ctr;
2019   SHIFT_TEMPS
2020 
2021   /* Pass 1: process rows.
2022    * Note results are scaled up by sqrt(8) compared to a true DCT.
2023    * cK represents sqrt(2) * cos(K*pi/30).
2024    */
2025 
2026   dataptr = data;
2027   ctr = 0;
2028   for (;;) {
2029     elemptr = sample_data[ctr] + start_col;
2030 
2031     /* Even part */
2032 
2033     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
2034     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
2035     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
2036     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
2037     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
2038     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
2039     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
2040     tmp7 = GETJSAMPLE(elemptr[7]);
2041 
2042     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
2043     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
2044     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
2045     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
2046     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
2047     tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
2048     tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
2049 
2050     z1 = tmp0 + tmp4 + tmp5;
2051     z2 = tmp1 + tmp3 + tmp6;
2052     z3 = tmp2 + tmp7;
2053     /* Apply unsigned->signed conversion. */
2054     dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
2055     z3 += z3;
2056     dataptr[6] = (DCTELEM)
2057       DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
2058               MULTIPLY(z2 - z3, FIX(0.437016024)),  /* c12 */
2059               CONST_BITS);
2060     tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2061     z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) -  /* c2+c14 */
2062          MULTIPLY(tmp6 - tmp2, FIX(2.238241955));   /* c4+c8 */
2063     z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) -  /* c8-c14 */
2064          MULTIPLY(tmp0 - tmp2, FIX(0.091361227));   /* c2-c4 */
2065     z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) +  /* c2 */
2066          MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) +  /* c8 */
2067          MULTIPLY(tmp1 - tmp4, FIX(0.790569415));   /* (c6+c12)/2 */
2068 
2069     dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
2070     dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
2071 
2072     /* Odd part */
2073 


2180   }
2181 }
2182 
2183 
2184 /*
2185  * Perform the forward DCT on a 16x16 sample block.
2186  */
2187 
2188 GLOBAL(void)
2189 jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2190 {
2191   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2192   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2193   DCTELEM workspace[DCTSIZE2];
2194   DCTELEM *dataptr;
2195   DCTELEM *wsptr;
2196   JSAMPROW elemptr;
2197   int ctr;
2198   SHIFT_TEMPS
2199 
2200   /* Pass 1: process rows.
2201    * Note results are scaled up by sqrt(8) compared to a true DCT;
2202    * furthermore, we scale the results by 2**PASS1_BITS.
2203    * cK represents sqrt(2) * cos(K*pi/32).
2204    */
2205 
2206   dataptr = data;
2207   ctr = 0;
2208   for (;;) {
2209     elemptr = sample_data[ctr] + start_col;
2210 
2211     /* Even part */
2212 
2213     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2214     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2215     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2216     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2217     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2218     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2219     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2220     tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2221 
2222     tmp10 = tmp0 + tmp7;
2223     tmp14 = tmp0 - tmp7;
2224     tmp11 = tmp1 + tmp6;
2225     tmp15 = tmp1 - tmp6;
2226     tmp12 = tmp2 + tmp5;
2227     tmp16 = tmp2 - tmp5;
2228     tmp13 = tmp3 + tmp4;
2229     tmp17 = tmp3 - tmp4;
2230 
2231     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2232     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2233     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2234     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2235     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2236     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2237     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2238     tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2239 
2240     /* Apply unsigned->signed conversion. */
2241     dataptr[0] = (DCTELEM)
2242       ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2243     dataptr[4] = (DCTELEM)
2244       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2245               MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2246               CONST_BITS-PASS1_BITS);
2247 
2248     tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2249             MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2250 
2251     dataptr[2] = (DCTELEM)
2252       DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2253               + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
2254               CONST_BITS-PASS1_BITS);
2255     dataptr[6] = (DCTELEM)
2256       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2257               - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2258               CONST_BITS-PASS1_BITS);
2259 
2260     /* Odd part */


2283 
2284     dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2285     dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2286     dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2287     dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2288 
2289     ctr++;
2290 
2291     if (ctr != DCTSIZE) {
2292       if (ctr == DCTSIZE * 2)
2293         break;                  /* Done. */
2294       dataptr += DCTSIZE;       /* advance pointer to next row */
2295     } else
2296       dataptr = workspace;      /* switch pointer to extended workspace */
2297   }
2298 
2299   /* Pass 2: process columns.
2300    * We remove the PASS1_BITS scaling, but leave the results scaled up
2301    * by an overall factor of 8.
2302    * We must also scale the output by (8/16)**2 = 1/2**2.
2303    * cK represents sqrt(2) * cos(K*pi/32).
2304    */
2305 
2306   dataptr = data;
2307   wsptr = workspace;
2308   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2309     /* Even part */
2310 
2311     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
2312     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
2313     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
2314     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
2315     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
2316     tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
2317     tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
2318     tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
2319 
2320     tmp10 = tmp0 + tmp7;
2321     tmp14 = tmp0 - tmp7;
2322     tmp11 = tmp1 + tmp6;
2323     tmp15 = tmp1 - tmp6;


2389 }
2390 
2391 
2392 /*
2393  * Perform the forward DCT on a 16x8 sample block.
2394  *
2395  * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
2396  */
2397 
2398 GLOBAL(void)
2399 jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2400 {
2401   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2402   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2403   INT32 z1;
2404   DCTELEM *dataptr;
2405   JSAMPROW elemptr;
2406   int ctr;
2407   SHIFT_TEMPS
2408 
2409   /* Pass 1: process rows.
2410    * Note results are scaled up by sqrt(8) compared to a true DCT;
2411    * furthermore, we scale the results by 2**PASS1_BITS.
2412    * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
2413    */
2414 
2415   dataptr = data;
2416   ctr = 0;
2417   for (ctr = 0; ctr < DCTSIZE; ctr++) {
2418     elemptr = sample_data[ctr] + start_col;
2419 
2420     /* Even part */
2421 
2422     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2423     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2424     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2425     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2426     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2427     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2428     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2429     tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2430 
2431     tmp10 = tmp0 + tmp7;
2432     tmp14 = tmp0 - tmp7;
2433     tmp11 = tmp1 + tmp6;
2434     tmp15 = tmp1 - tmp6;
2435     tmp12 = tmp2 + tmp5;
2436     tmp16 = tmp2 - tmp5;
2437     tmp13 = tmp3 + tmp4;
2438     tmp17 = tmp3 - tmp4;
2439 
2440     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2441     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2442     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2443     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2444     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2445     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2446     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2447     tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2448 
2449     /* Apply unsigned->signed conversion. */
2450     dataptr[0] = (DCTELEM)
2451       ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2452     dataptr[4] = (DCTELEM)
2453       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2454               MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2455               CONST_BITS-PASS1_BITS);
2456 
2457     tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2458             MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2459 
2460     dataptr[2] = (DCTELEM)
2461       DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2462               + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
2463               CONST_BITS-PASS1_BITS);
2464     dataptr[6] = (DCTELEM)
2465       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2466               - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2467               CONST_BITS-PASS1_BITS);
2468 
2469     /* Odd part */


2485             MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
2486     tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2487              - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
2488     tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2489              + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
2490     tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2491              + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
2492 
2493     dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2494     dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2495     dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2496     dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2497 
2498     dataptr += DCTSIZE;         /* advance pointer to next row */
2499   }
2500 
2501   /* Pass 2: process columns.
2502    * We remove the PASS1_BITS scaling, but leave the results scaled up
2503    * by an overall factor of 8.
2504    * We must also scale the output by 8/16 = 1/2.
2505    * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
2506    */
2507 
2508   dataptr = data;
2509   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2510     /* Even part per LL&M figure 1 --- note that published figure is faulty;
2511      * rotator "c1" should be "c6".
2512      */
2513 
2514     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
2515     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
2516     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
2517     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
2518 
2519     tmp10 = tmp0 + tmp3;
2520     tmp12 = tmp0 - tmp3;
2521     tmp11 = tmp1 + tmp2;
2522     tmp13 = tmp1 - tmp2;
2523 
2524     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
2525     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
2526     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
2527     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
2528 
2529     dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
2530     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
2531 
2532     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);   /* c6 */
2533     dataptr[DCTSIZE*2] = (DCTELEM)
2534       DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
2535               CONST_BITS+PASS1_BITS+1);
2536     dataptr[DCTSIZE*6] = (DCTELEM)
2537       DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
2538               CONST_BITS+PASS1_BITS+1);
2539 
2540     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).

2541      * i0..i3 in the paper are tmp0..tmp3 here.
2542      */
2543 


2544     tmp12 = tmp0 + tmp2;
2545     tmp13 = tmp1 + tmp3;
2546 
2547     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);   /*  c3 */
2548     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);      /* -c3+c5 */
2549     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);      /* -c3-c5 */
2550     tmp12 += z1;
2551     tmp13 += z1;
2552 
2553     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);   /* -c3+c7 */
2554     tmp0 = MULTIPLY(tmp0, FIX_1_501321110);          /*  c1+c3-c5-c7 */


2555     tmp3 = MULTIPLY(tmp3, FIX_0_298631336);          /* -c1+c3+c5-c7 */
2556     tmp0 += z1 + tmp12;
2557     tmp3 += z1 + tmp13;


2558 
2559     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);   /* -c1-c3 */
2560     tmp1 = MULTIPLY(tmp1, FIX_3_072711026);          /*  c1+c3+c5-c7 */
2561     tmp2 = MULTIPLY(tmp2, FIX_2_053119869);          /*  c1+c3-c5+c7 */
2562     tmp1 += z1 + tmp13;
2563     tmp2 += z1 + tmp12;
2564 
2565     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
2566     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
2567     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
2568     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+PASS1_BITS+1);




2569 
2570     dataptr++;                  /* advance pointer to next column */
2571   }
2572 }
2573 
2574 
2575 /*
2576  * Perform the forward DCT on a 14x7 sample block.
2577  *
2578  * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
2579  */
2580 
2581 GLOBAL(void)
2582 jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2583 {
2584   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
2585   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2586   INT32 z1, z2, z3;
2587   DCTELEM *dataptr;
2588   JSAMPROW elemptr;
2589   int ctr;
2590   SHIFT_TEMPS
2591 
2592   /* Zero bottom row of output coefficient block. */
2593   MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
2594 
2595   /* Pass 1: process rows.
2596    * Note results are scaled up by sqrt(8) compared to a true DCT;
2597    * furthermore, we scale the results by 2**PASS1_BITS.
2598    * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
2599    */
2600 
2601   dataptr = data;
2602   for (ctr = 0; ctr < 7; ctr++) {
2603     elemptr = sample_data[ctr] + start_col;
2604 
2605     /* Even part */
2606 
2607     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
2608     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
2609     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
2610     tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
2611     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
2612     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
2613     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
2614 
2615     tmp10 = tmp0 + tmp6;
2616     tmp14 = tmp0 - tmp6;
2617     tmp11 = tmp1 + tmp5;
2618     tmp15 = tmp1 - tmp5;
2619     tmp12 = tmp2 + tmp4;
2620     tmp16 = tmp2 - tmp4;
2621 
2622     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
2623     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
2624     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
2625     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
2626     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
2627     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
2628     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
2629 
2630     /* Apply unsigned->signed conversion. */
2631     dataptr[0] = (DCTELEM)
2632       ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
2633     tmp13 += tmp13;
2634     dataptr[4] = (DCTELEM)
2635       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
2636               MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
2637               MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
2638               CONST_BITS-PASS1_BITS);
2639 
2640     tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
2641 
2642     dataptr[2] = (DCTELEM)
2643       DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
2644               + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
2645               CONST_BITS-PASS1_BITS);
2646     dataptr[6] = (DCTELEM)
2647       DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
2648               - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
2649               CONST_BITS-PASS1_BITS);
2650 


2739 
2740 /*
2741  * Perform the forward DCT on a 12x6 sample block.
2742  *
2743  * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
2744  */
2745 
2746 GLOBAL(void)
2747 jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2748 {
2749   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
2750   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2751   DCTELEM *dataptr;
2752   JSAMPROW elemptr;
2753   int ctr;
2754   SHIFT_TEMPS
2755 
2756   /* Zero 2 bottom rows of output coefficient block. */
2757   MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
2758 
2759   /* Pass 1: process rows.
2760    * Note results are scaled up by sqrt(8) compared to a true DCT;
2761    * furthermore, we scale the results by 2**PASS1_BITS.
2762    * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
2763    */
2764 
2765   dataptr = data;
2766   for (ctr = 0; ctr < 6; ctr++) {
2767     elemptr = sample_data[ctr] + start_col;
2768 
2769     /* Even part */
2770 
2771     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
2772     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
2773     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
2774     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
2775     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
2776     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
2777 
2778     tmp10 = tmp0 + tmp5;
2779     tmp13 = tmp0 - tmp5;
2780     tmp11 = tmp1 + tmp4;
2781     tmp14 = tmp1 - tmp4;
2782     tmp12 = tmp2 + tmp3;
2783     tmp15 = tmp2 - tmp3;
2784 
2785     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
2786     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
2787     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
2788     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
2789     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
2790     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
2791 
2792     /* Apply unsigned->signed conversion. */
2793     dataptr[0] = (DCTELEM)
2794       ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
2795     dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
2796     dataptr[4] = (DCTELEM)
2797       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
2798               CONST_BITS-PASS1_BITS);
2799     dataptr[2] = (DCTELEM)
2800       DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
2801               CONST_BITS-PASS1_BITS);
2802 
2803     /* Odd part */
2804 
2805     tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
2806     tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
2807     tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
2808     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
2809     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
2810     tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
2811             + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
2812     tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */


2879 
2880 /*
2881  * Perform the forward DCT on a 10x5 sample block.
2882  *
2883  * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
2884  */
2885 
2886 GLOBAL(void)
2887 jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2888 {
2889   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
2890   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
2891   DCTELEM *dataptr;
2892   JSAMPROW elemptr;
2893   int ctr;
2894   SHIFT_TEMPS
2895 
2896   /* Zero 3 bottom rows of output coefficient block. */
2897   MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
2898 
2899   /* Pass 1: process rows.
2900    * Note results are scaled up by sqrt(8) compared to a true DCT;
2901    * furthermore, we scale the results by 2**PASS1_BITS.
2902    * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
2903    */
2904 
2905   dataptr = data;
2906   for (ctr = 0; ctr < 5; ctr++) {
2907     elemptr = sample_data[ctr] + start_col;
2908 
2909     /* Even part */
2910 
2911     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
2912     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
2913     tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
2914     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
2915     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
2916 
2917     tmp10 = tmp0 + tmp4;
2918     tmp13 = tmp0 - tmp4;
2919     tmp11 = tmp1 + tmp3;
2920     tmp14 = tmp1 - tmp3;
2921 
2922     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
2923     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
2924     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
2925     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
2926     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
2927 
2928     /* Apply unsigned->signed conversion. */
2929     dataptr[0] = (DCTELEM)
2930       ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
2931     tmp12 += tmp12;
2932     dataptr[4] = (DCTELEM)
2933       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
2934               MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
2935               CONST_BITS-PASS1_BITS);
2936     tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
2937     dataptr[2] = (DCTELEM)
2938       DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
2939               CONST_BITS-PASS1_BITS);
2940     dataptr[6] = (DCTELEM)
2941       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
2942               CONST_BITS-PASS1_BITS);
2943 
2944     /* Odd part */
2945 
2946     tmp10 = tmp0 + tmp4;
2947     tmp11 = tmp1 - tmp3;
2948     dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);


3013 /*
3014  * Perform the forward DCT on an 8x4 sample block.
3015  *
3016  * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
3017  */
3018 
3019 GLOBAL(void)
3020 jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3021 {
3022   INT32 tmp0, tmp1, tmp2, tmp3;
3023   INT32 tmp10, tmp11, tmp12, tmp13;
3024   INT32 z1;
3025   DCTELEM *dataptr;
3026   JSAMPROW elemptr;
3027   int ctr;
3028   SHIFT_TEMPS
3029 
3030   /* Zero 4 bottom rows of output coefficient block. */
3031   MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
3032 
3033   /* Pass 1: process rows.
3034    * Note results are scaled up by sqrt(8) compared to a true DCT;
3035    * furthermore, we scale the results by 2**PASS1_BITS.
3036    * We must also scale the output by 8/4 = 2, which we add here.
3037    * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3038    */
3039 
3040   dataptr = data;
3041   for (ctr = 0; ctr < 4; ctr++) {
3042     elemptr = sample_data[ctr] + start_col;
3043 
3044     /* Even part per LL&M figure 1 --- note that published figure is faulty;
3045      * rotator "c1" should be "c6".
3046      */
3047 
3048     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3049     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3050     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3051     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3052 
3053     tmp10 = tmp0 + tmp3;
3054     tmp12 = tmp0 - tmp3;
3055     tmp11 = tmp1 + tmp2;
3056     tmp13 = tmp1 - tmp2;
3057 
3058     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3059     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3060     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3061     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3062 
3063     /* Apply unsigned->signed conversion. */
3064     dataptr[0] = (DCTELEM)
3065       ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
3066     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
3067 
3068     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
3069     /* Add fudge factor here for final descale. */
3070     z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3071 
3072     dataptr[2] = (DCTELEM)
3073       RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
3074                   CONST_BITS-PASS1_BITS-1);
3075     dataptr[6] = (DCTELEM)
3076       RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
3077                   CONST_BITS-PASS1_BITS-1);
3078 
3079     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).

3080      * i0..i3 in the paper are tmp0..tmp3 here.
3081      */
3082 


3083     tmp12 = tmp0 + tmp2;
3084     tmp13 = tmp1 + tmp3;
3085 
3086     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
3087     /* Add fudge factor here for final descale. */
3088     z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3089 
3090     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */






3091     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */

3092     tmp12 += z1;
3093     tmp13 += z1;
3094 
3095     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
3096     tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
3097     tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
3098     tmp0 += z1 + tmp12;
3099     tmp3 += z1 + tmp13;
3100 
3101     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
3102     tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
3103     tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
3104     tmp1 += z1 + tmp13;
3105     tmp2 += z1 + tmp12;
3106 
3107     dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS-1);
3108     dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS-1);
3109     dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS-1);
3110     dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS-1);
3111 
3112     dataptr += DCTSIZE;         /* advance pointer to next row */
3113   }
3114 
3115   /* Pass 2: process columns.
3116    * We remove the PASS1_BITS scaling, but leave the results scaled up
3117    * by an overall factor of 8.
3118    * 4-point FDCT kernel,
3119    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
3120    */
3121 
3122   dataptr = data;
3123   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3124     /* Even part */
3125 
3126     /* Add fudge factor here for final descale. */
3127     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
3128     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
3129 
3130     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
3131     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
3132 
3133     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3134     dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3135 
3136     /* Odd part */
3137 
3138     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
3139     /* Add fudge factor here for final descale. */


3153 
3154 /*
3155  * Perform the forward DCT on a 6x3 sample block.
3156  *
3157  * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
3158  */
3159 
3160 GLOBAL(void)
3161 jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3162 {
3163   INT32 tmp0, tmp1, tmp2;
3164   INT32 tmp10, tmp11, tmp12;
3165   DCTELEM *dataptr;
3166   JSAMPROW elemptr;
3167   int ctr;
3168   SHIFT_TEMPS
3169 
3170   /* Pre-zero output coefficient block. */
3171   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3172 
3173   /* Pass 1: process rows.
3174    * Note results are scaled up by sqrt(8) compared to a true DCT;
3175    * furthermore, we scale the results by 2**PASS1_BITS.
3176    * We scale the results further by 2 as part of output adaption
3177    * scaling for different DCT size.
3178    * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3179    */
3180 
3181   dataptr = data;
3182   for (ctr = 0; ctr < 3; ctr++) {
3183     elemptr = sample_data[ctr] + start_col;
3184 
3185     /* Even part */
3186 
3187     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3188     tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3189     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3190 
3191     tmp10 = tmp0 + tmp2;
3192     tmp12 = tmp0 - tmp2;
3193 
3194     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3195     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3196     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3197 
3198     /* Apply unsigned->signed conversion. */
3199     dataptr[0] = (DCTELEM)
3200       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
3201     dataptr[2] = (DCTELEM)
3202       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
3203               CONST_BITS-PASS1_BITS-1);
3204     dataptr[4] = (DCTELEM)
3205       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3206               CONST_BITS-PASS1_BITS-1);
3207 
3208     /* Odd part */
3209 
3210     tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
3211                     CONST_BITS-PASS1_BITS-1);
3212 
3213     dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
3214     dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
3215     dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
3216 
3217     dataptr += DCTSIZE;         /* advance pointer to next row */
3218   }


3254 
3255 /*
3256  * Perform the forward DCT on a 4x2 sample block.
3257  *
3258  * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
3259  */
3260 
3261 GLOBAL(void)
3262 jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3263 {
3264   INT32 tmp0, tmp1;
3265   INT32 tmp10, tmp11;
3266   DCTELEM *dataptr;
3267   JSAMPROW elemptr;
3268   int ctr;
3269   SHIFT_TEMPS
3270 
3271   /* Pre-zero output coefficient block. */
3272   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3273 
3274   /* Pass 1: process rows.
3275    * Note results are scaled up by sqrt(8) compared to a true DCT;
3276    * furthermore, we scale the results by 2**PASS1_BITS.
3277    * We must also scale the output by (8/4)*(8/2) = 2**3, which we add here.
3278    * 4-point FDCT kernel,
3279    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
3280    */
3281 
3282   dataptr = data;
3283   for (ctr = 0; ctr < 2; ctr++) {
3284     elemptr = sample_data[ctr] + start_col;
3285 
3286     /* Even part */
3287 
3288     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
3289     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
3290 
3291     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
3292     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
3293 
3294     /* Apply unsigned->signed conversion. */
3295     dataptr[0] = (DCTELEM)
3296       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
3297     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
3298 
3299     /* Odd part */
3300 
3301     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
3302     /* Add fudge factor here for final descale. */
3303     tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
3304 
3305     dataptr[1] = (DCTELEM)
3306       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3307                   CONST_BITS-PASS1_BITS-3);
3308     dataptr[3] = (DCTELEM)
3309       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3310                   CONST_BITS-PASS1_BITS-3);
3311 
3312     dataptr += DCTSIZE;         /* advance pointer to next row */
3313   }
3314 


3328     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3329 
3330     /* Odd part */
3331 
3332     dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3333 
3334     dataptr++;                  /* advance pointer to next column */
3335   }
3336 }
3337 
3338 
3339 /*
3340  * Perform the forward DCT on a 2x1 sample block.
3341  *
3342  * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
3343  */
3344 
3345 GLOBAL(void)
3346 jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3347 {
3348   DCTELEM tmp0, tmp1;
3349   JSAMPROW elemptr;
3350 
3351   /* Pre-zero output coefficient block. */
3352   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3353 
3354   elemptr = sample_data[0] + start_col;
3355 
3356   tmp0 = GETJSAMPLE(elemptr[0]);
3357   tmp1 = GETJSAMPLE(elemptr[1]);
3358 
3359   /* We leave the results scaled up by an overall factor of 8.
3360    * We must also scale the output by (8/2)*(8/1) = 2**5.
3361    */
3362 
3363   /* Even part */
3364 
3365   /* Apply unsigned->signed conversion. */
3366   data[0] = (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5;
3367 
3368   /* Odd part */
3369 
3370   data[1] = (tmp0 - tmp1) << 5;
3371 }
3372 
3373 
3374 /*
3375  * Perform the forward DCT on an 8x16 sample block.
3376  *
3377  * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
3378  */
3379 
3380 GLOBAL(void)
3381 jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3382 {
3383   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3384   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
3385   INT32 z1;
3386   DCTELEM workspace[DCTSIZE2];
3387   DCTELEM *dataptr;
3388   DCTELEM *wsptr;
3389   JSAMPROW elemptr;
3390   int ctr;
3391   SHIFT_TEMPS
3392 
3393   /* Pass 1: process rows.
3394    * Note results are scaled up by sqrt(8) compared to a true DCT;
3395    * furthermore, we scale the results by 2**PASS1_BITS.
3396    * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3397    */
3398 
3399   dataptr = data;
3400   ctr = 0;
3401   for (;;) {
3402     elemptr = sample_data[ctr] + start_col;
3403 
3404     /* Even part per LL&M figure 1 --- note that published figure is faulty;
3405      * rotator "c1" should be "c6".
3406      */
3407 
3408     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3409     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3410     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3411     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3412 
3413     tmp10 = tmp0 + tmp3;
3414     tmp12 = tmp0 - tmp3;
3415     tmp11 = tmp1 + tmp2;
3416     tmp13 = tmp1 - tmp2;
3417 
3418     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3419     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3420     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3421     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3422 
3423     /* Apply unsigned->signed conversion. */
3424     dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
3425     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
3426 
3427     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);   /* c6 */
3428     dataptr[2] = (DCTELEM)
3429       DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
3430               CONST_BITS-PASS1_BITS);
3431     dataptr[6] = (DCTELEM)
3432       DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
3433               CONST_BITS-PASS1_BITS);
3434 
3435     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).

3436      * i0..i3 in the paper are tmp0..tmp3 here.
3437      */
3438 


3439     tmp12 = tmp0 + tmp2;
3440     tmp13 = tmp1 + tmp3;
3441 
3442     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);   /*  c3 */
3443     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);      /* -c3+c5 */
3444     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);      /* -c3-c5 */
3445     tmp12 += z1;
3446     tmp13 += z1;
3447 
3448     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);   /* -c3+c7 */
3449     tmp0 = MULTIPLY(tmp0, FIX_1_501321110);          /*  c1+c3-c5-c7 */


3450     tmp3 = MULTIPLY(tmp3, FIX_0_298631336);          /* -c1+c3+c5-c7 */
3451     tmp0 += z1 + tmp12;
3452     tmp3 += z1 + tmp13;


3453 
3454     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);   /* -c1-c3 */
3455     tmp1 = MULTIPLY(tmp1, FIX_3_072711026);          /*  c1+c3+c5-c7 */
3456     tmp2 = MULTIPLY(tmp2, FIX_2_053119869);          /*  c1+c3-c5+c7 */
3457     tmp1 += z1 + tmp13;
3458     tmp2 += z1 + tmp12;
3459 
3460     dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
3461     dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
3462     dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
3463     dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-PASS1_BITS);
3464 
3465     ctr++;
3466 
3467     if (ctr != DCTSIZE) {
3468       if (ctr == DCTSIZE * 2)
3469         break;                  /* Done. */
3470       dataptr += DCTSIZE;       /* advance pointer to next row */
3471     } else
3472       dataptr = workspace;      /* switch pointer to extended workspace */
3473   }
3474 
3475   /* Pass 2: process columns.
3476    * We remove the PASS1_BITS scaling, but leave the results scaled up
3477    * by an overall factor of 8.
3478    * We must also scale the output by 8/16 = 1/2.
3479    * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
3480    */
3481 
3482   dataptr = data;
3483   wsptr = workspace;


3570  *
3571  * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
3572  */
3573 
3574 GLOBAL(void)
3575 jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3576 {
3577   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
3578   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3579   INT32 z1, z2, z3;
3580   DCTELEM workspace[8*6];
3581   DCTELEM *dataptr;
3582   DCTELEM *wsptr;
3583   JSAMPROW elemptr;
3584   int ctr;
3585   SHIFT_TEMPS
3586 
3587   /* Pre-zero output coefficient block. */
3588   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3589 
3590   /* Pass 1: process rows.
3591    * Note results are scaled up by sqrt(8) compared to a true DCT;
3592    * furthermore, we scale the results by 2**PASS1_BITS.
3593    * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
3594    */
3595 
3596   dataptr = data;
3597   ctr = 0;
3598   for (;;) {
3599     elemptr = sample_data[ctr] + start_col;
3600 
3601     /* Even part */
3602 
3603     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
3604     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
3605     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
3606     tmp3 = GETJSAMPLE(elemptr[3]);
3607 
3608     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
3609     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
3610     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
3611 
3612     z1 = tmp0 + tmp2;
3613     /* Apply unsigned->signed conversion. */
3614     dataptr[0] = (DCTELEM)
3615       ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
3616     tmp3 += tmp3;
3617     z1 -= tmp3;
3618     z1 -= tmp3;
3619     z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
3620     z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
3621     z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
3622     dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
3623     z1 -= z2;
3624     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
3625     dataptr[4] = (DCTELEM)
3626       DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
3627               CONST_BITS-PASS1_BITS);
3628     dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
3629 
3630     /* Odd part */
3631 
3632     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
3633     tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */


3751  * Perform the forward DCT on a 6x12 sample block.
3752  *
3753  * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
3754  */
3755 
3756 GLOBAL(void)
3757 jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3758 {
3759   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3760   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3761   DCTELEM workspace[8*4];
3762   DCTELEM *dataptr;
3763   DCTELEM *wsptr;
3764   JSAMPROW elemptr;
3765   int ctr;
3766   SHIFT_TEMPS
3767 
3768   /* Pre-zero output coefficient block. */
3769   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3770 
3771   /* Pass 1: process rows.
3772    * Note results are scaled up by sqrt(8) compared to a true DCT;
3773    * furthermore, we scale the results by 2**PASS1_BITS.
3774    * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3775    */
3776 
3777   dataptr = data;
3778   ctr = 0;
3779   for (;;) {
3780     elemptr = sample_data[ctr] + start_col;
3781 
3782     /* Even part */
3783 
3784     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3785     tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3786     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3787 
3788     tmp10 = tmp0 + tmp2;
3789     tmp12 = tmp0 - tmp2;
3790 
3791     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3792     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3793     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3794 
3795     /* Apply unsigned->signed conversion. */
3796     dataptr[0] = (DCTELEM)
3797       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
3798     dataptr[2] = (DCTELEM)
3799       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
3800               CONST_BITS-PASS1_BITS);
3801     dataptr[4] = (DCTELEM)
3802       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3803               CONST_BITS-PASS1_BITS);
3804 
3805     /* Odd part */
3806 
3807     tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
3808                     CONST_BITS-PASS1_BITS);
3809 
3810     dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
3811     dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
3812     dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
3813 
3814     ctr++;
3815 


3901  * Perform the forward DCT on a 5x10 sample block.
3902  *
3903  * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
3904  */
3905 
3906 GLOBAL(void)
3907 jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3908 {
3909   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
3910   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3911   DCTELEM workspace[8*2];
3912   DCTELEM *dataptr;
3913   DCTELEM *wsptr;
3914   JSAMPROW elemptr;
3915   int ctr;
3916   SHIFT_TEMPS
3917 
3918   /* Pre-zero output coefficient block. */
3919   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3920 
3921   /* Pass 1: process rows.
3922    * Note results are scaled up by sqrt(8) compared to a true DCT;
3923    * furthermore, we scale the results by 2**PASS1_BITS.
3924    * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
3925    */
3926 
3927   dataptr = data;
3928   ctr = 0;
3929   for (;;) {
3930     elemptr = sample_data[ctr] + start_col;
3931 
3932     /* Even part */
3933 
3934     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
3935     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
3936     tmp2 = GETJSAMPLE(elemptr[2]);
3937 
3938     tmp10 = tmp0 + tmp1;
3939     tmp11 = tmp0 - tmp1;
3940 
3941     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
3942     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
3943 
3944     /* Apply unsigned->signed conversion. */
3945     dataptr[0] = (DCTELEM)
3946       ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
3947     tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
3948     tmp10 -= tmp2 << 2;
3949     tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
3950     dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
3951     dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
3952 
3953     /* Odd part */
3954 
3955     tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
3956 
3957     dataptr[1] = (DCTELEM)
3958       DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
3959               CONST_BITS-PASS1_BITS);
3960     dataptr[3] = (DCTELEM)
3961       DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
3962               CONST_BITS-PASS1_BITS);
3963 
3964     ctr++;


4047 /*
4048  * Perform the forward DCT on a 4x8 sample block.
4049  *
4050  * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
4051  */
4052 
4053 GLOBAL(void)
4054 jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4055 {
4056   INT32 tmp0, tmp1, tmp2, tmp3;
4057   INT32 tmp10, tmp11, tmp12, tmp13;
4058   INT32 z1;
4059   DCTELEM *dataptr;
4060   JSAMPROW elemptr;
4061   int ctr;
4062   SHIFT_TEMPS
4063 
4064   /* Pre-zero output coefficient block. */
4065   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4066 
4067   /* Pass 1: process rows.
4068    * Note results are scaled up by sqrt(8) compared to a true DCT;
4069    * furthermore, we scale the results by 2**PASS1_BITS.
4070    * We must also scale the output by 8/4 = 2, which we add here.
4071    * 4-point FDCT kernel,
4072    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4073    */
4074 
4075   dataptr = data;
4076   for (ctr = 0; ctr < DCTSIZE; ctr++) {
4077     elemptr = sample_data[ctr] + start_col;
4078 
4079     /* Even part */
4080 
4081     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
4082     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
4083 
4084     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
4085     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
4086 
4087     /* Apply unsigned->signed conversion. */
4088     dataptr[0] = (DCTELEM)
4089       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
4090     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
4091 
4092     /* Odd part */
4093 
4094     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
4095     /* Add fudge factor here for final descale. */
4096     tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
4097 
4098     dataptr[1] = (DCTELEM)
4099       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4100                   CONST_BITS-PASS1_BITS-1);
4101     dataptr[3] = (DCTELEM)
4102       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4103                   CONST_BITS-PASS1_BITS-1);
4104 
4105     dataptr += DCTSIZE;         /* advance pointer to next row */
4106   }
4107 
4108   /* Pass 2: process columns.
4109    * We remove the PASS1_BITS scaling, but leave the results scaled up
4110    * by an overall factor of 8.
4111    * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4112    */
4113 
4114   dataptr = data;
4115   for (ctr = 0; ctr < 4; ctr++) {
4116     /* Even part per LL&M figure 1 --- note that published figure is faulty;
4117      * rotator "c1" should be "c6".
4118      */
4119 
4120     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
4121     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
4122     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
4123     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
4124 
4125     /* Add fudge factor here for final descale. */
4126     tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
4127     tmp12 = tmp0 - tmp3;
4128     tmp11 = tmp1 + tmp2;
4129     tmp13 = tmp1 - tmp2;
4130 
4131     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
4132     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
4133     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
4134     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
4135 
4136     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
4137     dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
4138 
4139     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
4140     /* Add fudge factor here for final descale. */
4141     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4142 
4143     dataptr[DCTSIZE*2] = (DCTELEM)
4144       RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
4145                   CONST_BITS+PASS1_BITS);
4146     dataptr[DCTSIZE*6] = (DCTELEM)
4147       RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
4148                   CONST_BITS+PASS1_BITS);
4149 
4150     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).

4151      * i0..i3 in the paper are tmp0..tmp3 here.
4152      */
4153 


4154     tmp12 = tmp0 + tmp2;
4155     tmp13 = tmp1 + tmp3;
4156 
4157     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
4158     /* Add fudge factor here for final descale. */
4159     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4160 
4161     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */






4162     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */

4163     tmp12 += z1;
4164     tmp13 += z1;
4165 
4166     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
4167     tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
4168     tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
4169     tmp0 += z1 + tmp12;
4170     tmp3 += z1 + tmp13;
4171 
4172     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
4173     tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
4174     tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
4175     tmp1 += z1 + tmp13;
4176     tmp2 += z1 + tmp12;
4177 
4178     dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS1_BITS);
4179     dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS1_BITS);
4180     dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS1_BITS);
4181     dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS1_BITS);
4182 
4183     dataptr++;                  /* advance pointer to next column */
4184   }
4185 }
4186 
4187 
4188 /*
4189  * Perform the forward DCT on a 3x6 sample block.
4190  *
4191  * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
4192  */
4193 
4194 GLOBAL(void)
4195 jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4196 {
4197   INT32 tmp0, tmp1, tmp2;
4198   INT32 tmp10, tmp11, tmp12;
4199   DCTELEM *dataptr;
4200   JSAMPROW elemptr;
4201   int ctr;
4202   SHIFT_TEMPS
4203 
4204   /* Pre-zero output coefficient block. */
4205   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4206 
4207   /* Pass 1: process rows.
4208    * Note results are scaled up by sqrt(8) compared to a true DCT;
4209    * furthermore, we scale the results by 2**PASS1_BITS.
4210    * We scale the results further by 2 as part of output adaption
4211    * scaling for different DCT size.
4212    * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
4213    */
4214 
4215   dataptr = data;
4216   for (ctr = 0; ctr < 6; ctr++) {
4217     elemptr = sample_data[ctr] + start_col;
4218 
4219     /* Even part */
4220 
4221     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
4222     tmp1 = GETJSAMPLE(elemptr[1]);
4223 
4224     tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
4225 
4226     /* Apply unsigned->signed conversion. */
4227     dataptr[0] = (DCTELEM)
4228       ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
4229     dataptr[2] = (DCTELEM)
4230       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
4231               CONST_BITS-PASS1_BITS-1);
4232 
4233     /* Odd part */
4234 
4235     dataptr[1] = (DCTELEM)
4236       DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
4237               CONST_BITS-PASS1_BITS-1);
4238 
4239     dataptr += DCTSIZE;         /* advance pointer to next row */
4240   }
4241 
4242   /* Pass 2: process columns.
4243    * We remove the PASS1_BITS scaling, but leave the results scaled up
4244    * by an overall factor of 8.
4245    * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
4246    * fold into the constant multipliers (other part was done in pass 1):


4293 
4294 /*
4295  * Perform the forward DCT on a 2x4 sample block.
4296  *
4297  * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
4298  */
4299 
4300 GLOBAL(void)
4301 jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4302 {
4303   INT32 tmp0, tmp1;
4304   INT32 tmp10, tmp11;
4305   DCTELEM *dataptr;
4306   JSAMPROW elemptr;
4307   int ctr;
4308   SHIFT_TEMPS
4309 
4310   /* Pre-zero output coefficient block. */
4311   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4312 
4313   /* Pass 1: process rows.
4314    * Note results are scaled up by sqrt(8) compared to a true DCT.
4315    * We must also scale the output by (8/2)*(8/4) = 2**3, which we add here.
4316    */
4317 
4318   dataptr = data;
4319   for (ctr = 0; ctr < 4; ctr++) {
4320     elemptr = sample_data[ctr] + start_col;
4321 
4322     /* Even part */
4323 
4324     tmp0 = GETJSAMPLE(elemptr[0]);
4325     tmp1 = GETJSAMPLE(elemptr[1]);
4326 
4327     /* Apply unsigned->signed conversion. */
4328     dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
4329 
4330     /* Odd part */
4331 
4332     dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
4333 
4334     dataptr += DCTSIZE;         /* advance pointer to next row */
4335   }
4336 
4337   /* Pass 2: process columns.
4338    * We leave the results scaled up by an overall factor of 8.
4339    * 4-point FDCT kernel,
4340    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4341    */
4342 
4343   dataptr = data;
4344   for (ctr = 0; ctr < 2; ctr++) {
4345     /* Even part */
4346 
4347     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];


4363       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4364                   CONST_BITS);
4365     dataptr[DCTSIZE*3] = (DCTELEM)
4366       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4367                   CONST_BITS);
4368 
4369     dataptr++;                  /* advance pointer to next column */
4370   }
4371 }
4372 
4373 
4374 /*
4375  * Perform the forward DCT on a 1x2 sample block.
4376  *
4377  * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
4378  */
4379 
4380 GLOBAL(void)
4381 jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4382 {
4383   DCTELEM tmp0, tmp1;
4384 
4385   /* Pre-zero output coefficient block. */
4386   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4387 
4388   /* Pass 1: empty. */

4389 
4390   /* Pass 2: process columns.
4391    * We leave the results scaled up by an overall factor of 8.
4392    * We must also scale the output by (8/1)*(8/2) = 2**5.
4393    */
4394 
4395   /* Even part */
4396 
4397   tmp0 = GETJSAMPLE(sample_data[0][start_col]);
4398   tmp1 = GETJSAMPLE(sample_data[1][start_col]);
4399 
4400   /* Apply unsigned->signed conversion. */
4401   data[DCTSIZE*0] = (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5;
4402 
4403   /* Odd part */
4404 
4405   data[DCTSIZE*1] = (tmp0 - tmp1) << 5;
4406 }
4407 
4408 #endif /* DCT_SCALING_SUPPORTED */
4409 #endif /* DCT_ISLOW_SUPPORTED */
< prev index next >