1 /*
   2  * jidctint.c
   3  *
   4  * Copyright (C) 1991-1998, Thomas G. Lane.
   5  * Modification developed 2002-2009 by Guido Vollbeding.
   6  * This file is part of the Independent JPEG Group's software.
   7  * For conditions of distribution and use, see the accompanying README file.
   8  *
   9  * This file contains a slow-but-accurate integer implementation of the
  10  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
  11  * must also perform dequantization of the input coefficients.
  12  *
  13  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
  14  * on each row (or vice versa, but it's more convenient to emit a row at
  15  * a time).  Direct algorithms are also available, but they are much more
  16  * complex and seem not to be any faster when reduced to code.
  17  *
  18  * This implementation is based on an algorithm described in
  19  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  20  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  21  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  22  * The primary algorithm described there uses 11 multiplies and 29 adds.
  23  * We use their alternate method with 12 multiplies and 32 adds.
  24  * The advantage of this method is that no data path contains more than one
  25  * multiplication; this allows a very simple and accurate implementation in
  26  * scaled fixed-point arithmetic, with a minimal number of shifts.
  27  *
  28  * We also provide IDCT routines with various output sample block sizes for
  29  * direct resolution reduction or enlargement and for direct resolving the
  30  * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
  31  * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
  32  *
  33  * For N<8 we simply take the corresponding low-frequency coefficients of
  34  * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
  35  * to yield the downscaled outputs.
  36  * This can be seen as direct low-pass downsampling from the DCT domain
  37  * point of view rather than the usual spatial domain point of view,
  38  * yielding significant computational savings and results at least
  39  * as good as common bilinear (averaging) spatial downsampling.
  40  *
  41  * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
  42  * lower frequencies and higher frequencies assumed to be zero.
  43  * It turns out that the computational effort is similar to the 8x8 IDCT
  44  * regarding the output size.
  45  * Furthermore, the scaling and descaling is the same for all IDCT sizes.
  46  *
  47  * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
  48  * since there would be too many additional constants to pre-calculate.
  49  */
  50 
  51 #define JPEG_INTERNALS
  52 #include "jinclude.h"
  53 #include "jpeglib.h"
  54 #include "jdct.h"               /* Private declarations for DCT subsystem */
  55 
  56 #ifdef DCT_ISLOW_SUPPORTED
  57 
  58 
  59 /*
  60  * This module is specialized to the case DCTSIZE = 8.
  61  */
  62 
  63 #if DCTSIZE != 8
  64   Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
  65 #endif
  66 
  67 
  68 /*
  69  * The poop on this scaling stuff is as follows:
  70  *
  71  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
  72  * larger than the true IDCT outputs.  The final outputs are therefore
  73  * a factor of N larger than desired; since N=8 this can be cured by
  74  * a simple right shift at the end of the algorithm.  The advantage of
  75  * this arrangement is that we save two multiplications per 1-D IDCT,
  76  * because the y0 and y4 inputs need not be divided by sqrt(N).
  77  *
  78  * We have to do addition and subtraction of the integer inputs, which
  79  * is no problem, and multiplication by fractional constants, which is
  80  * a problem to do in integer arithmetic.  We multiply all the constants
  81  * by CONST_SCALE and convert them to integer constants (thus retaining
  82  * CONST_BITS bits of precision in the constants).  After doing a
  83  * multiplication we have to divide the product by CONST_SCALE, with proper
  84  * rounding, to produce the correct output.  This division can be done
  85  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
  86  * as long as possible so that partial sums can be added together with
  87  * full fractional precision.
  88  *
  89  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
  90  * they are represented to better-than-integral precision.  These outputs
  91  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
  92  * with the recommended scaling.  (To scale up 12-bit sample data further, an
  93  * intermediate INT32 array would be needed.)
  94  *
  95  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
  96  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
  97  * shows that the values given below are the most effective.
  98  */
  99 
 100 #if BITS_IN_JSAMPLE == 8
 101 #define CONST_BITS  13
 102 #define PASS1_BITS  2
 103 #else
 104 #define CONST_BITS  13
 105 #define PASS1_BITS  1           /* lose a little precision to avoid overflow */
 106 #endif
 107 
 108 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
 109  * causing a lot of useless floating-point operations at run time.
 110  * To get around this we use the following pre-calculated constants.
 111  * If you change CONST_BITS you may want to add appropriate values.
 112  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
 113  */
 114 
 115 #if CONST_BITS == 13
 116 #define FIX_0_298631336  ((INT32)  2446)        /* FIX(0.298631336) */
 117 #define FIX_0_390180644  ((INT32)  3196)        /* FIX(0.390180644) */
 118 #define FIX_0_541196100  ((INT32)  4433)        /* FIX(0.541196100) */
 119 #define FIX_0_765366865  ((INT32)  6270)        /* FIX(0.765366865) */
 120 #define FIX_0_899976223  ((INT32)  7373)        /* FIX(0.899976223) */
 121 #define FIX_1_175875602  ((INT32)  9633)        /* FIX(1.175875602) */
 122 #define FIX_1_501321110  ((INT32)  12299)       /* FIX(1.501321110) */
 123 #define FIX_1_847759065  ((INT32)  15137)       /* FIX(1.847759065) */
 124 #define FIX_1_961570560  ((INT32)  16069)       /* FIX(1.961570560) */
 125 #define FIX_2_053119869  ((INT32)  16819)       /* FIX(2.053119869) */
 126 #define FIX_2_562915447  ((INT32)  20995)       /* FIX(2.562915447) */
 127 #define FIX_3_072711026  ((INT32)  25172)       /* FIX(3.072711026) */
 128 #else
 129 #define FIX_0_298631336  FIX(0.298631336)
 130 #define FIX_0_390180644  FIX(0.390180644)
 131 #define FIX_0_541196100  FIX(0.541196100)
 132 #define FIX_0_765366865  FIX(0.765366865)
 133 #define FIX_0_899976223  FIX(0.899976223)
 134 #define FIX_1_175875602  FIX(1.175875602)
 135 #define FIX_1_501321110  FIX(1.501321110)
 136 #define FIX_1_847759065  FIX(1.847759065)
 137 #define FIX_1_961570560  FIX(1.961570560)
 138 #define FIX_2_053119869  FIX(2.053119869)
 139 #define FIX_2_562915447  FIX(2.562915447)
 140 #define FIX_3_072711026  FIX(3.072711026)
 141 #endif
 142 
 143 
 144 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
 145  * For 8-bit samples with the recommended scaling, all the variable
 146  * and constant values involved are no more than 16 bits wide, so a
 147  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
 148  * For 12-bit samples, a full 32-bit multiplication will be needed.
 149  */
 150 
 151 #if BITS_IN_JSAMPLE == 8
 152 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
 153 #else
 154 #define MULTIPLY(var,const)  ((var) * (const))
 155 #endif
 156 
 157 
 158 /* Dequantize a coefficient by multiplying it by the multiplier-table
 159  * entry; produce an int result.  In this module, both inputs and result
 160  * are 16 bits or less, so either int or short multiply will work.
 161  */
 162 
 163 #define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
 164 
 165 
 166 /*
 167  * Perform dequantization and inverse DCT on one block of coefficients.
 168  */
 169 
 170 GLOBAL(void)
 171 jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 172                  JCOEFPTR coef_block,
 173                  JSAMPARRAY output_buf, JDIMENSION output_col)
 174 {
 175   INT32 tmp0, tmp1, tmp2, tmp3;
 176   INT32 tmp10, tmp11, tmp12, tmp13;
 177   INT32 z1, z2, z3;
 178   JCOEFPTR inptr;
 179   ISLOW_MULT_TYPE * quantptr;
 180   int * wsptr;
 181   JSAMPROW outptr;
 182   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 183   int ctr;
 184   int workspace[DCTSIZE2];      /* buffers data between passes */
 185   SHIFT_TEMPS
 186 
 187   /* Pass 1: process columns from input, store into work array. */
 188   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
 189   /* furthermore, we scale the results by 2**PASS1_BITS. */
 190 
 191   inptr = coef_block;
 192   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 193   wsptr = workspace;
 194   for (ctr = DCTSIZE; ctr > 0; ctr--) {
 195     /* Due to quantization, we will usually find that many of the input
 196      * coefficients are zero, especially the AC terms.  We can exploit this
 197      * by short-circuiting the IDCT calculation for any column in which all
 198      * the AC terms are zero.  In that case each output is equal to the
 199      * DC coefficient (with scale factor as needed).
 200      * With typical images and quantization tables, half or more of the
 201      * column DCT calculations can be simplified this way.
 202      */
 203 
 204     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
 205         inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
 206         inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
 207         inptr[DCTSIZE*7] == 0) {
 208       /* AC terms all zero */
 209       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
 210 
 211       wsptr[DCTSIZE*0] = dcval;
 212       wsptr[DCTSIZE*1] = dcval;
 213       wsptr[DCTSIZE*2] = dcval;
 214       wsptr[DCTSIZE*3] = dcval;
 215       wsptr[DCTSIZE*4] = dcval;
 216       wsptr[DCTSIZE*5] = dcval;
 217       wsptr[DCTSIZE*6] = dcval;
 218       wsptr[DCTSIZE*7] = dcval;
 219 
 220       inptr++;                  /* advance pointers to next column */
 221       quantptr++;
 222       wsptr++;
 223       continue;
 224     }
 225 
 226     /* Even part: reverse the even part of the forward DCT. */
 227     /* The rotator is sqrt(2)*c(-6). */
 228 
 229     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 230     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
 231 
 232     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
 233     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
 234     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
 235 
 236     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 237     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 238     z2 <<= CONST_BITS;
 239     z3 <<= CONST_BITS;
 240     /* Add fudge factor here for final descale. */
 241     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
 242 
 243     tmp0 = z2 + z3;
 244     tmp1 = z2 - z3;
 245 
 246     tmp10 = tmp0 + tmp2;
 247     tmp13 = tmp0 - tmp2;
 248     tmp11 = tmp1 + tmp3;
 249     tmp12 = tmp1 - tmp3;
 250 
 251     /* Odd part per figure 8; the matrix is unitary and hence its
 252      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
 253      */
 254 
 255     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
 256     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
 257     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 258     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 259 
 260     z2 = tmp0 + tmp2;
 261     z3 = tmp1 + tmp3;
 262 
 263     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
 264     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
 265     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
 266     z2 += z1;
 267     z3 += z1;
 268 
 269     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
 270     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
 271     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
 272     tmp0 += z1 + z2;
 273     tmp3 += z1 + z3;
 274 
 275     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
 276     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
 277     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
 278     tmp1 += z1 + z3;
 279     tmp2 += z1 + z2;
 280 
 281     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 282 
 283     wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
 284     wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
 285     wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
 286     wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
 287     wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
 288     wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
 289     wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
 290     wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
 291 
 292     inptr++;                    /* advance pointers to next column */
 293     quantptr++;
 294     wsptr++;
 295   }
 296 
 297   /* Pass 2: process rows from work array, store into output array. */
 298   /* Note that we must descale the results by a factor of 8 == 2**3, */
 299   /* and also undo the PASS1_BITS scaling. */
 300 
 301   wsptr = workspace;
 302   for (ctr = 0; ctr < DCTSIZE; ctr++) {
 303     outptr = output_buf[ctr] + output_col;
 304     /* Rows of zeroes can be exploited in the same way as we did with columns.
 305      * However, the column calculation has created many nonzero AC terms, so
 306      * the simplification applies less often (typically 5% to 10% of the time).
 307      * On machines with very fast multiplication, it's possible that the
 308      * test takes more time than it's worth.  In that case this section
 309      * may be commented out.
 310      */
 311 
 312 #ifndef NO_ZERO_ROW_TEST
 313     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
 314         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
 315       /* AC terms all zero */
 316       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
 317                                   & RANGE_MASK];
 318 
 319       outptr[0] = dcval;
 320       outptr[1] = dcval;
 321       outptr[2] = dcval;
 322       outptr[3] = dcval;
 323       outptr[4] = dcval;
 324       outptr[5] = dcval;
 325       outptr[6] = dcval;
 326       outptr[7] = dcval;
 327 
 328       wsptr += DCTSIZE;         /* advance pointer to next row */
 329       continue;
 330     }
 331 #endif
 332 
 333     /* Even part: reverse the even part of the forward DCT. */
 334     /* The rotator is sqrt(2)*c(-6). */
 335 
 336     z2 = (INT32) wsptr[2];
 337     z3 = (INT32) wsptr[6];
 338 
 339     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
 340     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
 341     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
 342 
 343     /* Add fudge factor here for final descale. */
 344     z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
 345     z3 = (INT32) wsptr[4];
 346 
 347     tmp0 = (z2 + z3) << CONST_BITS;
 348     tmp1 = (z2 - z3) << CONST_BITS;
 349 
 350     tmp10 = tmp0 + tmp2;
 351     tmp13 = tmp0 - tmp2;
 352     tmp11 = tmp1 + tmp3;
 353     tmp12 = tmp1 - tmp3;
 354 
 355     /* Odd part per figure 8; the matrix is unitary and hence its
 356      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
 357      */
 358 
 359     tmp0 = (INT32) wsptr[7];
 360     tmp1 = (INT32) wsptr[5];
 361     tmp2 = (INT32) wsptr[3];
 362     tmp3 = (INT32) wsptr[1];
 363 
 364     z2 = tmp0 + tmp2;
 365     z3 = tmp1 + tmp3;
 366 
 367     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
 368     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
 369     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
 370     z2 += z1;
 371     z3 += z1;
 372 
 373     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
 374     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
 375     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
 376     tmp0 += z1 + z2;
 377     tmp3 += z1 + z3;
 378 
 379     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
 380     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
 381     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
 382     tmp1 += z1 + z3;
 383     tmp2 += z1 + z2;
 384 
 385     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 386 
 387     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
 388                                               CONST_BITS+PASS1_BITS+3)
 389                             & RANGE_MASK];
 390     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
 391                                               CONST_BITS+PASS1_BITS+3)
 392                             & RANGE_MASK];
 393     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
 394                                               CONST_BITS+PASS1_BITS+3)
 395                             & RANGE_MASK];
 396     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
 397                                               CONST_BITS+PASS1_BITS+3)
 398                             & RANGE_MASK];
 399     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
 400                                               CONST_BITS+PASS1_BITS+3)
 401                             & RANGE_MASK];
 402     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
 403                                               CONST_BITS+PASS1_BITS+3)
 404                             & RANGE_MASK];
 405     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
 406                                               CONST_BITS+PASS1_BITS+3)
 407                             & RANGE_MASK];
 408     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
 409                                               CONST_BITS+PASS1_BITS+3)
 410                             & RANGE_MASK];
 411 
 412     wsptr += DCTSIZE;           /* advance pointer to next row */
 413   }
 414 }
 415 
 416 #ifdef IDCT_SCALING_SUPPORTED
 417 
 418 
 419 /*
 420  * Perform dequantization and inverse DCT on one block of coefficients,
 421  * producing a 7x7 output block.
 422  *
 423  * Optimized algorithm with 12 multiplications in the 1-D kernel.
 424  * cK represents sqrt(2) * cos(K*pi/14).
 425  */
 426 
 427 GLOBAL(void)
 428 jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 429                JCOEFPTR coef_block,
 430                JSAMPARRAY output_buf, JDIMENSION output_col)
 431 {
 432   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
 433   INT32 z1, z2, z3;
 434   JCOEFPTR inptr;
 435   ISLOW_MULT_TYPE * quantptr;
 436   int * wsptr;
 437   JSAMPROW outptr;
 438   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 439   int ctr;
 440   int workspace[7*7];   /* buffers data between passes */
 441   SHIFT_TEMPS
 442 
 443   /* Pass 1: process columns from input, store into work array. */
 444 
 445   inptr = coef_block;
 446   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 447   wsptr = workspace;
 448   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
 449     /* Even part */
 450 
 451     tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 452     tmp13 <<= CONST_BITS;
 453     /* Add fudge factor here for final descale. */
 454     tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
 455 
 456     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 457     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 458     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
 459 
 460     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
 461     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
 462     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
 463     tmp0 = z1 + z3;
 464     z2 -= tmp0;
 465     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
 466     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
 467     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
 468     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
 469 
 470     /* Odd part */
 471 
 472     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 473     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 474     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
 475 
 476     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
 477     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
 478     tmp0 = tmp1 - tmp2;
 479     tmp1 += tmp2;
 480     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
 481     tmp1 += tmp2;
 482     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
 483     tmp0 += z2;
 484     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
 485 
 486     /* Final output stage */
 487 
 488     wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
 489     wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
 490     wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
 491     wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
 492     wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
 493     wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
 494     wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
 495   }
 496 
 497   /* Pass 2: process 7 rows from work array, store into output array. */
 498 
 499   wsptr = workspace;
 500   for (ctr = 0; ctr < 7; ctr++) {
 501     outptr = output_buf[ctr] + output_col;
 502 
 503     /* Even part */
 504 
 505     /* Add fudge factor here for final descale. */
 506     tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
 507     tmp13 <<= CONST_BITS;
 508 
 509     z1 = (INT32) wsptr[2];
 510     z2 = (INT32) wsptr[4];
 511     z3 = (INT32) wsptr[6];
 512 
 513     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
 514     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
 515     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
 516     tmp0 = z1 + z3;
 517     z2 -= tmp0;
 518     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
 519     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
 520     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
 521     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
 522 
 523     /* Odd part */
 524 
 525     z1 = (INT32) wsptr[1];
 526     z2 = (INT32) wsptr[3];
 527     z3 = (INT32) wsptr[5];
 528 
 529     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
 530     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
 531     tmp0 = tmp1 - tmp2;
 532     tmp1 += tmp2;
 533     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
 534     tmp1 += tmp2;
 535     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
 536     tmp0 += z2;
 537     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
 538 
 539     /* Final output stage */
 540 
 541     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
 542                                               CONST_BITS+PASS1_BITS+3)
 543                             & RANGE_MASK];
 544     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
 545                                               CONST_BITS+PASS1_BITS+3)
 546                             & RANGE_MASK];
 547     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
 548                                               CONST_BITS+PASS1_BITS+3)
 549                             & RANGE_MASK];
 550     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
 551                                               CONST_BITS+PASS1_BITS+3)
 552                             & RANGE_MASK];
 553     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
 554                                               CONST_BITS+PASS1_BITS+3)
 555                             & RANGE_MASK];
 556     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
 557                                               CONST_BITS+PASS1_BITS+3)
 558                             & RANGE_MASK];
 559     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
 560                                               CONST_BITS+PASS1_BITS+3)
 561                             & RANGE_MASK];
 562 
 563     wsptr += 7;         /* advance pointer to next row */
 564   }
 565 }
 566 
 567 
 568 /*
 569  * Perform dequantization and inverse DCT on one block of coefficients,
 570  * producing a reduced-size 6x6 output block.
 571  *
 572  * Optimized algorithm with 3 multiplications in the 1-D kernel.
 573  * cK represents sqrt(2) * cos(K*pi/12).
 574  */
 575 
 576 GLOBAL(void)
 577 jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 578                JCOEFPTR coef_block,
 579                JSAMPARRAY output_buf, JDIMENSION output_col)
 580 {
 581   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
 582   INT32 z1, z2, z3;
 583   JCOEFPTR inptr;
 584   ISLOW_MULT_TYPE * quantptr;
 585   int * wsptr;
 586   JSAMPROW outptr;
 587   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 588   int ctr;
 589   int workspace[6*6];   /* buffers data between passes */
 590   SHIFT_TEMPS
 591 
 592   /* Pass 1: process columns from input, store into work array. */
 593 
 594   inptr = coef_block;
 595   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 596   wsptr = workspace;
 597   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
 598     /* Even part */
 599 
 600     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 601     tmp0 <<= CONST_BITS;
 602     /* Add fudge factor here for final descale. */
 603     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
 604     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 605     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
 606     tmp1 = tmp0 + tmp10;
 607     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
 608     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 609     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
 610     tmp10 = tmp1 + tmp0;
 611     tmp12 = tmp1 - tmp0;
 612 
 613     /* Odd part */
 614 
 615     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 616     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 617     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
 618     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
 619     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
 620     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
 621     tmp1 = (z1 - z2 - z3) << PASS1_BITS;
 622 
 623     /* Final output stage */
 624 
 625     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
 626     wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
 627     wsptr[6*1] = (int) (tmp11 + tmp1);
 628     wsptr[6*4] = (int) (tmp11 - tmp1);
 629     wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
 630     wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
 631   }
 632 
 633   /* Pass 2: process 6 rows from work array, store into output array. */
 634 
 635   wsptr = workspace;
 636   for (ctr = 0; ctr < 6; ctr++) {
 637     outptr = output_buf[ctr] + output_col;
 638 
 639     /* Even part */
 640 
 641     /* Add fudge factor here for final descale. */
 642     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
 643     tmp0 <<= CONST_BITS;
 644     tmp2 = (INT32) wsptr[4];
 645     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
 646     tmp1 = tmp0 + tmp10;
 647     tmp11 = tmp0 - tmp10 - tmp10;
 648     tmp10 = (INT32) wsptr[2];
 649     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
 650     tmp10 = tmp1 + tmp0;
 651     tmp12 = tmp1 - tmp0;
 652 
 653     /* Odd part */
 654 
 655     z1 = (INT32) wsptr[1];
 656     z2 = (INT32) wsptr[3];
 657     z3 = (INT32) wsptr[5];
 658     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
 659     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
 660     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
 661     tmp1 = (z1 - z2 - z3) << CONST_BITS;
 662 
 663     /* Final output stage */
 664 
 665     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
 666                                               CONST_BITS+PASS1_BITS+3)
 667                             & RANGE_MASK];
 668     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
 669                                               CONST_BITS+PASS1_BITS+3)
 670                             & RANGE_MASK];
 671     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
 672                                               CONST_BITS+PASS1_BITS+3)
 673                             & RANGE_MASK];
 674     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
 675                                               CONST_BITS+PASS1_BITS+3)
 676                             & RANGE_MASK];
 677     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
 678                                               CONST_BITS+PASS1_BITS+3)
 679                             & RANGE_MASK];
 680     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
 681                                               CONST_BITS+PASS1_BITS+3)
 682                             & RANGE_MASK];
 683 
 684     wsptr += 6;         /* advance pointer to next row */
 685   }
 686 }
 687 
 688 
 689 /*
 690  * Perform dequantization and inverse DCT on one block of coefficients,
 691  * producing a reduced-size 5x5 output block.
 692  *
 693  * Optimized algorithm with 5 multiplications in the 1-D kernel.
 694  * cK represents sqrt(2) * cos(K*pi/10).
 695  */
 696 
 697 GLOBAL(void)
 698 jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 699                JCOEFPTR coef_block,
 700                JSAMPARRAY output_buf, JDIMENSION output_col)
 701 {
 702   INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
 703   INT32 z1, z2, z3;
 704   JCOEFPTR inptr;
 705   ISLOW_MULT_TYPE * quantptr;
 706   int * wsptr;
 707   JSAMPROW outptr;
 708   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 709   int ctr;
 710   int workspace[5*5];   /* buffers data between passes */
 711   SHIFT_TEMPS
 712 
 713   /* Pass 1: process columns from input, store into work array. */
 714 
 715   inptr = coef_block;
 716   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 717   wsptr = workspace;
 718   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
 719     /* Even part */
 720 
 721     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 722     tmp12 <<= CONST_BITS;
 723     /* Add fudge factor here for final descale. */
 724     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
 725     tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 726     tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 727     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
 728     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
 729     z3 = tmp12 + z2;
 730     tmp10 = z3 + z1;
 731     tmp11 = z3 - z1;
 732     tmp12 -= z2 << 2;
 733 
 734     /* Odd part */
 735 
 736     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 737     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 738 
 739     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
 740     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
 741     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
 742 
 743     /* Final output stage */
 744 
 745     wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
 746     wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
 747     wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
 748     wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
 749     wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
 750   }
 751 
 752   /* Pass 2: process 5 rows from work array, store into output array. */
 753 
 754   wsptr = workspace;
 755   for (ctr = 0; ctr < 5; ctr++) {
 756     outptr = output_buf[ctr] + output_col;
 757 
 758     /* Even part */
 759 
 760     /* Add fudge factor here for final descale. */
 761     tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
 762     tmp12 <<= CONST_BITS;
 763     tmp0 = (INT32) wsptr[2];
 764     tmp1 = (INT32) wsptr[4];
 765     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
 766     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
 767     z3 = tmp12 + z2;
 768     tmp10 = z3 + z1;
 769     tmp11 = z3 - z1;
 770     tmp12 -= z2 << 2;
 771 
 772     /* Odd part */
 773 
 774     z2 = (INT32) wsptr[1];
 775     z3 = (INT32) wsptr[3];
 776 
 777     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
 778     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
 779     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
 780 
 781     /* Final output stage */
 782 
 783     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
 784                                               CONST_BITS+PASS1_BITS+3)
 785                             & RANGE_MASK];
 786     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
 787                                               CONST_BITS+PASS1_BITS+3)
 788                             & RANGE_MASK];
 789     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
 790                                               CONST_BITS+PASS1_BITS+3)
 791                             & RANGE_MASK];
 792     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
 793                                               CONST_BITS+PASS1_BITS+3)
 794                             & RANGE_MASK];
 795     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
 796                                               CONST_BITS+PASS1_BITS+3)
 797                             & RANGE_MASK];
 798 
 799     wsptr += 5;         /* advance pointer to next row */
 800   }
 801 }
 802 
 803 
 804 /*
 805  * Perform dequantization and inverse DCT on one block of coefficients,
 806  * producing a reduced-size 4x4 output block.
 807  *
 808  * Optimized algorithm with 3 multiplications in the 1-D kernel.
 809  * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
 810  */
 811 
 812 GLOBAL(void)
 813 jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 814                JCOEFPTR coef_block,
 815                JSAMPARRAY output_buf, JDIMENSION output_col)
 816 {
 817   INT32 tmp0, tmp2, tmp10, tmp12;
 818   INT32 z1, z2, z3;
 819   JCOEFPTR inptr;
 820   ISLOW_MULT_TYPE * quantptr;
 821   int * wsptr;
 822   JSAMPROW outptr;
 823   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 824   int ctr;
 825   int workspace[4*4];   /* buffers data between passes */
 826   SHIFT_TEMPS
 827 
 828   /* Pass 1: process columns from input, store into work array. */
 829 
 830   inptr = coef_block;
 831   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 832   wsptr = workspace;
 833   for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
 834     /* Even part */
 835 
 836     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 837     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 838 
 839     tmp10 = (tmp0 + tmp2) << PASS1_BITS;
 840     tmp12 = (tmp0 - tmp2) << PASS1_BITS;
 841 
 842     /* Odd part */
 843     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
 844 
 845     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 846     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 847 
 848     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
 849     /* Add fudge factor here for final descale. */
 850     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
 851     tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
 852                        CONST_BITS-PASS1_BITS);
 853     tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
 854                        CONST_BITS-PASS1_BITS);
 855 
 856     /* Final output stage */
 857 
 858     wsptr[4*0] = (int) (tmp10 + tmp0);
 859     wsptr[4*3] = (int) (tmp10 - tmp0);
 860     wsptr[4*1] = (int) (tmp12 + tmp2);
 861     wsptr[4*2] = (int) (tmp12 - tmp2);
 862   }
 863 
 864   /* Pass 2: process 4 rows from work array, store into output array. */
 865 
 866   wsptr = workspace;
 867   for (ctr = 0; ctr < 4; ctr++) {
 868     outptr = output_buf[ctr] + output_col;
 869 
 870     /* Even part */
 871 
 872     /* Add fudge factor here for final descale. */
 873     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
 874     tmp2 = (INT32) wsptr[2];
 875 
 876     tmp10 = (tmp0 + tmp2) << CONST_BITS;
 877     tmp12 = (tmp0 - tmp2) << CONST_BITS;
 878 
 879     /* Odd part */
 880     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
 881 
 882     z2 = (INT32) wsptr[1];
 883     z3 = (INT32) wsptr[3];
 884 
 885     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
 886     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
 887     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
 888 
 889     /* Final output stage */
 890 
 891     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
 892                                               CONST_BITS+PASS1_BITS+3)
 893                             & RANGE_MASK];
 894     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
 895                                               CONST_BITS+PASS1_BITS+3)
 896                             & RANGE_MASK];
 897     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
 898                                               CONST_BITS+PASS1_BITS+3)
 899                             & RANGE_MASK];
 900     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
 901                                               CONST_BITS+PASS1_BITS+3)
 902                             & RANGE_MASK];
 903 
 904     wsptr += 4;         /* advance pointer to next row */
 905   }
 906 }
 907 
 908 
 909 /*
 910  * Perform dequantization and inverse DCT on one block of coefficients,
 911  * producing a reduced-size 3x3 output block.
 912  *
 913  * Optimized algorithm with 2 multiplications in the 1-D kernel.
 914  * cK represents sqrt(2) * cos(K*pi/6).
 915  */
 916 
 917 GLOBAL(void)
 918 jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 919                JCOEFPTR coef_block,
 920                JSAMPARRAY output_buf, JDIMENSION output_col)
 921 {
 922   INT32 tmp0, tmp2, tmp10, tmp12;
 923   JCOEFPTR inptr;
 924   ISLOW_MULT_TYPE * quantptr;
 925   int * wsptr;
 926   JSAMPROW outptr;
 927   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 928   int ctr;
 929   int workspace[3*3];   /* buffers data between passes */
 930   SHIFT_TEMPS
 931 
 932   /* Pass 1: process columns from input, store into work array. */
 933 
 934   inptr = coef_block;
 935   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 936   wsptr = workspace;
 937   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
 938     /* Even part */
 939 
 940     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 941     tmp0 <<= CONST_BITS;
 942     /* Add fudge factor here for final descale. */
 943     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
 944     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 945     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
 946     tmp10 = tmp0 + tmp12;
 947     tmp2 = tmp0 - tmp12 - tmp12;
 948 
 949     /* Odd part */
 950 
 951     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 952     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
 953 
 954     /* Final output stage */
 955 
 956     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
 957     wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
 958     wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
 959   }
 960 
 961   /* Pass 2: process 3 rows from work array, store into output array. */
 962 
 963   wsptr = workspace;
 964   for (ctr = 0; ctr < 3; ctr++) {
 965     outptr = output_buf[ctr] + output_col;
 966 
 967     /* Even part */
 968 
 969     /* Add fudge factor here for final descale. */
 970     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
 971     tmp0 <<= CONST_BITS;
 972     tmp2 = (INT32) wsptr[2];
 973     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
 974     tmp10 = tmp0 + tmp12;
 975     tmp2 = tmp0 - tmp12 - tmp12;
 976 
 977     /* Odd part */
 978 
 979     tmp12 = (INT32) wsptr[1];
 980     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
 981 
 982     /* Final output stage */
 983 
 984     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
 985                                               CONST_BITS+PASS1_BITS+3)
 986                             & RANGE_MASK];
 987     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
 988                                               CONST_BITS+PASS1_BITS+3)
 989                             & RANGE_MASK];
 990     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
 991                                               CONST_BITS+PASS1_BITS+3)
 992                             & RANGE_MASK];
 993 
 994     wsptr += 3;         /* advance pointer to next row */
 995   }
 996 }
 997 
 998 
 999 /*
1000  * Perform dequantization and inverse DCT on one block of coefficients,
1001  * producing a reduced-size 2x2 output block.
1002  *
1003  * Multiplication-less algorithm.
1004  */
1005 
1006 GLOBAL(void)
1007 jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1008                JCOEFPTR coef_block,
1009                JSAMPARRAY output_buf, JDIMENSION output_col)
1010 {
1011   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1012   ISLOW_MULT_TYPE * quantptr;
1013   JSAMPROW outptr;
1014   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1015   SHIFT_TEMPS
1016 
1017   /* Pass 1: process columns from input. */
1018 
1019   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1020 
1021   /* Column 0 */
1022   tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
1023   tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
1024   /* Add fudge factor here for final descale. */
1025   tmp4 += ONE << 2;
1026 
1027   tmp0 = tmp4 + tmp5;
1028   tmp2 = tmp4 - tmp5;
1029 
1030   /* Column 1 */
1031   tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
1032   tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
1033 
1034   tmp1 = tmp4 + tmp5;
1035   tmp3 = tmp4 - tmp5;
1036 
1037   /* Pass 2: process 2 rows, store into output array. */
1038 
1039   /* Row 0 */
1040   outptr = output_buf[0] + output_col;
1041 
1042   outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
1043   outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
1044 
1045   /* Row 1 */
1046   outptr = output_buf[1] + output_col;
1047 
1048   outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
1049   outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
1050 }
1051 
1052 
1053 /*
1054  * Perform dequantization and inverse DCT on one block of coefficients,
1055  * producing a reduced-size 1x1 output block.
1056  *
1057  * We hardly need an inverse DCT routine for this: just take the
1058  * average pixel value, which is one-eighth of the DC coefficient.
1059  */
1060 
1061 GLOBAL(void)
1062 jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1063                JCOEFPTR coef_block,
1064                JSAMPARRAY output_buf, JDIMENSION output_col)
1065 {
1066   int dcval;
1067   ISLOW_MULT_TYPE * quantptr;
1068   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1069   SHIFT_TEMPS
1070 
1071   /* 1x1 is trivial: just take the DC coefficient divided by 8. */
1072   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1073   dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
1074   dcval = (int) DESCALE((INT32) dcval, 3);
1075 
1076   output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
1077 }
1078 
1079 
1080 /*
1081  * Perform dequantization and inverse DCT on one block of coefficients,
1082  * producing a 9x9 output block.
1083  *
1084  * Optimized algorithm with 10 multiplications in the 1-D kernel.
1085  * cK represents sqrt(2) * cos(K*pi/18).
1086  */
1087 
1088 GLOBAL(void)
1089 jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1090                JCOEFPTR coef_block,
1091                JSAMPARRAY output_buf, JDIMENSION output_col)
1092 {
1093   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
1094   INT32 z1, z2, z3, z4;
1095   JCOEFPTR inptr;
1096   ISLOW_MULT_TYPE * quantptr;
1097   int * wsptr;
1098   JSAMPROW outptr;
1099   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1100   int ctr;
1101   int workspace[8*9];   /* buffers data between passes */
1102   SHIFT_TEMPS
1103 
1104   /* Pass 1: process columns from input, store into work array. */
1105 
1106   inptr = coef_block;
1107   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1108   wsptr = workspace;
1109   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1110     /* Even part */
1111 
1112     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1113     tmp0 <<= CONST_BITS;
1114     /* Add fudge factor here for final descale. */
1115     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
1116 
1117     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1118     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1119     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1120 
1121     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
1122     tmp1 = tmp0 + tmp3;
1123     tmp2 = tmp0 - tmp3 - tmp3;
1124 
1125     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1126     tmp11 = tmp2 + tmp0;
1127     tmp14 = tmp2 - tmp0 - tmp0;
1128 
1129     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1130     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1131     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1132 
1133     tmp10 = tmp1 + tmp0 - tmp3;
1134     tmp12 = tmp1 - tmp0 + tmp2;
1135     tmp13 = tmp1 - tmp2 + tmp3;
1136 
1137     /* Odd part */
1138 
1139     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1140     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1141     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1142     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1143 
1144     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
1145 
1146     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1147     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1148     tmp0 = tmp2 + tmp3 - z2;
1149     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1150     tmp2 += z2 - tmp1;
1151     tmp3 += z2 + tmp1;
1152     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1153 
1154     /* Final output stage */
1155 
1156     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
1157     wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
1158     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
1159     wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
1160     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
1161     wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
1162     wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
1163     wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
1164     wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
1165   }
1166 
1167   /* Pass 2: process 9 rows from work array, store into output array. */
1168 
1169   wsptr = workspace;
1170   for (ctr = 0; ctr < 9; ctr++) {
1171     outptr = output_buf[ctr] + output_col;
1172 
1173     /* Even part */
1174 
1175     /* Add fudge factor here for final descale. */
1176     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1177     tmp0 <<= CONST_BITS;
1178 
1179     z1 = (INT32) wsptr[2];
1180     z2 = (INT32) wsptr[4];
1181     z3 = (INT32) wsptr[6];
1182 
1183     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
1184     tmp1 = tmp0 + tmp3;
1185     tmp2 = tmp0 - tmp3 - tmp3;
1186 
1187     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1188     tmp11 = tmp2 + tmp0;
1189     tmp14 = tmp2 - tmp0 - tmp0;
1190 
1191     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1192     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1193     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1194 
1195     tmp10 = tmp1 + tmp0 - tmp3;
1196     tmp12 = tmp1 - tmp0 + tmp2;
1197     tmp13 = tmp1 - tmp2 + tmp3;
1198 
1199     /* Odd part */
1200 
1201     z1 = (INT32) wsptr[1];
1202     z2 = (INT32) wsptr[3];
1203     z3 = (INT32) wsptr[5];
1204     z4 = (INT32) wsptr[7];
1205 
1206     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
1207 
1208     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1209     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1210     tmp0 = tmp2 + tmp3 - z2;
1211     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1212     tmp2 += z2 - tmp1;
1213     tmp3 += z2 + tmp1;
1214     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1215 
1216     /* Final output stage */
1217 
1218     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
1219                                               CONST_BITS+PASS1_BITS+3)
1220                             & RANGE_MASK];
1221     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
1222                                               CONST_BITS+PASS1_BITS+3)
1223                             & RANGE_MASK];
1224     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
1225                                               CONST_BITS+PASS1_BITS+3)
1226                             & RANGE_MASK];
1227     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
1228                                               CONST_BITS+PASS1_BITS+3)
1229                             & RANGE_MASK];
1230     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
1231                                               CONST_BITS+PASS1_BITS+3)
1232                             & RANGE_MASK];
1233     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
1234                                               CONST_BITS+PASS1_BITS+3)
1235                             & RANGE_MASK];
1236     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
1237                                               CONST_BITS+PASS1_BITS+3)
1238                             & RANGE_MASK];
1239     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
1240                                               CONST_BITS+PASS1_BITS+3)
1241                             & RANGE_MASK];
1242     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
1243                                               CONST_BITS+PASS1_BITS+3)
1244                             & RANGE_MASK];
1245 
1246     wsptr += 8;         /* advance pointer to next row */
1247   }
1248 }
1249 
1250 
1251 /*
1252  * Perform dequantization and inverse DCT on one block of coefficients,
1253  * producing a 10x10 output block.
1254  *
1255  * Optimized algorithm with 12 multiplications in the 1-D kernel.
1256  * cK represents sqrt(2) * cos(K*pi/20).
1257  */
1258 
1259 GLOBAL(void)
1260 jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1261                  JCOEFPTR coef_block,
1262                  JSAMPARRAY output_buf, JDIMENSION output_col)
1263 {
1264   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1265   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
1266   INT32 z1, z2, z3, z4, z5;
1267   JCOEFPTR inptr;
1268   ISLOW_MULT_TYPE * quantptr;
1269   int * wsptr;
1270   JSAMPROW outptr;
1271   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1272   int ctr;
1273   int workspace[8*10];  /* buffers data between passes */
1274   SHIFT_TEMPS
1275 
1276   /* Pass 1: process columns from input, store into work array. */
1277 
1278   inptr = coef_block;
1279   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1280   wsptr = workspace;
1281   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1282     /* Even part */
1283 
1284     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1285     z3 <<= CONST_BITS;
1286     /* Add fudge factor here for final descale. */
1287     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1288     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1289     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1290     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1291     tmp10 = z3 + z1;
1292     tmp11 = z3 - z2;
1293 
1294     tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
1295                         CONST_BITS-PASS1_BITS);
1296 
1297     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1298     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1299 
1300     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1301     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1302     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1303 
1304     tmp20 = tmp10 + tmp12;
1305     tmp24 = tmp10 - tmp12;
1306     tmp21 = tmp11 + tmp13;
1307     tmp23 = tmp11 - tmp13;
1308 
1309     /* Odd part */
1310 
1311     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1312     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1313     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1314     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1315 
1316     tmp11 = z2 + z4;
1317     tmp13 = z2 - z4;
1318 
1319     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1320     z5 = z3 << CONST_BITS;
1321 
1322     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1323     z4 = z5 + tmp12;
1324 
1325     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1326     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1327 
1328     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1329     z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
1330 
1331     tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
1332 
1333     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1334     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1335 
1336     /* Final output stage */
1337 
1338     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1339     wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1340     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1341     wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1342     wsptr[8*2] = (int) (tmp22 + tmp12);
1343     wsptr[8*7] = (int) (tmp22 - tmp12);
1344     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1345     wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1346     wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1347     wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1348   }
1349 
1350   /* Pass 2: process 10 rows from work array, store into output array. */
1351 
1352   wsptr = workspace;
1353   for (ctr = 0; ctr < 10; ctr++) {
1354     outptr = output_buf[ctr] + output_col;
1355 
1356     /* Even part */
1357 
1358     /* Add fudge factor here for final descale. */
1359     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1360     z3 <<= CONST_BITS;
1361     z4 = (INT32) wsptr[4];
1362     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1363     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1364     tmp10 = z3 + z1;
1365     tmp11 = z3 - z2;
1366 
1367     tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
1368 
1369     z2 = (INT32) wsptr[2];
1370     z3 = (INT32) wsptr[6];
1371 
1372     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1373     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1374     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1375 
1376     tmp20 = tmp10 + tmp12;
1377     tmp24 = tmp10 - tmp12;
1378     tmp21 = tmp11 + tmp13;
1379     tmp23 = tmp11 - tmp13;
1380 
1381     /* Odd part */
1382 
1383     z1 = (INT32) wsptr[1];
1384     z2 = (INT32) wsptr[3];
1385     z3 = (INT32) wsptr[5];
1386     z3 <<= CONST_BITS;
1387     z4 = (INT32) wsptr[7];
1388 
1389     tmp11 = z2 + z4;
1390     tmp13 = z2 - z4;
1391 
1392     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1393 
1394     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1395     z4 = z3 + tmp12;
1396 
1397     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1398     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1399 
1400     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1401     z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
1402 
1403     tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
1404 
1405     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1406     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1407 
1408     /* Final output stage */
1409 
1410     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1411                                               CONST_BITS+PASS1_BITS+3)
1412                             & RANGE_MASK];
1413     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1414                                               CONST_BITS+PASS1_BITS+3)
1415                             & RANGE_MASK];
1416     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1417                                               CONST_BITS+PASS1_BITS+3)
1418                             & RANGE_MASK];
1419     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1420                                               CONST_BITS+PASS1_BITS+3)
1421                             & RANGE_MASK];
1422     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1423                                               CONST_BITS+PASS1_BITS+3)
1424                             & RANGE_MASK];
1425     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1426                                               CONST_BITS+PASS1_BITS+3)
1427                             & RANGE_MASK];
1428     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1429                                               CONST_BITS+PASS1_BITS+3)
1430                             & RANGE_MASK];
1431     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1432                                               CONST_BITS+PASS1_BITS+3)
1433                             & RANGE_MASK];
1434     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1435                                               CONST_BITS+PASS1_BITS+3)
1436                             & RANGE_MASK];
1437     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1438                                               CONST_BITS+PASS1_BITS+3)
1439                             & RANGE_MASK];
1440 
1441     wsptr += 8;         /* advance pointer to next row */
1442   }
1443 }
1444 
1445 
1446 /*
1447  * Perform dequantization and inverse DCT on one block of coefficients,
1448  * producing a 11x11 output block.
1449  *
1450  * Optimized algorithm with 24 multiplications in the 1-D kernel.
1451  * cK represents sqrt(2) * cos(K*pi/22).
1452  */
1453 
1454 GLOBAL(void)
1455 jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1456                  JCOEFPTR coef_block,
1457                  JSAMPARRAY output_buf, JDIMENSION output_col)
1458 {
1459   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1460   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1461   INT32 z1, z2, z3, z4;
1462   JCOEFPTR inptr;
1463   ISLOW_MULT_TYPE * quantptr;
1464   int * wsptr;
1465   JSAMPROW outptr;
1466   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1467   int ctr;
1468   int workspace[8*11];  /* buffers data between passes */
1469   SHIFT_TEMPS
1470 
1471   /* Pass 1: process columns from input, store into work array. */
1472 
1473   inptr = coef_block;
1474   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1475   wsptr = workspace;
1476   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1477     /* Even part */
1478 
1479     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1480     tmp10 <<= CONST_BITS;
1481     /* Add fudge factor here for final descale. */
1482     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
1483 
1484     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1485     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1486     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1487 
1488     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
1489     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
1490     z4 = z1 + z3;
1491     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
1492     z4 -= z2;
1493     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
1494     tmp21 = tmp20 + tmp23 + tmp25 -
1495             MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
1496     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1497     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1498     tmp24 += tmp25;
1499     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
1500     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
1501              MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
1502     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
1503 
1504     /* Odd part */
1505 
1506     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1507     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1508     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1509     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1510 
1511     tmp11 = z1 + z2;
1512     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1513     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
1514     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
1515     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1516     tmp10 = tmp11 + tmp12 + tmp13 -
1517             MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
1518     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1519     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
1520     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
1521     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
1522     tmp11 += z1;
1523     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
1524     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
1525              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
1526              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
1527 
1528     /* Final output stage */
1529 
1530     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1531     wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1532     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1533     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1534     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1535     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1536     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1537     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1538     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1539     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1540     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
1541   }
1542 
1543   /* Pass 2: process 11 rows from work array, store into output array. */
1544 
1545   wsptr = workspace;
1546   for (ctr = 0; ctr < 11; ctr++) {
1547     outptr = output_buf[ctr] + output_col;
1548 
1549     /* Even part */
1550 
1551     /* Add fudge factor here for final descale. */
1552     tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1553     tmp10 <<= CONST_BITS;
1554 
1555     z1 = (INT32) wsptr[2];
1556     z2 = (INT32) wsptr[4];
1557     z3 = (INT32) wsptr[6];
1558 
1559     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
1560     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
1561     z4 = z1 + z3;
1562     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
1563     z4 -= z2;
1564     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
1565     tmp21 = tmp20 + tmp23 + tmp25 -
1566             MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
1567     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1568     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1569     tmp24 += tmp25;
1570     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
1571     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
1572              MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
1573     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
1574 
1575     /* Odd part */
1576 
1577     z1 = (INT32) wsptr[1];
1578     z2 = (INT32) wsptr[3];
1579     z3 = (INT32) wsptr[5];
1580     z4 = (INT32) wsptr[7];
1581 
1582     tmp11 = z1 + z2;
1583     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1584     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
1585     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
1586     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1587     tmp10 = tmp11 + tmp12 + tmp13 -
1588             MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
1589     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1590     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
1591     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
1592     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
1593     tmp11 += z1;
1594     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
1595     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
1596              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
1597              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
1598 
1599     /* Final output stage */
1600 
1601     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1602                                                CONST_BITS+PASS1_BITS+3)
1603                              & RANGE_MASK];
1604     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1605                                                CONST_BITS+PASS1_BITS+3)
1606                              & RANGE_MASK];
1607     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1608                                                CONST_BITS+PASS1_BITS+3)
1609                              & RANGE_MASK];
1610     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1611                                                CONST_BITS+PASS1_BITS+3)
1612                              & RANGE_MASK];
1613     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1614                                                CONST_BITS+PASS1_BITS+3)
1615                              & RANGE_MASK];
1616     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1617                                                CONST_BITS+PASS1_BITS+3)
1618                              & RANGE_MASK];
1619     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1620                                                CONST_BITS+PASS1_BITS+3)
1621                              & RANGE_MASK];
1622     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1623                                                CONST_BITS+PASS1_BITS+3)
1624                              & RANGE_MASK];
1625     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1626                                                CONST_BITS+PASS1_BITS+3)
1627                              & RANGE_MASK];
1628     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1629                                                CONST_BITS+PASS1_BITS+3)
1630                              & RANGE_MASK];
1631     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
1632                                                CONST_BITS+PASS1_BITS+3)
1633                              & RANGE_MASK];
1634 
1635     wsptr += 8;         /* advance pointer to next row */
1636   }
1637 }
1638 
1639 
1640 /*
1641  * Perform dequantization and inverse DCT on one block of coefficients,
1642  * producing a 12x12 output block.
1643  *
1644  * Optimized algorithm with 15 multiplications in the 1-D kernel.
1645  * cK represents sqrt(2) * cos(K*pi/24).
1646  */
1647 
1648 GLOBAL(void)
1649 jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1650                  JCOEFPTR coef_block,
1651                  JSAMPARRAY output_buf, JDIMENSION output_col)
1652 {
1653   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1654   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1655   INT32 z1, z2, z3, z4;
1656   JCOEFPTR inptr;
1657   ISLOW_MULT_TYPE * quantptr;
1658   int * wsptr;
1659   JSAMPROW outptr;
1660   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1661   int ctr;
1662   int workspace[8*12];  /* buffers data between passes */
1663   SHIFT_TEMPS
1664 
1665   /* Pass 1: process columns from input, store into work array. */
1666 
1667   inptr = coef_block;
1668   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1669   wsptr = workspace;
1670   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1671     /* Even part */
1672 
1673     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1674     z3 <<= CONST_BITS;
1675     /* Add fudge factor here for final descale. */
1676     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1677 
1678     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1679     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1680 
1681     tmp10 = z3 + z4;
1682     tmp11 = z3 - z4;
1683 
1684     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1685     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1686     z1 <<= CONST_BITS;
1687     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1688     z2 <<= CONST_BITS;
1689 
1690     tmp12 = z1 - z2;
1691 
1692     tmp21 = z3 + tmp12;
1693     tmp24 = z3 - tmp12;
1694 
1695     tmp12 = z4 + z2;
1696 
1697     tmp20 = tmp10 + tmp12;
1698     tmp25 = tmp10 - tmp12;
1699 
1700     tmp12 = z4 - z1 - z2;
1701 
1702     tmp22 = tmp11 + tmp12;
1703     tmp23 = tmp11 - tmp12;
1704 
1705     /* Odd part */
1706 
1707     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1708     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1709     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1710     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1711 
1712     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
1713     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
1714 
1715     tmp10 = z1 + z3;
1716     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
1717     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
1718     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
1719     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
1720     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1721     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1722     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
1723              MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
1724 
1725     z1 -= z4;
1726     z2 -= z3;
1727     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
1728     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
1729     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
1730 
1731     /* Final output stage */
1732 
1733     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1734     wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1735     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1736     wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1737     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1738     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1739     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1740     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1741     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1742     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1743     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
1744     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
1745   }
1746 
1747   /* Pass 2: process 12 rows from work array, store into output array. */
1748 
1749   wsptr = workspace;
1750   for (ctr = 0; ctr < 12; ctr++) {
1751     outptr = output_buf[ctr] + output_col;
1752 
1753     /* Even part */
1754 
1755     /* Add fudge factor here for final descale. */
1756     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1757     z3 <<= CONST_BITS;
1758 
1759     z4 = (INT32) wsptr[4];
1760     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1761 
1762     tmp10 = z3 + z4;
1763     tmp11 = z3 - z4;
1764 
1765     z1 = (INT32) wsptr[2];
1766     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1767     z1 <<= CONST_BITS;
1768     z2 = (INT32) wsptr[6];
1769     z2 <<= CONST_BITS;
1770 
1771     tmp12 = z1 - z2;
1772 
1773     tmp21 = z3 + tmp12;
1774     tmp24 = z3 - tmp12;
1775 
1776     tmp12 = z4 + z2;
1777 
1778     tmp20 = tmp10 + tmp12;
1779     tmp25 = tmp10 - tmp12;
1780 
1781     tmp12 = z4 - z1 - z2;
1782 
1783     tmp22 = tmp11 + tmp12;
1784     tmp23 = tmp11 - tmp12;
1785 
1786     /* Odd part */
1787 
1788     z1 = (INT32) wsptr[1];
1789     z2 = (INT32) wsptr[3];
1790     z3 = (INT32) wsptr[5];
1791     z4 = (INT32) wsptr[7];
1792 
1793     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
1794     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
1795 
1796     tmp10 = z1 + z3;
1797     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
1798     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
1799     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
1800     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
1801     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1802     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1803     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
1804              MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
1805 
1806     z1 -= z4;
1807     z2 -= z3;
1808     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
1809     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
1810     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
1811 
1812     /* Final output stage */
1813 
1814     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1815                                                CONST_BITS+PASS1_BITS+3)
1816                              & RANGE_MASK];
1817     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1818                                                CONST_BITS+PASS1_BITS+3)
1819                              & RANGE_MASK];
1820     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1821                                                CONST_BITS+PASS1_BITS+3)
1822                              & RANGE_MASK];
1823     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1824                                                CONST_BITS+PASS1_BITS+3)
1825                              & RANGE_MASK];
1826     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1827                                                CONST_BITS+PASS1_BITS+3)
1828                              & RANGE_MASK];
1829     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1830                                                CONST_BITS+PASS1_BITS+3)
1831                              & RANGE_MASK];
1832     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1833                                                CONST_BITS+PASS1_BITS+3)
1834                              & RANGE_MASK];
1835     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1836                                                CONST_BITS+PASS1_BITS+3)
1837                              & RANGE_MASK];
1838     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1839                                                CONST_BITS+PASS1_BITS+3)
1840                              & RANGE_MASK];
1841     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1842                                                CONST_BITS+PASS1_BITS+3)
1843                              & RANGE_MASK];
1844     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
1845                                                CONST_BITS+PASS1_BITS+3)
1846                              & RANGE_MASK];
1847     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
1848                                                CONST_BITS+PASS1_BITS+3)
1849                              & RANGE_MASK];
1850 
1851     wsptr += 8;         /* advance pointer to next row */
1852   }
1853 }
1854 
1855 
1856 /*
1857  * Perform dequantization and inverse DCT on one block of coefficients,
1858  * producing a 13x13 output block.
1859  *
1860  * Optimized algorithm with 29 multiplications in the 1-D kernel.
1861  * cK represents sqrt(2) * cos(K*pi/26).
1862  */
1863 
1864 GLOBAL(void)
1865 jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1866                  JCOEFPTR coef_block,
1867                  JSAMPARRAY output_buf, JDIMENSION output_col)
1868 {
1869   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1870   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
1871   INT32 z1, z2, z3, z4;
1872   JCOEFPTR inptr;
1873   ISLOW_MULT_TYPE * quantptr;
1874   int * wsptr;
1875   JSAMPROW outptr;
1876   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1877   int ctr;
1878   int workspace[8*13];  /* buffers data between passes */
1879   SHIFT_TEMPS
1880 
1881   /* Pass 1: process columns from input, store into work array. */
1882 
1883   inptr = coef_block;
1884   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1885   wsptr = workspace;
1886   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1887     /* Even part */
1888 
1889     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1890     z1 <<= CONST_BITS;
1891     /* Add fudge factor here for final descale. */
1892     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
1893 
1894     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1895     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1896     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1897 
1898     tmp10 = z3 + z4;
1899     tmp11 = z3 - z4;
1900 
1901     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
1902     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
1903 
1904     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
1905     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
1906 
1907     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
1908     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
1909 
1910     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
1911     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
1912 
1913     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
1914     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
1915 
1916     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
1917     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
1918 
1919     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
1920 
1921     /* Odd part */
1922 
1923     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1924     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1925     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1926     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1927 
1928     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
1929     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
1930     tmp15 = z1 + z4;
1931     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
1932     tmp10 = tmp11 + tmp12 + tmp13 -
1933             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
1934     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
1935     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
1936     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
1937     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
1938     tmp11 += tmp14;
1939     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
1940     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
1941     tmp12 += tmp14;
1942     tmp13 += tmp14;
1943     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
1944     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
1945             MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
1946     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
1947     tmp14 += z1;
1948     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
1949              MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
1950 
1951     /* Final output stage */
1952 
1953     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1954     wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1955     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1956     wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1957     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1958     wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1959     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1960     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1961     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1962     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1963     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
1964     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
1965     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
1966   }
1967 
1968   /* Pass 2: process 13 rows from work array, store into output array. */
1969 
1970   wsptr = workspace;
1971   for (ctr = 0; ctr < 13; ctr++) {
1972     outptr = output_buf[ctr] + output_col;
1973 
1974     /* Even part */
1975 
1976     /* Add fudge factor here for final descale. */
1977     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1978     z1 <<= CONST_BITS;
1979 
1980     z2 = (INT32) wsptr[2];
1981     z3 = (INT32) wsptr[4];
1982     z4 = (INT32) wsptr[6];
1983 
1984     tmp10 = z3 + z4;
1985     tmp11 = z3 - z4;
1986 
1987     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
1988     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
1989 
1990     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
1991     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
1992 
1993     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
1994     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
1995 
1996     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
1997     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
1998 
1999     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
2000     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
2001 
2002     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
2003     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
2004 
2005     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
2006 
2007     /* Odd part */
2008 
2009     z1 = (INT32) wsptr[1];
2010     z2 = (INT32) wsptr[3];
2011     z3 = (INT32) wsptr[5];
2012     z4 = (INT32) wsptr[7];
2013 
2014     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
2015     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
2016     tmp15 = z1 + z4;
2017     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
2018     tmp10 = tmp11 + tmp12 + tmp13 -
2019             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
2020     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
2021     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
2022     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
2023     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
2024     tmp11 += tmp14;
2025     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
2026     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
2027     tmp12 += tmp14;
2028     tmp13 += tmp14;
2029     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
2030     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
2031             MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
2032     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
2033     tmp14 += z1;
2034     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
2035              MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
2036 
2037     /* Final output stage */
2038 
2039     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2040                                                CONST_BITS+PASS1_BITS+3)
2041                              & RANGE_MASK];
2042     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2043                                                CONST_BITS+PASS1_BITS+3)
2044                              & RANGE_MASK];
2045     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2046                                                CONST_BITS+PASS1_BITS+3)
2047                              & RANGE_MASK];
2048     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2049                                                CONST_BITS+PASS1_BITS+3)
2050                              & RANGE_MASK];
2051     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2052                                                CONST_BITS+PASS1_BITS+3)
2053                              & RANGE_MASK];
2054     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2055                                                CONST_BITS+PASS1_BITS+3)
2056                              & RANGE_MASK];
2057     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2058                                                CONST_BITS+PASS1_BITS+3)
2059                              & RANGE_MASK];
2060     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2061                                                CONST_BITS+PASS1_BITS+3)
2062                              & RANGE_MASK];
2063     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2064                                                CONST_BITS+PASS1_BITS+3)
2065                              & RANGE_MASK];
2066     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2067                                                CONST_BITS+PASS1_BITS+3)
2068                              & RANGE_MASK];
2069     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2070                                                CONST_BITS+PASS1_BITS+3)
2071                              & RANGE_MASK];
2072     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2073                                                CONST_BITS+PASS1_BITS+3)
2074                              & RANGE_MASK];
2075     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
2076                                                CONST_BITS+PASS1_BITS+3)
2077                              & RANGE_MASK];
2078 
2079     wsptr += 8;         /* advance pointer to next row */
2080   }
2081 }
2082 
2083 
2084 /*
2085  * Perform dequantization and inverse DCT on one block of coefficients,
2086  * producing a 14x14 output block.
2087  *
2088  * Optimized algorithm with 20 multiplications in the 1-D kernel.
2089  * cK represents sqrt(2) * cos(K*pi/28).
2090  */
2091 
2092 GLOBAL(void)
2093 jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2094                  JCOEFPTR coef_block,
2095                  JSAMPARRAY output_buf, JDIMENSION output_col)
2096 {
2097   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2098   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
2099   INT32 z1, z2, z3, z4;
2100   JCOEFPTR inptr;
2101   ISLOW_MULT_TYPE * quantptr;
2102   int * wsptr;
2103   JSAMPROW outptr;
2104   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2105   int ctr;
2106   int workspace[8*14];  /* buffers data between passes */
2107   SHIFT_TEMPS
2108 
2109   /* Pass 1: process columns from input, store into work array. */
2110 
2111   inptr = coef_block;
2112   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2113   wsptr = workspace;
2114   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2115     /* Even part */
2116 
2117     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2118     z1 <<= CONST_BITS;
2119     /* Add fudge factor here for final descale. */
2120     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
2121     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2122     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
2123     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
2124     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
2125 
2126     tmp10 = z1 + z2;
2127     tmp11 = z1 + z3;
2128     tmp12 = z1 - z4;
2129 
2130     tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
2131                         CONST_BITS-PASS1_BITS);
2132 
2133     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2134     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2135 
2136     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
2137 
2138     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2139     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2140     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
2141             MULTIPLY(z2, FIX(1.378756276));      /* c2 */
2142 
2143     tmp20 = tmp10 + tmp13;
2144     tmp26 = tmp10 - tmp13;
2145     tmp21 = tmp11 + tmp14;
2146     tmp25 = tmp11 - tmp14;
2147     tmp22 = tmp12 + tmp15;
2148     tmp24 = tmp12 - tmp15;
2149 
2150     /* Odd part */
2151 
2152     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2153     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2154     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2155     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2156     tmp13 = z4 << CONST_BITS;
2157 
2158     tmp14 = z1 + z3;
2159     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
2160     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
2161     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2162     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
2163     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
2164     z1    -= z2;
2165     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
2166     tmp16 += tmp15;
2167     z1    += z4;
2168     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
2169     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
2170     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
2171     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
2172     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2173     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
2174 
2175     tmp13 = (z1 - z3) << PASS1_BITS;
2176 
2177     /* Final output stage */
2178 
2179     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2180     wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2181     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2182     wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2183     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2184     wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2185     wsptr[8*3]  = (int) (tmp23 + tmp13);
2186     wsptr[8*10] = (int) (tmp23 - tmp13);
2187     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2188     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2189     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2190     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2191     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
2192     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
2193   }
2194 
2195   /* Pass 2: process 14 rows from work array, store into output array. */
2196 
2197   wsptr = workspace;
2198   for (ctr = 0; ctr < 14; ctr++) {
2199     outptr = output_buf[ctr] + output_col;
2200 
2201     /* Even part */
2202 
2203     /* Add fudge factor here for final descale. */
2204     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
2205     z1 <<= CONST_BITS;
2206     z4 = (INT32) wsptr[4];
2207     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
2208     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
2209     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
2210 
2211     tmp10 = z1 + z2;
2212     tmp11 = z1 + z3;
2213     tmp12 = z1 - z4;
2214 
2215     tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
2216 
2217     z1 = (INT32) wsptr[2];
2218     z2 = (INT32) wsptr[6];
2219 
2220     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
2221 
2222     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2223     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2224     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
2225             MULTIPLY(z2, FIX(1.378756276));      /* c2 */
2226 
2227     tmp20 = tmp10 + tmp13;
2228     tmp26 = tmp10 - tmp13;
2229     tmp21 = tmp11 + tmp14;
2230     tmp25 = tmp11 - tmp14;
2231     tmp22 = tmp12 + tmp15;
2232     tmp24 = tmp12 - tmp15;
2233 
2234     /* Odd part */
2235 
2236     z1 = (INT32) wsptr[1];
2237     z2 = (INT32) wsptr[3];
2238     z3 = (INT32) wsptr[5];
2239     z4 = (INT32) wsptr[7];
2240     z4 <<= CONST_BITS;
2241 
2242     tmp14 = z1 + z3;
2243     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
2244     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
2245     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2246     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
2247     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
2248     z1    -= z2;
2249     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
2250     tmp16 += tmp15;
2251     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
2252     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
2253     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
2254     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
2255     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2256     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
2257 
2258     tmp13 = ((z1 - z3) << CONST_BITS) + z4;
2259 
2260     /* Final output stage */
2261 
2262     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2263                                                CONST_BITS+PASS1_BITS+3)
2264                              & RANGE_MASK];
2265     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2266                                                CONST_BITS+PASS1_BITS+3)
2267                              & RANGE_MASK];
2268     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2269                                                CONST_BITS+PASS1_BITS+3)
2270                              & RANGE_MASK];
2271     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2272                                                CONST_BITS+PASS1_BITS+3)
2273                              & RANGE_MASK];
2274     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2275                                                CONST_BITS+PASS1_BITS+3)
2276                              & RANGE_MASK];
2277     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2278                                                CONST_BITS+PASS1_BITS+3)
2279                              & RANGE_MASK];
2280     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2281                                                CONST_BITS+PASS1_BITS+3)
2282                              & RANGE_MASK];
2283     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2284                                                CONST_BITS+PASS1_BITS+3)
2285                              & RANGE_MASK];
2286     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2287                                                CONST_BITS+PASS1_BITS+3)
2288                              & RANGE_MASK];
2289     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2290                                                CONST_BITS+PASS1_BITS+3)
2291                              & RANGE_MASK];
2292     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2293                                                CONST_BITS+PASS1_BITS+3)
2294                              & RANGE_MASK];
2295     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2296                                                CONST_BITS+PASS1_BITS+3)
2297                              & RANGE_MASK];
2298     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
2299                                                CONST_BITS+PASS1_BITS+3)
2300                              & RANGE_MASK];
2301     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
2302                                                CONST_BITS+PASS1_BITS+3)
2303                              & RANGE_MASK];
2304 
2305     wsptr += 8;         /* advance pointer to next row */
2306   }
2307 }
2308 
2309 
2310 /*
2311  * Perform dequantization and inverse DCT on one block of coefficients,
2312  * producing a 15x15 output block.
2313  *
2314  * Optimized algorithm with 22 multiplications in the 1-D kernel.
2315  * cK represents sqrt(2) * cos(K*pi/30).
2316  */
2317 
2318 GLOBAL(void)
2319 jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2320                  JCOEFPTR coef_block,
2321                  JSAMPARRAY output_buf, JDIMENSION output_col)
2322 {
2323   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2324   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2325   INT32 z1, z2, z3, z4;
2326   JCOEFPTR inptr;
2327   ISLOW_MULT_TYPE * quantptr;
2328   int * wsptr;
2329   JSAMPROW outptr;
2330   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2331   int ctr;
2332   int workspace[8*15];  /* buffers data between passes */
2333   SHIFT_TEMPS
2334 
2335   /* Pass 1: process columns from input, store into work array. */
2336 
2337   inptr = coef_block;
2338   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2339   wsptr = workspace;
2340   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2341     /* Even part */
2342 
2343     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2344     z1 <<= CONST_BITS;
2345     /* Add fudge factor here for final descale. */
2346     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
2347 
2348     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2349     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2350     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2351 
2352     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2353     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2354 
2355     tmp12 = z1 - tmp10;
2356     tmp13 = z1 + tmp11;
2357     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
2358 
2359     z4 = z2 - z3;
2360     z3 += z2;
2361     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2362     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2363     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
2364 
2365     tmp20 = tmp13 + tmp10 + tmp11;
2366     tmp23 = tmp12 - tmp10 + tmp11 + z2;
2367 
2368     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2369     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2370 
2371     tmp25 = tmp13 - tmp10 - tmp11;
2372     tmp26 = tmp12 + tmp10 - tmp11 - z2;
2373 
2374     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2375     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2376 
2377     tmp21 = tmp12 + tmp10 + tmp11;
2378     tmp24 = tmp13 - tmp10 + tmp11;
2379     tmp11 += tmp11;
2380     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
2381     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
2382 
2383     /* Odd part */
2384 
2385     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2386     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2387     z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2388     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
2389     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2390 
2391     tmp13 = z2 - z4;
2392     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
2393     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
2394     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
2395 
2396     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
2397     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
2398     z2 = z1 - z4;
2399     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
2400 
2401     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2402     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2403     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
2404     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
2405     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
2406     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
2407 
2408     /* Final output stage */
2409 
2410     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2411     wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2412     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2413     wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2414     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2415     wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2416     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
2417     wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
2418     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2419     wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2420     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2421     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2422     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
2423     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
2424     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
2425   }
2426 
2427   /* Pass 2: process 15 rows from work array, store into output array. */
2428 
2429   wsptr = workspace;
2430   for (ctr = 0; ctr < 15; ctr++) {
2431     outptr = output_buf[ctr] + output_col;
2432 
2433     /* Even part */
2434 
2435     /* Add fudge factor here for final descale. */
2436     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
2437     z1 <<= CONST_BITS;
2438 
2439     z2 = (INT32) wsptr[2];
2440     z3 = (INT32) wsptr[4];
2441     z4 = (INT32) wsptr[6];
2442 
2443     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2444     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2445 
2446     tmp12 = z1 - tmp10;
2447     tmp13 = z1 + tmp11;
2448     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
2449 
2450     z4 = z2 - z3;
2451     z3 += z2;
2452     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2453     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2454     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
2455 
2456     tmp20 = tmp13 + tmp10 + tmp11;
2457     tmp23 = tmp12 - tmp10 + tmp11 + z2;
2458 
2459     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2460     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2461 
2462     tmp25 = tmp13 - tmp10 - tmp11;
2463     tmp26 = tmp12 + tmp10 - tmp11 - z2;
2464 
2465     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2466     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2467 
2468     tmp21 = tmp12 + tmp10 + tmp11;
2469     tmp24 = tmp13 - tmp10 + tmp11;
2470     tmp11 += tmp11;
2471     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
2472     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
2473 
2474     /* Odd part */
2475 
2476     z1 = (INT32) wsptr[1];
2477     z2 = (INT32) wsptr[3];
2478     z4 = (INT32) wsptr[5];
2479     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
2480     z4 = (INT32) wsptr[7];
2481 
2482     tmp13 = z2 - z4;
2483     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
2484     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
2485     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
2486 
2487     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
2488     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
2489     z2 = z1 - z4;
2490     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
2491 
2492     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2493     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2494     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
2495     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
2496     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
2497     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
2498 
2499     /* Final output stage */
2500 
2501     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2502                                                CONST_BITS+PASS1_BITS+3)
2503                              & RANGE_MASK];
2504     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2505                                                CONST_BITS+PASS1_BITS+3)
2506                              & RANGE_MASK];
2507     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2508                                                CONST_BITS+PASS1_BITS+3)
2509                              & RANGE_MASK];
2510     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2511                                                CONST_BITS+PASS1_BITS+3)
2512                              & RANGE_MASK];
2513     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2514                                                CONST_BITS+PASS1_BITS+3)
2515                              & RANGE_MASK];
2516     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2517                                                CONST_BITS+PASS1_BITS+3)
2518                              & RANGE_MASK];
2519     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2520                                                CONST_BITS+PASS1_BITS+3)
2521                              & RANGE_MASK];
2522     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2523                                                CONST_BITS+PASS1_BITS+3)
2524                              & RANGE_MASK];
2525     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2526                                                CONST_BITS+PASS1_BITS+3)
2527                              & RANGE_MASK];
2528     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2529                                                CONST_BITS+PASS1_BITS+3)
2530                              & RANGE_MASK];
2531     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2532                                                CONST_BITS+PASS1_BITS+3)
2533                              & RANGE_MASK];
2534     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2535                                                CONST_BITS+PASS1_BITS+3)
2536                              & RANGE_MASK];
2537     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
2538                                                CONST_BITS+PASS1_BITS+3)
2539                              & RANGE_MASK];
2540     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
2541                                                CONST_BITS+PASS1_BITS+3)
2542                              & RANGE_MASK];
2543     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
2544                                                CONST_BITS+PASS1_BITS+3)
2545                              & RANGE_MASK];
2546 
2547     wsptr += 8;         /* advance pointer to next row */
2548   }
2549 }
2550 
2551 
2552 /*
2553  * Perform dequantization and inverse DCT on one block of coefficients,
2554  * producing a 16x16 output block.
2555  *
2556  * Optimized algorithm with 28 multiplications in the 1-D kernel.
2557  * cK represents sqrt(2) * cos(K*pi/32).
2558  */
2559 
2560 GLOBAL(void)
2561 jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2562                  JCOEFPTR coef_block,
2563                  JSAMPARRAY output_buf, JDIMENSION output_col)
2564 {
2565   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
2566   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2567   INT32 z1, z2, z3, z4;
2568   JCOEFPTR inptr;
2569   ISLOW_MULT_TYPE * quantptr;
2570   int * wsptr;
2571   JSAMPROW outptr;
2572   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2573   int ctr;
2574   int workspace[8*16];  /* buffers data between passes */
2575   SHIFT_TEMPS
2576 
2577   /* Pass 1: process columns from input, store into work array. */
2578 
2579   inptr = coef_block;
2580   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2581   wsptr = workspace;
2582   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2583     /* Even part */
2584 
2585     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2586     tmp0 <<= CONST_BITS;
2587     /* Add fudge factor here for final descale. */
2588     tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
2589 
2590     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2591     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2592     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2593 
2594     tmp10 = tmp0 + tmp1;
2595     tmp11 = tmp0 - tmp1;
2596     tmp12 = tmp0 + tmp2;
2597     tmp13 = tmp0 - tmp2;
2598 
2599     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2600     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2601     z3 = z1 - z2;
2602     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2603     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2604 
2605     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2606     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2607     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2608     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2609 
2610     tmp20 = tmp10 + tmp0;
2611     tmp27 = tmp10 - tmp0;
2612     tmp21 = tmp12 + tmp1;
2613     tmp26 = tmp12 - tmp1;
2614     tmp22 = tmp13 + tmp2;
2615     tmp25 = tmp13 - tmp2;
2616     tmp23 = tmp11 + tmp3;
2617     tmp24 = tmp11 - tmp3;
2618 
2619     /* Odd part */
2620 
2621     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2622     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2623     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2624     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2625 
2626     tmp11 = z1 + z3;
2627 
2628     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
2629     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
2630     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
2631     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
2632     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
2633     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
2634     tmp0  = tmp1 + tmp2 + tmp3 -
2635             MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
2636     tmp13 = tmp10 + tmp11 + tmp12 -
2637             MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
2638     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
2639     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
2640     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
2641     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
2642     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
2643     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
2644     z2    += z4;
2645     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
2646     tmp1  += z1;
2647     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
2648     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
2649     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
2650     tmp12 += z2;
2651     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
2652     tmp2  += z2;
2653     tmp3  += z2;
2654     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
2655     tmp10 += z2;
2656     tmp11 += z2;
2657 
2658     /* Final output stage */
2659 
2660     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
2661     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
2662     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
2663     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
2664     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
2665     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
2666     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
2667     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
2668     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
2669     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
2670     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
2671     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
2672     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
2673     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
2674     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
2675     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
2676   }
2677 
2678   /* Pass 2: process 16 rows from work array, store into output array. */
2679 
2680   wsptr = workspace;
2681   for (ctr = 0; ctr < 16; ctr++) {
2682     outptr = output_buf[ctr] + output_col;
2683 
2684     /* Even part */
2685 
2686     /* Add fudge factor here for final descale. */
2687     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
2688     tmp0 <<= CONST_BITS;
2689 
2690     z1 = (INT32) wsptr[4];
2691     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2692     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2693 
2694     tmp10 = tmp0 + tmp1;
2695     tmp11 = tmp0 - tmp1;
2696     tmp12 = tmp0 + tmp2;
2697     tmp13 = tmp0 - tmp2;
2698 
2699     z1 = (INT32) wsptr[2];
2700     z2 = (INT32) wsptr[6];
2701     z3 = z1 - z2;
2702     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2703     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2704 
2705     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2706     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2707     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2708     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2709 
2710     tmp20 = tmp10 + tmp0;
2711     tmp27 = tmp10 - tmp0;
2712     tmp21 = tmp12 + tmp1;
2713     tmp26 = tmp12 - tmp1;
2714     tmp22 = tmp13 + tmp2;
2715     tmp25 = tmp13 - tmp2;
2716     tmp23 = tmp11 + tmp3;
2717     tmp24 = tmp11 - tmp3;
2718 
2719     /* Odd part */
2720 
2721     z1 = (INT32) wsptr[1];
2722     z2 = (INT32) wsptr[3];
2723     z3 = (INT32) wsptr[5];
2724     z4 = (INT32) wsptr[7];
2725 
2726     tmp11 = z1 + z3;
2727 
2728     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
2729     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
2730     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
2731     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
2732     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
2733     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
2734     tmp0  = tmp1 + tmp2 + tmp3 -
2735             MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
2736     tmp13 = tmp10 + tmp11 + tmp12 -
2737             MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
2738     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
2739     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
2740     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
2741     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
2742     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
2743     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
2744     z2    += z4;
2745     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
2746     tmp1  += z1;
2747     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
2748     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
2749     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
2750     tmp12 += z2;
2751     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
2752     tmp2  += z2;
2753     tmp3  += z2;
2754     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
2755     tmp10 += z2;
2756     tmp11 += z2;
2757 
2758     /* Final output stage */
2759 
2760     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
2761                                                CONST_BITS+PASS1_BITS+3)
2762                              & RANGE_MASK];
2763     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
2764                                                CONST_BITS+PASS1_BITS+3)
2765                              & RANGE_MASK];
2766     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
2767                                                CONST_BITS+PASS1_BITS+3)
2768                              & RANGE_MASK];
2769     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
2770                                                CONST_BITS+PASS1_BITS+3)
2771                              & RANGE_MASK];
2772     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
2773                                                CONST_BITS+PASS1_BITS+3)
2774                              & RANGE_MASK];
2775     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
2776                                                CONST_BITS+PASS1_BITS+3)
2777                              & RANGE_MASK];
2778     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
2779                                                CONST_BITS+PASS1_BITS+3)
2780                              & RANGE_MASK];
2781     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
2782                                                CONST_BITS+PASS1_BITS+3)
2783                              & RANGE_MASK];
2784     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
2785                                                CONST_BITS+PASS1_BITS+3)
2786                              & RANGE_MASK];
2787     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
2788                                                CONST_BITS+PASS1_BITS+3)
2789                              & RANGE_MASK];
2790     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
2791                                                CONST_BITS+PASS1_BITS+3)
2792                              & RANGE_MASK];
2793     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
2794                                                CONST_BITS+PASS1_BITS+3)
2795                              & RANGE_MASK];
2796     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
2797                                                CONST_BITS+PASS1_BITS+3)
2798                              & RANGE_MASK];
2799     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
2800                                                CONST_BITS+PASS1_BITS+3)
2801                              & RANGE_MASK];
2802     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
2803                                                CONST_BITS+PASS1_BITS+3)
2804                              & RANGE_MASK];
2805     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
2806                                                CONST_BITS+PASS1_BITS+3)
2807                              & RANGE_MASK];
2808 
2809     wsptr += 8;         /* advance pointer to next row */
2810   }
2811 }
2812 
2813 
2814 /*
2815  * Perform dequantization and inverse DCT on one block of coefficients,
2816  * producing a 16x8 output block.
2817  *
2818  * 8-point IDCT in pass 1 (columns), 16-point in pass 2 (rows).
2819  */
2820 
2821 GLOBAL(void)
2822 jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2823                 JCOEFPTR coef_block,
2824                 JSAMPARRAY output_buf, JDIMENSION output_col)
2825 {
2826   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
2827   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2828   INT32 z1, z2, z3, z4;
2829   JCOEFPTR inptr;
2830   ISLOW_MULT_TYPE * quantptr;
2831   int * wsptr;
2832   JSAMPROW outptr;
2833   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2834   int ctr;
2835   int workspace[8*8];   /* buffers data between passes */
2836   SHIFT_TEMPS
2837 
2838   /* Pass 1: process columns from input, store into work array. */
2839   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
2840   /* furthermore, we scale the results by 2**PASS1_BITS. */
2841 
2842   inptr = coef_block;
2843   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2844   wsptr = workspace;
2845   for (ctr = DCTSIZE; ctr > 0; ctr--) {
2846     /* Due to quantization, we will usually find that many of the input
2847      * coefficients are zero, especially the AC terms.  We can exploit this
2848      * by short-circuiting the IDCT calculation for any column in which all
2849      * the AC terms are zero.  In that case each output is equal to the
2850      * DC coefficient (with scale factor as needed).
2851      * With typical images and quantization tables, half or more of the
2852      * column DCT calculations can be simplified this way.
2853      */
2854 
2855     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
2856         inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
2857         inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
2858         inptr[DCTSIZE*7] == 0) {
2859       /* AC terms all zero */
2860       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
2861 
2862       wsptr[DCTSIZE*0] = dcval;
2863       wsptr[DCTSIZE*1] = dcval;
2864       wsptr[DCTSIZE*2] = dcval;
2865       wsptr[DCTSIZE*3] = dcval;
2866       wsptr[DCTSIZE*4] = dcval;
2867       wsptr[DCTSIZE*5] = dcval;
2868       wsptr[DCTSIZE*6] = dcval;
2869       wsptr[DCTSIZE*7] = dcval;
2870 
2871       inptr++;                  /* advance pointers to next column */
2872       quantptr++;
2873       wsptr++;
2874       continue;
2875     }
2876 
2877     /* Even part: reverse the even part of the forward DCT. */
2878     /* The rotator is sqrt(2)*c(-6). */
2879 
2880     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2881     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2882 
2883     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
2884     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
2885     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
2886 
2887     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2888     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2889     z2 <<= CONST_BITS;
2890     z3 <<= CONST_BITS;
2891     /* Add fudge factor here for final descale. */
2892     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
2893 
2894     tmp0 = z2 + z3;
2895     tmp1 = z2 - z3;
2896 
2897     tmp10 = tmp0 + tmp2;
2898     tmp13 = tmp0 - tmp2;
2899     tmp11 = tmp1 + tmp3;
2900     tmp12 = tmp1 - tmp3;
2901 
2902     /* Odd part per figure 8; the matrix is unitary and hence its
2903      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
2904      */
2905 
2906     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2907     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2908     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2909     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2910 
2911     z2 = tmp0 + tmp2;
2912     z3 = tmp1 + tmp3;
2913 
2914     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
2915     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
2916     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
2917     z2 += z1;
2918     z3 += z1;
2919 
2920     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
2921     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
2922     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
2923     tmp0 += z1 + z2;
2924     tmp3 += z1 + z3;
2925 
2926     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
2927     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
2928     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
2929     tmp1 += z1 + z3;
2930     tmp2 += z1 + z2;
2931 
2932     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
2933 
2934     wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
2935     wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
2936     wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
2937     wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
2938     wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
2939     wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
2940     wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
2941     wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
2942 
2943     inptr++;                    /* advance pointers to next column */
2944     quantptr++;
2945     wsptr++;
2946   }
2947 
2948   /* Pass 2: process 8 rows from work array, store into output array.
2949    * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
2950    */
2951   wsptr = workspace;
2952   for (ctr = 0; ctr < 8; ctr++) {
2953     outptr = output_buf[ctr] + output_col;
2954 
2955     /* Even part */
2956 
2957     /* Add fudge factor here for final descale. */
2958     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
2959     tmp0 <<= CONST_BITS;
2960 
2961     z1 = (INT32) wsptr[4];
2962     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2963     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2964 
2965     tmp10 = tmp0 + tmp1;
2966     tmp11 = tmp0 - tmp1;
2967     tmp12 = tmp0 + tmp2;
2968     tmp13 = tmp0 - tmp2;
2969 
2970     z1 = (INT32) wsptr[2];
2971     z2 = (INT32) wsptr[6];
2972     z3 = z1 - z2;
2973     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2974     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2975 
2976     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2977     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2978     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2979     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2980 
2981     tmp20 = tmp10 + tmp0;
2982     tmp27 = tmp10 - tmp0;
2983     tmp21 = tmp12 + tmp1;
2984     tmp26 = tmp12 - tmp1;
2985     tmp22 = tmp13 + tmp2;
2986     tmp25 = tmp13 - tmp2;
2987     tmp23 = tmp11 + tmp3;
2988     tmp24 = tmp11 - tmp3;
2989 
2990     /* Odd part */
2991 
2992     z1 = (INT32) wsptr[1];
2993     z2 = (INT32) wsptr[3];
2994     z3 = (INT32) wsptr[5];
2995     z4 = (INT32) wsptr[7];
2996 
2997     tmp11 = z1 + z3;
2998 
2999     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
3000     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
3001     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
3002     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
3003     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
3004     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
3005     tmp0  = tmp1 + tmp2 + tmp3 -
3006             MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
3007     tmp13 = tmp10 + tmp11 + tmp12 -
3008             MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
3009     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
3010     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
3011     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
3012     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
3013     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
3014     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
3015     z2    += z4;
3016     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
3017     tmp1  += z1;
3018     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
3019     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
3020     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
3021     tmp12 += z2;
3022     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
3023     tmp2  += z2;
3024     tmp3  += z2;
3025     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
3026     tmp10 += z2;
3027     tmp11 += z2;
3028 
3029     /* Final output stage */
3030 
3031     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
3032                                                CONST_BITS+PASS1_BITS+3)
3033                              & RANGE_MASK];
3034     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
3035                                                CONST_BITS+PASS1_BITS+3)
3036                              & RANGE_MASK];
3037     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
3038                                                CONST_BITS+PASS1_BITS+3)
3039                              & RANGE_MASK];
3040     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
3041                                                CONST_BITS+PASS1_BITS+3)
3042                              & RANGE_MASK];
3043     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
3044                                                CONST_BITS+PASS1_BITS+3)
3045                              & RANGE_MASK];
3046     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
3047                                                CONST_BITS+PASS1_BITS+3)
3048                              & RANGE_MASK];
3049     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
3050                                                CONST_BITS+PASS1_BITS+3)
3051                              & RANGE_MASK];
3052     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
3053                                                CONST_BITS+PASS1_BITS+3)
3054                              & RANGE_MASK];
3055     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
3056                                                CONST_BITS+PASS1_BITS+3)
3057                              & RANGE_MASK];
3058     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
3059                                                CONST_BITS+PASS1_BITS+3)
3060                              & RANGE_MASK];
3061     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
3062                                                CONST_BITS+PASS1_BITS+3)
3063                              & RANGE_MASK];
3064     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
3065                                                CONST_BITS+PASS1_BITS+3)
3066                              & RANGE_MASK];
3067     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
3068                                                CONST_BITS+PASS1_BITS+3)
3069                              & RANGE_MASK];
3070     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
3071                                                CONST_BITS+PASS1_BITS+3)
3072                              & RANGE_MASK];
3073     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
3074                                                CONST_BITS+PASS1_BITS+3)
3075                              & RANGE_MASK];
3076     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
3077                                                CONST_BITS+PASS1_BITS+3)
3078                              & RANGE_MASK];
3079 
3080     wsptr += 8;         /* advance pointer to next row */
3081   }
3082 }
3083 
3084 
3085 /*
3086  * Perform dequantization and inverse DCT on one block of coefficients,
3087  * producing a 14x7 output block.
3088  *
3089  * 7-point IDCT in pass 1 (columns), 14-point in pass 2 (rows).
3090  */
3091 
3092 GLOBAL(void)
3093 jpeg_idct_14x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3094                 JCOEFPTR coef_block,
3095                 JSAMPARRAY output_buf, JDIMENSION output_col)
3096 {
3097   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3098   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
3099   INT32 z1, z2, z3, z4;
3100   JCOEFPTR inptr;
3101   ISLOW_MULT_TYPE * quantptr;
3102   int * wsptr;
3103   JSAMPROW outptr;
3104   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3105   int ctr;
3106   int workspace[8*7];   /* buffers data between passes */
3107   SHIFT_TEMPS
3108 
3109   /* Pass 1: process columns from input, store into work array.
3110    * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
3111    */
3112   inptr = coef_block;
3113   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3114   wsptr = workspace;
3115   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3116     /* Even part */
3117 
3118     tmp23 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3119     tmp23 <<= CONST_BITS;
3120     /* Add fudge factor here for final descale. */
3121     tmp23 += ONE << (CONST_BITS-PASS1_BITS-1);
3122 
3123     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3124     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3125     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
3126 
3127     tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
3128     tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
3129     tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
3130     tmp10 = z1 + z3;
3131     z2 -= tmp10;
3132     tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
3133     tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
3134     tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
3135     tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
3136 
3137     /* Odd part */
3138 
3139     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3140     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3141     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
3142 
3143     tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
3144     tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
3145     tmp10 = tmp11 - tmp12;
3146     tmp11 += tmp12;
3147     tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
3148     tmp11 += tmp12;
3149     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
3150     tmp10 += z2;
3151     tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
3152 
3153     /* Final output stage */
3154 
3155     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
3156     wsptr[8*6] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
3157     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
3158     wsptr[8*5] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
3159     wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
3160     wsptr[8*4] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
3161     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23, CONST_BITS-PASS1_BITS);
3162   }
3163 
3164   /* Pass 2: process 7 rows from work array, store into output array.
3165    * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
3166    */
3167   wsptr = workspace;
3168   for (ctr = 0; ctr < 7; ctr++) {
3169     outptr = output_buf[ctr] + output_col;
3170 
3171     /* Even part */
3172 
3173     /* Add fudge factor here for final descale. */
3174     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3175     z1 <<= CONST_BITS;
3176     z4 = (INT32) wsptr[4];
3177     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
3178     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
3179     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
3180 
3181     tmp10 = z1 + z2;
3182     tmp11 = z1 + z3;
3183     tmp12 = z1 - z4;
3184 
3185     tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
3186 
3187     z1 = (INT32) wsptr[2];
3188     z2 = (INT32) wsptr[6];
3189 
3190     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
3191 
3192     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
3193     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
3194     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
3195             MULTIPLY(z2, FIX(1.378756276));      /* c2 */
3196 
3197     tmp20 = tmp10 + tmp13;
3198     tmp26 = tmp10 - tmp13;
3199     tmp21 = tmp11 + tmp14;
3200     tmp25 = tmp11 - tmp14;
3201     tmp22 = tmp12 + tmp15;
3202     tmp24 = tmp12 - tmp15;
3203 
3204     /* Odd part */
3205 
3206     z1 = (INT32) wsptr[1];
3207     z2 = (INT32) wsptr[3];
3208     z3 = (INT32) wsptr[5];
3209     z4 = (INT32) wsptr[7];
3210     z4 <<= CONST_BITS;
3211 
3212     tmp14 = z1 + z3;
3213     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
3214     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
3215     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
3216     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
3217     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
3218     z1    -= z2;
3219     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
3220     tmp16 += tmp15;
3221     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
3222     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
3223     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
3224     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
3225     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
3226     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
3227 
3228     tmp13 = ((z1 - z3) << CONST_BITS) + z4;
3229 
3230     /* Final output stage */
3231 
3232     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3233                                                CONST_BITS+PASS1_BITS+3)
3234                              & RANGE_MASK];
3235     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3236                                                CONST_BITS+PASS1_BITS+3)
3237                              & RANGE_MASK];
3238     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3239                                                CONST_BITS+PASS1_BITS+3)
3240                              & RANGE_MASK];
3241     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3242                                                CONST_BITS+PASS1_BITS+3)
3243                              & RANGE_MASK];
3244     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3245                                                CONST_BITS+PASS1_BITS+3)
3246                              & RANGE_MASK];
3247     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3248                                                CONST_BITS+PASS1_BITS+3)
3249                              & RANGE_MASK];
3250     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3251                                                CONST_BITS+PASS1_BITS+3)
3252                              & RANGE_MASK];
3253     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3254                                                CONST_BITS+PASS1_BITS+3)
3255                              & RANGE_MASK];
3256     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3257                                                CONST_BITS+PASS1_BITS+3)
3258                              & RANGE_MASK];
3259     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3260                                                CONST_BITS+PASS1_BITS+3)
3261                              & RANGE_MASK];
3262     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
3263                                                CONST_BITS+PASS1_BITS+3)
3264                              & RANGE_MASK];
3265     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
3266                                                CONST_BITS+PASS1_BITS+3)
3267                              & RANGE_MASK];
3268     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
3269                                                CONST_BITS+PASS1_BITS+3)
3270                              & RANGE_MASK];
3271     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
3272                                                CONST_BITS+PASS1_BITS+3)
3273                              & RANGE_MASK];
3274 
3275     wsptr += 8;         /* advance pointer to next row */
3276   }
3277 }
3278 
3279 
3280 /*
3281  * Perform dequantization and inverse DCT on one block of coefficients,
3282  * producing a 12x6 output block.
3283  *
3284  * 6-point IDCT in pass 1 (columns), 12-point in pass 2 (rows).
3285  */
3286 
3287 GLOBAL(void)
3288 jpeg_idct_12x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3289                 JCOEFPTR coef_block,
3290                 JSAMPARRAY output_buf, JDIMENSION output_col)
3291 {
3292   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3293   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
3294   INT32 z1, z2, z3, z4;
3295   JCOEFPTR inptr;
3296   ISLOW_MULT_TYPE * quantptr;
3297   int * wsptr;
3298   JSAMPROW outptr;
3299   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3300   int ctr;
3301   int workspace[8*6];   /* buffers data between passes */
3302   SHIFT_TEMPS
3303 
3304   /* Pass 1: process columns from input, store into work array.
3305    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3306    */
3307   inptr = coef_block;
3308   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3309   wsptr = workspace;
3310   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3311     /* Even part */
3312 
3313     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3314     tmp10 <<= CONST_BITS;
3315     /* Add fudge factor here for final descale. */
3316     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
3317     tmp12 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3318     tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
3319     tmp11 = tmp10 + tmp20;
3320     tmp21 = RIGHT_SHIFT(tmp10 - tmp20 - tmp20, CONST_BITS-PASS1_BITS);
3321     tmp20 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3322     tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
3323     tmp20 = tmp11 + tmp10;
3324     tmp22 = tmp11 - tmp10;
3325 
3326     /* Odd part */
3327 
3328     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3329     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3330     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
3331     tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
3332     tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
3333     tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
3334     tmp11 = (z1 - z2 - z3) << PASS1_BITS;
3335 
3336     /* Final output stage */
3337 
3338     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
3339     wsptr[8*5] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
3340     wsptr[8*1] = (int) (tmp21 + tmp11);
3341     wsptr[8*4] = (int) (tmp21 - tmp11);
3342     wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
3343     wsptr[8*3] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
3344   }
3345 
3346   /* Pass 2: process 6 rows from work array, store into output array.
3347    * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
3348    */
3349   wsptr = workspace;
3350   for (ctr = 0; ctr < 6; ctr++) {
3351     outptr = output_buf[ctr] + output_col;
3352 
3353     /* Even part */
3354 
3355     /* Add fudge factor here for final descale. */
3356     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3357     z3 <<= CONST_BITS;
3358 
3359     z4 = (INT32) wsptr[4];
3360     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
3361 
3362     tmp10 = z3 + z4;
3363     tmp11 = z3 - z4;
3364 
3365     z1 = (INT32) wsptr[2];
3366     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
3367     z1 <<= CONST_BITS;
3368     z2 = (INT32) wsptr[6];
3369     z2 <<= CONST_BITS;
3370 
3371     tmp12 = z1 - z2;
3372 
3373     tmp21 = z3 + tmp12;
3374     tmp24 = z3 - tmp12;
3375 
3376     tmp12 = z4 + z2;
3377 
3378     tmp20 = tmp10 + tmp12;
3379     tmp25 = tmp10 - tmp12;
3380 
3381     tmp12 = z4 - z1 - z2;
3382 
3383     tmp22 = tmp11 + tmp12;
3384     tmp23 = tmp11 - tmp12;
3385 
3386     /* Odd part */
3387 
3388     z1 = (INT32) wsptr[1];
3389     z2 = (INT32) wsptr[3];
3390     z3 = (INT32) wsptr[5];
3391     z4 = (INT32) wsptr[7];
3392 
3393     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
3394     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
3395 
3396     tmp10 = z1 + z3;
3397     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
3398     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
3399     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
3400     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
3401     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
3402     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
3403     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
3404              MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
3405 
3406     z1 -= z4;
3407     z2 -= z3;
3408     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
3409     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
3410     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
3411 
3412     /* Final output stage */
3413 
3414     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3415                                                CONST_BITS+PASS1_BITS+3)
3416                              & RANGE_MASK];
3417     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3418                                                CONST_BITS+PASS1_BITS+3)
3419                              & RANGE_MASK];
3420     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3421                                                CONST_BITS+PASS1_BITS+3)
3422                              & RANGE_MASK];
3423     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3424                                                CONST_BITS+PASS1_BITS+3)
3425                              & RANGE_MASK];
3426     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3427                                                CONST_BITS+PASS1_BITS+3)
3428                              & RANGE_MASK];
3429     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3430                                                CONST_BITS+PASS1_BITS+3)
3431                              & RANGE_MASK];
3432     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3433                                                CONST_BITS+PASS1_BITS+3)
3434                              & RANGE_MASK];
3435     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3436                                                CONST_BITS+PASS1_BITS+3)
3437                              & RANGE_MASK];
3438     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3439                                                CONST_BITS+PASS1_BITS+3)
3440                              & RANGE_MASK];
3441     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3442                                                CONST_BITS+PASS1_BITS+3)
3443                              & RANGE_MASK];
3444     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
3445                                                CONST_BITS+PASS1_BITS+3)
3446                              & RANGE_MASK];
3447     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
3448                                                CONST_BITS+PASS1_BITS+3)
3449                              & RANGE_MASK];
3450 
3451     wsptr += 8;         /* advance pointer to next row */
3452   }
3453 }
3454 
3455 
3456 /*
3457  * Perform dequantization and inverse DCT on one block of coefficients,
3458  * producing a 10x5 output block.
3459  *
3460  * 5-point IDCT in pass 1 (columns), 10-point in pass 2 (rows).
3461  */
3462 
3463 GLOBAL(void)
3464 jpeg_idct_10x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3465                 JCOEFPTR coef_block,
3466                 JSAMPARRAY output_buf, JDIMENSION output_col)
3467 {
3468   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3469   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
3470   INT32 z1, z2, z3, z4;
3471   JCOEFPTR inptr;
3472   ISLOW_MULT_TYPE * quantptr;
3473   int * wsptr;
3474   JSAMPROW outptr;
3475   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3476   int ctr;
3477   int workspace[8*5];   /* buffers data between passes */
3478   SHIFT_TEMPS
3479 
3480   /* Pass 1: process columns from input, store into work array.
3481    * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
3482    */
3483   inptr = coef_block;
3484   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3485   wsptr = workspace;
3486   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3487     /* Even part */
3488 
3489     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3490     tmp12 <<= CONST_BITS;
3491     /* Add fudge factor here for final descale. */
3492     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
3493     tmp13 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3494     tmp14 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3495     z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
3496     z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
3497     z3 = tmp12 + z2;
3498     tmp10 = z3 + z1;
3499     tmp11 = z3 - z1;
3500     tmp12 -= z2 << 2;
3501 
3502     /* Odd part */
3503 
3504     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3505     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3506 
3507     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
3508     tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
3509     tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
3510 
3511     /* Final output stage */
3512 
3513     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp13, CONST_BITS-PASS1_BITS);
3514     wsptr[8*4] = (int) RIGHT_SHIFT(tmp10 - tmp13, CONST_BITS-PASS1_BITS);
3515     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp14, CONST_BITS-PASS1_BITS);
3516     wsptr[8*3] = (int) RIGHT_SHIFT(tmp11 - tmp14, CONST_BITS-PASS1_BITS);
3517     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
3518   }
3519 
3520   /* Pass 2: process 5 rows from work array, store into output array.
3521    * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
3522    */
3523   wsptr = workspace;
3524   for (ctr = 0; ctr < 5; ctr++) {
3525     outptr = output_buf[ctr] + output_col;
3526 
3527     /* Even part */
3528 
3529     /* Add fudge factor here for final descale. */
3530     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3531     z3 <<= CONST_BITS;
3532     z4 = (INT32) wsptr[4];
3533     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
3534     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
3535     tmp10 = z3 + z1;
3536     tmp11 = z3 - z2;
3537 
3538     tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
3539 
3540     z2 = (INT32) wsptr[2];
3541     z3 = (INT32) wsptr[6];
3542 
3543     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
3544     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
3545     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
3546 
3547     tmp20 = tmp10 + tmp12;
3548     tmp24 = tmp10 - tmp12;
3549     tmp21 = tmp11 + tmp13;
3550     tmp23 = tmp11 - tmp13;
3551 
3552     /* Odd part */
3553 
3554     z1 = (INT32) wsptr[1];
3555     z2 = (INT32) wsptr[3];
3556     z3 = (INT32) wsptr[5];
3557     z3 <<= CONST_BITS;
3558     z4 = (INT32) wsptr[7];
3559 
3560     tmp11 = z2 + z4;
3561     tmp13 = z2 - z4;
3562 
3563     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
3564 
3565     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
3566     z4 = z3 + tmp12;
3567 
3568     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
3569     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
3570 
3571     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
3572     z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
3573 
3574     tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
3575 
3576     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
3577     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
3578 
3579     /* Final output stage */
3580 
3581     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3582                                               CONST_BITS+PASS1_BITS+3)
3583                             & RANGE_MASK];
3584     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3585                                               CONST_BITS+PASS1_BITS+3)
3586                             & RANGE_MASK];
3587     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3588                                               CONST_BITS+PASS1_BITS+3)
3589                             & RANGE_MASK];
3590     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3591                                               CONST_BITS+PASS1_BITS+3)
3592                             & RANGE_MASK];
3593     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3594                                               CONST_BITS+PASS1_BITS+3)
3595                             & RANGE_MASK];
3596     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3597                                               CONST_BITS+PASS1_BITS+3)
3598                             & RANGE_MASK];
3599     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3600                                               CONST_BITS+PASS1_BITS+3)
3601                             & RANGE_MASK];
3602     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3603                                               CONST_BITS+PASS1_BITS+3)
3604                             & RANGE_MASK];
3605     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3606                                               CONST_BITS+PASS1_BITS+3)
3607                             & RANGE_MASK];
3608     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3609                                               CONST_BITS+PASS1_BITS+3)
3610                             & RANGE_MASK];
3611 
3612     wsptr += 8;         /* advance pointer to next row */
3613   }
3614 }
3615 
3616 
3617 /*
3618  * Perform dequantization and inverse DCT on one block of coefficients,
3619  * producing a 8x4 output block.
3620  *
3621  * 4-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
3622  */
3623 
3624 GLOBAL(void)
3625 jpeg_idct_8x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3626                JCOEFPTR coef_block,
3627                JSAMPARRAY output_buf, JDIMENSION output_col)
3628 {
3629   INT32 tmp0, tmp1, tmp2, tmp3;
3630   INT32 tmp10, tmp11, tmp12, tmp13;
3631   INT32 z1, z2, z3;
3632   JCOEFPTR inptr;
3633   ISLOW_MULT_TYPE * quantptr;
3634   int * wsptr;
3635   JSAMPROW outptr;
3636   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3637   int ctr;
3638   int workspace[8*4];   /* buffers data between passes */
3639   SHIFT_TEMPS
3640 
3641   /* Pass 1: process columns from input, store into work array.
3642    * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3643    */
3644   inptr = coef_block;
3645   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3646   wsptr = workspace;
3647   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3648     /* Even part */
3649 
3650     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3651     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3652 
3653     tmp10 = (tmp0 + tmp2) << PASS1_BITS;
3654     tmp12 = (tmp0 - tmp2) << PASS1_BITS;
3655 
3656     /* Odd part */
3657     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
3658 
3659     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3660     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3661 
3662     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
3663     /* Add fudge factor here for final descale. */
3664     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
3665     tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
3666                        CONST_BITS-PASS1_BITS);
3667     tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
3668                        CONST_BITS-PASS1_BITS);
3669 
3670     /* Final output stage */
3671 
3672     wsptr[8*0] = (int) (tmp10 + tmp0);
3673     wsptr[8*3] = (int) (tmp10 - tmp0);
3674     wsptr[8*1] = (int) (tmp12 + tmp2);
3675     wsptr[8*2] = (int) (tmp12 - tmp2);
3676   }
3677 
3678   /* Pass 2: process rows from work array, store into output array. */
3679   /* Note that we must descale the results by a factor of 8 == 2**3, */
3680   /* and also undo the PASS1_BITS scaling. */
3681 
3682   wsptr = workspace;
3683   for (ctr = 0; ctr < 4; ctr++) {
3684     outptr = output_buf[ctr] + output_col;
3685 
3686     /* Even part: reverse the even part of the forward DCT. */
3687     /* The rotator is sqrt(2)*c(-6). */
3688 
3689     z2 = (INT32) wsptr[2];
3690     z3 = (INT32) wsptr[6];
3691 
3692     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
3693     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
3694     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
3695 
3696     /* Add fudge factor here for final descale. */
3697     z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3698     z3 = (INT32) wsptr[4];
3699 
3700     tmp0 = (z2 + z3) << CONST_BITS;
3701     tmp1 = (z2 - z3) << CONST_BITS;
3702 
3703     tmp10 = tmp0 + tmp2;
3704     tmp13 = tmp0 - tmp2;
3705     tmp11 = tmp1 + tmp3;
3706     tmp12 = tmp1 - tmp3;
3707 
3708     /* Odd part per figure 8; the matrix is unitary and hence its
3709      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
3710      */
3711 
3712     tmp0 = (INT32) wsptr[7];
3713     tmp1 = (INT32) wsptr[5];
3714     tmp2 = (INT32) wsptr[3];
3715     tmp3 = (INT32) wsptr[1];
3716 
3717     z2 = tmp0 + tmp2;
3718     z3 = tmp1 + tmp3;
3719 
3720     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
3721     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
3722     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
3723     z2 += z1;
3724     z3 += z1;
3725 
3726     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
3727     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
3728     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
3729     tmp0 += z1 + z2;
3730     tmp3 += z1 + z3;
3731 
3732     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
3733     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
3734     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
3735     tmp1 += z1 + z3;
3736     tmp2 += z1 + z2;
3737 
3738     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
3739 
3740     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
3741                                               CONST_BITS+PASS1_BITS+3)
3742                             & RANGE_MASK];
3743     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
3744                                               CONST_BITS+PASS1_BITS+3)
3745                             & RANGE_MASK];
3746     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
3747                                               CONST_BITS+PASS1_BITS+3)
3748                             & RANGE_MASK];
3749     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
3750                                               CONST_BITS+PASS1_BITS+3)
3751                             & RANGE_MASK];
3752     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
3753                                               CONST_BITS+PASS1_BITS+3)
3754                             & RANGE_MASK];
3755     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
3756                                               CONST_BITS+PASS1_BITS+3)
3757                             & RANGE_MASK];
3758     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
3759                                               CONST_BITS+PASS1_BITS+3)
3760                             & RANGE_MASK];
3761     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
3762                                               CONST_BITS+PASS1_BITS+3)
3763                             & RANGE_MASK];
3764 
3765     wsptr += DCTSIZE;           /* advance pointer to next row */
3766   }
3767 }
3768 
3769 
3770 /*
3771  * Perform dequantization and inverse DCT on one block of coefficients,
3772  * producing a reduced-size 6x3 output block.
3773  *
3774  * 3-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
3775  */
3776 
3777 GLOBAL(void)
3778 jpeg_idct_6x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3779                JCOEFPTR coef_block,
3780                JSAMPARRAY output_buf, JDIMENSION output_col)
3781 {
3782   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
3783   INT32 z1, z2, z3;
3784   JCOEFPTR inptr;
3785   ISLOW_MULT_TYPE * quantptr;
3786   int * wsptr;
3787   JSAMPROW outptr;
3788   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3789   int ctr;
3790   int workspace[6*3];   /* buffers data between passes */
3791   SHIFT_TEMPS
3792 
3793   /* Pass 1: process columns from input, store into work array.
3794    * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
3795    */
3796   inptr = coef_block;
3797   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3798   wsptr = workspace;
3799   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
3800     /* Even part */
3801 
3802     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3803     tmp0 <<= CONST_BITS;
3804     /* Add fudge factor here for final descale. */
3805     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
3806     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3807     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
3808     tmp10 = tmp0 + tmp12;
3809     tmp2 = tmp0 - tmp12 - tmp12;
3810 
3811     /* Odd part */
3812 
3813     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3814     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
3815 
3816     /* Final output stage */
3817 
3818     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
3819     wsptr[6*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
3820     wsptr[6*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
3821   }
3822 
3823   /* Pass 2: process 3 rows from work array, store into output array.
3824    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3825    */
3826   wsptr = workspace;
3827   for (ctr = 0; ctr < 3; ctr++) {
3828     outptr = output_buf[ctr] + output_col;
3829 
3830     /* Even part */
3831 
3832     /* Add fudge factor here for final descale. */
3833     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3834     tmp0 <<= CONST_BITS;
3835     tmp2 = (INT32) wsptr[4];
3836     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
3837     tmp1 = tmp0 + tmp10;
3838     tmp11 = tmp0 - tmp10 - tmp10;
3839     tmp10 = (INT32) wsptr[2];
3840     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
3841     tmp10 = tmp1 + tmp0;
3842     tmp12 = tmp1 - tmp0;
3843 
3844     /* Odd part */
3845 
3846     z1 = (INT32) wsptr[1];
3847     z2 = (INT32) wsptr[3];
3848     z3 = (INT32) wsptr[5];
3849     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
3850     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
3851     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
3852     tmp1 = (z1 - z2 - z3) << CONST_BITS;
3853 
3854     /* Final output stage */
3855 
3856     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
3857                                               CONST_BITS+PASS1_BITS+3)
3858                             & RANGE_MASK];
3859     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
3860                                               CONST_BITS+PASS1_BITS+3)
3861                             & RANGE_MASK];
3862     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
3863                                               CONST_BITS+PASS1_BITS+3)
3864                             & RANGE_MASK];
3865     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
3866                                               CONST_BITS+PASS1_BITS+3)
3867                             & RANGE_MASK];
3868     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
3869                                               CONST_BITS+PASS1_BITS+3)
3870                             & RANGE_MASK];
3871     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
3872                                               CONST_BITS+PASS1_BITS+3)
3873                             & RANGE_MASK];
3874 
3875     wsptr += 6;         /* advance pointer to next row */
3876   }
3877 }
3878 
3879 
3880 /*
3881  * Perform dequantization and inverse DCT on one block of coefficients,
3882  * producing a 4x2 output block.
3883  *
3884  * 2-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
3885  */
3886 
3887 GLOBAL(void)
3888 jpeg_idct_4x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3889                JCOEFPTR coef_block,
3890                JSAMPARRAY output_buf, JDIMENSION output_col)
3891 {
3892   INT32 tmp0, tmp2, tmp10, tmp12;
3893   INT32 z1, z2, z3;
3894   JCOEFPTR inptr;
3895   ISLOW_MULT_TYPE * quantptr;
3896   INT32 * wsptr;
3897   JSAMPROW outptr;
3898   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3899   int ctr;
3900   INT32 workspace[4*2]; /* buffers data between passes */
3901   SHIFT_TEMPS
3902 
3903   /* Pass 1: process columns from input, store into work array. */
3904 
3905   inptr = coef_block;
3906   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3907   wsptr = workspace;
3908   for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
3909     /* Even part */
3910 
3911     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3912 
3913     /* Odd part */
3914 
3915     tmp0 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3916 
3917     /* Final output stage */
3918 
3919     wsptr[4*0] = tmp10 + tmp0;
3920     wsptr[4*1] = tmp10 - tmp0;
3921   }
3922 
3923   /* Pass 2: process 2 rows from work array, store into output array.
3924    * 4-point IDCT kernel,
3925    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
3926    */
3927   wsptr = workspace;
3928   for (ctr = 0; ctr < 2; ctr++) {
3929     outptr = output_buf[ctr] + output_col;
3930 
3931     /* Even part */
3932 
3933     /* Add fudge factor here for final descale. */
3934     tmp0 = wsptr[0] + (ONE << 2);
3935     tmp2 = wsptr[2];
3936 
3937     tmp10 = (tmp0 + tmp2) << CONST_BITS;
3938     tmp12 = (tmp0 - tmp2) << CONST_BITS;
3939 
3940     /* Odd part */
3941     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
3942 
3943     z2 = wsptr[1];
3944     z3 = wsptr[3];
3945 
3946     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
3947     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
3948     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
3949 
3950     /* Final output stage */
3951 
3952     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
3953                                               CONST_BITS+3)
3954                             & RANGE_MASK];
3955     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
3956                                               CONST_BITS+3)
3957                             & RANGE_MASK];
3958     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
3959                                               CONST_BITS+3)
3960                             & RANGE_MASK];
3961     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
3962                                               CONST_BITS+3)
3963                             & RANGE_MASK];
3964 
3965     wsptr += 4;         /* advance pointer to next row */
3966   }
3967 }
3968 
3969 
3970 /*
3971  * Perform dequantization and inverse DCT on one block of coefficients,
3972  * producing a 2x1 output block.
3973  *
3974  * 1-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
3975  */
3976 
3977 GLOBAL(void)
3978 jpeg_idct_2x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3979                JCOEFPTR coef_block,
3980                JSAMPARRAY output_buf, JDIMENSION output_col)
3981 {
3982   INT32 tmp0, tmp10;
3983   ISLOW_MULT_TYPE * quantptr;
3984   JSAMPROW outptr;
3985   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3986   SHIFT_TEMPS
3987 
3988   /* Pass 1: empty. */
3989 
3990   /* Pass 2: process 1 row from input, store into output array. */
3991 
3992   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3993   outptr = output_buf[0] + output_col;
3994 
3995   /* Even part */
3996 
3997   tmp10 = DEQUANTIZE(coef_block[0], quantptr[0]);
3998   /* Add fudge factor here for final descale. */
3999   tmp10 += ONE << 2;
4000 
4001   /* Odd part */
4002 
4003   tmp0 = DEQUANTIZE(coef_block[1], quantptr[1]);
4004 
4005   /* Final output stage */
4006 
4007   outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3) & RANGE_MASK];
4008   outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3) & RANGE_MASK];
4009 }
4010 
4011 
4012 /*
4013  * Perform dequantization and inverse DCT on one block of coefficients,
4014  * producing a 8x16 output block.
4015  *
4016  * 16-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
4017  */
4018 
4019 GLOBAL(void)
4020 jpeg_idct_8x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4021                 JCOEFPTR coef_block,
4022                 JSAMPARRAY output_buf, JDIMENSION output_col)
4023 {
4024   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
4025   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
4026   INT32 z1, z2, z3, z4;
4027   JCOEFPTR inptr;
4028   ISLOW_MULT_TYPE * quantptr;
4029   int * wsptr;
4030   JSAMPROW outptr;
4031   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4032   int ctr;
4033   int workspace[8*16];  /* buffers data between passes */
4034   SHIFT_TEMPS
4035 
4036   /* Pass 1: process columns from input, store into work array.
4037    * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
4038    */
4039   inptr = coef_block;
4040   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4041   wsptr = workspace;
4042   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
4043     /* Even part */
4044 
4045     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4046     tmp0 <<= CONST_BITS;
4047     /* Add fudge factor here for final descale. */
4048     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
4049 
4050     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4051     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
4052     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
4053 
4054     tmp10 = tmp0 + tmp1;
4055     tmp11 = tmp0 - tmp1;
4056     tmp12 = tmp0 + tmp2;
4057     tmp13 = tmp0 - tmp2;
4058 
4059     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4060     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4061     z3 = z1 - z2;
4062     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
4063     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
4064 
4065     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
4066     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
4067     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
4068     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
4069 
4070     tmp20 = tmp10 + tmp0;
4071     tmp27 = tmp10 - tmp0;
4072     tmp21 = tmp12 + tmp1;
4073     tmp26 = tmp12 - tmp1;
4074     tmp22 = tmp13 + tmp2;
4075     tmp25 = tmp13 - tmp2;
4076     tmp23 = tmp11 + tmp3;
4077     tmp24 = tmp11 - tmp3;
4078 
4079     /* Odd part */
4080 
4081     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4082     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4083     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4084     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4085 
4086     tmp11 = z1 + z3;
4087 
4088     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
4089     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
4090     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
4091     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
4092     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
4093     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
4094     tmp0  = tmp1 + tmp2 + tmp3 -
4095             MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
4096     tmp13 = tmp10 + tmp11 + tmp12 -
4097             MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
4098     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
4099     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
4100     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
4101     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
4102     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
4103     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
4104     z2    += z4;
4105     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
4106     tmp1  += z1;
4107     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
4108     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
4109     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
4110     tmp12 += z2;
4111     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
4112     tmp2  += z2;
4113     tmp3  += z2;
4114     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
4115     tmp10 += z2;
4116     tmp11 += z2;
4117 
4118     /* Final output stage */
4119 
4120     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
4121     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
4122     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
4123     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
4124     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
4125     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
4126     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
4127     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
4128     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
4129     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
4130     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
4131     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
4132     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
4133     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
4134     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
4135     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
4136   }
4137 
4138   /* Pass 2: process rows from work array, store into output array. */
4139   /* Note that we must descale the results by a factor of 8 == 2**3, */
4140   /* and also undo the PASS1_BITS scaling. */
4141 
4142   wsptr = workspace;
4143   for (ctr = 0; ctr < 16; ctr++) {
4144     outptr = output_buf[ctr] + output_col;
4145 
4146     /* Even part: reverse the even part of the forward DCT. */
4147     /* The rotator is sqrt(2)*c(-6). */
4148 
4149     z2 = (INT32) wsptr[2];
4150     z3 = (INT32) wsptr[6];
4151 
4152     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
4153     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
4154     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
4155 
4156     /* Add fudge factor here for final descale. */
4157     z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4158     z3 = (INT32) wsptr[4];
4159 
4160     tmp0 = (z2 + z3) << CONST_BITS;
4161     tmp1 = (z2 - z3) << CONST_BITS;
4162 
4163     tmp10 = tmp0 + tmp2;
4164     tmp13 = tmp0 - tmp2;
4165     tmp11 = tmp1 + tmp3;
4166     tmp12 = tmp1 - tmp3;
4167 
4168     /* Odd part per figure 8; the matrix is unitary and hence its
4169      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
4170      */
4171 
4172     tmp0 = (INT32) wsptr[7];
4173     tmp1 = (INT32) wsptr[5];
4174     tmp2 = (INT32) wsptr[3];
4175     tmp3 = (INT32) wsptr[1];
4176 
4177     z2 = tmp0 + tmp2;
4178     z3 = tmp1 + tmp3;
4179 
4180     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
4181     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
4182     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
4183     z2 += z1;
4184     z3 += z1;
4185 
4186     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
4187     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
4188     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
4189     tmp0 += z1 + z2;
4190     tmp3 += z1 + z3;
4191 
4192     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
4193     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
4194     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
4195     tmp1 += z1 + z3;
4196     tmp2 += z1 + z2;
4197 
4198     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
4199 
4200     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
4201                                               CONST_BITS+PASS1_BITS+3)
4202                             & RANGE_MASK];
4203     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
4204                                               CONST_BITS+PASS1_BITS+3)
4205                             & RANGE_MASK];
4206     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
4207                                               CONST_BITS+PASS1_BITS+3)
4208                             & RANGE_MASK];
4209     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
4210                                               CONST_BITS+PASS1_BITS+3)
4211                             & RANGE_MASK];
4212     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
4213                                               CONST_BITS+PASS1_BITS+3)
4214                             & RANGE_MASK];
4215     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
4216                                               CONST_BITS+PASS1_BITS+3)
4217                             & RANGE_MASK];
4218     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
4219                                               CONST_BITS+PASS1_BITS+3)
4220                             & RANGE_MASK];
4221     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
4222                                               CONST_BITS+PASS1_BITS+3)
4223                             & RANGE_MASK];
4224 
4225     wsptr += DCTSIZE;           /* advance pointer to next row */
4226   }
4227 }
4228 
4229 
4230 /*
4231  * Perform dequantization and inverse DCT on one block of coefficients,
4232  * producing a 7x14 output block.
4233  *
4234  * 14-point IDCT in pass 1 (columns), 7-point in pass 2 (rows).
4235  */
4236 
4237 GLOBAL(void)
4238 jpeg_idct_7x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4239                 JCOEFPTR coef_block,
4240                 JSAMPARRAY output_buf, JDIMENSION output_col)
4241 {
4242   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
4243   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
4244   INT32 z1, z2, z3, z4;
4245   JCOEFPTR inptr;
4246   ISLOW_MULT_TYPE * quantptr;
4247   int * wsptr;
4248   JSAMPROW outptr;
4249   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4250   int ctr;
4251   int workspace[7*14];  /* buffers data between passes */
4252   SHIFT_TEMPS
4253 
4254   /* Pass 1: process columns from input, store into work array.
4255    * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
4256    */
4257   inptr = coef_block;
4258   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4259   wsptr = workspace;
4260   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
4261     /* Even part */
4262 
4263     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4264     z1 <<= CONST_BITS;
4265     /* Add fudge factor here for final descale. */
4266     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
4267     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4268     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
4269     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
4270     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
4271 
4272     tmp10 = z1 + z2;
4273     tmp11 = z1 + z3;
4274     tmp12 = z1 - z4;
4275 
4276     tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
4277                         CONST_BITS-PASS1_BITS);
4278 
4279     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4280     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4281 
4282     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
4283 
4284     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
4285     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
4286     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
4287             MULTIPLY(z2, FIX(1.378756276));      /* c2 */
4288 
4289     tmp20 = tmp10 + tmp13;
4290     tmp26 = tmp10 - tmp13;
4291     tmp21 = tmp11 + tmp14;
4292     tmp25 = tmp11 - tmp14;
4293     tmp22 = tmp12 + tmp15;
4294     tmp24 = tmp12 - tmp15;
4295 
4296     /* Odd part */
4297 
4298     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4299     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4300     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4301     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4302     tmp13 = z4 << CONST_BITS;
4303 
4304     tmp14 = z1 + z3;
4305     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
4306     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
4307     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
4308     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
4309     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
4310     z1    -= z2;
4311     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
4312     tmp16 += tmp15;
4313     z1    += z4;
4314     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
4315     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
4316     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
4317     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
4318     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
4319     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
4320 
4321     tmp13 = (z1 - z3) << PASS1_BITS;
4322 
4323     /* Final output stage */
4324 
4325     wsptr[7*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4326     wsptr[7*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4327     wsptr[7*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4328     wsptr[7*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4329     wsptr[7*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
4330     wsptr[7*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
4331     wsptr[7*3]  = (int) (tmp23 + tmp13);
4332     wsptr[7*10] = (int) (tmp23 - tmp13);
4333     wsptr[7*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4334     wsptr[7*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4335     wsptr[7*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
4336     wsptr[7*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
4337     wsptr[7*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
4338     wsptr[7*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
4339   }
4340 
4341   /* Pass 2: process 14 rows from work array, store into output array.
4342    * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
4343    */
4344   wsptr = workspace;
4345   for (ctr = 0; ctr < 14; ctr++) {
4346     outptr = output_buf[ctr] + output_col;
4347 
4348     /* Even part */
4349 
4350     /* Add fudge factor here for final descale. */
4351     tmp23 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4352     tmp23 <<= CONST_BITS;
4353 
4354     z1 = (INT32) wsptr[2];
4355     z2 = (INT32) wsptr[4];
4356     z3 = (INT32) wsptr[6];
4357 
4358     tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
4359     tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
4360     tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
4361     tmp10 = z1 + z3;
4362     z2 -= tmp10;
4363     tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
4364     tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
4365     tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
4366     tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
4367 
4368     /* Odd part */
4369 
4370     z1 = (INT32) wsptr[1];
4371     z2 = (INT32) wsptr[3];
4372     z3 = (INT32) wsptr[5];
4373 
4374     tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
4375     tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
4376     tmp10 = tmp11 - tmp12;
4377     tmp11 += tmp12;
4378     tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
4379     tmp11 += tmp12;
4380     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
4381     tmp10 += z2;
4382     tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
4383 
4384     /* Final output stage */
4385 
4386     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
4387                                               CONST_BITS+PASS1_BITS+3)
4388                             & RANGE_MASK];
4389     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
4390                                               CONST_BITS+PASS1_BITS+3)
4391                             & RANGE_MASK];
4392     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
4393                                               CONST_BITS+PASS1_BITS+3)
4394                             & RANGE_MASK];
4395     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
4396                                               CONST_BITS+PASS1_BITS+3)
4397                             & RANGE_MASK];
4398     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
4399                                               CONST_BITS+PASS1_BITS+3)
4400                             & RANGE_MASK];
4401     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
4402                                               CONST_BITS+PASS1_BITS+3)
4403                             & RANGE_MASK];
4404     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23,
4405                                               CONST_BITS+PASS1_BITS+3)
4406                             & RANGE_MASK];
4407 
4408     wsptr += 7;         /* advance pointer to next row */
4409   }
4410 }
4411 
4412 
4413 /*
4414  * Perform dequantization and inverse DCT on one block of coefficients,
4415  * producing a 6x12 output block.
4416  *
4417  * 12-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
4418  */
4419 
4420 GLOBAL(void)
4421 jpeg_idct_6x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4422                 JCOEFPTR coef_block,
4423                 JSAMPARRAY output_buf, JDIMENSION output_col)
4424 {
4425   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
4426   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
4427   INT32 z1, z2, z3, z4;
4428   JCOEFPTR inptr;
4429   ISLOW_MULT_TYPE * quantptr;
4430   int * wsptr;
4431   JSAMPROW outptr;
4432   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4433   int ctr;
4434   int workspace[6*12];  /* buffers data between passes */
4435   SHIFT_TEMPS
4436 
4437   /* Pass 1: process columns from input, store into work array.
4438    * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
4439    */
4440   inptr = coef_block;
4441   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4442   wsptr = workspace;
4443   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
4444     /* Even part */
4445 
4446     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4447     z3 <<= CONST_BITS;
4448     /* Add fudge factor here for final descale. */
4449     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
4450 
4451     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4452     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
4453 
4454     tmp10 = z3 + z4;
4455     tmp11 = z3 - z4;
4456 
4457     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4458     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
4459     z1 <<= CONST_BITS;
4460     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4461     z2 <<= CONST_BITS;
4462 
4463     tmp12 = z1 - z2;
4464 
4465     tmp21 = z3 + tmp12;
4466     tmp24 = z3 - tmp12;
4467 
4468     tmp12 = z4 + z2;
4469 
4470     tmp20 = tmp10 + tmp12;
4471     tmp25 = tmp10 - tmp12;
4472 
4473     tmp12 = z4 - z1 - z2;
4474 
4475     tmp22 = tmp11 + tmp12;
4476     tmp23 = tmp11 - tmp12;
4477 
4478     /* Odd part */
4479 
4480     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4481     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4482     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4483     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4484 
4485     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
4486     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
4487 
4488     tmp10 = z1 + z3;
4489     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
4490     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
4491     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
4492     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
4493     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
4494     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
4495     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
4496              MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
4497 
4498     z1 -= z4;
4499     z2 -= z3;
4500     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
4501     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
4502     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
4503 
4504     /* Final output stage */
4505 
4506     wsptr[6*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4507     wsptr[6*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4508     wsptr[6*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4509     wsptr[6*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4510     wsptr[6*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
4511     wsptr[6*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
4512     wsptr[6*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
4513     wsptr[6*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
4514     wsptr[6*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4515     wsptr[6*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4516     wsptr[6*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
4517     wsptr[6*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
4518   }
4519 
4520   /* Pass 2: process 12 rows from work array, store into output array.
4521    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
4522    */
4523   wsptr = workspace;
4524   for (ctr = 0; ctr < 12; ctr++) {
4525     outptr = output_buf[ctr] + output_col;
4526 
4527     /* Even part */
4528 
4529     /* Add fudge factor here for final descale. */
4530     tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4531     tmp10 <<= CONST_BITS;
4532     tmp12 = (INT32) wsptr[4];
4533     tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
4534     tmp11 = tmp10 + tmp20;
4535     tmp21 = tmp10 - tmp20 - tmp20;
4536     tmp20 = (INT32) wsptr[2];
4537     tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
4538     tmp20 = tmp11 + tmp10;
4539     tmp22 = tmp11 - tmp10;
4540 
4541     /* Odd part */
4542 
4543     z1 = (INT32) wsptr[1];
4544     z2 = (INT32) wsptr[3];
4545     z3 = (INT32) wsptr[5];
4546     tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
4547     tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
4548     tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
4549     tmp11 = (z1 - z2 - z3) << CONST_BITS;
4550 
4551     /* Final output stage */
4552 
4553     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
4554                                               CONST_BITS+PASS1_BITS+3)
4555                             & RANGE_MASK];
4556     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
4557                                               CONST_BITS+PASS1_BITS+3)
4558                             & RANGE_MASK];
4559     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
4560                                               CONST_BITS+PASS1_BITS+3)
4561                             & RANGE_MASK];
4562     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
4563                                               CONST_BITS+PASS1_BITS+3)
4564                             & RANGE_MASK];
4565     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
4566                                               CONST_BITS+PASS1_BITS+3)
4567                             & RANGE_MASK];
4568     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
4569                                               CONST_BITS+PASS1_BITS+3)
4570                             & RANGE_MASK];
4571 
4572     wsptr += 6;         /* advance pointer to next row */
4573   }
4574 }
4575 
4576 
4577 /*
4578  * Perform dequantization and inverse DCT on one block of coefficients,
4579  * producing a 5x10 output block.
4580  *
4581  * 10-point IDCT in pass 1 (columns), 5-point in pass 2 (rows).
4582  */
4583 
4584 GLOBAL(void)
4585 jpeg_idct_5x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4586                 JCOEFPTR coef_block,
4587                 JSAMPARRAY output_buf, JDIMENSION output_col)
4588 {
4589   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
4590   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
4591   INT32 z1, z2, z3, z4, z5;
4592   JCOEFPTR inptr;
4593   ISLOW_MULT_TYPE * quantptr;
4594   int * wsptr;
4595   JSAMPROW outptr;
4596   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4597   int ctr;
4598   int workspace[5*10];  /* buffers data between passes */
4599   SHIFT_TEMPS
4600 
4601   /* Pass 1: process columns from input, store into work array.
4602    * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
4603    */
4604   inptr = coef_block;
4605   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4606   wsptr = workspace;
4607   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
4608     /* Even part */
4609 
4610     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4611     z3 <<= CONST_BITS;
4612     /* Add fudge factor here for final descale. */
4613     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
4614     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4615     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
4616     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
4617     tmp10 = z3 + z1;
4618     tmp11 = z3 - z2;
4619 
4620     tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
4621                         CONST_BITS-PASS1_BITS);
4622 
4623     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4624     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4625 
4626     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
4627     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
4628     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
4629 
4630     tmp20 = tmp10 + tmp12;
4631     tmp24 = tmp10 - tmp12;
4632     tmp21 = tmp11 + tmp13;
4633     tmp23 = tmp11 - tmp13;
4634 
4635     /* Odd part */
4636 
4637     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4638     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4639     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4640     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4641 
4642     tmp11 = z2 + z4;
4643     tmp13 = z2 - z4;
4644 
4645     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
4646     z5 = z3 << CONST_BITS;
4647 
4648     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
4649     z4 = z5 + tmp12;
4650 
4651     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
4652     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
4653 
4654     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
4655     z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
4656 
4657     tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
4658 
4659     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
4660     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
4661 
4662     /* Final output stage */
4663 
4664     wsptr[5*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4665     wsptr[5*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4666     wsptr[5*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4667     wsptr[5*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4668     wsptr[5*2] = (int) (tmp22 + tmp12);
4669     wsptr[5*7] = (int) (tmp22 - tmp12);
4670     wsptr[5*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
4671     wsptr[5*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
4672     wsptr[5*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4673     wsptr[5*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4674   }
4675 
4676   /* Pass 2: process 10 rows from work array, store into output array.
4677    * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
4678    */
4679   wsptr = workspace;
4680   for (ctr = 0; ctr < 10; ctr++) {
4681     outptr = output_buf[ctr] + output_col;
4682 
4683     /* Even part */
4684 
4685     /* Add fudge factor here for final descale. */
4686     tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4687     tmp12 <<= CONST_BITS;
4688     tmp13 = (INT32) wsptr[2];
4689     tmp14 = (INT32) wsptr[4];
4690     z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
4691     z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
4692     z3 = tmp12 + z2;
4693     tmp10 = z3 + z1;
4694     tmp11 = z3 - z1;
4695     tmp12 -= z2 << 2;
4696 
4697     /* Odd part */
4698 
4699     z2 = (INT32) wsptr[1];
4700     z3 = (INT32) wsptr[3];
4701 
4702     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
4703     tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
4704     tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
4705 
4706     /* Final output stage */
4707 
4708     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp13,
4709                                               CONST_BITS+PASS1_BITS+3)
4710                             & RANGE_MASK];
4711     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp13,
4712                                               CONST_BITS+PASS1_BITS+3)
4713                             & RANGE_MASK];
4714     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp14,
4715                                               CONST_BITS+PASS1_BITS+3)
4716                             & RANGE_MASK];
4717     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp14,
4718                                               CONST_BITS+PASS1_BITS+3)
4719                             & RANGE_MASK];
4720     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
4721                                               CONST_BITS+PASS1_BITS+3)
4722                             & RANGE_MASK];
4723 
4724     wsptr += 5;         /* advance pointer to next row */
4725   }
4726 }
4727 
4728 
4729 /*
4730  * Perform dequantization and inverse DCT on one block of coefficients,
4731  * producing a 4x8 output block.
4732  *
4733  * 8-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
4734  */
4735 
4736 GLOBAL(void)
4737 jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4738                JCOEFPTR coef_block,
4739                JSAMPARRAY output_buf, JDIMENSION output_col)
4740 {
4741   INT32 tmp0, tmp1, tmp2, tmp3;
4742   INT32 tmp10, tmp11, tmp12, tmp13;
4743   INT32 z1, z2, z3;
4744   JCOEFPTR inptr;
4745   ISLOW_MULT_TYPE * quantptr;
4746   int * wsptr;
4747   JSAMPROW outptr;
4748   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4749   int ctr;
4750   int workspace[4*8];   /* buffers data between passes */
4751   SHIFT_TEMPS
4752 
4753   /* Pass 1: process columns from input, store into work array. */
4754   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
4755   /* furthermore, we scale the results by 2**PASS1_BITS. */
4756 
4757   inptr = coef_block;
4758   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4759   wsptr = workspace;
4760   for (ctr = 4; ctr > 0; ctr--) {
4761     /* Due to quantization, we will usually find that many of the input
4762      * coefficients are zero, especially the AC terms.  We can exploit this
4763      * by short-circuiting the IDCT calculation for any column in which all
4764      * the AC terms are zero.  In that case each output is equal to the
4765      * DC coefficient (with scale factor as needed).
4766      * With typical images and quantization tables, half or more of the
4767      * column DCT calculations can be simplified this way.
4768      */
4769 
4770     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
4771         inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
4772         inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
4773         inptr[DCTSIZE*7] == 0) {
4774       /* AC terms all zero */
4775       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
4776 
4777       wsptr[4*0] = dcval;
4778       wsptr[4*1] = dcval;
4779       wsptr[4*2] = dcval;
4780       wsptr[4*3] = dcval;
4781       wsptr[4*4] = dcval;
4782       wsptr[4*5] = dcval;
4783       wsptr[4*6] = dcval;
4784       wsptr[4*7] = dcval;
4785 
4786       inptr++;                  /* advance pointers to next column */
4787       quantptr++;
4788       wsptr++;
4789       continue;
4790     }
4791 
4792     /* Even part: reverse the even part of the forward DCT. */
4793     /* The rotator is sqrt(2)*c(-6). */
4794 
4795     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4796     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4797 
4798     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
4799     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
4800     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
4801 
4802     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4803     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4804     z2 <<= CONST_BITS;
4805     z3 <<= CONST_BITS;
4806     /* Add fudge factor here for final descale. */
4807     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
4808 
4809     tmp0 = z2 + z3;
4810     tmp1 = z2 - z3;
4811 
4812     tmp10 = tmp0 + tmp2;
4813     tmp13 = tmp0 - tmp2;
4814     tmp11 = tmp1 + tmp3;
4815     tmp12 = tmp1 - tmp3;
4816 
4817     /* Odd part per figure 8; the matrix is unitary and hence its
4818      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
4819      */
4820 
4821     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4822     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4823     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4824     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4825 
4826     z2 = tmp0 + tmp2;
4827     z3 = tmp1 + tmp3;
4828 
4829     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
4830     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
4831     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
4832     z2 += z1;
4833     z3 += z1;
4834 
4835     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
4836     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
4837     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
4838     tmp0 += z1 + z2;
4839     tmp3 += z1 + z3;
4840 
4841     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
4842     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
4843     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
4844     tmp1 += z1 + z3;
4845     tmp2 += z1 + z2;
4846 
4847     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
4848 
4849     wsptr[4*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
4850     wsptr[4*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
4851     wsptr[4*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
4852     wsptr[4*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
4853     wsptr[4*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
4854     wsptr[4*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
4855     wsptr[4*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
4856     wsptr[4*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
4857 
4858     inptr++;                    /* advance pointers to next column */
4859     quantptr++;
4860     wsptr++;
4861   }
4862 
4863   /* Pass 2: process 8 rows from work array, store into output array.
4864    * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4865    */
4866   wsptr = workspace;
4867   for (ctr = 0; ctr < 8; ctr++) {
4868     outptr = output_buf[ctr] + output_col;
4869 
4870     /* Even part */
4871 
4872     /* Add fudge factor here for final descale. */
4873     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4874     tmp2 = (INT32) wsptr[2];
4875 
4876     tmp10 = (tmp0 + tmp2) << CONST_BITS;
4877     tmp12 = (tmp0 - tmp2) << CONST_BITS;
4878 
4879     /* Odd part */
4880     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
4881 
4882     z2 = (INT32) wsptr[1];
4883     z3 = (INT32) wsptr[3];
4884 
4885     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
4886     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
4887     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
4888 
4889     /* Final output stage */
4890 
4891     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
4892                                               CONST_BITS+PASS1_BITS+3)
4893                             & RANGE_MASK];
4894     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
4895                                               CONST_BITS+PASS1_BITS+3)
4896                             & RANGE_MASK];
4897     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
4898                                               CONST_BITS+PASS1_BITS+3)
4899                             & RANGE_MASK];
4900     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
4901                                               CONST_BITS+PASS1_BITS+3)
4902                             & RANGE_MASK];
4903 
4904     wsptr += 4;         /* advance pointer to next row */
4905   }
4906 }
4907 
4908 
4909 /*
4910  * Perform dequantization and inverse DCT on one block of coefficients,
4911  * producing a reduced-size 3x6 output block.
4912  *
4913  * 6-point IDCT in pass 1 (columns), 3-point in pass 2 (rows).
4914  */
4915 
4916 GLOBAL(void)
4917 jpeg_idct_3x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4918                JCOEFPTR coef_block,
4919                JSAMPARRAY output_buf, JDIMENSION output_col)
4920 {
4921   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
4922   INT32 z1, z2, z3;
4923   JCOEFPTR inptr;
4924   ISLOW_MULT_TYPE * quantptr;
4925   int * wsptr;
4926   JSAMPROW outptr;
4927   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4928   int ctr;
4929   int workspace[3*6];   /* buffers data between passes */
4930   SHIFT_TEMPS
4931 
4932   /* Pass 1: process columns from input, store into work array.
4933    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
4934    */
4935   inptr = coef_block;
4936   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4937   wsptr = workspace;
4938   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
4939     /* Even part */
4940 
4941     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4942     tmp0 <<= CONST_BITS;
4943     /* Add fudge factor here for final descale. */
4944     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
4945     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4946     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
4947     tmp1 = tmp0 + tmp10;
4948     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
4949     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4950     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
4951     tmp10 = tmp1 + tmp0;
4952     tmp12 = tmp1 - tmp0;
4953 
4954     /* Odd part */
4955 
4956     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4957     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4958     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4959     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
4960     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
4961     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
4962     tmp1 = (z1 - z2 - z3) << PASS1_BITS;
4963 
4964     /* Final output stage */
4965 
4966     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
4967     wsptr[3*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
4968     wsptr[3*1] = (int) (tmp11 + tmp1);
4969     wsptr[3*4] = (int) (tmp11 - tmp1);
4970     wsptr[3*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
4971     wsptr[3*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
4972   }
4973 
4974   /* Pass 2: process 6 rows from work array, store into output array.
4975    * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
4976    */
4977   wsptr = workspace;
4978   for (ctr = 0; ctr < 6; ctr++) {
4979     outptr = output_buf[ctr] + output_col;
4980 
4981     /* Even part */
4982 
4983     /* Add fudge factor here for final descale. */
4984     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4985     tmp0 <<= CONST_BITS;
4986     tmp2 = (INT32) wsptr[2];
4987     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
4988     tmp10 = tmp0 + tmp12;
4989     tmp2 = tmp0 - tmp12 - tmp12;
4990 
4991     /* Odd part */
4992 
4993     tmp12 = (INT32) wsptr[1];
4994     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
4995 
4996     /* Final output stage */
4997 
4998     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
4999                                               CONST_BITS+PASS1_BITS+3)
5000                             & RANGE_MASK];
5001     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
5002                                               CONST_BITS+PASS1_BITS+3)
5003                             & RANGE_MASK];
5004     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
5005                                               CONST_BITS+PASS1_BITS+3)
5006                             & RANGE_MASK];
5007 
5008     wsptr += 3;         /* advance pointer to next row */
5009   }
5010 }
5011 
5012 
5013 /*
5014  * Perform dequantization and inverse DCT on one block of coefficients,
5015  * producing a 2x4 output block.
5016  *
5017  * 4-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
5018  */
5019 
5020 GLOBAL(void)
5021 jpeg_idct_2x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
5022                JCOEFPTR coef_block,
5023                JSAMPARRAY output_buf, JDIMENSION output_col)
5024 {
5025   INT32 tmp0, tmp2, tmp10, tmp12;
5026   INT32 z1, z2, z3;
5027   JCOEFPTR inptr;
5028   ISLOW_MULT_TYPE * quantptr;
5029   INT32 * wsptr;
5030   JSAMPROW outptr;
5031   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
5032   int ctr;
5033   INT32 workspace[2*4]; /* buffers data between passes */
5034   SHIFT_TEMPS
5035 
5036   /* Pass 1: process columns from input, store into work array.
5037    * 4-point IDCT kernel,
5038    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
5039    */
5040   inptr = coef_block;
5041   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5042   wsptr = workspace;
5043   for (ctr = 0; ctr < 2; ctr++, inptr++, quantptr++, wsptr++) {
5044     /* Even part */
5045 
5046     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
5047     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
5048 
5049     tmp10 = (tmp0 + tmp2) << CONST_BITS;
5050     tmp12 = (tmp0 - tmp2) << CONST_BITS;
5051 
5052     /* Odd part */
5053     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
5054 
5055     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
5056     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
5057 
5058     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
5059     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
5060     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
5061 
5062     /* Final output stage */
5063 
5064     wsptr[2*0] = tmp10 + tmp0;
5065     wsptr[2*3] = tmp10 - tmp0;
5066     wsptr[2*1] = tmp12 + tmp2;
5067     wsptr[2*2] = tmp12 - tmp2;
5068   }
5069 
5070   /* Pass 2: process 4 rows from work array, store into output array. */
5071 
5072   wsptr = workspace;
5073   for (ctr = 0; ctr < 4; ctr++) {
5074     outptr = output_buf[ctr] + output_col;
5075 
5076     /* Even part */
5077 
5078     /* Add fudge factor here for final descale. */
5079     tmp10 = wsptr[0] + (ONE << (CONST_BITS+2));
5080 
5081     /* Odd part */
5082 
5083     tmp0 = wsptr[1];
5084 
5085     /* Final output stage */
5086 
5087     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS+3)
5088                             & RANGE_MASK];
5089     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS+3)
5090                             & RANGE_MASK];
5091 
5092     wsptr += 2;         /* advance pointer to next row */
5093   }
5094 }
5095 
5096 
5097 /*
5098  * Perform dequantization and inverse DCT on one block of coefficients,
5099  * producing a 1x2 output block.
5100  *
5101  * 2-point IDCT in pass 1 (columns), 1-point in pass 2 (rows).
5102  */
5103 
5104 GLOBAL(void)
5105 jpeg_idct_1x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
5106                JCOEFPTR coef_block,
5107                JSAMPARRAY output_buf, JDIMENSION output_col)
5108 {
5109   INT32 tmp0, tmp10;
5110   ISLOW_MULT_TYPE * quantptr;
5111   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
5112   SHIFT_TEMPS
5113 
5114   /* Process 1 column from input, store into output array. */
5115 
5116   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5117 
5118   /* Even part */
5119 
5120   tmp10 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
5121   /* Add fudge factor here for final descale. */
5122   tmp10 += ONE << 2;
5123 
5124   /* Odd part */
5125 
5126   tmp0 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
5127 
5128   /* Final output stage */
5129 
5130   output_buf[0][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3)
5131                                           & RANGE_MASK];
5132   output_buf[1][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3)
5133                                           & RANGE_MASK];
5134 }
5135 
5136 #endif /* IDCT_SCALING_SUPPORTED */
5137 #endif /* DCT_ISLOW_SUPPORTED */