rt Udiff modules/javafx.graphics/src/main/native-iio/libjpeg7/jfdctint.c

modules/javafx.graphics/src/main/native-iio/libjpeg7/jfdctint.c

@@ -1,10 +1,10 @@
 /*
  * jfdctint.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * Modification developed 2003-2009 by Guido Vollbeding.
+ * Modification developed 2003-2015 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains a slow-but-accurate integer implementation of the
  * forward DCT (Discrete Cosine Transform).

@@ -163,20 +163,22 @@
   DCTELEM *dataptr;
   JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * cK represents sqrt(2) * cos(K*pi/16).
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < DCTSIZE; ctr++) {
     elemptr = sample_data[ctr] + start_col;
 
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
-     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     * rotator "c1" should be "c6".
      */
 
     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);

@@ -190,68 +192,71 @@
     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
 
-    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
     /* Add fudge factor here for final descale. */
     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
-    dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+
+    dataptr[2] = (DCTELEM)
+      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
                                        CONST_BITS-PASS1_BITS);
-    dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+    dataptr[6] = (DCTELEM)
+      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
                                        CONST_BITS-PASS1_BITS);
 
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-     * cK represents sqrt(2) * cos(K*pi/16).
      * i0..i3 in the paper are tmp0..tmp3 here.
      */
 
-    tmp10 = tmp0 + tmp3;
-    tmp11 = tmp1 + tmp2;
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp1 + tmp3;
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
     /* Add fudge factor here for final descale. */
     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
 
-    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
-    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
-    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
-    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
-    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
-    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
-    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
-
     tmp12 += z1;
     tmp13 += z1;
 
-    dataptr[1] = (DCTELEM)
-      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
-    dataptr[3] = (DCTELEM)
-      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
-    dataptr[5] = (DCTELEM)
-      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
-    dataptr[7] = (DCTELEM)
-      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
+    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
+    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
+    tmp0 += z1 + tmp12;
+    tmp3 += z1 + tmp13;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
+    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
+    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
+    tmp1 += z1 + tmp13;
+    tmp2 += z1 + tmp12;
+
+    dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS);
 
     dataptr += DCTSIZE;         /* advance pointer to next row */
   }
 
   /* Pass 2: process columns.
    * We remove the PASS1_BITS scaling, but leave the results scaled up
    * by an overall factor of 8.
+   * cK represents sqrt(2) * cos(K*pi/16).
    */
 
   dataptr = data;
   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
-     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     * rotator "c1" should be "c6".
      */
 
     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];

@@ -269,51 +274,53 @@
     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
 
     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
     dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
 
-    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
     /* Add fudge factor here for final descale. */
     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+
     dataptr[DCTSIZE*2] = (DCTELEM)
-      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
+      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
+                  CONST_BITS+PASS1_BITS);
     dataptr[DCTSIZE*6] = (DCTELEM)
-      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
+      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
+                  CONST_BITS+PASS1_BITS);
 
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-     * cK represents sqrt(2) * cos(K*pi/16).
      * i0..i3 in the paper are tmp0..tmp3 here.
      */
 
-    tmp10 = tmp0 + tmp3;
-    tmp11 = tmp1 + tmp2;
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp1 + tmp3;
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
     /* Add fudge factor here for final descale. */
     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
 
-    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
-    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
-    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
-    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
-    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
-    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
-    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
-
     tmp12 += z1;
     tmp13 += z1;
 
-    dataptr[DCTSIZE*1] = (DCTELEM)
-      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*3] = (DCTELEM)
-      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*5] = (DCTELEM)
-      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*7] = (DCTELEM)
-      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
+    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
+    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
+    tmp0 += z1 + tmp12;
+    tmp3 += z1 + tmp13;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
+    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
+    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
+    tmp1 += z1 + tmp13;
+    tmp2 += z1 + tmp12;
+
+    dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS1_BITS);
 
     dataptr++;                  /* advance pointer to next column */
   }
 }

@@ -336,14 +343,15 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* cK represents sqrt(2) * cos(K*pi/14). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * cK represents sqrt(2) * cos(K*pi/14).
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 7; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -357,11 +365,11 @@
     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
 
     z1 = tmp0 + tmp2;
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
     tmp3 += tmp3;
     z1 -= tmp3;
     z1 -= tmp3;

@@ -470,14 +478,15 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* cK represents sqrt(2) * cos(K*pi/12). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * cK represents sqrt(2) * cos(K*pi/12).
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 6; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -492,11 +501,11 @@
 
     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
     dataptr[2] = (DCTELEM)
       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
               CONST_BITS-PASS1_BITS);

@@ -583,16 +592,17 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* We scale the results further by 2 as part of output adaption */
-  /* scaling for different DCT size. */
-  /* cK represents sqrt(2) * cos(K*pi/10). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * We scale the results further by 2 as part of output adaption
+   * scaling for different DCT size.
+   * cK represents sqrt(2) * cos(K*pi/10).
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 5; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -606,11 +616,11 @@
     tmp11 = tmp0 - tmp1;
 
     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
     tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
     tmp10 -= tmp2 << 2;
     tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */

@@ -693,15 +703,16 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* We must also scale the output by (8/4)**2 = 2**2, which we add here. */
-  /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * We must also scale the output by (8/4)**2 = 2**2, which we add here.
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 4; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -711,11 +722,11 @@
     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
 
     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
 
     /* Odd part */

@@ -735,10 +746,11 @@
   }
 
   /* Pass 2: process columns.
    * We remove the PASS1_BITS scaling, but leave the results scaled up
    * by an overall factor of 8.
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
    */
 
   dataptr = data;
   for (ctr = 0; ctr < 4; ctr++) {
     /* Even part */

@@ -785,16 +797,17 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* We scale the results further by 2**2 as part of output adaption */
-  /* scaling for different DCT size. */
-  /* cK represents sqrt(2) * cos(K*pi/6). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * We scale the results further by 2**2 as part of output adaption
+   * scaling for different DCT size.
+   * cK represents sqrt(2) * cos(K*pi/6).
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 3; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -803,11 +816,11 @@
     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
     tmp1 = GETJSAMPLE(elemptr[1]);
 
     tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
     dataptr[2] = (DCTELEM)
       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
               CONST_BITS-PASS1_BITS-2);

@@ -861,18 +874,19 @@
  */
 
 GLOBAL(void)
 jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3;
+  DCTELEM tmp0, tmp1, tmp2, tmp3;
   JSAMPROW elemptr;
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT.
+   */
 
   /* Row 0 */
   elemptr = sample_data[0] + start_col;
 
   tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);

@@ -888,35 +902,38 @@
    * We leave the results scaled up by an overall factor of 8.
    * We must also scale the output by (8/2)**2 = 2**4.
    */
 
   /* Column 0 */
-  /* Apply unsigned->signed conversion */
-  data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4);
-  data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);
+  /* Apply unsigned->signed conversion. */
+  data[DCTSIZE*0] = (tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4;
+  data[DCTSIZE*1] = (tmp0 - tmp2) << 4;
 
   /* Column 1 */
-  data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
-  data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
+  data[DCTSIZE*0+1] = (tmp1 + tmp3) << 4;
+  data[DCTSIZE*1+1] = (tmp1 - tmp3) << 4;
 }
 
 
 /*
  * Perform the forward DCT on a 1x1 sample block.
  */
 
 GLOBAL(void)
 jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
+  DCTELEM dcval;
+
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
+  dcval = GETJSAMPLE(sample_data[0][start_col]);
+
   /* We leave the result scaled up by an overall factor of 8. */
   /* We must also scale the output by (8/1)**2 = 2**6. */
-  /* Apply unsigned->signed conversion */
-  data[0] = (DCTELEM)
-    ((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
+  /* Apply unsigned->signed conversion. */
+  data[0] = (dcval - CENTERJSAMPLE) << 6;
 }
 
 
 /*
  * Perform the forward DCT on a 9x9 sample block.

@@ -933,15 +950,16 @@
   DCTELEM *wsptr;
   JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* we scale the results further by 2 as part of output adaption */
-  /* scaling for different DCT size. */
-  /* cK represents sqrt(2) * cos(K*pi/18). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * we scale the results further by 2 as part of output adaption
+   * scaling for different DCT size.
+   * cK represents sqrt(2) * cos(K*pi/18).
+   */
 
   dataptr = data;
   ctr = 0;
   for (;;) {
     elemptr = sample_data[ctr] + start_col;

@@ -959,11 +977,11 @@
     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
 
     z1 = tmp0 + tmp2 + tmp3;
     z2 = tmp1 + tmp4;
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
     dataptr[6] = (DCTELEM)
       DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)),  /* c6 */
               CONST_BITS-1);
     z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049));        /* c2 */

@@ -1082,15 +1100,16 @@
   DCTELEM *wsptr;
   JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* we scale the results further by 2 as part of output adaption */
-  /* scaling for different DCT size. */
-  /* cK represents sqrt(2) * cos(K*pi/20). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * we scale the results further by 2 as part of output adaption
+   * scaling for different DCT size.
+   * cK represents sqrt(2) * cos(K*pi/20).
+   */
 
   dataptr = data;
   ctr = 0;
   for (;;) {
     elemptr = sample_data[ctr] + start_col;

@@ -1112,11 +1131,11 @@
     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
     tmp12 += tmp12;
     dataptr[4] = (DCTELEM)
       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */

@@ -1246,15 +1265,16 @@
   DCTELEM *wsptr;
   JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* we scale the results further by 2 as part of output adaption */
-  /* scaling for different DCT size. */
-  /* cK represents sqrt(2) * cos(K*pi/22). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * we scale the results further by 2 as part of output adaption
+   * scaling for different DCT size.
+   * cK represents sqrt(2) * cos(K*pi/22).
+   */
 
   dataptr = data;
   ctr = 0;
   for (;;) {
     elemptr = sample_data[ctr] + start_col;

@@ -1272,11 +1292,11 @@
     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
     tmp5 += tmp5;
     tmp0 -= tmp5;
     tmp1 -= tmp5;

@@ -1428,13 +1448,14 @@
   DCTELEM *wsptr;
   JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
-  /* cK represents sqrt(2) * cos(K*pi/24). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT.
+   * cK represents sqrt(2) * cos(K*pi/24).
+   */
 
   dataptr = data;
   ctr = 0;
   for (;;) {
     elemptr = sample_data[ctr] + start_col;

@@ -1460,11 +1481,11 @@
     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
     dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
     dataptr[4] = (DCTELEM)
       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
               CONST_BITS);

@@ -1594,13 +1615,14 @@
   DCTELEM *wsptr;
   JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
-  /* cK represents sqrt(2) * cos(K*pi/26). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT.
+   * cK represents sqrt(2) * cos(K*pi/26).
+   */
 
   dataptr = data;
   ctr = 0;
   for (;;) {
     elemptr = sample_data[ctr] + start_col;

@@ -1620,11 +1642,11 @@
     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
     tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
     tmp6 += tmp6;
     tmp0 -= tmp6;
     tmp1 -= tmp6;

@@ -1792,13 +1814,14 @@
   DCTELEM *wsptr;
   JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
-  /* cK represents sqrt(2) * cos(K*pi/28). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT.
+   * cK represents sqrt(2) * cos(K*pi/28).
+   */
 
   dataptr = data;
   ctr = 0;
   for (;;) {
     elemptr = sample_data[ctr] + start_col;

@@ -1826,11 +1849,11 @@
     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
     tmp13 += tmp13;
     dataptr[4] = (DCTELEM)
       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */

@@ -1993,13 +2016,14 @@
   DCTELEM *wsptr;
   JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
-  /* cK represents sqrt(2) * cos(K*pi/30). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT.
+   * cK represents sqrt(2) * cos(K*pi/30).
+   */
 
   dataptr = data;
   ctr = 0;
   for (;;) {
     elemptr = sample_data[ctr] + start_col;

@@ -2024,11 +2048,11 @@
     tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
 
     z1 = tmp0 + tmp4 + tmp5;
     z2 = tmp1 + tmp3 + tmp6;
     z3 = tmp2 + tmp7;
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
     z3 += z3;
     dataptr[6] = (DCTELEM)
       DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
               MULTIPLY(z2 - z3, FIX(0.437016024)),  /* c12 */

@@ -2171,14 +2195,15 @@
   DCTELEM *wsptr;
   JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* cK represents sqrt(2) * cos(K*pi/32). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * cK represents sqrt(2) * cos(K*pi/32).
+   */
 
   dataptr = data;
   ctr = 0;
   for (;;) {
     elemptr = sample_data[ctr] + start_col;

@@ -2210,11 +2235,11 @@
     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
     tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
     dataptr[4] = (DCTELEM)
       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
               MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */

@@ -2273,10 +2298,11 @@
 
   /* Pass 2: process columns.
    * We remove the PASS1_BITS scaling, but leave the results scaled up
    * by an overall factor of 8.
    * We must also scale the output by (8/16)**2 = 1/2**2.
+   * cK represents sqrt(2) * cos(K*pi/32).
    */
 
   dataptr = data;
   wsptr = workspace;
   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {

@@ -2378,14 +2404,15 @@
   DCTELEM *dataptr;
   JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+   */
 
   dataptr = data;
   ctr = 0;
   for (ctr = 0; ctr < DCTSIZE; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -2417,11 +2444,11 @@
     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
     tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
     dataptr[4] = (DCTELEM)
       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
               MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */

@@ -2473,16 +2500,17 @@
 
   /* Pass 2: process columns.
    * We remove the PASS1_BITS scaling, but leave the results scaled up
    * by an overall factor of 8.
    * We must also scale the output by 8/16 = 1/2.
+   * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
    */
 
   dataptr = data;
   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
-     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     * rotator "c1" should be "c6".
      */
 
     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];

@@ -2499,47 +2527,47 @@
     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
 
     dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
 
-    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);   /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
                                            CONST_BITS+PASS1_BITS+1);
-    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
                                            CONST_BITS+PASS1_BITS+1);
 
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
      * i0..i3 in the paper are tmp0..tmp3 here.
      */
 
-    tmp10 = tmp0 + tmp3;
-    tmp11 = tmp1 + tmp2;
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp1 + tmp3;
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);      /* -c3+c5 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);      /* -c3-c5 */
+    tmp12 += z1;
+    tmp13 += z1;
 
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);   /* -c3+c7 */
     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
-    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
-    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
-    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
-    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
-    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
-    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+    tmp0 += z1 + tmp12;
+    tmp3 += z1 + tmp13;
 
-    tmp12 += z1;
-    tmp13 += z1;
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);   /* -c1-c3 */
+    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);          /*  c1+c3+c5-c7 */
+    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);          /*  c1+c3-c5+c7 */
+    tmp1 += z1 + tmp13;
+    tmp2 += z1 + tmp12;
 
-    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
-                                           CONST_BITS+PASS1_BITS+1);
-    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
-                                           CONST_BITS+PASS1_BITS+1);
-    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
-                                           CONST_BITS+PASS1_BITS+1);
-    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
-                                           CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+PASS1_BITS+1);
 
     dataptr++;                  /* advance pointer to next column */
   }
 }

@@ -2562,14 +2590,15 @@
   SHIFT_TEMPS
 
   /* Zero bottom row of output coefficient block. */
   MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 7; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -2596,11 +2625,11 @@
     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
     tmp13 += tmp13;
     dataptr[4] = (DCTELEM)
       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */

@@ -2725,14 +2754,15 @@
   SHIFT_TEMPS
 
   /* Zero 2 bottom rows of output coefficient block. */
   MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 6; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -2757,11 +2787,11 @@
     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
     dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
     dataptr[4] = (DCTELEM)
       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */

@@ -2864,14 +2894,15 @@
   SHIFT_TEMPS
 
   /* Zero 3 bottom rows of output coefficient block. */
   MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 5; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -2892,11 +2923,11 @@
     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
     tmp12 += tmp12;
     dataptr[4] = (DCTELEM)
       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */

@@ -2997,21 +3028,23 @@
   SHIFT_TEMPS
 
   /* Zero 4 bottom rows of output coefficient block. */
   MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* We must also scale the output by 8/4 = 2, which we add here. */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * We must also scale the output by 8/4 = 2, which we add here.
+   * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 4; ctr++) {
     elemptr = sample_data[ctr] + start_col;
 
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
-     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     * rotator "c1" should be "c6".
      */
 
     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);

@@ -3025,64 +3058,67 @@
     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
 
-    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
     /* Add fudge factor here for final descale. */
     z1 += ONE << (CONST_BITS-PASS1_BITS-2);
-    dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+
+    dataptr[2] = (DCTELEM)
+      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
                                        CONST_BITS-PASS1_BITS-1);
-    dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+    dataptr[6] = (DCTELEM)
+      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
                                        CONST_BITS-PASS1_BITS-1);
 
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
      * i0..i3 in the paper are tmp0..tmp3 here.
      */
 
-    tmp10 = tmp0 + tmp3;
-    tmp11 = tmp1 + tmp2;
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp1 + tmp3;
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
     /* Add fudge factor here for final descale. */
     z1 += ONE << (CONST_BITS-PASS1_BITS-2);
 
-    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
-    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
-    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
-    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
-    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
-    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
-    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
-
     tmp12 += z1;
     tmp13 += z1;
 
-    dataptr[1] = (DCTELEM)
-      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
-    dataptr[3] = (DCTELEM)
-      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
-    dataptr[5] = (DCTELEM)
-      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
-    dataptr[7] = (DCTELEM)
-      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
+    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
+    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
+    tmp0 += z1 + tmp12;
+    tmp3 += z1 + tmp13;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
+    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
+    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
+    tmp1 += z1 + tmp13;
+    tmp2 += z1 + tmp12;
+
+    dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS-1);
+    dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS-1);
+    dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS-1);
+    dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS-1);
 
     dataptr += DCTSIZE;         /* advance pointer to next row */
   }
 
   /* Pass 2: process columns.
    * We remove the PASS1_BITS scaling, but leave the results scaled up
    * by an overall factor of 8.
-   * 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+   * 4-point FDCT kernel,
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
    */
 
   dataptr = data;
   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
     /* Even part */

@@ -3132,16 +3168,17 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* We scale the results further by 2 as part of output adaption */
-  /* scaling for different DCT size. */
-  /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * We scale the results further by 2 as part of output adaption
+   * scaling for different DCT size.
+   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 3; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -3156,11 +3193,11 @@
 
     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
     dataptr[2] = (DCTELEM)
       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
               CONST_BITS-PASS1_BITS-1);

@@ -3232,16 +3269,17 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* We must also scale the output by (8/4)*(8/2) = 2**3, which we add here. */
-  /* 4-point FDCT kernel, */
-  /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * We must also scale the output by (8/4)*(8/2) = 2**3, which we add here.
+   * 4-point FDCT kernel,
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 2; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -3251,11 +3289,11 @@
     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
 
     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
 
     /* Odd part */

@@ -3305,11 +3343,11 @@
  */
 
 GLOBAL(void)
 jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
-  INT32 tmp0, tmp1;
+  DCTELEM tmp0, tmp1;
   JSAMPROW elemptr;
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);

@@ -3321,15 +3359,17 @@
   /* We leave the results scaled up by an overall factor of 8.
    * We must also scale the output by (8/2)*(8/1) = 2**5.
    */
 
   /* Even part */
-  /* Apply unsigned->signed conversion */
-  data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
+
+  /* Apply unsigned->signed conversion. */
+  data[0] = (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5;
 
   /* Odd part */
-  data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);
+
+  data[1] = (tmp0 - tmp1) << 5;
 }
 
 
 /*
  * Perform the forward DCT on an 8x16 sample block.

@@ -3348,21 +3388,23 @@
   DCTELEM *wsptr;
   JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+   */
 
   dataptr = data;
   ctr = 0;
   for (;;) {
     elemptr = sample_data[ctr] + start_col;
 
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
-     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     * rotator "c1" should be "c6".
      */
 
     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);

@@ -3376,47 +3418,51 @@
     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
 
-    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);   /* c6 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
                                    CONST_BITS-PASS1_BITS);
-    dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+    dataptr[6] = (DCTELEM)
+      DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
                                    CONST_BITS-PASS1_BITS);
 
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
      * i0..i3 in the paper are tmp0..tmp3 here.
      */
 
-    tmp10 = tmp0 + tmp3;
-    tmp11 = tmp1 + tmp2;
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp1 + tmp3;
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);      /* -c3+c5 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);      /* -c3-c5 */
+    tmp12 += z1;
+    tmp13 += z1;
 
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);   /* -c3+c7 */
     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
-    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
-    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
-    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
-    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
-    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
-    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+    tmp0 += z1 + tmp12;
+    tmp3 += z1 + tmp13;
 
-    tmp12 += z1;
-    tmp13 += z1;
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);   /* -c1-c3 */
+    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);          /*  c1+c3+c5-c7 */
+    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);          /*  c1+c3-c5+c7 */
+    tmp1 += z1 + tmp13;
+    tmp2 += z1 + tmp12;
 
-    dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
-    dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
-    dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
-    dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-PASS1_BITS);
 
     ctr++;
 
     if (ctr != DCTSIZE) {
       if (ctr == DCTSIZE * 2)

@@ -3539,14 +3585,15 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
+   */
 
   dataptr = data;
   ctr = 0;
   for (;;) {
     elemptr = sample_data[ctr] + start_col;

@@ -3561,11 +3608,11 @@
     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
 
     z1 = tmp0 + tmp2;
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
     tmp3 += tmp3;
     z1 -= tmp3;
     z1 -= tmp3;

@@ -3719,14 +3766,15 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
 
   dataptr = data;
   ctr = 0;
   for (;;) {
     elemptr = sample_data[ctr] + start_col;

@@ -3742,11 +3790,11 @@
 
     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
     dataptr[2] = (DCTELEM)
       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
               CONST_BITS-PASS1_BITS);

@@ -3868,14 +3916,15 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
+   */
 
   dataptr = data;
   ctr = 0;
   for (;;) {
     elemptr = sample_data[ctr] + start_col;

@@ -3890,11 +3939,11 @@
     tmp11 = tmp0 - tmp1;
 
     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
     tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
     tmp10 -= tmp2 << 2;
     tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */

@@ -4013,15 +4062,17 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* We must also scale the output by 8/4 = 2, which we add here. */
-  /* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * We must also scale the output by 8/4 = 2, which we add here.
+   * 4-point FDCT kernel,
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < DCTSIZE; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -4031,11 +4082,11 @@
     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
 
     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
 
     /* Odd part */

@@ -4055,16 +4106,17 @@
   }
 
   /* Pass 2: process columns.
    * We remove the PASS1_BITS scaling, but leave the results scaled up
    * by an overall factor of 8.
+   * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
    */
 
   dataptr = data;
   for (ctr = 0; ctr < 4; ctr++) {
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
-     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     * rotator "c1" should be "c6".
      */
 
     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];

@@ -4082,51 +4134,53 @@
     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
 
     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
     dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
 
-    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
     /* Add fudge factor here for final descale. */
     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+
     dataptr[DCTSIZE*2] = (DCTELEM)
-      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
+      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
+                  CONST_BITS+PASS1_BITS);
     dataptr[DCTSIZE*6] = (DCTELEM)
-      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
+      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
+                  CONST_BITS+PASS1_BITS);
 
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
      * i0..i3 in the paper are tmp0..tmp3 here.
      */
 
-    tmp10 = tmp0 + tmp3;
-    tmp11 = tmp1 + tmp2;
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp1 + tmp3;
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
     /* Add fudge factor here for final descale. */
     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
 
-    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
-    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
-    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
-    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
-    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
-    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
-    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
-
     tmp12 += z1;
     tmp13 += z1;
 
-    dataptr[DCTSIZE*1] = (DCTELEM)
-      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*3] = (DCTELEM)
-      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*5] = (DCTELEM)
-      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*7] = (DCTELEM)
-      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
+    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
+    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
+    tmp0 += z1 + tmp12;
+    tmp3 += z1 + tmp13;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
+    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
+    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
+    tmp1 += z1 + tmp13;
+    tmp2 += z1 + tmp12;
+
+    dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS1_BITS);
 
     dataptr++;                  /* advance pointer to next column */
   }
 }

@@ -4148,16 +4202,17 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
-  /* We scale the results further by 2 as part of output adaption */
-  /* scaling for different DCT size. */
-  /* 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6). */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT;
+   * furthermore, we scale the results by 2**PASS1_BITS.
+   * We scale the results further by 2 as part of output adaption
+   * scaling for different DCT size.
+   * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 6; ctr++) {
     elemptr = sample_data[ctr] + start_col;

@@ -4166,11 +4221,11 @@
     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
     tmp1 = GETJSAMPLE(elemptr[1]);
 
     tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM)
       ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
     dataptr[2] = (DCTELEM)
       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
               CONST_BITS-PASS1_BITS-1);

@@ -4253,24 +4308,25 @@
   SHIFT_TEMPS
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
-  /* We must also scale the output by (8/2)*(8/4) = 2**3, which we add here. */
+  /* Pass 1: process rows.
+   * Note results are scaled up by sqrt(8) compared to a true DCT.
+   * We must also scale the output by (8/2)*(8/4) = 2**3, which we add here.
+   */
 
   dataptr = data;
   for (ctr = 0; ctr < 4; ctr++) {
     elemptr = sample_data[ctr] + start_col;
 
     /* Even part */
 
     tmp0 = GETJSAMPLE(elemptr[0]);
     tmp1 = GETJSAMPLE(elemptr[1]);
 
-    /* Apply unsigned->signed conversion */
+    /* Apply unsigned->signed conversion. */
     dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
 
     /* Odd part */
 
     dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);

@@ -4322,27 +4378,32 @@
  */
 
 GLOBAL(void)
 jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
-  INT32 tmp0, tmp1;
+  DCTELEM tmp0, tmp1;
 
   /* Pre-zero output coefficient block. */
   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
 
-  tmp0 = GETJSAMPLE(sample_data[0][start_col]);
-  tmp1 = GETJSAMPLE(sample_data[1][start_col]);
+  /* Pass 1: empty. */
 
-  /* We leave the results scaled up by an overall factor of 8.
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
    * We must also scale the output by (8/1)*(8/2) = 2**5.
    */
 
   /* Even part */
-  /* Apply unsigned->signed conversion */
-  data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
+
+  tmp0 = GETJSAMPLE(sample_data[0][start_col]);
+  tmp1 = GETJSAMPLE(sample_data[1][start_col]);
+
+  /* Apply unsigned->signed conversion. */
+  data[DCTSIZE*0] = (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5;
 
   /* Odd part */
-  data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);
+
+  data[DCTSIZE*1] = (tmp0 - tmp1) << 5;
 }
 
 #endif /* DCT_SCALING_SUPPORTED */
 #endif /* DCT_ISLOW_SUPPORTED */

< prev index next >