libs/jpeg6/jidctflt.cpp

   1 /*\r
   2 \r
   3  * jidctflt.c\r
   4 \r
   5  *\r
   6 \r
   7  * Copyright (C) 1994, Thomas G. Lane.\r
   8 \r
   9  * This file is part of the Independent JPEG Group's software.\r
  10 \r
  11  * For conditions of distribution and use, see the accompanying README file.\r
  12 \r
  13  *\r
  14 \r
  15  * This file contains a floating-point implementation of the\r
  16 \r
  17  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine\r
  18 \r
  19  * must also perform dequantization of the input coefficients.\r
  20 \r
  21  *\r
  22 \r
  23  * This implementation should be more accurate than either of the integer\r
  24 \r
  25  * IDCT implementations.  However, it may not give the same results on all\r
  26 \r
  27  * machines because of differences in roundoff behavior.  Speed will depend\r
  28 \r
  29  * on the hardware's floating point capacity.\r
  30 \r
  31  *\r
  32 \r
  33  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT\r
  34 \r
  35  * on each row (or vice versa, but it's more convenient to emit a row at\r
  36 \r
  37  * a time).  Direct algorithms are also available, but they are much more\r
  38 \r
  39  * complex and seem not to be any faster when reduced to code.\r
  40 \r
  41  *\r
  42 \r
  43  * This implementation is based on Arai, Agui, and Nakajima's algorithm for\r
  44 \r
  45  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in\r
  46 \r
  47  * Japanese, but the algorithm is described in the Pennebaker & Mitchell\r
  48 \r
  49  * JPEG textbook (see REFERENCES section in file README).  The following code\r
  50 \r
  51  * is based directly on figure 4-8 in P&M.\r
  52 \r
  53  * While an 8-point DCT cannot be done in less than 11 multiplies, it is\r
  54 \r
  55  * possible to arrange the computation so that many of the multiplies are\r
  56 \r
  57  * simple scalings of the final outputs.  These multiplies can then be\r
  58 \r
  59  * folded into the multiplications or divisions by the JPEG quantization\r
  60 \r
  61  * table entries.  The AA&N method leaves only 5 multiplies and 29 adds\r
  62 \r
  63  * to be done in the DCT itself.\r
  64 \r
  65  * The primary disadvantage of this method is that with a fixed-point\r
  66 \r
  67  * implementation, accuracy is lost due to imprecise representation of the\r
  68 \r
  69  * scaled quantization values.  However, that problem does not arise if\r
  70 \r
  71  * we use floating point arithmetic.\r
  72 \r
  73  */\r
  74 \r
  75 \r
  76 \r
  77 #define JPEG_INTERNALS\r
  78 \r
  79 #include "jinclude.h"\r
  80 \r
  81 #include "radiant_jpeglib.h"\r
  82 \r
  83 #include "jdct.h"               /* Private declarations for DCT subsystem */\r
  84 \r
  85 \r
  86 \r
  87 #ifdef DCT_FLOAT_SUPPORTED\r
  88 \r
  89 \r
  90 \r
  91 \r
  92 \r
  93 /*\r
  94 \r
  95  * This module is specialized to the case DCTSIZE = 8.\r
  96 \r
  97  */\r
  98 \r
  99 \r
 100 \r
 101 #if DCTSIZE != 8\r
 102 \r
 103   Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */\r
 104 \r
 105 #endif\r
 106 \r
 107 \r
 108 \r
 109 \r
 110 \r
 111 /* Dequantize a coefficient by multiplying it by the multiplier-table\r
 112 \r
 113  * entry; produce a float result.\r
 114 \r
 115  */\r
 116 \r
 117 \r
 118 \r
 119 #define DEQUANTIZE(coef,quantval)  (((FAST_FLOAT) (coef)) * (quantval))\r
 120 \r
 121 \r
 122 \r
 123 \r
 124 \r
 125 /*\r
 126 \r
 127  * Perform dequantization and inverse DCT on one block of coefficients.\r
 128 \r
 129  */\r
 130 \r
 131 \r
 132 \r
 133 GLOBAL void\r
 134 \r
 135 jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,\r
 136 \r
 137                  JCOEFPTR coef_block,\r
 138 \r
 139                  JSAMPARRAY output_buf, JDIMENSION output_col)\r
 140 \r
 141 {\r
 142 \r
 143   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;\r
 144 \r
 145   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;\r
 146 \r
 147   FAST_FLOAT z5, z10, z11, z12, z13;\r
 148 \r
 149   JCOEFPTR inptr;\r
 150 \r
 151   FLOAT_MULT_TYPE * quantptr;\r
 152 \r
 153   FAST_FLOAT * wsptr;\r
 154 \r
 155   JSAMPROW outptr;\r
 156 \r
 157   JSAMPLE *range_limit = IDCT_range_limit(cinfo);\r
 158 \r
 159   int ctr;\r
 160 \r
 161   FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */\r
 162 \r
 163   SHIFT_TEMPS\r
 164 \r
 165 \r
 166 \r
 167   /* Pass 1: process columns from input, store into work array. */\r
 168 \r
 169 \r
 170 \r
 171   inptr = coef_block;\r
 172 \r
 173   quantptr = (FLOAT_MULT_TYPE *) compptr->dct_table;\r
 174 \r
 175   wsptr = workspace;\r
 176 \r
 177   for (ctr = DCTSIZE; ctr > 0; ctr--) {\r
 178 \r
 179     /* Due to quantization, we will usually find that many of the input\r
 180 \r
 181      * coefficients are zero, especially the AC terms.  We can exploit this\r
 182 \r
 183      * by short-circuiting the IDCT calculation for any column in which all\r
 184 \r
 185      * the AC terms are zero.  In that case each output is equal to the\r
 186 \r
 187      * DC coefficient (with scale factor as needed).\r
 188 \r
 189      * With typical images and quantization tables, half or more of the\r
 190 \r
 191      * column DCT calculations can be simplified this way.\r
 192 \r
 193      */\r
 194 \r
 195     \r
 196 \r
 197     if ((inptr[DCTSIZE*1] | inptr[DCTSIZE*2] | inptr[DCTSIZE*3] |\r
 198 \r
 199          inptr[DCTSIZE*4] | inptr[DCTSIZE*5] | inptr[DCTSIZE*6] |\r
 200 \r
 201          inptr[DCTSIZE*7]) == 0) {\r
 202 \r
 203       /* AC terms all zero */\r
 204 \r
 205       FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);\r
 206 \r
 207       \r
 208 \r
 209       wsptr[DCTSIZE*0] = dcval;\r
 210 \r
 211       wsptr[DCTSIZE*1] = dcval;\r
 212 \r
 213       wsptr[DCTSIZE*2] = dcval;\r
 214 \r
 215       wsptr[DCTSIZE*3] = dcval;\r
 216 \r
 217       wsptr[DCTSIZE*4] = dcval;\r
 218 \r
 219       wsptr[DCTSIZE*5] = dcval;\r
 220 \r
 221       wsptr[DCTSIZE*6] = dcval;\r
 222 \r
 223       wsptr[DCTSIZE*7] = dcval;\r
 224 \r
 225       \r
 226 \r
 227       inptr++;                  /* advance pointers to next column */\r
 228 \r
 229       quantptr++;\r
 230 \r
 231       wsptr++;\r
 232 \r
 233       continue;\r
 234 \r
 235     }\r
 236 \r
 237     \r
 238 \r
 239     /* Even part */\r
 240 \r
 241 \r
 242 \r
 243     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);\r
 244 \r
 245     tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);\r
 246 \r
 247     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);\r
 248 \r
 249     tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);\r
 250 \r
 251 \r
 252 \r
 253     tmp10 = tmp0 + tmp2;        /* phase 3 */\r
 254 \r
 255     tmp11 = tmp0 - tmp2;\r
 256 \r
 257 \r
 258 \r
 259     tmp13 = tmp1 + tmp3;        /* phases 5-3 */\r
 260 \r
 261     tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */\r
 262 \r
 263 \r
 264 \r
 265     tmp0 = tmp10 + tmp13;       /* phase 2 */\r
 266 \r
 267     tmp3 = tmp10 - tmp13;\r
 268 \r
 269     tmp1 = tmp11 + tmp12;\r
 270 \r
 271     tmp2 = tmp11 - tmp12;\r
 272 \r
 273     \r
 274 \r
 275     /* Odd part */\r
 276 \r
 277 \r
 278 \r
 279     tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);\r
 280 \r
 281     tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);\r
 282 \r
 283     tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);\r
 284 \r
 285     tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);\r
 286 \r
 287 \r
 288 \r
 289     z13 = tmp6 + tmp5;          /* phase 6 */\r
 290 \r
 291     z10 = tmp6 - tmp5;\r
 292 \r
 293     z11 = tmp4 + tmp7;\r
 294 \r
 295     z12 = tmp4 - tmp7;\r
 296 \r
 297 \r
 298 \r
 299     tmp7 = z11 + z13;           /* phase 5 */\r
 300 \r
 301     tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */\r
 302 \r
 303 \r
 304 \r
 305     z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */\r
 306 \r
 307     tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */\r
 308 \r
 309     tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */\r
 310 \r
 311 \r
 312 \r
 313     tmp6 = tmp12 - tmp7;        /* phase 2 */\r
 314 \r
 315     tmp5 = tmp11 - tmp6;\r
 316 \r
 317     tmp4 = tmp10 + tmp5;\r
 318 \r
 319 \r
 320 \r
 321     wsptr[DCTSIZE*0] = tmp0 + tmp7;\r
 322 \r
 323     wsptr[DCTSIZE*7] = tmp0 - tmp7;\r
 324 \r
 325     wsptr[DCTSIZE*1] = tmp1 + tmp6;\r
 326 \r
 327     wsptr[DCTSIZE*6] = tmp1 - tmp6;\r
 328 \r
 329     wsptr[DCTSIZE*2] = tmp2 + tmp5;\r
 330 \r
 331     wsptr[DCTSIZE*5] = tmp2 - tmp5;\r
 332 \r
 333     wsptr[DCTSIZE*4] = tmp3 + tmp4;\r
 334 \r
 335     wsptr[DCTSIZE*3] = tmp3 - tmp4;\r
 336 \r
 337 \r
 338 \r
 339     inptr++;                    /* advance pointers to next column */\r
 340 \r
 341     quantptr++;\r
 342 \r
 343     wsptr++;\r
 344 \r
 345   }\r
 346 \r
 347   \r
 348 \r
 349   /* Pass 2: process rows from work array, store into output array. */\r
 350 \r
 351   /* Note that we must descale the results by a factor of 8 == 2**3. */\r
 352 \r
 353 \r
 354 \r
 355   wsptr = workspace;\r
 356 \r
 357   for (ctr = 0; ctr < DCTSIZE; ctr++) {\r
 358 \r
 359     outptr = output_buf[ctr] + output_col;\r
 360 \r
 361     /* Rows of zeroes can be exploited in the same way as we did with columns.\r
 362 \r
 363      * However, the column calculation has created many nonzero AC terms, so\r
 364 \r
 365      * the simplification applies less often (typically 5% to 10% of the time).\r
 366 \r
 367      * And testing floats for zero is relatively expensive, so we don't bother.\r
 368 \r
 369      */\r
 370 \r
 371     \r
 372 \r
 373     /* Even part */\r
 374 \r
 375 \r
 376 \r
 377     tmp10 = wsptr[0] + wsptr[4];\r
 378 \r
 379     tmp11 = wsptr[0] - wsptr[4];\r
 380 \r
 381 \r
 382 \r
 383     tmp13 = wsptr[2] + wsptr[6];\r
 384 \r
 385     tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;\r
 386 \r
 387 \r
 388 \r
 389     tmp0 = tmp10 + tmp13;\r
 390 \r
 391     tmp3 = tmp10 - tmp13;\r
 392 \r
 393     tmp1 = tmp11 + tmp12;\r
 394 \r
 395     tmp2 = tmp11 - tmp12;\r
 396 \r
 397 \r
 398 \r
 399     /* Odd part */\r
 400 \r
 401 \r
 402 \r
 403     z13 = wsptr[5] + wsptr[3];\r
 404 \r
 405     z10 = wsptr[5] - wsptr[3];\r
 406 \r
 407     z11 = wsptr[1] + wsptr[7];\r
 408 \r
 409     z12 = wsptr[1] - wsptr[7];\r
 410 \r
 411 \r
 412 \r
 413     tmp7 = z11 + z13;\r
 414 \r
 415     tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);\r
 416 \r
 417 \r
 418 \r
 419     z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */\r
 420 \r
 421     tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */\r
 422 \r
 423     tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */\r
 424 \r
 425 \r
 426 \r
 427     tmp6 = tmp12 - tmp7;\r
 428 \r
 429     tmp5 = tmp11 - tmp6;\r
 430 \r
 431     tmp4 = tmp10 + tmp5;\r
 432 \r
 433 \r
 434 \r
 435     /* Final output stage: scale down by a factor of 8 and range-limit */\r
 436 \r
 437 \r
 438 \r
 439     outptr[0] = range_limit[(int) DESCALE((INT32) (tmp0 + tmp7), 3)\r
 440 \r
 441                             & RANGE_MASK];\r
 442 \r
 443     outptr[7] = range_limit[(int) DESCALE((INT32) (tmp0 - tmp7), 3)\r
 444 \r
 445                             & RANGE_MASK];\r
 446 \r
 447     outptr[1] = range_limit[(int) DESCALE((INT32) (tmp1 + tmp6), 3)\r
 448 \r
 449                             & RANGE_MASK];\r
 450 \r
 451     outptr[6] = range_limit[(int) DESCALE((INT32) (tmp1 - tmp6), 3)\r
 452 \r
 453                             & RANGE_MASK];\r
 454 \r
 455     outptr[2] = range_limit[(int) DESCALE((INT32) (tmp2 + tmp5), 3)\r
 456 \r
 457                             & RANGE_MASK];\r
 458 \r
 459     outptr[5] = range_limit[(int) DESCALE((INT32) (tmp2 - tmp5), 3)\r
 460 \r
 461                             & RANGE_MASK];\r
 462 \r
 463     outptr[4] = range_limit[(int) DESCALE((INT32) (tmp3 + tmp4), 3)\r
 464 \r
 465                             & RANGE_MASK];\r
 466 \r
 467     outptr[3] = range_limit[(int) DESCALE((INT32) (tmp3 - tmp4), 3)\r
 468 \r
 469                             & RANGE_MASK];\r
 470 \r
 471     \r
 472 \r
 473     wsptr += DCTSIZE;           /* advance pointer to next row */\r
 474 \r
 475   }\r
 476 \r
 477 }\r
 478 \r
 479 \r
 480 \r
 481 #endif /* DCT_FLOAT_SUPPORTED */\r
 482 \r