1 |
/***************************************************************************** |
2 |
* |
3 |
* XVID MPEG-4 VIDEO CODEC |
4 |
* - Forward DCT - |
5 |
* |
6 |
* These routines are from Independent JPEG Group's free JPEG software |
7 |
* Copyright (C) 1991-1998, Thomas G. Lane (see the file README.IJG) |
8 |
* |
9 |
* This program is free software ; you can redistribute it and/or modify |
10 |
* it under the terms of the GNU General Public License as published by |
11 |
* the Free Software Foundation ; either version 2 of the License, or |
12 |
* (at your option) any later version. |
13 |
* |
14 |
* This program is distributed in the hope that it will be useful, |
15 |
* but WITHOUT ANY WARRANTY ; without even the implied warranty of |
16 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 |
* GNU General Public License for more details. |
18 |
* |
19 |
* You should have received a copy of the GNU General Public License |
20 |
* along with this program ; if not, write to the Free Software |
21 |
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
22 |
* |
23 |
* $Id$ |
24 |
* |
25 |
****************************************************************************/ |
26 |
|
27 |
/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */ |
28 |
|
29 |
/* |
30 |
* Disclaimer of Warranty |
31 |
* |
32 |
* These software programs are available to the user without any license fee or |
33 |
* royalty on an "as is" basis. The MPEG Software Simulation Group disclaims |
34 |
* any and all warranties, whether express, implied, or statuary, including any |
35 |
* implied warranties or merchantability or of fitness for a particular |
36 |
* purpose. In no event shall the copyright-holder be liable for any |
37 |
* incidental, punitive, or consequential damages of any kind whatsoever |
38 |
* arising from the use of these programs. |
39 |
* |
40 |
* This disclaimer of warranty extends to the user of these programs and user's |
41 |
* customers, employees, agents, transferees, successors, and assigns. |
42 |
* |
43 |
* The MPEG Software Simulation Group does not represent or warrant that the |
44 |
* programs furnished hereunder are free of infringement of any third-party |
45 |
* patents. |
46 |
* |
47 |
* Commercial implementations of MPEG-1 and MPEG-2 video, including shareware, |
48 |
* are subject to royalty fees to patent holders. Many of these patents are |
49 |
* general enough such that they are unavoidable regardless of implementation |
50 |
* design. |
51 |
* |
52 |
*/ |
53 |
|
54 |
/* This routine is a slow-but-accurate integer implementation of the |
55 |
* forward DCT (Discrete Cosine Transform). Taken from the IJG software |
56 |
* |
57 |
* A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT |
58 |
* on each column. Direct algorithms are also available, but they are |
59 |
* much more complex and seem not to be any faster when reduced to code. |
60 |
* |
61 |
* This implementation is based on an algorithm described in |
62 |
* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT |
63 |
* Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, |
64 |
* Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. |
65 |
* The primary algorithm described there uses 11 multiplies and 29 adds. |
66 |
* We use their alternate method with 12 multiplies and 32 adds. |
67 |
* The advantage of this method is that no data path contains more than one |
68 |
* multiplication; this allows a very simple and accurate implementation in |
69 |
* scaled fixed-point arithmetic, with a minimal number of shifts. |
70 |
* |
71 |
* The poop on this scaling stuff is as follows: |
72 |
* |
73 |
* Each 1-D DCT step produces outputs which are a factor of sqrt(N) |
74 |
* larger than the true DCT outputs. The final outputs are therefore |
75 |
* a factor of N larger than desired; since N=8 this can be cured by |
76 |
* a simple right shift at the end of the algorithm. The advantage of |
77 |
* this arrangement is that we save two multiplications per 1-D DCT, |
78 |
* because the y0 and y4 outputs need not be divided by sqrt(N). |
79 |
* In the IJG code, this factor of 8 is removed by the quantization step |
80 |
* (in jcdctmgr.c), here it is removed. |
81 |
* |
82 |
* We have to do addition and subtraction of the integer inputs, which |
83 |
* is no problem, and multiplication by fractional constants, which is |
84 |
* a problem to do in integer arithmetic. We multiply all the constants |
85 |
* by CONST_SCALE and convert them to integer constants (thus retaining |
86 |
* CONST_BITS bits of precision in the constants). After doing a |
87 |
* multiplication we have to divide the product by CONST_SCALE, with proper |
88 |
* rounding, to produce the correct output. This division can be done |
89 |
* cheaply as a right shift of CONST_BITS bits. We postpone shifting |
90 |
* as long as possible so that partial sums can be added together with |
91 |
* full fractional precision. |
92 |
* |
93 |
* The outputs of the first pass are scaled up by PASS1_BITS bits so that |
94 |
* they are represented to better-than-integral precision. These outputs |
95 |
* require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word |
96 |
* with the recommended scaling. (For 12-bit sample data, the intermediate |
97 |
* array is INT32 anyway.) |
98 |
* |
99 |
* To avoid overflow of the 32-bit intermediate results in pass 2, we must |
100 |
* have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis |
101 |
* shows that the values given below are the most effective. |
102 |
* |
103 |
* We can gain a little more speed, with a further compromise in accuracy, |
104 |
* by omitting the addition in a descaling shift. This yields an incorrectly |
105 |
* rounded result half the time... |
106 |
*/ |
107 |
|
108 |
#include "fdct.h" |
109 |
|
110 |
#define USE_ACCURATE_ROUNDING |
111 |
|
112 |
#define RIGHT_SHIFT(x, shft) ((x) >> (shft)) |
113 |
|
114 |
#ifdef USE_ACCURATE_ROUNDING |
115 |
#define ONE ((int) 1) |
116 |
#define DESCALE(x, n) RIGHT_SHIFT((x) + (ONE << ((n) - 1)), n) |
117 |
#else |
118 |
#define DESCALE(x, n) RIGHT_SHIFT(x, n) |
119 |
#endif |
120 |
|
121 |
#define CONST_BITS 13 |
122 |
#define PASS1_BITS 2 |
123 |
|
124 |
#define FIX_0_298631336 ((int) 2446) /* FIX(0.298631336) */ |
125 |
#define FIX_0_390180644 ((int) 3196) /* FIX(0.390180644) */ |
126 |
#define FIX_0_541196100 ((int) 4433) /* FIX(0.541196100) */ |
127 |
#define FIX_0_765366865 ((int) 6270) /* FIX(0.765366865) */ |
128 |
#define FIX_0_899976223 ((int) 7373) /* FIX(0.899976223) */ |
129 |
#define FIX_1_175875602 ((int) 9633) /* FIX(1.175875602) */ |
130 |
#define FIX_1_501321110 ((int) 12299) /* FIX(1.501321110) */ |
131 |
#define FIX_1_847759065 ((int) 15137) /* FIX(1.847759065) */ |
132 |
#define FIX_1_961570560 ((int) 16069) /* FIX(1.961570560) */ |
133 |
#define FIX_2_053119869 ((int) 16819) /* FIX(2.053119869) */ |
134 |
#define FIX_2_562915447 ((int) 20995) /* FIX(2.562915447) */ |
135 |
#define FIX_3_072711026 ((int) 25172) /* FIX(3.072711026) */ |
136 |
|
137 |
/* function pointer */ |
138 |
fdctFuncPtr fdct; |
139 |
|
140 |
/* |
141 |
* Perform an integer forward DCT on one block of samples. |
142 |
*/ |
143 |
|
144 |
void |
145 |
fdct_int32(short *const block) |
146 |
{ |
147 |
int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
148 |
int tmp10, tmp11, tmp12, tmp13; |
149 |
int z1, z2, z3, z4, z5; |
150 |
short *blkptr; |
151 |
int *dataptr; |
152 |
int data[64]; |
153 |
int i; |
154 |
|
155 |
/* Pass 1: process rows. */ |
156 |
/* Note results are scaled up by sqrt(8) compared to a true DCT; */ |
157 |
/* furthermore, we scale the results by 2**PASS1_BITS. */ |
158 |
|
159 |
dataptr = data; |
160 |
blkptr = block; |
161 |
for (i = 0; i < 8; i++) { |
162 |
tmp0 = blkptr[0] + blkptr[7]; |
163 |
tmp7 = blkptr[0] - blkptr[7]; |
164 |
tmp1 = blkptr[1] + blkptr[6]; |
165 |
tmp6 = blkptr[1] - blkptr[6]; |
166 |
tmp2 = blkptr[2] + blkptr[5]; |
167 |
tmp5 = blkptr[2] - blkptr[5]; |
168 |
tmp3 = blkptr[3] + blkptr[4]; |
169 |
tmp4 = blkptr[3] - blkptr[4]; |
170 |
|
171 |
/* Even part per LL&M figure 1 --- note that published figure is faulty; |
172 |
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6". |
173 |
*/ |
174 |
|
175 |
tmp10 = tmp0 + tmp3; |
176 |
tmp13 = tmp0 - tmp3; |
177 |
tmp11 = tmp1 + tmp2; |
178 |
tmp12 = tmp1 - tmp2; |
179 |
|
180 |
dataptr[0] = (tmp10 + tmp11) << PASS1_BITS; |
181 |
dataptr[4] = (tmp10 - tmp11) << PASS1_BITS; |
182 |
|
183 |
z1 = (tmp12 + tmp13) * FIX_0_541196100; |
184 |
dataptr[2] = |
185 |
DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS); |
186 |
dataptr[6] = |
187 |
DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS); |
188 |
|
189 |
/* Odd part per figure 8 --- note paper omits factor of sqrt(2). |
190 |
* cK represents cos(K*pi/16). |
191 |
* i0..i3 in the paper are tmp4..tmp7 here. |
192 |
*/ |
193 |
|
194 |
z1 = tmp4 + tmp7; |
195 |
z2 = tmp5 + tmp6; |
196 |
z3 = tmp4 + tmp6; |
197 |
z4 = tmp5 + tmp7; |
198 |
z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ |
199 |
|
200 |
tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */ |
201 |
tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */ |
202 |
tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */ |
203 |
tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */ |
204 |
z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */ |
205 |
z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */ |
206 |
z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */ |
207 |
z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */ |
208 |
|
209 |
z3 += z5; |
210 |
z4 += z5; |
211 |
|
212 |
dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS); |
213 |
dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS); |
214 |
dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS); |
215 |
dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS); |
216 |
|
217 |
dataptr += 8; /* advance pointer to next row */ |
218 |
blkptr += 8; |
219 |
} |
220 |
|
221 |
/* Pass 2: process columns. |
222 |
* We remove the PASS1_BITS scaling, but leave the results scaled up |
223 |
* by an overall factor of 8. |
224 |
*/ |
225 |
|
226 |
dataptr = data; |
227 |
for (i = 0; i < 8; i++) { |
228 |
tmp0 = dataptr[0] + dataptr[56]; |
229 |
tmp7 = dataptr[0] - dataptr[56]; |
230 |
tmp1 = dataptr[8] + dataptr[48]; |
231 |
tmp6 = dataptr[8] - dataptr[48]; |
232 |
tmp2 = dataptr[16] + dataptr[40]; |
233 |
tmp5 = dataptr[16] - dataptr[40]; |
234 |
tmp3 = dataptr[24] + dataptr[32]; |
235 |
tmp4 = dataptr[24] - dataptr[32]; |
236 |
|
237 |
/* Even part per LL&M figure 1 --- note that published figure is faulty; |
238 |
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6". |
239 |
*/ |
240 |
|
241 |
tmp10 = tmp0 + tmp3; |
242 |
tmp13 = tmp0 - tmp3; |
243 |
tmp11 = tmp1 + tmp2; |
244 |
tmp12 = tmp1 - tmp2; |
245 |
|
246 |
dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS); |
247 |
dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS); |
248 |
|
249 |
z1 = (tmp12 + tmp13) * FIX_0_541196100; |
250 |
dataptr[16] = |
251 |
DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS); |
252 |
dataptr[48] = |
253 |
DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS); |
254 |
|
255 |
/* Odd part per figure 8 --- note paper omits factor of sqrt(2). |
256 |
* cK represents cos(K*pi/16). |
257 |
* i0..i3 in the paper are tmp4..tmp7 here. |
258 |
*/ |
259 |
|
260 |
z1 = tmp4 + tmp7; |
261 |
z2 = tmp5 + tmp6; |
262 |
z3 = tmp4 + tmp6; |
263 |
z4 = tmp5 + tmp7; |
264 |
z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ |
265 |
|
266 |
tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */ |
267 |
tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */ |
268 |
tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */ |
269 |
tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */ |
270 |
z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */ |
271 |
z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */ |
272 |
z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */ |
273 |
z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */ |
274 |
|
275 |
z3 += z5; |
276 |
z4 += z5; |
277 |
|
278 |
dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS); |
279 |
dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS); |
280 |
dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS); |
281 |
dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS); |
282 |
|
283 |
dataptr++; /* advance pointer to next column */ |
284 |
} |
285 |
/* descale */ |
286 |
for (i = 0; i < 64; i++) |
287 |
block[i] = (short int) DESCALE(data[i], 3); |
288 |
} |