1 |
chl |
1.3 |
/***************************************************************************** |
2 |
Isibaar |
1.1 |
* |
3 |
chl |
1.3 |
* XVID MPEG-4 VIDEO CODEC |
4 |
|
|
* - fast disrete cosine transformation - integer C version |
5 |
|
|
* |
6 |
|
|
* These routines are from Independent JPEG Group's free JPEG software |
7 |
|
|
* Copyright (C) 1991-1998, Thomas G. Lane (see the file README.IJG) |
8 |
|
|
* |
9 |
edgomez |
1.4 |
* This file is part of XviD, a free MPEG-4 video encoder/decoder |
10 |
chl |
1.3 |
* |
11 |
edgomez |
1.4 |
* XviD is free software; you can redistribute it and/or modify it |
12 |
|
|
* under the terms of the GNU General Public License as published by |
13 |
chl |
1.3 |
* the Free Software Foundation; either version 2 of the License, or |
14 |
|
|
* (at your option) any later version. |
15 |
|
|
* |
16 |
|
|
* This program is distributed in the hope that it will be useful, |
17 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
19 |
|
|
* GNU General Public License for more details. |
20 |
|
|
* |
21 |
|
|
* You should have received a copy of the GNU General Public License |
22 |
|
|
* along with this program; if not, write to the Free Software |
23 |
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
24 |
edgomez |
1.4 |
* |
25 |
|
|
* Under section 8 of the GNU General Public License, the copyright |
26 |
|
|
* holders of XVID explicitly forbid distribution in the following |
27 |
|
|
* countries: |
28 |
|
|
* |
29 |
|
|
* - Japan |
30 |
|
|
* - United States of America |
31 |
|
|
* |
32 |
|
|
* Linking XviD statically or dynamically with other modules is making a |
33 |
|
|
* combined work based on XviD. Thus, the terms and conditions of the |
34 |
|
|
* GNU General Public License cover the whole combination. |
35 |
|
|
* |
36 |
|
|
* As a special exception, the copyright holders of XviD give you |
37 |
|
|
* permission to link XviD with independent modules that communicate with |
38 |
|
|
* XviD solely through the VFW1.1 and DShow interfaces, regardless of the |
39 |
|
|
* license terms of these independent modules, and to copy and distribute |
40 |
|
|
* the resulting combined work under terms of your choice, provided that |
41 |
|
|
* every copy of the combined work is accompanied by a complete copy of |
42 |
|
|
* the source code of XviD (the version of XviD used to produce the |
43 |
|
|
* combined work), being distributed under the terms of the GNU General |
44 |
|
|
* Public License plus this exception. An independent module is a module |
45 |
|
|
* which is not derived from or based on XviD. |
46 |
|
|
* |
47 |
|
|
* Note that people who make modified versions of XviD are not obligated |
48 |
|
|
* to grant this special exception for their modified versions; it is |
49 |
|
|
* their choice whether to do so. The GNU General Public License gives |
50 |
|
|
* permission to release a modified version without this exception; this |
51 |
|
|
* exception also makes it possible to release a modified version which |
52 |
|
|
* carries forward this exception. |
53 |
|
|
* |
54 |
edgomez |
1.5 |
* $Id: fdct.c,v 1.4 2002/11/16 23:51:58 edgomez Exp $ |
55 |
chl |
1.3 |
* |
56 |
|
|
*************************************************************************/ |
57 |
Isibaar |
1.1 |
|
58 |
|
|
/* This routine is a slow-but-accurate integer implementation of the |
59 |
|
|
* forward DCT (Discrete Cosine Transform). Taken from the IJG software |
60 |
|
|
* |
61 |
|
|
* A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT |
62 |
|
|
* on each column. Direct algorithms are also available, but they are |
63 |
|
|
* much more complex and seem not to be any faster when reduced to code. |
64 |
|
|
* |
65 |
|
|
* This implementation is based on an algorithm described in |
66 |
|
|
* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT |
67 |
|
|
* Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, |
68 |
|
|
* Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. |
69 |
|
|
* The primary algorithm described there uses 11 multiplies and 29 adds. |
70 |
|
|
* We use their alternate method with 12 multiplies and 32 adds. |
71 |
|
|
* The advantage of this method is that no data path contains more than one |
72 |
|
|
* multiplication; this allows a very simple and accurate implementation in |
73 |
|
|
* scaled fixed-point arithmetic, with a minimal number of shifts. |
74 |
|
|
* |
75 |
|
|
* The poop on this scaling stuff is as follows: |
76 |
|
|
* |
77 |
|
|
* Each 1-D DCT step produces outputs which are a factor of sqrt(N) |
78 |
|
|
* larger than the true DCT outputs. The final outputs are therefore |
79 |
|
|
* a factor of N larger than desired; since N=8 this can be cured by |
80 |
|
|
* a simple right shift at the end of the algorithm. The advantage of |
81 |
|
|
* this arrangement is that we save two multiplications per 1-D DCT, |
82 |
|
|
* because the y0 and y4 outputs need not be divided by sqrt(N). |
83 |
|
|
* In the IJG code, this factor of 8 is removed by the quantization step |
84 |
|
|
* (in jcdctmgr.c), here it is removed. |
85 |
|
|
* |
86 |
|
|
* We have to do addition and subtraction of the integer inputs, which |
87 |
|
|
* is no problem, and multiplication by fractional constants, which is |
88 |
|
|
* a problem to do in integer arithmetic. We multiply all the constants |
89 |
|
|
* by CONST_SCALE and convert them to integer constants (thus retaining |
90 |
|
|
* CONST_BITS bits of precision in the constants). After doing a |
91 |
|
|
* multiplication we have to divide the product by CONST_SCALE, with proper |
92 |
|
|
* rounding, to produce the correct output. This division can be done |
93 |
|
|
* cheaply as a right shift of CONST_BITS bits. We postpone shifting |
94 |
|
|
* as long as possible so that partial sums can be added together with |
95 |
|
|
* full fractional precision. |
96 |
|
|
* |
97 |
|
|
* The outputs of the first pass are scaled up by PASS1_BITS bits so that |
98 |
|
|
* they are represented to better-than-integral precision. These outputs |
99 |
|
|
* require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word |
100 |
|
|
* with the recommended scaling. (For 12-bit sample data, the intermediate |
101 |
|
|
* array is INT32 anyway.) |
102 |
|
|
* |
103 |
|
|
* To avoid overflow of the 32-bit intermediate results in pass 2, we must |
104 |
|
|
* have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis |
105 |
|
|
* shows that the values given below are the most effective. |
106 |
|
|
* |
107 |
|
|
* We can gain a little more speed, with a further compromise in accuracy, |
108 |
|
|
* by omitting the addition in a descaling shift. This yields an incorrectly |
109 |
|
|
* rounded result half the time... |
110 |
|
|
*/ |
111 |
|
|
|
112 |
|
|
#include "fdct.h" |
113 |
|
|
|
114 |
|
|
#define USE_ACCURATE_ROUNDING |
115 |
|
|
|
116 |
|
|
#define RIGHT_SHIFT(x, shft) ((x) >> (shft)) |
117 |
|
|
|
118 |
|
|
#ifdef USE_ACCURATE_ROUNDING |
119 |
|
|
#define ONE ((int) 1) |
120 |
|
|
#define DESCALE(x, n) RIGHT_SHIFT((x) + (ONE << ((n) - 1)), n) |
121 |
|
|
#else |
122 |
|
|
#define DESCALE(x, n) RIGHT_SHIFT(x, n) |
123 |
|
|
#endif |
124 |
|
|
|
125 |
|
|
#define CONST_BITS 13 |
126 |
|
|
#define PASS1_BITS 2 |
127 |
|
|
|
128 |
|
|
#define FIX_0_298631336 ((int) 2446) /* FIX(0.298631336) */ |
129 |
|
|
#define FIX_0_390180644 ((int) 3196) /* FIX(0.390180644) */ |
130 |
|
|
#define FIX_0_541196100 ((int) 4433) /* FIX(0.541196100) */ |
131 |
|
|
#define FIX_0_765366865 ((int) 6270) /* FIX(0.765366865) */ |
132 |
|
|
#define FIX_0_899976223 ((int) 7373) /* FIX(0.899976223) */ |
133 |
|
|
#define FIX_1_175875602 ((int) 9633) /* FIX(1.175875602) */ |
134 |
|
|
#define FIX_1_501321110 ((int) 12299) /* FIX(1.501321110) */ |
135 |
|
|
#define FIX_1_847759065 ((int) 15137) /* FIX(1.847759065) */ |
136 |
|
|
#define FIX_1_961570560 ((int) 16069) /* FIX(1.961570560) */ |
137 |
|
|
#define FIX_2_053119869 ((int) 16819) /* FIX(2.053119869) */ |
138 |
|
|
#define FIX_2_562915447 ((int) 20995) /* FIX(2.562915447) */ |
139 |
|
|
#define FIX_3_072711026 ((int) 25172) /* FIX(3.072711026) */ |
140 |
|
|
|
141 |
edgomez |
1.5 |
/* function pointer */ |
142 |
Isibaar |
1.1 |
fdctFuncPtr fdct; |
143 |
|
|
|
144 |
|
|
/* |
145 |
|
|
* Perform an integer forward DCT on one block of samples. |
146 |
|
|
*/ |
147 |
|
|
|
148 |
edgomez |
1.2 |
void |
149 |
|
|
fdct_int32(short *const block) |
150 |
Isibaar |
1.1 |
{ |
151 |
edgomez |
1.2 |
int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
152 |
|
|
int tmp10, tmp11, tmp12, tmp13; |
153 |
|
|
int z1, z2, z3, z4, z5; |
154 |
|
|
short *blkptr; |
155 |
|
|
int *dataptr; |
156 |
|
|
int data[64]; |
157 |
|
|
int i; |
158 |
|
|
|
159 |
|
|
/* Pass 1: process rows. */ |
160 |
|
|
/* Note results are scaled up by sqrt(8) compared to a true DCT; */ |
161 |
|
|
/* furthermore, we scale the results by 2**PASS1_BITS. */ |
162 |
|
|
|
163 |
|
|
dataptr = data; |
164 |
|
|
blkptr = block; |
165 |
|
|
for (i = 0; i < 8; i++) { |
166 |
|
|
tmp0 = blkptr[0] + blkptr[7]; |
167 |
|
|
tmp7 = blkptr[0] - blkptr[7]; |
168 |
|
|
tmp1 = blkptr[1] + blkptr[6]; |
169 |
|
|
tmp6 = blkptr[1] - blkptr[6]; |
170 |
|
|
tmp2 = blkptr[2] + blkptr[5]; |
171 |
|
|
tmp5 = blkptr[2] - blkptr[5]; |
172 |
|
|
tmp3 = blkptr[3] + blkptr[4]; |
173 |
|
|
tmp4 = blkptr[3] - blkptr[4]; |
174 |
|
|
|
175 |
|
|
/* Even part per LL&M figure 1 --- note that published figure is faulty; |
176 |
|
|
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6". |
177 |
|
|
*/ |
178 |
|
|
|
179 |
|
|
tmp10 = tmp0 + tmp3; |
180 |
|
|
tmp13 = tmp0 - tmp3; |
181 |
|
|
tmp11 = tmp1 + tmp2; |
182 |
|
|
tmp12 = tmp1 - tmp2; |
183 |
|
|
|
184 |
|
|
dataptr[0] = (tmp10 + tmp11) << PASS1_BITS; |
185 |
|
|
dataptr[4] = (tmp10 - tmp11) << PASS1_BITS; |
186 |
|
|
|
187 |
|
|
z1 = (tmp12 + tmp13) * FIX_0_541196100; |
188 |
|
|
dataptr[2] = |
189 |
|
|
DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS); |
190 |
|
|
dataptr[6] = |
191 |
|
|
DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS); |
192 |
|
|
|
193 |
|
|
/* Odd part per figure 8 --- note paper omits factor of sqrt(2). |
194 |
|
|
* cK represents cos(K*pi/16). |
195 |
|
|
* i0..i3 in the paper are tmp4..tmp7 here. |
196 |
|
|
*/ |
197 |
|
|
|
198 |
|
|
z1 = tmp4 + tmp7; |
199 |
|
|
z2 = tmp5 + tmp6; |
200 |
|
|
z3 = tmp4 + tmp6; |
201 |
|
|
z4 = tmp5 + tmp7; |
202 |
|
|
z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ |
203 |
|
|
|
204 |
|
|
tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */ |
205 |
|
|
tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */ |
206 |
|
|
tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */ |
207 |
|
|
tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */ |
208 |
|
|
z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */ |
209 |
|
|
z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */ |
210 |
|
|
z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */ |
211 |
|
|
z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */ |
212 |
|
|
|
213 |
|
|
z3 += z5; |
214 |
|
|
z4 += z5; |
215 |
|
|
|
216 |
|
|
dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS); |
217 |
|
|
dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS); |
218 |
|
|
dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS); |
219 |
|
|
dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS); |
220 |
|
|
|
221 |
|
|
dataptr += 8; /* advance pointer to next row */ |
222 |
|
|
blkptr += 8; |
223 |
|
|
} |
224 |
|
|
|
225 |
|
|
/* Pass 2: process columns. |
226 |
|
|
* We remove the PASS1_BITS scaling, but leave the results scaled up |
227 |
|
|
* by an overall factor of 8. |
228 |
|
|
*/ |
229 |
|
|
|
230 |
|
|
dataptr = data; |
231 |
|
|
for (i = 0; i < 8; i++) { |
232 |
|
|
tmp0 = dataptr[0] + dataptr[56]; |
233 |
|
|
tmp7 = dataptr[0] - dataptr[56]; |
234 |
|
|
tmp1 = dataptr[8] + dataptr[48]; |
235 |
|
|
tmp6 = dataptr[8] - dataptr[48]; |
236 |
|
|
tmp2 = dataptr[16] + dataptr[40]; |
237 |
|
|
tmp5 = dataptr[16] - dataptr[40]; |
238 |
|
|
tmp3 = dataptr[24] + dataptr[32]; |
239 |
|
|
tmp4 = dataptr[24] - dataptr[32]; |
240 |
|
|
|
241 |
|
|
/* Even part per LL&M figure 1 --- note that published figure is faulty; |
242 |
|
|
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6". |
243 |
|
|
*/ |
244 |
|
|
|
245 |
|
|
tmp10 = tmp0 + tmp3; |
246 |
|
|
tmp13 = tmp0 - tmp3; |
247 |
|
|
tmp11 = tmp1 + tmp2; |
248 |
|
|
tmp12 = tmp1 - tmp2; |
249 |
|
|
|
250 |
|
|
dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS); |
251 |
|
|
dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS); |
252 |
|
|
|
253 |
|
|
z1 = (tmp12 + tmp13) * FIX_0_541196100; |
254 |
|
|
dataptr[16] = |
255 |
|
|
DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS); |
256 |
|
|
dataptr[48] = |
257 |
|
|
DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS); |
258 |
|
|
|
259 |
|
|
/* Odd part per figure 8 --- note paper omits factor of sqrt(2). |
260 |
|
|
* cK represents cos(K*pi/16). |
261 |
|
|
* i0..i3 in the paper are tmp4..tmp7 here. |
262 |
|
|
*/ |
263 |
|
|
|
264 |
|
|
z1 = tmp4 + tmp7; |
265 |
|
|
z2 = tmp5 + tmp6; |
266 |
|
|
z3 = tmp4 + tmp6; |
267 |
|
|
z4 = tmp5 + tmp7; |
268 |
|
|
z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ |
269 |
|
|
|
270 |
|
|
tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */ |
271 |
|
|
tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */ |
272 |
|
|
tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */ |
273 |
|
|
tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */ |
274 |
|
|
z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */ |
275 |
|
|
z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */ |
276 |
|
|
z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */ |
277 |
|
|
z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */ |
278 |
|
|
|
279 |
|
|
z3 += z5; |
280 |
|
|
z4 += z5; |
281 |
|
|
|
282 |
|
|
dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS); |
283 |
|
|
dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS); |
284 |
|
|
dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS); |
285 |
|
|
dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS); |
286 |
|
|
|
287 |
|
|
dataptr++; /* advance pointer to next column */ |
288 |
|
|
} |
289 |
|
|
/* descale */ |
290 |
|
|
for (i = 0; i < 64; i++) |
291 |
|
|
block[i] = (short int) DESCALE(data[i], 3); |
292 |
Isibaar |
1.1 |
} |