1 |
/***************************************************************************** |
2 |
* |
3 |
* XVID MPEG-4 VIDEO CODEC |
4 |
* - Inverse DCT (More precise version) - |
5 |
* |
6 |
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> |
7 |
* |
8 |
* Originally distributed under the GNU LGPL License (ffmpeg). |
9 |
* It is licensed under the GNU GPL for the Xvid tree. |
10 |
* |
11 |
* This program is free software ; you can redistribute it and/or modify |
12 |
* it under the terms of the GNU General Public License as published by |
13 |
* the Free Software Foundation ; either version 2 of the License, or |
14 |
* (at your option) any later version. |
15 |
* |
16 |
* This program is distributed in the hope that it will be useful, |
17 |
* but WITHOUT ANY WARRANTY ; without even the implied warranty of |
18 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
19 |
* GNU General Public License for more details. |
20 |
* |
21 |
* You should have received a copy of the GNU General Public License |
22 |
* along with this program ; if not, write to the Free Software |
23 |
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
24 |
* |
25 |
* $Id: simple_idct.c,v 1.5 2004/04/05 20:36:36 edgomez Exp $ |
26 |
* |
27 |
****************************************************************************/ |
28 |
|
29 |
/* |
30 |
based upon some outcommented c code from mpeg2dec (idct_mmx.c |
31 |
written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>) |
32 |
*/ |
33 |
|
34 |
#include "../portab.h" |
35 |
#include "idct.h" |
36 |
|
37 |
#if 0 |
38 |
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ |
39 |
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ |
40 |
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ |
41 |
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ |
42 |
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */ |
43 |
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */ |
44 |
#define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ |
45 |
#define ROW_SHIFT 8 |
46 |
#define COL_SHIFT 17 |
47 |
#else |
48 |
#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |
49 |
#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |
50 |
#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |
51 |
#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |
52 |
#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |
53 |
#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |
54 |
#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |
55 |
#define ROW_SHIFT 11 |
56 |
#define COL_SHIFT 20 /* 6 */ |
57 |
#endif |
58 |
|
59 |
/* |
60 |
PPC mac operation. Causes compile problems on newer ppc targets |
61 |
|
62 |
Was originally: #if defined(ARCH_IS_PPC) |
63 |
*/ |
64 |
#if 0 |
65 |
|
66 |
/* signed 16x16 -> 32 multiply add accumulate */ |
67 |
#define MAC16(rt, ra, rb) \ |
68 |
asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); |
69 |
|
70 |
/* signed 16x16 -> 32 multiply */ |
71 |
#define MUL16(rt, ra, rb) \ |
72 |
asm ("mullhw %0, %1, %2" : "=r" (rt) : "r" (ra), "r" (rb)); |
73 |
|
74 |
#else |
75 |
|
76 |
/* signed 16x16 -> 32 multiply add accumulate */ |
77 |
#define MAC16(rt, ra, rb) rt += (ra) * (rb) |
78 |
|
79 |
/* signed 16x16 -> 32 multiply */ |
80 |
#define MUL16(rt, ra, rb) rt = (ra) * (rb) |
81 |
|
82 |
#endif |
83 |
|
84 |
static __inline void idctRowCondDC (int16_t * const row) |
85 |
{ |
86 |
int a0, a1, a2, a3, b0, b1, b2, b3; |
87 |
#ifdef FAST_64BIT |
88 |
uint64_t temp; |
89 |
#else |
90 |
uint32_t temp; |
91 |
#endif |
92 |
|
93 |
#ifdef FAST_64BIT |
94 |
#ifdef ARCH_IS_BIG_ENDIAN |
95 |
#define ROW0_MASK 0xffff000000000000LL |
96 |
#else |
97 |
#define ROW0_MASK 0xffffLL |
98 |
#endif |
99 |
if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) | |
100 |
((uint64_t *)row)[1]) == 0) { |
101 |
temp = (row[0] << 3) & 0xffff; |
102 |
temp += temp << 16; |
103 |
temp += temp << 32; |
104 |
((uint64_t *)row)[0] = temp; |
105 |
((uint64_t *)row)[1] = temp; |
106 |
return; |
107 |
} |
108 |
#else |
109 |
if (!(((uint32_t*)row)[1] | |
110 |
((uint32_t*)row)[2] | |
111 |
((uint32_t*)row)[3] | |
112 |
row[1])) { |
113 |
temp = (row[0] << 3) & 0xffff; |
114 |
temp += temp << 16; |
115 |
((uint32_t*)row)[0]=((uint32_t*)row)[1] = |
116 |
((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp; |
117 |
return; |
118 |
} |
119 |
#endif |
120 |
|
121 |
a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); |
122 |
a1 = a0; |
123 |
a2 = a0; |
124 |
a3 = a0; |
125 |
|
126 |
/* no need to optimize : gcc does it */ |
127 |
a0 += W2 * row[2]; |
128 |
a1 += W6 * row[2]; |
129 |
a2 -= W6 * row[2]; |
130 |
a3 -= W2 * row[2]; |
131 |
|
132 |
MUL16(b0, W1, row[1]); |
133 |
MAC16(b0, W3, row[3]); |
134 |
MUL16(b1, W3, row[1]); |
135 |
MAC16(b1, -W7, row[3]); |
136 |
MUL16(b2, W5, row[1]); |
137 |
MAC16(b2, -W1, row[3]); |
138 |
MUL16(b3, W7, row[1]); |
139 |
MAC16(b3, -W5, row[3]); |
140 |
|
141 |
#ifdef FAST_64BIT |
142 |
temp = ((uint64_t*)row)[1]; |
143 |
#else |
144 |
temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; |
145 |
#endif |
146 |
if (temp != 0) { |
147 |
a0 += W4*row[4] + W6*row[6]; |
148 |
a1 += - W4*row[4] - W2*row[6]; |
149 |
a2 += - W4*row[4] + W2*row[6]; |
150 |
a3 += W4*row[4] - W6*row[6]; |
151 |
|
152 |
MAC16(b0, W5, row[5]); |
153 |
MAC16(b0, W7, row[7]); |
154 |
|
155 |
MAC16(b1, -W1, row[5]); |
156 |
MAC16(b1, -W5, row[7]); |
157 |
|
158 |
MAC16(b2, W7, row[5]); |
159 |
MAC16(b2, W3, row[7]); |
160 |
|
161 |
MAC16(b3, W3, row[5]); |
162 |
MAC16(b3, -W1, row[7]); |
163 |
} |
164 |
|
165 |
row[0] = (a0 + b0) >> ROW_SHIFT; |
166 |
row[7] = (a0 - b0) >> ROW_SHIFT; |
167 |
row[1] = (a1 + b1) >> ROW_SHIFT; |
168 |
row[6] = (a1 - b1) >> ROW_SHIFT; |
169 |
row[2] = (a2 + b2) >> ROW_SHIFT; |
170 |
row[5] = (a2 - b2) >> ROW_SHIFT; |
171 |
row[3] = (a3 + b3) >> ROW_SHIFT; |
172 |
row[4] = (a3 - b3) >> ROW_SHIFT; |
173 |
} |
174 |
|
175 |
|
176 |
static __inline void idctSparseCol (int16_t * const col) |
177 |
{ |
178 |
int a0, a1, a2, a3, b0, b1, b2, b3; |
179 |
|
180 |
/* XXX: I did that only to give same values as previous code */ |
181 |
a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); |
182 |
a1 = a0; |
183 |
a2 = a0; |
184 |
a3 = a0; |
185 |
|
186 |
a0 += + W2*col[8*2]; |
187 |
a1 += + W6*col[8*2]; |
188 |
a2 += - W6*col[8*2]; |
189 |
a3 += - W2*col[8*2]; |
190 |
|
191 |
MUL16(b0, W1, col[8*1]); |
192 |
MUL16(b1, W3, col[8*1]); |
193 |
MUL16(b2, W5, col[8*1]); |
194 |
MUL16(b3, W7, col[8*1]); |
195 |
|
196 |
MAC16(b0, + W3, col[8*3]); |
197 |
MAC16(b1, - W7, col[8*3]); |
198 |
MAC16(b2, - W1, col[8*3]); |
199 |
MAC16(b3, - W5, col[8*3]); |
200 |
|
201 |
if(col[8*4]){ |
202 |
a0 += + W4*col[8*4]; |
203 |
a1 += - W4*col[8*4]; |
204 |
a2 += - W4*col[8*4]; |
205 |
a3 += + W4*col[8*4]; |
206 |
} |
207 |
|
208 |
if (col[8*5]) { |
209 |
MAC16(b0, + W5, col[8*5]); |
210 |
MAC16(b1, - W1, col[8*5]); |
211 |
MAC16(b2, + W7, col[8*5]); |
212 |
MAC16(b3, + W3, col[8*5]); |
213 |
} |
214 |
|
215 |
if(col[8*6]){ |
216 |
a0 += + W6*col[8*6]; |
217 |
a1 += - W2*col[8*6]; |
218 |
a2 += + W2*col[8*6]; |
219 |
a3 += - W6*col[8*6]; |
220 |
} |
221 |
|
222 |
if (col[8*7]) { |
223 |
MAC16(b0, + W7, col[8*7]); |
224 |
MAC16(b1, - W5, col[8*7]); |
225 |
MAC16(b2, + W3, col[8*7]); |
226 |
MAC16(b3, - W1, col[8*7]); |
227 |
} |
228 |
|
229 |
col[0 ] = ((a0 + b0) >> COL_SHIFT); |
230 |
col[8 ] = ((a1 + b1) >> COL_SHIFT); |
231 |
col[16] = ((a2 + b2) >> COL_SHIFT); |
232 |
col[24] = ((a3 + b3) >> COL_SHIFT); |
233 |
col[32] = ((a3 - b3) >> COL_SHIFT); |
234 |
col[40] = ((a2 - b2) >> COL_SHIFT); |
235 |
col[48] = ((a1 - b1) >> COL_SHIFT); |
236 |
col[56] = ((a0 - b0) >> COL_SHIFT); |
237 |
} |
238 |
|
239 |
void simple_idct_c(int16_t * const block) |
240 |
{ |
241 |
int i; |
242 |
for(i=0; i<8; i++) |
243 |
idctRowCondDC(block + i*8); |
244 |
|
245 |
for(i=0; i<8; i++) |
246 |
idctSparseCol(block + i); |
247 |
} |