1 |
|
/* |
2 |
|
* Simple IDCT |
3 |
|
* |
4 |
|
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> |
5 |
|
* |
6 |
|
* This library is free software; you can redistribute it and/or |
7 |
|
* modify it under the terms of the GNU Lesser General Public |
8 |
|
* License as published by the Free Software Foundation; either |
9 |
|
* version 2 of the License, or (at your option) any later version. |
10 |
|
* |
11 |
|
* This library is distributed in the hope that it will be useful, |
12 |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 |
|
* Lesser General Public License for more details. |
15 |
|
* |
16 |
|
* You should have received a copy of the GNU Lesser General Public |
17 |
|
* License along with this library; if not, write to the Free Software |
18 |
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
19 |
|
*/ |
20 |
|
/* |
21 |
|
based upon some outcommented c code from mpeg2dec (idct_mmx.c |
22 |
|
written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>) |
23 |
|
*/ |
24 |
|
#include "../portab.h" |
25 |
|
#include "idct.h" |
26 |
|
|
27 |
|
#if 0 |
28 |
|
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ |
29 |
|
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ |
30 |
|
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ |
31 |
|
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ |
32 |
|
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */ |
33 |
|
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */ |
34 |
|
#define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ |
35 |
|
#define ROW_SHIFT 8 |
36 |
|
#define COL_SHIFT 17 |
37 |
|
#else |
38 |
|
#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
39 |
|
#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
40 |
|
#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
41 |
|
#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
42 |
|
#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
43 |
|
#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
44 |
|
#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
45 |
|
#define ROW_SHIFT 11 |
46 |
|
#define COL_SHIFT 20 // 6 |
47 |
|
#endif |
48 |
|
|
49 |
|
#if defined(ARCH_IS_PPC) |
50 |
|
|
51 |
|
/* signed 16x16 -> 32 multiply add accumulate */ |
52 |
|
#define MAC16(rt, ra, rb) \ |
53 |
|
asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); |
54 |
|
|
55 |
|
/* signed 16x16 -> 32 multiply */ |
56 |
|
#define MUL16(rt, ra, rb) \ |
57 |
|
asm ("mullhw %0, %1, %2" : "=r" (rt) : "r" (ra), "r" (rb)); |
58 |
|
|
59 |
|
#else |
60 |
|
|
61 |
|
/* signed 16x16 -> 32 multiply add accumulate */ |
62 |
|
#define MAC16(rt, ra, rb) rt += (ra) * (rb) |
63 |
|
|
64 |
|
/* signed 16x16 -> 32 multiply */ |
65 |
|
#define MUL16(rt, ra, rb) rt = (ra) * (rb) |
66 |
|
|
67 |
|
#endif |
68 |
|
|
69 |
|
static __inline void idctRowCondDC (int16_t * const row) |
70 |
|
{ |
71 |
|
int a0, a1, a2, a3, b0, b1, b2, b3; |
72 |
|
#ifdef FAST_64BIT |
73 |
|
uint64_t temp; |
74 |
|
#else |
75 |
|
uint32_t temp; |
76 |
|
#endif |
77 |
|
|
78 |
|
#ifdef FAST_64BIT |
79 |
|
#ifdef ARCH_IS_BIG_ENDIAN |
80 |
|
#define ROW0_MASK 0xffff000000000000LL |
81 |
|
#else |
82 |
|
#define ROW0_MASK 0xffffLL |
83 |
|
#endif |
84 |
|
if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) | |
85 |
|
((uint64_t *)row)[1]) == 0) { |
86 |
|
temp = (row[0] << 3) & 0xffff; |
87 |
|
temp += temp << 16; |
88 |
|
temp += temp << 32; |
89 |
|
((uint64_t *)row)[0] = temp; |
90 |
|
((uint64_t *)row)[1] = temp; |
91 |
|
return; |
92 |
|
} |
93 |
|
#else |
94 |
|
if (!(((uint32_t*)row)[1] | |
95 |
|
((uint32_t*)row)[2] | |
96 |
|
((uint32_t*)row)[3] | |
97 |
|
row[1])) { |
98 |
|
temp = (row[0] << 3) & 0xffff; |
99 |
|
temp += temp << 16; |
100 |
|
((uint32_t*)row)[0]=((uint32_t*)row)[1] = |
101 |
|
((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp; |
102 |
|
return; |
103 |
|
} |
104 |
|
#endif |
105 |
|
|
106 |
|
a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); |
107 |
|
a1 = a0; |
108 |
|
a2 = a0; |
109 |
|
a3 = a0; |
110 |
|
|
111 |
|
/* no need to optimize : gcc does it */ |
112 |
|
a0 += W2 * row[2]; |
113 |
|
a1 += W6 * row[2]; |
114 |
|
a2 -= W6 * row[2]; |
115 |
|
a3 -= W2 * row[2]; |
116 |
|
|
117 |
|
MUL16(b0, W1, row[1]); |
118 |
|
MAC16(b0, W3, row[3]); |
119 |
|
MUL16(b1, W3, row[1]); |
120 |
|
MAC16(b1, -W7, row[3]); |
121 |
|
MUL16(b2, W5, row[1]); |
122 |
|
MAC16(b2, -W1, row[3]); |
123 |
|
MUL16(b3, W7, row[1]); |
124 |
|
MAC16(b3, -W5, row[3]); |
125 |
|
|
126 |
|
#ifdef FAST_64BIT |
127 |
|
temp = ((uint64_t*)row)[1]; |
128 |
|
#else |
129 |
|
temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; |
130 |
|
#endif |
131 |
|
if (temp != 0) { |
132 |
|
a0 += W4*row[4] + W6*row[6]; |
133 |
|
a1 += - W4*row[4] - W2*row[6]; |
134 |
|
a2 += - W4*row[4] + W2*row[6]; |
135 |
|
a3 += W4*row[4] - W6*row[6]; |
136 |
|
|
137 |
|
MAC16(b0, W5, row[5]); |
138 |
|
MAC16(b0, W7, row[7]); |
139 |
|
|
140 |
|
MAC16(b1, -W1, row[5]); |
141 |
|
MAC16(b1, -W5, row[7]); |
142 |
|
|
143 |
|
MAC16(b2, W7, row[5]); |
144 |
|
MAC16(b2, W3, row[7]); |
145 |
|
|
146 |
|
MAC16(b3, W3, row[5]); |
147 |
|
MAC16(b3, -W1, row[7]); |
148 |
|
} |
149 |
|
|
150 |
|
row[0] = (a0 + b0) >> ROW_SHIFT; |
151 |
|
row[7] = (a0 - b0) >> ROW_SHIFT; |
152 |
|
row[1] = (a1 + b1) >> ROW_SHIFT; |
153 |
|
row[6] = (a1 - b1) >> ROW_SHIFT; |
154 |
|
row[2] = (a2 + b2) >> ROW_SHIFT; |
155 |
|
row[5] = (a2 - b2) >> ROW_SHIFT; |
156 |
|
row[3] = (a3 + b3) >> ROW_SHIFT; |
157 |
|
row[4] = (a3 - b3) >> ROW_SHIFT; |
158 |
|
} |
159 |
|
|
160 |
|
|
161 |
|
static __inline void idctSparseCol (int16_t * const col) |
162 |
|
{ |
163 |
|
int a0, a1, a2, a3, b0, b1, b2, b3; |
164 |
|
|
165 |
|
/* XXX: I did that only to give same values as previous code */ |
166 |
|
a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); |
167 |
|
a1 = a0; |
168 |
|
a2 = a0; |
169 |
|
a3 = a0; |
170 |
|
|
171 |
|
a0 += + W2*col[8*2]; |
172 |
|
a1 += + W6*col[8*2]; |
173 |
|
a2 += - W6*col[8*2]; |
174 |
|
a3 += - W2*col[8*2]; |
175 |
|
|
176 |
|
MUL16(b0, W1, col[8*1]); |
177 |
|
MUL16(b1, W3, col[8*1]); |
178 |
|
MUL16(b2, W5, col[8*1]); |
179 |
|
MUL16(b3, W7, col[8*1]); |
180 |
|
|
181 |
|
MAC16(b0, + W3, col[8*3]); |
182 |
|
MAC16(b1, - W7, col[8*3]); |
183 |
|
MAC16(b2, - W1, col[8*3]); |
184 |
|
MAC16(b3, - W5, col[8*3]); |
185 |
|
|
186 |
|
if(col[8*4]){ |
187 |
|
a0 += + W4*col[8*4]; |
188 |
|
a1 += - W4*col[8*4]; |
189 |
|
a2 += - W4*col[8*4]; |
190 |
|
a3 += + W4*col[8*4]; |
191 |
|
} |
192 |
|
|
193 |
|
if (col[8*5]) { |
194 |
|
MAC16(b0, + W5, col[8*5]); |
195 |
|
MAC16(b1, - W1, col[8*5]); |
196 |
|
MAC16(b2, + W7, col[8*5]); |
197 |
|
MAC16(b3, + W3, col[8*5]); |
198 |
|
} |
199 |
|
|
200 |
|
if(col[8*6]){ |
201 |
|
a0 += + W6*col[8*6]; |
202 |
|
a1 += - W2*col[8*6]; |
203 |
|
a2 += + W2*col[8*6]; |
204 |
|
a3 += - W6*col[8*6]; |
205 |
|
} |
206 |
|
|
207 |
|
if (col[8*7]) { |
208 |
|
MAC16(b0, + W7, col[8*7]); |
209 |
|
MAC16(b1, - W5, col[8*7]); |
210 |
|
MAC16(b2, + W3, col[8*7]); |
211 |
|
MAC16(b3, - W1, col[8*7]); |
212 |
|
} |
213 |
|
|
214 |
|
col[0 ] = ((a0 + b0) >> COL_SHIFT); |
215 |
|
col[8 ] = ((a1 + b1) >> COL_SHIFT); |
216 |
|
col[16] = ((a2 + b2) >> COL_SHIFT); |
217 |
|
col[24] = ((a3 + b3) >> COL_SHIFT); |
218 |
|
col[32] = ((a3 - b3) >> COL_SHIFT); |
219 |
|
col[40] = ((a2 - b2) >> COL_SHIFT); |
220 |
|
col[48] = ((a1 - b1) >> COL_SHIFT); |
221 |
|
col[56] = ((a0 - b0) >> COL_SHIFT); |
222 |
|
} |
223 |
|
|
224 |
|
void simple_idct_c(int16_t * const block) |
225 |
|
{ |
226 |
|
int i; |
227 |
|
for(i=0; i<8; i++) |
228 |
|
idctRowCondDC(block + i*8); |
229 |
|
|
230 |
|
for(i=0; i<8; i++) |
231 |
|
idctSparseCol(block + i); |
232 |
|
} |
233 |
|
|
234 |
|
|
235 |
|
/* Input permutation for the simple_idct_mmx */ |
236 |
|
static const uint8_t simple_mmx_permutation[64]={ |
237 |
|
0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, |
238 |
|
0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, |
239 |
|
0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, |
240 |
|
0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, |
241 |
|
0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, |
242 |
|
0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, |
243 |
|
0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, |
244 |
|
0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, |
245 |
|
}; |
246 |
|
|
247 |
|
#if defined(ARCH_IS_IA32) |
248 |
|
/* wrapper function, as simple_idct_mmx expects data to be permutated */ |
249 |
|
void simple_idct_mmx2(int16_t * const block) |
250 |
|
{ |
251 |
|
int16_t tmp[64]; |
252 |
|
int i; |
253 |
|
|
254 |
|
for(i=0;i<64;i++) tmp[simple_mmx_permutation[i]] = block[i]; |
255 |
|
simple_idct_mmx(tmp); |
256 |
|
for(i=0;i<64;i++) block[i] = tmp[i]; |
257 |
|
} |
258 |
|
#endif |