1 |
|
;/**************************************************************************** |
2 |
|
; * |
3 |
|
; * XVID MPEG-4 VIDEO CODEC |
4 |
|
; * - SSE2 inverse discrete cosine transform - |
5 |
|
; * |
6 |
|
; * Copyright(C) 2002 Dmitry Rozhdestvensky |
7 |
|
; * |
8 |
|
; * This program is free software; you can redistribute it and/or modify it |
9 |
|
; * under the terms of the GNU General Public License as published by |
10 |
|
; * the Free Software Foundation; either version 2 of the License, or |
11 |
|
; * (at your option) any later version. |
12 |
|
; * |
13 |
|
; * This program is distributed in the hope that it will be useful, |
14 |
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
|
; * GNU General Public License for more details. |
17 |
|
; * |
18 |
|
; * You should have received a copy of the GNU General Public License |
19 |
|
; * along with this program; if not, write to the Free Software |
20 |
|
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
|
; * |
22 |
|
; * $Id$ |
23 |
|
; * |
24 |
|
; ***************************************************************************/ |
25 |
|
|
26 |
|
BITS 32 |
27 |
|
|
28 |
|
;============================================================================= |
29 |
|
; Macros and other preprocessor constants |
30 |
|
;============================================================================= |
31 |
|
|
32 |
|
%macro cglobal 1 |
33 |
|
%ifdef PREFIX |
34 |
|
global _%1 |
35 |
|
%define %1 _%1 |
36 |
|
%else |
37 |
|
global %1 |
38 |
|
%endif |
39 |
|
%endmacro |
40 |
|
|
41 |
|
%define BITS_INV_ACC 5 ; 4 or 5 for IEEE |
42 |
|
%define SHIFT_INV_ROW 16 - BITS_INV_ACC |
43 |
|
%define SHIFT_INV_COL 1 + BITS_INV_ACC |
44 |
|
%define RND_INV_ROW 1024 * (6 - BITS_INV_ACC) ; 1 << (SHIFT_INV_ROW-1) |
45 |
|
%define RND_INV_COL 16 * (BITS_INV_ACC - 3) ; 1 << (SHIFT_INV_COL-1) |
46 |
|
%define RND_INV_CORR RND_INV_COL - 1 ; correction -1.0 and round |
47 |
|
|
48 |
|
%define BITS_FRW_ACC 3 ; 2 or 3 for accuracy |
49 |
|
%define SHIFT_FRW_COL BITS_FRW_ACC |
50 |
|
%define SHIFT_FRW_ROW BITS_FRW_ACC + 17 |
51 |
|
%define RND_FRW_ROW 262144 * (BITS_FRW_ACC - 1) ; 1 << (SHIFT_FRW_ROW-1) |
52 |
|
|
53 |
|
;============================================================================= |
54 |
|
; Local Data (Read Only) |
55 |
|
;============================================================================= |
56 |
|
|
57 |
|
%ifdef FORMAT_COFF |
58 |
|
SECTION .rodata data |
59 |
|
%else |
60 |
|
SECTION .rodata data align=16 |
61 |
|
%endif |
62 |
|
|
63 |
|
ALIGN 16 |
64 |
|
tab_i_04: |
65 |
|
dw 16384, 21407, 16384, 8867 ; movq-> w05 w04 w01 w00 |
66 |
|
dw 16384, -8867, 16384, -21407 ; w13 w12 w09 w08 |
67 |
|
dw 16384, 8867, -16384, -21407 ; w07 w06 w03 w02 |
68 |
|
dw -16384, 21407, 16384, -8867 ; w15 w14 w11 w10 |
69 |
|
dw 22725, 19266, 19266, -4520 ; w21 w20 w17 w16 |
70 |
|
dw 12873, -22725, 4520, -12873 ; w29 w28 w25 w24 |
71 |
|
dw 12873, 4520, -22725, -12873 ; w23 w22 w19 w18 |
72 |
|
dw 4520, 19266, 19266, -22725 ; w31 w30 w27 w26 |
73 |
|
|
74 |
|
; Table for rows 1,7 - constants are multiplied by cos_1_16 |
75 |
|
tab_i_17: |
76 |
|
dw 22725, 29692, 22725, 12299 ; movq-> w05 w04 w01 w00 |
77 |
|
dw 22725, -12299, 22725, -29692 ; w13 w12 w09 w08 |
78 |
|
dw 22725, 12299, -22725, -29692 ; w07 w06 w03 w02 |
79 |
|
dw -22725, 29692, 22725, -12299 ; w15 w14 w11 w10 |
80 |
|
dw 31521, 26722, 26722, -6270 ; w21 w20 w17 w16 |
81 |
|
dw 17855, -31521, 6270, -17855 ; w29 w28 w25 w24 |
82 |
|
dw 17855, 6270, -31521, -17855 ; w23 w22 w19 w18 |
83 |
|
dw 6270, 26722, 26722, -31521 ; w31 w30 w27 w26 |
84 |
|
|
85 |
|
; Table for rows 2,6 - constants are multiplied by cos_2_16 |
86 |
|
tab_i_26: |
87 |
|
dw 21407, 27969, 21407, 11585 ; movq-> w05 w04 w01 w00 |
88 |
|
dw 21407, -11585, 21407, -27969 ; w13 w12 w09 w08 |
89 |
|
dw 21407, 11585, -21407, -27969 ; w07 w06 w03 w02 |
90 |
|
dw -21407, 27969, 21407, -11585 ; w15 w14 w11 w10 |
91 |
|
dw 29692, 25172, 25172, -5906 ; w21 w20 w17 w16 |
92 |
|
dw 16819, -29692, 5906, -16819 ; w29 w28 w25 w24 |
93 |
|
dw 16819, 5906, -29692, -16819 ; w23 w22 w19 w18 |
94 |
|
dw 5906, 25172, 25172, -29692 ; w31 w30 w27 w26 |
95 |
|
|
96 |
|
; Table for rows 3,5 - constants are multiplied by cos_3_16 |
97 |
|
tab_i_35: |
98 |
|
dw 19266, 25172, 19266, 10426 ; movq-> w05 w04 w01 w00 |
99 |
|
dw 19266, -10426, 19266, -25172 ; w13 w12 w09 w08 |
100 |
|
dw 19266, 10426, -19266, -25172 ; w07 w06 w03 w02 |
101 |
|
dw -19266, 25172, 19266, -10426 ; w15 w14 w11 w10 |
102 |
|
dw 26722, 22654, 22654, -5315 ; w21 w20 w17 w16 |
103 |
|
dw 15137, -26722, 5315, -15137 ; w29 w28 w25 w24 |
104 |
|
dw 15137, 5315, -26722, -15137 ; w23 w22 w19 w18 |
105 |
|
dw 5315, 22654, 22654, -26722 ; w31 w30 w27 w26 |
106 |
|
|
107 |
|
%if SHIFT_INV_ROW == 12 ; assume SHIFT_INV_ROW == 12 |
108 |
|
rounder_2_0: dd 65536, 65536 |
109 |
|
dd 65536, 65536 |
110 |
|
rounder_2_4: dd 0, 0 |
111 |
|
dd 0, 0 |
112 |
|
rounder_2_1: dd 7195, 7195 |
113 |
|
dd 7195, 7195 |
114 |
|
rounder_2_7: dd 1024, 1024 |
115 |
|
dd 1024, 1024 |
116 |
|
rounder_2_2: dd 4520, 4520 |
117 |
|
dd 4520, 4520 |
118 |
|
rounder_2_6: dd 1024, 1024 |
119 |
|
dd 1024, 1024 |
120 |
|
rounder_2_3: dd 2407, 2407 |
121 |
|
dd 2407, 2407 |
122 |
|
rounder_2_5: dd 240, 240 |
123 |
|
dd 240, 240 |
124 |
|
|
125 |
|
%elif SHIFT_INV_ROW == 11 ; assume SHIFT_INV_ROW == 11 |
126 |
|
rounder_2_0: dd 65536, 65536 |
127 |
|
dd 65536, 65536 |
128 |
|
rounder_2_4: dd 0, 0 |
129 |
|
dd 0, 0 |
130 |
|
rounder_2_1: dd 3597, 3597 |
131 |
|
dd 3597, 3597 |
132 |
|
rounder_2_7: dd 512, 512 |
133 |
|
dd 512, 512 |
134 |
|
rounder_2_2: dd 2260, 2260 |
135 |
|
dd 2260, 2260 |
136 |
|
rounder_2_6: dd 512, 512 |
137 |
|
dd 512, 512 |
138 |
|
rounder_2_3: dd 1203, 1203 |
139 |
|
dd 1203, 1203 |
140 |
|
rounder_2_5: dd 120, 120 |
141 |
|
dd 120, 120 |
142 |
|
%else |
143 |
|
|
144 |
|
%error Invalid SHIFT_INV_ROW specified |
145 |
|
|
146 |
|
%endif |
147 |
|
|
148 |
|
tg_1_16: dw 13036, 13036, 13036, 13036 ; tg * (2<<16) + 0.5 |
149 |
|
dw 13036, 13036, 13036, 13036 |
150 |
|
tg_2_16: dw 27146, 27146, 27146, 27146 ; tg * (2<<16) + 0.5 |
151 |
|
dw 27146, 27146, 27146, 27146 |
152 |
|
tg_3_16: dw -21746, -21746, -21746, -21746 ; tg * (2<<16) + 0.5 |
153 |
|
dw -21746, -21746, -21746, -21746 |
154 |
|
ocos_4_16: dw 23170, 23170, 23170, 23170 ; cos * (2<<15) + 0.5 |
155 |
|
dw 23170, 23170, 23170, 23170 |
156 |
|
|
157 |
|
;============================================================================= |
158 |
|
; Code |
159 |
|
;============================================================================= |
160 |
|
|
161 |
|
SECTION .text |
162 |
|
|
163 |
|
cglobal idct_sse2_dmitry |
164 |
|
|
165 |
|
;----------------------------------------------------------------------------- |
166 |
|
; Helper macro - ROW iDCT |
167 |
|
;----------------------------------------------------------------------------- |
168 |
|
|
169 |
|
%macro DCT_8_INV_ROW_1_SSE2 4 |
170 |
|
pshufhw xmm1, [%1], 11011000b ;x 75643210 |
171 |
|
pshuflw xmm1, xmm1, 11011000b ;x 75643120 |
172 |
|
pshufd xmm0, xmm1, 00000000b ;x 20202020 |
173 |
|
pmaddwd xmm0, [%3] ;w 13 12 9 8 5410 |
174 |
|
|
175 |
|
;a 3210 first part |
176 |
|
pshufd xmm2, xmm1, 10101010b ;x 64646464 |
177 |
|
pmaddwd xmm2, [%3+16] ;w 15 14 11 10 7632 |
178 |
|
|
179 |
|
;a 3210 second part |
180 |
|
paddd xmm2, xmm0 ;a 3210 ready |
181 |
|
paddd xmm2, [%4] ;must be 4 dwords long, not 2 as for sse1 |
182 |
|
movdqa xmm5, xmm2 |
183 |
|
|
184 |
|
pshufd xmm3, xmm1, 01010101b ;x 31313131 |
185 |
|
pmaddwd xmm3, [%3+32] ;w 29 28 25 24 21 20 17 16 |
186 |
|
|
187 |
|
;b 3210 first part |
188 |
|
pshufd xmm4, xmm1, 11111111b ;x 75757575 |
189 |
|
pmaddwd xmm4, [%3+48] ;w 31 30 27 26 23 22 19 18 |
190 |
|
|
191 |
|
;b 3210 second part |
192 |
|
paddd xmm3,xmm4 ;b 3210 ready |
193 |
|
|
194 |
|
paddd xmm2, xmm3 ;will be y 3210 |
195 |
|
psubd xmm5, xmm3 ;will be y 4567 |
196 |
|
psrad xmm2, SHIFT_INV_ROW |
197 |
|
psrad xmm5, SHIFT_INV_ROW |
198 |
|
packssdw xmm2, xmm5 ;y 45673210 |
199 |
|
pshufhw xmm6, xmm2, 00011011b ;y 76543210 |
200 |
|
movdqa [%2], xmm6 |
201 |
|
%endmacro |
202 |
|
|
203 |
|
;----------------------------------------------------------------------------- |
204 |
|
; Helper macro - Columns iDCT |
205 |
|
;----------------------------------------------------------------------------- |
206 |
|
|
207 |
|
%macro DCT_8_INV_COL_4_SSE2 2 |
208 |
|
movdqa xmm0, [%1+16*0] ;x0 (all columns) |
209 |
|
movdqa xmm2, [%1+16*4] ;x4 |
210 |
|
movdqa xmm1, xmm0 |
211 |
|
|
212 |
|
movdqa xmm4, [%1+16*2] ;x2 |
213 |
|
movdqa xmm5, [%1+16*6] ;x6 |
214 |
|
movdqa xmm6, [tg_2_16] |
215 |
|
movdqa xmm7, xmm6 |
216 |
|
|
217 |
|
paddsw xmm0, xmm2 ;u04=x0+x4 |
218 |
|
psubsw xmm1, xmm2 ;v04=x0-x4 |
219 |
|
movdqa xmm3, xmm0 |
220 |
|
movdqa xmm2, xmm1 |
221 |
|
|
222 |
|
pmulhw xmm6, xmm4 |
223 |
|
pmulhw xmm7, xmm5 |
224 |
|
psubsw xmm6, xmm5 ;v26=x2*T2-x6 |
225 |
|
paddsw xmm7, xmm4 ;u26=x6*T2+x2 |
226 |
|
|
227 |
|
paddsw xmm1, xmm6 ;a1=v04+v26 |
228 |
|
paddsw xmm0, xmm7 ;a0=u04+u26 |
229 |
|
psubsw xmm2, xmm6 ;a2=v04-v26 |
230 |
|
psubsw xmm3, xmm7 ;a3=u04-u26 |
231 |
|
|
232 |
|
movdqa [%2+16*0], xmm0 ;store a3-a0 to |
233 |
|
movdqa [%2+16*6], xmm1 ;free registers |
234 |
|
movdqa [%2+16*2], xmm2 |
235 |
|
movdqa [%2+16*4], xmm3 |
236 |
|
|
237 |
|
movdqa xmm0, [%1+16*1] ;x1 |
238 |
|
movdqa xmm1, [%1+16*7] ;x7 |
239 |
|
movdqa xmm2, [tg_1_16] |
240 |
|
movdqa xmm3, xmm2 |
241 |
|
|
242 |
|
movdqa xmm4, [%1+16*3] ;x3 |
243 |
|
movdqa xmm5, [%1+16*5] ;x5 |
244 |
|
movdqa xmm6, [tg_3_16] |
245 |
|
movdqa xmm7, xmm6 |
246 |
|
|
247 |
|
pmulhw xmm2, xmm0 |
248 |
|
pmulhw xmm3, xmm1 |
249 |
|
psubsw xmm2, xmm1 ;v17=x1*T1-x7 |
250 |
|
paddsw xmm3, xmm0 ;u17=x7*T1+x1 |
251 |
|
movdqa xmm0, xmm3 ;u17 |
252 |
|
movdqa xmm1, xmm2 ;v17 |
253 |
|
|
254 |
|
pmulhw xmm6, xmm4 ;x3*(t3-1) |
255 |
|
pmulhw xmm7, xmm5 ;x5*(t3-1) |
256 |
|
paddsw xmm6, xmm4 |
257 |
|
paddsw xmm7, xmm5 |
258 |
|
psubsw xmm6, xmm5 ;v35=x3*T3-x5 |
259 |
|
paddsw xmm7, xmm4 ;u35=x5*T3+x3 |
260 |
|
|
261 |
|
movdqa xmm4, [ocos_4_16] |
262 |
|
|
263 |
|
paddsw xmm0, xmm7 ;b0=u17+u35 |
264 |
|
psubsw xmm1, xmm6 ;b3=v17-v35 |
265 |
|
psubsw xmm3, xmm7 ;u12=u17-v35 |
266 |
|
paddsw xmm2, xmm6 ;v12=v17+v35 |
267 |
|
|
268 |
|
movdqa xmm5, xmm3 |
269 |
|
paddsw xmm3, xmm2 ;tb1 |
270 |
|
psubsw xmm5, xmm2 ;tb2 |
271 |
|
pmulhw xmm5, xmm4 |
272 |
|
pmulhw xmm4, xmm3 |
273 |
|
paddsw xmm5, xmm5 |
274 |
|
paddsw xmm4, xmm4 |
275 |
|
|
276 |
|
movdqa xmm6, [%2+16*0] ;a0 |
277 |
|
movdqa xmm7, xmm6 |
278 |
|
movdqa xmm2, [%2+16*4] ;a3 |
279 |
|
movdqa xmm3, xmm2 |
280 |
|
|
281 |
|
paddsw xmm6, xmm0 |
282 |
|
psubsw xmm7, xmm0 |
283 |
|
psraw xmm6, SHIFT_INV_COL ;y0=a0+b0 |
284 |
|
psraw xmm7, SHIFT_INV_COL ;y7=a0-b0 |
285 |
|
movdqa [%2+16*0], xmm6 |
286 |
|
movdqa [%2+16*7], xmm7 |
287 |
|
|
288 |
|
paddsw xmm2, xmm1 |
289 |
|
psubsw xmm3, xmm1 |
290 |
|
psraw xmm2, SHIFT_INV_COL ;y3=a3+b3 |
291 |
|
psraw xmm3, SHIFT_INV_COL ;y4=a3-b3 |
292 |
|
movdqa [%2+16*3], xmm2 |
293 |
|
movdqa [%2+16*4], xmm3 |
294 |
|
|
295 |
|
movdqa xmm0, [%2+16*6] ;a1 |
296 |
|
movdqa xmm1, xmm0 |
297 |
|
movdqa xmm6, [%2+16*2] ;a2 |
298 |
|
movdqa xmm7, xmm6 |
299 |
|
|
300 |
|
|
301 |
|
paddsw xmm0, xmm4 |
302 |
|
psubsw xmm1, xmm4 |
303 |
|
psraw xmm0, SHIFT_INV_COL ;y1=a1+b1 |
304 |
|
psraw xmm1, SHIFT_INV_COL ;y6=a1-b1 |
305 |
|
movdqa [%2+16*1], xmm0 |
306 |
|
movdqa [%2+16*6], xmm1 |
307 |
|
|
308 |
|
paddsw xmm6, xmm5 |
309 |
|
psubsw xmm7, xmm5 |
310 |
|
psraw xmm6, SHIFT_INV_COL ;y2=a2+b2 |
311 |
|
psraw xmm7, SHIFT_INV_COL ;y5=a2-b2 |
312 |
|
movdqa [%2+16*2], xmm6 |
313 |
|
movdqa [%2+16*5], xmm7 |
314 |
|
%endmacro |
315 |
|
|
316 |
|
;----------------------------------------------------------------------------- |
317 |
|
; void idct_sse2_dmitry(int16_t coeff[64]); |
318 |
|
;----------------------------------------------------------------------------- |
319 |
|
|
320 |
|
ALIGN 16 |
321 |
|
idct_sse2_dmitry: |
322 |
|
|
323 |
|
mov eax, [esp + 4] |
324 |
|
|
325 |
|
DCT_8_INV_ROW_1_SSE2 eax+ 0, eax+ 0, tab_i_04, rounder_2_0 |
326 |
|
DCT_8_INV_ROW_1_SSE2 eax+ 16, eax+ 16, tab_i_17, rounder_2_1 |
327 |
|
DCT_8_INV_ROW_1_SSE2 eax+ 32, eax+ 32, tab_i_26, rounder_2_2 |
328 |
|
DCT_8_INV_ROW_1_SSE2 eax+ 48, eax+ 48, tab_i_35, rounder_2_3 |
329 |
|
DCT_8_INV_ROW_1_SSE2 eax+ 64, eax+ 64, tab_i_04, rounder_2_4 |
330 |
|
DCT_8_INV_ROW_1_SSE2 eax+ 80, eax+ 80, tab_i_35, rounder_2_5 |
331 |
|
DCT_8_INV_ROW_1_SSE2 eax+ 96, eax+ 96, tab_i_26, rounder_2_6 |
332 |
|
DCT_8_INV_ROW_1_SSE2 eax+112, eax+112, tab_i_17, rounder_2_7 |
333 |
|
|
334 |
|
DCT_8_INV_COL_4_SSE2 eax, eax |
335 |
|
|
336 |
|
ret |