1 |
|
;/**************************************************************************** |
2 |
|
; * |
3 |
|
; * XVID MPEG-4 VIDEO CODEC |
4 |
|
; * - MMX and XMM forward discrete cosine transform - |
5 |
|
; * |
6 |
|
; * Copyright(C) 2003 Edouard Gomez <ed.gomez@free.fr> |
7 |
|
; * |
8 |
|
; * This program is free software; you can redistribute it and/or modify it |
9 |
|
; * under the terms of the GNU General Public License as published by |
10 |
|
; * the Free Software Foundation; either version 2 of the License, or |
11 |
|
; * (at your option) any later version. |
12 |
|
; * |
13 |
|
; * This program is distributed in the hope that it will be useful, |
14 |
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
|
; * GNU General Public License for more details. |
17 |
|
; * |
18 |
|
; * You should have received a copy of the GNU General Public License |
19 |
|
; * along with this program; if not, write to the Free Software |
20 |
|
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
|
; * |
22 |
|
; * $Id$ |
23 |
|
; * |
24 |
|
; ***************************************************************************/ |
25 |
|
|
26 |
|
;/**************************************************************************** |
27 |
|
; * |
28 |
|
; * Initial, but incomplete version provided by Intel at AppNote AP-922 |
29 |
|
; * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm |
30 |
|
; * Copyright (C) 1999 Intel Corporation |
31 |
|
; * |
32 |
|
; * Completed and corrected in fdctmm32.c/fdctmm32.doc |
33 |
|
; * http://members.tripod.com/~liaor/ |
34 |
|
; * Copyright (C) 2000 - Royce Shih-Wea Liao <liaor@iname.com> |
35 |
|
; * |
36 |
|
; * Minimizing coefficients reordering changing the tables constants order |
37 |
|
; * http://ffmpeg.sourceforge.net/ |
38 |
|
; * Copyright (C) 2001 Fabrice Bellard. |
39 |
|
; * |
40 |
|
; * The version coded here is just a port to NASM syntax from the FFMPEG's |
41 |
|
; * version. So all credits go to the previous authors for all their |
42 |
|
; * respective work in order to have a nice/fast mmx fDCT. |
43 |
|
; ***************************************************************************/ |
44 |
|
|
45 |
|
BITS 32 |
46 |
|
|
47 |
|
;============================================================================= |
48 |
|
; Macros and other preprocessor constants |
49 |
|
;============================================================================= |
50 |
|
|
51 |
|
%macro cglobal 1 |
52 |
|
%ifdef PREFIX |
53 |
|
global _%1 |
54 |
|
%define %1 _%1 |
55 |
|
%else |
56 |
|
global %1 |
57 |
|
%endif |
58 |
|
%endmacro |
59 |
|
|
60 |
|
;;; Define this if you want an unrolled version of the code |
61 |
|
%define UNROLLED_LOOP |
62 |
|
|
63 |
|
%define BITS_FRW_ACC 3 |
64 |
|
%define SHIFT_FRW_COL BITS_FRW_ACC |
65 |
|
%define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) |
66 |
|
%define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) |
67 |
|
%define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) |
68 |
|
|
69 |
|
;============================================================================= |
70 |
|
; Local Data (Read Only) |
71 |
|
;============================================================================= |
72 |
|
|
73 |
|
%ifdef FORMAT_COFF |
74 |
|
SECTION .rodata data |
75 |
|
%else |
76 |
|
SECTION .rodata data align=16 |
77 |
|
%endif |
78 |
|
|
79 |
|
ALIGN 8 |
80 |
|
tab_frw_01234567: |
81 |
|
dw 16384, 16384, -8867, -21407 |
82 |
|
dw 16384, 16384, 21407, 8867 |
83 |
|
dw 16384, -16384, 21407, -8867 |
84 |
|
dw -16384, 16384, 8867, -21407 |
85 |
|
dw 22725, 19266, -22725, -12873 |
86 |
|
dw 12873, 4520, 19266, -4520 |
87 |
|
dw 12873, -22725, 19266, -22725 |
88 |
|
dw 4520, 19266, 4520, -12873 |
89 |
|
|
90 |
|
dw 22725, 22725, -12299, -29692 |
91 |
|
dw 22725, 22725, 29692, 12299 |
92 |
|
dw 22725, -22725, 29692, -12299 |
93 |
|
dw -22725, 22725, 12299, -29692 |
94 |
|
dw 31521, 26722, -31521, -17855 |
95 |
|
dw 17855, 6270, 26722, -6270 |
96 |
|
dw 17855, -31521, 26722, -31521 |
97 |
|
dw 6270, 26722, 6270, -17855 |
98 |
|
|
99 |
|
dw 21407, 21407, -11585, -27969 |
100 |
|
dw 21407, 21407, 27969, 11585 |
101 |
|
dw 21407, -21407, 27969, -11585 |
102 |
|
dw -21407, 21407, 11585, -27969 |
103 |
|
dw 29692, 25172, -29692, -16819 |
104 |
|
dw 16819, 5906, 25172, -5906 |
105 |
|
dw 16819, -29692, 25172, -29692 |
106 |
|
dw 5906, 25172, 5906, -16819 |
107 |
|
|
108 |
|
dw 19266, 19266, -10426, -25172 |
109 |
|
dw 19266, 19266, 25172, 10426 |
110 |
|
dw 19266, -19266, 25172, -10426 |
111 |
|
dw -19266, 19266, 10426, -25172 |
112 |
|
dw 26722, 22654, -26722, -15137 |
113 |
|
dw 15137, 5315, 22654, -5315 |
114 |
|
dw 15137, -26722, 22654, -26722 |
115 |
|
dw 5315, 22654, 5315, -15137 |
116 |
|
|
117 |
|
dw 16384, 16384, -8867, -21407 |
118 |
|
dw 16384, 16384, 21407, 8867 |
119 |
|
dw 16384, -16384, 21407, -8867 |
120 |
|
dw -16384, 16384, 8867, -21407 |
121 |
|
dw 22725, 19266, -22725, -12873 |
122 |
|
dw 12873, 4520, 19266, -4520 |
123 |
|
dw 12873, -22725, 19266, -22725 |
124 |
|
dw 4520, 19266, 4520, -12873 |
125 |
|
|
126 |
|
dw 19266, 19266, -10426, -25172 |
127 |
|
dw 19266, 19266, 25172, 10426 |
128 |
|
dw 19266, -19266, 25172, -10426 |
129 |
|
dw -19266, 19266, 10426, -25172 |
130 |
|
dw 26722, 22654, -26722, -15137 |
131 |
|
dw 15137, 5315, 22654, -5315 |
132 |
|
dw 15137, -26722, 22654, -26722 |
133 |
|
dw 5315, 22654, 5315, -15137 |
134 |
|
|
135 |
|
dw 21407, 21407, -11585, -27969 |
136 |
|
dw 21407, 21407, 27969, 11585 |
137 |
|
dw 21407, -21407, 27969, -11585 |
138 |
|
dw -21407, 21407, 11585, -27969 |
139 |
|
dw 29692, 25172, -29692, -16819 |
140 |
|
dw 16819, 5906, 25172, -5906 |
141 |
|
dw 16819, -29692, 25172, -29692 |
142 |
|
dw 5906, 25172, 5906, -16819, |
143 |
|
|
144 |
|
dw 22725, 22725, -12299, -29692 |
145 |
|
dw 22725, 22725, 29692, 12299 |
146 |
|
dw 22725, -22725, 29692, -12299 |
147 |
|
dw -22725, 22725, 12299, -29692 |
148 |
|
dw 31521, 26722, -31521, -17855 |
149 |
|
dw 17855, 6270, 26722, -6270 |
150 |
|
dw 17855, -31521, 26722, -31521 |
151 |
|
dw 6270, 26722, 6270, -17855 |
152 |
|
|
153 |
|
ALIGN 8 |
154 |
|
fdct_one_corr: |
155 |
|
dw 1, 1, 1, 1 |
156 |
|
|
157 |
|
ALIGN 8 |
158 |
|
fdct_tg_all_16: |
159 |
|
dw 13036, 13036, 13036, 13036 |
160 |
|
dw 27146, 27146, 27146, 27146 |
161 |
|
dw -21746, -21746, -21746, -21746 |
162 |
|
|
163 |
|
ALIGN 8 |
164 |
|
cos_4_16: |
165 |
|
dw -19195, -19195, -19195, -19195 |
166 |
|
|
167 |
|
ALIGN 8 |
168 |
|
ocos_4_16: |
169 |
|
dw 23170, 23170, 23170, 23170 |
170 |
|
|
171 |
|
ALIGN 8 |
172 |
|
fdct_r_row: |
173 |
|
dd RND_FRW_ROW, RND_FRW_ROW |
174 |
|
|
175 |
|
;============================================================================= |
176 |
|
; Factorized parts of the code turned into macros for better understanding |
177 |
|
;============================================================================= |
178 |
|
|
179 |
|
;; Macro for column DCT |
180 |
|
;; FDCT_COLUMN_MMX(int16_t *out, const int16_t *in, int offset); |
181 |
|
;; - out, register name holding the out address |
182 |
|
;; - in, register name holding the in address |
183 |
|
;; - column number to process |
184 |
|
%macro FDCT_COLUMN_COMMON 3 |
185 |
|
movq mm0, [%2 + %3*2 + 1*16] |
186 |
|
movq mm1, [%2 + %3*2 + 6*16] |
187 |
|
movq mm2, mm0 |
188 |
|
movq mm3, [%2 + %3*2 + 2*16] |
189 |
|
paddsw mm0, mm1 |
190 |
|
movq mm4, [%2 + %3*2 + 5*16] |
191 |
|
psllw mm0, SHIFT_FRW_COL |
192 |
|
movq mm5, [%2 + %3*2 + 0*16] |
193 |
|
paddsw mm4, mm3 |
194 |
|
paddsw mm5, [%2 + %3*2 + 7*16] |
195 |
|
psllw mm4, SHIFT_FRW_COL |
196 |
|
movq mm6, mm0 |
197 |
|
psubsw mm2, mm1 |
198 |
|
movq mm1, [fdct_tg_all_16 + 4*2] |
199 |
|
psubsw mm0, mm4 |
200 |
|
movq mm7, [%2 + %3*2 + 3*16] |
201 |
|
pmulhw mm1, mm0 |
202 |
|
paddsw mm7, [%2 + %3*2 + 4*16] |
203 |
|
psllw mm5, SHIFT_FRW_COL |
204 |
|
paddsw mm6, mm4 |
205 |
|
psllw mm7, SHIFT_FRW_COL |
206 |
|
movq mm4, mm5 |
207 |
|
psubsw mm5, mm7 |
208 |
|
paddsw mm1, mm5 |
209 |
|
paddsw mm4, mm7 |
210 |
|
por mm1, [fdct_one_corr] |
211 |
|
psllw mm2, SHIFT_FRW_COL + 1 |
212 |
|
pmulhw mm5, [fdct_tg_all_16 + 4*2] |
213 |
|
movq mm7, mm4 |
214 |
|
psubsw mm3, [%2 + %3*2 + 5*16] |
215 |
|
psubsw mm4, mm6 |
216 |
|
movq [%1 + %3*2 + 2*16], mm1 |
217 |
|
paddsw mm7, mm6 |
218 |
|
movq mm1, [%2 + %3*2 + 3*16] |
219 |
|
psllw mm3, SHIFT_FRW_COL + 1 |
220 |
|
psubsw mm1, [%2 + %3*2 + 4*16] |
221 |
|
movq mm6, mm2 |
222 |
|
movq [%1 + %3*2 + 4*16], mm4 |
223 |
|
paddsw mm2, mm3 |
224 |
|
pmulhw mm2, [ocos_4_16] |
225 |
|
psubsw mm6, mm3 |
226 |
|
pmulhw mm6, [ocos_4_16] |
227 |
|
psubsw mm5, mm0 |
228 |
|
por mm5, [fdct_one_corr] |
229 |
|
psllw mm1, SHIFT_FRW_COL |
230 |
|
por mm2, [fdct_one_corr] |
231 |
|
movq mm4, mm1 |
232 |
|
movq mm3, [%2 + %3*2 + 0*16] |
233 |
|
paddsw mm1, mm6 |
234 |
|
psubsw mm3, [%2 + %3*2 + 7*16] |
235 |
|
psubsw mm4, mm6 |
236 |
|
movq mm0, [fdct_tg_all_16 + 0*2] |
237 |
|
psllw mm3, SHIFT_FRW_COL |
238 |
|
movq mm6, [fdct_tg_all_16 + 8*2] |
239 |
|
pmulhw mm0, mm1 |
240 |
|
movq [%1 + %3*2 + 0*16], mm7 |
241 |
|
pmulhw mm6, mm4 |
242 |
|
movq [%1 + %3*2 + 6*16], mm5 |
243 |
|
movq mm7, mm3 |
244 |
|
movq mm5, [fdct_tg_all_16 + 8*2] |
245 |
|
psubsw mm7, mm2 |
246 |
|
paddsw mm3, mm2 |
247 |
|
pmulhw mm5, mm7 |
248 |
|
paddsw mm0, mm3 |
249 |
|
paddsw mm6, mm4 |
250 |
|
pmulhw mm3, [fdct_tg_all_16 + 0*2] |
251 |
|
por mm0, [fdct_one_corr] |
252 |
|
paddsw mm5, mm7 |
253 |
|
psubsw mm7, mm6 |
254 |
|
movq [%1 + %3*2 + 1*16], mm0 |
255 |
|
paddsw mm5, mm4 |
256 |
|
movq [%1 + %3*2 + 3*16], mm7 |
257 |
|
psubsw mm3, mm1 |
258 |
|
movq [%1 + %3*2 + 5*16], mm5 |
259 |
|
movq [%1 + %3*2 + 7*16], mm3 |
260 |
|
%endmacro |
261 |
|
|
262 |
|
;; Macro for row DCT using MMX punpcklw instructions |
263 |
|
;; FDCT_ROW_MMX(int16_t *out, const int16_t *in, const int16_t *table); |
264 |
|
;; - out, register name holding the out address |
265 |
|
;; - in, register name holding the in address |
266 |
|
;; - table coefficients address (register or absolute) |
267 |
|
%macro FDCT_ROW_MMX 3 |
268 |
|
movd mm1, [%2 + 6*2] |
269 |
|
punpcklwd mm1, [%2 + 4*2] |
270 |
|
movq mm2, mm1 |
271 |
|
psrlq mm1, 0x20 |
272 |
|
movq mm0, [%2 + 0*2] |
273 |
|
punpcklwd mm1, mm2 |
274 |
|
movq mm5, mm0 |
275 |
|
paddsw mm0, mm1 |
276 |
|
psubsw mm5, mm1 |
277 |
|
movq mm1, mm0 |
278 |
|
movq mm6, mm5 |
279 |
|
punpckldq mm3, mm5 |
280 |
|
punpckhdq mm6, mm3 |
281 |
|
movq mm3, [%3 + 0*2] |
282 |
|
movq mm4, [%3 + 4*2] |
283 |
|
punpckldq mm2, mm0 |
284 |
|
pmaddwd mm3, mm0 |
285 |
|
punpckhdq mm1, mm2 |
286 |
|
movq mm2, [%3 + 16*2] |
287 |
|
pmaddwd mm4, mm1 |
288 |
|
pmaddwd mm0, [%3 + 8*2] |
289 |
|
movq mm7, [%3 + 20*2] |
290 |
|
pmaddwd mm2, mm5 |
291 |
|
paddd mm3, [fdct_r_row] |
292 |
|
pmaddwd mm7, mm6 |
293 |
|
pmaddwd mm1, [%3 + 12*2] |
294 |
|
paddd mm3, mm4 |
295 |
|
pmaddwd mm5, [%3 + 24*2] |
296 |
|
pmaddwd mm6, [%3 + 28*2] |
297 |
|
paddd mm2, mm7 |
298 |
|
paddd mm0, [fdct_r_row] |
299 |
|
psrad mm3, SHIFT_FRW_ROW |
300 |
|
paddd mm2, [fdct_r_row] |
301 |
|
paddd mm0, mm1 |
302 |
|
paddd mm5, [fdct_r_row] |
303 |
|
psrad mm2, SHIFT_FRW_ROW |
304 |
|
paddd mm5, mm6 |
305 |
|
psrad mm0, SHIFT_FRW_ROW |
306 |
|
psrad mm5, SHIFT_FRW_ROW |
307 |
|
packssdw mm3, mm0 |
308 |
|
packssdw mm2, mm5 |
309 |
|
movq mm6, mm3 |
310 |
|
punpcklwd mm3, mm2 |
311 |
|
punpckhwd mm6, mm2 |
312 |
|
movq [%1 + 0*2], mm3 |
313 |
|
movq [%1 + 4*2], mm6 |
314 |
|
%endmacro |
315 |
|
|
316 |
|
;; Macro for column DCT using XMM instuction pshufw |
317 |
|
;; FDCT_ROW_XMM(int16_t *out, const int16_t *in, const int16_t *table); |
318 |
|
;; - out, register name holding the out address |
319 |
|
;; - in, register name holding the in address |
320 |
|
;; - table coefficient address |
321 |
|
%macro FDCT_ROW_XMM 3 |
322 |
|
;; fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) |
323 |
|
pshufw mm5, [%2 + 4*2], 0x1B |
324 |
|
movq mm0, [%2 + 0*2] |
325 |
|
movq mm1, mm0 |
326 |
|
paddsw mm0, mm5 |
327 |
|
psubsw mm1, mm5 |
328 |
|
pshufw mm2, mm0, 0x4E |
329 |
|
pshufw mm3, mm1, 0x4E |
330 |
|
movq mm4, [%3 + 0*2] |
331 |
|
movq mm6, [%3 + 4*2] |
332 |
|
movq mm5, [%3 + 16*2] |
333 |
|
movq mm7, [%3 + 20*2] |
334 |
|
pmaddwd mm4, mm0 |
335 |
|
pmaddwd mm5, mm1 |
336 |
|
pmaddwd mm6, mm2 |
337 |
|
pmaddwd mm7, mm3 |
338 |
|
pmaddwd mm0, [%3 + 8*2] |
339 |
|
pmaddwd mm2, [%3 + 12*2] |
340 |
|
pmaddwd mm1, [%3 + 24*2] |
341 |
|
pmaddwd mm3, [%3 + 28*2] |
342 |
|
paddd mm4, mm6 |
343 |
|
paddd mm5, mm7 |
344 |
|
paddd mm0, mm2 |
345 |
|
paddd mm1, mm3 |
346 |
|
movq mm7, [fdct_r_row] |
347 |
|
paddd mm4, mm7 |
348 |
|
paddd mm5, mm7 |
349 |
|
paddd mm0, mm7 |
350 |
|
paddd mm1, mm7 |
351 |
|
psrad mm4, SHIFT_FRW_ROW |
352 |
|
psrad mm5, SHIFT_FRW_ROW |
353 |
|
psrad mm0, SHIFT_FRW_ROW |
354 |
|
psrad mm1, SHIFT_FRW_ROW |
355 |
|
packssdw mm4, mm0 |
356 |
|
packssdw mm5, mm1 |
357 |
|
movq mm2, mm4 |
358 |
|
punpcklwd mm4, mm5 |
359 |
|
punpckhwd mm2, mm5 |
360 |
|
movq [%1 + 0*2], mm4 |
361 |
|
movq [%1 + 4*2], mm2 |
362 |
|
%endmacro |
363 |
|
|
364 |
|
%macro MAKE_FDCT_FUNC 2 |
365 |
|
ALIGN 16 |
366 |
|
cglobal %1 |
367 |
|
%1: |
368 |
|
;; Move the destination/source address to the eax register |
369 |
|
mov eax, [esp + 4] |
370 |
|
|
371 |
|
;; Process the columns (4 at a time) |
372 |
|
FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3 |
373 |
|
FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7 |
374 |
|
|
375 |
|
%ifdef UNROLLED_LOOP |
376 |
|
; Unrolled loop version |
377 |
|
%assign i 0 |
378 |
|
%rep 8 |
379 |
|
;; Process the 'i'th row |
380 |
|
%2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i |
381 |
|
%assign i i+1 |
382 |
|
%endrep |
383 |
|
%else |
384 |
|
mov ecx, 8 |
385 |
|
mov edx, tab_frw_01234567 |
386 |
|
ALIGN 8 |
387 |
|
.loop |
388 |
|
%2 eax, eax, edx |
389 |
|
add eax, 2*8 |
390 |
|
add edx, 2*32 |
391 |
|
dec ecx |
392 |
|
jne .loop |
393 |
|
%endif |
394 |
|
|
395 |
|
ret |
396 |
|
%endmacro |
397 |
|
|
398 |
|
;============================================================================= |
399 |
|
; Code |
400 |
|
;============================================================================= |
401 |
|
|
402 |
|
SECTION .text |
403 |
|
|
404 |
|
;----------------------------------------------------------------------------- |
405 |
|
; void fdct_mmx_ffmpeg(int16_t block[64]); |
406 |
|
;----------------------------------------------------------------------------- |
407 |
|
|
408 |
|
MAKE_FDCT_FUNC fdct_mmx_ffmpeg, FDCT_ROW_MMX |
409 |
|
|
410 |
|
;----------------------------------------------------------------------------- |
411 |
|
; void fdct_xmm_ffmpeg(int16_t block[64]); |
412 |
|
;----------------------------------------------------------------------------- |
413 |
|
|
414 |
|
MAKE_FDCT_FUNC fdct_xmm_ffmpeg, FDCT_ROW_XMM |