1 |
|
;/**************************************************************************** |
2 |
|
; * |
3 |
|
; * XVID MPEG-4 VIDEO CODEC |
4 |
|
; * - MMX and XMM forward discrete cosine transform - |
5 |
|
; * |
6 |
|
; * Copyright(C) 2003 Edouard Gomez <ed.gomez@free.fr> |
7 |
|
; * |
8 |
|
; * This program is free software; you can redistribute it and/or modify it |
9 |
|
; * under the terms of the GNU General Public License as published by |
10 |
|
; * the Free Software Foundation; either version 2 of the License, or |
11 |
|
; * (at your option) any later version. |
12 |
|
; * |
13 |
|
; * This program is distributed in the hope that it will be useful, |
14 |
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
|
; * GNU General Public License for more details. |
17 |
|
; * |
18 |
|
; * You should have received a copy of the GNU General Public License |
19 |
|
; * along with this program; if not, write to the Free Software |
20 |
|
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
|
; * |
22 |
|
; * $Id$ |
23 |
|
; * |
24 |
|
; ***************************************************************************/ |
25 |
|
|
26 |
|
;/**************************************************************************** |
27 |
|
; * |
28 |
|
; * Initial, but incomplete version provided by Intel at AppNote AP-922 |
29 |
|
; * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm |
30 |
|
; * Copyright (C) 1999 Intel Corporation |
31 |
|
; * |
32 |
|
; * Completed and corrected in fdctmm32.c/fdctmm32.doc |
33 |
|
; * http://members.tripod.com/~liaor/ |
34 |
|
; * Copyright (C) 2000 - Royce Shih-Wea Liao <liaor@iname.com> |
35 |
|
; * |
36 |
|
; * Minimizing coefficients reordering changing the tables constants order |
37 |
|
; * http://ffmpeg.sourceforge.net/ |
38 |
|
; * Copyright (C) 2001 Fabrice Bellard. |
39 |
|
; * |
40 |
|
; * The version coded here is just a port to NASM syntax from the FFMPEG's |
41 |
|
; * version. So all credits go to the previous authors for all their |
42 |
|
; * respective work in order to have a nice/fast mmx fDCT. |
43 |
|
; ***************************************************************************/ |
44 |
|
|
45 |
|
BITS 32 |
46 |
|
|
47 |
|
;============================================================================= |
48 |
|
; Macros and other preprocessor constants |
49 |
|
;============================================================================= |
50 |
|
|
51 |
|
%macro cglobal 1 |
52 |
|
%ifdef PREFIX |
53 |
|
global _%1 |
54 |
|
%define %1 _%1 |
55 |
|
%else |
56 |
|
global %1 |
57 |
|
%endif |
58 |
|
%endmacro |
59 |
|
|
60 |
|
;;; Define this if you want an unrolled version of the code |
61 |
|
%define UNROLLED_LOOP |
62 |
|
|
63 |
|
%define BITS_FRW_ACC 3 |
64 |
|
%define SHIFT_FRW_COL BITS_FRW_ACC |
65 |
|
%define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) |
66 |
|
%define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) |
67 |
|
%define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) |
68 |
|
|
69 |
|
;============================================================================= |
70 |
|
; Local Data (Read Only) |
71 |
|
;============================================================================= |
72 |
|
|
73 |
|
SECTION .rodata |
74 |
|
|
75 |
|
ALIGN 8 |
76 |
|
tab_frw_01234567: |
77 |
|
dw 16384, 16384, -8867, -21407 |
78 |
|
dw 16384, 16384, 21407, 8867 |
79 |
|
dw 16384, -16384, 21407, -8867 |
80 |
|
dw -16384, 16384, 8867, -21407 |
81 |
|
dw 22725, 19266, -22725, -12873 |
82 |
|
dw 12873, 4520, 19266, -4520 |
83 |
|
dw 12873, -22725, 19266, -22725 |
84 |
|
dw 4520, 19266, 4520, -12873 |
85 |
|
|
86 |
|
dw 22725, 22725, -12299, -29692 |
87 |
|
dw 22725, 22725, 29692, 12299 |
88 |
|
dw 22725, -22725, 29692, -12299 |
89 |
|
dw -22725, 22725, 12299, -29692 |
90 |
|
dw 31521, 26722, -31521, -17855 |
91 |
|
dw 17855, 6270, 26722, -6270 |
92 |
|
dw 17855, -31521, 26722, -31521 |
93 |
|
dw 6270, 26722, 6270, -17855 |
94 |
|
|
95 |
|
dw 21407, 21407, -11585, -27969 |
96 |
|
dw 21407, 21407, 27969, 11585 |
97 |
|
dw 21407, -21407, 27969, -11585 |
98 |
|
dw -21407, 21407, 11585, -27969 |
99 |
|
dw 29692, 25172, -29692, -16819 |
100 |
|
dw 16819, 5906, 25172, -5906 |
101 |
|
dw 16819, -29692, 25172, -29692 |
102 |
|
dw 5906, 25172, 5906, -16819 |
103 |
|
|
104 |
|
dw 19266, 19266, -10426, -25172 |
105 |
|
dw 19266, 19266, 25172, 10426 |
106 |
|
dw 19266, -19266, 25172, -10426 |
107 |
|
dw -19266, 19266, 10426, -25172 |
108 |
|
dw 26722, 22654, -26722, -15137 |
109 |
|
dw 15137, 5315, 22654, -5315 |
110 |
|
dw 15137, -26722, 22654, -26722 |
111 |
|
dw 5315, 22654, 5315, -15137 |
112 |
|
|
113 |
|
dw 16384, 16384, -8867, -21407 |
114 |
|
dw 16384, 16384, 21407, 8867 |
115 |
|
dw 16384, -16384, 21407, -8867 |
116 |
|
dw -16384, 16384, 8867, -21407 |
117 |
|
dw 22725, 19266, -22725, -12873 |
118 |
|
dw 12873, 4520, 19266, -4520 |
119 |
|
dw 12873, -22725, 19266, -22725 |
120 |
|
dw 4520, 19266, 4520, -12873 |
121 |
|
|
122 |
|
dw 19266, 19266, -10426, -25172 |
123 |
|
dw 19266, 19266, 25172, 10426 |
124 |
|
dw 19266, -19266, 25172, -10426 |
125 |
|
dw -19266, 19266, 10426, -25172 |
126 |
|
dw 26722, 22654, -26722, -15137 |
127 |
|
dw 15137, 5315, 22654, -5315 |
128 |
|
dw 15137, -26722, 22654, -26722 |
129 |
|
dw 5315, 22654, 5315, -15137 |
130 |
|
|
131 |
|
dw 21407, 21407, -11585, -27969 |
132 |
|
dw 21407, 21407, 27969, 11585 |
133 |
|
dw 21407, -21407, 27969, -11585 |
134 |
|
dw -21407, 21407, 11585, -27969 |
135 |
|
dw 29692, 25172, -29692, -16819 |
136 |
|
dw 16819, 5906, 25172, -5906 |
137 |
|
dw 16819, -29692, 25172, -29692 |
138 |
|
dw 5906, 25172, 5906, -16819, |
139 |
|
|
140 |
|
dw 22725, 22725, -12299, -29692 |
141 |
|
dw 22725, 22725, 29692, 12299 |
142 |
|
dw 22725, -22725, 29692, -12299 |
143 |
|
dw -22725, 22725, 12299, -29692 |
144 |
|
dw 31521, 26722, -31521, -17855 |
145 |
|
dw 17855, 6270, 26722, -6270 |
146 |
|
dw 17855, -31521, 26722, -31521 |
147 |
|
dw 6270, 26722, 6270, -17855 |
148 |
|
|
149 |
|
ALIGN 8 |
150 |
|
fdct_one_corr: |
151 |
|
dw 1, 1, 1, 1 |
152 |
|
|
153 |
|
ALIGN 8 |
154 |
|
fdct_tg_all_16: |
155 |
|
dw 13036, 13036, 13036, 13036 |
156 |
|
dw 27146, 27146, 27146, 27146 |
157 |
|
dw -21746, -21746, -21746, -21746 |
158 |
|
|
159 |
|
ALIGN 8 |
160 |
|
cos_4_16: |
161 |
|
dw -19195, -19195, -19195, -19195 |
162 |
|
|
163 |
|
ALIGN 8 |
164 |
|
ocos_4_16: |
165 |
|
dw 23170, 23170, 23170, 23170 |
166 |
|
|
167 |
|
ALIGN 8 |
168 |
|
fdct_r_row: |
169 |
|
dd RND_FRW_ROW, RND_FRW_ROW |
170 |
|
|
171 |
|
;============================================================================= |
172 |
|
; Factorized parts of the code turned into macros for better understanding |
173 |
|
;============================================================================= |
174 |
|
|
175 |
|
;; Macro for column DCT |
176 |
|
;; FDCT_COLUMN_MMX(int16_t *out, const int16_t *in, int offset); |
177 |
|
;; - out, register name holding the out address |
178 |
|
;; - in, register name holding the in address |
179 |
|
;; - column number to process |
180 |
|
%macro FDCT_COLUMN_COMMON 3 |
181 |
|
movq mm0, [%2 + %3*2 + 1*16] |
182 |
|
movq mm1, [%2 + %3*2 + 6*16] |
183 |
|
movq mm2, mm0 |
184 |
|
movq mm3, [%2 + %3*2 + 2*16] |
185 |
|
paddsw mm0, mm1 |
186 |
|
movq mm4, [%2 + %3*2 + 5*16] |
187 |
|
psllw mm0, SHIFT_FRW_COL |
188 |
|
movq mm5, [%2 + %3*2 + 0*16] |
189 |
|
paddsw mm4, mm3 |
190 |
|
paddsw mm5, [%2 + %3*2 + 7*16] |
191 |
|
psllw mm4, SHIFT_FRW_COL |
192 |
|
movq mm6, mm0 |
193 |
|
psubsw mm2, mm1 |
194 |
|
movq mm1, [fdct_tg_all_16 + 4*2] |
195 |
|
psubsw mm0, mm4 |
196 |
|
movq mm7, [%2 + %3*2 + 3*16] |
197 |
|
pmulhw mm1, mm0 |
198 |
|
paddsw mm7, [%2 + %3*2 + 4*16] |
199 |
|
psllw mm5, SHIFT_FRW_COL |
200 |
|
paddsw mm6, mm4 |
201 |
|
psllw mm7, SHIFT_FRW_COL |
202 |
|
movq mm4, mm5 |
203 |
|
psubsw mm5, mm7 |
204 |
|
paddsw mm1, mm5 |
205 |
|
paddsw mm4, mm7 |
206 |
|
por mm1, [fdct_one_corr] |
207 |
|
psllw mm2, SHIFT_FRW_COL + 1 |
208 |
|
pmulhw mm5, [fdct_tg_all_16 + 4*2] |
209 |
|
movq mm7, mm4 |
210 |
|
psubsw mm3, [%2 + %3*2 + 5*16] |
211 |
|
psubsw mm4, mm6 |
212 |
|
movq [%1 + %3*2 + 2*16], mm1 |
213 |
|
paddsw mm7, mm6 |
214 |
|
movq mm1, [%2 + %3*2 + 3*16] |
215 |
|
psllw mm3, SHIFT_FRW_COL + 1 |
216 |
|
psubsw mm1, [%2 + %3*2 + 4*16] |
217 |
|
movq mm6, mm2 |
218 |
|
movq [%1 + %3*2 + 4*16], mm4 |
219 |
|
paddsw mm2, mm3 |
220 |
|
pmulhw mm2, [ocos_4_16] |
221 |
|
psubsw mm6, mm3 |
222 |
|
pmulhw mm6, [ocos_4_16] |
223 |
|
psubsw mm5, mm0 |
224 |
|
por mm5, [fdct_one_corr] |
225 |
|
psllw mm1, SHIFT_FRW_COL |
226 |
|
por mm2, [fdct_one_corr] |
227 |
|
movq mm4, mm1 |
228 |
|
movq mm3, [%2 + %3*2 + 0*16] |
229 |
|
paddsw mm1, mm6 |
230 |
|
psubsw mm3, [%2 + %3*2 + 7*16] |
231 |
|
psubsw mm4, mm6 |
232 |
|
movq mm0, [fdct_tg_all_16 + 0*2] |
233 |
|
psllw mm3, SHIFT_FRW_COL |
234 |
|
movq mm6, [fdct_tg_all_16 + 8*2] |
235 |
|
pmulhw mm0, mm1 |
236 |
|
movq [%1 + %3*2 + 0*16], mm7 |
237 |
|
pmulhw mm6, mm4 |
238 |
|
movq [%1 + %3*2 + 6*16], mm5 |
239 |
|
movq mm7, mm3 |
240 |
|
movq mm5, [fdct_tg_all_16 + 8*2] |
241 |
|
psubsw mm7, mm2 |
242 |
|
paddsw mm3, mm2 |
243 |
|
pmulhw mm5, mm7 |
244 |
|
paddsw mm0, mm3 |
245 |
|
paddsw mm6, mm4 |
246 |
|
pmulhw mm3, [fdct_tg_all_16 + 0*2] |
247 |
|
por mm0, [fdct_one_corr] |
248 |
|
paddsw mm5, mm7 |
249 |
|
psubsw mm7, mm6 |
250 |
|
movq [%1 + %3*2 + 1*16], mm0 |
251 |
|
paddsw mm5, mm4 |
252 |
|
movq [%1 + %3*2 + 3*16], mm7 |
253 |
|
psubsw mm3, mm1 |
254 |
|
movq [%1 + %3*2 + 5*16], mm5 |
255 |
|
movq [%1 + %3*2 + 7*16], mm3 |
256 |
|
%endmacro |
257 |
|
|
258 |
|
;; Macro for row DCT using MMX punpcklw instructions |
259 |
|
;; FDCT_ROW_MMX(int16_t *out, const int16_t *in, const int16_t *table); |
260 |
|
;; - out, register name holding the out address |
261 |
|
;; - in, register name holding the in address |
262 |
|
;; - table coefficients address (register or absolute) |
263 |
|
%macro FDCT_ROW_MMX 3 |
264 |
|
movd mm1, [%2 + 6*2] |
265 |
|
punpcklwd mm1, [%2 + 4*2] |
266 |
|
movq mm2, mm1 |
267 |
|
psrlq mm1, 0x20 |
268 |
|
movq mm0, [%2 + 0*2] |
269 |
|
punpcklwd mm1, mm2 |
270 |
|
movq mm5, mm0 |
271 |
|
paddsw mm0, mm1 |
272 |
|
psubsw mm5, mm1 |
273 |
|
movq mm1, mm0 |
274 |
|
movq mm6, mm5 |
275 |
|
punpckldq mm3, mm5 |
276 |
|
punpckhdq mm6, mm3 |
277 |
|
movq mm3, [%3 + 0*2] |
278 |
|
movq mm4, [%3 + 4*2] |
279 |
|
punpckldq mm2, mm0 |
280 |
|
pmaddwd mm3, mm0 |
281 |
|
punpckhdq mm1, mm2 |
282 |
|
movq mm2, [%3 + 16*2] |
283 |
|
pmaddwd mm4, mm1 |
284 |
|
pmaddwd mm0, [%3 + 8*2] |
285 |
|
movq mm7, [%3 + 20*2] |
286 |
|
pmaddwd mm2, mm5 |
287 |
|
paddd mm3, [fdct_r_row] |
288 |
|
pmaddwd mm7, mm6 |
289 |
|
pmaddwd mm1, [%3 + 12*2] |
290 |
|
paddd mm3, mm4 |
291 |
|
pmaddwd mm5, [%3 + 24*2] |
292 |
|
pmaddwd mm6, [%3 + 28*2] |
293 |
|
paddd mm2, mm7 |
294 |
|
paddd mm0, [fdct_r_row] |
295 |
|
psrad mm3, SHIFT_FRW_ROW |
296 |
|
paddd mm2, [fdct_r_row] |
297 |
|
paddd mm0, mm1 |
298 |
|
paddd mm5, [fdct_r_row] |
299 |
|
psrad mm2, SHIFT_FRW_ROW |
300 |
|
paddd mm5, mm6 |
301 |
|
psrad mm0, SHIFT_FRW_ROW |
302 |
|
psrad mm5, SHIFT_FRW_ROW |
303 |
|
packssdw mm3, mm0 |
304 |
|
packssdw mm2, mm5 |
305 |
|
movq mm6, mm3 |
306 |
|
punpcklwd mm3, mm2 |
307 |
|
punpckhwd mm6, mm2 |
308 |
|
movq [%1 + 0*2], mm3 |
309 |
|
movq [%1 + 4*2], mm6 |
310 |
|
%endmacro |
311 |
|
|
312 |
|
;; Macro for column DCT using XMM instuction pshufw |
313 |
|
;; FDCT_ROW_XMM(int16_t *out, const int16_t *in, const int16_t *table); |
314 |
|
;; - out, register name holding the out address |
315 |
|
;; - in, register name holding the in address |
316 |
|
;; - table coefficient address |
317 |
|
%macro FDCT_ROW_XMM 3 |
318 |
|
;; fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) |
319 |
|
pshufw mm5, [%2 + 4*2], 0x1B |
320 |
|
movq mm0, [%2 + 0*2] |
321 |
|
movq mm1, mm0 |
322 |
|
paddsw mm0, mm5 |
323 |
|
psubsw mm1, mm5 |
324 |
|
pshufw mm2, mm0, 0x4E |
325 |
|
pshufw mm3, mm1, 0x4E |
326 |
|
movq mm4, [%3 + 0*2] |
327 |
|
movq mm6, [%3 + 4*2] |
328 |
|
movq mm5, [%3 + 16*2] |
329 |
|
movq mm7, [%3 + 20*2] |
330 |
|
pmaddwd mm4, mm0 |
331 |
|
pmaddwd mm5, mm1 |
332 |
|
pmaddwd mm6, mm2 |
333 |
|
pmaddwd mm7, mm3 |
334 |
|
pmaddwd mm0, [%3 + 8*2] |
335 |
|
pmaddwd mm2, [%3 + 12*2] |
336 |
|
pmaddwd mm1, [%3 + 24*2] |
337 |
|
pmaddwd mm3, [%3 + 28*2] |
338 |
|
paddd mm4, mm6 |
339 |
|
paddd mm5, mm7 |
340 |
|
paddd mm0, mm2 |
341 |
|
paddd mm1, mm3 |
342 |
|
movq mm7, [fdct_r_row] |
343 |
|
paddd mm4, mm7 |
344 |
|
paddd mm5, mm7 |
345 |
|
paddd mm0, mm7 |
346 |
|
paddd mm1, mm7 |
347 |
|
psrad mm4, SHIFT_FRW_ROW |
348 |
|
psrad mm5, SHIFT_FRW_ROW |
349 |
|
psrad mm0, SHIFT_FRW_ROW |
350 |
|
psrad mm1, SHIFT_FRW_ROW |
351 |
|
packssdw mm4, mm0 |
352 |
|
packssdw mm5, mm1 |
353 |
|
movq mm2, mm4 |
354 |
|
punpcklwd mm4, mm5 |
355 |
|
punpckhwd mm2, mm5 |
356 |
|
movq [%1 + 0*2], mm4 |
357 |
|
movq [%1 + 4*2], mm2 |
358 |
|
%endmacro |
359 |
|
|
360 |
|
%macro MAKE_FDCT_FUNC 2 |
361 |
|
ALIGN 16 |
362 |
|
cglobal %1 |
363 |
|
%1: |
364 |
|
;; Move the destination/source address to the eax register |
365 |
|
mov eax, [esp + 4] |
366 |
|
|
367 |
|
;; Process the columns (4 at a time) |
368 |
|
FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3 |
369 |
|
FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7 |
370 |
|
|
371 |
|
%ifdef UNROLLED_LOOP |
372 |
|
; Unrolled loop version |
373 |
|
%assign i 0 |
374 |
|
%rep 8 |
375 |
|
;; Process the 'i'th row |
376 |
|
%2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i |
377 |
|
%assign i i+1 |
378 |
|
%endrep |
379 |
|
%else |
380 |
|
mov ecx, 8 |
381 |
|
mov edx, tab_frw_01234567 |
382 |
|
ALIGN 8 |
383 |
|
.loop |
384 |
|
%2 eax, eax, edx |
385 |
|
add eax, 2*8 |
386 |
|
add edx, 2*32 |
387 |
|
dec ecx |
388 |
|
jne .loop |
389 |
|
%endif |
390 |
|
|
391 |
|
ret |
392 |
|
%endmacro |
393 |
|
|
394 |
|
;============================================================================= |
395 |
|
; Code |
396 |
|
;============================================================================= |
397 |
|
|
398 |
|
SECTION .text |
399 |
|
|
400 |
|
;----------------------------------------------------------------------------- |
401 |
|
; void fdct_mmx_ffmpeg(int16_t block[64]); |
402 |
|
;----------------------------------------------------------------------------- |
403 |
|
|
404 |
|
MAKE_FDCT_FUNC fdct_mmx_ffmpeg, FDCT_ROW_MMX |
405 |
|
|
406 |
|
;----------------------------------------------------------------------------- |
407 |
|
; void fdct_xmm_ffmpeg(int16_t block[64]); |
408 |
|
;----------------------------------------------------------------------------- |
409 |
|
|
410 |
|
MAKE_FDCT_FUNC fdct_xmm_ffmpeg, FDCT_ROW_XMM |