6 |
; * |
; * |
7 |
; * This file is part of XviD, a free MPEG-4 video encoder/decoder |
; * This file is part of XviD, a free MPEG-4 video encoder/decoder |
8 |
; * |
; * |
9 |
; * XviD is free software; you can redistribute it and/or modify it |
; * XviD is free software; you can rDST_PTRstribute it and/or modify it |
10 |
; * under the terms of the GNU General Public License as published by |
; * under the terms of the GNU General Public License as published by |
11 |
; * the Free Software Foundation; either version 2 of the License, or |
; * the Free Software Foundation; either version 2 of the License, or |
12 |
; * (at your option) any later version. |
; * (at your option) any later version. |
38 |
; instead of xvid_Expand_mmx... |
; instead of xvid_Expand_mmx... |
39 |
|
|
40 |
|
|
41 |
bits 32 |
%include "nasm.inc" |
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
|
%endmacro |
|
|
%macro cextern 1 |
|
|
%ifdef PREFIX |
|
|
extern _%1 |
|
|
%define %1 _%1 |
|
|
%else |
|
|
extern %1 |
|
|
%endif |
|
|
%endmacro |
|
|
|
|
42 |
|
|
43 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
44 |
;// Declarations |
;// Declarations |
73 |
cglobal xvid_V_Pass_Avrg_8_Add_mmx |
cglobal xvid_V_Pass_Avrg_8_Add_mmx |
74 |
cglobal xvid_V_Pass_Avrg_Up_8_Add_mmx |
cglobal xvid_V_Pass_Avrg_Up_8_Add_mmx |
75 |
|
|
76 |
cextern xvid_Expand_mmx |
cglobal xvid_Expand_mmx |
77 |
|
|
78 |
%ifdef USE_TABLES |
cglobal xvid_FIR_1_0_0_0 |
79 |
|
cglobal xvid_FIR_3_1_0_0 |
80 |
cextern xvid_FIR_1_0_0_0 |
cglobal xvid_FIR_6_3_1_0 |
81 |
cextern xvid_FIR_3_1_0_0 |
cglobal xvid_FIR_14_3_2_1 |
82 |
cextern xvid_FIR_6_3_1_0 |
cglobal xvid_FIR_20_6_3_1 |
83 |
cextern xvid_FIR_14_3_2_1 |
cglobal xvid_FIR_20_20_6_3 |
84 |
cextern xvid_FIR_20_6_3_1 |
cglobal xvid_FIR_23_19_6_3 |
85 |
cextern xvid_FIR_20_20_6_3 |
cglobal xvid_FIR_7_20_20_6 |
86 |
cextern xvid_FIR_23_19_6_3 |
cglobal xvid_FIR_6_20_20_6 |
87 |
cextern xvid_FIR_7_20_20_6 |
cglobal xvid_FIR_6_20_20_7 |
88 |
cextern xvid_FIR_6_20_20_6 |
cglobal xvid_FIR_3_6_20_20 |
89 |
cextern xvid_FIR_6_20_20_7 |
cglobal xvid_FIR_3_6_19_23 |
90 |
cextern xvid_FIR_3_6_20_20 |
cglobal xvid_FIR_1_3_6_20 |
91 |
cextern xvid_FIR_3_6_19_23 |
cglobal xvid_FIR_1_2_3_14 |
92 |
cextern xvid_FIR_1_3_6_20 |
cglobal xvid_FIR_0_1_3_6 |
93 |
cextern xvid_FIR_1_2_3_14 |
cglobal xvid_FIR_0_0_1_3 |
94 |
cextern xvid_FIR_0_1_3_6 |
cglobal xvid_FIR_0_0_0_1 |
95 |
cextern xvid_FIR_0_0_1_3 |
|
96 |
cextern xvid_FIR_0_0_0_1 |
SECTION .data align=SECTION_ALIGN |
97 |
|
|
98 |
%endif |
align SECTION_ALIGN |
99 |
|
xvid_Expand_mmx: |
100 |
|
times 256*4 dw 0 ; uint16_t xvid_Expand_mmx[256][4] |
101 |
|
ENDFUNC |
102 |
|
|
103 |
|
xvid_FIR_1_0_0_0: |
104 |
|
times 256*4 dw 0 |
105 |
|
ENDFUNC |
106 |
|
|
107 |
|
xvid_FIR_3_1_0_0: |
108 |
|
times 256*4 dw 0 |
109 |
|
ENDFUNC |
110 |
|
|
111 |
|
xvid_FIR_6_3_1_0: |
112 |
|
times 256*4 dw 0 |
113 |
|
ENDFUNC |
114 |
|
|
115 |
|
xvid_FIR_14_3_2_1: |
116 |
|
times 256*4 dw 0 |
117 |
|
ENDFUNC |
118 |
|
|
119 |
|
xvid_FIR_20_6_3_1: |
120 |
|
times 256*4 dw 0 |
121 |
|
ENDFUNC |
122 |
|
|
123 |
|
xvid_FIR_20_20_6_3: |
124 |
|
times 256*4 dw 0 |
125 |
|
ENDFUNC |
126 |
|
|
127 |
|
xvid_FIR_23_19_6_3: |
128 |
|
times 256*4 dw 0 |
129 |
|
ENDFUNC |
130 |
|
|
131 |
|
xvid_FIR_7_20_20_6: |
132 |
|
times 256*4 dw 0 |
133 |
|
ENDFUNC |
134 |
|
|
135 |
|
xvid_FIR_6_20_20_6: |
136 |
|
times 256*4 dw 0 |
137 |
|
ENDFUNC |
138 |
|
|
139 |
|
xvid_FIR_6_20_20_7: |
140 |
|
times 256*4 dw 0 |
141 |
|
ENDFUNC |
142 |
|
|
143 |
|
xvid_FIR_3_6_20_20: |
144 |
|
times 256*4 dw 0 |
145 |
|
ENDFUNC |
146 |
|
|
147 |
|
xvid_FIR_3_6_19_23: |
148 |
|
times 256*4 dw 0 |
149 |
|
ENDFUNC |
150 |
|
|
151 |
|
xvid_FIR_1_3_6_20: |
152 |
|
times 256*4 dw 0 |
153 |
|
ENDFUNC |
154 |
|
|
155 |
|
xvid_FIR_1_2_3_14: |
156 |
|
times 256*4 dw 0 |
157 |
|
ENDFUNC |
158 |
|
|
159 |
|
xvid_FIR_0_1_3_6: |
160 |
|
times 256*4 dw 0 |
161 |
|
ENDFUNC |
162 |
|
|
163 |
|
xvid_FIR_0_0_1_3: |
164 |
|
times 256*4 dw 0 |
165 |
|
ENDFUNC |
166 |
|
|
167 |
|
xvid_FIR_0_0_0_1: |
168 |
|
times 256*4 dw 0 |
169 |
|
ENDFUNC |
170 |
|
|
171 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
172 |
|
|
173 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata |
|
|
%else |
|
|
SECTION .rodata align=16 |
|
|
%endif |
|
174 |
|
|
175 |
align 16 |
align SECTION_ALIGN |
176 |
Rounder1_MMX: |
Rounder1_MMX: |
177 |
times 4 dw 1 |
times 4 dw 1 |
178 |
Rounder0_MMX: |
Rounder0_MMX: |
179 |
times 4 dw 0 |
times 4 dw 0 |
180 |
|
|
181 |
align 16 |
align SECTION_ALIGN |
182 |
Rounder_QP_MMX |
Rounder_QP_MMX: |
183 |
times 4 dw 16 |
times 4 dw 16 |
184 |
times 4 dw 15 |
times 4 dw 15 |
185 |
|
|
186 |
%ifndef USE_TABLES |
%ifndef USE_TABLES |
187 |
|
|
188 |
align 16 |
align SECTION_ALIGN |
189 |
|
|
190 |
; H-Pass table shared by 16x? and 8x? filters |
; H-Pass table shared by 16x? and 8x? filters |
191 |
|
|
192 |
FIR_R0: dw 14, -3, 2, -1 |
FIR_R0: dw 14, -3, 2, -1 |
193 |
align 16 |
align SECTION_ALIGN |
194 |
FIR_R1: dw 23, 19, -6, 3, -1, 0, 0, 0 |
FIR_R1: dw 23, 19, -6, 3, -1, 0, 0, 0 |
195 |
|
|
196 |
FIR_R2: dw -7, 20, 20, -6, 3, -1, 0, 0 |
FIR_R2: dw -7, 20, 20, -6, 3, -1, 0, 0 |
200 |
FIR_R4: dw -1, 3, -6, 20, 20, -6, 3, -1 |
FIR_R4: dw -1, 3, -6, 20, 20, -6, 3, -1 |
201 |
|
|
202 |
FIR_R5: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 |
FIR_R5: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 |
203 |
align 16 |
align SECTION_ALIGN |
204 |
FIR_R6: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 |
FIR_R6: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 |
205 |
align 16 |
align SECTION_ALIGN |
206 |
FIR_R7: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 |
FIR_R7: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 |
207 |
align 16 |
align SECTION_ALIGN |
208 |
FIR_R8: dw -1, 3, -6, 20, 20, -6, 3, -1 |
FIR_R8: dw -1, 3, -6, 20, 20, -6, 3, -1 |
209 |
|
|
210 |
FIR_R9: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 |
FIR_R9: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 |
211 |
align 16 |
align SECTION_ALIGN |
212 |
FIR_R10: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 |
FIR_R10: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 |
213 |
align 16 |
align SECTION_ALIGN |
214 |
FIR_R11: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 |
FIR_R11: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 |
215 |
align 16 |
align SECTION_ALIGN |
216 |
FIR_R12: dw -1, 3, -6, 20, 20, -6, 3, -1 |
FIR_R12: dw -1, 3, -6, 20, 20, -6, 3, -1 |
217 |
|
|
218 |
FIR_R13: dw 0, -1, 3, -6, 20, 20, -6, 3 |
FIR_R13: dw 0, -1, 3, -6, 20, 20, -6, 3 |
227 |
|
|
228 |
; V-Pass taps |
; V-Pass taps |
229 |
|
|
230 |
align 16 |
align SECTION_ALIGN |
231 |
FIR_Cm7: times 4 dw -7 |
FIR_Cm7: times 4 dw -7 |
232 |
FIR_Cm6: times 4 dw -6 |
FIR_Cm6: times 4 dw -6 |
233 |
FIR_Cm3: times 4 dw -3 |
FIR_Cm3: times 4 dw -3 |
239 |
FIR_C20: times 4 dw 20 |
FIR_C20: times 4 dw 20 |
240 |
FIR_C23: times 4 dw 23 |
FIR_C23: times 4 dw 23 |
241 |
|
|
242 |
SECTION .text |
SECTION .rotext align=SECTION_ALIGN |
243 |
|
|
244 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
245 |
;// Here we go with the Q-Pel mess. |
;// Here we go with the Q-Pel mess. |
247 |
;// For vertical ones, we process 4 *input* pixel in parallel. |
;// For vertical ones, we process 4 *input* pixel in parallel. |
248 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
249 |
|
|
250 |
|
%ifdef ARCH_IS_X86_64 |
251 |
|
%macro XVID_MOVQ 3 |
252 |
|
lea r9, [%2] |
253 |
|
movq %1, [r9 + %3] |
254 |
|
%endmacro |
255 |
|
%macro XVID_PADDW 3 |
256 |
|
lea r9, [%2] |
257 |
|
paddw %1, [r9 + %3] |
258 |
|
%endmacro |
259 |
|
%define SRC_PTR prm2 |
260 |
|
%define DST_PTR prm1 |
261 |
|
%else |
262 |
|
%macro XVID_MOVQ 3 |
263 |
|
movq %1, [%2 + %3] |
264 |
|
%endmacro |
265 |
|
%macro XVID_PADDW 3 |
266 |
|
paddw %1, [%2 + %3] |
267 |
|
%endmacro |
268 |
|
%define SRC_PTR _ESI |
269 |
|
%define DST_PTR _EDI |
270 |
|
%endif |
271 |
|
|
272 |
%macro PROLOG_NO_AVRG 0 |
%macro PROLOG_NO_AVRG 0 |
273 |
push esi |
mov TMP0, prm3 ; Size |
274 |
push edi |
mov TMP1, prm4 ; BpS |
275 |
push ebp |
mov eax, prm5d ; Rnd |
276 |
mov edi, [esp+16 + 0*4] ; Dst |
|
277 |
mov esi, [esp+16 + 1*4] ; Src |
%ifndef ARCH_IS_X86_64 |
278 |
mov ecx, [esp+16 + 2*4] ; Size |
push SRC_PTR |
279 |
mov ebp, [esp+16 + 3*4] ; BpS |
push DST_PTR |
280 |
mov eax, [esp+16 + 4*4] ; Rnd |
%endif |
281 |
and eax, 1 |
push _EBP |
282 |
movq mm7, [Rounder_QP_MMX+eax*8] ; rounder |
mov _EBP, TMP1 |
283 |
|
|
284 |
|
%ifndef ARCH_IS_X86_64 |
285 |
|
mov DST_PTR, [_ESP+16 + 0*4] ; Dst |
286 |
|
mov SRC_PTR, [_ESP+16 + 1*4] ; Src |
287 |
|
%endif |
288 |
|
|
289 |
|
and _EAX, 1 |
290 |
|
lea TMP1, [Rounder_QP_MMX] |
291 |
|
movq mm7, [TMP1+_EAX*8] ; rounder |
292 |
%endmacro |
%endmacro |
293 |
|
|
294 |
%macro EPILOG_NO_AVRG 0 |
%macro EPILOG_NO_AVRG 0 |
295 |
pop ebp |
pop _EBP |
296 |
pop edi |
%ifndef ARCH_IS_X86_64 |
297 |
pop esi |
pop DST_PTR |
298 |
|
pop SRC_PTR |
299 |
|
%endif |
300 |
ret |
ret |
301 |
%endmacro |
%endmacro |
302 |
|
|
303 |
%macro PROLOG_AVRG 0 |
%macro PROLOG_AVRG 0 |
304 |
push ebx |
mov TMP0, prm3 ; Size |
305 |
push esi |
mov TMP1, prm4 ; BpS |
306 |
push edi |
mov eax, prm5d ; Rnd |
307 |
push ebp |
|
308 |
mov edi, [esp+20 + 0*4] ; Dst |
push _EBX |
309 |
mov esi, [esp+20 + 1*4] ; Src |
push _EBP |
310 |
mov ecx, [esp+20 + 2*4] ; Size |
%ifndef ARCH_IS_X86_64 |
311 |
mov ebp, [esp+20 + 3*4] ; BpS |
push SRC_PTR |
312 |
mov eax, [esp+20 + 4*4] ; Rnd |
push DST_PTR |
313 |
and eax, 1 |
%endif |
314 |
movq mm7, [Rounder_QP_MMX+eax*8] ; rounder |
mov _EBP, TMP1 |
315 |
lea ebx, [Rounder1_MMX+eax*8] ; *Rounder2 |
|
316 |
|
%ifndef ARCH_IS_X86_64 |
317 |
|
mov DST_PTR, [_ESP+20 + 0*4] ; Dst |
318 |
|
mov SRC_PTR, [_ESP+20 + 1*4] ; Src |
319 |
|
%endif |
320 |
|
|
321 |
|
and _EAX, 1 |
322 |
|
lea TMP1, [Rounder_QP_MMX] |
323 |
|
movq mm7, [TMP1+_EAX*8] ; rounder |
324 |
|
lea TMP1, [Rounder1_MMX] |
325 |
|
lea _EBX, [TMP1+_EAX*8] ; *Rounder2 |
326 |
%endmacro |
%endmacro |
327 |
|
|
328 |
%macro EPILOG_AVRG 0 |
%macro EPILOG_AVRG 0 |
329 |
pop ebp |
%ifndef ARCH_IS_X86_64 |
330 |
pop edi |
pop DST_PTR |
331 |
pop esi |
pop SRC_PTR |
332 |
pop ebx |
%endif |
333 |
|
pop _EBP |
334 |
|
pop _EBX |
335 |
ret |
ret |
336 |
%endmacro |
%endmacro |
337 |
|
|
344 |
; macros for USE_TABLES |
; macros for USE_TABLES |
345 |
|
|
346 |
%macro TLOAD 2 ; %1,%2: src pixels |
%macro TLOAD 2 ; %1,%2: src pixels |
347 |
movzx eax, byte [esi+%1] |
movzx _EAX, byte [SRC_PTR+%1] |
348 |
movzx edx, byte [esi+%2] |
movzx TMP1, byte [SRC_PTR+%2] |
349 |
movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ] |
XVID_MOVQ mm0, xvid_FIR_14_3_2_1, _EAX*8 |
350 |
movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ] |
XVID_MOVQ mm3, xvid_FIR_1_2_3_14, TMP1*8 |
351 |
paddw mm0, mm7 |
paddw mm0, mm7 |
352 |
paddw mm3, mm7 |
paddw mm3, mm7 |
353 |
%endmacro |
%endmacro |
354 |
|
|
355 |
%macro TACCUM2 5 ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs |
%macro TACCUM2 5 ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs |
356 |
movzx eax, byte [esi+%1] |
movzx _EAX, byte [SRC_PTR+%1] |
357 |
paddw %4, [%2 + eax*8] |
XVID_PADDW %4, %2, _EAX*8 |
358 |
paddw %5, [%3 + eax*8] |
XVID_PADDW %5, %3, _EAX*8 |
359 |
%endmacro |
%endmacro |
360 |
|
|
361 |
%macro TACCUM3 7 ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs |
%macro TACCUM3 7 ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs |
362 |
movzx eax, byte [esi+%1] |
movzx _EAX, byte [SRC_PTR+%1] |
363 |
paddw %5, [%2 + eax*8] |
XVID_PADDW %5, %2, _EAX*8 |
364 |
paddw %6, [%3 + eax*8] |
XVID_PADDW %6, %3, _EAX*8 |
365 |
paddw %7, [%4 + eax*8] |
XVID_PADDW %7, %4, _EAX*8 |
366 |
%endmacro |
%endmacro |
367 |
|
|
368 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
370 |
; macros without USE_TABLES |
; macros without USE_TABLES |
371 |
|
|
372 |
%macro LOAD 2 ; %1,%2: src pixels |
%macro LOAD 2 ; %1,%2: src pixels |
373 |
movzx eax, byte [esi+%1] |
movzx _EAX, byte [SRC_PTR+%1] |
374 |
movzx edx, byte [esi+%2] |
movzx TMP1, byte [SRC_PTR+%2] |
375 |
movq mm0, [xvid_Expand_mmx + eax*8] |
XVID_MOVQ mm0, xvid_Expand_mmx, _EAX*8 |
376 |
movq mm3, [xvid_Expand_mmx + edx*8] |
XVID_MOVQ mm3, xvid_Expand_mmx, TMP1*8 |
377 |
pmullw mm0, [FIR_R0 ] |
pmullw mm0, [FIR_R0 ] |
378 |
pmullw mm3, [FIR_R16] |
pmullw mm3, [FIR_R16] |
379 |
paddw mm0, mm7 |
paddw mm0, mm7 |
381 |
%endmacro |
%endmacro |
382 |
|
|
383 |
%macro ACCUM2 4 ;src pixel/Taps/dst regs #1-#2 |
%macro ACCUM2 4 ;src pixel/Taps/dst regs #1-#2 |
384 |
movzx eax, byte [esi+%1] |
movzx _EAX, byte [SRC_PTR+%1] |
385 |
movq mm4, [xvid_Expand_mmx + eax*8] |
XVID_MOVQ mm4, xvid_Expand_mmx, _EAX*8 |
386 |
movq mm5, mm4 |
movq mm5, mm4 |
387 |
pmullw mm4, [%2] |
pmullw mm4, [%2] |
388 |
pmullw mm5, [%2+8] |
pmullw mm5, [%2+8] |
391 |
%endmacro |
%endmacro |
392 |
|
|
393 |
%macro ACCUM3 5 ;src pixel/Taps/dst regs #1-#2-#3 |
%macro ACCUM3 5 ;src pixel/Taps/dst regs #1-#2-#3 |
394 |
movzx eax, byte [esi+%1] |
movzx _EAX, byte [SRC_PTR+%1] |
395 |
movq mm4, [xvid_Expand_mmx + eax*8] |
XVID_MOVQ mm4, xvid_Expand_mmx, _EAX*8 |
396 |
movq mm5, mm4 |
movq mm5, mm4 |
397 |
movq mm6, mm5 |
movq mm6, mm5 |
398 |
pmullw mm4, [%2 ] |
pmullw mm4, [%2 ] |
434 |
PROLOG_AVRG |
PROLOG_AVRG |
435 |
%endif |
%endif |
436 |
|
|
437 |
.Loop |
.Loop: |
438 |
|
|
439 |
; mm0..mm3 serves as a 4x4 delay line |
; mm0..mm3 serves as a 4x4 delay line |
440 |
|
|
498 |
packuswb mm2, mm3 |
packuswb mm2, mm3 |
499 |
|
|
500 |
%if (%1==1) |
%if (%1==1) |
501 |
MIX mm0, esi, ebx |
MIX mm0, SRC_PTR, _EBX |
502 |
%elif (%1==2) |
%elif (%1==2) |
503 |
MIX mm0, esi+1, ebx |
MIX mm0, SRC_PTR+1, _EBX |
504 |
%endif |
%endif |
505 |
%if (%2==1) |
%if (%2==1) |
506 |
MIX mm0, edi, Rounder1_MMX |
MIX mm0, DST_PTR, Rounder1_MMX |
507 |
%endif |
%endif |
508 |
|
|
509 |
%if (%1==1) |
%if (%1==1) |
510 |
MIX mm2, esi+8, ebx |
MIX mm2, SRC_PTR+8, _EBX |
511 |
%elif (%1==2) |
%elif (%1==2) |
512 |
MIX mm2, esi+9, ebx |
MIX mm2, SRC_PTR+9, _EBX |
513 |
%endif |
%endif |
514 |
%if (%2==1) |
%if (%2==1) |
515 |
MIX mm2, edi+8, Rounder1_MMX |
MIX mm2, DST_PTR+8, Rounder1_MMX |
516 |
%endif |
%endif |
517 |
|
|
518 |
lea esi, [esi+ebp] |
lea SRC_PTR, [SRC_PTR+_EBP] |
519 |
|
|
520 |
movq [edi+0], mm0 |
movq [DST_PTR+0], mm0 |
521 |
movq [edi+8], mm2 |
movq [DST_PTR+8], mm2 |
522 |
|
|
523 |
add edi, ebp |
add DST_PTR, _EBP |
524 |
dec ecx |
dec TMP0 |
525 |
jg .Loop |
jg .Loop |
526 |
|
|
527 |
%if (%2==0) && (%1==0) |
%if (%2==0) && (%1==0) |
543 |
PROLOG_AVRG |
PROLOG_AVRG |
544 |
%endif |
%endif |
545 |
|
|
546 |
.Loop |
.Loop: |
547 |
; mm0..mm3 serves as a 4x4 delay line |
; mm0..mm3 serves as a 4x4 delay line |
548 |
|
|
549 |
%ifndef USE_TABLES |
%ifndef USE_TABLES |
573 |
|
|
574 |
%else ; test with unrolling (little faster, but not much) |
%else ; test with unrolling (little faster, but not much) |
575 |
|
|
576 |
movzx eax, byte [esi] |
movzx _EAX, byte [SRC_PTR] |
577 |
movzx edx, byte [esi+8] |
movzx TMP1, byte [SRC_PTR+8] |
578 |
movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ] |
XVID_MOVQ mm0, xvid_FIR_14_3_2_1, _EAX*8 |
579 |
movzx eax, byte [esi+1] |
movzx _EAX, byte [SRC_PTR+1] |
580 |
movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ] |
XVID_MOVQ mm3, xvid_FIR_1_2_3_14, TMP1*8 |
581 |
paddw mm0, mm7 |
paddw mm0, mm7 |
582 |
paddw mm3, mm7 |
paddw mm3, mm7 |
583 |
|
|
584 |
movzx edx, byte [esi+2] |
movzx TMP1, byte [SRC_PTR+2] |
585 |
paddw mm0, [xvid_FIR_23_19_6_3 + eax*8] |
XVID_PADDW mm0, xvid_FIR_23_19_6_3, _EAX*8 |
586 |
paddw mm3, [xvid_FIR_1_0_0_0 + eax*8] |
XVID_PADDW mm3, xvid_FIR_1_0_0_0, _EAX*8 |
587 |
|
|
588 |
movzx eax, byte [esi+3] |
movzx _EAX, byte [SRC_PTR+3] |
589 |
paddw mm0, [xvid_FIR_7_20_20_6 + edx*8] |
XVID_PADDW mm0, xvid_FIR_7_20_20_6, TMP1*8 |
590 |
paddw mm3, [xvid_FIR_3_1_0_0 + edx*8] |
XVID_PADDW mm3, xvid_FIR_3_1_0_0, TMP1*8 |
591 |
|
|
592 |
movzx edx, byte [esi+4] |
movzx TMP1, byte [SRC_PTR+4] |
593 |
paddw mm0, [xvid_FIR_3_6_20_20 + eax*8] |
XVID_PADDW mm0, xvid_FIR_3_6_20_20, _EAX*8 |
594 |
paddw mm3, [xvid_FIR_6_3_1_0 + eax*8] |
XVID_PADDW mm3, xvid_FIR_6_3_1_0, _EAX*8 |
595 |
|
|
596 |
movzx eax, byte [esi+5] |
movzx _EAX, byte [SRC_PTR+5] |
597 |
paddw mm0, [xvid_FIR_1_3_6_20 + edx*8] |
XVID_PADDW mm0, xvid_FIR_1_3_6_20, TMP1*8 |
598 |
paddw mm3, [xvid_FIR_20_6_3_1 + edx*8] |
XVID_PADDW mm3, xvid_FIR_20_6_3_1, TMP1*8 |
599 |
|
|
600 |
movzx edx, byte [esi+6] |
movzx TMP1, byte [SRC_PTR+6] |
601 |
paddw mm0, [xvid_FIR_0_1_3_6 + eax*8] |
XVID_PADDW mm0, xvid_FIR_0_1_3_6, _EAX*8 |
602 |
paddw mm3, [xvid_FIR_20_20_6_3 + eax*8] |
XVID_PADDW mm3, xvid_FIR_20_20_6_3, _EAX*8 |
603 |
|
|
604 |
movzx eax, byte [esi+7] |
movzx _EAX, byte [SRC_PTR+7] |
605 |
paddw mm0, [xvid_FIR_0_0_1_3 + edx*8] |
XVID_PADDW mm0, xvid_FIR_0_0_1_3, TMP1*8 |
606 |
paddw mm3, [xvid_FIR_6_20_20_7 + edx*8] |
XVID_PADDW mm3, xvid_FIR_6_20_20_7, TMP1*8 |
607 |
|
|
608 |
paddw mm0, [xvid_FIR_0_0_0_1 + eax*8] |
XVID_PADDW mm0, xvid_FIR_0_0_0_1, _EAX*8 |
609 |
paddw mm3, [xvid_FIR_3_6_19_23 + eax*8] |
XVID_PADDW mm3, xvid_FIR_3_6_19_23, _EAX*8 |
610 |
|
|
611 |
%endif |
%endif |
612 |
|
|
617 |
packuswb mm0, mm3 |
packuswb mm0, mm3 |
618 |
|
|
619 |
%if (%1==1) |
%if (%1==1) |
620 |
MIX mm0, esi, ebx |
MIX mm0, SRC_PTR, _EBX |
621 |
%elif (%1==2) |
%elif (%1==2) |
622 |
MIX mm0, esi+1, ebx |
MIX mm0, SRC_PTR+1, _EBX |
623 |
%endif |
%endif |
624 |
%if (%2==1) |
%if (%2==1) |
625 |
MIX mm0, edi, Rounder1_MMX |
MIX mm0, DST_PTR, Rounder1_MMX |
626 |
%endif |
%endif |
627 |
|
|
628 |
movq [edi], mm0 |
movq [DST_PTR], mm0 |
629 |
|
|
630 |
add edi, ebp |
add DST_PTR, _EBP |
631 |
add esi, ebp |
add SRC_PTR, _EBP |
632 |
dec ecx |
dec TMP0 |
633 |
jg .Loop |
jg .Loop |
634 |
|
|
635 |
%if (%2==0) && (%1==0) |
%if (%2==0) && (%1==0) |
645 |
|
|
646 |
xvid_H_Pass_16_mmx: |
xvid_H_Pass_16_mmx: |
647 |
H_PASS_16 0, 0 |
H_PASS_16 0, 0 |
648 |
|
ENDFUNC |
649 |
xvid_H_Pass_Avrg_16_mmx: |
xvid_H_Pass_Avrg_16_mmx: |
650 |
H_PASS_16 1, 0 |
H_PASS_16 1, 0 |
651 |
|
ENDFUNC |
652 |
xvid_H_Pass_Avrg_Up_16_mmx: |
xvid_H_Pass_Avrg_Up_16_mmx: |
653 |
H_PASS_16 2, 0 |
H_PASS_16 2, 0 |
654 |
|
ENDFUNC |
655 |
|
|
656 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
657 |
;// 8x? copy Functions |
;// 8x? copy Functions |
658 |
|
|
659 |
xvid_H_Pass_8_mmx: |
xvid_H_Pass_8_mmx: |
660 |
H_PASS_8 0, 0 |
H_PASS_8 0, 0 |
661 |
|
ENDFUNC |
662 |
xvid_H_Pass_Avrg_8_mmx: |
xvid_H_Pass_Avrg_8_mmx: |
663 |
H_PASS_8 1, 0 |
H_PASS_8 1, 0 |
664 |
|
ENDFUNC |
665 |
xvid_H_Pass_Avrg_Up_8_mmx: |
xvid_H_Pass_Avrg_Up_8_mmx: |
666 |
H_PASS_8 2, 0 |
H_PASS_8 2, 0 |
667 |
|
ENDFUNC |
668 |
|
|
669 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
670 |
;// 16x? avrg Functions |
;// 16x? avrg Functions |
671 |
|
|
672 |
xvid_H_Pass_Add_16_mmx: |
xvid_H_Pass_Add_16_mmx: |
673 |
H_PASS_16 0, 1 |
H_PASS_16 0, 1 |
674 |
|
ENDFUNC |
675 |
xvid_H_Pass_Avrg_Add_16_mmx: |
xvid_H_Pass_Avrg_Add_16_mmx: |
676 |
H_PASS_16 1, 1 |
H_PASS_16 1, 1 |
677 |
|
ENDFUNC |
678 |
xvid_H_Pass_Avrg_Up_Add_16_mmx: |
xvid_H_Pass_Avrg_Up_Add_16_mmx: |
679 |
H_PASS_16 2, 1 |
H_PASS_16 2, 1 |
680 |
|
ENDFUNC |
681 |
|
|
682 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
683 |
;// 8x? avrg Functions |
;// 8x? avrg Functions |
684 |
|
|
685 |
xvid_H_Pass_8_Add_mmx: |
xvid_H_Pass_8_Add_mmx: |
686 |
H_PASS_8 0, 1 |
H_PASS_8 0, 1 |
687 |
|
ENDFUNC |
688 |
xvid_H_Pass_Avrg_8_Add_mmx: |
xvid_H_Pass_Avrg_8_Add_mmx: |
689 |
H_PASS_8 1, 1 |
H_PASS_8 1, 1 |
690 |
|
ENDFUNC |
691 |
xvid_H_Pass_Avrg_Up_8_Add_mmx: |
xvid_H_Pass_Avrg_Up_8_Add_mmx: |
692 |
H_PASS_8 2, 1 |
H_PASS_8 2, 1 |
693 |
|
ENDFUNC |
694 |
|
|
695 |
|
|
696 |
|
|
702 |
|
|
703 |
%macro V_LOAD 1 ; %1=Last? |
%macro V_LOAD 1 ; %1=Last? |
704 |
|
|
705 |
movd mm4, [edx] |
movd mm4, dword [TMP1] |
706 |
pxor mm6, mm6 |
pxor mm6, mm6 |
707 |
%if (%1==0) |
%if (%1==0) |
708 |
add edx, ebp |
add TMP1, _EBP |
709 |
%endif |
%endif |
710 |
punpcklbw mm4, mm6 |
punpcklbw mm4, mm6 |
711 |
|
|
756 |
packuswb %3, %3 |
packuswb %3, %3 |
757 |
|
|
758 |
%if (%1==1) |
%if (%1==1) |
759 |
V_MIX %3, esi, ebx |
V_MIX %3, SRC_PTR, _EBX |
760 |
add esi, ebp |
add SRC_PTR, _EBP |
761 |
%elif (%1==2) |
%elif (%1==2) |
762 |
add esi, ebp |
add SRC_PTR, _EBP |
763 |
V_MIX %3, esi, ebx |
V_MIX %3, SRC_PTR, _EBX |
764 |
%endif |
%endif |
765 |
%if (%2==1) |
%if (%2==1) |
766 |
V_MIX %3, edi, Rounder1_MMX |
V_MIX %3, DST_PTR, Rounder1_MMX |
767 |
%endif |
%endif |
768 |
|
|
769 |
movd eax, %3 |
movd eax, %3 |
770 |
mov [edi], eax |
mov dword [DST_PTR], eax |
771 |
|
|
772 |
%if (%4==0) |
%if (%4==0) |
773 |
add edi, ebp |
add DST_PTR, _EBP |
774 |
%endif |
%endif |
775 |
|
|
776 |
%endmacro |
%endmacro |
789 |
; the size (3rd argument) is meant to be a multiple of 4 |
; the size (3rd argument) is meant to be a multiple of 4 |
790 |
; mm0..mm3 serves as a 4x4 delay line |
; mm0..mm3 serves as a 4x4 delay line |
791 |
|
|
792 |
.Loop |
.Loop: |
793 |
|
|
794 |
push edi |
push DST_PTR |
795 |
push esi ; esi is preserved for src-mixing |
push SRC_PTR ; SRC_PTR is preserved for src-mixing |
796 |
mov edx, esi |
mov TMP1, SRC_PTR |
797 |
|
|
798 |
; ouput rows [0..3], from input rows [0..8] |
; ouput rows [0..3], from input rows [0..8] |
799 |
|
|
829 |
|
|
830 |
; ouput rows [4..7], from input rows [1..11] (!!) |
; ouput rows [4..7], from input rows [1..11] (!!) |
831 |
|
|
832 |
mov esi, [esp] |
mov SRC_PTR, [_ESP] |
833 |
lea edx, [esi+ebp] |
lea TMP1, [SRC_PTR+_EBP] |
834 |
|
|
835 |
lea esi, [esi+4*ebp] ; for src-mixing |
lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing |
836 |
push esi ; this will be the new value for next round |
push SRC_PTR ; this will be the new value for next round |
837 |
|
|
838 |
movq mm0, mm7 |
movq mm0, mm7 |
839 |
movq mm1, mm7 |
movq mm1, mm7 |
877 |
|
|
878 |
; ouput rows [8..11], from input rows [5..15] |
; ouput rows [8..11], from input rows [5..15] |
879 |
|
|
880 |
pop esi |
pop SRC_PTR |
881 |
lea edx, [esi+ebp] |
lea TMP1, [SRC_PTR+_EBP] |
882 |
|
|
883 |
lea esi, [esi+4*ebp] ; for src-mixing |
lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing |
884 |
push esi ; this will be the new value for next round |
push SRC_PTR ; this will be the new value for next round |
885 |
|
|
886 |
movq mm0, mm7 |
movq mm0, mm7 |
887 |
movq mm1, mm7 |
movq mm1, mm7 |
927 |
|
|
928 |
; ouput rows [12..15], from input rows [9.16] |
; ouput rows [12..15], from input rows [9.16] |
929 |
|
|
930 |
pop esi |
pop SRC_PTR |
931 |
lea edx, [esi+ebp] |
lea TMP1, [SRC_PTR+_EBP] |
932 |
|
|
933 |
%if (%1!=0) |
%if (%1!=0) |
934 |
lea esi, [esi+4*ebp] ; for src-mixing |
lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing |
935 |
%endif |
%endif |
936 |
|
|
937 |
movq mm0, mm7 |
movq mm0, mm7 |
967 |
|
|
968 |
; ... next 4 columns |
; ... next 4 columns |
969 |
|
|
970 |
pop esi |
pop SRC_PTR |
971 |
pop edi |
pop DST_PTR |
972 |
add esi, 4 |
add SRC_PTR, 4 |
973 |
add edi, 4 |
add DST_PTR, 4 |
974 |
sub ecx, 4 |
sub TMP0, 4 |
975 |
jg .Loop |
jg .Loop |
976 |
|
|
977 |
%if (%2==0) && (%1==0) |
%if (%2==0) && (%1==0) |
995 |
; we process one stripe of 4x8 pixel each time |
; we process one stripe of 4x8 pixel each time |
996 |
; the size (3rd argument) is meant to be a multiple of 4 |
; the size (3rd argument) is meant to be a multiple of 4 |
997 |
; mm0..mm3 serves as a 4x4 delay line |
; mm0..mm3 serves as a 4x4 delay line |
998 |
.Loop |
.Loop: |
999 |
|
|
1000 |
push edi |
push DST_PTR |
1001 |
push esi ; esi is preserved for src-mixing |
push SRC_PTR ; SRC_PTR is preserved for src-mixing |
1002 |
mov edx, esi |
mov TMP1, SRC_PTR |
1003 |
|
|
1004 |
; ouput rows [0..3], from input rows [0..8] |
; ouput rows [0..3], from input rows [0..8] |
1005 |
|
|
1036 |
|
|
1037 |
; ouput rows [4..7], from input rows [1..9] |
; ouput rows [4..7], from input rows [1..9] |
1038 |
|
|
1039 |
mov esi, [esp] |
mov SRC_PTR, [_ESP] |
1040 |
lea edx, [esi+ebp] |
lea TMP1, [SRC_PTR+_EBP] |
1041 |
|
|
1042 |
%if (%1!=0) |
%if (%1!=0) |
1043 |
lea esi, [esi+4*ebp] ; for src-mixing |
lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing |
1044 |
%endif |
%endif |
1045 |
|
|
1046 |
movq mm0, mm7 |
movq mm0, mm7 |
1076 |
|
|
1077 |
; ... next 4 columns |
; ... next 4 columns |
1078 |
|
|
1079 |
pop esi |
pop SRC_PTR |
1080 |
pop edi |
pop DST_PTR |
1081 |
add esi, 4 |
add SRC_PTR, 4 |
1082 |
add edi, 4 |
add DST_PTR, 4 |
1083 |
sub ecx, 4 |
sub TMP0, 4 |
1084 |
jg .Loop |
jg .Loop |
1085 |
|
|
1086 |
%if (%2==0) && (%1==0) |
%if (%2==0) && (%1==0) |
1097 |
|
|
1098 |
xvid_V_Pass_16_mmx: |
xvid_V_Pass_16_mmx: |
1099 |
V_PASS_16 0, 0 |
V_PASS_16 0, 0 |
1100 |
|
ENDFUNC |
1101 |
xvid_V_Pass_Avrg_16_mmx: |
xvid_V_Pass_Avrg_16_mmx: |
1102 |
V_PASS_16 1, 0 |
V_PASS_16 1, 0 |
1103 |
|
ENDFUNC |
1104 |
xvid_V_Pass_Avrg_Up_16_mmx: |
xvid_V_Pass_Avrg_Up_16_mmx: |
1105 |
V_PASS_16 2, 0 |
V_PASS_16 2, 0 |
1106 |
|
ENDFUNC |
1107 |
|
|
1108 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
1109 |
;// 8x? copy Functions |
;// 8x? copy Functions |
1110 |
|
|
1111 |
xvid_V_Pass_8_mmx: |
xvid_V_Pass_8_mmx: |
1112 |
V_PASS_8 0, 0 |
V_PASS_8 0, 0 |
1113 |
|
ENDFUNC |
1114 |
xvid_V_Pass_Avrg_8_mmx: |
xvid_V_Pass_Avrg_8_mmx: |
1115 |
V_PASS_8 1, 0 |
V_PASS_8 1, 0 |
1116 |
|
ENDFUNC |
1117 |
xvid_V_Pass_Avrg_Up_8_mmx: |
xvid_V_Pass_Avrg_Up_8_mmx: |
1118 |
V_PASS_8 2, 0 |
V_PASS_8 2, 0 |
1119 |
|
ENDFUNC |
1120 |
|
|
1121 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
1122 |
;// 16x? avrg Functions |
;// 16x? avrg Functions |
1123 |
|
|
1124 |
xvid_V_Pass_Add_16_mmx: |
xvid_V_Pass_Add_16_mmx: |
1125 |
V_PASS_16 0, 1 |
V_PASS_16 0, 1 |
1126 |
|
ENDFUNC |
1127 |
xvid_V_Pass_Avrg_Add_16_mmx: |
xvid_V_Pass_Avrg_Add_16_mmx: |
1128 |
V_PASS_16 1, 1 |
V_PASS_16 1, 1 |
1129 |
|
ENDFUNC |
1130 |
xvid_V_Pass_Avrg_Up_Add_16_mmx: |
xvid_V_Pass_Avrg_Up_Add_16_mmx: |
1131 |
V_PASS_16 2, 1 |
V_PASS_16 2, 1 |
1132 |
|
ENDFUNC |
1133 |
|
|
1134 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
1135 |
;// 8x? avrg Functions |
;// 8x? avrg Functions |
1136 |
|
|
1137 |
xvid_V_Pass_8_Add_mmx: |
xvid_V_Pass_8_Add_mmx: |
1138 |
V_PASS_8 0, 1 |
V_PASS_8 0, 1 |
1139 |
|
ENDFUNC |
1140 |
xvid_V_Pass_Avrg_8_Add_mmx: |
xvid_V_Pass_Avrg_8_Add_mmx: |
1141 |
V_PASS_8 1, 1 |
V_PASS_8 1, 1 |
1142 |
|
ENDFUNC |
1143 |
xvid_V_Pass_Avrg_Up_8_Add_mmx: |
xvid_V_Pass_Avrg_Up_8_Add_mmx: |
1144 |
V_PASS_8 2, 1 |
V_PASS_8 2, 1 |
1145 |
|
ENDFUNC |
1146 |
|
|
1147 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
1148 |
|
|
1149 |
|
%undef SRC_PTR |
1150 |
|
%undef DST_PTR |
1151 |
|
|
1152 |
|
%ifidn __OUTPUT_FORMAT__,elf |
1153 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
1154 |
|
%endif |
1155 |
|
|