1 |
|
;/***************************************************************************** |
2 |
|
; * |
3 |
|
; * XVID MPEG-4 VIDEO CODEC |
4 |
|
; * - Quarter-pixel interpolation - |
5 |
|
; * Copyright(C) 2002 Pascal Massimino <skal@planet-d.net> |
6 |
|
; * |
7 |
|
; * This file is part of XviD, a free MPEG-4 video encoder/decoder |
8 |
|
; * |
9 |
|
; * XviD is free software; you can redistribute it and/or modify it |
10 |
|
; * under the terms of the GNU General Public License as published by |
11 |
|
; * the Free Software Foundation; either version 2 of the License, or |
12 |
|
; * (at your option) any later version. |
13 |
|
; * |
14 |
|
; * This program is distributed in the hope that it will be useful, |
15 |
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 |
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 |
|
; * GNU General Public License for more details. |
18 |
|
; * |
19 |
|
; * You should have received a copy of the GNU General Public License |
20 |
|
; * along with this program; if not, write to the Free Software |
21 |
|
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
22 |
|
; * |
23 |
|
; * $Id$ |
24 |
|
; * |
25 |
|
; *************************************************************************/ |
26 |
|
|
27 |
|
;/************************************************************************** |
28 |
|
; * |
29 |
|
; * History: |
30 |
|
; * |
31 |
|
; * 22.10.2002 initial coding. unoptimized 'proof of concept', |
32 |
|
; * just to heft the qpel filtering. - Skal - |
33 |
|
; * |
34 |
|
; *************************************************************************/ |
35 |
|
|
36 |
|
|
37 |
|
%define USE_TABLES ; in order to use xvid_FIR_x_x_x_x tables |
38 |
|
; instead of xvid_Expand_mmx... |
39 |
|
|
40 |
|
|
41 |
|
bits 32 |
42 |
|
|
43 |
|
%macro cglobal 1 |
44 |
|
%ifdef PREFIX |
45 |
|
global _%1 |
46 |
|
%define %1 _%1 |
47 |
|
%else |
48 |
|
global %1 |
49 |
|
%endif |
50 |
|
%endmacro |
51 |
|
%macro cextern 1 |
52 |
|
%ifdef PREFIX |
53 |
|
extern _%1 |
54 |
|
%define %1 _%1 |
55 |
|
%else |
56 |
|
extern %1 |
57 |
|
%endif |
58 |
|
%endmacro |
59 |
|
|
60 |
|
|
61 |
|
;////////////////////////////////////////////////////////////////////// |
62 |
|
;// Declarations |
63 |
|
;// all signatures are: |
64 |
|
;// void XXX(uint8_t *dst, const uint8_t *src, |
65 |
|
;// int32_t length, int32_t stride, int32_t rounding) |
66 |
|
;////////////////////////////////////////////////////////////////////// |
67 |
|
|
68 |
|
cglobal xvid_H_Pass_16_mmx |
69 |
|
cglobal xvid_H_Pass_Avrg_16_mmx |
70 |
|
cglobal xvid_H_Pass_Avrg_Up_16_mmx |
71 |
|
cglobal xvid_V_Pass_16_mmx |
72 |
|
cglobal xvid_V_Pass_Avrg_16_mmx |
73 |
|
cglobal xvid_V_Pass_Avrg_Up_16_mmx |
74 |
|
cglobal xvid_H_Pass_8_mmx |
75 |
|
cglobal xvid_H_Pass_Avrg_8_mmx |
76 |
|
cglobal xvid_H_Pass_Avrg_Up_8_mmx |
77 |
|
cglobal xvid_V_Pass_8_mmx |
78 |
|
cglobal xvid_V_Pass_Avrg_8_mmx |
79 |
|
cglobal xvid_V_Pass_Avrg_Up_8_mmx |
80 |
|
|
81 |
|
cglobal xvid_H_Pass_Add_16_mmx |
82 |
|
cglobal xvid_H_Pass_Avrg_Add_16_mmx |
83 |
|
cglobal xvid_H_Pass_Avrg_Up_Add_16_mmx |
84 |
|
cglobal xvid_V_Pass_Add_16_mmx |
85 |
|
cglobal xvid_V_Pass_Avrg_Add_16_mmx |
86 |
|
cglobal xvid_V_Pass_Avrg_Up_Add_16_mmx |
87 |
|
cglobal xvid_H_Pass_8_Add_mmx |
88 |
|
cglobal xvid_H_Pass_Avrg_8_Add_mmx |
89 |
|
cglobal xvid_H_Pass_Avrg_Up_8_Add_mmx |
90 |
|
cglobal xvid_V_Pass_8_Add_mmx |
91 |
|
cglobal xvid_V_Pass_Avrg_8_Add_mmx |
92 |
|
cglobal xvid_V_Pass_Avrg_Up_8_Add_mmx |
93 |
|
|
94 |
|
cextern xvid_Expand_mmx |
95 |
|
|
96 |
|
%ifdef USE_TABLES |
97 |
|
|
98 |
|
cextern xvid_FIR_1_0_0_0 |
99 |
|
cextern xvid_FIR_3_1_0_0 |
100 |
|
cextern xvid_FIR_6_3_1_0 |
101 |
|
cextern xvid_FIR_14_3_2_1 |
102 |
|
cextern xvid_FIR_20_6_3_1 |
103 |
|
cextern xvid_FIR_20_20_6_3 |
104 |
|
cextern xvid_FIR_23_19_6_3 |
105 |
|
cextern xvid_FIR_7_20_20_6 |
106 |
|
cextern xvid_FIR_6_20_20_6 |
107 |
|
cextern xvid_FIR_6_20_20_7 |
108 |
|
cextern xvid_FIR_3_6_20_20 |
109 |
|
cextern xvid_FIR_3_6_19_23 |
110 |
|
cextern xvid_FIR_1_3_6_20 |
111 |
|
cextern xvid_FIR_1_2_3_14 |
112 |
|
cextern xvid_FIR_0_1_3_6 |
113 |
|
cextern xvid_FIR_0_0_1_3 |
114 |
|
cextern xvid_FIR_0_0_0_1 |
115 |
|
|
116 |
|
%endif |
117 |
|
|
118 |
|
;////////////////////////////////////////////////////////////////////// |
119 |
|
|
120 |
|
%ifdef FORMAT_COFF |
121 |
|
SECTION .rodata data |
122 |
|
%else |
123 |
|
SECTION .rodata data align=16 |
124 |
|
%endif |
125 |
|
|
126 |
|
align 16 |
127 |
|
Rounder1_MMX: |
128 |
|
times 4 dw 1 |
129 |
|
Rounder0_MMX: |
130 |
|
times 4 dw 0 |
131 |
|
|
132 |
|
align 16 |
133 |
|
Rounder_QP_MMX |
134 |
|
times 4 dw 16 |
135 |
|
times 4 dw 15 |
136 |
|
|
137 |
|
%ifndef USE_TABLES |
138 |
|
|
139 |
|
align 16 |
140 |
|
|
141 |
|
; H-Pass table shared by 16x? and 8x? filters |
142 |
|
|
143 |
|
FIR_R0: dw 14, -3, 2, -1 |
144 |
|
align 16 |
145 |
|
FIR_R1: dw 23, 19, -6, 3, -1, 0, 0, 0 |
146 |
|
|
147 |
|
FIR_R2: dw -7, 20, 20, -6, 3, -1, 0, 0 |
148 |
|
|
149 |
|
FIR_R3: dw 3, -6, 20, 20, -6, 3, -1, 0 |
150 |
|
|
151 |
|
FIR_R4: dw -1, 3, -6, 20, 20, -6, 3, -1 |
152 |
|
|
153 |
|
FIR_R5: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 |
154 |
|
align 16 |
155 |
|
FIR_R6: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 |
156 |
|
align 16 |
157 |
|
FIR_R7: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 |
158 |
|
align 16 |
159 |
|
FIR_R8: dw -1, 3, -6, 20, 20, -6, 3, -1 |
160 |
|
|
161 |
|
FIR_R9: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 |
162 |
|
align 16 |
163 |
|
FIR_R10: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 |
164 |
|
align 16 |
165 |
|
FIR_R11: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 |
166 |
|
align 16 |
167 |
|
FIR_R12: dw -1, 3, -6, 20, 20, -6, 3, -1 |
168 |
|
|
169 |
|
FIR_R13: dw 0, -1, 3, -6, 20, 20, -6, 3 |
170 |
|
|
171 |
|
FIR_R14: dw 0, 0, -1, 3, -6, 20, 20, -7 |
172 |
|
|
173 |
|
FIR_R15: dw 0, 0, 0, -1, 3, -6, 19, 23 |
174 |
|
|
175 |
|
FIR_R16: dw -1, 2, -3, 14 |
176 |
|
|
177 |
|
%endif ; !USE_TABLES |
178 |
|
|
179 |
|
; V-Pass taps |
180 |
|
|
181 |
|
align 16 |
182 |
|
FIR_Cm7: times 4 dw -7 |
183 |
|
FIR_Cm6: times 4 dw -6 |
184 |
|
FIR_Cm3: times 4 dw -3 |
185 |
|
FIR_Cm1: times 4 dw -1 |
186 |
|
FIR_C2: times 4 dw 2 |
187 |
|
FIR_C3: times 4 dw 3 |
188 |
|
FIR_C14: times 4 dw 14 |
189 |
|
FIR_C19: times 4 dw 19 |
190 |
|
FIR_C20: times 4 dw 20 |
191 |
|
FIR_C23: times 4 dw 23 |
192 |
|
|
193 |
|
SECTION .text |
194 |
|
|
195 |
|
;////////////////////////////////////////////////////////////////////// |
196 |
|
;// Here we go with the Q-Pel mess. |
197 |
|
;// For horizontal passes, we process 4 *output* pixel in parallel |
198 |
|
;// For vertical ones, we process 4 *input* pixel in parallel. |
199 |
|
;////////////////////////////////////////////////////////////////////// |
200 |
|
|
201 |
|
%macro PROLOG_NO_AVRG 0 |
202 |
|
push esi |
203 |
|
push edi |
204 |
|
push ebp |
205 |
|
mov edi, [esp+16 + 0*4] ; Dst |
206 |
|
mov esi, [esp+16 + 1*4] ; Src |
207 |
|
mov ecx, [esp+16 + 2*4] ; Size |
208 |
|
mov ebp, [esp+16 + 3*4] ; BpS |
209 |
|
mov eax, [esp+16 + 4*4] ; Rnd |
210 |
|
and eax, 1 |
211 |
|
movq mm7, [Rounder_QP_MMX+eax*8] ; rounder |
212 |
|
%endmacro |
213 |
|
|
214 |
|
%macro EPILOG_NO_AVRG 0 |
215 |
|
pop ebp |
216 |
|
pop edi |
217 |
|
pop esi |
218 |
|
ret |
219 |
|
%endmacro |
220 |
|
|
221 |
|
%macro PROLOG_AVRG 0 |
222 |
|
push ebx |
223 |
|
push esi |
224 |
|
push edi |
225 |
|
push ebp |
226 |
|
mov edi, [esp+20 + 0*4] ; Dst |
227 |
|
mov esi, [esp+20 + 1*4] ; Src |
228 |
|
mov ecx, [esp+20 + 2*4] ; Size |
229 |
|
mov ebp, [esp+20 + 3*4] ; BpS |
230 |
|
mov eax, [esp+20 + 4*4] ; Rnd |
231 |
|
and eax, 1 |
232 |
|
movq mm7, [Rounder_QP_MMX+eax*8] ; rounder |
233 |
|
lea ebx, [Rounder1_MMX+eax*8] ; *Rounder2 |
234 |
|
%endmacro |
235 |
|
|
236 |
|
%macro EPILOG_AVRG 0 |
237 |
|
pop ebp |
238 |
|
pop edi |
239 |
|
pop esi |
240 |
|
pop ebx |
241 |
|
ret |
242 |
|
%endmacro |
243 |
|
|
244 |
|
;////////////////////////////////////////////////////////////////////// |
245 |
|
;// |
246 |
|
;// All horizontal passes |
247 |
|
;// |
248 |
|
;////////////////////////////////////////////////////////////////////// |
249 |
|
|
250 |
|
; macros for USE_TABLES |
251 |
|
|
252 |
|
%macro TLOAD 2 ; %1,%2: src pixels |
253 |
|
movzx eax, byte [esi+%1] |
254 |
|
movzx edx, byte [esi+%2] |
255 |
|
movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ] |
256 |
|
movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ] |
257 |
|
paddw mm0, mm7 |
258 |
|
paddw mm3, mm7 |
259 |
|
%endmacro |
260 |
|
|
261 |
|
%macro TACCUM2 5 ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs |
262 |
|
movzx eax, byte [esi+%1] |
263 |
|
paddw %4, [%2 + eax*8] |
264 |
|
paddw %5, [%3 + eax*8] |
265 |
|
%endmacro |
266 |
|
|
267 |
|
%macro TACCUM3 7 ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs |
268 |
|
movzx eax, byte [esi+%1] |
269 |
|
paddw %5, [%2 + eax*8] |
270 |
|
paddw %6, [%3 + eax*8] |
271 |
|
paddw %7, [%4 + eax*8] |
272 |
|
%endmacro |
273 |
|
|
274 |
|
;////////////////////////////////////////////////////////////////////// |
275 |
|
|
276 |
|
; macros without USE_TABLES |
277 |
|
|
278 |
|
%macro LOAD 2 ; %1,%2: src pixels |
279 |
|
movzx eax, byte [esi+%1] |
280 |
|
movzx edx, byte [esi+%2] |
281 |
|
movq mm0, [xvid_Expand_mmx + eax*8] |
282 |
|
movq mm3, [xvid_Expand_mmx + edx*8] |
283 |
|
pmullw mm0, [FIR_R0 ] |
284 |
|
pmullw mm3, [FIR_R16] |
285 |
|
paddw mm0, mm7 |
286 |
|
paddw mm3, mm7 |
287 |
|
%endmacro |
288 |
|
|
289 |
|
%macro ACCUM2 4 ;src pixel/Taps/dst regs #1-#2 |
290 |
|
movzx eax, byte [esi+%1] |
291 |
|
movq mm4, [xvid_Expand_mmx + eax*8] |
292 |
|
movq mm5, mm4 |
293 |
|
pmullw mm4, [%2] |
294 |
|
pmullw mm5, [%2+8] |
295 |
|
paddw %3, mm4 |
296 |
|
paddw %4, mm5 |
297 |
|
%endmacro |
298 |
|
|
299 |
|
%macro ACCUM3 5 ;src pixel/Taps/dst regs #1-#2-#3 |
300 |
|
movzx eax, byte [esi+%1] |
301 |
|
movq mm4, [xvid_Expand_mmx + eax*8] |
302 |
|
movq mm5, mm4 |
303 |
|
movq mm6, mm5 |
304 |
|
pmullw mm4, [%2 ] |
305 |
|
pmullw mm5, [%2+ 8] |
306 |
|
pmullw mm6, [%2+16] |
307 |
|
paddw %3, mm4 |
308 |
|
paddw %4, mm5 |
309 |
|
paddw %5, mm6 |
310 |
|
%endmacro |
311 |
|
|
312 |
|
;////////////////////////////////////////////////////////////////////// |
313 |
|
|
314 |
|
%macro MIX 3 ; %1:reg, %2:src, %3:rounder |
315 |
|
pxor mm6, mm6 |
316 |
|
movq mm4, [%2] |
317 |
|
movq mm1, %1 |
318 |
|
movq mm5, mm4 |
319 |
|
punpcklbw %1, mm6 |
320 |
|
punpcklbw mm4, mm6 |
321 |
|
punpckhbw mm1, mm6 |
322 |
|
punpckhbw mm5, mm6 |
323 |
|
movq mm6, [%3] ; rounder #2 |
324 |
|
paddusw %1, mm4 |
325 |
|
paddusw mm1, mm5 |
326 |
|
paddusw %1, mm6 |
327 |
|
paddusw mm1, mm6 |
328 |
|
psrlw %1, 1 |
329 |
|
psrlw mm1, 1 |
330 |
|
packuswb %1, mm1 |
331 |
|
%endmacro |
332 |
|
|
333 |
|
;////////////////////////////////////////////////////////////////////// |
334 |
|
|
335 |
|
%macro H_PASS_16 2 ; %1:src-op (0=NONE,1=AVRG,2=AVRG-UP), %2:dst-op (NONE/AVRG) |
336 |
|
|
337 |
|
%if (%2==0) && (%1==0) |
338 |
|
PROLOG_NO_AVRG |
339 |
|
%else |
340 |
|
PROLOG_AVRG |
341 |
|
%endif |
342 |
|
|
343 |
|
.Loop |
344 |
|
|
345 |
|
; mm0..mm3 serves as a 4x4 delay line |
346 |
|
|
347 |
|
%ifndef USE_TABLES |
348 |
|
|
349 |
|
LOAD 0, 16 ; special case for 1rst/last pixel |
350 |
|
movq mm1, mm7 |
351 |
|
movq mm2, mm7 |
352 |
|
|
353 |
|
ACCUM2 1, FIR_R1, mm0, mm1 |
354 |
|
ACCUM2 2, FIR_R2, mm0, mm1 |
355 |
|
ACCUM2 3, FIR_R3, mm0, mm1 |
356 |
|
ACCUM2 4, FIR_R4, mm0, mm1 |
357 |
|
|
358 |
|
ACCUM3 5, FIR_R5, mm0, mm1, mm2 |
359 |
|
ACCUM3 6, FIR_R6, mm0, mm1, mm2 |
360 |
|
ACCUM3 7, FIR_R7, mm0, mm1, mm2 |
361 |
|
ACCUM2 8, FIR_R8, mm1, mm2 |
362 |
|
ACCUM3 9, FIR_R9, mm1, mm2, mm3 |
363 |
|
ACCUM3 10, FIR_R10,mm1, mm2, mm3 |
364 |
|
ACCUM3 11, FIR_R11,mm1, mm2, mm3 |
365 |
|
|
366 |
|
ACCUM2 12, FIR_R12, mm2, mm3 |
367 |
|
ACCUM2 13, FIR_R13, mm2, mm3 |
368 |
|
ACCUM2 14, FIR_R14, mm2, mm3 |
369 |
|
ACCUM2 15, FIR_R15, mm2, mm3 |
370 |
|
|
371 |
|
%else |
372 |
|
|
373 |
|
TLOAD 0, 16 ; special case for 1rst/last pixel |
374 |
|
movq mm1, mm7 |
375 |
|
movq mm2, mm7 |
376 |
|
|
377 |
|
TACCUM2 1, xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm1 |
378 |
|
TACCUM2 2, xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1 |
379 |
|
TACCUM2 3, xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1 |
380 |
|
TACCUM2 4, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1, mm0, mm1 |
381 |
|
|
382 |
|
TACCUM3 5, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0 , mm0, mm1, mm2 |
383 |
|
TACCUM3 6, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1, mm2 |
384 |
|
TACCUM3 7, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1, mm2 |
385 |
|
|
386 |
|
TACCUM2 8, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm1, mm2 |
387 |
|
|
388 |
|
TACCUM3 9, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0, mm1, mm2, mm3 |
389 |
|
TACCUM3 10, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0, mm1, mm2, mm3 |
390 |
|
TACCUM3 11, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0, mm1, mm2, mm3 |
391 |
|
|
392 |
|
TACCUM2 12, xvid_FIR_1_3_6_20, xvid_FIR_20_6_3_1 , mm2, mm3 |
393 |
|
TACCUM2 13, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm2, mm3 |
394 |
|
TACCUM2 14, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm2, mm3 |
395 |
|
TACCUM2 15, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm2, mm3 |
396 |
|
|
397 |
|
%endif |
398 |
|
|
399 |
|
psraw mm0, 5 |
400 |
|
psraw mm1, 5 |
401 |
|
psraw mm2, 5 |
402 |
|
psraw mm3, 5 |
403 |
|
packuswb mm0, mm1 |
404 |
|
packuswb mm2, mm3 |
405 |
|
|
406 |
|
%if (%1==1) |
407 |
|
MIX mm0, esi, ebx |
408 |
|
%elif (%1==2) |
409 |
|
MIX mm0, esi+1, ebx |
410 |
|
%endif |
411 |
|
%if (%2==1) |
412 |
|
MIX mm0, edi, Rounder1_MMX |
413 |
|
%endif |
414 |
|
|
415 |
|
%if (%1==1) |
416 |
|
MIX mm2, esi+8, ebx |
417 |
|
%elif (%1==2) |
418 |
|
MIX mm2, esi+9, ebx |
419 |
|
%endif |
420 |
|
%if (%2==1) |
421 |
|
MIX mm2, edi+8, Rounder1_MMX |
422 |
|
%endif |
423 |
|
|
424 |
|
lea esi, [esi+ebp] |
425 |
|
|
426 |
|
movq [edi+0], mm0 |
427 |
|
movq [edi+8], mm2 |
428 |
|
|
429 |
|
add edi, ebp |
430 |
|
dec ecx |
431 |
|
jg .Loop |
432 |
|
|
433 |
|
%if (%2==0) && (%1==0) |
434 |
|
EPILOG_NO_AVRG |
435 |
|
%else |
436 |
|
EPILOG_AVRG |
437 |
|
%endif |
438 |
|
|
439 |
|
%endmacro |
440 |
|
|
441 |
|
|
442 |
|
;////////////////////////////////////////////////////////////////////// |
443 |
|
|
444 |
|
%macro H_PASS_8 2 ; %1:src-op (0=NONE,1=AVRG,2=AVRG-UP), %2:dst-op (NONE/AVRG) |
445 |
|
|
446 |
|
%if (%2==0) && (%1==0) |
447 |
|
PROLOG_NO_AVRG |
448 |
|
%else |
449 |
|
PROLOG_AVRG |
450 |
|
%endif |
451 |
|
|
452 |
|
.Loop |
453 |
|
; mm0..mm3 serves as a 4x4 delay line |
454 |
|
|
455 |
|
%ifndef USE_TABLES |
456 |
|
|
457 |
|
LOAD 0, 8 ; special case for 1rst/last pixel |
458 |
|
ACCUM2 1, FIR_R1, mm0, mm3 |
459 |
|
ACCUM2 2, FIR_R2, mm0, mm3 |
460 |
|
ACCUM2 3, FIR_R3, mm0, mm3 |
461 |
|
ACCUM2 4, FIR_R4, mm0, mm3 |
462 |
|
|
463 |
|
ACCUM2 5, FIR_R13, mm0, mm3 |
464 |
|
ACCUM2 6, FIR_R14, mm0, mm3 |
465 |
|
ACCUM2 7, FIR_R15, mm0, mm3 |
466 |
|
|
467 |
|
%else |
468 |
|
|
469 |
|
%if 0 ; test with no unrolling |
470 |
|
|
471 |
|
TLOAD 0, 8 ; special case for 1rst/last pixel |
472 |
|
TACCUM2 1, xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm3 |
473 |
|
TACCUM2 2, xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm3 |
474 |
|
TACCUM2 3, xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm3 |
475 |
|
TACCUM2 4, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm0, mm3 |
476 |
|
TACCUM2 5, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm0, mm3 |
477 |
|
TACCUM2 6, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm0, mm3 |
478 |
|
TACCUM2 7, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm0, mm3 |
479 |
|
|
480 |
|
%else ; test with unrolling (little faster, but not much) |
481 |
|
|
482 |
|
movzx eax, byte [esi] |
483 |
|
movzx edx, byte [esi+8] |
484 |
|
movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ] |
485 |
|
movzx eax, byte [esi+1] |
486 |
|
movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ] |
487 |
|
paddw mm0, mm7 |
488 |
|
paddw mm3, mm7 |
489 |
|
|
490 |
|
movzx edx, byte [esi+2] |
491 |
|
paddw mm0, [xvid_FIR_23_19_6_3 + eax*8] |
492 |
|
paddw mm3, [xvid_FIR_1_0_0_0 + eax*8] |
493 |
|
|
494 |
|
movzx eax, byte [esi+3] |
495 |
|
paddw mm0, [xvid_FIR_7_20_20_6 + edx*8] |
496 |
|
paddw mm3, [xvid_FIR_3_1_0_0 + edx*8] |
497 |
|
|
498 |
|
movzx edx, byte [esi+4] |
499 |
|
paddw mm0, [xvid_FIR_3_6_20_20 + eax*8] |
500 |
|
paddw mm3, [xvid_FIR_6_3_1_0 + eax*8] |
501 |
|
|
502 |
|
movzx eax, byte [esi+5] |
503 |
|
paddw mm0, [xvid_FIR_1_3_6_20 + edx*8] |
504 |
|
paddw mm3, [xvid_FIR_20_6_3_1 + edx*8] |
505 |
|
|
506 |
|
movzx edx, byte [esi+6] |
507 |
|
paddw mm0, [xvid_FIR_0_1_3_6 + eax*8] |
508 |
|
paddw mm3, [xvid_FIR_20_20_6_3 + eax*8] |
509 |
|
|
510 |
|
movzx eax, byte [esi+7] |
511 |
|
paddw mm0, [xvid_FIR_0_0_1_3 + edx*8] |
512 |
|
paddw mm3, [xvid_FIR_6_20_20_7 + edx*8] |
513 |
|
|
514 |
|
paddw mm0, [xvid_FIR_0_0_0_1 + eax*8] |
515 |
|
paddw mm3, [xvid_FIR_3_6_19_23 + eax*8] |
516 |
|
|
517 |
|
%endif |
518 |
|
|
519 |
|
%endif ; !USE_TABLES |
520 |
|
|
521 |
|
psraw mm0, 5 |
522 |
|
psraw mm3, 5 |
523 |
|
packuswb mm0, mm3 |
524 |
|
|
525 |
|
%if (%1==1) |
526 |
|
MIX mm0, esi, ebx |
527 |
|
%elif (%1==2) |
528 |
|
MIX mm0, esi+1, ebx |
529 |
|
%endif |
530 |
|
%if (%2==1) |
531 |
|
MIX mm0, edi, Rounder1_MMX |
532 |
|
%endif |
533 |
|
|
534 |
|
movq [edi], mm0 |
535 |
|
|
536 |
|
add edi, ebp |
537 |
|
add esi, ebp |
538 |
|
dec ecx |
539 |
|
jg .Loop |
540 |
|
|
541 |
|
%if (%2==0) && (%1==0) |
542 |
|
EPILOG_NO_AVRG |
543 |
|
%else |
544 |
|
EPILOG_AVRG |
545 |
|
%endif |
546 |
|
|
547 |
|
%endmacro |
548 |
|
|
549 |
|
;////////////////////////////////////////////////////////////////////// |
550 |
|
;// 16x? copy Functions |
551 |
|
|
552 |
|
xvid_H_Pass_16_mmx: |
553 |
|
H_PASS_16 0, 0 |
554 |
|
xvid_H_Pass_Avrg_16_mmx: |
555 |
|
H_PASS_16 1, 0 |
556 |
|
xvid_H_Pass_Avrg_Up_16_mmx: |
557 |
|
H_PASS_16 2, 0 |
558 |
|
|
559 |
|
;////////////////////////////////////////////////////////////////////// |
560 |
|
;// 8x? copy Functions |
561 |
|
|
562 |
|
xvid_H_Pass_8_mmx: |
563 |
|
H_PASS_8 0, 0 |
564 |
|
xvid_H_Pass_Avrg_8_mmx: |
565 |
|
H_PASS_8 1, 0 |
566 |
|
xvid_H_Pass_Avrg_Up_8_mmx: |
567 |
|
H_PASS_8 2, 0 |
568 |
|
|
569 |
|
;////////////////////////////////////////////////////////////////////// |
570 |
|
;// 16x? avrg Functions |
571 |
|
|
572 |
|
xvid_H_Pass_Add_16_mmx: |
573 |
|
H_PASS_16 0, 1 |
574 |
|
xvid_H_Pass_Avrg_Add_16_mmx: |
575 |
|
H_PASS_16 1, 1 |
576 |
|
xvid_H_Pass_Avrg_Up_Add_16_mmx: |
577 |
|
H_PASS_16 2, 1 |
578 |
|
|
579 |
|
;////////////////////////////////////////////////////////////////////// |
580 |
|
;// 8x? avrg Functions |
581 |
|
|
582 |
|
xvid_H_Pass_8_Add_mmx: |
583 |
|
H_PASS_8 0, 1 |
584 |
|
xvid_H_Pass_Avrg_8_Add_mmx: |
585 |
|
H_PASS_8 1, 1 |
586 |
|
xvid_H_Pass_Avrg_Up_8_Add_mmx: |
587 |
|
H_PASS_8 2, 1 |
588 |
|
|
589 |
|
|
590 |
|
|
591 |
|
;////////////////////////////////////////////////////////////////////// |
592 |
|
;// |
593 |
|
;// All vertical passes |
594 |
|
;// |
595 |
|
;////////////////////////////////////////////////////////////////////// |
596 |
|
|
597 |
|
%macro V_LOAD 1 ; %1=Last? |
598 |
|
|
599 |
|
movd mm4, [edx] |
600 |
|
pxor mm6, mm6 |
601 |
|
%if (%1==0) |
602 |
|
add edx, ebp |
603 |
|
%endif |
604 |
|
punpcklbw mm4, mm6 |
605 |
|
|
606 |
|
%endmacro |
607 |
|
|
608 |
|
%macro V_ACC1 2 ; %1:reg; 2:tap |
609 |
|
pmullw mm4, [%2] |
610 |
|
paddw %1, mm4 |
611 |
|
%endmacro |
612 |
|
|
613 |
|
%macro V_ACC2 4 ; %1-%2: regs, %3-%4: taps |
614 |
|
movq mm5, mm4 |
615 |
|
movq mm6, mm4 |
616 |
|
pmullw mm5, [%3] |
617 |
|
pmullw mm6, [%4] |
618 |
|
paddw %1, mm5 |
619 |
|
paddw %2, mm6 |
620 |
|
%endmacro |
621 |
|
|
622 |
|
%macro V_ACC2l 4 ; %1-%2: regs, %3-%4: taps |
623 |
|
movq mm5, mm4 |
624 |
|
pmullw mm5, [%3] |
625 |
|
pmullw mm4, [%4] |
626 |
|
paddw %1, mm5 |
627 |
|
paddw %2, mm4 |
628 |
|
%endmacro |
629 |
|
|
630 |
|
%macro V_ACC4 8 ; %1-%4: regs, %5-%8: taps |
631 |
|
V_ACC2 %1,%2, %5,%6 |
632 |
|
V_ACC2l %3,%4, %7,%8 |
633 |
|
%endmacro |
634 |
|
|
635 |
|
|
636 |
|
%macro V_MIX 3 ; %1:dst-reg, %2:src, %3: rounder |
637 |
|
pxor mm6, mm6 |
638 |
|
movq mm4, [%2] |
639 |
|
punpcklbw %1, mm6 |
640 |
|
punpcklbw mm4, mm6 |
641 |
|
paddusw %1, mm4 |
642 |
|
paddusw %1, [%3] |
643 |
|
psrlw %1, 1 |
644 |
|
packuswb %1, %1 |
645 |
|
%endmacro |
646 |
|
|
647 |
|
%macro V_STORE 4 ; %1-%2: mix ops, %3: reg, %4:last? |
648 |
|
|
649 |
|
psraw %3, 5 |
650 |
|
packuswb %3, %3 |
651 |
|
|
652 |
|
%if (%1==1) |
653 |
|
V_MIX %3, esi, ebx |
654 |
|
add esi, ebp |
655 |
|
%elif (%1==2) |
656 |
|
add esi, ebp |
657 |
|
V_MIX %3, esi, ebx |
658 |
|
%endif |
659 |
|
%if (%2==1) |
660 |
|
V_MIX %3, edi, Rounder1_MMX |
661 |
|
%endif |
662 |
|
|
663 |
|
movd eax, %3 |
664 |
|
mov [edi], eax |
665 |
|
|
666 |
|
%if (%4==0) |
667 |
|
add edi, ebp |
668 |
|
%endif |
669 |
|
|
670 |
|
%endmacro |
671 |
|
|
672 |
|
;////////////////////////////////////////////////////////////////////// |
673 |
|
|
674 |
|
%macro V_PASS_16 2 ; %1:src-op (0=NONE,1=AVRG,2=AVRG-UP), %2:dst-op (NONE/AVRG) |
675 |
|
|
676 |
|
%if (%2==0) && (%1==0) |
677 |
|
PROLOG_NO_AVRG |
678 |
|
%else |
679 |
|
PROLOG_AVRG |
680 |
|
%endif |
681 |
|
|
682 |
|
; we process one stripe of 4x16 pixel each time. |
683 |
|
; the size (3rd argument) is meant to be a multiple of 4 |
684 |
|
; mm0..mm3 serves as a 4x4 delay line |
685 |
|
|
686 |
|
.Loop |
687 |
|
|
688 |
|
push edi |
689 |
|
push esi ; esi is preserved for src-mixing |
690 |
|
mov edx, esi |
691 |
|
|
692 |
|
; ouput rows [0..3], from input rows [0..8] |
693 |
|
|
694 |
|
movq mm0, mm7 |
695 |
|
movq mm1, mm7 |
696 |
|
movq mm2, mm7 |
697 |
|
movq mm3, mm7 |
698 |
|
|
699 |
|
V_LOAD 0 |
700 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 |
701 |
|
V_LOAD 0 |
702 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 |
703 |
|
V_LOAD 0 |
704 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 |
705 |
|
V_LOAD 0 |
706 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 |
707 |
|
V_LOAD 0 |
708 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 |
709 |
|
V_STORE %1, %2, mm0, 0 |
710 |
|
|
711 |
|
V_LOAD 0 |
712 |
|
V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 |
713 |
|
V_ACC1 mm3, FIR_Cm6 |
714 |
|
V_STORE %1, %2, mm1, 0 |
715 |
|
|
716 |
|
V_LOAD 0 |
717 |
|
V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 |
718 |
|
V_STORE %1, %2, mm2, 0 |
719 |
|
|
720 |
|
V_LOAD 1 |
721 |
|
V_ACC1 mm3, FIR_Cm1 |
722 |
|
V_STORE %1, %2, mm3, 0 |
723 |
|
|
724 |
|
; ouput rows [4..7], from input rows [1..11] (!!) |
725 |
|
|
726 |
|
mov esi, [esp] |
727 |
|
lea edx, [esi+ebp] |
728 |
|
|
729 |
|
lea esi, [esi+4*ebp] ; for src-mixing |
730 |
|
push esi ; this will be the new value for next round |
731 |
|
|
732 |
|
movq mm0, mm7 |
733 |
|
movq mm1, mm7 |
734 |
|
movq mm2, mm7 |
735 |
|
movq mm3, mm7 |
736 |
|
|
737 |
|
V_LOAD 0 |
738 |
|
V_ACC1 mm0, FIR_Cm1 |
739 |
|
|
740 |
|
V_LOAD 0 |
741 |
|
V_ACC2l mm0, mm1, FIR_C3, FIR_Cm1 |
742 |
|
|
743 |
|
V_LOAD 0 |
744 |
|
V_ACC2 mm0, mm1, FIR_Cm6, FIR_C3 |
745 |
|
V_ACC1 mm2, FIR_Cm1 |
746 |
|
|
747 |
|
V_LOAD 0 |
748 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1 |
749 |
|
V_LOAD 0 |
750 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3 |
751 |
|
V_LOAD 0 |
752 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6 |
753 |
|
V_LOAD 0 |
754 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 |
755 |
|
V_LOAD 0 |
756 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 |
757 |
|
V_STORE %1, %2, mm0, 0 |
758 |
|
|
759 |
|
V_LOAD 0 |
760 |
|
V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 |
761 |
|
V_ACC1 mm3, FIR_Cm6 |
762 |
|
V_STORE %1, %2, mm1, 0 |
763 |
|
|
764 |
|
V_LOAD 0 |
765 |
|
V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 |
766 |
|
V_STORE %1, %2, mm2, 0 |
767 |
|
|
768 |
|
V_LOAD 1 |
769 |
|
V_ACC1 mm3, FIR_Cm1 |
770 |
|
V_STORE %1, %2, mm3, 0 |
771 |
|
|
772 |
|
; ouput rows [8..11], from input rows [5..15] |
773 |
|
|
774 |
|
pop esi |
775 |
|
lea edx, [esi+ebp] |
776 |
|
|
777 |
|
lea esi, [esi+4*ebp] ; for src-mixing |
778 |
|
push esi ; this will be the new value for next round |
779 |
|
|
780 |
|
movq mm0, mm7 |
781 |
|
movq mm1, mm7 |
782 |
|
movq mm2, mm7 |
783 |
|
movq mm3, mm7 |
784 |
|
|
785 |
|
V_LOAD 0 |
786 |
|
V_ACC1 mm0, FIR_Cm1 |
787 |
|
|
788 |
|
V_LOAD 0 |
789 |
|
V_ACC2l mm0, mm1, FIR_C3, FIR_Cm1 |
790 |
|
|
791 |
|
V_LOAD 0 |
792 |
|
V_ACC2 mm0, mm1, FIR_Cm6, FIR_C3 |
793 |
|
V_ACC1 mm2, FIR_Cm1 |
794 |
|
|
795 |
|
V_LOAD 0 |
796 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1 |
797 |
|
V_LOAD 0 |
798 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3 |
799 |
|
V_LOAD 0 |
800 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6 |
801 |
|
V_LOAD 0 |
802 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 |
803 |
|
V_LOAD 0 |
804 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 |
805 |
|
|
806 |
|
V_STORE %1, %2, mm0, 0 |
807 |
|
|
808 |
|
V_LOAD 0 |
809 |
|
V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 |
810 |
|
V_ACC1 mm3, FIR_Cm6 |
811 |
|
V_STORE %1, %2, mm1, 0 |
812 |
|
|
813 |
|
V_LOAD 0 |
814 |
|
V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 |
815 |
|
V_STORE %1, %2, mm2, 0 |
816 |
|
|
817 |
|
V_LOAD 1 |
818 |
|
V_ACC1 mm3, FIR_Cm1 |
819 |
|
V_STORE %1, %2, mm3, 0 |
820 |
|
|
821 |
|
|
822 |
|
; ouput rows [12..15], from input rows [9.16] |
823 |
|
|
824 |
|
pop esi |
825 |
|
lea edx, [esi+ebp] |
826 |
|
|
827 |
|
%if (%1!=0) |
828 |
|
lea esi, [esi+4*ebp] ; for src-mixing |
829 |
|
%endif |
830 |
|
|
831 |
|
movq mm0, mm7 |
832 |
|
movq mm1, mm7 |
833 |
|
movq mm2, mm7 |
834 |
|
movq mm3, mm7 |
835 |
|
|
836 |
|
V_LOAD 0 |
837 |
|
V_ACC1 mm3, FIR_Cm1 |
838 |
|
|
839 |
|
V_LOAD 0 |
840 |
|
V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 |
841 |
|
|
842 |
|
V_LOAD 0 |
843 |
|
V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 |
844 |
|
V_ACC1 mm3, FIR_Cm6 |
845 |
|
|
846 |
|
V_LOAD 0 |
847 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 |
848 |
|
V_LOAD 0 |
849 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 |
850 |
|
V_LOAD 0 |
851 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 |
852 |
|
V_LOAD 0 |
853 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 |
854 |
|
V_LOAD 1 |
855 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 |
856 |
|
|
857 |
|
V_STORE %1, %2, mm3, 0 |
858 |
|
V_STORE %1, %2, mm2, 0 |
859 |
|
V_STORE %1, %2, mm1, 0 |
860 |
|
V_STORE %1, %2, mm0, 1 |
861 |
|
|
862 |
|
; ... next 4 columns |
863 |
|
|
864 |
|
pop esi |
865 |
|
pop edi |
866 |
|
add esi, 4 |
867 |
|
add edi, 4 |
868 |
|
sub ecx, 4 |
869 |
|
jg .Loop |
870 |
|
|
871 |
|
%if (%2==0) && (%1==0) |
872 |
|
EPILOG_NO_AVRG |
873 |
|
%else |
874 |
|
EPILOG_AVRG |
875 |
|
%endif |
876 |
|
|
877 |
|
%endmacro |
878 |
|
|
879 |
|
;////////////////////////////////////////////////////////////////////// |
880 |
|
|
881 |
|
%macro V_PASS_8 2 ; %1:src-op (0=NONE,1=AVRG,2=AVRG-UP), %2:dst-op (NONE/AVRG) |
882 |
|
|
883 |
|
%if (%2==0) && (%1==0) |
884 |
|
PROLOG_NO_AVRG |
885 |
|
%else |
886 |
|
PROLOG_AVRG |
887 |
|
%endif |
888 |
|
|
889 |
|
; we process one stripe of 4x8 pixel each time |
890 |
|
; the size (3rd argument) is meant to be a multiple of 4 |
891 |
|
; mm0..mm3 serves as a 4x4 delay line |
892 |
|
.Loop |
893 |
|
|
894 |
|
push edi |
895 |
|
push esi ; esi is preserved for src-mixing |
896 |
|
mov edx, esi |
897 |
|
|
898 |
|
; ouput rows [0..3], from input rows [0..8] |
899 |
|
|
900 |
|
movq mm0, mm7 |
901 |
|
movq mm1, mm7 |
902 |
|
movq mm2, mm7 |
903 |
|
movq mm3, mm7 |
904 |
|
|
905 |
|
V_LOAD 0 |
906 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 |
907 |
|
V_LOAD 0 |
908 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 |
909 |
|
V_LOAD 0 |
910 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 |
911 |
|
V_LOAD 0 |
912 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 |
913 |
|
V_LOAD 0 |
914 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 |
915 |
|
V_STORE %1, %2, mm0, 0 |
916 |
|
|
917 |
|
V_LOAD 0 |
918 |
|
V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 |
919 |
|
V_ACC1 mm3, FIR_Cm6 |
920 |
|
|
921 |
|
V_STORE %1, %2, mm1, 0 |
922 |
|
|
923 |
|
V_LOAD 0 |
924 |
|
V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 |
925 |
|
V_STORE %1, %2, mm2, 0 |
926 |
|
|
927 |
|
V_LOAD 1 |
928 |
|
V_ACC1 mm3, FIR_Cm1 |
929 |
|
V_STORE %1, %2, mm3, 0 |
930 |
|
|
931 |
|
; ouput rows [4..7], from input rows [1..9] |
932 |
|
|
933 |
|
mov esi, [esp] |
934 |
|
lea edx, [esi+ebp] |
935 |
|
|
936 |
|
%if (%1!=0) |
937 |
|
lea esi, [esi+4*ebp] ; for src-mixing |
938 |
|
%endif |
939 |
|
|
940 |
|
movq mm0, mm7 |
941 |
|
movq mm1, mm7 |
942 |
|
movq mm2, mm7 |
943 |
|
movq mm3, mm7 |
944 |
|
|
945 |
|
V_LOAD 0 |
946 |
|
V_ACC1 mm3, FIR_Cm1 |
947 |
|
|
948 |
|
V_LOAD 0 |
949 |
|
V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 |
950 |
|
|
951 |
|
V_LOAD 0 |
952 |
|
V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 |
953 |
|
V_ACC1 mm3, FIR_Cm6 |
954 |
|
|
955 |
|
V_LOAD 0 |
956 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 |
957 |
|
V_LOAD 0 |
958 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 |
959 |
|
V_LOAD 0 |
960 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 |
961 |
|
V_LOAD 0 |
962 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 |
963 |
|
V_LOAD 1 |
964 |
|
V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 |
965 |
|
|
966 |
|
V_STORE %1, %2, mm3, 0 |
967 |
|
V_STORE %1, %2, mm2, 0 |
968 |
|
V_STORE %1, %2, mm1, 0 |
969 |
|
V_STORE %1, %2, mm0, 1 |
970 |
|
|
971 |
|
; ... next 4 columns |
972 |
|
|
973 |
|
pop esi |
974 |
|
pop edi |
975 |
|
add esi, 4 |
976 |
|
add edi, 4 |
977 |
|
sub ecx, 4 |
978 |
|
jg .Loop |
979 |
|
|
980 |
|
%if (%2==0) && (%1==0) |
981 |
|
EPILOG_NO_AVRG |
982 |
|
%else |
983 |
|
EPILOG_AVRG |
984 |
|
%endif |
985 |
|
|
986 |
|
%endmacro |
987 |
|
|
988 |
|
|
989 |
|
;////////////////////////////////////////////////////////////////////// |
990 |
|
;// 16x? copy Functions |
991 |
|
|
992 |
|
xvid_V_Pass_16_mmx: |
993 |
|
V_PASS_16 0, 0 |
994 |
|
xvid_V_Pass_Avrg_16_mmx: |
995 |
|
V_PASS_16 1, 0 |
996 |
|
xvid_V_Pass_Avrg_Up_16_mmx: |
997 |
|
V_PASS_16 2, 0 |
998 |
|
|
999 |
|
;////////////////////////////////////////////////////////////////////// |
1000 |
|
;// 8x? copy Functions |
1001 |
|
|
1002 |
|
xvid_V_Pass_8_mmx: |
1003 |
|
V_PASS_8 0, 0 |
1004 |
|
xvid_V_Pass_Avrg_8_mmx: |
1005 |
|
V_PASS_8 1, 0 |
1006 |
|
xvid_V_Pass_Avrg_Up_8_mmx: |
1007 |
|
V_PASS_8 2, 0 |
1008 |
|
|
1009 |
|
;////////////////////////////////////////////////////////////////////// |
1010 |
|
;// 16x? avrg Functions |
1011 |
|
|
1012 |
|
xvid_V_Pass_Add_16_mmx: |
1013 |
|
V_PASS_16 0, 1 |
1014 |
|
xvid_V_Pass_Avrg_Add_16_mmx: |
1015 |
|
V_PASS_16 1, 1 |
1016 |
|
xvid_V_Pass_Avrg_Up_Add_16_mmx: |
1017 |
|
V_PASS_16 2, 1 |
1018 |
|
|
1019 |
|
;////////////////////////////////////////////////////////////////////// |
1020 |
|
;// 8x? avrg Functions |
1021 |
|
|
1022 |
|
xvid_V_Pass_8_Add_mmx: |
1023 |
|
V_PASS_8 0, 1 |
1024 |
|
xvid_V_Pass_Avrg_8_Add_mmx: |
1025 |
|
V_PASS_8 1, 1 |
1026 |
|
xvid_V_Pass_Avrg_Up_8_Add_mmx: |
1027 |
|
V_PASS_8 2, 1 |
1028 |
|
|
1029 |
|
;////////////////////////////////////////////////////////////////////// |