1 |
|
;/***************************************************************************** |
2 |
|
; * |
3 |
|
; * XVID MPEG-4 VIDEO CODEC |
4 |
|
; * Reduced-Resolution utilities |
5 |
|
; * |
6 |
|
; * Copyright(C) 2002 Pascal Massimino <skal@planet-d.net> |
7 |
|
; * |
8 |
|
; * This file is part of XviD, a free MPEG-4 video encoder/decoder |
9 |
|
; * |
10 |
|
; * XviD is free software; you can redistribute it and/or modify it |
11 |
|
; * under the terms of the GNU General Public License as published by |
12 |
|
; * the Free Software Foundation; either version 2 of the License, or |
13 |
|
; * (at your option) any later version. |
14 |
|
; * |
15 |
|
; * This program is distributed in the hope that it will be useful, |
16 |
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 |
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 |
|
; * GNU General Public License for more details. |
19 |
|
; * |
20 |
|
; * You should have received a copy of the GNU General Public License |
21 |
|
; * along with this program; if not, write to the Free Software |
22 |
|
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
23 |
|
; * |
24 |
|
; * Under section 8 of the GNU General Public License, the copyright |
25 |
|
; * holders of XVID explicitly forbid distribution in the following |
26 |
|
; * countries: |
27 |
|
; * |
28 |
|
; * - Japan |
29 |
|
; * - United States of America |
30 |
|
; * |
31 |
|
; * Linking XviD statically or dynamically with other modules is making a |
32 |
|
; * combined work based on XviD. Thus, the terms and conditions of the |
33 |
|
; * GNU General Public License cover the whole combination. |
34 |
|
; * |
35 |
|
; * As a special exception, the copyright holders of XviD give you |
36 |
|
; * permission to link XviD with independent modules that communicate with |
37 |
|
; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the |
38 |
|
; * license terms of these independent modules, and to copy and distribute |
39 |
|
; * the resulting combined work under terms of your choice, provided that |
40 |
|
; * every copy of the combined work is accompanied by a complete copy of |
41 |
|
; * the source code of XviD (the version of XviD used to produce the |
42 |
|
; * combined work), being distributed under the terms of the GNU General |
43 |
|
; * Public License plus this exception. An independent module is a module |
44 |
|
; * which is not derived from or based on XviD. |
45 |
|
; * |
46 |
|
; * Note that people who make modified versions of XviD are not obligated |
47 |
|
; * to grant this special exception for their modified versions; it is |
48 |
|
; * their choice whether to do so. The GNU General Public License gives |
49 |
|
; * permission to release a modified version without this exception; this |
50 |
|
; * exception also makes it possible to release a modified version which |
51 |
|
; * carries forward this exception. |
52 |
|
; * |
53 |
|
; * $Id$ |
54 |
|
; * |
55 |
|
; *************************************************************************/ |
56 |
|
|
57 |
|
bits 32 |
58 |
|
|
59 |
|
%macro cglobal 1 |
60 |
|
%ifdef PREFIX |
61 |
|
global _%1 |
62 |
|
%define %1 _%1 |
63 |
|
%else |
64 |
|
global %1 |
65 |
|
%endif |
66 |
|
%endmacro |
67 |
|
|
68 |
|
;=========================================================================== |
69 |
|
|
70 |
|
section .data |
71 |
|
|
72 |
|
align 16 |
73 |
|
Up31 dw 3, 1, 3, 1 |
74 |
|
Up13 dw 1, 3, 1, 3 |
75 |
|
Up93 dw 9, 3, 9, 3 |
76 |
|
Up39 dw 3, 9, 3, 9 |
77 |
|
Cst0 dw 0, 0, 0, 0 |
78 |
|
Cst2 dw 2, 2, 2, 2 |
79 |
|
Cst3 dw 3, 3, 3, 3 |
80 |
|
Cst32 dw 32,32,32,32 |
81 |
|
Cst2000 dw 2, 0, 0, 0 |
82 |
|
Cst0002 dw 0, 0, 0, 2 |
83 |
|
|
84 |
|
Mask_ff dw 0xff,0xff,0xff,0xff |
85 |
|
|
86 |
|
;=========================================================================== |
87 |
|
|
88 |
|
section .text |
89 |
|
|
90 |
|
cglobal xvid_Copy_Upsampled_8x8_16To8_mmx |
91 |
|
cglobal xvid_Add_Upsampled_8x8_16To8_mmx |
92 |
|
cglobal xvid_Copy_Upsampled_8x8_16To8_xmm |
93 |
|
cglobal xvid_Add_Upsampled_8x8_16To8_xmm |
94 |
|
|
95 |
|
cglobal xvid_HFilter_31_mmx |
96 |
|
cglobal xvid_VFilter_31_x86 |
97 |
|
cglobal xvid_HFilter_31_x86 |
98 |
|
|
99 |
|
cglobal xvid_Filter_18x18_To_8x8_mmx |
100 |
|
cglobal xvid_Filter_Diff_18x18_To_8x8_mmx |
101 |
|
|
102 |
|
|
103 |
|
;////////////////////////////////////////////////////////////////////// |
104 |
|
;// 8x8 -> 16x16 upsampling (16b) |
105 |
|
;////////////////////////////////////////////////////////////////////// |
106 |
|
|
107 |
|
%macro MUL_PACK 4 ; %1/%2: regs %3/%4/%5: Up13/Up31 |
108 |
|
pmullw %1, %3 ; [Up13] |
109 |
|
pmullw mm4, %4 ; [Up31] |
110 |
|
pmullw %2, %3 ; [Up13] |
111 |
|
pmullw mm5, %4 ; [Up31] |
112 |
|
paddsw %1, [Cst2] |
113 |
|
paddsw %2, [Cst2] |
114 |
|
paddsw %1, mm4 |
115 |
|
paddsw %2, mm5 |
116 |
|
%endmacro |
117 |
|
|
118 |
|
; MMX-way of reordering columns... |
119 |
|
|
120 |
|
%macro COL03 3 ;%1/%2: regs, %3: row -output: mm4/mm5 |
121 |
|
movq %1, [edx+%3*16+0*2] ; %1 = 0|1|2|3 |
122 |
|
movq %2,[edx+%3*16+1*2] ; %2 = 1|2|3|4 |
123 |
|
movq mm5, %1 ; mm5 = 0|1|2|3 |
124 |
|
movq mm4, %1 ; mm4 = 0|1|2|3 |
125 |
|
punpckhwd mm5,%2 ; mm5 = 2|3|3|4 |
126 |
|
punpcklwd mm4,%2 ; mm4 = 0|1|1|2 |
127 |
|
punpcklwd %1,%1 ; %1 = 0|0|1|1 |
128 |
|
punpcklwd %2, mm5 ; %2 = 1|2|2|3 |
129 |
|
punpcklwd %1, mm4 ; %1 = 0|0|0|1 |
130 |
|
%endmacro |
131 |
|
|
132 |
|
%macro COL47 3 ;%1-%2: regs, %3: row -output: mm4/mm5 |
133 |
|
movq mm5, [edx+%3*16+4*2] ; mm5 = 4|5|6|7 |
134 |
|
movq %1, [edx+%3*16+3*2] ; %1 = 3|4|5|6 |
135 |
|
movq %2, mm5 ; %2 = 4|5|6|7 |
136 |
|
movq mm4, mm5 ; mm4 = 4|5|6|7 |
137 |
|
punpckhwd %2, %2 ; %2 = 6|6|7|7 |
138 |
|
punpckhwd mm5, %2 ; mm5 = 6|7|7|7 |
139 |
|
movq %2, %1 ; %2 = 3|4|5|6 |
140 |
|
punpcklwd %1, mm4 ; %1 = 3|4|4|5 |
141 |
|
punpckhwd %2, mm4 ; %2 = 5|6|6|7 |
142 |
|
punpcklwd mm4, %2 ; mm4 = 4|5|5|6 |
143 |
|
%endmacro |
144 |
|
|
145 |
|
%macro MIX_ROWS 4 ; %1/%2:prev %3/4:cur (preserved) mm4/mm5: output |
146 |
|
; we need to perform: (%1,%3) -> (%1 = 3*%1+%3, mm4 = 3*%3+%1), %3 preserved. |
147 |
|
movq mm4, [Cst3] |
148 |
|
movq mm5, [Cst3] |
149 |
|
pmullw mm4, %3 |
150 |
|
pmullw mm5, %4 |
151 |
|
paddsw mm4, %1 |
152 |
|
paddsw mm5, %2 |
153 |
|
pmullw %1, [Cst3] |
154 |
|
pmullw %2, [Cst3] |
155 |
|
paddsw %1, %3 |
156 |
|
paddsw %2, %4 |
157 |
|
%endmacro |
158 |
|
|
159 |
|
;=========================================================================== |
160 |
|
; |
161 |
|
; void xvid_Copy_Upsampled_8x8_16To8_mmx(uint8_t *Dst, |
162 |
|
; const int16_t *Src, const int BpS); |
163 |
|
; |
164 |
|
;=========================================================================== |
165 |
|
|
166 |
|
; Note: we can use ">>2" instead of "/4" here, since we |
167 |
|
; are (supposed to be) averaging positive values |
168 |
|
|
169 |
|
%macro STORE_1 2 |
170 |
|
psraw %1, 2 |
171 |
|
psraw %2, 2 |
172 |
|
packuswb %1,%2 |
173 |
|
movq [ecx], %1 |
174 |
|
%endmacro |
175 |
|
|
176 |
|
%macro STORE_2 2 ; pack and store (%1,%2) + (mm4,mm5) |
177 |
|
psraw %1, 4 |
178 |
|
psraw %2, 4 |
179 |
|
psraw mm4, 4 |
180 |
|
psraw mm5, 4 |
181 |
|
packuswb %1,%2 |
182 |
|
packuswb mm4, mm5 |
183 |
|
movq [ecx], %1 |
184 |
|
movq [ecx+eax], mm4 |
185 |
|
lea ecx, [ecx+2*eax] |
186 |
|
%endmacro |
187 |
|
|
188 |
|
;////////////////////////////////////////////////////////////////////// |
189 |
|
|
190 |
|
align 16 |
191 |
|
xvid_Copy_Upsampled_8x8_16To8_mmx: ; 344c |
192 |
|
|
193 |
|
mov ecx, [esp+4] ; Dst |
194 |
|
mov edx, [esp+8] ; Src |
195 |
|
mov eax, [esp+12] ; BpS |
196 |
|
|
197 |
|
movq mm6, [Up13] |
198 |
|
movq mm7, [Up31] |
199 |
|
|
200 |
|
COL03 mm0, mm1, 0 |
201 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
202 |
|
movq mm4, mm0 |
203 |
|
movq mm5, mm1 |
204 |
|
STORE_1 mm4, mm5 |
205 |
|
add ecx, eax |
206 |
|
|
207 |
|
COL03 mm2, mm3, 1 |
208 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
209 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
210 |
|
STORE_2 mm0, mm1 |
211 |
|
|
212 |
|
COL03 mm0, mm1, 2 |
213 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
214 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
215 |
|
STORE_2 mm2, mm3 |
216 |
|
|
217 |
|
COL03 mm2, mm3, 3 |
218 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
219 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
220 |
|
STORE_2 mm0, mm1 |
221 |
|
|
222 |
|
COL03 mm0, mm1, 4 |
223 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
224 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
225 |
|
STORE_2 mm2, mm3 |
226 |
|
|
227 |
|
COL03 mm2, mm3, 5 |
228 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
229 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
230 |
|
STORE_2 mm0, mm1 |
231 |
|
|
232 |
|
COL03 mm0, mm1, 6 |
233 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
234 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
235 |
|
STORE_2 mm2, mm3 |
236 |
|
|
237 |
|
COL03 mm2, mm3, 7 |
238 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
239 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
240 |
|
STORE_2 mm0, mm1 |
241 |
|
|
242 |
|
STORE_1 mm2, mm3 |
243 |
|
|
244 |
|
mov ecx, [esp+4] |
245 |
|
add ecx, 8 |
246 |
|
|
247 |
|
COL47 mm0, mm1, 0 |
248 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
249 |
|
movq mm4, mm0 |
250 |
|
movq mm5, mm1 |
251 |
|
STORE_1 mm4, mm5 |
252 |
|
add ecx, eax |
253 |
|
|
254 |
|
COL47 mm2, mm3, 1 |
255 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
256 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
257 |
|
STORE_2 mm0, mm1 |
258 |
|
|
259 |
|
COL47 mm0, mm1, 2 |
260 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
261 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
262 |
|
STORE_2 mm2, mm3 |
263 |
|
|
264 |
|
COL47 mm2, mm3, 3 |
265 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
266 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
267 |
|
STORE_2 mm0, mm1 |
268 |
|
|
269 |
|
COL47 mm0, mm1, 4 |
270 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
271 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
272 |
|
STORE_2 mm2, mm3 |
273 |
|
|
274 |
|
COL47 mm2, mm3, 5 |
275 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
276 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
277 |
|
STORE_2 mm0, mm1 |
278 |
|
|
279 |
|
COL47 mm0, mm1, 6 |
280 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
281 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
282 |
|
STORE_2 mm2, mm3 |
283 |
|
|
284 |
|
COL47 mm2, mm3, 7 |
285 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
286 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
287 |
|
STORE_2 mm0, mm1 |
288 |
|
|
289 |
|
STORE_1 mm2, mm3 |
290 |
|
|
291 |
|
ret |
292 |
|
|
293 |
|
;=========================================================================== |
294 |
|
; |
295 |
|
; void xvid_Add_Upsampled_8x8_16To8_mmx(uint8_t *Dst, |
296 |
|
; const int16_t *Src, const int BpS); |
297 |
|
; |
298 |
|
;=========================================================================== |
299 |
|
|
300 |
|
; Note: grrr... the 'pcmpgtw' stuff are the "/4" and "/16" operators |
301 |
|
; implemented with ">>2" and ">>4" using: |
302 |
|
; x/4 = ( (x-(x<0))>>2 ) + (x<0) |
303 |
|
; x/16 = ( (x-(x<0))>>4 ) + (x<0) |
304 |
|
|
305 |
|
%macro STORE_ADD_1 2 |
306 |
|
; We substract the rounder '2' for corner pixels, |
307 |
|
; since when 'x' is negative, (x*4 + 2)/4 is *not* |
308 |
|
; equal to 'x'. In fact, the correct relation is: |
309 |
|
; (x*4 + 2)/4 = x - (x<0) |
310 |
|
; So, better revert to (x*4)/4 = x. |
311 |
|
|
312 |
|
psubsw %1, [Cst2000] |
313 |
|
psubsw %2, [Cst0002] |
314 |
|
pxor mm6, mm6 |
315 |
|
pxor mm7, mm7 |
316 |
|
pcmpgtw mm6, %1 |
317 |
|
pcmpgtw mm7, %2 |
318 |
|
paddsw %1, mm6 |
319 |
|
paddsw %2, mm7 |
320 |
|
psraw %1, 2 |
321 |
|
psraw %2, 2 |
322 |
|
psubsw %1, mm6 |
323 |
|
psubsw %2, mm7 |
324 |
|
|
325 |
|
; mix with destination [ecx] |
326 |
|
movq mm6, [ecx] |
327 |
|
movq mm7, [ecx] |
328 |
|
punpcklbw mm6, [Cst0] |
329 |
|
punpckhbw mm7, [Cst0] |
330 |
|
paddsw %1, mm6 |
331 |
|
paddsw %2, mm7 |
332 |
|
packuswb %1,%2 |
333 |
|
movq [ecx], %1 |
334 |
|
%endmacro |
335 |
|
|
336 |
|
%macro STORE_ADD_2 2 |
337 |
|
pxor mm6, mm6 |
338 |
|
pxor mm7, mm7 |
339 |
|
pcmpgtw mm6, %1 |
340 |
|
pcmpgtw mm7, %2 |
341 |
|
paddsw %1, mm6 |
342 |
|
paddsw %2, mm7 |
343 |
|
psraw %1, 4 |
344 |
|
psraw %2, 4 |
345 |
|
psubsw %1, mm6 |
346 |
|
psubsw %2, mm7 |
347 |
|
|
348 |
|
pxor mm6, mm6 |
349 |
|
pxor mm7, mm7 |
350 |
|
pcmpgtw mm6, mm4 |
351 |
|
pcmpgtw mm7, mm5 |
352 |
|
paddsw mm4, mm6 |
353 |
|
paddsw mm5, mm7 |
354 |
|
psraw mm4, 4 |
355 |
|
psraw mm5, 4 |
356 |
|
psubsw mm4, mm6 |
357 |
|
psubsw mm5, mm7 |
358 |
|
|
359 |
|
; mix with destination |
360 |
|
movq mm6, [ecx] |
361 |
|
movq mm7, [ecx] |
362 |
|
punpcklbw mm6, [Cst0] |
363 |
|
punpckhbw mm7, [Cst0] |
364 |
|
paddsw %1, mm6 |
365 |
|
paddsw %2, mm7 |
366 |
|
|
367 |
|
movq mm6, [ecx+eax] |
368 |
|
movq mm7, [ecx+eax] |
369 |
|
|
370 |
|
punpcklbw mm6, [Cst0] |
371 |
|
punpckhbw mm7, [Cst0] |
372 |
|
paddsw mm4, mm6 |
373 |
|
paddsw mm5, mm7 |
374 |
|
|
375 |
|
packuswb %1,%2 |
376 |
|
packuswb mm4, mm5 |
377 |
|
|
378 |
|
movq [ecx], %1 |
379 |
|
movq [ecx+eax], mm4 |
380 |
|
|
381 |
|
lea ecx, [ecx+2*eax] |
382 |
|
%endmacro |
383 |
|
|
384 |
|
;////////////////////////////////////////////////////////////////////// |
385 |
|
|
386 |
|
align 16 |
387 |
|
xvid_Add_Upsampled_8x8_16To8_mmx: ; 579c |
388 |
|
|
389 |
|
mov ecx, [esp+4] ; Dst |
390 |
|
mov edx, [esp+8] ; Src |
391 |
|
mov eax, [esp+12] ; BpS |
392 |
|
|
393 |
|
COL03 mm0, mm1, 0 |
394 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
395 |
|
movq mm4, mm0 |
396 |
|
movq mm5, mm1 |
397 |
|
STORE_ADD_1 mm4, mm5 |
398 |
|
add ecx, eax |
399 |
|
|
400 |
|
COL03 mm2, mm3, 1 |
401 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
402 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
403 |
|
STORE_ADD_2 mm0, mm1 |
404 |
|
|
405 |
|
COL03 mm0, mm1, 2 |
406 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
407 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
408 |
|
STORE_ADD_2 mm2, mm3 |
409 |
|
|
410 |
|
COL03 mm2, mm3, 3 |
411 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
412 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
413 |
|
STORE_ADD_2 mm0, mm1 |
414 |
|
|
415 |
|
COL03 mm0, mm1, 4 |
416 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
417 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
418 |
|
STORE_ADD_2 mm2, mm3 |
419 |
|
|
420 |
|
COL03 mm2, mm3, 5 |
421 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
422 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
423 |
|
STORE_ADD_2 mm0, mm1 |
424 |
|
|
425 |
|
COL03 mm0, mm1, 6 |
426 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
427 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
428 |
|
STORE_ADD_2 mm2, mm3 |
429 |
|
|
430 |
|
COL03 mm2, mm3, 7 |
431 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
432 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
433 |
|
STORE_ADD_2 mm0, mm1 |
434 |
|
|
435 |
|
STORE_ADD_1 mm2, mm3 |
436 |
|
|
437 |
|
|
438 |
|
mov ecx, [esp+4] |
439 |
|
add ecx, 8 |
440 |
|
|
441 |
|
COL47 mm0, mm1, 0 |
442 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
443 |
|
movq mm4, mm0 |
444 |
|
movq mm5, mm1 |
445 |
|
STORE_ADD_1 mm4, mm5 |
446 |
|
add ecx, eax |
447 |
|
|
448 |
|
COL47 mm2, mm3, 1 |
449 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
450 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
451 |
|
STORE_ADD_2 mm0, mm1 |
452 |
|
|
453 |
|
COL47 mm0, mm1, 2 |
454 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
455 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
456 |
|
STORE_ADD_2 mm2, mm3 |
457 |
|
|
458 |
|
COL47 mm2, mm3, 3 |
459 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
460 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
461 |
|
STORE_ADD_2 mm0, mm1 |
462 |
|
|
463 |
|
COL47 mm0, mm1, 4 |
464 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
465 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
466 |
|
STORE_ADD_2 mm2, mm3 |
467 |
|
|
468 |
|
COL47 mm2, mm3, 5 |
469 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
470 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
471 |
|
STORE_ADD_2 mm0, mm1 |
472 |
|
|
473 |
|
COL47 mm0, mm1, 6 |
474 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
475 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
476 |
|
STORE_ADD_2 mm2, mm3 |
477 |
|
|
478 |
|
COL47 mm2, mm3, 7 |
479 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
480 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
481 |
|
STORE_ADD_2 mm0, mm1 |
482 |
|
|
483 |
|
STORE_ADD_1 mm2, mm3 |
484 |
|
|
485 |
|
ret |
486 |
|
|
487 |
|
;=========================================================================== |
488 |
|
; |
489 |
|
; void xvid_Copy_Upsampled_8x8_16To8_xmm(uint8_t *Dst, |
490 |
|
; const int16_t *Src, const int BpS); |
491 |
|
; |
492 |
|
;=========================================================================== |
493 |
|
|
494 |
|
; xmm version can take (little) advantage of 'pshufw' |
495 |
|
|
496 |
|
%macro COL03_SSE 3 ;%1/%2: regs, %3: row -trashes mm4/mm5 |
497 |
|
movq %2, [edx+%3*16+0*2] ; <- 0|1|2|3 |
498 |
|
pshufw %1, %2, (0+0*4+0*16+1*64) ; %1 = 0|0|0|1 |
499 |
|
pshufw mm4, %2, (0+1*4+1*16+2*64) ; mm4= 0|1|1|2 |
500 |
|
pshufw %2, %2, (1+2*4+2*16+3*64) ; %2 = 1|2|2|3 |
501 |
|
pshufw mm5, [edx+%3*16+2*2], (0+1*4+1*16+2*64) ; mm5 = 2|3|3|4 |
502 |
|
%endmacro |
503 |
|
|
504 |
|
%macro COL47_SSE 3 ;%1-%2: regs, %3: row -trashes mm4/mm5 |
505 |
|
pshufw %1, [edx+%3*16+2*2], (1+2*4+2*16+3*64) ; 3|4|4|5 |
506 |
|
movq mm5, [edx+%3*16+2*4] ; <- 4|5|6|7 |
507 |
|
pshufw mm4, mm5, (0+1*4+1*16+2*64) ; 4|5|5|6 |
508 |
|
pshufw %2, mm5, (1+2*4+2*16+3*64) ; 5|6|6|7 |
509 |
|
pshufw mm5, mm5, (2+3*4+3*16+3*64) ; 6|7|7|7 |
510 |
|
%endmacro |
511 |
|
|
512 |
|
|
513 |
|
;////////////////////////////////////////////////////////////////////// |
514 |
|
|
515 |
|
align 16 |
516 |
|
xvid_Copy_Upsampled_8x8_16To8_xmm: ; 315c |
517 |
|
|
518 |
|
mov ecx, [esp+4] ; Dst |
519 |
|
mov edx, [esp+8] ; Src |
520 |
|
mov eax, [esp+12] ; BpS |
521 |
|
|
522 |
|
movq mm6, [Up13] |
523 |
|
movq mm7, [Up31] |
524 |
|
|
525 |
|
COL03_SSE mm0, mm1, 0 |
526 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
527 |
|
movq mm4, mm0 |
528 |
|
movq mm5, mm1 |
529 |
|
STORE_1 mm4, mm5 |
530 |
|
add ecx, eax |
531 |
|
|
532 |
|
COL03_SSE mm2, mm3, 1 |
533 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
534 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
535 |
|
STORE_2 mm0, mm1 |
536 |
|
|
537 |
|
COL03_SSE mm0, mm1, 2 |
538 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
539 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
540 |
|
STORE_2 mm2, mm3 |
541 |
|
|
542 |
|
COL03_SSE mm2, mm3, 3 |
543 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
544 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
545 |
|
STORE_2 mm0, mm1 |
546 |
|
|
547 |
|
COL03_SSE mm0, mm1, 4 |
548 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
549 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
550 |
|
STORE_2 mm2, mm3 |
551 |
|
|
552 |
|
COL03_SSE mm2, mm3, 5 |
553 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
554 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
555 |
|
STORE_2 mm0, mm1 |
556 |
|
|
557 |
|
COL03_SSE mm0, mm1, 6 |
558 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
559 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
560 |
|
STORE_2 mm2, mm3 |
561 |
|
|
562 |
|
COL03_SSE mm2, mm3, 7 |
563 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
564 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
565 |
|
STORE_2 mm0, mm1 |
566 |
|
|
567 |
|
STORE_1 mm2, mm3 |
568 |
|
|
569 |
|
mov ecx, [esp+4] |
570 |
|
add ecx, 8 |
571 |
|
|
572 |
|
COL47_SSE mm0, mm1, 0 |
573 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
574 |
|
movq mm4, mm0 |
575 |
|
movq mm5, mm1 |
576 |
|
STORE_1 mm4, mm5 |
577 |
|
add ecx, eax |
578 |
|
|
579 |
|
COL47_SSE mm2, mm3, 1 |
580 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
581 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
582 |
|
STORE_2 mm0, mm1 |
583 |
|
|
584 |
|
COL47_SSE mm0, mm1, 2 |
585 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
586 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
587 |
|
STORE_2 mm2, mm3 |
588 |
|
|
589 |
|
COL47_SSE mm2, mm3, 3 |
590 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
591 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
592 |
|
STORE_2 mm0, mm1 |
593 |
|
|
594 |
|
COL47_SSE mm0, mm1, 4 |
595 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
596 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
597 |
|
STORE_2 mm2, mm3 |
598 |
|
|
599 |
|
COL47_SSE mm2, mm3, 5 |
600 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
601 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
602 |
|
STORE_2 mm0, mm1 |
603 |
|
|
604 |
|
COL47_SSE mm0, mm1, 6 |
605 |
|
MUL_PACK mm0,mm1, mm6, mm7 |
606 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
607 |
|
STORE_2 mm2, mm3 |
608 |
|
|
609 |
|
COL47_SSE mm2, mm3, 7 |
610 |
|
MUL_PACK mm2,mm3, mm6, mm7 |
611 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
612 |
|
STORE_2 mm0, mm1 |
613 |
|
|
614 |
|
STORE_1 mm2, mm3 |
615 |
|
|
616 |
|
ret |
617 |
|
|
618 |
|
;=========================================================================== |
619 |
|
; |
620 |
|
; void xvid_Add_Upsampled_8x8_16To8_xmm(uint8_t *Dst, |
621 |
|
; const int16_t *Src, const int BpS); |
622 |
|
; |
623 |
|
;=========================================================================== |
624 |
|
|
625 |
|
align 16 |
626 |
|
xvid_Add_Upsampled_8x8_16To8_xmm: ; 549c |
627 |
|
|
628 |
|
mov ecx, [esp+4] ; Dst |
629 |
|
mov edx, [esp+8] ; Src |
630 |
|
mov eax, [esp+12] ; BpS |
631 |
|
|
632 |
|
COL03_SSE mm0, mm1, 0 |
633 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
634 |
|
movq mm4, mm0 |
635 |
|
movq mm5, mm1 |
636 |
|
STORE_ADD_1 mm4, mm5 |
637 |
|
add ecx, eax |
638 |
|
|
639 |
|
COL03_SSE mm2, mm3, 1 |
640 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
641 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
642 |
|
STORE_ADD_2 mm0, mm1 |
643 |
|
|
644 |
|
COL03_SSE mm0, mm1, 2 |
645 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
646 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
647 |
|
STORE_ADD_2 mm2, mm3 |
648 |
|
|
649 |
|
COL03_SSE mm2, mm3, 3 |
650 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
651 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
652 |
|
STORE_ADD_2 mm0, mm1 |
653 |
|
|
654 |
|
COL03_SSE mm0, mm1, 4 |
655 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
656 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
657 |
|
STORE_ADD_2 mm2, mm3 |
658 |
|
|
659 |
|
COL03_SSE mm2, mm3, 5 |
660 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
661 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
662 |
|
STORE_ADD_2 mm0, mm1 |
663 |
|
|
664 |
|
COL03_SSE mm0, mm1, 6 |
665 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
666 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
667 |
|
STORE_ADD_2 mm2, mm3 |
668 |
|
|
669 |
|
COL03_SSE mm2, mm3, 7 |
670 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
671 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
672 |
|
STORE_ADD_2 mm0, mm1 |
673 |
|
|
674 |
|
STORE_ADD_1 mm2, mm3 |
675 |
|
|
676 |
|
|
677 |
|
mov ecx, [esp+4] |
678 |
|
add ecx, 8 |
679 |
|
|
680 |
|
COL47_SSE mm0, mm1, 0 |
681 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
682 |
|
movq mm4, mm0 |
683 |
|
movq mm5, mm1 |
684 |
|
STORE_ADD_1 mm4, mm5 |
685 |
|
add ecx, eax |
686 |
|
|
687 |
|
COL47_SSE mm2, mm3, 1 |
688 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
689 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
690 |
|
STORE_ADD_2 mm0, mm1 |
691 |
|
|
692 |
|
COL47_SSE mm0, mm1, 2 |
693 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
694 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
695 |
|
STORE_ADD_2 mm2, mm3 |
696 |
|
|
697 |
|
COL47_SSE mm2, mm3, 3 |
698 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
699 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
700 |
|
STORE_ADD_2 mm0, mm1 |
701 |
|
|
702 |
|
COL47_SSE mm0, mm1, 4 |
703 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
704 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
705 |
|
STORE_ADD_2 mm2, mm3 |
706 |
|
|
707 |
|
COL47_SSE mm2, mm3, 5 |
708 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
709 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
710 |
|
STORE_ADD_2 mm0, mm1 |
711 |
|
|
712 |
|
COL47_SSE mm0, mm1, 6 |
713 |
|
MUL_PACK mm0,mm1, [Up13], [Up31] |
714 |
|
MIX_ROWS mm2, mm3, mm0, mm1 |
715 |
|
STORE_ADD_2 mm2, mm3 |
716 |
|
|
717 |
|
COL47_SSE mm2, mm3, 7 |
718 |
|
MUL_PACK mm2,mm3, [Up13], [Up31] |
719 |
|
MIX_ROWS mm0, mm1, mm2, mm3 |
720 |
|
STORE_ADD_2 mm0, mm1 |
721 |
|
|
722 |
|
STORE_ADD_1 mm2, mm3 |
723 |
|
|
724 |
|
ret |
725 |
|
|
726 |
|
|
727 |
|
;=========================================================================== |
728 |
|
; |
729 |
|
; void xvid_HFilter_31_mmx(uint8_t *Src1, uint8_t *Src2, int Nb_Blks); |
730 |
|
; void xvid_VFilter_31_x86(uint8_t *Src1, uint8_t *Src2, const int BpS, int Nb_Blks); |
731 |
|
; void xvid_HFilter_31_x86(uint8_t *Src1, uint8_t *Src2, int Nb_Blks); |
732 |
|
; |
733 |
|
;=========================================================================== |
734 |
|
|
735 |
|
;////////////////////////////////////////////////////////////////////// |
736 |
|
;// horizontal/vertical filtering: [x,y] -> [ (3x+y+2)>>2, (x+3y+2)>>2 ] |
737 |
|
;// |
738 |
|
;// We use the trick: tmp = (x+y+2) -> [x = (tmp+2x)>>2, y = (tmp+2y)>>2] |
739 |
|
;////////////////////////////////////////////////////////////////////// |
740 |
|
|
741 |
|
align 16 |
742 |
|
xvid_HFilter_31_mmx: |
743 |
|
push esi |
744 |
|
push edi |
745 |
|
mov esi, [esp+4 +8] ; Src1 |
746 |
|
mov edi, [esp+8 +8] ; Src2 |
747 |
|
mov eax, [esp+12 +8] ; Nb_Blks |
748 |
|
lea eax,[eax*2] |
749 |
|
movq mm5, [Cst2] |
750 |
|
pxor mm7, mm7 |
751 |
|
|
752 |
|
lea esi, [esi+eax*4] |
753 |
|
lea edi, [edi+eax*4] |
754 |
|
|
755 |
|
neg eax |
756 |
|
|
757 |
|
.Loop: ;12c |
758 |
|
movd mm0, [esi+eax*4] |
759 |
|
movd mm1, [edi+eax*4] |
760 |
|
movq mm2, mm5 |
761 |
|
punpcklbw mm0, mm7 |
762 |
|
punpcklbw mm1, mm7 |
763 |
|
paddsw mm2, mm0 |
764 |
|
paddsw mm0, mm0 |
765 |
|
paddsw mm2, mm1 |
766 |
|
paddsw mm1, mm1 |
767 |
|
paddsw mm0, mm2 |
768 |
|
paddsw mm1, mm2 |
769 |
|
psraw mm0, 2 |
770 |
|
psraw mm1, 2 |
771 |
|
packuswb mm0, mm7 |
772 |
|
packuswb mm1, mm7 |
773 |
|
movd [esi+eax*4], mm0 |
774 |
|
movd [edi+eax*4], mm1 |
775 |
|
add eax,1 |
776 |
|
jl .Loop |
777 |
|
|
778 |
|
pop edi |
779 |
|
pop esi |
780 |
|
ret |
781 |
|
|
782 |
|
; mmx is of no use here. Better use plain ASM. Moreover, |
783 |
|
; this is for the fun of ASM coding, coz' every modern compiler can |
784 |
|
; end up with a code that looks very much like this one... |
785 |
|
|
786 |
|
align 16 |
787 |
|
xvid_VFilter_31_x86: |
788 |
|
push esi |
789 |
|
push edi |
790 |
|
push ebx |
791 |
|
push ebp |
792 |
|
mov esi, [esp+4 +16] ; Src1 |
793 |
|
mov edi, [esp+8 +16] ; Src2 |
794 |
|
mov ebp, [esp+12 +16] ; BpS |
795 |
|
mov eax, [esp+16 +16] ; Nb_Blks |
796 |
|
lea eax,[eax*8] |
797 |
|
|
798 |
|
.Loop: ;7c |
799 |
|
movzx ecx, byte [esi] |
800 |
|
movzx edx, byte [edi] |
801 |
|
|
802 |
|
lea ebx, [ecx+edx+2] |
803 |
|
lea ecx,[ebx+2*ecx] |
804 |
|
lea edx,[ebx+2*edx] |
805 |
|
|
806 |
|
shr ecx,2 |
807 |
|
shr edx,2 |
808 |
|
mov [esi], cl |
809 |
|
mov [edi], dl |
810 |
|
lea esi, [esi+ebp] |
811 |
|
lea edi, [edi+ebp] |
812 |
|
dec eax |
813 |
|
jg .Loop |
814 |
|
|
815 |
|
pop ebp |
816 |
|
pop ebx |
817 |
|
pop edi |
818 |
|
pop esi |
819 |
|
ret |
820 |
|
|
821 |
|
; this one's just a little faster than gcc's code. Very little. |
822 |
|
|
823 |
|
align 16 |
824 |
|
xvid_HFilter_31_x86: |
825 |
|
push esi |
826 |
|
push edi |
827 |
|
push ebx |
828 |
|
mov esi, [esp+4 +12] ; Src1 |
829 |
|
mov edi, [esp+8 +12] ; Src2 |
830 |
|
mov eax, [esp+12 +12] ; Nb_Blks |
831 |
|
|
832 |
|
lea eax,[eax*8] |
833 |
|
lea esi, [esi+eax] |
834 |
|
lea edi, [esi+eax] |
835 |
|
neg eax |
836 |
|
|
837 |
|
.Loop: ; 6c |
838 |
|
movzx ecx, byte [esi+eax] |
839 |
|
movzx edx, byte [edi+eax] |
840 |
|
|
841 |
|
lea ebx, [ecx+edx+2] |
842 |
|
lea ecx,[ebx+2*ecx] |
843 |
|
lea edx,[ebx+2*edx] |
844 |
|
shr ecx,2 |
845 |
|
shr edx,2 |
846 |
|
mov [esi+eax], cl |
847 |
|
mov [edi+eax], dl |
848 |
|
inc eax |
849 |
|
|
850 |
|
jl .Loop |
851 |
|
|
852 |
|
pop ebx |
853 |
|
pop edi |
854 |
|
pop esi |
855 |
|
ret |
856 |
|
|
857 |
|
;////////////////////////////////////////////////////////////////////// |
858 |
|
;// 16b downsampling 16x16 -> 8x8 |
859 |
|
;////////////////////////////////////////////////////////////////////// |
860 |
|
|
861 |
|
%macro HFILTER_1331 2 ;%1:src %2:dst reg. -trashes mm0/mm1/mm2 |
862 |
|
movq mm2, [Mask_ff] |
863 |
|
movq %2, [%1-1] ;-10123456 |
864 |
|
movq mm0, [%1] ; 01234567 |
865 |
|
movq mm1, [%1+1] ; 12345678 |
866 |
|
pand %2, mm2 ;-1|1|3|5 |
867 |
|
pand mm0, mm2 ; 0|2|4|6 |
868 |
|
pand mm1, mm2 ; 1|3|5|7 |
869 |
|
pand mm2, [%1+2] ; 2|4|6|8 |
870 |
|
paddusw mm0, mm1 |
871 |
|
paddusw %2, mm2 |
872 |
|
pmullw mm0, mm7 |
873 |
|
paddusw %2, mm0 |
874 |
|
%endmacro |
875 |
|
|
876 |
|
%macro VFILTER_1331 4 ; %1-4: regs %1-%2: trashed |
877 |
|
paddsw %1, [Cst32] |
878 |
|
paddsw %2, %3 |
879 |
|
pmullw %2, mm7 |
880 |
|
paddsw %1,%4 |
881 |
|
paddsw %1, %2 |
882 |
|
psraw %1, 6 |
883 |
|
%endmacro |
884 |
|
|
885 |
|
;=========================================================================== |
886 |
|
; |
887 |
|
; void xvid_Filter_18x18_To_8x8_mmx(int16_t *Dst, |
888 |
|
; const uint8_t *Src, const int BpS); |
889 |
|
; |
890 |
|
;=========================================================================== |
891 |
|
|
892 |
|
%macro COPY_TWO_LINES_1331 1 ; %1: dst |
893 |
|
HFILTER_1331 edx , mm5 |
894 |
|
HFILTER_1331 edx+eax, mm6 |
895 |
|
lea edx, [edx+2*eax] |
896 |
|
VFILTER_1331 mm3,mm4,mm5, mm6 |
897 |
|
movq [%1], mm3 |
898 |
|
|
899 |
|
HFILTER_1331 edx , mm3 |
900 |
|
HFILTER_1331 edx+eax, mm4 |
901 |
|
lea edx, [edx+2*eax] |
902 |
|
VFILTER_1331 mm5,mm6,mm3,mm4 |
903 |
|
movq [%1+16], mm5 |
904 |
|
%endmacro |
905 |
|
|
906 |
|
align 16 |
907 |
|
xvid_Filter_18x18_To_8x8_mmx: ; 283c (~4.4c per output pixel) |
908 |
|
|
909 |
|
mov ecx, [esp+4] ; Dst |
910 |
|
mov edx, [esp+8] ; Src |
911 |
|
mov eax, [esp+12] ; BpS |
912 |
|
|
913 |
|
movq mm7, [Cst3] |
914 |
|
sub edx, eax |
915 |
|
|
916 |
|
; mm3/mm4/mm5/mm6 is used as a 4-samples delay line. |
917 |
|
|
918 |
|
; process columns 0-3 |
919 |
|
|
920 |
|
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4 |
921 |
|
HFILTER_1331 edx+eax, mm4 |
922 |
|
lea edx, [edx+2*eax] |
923 |
|
|
924 |
|
COPY_TWO_LINES_1331 ecx + 0*16 |
925 |
|
COPY_TWO_LINES_1331 ecx + 2*16 |
926 |
|
COPY_TWO_LINES_1331 ecx + 4*16 |
927 |
|
COPY_TWO_LINES_1331 ecx + 6*16 |
928 |
|
|
929 |
|
; process columns 4-7 |
930 |
|
|
931 |
|
mov edx, [esp+8] |
932 |
|
sub edx, eax |
933 |
|
add edx, 8 |
934 |
|
|
935 |
|
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4 |
936 |
|
HFILTER_1331 edx+eax, mm4 |
937 |
|
lea edx, [edx+2*eax] |
938 |
|
|
939 |
|
COPY_TWO_LINES_1331 ecx + 0*16 +8 |
940 |
|
COPY_TWO_LINES_1331 ecx + 2*16 +8 |
941 |
|
COPY_TWO_LINES_1331 ecx + 4*16 +8 |
942 |
|
COPY_TWO_LINES_1331 ecx + 6*16 +8 |
943 |
|
|
944 |
|
ret |
945 |
|
|
946 |
|
;=========================================================================== |
947 |
|
; |
948 |
|
; void xvid_Filter_Diff_18x18_To_8x8_mmx(int16_t *Dst, |
949 |
|
; const uint8_t *Src, const int BpS); |
950 |
|
; |
951 |
|
;=========================================================================== |
952 |
|
|
953 |
|
%macro DIFF_TWO_LINES_1331 1 ; %1: dst |
954 |
|
HFILTER_1331 edx , mm5 |
955 |
|
HFILTER_1331 edx+eax, mm6 |
956 |
|
lea edx, [edx+2*eax] |
957 |
|
movq mm2, [%1] |
958 |
|
VFILTER_1331 mm3,mm4,mm5, mm6 |
959 |
|
psubsw mm2, mm3 |
960 |
|
movq [%1], mm2 |
961 |
|
|
962 |
|
HFILTER_1331 edx , mm3 |
963 |
|
HFILTER_1331 edx+eax, mm4 |
964 |
|
lea edx, [edx+2*eax] |
965 |
|
movq mm2, [%1+16] |
966 |
|
VFILTER_1331 mm5,mm6,mm3,mm4 |
967 |
|
psubsw mm2, mm5 |
968 |
|
movq [%1+16], mm2 |
969 |
|
%endmacro |
970 |
|
|
971 |
|
align 16 |
972 |
|
xvid_Filter_Diff_18x18_To_8x8_mmx: ; 302c |
973 |
|
|
974 |
|
mov ecx, [esp+4] ; Dst |
975 |
|
mov edx, [esp+8] ; Src |
976 |
|
mov eax, [esp+12] ; BpS |
977 |
|
|
978 |
|
movq mm7, [Cst3] |
979 |
|
sub edx, eax |
980 |
|
|
981 |
|
; mm3/mm4/mm5/mm6 is used as a 4-samples delay line. |
982 |
|
|
983 |
|
; process columns 0-3 |
984 |
|
|
985 |
|
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4 |
986 |
|
HFILTER_1331 edx+eax, mm4 |
987 |
|
lea edx, [edx+2*eax] |
988 |
|
|
989 |
|
DIFF_TWO_LINES_1331 ecx + 0*16 |
990 |
|
DIFF_TWO_LINES_1331 ecx + 2*16 |
991 |
|
DIFF_TWO_LINES_1331 ecx + 4*16 |
992 |
|
DIFF_TWO_LINES_1331 ecx + 6*16 |
993 |
|
|
994 |
|
; process columns 4-7 |
995 |
|
mov edx, [esp+8] |
996 |
|
sub edx, eax |
997 |
|
add edx, 8 |
998 |
|
|
999 |
|
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4 |
1000 |
|
HFILTER_1331 edx+eax, mm4 |
1001 |
|
lea edx, [edx+2*eax] |
1002 |
|
|
1003 |
|
DIFF_TWO_LINES_1331 ecx + 0*16 +8 |
1004 |
|
DIFF_TWO_LINES_1331 ecx + 2*16 +8 |
1005 |
|
DIFF_TWO_LINES_1331 ecx + 4*16 +8 |
1006 |
|
DIFF_TWO_LINES_1331 ecx + 6*16 +8 |
1007 |
|
|
1008 |
|
ret |
1009 |
|
|
1010 |
|
;////////////////////////////////////////////////////////////////////// |
1011 |
|
|
1012 |
|
; pfeewwww... Never Do That On Stage Again. :) |
1013 |
|
|