1 |
|
;/************************************************************************** |
2 |
|
; * |
3 |
|
; * XVID MPEG-4 VIDEO CODEC |
4 |
|
; * xmm sum of absolute difference |
5 |
|
; * |
6 |
|
; * This program is an implementation of a part of one or more MPEG-4 |
7 |
|
; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending |
8 |
|
; * to use this software module in hardware or software products are |
9 |
|
; * advised that its use may infringe existing patents or copyrights, and |
10 |
|
; * any such use would be at such party's own risk. The original |
11 |
|
; * developer of this software module and his/her company, and subsequent |
12 |
|
; * editors and their companies, will have no liability for use of this |
13 |
|
; * software or modifications or derivatives thereof. |
14 |
|
; * |
15 |
|
; * This program is free software; you can redistribute it and/or modify |
16 |
|
; * it under the terms of the GNU General Public License as published by |
17 |
|
; * the Free Software Foundation; either version 2 of the License, or |
18 |
|
; * (at your option) any later version. |
19 |
|
; * |
20 |
|
; * This program is distributed in the hope that it will be useful, |
21 |
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 |
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 |
|
; * GNU General Public License for more details. |
24 |
|
; * |
25 |
|
; * You should have received a copy of the GNU General Public License |
26 |
|
; * along with this program; if not, write to the Free Software |
27 |
|
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
28 |
|
; * |
29 |
|
; *************************************************************************/ |
30 |
|
; |
31 |
|
; these 3dne functions are compatible with iSSE, but are optimized specifically for |
32 |
|
; K7 pipelines |
33 |
|
; |
34 |
|
;------------------------------------------------------------------------------ |
35 |
|
; 09.12.2002 Athlon optimizations contributed by Jaan Kalda |
36 |
|
;------------------------------------------------------------------------------ |
37 |
|
|
38 |
|
bits 32 |
39 |
|
|
40 |
|
%macro cglobal 1 |
41 |
|
%if 1 |
42 |
|
global _%1 |
43 |
|
%define %1 _%1 |
44 |
|
%else |
45 |
|
global %1 |
46 |
|
%endif |
47 |
|
%endmacro |
48 |
|
|
49 |
|
%ifdef FORMAT_COFF |
50 |
|
section .data data |
51 |
|
%else |
52 |
|
section .data data align=16 |
53 |
|
%endif |
54 |
|
|
55 |
|
align 16 |
56 |
|
mmx_one times 4 dw 1 |
57 |
|
|
58 |
|
section .text |
59 |
|
|
60 |
|
cglobal sad16_3dne |
61 |
|
cglobal sad8_3dne |
62 |
|
cglobal sad16bi_3dne |
63 |
|
cglobal sad8bi_3dne |
64 |
|
cglobal dev16_3dne |
65 |
|
|
66 |
|
;=========================================================================== |
67 |
|
; |
68 |
|
; uint32_t sad16_3dne(const uint8_t * const cur, |
69 |
|
; const uint8_t * const ref, |
70 |
|
; const uint32_t stride, |
71 |
|
; const uint32_t best_sad); |
72 |
|
; |
73 |
|
;=========================================================================== |
74 |
|
; optimization: 21% faster |
75 |
|
%macro SAD_16x16_SSE 1 |
76 |
|
movq mm7, [eax] |
77 |
|
movq mm6, [eax+8] |
78 |
|
psadbw mm7, [edx] |
79 |
|
psadbw mm6, [edx+8] |
80 |
|
%if (%1) |
81 |
|
paddd mm1,mm5 |
82 |
|
%endif |
83 |
|
movq mm5, [eax+ecx] |
84 |
|
movq mm4, [eax+ecx+8] |
85 |
|
psadbw mm5, [edx+ecx] |
86 |
|
psadbw mm4, [edx+ecx+8] |
87 |
|
movq mm3, [eax+2*ecx] |
88 |
|
movq mm2, [eax+2*ecx+8] |
89 |
|
psadbw mm3, [edx+2*ecx] |
90 |
|
psadbw mm2, [edx+2*ecx+8] |
91 |
|
%if (%1) |
92 |
|
movd [esp+4*(%1-1)],mm1 |
93 |
|
%else |
94 |
|
sub esp,byte 12 |
95 |
|
%endif |
96 |
|
movq mm1, [eax+ebx] |
97 |
|
movq mm0, [eax+ebx+8] |
98 |
|
psadbw mm1, [edx+ebx] |
99 |
|
psadbw mm0, [edx+ebx+8] |
100 |
|
lea eax,[eax+4*ecx] |
101 |
|
lea edx,[edx+4*ecx] |
102 |
|
paddd mm7,mm6 |
103 |
|
paddd mm5,mm4 |
104 |
|
paddd mm3,mm2 |
105 |
|
paddd mm1,mm0 |
106 |
|
paddd mm5,mm7 |
107 |
|
paddd mm1,mm3 |
108 |
|
%endmacro |
109 |
|
|
110 |
|
align 16 |
111 |
|
sad16_3dne: |
112 |
|
|
113 |
|
mov eax, [esp+ 4] ; Src1 |
114 |
|
mov edx, [esp+ 8] ; Src2 |
115 |
|
mov ecx, [esp+12] ; Stride |
116 |
|
push ebx |
117 |
|
lea ebx,[2*ecx+ecx] |
118 |
|
SAD_16x16_SSE 0 |
119 |
|
SAD_16x16_SSE 1 |
120 |
|
SAD_16x16_SSE 2 |
121 |
|
SAD_16x16_SSE 3 |
122 |
|
mov ecx,[esp] |
123 |
|
add ecx,[esp+4] |
124 |
|
add ecx,[esp+8] |
125 |
|
paddd mm1,mm5 |
126 |
|
mov ebx,[esp+12] |
127 |
|
add esp,byte 4+12 |
128 |
|
movd eax, mm1 |
129 |
|
add eax,ecx |
130 |
|
ret |
131 |
|
|
132 |
|
|
133 |
|
;=========================================================================== |
134 |
|
; |
135 |
|
; uint32_t sad8_3dne(const uint8_t * const cur, |
136 |
|
; const uint8_t * const ref, |
137 |
|
; const uint32_t stride); |
138 |
|
; |
139 |
|
;=========================================================================== |
140 |
|
align 16 |
141 |
|
sad8_3dne: |
142 |
|
|
143 |
|
mov eax, [esp+ 4] ; Src1 |
144 |
|
mov ecx, [esp+12] ; Stride |
145 |
|
mov edx, [esp+ 8] ; Src2 |
146 |
|
push ebx |
147 |
|
lea ebx, [ecx+2*ecx] |
148 |
|
|
149 |
|
movq mm0, [byte eax] ;0 |
150 |
|
psadbw mm0, [byte edx] |
151 |
|
movq mm1, [eax+ecx] ;1 |
152 |
|
psadbw mm1, [edx+ecx] |
153 |
|
|
154 |
|
movq mm2, [eax+2*ecx] ;2 |
155 |
|
psadbw mm2, [edx+2*ecx] |
156 |
|
movq mm3, [eax+ebx] ;3 |
157 |
|
psadbw mm3, [edx+ebx] |
158 |
|
|
159 |
|
paddd mm0,mm1 |
160 |
|
|
161 |
|
movq mm4, [byte eax+4*ecx] ;4 |
162 |
|
psadbw mm4, [edx+4*ecx] |
163 |
|
movq mm5, [eax+2*ebx] ;6 |
164 |
|
psadbw mm5, [edx+2*ebx] |
165 |
|
|
166 |
|
paddd mm2,mm3 |
167 |
|
paddd mm0,mm2 |
168 |
|
|
169 |
|
lea ebx, [ebx+4*ecx] ;3+4=7 |
170 |
|
lea ecx,[ecx+4*ecx] ; 5 |
171 |
|
movq mm6, [eax+ecx] ;5 |
172 |
|
psadbw mm6, [edx+ecx] |
173 |
|
movq mm7, [eax+ebx] ;7 |
174 |
|
psadbw mm7, [edx+ebx] |
175 |
|
paddd mm4,mm5 |
176 |
|
paddd mm6,mm7 |
177 |
|
paddd mm0,mm4 |
178 |
|
mov ebx,[esp] |
179 |
|
add esp,byte 4 |
180 |
|
paddd mm0,mm6 |
181 |
|
movd eax, mm0 |
182 |
|
|
183 |
|
ret |
184 |
|
|
185 |
|
|
186 |
|
;=========================================================================== |
187 |
|
; |
188 |
|
; uint32_t sad16bi_3dne(const uint8_t * const cur, |
189 |
|
; const uint8_t * const ref1, |
190 |
|
; const uint8_t * const ref2, |
191 |
|
; const uint32_t stride); |
192 |
|
; |
193 |
|
;=========================================================================== |
194 |
|
;optimization: 14% faster |
195 |
|
%macro SADBI_16x16_SSE0 0 |
196 |
|
movq mm2, [edx] |
197 |
|
movq mm3, [edx+8] |
198 |
|
|
199 |
|
movq mm5, [byte eax] |
200 |
|
movq mm6, [eax+8] |
201 |
|
pavgb mm2, [byte ebx] |
202 |
|
pavgb mm3, [ebx+8] |
203 |
|
|
204 |
|
add edx, ecx |
205 |
|
psadbw mm5, mm2 |
206 |
|
psadbw mm6, mm3 |
207 |
|
|
208 |
|
add eax, ecx |
209 |
|
add ebx, ecx |
210 |
|
movq mm2, [byte edx] |
211 |
|
|
212 |
|
movq mm3, [edx+8] |
213 |
|
movq mm0, [byte eax] |
214 |
|
|
215 |
|
movq mm1, [eax+8] |
216 |
|
pavgb mm2, [byte ebx] |
217 |
|
|
218 |
|
pavgb mm3, [ebx+8] |
219 |
|
add edx, ecx |
220 |
|
add eax, ecx |
221 |
|
|
222 |
|
add ebx, ecx |
223 |
|
psadbw mm0, mm2 |
224 |
|
psadbw mm1, mm3 |
225 |
|
|
226 |
|
%endmacro |
227 |
|
%macro SADBI_16x16_SSE 0 |
228 |
|
movq mm2, [byte edx] |
229 |
|
movq mm3, [edx+8] |
230 |
|
paddusw mm5,mm0 |
231 |
|
paddusw mm6,mm1 |
232 |
|
movq mm0, [eax] |
233 |
|
movq mm1, [eax+8] |
234 |
|
pavgb mm2, [ebx] |
235 |
|
pavgb mm3, [ebx+8] |
236 |
|
add edx, ecx |
237 |
|
add eax, ecx |
238 |
|
add ebx, ecx |
239 |
|
psadbw mm0, mm2 |
240 |
|
psadbw mm1, mm3 |
241 |
|
%endmacro |
242 |
|
|
243 |
|
align 16 |
244 |
|
sad16bi_3dne: |
245 |
|
mov eax, [esp+ 4] ; Src |
246 |
|
mov edx, [esp+ 8] ; Ref1 |
247 |
|
mov ecx, [esp+16] ; Stride |
248 |
|
push ebx |
249 |
|
mov ebx, [esp+4+12] ; Ref2 |
250 |
|
|
251 |
|
SADBI_16x16_SSE0 |
252 |
|
SADBI_16x16_SSE |
253 |
|
SADBI_16x16_SSE |
254 |
|
SADBI_16x16_SSE |
255 |
|
SADBI_16x16_SSE |
256 |
|
SADBI_16x16_SSE |
257 |
|
SADBI_16x16_SSE |
258 |
|
|
259 |
|
SADBI_16x16_SSE |
260 |
|
SADBI_16x16_SSE |
261 |
|
SADBI_16x16_SSE |
262 |
|
SADBI_16x16_SSE |
263 |
|
SADBI_16x16_SSE |
264 |
|
SADBI_16x16_SSE |
265 |
|
SADBI_16x16_SSE |
266 |
|
SADBI_16x16_SSE |
267 |
|
paddusw mm5,mm0 |
268 |
|
paddusw mm6,mm1 |
269 |
|
|
270 |
|
pop ebx |
271 |
|
paddusw mm6,mm5 |
272 |
|
movd eax, mm6 |
273 |
|
ret |
274 |
|
;=========================================================================== |
275 |
|
; |
276 |
|
; uint32_t sad8bi_3dne(const uint8_t * const cur, |
277 |
|
; const uint8_t * const ref1, |
278 |
|
; const uint8_t * const ref2, |
279 |
|
; const uint32_t stride); |
280 |
|
; |
281 |
|
;=========================================================================== |
282 |
|
|
283 |
|
%macro SADBI_8x8_3dne 0 |
284 |
|
movq mm2, [edx] |
285 |
|
movq mm3, [edx+ecx] |
286 |
|
pavgb mm2, [eax] |
287 |
|
pavgb mm3, [eax+ecx] |
288 |
|
lea edx, [edx+2*ecx] |
289 |
|
lea eax, [eax+2*ecx] |
290 |
|
paddusw mm5,mm0 |
291 |
|
paddusw mm6,mm1 |
292 |
|
movq mm0, [ebx] |
293 |
|
movq mm1, [ebx+ecx] |
294 |
|
lea ebx, [ebx+2*ecx] |
295 |
|
psadbw mm0, mm2 |
296 |
|
psadbw mm1, mm3 |
297 |
|
%endmacro |
298 |
|
|
299 |
|
align 16 |
300 |
|
sad8bi_3dne: |
301 |
|
mov eax, [esp+12] ; Ref2 |
302 |
|
mov edx, [esp+ 8] ; Ref1 |
303 |
|
mov ecx, [esp+16] ; Stride |
304 |
|
push ebx |
305 |
|
mov ebx, [esp+4+ 4] ; Src |
306 |
|
|
307 |
|
movq mm2, [edx] |
308 |
|
movq mm3, [edx+ecx] |
309 |
|
pavgb mm2, [eax] |
310 |
|
pavgb mm3, [eax+ecx] |
311 |
|
lea edx, [edx+2*ecx] |
312 |
|
lea eax, [eax+2*ecx] |
313 |
|
movq mm5, [ebx] |
314 |
|
movq mm6, [ebx+ecx] |
315 |
|
lea ebx, [ebx+2*ecx] |
316 |
|
psadbw mm5, mm2 |
317 |
|
psadbw mm6, mm3 |
318 |
|
|
319 |
|
movq mm2, [edx] |
320 |
|
movq mm3, [edx+ecx] |
321 |
|
pavgb mm2, [eax] |
322 |
|
pavgb mm3, [eax+ecx] |
323 |
|
lea edx, [edx+2*ecx] |
324 |
|
lea eax, [eax+2*ecx] |
325 |
|
movq mm0, [ebx] |
326 |
|
movq mm1, [ebx+ecx] |
327 |
|
lea ebx, [ebx+2*ecx] |
328 |
|
psadbw mm0, mm2 |
329 |
|
psadbw mm1, mm3 |
330 |
|
|
331 |
|
movq mm2, [edx] |
332 |
|
movq mm3, [edx+ecx] |
333 |
|
pavgb mm2, [eax] |
334 |
|
pavgb mm3, [eax+ecx] |
335 |
|
lea edx, [edx+2*ecx] |
336 |
|
lea eax, [eax+2*ecx] |
337 |
|
paddusw mm5,mm0 |
338 |
|
paddusw mm6,mm1 |
339 |
|
movq mm0, [ebx] |
340 |
|
movq mm1, [ebx+ecx] |
341 |
|
lea ebx, [ebx+2*ecx] |
342 |
|
psadbw mm0, mm2 |
343 |
|
psadbw mm1, mm3 |
344 |
|
|
345 |
|
movq mm2, [edx] |
346 |
|
movq mm3, [edx+ecx] |
347 |
|
pavgb mm2, [eax] |
348 |
|
pavgb mm3, [eax+ecx] |
349 |
|
paddusw mm5,mm0 |
350 |
|
paddusw mm6,mm1 |
351 |
|
movq mm0, [ebx] |
352 |
|
movq mm1, [ebx+ecx] |
353 |
|
psadbw mm0, mm2 |
354 |
|
psadbw mm1, mm3 |
355 |
|
paddusw mm5,mm0 |
356 |
|
paddusw mm6,mm1 |
357 |
|
|
358 |
|
paddusw mm6,mm5 |
359 |
|
mov ebx,[esp] |
360 |
|
add esp,byte 4 |
361 |
|
movd eax, mm6 |
362 |
|
ret |
363 |
|
|
364 |
|
|
365 |
|
;=========================================================================== |
366 |
|
; |
367 |
|
; uint32_t dev16_3dne(const uint8_t * const cur, |
368 |
|
; const uint32_t stride); |
369 |
|
; |
370 |
|
;=========================================================================== |
371 |
|
; optimization: 25 % faster |
372 |
|
%macro ABS_16x16_SSE 1 |
373 |
|
%if (%1 == 0) |
374 |
|
movq mm7, [eax] |
375 |
|
psadbw mm7, mm4 |
376 |
|
mov esi,esi |
377 |
|
movq mm6, [eax+8] |
378 |
|
movq mm5, [eax+ecx] |
379 |
|
movq mm3, [eax+ecx+8] |
380 |
|
psadbw mm6, mm4 |
381 |
|
|
382 |
|
movq mm2, [byte eax+2*ecx] |
383 |
|
psadbw mm5, mm4 |
384 |
|
movq mm1, [eax+2*ecx+8] |
385 |
|
psadbw mm3, mm4 |
386 |
|
|
387 |
|
movq mm0, [dword eax+edx] |
388 |
|
psadbw mm2, mm4 |
389 |
|
add eax,edx |
390 |
|
psadbw mm1, mm4 |
391 |
|
%endif |
392 |
|
%if (%1 == 1) |
393 |
|
psadbw mm0, mm4 |
394 |
|
paddd mm7, mm0 |
395 |
|
movq mm0, [eax+8] |
396 |
|
psadbw mm0, mm4 |
397 |
|
paddd mm6, mm0 |
398 |
|
|
399 |
|
movq mm0, [byte eax+ecx] |
400 |
|
psadbw mm0, mm4 |
401 |
|
|
402 |
|
paddd mm5, mm0 |
403 |
|
movq mm0, [eax+ecx+8] |
404 |
|
|
405 |
|
psadbw mm0, mm4 |
406 |
|
paddd mm3, mm0 |
407 |
|
movq mm0, [eax+2*ecx] |
408 |
|
psadbw mm0, mm4 |
409 |
|
paddd mm2, mm0 |
410 |
|
|
411 |
|
movq mm0, [eax+2*ecx+8] |
412 |
|
add eax,edx |
413 |
|
psadbw mm0, mm4 |
414 |
|
paddd mm1, mm0 |
415 |
|
movq mm0, [eax] |
416 |
|
%endif |
417 |
|
%if (%1 == 2) |
418 |
|
psadbw mm0, mm4 |
419 |
|
paddd mm7, mm0 |
420 |
|
movq mm0, [eax+8] |
421 |
|
psadbw mm0, mm4 |
422 |
|
paddd mm6, mm0 |
423 |
|
%endif |
424 |
|
%endmacro |
425 |
|
|
426 |
|
align 16 |
427 |
|
dev16_3dne: |
428 |
|
|
429 |
|
mov eax, [esp+ 4] ; Src |
430 |
|
mov ecx, [esp+ 8] ; Stride |
431 |
|
lea edx,[ecx+2*ecx] |
432 |
|
|
433 |
|
pxor mm4, mm4 |
434 |
|
align 8 |
435 |
|
ABS_16x16_SSE 0 |
436 |
|
ABS_16x16_SSE 1 |
437 |
|
ABS_16x16_SSE 1 |
438 |
|
ABS_16x16_SSE 1 |
439 |
|
ABS_16x16_SSE 1 |
440 |
|
paddd mm1, mm2 |
441 |
|
paddd mm3, mm5 |
442 |
|
ABS_16x16_SSE 2 |
443 |
|
paddd mm7, mm6 |
444 |
|
paddd mm1, mm3 |
445 |
|
mov eax, [esp+ 4] ; Src |
446 |
|
paddd mm7,mm1 |
447 |
|
punpcklbw mm7,mm7 ;xxyyaazz |
448 |
|
pshufw mm4,mm7,055h |
449 |
|
; mm4 contains the mean |
450 |
|
pxor mm1, mm1 |
451 |
|
|
452 |
|
ABS_16x16_SSE 0 |
453 |
|
ABS_16x16_SSE 1 |
454 |
|
ABS_16x16_SSE 1 |
455 |
|
ABS_16x16_SSE 1 |
456 |
|
ABS_16x16_SSE 1 |
457 |
|
paddd mm1, mm2 |
458 |
|
paddd mm3, mm5 |
459 |
|
ABS_16x16_SSE 2 |
460 |
|
paddd mm7, mm6 |
461 |
|
paddd mm1, mm3 |
462 |
|
paddd mm7,mm1 |
463 |
|
movd eax, mm7 |
464 |
|
ret |