1 |
;/************************************************************************** |
;/**************************************************************************** |
2 |
; * |
; * |
3 |
; * XVID MPEG-4 VIDEO CODEC |
; * XVID MPEG-4 VIDEO CODEC |
4 |
; * xmm sum of absolute difference |
; * - K7 optimized SAD operators - |
5 |
; * |
; * |
6 |
; * This program is an implementation of a part of one or more MPEG-4 |
; * Copyright(C) 2002 Jaan Kalda |
|
; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending |
|
|
; * to use this software module in hardware or software products are |
|
|
; * advised that its use may infringe existing patents or copyrights, and |
|
|
; * any such use would be at such party's own risk. The original |
|
|
; * developer of this software module and his/her company, and subsequent |
|
|
; * editors and their companies, will have no liability for use of this |
|
|
; * software or modifications or derivatives thereof. |
|
7 |
; * |
; * |
8 |
; * This program is free software; you can redistribute it and/or modify |
; * This program is free software; you can redistribute it and/or modify it |
9 |
; * it under the terms of the GNU General Public License as published by |
; * under the terms of the GNU General Public License as published by |
10 |
; * the Free Software Foundation; either version 2 of the License, or |
; * the Free Software Foundation; either version 2 of the License, or |
11 |
; * (at your option) any later version. |
; * (at your option) any later version. |
12 |
; * |
; * |
17 |
; * |
; * |
18 |
; * You should have received a copy of the GNU General Public License |
; * You should have received a copy of the GNU General Public License |
19 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
20 |
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
; * |
; * |
22 |
; *************************************************************************/ |
; * $Id$ |
23 |
; |
; * |
24 |
; these 3dne functions are compatible with iSSE, but are optimized specifically for |
; ***************************************************************************/ |
25 |
; K7 pipelines |
|
26 |
; |
; these 3dne functions are compatible with iSSE, but are optimized specifically |
27 |
;------------------------------------------------------------------------------ |
; for K7 pipelines |
28 |
; 09.12.2002 Athlon optimizations contributed by Jaan Kalda |
|
29 |
;------------------------------------------------------------------------------ |
%include "nasm.inc" |
30 |
|
|
31 |
bits 32 |
;============================================================================= |
32 |
|
; Read only data |
33 |
%macro cglobal 1 |
;============================================================================= |
34 |
%ifdef PREFIX |
|
35 |
global _%1 |
DATA |
36 |
%define %1 _%1 |
|
37 |
|
ALIGN SECTION_ALIGN |
38 |
|
mmx_one: |
39 |
|
times 4 dw 1 |
40 |
|
|
41 |
|
;============================================================================= |
42 |
|
; Helper macros |
43 |
|
;============================================================================= |
44 |
|
|
45 |
|
;; %1 block number (0..4) |
46 |
|
%macro SAD_16x16_SSE 1 |
47 |
|
movq mm7, [_EAX] |
48 |
|
movq mm6, [_EAX+8] |
49 |
|
psadbw mm7, [TMP1] |
50 |
|
psadbw mm6, [TMP1+8] |
51 |
|
%if (%1) |
52 |
|
paddd mm1, mm5 |
53 |
|
%endif |
54 |
|
movq mm5, [_EAX+TMP0] |
55 |
|
movq mm4, [_EAX+TMP0+8] |
56 |
|
psadbw mm5, [TMP1+TMP0] |
57 |
|
psadbw mm4, [TMP1+TMP0+8] |
58 |
|
movq mm3, [_EAX+2*TMP0] |
59 |
|
movq mm2, [_EAX+2*TMP0+8] |
60 |
|
psadbw mm3, [TMP1+2*TMP0] |
61 |
|
psadbw mm2, [TMP1+2*TMP0+8] |
62 |
|
%if (%1) |
63 |
|
movd [_ESP+4*(%1-1)], mm1 |
64 |
%else |
%else |
65 |
global %1 |
sub _ESP, byte 12 |
66 |
%endif |
%endif |
67 |
|
movq mm1, [_EAX+_EBX] |
68 |
|
movq mm0, [_EAX+_EBX+8] |
69 |
|
psadbw mm1, [TMP1+_EBX] |
70 |
|
psadbw mm0, [TMP1+_EBX+8] |
71 |
|
lea _EAX, [_EAX+4*TMP0] |
72 |
|
lea TMP1, [TMP1+4*TMP0] |
73 |
|
paddd mm7, mm6 |
74 |
|
paddd mm5, mm4 |
75 |
|
paddd mm3, mm2 |
76 |
|
paddd mm1, mm0 |
77 |
|
paddd mm5, mm7 |
78 |
|
paddd mm1, mm3 |
79 |
%endmacro |
%endmacro |
80 |
|
|
81 |
%ifdef FORMAT_COFF |
%macro SADBI_16x16_SSE0 0 |
82 |
section .data data |
movq mm2, [TMP1] |
83 |
%else |
movq mm3, [TMP1+8] |
84 |
section .data data align=16 |
|
85 |
|
movq mm5, [byte _EAX] |
86 |
|
movq mm6, [_EAX+8] |
87 |
|
pavgb mm2, [byte _EBX] |
88 |
|
pavgb mm3, [_EBX+8] |
89 |
|
|
90 |
|
add TMP1, TMP0 |
91 |
|
psadbw mm5, mm2 |
92 |
|
psadbw mm6, mm3 |
93 |
|
|
94 |
|
add _EAX, TMP0 |
95 |
|
add _EBX, TMP0 |
96 |
|
movq mm2, [byte TMP1] |
97 |
|
|
98 |
|
movq mm3, [TMP1+8] |
99 |
|
movq mm0, [byte _EAX] |
100 |
|
|
101 |
|
movq mm1, [_EAX+8] |
102 |
|
pavgb mm2, [byte _EBX] |
103 |
|
|
104 |
|
pavgb mm3, [_EBX+8] |
105 |
|
add TMP1, TMP0 |
106 |
|
add _EAX, TMP0 |
107 |
|
|
108 |
|
add _EBX, TMP0 |
109 |
|
psadbw mm0, mm2 |
110 |
|
psadbw mm1, mm3 |
111 |
|
|
112 |
|
%endmacro |
113 |
|
|
114 |
|
%macro SADBI_16x16_SSE 0 |
115 |
|
movq mm2, [byte TMP1] |
116 |
|
movq mm3, [TMP1+8] |
117 |
|
paddusw mm5, mm0 |
118 |
|
paddusw mm6, mm1 |
119 |
|
movq mm0, [_EAX] |
120 |
|
movq mm1, [_EAX+8] |
121 |
|
pavgb mm2, [_EBX] |
122 |
|
pavgb mm3, [_EBX+8] |
123 |
|
add TMP1, TMP0 |
124 |
|
add _EAX, TMP0 |
125 |
|
add _EBX, TMP0 |
126 |
|
psadbw mm0, mm2 |
127 |
|
psadbw mm1, mm3 |
128 |
|
%endmacro |
129 |
|
|
130 |
|
%macro SADBI_8x8_3dne 0 |
131 |
|
movq mm2, [TMP1] |
132 |
|
movq mm3, [TMP1+TMP0] |
133 |
|
pavgb mm2, [_EAX] |
134 |
|
pavgb mm3, [_EAX+TMP0] |
135 |
|
lea TMP1, [TMP1+2*TMP0] |
136 |
|
lea _EAX, [_EAX+2*TMP0] |
137 |
|
paddusw mm5, mm0 |
138 |
|
paddusw mm6, mm1 |
139 |
|
movq mm0, [_EBX] |
140 |
|
movq mm1, [_EBX+TMP0] |
141 |
|
lea _EBX, [_EBX+2*TMP0] |
142 |
|
psadbw mm0, mm2 |
143 |
|
psadbw mm1, mm3 |
144 |
|
%endmacro |
145 |
|
|
146 |
|
%macro ABS_16x16_SSE 1 |
147 |
|
%if (%1 == 0) |
148 |
|
movq mm7, [_EAX] |
149 |
|
psadbw mm7, mm4 |
150 |
|
mov esi, esi |
151 |
|
movq mm6, [_EAX+8] |
152 |
|
movq mm5, [_EAX+TMP0] |
153 |
|
movq mm3, [_EAX+TMP0+8] |
154 |
|
psadbw mm6, mm4 |
155 |
|
|
156 |
|
movq mm2, [byte _EAX+2*TMP0] |
157 |
|
psadbw mm5, mm4 |
158 |
|
movq mm1, [_EAX+2*TMP0+8] |
159 |
|
psadbw mm3, mm4 |
160 |
|
|
161 |
|
movq mm0, [_EAX+TMP1+0] |
162 |
|
psadbw mm2, mm4 |
163 |
|
add _EAX, TMP1 |
164 |
|
psadbw mm1, mm4 |
165 |
%endif |
%endif |
166 |
|
%if (%1 == 1) |
167 |
|
psadbw mm0, mm4 |
168 |
|
paddd mm7, mm0 |
169 |
|
movq mm0, [_EAX+8] |
170 |
|
psadbw mm0, mm4 |
171 |
|
paddd mm6, mm0 |
172 |
|
|
173 |
|
movq mm0, [byte _EAX+TMP0] |
174 |
|
psadbw mm0, mm4 |
175 |
|
|
176 |
align 16 |
paddd mm5, mm0 |
177 |
mmx_one times 4 dw 1 |
movq mm0, [_EAX+TMP0+8] |
178 |
|
|
179 |
section .text |
psadbw mm0, mm4 |
180 |
|
paddd mm3, mm0 |
181 |
|
movq mm0, [_EAX+2*TMP0] |
182 |
|
psadbw mm0, mm4 |
183 |
|
paddd mm2, mm0 |
184 |
|
|
185 |
|
movq mm0, [_EAX+2*TMP0+8] |
186 |
|
add _EAX, TMP1 |
187 |
|
psadbw mm0, mm4 |
188 |
|
paddd mm1, mm0 |
189 |
|
movq mm0, [_EAX] |
190 |
|
%endif |
191 |
|
%if (%1 == 2) |
192 |
|
psadbw mm0, mm4 |
193 |
|
paddd mm7, mm0 |
194 |
|
movq mm0, [_EAX+8] |
195 |
|
psadbw mm0, mm4 |
196 |
|
paddd mm6, mm0 |
197 |
|
%endif |
198 |
|
%endmacro |
199 |
|
|
200 |
|
;============================================================================= |
201 |
|
; Code |
202 |
|
;============================================================================= |
203 |
|
|
204 |
|
TEXT |
205 |
|
|
206 |
cglobal sad16_3dne |
cglobal sad16_3dne |
207 |
cglobal sad8_3dne |
cglobal sad8_3dne |
209 |
cglobal sad8bi_3dne |
cglobal sad8bi_3dne |
210 |
cglobal dev16_3dne |
cglobal dev16_3dne |
211 |
|
|
212 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
213 |
; |
; |
214 |
; uint32_t sad16_3dne(const uint8_t * const cur, |
; uint32_t sad16_3dne(const uint8_t * const cur, |
215 |
; const uint8_t * const ref, |
; const uint8_t * const ref, |
216 |
; const uint32_t stride, |
; const uint32_t stride, |
217 |
; const uint32_t best_sad); |
; const uint32_t best_sad); |
218 |
; |
; |
219 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
220 |
|
|
221 |
; optimization: 21% faster |
; optimization: 21% faster |
|
%macro SAD_16x16_SSE 1 |
|
|
movq mm7, [eax] |
|
|
movq mm6, [eax+8] |
|
|
psadbw mm7, [edx] |
|
|
psadbw mm6, [edx+8] |
|
|
%if (%1) |
|
|
paddd mm1,mm5 |
|
|
%endif |
|
|
movq mm5, [eax+ecx] |
|
|
movq mm4, [eax+ecx+8] |
|
|
psadbw mm5, [edx+ecx] |
|
|
psadbw mm4, [edx+ecx+8] |
|
|
movq mm3, [eax+2*ecx] |
|
|
movq mm2, [eax+2*ecx+8] |
|
|
psadbw mm3, [edx+2*ecx] |
|
|
psadbw mm2, [edx+2*ecx+8] |
|
|
%if (%1) |
|
|
movd [esp+4*(%1-1)],mm1 |
|
|
%else |
|
|
sub esp,byte 12 |
|
|
%endif |
|
|
movq mm1, [eax+ebx] |
|
|
movq mm0, [eax+ebx+8] |
|
|
psadbw mm1, [edx+ebx] |
|
|
psadbw mm0, [edx+ebx+8] |
|
|
lea eax,[eax+4*ecx] |
|
|
lea edx,[edx+4*ecx] |
|
|
paddd mm7,mm6 |
|
|
paddd mm5,mm4 |
|
|
paddd mm3,mm2 |
|
|
paddd mm1,mm0 |
|
|
paddd mm5,mm7 |
|
|
paddd mm1,mm3 |
|
|
%endmacro |
|
222 |
|
|
223 |
align 16 |
ALIGN SECTION_ALIGN |
224 |
sad16_3dne: |
sad16_3dne: |
225 |
|
mov _EAX, prm1 ; Src1 |
226 |
|
mov TMP1, prm2 ; Src2 |
227 |
|
mov TMP0, prm3 ; Stride |
228 |
|
|
229 |
|
push _EBX |
230 |
|
lea _EBX, [2*TMP0+TMP0] |
231 |
|
|
|
mov eax, [esp+ 4] ; Src1 |
|
|
mov edx, [esp+ 8] ; Src2 |
|
|
mov ecx, [esp+12] ; Stride |
|
|
push ebx |
|
|
lea ebx,[2*ecx+ecx] |
|
232 |
SAD_16x16_SSE 0 |
SAD_16x16_SSE 0 |
233 |
SAD_16x16_SSE 1 |
SAD_16x16_SSE 1 |
234 |
SAD_16x16_SSE 2 |
SAD_16x16_SSE 2 |
235 |
SAD_16x16_SSE 3 |
SAD_16x16_SSE 3 |
236 |
mov ecx,[esp] |
|
|
add ecx,[esp+4] |
|
|
add ecx,[esp+8] |
|
237 |
paddd mm1,mm5 |
paddd mm1,mm5 |
|
mov ebx,[esp+12] |
|
|
add esp,byte 4+12 |
|
238 |
movd eax, mm1 |
movd eax, mm1 |
239 |
add eax,ecx |
add eax, dword [_ESP] |
240 |
|
add eax, dword [_ESP+4] |
241 |
|
add eax, dword [_ESP+8] |
242 |
|
mov _EBX, [_ESP+12] |
243 |
|
add _ESP, byte PTR_SIZE+12 |
244 |
|
|
245 |
ret |
ret |
246 |
|
ENDFUNC |
247 |
|
|
248 |
|
|
249 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
250 |
; |
; |
251 |
; uint32_t sad8_3dne(const uint8_t * const cur, |
; uint32_t sad8_3dne(const uint8_t * const cur, |
252 |
; const uint8_t * const ref, |
; const uint8_t * const ref, |
253 |
; const uint32_t stride); |
; const uint32_t stride); |
254 |
; |
; |
255 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
256 |
align 16 |
|
257 |
|
ALIGN SECTION_ALIGN |
258 |
sad8_3dne: |
sad8_3dne: |
259 |
|
|
260 |
mov eax, [esp+ 4] ; Src1 |
mov _EAX, prm1 ; Src1 |
261 |
mov ecx, [esp+12] ; Stride |
mov TMP0, prm3 ; Stride |
262 |
mov edx, [esp+ 8] ; Src2 |
mov TMP1, prm2 ; Src2 |
263 |
push ebx |
push _EBX |
264 |
lea ebx, [ecx+2*ecx] |
lea _EBX, [TMP0+2*TMP0] |
265 |
|
|
266 |
movq mm0, [byte eax] ;0 |
movq mm0, [byte _EAX] ;0 |
267 |
psadbw mm0, [byte edx] |
psadbw mm0, [byte TMP1] |
268 |
movq mm1, [eax+ecx] ;1 |
movq mm1, [_EAX+TMP0] ;1 |
269 |
psadbw mm1, [edx+ecx] |
psadbw mm1, [TMP1+TMP0] |
270 |
|
|
271 |
movq mm2, [eax+2*ecx] ;2 |
movq mm2, [_EAX+2*TMP0] ;2 |
272 |
psadbw mm2, [edx+2*ecx] |
psadbw mm2, [TMP1+2*TMP0] |
273 |
movq mm3, [eax+ebx] ;3 |
movq mm3, [_EAX+_EBX] ;3 |
274 |
psadbw mm3, [edx+ebx] |
psadbw mm3, [TMP1+_EBX] |
275 |
|
|
276 |
paddd mm0,mm1 |
paddd mm0,mm1 |
277 |
|
|
278 |
movq mm4, [byte eax+4*ecx] ;4 |
movq mm4, [byte _EAX+4*TMP0];4 |
279 |
psadbw mm4, [edx+4*ecx] |
psadbw mm4, [TMP1+4*TMP0] |
280 |
movq mm5, [eax+2*ebx] ;6 |
movq mm5, [_EAX+2*_EBX] ;6 |
281 |
psadbw mm5, [edx+2*ebx] |
psadbw mm5, [TMP1+2*_EBX] |
282 |
|
|
283 |
paddd mm2,mm3 |
paddd mm2,mm3 |
284 |
paddd mm0,mm2 |
paddd mm0,mm2 |
285 |
|
|
286 |
lea ebx, [ebx+4*ecx] ;3+4=7 |
lea _EBX, [_EBX+4*TMP0] ;3+4=7 |
287 |
lea ecx,[ecx+4*ecx] ; 5 |
lea TMP0, [TMP0+4*TMP0] ; 5 |
288 |
movq mm6, [eax+ecx] ;5 |
movq mm6, [_EAX+TMP0] ;5 |
289 |
psadbw mm6, [edx+ecx] |
psadbw mm6, [TMP1+TMP0] |
290 |
movq mm7, [eax+ebx] ;7 |
movq mm7, [_EAX+_EBX] ;7 |
291 |
psadbw mm7, [edx+ebx] |
psadbw mm7, [TMP1+_EBX] |
292 |
paddd mm4,mm5 |
paddd mm4,mm5 |
293 |
paddd mm6,mm7 |
paddd mm6,mm7 |
294 |
paddd mm0,mm4 |
paddd mm0,mm4 |
295 |
mov ebx,[esp] |
mov _EBX, [_ESP] |
296 |
add esp,byte 4 |
add _ESP, byte PTR_SIZE |
297 |
paddd mm0,mm6 |
paddd mm0,mm6 |
298 |
movd eax, mm0 |
movd eax, mm0 |
299 |
|
|
300 |
ret |
ret |
301 |
|
ENDFUNC |
302 |
|
|
303 |
|
|
304 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
305 |
; |
; |
306 |
; uint32_t sad16bi_3dne(const uint8_t * const cur, |
; uint32_t sad16bi_3dne(const uint8_t * const cur, |
307 |
; const uint8_t * const ref1, |
; const uint8_t * const ref1, |
308 |
; const uint8_t * const ref2, |
; const uint8_t * const ref2, |
309 |
; const uint32_t stride); |
; const uint32_t stride); |
310 |
; |
; |
311 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
312 |
;optimization: 14% faster |
;optimization: 14% faster |
|
%macro SADBI_16x16_SSE0 0 |
|
|
movq mm2, [edx] |
|
|
movq mm3, [edx+8] |
|
|
|
|
|
movq mm5, [byte eax] |
|
|
movq mm6, [eax+8] |
|
|
pavgb mm2, [byte ebx] |
|
|
pavgb mm3, [ebx+8] |
|
|
|
|
|
add edx, ecx |
|
|
psadbw mm5, mm2 |
|
|
psadbw mm6, mm3 |
|
|
|
|
|
add eax, ecx |
|
|
add ebx, ecx |
|
|
movq mm2, [byte edx] |
|
|
|
|
|
movq mm3, [edx+8] |
|
|
movq mm0, [byte eax] |
|
|
|
|
|
movq mm1, [eax+8] |
|
|
pavgb mm2, [byte ebx] |
|
|
|
|
|
pavgb mm3, [ebx+8] |
|
|
add edx, ecx |
|
|
add eax, ecx |
|
|
|
|
|
add ebx, ecx |
|
|
psadbw mm0, mm2 |
|
|
psadbw mm1, mm3 |
|
313 |
|
|
314 |
%endmacro |
ALIGN SECTION_ALIGN |
|
%macro SADBI_16x16_SSE 0 |
|
|
movq mm2, [byte edx] |
|
|
movq mm3, [edx+8] |
|
|
paddusw mm5,mm0 |
|
|
paddusw mm6,mm1 |
|
|
movq mm0, [eax] |
|
|
movq mm1, [eax+8] |
|
|
pavgb mm2, [ebx] |
|
|
pavgb mm3, [ebx+8] |
|
|
add edx, ecx |
|
|
add eax, ecx |
|
|
add ebx, ecx |
|
|
psadbw mm0, mm2 |
|
|
psadbw mm1, mm3 |
|
|
%endmacro |
|
|
|
|
|
align 16 |
|
315 |
sad16bi_3dne: |
sad16bi_3dne: |
316 |
mov eax, [esp+ 4] ; Src |
mov _EAX, prm1 ; Src |
317 |
mov edx, [esp+ 8] ; Ref1 |
mov TMP1, prm2 ; Ref1 |
318 |
mov ecx, [esp+16] ; Stride |
mov TMP0, prm4 ; Stride |
319 |
push ebx |
|
320 |
mov ebx, [esp+4+12] ; Ref2 |
push _EBX |
321 |
|
%ifdef ARCH_IS_X86_64 |
322 |
|
mov _EBX, prm3 |
323 |
|
%else |
324 |
|
mov _EBX, [_ESP+4+12] ; Ref2 |
325 |
|
%endif |
326 |
|
|
327 |
SADBI_16x16_SSE0 |
SADBI_16x16_SSE0 |
328 |
SADBI_16x16_SSE |
SADBI_16x16_SSE |
343 |
paddusw mm5,mm0 |
paddusw mm5,mm0 |
344 |
paddusw mm6,mm1 |
paddusw mm6,mm1 |
345 |
|
|
346 |
pop ebx |
pop _EBX |
347 |
paddusw mm6,mm5 |
paddusw mm6,mm5 |
348 |
movd eax, mm6 |
movd eax, mm6 |
349 |
|
|
350 |
ret |
ret |
351 |
;=========================================================================== |
ENDFUNC |
352 |
|
|
353 |
|
;----------------------------------------------------------------------------- |
354 |
; |
; |
355 |
; uint32_t sad8bi_3dne(const uint8_t * const cur, |
; uint32_t sad8bi_3dne(const uint8_t * const cur, |
356 |
; const uint8_t * const ref1, |
; const uint8_t * const ref1, |
357 |
; const uint8_t * const ref2, |
; const uint8_t * const ref2, |
358 |
; const uint32_t stride); |
; const uint32_t stride); |
359 |
; |
; |
360 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
|
|
|
|
%macro SADBI_8x8_3dne 0 |
|
|
movq mm2, [edx] |
|
|
movq mm3, [edx+ecx] |
|
|
pavgb mm2, [eax] |
|
|
pavgb mm3, [eax+ecx] |
|
|
lea edx, [edx+2*ecx] |
|
|
lea eax, [eax+2*ecx] |
|
|
paddusw mm5,mm0 |
|
|
paddusw mm6,mm1 |
|
|
movq mm0, [ebx] |
|
|
movq mm1, [ebx+ecx] |
|
|
lea ebx, [ebx+2*ecx] |
|
|
psadbw mm0, mm2 |
|
|
psadbw mm1, mm3 |
|
|
%endmacro |
|
361 |
|
|
362 |
align 16 |
ALIGN SECTION_ALIGN |
363 |
sad8bi_3dne: |
sad8bi_3dne: |
364 |
mov eax, [esp+12] ; Ref2 |
mov _EAX, prm3 ; Ref2 |
365 |
mov edx, [esp+ 8] ; Ref1 |
mov TMP1, prm2 ; Ref1 |
366 |
mov ecx, [esp+16] ; Stride |
mov TMP0, prm4 ; Stride |
367 |
push ebx |
|
368 |
mov ebx, [esp+4+ 4] ; Src |
push _EBX |
369 |
|
%ifdef ARCH_IS_X86_64 |
370 |
movq mm2, [edx] |
mov _EBX, prm1 |
371 |
movq mm3, [edx+ecx] |
%else |
372 |
pavgb mm2, [eax] |
mov _EBX, [_ESP+4+ 4] ; Src |
373 |
pavgb mm3, [eax+ecx] |
%endif |
374 |
lea edx, [edx+2*ecx] |
|
375 |
lea eax, [eax+2*ecx] |
movq mm2, [TMP1] |
376 |
movq mm5, [ebx] |
movq mm3, [TMP1+TMP0] |
377 |
movq mm6, [ebx+ecx] |
pavgb mm2, [_EAX] |
378 |
lea ebx, [ebx+2*ecx] |
pavgb mm3, [_EAX+TMP0] |
379 |
|
lea TMP1, [TMP1+2*TMP0] |
380 |
|
lea _EAX, [_EAX+2*TMP0] |
381 |
|
movq mm5, [_EBX] |
382 |
|
movq mm6, [_EBX+TMP0] |
383 |
|
lea _EBX, [_EBX+2*TMP0] |
384 |
psadbw mm5, mm2 |
psadbw mm5, mm2 |
385 |
psadbw mm6, mm3 |
psadbw mm6, mm3 |
386 |
|
|
387 |
movq mm2, [edx] |
movq mm2, [TMP1] |
388 |
movq mm3, [edx+ecx] |
movq mm3, [TMP1+TMP0] |
389 |
pavgb mm2, [eax] |
pavgb mm2, [_EAX] |
390 |
pavgb mm3, [eax+ecx] |
pavgb mm3, [_EAX+TMP0] |
391 |
lea edx, [edx+2*ecx] |
lea TMP1, [TMP1+2*TMP0] |
392 |
lea eax, [eax+2*ecx] |
lea _EAX, [_EAX+2*TMP0] |
393 |
movq mm0, [ebx] |
movq mm0, [_EBX] |
394 |
movq mm1, [ebx+ecx] |
movq mm1, [_EBX+TMP0] |
395 |
lea ebx, [ebx+2*ecx] |
lea _EBX, [_EBX+2*TMP0] |
396 |
psadbw mm0, mm2 |
psadbw mm0, mm2 |
397 |
psadbw mm1, mm3 |
psadbw mm1, mm3 |
398 |
|
|
399 |
movq mm2, [edx] |
movq mm2, [TMP1] |
400 |
movq mm3, [edx+ecx] |
movq mm3, [TMP1+TMP0] |
401 |
pavgb mm2, [eax] |
pavgb mm2, [_EAX] |
402 |
pavgb mm3, [eax+ecx] |
pavgb mm3, [_EAX+TMP0] |
403 |
lea edx, [edx+2*ecx] |
lea TMP1, [TMP1+2*TMP0] |
404 |
lea eax, [eax+2*ecx] |
lea _EAX, [_EAX+2*TMP0] |
405 |
paddusw mm5,mm0 |
paddusw mm5,mm0 |
406 |
paddusw mm6,mm1 |
paddusw mm6,mm1 |
407 |
movq mm0, [ebx] |
movq mm0, [_EBX] |
408 |
movq mm1, [ebx+ecx] |
movq mm1, [_EBX+TMP0] |
409 |
lea ebx, [ebx+2*ecx] |
lea _EBX, [_EBX+2*TMP0] |
410 |
psadbw mm0, mm2 |
psadbw mm0, mm2 |
411 |
psadbw mm1, mm3 |
psadbw mm1, mm3 |
412 |
|
|
413 |
movq mm2, [edx] |
movq mm2, [TMP1] |
414 |
movq mm3, [edx+ecx] |
movq mm3, [TMP1+TMP0] |
415 |
pavgb mm2, [eax] |
pavgb mm2, [_EAX] |
416 |
pavgb mm3, [eax+ecx] |
pavgb mm3, [_EAX+TMP0] |
417 |
paddusw mm5,mm0 |
paddusw mm5,mm0 |
418 |
paddusw mm6,mm1 |
paddusw mm6,mm1 |
419 |
movq mm0, [ebx] |
movq mm0, [_EBX] |
420 |
movq mm1, [ebx+ecx] |
movq mm1, [_EBX+TMP0] |
421 |
psadbw mm0, mm2 |
psadbw mm0, mm2 |
422 |
psadbw mm1, mm3 |
psadbw mm1, mm3 |
423 |
paddusw mm5,mm0 |
paddusw mm5,mm0 |
424 |
paddusw mm6,mm1 |
paddusw mm6,mm1 |
425 |
|
|
426 |
paddusw mm6,mm5 |
paddusw mm6,mm5 |
427 |
mov ebx,[esp] |
mov _EBX,[_ESP] |
428 |
add esp,byte 4 |
add _ESP,byte PTR_SIZE |
429 |
movd eax, mm6 |
movd eax, mm6 |
430 |
|
|
431 |
ret |
ret |
432 |
|
ENDFUNC |
433 |
|
|
434 |
|
|
435 |
;=========================================================================== |
;=========================================================================== |
439 |
; |
; |
440 |
;=========================================================================== |
;=========================================================================== |
441 |
; optimization: 25 % faster |
; optimization: 25 % faster |
|
%macro ABS_16x16_SSE 1 |
|
|
%if (%1 == 0) |
|
|
movq mm7, [eax] |
|
|
psadbw mm7, mm4 |
|
|
mov esi,esi |
|
|
movq mm6, [eax+8] |
|
|
movq mm5, [eax+ecx] |
|
|
movq mm3, [eax+ecx+8] |
|
|
psadbw mm6, mm4 |
|
442 |
|
|
443 |
movq mm2, [byte eax+2*ecx] |
ALIGN SECTION_ALIGN |
|
psadbw mm5, mm4 |
|
|
movq mm1, [eax+2*ecx+8] |
|
|
psadbw mm3, mm4 |
|
|
|
|
|
movq mm0, [dword eax+edx] |
|
|
psadbw mm2, mm4 |
|
|
add eax,edx |
|
|
psadbw mm1, mm4 |
|
|
%endif |
|
|
%if (%1 == 1) |
|
|
psadbw mm0, mm4 |
|
|
paddd mm7, mm0 |
|
|
movq mm0, [eax+8] |
|
|
psadbw mm0, mm4 |
|
|
paddd mm6, mm0 |
|
|
|
|
|
movq mm0, [byte eax+ecx] |
|
|
psadbw mm0, mm4 |
|
|
|
|
|
paddd mm5, mm0 |
|
|
movq mm0, [eax+ecx+8] |
|
|
|
|
|
psadbw mm0, mm4 |
|
|
paddd mm3, mm0 |
|
|
movq mm0, [eax+2*ecx] |
|
|
psadbw mm0, mm4 |
|
|
paddd mm2, mm0 |
|
|
|
|
|
movq mm0, [eax+2*ecx+8] |
|
|
add eax,edx |
|
|
psadbw mm0, mm4 |
|
|
paddd mm1, mm0 |
|
|
movq mm0, [eax] |
|
|
%endif |
|
|
%if (%1 == 2) |
|
|
psadbw mm0, mm4 |
|
|
paddd mm7, mm0 |
|
|
movq mm0, [eax+8] |
|
|
psadbw mm0, mm4 |
|
|
paddd mm6, mm0 |
|
|
%endif |
|
|
%endmacro |
|
|
|
|
|
align 16 |
|
444 |
dev16_3dne: |
dev16_3dne: |
445 |
|
|
446 |
mov eax, [esp+ 4] ; Src |
mov _EAX, prm1 ; Src |
447 |
mov ecx, [esp+ 8] ; Stride |
mov TMP0, prm2 ; Stride |
448 |
lea edx,[ecx+2*ecx] |
lea TMP1, [TMP0+2*TMP0] |
449 |
|
|
450 |
pxor mm4, mm4 |
pxor mm4, mm4 |
451 |
align 8 |
|
452 |
|
ALIGN SECTION_ALIGN |
453 |
ABS_16x16_SSE 0 |
ABS_16x16_SSE 0 |
454 |
ABS_16x16_SSE 1 |
ABS_16x16_SSE 1 |
455 |
ABS_16x16_SSE 1 |
ABS_16x16_SSE 1 |
456 |
ABS_16x16_SSE 1 |
ABS_16x16_SSE 1 |
457 |
ABS_16x16_SSE 1 |
ABS_16x16_SSE 1 |
458 |
|
|
459 |
paddd mm1, mm2 |
paddd mm1, mm2 |
460 |
paddd mm3, mm5 |
paddd mm3, mm5 |
461 |
|
|
462 |
ABS_16x16_SSE 2 |
ABS_16x16_SSE 2 |
463 |
|
|
464 |
paddd mm7, mm6 |
paddd mm7, mm6 |
465 |
paddd mm1, mm3 |
paddd mm1, mm3 |
466 |
mov eax, [esp+ 4] ; Src |
mov _EAX, prm1 ; Src |
467 |
paddd mm7,mm1 |
paddd mm7,mm1 |
468 |
punpcklbw mm7,mm7 ;xxyyaazz |
punpcklbw mm7,mm7 ;xxyyaazz |
469 |
pshufw mm4,mm7,055h |
pshufw mm4, mm7, 055h ; mm4 contains the mean |
470 |
; mm4 contains the mean |
|
471 |
|
|
472 |
pxor mm1, mm1 |
pxor mm1, mm1 |
473 |
|
|
474 |
ABS_16x16_SSE 0 |
ABS_16x16_SSE 0 |
476 |
ABS_16x16_SSE 1 |
ABS_16x16_SSE 1 |
477 |
ABS_16x16_SSE 1 |
ABS_16x16_SSE 1 |
478 |
ABS_16x16_SSE 1 |
ABS_16x16_SSE 1 |
479 |
|
|
480 |
paddd mm1, mm2 |
paddd mm1, mm2 |
481 |
paddd mm3, mm5 |
paddd mm3, mm5 |
482 |
|
|
483 |
ABS_16x16_SSE 2 |
ABS_16x16_SSE 2 |
484 |
|
|
485 |
paddd mm7, mm6 |
paddd mm7, mm6 |
486 |
paddd mm1, mm3 |
paddd mm1, mm3 |
487 |
paddd mm7,mm1 |
paddd mm7,mm1 |
488 |
movd eax, mm7 |
movd eax, mm7 |
489 |
|
|
490 |
ret |
ret |
491 |
|
ENDFUNC |
492 |
|
|
493 |
|
NON_EXEC_STACK |