Annotation of /xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm

Revision 1.12 - (view) (download)

1 :	edgomez	1.5	;/*****************************************************************************
2 :	Isibaar	1.1	; *
3 :	edgomez	1.5	; * XVID MPEG-4 VIDEO CODEC
4 :			; * - mmx 8x8 block-based halfpel interpolation -
5 :	Isibaar	1.1	; *
6 :	edgomez	1.5	; * Copyright(C) 2002 Michael Militzer <isibaar@xvid.org>
7 :			; * 2002 Pascal Massimino <skal@planet-d.net>
8 :	Isibaar	1.1	; *
9 :	edgomez	1.5	; * This program is free software ; you can redistribute it and/or modify
10 :			; * it under the terms of the GNU General Public License as published by
11 :			; * the Free Software Foundation ; either version 2 of the License, or
12 :			; * (at your option) any later version.
13 :	edgomez	1.3	; *
14 :	edgomez	1.5	; * This program is distributed in the hope that it will be useful,
15 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
16 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 :			; * GNU General Public License for more details.
18 :	Isibaar	1.1	; *
19 :	edgomez	1.5	; * You should have received a copy of the GNU General Public License
20 :			; * along with this program ; if not, write to the Free Software
21 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 :	edgomez	1.4	; *
23 :	edgomez	1.5	; ****************************************************************************/
24 :	edgomez	1.4
25 :	edgomez	1.5	BITS 32
26 :	Isibaar	1.1
27 :	edgomez	1.5	%macro cglobal 1
28 :	Isibaar	1.1	%ifdef PREFIX
29 :	edgomez	1.8	%ifdef MARK_FUNCS
30 :	edgomez	1.9	global _%1:function %1.endfunc-%1
31 :			%define %1 _%1:function %1.endfunc-%1
32 :	Isibaar	1.12	%define ENDFUNC .endfunc
33 :	edgomez	1.8	%else
34 :			global _%1
35 :			%define %1 _%1
36 :	Isibaar	1.12	%define ENDFUNC
37 :	edgomez	1.8	%endif
38 :	Isibaar	1.1	%else
39 :	edgomez	1.8	%ifdef MARK_FUNCS
40 :	edgomez	1.9	global %1:function %1.endfunc-%1
41 :	Isibaar	1.12	%define ENDFUNC .endfunc
42 :	edgomez	1.8	%else
43 :			global %1
44 :	Isibaar	1.12	%define ENDFUNC
45 :	edgomez	1.8	%endif
46 :	Isibaar	1.1	%endif
47 :			%endmacro
48 :
49 :	edgomez	1.5	;=============================================================================
50 :			; Read only data
51 :			;=============================================================================
52 :
53 :			%ifdef FORMAT_COFF
54 :	edgomez	1.6	SECTION .rodata
55 :	edgomez	1.5	%else
56 :	edgomez	1.6	SECTION .rodata align=16
57 :	edgomez	1.5	%endif
58 :
59 :			ALIGN 16
60 :			mmx_one:
61 :			times 8 db 1
62 :	Isibaar	1.1
63 :	edgomez	1.5	SECTION .text
64 :	Isibaar	1.1
65 :			cglobal interpolate8x8_halfpel_h_xmm
66 :			cglobal interpolate8x8_halfpel_v_xmm
67 :			cglobal interpolate8x8_halfpel_hv_xmm
68 :
69 :	suxen_drol	1.10	cglobal interpolate8x4_halfpel_h_xmm
70 :			cglobal interpolate8x4_halfpel_v_xmm
71 :			cglobal interpolate8x4_halfpel_hv_xmm
72 :
73 :	edgomez	1.7	cglobal interpolate8x8_halfpel_add_xmm
74 :			cglobal interpolate8x8_halfpel_h_add_xmm
75 :			cglobal interpolate8x8_halfpel_v_add_xmm
76 :			cglobal interpolate8x8_halfpel_hv_add_xmm
77 :
78 :	Isibaar	1.1	;===========================================================================
79 :			;
80 :			; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst,
81 :			; const uint8_t * const src,
82 :			; const uint32_t stride,
83 :			; const uint32_t rounding);
84 :			;
85 :			;===========================================================================
86 :
87 :			%macro COPY_H_SSE_RND0 0
88 :			movq mm0, [eax]
89 :			pavgb mm0, [eax+1]
90 :			movq mm1, [eax+edx]
91 :			pavgb mm1, [eax+edx+1]
92 :			lea eax,[eax+2*edx]
93 :			movq [ecx],mm0
94 :			movq [ecx+edx],mm1
95 :			%endmacro
96 :
97 :			%macro COPY_H_SSE_RND1 0
98 :			movq mm0, [eax]
99 :			movq mm1, [eax+edx]
100 :			movq mm4, mm0
101 :			movq mm5, mm1
102 :	edgomez	1.5	movq mm2, [eax+1]
103 :	Isibaar	1.1	movq mm3, [eax+edx+1]
104 :			pavgb mm0, mm2
105 :			pxor mm2, mm4
106 :			pavgb mm1, mm3
107 :	edgomez	1.5	lea eax, [eax+2*edx]
108 :	Isibaar	1.1	pxor mm3, mm5
109 :			pand mm2, mm7
110 :			pand mm3, mm7
111 :			psubb mm0, mm2
112 :			movq [ecx], mm0
113 :			psubb mm1, mm3
114 :	edgomez	1.5	movq [ecx+edx], mm1
115 :	Isibaar	1.1	%endmacro
116 :
117 :	edgomez	1.5	ALIGN 16
118 :	Isibaar	1.1	interpolate8x8_halfpel_h_xmm:
119 :
120 :	edgomez	1.5	mov eax, [esp+16] ; rounding
121 :			mov ecx, [esp+ 4] ; Dst
122 :	Isibaar	1.1	test eax,eax
123 :	edgomez	1.5	mov eax, [esp+ 8] ; Src
124 :			mov edx, [esp+12] ; stride
125 :	Isibaar	1.1
126 :			jnz near .rounding1
127 :
128 :			COPY_H_SSE_RND0
129 :			lea ecx,[ecx+2*edx]
130 :			COPY_H_SSE_RND0
131 :			lea ecx,[ecx+2*edx]
132 :			COPY_H_SSE_RND0
133 :			lea ecx,[ecx+2*edx]
134 :			COPY_H_SSE_RND0
135 :			ret
136 :
137 :	Isibaar	1.12	.rounding1:
138 :	edgomez	1.5	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
139 :	Isibaar	1.1	movq mm7, [mmx_one]
140 :			COPY_H_SSE_RND1
141 :			lea ecx, [ecx+2*edx]
142 :			COPY_H_SSE_RND1
143 :			lea ecx,[ecx+2*edx]
144 :			COPY_H_SSE_RND1
145 :			lea ecx,[ecx+2*edx]
146 :			COPY_H_SSE_RND1
147 :			ret
148 :	Isibaar	1.12	ENDFUNC
149 :	Isibaar	1.1
150 :			;===========================================================================
151 :			;
152 :			; void interpolate8x8_halfpel_v_xmm(uint8_t * const dst,
153 :	edgomez	1.5	; const uint8_t * const src,
154 :			; const uint32_t stride,
155 :			; const uint32_t rounding);
156 :	Isibaar	1.1	;
157 :			;===========================================================================
158 :
159 :			%macro COPY_V_SSE_RND0 0
160 :	edgomez	1.5	movq mm0, [eax]
161 :			movq mm1, [eax+edx]
162 :	Isibaar	1.1	pavgb mm0, mm1
163 :			pavgb mm1, [eax+2*edx]
164 :	edgomez	1.5	lea eax, [eax+2*edx]
165 :			movq [ecx], mm0
166 :	Isibaar	1.1	movq [ecx+edx],mm1
167 :			%endmacro
168 :
169 :			%macro COPY_V_SSE_RND1 0
170 :			movq mm0, mm2
171 :			movq mm1, [eax]
172 :			movq mm2, [eax+edx]
173 :			lea eax,[eax+2*edx]
174 :			movq mm4, mm0
175 :			movq mm5, mm1
176 :			pavgb mm0, mm1
177 :	edgomez	1.5	pxor mm4, mm1
178 :	Isibaar	1.1	pavgb mm1, mm2
179 :			pxor mm5, mm2
180 :	edgomez	1.5	pand mm4, mm7 ; lsb's of (i^j)...
181 :			pand mm5, mm7 ; lsb's of (i^j)...
182 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
183 :	Isibaar	1.1	movq [ecx], mm0
184 :	edgomez	1.5	psubb mm1, mm5 ; ...are substracted from result of pavgb
185 :	Isibaar	1.1	movq [ecx+edx], mm1
186 :			%endmacro
187 :
188 :	edgomez	1.5	ALIGN 16
189 :	Isibaar	1.1	interpolate8x8_halfpel_v_xmm:
190 :
191 :			mov eax, [esp+16]; rounding
192 :	edgomez	1.5	mov ecx, [esp+ 4] ; Dst
193 :	Isibaar	1.1	test eax,eax
194 :	edgomez	1.5	mov eax, [esp+ 8] ; Src
195 :			mov edx, [esp+12] ; stride
196 :	Isibaar	1.1
197 :	edgomez	1.5	; we process 2 line at a time
198 :	Isibaar	1.1	jnz near .rounding1
199 :
200 :			COPY_V_SSE_RND0
201 :			lea ecx, [ecx+2*edx]
202 :			COPY_V_SSE_RND0
203 :			lea ecx, [ecx+2*edx]
204 :			COPY_V_SSE_RND0
205 :			lea ecx, [ecx+2*edx]
206 :			COPY_V_SSE_RND0
207 :			ret
208 :
209 :	Isibaar	1.12	.rounding1:
210 :	edgomez	1.5	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
211 :	Isibaar	1.1	movq mm7, [mmx_one]
212 :	edgomez	1.5	movq mm2, [eax] ; loop invariant
213 :	Isibaar	1.1	add eax, edx
214 :
215 :			COPY_V_SSE_RND1
216 :			lea ecx,[ecx+2*edx]
217 :			COPY_V_SSE_RND1
218 :			lea ecx,[ecx+2*edx]
219 :			COPY_V_SSE_RND1
220 :			lea ecx,[ecx+2*edx]
221 :			COPY_V_SSE_RND1
222 :			ret
223 :	Isibaar	1.12	ENDFUNC
224 :	Isibaar	1.1
225 :			;===========================================================================
226 :			;
227 :			; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,
228 :	edgomez	1.5	; const uint8_t * const src,
229 :			; const uint32_t stride,
230 :			; const uint32_t rounding);
231 :	Isibaar	1.1	;
232 :			;
233 :			;===========================================================================
234 :
235 :			; The trick is to correct the result of 'pavgb' with some combination of the
236 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
237 :			; The boolean relations are:
238 :	edgomez	1.5	; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
239 :	Isibaar	1.1	; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
240 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
241 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
242 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
243 :
244 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
245 :
246 :			%macro COPY_HV_SSE_RND0 0
247 :	edgomez	1.5	lea eax, [eax+edx]
248 :	Isibaar	1.1
249 :	edgomez	1.5	movq mm0, [eax]
250 :			movq mm1, [eax+1]
251 :	Isibaar	1.1
252 :	edgomez	1.5	movq mm6, mm0
253 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
254 :			lea eax, [eax+edx]
255 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
256 :	Isibaar	1.1
257 :	edgomez	1.5	por mm3, mm1 ; ij \|= jk
258 :			movq mm6, mm2
259 :			pxor mm6, mm0 ; mm6 = s^t
260 :			pand mm3, mm6 ; (ij\|jk) &= st
261 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
262 :			pand mm3, mm7 ; mask lsb
263 :			psubb mm2, mm3 ; apply.
264 :
265 :			movq [ecx], mm2
266 :	Isibaar	1.1
267 :	edgomez	1.5	movq mm2, [eax]
268 :			movq mm3, [eax+1]
269 :			movq mm6, mm2
270 :			pavgb mm2, mm3 ; preserved for next iteration
271 :			lea ecx,[ecx+edx]
272 :			pxor mm3, mm6 ; preserved for next iteration
273 :
274 :			por mm1, mm3
275 :			movq mm6, mm0
276 :			pxor mm6, mm2
277 :			pand mm1, mm6
278 :			pavgb mm0, mm2
279 :
280 :			pand mm1, mm7
281 :			psubb mm0, mm1
282 :
283 :			movq [ecx], mm0
284 :	Isibaar	1.1	%endmacro
285 :
286 :			%macro COPY_HV_SSE_RND1 0
287 :	edgomez	1.5	lea eax, [eax+edx]
288 :
289 :			movq mm0, [eax]
290 :			movq mm1, [eax+1]
291 :	Isibaar	1.1
292 :	edgomez	1.5	movq mm6, mm0
293 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
294 :			lea eax, [eax+edx]
295 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
296 :	Isibaar	1.1
297 :	edgomez	1.5	pand mm3, mm1
298 :			movq mm6, mm2
299 :			pxor mm6, mm0
300 :			por mm3, mm6
301 :			pavgb mm2, mm0
302 :			pand mm3, mm7
303 :			psubb mm2, mm3
304 :
305 :			movq [ecx], mm2
306 :	Isibaar	1.1
307 :	edgomez	1.5	movq mm2, [eax]
308 :			movq mm3, [eax+1]
309 :			movq mm6, mm2
310 :			pavgb mm2, mm3 ; preserved for next iteration
311 :			lea ecx,[ecx+edx]
312 :			pxor mm3, mm6 ; preserved for next iteration
313 :
314 :			pand mm1, mm3
315 :			movq mm6, mm0
316 :			pxor mm6, mm2
317 :			por mm1, mm6
318 :			pavgb mm0, mm2
319 :			pand mm1, mm7
320 :			psubb mm0, mm1
321 :
322 :			movq [ecx], mm0
323 :	Isibaar	1.1	%endmacro
324 :
325 :	edgomez	1.5	ALIGN 16
326 :	Isibaar	1.1	interpolate8x8_halfpel_hv_xmm:
327 :	edgomez	1.5	mov eax, [esp+16] ; rounding
328 :			mov ecx, [esp+ 4] ; Dst
329 :			test eax, eax
330 :			mov eax, [esp+ 8] ; Src
331 :			mov edx, [esp+12] ; stride
332 :	Isibaar	1.1
333 :			movq mm7, [mmx_one]
334 :
335 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
336 :			movq mm2, [eax]
337 :			movq mm3, [eax+1]
338 :			movq mm6, mm2
339 :			pavgb mm2, mm3
340 :	edgomez	1.5	pxor mm3, mm6 ; mm2/mm3 ready
341 :	Isibaar	1.1
342 :			jnz near .rounding1
343 :
344 :			COPY_HV_SSE_RND0
345 :			add ecx, edx
346 :			COPY_HV_SSE_RND0
347 :			add ecx, edx
348 :			COPY_HV_SSE_RND0
349 :			add ecx, edx
350 :			COPY_HV_SSE_RND0
351 :			ret
352 :
353 :	Isibaar	1.12	.rounding1:
354 :	Isibaar	1.1	COPY_HV_SSE_RND1
355 :			add ecx, edx
356 :			COPY_HV_SSE_RND1
357 :			add ecx, edx
358 :			COPY_HV_SSE_RND1
359 :			add ecx, edx
360 :			COPY_HV_SSE_RND1
361 :	edgomez	1.5	ret
362 :	Isibaar	1.12	ENDFUNC
363 :	edgomez	1.7
364 :			;===========================================================================
365 :			;
366 :	suxen_drol	1.10	; void interpolate8x4_halfpel_h_xmm(uint8_t * const dst,
367 :			; const uint8_t * const src,
368 :			; const uint32_t stride,
369 :			; const uint32_t rounding);
370 :			;
371 :			;===========================================================================
372 :
373 :			ALIGN 16
374 :			interpolate8x4_halfpel_h_xmm:
375 :
376 :			mov eax, [esp+16] ; rounding
377 :			mov ecx, [esp+ 4] ; Dst
378 :			test eax,eax
379 :			mov eax, [esp+ 8] ; Src
380 :			mov edx, [esp+12] ; stride
381 :
382 :			jnz near .rounding1
383 :
384 :			COPY_H_SSE_RND0
385 :			lea ecx,[ecx+2*edx]
386 :			COPY_H_SSE_RND0
387 :			ret
388 :
389 :	Isibaar	1.12	.rounding1:
390 :	suxen_drol	1.10	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
391 :			movq mm7, [mmx_one]
392 :			COPY_H_SSE_RND1
393 :			lea ecx, [ecx+2*edx]
394 :			COPY_H_SSE_RND1
395 :			ret
396 :	Isibaar	1.12	ENDFUNC
397 :	suxen_drol	1.10
398 :			;===========================================================================
399 :			;
400 :			; void interpolate8x4_halfpel_v_xmm(uint8_t * const dst,
401 :			; const uint8_t * const src,
402 :			; const uint32_t stride,
403 :			; const uint32_t rounding);
404 :			;
405 :			;===========================================================================
406 :
407 :			ALIGN 16
408 :			interpolate8x4_halfpel_v_xmm:
409 :
410 :			mov eax, [esp+16]; rounding
411 :			mov ecx, [esp+ 4] ; Dst
412 :			test eax,eax
413 :			mov eax, [esp+ 8] ; Src
414 :			mov edx, [esp+12] ; stride
415 :
416 :			; we process 2 line at a time
417 :			jnz near .rounding1
418 :
419 :			COPY_V_SSE_RND0
420 :			lea ecx, [ecx+2*edx]
421 :			COPY_V_SSE_RND0
422 :			ret
423 :
424 :	Isibaar	1.12	.rounding1:
425 :	suxen_drol	1.10	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
426 :			movq mm7, [mmx_one]
427 :			movq mm2, [eax] ; loop invariant
428 :			add eax, edx
429 :
430 :			COPY_V_SSE_RND1
431 :			lea ecx,[ecx+2*edx]
432 :			COPY_V_SSE_RND1
433 :			ret
434 :	Isibaar	1.12	ENDFUNC
435 :	suxen_drol	1.10
436 :			;===========================================================================
437 :			;
438 :			; void interpolate8x4_halfpel_hv_xmm(uint8_t * const dst,
439 :			; const uint8_t * const src,
440 :			; const uint32_t stride,
441 :			; const uint32_t rounding);
442 :			;
443 :			;
444 :			;===========================================================================
445 :
446 :			; The trick is to correct the result of 'pavgb' with some combination of the
447 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
448 :			; The boolean relations are:
449 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
450 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
451 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
452 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
453 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
454 :
455 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
456 :
457 :			ALIGN 16
458 :			interpolate8x4_halfpel_hv_xmm:
459 :			mov eax, [esp+16] ; rounding
460 :			mov ecx, [esp+ 4] ; Dst
461 :			test eax, eax
462 :			mov eax, [esp+ 8] ; Src
463 :			mov edx, [esp+12] ; stride
464 :
465 :			movq mm7, [mmx_one]
466 :
467 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
468 :			movq mm2, [eax]
469 :			movq mm3, [eax+1]
470 :			movq mm6, mm2
471 :			pavgb mm2, mm3
472 :			pxor mm3, mm6 ; mm2/mm3 ready
473 :
474 :			jnz near .rounding1
475 :
476 :			COPY_HV_SSE_RND0
477 :			add ecx, edx
478 :			COPY_HV_SSE_RND0
479 :			ret
480 :
481 :	Isibaar	1.12	.rounding1:
482 :	suxen_drol	1.10	COPY_HV_SSE_RND1
483 :			add ecx, edx
484 :			COPY_HV_SSE_RND1
485 :			ret
486 :	Isibaar	1.12	ENDFUNC
487 :	suxen_drol	1.10
488 :			;===========================================================================
489 :			;
490 :	edgomez	1.7	; The next functions combine both source halfpel interpolation step and the
491 :			; averaging (with rouding) step to avoid wasting memory bandwidth computing
492 :			; intermediate halfpel images and then averaging them.
493 :			;
494 :			;===========================================================================
495 :
496 :			%macro PROLOG0 0
497 :			mov ecx, [esp+ 4] ; Dst
498 :			mov eax, [esp+ 8] ; Src
499 :			mov edx, [esp+12] ; BpS
500 :			%endmacro
501 :			%macro PROLOG1 0
502 :			PROLOG0
503 :			test dword [esp+16], 1; Rounding?
504 :			%endmacro
505 :			%macro EPILOG 0
506 :			ret
507 :			%endmacro
508 :
509 :			;===========================================================================
510 :			;
511 :			; void interpolate8x8_halfpel_add_xmm(uint8_t * const dst,
512 :			; const uint8_t * const src,
513 :			; const uint32_t stride,
514 :			; const uint32_t rounding);
515 :			;
516 :			;
517 :			;===========================================================================
518 :
519 :			%macro ADD_FF 2
520 :			movq mm0, [eax+%1]
521 :			movq mm1, [eax+%2]
522 :			;;---
523 :			;; movq mm2, mm0
524 :			;; movq mm3, mm1
525 :			;;---
526 :			pavgb mm0, [ecx+%1]
527 :			pavgb mm1, [ecx+%2]
528 :			;;--
529 :			;; por mm2, [ecx+%1]
530 :			;; por mm3, [ecx+%2]
531 :			;; pand mm2, [mmx_one]
532 :			;; pand mm3, [mmx_one]
533 :			;; psubsb mm0, mm2
534 :			;; psubsb mm1, mm3
535 :			;;--
536 :			movq [ecx+%1], mm0
537 :			movq [ecx+%2], mm1
538 :			%endmacro
539 :
540 :			ALIGN 16
541 :			interpolate8x8_halfpel_add_xmm: ; 23c
542 :			PROLOG1
543 :			ADD_FF 0, edx
544 :			lea eax,[eax+2*edx]
545 :			lea ecx,[ecx+2*edx]
546 :			ADD_FF 0, edx
547 :			lea eax,[eax+2*edx]
548 :			lea ecx,[ecx+2*edx]
549 :			ADD_FF 0, edx
550 :			lea eax,[eax+2*edx]
551 :			lea ecx,[ecx+2*edx]
552 :			ADD_FF 0, edx
553 :			EPILOG
554 :	Isibaar	1.12	ENDFUNC
555 :	edgomez	1.7
556 :			;===========================================================================
557 :			;
558 :			; void interpolate8x8_halfpel_h_add_xmm(uint8_t * const dst,
559 :			; const uint8_t * const src,
560 :			; const uint32_t stride,
561 :			; const uint32_t rounding);
562 :			;
563 :			;
564 :			;===========================================================================
565 :
566 :
567 :			%macro ADD_FH_RND0 2
568 :			movq mm0, [eax+%1]
569 :			movq mm1, [eax+%2]
570 :			pavgb mm0, [eax+%1+1]
571 :			pavgb mm1, [eax+%2+1]
572 :			pavgb mm0, [ecx+%1]
573 :			pavgb mm1, [ecx+%2]
574 :			movq [ecx+%1],mm0
575 :			movq [ecx+%2],mm1
576 :			%endmacro
577 :
578 :			%macro ADD_FH_RND1 2
579 :			movq mm0, [eax+%1]
580 :			movq mm1, [eax+%2]
581 :			movq mm4, mm0
582 :			movq mm5, mm1
583 :			movq mm2, [eax+%1+1]
584 :			movq mm3, [eax+%2+1]
585 :			pavgb mm0, mm2
586 :			; lea ??
587 :			pxor mm2, mm4
588 :			pavgb mm1, mm3
589 :			pxor mm3, mm5
590 :			pand mm2, [mmx_one]
591 :			pand mm3, [mmx_one]
592 :			psubb mm0, mm2
593 :			psubb mm1, mm3
594 :			pavgb mm0, [ecx+%1]
595 :			pavgb mm1, [ecx+%2]
596 :			movq [ecx+%1],mm0
597 :			movq [ecx+%2],mm1
598 :			%endmacro
599 :
600 :			ALIGN 16
601 :			interpolate8x8_halfpel_h_add_xmm: ; 32c
602 :			PROLOG1
603 :			jnz near .Loop1
604 :			ADD_FH_RND0 0, edx
605 :			lea eax,[eax+2*edx]
606 :			lea ecx,[ecx+2*edx]
607 :			ADD_FH_RND0 0, edx
608 :			lea eax,[eax+2*edx]
609 :			lea ecx,[ecx+2*edx]
610 :			ADD_FH_RND0 0, edx
611 :			lea eax,[eax+2*edx]
612 :			lea ecx,[ecx+2*edx]
613 :			ADD_FH_RND0 0, edx
614 :			EPILOG
615 :
616 :	Isibaar	1.12	.Loop1:
617 :	edgomez	1.7	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
618 :			; movq mm7, [mmx_one]
619 :			ADD_FH_RND1 0, edx
620 :			lea eax,[eax+2*edx]
621 :			lea ecx,[ecx+2*edx]
622 :			ADD_FH_RND1 0, edx
623 :			lea eax,[eax+2*edx]
624 :			lea ecx,[ecx+2*edx]
625 :			ADD_FH_RND1 0, edx
626 :			lea eax,[eax+2*edx]
627 :			lea ecx,[ecx+2*edx]
628 :			ADD_FH_RND1 0, edx
629 :			EPILOG
630 :	Isibaar	1.12	ENDFUNC
631 :	edgomez	1.7
632 :
633 :			;===========================================================================
634 :			;
635 :			; void interpolate8x8_halfpel_v_add_xmm(uint8_t * const dst,
636 :			; const uint8_t * const src,
637 :			; const uint32_t stride,
638 :			; const uint32_t rounding);
639 :			;
640 :			;
641 :			;===========================================================================
642 :
643 :			%macro ADD_8_HF_RND0 0
644 :			movq mm0, [eax]
645 :			movq mm1, [eax+edx]
646 :			pavgb mm0, mm1
647 :			pavgb mm1, [eax+2*edx]
648 :			lea eax,[eax+2*edx]
649 :			pavgb mm0, [ecx]
650 :			pavgb mm1, [ecx+edx]
651 :			movq [ecx],mm0
652 :			movq [ecx+edx],mm1
653 :			%endmacro
654 :
655 :			%macro ADD_8_HF_RND1 0
656 :			movq mm1, [eax+edx]
657 :			movq mm2, [eax+2*edx]
658 :			lea eax,[eax+2*edx]
659 :			movq mm4, mm0
660 :			movq mm5, mm1
661 :			pavgb mm0, mm1
662 :			pxor mm4, mm1
663 :			pavgb mm1, mm2
664 :			pxor mm5, mm2
665 :			pand mm4, mm7 ; lsb's of (i^j)...
666 :			pand mm5, mm7 ; lsb's of (i^j)...
667 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
668 :			pavgb mm0, [ecx]
669 :			movq [ecx], mm0
670 :			psubb mm1, mm5 ; ...are substracted from result of pavgb
671 :			pavgb mm1, [ecx+edx]
672 :			movq [ecx+edx], mm1
673 :			%endmacro
674 :
675 :			ALIGN 16
676 :			interpolate8x8_halfpel_v_add_xmm:
677 :			PROLOG1
678 :
679 :			jnz near .Loop1
680 :			pxor mm7, mm7 ; this is a NOP
681 :
682 :			ADD_8_HF_RND0
683 :			lea ecx,[ecx+2*edx]
684 :			ADD_8_HF_RND0
685 :			lea ecx,[ecx+2*edx]
686 :			ADD_8_HF_RND0
687 :			lea ecx,[ecx+2*edx]
688 :			ADD_8_HF_RND0
689 :			EPILOG
690 :
691 :	Isibaar	1.12	.Loop1:
692 :	edgomez	1.7	movq mm0, [eax] ; loop invariant
693 :			movq mm7, [mmx_one]
694 :
695 :			ADD_8_HF_RND1
696 :			movq mm0, mm2
697 :			lea ecx,[ecx+2*edx]
698 :			ADD_8_HF_RND1
699 :			movq mm0, mm2
700 :			lea ecx,[ecx+2*edx]
701 :			ADD_8_HF_RND1
702 :			movq mm0, mm2
703 :			lea ecx,[ecx+2*edx]
704 :			ADD_8_HF_RND1
705 :			EPILOG
706 :	Isibaar	1.12	ENDFUNC
707 :	edgomez	1.7
708 :			; The trick is to correct the result of 'pavgb' with some combination of the
709 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
710 :			; The boolean relations are:
711 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
712 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
713 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
714 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
715 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
716 :
717 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
718 :
719 :			;===========================================================================
720 :			;
721 :			; void interpolate8x8_halfpel_hv_add_xmm(uint8_t * const dst,
722 :			; const uint8_t * const src,
723 :			; const uint32_t stride,
724 :			; const uint32_t rounding);
725 :			;
726 :			;
727 :			;===========================================================================
728 :
729 :			%macro ADD_HH_RND0 0
730 :			lea eax,[eax+edx]
731 :
732 :			movq mm0, [eax]
733 :			movq mm1, [eax+1]
734 :
735 :			movq mm6, mm0
736 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
737 :			lea eax,[eax+edx]
738 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
739 :
740 :			por mm3, mm1 ; ij \|= jk
741 :			movq mm6, mm2
742 :			pxor mm6, mm0 ; mm6 = s^t
743 :			pand mm3, mm6 ; (ij\|jk) &= st
744 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
745 :			pand mm3, mm7 ; mask lsb
746 :			psubb mm2, mm3 ; apply.
747 :
748 :			pavgb mm2, [ecx]
749 :			movq [ecx], mm2
750 :
751 :			movq mm2, [eax]
752 :			movq mm3, [eax+1]
753 :			movq mm6, mm2
754 :			pavgb mm2, mm3 ; preserved for next iteration
755 :			lea ecx,[ecx+edx]
756 :			pxor mm3, mm6 ; preserved for next iteration
757 :
758 :			por mm1, mm3
759 :			movq mm6, mm0
760 :			pxor mm6, mm2
761 :			pand mm1, mm6
762 :			pavgb mm0, mm2
763 :
764 :			pand mm1, mm7
765 :			psubb mm0, mm1
766 :
767 :			pavgb mm0, [ecx]
768 :			movq [ecx], mm0
769 :			%endmacro
770 :
771 :			%macro ADD_HH_RND1 0
772 :			lea eax,[eax+edx]
773 :
774 :			movq mm0, [eax]
775 :			movq mm1, [eax+1]
776 :
777 :			movq mm6, mm0
778 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
779 :			lea eax,[eax+edx]
780 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
781 :
782 :			pand mm3, mm1
783 :			movq mm6, mm2
784 :			pxor mm6, mm0
785 :			por mm3, mm6
786 :			pavgb mm2, mm0
787 :			pand mm3, mm7
788 :			psubb mm2, mm3
789 :
790 :			pavgb mm2, [ecx]
791 :			movq [ecx], mm2
792 :
793 :			movq mm2, [eax]
794 :			movq mm3, [eax+1]
795 :			movq mm6, mm2
796 :			pavgb mm2, mm3 ; preserved for next iteration
797 :			lea ecx,[ecx+edx]
798 :			pxor mm3, mm6 ; preserved for next iteration
799 :
800 :			pand mm1, mm3
801 :			movq mm6, mm0
802 :			pxor mm6, mm2
803 :			por mm1, mm6
804 :			pavgb mm0, mm2
805 :			pand mm1, mm7
806 :			psubb mm0, mm1
807 :
808 :			pavgb mm0, [ecx]
809 :			movq [ecx], mm0
810 :			%endmacro
811 :
812 :			ALIGN 16
813 :			interpolate8x8_halfpel_hv_add_xmm:
814 :			PROLOG1
815 :
816 :			movq mm7, [mmx_one]
817 :
818 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
819 :			movq mm2, [eax]
820 :			movq mm3, [eax+1]
821 :			movq mm6, mm2
822 :			pavgb mm2, mm3
823 :			pxor mm3, mm6 ; mm2/mm3 ready
824 :
825 :			jnz near .Loop1
826 :
827 :			ADD_HH_RND0
828 :			add ecx, edx
829 :			ADD_HH_RND0
830 :			add ecx, edx
831 :			ADD_HH_RND0
832 :			add ecx, edx
833 :			ADD_HH_RND0
834 :			EPILOG
835 :
836 :	Isibaar	1.12	.Loop1:
837 :	edgomez	1.7	ADD_HH_RND1
838 :			add ecx, edx
839 :			ADD_HH_RND1
840 :			add ecx, edx
841 :			ADD_HH_RND1
842 :			add ecx, edx
843 :			ADD_HH_RND1
844 :
845 :			EPILOG
846 :	Isibaar	1.12	ENDFUNC
847 :	edgomez	1.9
848 :	Isibaar	1.11
849 :			%ifidn __OUTPUT_FORMAT__,elf
850 :			section ".note.GNU-stack" noalloc noexec nowrite progbits
851 :			%endif
852 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4