Annotation of /xvidcore/src/image/x86_asm/interpolate8x8_3dn.asm

Revision 1.9 - (view) (download)

1 :	edgomez	1.5	;/*****************************************************************************
2 :	Isibaar	1.1	; *
3 :	edgomez	1.5	; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dnow 8x8 block-based halfpel interpolation -
5 :	Isibaar	1.1	; *
6 :	edgomez	1.5	; * Copyright(C) 2001 Peter Ross <pross@xvid.org>
7 :			; * 2002 Michael Militzer <isibaar@xvid.org>
8 :			; * 2002 Pascal Massimino <skal@planet-d.net>
9 :	Isibaar	1.1	; *
10 :	edgomez	1.5	; * This program is free software ; you can redistribute it and/or modify
11 :			; * it under the terms of the GNU General Public License as published by
12 :			; * the Free Software Foundation ; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :	edgomez	1.3	; *
15 :	edgomez	1.5	; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :	Isibaar	1.1	; *
20 :	edgomez	1.5	; * You should have received a copy of the GNU General Public License
21 :			; * along with this program ; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :	edgomez	1.4	; *
24 :	edgomez	1.5	; ****************************************************************************/
25 :	edgomez	1.4
26 :	edgomez	1.5	BITS 32
27 :	Isibaar	1.1
28 :	edgomez	1.5	%macro cglobal 1
29 :	Isibaar	1.1	%ifdef PREFIX
30 :	edgomez	1.7	%ifdef MARK_FUNCS
31 :	edgomez	1.8	global _%1:function %1.endfunc-%1
32 :			%define %1 _%1:function %1.endfunc-%1
33 :	edgomez	1.7	%else
34 :			global _%1
35 :			%define %1 _%1
36 :			%endif
37 :	Isibaar	1.1	%else
38 :	edgomez	1.7	%ifdef MARK_FUNCS
39 :	edgomez	1.8	global %1:function %1.endfunc-%1
40 :	edgomez	1.7	%else
41 :			global %1
42 :			%endif
43 :	Isibaar	1.1	%endif
44 :			%endmacro
45 :
46 :	edgomez	1.5	;=============================================================================
47 :			; Read Only data
48 :			;=============================================================================
49 :
50 :			%ifdef FORMAT_COFF
51 :	edgomez	1.6	SECTION .rodata
52 :	edgomez	1.5	%else
53 :	edgomez	1.6	SECTION .rodata align=16
54 :	edgomez	1.5	%endif
55 :
56 :			ALIGN 16
57 :			mmx_one:
58 :			times 8 db 1
59 :
60 :			;=============================================================================
61 :			; Code
62 :			;=============================================================================
63 :	Isibaar	1.1
64 :	edgomez	1.5	SECTION .text
65 :	Isibaar	1.1
66 :			cglobal interpolate8x8_halfpel_h_3dn
67 :			cglobal interpolate8x8_halfpel_v_3dn
68 :			cglobal interpolate8x8_halfpel_hv_3dn
69 :
70 :	suxen_drol	1.9	cglobal interpolate8x4_halfpel_h_3dn
71 :			cglobal interpolate8x4_halfpel_v_3dn
72 :			cglobal interpolate8x4_halfpel_hv_3dn
73 :
74 :	edgomez	1.5	;-----------------------------------------------------------------------------
75 :	Isibaar	1.1	;
76 :			; void interpolate8x8_halfpel_h_3dn(uint8_t * const dst,
77 :	edgomez	1.5	; const uint8_t * const src,
78 :			; const uint32_t stride,
79 :			; const uint32_t rounding);
80 :	Isibaar	1.1	;
81 :	edgomez	1.5	;-----------------------------------------------------------------------------
82 :	Isibaar	1.1
83 :			%macro COPY_H_3DN_RND0 0
84 :	edgomez	1.5	movq mm0, [eax]
85 :	Isibaar	1.1	pavgusb mm0, [eax+1]
86 :	edgomez	1.5	movq mm1, [eax+edx]
87 :	Isibaar	1.1	pavgusb mm1, [eax+edx+1]
88 :	edgomez	1.5	lea eax, [eax+2*edx]
89 :			movq [ecx], mm0
90 :			movq [ecx+edx], mm1
91 :	Isibaar	1.1	%endmacro
92 :
93 :			%macro COPY_H_3DN_RND1 0
94 :			movq mm0, [eax]
95 :			movq mm1, [eax+edx]
96 :			movq mm4, mm0
97 :			movq mm5, mm1
98 :	edgomez	1.5	movq mm2, [eax+1]
99 :	Isibaar	1.1	movq mm3, [eax+edx+1]
100 :			pavgusb mm0, mm2
101 :			pxor mm2, mm4
102 :			pavgusb mm1, mm3
103 :	edgomez	1.5	lea eax, [eax+2*edx]
104 :	Isibaar	1.1	pxor mm3, mm5
105 :			pand mm2, mm7
106 :			pand mm3, mm7
107 :			psubb mm0, mm2
108 :			movq [ecx], mm0
109 :			psubb mm1, mm3
110 :			movq [ecx+edx], mm1
111 :			%endmacro
112 :
113 :	edgomez	1.5	ALIGN 16
114 :	Isibaar	1.1	interpolate8x8_halfpel_h_3dn:
115 :
116 :			mov eax, [esp+16] ; rounding
117 :			mov ecx, [esp+ 4] ; Dst
118 :	edgomez	1.5	test eax, eax
119 :	Isibaar	1.1	mov eax, [esp+ 8] ; Src
120 :			mov edx, [esp+12] ; stride
121 :
122 :			jnz near .rounding1
123 :
124 :			COPY_H_3DN_RND0
125 :	edgomez	1.5	lea ecx, [ecx+2*edx]
126 :	Isibaar	1.1	COPY_H_3DN_RND0
127 :	edgomez	1.5	lea ecx, [ecx+2*edx]
128 :	Isibaar	1.1	COPY_H_3DN_RND0
129 :	edgomez	1.5	lea ecx, [ecx+2*edx]
130 :	Isibaar	1.1	COPY_H_3DN_RND0
131 :			ret
132 :
133 :			.rounding1
134 :	edgomez	1.5	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
135 :	Isibaar	1.1	movq mm7, [mmx_one]
136 :			COPY_H_3DN_RND1
137 :			lea ecx, [ecx+2*edx]
138 :			COPY_H_3DN_RND1
139 :	edgomez	1.5	lea ecx, [ecx+2*edx]
140 :	Isibaar	1.1	COPY_H_3DN_RND1
141 :	edgomez	1.5	lea ecx, [ecx+2*edx]
142 :	Isibaar	1.1	COPY_H_3DN_RND1
143 :			ret
144 :	edgomez	1.8	.endfunc
145 :	Isibaar	1.1
146 :
147 :	edgomez	1.5	;-----------------------------------------------------------------------------
148 :	Isibaar	1.1	;
149 :			; void interpolate8x8_halfpel_v_3dn(uint8_t * const dst,
150 :	edgomez	1.5	; const uint8_t * const src,
151 :			; const uint32_t stride,
152 :			; const uint32_t rounding);
153 :	Isibaar	1.1	;
154 :	edgomez	1.5	;-----------------------------------------------------------------------------
155 :	Isibaar	1.1
156 :			%macro COPY_V_3DN_RND0 0
157 :	edgomez	1.5	movq mm0, [eax]
158 :			movq mm1, [eax+edx]
159 :	Isibaar	1.1	pavgusb mm0, mm1
160 :			pavgusb mm1, [eax+2*edx]
161 :	edgomez	1.5	lea eax, [eax+2*edx]
162 :			movq [ecx], mm0
163 :			movq [ecx+edx], mm1
164 :	Isibaar	1.1	%endmacro
165 :
166 :			%macro COPY_V_3DN_RND1 0
167 :			movq mm0, mm2
168 :			movq mm1, [eax]
169 :			movq mm2, [eax+edx]
170 :	edgomez	1.5	lea eax, [eax+2*edx]
171 :	Isibaar	1.1	movq mm4, mm0
172 :			movq mm5, mm1
173 :			pavgusb mm0, mm1
174 :	edgomez	1.5	pxor mm4, mm1
175 :	Isibaar	1.1	pavgusb mm1, mm2
176 :			pxor mm5, mm2
177 :	edgomez	1.5	pand mm4, mm7 ; lsb's of (i^j)...
178 :			pand mm5, mm7 ; lsb's of (i^j)...
179 :			psubb mm0, mm4 ; ...are substracted from result of pavgusb
180 :	Isibaar	1.1	movq [ecx], mm0
181 :	edgomez	1.5	psubb mm1, mm5 ; ...are substracted from result of pavgusb
182 :	Isibaar	1.1	movq [ecx+edx], mm1
183 :			%endmacro
184 :
185 :	edgomez	1.5	ALIGN 16
186 :	Isibaar	1.1	interpolate8x8_halfpel_v_3dn:
187 :
188 :			mov eax, [esp+16] ; rounding
189 :			mov ecx, [esp+ 4] ; Dst
190 :			test eax,eax
191 :			mov eax, [esp+ 8] ; Src
192 :			mov edx, [esp+12] ; stride
193 :
194 :			; we process 2 line at a time
195 :
196 :			jnz near .rounding1
197 :
198 :			COPY_V_3DN_RND0
199 :			lea ecx, [ecx+2*edx]
200 :			COPY_V_3DN_RND0
201 :			lea ecx, [ecx+2*edx]
202 :			COPY_V_3DN_RND0
203 :			lea ecx, [ecx+2*edx]
204 :			COPY_V_3DN_RND0
205 :			ret
206 :
207 :			.rounding1
208 :	edgomez	1.5	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
209 :	Isibaar	1.1	movq mm7, [mmx_one]
210 :	edgomez	1.5	movq mm2, [eax] ; loop invariant
211 :	Isibaar	1.1	add eax, edx
212 :
213 :			COPY_V_3DN_RND1
214 :	edgomez	1.5	lea ecx, [ecx+2*edx]
215 :	Isibaar	1.1	COPY_V_3DN_RND1
216 :	edgomez	1.5	lea ecx, [ecx+2*edx]
217 :	Isibaar	1.1	COPY_V_3DN_RND1
218 :	edgomez	1.5	lea ecx, [ecx+2*edx]
219 :	Isibaar	1.1	COPY_V_3DN_RND1
220 :			ret
221 :	edgomez	1.8	.endfunc
222 :	Isibaar	1.1
223 :
224 :	edgomez	1.5	;-----------------------------------------------------------------------------
225 :	Isibaar	1.1	;
226 :			; void interpolate8x8_halfpel_hv_3dn(uint8_t * const dst,
227 :	edgomez	1.5	; const uint8_t * const src,
228 :			; const uint32_t stride,
229 :			; const uint32_t rounding);
230 :	Isibaar	1.1	;
231 :			;
232 :	edgomez	1.5	;-----------------------------------------------------------------------------
233 :	Isibaar	1.1
234 :			; The trick is to correct the result of 'pavgusb' with some combination of the
235 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgusb' (s and t).
236 :			; The boolean relations are:
237 :	edgomez	1.5	; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
238 :	Isibaar	1.1	; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
239 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
240 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
241 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
242 :
243 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
244 :
245 :			%macro COPY_HV_3DN_RND0 0
246 :	edgomez	1.5	lea eax, [eax+edx]
247 :
248 :			movq mm0, [eax]
249 :			movq mm1, [eax+1]
250 :
251 :			movq mm6, mm0
252 :			pavgusb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
253 :			lea eax, [eax+edx]
254 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
255 :
256 :			por mm3, mm1 ; ij \|= jk
257 :			movq mm6, mm2
258 :			pxor mm6, mm0 ; mm6 = s^t
259 :			pand mm3, mm6 ; (ij\|jk) &= st
260 :			pavgusb mm2, mm0 ; mm2 = (s+t+1)/2
261 :			pand mm3, mm7 ; mask lsb
262 :			psubb mm2, mm3 ; apply.
263 :	Isibaar	1.1
264 :	edgomez	1.5	movq [ecx], mm2
265 :	Isibaar	1.1
266 :	edgomez	1.5	movq mm2, [eax]
267 :			movq mm3, [eax+1]
268 :			movq mm6, mm2
269 :			pavgusb mm2, mm3 ; preserved for next iteration
270 :			lea ecx, [ecx+edx]
271 :			pxor mm3, mm6 ; preserved for next iteration
272 :
273 :			por mm1, mm3
274 :			movq mm6, mm0
275 :			pxor mm6, mm2
276 :			pand mm1, mm6
277 :			pavgusb mm0, mm2
278 :	Isibaar	1.1
279 :	edgomez	1.5	pand mm1, mm7
280 :			psubb mm0, mm1
281 :	Isibaar	1.1
282 :	edgomez	1.5	movq [ecx], mm0
283 :	Isibaar	1.1	%endmacro
284 :
285 :			%macro COPY_HV_3DN_RND1 0
286 :	edgomez	1.5	lea eax,[eax+edx]
287 :
288 :			movq mm0, [eax]
289 :			movq mm1, [eax+1]
290 :
291 :			movq mm6, mm0
292 :			pavgusb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
293 :			lea eax, [eax+edx]
294 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
295 :
296 :			pand mm3, mm1
297 :			movq mm6, mm2
298 :			pxor mm6, mm0
299 :			por mm3, mm6
300 :			pavgusb mm2, mm0
301 :			pand mm3, mm7
302 :			psubb mm2, mm3
303 :	Isibaar	1.1
304 :	edgomez	1.5	movq [ecx], mm2
305 :	Isibaar	1.1
306 :	edgomez	1.5	movq mm2, [eax]
307 :			movq mm3, [eax+1]
308 :			movq mm6, mm2
309 :			pavgusb mm2, mm3 ; preserved for next iteration
310 :			lea ecx, [ecx+edx]
311 :			pxor mm3, mm6 ; preserved for next iteration
312 :
313 :			pand mm1, mm3
314 :			movq mm6, mm0
315 :			pxor mm6, mm2
316 :			por mm1, mm6
317 :			pavgusb mm0, mm2
318 :			pand mm1, mm7
319 :			psubb mm0, mm1
320 :	Isibaar	1.1
321 :	edgomez	1.5	movq [ecx], mm0
322 :	Isibaar	1.1	%endmacro
323 :
324 :	edgomez	1.5	ALIGN 16
325 :	Isibaar	1.1	interpolate8x8_halfpel_hv_3dn
326 :			mov eax, [esp+16] ; rounding
327 :			mov ecx, [esp+ 4] ; Dst
328 :	edgomez	1.5	test eax, eax
329 :	Isibaar	1.1	mov eax, [esp+ 8] ; Src
330 :			mov edx, [esp+12] ; stride
331 :
332 :			movq mm7, [mmx_one]
333 :
334 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
335 :			movq mm2, [eax]
336 :			movq mm3, [eax+1]
337 :			movq mm6, mm2
338 :			pavgusb mm2, mm3
339 :	edgomez	1.5	pxor mm3, mm6 ; mm2/mm3 ready
340 :	Isibaar	1.1
341 :			jnz near .rounding1
342 :
343 :			COPY_HV_3DN_RND0
344 :			add ecx, edx
345 :			COPY_HV_3DN_RND0
346 :			add ecx, edx
347 :			COPY_HV_3DN_RND0
348 :			add ecx, edx
349 :			COPY_HV_3DN_RND0
350 :			ret
351 :
352 :			.rounding1
353 :			COPY_HV_3DN_RND1
354 :			add ecx, edx
355 :			COPY_HV_3DN_RND1
356 :			add ecx, edx
357 :			COPY_HV_3DN_RND1
358 :			add ecx, edx
359 :			COPY_HV_3DN_RND1
360 :	edgomez	1.5	ret
361 :	edgomez	1.8	.endfunc
362 :
363 :	suxen_drol	1.9	;-----------------------------------------------------------------------------
364 :			;
365 :			; void interpolate8x4_halfpel_h_3dn(uint8_t * const dst,
366 :			; const uint8_t * const src,
367 :			; const uint32_t stride,
368 :			; const uint32_t rounding);
369 :			;
370 :			;-----------------------------------------------------------------------------
371 :
372 :			ALIGN 16
373 :			interpolate8x4_halfpel_h_3dn:
374 :
375 :			mov eax, [esp+16] ; rounding
376 :			mov ecx, [esp+ 4] ; Dst
377 :			test eax, eax
378 :			mov eax, [esp+ 8] ; Src
379 :			mov edx, [esp+12] ; stride
380 :
381 :			jnz near .rounding1
382 :
383 :			COPY_H_3DN_RND0
384 :			lea ecx, [ecx+2*edx]
385 :			COPY_H_3DN_RND0
386 :			ret
387 :
388 :			.rounding1
389 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
390 :			movq mm7, [mmx_one]
391 :			COPY_H_3DN_RND1
392 :			lea ecx, [ecx+2*edx]
393 :			COPY_H_3DN_RND1
394 :			ret
395 :			.endfunc
396 :
397 :
398 :			;-----------------------------------------------------------------------------
399 :			;
400 :			; void interpolate8x4_halfpel_v_3dn(uint8_t * const dst,
401 :			; const uint8_t * const src,
402 :			; const uint32_t stride,
403 :			; const uint32_t rounding);
404 :			;
405 :			;-----------------------------------------------------------------------------
406 :
407 :			ALIGN 16
408 :			interpolate8x4_halfpel_v_3dn:
409 :
410 :			mov eax, [esp+16] ; rounding
411 :			mov ecx, [esp+ 4] ; Dst
412 :			test eax,eax
413 :			mov eax, [esp+ 8] ; Src
414 :			mov edx, [esp+12] ; stride
415 :
416 :			; we process 2 line at a time
417 :
418 :			jnz near .rounding1
419 :
420 :			COPY_V_3DN_RND0
421 :			lea ecx, [ecx+2*edx]
422 :			COPY_V_3DN_RND0
423 :			ret
424 :
425 :			.rounding1
426 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
427 :			movq mm7, [mmx_one]
428 :			movq mm2, [eax] ; loop invariant
429 :			add eax, edx
430 :
431 :			COPY_V_3DN_RND1
432 :			lea ecx, [ecx+2*edx]
433 :			COPY_V_3DN_RND1
434 :			ret
435 :			.endfunc
436 :
437 :
438 :			;-----------------------------------------------------------------------------
439 :			;
440 :			; void interpolate8x4_halfpel_hv_3dn(uint8_t * const dst,
441 :			; const uint8_t * const src,
442 :			; const uint32_t stride,
443 :			; const uint32_t rounding);
444 :			;
445 :			;
446 :			;-----------------------------------------------------------------------------
447 :
448 :			; The trick is to correct the result of 'pavgusb' with some combination of the
449 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgusb' (s and t).
450 :			; The boolean relations are:
451 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
452 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
453 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
454 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
455 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
456 :
457 :			ALIGN 16
458 :			interpolate8x4_halfpel_hv_3dn
459 :			mov eax, [esp+16] ; rounding
460 :			mov ecx, [esp+ 4] ; Dst
461 :			test eax, eax
462 :			mov eax, [esp+ 8] ; Src
463 :			mov edx, [esp+12] ; stride
464 :
465 :			movq mm7, [mmx_one]
466 :
467 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
468 :			movq mm2, [eax]
469 :			movq mm3, [eax+1]
470 :			movq mm6, mm2
471 :			pavgusb mm2, mm3
472 :			pxor mm3, mm6 ; mm2/mm3 ready
473 :
474 :			jnz near .rounding1
475 :
476 :			COPY_HV_3DN_RND0
477 :			add ecx, edx
478 :			COPY_HV_3DN_RND0
479 :			ret
480 :
481 :			.rounding1
482 :			COPY_HV_3DN_RND1
483 :			add ecx, edx
484 :			COPY_HV_3DN_RND1
485 :			ret
486 :			.endfunc
487 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4