Annotation of /xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm

Revision 1.7 - (view) (download)

1 :	edgomez	1.5	;/*****************************************************************************
2 :	Isibaar	1.1	; *
3 :	edgomez	1.5	; * XVID MPEG-4 VIDEO CODEC
4 :			; * - mmx 8x8 block-based halfpel interpolation -
5 :	Isibaar	1.1	; *
6 :	edgomez	1.5	; * Copyright(C) 2002 Michael Militzer <isibaar@xvid.org>
7 :			; * 2002 Pascal Massimino <skal@planet-d.net>
8 :	Isibaar	1.1	; *
9 :	edgomez	1.5	; * This program is free software ; you can redistribute it and/or modify
10 :			; * it under the terms of the GNU General Public License as published by
11 :			; * the Free Software Foundation ; either version 2 of the License, or
12 :			; * (at your option) any later version.
13 :	edgomez	1.3	; *
14 :	edgomez	1.5	; * This program is distributed in the hope that it will be useful,
15 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
16 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 :			; * GNU General Public License for more details.
18 :	Isibaar	1.1	; *
19 :	edgomez	1.5	; * You should have received a copy of the GNU General Public License
20 :			; * along with this program ; if not, write to the Free Software
21 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 :	edgomez	1.4	; *
23 :	edgomez	1.5	; ****************************************************************************/
24 :	edgomez	1.4
25 :	edgomez	1.5	BITS 32
26 :	Isibaar	1.1
27 :	edgomez	1.5	%macro cglobal 1
28 :	Isibaar	1.1	%ifdef PREFIX
29 :	edgomez	1.5	global _%1
30 :	Isibaar	1.1	%define %1 _%1
31 :			%else
32 :			global %1
33 :			%endif
34 :			%endmacro
35 :
36 :	edgomez	1.5	;=============================================================================
37 :			; Read only data
38 :			;=============================================================================
39 :
40 :			%ifdef FORMAT_COFF
41 :	edgomez	1.6	SECTION .rodata
42 :	edgomez	1.5	%else
43 :	edgomez	1.6	SECTION .rodata align=16
44 :	edgomez	1.5	%endif
45 :
46 :			ALIGN 16
47 :			mmx_one:
48 :			times 8 db 1
49 :	Isibaar	1.1
50 :	edgomez	1.5	SECTION .text
51 :	Isibaar	1.1
52 :			cglobal interpolate8x8_halfpel_h_xmm
53 :			cglobal interpolate8x8_halfpel_v_xmm
54 :			cglobal interpolate8x8_halfpel_hv_xmm
55 :
56 :	edgomez	1.7	cglobal interpolate8x8_halfpel_add_xmm
57 :			cglobal interpolate8x8_halfpel_h_add_xmm
58 :			cglobal interpolate8x8_halfpel_v_add_xmm
59 :			cglobal interpolate8x8_halfpel_hv_add_xmm
60 :
61 :	Isibaar	1.1	;===========================================================================
62 :			;
63 :			; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst,
64 :			; const uint8_t * const src,
65 :			; const uint32_t stride,
66 :			; const uint32_t rounding);
67 :			;
68 :			;===========================================================================
69 :
70 :			%macro COPY_H_SSE_RND0 0
71 :			movq mm0, [eax]
72 :			pavgb mm0, [eax+1]
73 :			movq mm1, [eax+edx]
74 :			pavgb mm1, [eax+edx+1]
75 :			lea eax,[eax+2*edx]
76 :			movq [ecx],mm0
77 :			movq [ecx+edx],mm1
78 :			%endmacro
79 :
80 :			%macro COPY_H_SSE_RND1 0
81 :			movq mm0, [eax]
82 :			movq mm1, [eax+edx]
83 :			movq mm4, mm0
84 :			movq mm5, mm1
85 :	edgomez	1.5	movq mm2, [eax+1]
86 :	Isibaar	1.1	movq mm3, [eax+edx+1]
87 :			pavgb mm0, mm2
88 :			pxor mm2, mm4
89 :			pavgb mm1, mm3
90 :	edgomez	1.5	lea eax, [eax+2*edx]
91 :	Isibaar	1.1	pxor mm3, mm5
92 :			pand mm2, mm7
93 :			pand mm3, mm7
94 :			psubb mm0, mm2
95 :			movq [ecx], mm0
96 :			psubb mm1, mm3
97 :	edgomez	1.5	movq [ecx+edx], mm1
98 :	Isibaar	1.1	%endmacro
99 :
100 :	edgomez	1.5	ALIGN 16
101 :	Isibaar	1.1	interpolate8x8_halfpel_h_xmm:
102 :
103 :	edgomez	1.5	mov eax, [esp+16] ; rounding
104 :			mov ecx, [esp+ 4] ; Dst
105 :	Isibaar	1.1	test eax,eax
106 :	edgomez	1.5	mov eax, [esp+ 8] ; Src
107 :			mov edx, [esp+12] ; stride
108 :	Isibaar	1.1
109 :			jnz near .rounding1
110 :
111 :			COPY_H_SSE_RND0
112 :			lea ecx,[ecx+2*edx]
113 :			COPY_H_SSE_RND0
114 :			lea ecx,[ecx+2*edx]
115 :			COPY_H_SSE_RND0
116 :			lea ecx,[ecx+2*edx]
117 :			COPY_H_SSE_RND0
118 :			ret
119 :
120 :			.rounding1
121 :	edgomez	1.5	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
122 :	Isibaar	1.1	movq mm7, [mmx_one]
123 :			COPY_H_SSE_RND1
124 :			lea ecx, [ecx+2*edx]
125 :			COPY_H_SSE_RND1
126 :			lea ecx,[ecx+2*edx]
127 :			COPY_H_SSE_RND1
128 :			lea ecx,[ecx+2*edx]
129 :			COPY_H_SSE_RND1
130 :			ret
131 :
132 :			;===========================================================================
133 :			;
134 :			; void interpolate8x8_halfpel_v_xmm(uint8_t * const dst,
135 :	edgomez	1.5	; const uint8_t * const src,
136 :			; const uint32_t stride,
137 :			; const uint32_t rounding);
138 :	Isibaar	1.1	;
139 :			;===========================================================================
140 :
141 :			%macro COPY_V_SSE_RND0 0
142 :	edgomez	1.5	movq mm0, [eax]
143 :			movq mm1, [eax+edx]
144 :	Isibaar	1.1	pavgb mm0, mm1
145 :			pavgb mm1, [eax+2*edx]
146 :	edgomez	1.5	lea eax, [eax+2*edx]
147 :			movq [ecx], mm0
148 :	Isibaar	1.1	movq [ecx+edx],mm1
149 :			%endmacro
150 :
151 :			%macro COPY_V_SSE_RND1 0
152 :			movq mm0, mm2
153 :			movq mm1, [eax]
154 :			movq mm2, [eax+edx]
155 :			lea eax,[eax+2*edx]
156 :			movq mm4, mm0
157 :			movq mm5, mm1
158 :			pavgb mm0, mm1
159 :	edgomez	1.5	pxor mm4, mm1
160 :	Isibaar	1.1	pavgb mm1, mm2
161 :			pxor mm5, mm2
162 :	edgomez	1.5	pand mm4, mm7 ; lsb's of (i^j)...
163 :			pand mm5, mm7 ; lsb's of (i^j)...
164 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
165 :	Isibaar	1.1	movq [ecx], mm0
166 :	edgomez	1.5	psubb mm1, mm5 ; ...are substracted from result of pavgb
167 :	Isibaar	1.1	movq [ecx+edx], mm1
168 :			%endmacro
169 :
170 :	edgomez	1.5	ALIGN 16
171 :	Isibaar	1.1	interpolate8x8_halfpel_v_xmm:
172 :
173 :			mov eax, [esp+16]; rounding
174 :	edgomez	1.5	mov ecx, [esp+ 4] ; Dst
175 :	Isibaar	1.1	test eax,eax
176 :	edgomez	1.5	mov eax, [esp+ 8] ; Src
177 :			mov edx, [esp+12] ; stride
178 :	Isibaar	1.1
179 :	edgomez	1.5	; we process 2 line at a time
180 :	Isibaar	1.1	jnz near .rounding1
181 :
182 :			COPY_V_SSE_RND0
183 :			lea ecx, [ecx+2*edx]
184 :			COPY_V_SSE_RND0
185 :			lea ecx, [ecx+2*edx]
186 :			COPY_V_SSE_RND0
187 :			lea ecx, [ecx+2*edx]
188 :			COPY_V_SSE_RND0
189 :			ret
190 :
191 :			.rounding1
192 :	edgomez	1.5	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
193 :	Isibaar	1.1	movq mm7, [mmx_one]
194 :	edgomez	1.5	movq mm2, [eax] ; loop invariant
195 :	Isibaar	1.1	add eax, edx
196 :
197 :			COPY_V_SSE_RND1
198 :			lea ecx,[ecx+2*edx]
199 :			COPY_V_SSE_RND1
200 :			lea ecx,[ecx+2*edx]
201 :			COPY_V_SSE_RND1
202 :			lea ecx,[ecx+2*edx]
203 :			COPY_V_SSE_RND1
204 :			ret
205 :
206 :			;===========================================================================
207 :			;
208 :			; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,
209 :	edgomez	1.5	; const uint8_t * const src,
210 :			; const uint32_t stride,
211 :			; const uint32_t rounding);
212 :	Isibaar	1.1	;
213 :			;
214 :			;===========================================================================
215 :
216 :			; The trick is to correct the result of 'pavgb' with some combination of the
217 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
218 :			; The boolean relations are:
219 :	edgomez	1.5	; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
220 :	Isibaar	1.1	; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
221 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
222 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
223 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
224 :
225 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
226 :
227 :			%macro COPY_HV_SSE_RND0 0
228 :	edgomez	1.5	lea eax, [eax+edx]
229 :	Isibaar	1.1
230 :	edgomez	1.5	movq mm0, [eax]
231 :			movq mm1, [eax+1]
232 :	Isibaar	1.1
233 :	edgomez	1.5	movq mm6, mm0
234 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
235 :			lea eax, [eax+edx]
236 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
237 :	Isibaar	1.1
238 :	edgomez	1.5	por mm3, mm1 ; ij \|= jk
239 :			movq mm6, mm2
240 :			pxor mm6, mm0 ; mm6 = s^t
241 :			pand mm3, mm6 ; (ij\|jk) &= st
242 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
243 :			pand mm3, mm7 ; mask lsb
244 :			psubb mm2, mm3 ; apply.
245 :
246 :			movq [ecx], mm2
247 :	Isibaar	1.1
248 :	edgomez	1.5	movq mm2, [eax]
249 :			movq mm3, [eax+1]
250 :			movq mm6, mm2
251 :			pavgb mm2, mm3 ; preserved for next iteration
252 :			lea ecx,[ecx+edx]
253 :			pxor mm3, mm6 ; preserved for next iteration
254 :
255 :			por mm1, mm3
256 :			movq mm6, mm0
257 :			pxor mm6, mm2
258 :			pand mm1, mm6
259 :			pavgb mm0, mm2
260 :
261 :			pand mm1, mm7
262 :			psubb mm0, mm1
263 :
264 :			movq [ecx], mm0
265 :	Isibaar	1.1	%endmacro
266 :
267 :			%macro COPY_HV_SSE_RND1 0
268 :	edgomez	1.5	lea eax, [eax+edx]
269 :
270 :			movq mm0, [eax]
271 :			movq mm1, [eax+1]
272 :	Isibaar	1.1
273 :	edgomez	1.5	movq mm6, mm0
274 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
275 :			lea eax, [eax+edx]
276 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
277 :	Isibaar	1.1
278 :	edgomez	1.5	pand mm3, mm1
279 :			movq mm6, mm2
280 :			pxor mm6, mm0
281 :			por mm3, mm6
282 :			pavgb mm2, mm0
283 :			pand mm3, mm7
284 :			psubb mm2, mm3
285 :
286 :			movq [ecx], mm2
287 :	Isibaar	1.1
288 :	edgomez	1.5	movq mm2, [eax]
289 :			movq mm3, [eax+1]
290 :			movq mm6, mm2
291 :			pavgb mm2, mm3 ; preserved for next iteration
292 :			lea ecx,[ecx+edx]
293 :			pxor mm3, mm6 ; preserved for next iteration
294 :
295 :			pand mm1, mm3
296 :			movq mm6, mm0
297 :			pxor mm6, mm2
298 :			por mm1, mm6
299 :			pavgb mm0, mm2
300 :			pand mm1, mm7
301 :			psubb mm0, mm1
302 :
303 :			movq [ecx], mm0
304 :	Isibaar	1.1	%endmacro
305 :
306 :	edgomez	1.5	ALIGN 16
307 :	Isibaar	1.1	interpolate8x8_halfpel_hv_xmm:
308 :	edgomez	1.5	mov eax, [esp+16] ; rounding
309 :			mov ecx, [esp+ 4] ; Dst
310 :			test eax, eax
311 :			mov eax, [esp+ 8] ; Src
312 :			mov edx, [esp+12] ; stride
313 :	Isibaar	1.1
314 :			movq mm7, [mmx_one]
315 :
316 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
317 :			movq mm2, [eax]
318 :			movq mm3, [eax+1]
319 :			movq mm6, mm2
320 :			pavgb mm2, mm3
321 :	edgomez	1.5	pxor mm3, mm6 ; mm2/mm3 ready
322 :	Isibaar	1.1
323 :			jnz near .rounding1
324 :
325 :			COPY_HV_SSE_RND0
326 :			add ecx, edx
327 :			COPY_HV_SSE_RND0
328 :			add ecx, edx
329 :			COPY_HV_SSE_RND0
330 :			add ecx, edx
331 :			COPY_HV_SSE_RND0
332 :			ret
333 :
334 :			.rounding1
335 :			COPY_HV_SSE_RND1
336 :			add ecx, edx
337 :			COPY_HV_SSE_RND1
338 :			add ecx, edx
339 :			COPY_HV_SSE_RND1
340 :			add ecx, edx
341 :			COPY_HV_SSE_RND1
342 :	edgomez	1.5	ret
343 :	edgomez	1.7
344 :			;===========================================================================
345 :			;
346 :			; The next functions combine both source halfpel interpolation step and the
347 :			; averaging (with rouding) step to avoid wasting memory bandwidth computing
348 :			; intermediate halfpel images and then averaging them.
349 :			;
350 :			;===========================================================================
351 :
352 :			%macro PROLOG0 0
353 :			mov ecx, [esp+ 4] ; Dst
354 :			mov eax, [esp+ 8] ; Src
355 :			mov edx, [esp+12] ; BpS
356 :			%endmacro
357 :			%macro PROLOG1 0
358 :			PROLOG0
359 :			test dword [esp+16], 1; Rounding?
360 :			%endmacro
361 :			%macro EPILOG 0
362 :			ret
363 :			%endmacro
364 :
365 :			;===========================================================================
366 :			;
367 :			; void interpolate8x8_halfpel_add_xmm(uint8_t * const dst,
368 :			; const uint8_t * const src,
369 :			; const uint32_t stride,
370 :			; const uint32_t rounding);
371 :			;
372 :			;
373 :			;===========================================================================
374 :
375 :			%macro ADD_FF 2
376 :			movq mm0, [eax+%1]
377 :			movq mm1, [eax+%2]
378 :			;;---
379 :			;; movq mm2, mm0
380 :			;; movq mm3, mm1
381 :			;;---
382 :			pavgb mm0, [ecx+%1]
383 :			pavgb mm1, [ecx+%2]
384 :			;;--
385 :			;; por mm2, [ecx+%1]
386 :			;; por mm3, [ecx+%2]
387 :			;; pand mm2, [mmx_one]
388 :			;; pand mm3, [mmx_one]
389 :			;; psubsb mm0, mm2
390 :			;; psubsb mm1, mm3
391 :			;;--
392 :			movq [ecx+%1], mm0
393 :			movq [ecx+%2], mm1
394 :			%endmacro
395 :
396 :			ALIGN 16
397 :			interpolate8x8_halfpel_add_xmm: ; 23c
398 :			PROLOG1
399 :			ADD_FF 0, edx
400 :			lea eax,[eax+2*edx]
401 :			lea ecx,[ecx+2*edx]
402 :			ADD_FF 0, edx
403 :			lea eax,[eax+2*edx]
404 :			lea ecx,[ecx+2*edx]
405 :			ADD_FF 0, edx
406 :			lea eax,[eax+2*edx]
407 :			lea ecx,[ecx+2*edx]
408 :			ADD_FF 0, edx
409 :			EPILOG
410 :
411 :			;===========================================================================
412 :			;
413 :			; void interpolate8x8_halfpel_h_add_xmm(uint8_t * const dst,
414 :			; const uint8_t * const src,
415 :			; const uint32_t stride,
416 :			; const uint32_t rounding);
417 :			;
418 :			;
419 :			;===========================================================================
420 :
421 :
422 :			%macro ADD_FH_RND0 2
423 :			movq mm0, [eax+%1]
424 :			movq mm1, [eax+%2]
425 :			pavgb mm0, [eax+%1+1]
426 :			pavgb mm1, [eax+%2+1]
427 :			pavgb mm0, [ecx+%1]
428 :			pavgb mm1, [ecx+%2]
429 :			movq [ecx+%1],mm0
430 :			movq [ecx+%2],mm1
431 :			%endmacro
432 :
433 :			%macro ADD_FH_RND1 2
434 :			movq mm0, [eax+%1]
435 :			movq mm1, [eax+%2]
436 :			movq mm4, mm0
437 :			movq mm5, mm1
438 :			movq mm2, [eax+%1+1]
439 :			movq mm3, [eax+%2+1]
440 :			pavgb mm0, mm2
441 :			; lea ??
442 :			pxor mm2, mm4
443 :			pavgb mm1, mm3
444 :			pxor mm3, mm5
445 :			pand mm2, [mmx_one]
446 :			pand mm3, [mmx_one]
447 :			psubb mm0, mm2
448 :			psubb mm1, mm3
449 :			pavgb mm0, [ecx+%1]
450 :			pavgb mm1, [ecx+%2]
451 :			movq [ecx+%1],mm0
452 :			movq [ecx+%2],mm1
453 :			%endmacro
454 :
455 :			ALIGN 16
456 :			interpolate8x8_halfpel_h_add_xmm: ; 32c
457 :			PROLOG1
458 :			jnz near .Loop1
459 :			ADD_FH_RND0 0, edx
460 :			lea eax,[eax+2*edx]
461 :			lea ecx,[ecx+2*edx]
462 :			ADD_FH_RND0 0, edx
463 :			lea eax,[eax+2*edx]
464 :			lea ecx,[ecx+2*edx]
465 :			ADD_FH_RND0 0, edx
466 :			lea eax,[eax+2*edx]
467 :			lea ecx,[ecx+2*edx]
468 :			ADD_FH_RND0 0, edx
469 :			EPILOG
470 :
471 :			.Loop1
472 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
473 :			; movq mm7, [mmx_one]
474 :			ADD_FH_RND1 0, edx
475 :			lea eax,[eax+2*edx]
476 :			lea ecx,[ecx+2*edx]
477 :			ADD_FH_RND1 0, edx
478 :			lea eax,[eax+2*edx]
479 :			lea ecx,[ecx+2*edx]
480 :			ADD_FH_RND1 0, edx
481 :			lea eax,[eax+2*edx]
482 :			lea ecx,[ecx+2*edx]
483 :			ADD_FH_RND1 0, edx
484 :			EPILOG
485 :
486 :
487 :			;===========================================================================
488 :			;
489 :			; void interpolate8x8_halfpel_v_add_xmm(uint8_t * const dst,
490 :			; const uint8_t * const src,
491 :			; const uint32_t stride,
492 :			; const uint32_t rounding);
493 :			;
494 :			;
495 :			;===========================================================================
496 :
497 :			%macro ADD_8_HF_RND0 0
498 :			movq mm0, [eax]
499 :			movq mm1, [eax+edx]
500 :			pavgb mm0, mm1
501 :			pavgb mm1, [eax+2*edx]
502 :			lea eax,[eax+2*edx]
503 :			pavgb mm0, [ecx]
504 :			pavgb mm1, [ecx+edx]
505 :			movq [ecx],mm0
506 :			movq [ecx+edx],mm1
507 :			%endmacro
508 :
509 :			%macro ADD_8_HF_RND1 0
510 :			movq mm1, [eax+edx]
511 :			movq mm2, [eax+2*edx]
512 :			lea eax,[eax+2*edx]
513 :			movq mm4, mm0
514 :			movq mm5, mm1
515 :			pavgb mm0, mm1
516 :			pxor mm4, mm1
517 :			pavgb mm1, mm2
518 :			pxor mm5, mm2
519 :			pand mm4, mm7 ; lsb's of (i^j)...
520 :			pand mm5, mm7 ; lsb's of (i^j)...
521 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
522 :			pavgb mm0, [ecx]
523 :			movq [ecx], mm0
524 :			psubb mm1, mm5 ; ...are substracted from result of pavgb
525 :			pavgb mm1, [ecx+edx]
526 :			movq [ecx+edx], mm1
527 :			%endmacro
528 :
529 :			ALIGN 16
530 :			interpolate8x8_halfpel_v_add_xmm:
531 :			PROLOG1
532 :
533 :			jnz near .Loop1
534 :			pxor mm7, mm7 ; this is a NOP
535 :
536 :			ADD_8_HF_RND0
537 :			lea ecx,[ecx+2*edx]
538 :			ADD_8_HF_RND0
539 :			lea ecx,[ecx+2*edx]
540 :			ADD_8_HF_RND0
541 :			lea ecx,[ecx+2*edx]
542 :			ADD_8_HF_RND0
543 :			EPILOG
544 :
545 :			.Loop1
546 :			movq mm0, [eax] ; loop invariant
547 :			movq mm7, [mmx_one]
548 :
549 :			ADD_8_HF_RND1
550 :			movq mm0, mm2
551 :			lea ecx,[ecx+2*edx]
552 :			ADD_8_HF_RND1
553 :			movq mm0, mm2
554 :			lea ecx,[ecx+2*edx]
555 :			ADD_8_HF_RND1
556 :			movq mm0, mm2
557 :			lea ecx,[ecx+2*edx]
558 :			ADD_8_HF_RND1
559 :			EPILOG
560 :
561 :			; The trick is to correct the result of 'pavgb' with some combination of the
562 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
563 :			; The boolean relations are:
564 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
565 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
566 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
567 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
568 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
569 :
570 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
571 :
572 :			;===========================================================================
573 :			;
574 :			; void interpolate8x8_halfpel_hv_add_xmm(uint8_t * const dst,
575 :			; const uint8_t * const src,
576 :			; const uint32_t stride,
577 :			; const uint32_t rounding);
578 :			;
579 :			;
580 :			;===========================================================================
581 :
582 :			%macro ADD_HH_RND0 0
583 :			lea eax,[eax+edx]
584 :
585 :			movq mm0, [eax]
586 :			movq mm1, [eax+1]
587 :
588 :			movq mm6, mm0
589 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
590 :			lea eax,[eax+edx]
591 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
592 :
593 :			por mm3, mm1 ; ij \|= jk
594 :			movq mm6, mm2
595 :			pxor mm6, mm0 ; mm6 = s^t
596 :			pand mm3, mm6 ; (ij\|jk) &= st
597 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
598 :			pand mm3, mm7 ; mask lsb
599 :			psubb mm2, mm3 ; apply.
600 :
601 :			pavgb mm2, [ecx]
602 :			movq [ecx], mm2
603 :
604 :			movq mm2, [eax]
605 :			movq mm3, [eax+1]
606 :			movq mm6, mm2
607 :			pavgb mm2, mm3 ; preserved for next iteration
608 :			lea ecx,[ecx+edx]
609 :			pxor mm3, mm6 ; preserved for next iteration
610 :
611 :			por mm1, mm3
612 :			movq mm6, mm0
613 :			pxor mm6, mm2
614 :			pand mm1, mm6
615 :			pavgb mm0, mm2
616 :
617 :			pand mm1, mm7
618 :			psubb mm0, mm1
619 :
620 :			pavgb mm0, [ecx]
621 :			movq [ecx], mm0
622 :			%endmacro
623 :
624 :			%macro ADD_HH_RND1 0
625 :			lea eax,[eax+edx]
626 :
627 :			movq mm0, [eax]
628 :			movq mm1, [eax+1]
629 :
630 :			movq mm6, mm0
631 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
632 :			lea eax,[eax+edx]
633 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
634 :
635 :			pand mm3, mm1
636 :			movq mm6, mm2
637 :			pxor mm6, mm0
638 :			por mm3, mm6
639 :			pavgb mm2, mm0
640 :			pand mm3, mm7
641 :			psubb mm2, mm3
642 :
643 :			pavgb mm2, [ecx]
644 :			movq [ecx], mm2
645 :
646 :			movq mm2, [eax]
647 :			movq mm3, [eax+1]
648 :			movq mm6, mm2
649 :			pavgb mm2, mm3 ; preserved for next iteration
650 :			lea ecx,[ecx+edx]
651 :			pxor mm3, mm6 ; preserved for next iteration
652 :
653 :			pand mm1, mm3
654 :			movq mm6, mm0
655 :			pxor mm6, mm2
656 :			por mm1, mm6
657 :			pavgb mm0, mm2
658 :			pand mm1, mm7
659 :			psubb mm0, mm1
660 :
661 :			pavgb mm0, [ecx]
662 :			movq [ecx], mm0
663 :			%endmacro
664 :
665 :			ALIGN 16
666 :			interpolate8x8_halfpel_hv_add_xmm:
667 :			PROLOG1
668 :
669 :			movq mm7, [mmx_one]
670 :
671 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
672 :			movq mm2, [eax]
673 :			movq mm3, [eax+1]
674 :			movq mm6, mm2
675 :			pavgb mm2, mm3
676 :			pxor mm3, mm6 ; mm2/mm3 ready
677 :
678 :			jnz near .Loop1
679 :
680 :			ADD_HH_RND0
681 :			add ecx, edx
682 :			ADD_HH_RND0
683 :			add ecx, edx
684 :			ADD_HH_RND0
685 :			add ecx, edx
686 :			ADD_HH_RND0
687 :			EPILOG
688 :
689 :			.Loop1
690 :			ADD_HH_RND1
691 :			add ecx, edx
692 :			ADD_HH_RND1
693 :			add ecx, edx
694 :			ADD_HH_RND1
695 :			add ecx, edx
696 :			ADD_HH_RND1
697 :
698 :			EPILOG

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4