Annotation of /xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm

Revision 1.8 - (view) (download)

1 :	edgomez	1.5	;/*****************************************************************************
2 :	Isibaar	1.1	; *
3 :	edgomez	1.5	; * XVID MPEG-4 VIDEO CODEC
4 :			; * - mmx 8x8 block-based halfpel interpolation -
5 :	Isibaar	1.1	; *
6 :	edgomez	1.5	; * Copyright(C) 2002 Michael Militzer <isibaar@xvid.org>
7 :			; * 2002 Pascal Massimino <skal@planet-d.net>
8 :	Isibaar	1.1	; *
9 :	edgomez	1.5	; * This program is free software ; you can redistribute it and/or modify
10 :			; * it under the terms of the GNU General Public License as published by
11 :			; * the Free Software Foundation ; either version 2 of the License, or
12 :			; * (at your option) any later version.
13 :	edgomez	1.3	; *
14 :	edgomez	1.5	; * This program is distributed in the hope that it will be useful,
15 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
16 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 :			; * GNU General Public License for more details.
18 :	Isibaar	1.1	; *
19 :	edgomez	1.5	; * You should have received a copy of the GNU General Public License
20 :			; * along with this program ; if not, write to the Free Software
21 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 :	edgomez	1.4	; *
23 :	edgomez	1.5	; ****************************************************************************/
24 :	edgomez	1.4
25 :	edgomez	1.5	BITS 32
26 :	Isibaar	1.1
27 :	edgomez	1.5	%macro cglobal 1
28 :	Isibaar	1.1	%ifdef PREFIX
29 :	edgomez	1.8	%ifdef MARK_FUNCS
30 :			global _%1:function
31 :			%define %1 _%1:function
32 :			%else
33 :			global _%1
34 :			%define %1 _%1
35 :			%endif
36 :	Isibaar	1.1	%else
37 :	edgomez	1.8	%ifdef MARK_FUNCS
38 :			global %1:function
39 :			%else
40 :			global %1
41 :			%endif
42 :	Isibaar	1.1	%endif
43 :			%endmacro
44 :
45 :	edgomez	1.5	;=============================================================================
46 :			; Read only data
47 :			;=============================================================================
48 :
49 :			%ifdef FORMAT_COFF
50 :	edgomez	1.6	SECTION .rodata
51 :	edgomez	1.5	%else
52 :	edgomez	1.6	SECTION .rodata align=16
53 :	edgomez	1.5	%endif
54 :
55 :			ALIGN 16
56 :			mmx_one:
57 :			times 8 db 1
58 :	Isibaar	1.1
59 :	edgomez	1.5	SECTION .text
60 :	Isibaar	1.1
61 :			cglobal interpolate8x8_halfpel_h_xmm
62 :			cglobal interpolate8x8_halfpel_v_xmm
63 :			cglobal interpolate8x8_halfpel_hv_xmm
64 :
65 :	edgomez	1.7	cglobal interpolate8x8_halfpel_add_xmm
66 :			cglobal interpolate8x8_halfpel_h_add_xmm
67 :			cglobal interpolate8x8_halfpel_v_add_xmm
68 :			cglobal interpolate8x8_halfpel_hv_add_xmm
69 :
70 :	Isibaar	1.1	;===========================================================================
71 :			;
72 :			; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst,
73 :			; const uint8_t * const src,
74 :			; const uint32_t stride,
75 :			; const uint32_t rounding);
76 :			;
77 :			;===========================================================================
78 :
79 :			%macro COPY_H_SSE_RND0 0
80 :			movq mm0, [eax]
81 :			pavgb mm0, [eax+1]
82 :			movq mm1, [eax+edx]
83 :			pavgb mm1, [eax+edx+1]
84 :			lea eax,[eax+2*edx]
85 :			movq [ecx],mm0
86 :			movq [ecx+edx],mm1
87 :			%endmacro
88 :
89 :			%macro COPY_H_SSE_RND1 0
90 :			movq mm0, [eax]
91 :			movq mm1, [eax+edx]
92 :			movq mm4, mm0
93 :			movq mm5, mm1
94 :	edgomez	1.5	movq mm2, [eax+1]
95 :	Isibaar	1.1	movq mm3, [eax+edx+1]
96 :			pavgb mm0, mm2
97 :			pxor mm2, mm4
98 :			pavgb mm1, mm3
99 :	edgomez	1.5	lea eax, [eax+2*edx]
100 :	Isibaar	1.1	pxor mm3, mm5
101 :			pand mm2, mm7
102 :			pand mm3, mm7
103 :			psubb mm0, mm2
104 :			movq [ecx], mm0
105 :			psubb mm1, mm3
106 :	edgomez	1.5	movq [ecx+edx], mm1
107 :	Isibaar	1.1	%endmacro
108 :
109 :	edgomez	1.5	ALIGN 16
110 :	Isibaar	1.1	interpolate8x8_halfpel_h_xmm:
111 :
112 :	edgomez	1.5	mov eax, [esp+16] ; rounding
113 :			mov ecx, [esp+ 4] ; Dst
114 :	Isibaar	1.1	test eax,eax
115 :	edgomez	1.5	mov eax, [esp+ 8] ; Src
116 :			mov edx, [esp+12] ; stride
117 :	Isibaar	1.1
118 :			jnz near .rounding1
119 :
120 :			COPY_H_SSE_RND0
121 :			lea ecx,[ecx+2*edx]
122 :			COPY_H_SSE_RND0
123 :			lea ecx,[ecx+2*edx]
124 :			COPY_H_SSE_RND0
125 :			lea ecx,[ecx+2*edx]
126 :			COPY_H_SSE_RND0
127 :			ret
128 :
129 :			.rounding1
130 :	edgomez	1.5	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
131 :	Isibaar	1.1	movq mm7, [mmx_one]
132 :			COPY_H_SSE_RND1
133 :			lea ecx, [ecx+2*edx]
134 :			COPY_H_SSE_RND1
135 :			lea ecx,[ecx+2*edx]
136 :			COPY_H_SSE_RND1
137 :			lea ecx,[ecx+2*edx]
138 :			COPY_H_SSE_RND1
139 :			ret
140 :
141 :			;===========================================================================
142 :			;
143 :			; void interpolate8x8_halfpel_v_xmm(uint8_t * const dst,
144 :	edgomez	1.5	; const uint8_t * const src,
145 :			; const uint32_t stride,
146 :			; const uint32_t rounding);
147 :	Isibaar	1.1	;
148 :			;===========================================================================
149 :
150 :			%macro COPY_V_SSE_RND0 0
151 :	edgomez	1.5	movq mm0, [eax]
152 :			movq mm1, [eax+edx]
153 :	Isibaar	1.1	pavgb mm0, mm1
154 :			pavgb mm1, [eax+2*edx]
155 :	edgomez	1.5	lea eax, [eax+2*edx]
156 :			movq [ecx], mm0
157 :	Isibaar	1.1	movq [ecx+edx],mm1
158 :			%endmacro
159 :
160 :			%macro COPY_V_SSE_RND1 0
161 :			movq mm0, mm2
162 :			movq mm1, [eax]
163 :			movq mm2, [eax+edx]
164 :			lea eax,[eax+2*edx]
165 :			movq mm4, mm0
166 :			movq mm5, mm1
167 :			pavgb mm0, mm1
168 :	edgomez	1.5	pxor mm4, mm1
169 :	Isibaar	1.1	pavgb mm1, mm2
170 :			pxor mm5, mm2
171 :	edgomez	1.5	pand mm4, mm7 ; lsb's of (i^j)...
172 :			pand mm5, mm7 ; lsb's of (i^j)...
173 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
174 :	Isibaar	1.1	movq [ecx], mm0
175 :	edgomez	1.5	psubb mm1, mm5 ; ...are substracted from result of pavgb
176 :	Isibaar	1.1	movq [ecx+edx], mm1
177 :			%endmacro
178 :
179 :	edgomez	1.5	ALIGN 16
180 :	Isibaar	1.1	interpolate8x8_halfpel_v_xmm:
181 :
182 :			mov eax, [esp+16]; rounding
183 :	edgomez	1.5	mov ecx, [esp+ 4] ; Dst
184 :	Isibaar	1.1	test eax,eax
185 :	edgomez	1.5	mov eax, [esp+ 8] ; Src
186 :			mov edx, [esp+12] ; stride
187 :	Isibaar	1.1
188 :	edgomez	1.5	; we process 2 line at a time
189 :	Isibaar	1.1	jnz near .rounding1
190 :
191 :			COPY_V_SSE_RND0
192 :			lea ecx, [ecx+2*edx]
193 :			COPY_V_SSE_RND0
194 :			lea ecx, [ecx+2*edx]
195 :			COPY_V_SSE_RND0
196 :			lea ecx, [ecx+2*edx]
197 :			COPY_V_SSE_RND0
198 :			ret
199 :
200 :			.rounding1
201 :	edgomez	1.5	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
202 :	Isibaar	1.1	movq mm7, [mmx_one]
203 :	edgomez	1.5	movq mm2, [eax] ; loop invariant
204 :	Isibaar	1.1	add eax, edx
205 :
206 :			COPY_V_SSE_RND1
207 :			lea ecx,[ecx+2*edx]
208 :			COPY_V_SSE_RND1
209 :			lea ecx,[ecx+2*edx]
210 :			COPY_V_SSE_RND1
211 :			lea ecx,[ecx+2*edx]
212 :			COPY_V_SSE_RND1
213 :			ret
214 :
215 :			;===========================================================================
216 :			;
217 :			; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,
218 :	edgomez	1.5	; const uint8_t * const src,
219 :			; const uint32_t stride,
220 :			; const uint32_t rounding);
221 :	Isibaar	1.1	;
222 :			;
223 :			;===========================================================================
224 :
225 :			; The trick is to correct the result of 'pavgb' with some combination of the
226 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
227 :			; The boolean relations are:
228 :	edgomez	1.5	; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
229 :	Isibaar	1.1	; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
230 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
231 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
232 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
233 :
234 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
235 :
236 :			%macro COPY_HV_SSE_RND0 0
237 :	edgomez	1.5	lea eax, [eax+edx]
238 :	Isibaar	1.1
239 :	edgomez	1.5	movq mm0, [eax]
240 :			movq mm1, [eax+1]
241 :	Isibaar	1.1
242 :	edgomez	1.5	movq mm6, mm0
243 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
244 :			lea eax, [eax+edx]
245 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
246 :	Isibaar	1.1
247 :	edgomez	1.5	por mm3, mm1 ; ij \|= jk
248 :			movq mm6, mm2
249 :			pxor mm6, mm0 ; mm6 = s^t
250 :			pand mm3, mm6 ; (ij\|jk) &= st
251 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
252 :			pand mm3, mm7 ; mask lsb
253 :			psubb mm2, mm3 ; apply.
254 :
255 :			movq [ecx], mm2
256 :	Isibaar	1.1
257 :	edgomez	1.5	movq mm2, [eax]
258 :			movq mm3, [eax+1]
259 :			movq mm6, mm2
260 :			pavgb mm2, mm3 ; preserved for next iteration
261 :			lea ecx,[ecx+edx]
262 :			pxor mm3, mm6 ; preserved for next iteration
263 :
264 :			por mm1, mm3
265 :			movq mm6, mm0
266 :			pxor mm6, mm2
267 :			pand mm1, mm6
268 :			pavgb mm0, mm2
269 :
270 :			pand mm1, mm7
271 :			psubb mm0, mm1
272 :
273 :			movq [ecx], mm0
274 :	Isibaar	1.1	%endmacro
275 :
276 :			%macro COPY_HV_SSE_RND1 0
277 :	edgomez	1.5	lea eax, [eax+edx]
278 :
279 :			movq mm0, [eax]
280 :			movq mm1, [eax+1]
281 :	Isibaar	1.1
282 :	edgomez	1.5	movq mm6, mm0
283 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
284 :			lea eax, [eax+edx]
285 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
286 :	Isibaar	1.1
287 :	edgomez	1.5	pand mm3, mm1
288 :			movq mm6, mm2
289 :			pxor mm6, mm0
290 :			por mm3, mm6
291 :			pavgb mm2, mm0
292 :			pand mm3, mm7
293 :			psubb mm2, mm3
294 :
295 :			movq [ecx], mm2
296 :	Isibaar	1.1
297 :	edgomez	1.5	movq mm2, [eax]
298 :			movq mm3, [eax+1]
299 :			movq mm6, mm2
300 :			pavgb mm2, mm3 ; preserved for next iteration
301 :			lea ecx,[ecx+edx]
302 :			pxor mm3, mm6 ; preserved for next iteration
303 :
304 :			pand mm1, mm3
305 :			movq mm6, mm0
306 :			pxor mm6, mm2
307 :			por mm1, mm6
308 :			pavgb mm0, mm2
309 :			pand mm1, mm7
310 :			psubb mm0, mm1
311 :
312 :			movq [ecx], mm0
313 :	Isibaar	1.1	%endmacro
314 :
315 :	edgomez	1.5	ALIGN 16
316 :	Isibaar	1.1	interpolate8x8_halfpel_hv_xmm:
317 :	edgomez	1.5	mov eax, [esp+16] ; rounding
318 :			mov ecx, [esp+ 4] ; Dst
319 :			test eax, eax
320 :			mov eax, [esp+ 8] ; Src
321 :			mov edx, [esp+12] ; stride
322 :	Isibaar	1.1
323 :			movq mm7, [mmx_one]
324 :
325 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
326 :			movq mm2, [eax]
327 :			movq mm3, [eax+1]
328 :			movq mm6, mm2
329 :			pavgb mm2, mm3
330 :	edgomez	1.5	pxor mm3, mm6 ; mm2/mm3 ready
331 :	Isibaar	1.1
332 :			jnz near .rounding1
333 :
334 :			COPY_HV_SSE_RND0
335 :			add ecx, edx
336 :			COPY_HV_SSE_RND0
337 :			add ecx, edx
338 :			COPY_HV_SSE_RND0
339 :			add ecx, edx
340 :			COPY_HV_SSE_RND0
341 :			ret
342 :
343 :			.rounding1
344 :			COPY_HV_SSE_RND1
345 :			add ecx, edx
346 :			COPY_HV_SSE_RND1
347 :			add ecx, edx
348 :			COPY_HV_SSE_RND1
349 :			add ecx, edx
350 :			COPY_HV_SSE_RND1
351 :	edgomez	1.5	ret
352 :	edgomez	1.7
353 :			;===========================================================================
354 :			;
355 :			; The next functions combine both source halfpel interpolation step and the
356 :			; averaging (with rouding) step to avoid wasting memory bandwidth computing
357 :			; intermediate halfpel images and then averaging them.
358 :			;
359 :			;===========================================================================
360 :
361 :			%macro PROLOG0 0
362 :			mov ecx, [esp+ 4] ; Dst
363 :			mov eax, [esp+ 8] ; Src
364 :			mov edx, [esp+12] ; BpS
365 :			%endmacro
366 :			%macro PROLOG1 0
367 :			PROLOG0
368 :			test dword [esp+16], 1; Rounding?
369 :			%endmacro
370 :			%macro EPILOG 0
371 :			ret
372 :			%endmacro
373 :
374 :			;===========================================================================
375 :			;
376 :			; void interpolate8x8_halfpel_add_xmm(uint8_t * const dst,
377 :			; const uint8_t * const src,
378 :			; const uint32_t stride,
379 :			; const uint32_t rounding);
380 :			;
381 :			;
382 :			;===========================================================================
383 :
384 :			%macro ADD_FF 2
385 :			movq mm0, [eax+%1]
386 :			movq mm1, [eax+%2]
387 :			;;---
388 :			;; movq mm2, mm0
389 :			;; movq mm3, mm1
390 :			;;---
391 :			pavgb mm0, [ecx+%1]
392 :			pavgb mm1, [ecx+%2]
393 :			;;--
394 :			;; por mm2, [ecx+%1]
395 :			;; por mm3, [ecx+%2]
396 :			;; pand mm2, [mmx_one]
397 :			;; pand mm3, [mmx_one]
398 :			;; psubsb mm0, mm2
399 :			;; psubsb mm1, mm3
400 :			;;--
401 :			movq [ecx+%1], mm0
402 :			movq [ecx+%2], mm1
403 :			%endmacro
404 :
405 :			ALIGN 16
406 :			interpolate8x8_halfpel_add_xmm: ; 23c
407 :			PROLOG1
408 :			ADD_FF 0, edx
409 :			lea eax,[eax+2*edx]
410 :			lea ecx,[ecx+2*edx]
411 :			ADD_FF 0, edx
412 :			lea eax,[eax+2*edx]
413 :			lea ecx,[ecx+2*edx]
414 :			ADD_FF 0, edx
415 :			lea eax,[eax+2*edx]
416 :			lea ecx,[ecx+2*edx]
417 :			ADD_FF 0, edx
418 :			EPILOG
419 :
420 :			;===========================================================================
421 :			;
422 :			; void interpolate8x8_halfpel_h_add_xmm(uint8_t * const dst,
423 :			; const uint8_t * const src,
424 :			; const uint32_t stride,
425 :			; const uint32_t rounding);
426 :			;
427 :			;
428 :			;===========================================================================
429 :
430 :
431 :			%macro ADD_FH_RND0 2
432 :			movq mm0, [eax+%1]
433 :			movq mm1, [eax+%2]
434 :			pavgb mm0, [eax+%1+1]
435 :			pavgb mm1, [eax+%2+1]
436 :			pavgb mm0, [ecx+%1]
437 :			pavgb mm1, [ecx+%2]
438 :			movq [ecx+%1],mm0
439 :			movq [ecx+%2],mm1
440 :			%endmacro
441 :
442 :			%macro ADD_FH_RND1 2
443 :			movq mm0, [eax+%1]
444 :			movq mm1, [eax+%2]
445 :			movq mm4, mm0
446 :			movq mm5, mm1
447 :			movq mm2, [eax+%1+1]
448 :			movq mm3, [eax+%2+1]
449 :			pavgb mm0, mm2
450 :			; lea ??
451 :			pxor mm2, mm4
452 :			pavgb mm1, mm3
453 :			pxor mm3, mm5
454 :			pand mm2, [mmx_one]
455 :			pand mm3, [mmx_one]
456 :			psubb mm0, mm2
457 :			psubb mm1, mm3
458 :			pavgb mm0, [ecx+%1]
459 :			pavgb mm1, [ecx+%2]
460 :			movq [ecx+%1],mm0
461 :			movq [ecx+%2],mm1
462 :			%endmacro
463 :
464 :			ALIGN 16
465 :			interpolate8x8_halfpel_h_add_xmm: ; 32c
466 :			PROLOG1
467 :			jnz near .Loop1
468 :			ADD_FH_RND0 0, edx
469 :			lea eax,[eax+2*edx]
470 :			lea ecx,[ecx+2*edx]
471 :			ADD_FH_RND0 0, edx
472 :			lea eax,[eax+2*edx]
473 :			lea ecx,[ecx+2*edx]
474 :			ADD_FH_RND0 0, edx
475 :			lea eax,[eax+2*edx]
476 :			lea ecx,[ecx+2*edx]
477 :			ADD_FH_RND0 0, edx
478 :			EPILOG
479 :
480 :			.Loop1
481 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
482 :			; movq mm7, [mmx_one]
483 :			ADD_FH_RND1 0, edx
484 :			lea eax,[eax+2*edx]
485 :			lea ecx,[ecx+2*edx]
486 :			ADD_FH_RND1 0, edx
487 :			lea eax,[eax+2*edx]
488 :			lea ecx,[ecx+2*edx]
489 :			ADD_FH_RND1 0, edx
490 :			lea eax,[eax+2*edx]
491 :			lea ecx,[ecx+2*edx]
492 :			ADD_FH_RND1 0, edx
493 :			EPILOG
494 :
495 :
496 :			;===========================================================================
497 :			;
498 :			; void interpolate8x8_halfpel_v_add_xmm(uint8_t * const dst,
499 :			; const uint8_t * const src,
500 :			; const uint32_t stride,
501 :			; const uint32_t rounding);
502 :			;
503 :			;
504 :			;===========================================================================
505 :
506 :			%macro ADD_8_HF_RND0 0
507 :			movq mm0, [eax]
508 :			movq mm1, [eax+edx]
509 :			pavgb mm0, mm1
510 :			pavgb mm1, [eax+2*edx]
511 :			lea eax,[eax+2*edx]
512 :			pavgb mm0, [ecx]
513 :			pavgb mm1, [ecx+edx]
514 :			movq [ecx],mm0
515 :			movq [ecx+edx],mm1
516 :			%endmacro
517 :
518 :			%macro ADD_8_HF_RND1 0
519 :			movq mm1, [eax+edx]
520 :			movq mm2, [eax+2*edx]
521 :			lea eax,[eax+2*edx]
522 :			movq mm4, mm0
523 :			movq mm5, mm1
524 :			pavgb mm0, mm1
525 :			pxor mm4, mm1
526 :			pavgb mm1, mm2
527 :			pxor mm5, mm2
528 :			pand mm4, mm7 ; lsb's of (i^j)...
529 :			pand mm5, mm7 ; lsb's of (i^j)...
530 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
531 :			pavgb mm0, [ecx]
532 :			movq [ecx], mm0
533 :			psubb mm1, mm5 ; ...are substracted from result of pavgb
534 :			pavgb mm1, [ecx+edx]
535 :			movq [ecx+edx], mm1
536 :			%endmacro
537 :
538 :			ALIGN 16
539 :			interpolate8x8_halfpel_v_add_xmm:
540 :			PROLOG1
541 :
542 :			jnz near .Loop1
543 :			pxor mm7, mm7 ; this is a NOP
544 :
545 :			ADD_8_HF_RND0
546 :			lea ecx,[ecx+2*edx]
547 :			ADD_8_HF_RND0
548 :			lea ecx,[ecx+2*edx]
549 :			ADD_8_HF_RND0
550 :			lea ecx,[ecx+2*edx]
551 :			ADD_8_HF_RND0
552 :			EPILOG
553 :
554 :			.Loop1
555 :			movq mm0, [eax] ; loop invariant
556 :			movq mm7, [mmx_one]
557 :
558 :			ADD_8_HF_RND1
559 :			movq mm0, mm2
560 :			lea ecx,[ecx+2*edx]
561 :			ADD_8_HF_RND1
562 :			movq mm0, mm2
563 :			lea ecx,[ecx+2*edx]
564 :			ADD_8_HF_RND1
565 :			movq mm0, mm2
566 :			lea ecx,[ecx+2*edx]
567 :			ADD_8_HF_RND1
568 :			EPILOG
569 :
570 :			; The trick is to correct the result of 'pavgb' with some combination of the
571 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
572 :			; The boolean relations are:
573 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
574 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
575 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
576 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
577 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
578 :
579 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
580 :
581 :			;===========================================================================
582 :			;
583 :			; void interpolate8x8_halfpel_hv_add_xmm(uint8_t * const dst,
584 :			; const uint8_t * const src,
585 :			; const uint32_t stride,
586 :			; const uint32_t rounding);
587 :			;
588 :			;
589 :			;===========================================================================
590 :
591 :			%macro ADD_HH_RND0 0
592 :			lea eax,[eax+edx]
593 :
594 :			movq mm0, [eax]
595 :			movq mm1, [eax+1]
596 :
597 :			movq mm6, mm0
598 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
599 :			lea eax,[eax+edx]
600 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
601 :
602 :			por mm3, mm1 ; ij \|= jk
603 :			movq mm6, mm2
604 :			pxor mm6, mm0 ; mm6 = s^t
605 :			pand mm3, mm6 ; (ij\|jk) &= st
606 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
607 :			pand mm3, mm7 ; mask lsb
608 :			psubb mm2, mm3 ; apply.
609 :
610 :			pavgb mm2, [ecx]
611 :			movq [ecx], mm2
612 :
613 :			movq mm2, [eax]
614 :			movq mm3, [eax+1]
615 :			movq mm6, mm2
616 :			pavgb mm2, mm3 ; preserved for next iteration
617 :			lea ecx,[ecx+edx]
618 :			pxor mm3, mm6 ; preserved for next iteration
619 :
620 :			por mm1, mm3
621 :			movq mm6, mm0
622 :			pxor mm6, mm2
623 :			pand mm1, mm6
624 :			pavgb mm0, mm2
625 :
626 :			pand mm1, mm7
627 :			psubb mm0, mm1
628 :
629 :			pavgb mm0, [ecx]
630 :			movq [ecx], mm0
631 :			%endmacro
632 :
633 :			%macro ADD_HH_RND1 0
634 :			lea eax,[eax+edx]
635 :
636 :			movq mm0, [eax]
637 :			movq mm1, [eax+1]
638 :
639 :			movq mm6, mm0
640 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
641 :			lea eax,[eax+edx]
642 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
643 :
644 :			pand mm3, mm1
645 :			movq mm6, mm2
646 :			pxor mm6, mm0
647 :			por mm3, mm6
648 :			pavgb mm2, mm0
649 :			pand mm3, mm7
650 :			psubb mm2, mm3
651 :
652 :			pavgb mm2, [ecx]
653 :			movq [ecx], mm2
654 :
655 :			movq mm2, [eax]
656 :			movq mm3, [eax+1]
657 :			movq mm6, mm2
658 :			pavgb mm2, mm3 ; preserved for next iteration
659 :			lea ecx,[ecx+edx]
660 :			pxor mm3, mm6 ; preserved for next iteration
661 :
662 :			pand mm1, mm3
663 :			movq mm6, mm0
664 :			pxor mm6, mm2
665 :			por mm1, mm6
666 :			pavgb mm0, mm2
667 :			pand mm1, mm7
668 :			psubb mm0, mm1
669 :
670 :			pavgb mm0, [ecx]
671 :			movq [ecx], mm0
672 :			%endmacro
673 :
674 :			ALIGN 16
675 :			interpolate8x8_halfpel_hv_add_xmm:
676 :			PROLOG1
677 :
678 :			movq mm7, [mmx_one]
679 :
680 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
681 :			movq mm2, [eax]
682 :			movq mm3, [eax+1]
683 :			movq mm6, mm2
684 :			pavgb mm2, mm3
685 :			pxor mm3, mm6 ; mm2/mm3 ready
686 :
687 :			jnz near .Loop1
688 :
689 :			ADD_HH_RND0
690 :			add ecx, edx
691 :			ADD_HH_RND0
692 :			add ecx, edx
693 :			ADD_HH_RND0
694 :			add ecx, edx
695 :			ADD_HH_RND0
696 :			EPILOG
697 :
698 :			.Loop1
699 :			ADD_HH_RND1
700 :			add ecx, edx
701 :			ADD_HH_RND1
702 :			add ecx, edx
703 :			ADD_HH_RND1
704 :			add ecx, edx
705 :			ADD_HH_RND1
706 :
707 :			EPILOG

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4