Annotation of /xvidcore/src/image/x86_64_asm/interpolate8x8_xmm.asm

Revision 1.4 - (view) (download)

1 :	edgomez	1.1	;/*****************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - mmx 8x8 block-based halfpel interpolation -
5 :			; *
6 :			; * Copyright(C) 2002 Michael Militzer <isibaar@xvid.org>
7 :			; * 2002 Pascal Massimino <skal@planet-d.net>
8 :			; * 2004 Andre Werthmann <wertmann@aei.mpg.de>
9 :			; *
10 :			; * This program is free software ; you can redistribute it and/or modify
11 :			; * it under the terms of the GNU General Public License as published by
12 :			; * the Free Software Foundation ; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :			; *
15 :			; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :			; *
20 :			; * You should have received a copy of the GNU General Public License
21 :			; * along with this program ; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :			; *
24 :			; ****************************************************************************/
25 :
26 :			BITS 64
27 :
28 :			%macro cglobal 1
29 :			%ifdef PREFIX
30 :			%ifdef MARK_FUNCS
31 :			global _%1:function %1.endfunc-%1
32 :			%define %1 _%1:function %1.endfunc-%1
33 :	Isibaar	1.3	%define ENDFUNC .endfunc
34 :	edgomez	1.1	%else
35 :			global _%1
36 :			%define %1 _%1
37 :	Isibaar	1.3	%define ENDFUNC
38 :	edgomez	1.1	%endif
39 :			%else
40 :			%ifdef MARK_FUNCS
41 :			global %1:function %1.endfunc-%1
42 :	Isibaar	1.3	%define ENDFUNC .endfunc
43 :	edgomez	1.1	%else
44 :			global %1
45 :	Isibaar	1.3	%define ENDFUNC
46 :	edgomez	1.1	%endif
47 :			%endif
48 :			%endmacro
49 :
50 :			;=============================================================================
51 :			; Read only data
52 :			;=============================================================================
53 :
54 :			%ifdef FORMAT_COFF
55 :			SECTION .rodata
56 :			%else
57 :			SECTION .rodata align=16
58 :			%endif
59 :
60 :			ALIGN 16
61 :			mmx_one:
62 :			times 8 db 1
63 :
64 :			SECTION .text align=16
65 :
66 :			cglobal interpolate8x8_halfpel_h_x86_64
67 :			cglobal interpolate8x8_halfpel_v_x86_64
68 :			cglobal interpolate8x8_halfpel_hv_x86_64
69 :
70 :			cglobal interpolate8x8_halfpel_add_x86_64
71 :			cglobal interpolate8x8_halfpel_h_add_x86_64
72 :			cglobal interpolate8x8_halfpel_v_add_x86_64
73 :			cglobal interpolate8x8_halfpel_hv_add_x86_64
74 :
75 :			;===========================================================================
76 :			;
77 :			; void interpolate8x8_halfpel_h_x86_64(uint8_t * const dst,
78 :			; const uint8_t * const src,
79 :			; const uint32_t stride,
80 :			; const uint32_t rounding);
81 :			;
82 :			;===========================================================================
83 :
84 :			%macro COPY_H_SSE_RND0 0
85 :			movq mm0, [rax]
86 :			pavgb mm0, [rax+1]
87 :			movq mm1, [rax+rdx]
88 :			pavgb mm1, [rax+rdx+1]
89 :			lea rax,[rax+2*rdx]
90 :			movq [rcx],mm0
91 :			movq [rcx+rdx],mm1
92 :			%endmacro
93 :
94 :			%macro COPY_H_SSE_RND1 0
95 :			movq mm0, [rax]
96 :			movq mm1, [rax+rdx]
97 :			movq mm4, mm0
98 :			movq mm5, mm1
99 :			movq mm2, [rax+1]
100 :			movq mm3, [rax+rdx+1]
101 :			pavgb mm0, mm2
102 :			pxor mm2, mm4
103 :			pavgb mm1, mm3
104 :			lea rax, [rax+2*rdx]
105 :			pxor mm3, mm5
106 :			pand mm2, mm7
107 :			pand mm3, mm7
108 :			psubb mm0, mm2
109 :			movq [rcx], mm0
110 :			psubb mm1, mm3
111 :			movq [rcx+rdx], mm1
112 :			%endmacro
113 :
114 :			ALIGN 16
115 :			interpolate8x8_halfpel_h_x86_64:
116 :
117 :			mov rax, rcx ; rounding
118 :			mov rcx, rdi ; Dst
119 :			test rax,rax
120 :			mov rax, rsi ; src
121 :			; rdx is stride
122 :
123 :			jnz near .rounding1
124 :
125 :			COPY_H_SSE_RND0
126 :			lea rcx,[rcx+2*rdx]
127 :			COPY_H_SSE_RND0
128 :			lea rcx,[rcx+2*rdx]
129 :			COPY_H_SSE_RND0
130 :			lea rcx,[rcx+2*rdx]
131 :			COPY_H_SSE_RND0
132 :			ret
133 :
134 :	Isibaar	1.3	.rounding1:
135 :	edgomez	1.1	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
136 :			movq mm7, [mmx_one wrt rip]
137 :			COPY_H_SSE_RND1
138 :			lea rcx, [rcx+2*rdx]
139 :			COPY_H_SSE_RND1
140 :			lea rcx,[rcx+2*rdx]
141 :			COPY_H_SSE_RND1
142 :			lea rcx,[rcx+2*rdx]
143 :			COPY_H_SSE_RND1
144 :			ret
145 :	Isibaar	1.3	ENDFUNC
146 :	edgomez	1.1
147 :			;===========================================================================
148 :			;
149 :			; void interpolate8x8_halfpel_v_x86_64(uint8_t * const dst,
150 :			; const uint8_t * const src,
151 :			; const uint32_t stride,
152 :			; const uint32_t rounding);
153 :			;
154 :			;===========================================================================
155 :
156 :			%macro COPY_V_SSE_RND0 0
157 :			movq mm0, [rax]
158 :			movq mm1, [rax+rdx]
159 :			pavgb mm0, mm1
160 :			pavgb mm1, [rax+2*rdx]
161 :			lea rax, [rax+2*rdx]
162 :			movq [rcx], mm0
163 :			movq [rcx+rdx],mm1
164 :			%endmacro
165 :
166 :			%macro COPY_V_SSE_RND1 0
167 :			movq mm0, mm2
168 :			movq mm1, [rax]
169 :			movq mm2, [rax+rdx]
170 :			lea rax,[rax+2*rdx]
171 :			movq mm4, mm0
172 :			movq mm5, mm1
173 :			pavgb mm0, mm1
174 :			pxor mm4, mm1
175 :			pavgb mm1, mm2
176 :			pxor mm5, mm2
177 :			pand mm4, mm7 ; lsb's of (i^j)...
178 :			pand mm5, mm7 ; lsb's of (i^j)...
179 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
180 :			movq [rcx], mm0
181 :			psubb mm1, mm5 ; ...are substracted from result of pavgb
182 :			movq [rcx+rdx], mm1
183 :			%endmacro
184 :
185 :			ALIGN 16
186 :			interpolate8x8_halfpel_v_x86_64:
187 :			mov rax, rcx ; rounding
188 :			mov rcx, rdi ; Dst
189 :			test rax,rax
190 :			mov rax, rsi ; Src
191 :			; rdx is stride
192 :
193 :			; we process 2 line at a time
194 :			jnz near .rounding1
195 :
196 :			COPY_V_SSE_RND0
197 :			lea rcx, [rcx+2*rdx]
198 :			COPY_V_SSE_RND0
199 :			lea rcx, [rcx+2*rdx]
200 :			COPY_V_SSE_RND0
201 :			lea rcx, [rcx+2*rdx]
202 :			COPY_V_SSE_RND0
203 :			ret
204 :
205 :	Isibaar	1.3	.rounding1:
206 :	edgomez	1.1	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
207 :			movq mm7, [mmx_one wrt rip]
208 :			movq mm2, [rax] ; loop invariant
209 :			add rax, rdx
210 :
211 :			COPY_V_SSE_RND1
212 :			lea rcx,[rcx+2*rdx]
213 :			COPY_V_SSE_RND1
214 :			lea rcx,[rcx+2*rdx]
215 :			COPY_V_SSE_RND1
216 :			lea rcx,[rcx+2*rdx]
217 :			COPY_V_SSE_RND1
218 :			ret
219 :	Isibaar	1.3	ENDFUNC
220 :	edgomez	1.1
221 :			;===========================================================================
222 :			;
223 :			; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,
224 :			; const uint8_t * const src,
225 :			; const uint32_t stride,
226 :			; const uint32_t rounding);
227 :			;
228 :			;
229 :			;===========================================================================
230 :
231 :			; The trick is to correct the result of 'pavgb' with some combination of the
232 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
233 :			; The boolean relations are:
234 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
235 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
236 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
237 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
238 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
239 :
240 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
241 :
242 :			%macro COPY_HV_SSE_RND0 0
243 :			lea rax, [rax+rdx]
244 :
245 :			movq mm0, [rax]
246 :			movq mm1, [rax+1]
247 :
248 :			movq mm6, mm0
249 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
250 :			lea rax, [rax+rdx]
251 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
252 :
253 :			por mm3, mm1 ; ij \|= jk
254 :			movq mm6, mm2
255 :			pxor mm6, mm0 ; mm6 = s^t
256 :			pand mm3, mm6 ; (ij\|jk) &= st
257 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
258 :			pand mm3, mm7 ; mask lsb
259 :			psubb mm2, mm3 ; apply.
260 :
261 :			movq [rcx], mm2
262 :
263 :			movq mm2, [rax]
264 :			movq mm3, [rax+1]
265 :			movq mm6, mm2
266 :			pavgb mm2, mm3 ; preserved for next iteration
267 :			lea rcx,[rcx+rdx]
268 :			pxor mm3, mm6 ; preserved for next iteration
269 :
270 :			por mm1, mm3
271 :			movq mm6, mm0
272 :			pxor mm6, mm2
273 :			pand mm1, mm6
274 :			pavgb mm0, mm2
275 :
276 :			pand mm1, mm7
277 :			psubb mm0, mm1
278 :
279 :			movq [rcx], mm0
280 :			%endmacro
281 :
282 :			%macro COPY_HV_SSE_RND1 0
283 :			lea rax, [rax+rdx]
284 :
285 :			movq mm0, [rax]
286 :			movq mm1, [rax+1]
287 :
288 :			movq mm6, mm0
289 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
290 :			lea rax, [rax+rdx]
291 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
292 :
293 :			pand mm3, mm1
294 :			movq mm6, mm2
295 :			pxor mm6, mm0
296 :			por mm3, mm6
297 :			pavgb mm2, mm0
298 :			pand mm3, mm7
299 :			psubb mm2, mm3
300 :
301 :			movq [rcx], mm2
302 :
303 :			movq mm2, [rax]
304 :			movq mm3, [rax+1]
305 :			movq mm6, mm2
306 :			pavgb mm2, mm3 ; preserved for next iteration
307 :			lea rcx,[rcx+rdx]
308 :			pxor mm3, mm6 ; preserved for next iteration
309 :
310 :			pand mm1, mm3
311 :			movq mm6, mm0
312 :			pxor mm6, mm2
313 :			por mm1, mm6
314 :			pavgb mm0, mm2
315 :			pand mm1, mm7
316 :			psubb mm0, mm1
317 :
318 :			movq [rcx], mm0
319 :			%endmacro
320 :
321 :			ALIGN 16
322 :			interpolate8x8_halfpel_hv_x86_64:
323 :			mov rax, rcx ; rounding
324 :			mov rcx, rdi ; Dst
325 :			test rax, rax
326 :			mov rax, rsi ; Src
327 :			; rdx is stride
328 :
329 :			movq mm7, [mmx_one wrt rip]
330 :
331 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
332 :			movq mm2, [rax]
333 :			movq mm3, [rax+1]
334 :			movq mm6, mm2
335 :			pavgb mm2, mm3
336 :			pxor mm3, mm6 ; mm2/mm3 ready
337 :
338 :			jnz near .rounding1
339 :
340 :			COPY_HV_SSE_RND0
341 :			add rcx, rdx
342 :			COPY_HV_SSE_RND0
343 :			add rcx, rdx
344 :			COPY_HV_SSE_RND0
345 :			add rcx, rdx
346 :			COPY_HV_SSE_RND0
347 :			ret
348 :
349 :	Isibaar	1.3	.rounding1:
350 :	edgomez	1.1	COPY_HV_SSE_RND1
351 :			add rcx, rdx
352 :			COPY_HV_SSE_RND1
353 :			add rcx, rdx
354 :			COPY_HV_SSE_RND1
355 :			add rcx, rdx
356 :			COPY_HV_SSE_RND1
357 :			ret
358 :	Isibaar	1.3	ENDFUNC
359 :	edgomez	1.1
360 :			;===========================================================================
361 :			;
362 :			; The next functions combine both source halfpel interpolation step and the
363 :			; averaging (with rouding) step to avoid wasting memory bandwidth computing
364 :			; intermediate halfpel images and then averaging them.
365 :			;
366 :			;===========================================================================
367 :
368 :			%macro PROLOG0 0
369 :			; rcx fourth
370 :			; rdx third
371 :			; r8 fifth
372 :			mov r8, rdx ; saves rounding
373 :			mov rcx, rdi ; Dst
374 :			mov rax, rsi ; Src
375 :			; rdx is stride
376 :			%endmacro
377 :			%macro PROLOG1 0
378 :			PROLOG0
379 :			test r8, 1; Rounding?
380 :			%endmacro
381 :			%macro EPILOG 0
382 :			ret
383 :			%endmacro
384 :
385 :			;===========================================================================
386 :			;
387 :			; void interpolate8x8_halfpel_add_xmm(uint8_t * const dst,
388 :			; const uint8_t * const src,
389 :			; const uint32_t stride,
390 :			; const uint32_t rounding);
391 :			;
392 :			;
393 :			;===========================================================================
394 :
395 :			%macro ADD_FF 2
396 :			movq mm0, [rax+%1]
397 :			movq mm1, [rax+%2]
398 :			pavgb mm0, [rcx+%1]
399 :			pavgb mm1, [rcx+%2]
400 :			movq [rcx+%1], mm0
401 :			movq [rcx+%2], mm1
402 :			%endmacro
403 :
404 :			ALIGN 16
405 :			interpolate8x8_halfpel_add_x86_64: ; 23c
406 :			PROLOG1
407 :			ADD_FF 0, rdx
408 :			lea rax,[rax+2*rdx]
409 :			lea rcx,[rcx+2*rdx]
410 :			ADD_FF 0, rdx
411 :			lea rax,[rax+2*rdx]
412 :			lea rcx,[rcx+2*rdx]
413 :			ADD_FF 0, rdx
414 :			lea rax,[rax+2*rdx]
415 :			lea rcx,[rcx+2*rdx]
416 :			ADD_FF 0, rdx
417 :			EPILOG
418 :	Isibaar	1.3	ENDFUNC
419 :	edgomez	1.1
420 :			;===========================================================================
421 :			;
422 :			; void interpolate8x8_halfpel_h_add_xmm(uint8_t * const dst,
423 :			; const uint8_t * const src,
424 :			; const uint32_t stride,
425 :			; const uint32_t rounding);
426 :			;
427 :			;
428 :			;===========================================================================
429 :
430 :
431 :			%macro ADD_FH_RND0 2
432 :			movq mm0, [rax+%1]
433 :			movq mm1, [rax+%2]
434 :			pavgb mm0, [rax+%1+1]
435 :			pavgb mm1, [rax+%2+1]
436 :			pavgb mm0, [rcx+%1]
437 :			pavgb mm1, [rcx+%2]
438 :			movq [rcx+%1],mm0
439 :			movq [rcx+%2],mm1
440 :			%endmacro
441 :
442 :			%macro ADD_FH_RND1 2
443 :			movq mm0, [rax+%1]
444 :			movq mm1, [rax+%2]
445 :			movq mm4, mm0
446 :			movq mm5, mm1
447 :			movq mm2, [rax+%1+1]
448 :			movq mm3, [rax+%2+1]
449 :			pavgb mm0, mm2
450 :			; lea ??
451 :			pxor mm2, mm4
452 :			pavgb mm1, mm3
453 :			pxor mm3, mm5
454 :			pand mm2, [mmx_one wrt rip]
455 :			pand mm3, [mmx_one wrt rip]
456 :			psubb mm0, mm2
457 :			psubb mm1, mm3
458 :			pavgb mm0, [rcx+%1]
459 :			pavgb mm1, [rcx+%2]
460 :			movq [rcx+%1],mm0
461 :			movq [rcx+%2],mm1
462 :			%endmacro
463 :
464 :			ALIGN 16
465 :			interpolate8x8_halfpel_h_add_x86_64: ; 32c
466 :			PROLOG1
467 :			jnz near .Loop1
468 :			ADD_FH_RND0 0, rdx
469 :			lea rax,[rax+2*rdx]
470 :			lea rcx,[rcx+2*rdx]
471 :			ADD_FH_RND0 0, rdx
472 :			lea rax,[rax+2*rdx]
473 :			lea rcx,[rcx+2*rdx]
474 :			ADD_FH_RND0 0, rdx
475 :			lea rax,[rax+2*rdx]
476 :			lea rcx,[rcx+2*rdx]
477 :			ADD_FH_RND0 0, rdx
478 :			EPILOG
479 :
480 :	Isibaar	1.3	.Loop1:
481 :	edgomez	1.1	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
482 :			; movq mm7, [mmx_one wrt rip]
483 :			ADD_FH_RND1 0, rdx
484 :			lea rax,[rax+2*rdx]
485 :			lea rcx,[rcx+2*rdx]
486 :			ADD_FH_RND1 0, rdx
487 :			lea rax,[rax+2*rdx]
488 :			lea rcx,[rcx+2*rdx]
489 :			ADD_FH_RND1 0, rdx
490 :			lea rax,[rax+2*rdx]
491 :			lea rcx,[rcx+2*rdx]
492 :			ADD_FH_RND1 0, rdx
493 :			EPILOG
494 :	Isibaar	1.3	ENDFUNC
495 :	edgomez	1.1
496 :
497 :			;===========================================================================
498 :			;
499 :			; void interpolate8x8_halfpel_v_add_x86_64(uint8_t * const dst,
500 :			; const uint8_t * const src,
501 :			; const uint32_t stride,
502 :			; const uint32_t rounding);
503 :			;
504 :			;
505 :			;===========================================================================
506 :
507 :			%macro ADD_8_HF_RND0 0
508 :			movq mm0, [rax]
509 :			movq mm1, [rax+rdx]
510 :			pavgb mm0, mm1
511 :			pavgb mm1, [rax+2*rdx]
512 :			lea rax,[rax+2*rdx]
513 :			pavgb mm0, [rcx]
514 :			pavgb mm1, [rcx+rdx]
515 :			movq [rcx],mm0
516 :			movq [rcx+rdx],mm1
517 :			%endmacro
518 :
519 :			%macro ADD_8_HF_RND1 0
520 :			movq mm1, [rax+rdx]
521 :			movq mm2, [rax+2*rdx]
522 :			lea rax,[rax+2*rdx]
523 :			movq mm4, mm0
524 :			movq mm5, mm1
525 :			pavgb mm0, mm1
526 :			pxor mm4, mm1
527 :			pavgb mm1, mm2
528 :			pxor mm5, mm2
529 :			pand mm4, mm7 ; lsb's of (i^j)...
530 :			pand mm5, mm7 ; lsb's of (i^j)...
531 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
532 :			pavgb mm0, [rcx]
533 :			movq [rcx], mm0
534 :			psubb mm1, mm5 ; ...are substracted from result of pavgb
535 :			pavgb mm1, [rcx+rdx]
536 :			movq [rcx+rdx], mm1
537 :			%endmacro
538 :
539 :			ALIGN 16
540 :			interpolate8x8_halfpel_v_add_x86_64:
541 :			PROLOG1
542 :
543 :			jnz near .Loop1
544 :			pxor mm7, mm7 ; this is a NOP
545 :
546 :			ADD_8_HF_RND0
547 :			lea rcx,[rcx+2*rdx]
548 :			ADD_8_HF_RND0
549 :			lea rcx,[rcx+2*rdx]
550 :			ADD_8_HF_RND0
551 :			lea rcx,[rcx+2*rdx]
552 :			ADD_8_HF_RND0
553 :			EPILOG
554 :
555 :	Isibaar	1.3	.Loop1:
556 :	edgomez	1.1	movq mm0, [rax] ; loop invariant
557 :			movq mm7, [mmx_one wrt rip]
558 :
559 :			ADD_8_HF_RND1
560 :			movq mm0, mm2
561 :			lea rcx,[rcx+2*rdx]
562 :			ADD_8_HF_RND1
563 :			movq mm0, mm2
564 :			lea rcx,[rcx+2*rdx]
565 :			ADD_8_HF_RND1
566 :			movq mm0, mm2
567 :			lea rcx,[rcx+2*rdx]
568 :			ADD_8_HF_RND1
569 :			EPILOG
570 :	Isibaar	1.3	ENDFUNC
571 :	edgomez	1.1
572 :			; The trick is to correct the result of 'pavgb' with some combination of the
573 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
574 :			; The boolean relations are:
575 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
576 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
577 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
578 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
579 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
580 :
581 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
582 :
583 :			;===========================================================================
584 :			;
585 :			; void interpolate8x8_halfpel_hv_add_x86_64(uint8_t * const dst,
586 :			; const uint8_t * const src,
587 :			; const uint32_t stride,
588 :			; const uint32_t rounding);
589 :			;
590 :			;
591 :			;===========================================================================
592 :
593 :			%macro ADD_HH_RND0 0
594 :			lea rax,[rax+rdx]
595 :
596 :			movq mm0, [rax]
597 :			movq mm1, [rax+1]
598 :
599 :			movq mm6, mm0
600 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
601 :			lea rax,[rax+rdx]
602 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
603 :
604 :			por mm3, mm1 ; ij \|= jk
605 :			movq mm6, mm2
606 :			pxor mm6, mm0 ; mm6 = s^t
607 :			pand mm3, mm6 ; (ij\|jk) &= st
608 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
609 :			pand mm3, mm7 ; mask lsb
610 :			psubb mm2, mm3 ; apply.
611 :
612 :			pavgb mm2, [rcx]
613 :			movq [rcx], mm2
614 :
615 :			movq mm2, [rax]
616 :			movq mm3, [rax+1]
617 :			movq mm6, mm2
618 :			pavgb mm2, mm3 ; preserved for next iteration
619 :			lea rcx,[rcx+rdx]
620 :			pxor mm3, mm6 ; preserved for next iteration
621 :
622 :			por mm1, mm3
623 :			movq mm6, mm0
624 :			pxor mm6, mm2
625 :			pand mm1, mm6
626 :			pavgb mm0, mm2
627 :
628 :			pand mm1, mm7
629 :			psubb mm0, mm1
630 :
631 :			pavgb mm0, [rcx]
632 :			movq [rcx], mm0
633 :			%endmacro
634 :
635 :			%macro ADD_HH_RND1 0
636 :			lea rax,[rax+rdx]
637 :
638 :			movq mm0, [rax]
639 :			movq mm1, [rax+1]
640 :
641 :			movq mm6, mm0
642 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
643 :			lea rax,[rax+rdx]
644 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
645 :
646 :			pand mm3, mm1
647 :			movq mm6, mm2
648 :			pxor mm6, mm0
649 :			por mm3, mm6
650 :			pavgb mm2, mm0
651 :			pand mm3, mm7
652 :			psubb mm2, mm3
653 :
654 :			pavgb mm2, [rcx]
655 :			movq [rcx], mm2
656 :
657 :			movq mm2, [rax]
658 :			movq mm3, [rax+1]
659 :			movq mm6, mm2
660 :			pavgb mm2, mm3 ; preserved for next iteration
661 :			lea rcx,[rcx+rdx]
662 :			pxor mm3, mm6 ; preserved for next iteration
663 :
664 :			pand mm1, mm3
665 :			movq mm6, mm0
666 :			pxor mm6, mm2
667 :			por mm1, mm6
668 :			pavgb mm0, mm2
669 :			pand mm1, mm7
670 :			psubb mm0, mm1
671 :
672 :			pavgb mm0, [rcx]
673 :			movq [rcx], mm0
674 :			%endmacro
675 :
676 :			ALIGN 16
677 :			interpolate8x8_halfpel_hv_add_x86_64:
678 :			PROLOG1
679 :
680 :			movq mm7, [mmx_one wrt rip]
681 :
682 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
683 :			movq mm2, [rax]
684 :			movq mm3, [rax+1]
685 :			movq mm6, mm2
686 :			pavgb mm2, mm3
687 :			pxor mm3, mm6 ; mm2/mm3 ready
688 :
689 :			jnz near .Loop1
690 :
691 :			ADD_HH_RND0
692 :			add rcx, rdx
693 :			ADD_HH_RND0
694 :			add rcx, rdx
695 :			ADD_HH_RND0
696 :			add rcx, rdx
697 :			ADD_HH_RND0
698 :			EPILOG
699 :
700 :	Isibaar	1.3	.Loop1:
701 :	edgomez	1.1	ADD_HH_RND1
702 :			add rcx, rdx
703 :			ADD_HH_RND1
704 :			add rcx, rdx
705 :			ADD_HH_RND1
706 :			add rcx, rdx
707 :			ADD_HH_RND1
708 :
709 :			EPILOG
710 :	Isibaar	1.3	ENDFUNC
711 :	Isibaar	1.2
712 :			%ifidn __OUTPUT_FORMAT__,elf
713 :			section ".note.GNU-stack" noalloc noexec nowrite progbits
714 :			%endif
715 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4