1 |
;/************************************************************************** |
;/***************************************************************************** |
2 |
; * |
; * |
3 |
; * XVID MPEG-4 VIDEO CODEC |
; * XVID MPEG-4 VIDEO CODEC |
4 |
; * xmm 8x8 block-based halfpel interpolation |
; * - mmx 8x8 block-based halfpel interpolation - |
5 |
|
; * |
6 |
|
; * Copyright(C) 2002 Michael Militzer <isibaar@xvid.org> |
7 |
|
; * 2002 Pascal Massimino <skal@planet-d.net> |
8 |
; * |
; * |
9 |
; * This program is free software; you can redistribute it and/or modify |
; * This program is free software; you can redistribute it and/or modify |
10 |
; * it under the terms of the GNU General Public License as published by |
; * it under the terms of the GNU General Public License as published by |
18 |
; * |
; * |
19 |
; * You should have received a copy of the GNU General Public License |
; * You should have received a copy of the GNU General Public License |
20 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
21 |
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
; * |
|
|
; *************************************************************************/ |
|
|
|
|
|
;/************************************************************************** |
|
22 |
; * |
; * |
23 |
; * History: |
; ****************************************************************************/ |
|
; * |
|
|
; * 04.06.2002 rewrote some funcs, mostly XMM. -Skal- |
|
|
; * Heavily tuned for overlap and AGI-stalls avoidance |
|
|
; * 04.02.2002 initial version (Isibaar) |
|
|
; * |
|
|
; *************************************************************************/ |
|
|
|
|
24 |
|
|
25 |
bits 32 |
BITS 32 |
26 |
|
|
27 |
%macro cglobal 1 |
%macro cglobal 1 |
28 |
%ifdef PREFIX |
%ifdef PREFIX |
29 |
|
%ifdef MARK_FUNCS |
30 |
|
global _%1:function %1.endfunc-%1 |
31 |
|
%define %1 _%1:function %1.endfunc-%1 |
32 |
|
%else |
33 |
global _%1 |
global _%1 |
34 |
%define %1 _%1 |
%define %1 _%1 |
35 |
|
%endif |
36 |
|
%else |
37 |
|
%ifdef MARK_FUNCS |
38 |
|
global %1:function %1.endfunc-%1 |
39 |
%else |
%else |
40 |
global %1 |
global %1 |
41 |
%endif |
%endif |
42 |
|
%endif |
43 |
%endmacro |
%endmacro |
44 |
|
|
45 |
section .data |
;============================================================================= |
46 |
|
; Read only data |
47 |
|
;============================================================================= |
48 |
|
|
49 |
align 16 |
%ifdef FORMAT_COFF |
50 |
|
SECTION .rodata |
51 |
|
%else |
52 |
|
SECTION .rodata align=16 |
53 |
|
%endif |
54 |
|
|
55 |
mmx_one |
ALIGN 16 |
56 |
|
mmx_one: |
57 |
times 8 db 1 |
times 8 db 1 |
58 |
|
|
59 |
section .text |
SECTION .text |
60 |
|
|
61 |
cglobal interpolate8x8_halfpel_h_xmm |
cglobal interpolate8x8_halfpel_h_xmm |
62 |
cglobal interpolate8x8_halfpel_v_xmm |
cglobal interpolate8x8_halfpel_v_xmm |
63 |
cglobal interpolate8x8_halfpel_hv_xmm |
cglobal interpolate8x8_halfpel_hv_xmm |
64 |
|
|
65 |
|
cglobal interpolate8x4_halfpel_h_xmm |
66 |
|
cglobal interpolate8x4_halfpel_v_xmm |
67 |
|
cglobal interpolate8x4_halfpel_hv_xmm |
68 |
|
|
69 |
|
cglobal interpolate8x8_halfpel_add_xmm |
70 |
|
cglobal interpolate8x8_halfpel_h_add_xmm |
71 |
|
cglobal interpolate8x8_halfpel_v_add_xmm |
72 |
|
cglobal interpolate8x8_halfpel_hv_add_xmm |
73 |
|
|
74 |
;=========================================================================== |
;=========================================================================== |
75 |
; |
; |
76 |
; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst, |
; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst, |
110 |
movq [ecx+edx], mm1 |
movq [ecx+edx], mm1 |
111 |
%endmacro |
%endmacro |
112 |
|
|
113 |
align 16 |
ALIGN 16 |
114 |
interpolate8x8_halfpel_h_xmm: |
interpolate8x8_halfpel_h_xmm: |
115 |
|
|
116 |
mov eax, [esp+16]; rounding |
mov eax, [esp+16]; rounding |
141 |
lea ecx,[ecx+2*edx] |
lea ecx,[ecx+2*edx] |
142 |
COPY_H_SSE_RND1 |
COPY_H_SSE_RND1 |
143 |
ret |
ret |
144 |
|
.endfunc |
145 |
|
|
146 |
;=========================================================================== |
;=========================================================================== |
147 |
; |
; |
181 |
movq [ecx+edx], mm1 |
movq [ecx+edx], mm1 |
182 |
%endmacro |
%endmacro |
183 |
|
|
184 |
align 16 |
ALIGN 16 |
185 |
interpolate8x8_halfpel_v_xmm: |
interpolate8x8_halfpel_v_xmm: |
186 |
|
|
187 |
mov eax, [esp+16]; rounding |
mov eax, [esp+16]; rounding |
191 |
mov edx, [esp+12] ; stride |
mov edx, [esp+12] ; stride |
192 |
|
|
193 |
; we process 2 line at a time |
; we process 2 line at a time |
|
|
|
194 |
jnz near .rounding1 |
jnz near .rounding1 |
195 |
|
|
196 |
COPY_V_SSE_RND0 |
COPY_V_SSE_RND0 |
216 |
lea ecx,[ecx+2*edx] |
lea ecx,[ecx+2*edx] |
217 |
COPY_V_SSE_RND1 |
COPY_V_SSE_RND1 |
218 |
ret |
ret |
219 |
|
.endfunc |
220 |
|
|
221 |
;=========================================================================== |
;=========================================================================== |
222 |
; |
; |
318 |
movq [ecx], mm0 |
movq [ecx], mm0 |
319 |
%endmacro |
%endmacro |
320 |
|
|
321 |
align 16 |
ALIGN 16 |
322 |
interpolate8x8_halfpel_hv_xmm: |
interpolate8x8_halfpel_hv_xmm: |
323 |
mov eax, [esp+16] ; rounding |
mov eax, [esp+16] ; rounding |
324 |
mov ecx, [esp+ 4] ; Dst |
mov ecx, [esp+ 4] ; Dst |
355 |
add ecx, edx |
add ecx, edx |
356 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
357 |
ret |
ret |
358 |
|
.endfunc |
359 |
|
|
360 |
|
;=========================================================================== |
361 |
|
; |
362 |
|
; void interpolate8x4_halfpel_h_xmm(uint8_t * const dst, |
363 |
|
; const uint8_t * const src, |
364 |
|
; const uint32_t stride, |
365 |
|
; const uint32_t rounding); |
366 |
|
; |
367 |
|
;=========================================================================== |
368 |
|
|
369 |
|
ALIGN 16 |
370 |
|
interpolate8x4_halfpel_h_xmm: |
371 |
|
|
372 |
|
mov eax, [esp+16] ; rounding |
373 |
|
mov ecx, [esp+ 4] ; Dst |
374 |
|
test eax,eax |
375 |
|
mov eax, [esp+ 8] ; Src |
376 |
|
mov edx, [esp+12] ; stride |
377 |
|
|
378 |
|
jnz near .rounding1 |
379 |
|
|
380 |
|
COPY_H_SSE_RND0 |
381 |
|
lea ecx,[ecx+2*edx] |
382 |
|
COPY_H_SSE_RND0 |
383 |
|
ret |
384 |
|
|
385 |
|
.rounding1 |
386 |
|
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
387 |
|
movq mm7, [mmx_one] |
388 |
|
COPY_H_SSE_RND1 |
389 |
|
lea ecx, [ecx+2*edx] |
390 |
|
COPY_H_SSE_RND1 |
391 |
|
ret |
392 |
|
.endfunc |
393 |
|
|
394 |
|
;=========================================================================== |
395 |
|
; |
396 |
|
; void interpolate8x4_halfpel_v_xmm(uint8_t * const dst, |
397 |
|
; const uint8_t * const src, |
398 |
|
; const uint32_t stride, |
399 |
|
; const uint32_t rounding); |
400 |
|
; |
401 |
|
;=========================================================================== |
402 |
|
|
403 |
|
ALIGN 16 |
404 |
|
interpolate8x4_halfpel_v_xmm: |
405 |
|
|
406 |
|
mov eax, [esp+16]; rounding |
407 |
|
mov ecx, [esp+ 4] ; Dst |
408 |
|
test eax,eax |
409 |
|
mov eax, [esp+ 8] ; Src |
410 |
|
mov edx, [esp+12] ; stride |
411 |
|
|
412 |
|
; we process 2 line at a time |
413 |
|
jnz near .rounding1 |
414 |
|
|
415 |
|
COPY_V_SSE_RND0 |
416 |
|
lea ecx, [ecx+2*edx] |
417 |
|
COPY_V_SSE_RND0 |
418 |
|
ret |
419 |
|
|
420 |
|
.rounding1 |
421 |
|
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
422 |
|
movq mm7, [mmx_one] |
423 |
|
movq mm2, [eax] ; loop invariant |
424 |
|
add eax, edx |
425 |
|
|
426 |
|
COPY_V_SSE_RND1 |
427 |
|
lea ecx,[ecx+2*edx] |
428 |
|
COPY_V_SSE_RND1 |
429 |
|
ret |
430 |
|
.endfunc |
431 |
|
|
432 |
|
;=========================================================================== |
433 |
|
; |
434 |
|
; void interpolate8x4_halfpel_hv_xmm(uint8_t * const dst, |
435 |
|
; const uint8_t * const src, |
436 |
|
; const uint32_t stride, |
437 |
|
; const uint32_t rounding); |
438 |
|
; |
439 |
|
; |
440 |
|
;=========================================================================== |
441 |
|
|
442 |
|
; The trick is to correct the result of 'pavgb' with some combination of the |
443 |
|
; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). |
444 |
|
; The boolean relations are: |
445 |
|
; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st |
446 |
|
; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st |
447 |
|
; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st |
448 |
|
; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st |
449 |
|
; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t. |
450 |
|
|
451 |
|
; Moreover, we process 2 lines at a times, for better overlapping (~15% faster). |
452 |
|
|
453 |
|
ALIGN 16 |
454 |
|
interpolate8x4_halfpel_hv_xmm: |
455 |
|
mov eax, [esp+16] ; rounding |
456 |
|
mov ecx, [esp+ 4] ; Dst |
457 |
|
test eax, eax |
458 |
|
mov eax, [esp+ 8] ; Src |
459 |
|
mov edx, [esp+12] ; stride |
460 |
|
|
461 |
|
movq mm7, [mmx_one] |
462 |
|
|
463 |
|
; loop invariants: mm2=(i+j+1)/2 and mm3= i^j |
464 |
|
movq mm2, [eax] |
465 |
|
movq mm3, [eax+1] |
466 |
|
movq mm6, mm2 |
467 |
|
pavgb mm2, mm3 |
468 |
|
pxor mm3, mm6 ; mm2/mm3 ready |
469 |
|
|
470 |
|
jnz near .rounding1 |
471 |
|
|
472 |
|
COPY_HV_SSE_RND0 |
473 |
|
add ecx, edx |
474 |
|
COPY_HV_SSE_RND0 |
475 |
|
ret |
476 |
|
|
477 |
|
.rounding1 |
478 |
|
COPY_HV_SSE_RND1 |
479 |
|
add ecx, edx |
480 |
|
COPY_HV_SSE_RND1 |
481 |
|
ret |
482 |
|
.endfunc |
483 |
|
|
484 |
|
;=========================================================================== |
485 |
|
; |
486 |
|
; The next functions combine both source halfpel interpolation step and the |
487 |
|
; averaging (with rouding) step to avoid wasting memory bandwidth computing |
488 |
|
; intermediate halfpel images and then averaging them. |
489 |
|
; |
490 |
|
;=========================================================================== |
491 |
|
|
492 |
|
%macro PROLOG0 0 |
493 |
|
mov ecx, [esp+ 4] ; Dst |
494 |
|
mov eax, [esp+ 8] ; Src |
495 |
|
mov edx, [esp+12] ; BpS |
496 |
|
%endmacro |
497 |
|
%macro PROLOG1 0 |
498 |
|
PROLOG0 |
499 |
|
test dword [esp+16], 1; Rounding? |
500 |
|
%endmacro |
501 |
|
%macro EPILOG 0 |
502 |
|
ret |
503 |
|
%endmacro |
504 |
|
|
505 |
|
;=========================================================================== |
506 |
|
; |
507 |
|
; void interpolate8x8_halfpel_add_xmm(uint8_t * const dst, |
508 |
|
; const uint8_t * const src, |
509 |
|
; const uint32_t stride, |
510 |
|
; const uint32_t rounding); |
511 |
|
; |
512 |
|
; |
513 |
|
;=========================================================================== |
514 |
|
|
515 |
|
%macro ADD_FF 2 |
516 |
|
movq mm0, [eax+%1] |
517 |
|
movq mm1, [eax+%2] |
518 |
|
;;--- |
519 |
|
;; movq mm2, mm0 |
520 |
|
;; movq mm3, mm1 |
521 |
|
;;--- |
522 |
|
pavgb mm0, [ecx+%1] |
523 |
|
pavgb mm1, [ecx+%2] |
524 |
|
;;-- |
525 |
|
;; por mm2, [ecx+%1] |
526 |
|
;; por mm3, [ecx+%2] |
527 |
|
;; pand mm2, [mmx_one] |
528 |
|
;; pand mm3, [mmx_one] |
529 |
|
;; psubsb mm0, mm2 |
530 |
|
;; psubsb mm1, mm3 |
531 |
|
;;-- |
532 |
|
movq [ecx+%1], mm0 |
533 |
|
movq [ecx+%2], mm1 |
534 |
|
%endmacro |
535 |
|
|
536 |
|
ALIGN 16 |
537 |
|
interpolate8x8_halfpel_add_xmm: ; 23c |
538 |
|
PROLOG1 |
539 |
|
ADD_FF 0, edx |
540 |
|
lea eax,[eax+2*edx] |
541 |
|
lea ecx,[ecx+2*edx] |
542 |
|
ADD_FF 0, edx |
543 |
|
lea eax,[eax+2*edx] |
544 |
|
lea ecx,[ecx+2*edx] |
545 |
|
ADD_FF 0, edx |
546 |
|
lea eax,[eax+2*edx] |
547 |
|
lea ecx,[ecx+2*edx] |
548 |
|
ADD_FF 0, edx |
549 |
|
EPILOG |
550 |
|
.endfunc |
551 |
|
|
552 |
|
;=========================================================================== |
553 |
|
; |
554 |
|
; void interpolate8x8_halfpel_h_add_xmm(uint8_t * const dst, |
555 |
|
; const uint8_t * const src, |
556 |
|
; const uint32_t stride, |
557 |
|
; const uint32_t rounding); |
558 |
|
; |
559 |
|
; |
560 |
|
;=========================================================================== |
561 |
|
|
562 |
|
|
563 |
|
%macro ADD_FH_RND0 2 |
564 |
|
movq mm0, [eax+%1] |
565 |
|
movq mm1, [eax+%2] |
566 |
|
pavgb mm0, [eax+%1+1] |
567 |
|
pavgb mm1, [eax+%2+1] |
568 |
|
pavgb mm0, [ecx+%1] |
569 |
|
pavgb mm1, [ecx+%2] |
570 |
|
movq [ecx+%1],mm0 |
571 |
|
movq [ecx+%2],mm1 |
572 |
|
%endmacro |
573 |
|
|
574 |
|
%macro ADD_FH_RND1 2 |
575 |
|
movq mm0, [eax+%1] |
576 |
|
movq mm1, [eax+%2] |
577 |
|
movq mm4, mm0 |
578 |
|
movq mm5, mm1 |
579 |
|
movq mm2, [eax+%1+1] |
580 |
|
movq mm3, [eax+%2+1] |
581 |
|
pavgb mm0, mm2 |
582 |
|
; lea ?? |
583 |
|
pxor mm2, mm4 |
584 |
|
pavgb mm1, mm3 |
585 |
|
pxor mm3, mm5 |
586 |
|
pand mm2, [mmx_one] |
587 |
|
pand mm3, [mmx_one] |
588 |
|
psubb mm0, mm2 |
589 |
|
psubb mm1, mm3 |
590 |
|
pavgb mm0, [ecx+%1] |
591 |
|
pavgb mm1, [ecx+%2] |
592 |
|
movq [ecx+%1],mm0 |
593 |
|
movq [ecx+%2],mm1 |
594 |
|
%endmacro |
595 |
|
|
596 |
|
ALIGN 16 |
597 |
|
interpolate8x8_halfpel_h_add_xmm: ; 32c |
598 |
|
PROLOG1 |
599 |
|
jnz near .Loop1 |
600 |
|
ADD_FH_RND0 0, edx |
601 |
|
lea eax,[eax+2*edx] |
602 |
|
lea ecx,[ecx+2*edx] |
603 |
|
ADD_FH_RND0 0, edx |
604 |
|
lea eax,[eax+2*edx] |
605 |
|
lea ecx,[ecx+2*edx] |
606 |
|
ADD_FH_RND0 0, edx |
607 |
|
lea eax,[eax+2*edx] |
608 |
|
lea ecx,[ecx+2*edx] |
609 |
|
ADD_FH_RND0 0, edx |
610 |
|
EPILOG |
611 |
|
|
612 |
|
.Loop1 |
613 |
|
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
614 |
|
; movq mm7, [mmx_one] |
615 |
|
ADD_FH_RND1 0, edx |
616 |
|
lea eax,[eax+2*edx] |
617 |
|
lea ecx,[ecx+2*edx] |
618 |
|
ADD_FH_RND1 0, edx |
619 |
|
lea eax,[eax+2*edx] |
620 |
|
lea ecx,[ecx+2*edx] |
621 |
|
ADD_FH_RND1 0, edx |
622 |
|
lea eax,[eax+2*edx] |
623 |
|
lea ecx,[ecx+2*edx] |
624 |
|
ADD_FH_RND1 0, edx |
625 |
|
EPILOG |
626 |
|
.endfunc |
627 |
|
|
628 |
|
|
629 |
|
;=========================================================================== |
630 |
|
; |
631 |
|
; void interpolate8x8_halfpel_v_add_xmm(uint8_t * const dst, |
632 |
|
; const uint8_t * const src, |
633 |
|
; const uint32_t stride, |
634 |
|
; const uint32_t rounding); |
635 |
|
; |
636 |
|
; |
637 |
|
;=========================================================================== |
638 |
|
|
639 |
|
%macro ADD_8_HF_RND0 0 |
640 |
|
movq mm0, [eax] |
641 |
|
movq mm1, [eax+edx] |
642 |
|
pavgb mm0, mm1 |
643 |
|
pavgb mm1, [eax+2*edx] |
644 |
|
lea eax,[eax+2*edx] |
645 |
|
pavgb mm0, [ecx] |
646 |
|
pavgb mm1, [ecx+edx] |
647 |
|
movq [ecx],mm0 |
648 |
|
movq [ecx+edx],mm1 |
649 |
|
%endmacro |
650 |
|
|
651 |
|
%macro ADD_8_HF_RND1 0 |
652 |
|
movq mm1, [eax+edx] |
653 |
|
movq mm2, [eax+2*edx] |
654 |
|
lea eax,[eax+2*edx] |
655 |
|
movq mm4, mm0 |
656 |
|
movq mm5, mm1 |
657 |
|
pavgb mm0, mm1 |
658 |
|
pxor mm4, mm1 |
659 |
|
pavgb mm1, mm2 |
660 |
|
pxor mm5, mm2 |
661 |
|
pand mm4, mm7 ; lsb's of (i^j)... |
662 |
|
pand mm5, mm7 ; lsb's of (i^j)... |
663 |
|
psubb mm0, mm4 ; ...are substracted from result of pavgb |
664 |
|
pavgb mm0, [ecx] |
665 |
|
movq [ecx], mm0 |
666 |
|
psubb mm1, mm5 ; ...are substracted from result of pavgb |
667 |
|
pavgb mm1, [ecx+edx] |
668 |
|
movq [ecx+edx], mm1 |
669 |
|
%endmacro |
670 |
|
|
671 |
|
ALIGN 16 |
672 |
|
interpolate8x8_halfpel_v_add_xmm: |
673 |
|
PROLOG1 |
674 |
|
|
675 |
|
jnz near .Loop1 |
676 |
|
pxor mm7, mm7 ; this is a NOP |
677 |
|
|
678 |
|
ADD_8_HF_RND0 |
679 |
|
lea ecx,[ecx+2*edx] |
680 |
|
ADD_8_HF_RND0 |
681 |
|
lea ecx,[ecx+2*edx] |
682 |
|
ADD_8_HF_RND0 |
683 |
|
lea ecx,[ecx+2*edx] |
684 |
|
ADD_8_HF_RND0 |
685 |
|
EPILOG |
686 |
|
|
687 |
|
.Loop1 |
688 |
|
movq mm0, [eax] ; loop invariant |
689 |
|
movq mm7, [mmx_one] |
690 |
|
|
691 |
|
ADD_8_HF_RND1 |
692 |
|
movq mm0, mm2 |
693 |
|
lea ecx,[ecx+2*edx] |
694 |
|
ADD_8_HF_RND1 |
695 |
|
movq mm0, mm2 |
696 |
|
lea ecx,[ecx+2*edx] |
697 |
|
ADD_8_HF_RND1 |
698 |
|
movq mm0, mm2 |
699 |
|
lea ecx,[ecx+2*edx] |
700 |
|
ADD_8_HF_RND1 |
701 |
|
EPILOG |
702 |
|
.endfunc |
703 |
|
|
704 |
|
; The trick is to correct the result of 'pavgb' with some combination of the |
705 |
|
; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). |
706 |
|
; The boolean relations are: |
707 |
|
; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st |
708 |
|
; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st |
709 |
|
; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st |
710 |
|
; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st |
711 |
|
; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t. |
712 |
|
|
713 |
|
; Moreover, we process 2 lines at a times, for better overlapping (~15% faster). |
714 |
|
|
715 |
|
;=========================================================================== |
716 |
|
; |
717 |
|
; void interpolate8x8_halfpel_hv_add_xmm(uint8_t * const dst, |
718 |
|
; const uint8_t * const src, |
719 |
|
; const uint32_t stride, |
720 |
|
; const uint32_t rounding); |
721 |
|
; |
722 |
|
; |
723 |
|
;=========================================================================== |
724 |
|
|
725 |
|
%macro ADD_HH_RND0 0 |
726 |
|
lea eax,[eax+edx] |
727 |
|
|
728 |
|
movq mm0, [eax] |
729 |
|
movq mm1, [eax+1] |
730 |
|
|
731 |
|
movq mm6, mm0 |
732 |
|
pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step |
733 |
|
lea eax,[eax+edx] |
734 |
|
pxor mm1, mm6 ; mm1=(j^k). preserved for next step |
735 |
|
|
736 |
|
por mm3, mm1 ; ij |= jk |
737 |
|
movq mm6, mm2 |
738 |
|
pxor mm6, mm0 ; mm6 = s^t |
739 |
|
pand mm3, mm6 ; (ij|jk) &= st |
740 |
|
pavgb mm2, mm0 ; mm2 = (s+t+1)/2 |
741 |
|
pand mm3, mm7 ; mask lsb |
742 |
|
psubb mm2, mm3 ; apply. |
743 |
|
|
744 |
|
pavgb mm2, [ecx] |
745 |
|
movq [ecx], mm2 |
746 |
|
|
747 |
|
movq mm2, [eax] |
748 |
|
movq mm3, [eax+1] |
749 |
|
movq mm6, mm2 |
750 |
|
pavgb mm2, mm3 ; preserved for next iteration |
751 |
|
lea ecx,[ecx+edx] |
752 |
|
pxor mm3, mm6 ; preserved for next iteration |
753 |
|
|
754 |
|
por mm1, mm3 |
755 |
|
movq mm6, mm0 |
756 |
|
pxor mm6, mm2 |
757 |
|
pand mm1, mm6 |
758 |
|
pavgb mm0, mm2 |
759 |
|
|
760 |
|
pand mm1, mm7 |
761 |
|
psubb mm0, mm1 |
762 |
|
|
763 |
|
pavgb mm0, [ecx] |
764 |
|
movq [ecx], mm0 |
765 |
|
%endmacro |
766 |
|
|
767 |
|
%macro ADD_HH_RND1 0 |
768 |
|
lea eax,[eax+edx] |
769 |
|
|
770 |
|
movq mm0, [eax] |
771 |
|
movq mm1, [eax+1] |
772 |
|
|
773 |
|
movq mm6, mm0 |
774 |
|
pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step |
775 |
|
lea eax,[eax+edx] |
776 |
|
pxor mm1, mm6 ; mm1=(j^k). preserved for next step |
777 |
|
|
778 |
|
pand mm3, mm1 |
779 |
|
movq mm6, mm2 |
780 |
|
pxor mm6, mm0 |
781 |
|
por mm3, mm6 |
782 |
|
pavgb mm2, mm0 |
783 |
|
pand mm3, mm7 |
784 |
|
psubb mm2, mm3 |
785 |
|
|
786 |
|
pavgb mm2, [ecx] |
787 |
|
movq [ecx], mm2 |
788 |
|
|
789 |
|
movq mm2, [eax] |
790 |
|
movq mm3, [eax+1] |
791 |
|
movq mm6, mm2 |
792 |
|
pavgb mm2, mm3 ; preserved for next iteration |
793 |
|
lea ecx,[ecx+edx] |
794 |
|
pxor mm3, mm6 ; preserved for next iteration |
795 |
|
|
796 |
|
pand mm1, mm3 |
797 |
|
movq mm6, mm0 |
798 |
|
pxor mm6, mm2 |
799 |
|
por mm1, mm6 |
800 |
|
pavgb mm0, mm2 |
801 |
|
pand mm1, mm7 |
802 |
|
psubb mm0, mm1 |
803 |
|
|
804 |
|
pavgb mm0, [ecx] |
805 |
|
movq [ecx], mm0 |
806 |
|
%endmacro |
807 |
|
|
808 |
|
ALIGN 16 |
809 |
|
interpolate8x8_halfpel_hv_add_xmm: |
810 |
|
PROLOG1 |
811 |
|
|
812 |
|
movq mm7, [mmx_one] |
813 |
|
|
814 |
|
; loop invariants: mm2=(i+j+1)/2 and mm3= i^j |
815 |
|
movq mm2, [eax] |
816 |
|
movq mm3, [eax+1] |
817 |
|
movq mm6, mm2 |
818 |
|
pavgb mm2, mm3 |
819 |
|
pxor mm3, mm6 ; mm2/mm3 ready |
820 |
|
|
821 |
|
jnz near .Loop1 |
822 |
|
|
823 |
|
ADD_HH_RND0 |
824 |
|
add ecx, edx |
825 |
|
ADD_HH_RND0 |
826 |
|
add ecx, edx |
827 |
|
ADD_HH_RND0 |
828 |
|
add ecx, edx |
829 |
|
ADD_HH_RND0 |
830 |
|
EPILOG |
831 |
|
|
832 |
|
.Loop1 |
833 |
|
ADD_HH_RND1 |
834 |
|
add ecx, edx |
835 |
|
ADD_HH_RND1 |
836 |
|
add ecx, edx |
837 |
|
ADD_HH_RND1 |
838 |
|
add ecx, edx |
839 |
|
ADD_HH_RND1 |
840 |
|
|
841 |
|
EPILOG |
842 |
|
.endfunc |
843 |
|
|