270 |
cglobal sad8bi_mmx |
cglobal sad8bi_mmx |
271 |
cglobal dev16_mmx |
cglobal dev16_mmx |
272 |
cglobal sse8_16bit_mmx |
cglobal sse8_16bit_mmx |
273 |
|
cglobal sse8_8bit_mmx |
274 |
|
|
275 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
276 |
; |
; |
631 |
; |
; |
632 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
633 |
|
|
634 |
%macro ROW_SSE_MMX 2 |
%macro ROW_SSE_16bit_MMX 2 |
635 |
movq mm0, [%1] |
movq mm0, [%1] |
636 |
movq mm1, [%1+8] |
movq mm1, [%1+8] |
637 |
psubw mm0, [%2] |
psubw mm0, [%2] |
655 |
pxor mm2, mm2 |
pxor mm2, mm2 |
656 |
|
|
657 |
;; Let's go |
;; Let's go |
658 |
ROW_SSE_MMX esi, edi |
%rep 8 |
659 |
lea esi, [esi+edx] |
ROW_SSE_16bit_MMX esi, edi |
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
|
lea esi, [esi+edx] |
|
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
|
lea esi, [esi+edx] |
|
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
|
lea esi, [esi+edx] |
|
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
|
lea esi, [esi+edx] |
|
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
|
lea esi, [esi+edx] |
|
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
|
lea esi, [esi+edx] |
|
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
660 |
lea esi, [esi+edx] |
lea esi, [esi+edx] |
661 |
lea edi, [edi+edx] |
lea edi, [edi+edx] |
662 |
|
%endrep |
663 |
|
|
664 |
;; Finish adding each dword of the accumulator |
;; Finish adding each dword of the accumulator |
665 |
movq mm3, mm2 |
movq mm3, mm2 |
671 |
pop edi |
pop edi |
672 |
pop esi |
pop esi |
673 |
ret |
ret |
674 |
|
|
675 |
|
;----------------------------------------------------------------------------- |
676 |
|
; |
677 |
|
; uint32_t sse8_8bit_mmx(const int8_t *b1, |
678 |
|
; const int8_t *b2, |
679 |
|
; const uint32_t stride); |
680 |
|
; |
681 |
|
;----------------------------------------------------------------------------- |
682 |
|
|
683 |
|
%macro ROW_SSE_8bit_MMX 2 |
684 |
|
movq mm0, [%1] ; load a row |
685 |
|
movq mm2, [%2] ; load a row |
686 |
|
|
687 |
|
movq mm1, mm0 ; copy row |
688 |
|
movq mm3, mm2 ; copy row |
689 |
|
|
690 |
|
punpcklbw mm0, mm7 ; turn the 4low elements into 16bit |
691 |
|
punpckhbw mm1, mm7 ; turn the 4high elements into 16bit |
692 |
|
|
693 |
|
punpcklbw mm2, mm7 ; turn the 4low elements into 16bit |
694 |
|
punpckhbw mm3, mm7 ; turn the 4high elements into 16bit |
695 |
|
|
696 |
|
psubw mm0, mm2 ; low part of src-dst |
697 |
|
psubw mm1, mm3 ; high part of src-dst |
698 |
|
|
699 |
|
pmaddwd mm0, mm0 ; compute the square sum |
700 |
|
pmaddwd mm1, mm1 ; compute the square sum |
701 |
|
|
702 |
|
paddd mm6, mm0 ; add to the accumulator |
703 |
|
paddd mm6, mm1 ; add to the accumulator |
704 |
|
%endmacro |
705 |
|
|
706 |
|
sse8_8bit_mmx: |
707 |
|
push esi |
708 |
|
push edi |
709 |
|
|
710 |
|
;; Load the function params |
711 |
|
mov esi, [esp+8+4] |
712 |
|
mov edi, [esp+8+8] |
713 |
|
mov edx, [esp+8+12] |
714 |
|
|
715 |
|
;; Reset the sse accumulator |
716 |
|
pxor mm6, mm6 |
717 |
|
|
718 |
|
;; Used to interleave 8bit data with 0x00 values |
719 |
|
pxor mm7, mm7 |
720 |
|
|
721 |
|
;; Let's go |
722 |
|
%rep 8 |
723 |
|
ROW_SSE_8bit_MMX esi, edi |
724 |
|
lea esi, [esi+edx] |
725 |
|
lea edi, [edi+edx] |
726 |
|
%endrep |
727 |
|
|
728 |
|
;; Finish adding each dword of the accumulator |
729 |
|
movq mm7, mm6 |
730 |
|
psrlq mm6, 32 |
731 |
|
paddd mm6, mm7 |
732 |
|
movd eax, mm6 |
733 |
|
|
734 |
|
;; All done |
735 |
|
pop edi |
736 |
|
pop esi |
737 |
|
ret |