--- sad_3dne.asm 2002/12/19 00:40:19 1.1 +++ sad_3dne.asm 2003/02/15 15:22:19 1.2 @@ -0,0 +1,464 @@ +;/************************************************************************** +; * +; * XVID MPEG-4 VIDEO CODEC +; * xmm sum of absolute difference +; * +; * This program is an implementation of a part of one or more MPEG-4 +; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending +; * to use this software module in hardware or software products are +; * advised that its use may infringe existing patents or copyrights, and +; * any such use would be at such party's own risk. The original +; * developer of this software module and his/her company, and subsequent +; * editors and their companies, will have no liability for use of this +; * software or modifications or derivatives thereof. +; * +; * This program is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. +; * +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +; * +; *************************************************************************/ +; +; these 3dne functions are compatible with iSSE, but are optimized specifically for +; K7 pipelines +; +;------------------------------------------------------------------------------ +; 09.12.2002 Athlon optimizations contributed by Jaan Kalda +;------------------------------------------------------------------------------ + +bits 32 + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +%ifdef FORMAT_COFF +section .data data +%else +section .data data align=16 +%endif + +align 16 +mmx_one times 4 dw 1 + +section .text + +cglobal sad16_3dne +cglobal sad8_3dne +cglobal sad16bi_3dne +cglobal sad8bi_3dne +cglobal dev16_3dne + +;=========================================================================== +; +; uint32_t sad16_3dne(const uint8_t * const cur, +; const uint8_t * const ref, +; const uint32_t stride, +; const uint32_t best_sad); +; +;=========================================================================== +; optimization: 21% faster +%macro SAD_16x16_SSE 1 + movq mm7, [eax] + movq mm6, [eax+8] + psadbw mm7, [edx] + psadbw mm6, [edx+8] +%if (%1) + paddd mm1,mm5 +%endif + movq mm5, [eax+ecx] + movq mm4, [eax+ecx+8] + psadbw mm5, [edx+ecx] + psadbw mm4, [edx+ecx+8] + movq mm3, [eax+2*ecx] + movq mm2, [eax+2*ecx+8] + psadbw mm3, [edx+2*ecx] + psadbw mm2, [edx+2*ecx+8] +%if (%1) + movd [esp+4*(%1-1)],mm1 +%else + sub esp,byte 12 +%endif + movq mm1, [eax+ebx] + movq mm0, [eax+ebx+8] + psadbw mm1, [edx+ebx] + psadbw mm0, [edx+ebx+8] + lea eax,[eax+4*ecx] + lea edx,[edx+4*ecx] + paddd mm7,mm6 + paddd mm5,mm4 + paddd mm3,mm2 + paddd mm1,mm0 + paddd mm5,mm7 + paddd mm1,mm3 +%endmacro + +align 16 +sad16_3dne: + + mov eax, [esp+ 4] ; Src1 + mov edx, [esp+ 8] ; Src2 + mov ecx, [esp+12] ; Stride + push ebx + lea ebx,[2*ecx+ecx] + SAD_16x16_SSE 0 + SAD_16x16_SSE 1 + SAD_16x16_SSE 2 + SAD_16x16_SSE 3 + mov ecx,[esp] + add ecx,[esp+4] + add ecx,[esp+8] + paddd mm1,mm5 + mov ebx,[esp+12] + add esp,byte 4+12 + movd eax, mm1 + add eax,ecx + ret + + +;=========================================================================== +; +; uint32_t sad8_3dne(const uint8_t * const cur, +; const uint8_t * const ref, +; const uint32_t stride); +; +;=========================================================================== +align 16 +sad8_3dne: + + mov eax, [esp+ 4] ; Src1 + mov ecx, [esp+12] ; Stride + mov edx, [esp+ 8] ; Src2 + push ebx + lea ebx, [ecx+2*ecx] + + movq mm0, [byte eax] ;0 + psadbw mm0, [byte edx] + movq mm1, [eax+ecx] ;1 + psadbw mm1, [edx+ecx] + + movq mm2, [eax+2*ecx] ;2 + psadbw mm2, [edx+2*ecx] + movq mm3, [eax+ebx] ;3 + psadbw mm3, [edx+ebx] + + paddd mm0,mm1 + + movq mm4, [byte eax+4*ecx] ;4 + psadbw mm4, [edx+4*ecx] + movq mm5, [eax+2*ebx] ;6 + psadbw mm5, [edx+2*ebx] + + paddd mm2,mm3 + paddd mm0,mm2 + + lea ebx, [ebx+4*ecx] ;3+4=7 + lea ecx,[ecx+4*ecx] ; 5 + movq mm6, [eax+ecx] ;5 + psadbw mm6, [edx+ecx] + movq mm7, [eax+ebx] ;7 + psadbw mm7, [edx+ebx] + paddd mm4,mm5 + paddd mm6,mm7 + paddd mm0,mm4 + mov ebx,[esp] + add esp,byte 4 + paddd mm0,mm6 + movd eax, mm0 + + ret + + +;=========================================================================== +; +; uint32_t sad16bi_3dne(const uint8_t * const cur, +; const uint8_t * const ref1, +; const uint8_t * const ref2, +; const uint32_t stride); +; +;=========================================================================== +;optimization: 14% faster +%macro SADBI_16x16_SSE0 0 + movq mm2, [edx] + movq mm3, [edx+8] + + movq mm5, [byte eax] + movq mm6, [eax+8] + pavgb mm2, [byte ebx] + pavgb mm3, [ebx+8] + + add edx, ecx + psadbw mm5, mm2 + psadbw mm6, mm3 + + add eax, ecx + add ebx, ecx + movq mm2, [byte edx] + + movq mm3, [edx+8] + movq mm0, [byte eax] + + movq mm1, [eax+8] + pavgb mm2, [byte ebx] + + pavgb mm3, [ebx+8] + add edx, ecx + add eax, ecx + + add ebx, ecx + psadbw mm0, mm2 + psadbw mm1, mm3 + +%endmacro +%macro SADBI_16x16_SSE 0 + movq mm2, [byte edx] + movq mm3, [edx+8] + paddusw mm5,mm0 + paddusw mm6,mm1 + movq mm0, [eax] + movq mm1, [eax+8] + pavgb mm2, [ebx] + pavgb mm3, [ebx+8] + add edx, ecx + add eax, ecx + add ebx, ecx + psadbw mm0, mm2 + psadbw mm1, mm3 +%endmacro + +align 16 +sad16bi_3dne: + mov eax, [esp+ 4] ; Src + mov edx, [esp+ 8] ; Ref1 + mov ecx, [esp+16] ; Stride + push ebx + mov ebx, [esp+4+12] ; Ref2 + + SADBI_16x16_SSE0 + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + paddusw mm5,mm0 + paddusw mm6,mm1 + + pop ebx + paddusw mm6,mm5 + movd eax, mm6 + ret +;=========================================================================== +; +; uint32_t sad8bi_3dne(const uint8_t * const cur, +; const uint8_t * const ref1, +; const uint8_t * const ref2, +; const uint32_t stride); +; +;=========================================================================== + +%macro SADBI_8x8_3dne 0 + movq mm2, [edx] + movq mm3, [edx+ecx] + pavgb mm2, [eax] + pavgb mm3, [eax+ecx] + lea edx, [edx+2*ecx] + lea eax, [eax+2*ecx] + paddusw mm5,mm0 + paddusw mm6,mm1 + movq mm0, [ebx] + movq mm1, [ebx+ecx] + lea ebx, [ebx+2*ecx] + psadbw mm0, mm2 + psadbw mm1, mm3 +%endmacro + +align 16 +sad8bi_3dne: + mov eax, [esp+12] ; Ref2 + mov edx, [esp+ 8] ; Ref1 + mov ecx, [esp+16] ; Stride + push ebx + mov ebx, [esp+4+ 4] ; Src + + movq mm2, [edx] + movq mm3, [edx+ecx] + pavgb mm2, [eax] + pavgb mm3, [eax+ecx] + lea edx, [edx+2*ecx] + lea eax, [eax+2*ecx] + movq mm5, [ebx] + movq mm6, [ebx+ecx] + lea ebx, [ebx+2*ecx] + psadbw mm5, mm2 + psadbw mm6, mm3 + + movq mm2, [edx] + movq mm3, [edx+ecx] + pavgb mm2, [eax] + pavgb mm3, [eax+ecx] + lea edx, [edx+2*ecx] + lea eax, [eax+2*ecx] + movq mm0, [ebx] + movq mm1, [ebx+ecx] + lea ebx, [ebx+2*ecx] + psadbw mm0, mm2 + psadbw mm1, mm3 + + movq mm2, [edx] + movq mm3, [edx+ecx] + pavgb mm2, [eax] + pavgb mm3, [eax+ecx] + lea edx, [edx+2*ecx] + lea eax, [eax+2*ecx] + paddusw mm5,mm0 + paddusw mm6,mm1 + movq mm0, [ebx] + movq mm1, [ebx+ecx] + lea ebx, [ebx+2*ecx] + psadbw mm0, mm2 + psadbw mm1, mm3 + + movq mm2, [edx] + movq mm3, [edx+ecx] + pavgb mm2, [eax] + pavgb mm3, [eax+ecx] + paddusw mm5,mm0 + paddusw mm6,mm1 + movq mm0, [ebx] + movq mm1, [ebx+ecx] + psadbw mm0, mm2 + psadbw mm1, mm3 + paddusw mm5,mm0 + paddusw mm6,mm1 + + paddusw mm6,mm5 + mov ebx,[esp] + add esp,byte 4 + movd eax, mm6 + ret + + +;=========================================================================== +; +; uint32_t dev16_3dne(const uint8_t * const cur, +; const uint32_t stride); +; +;=========================================================================== +; optimization: 25 % faster +%macro ABS_16x16_SSE 1 +%if (%1 == 0) + movq mm7, [eax] + psadbw mm7, mm4 + mov esi,esi + movq mm6, [eax+8] + movq mm5, [eax+ecx] + movq mm3, [eax+ecx+8] + psadbw mm6, mm4 + + movq mm2, [byte eax+2*ecx] + psadbw mm5, mm4 + movq mm1, [eax+2*ecx+8] + psadbw mm3, mm4 + + movq mm0, [dword eax+edx] + psadbw mm2, mm4 + add eax,edx + psadbw mm1, mm4 +%endif +%if (%1 == 1) + psadbw mm0, mm4 + paddd mm7, mm0 + movq mm0, [eax+8] + psadbw mm0, mm4 + paddd mm6, mm0 + + movq mm0, [byte eax+ecx] + psadbw mm0, mm4 + + paddd mm5, mm0 + movq mm0, [eax+ecx+8] + + psadbw mm0, mm4 + paddd mm3, mm0 + movq mm0, [eax+2*ecx] + psadbw mm0, mm4 + paddd mm2, mm0 + + movq mm0, [eax+2*ecx+8] + add eax,edx + psadbw mm0, mm4 + paddd mm1, mm0 + movq mm0, [eax] +%endif +%if (%1 == 2) + psadbw mm0, mm4 + paddd mm7, mm0 + movq mm0, [eax+8] + psadbw mm0, mm4 + paddd mm6, mm0 +%endif +%endmacro + +align 16 +dev16_3dne: + + mov eax, [esp+ 4] ; Src + mov ecx, [esp+ 8] ; Stride + lea edx,[ecx+2*ecx] + + pxor mm4, mm4 +align 8 + ABS_16x16_SSE 0 + ABS_16x16_SSE 1 + ABS_16x16_SSE 1 + ABS_16x16_SSE 1 + ABS_16x16_SSE 1 + paddd mm1, mm2 + paddd mm3, mm5 + ABS_16x16_SSE 2 + paddd mm7, mm6 + paddd mm1, mm3 + mov eax, [esp+ 4] ; Src + paddd mm7,mm1 + punpcklbw mm7,mm7 ;xxyyaazz + pshufw mm4,mm7,055h + ; mm4 contains the mean + pxor mm1, mm1 + + ABS_16x16_SSE 0 + ABS_16x16_SSE 1 + ABS_16x16_SSE 1 + ABS_16x16_SSE 1 + ABS_16x16_SSE 1 + paddd mm1, mm2 + paddd mm3, mm5 + ABS_16x16_SSE 2 + paddd mm7, mm6 + paddd mm1, mm3 + paddd mm7,mm1 + movd eax, mm7 + ret \ No newline at end of file