--- interpolate8x8_mmx.asm 2002/09/06 17:48:58 1.9 +++ interpolate8x8_mmx.asm 2005/09/13 12:12:15 1.18 @@ -1,299 +1,1369 @@ ;/***************************************************************************** ; * ; * XVID MPEG-4 VIDEO CODEC -; * mmx 8x8 block-based halfpel interpolation +; * - mmx 8x8 block-based halfpel interpolation - ; * -; * Copyright(C) 2002 Peter Ross -; * Copyright(C) 2002 Michael Militzer +; * Copyright(C) 2001 Peter Ross +; * 2002 Michael Militzer ; * -; * This program is an implementation of a part of one or more MPEG-4 -; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending -; * to use this software module in hardware or software products are -; * advised that its use may infringe existing patents or copyrights, and -; * any such use would be at such party's own risk. The original -; * developer of this software module and his/her company, and subsequent -; * editors and their companies, will have no liability for use of this -; * software or modifications or derivatives thereof. -; * -; * This program is free software; you can redistribute it and/or modify +; * This program is free software ; you can redistribute it and/or modify ; * it under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 2 of the License, or +; * the Free Software Foundation ; either version 2 of the License, or ; * (at your option) any later version. ; * ; * This program is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * but WITHOUT ANY WARRANTY ; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; * GNU General Public License for more details. ; * ; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software +; * along with this program ; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * ; ****************************************************************************/ -bits 32 +BITS 32 -%macro cglobal 1 +%macro cglobal 1 %ifdef PREFIX - global _%1 - %define %1 _%1 + %ifdef MARK_FUNCS + global _%1:function %1.endfunc-%1 + %define %1 _%1:function %1.endfunc-%1 + %else + global _%1 + %define %1 _%1 + %endif %else - global %1 + %ifdef MARK_FUNCS + global %1:function %1.endfunc-%1 + %else + global %1 + %endif %endif %endmacro -section .data - -align 16 +;============================================================================= +; Read only data +;============================================================================= + +%ifdef FORMAT_COFF +SECTION .rodata +%else +SECTION .rodata align=16 +%endif + +;----------------------------------------------------------------------------- +; (16 - r) rounding table +;----------------------------------------------------------------------------- + +ALIGN 16 +rounding_lowpass_mmx: + times 4 dw 16 + times 4 dw 15 -;=========================================================================== +;----------------------------------------------------------------------------- ; (1 - r) rounding table -;=========================================================================== +;----------------------------------------------------------------------------- -rounding1_mmx -times 4 dw 1 -times 4 dw 0 +rounding1_mmx: + times 4 dw 1 + times 4 dw 0 -;=========================================================================== -; (2 - r) rounding table -;=========================================================================== +;----------------------------------------------------------------------------- +; (2 - r) rounding table +;----------------------------------------------------------------------------- + +rounding2_mmx: + times 4 dw 2 + times 4 dw 1 + +mmx_one: + times 8 db 1 + +mmx_two: + times 8 db 2 + +mmx_three: + times 8 db 3 -rounding2_mmx -times 4 dw 2 -times 4 dw 1 +mmx_five: + times 4 dw 5 -mmx_one -times 8 db 1 +mmx_mask: + times 8 db 254 -section .text +mmx_mask2: + times 8 db 252 + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +cglobal interpolate8x8_halfpel_h_mmx +cglobal interpolate8x8_halfpel_v_mmx +cglobal interpolate8x8_halfpel_hv_mmx + +cglobal interpolate8x4_halfpel_h_mmx +cglobal interpolate8x4_halfpel_v_mmx +cglobal interpolate8x4_halfpel_hv_mmx + +cglobal interpolate8x8_avg4_mmx +cglobal interpolate8x8_avg2_mmx + +cglobal interpolate8x8_6tap_lowpass_h_mmx +cglobal interpolate8x8_6tap_lowpass_v_mmx + +cglobal interpolate8x8_halfpel_add_mmx +cglobal interpolate8x8_halfpel_h_add_mmx +cglobal interpolate8x8_halfpel_v_add_mmx +cglobal interpolate8x8_halfpel_hv_add_mmx %macro CALC_AVG 6 - punpcklbw %3, %6 - punpckhbw %4, %6 + punpcklbw %3, %6 + punpckhbw %4, %6 - paddusw %1, %3 ; mm01 += mm23 - paddusw %2, %4 - paddusw %1, %5 ; mm01 += rounding - paddusw %2, %5 - - psrlw %1, 1 ; mm01 >>= 1 - psrlw %2, 1 + paddusw %1, %3 ; mm01 += mm23 + paddusw %2, %4 + paddusw %1, %5 ; mm01 += rounding + paddusw %2, %5 + psrlw %1, 1 ; mm01 >>= 1 + psrlw %2, 1 %endmacro -;=========================================================================== +;----------------------------------------------------------------------------- ; ; void interpolate8x8_halfpel_h_mmx(uint8_t * const dst, -; const uint8_t * const src, -; const uint32_t stride, -; const uint32_t rounding); +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); ; -;=========================================================================== +;----------------------------------------------------------------------------- %macro COPY_H_MMX 0 - movq mm0, [esi] - movq mm2, [esi + 1] - movq mm1, mm0 - movq mm3, mm2 + movq mm0, [esi] + movq mm2, [esi + 1] + movq mm1, mm0 + movq mm3, mm2 - punpcklbw mm0, mm6 ; mm01 = [src] - punpckhbw mm1, mm6 ; mm23 = [src + 1] + punpcklbw mm0, mm6 ; mm01 = [src] + punpckhbw mm1, mm6 ; mm23 = [src + 1] - CALC_AVG mm0, mm1, mm2, mm3, mm7, mm6 + CALC_AVG mm0, mm1, mm2, mm3, mm7, mm6 - packuswb mm0, mm1 - movq [edi], mm0 ; [dst] = mm01 + packuswb mm0, mm1 + movq [edi], mm0 ; [dst] = mm01 - add esi, edx ; src += stride - add edi, edx ; dst += stride + add esi, edx ; src += stride + add edi, edx ; dst += stride %endmacro -align 16 -cglobal interpolate8x8_halfpel_h_mmx -interpolate8x8_halfpel_h_mmx - - push esi - push edi +ALIGN 16 +interpolate8x8_halfpel_h_mmx: - mov eax, [esp + 8 + 16] ; rounding + push esi + push edi + mov eax, [esp + 8 + 16] ; rounding -interpolate8x8_halfpel_h_mmx.start - movq mm7, [rounding1_mmx + eax * 8] + movq mm7, [rounding1_mmx + eax * 8] - mov edi, [esp + 8 + 4] ; dst - mov esi, [esp + 8 + 8] ; src - mov edx, [esp + 8 + 12] ; stride + mov edi, [esp + 8 + 4] ; dst + mov esi, [esp + 8 + 8] ; src + mov edx, [esp + 8 + 12] ; stride - pxor mm6, mm6 ; zero + pxor mm6, mm6 ; zero - COPY_H_MMX - COPY_H_MMX - COPY_H_MMX - COPY_H_MMX - COPY_H_MMX - COPY_H_MMX - COPY_H_MMX - COPY_H_MMX + COPY_H_MMX + COPY_H_MMX + COPY_H_MMX + COPY_H_MMX + COPY_H_MMX + COPY_H_MMX + COPY_H_MMX + COPY_H_MMX - pop edi - pop esi + pop edi + pop esi - ret + ret +.endfunc -;=========================================================================== +;----------------------------------------------------------------------------- ; ; void interpolate8x8_halfpel_v_mmx(uint8_t * const dst, -; const uint8_t * const src, -; const uint32_t stride, -; const uint32_t rounding); +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); ; -;=========================================================================== +;----------------------------------------------------------------------------- %macro COPY_V_MMX 0 - movq mm0, [esi] - movq mm2, [esi + edx] - movq mm1, mm0 - movq mm3, mm2 + movq mm0, [esi] + movq mm2, [esi + edx] + movq mm1, mm0 + movq mm3, mm2 - punpcklbw mm0, mm6 ; mm01 = [src] - punpckhbw mm1, mm6 ; mm23 = [src + 1] + punpcklbw mm0, mm6 ; mm01 = [src] + punpckhbw mm1, mm6 ; mm23 = [src + 1] - CALC_AVG mm0, mm1, mm2, mm3, mm7, mm6 + CALC_AVG mm0, mm1, mm2, mm3, mm7, mm6 - packuswb mm0, mm1 - movq [edi], mm0 ; [dst] = mm01 + packuswb mm0, mm1 + movq [edi], mm0 ; [dst] = mm01 - add esi, edx ; src += stride - add edi, edx ; dst += stride + add esi, edx ; src += stride + add edi, edx ; dst += stride %endmacro -align 16 -cglobal interpolate8x8_halfpel_v_mmx -interpolate8x8_halfpel_v_mmx +ALIGN 16 +interpolate8x8_halfpel_v_mmx: - push esi - push edi + push esi + push edi - mov eax, [esp + 8 + 16] ; rounding + mov eax, [esp + 8 + 16] ; rounding -interpolate8x8_halfpel_v_mmx.start - movq mm7, [rounding1_mmx + eax * 8] + movq mm7, [rounding1_mmx + eax * 8] - mov edi, [esp + 8 + 4] ; dst - mov esi, [esp + 8 + 8] ; src - mov edx, [esp + 8 + 12] ; stride + mov edi, [esp + 8 + 4] ; dst + mov esi, [esp + 8 + 8] ; src + mov edx, [esp + 8 + 12] ; stride - pxor mm6, mm6 ; zero + pxor mm6, mm6 ; zero - - COPY_V_MMX - COPY_V_MMX - COPY_V_MMX - COPY_V_MMX - COPY_V_MMX - COPY_V_MMX - COPY_V_MMX - COPY_V_MMX - pop edi - pop esi + COPY_V_MMX + COPY_V_MMX + COPY_V_MMX + COPY_V_MMX + COPY_V_MMX + COPY_V_MMX + COPY_V_MMX + COPY_V_MMX - ret + pop edi + pop esi + ret +.endfunc -;=========================================================================== + +;----------------------------------------------------------------------------- ; ; void interpolate8x8_halfpel_hv_mmx(uint8_t * const dst, -; const uint8_t * const src, -; const uint32_t stride, -; const uint32_t rounding); +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); ; ; -;=========================================================================== +;----------------------------------------------------------------------------- %macro COPY_HV_MMX 0 - ; current row + ; current row + movq mm0, [esi] + movq mm2, [esi + 1] + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 ; mm01 = [src] + punpcklbw mm2, mm6 ; mm23 = [src + 1] + punpckhbw mm1, mm6 + punpckhbw mm3, mm6 + + paddusw mm0, mm2 ; mm01 += mm23 + paddusw mm1, mm3 + + ; next row + movq mm4, [esi + edx] + movq mm2, [esi + edx + 1] + + movq mm5, mm4 + movq mm3, mm2 + + punpcklbw mm4, mm6 ; mm45 = [src + stride] + punpcklbw mm2, mm6 ; mm23 = [src + stride + 1] + punpckhbw mm5, mm6 + punpckhbw mm3, mm6 + + paddusw mm4, mm2 ; mm45 += mm23 + paddusw mm5, mm3 + + ; add current + next row + paddusw mm0, mm4 ; mm01 += mm45 + paddusw mm1, mm5 + paddusw mm0, mm7 ; mm01 += rounding2 + paddusw mm1, mm7 + + psrlw mm0, 2 ; mm01 >>= 2 + psrlw mm1, 2 + + packuswb mm0, mm1 + movq [edi], mm0 ; [dst] = mm01 + + add esi, edx ; src += stride + add edi, edx ; dst += stride +%endmacro + +ALIGN 16 +interpolate8x8_halfpel_hv_mmx: + + push esi + push edi + + mov eax, [esp + 8 + 16] ; rounding + + movq mm7, [rounding2_mmx + eax * 8] + + mov edi, [esp + 8 + 4] ; dst + mov esi, [esp + 8 + 8] ; src + + mov eax, 8 + + pxor mm6, mm6 ; zero + + mov edx, [esp + 8 + 12] ; stride + + COPY_HV_MMX + COPY_HV_MMX + COPY_HV_MMX + COPY_HV_MMX + COPY_HV_MMX + COPY_HV_MMX + COPY_HV_MMX + COPY_HV_MMX + + pop edi + pop esi + + ret +.endfunc + +;----------------------------------------------------------------------------- +; +; void interpolate8x4_halfpel_h_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +;----------------------------------------------------------------------------- + +ALIGN 16 +interpolate8x4_halfpel_h_mmx: + + push esi + push edi + mov eax, [esp + 8 + 16] ; rounding + + movq mm7, [rounding1_mmx + eax * 8] + + mov edi, [esp + 8 + 4] ; dst + mov esi, [esp + 8 + 8] ; src + mov edx, [esp + 8 + 12] ; stride + + pxor mm6, mm6 ; zero + + COPY_H_MMX + COPY_H_MMX + COPY_H_MMX + COPY_H_MMX + + pop edi + pop esi + + ret +.endfunc + + +;----------------------------------------------------------------------------- +; +; void interpolate8x4_halfpel_v_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +;----------------------------------------------------------------------------- + +ALIGN 16 +interpolate8x4_halfpel_v_mmx: + + push esi + push edi + + mov eax, [esp + 8 + 16] ; rounding + + movq mm7, [rounding1_mmx + eax * 8] + + mov edi, [esp + 8 + 4] ; dst + mov esi, [esp + 8 + 8] ; src + mov edx, [esp + 8 + 12] ; stride + + pxor mm6, mm6 ; zero + + + COPY_V_MMX + COPY_V_MMX + COPY_V_MMX + COPY_V_MMX + + pop edi + pop esi + + ret +.endfunc + + +;----------------------------------------------------------------------------- +; +; void interpolate8x4_halfpel_hv_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;----------------------------------------------------------------------------- + +ALIGN 16 +interpolate8x4_halfpel_hv_mmx: + + push esi + push edi + + mov eax, [esp + 8 + 16] ; rounding - movq mm0, [esi] - movq mm2, [esi + 1] + movq mm7, [rounding2_mmx + eax * 8] - movq mm1, mm0 - movq mm3, mm2 + mov edi, [esp + 8 + 4] ; dst + mov esi, [esp + 8 + 8] ; src - punpcklbw mm0, mm6 ; mm01 = [src] - punpcklbw mm2, mm6 ; mm23 = [src + 1] - punpckhbw mm1, mm6 - punpckhbw mm3, mm6 + mov eax, 8 - paddusw mm0, mm2 ; mm01 += mm23 - paddusw mm1, mm3 + pxor mm6, mm6 ; zero - ; next row + mov edx, [esp + 8 + 12] ; stride - movq mm4, [esi + edx] - movq mm2, [esi + edx + 1] - - movq mm5, mm4 - movq mm3, mm2 - - punpcklbw mm4, mm6 ; mm45 = [src + stride] - punpcklbw mm2, mm6 ; mm23 = [src + stride + 1] - punpckhbw mm5, mm6 - punpckhbw mm3, mm6 + COPY_HV_MMX + COPY_HV_MMX + COPY_HV_MMX + COPY_HV_MMX - paddusw mm4, mm2 ; mm45 += mm23 - paddusw mm5, mm3 + pop edi + pop esi - ; add current + next row + ret +.endfunc - paddusw mm0, mm4 ; mm01 += mm45 - paddusw mm1, mm5 - paddusw mm0, mm7 ; mm01 += rounding2 - paddusw mm1, mm7 - - psrlw mm0, 2 ; mm01 >>= 2 - psrlw mm1, 2 +;----------------------------------------------------------------------------- +; +; void interpolate8x8_avg2_mmx(uint8_t const *dst, +; const uint8_t * const src1, +; const uint8_t * const src2, +; const uint32_t stride, +; const uint32_t rounding, +; const uint32_t height); +; +;----------------------------------------------------------------------------- + +%macro AVG2_MMX_RND0 0 + movq mm0, [eax] ; src1 -> mm0 + movq mm1, [ebx] ; src2 -> mm1 + + movq mm4, [eax+edx] + movq mm5, [ebx+edx] + + movq mm2, mm0 ; src1 -> mm2 + movq mm3, mm1 ; src2 -> mm3 + + pand mm2, mm7 ; isolate the lsb + pand mm3, mm7 ; isolate the lsb + + por mm2, mm3 ; ODD(src1) OR ODD(src2) -> mm2 + + movq mm3, mm4 + movq mm6, mm5 + + pand mm3, mm7 + pand mm6, mm7 + + por mm3, mm6 - packuswb mm0, mm1 - movq [edi], mm0 ; [dst] = mm01 + pand mm0, [mmx_mask] + pand mm1, [mmx_mask] + pand mm4, [mmx_mask] + pand mm5, [mmx_mask] - add esi, edx ; src += stride - add edi, edx ; dst += stride + psrlq mm0, 1 ; src1 / 2 + psrlq mm1, 1 ; src2 / 2 + + psrlq mm4, 1 + psrlq mm5, 1 + + paddb mm0, mm1 ; src1/2 + src2/2 -> mm0 + paddb mm0, mm2 ; correct rounding error + + paddb mm4, mm5 + paddb mm4, mm3 + + lea eax, [eax+2*edx] + lea ebx, [ebx+2*edx] + + movq [ecx], mm0 ; (src1 + src2 + 1) / 2 -> dst + movq [ecx+edx], mm4 %endmacro -align 16 -cglobal interpolate8x8_halfpel_hv_mmx -interpolate8x8_halfpel_hv_mmx +%macro AVG2_MMX_RND1 0 + movq mm0, [eax] ; src1 -> mm0 + movq mm1, [ebx] ; src2 -> mm1 + + movq mm4, [eax+edx] + movq mm5, [ebx+edx] + + movq mm2, mm0 ; src1 -> mm2 + movq mm3, mm1 ; src2 -> mm3 + + pand mm2, mm7 ; isolate the lsb + pand mm3, mm7 ; isolate the lsb + + pand mm2, mm3 ; ODD(src1) AND ODD(src2) -> mm2 + + movq mm3, mm4 + movq mm6, mm5 + + pand mm3, mm7 + pand mm6, mm7 + + pand mm3, mm6 + + pand mm0, [mmx_mask] + pand mm1, [mmx_mask] + pand mm4, [mmx_mask] + pand mm5, [mmx_mask] + + psrlq mm0, 1 ; src1 / 2 + psrlq mm1, 1 ; src2 / 2 + + psrlq mm4, 1 + psrlq mm5, 1 + + paddb mm0, mm1 ; src1/2 + src2/2 -> mm0 + paddb mm0, mm2 ; correct rounding error + + paddb mm4, mm5 + paddb mm4, mm3 + + lea eax, [eax+2*edx] + lea ebx, [ebx+2*edx] + + movq [ecx], mm0 ; (src1 + src2 + 1) / 2 -> dst + movq [ecx+edx], mm4 +%endmacro + +ALIGN 16 +interpolate8x8_avg2_mmx: + + push ebx + + mov eax, [esp + 4 + 20] ; rounding + test eax, eax + + jnz near .rounding1 + + mov eax, [esp + 4 + 24] ; height -> eax + sub eax, 8 + test eax, eax + + mov ecx, [esp + 4 + 4] ; dst -> edi + mov eax, [esp + 4 + 8] ; src1 -> esi + mov ebx, [esp + 4 + 12] ; src2 -> eax + mov edx, [esp + 4 + 16] ; stride -> edx + + movq mm7, [mmx_one] + + jz near .start0 + + AVG2_MMX_RND0 + lea ecx, [ecx+2*edx] + +.start0 + + AVG2_MMX_RND0 + lea ecx, [ecx+2*edx] + AVG2_MMX_RND0 + lea ecx, [ecx+2*edx] + AVG2_MMX_RND0 + lea ecx, [ecx+2*edx] + AVG2_MMX_RND0 + + pop ebx + ret + +.rounding1 + mov eax, [esp + 4 + 24] ; height -> eax + sub eax, 8 + test eax, eax + + mov ecx, [esp + 4 + 4] ; dst -> edi + mov eax, [esp + 4 + 8] ; src1 -> esi + mov ebx, [esp + 4 + 12] ; src2 -> eax + mov edx, [esp + 4 + 16] ; stride -> edx + + movq mm7, [mmx_one] + + jz near .start1 + + AVG2_MMX_RND1 + lea ecx, [ecx+2*edx] + +.start1 + + AVG2_MMX_RND1 + lea ecx, [ecx+2*edx] + AVG2_MMX_RND1 + lea ecx, [ecx+2*edx] + AVG2_MMX_RND1 + lea ecx, [ecx+2*edx] + AVG2_MMX_RND1 + + pop ebx + ret +.endfunc + + +;----------------------------------------------------------------------------- +; +; void interpolate8x8_avg4_mmx(uint8_t const *dst, +; const uint8_t * const src1, +; const uint8_t * const src2, +; const uint8_t * const src3, +; const uint8_t * const src4, +; const uint32_t stride, +; const uint32_t rounding); +; +;----------------------------------------------------------------------------- + +%macro AVG4_MMX_RND0 0 + movq mm0, [eax] ; src1 -> mm0 + movq mm1, [ebx] ; src2 -> mm1 + + movq mm2, mm0 + movq mm3, mm1 + + pand mm2, [mmx_three] + pand mm3, [mmx_three] + + pand mm0, [mmx_mask2] + pand mm1, [mmx_mask2] + + psrlq mm0, 2 + psrlq mm1, 2 + + lea eax, [eax+edx] + lea ebx, [ebx+edx] + + paddb mm0, mm1 + paddb mm2, mm3 + + movq mm4, [esi] ; src3 -> mm0 + movq mm5, [edi] ; src4 -> mm1 + + movq mm1, mm4 + movq mm3, mm5 + + pand mm1, [mmx_three] + pand mm3, [mmx_three] + + pand mm4, [mmx_mask2] + pand mm5, [mmx_mask2] + + psrlq mm4, 2 + psrlq mm5, 2 + + paddb mm4, mm5 + paddb mm0, mm4 + + paddb mm1, mm3 + paddb mm2, mm1 + + paddb mm2, [mmx_two] + pand mm2, [mmx_mask2] + + psrlq mm2, 2 + paddb mm0, mm2 + + lea esi, [esi+edx] + lea edi, [edi+edx] + + movq [ecx], mm0 ; (src1 + src2 + src3 + src4 + 2) / 4 -> dst +%endmacro + +%macro AVG4_MMX_RND1 0 + movq mm0, [eax] ; src1 -> mm0 + movq mm1, [ebx] ; src2 -> mm1 - push esi - push edi + movq mm2, mm0 + movq mm3, mm1 - mov eax, [esp + 8 + 16] ; rounding -interpolate8x8_halfpel_hv_mmx.start + pand mm2, [mmx_three] + pand mm3, [mmx_three] + + pand mm0, [mmx_mask2] + pand mm1, [mmx_mask2] + + psrlq mm0, 2 + psrlq mm1, 2 + + lea eax,[eax+edx] + lea ebx,[ebx+edx] + + paddb mm0, mm1 + paddb mm2, mm3 + + movq mm4, [esi] ; src3 -> mm0 + movq mm5, [edi] ; src4 -> mm1 + + movq mm1, mm4 + movq mm3, mm5 + + pand mm1, [mmx_three] + pand mm3, [mmx_three] + + pand mm4, [mmx_mask2] + pand mm5, [mmx_mask2] + + psrlq mm4, 2 + psrlq mm5, 2 + + paddb mm4, mm5 + paddb mm0, mm4 + + paddb mm1, mm3 + paddb mm2, mm1 + + paddb mm2, [mmx_one] + pand mm2, [mmx_mask2] + + psrlq mm2, 2 + paddb mm0, mm2 + + lea esi,[esi+edx] + lea edi,[edi+edx] + + movq [ecx], mm0 ; (src1 + src2 + src3 + src4 + 2) / 4 -> dst +%endmacro + +ALIGN 16 +interpolate8x8_avg4_mmx: + + push ebx + push edi + push esi + + mov eax, [esp + 12 + 28] ; rounding + + test eax, eax + + mov ecx, [esp + 12 + 4] ; dst -> edi + mov eax, [esp + 12 + 8] ; src1 -> esi + mov ebx, [esp + 12 + 12] ; src2 -> eax + mov esi, [esp + 12 + 16] ; src3 -> esi + mov edi, [esp + 12 + 20] ; src4 -> edi + mov edx, [esp + 12 + 24] ; stride -> edx + + movq mm7, [mmx_one] + + jnz near .rounding1 + + AVG4_MMX_RND0 + lea ecx, [ecx+edx] + AVG4_MMX_RND0 + lea ecx, [ecx+edx] + AVG4_MMX_RND0 + lea ecx, [ecx+edx] + AVG4_MMX_RND0 + lea ecx, [ecx+edx] + AVG4_MMX_RND0 + lea ecx, [ecx+edx] + AVG4_MMX_RND0 + lea ecx, [ecx+edx] + AVG4_MMX_RND0 + lea ecx, [ecx+edx] + AVG4_MMX_RND0 + + pop esi + pop edi + pop ebx + ret + +.rounding1 + AVG4_MMX_RND1 + lea ecx, [ecx+edx] + AVG4_MMX_RND1 + lea ecx, [ecx+edx] + AVG4_MMX_RND1 + lea ecx, [ecx+edx] + AVG4_MMX_RND1 + lea ecx, [ecx+edx] + AVG4_MMX_RND1 + lea ecx, [ecx+edx] + AVG4_MMX_RND1 + lea ecx, [ecx+edx] + AVG4_MMX_RND1 + lea ecx, [ecx+edx] + AVG4_MMX_RND1 + + pop esi + pop edi + pop ebx + ret +.endfunc + + +;----------------------------------------------------------------------------- +; +; void interpolate8x8_6tap_lowpass_h_mmx(uint8_t const *dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +;----------------------------------------------------------------------------- + +%macro LOWPASS_6TAP_H_MMX 0 + movq mm0, [eax] + movq mm2, [eax+1] + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm7 + punpcklbw mm2, mm7 + + punpckhbw mm1, mm7 + punpckhbw mm3, mm7 + + paddw mm0, mm2 + paddw mm1, mm3 + + psllw mm0, 2 + psllw mm1, 2 + + movq mm2, [eax-1] + movq mm4, [eax+2] + + movq mm3, mm2 + movq mm5, mm4 + + punpcklbw mm2, mm7 + punpcklbw mm4, mm7 + + punpckhbw mm3, mm7 + punpckhbw mm5, mm7 + + paddw mm2, mm4 + paddw mm3, mm5 + + psubsw mm0, mm2 + psubsw mm1, mm3 + + pmullw mm0, [mmx_five] + pmullw mm1, [mmx_five] + + movq mm2, [eax-2] + movq mm4, [eax+3] + + movq mm3, mm2 + movq mm5, mm4 + + punpcklbw mm2, mm7 + punpcklbw mm4, mm7 + + punpckhbw mm3, mm7 + punpckhbw mm5, mm7 + + paddw mm2, mm4 + paddw mm3, mm5 + + paddsw mm0, mm2 + paddsw mm1, mm3 + + paddsw mm0, mm6 + paddsw mm1, mm6 + + psraw mm0, 5 + psraw mm1, 5 + + lea eax, [eax+edx] + packuswb mm0, mm1 + movq [ecx], mm0 +%endmacro + +ALIGN 16 +interpolate8x8_6tap_lowpass_h_mmx: + + mov eax, [esp + 16] ; rounding + + movq mm6, [rounding_lowpass_mmx + eax * 8] + + mov ecx, [esp + 4] ; dst -> edi + mov eax, [esp + 8] ; src -> esi + mov edx, [esp + 12] ; stride -> edx + + pxor mm7, mm7 + + LOWPASS_6TAP_H_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_H_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_H_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_H_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_H_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_H_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_H_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_H_MMX + + ret +.endfunc + +;----------------------------------------------------------------------------- +; +; void interpolate8x8_6tap_lowpass_v_mmx(uint8_t const *dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +;----------------------------------------------------------------------------- + +%macro LOWPASS_6TAP_V_MMX 0 + movq mm0, [eax] + movq mm2, [eax+edx] + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm7 + punpcklbw mm2, mm7 + + punpckhbw mm1, mm7 + punpckhbw mm3, mm7 + + paddw mm0, mm2 + paddw mm1, mm3 + + psllw mm0, 2 + psllw mm1, 2 + + movq mm4, [eax+2*edx] + sub eax, ebx + movq mm2, [eax+2*edx] + + movq mm3, mm2 + movq mm5, mm4 + + punpcklbw mm2, mm7 + punpcklbw mm4, mm7 + + punpckhbw mm3, mm7 + punpckhbw mm5, mm7 + + paddw mm2, mm4 + paddw mm3, mm5 + + psubsw mm0, mm2 + psubsw mm1, mm3 + + pmullw mm0, [mmx_five] + pmullw mm1, [mmx_five] + + movq mm2, [eax+edx] + movq mm4, [eax+2*ebx] + + movq mm3, mm2 + movq mm5, mm4 + + punpcklbw mm2, mm7 + punpcklbw mm4, mm7 + + punpckhbw mm3, mm7 + punpckhbw mm5, mm7 + + paddw mm2, mm4 + paddw mm3, mm5 + + paddsw mm0, mm2 + paddsw mm1, mm3 + + paddsw mm0, mm6 + paddsw mm1, mm6 + + psraw mm0, 5 + psraw mm1, 5 + + lea eax, [eax+4*edx] + packuswb mm0, mm1 + movq [ecx], mm0 +%endmacro + +ALIGN 16 +interpolate8x8_6tap_lowpass_v_mmx: + + push ebx + + mov eax, [esp + 4 + 16] ; rounding + + movq mm6, [rounding_lowpass_mmx + eax * 8] + + mov ecx, [esp + 4 + 4] ; dst -> edi + mov eax, [esp + 4 + 8] ; src -> esi + mov edx, [esp + 4 + 12] ; stride -> edx + + mov ebx, edx + shl ebx, 1 + add ebx, edx + + pxor mm7, mm7 + + LOWPASS_6TAP_V_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_V_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_V_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_V_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_V_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_V_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_V_MMX + lea ecx, [ecx+edx] + LOWPASS_6TAP_V_MMX + + pop ebx + ret +.endfunc + +;=========================================================================== +; +; The next functions combine both source halfpel interpolation step and the +; averaging (with rouding) step to avoid wasting memory bandwidth computing +; intermediate halfpel images and then averaging them. +; +;=========================================================================== + +%macro PROLOG0 0 + mov ecx, [esp+ 4] ; Dst + mov eax, [esp+ 8] ; Src + mov edx, [esp+12] ; BpS +%endmacro + +%macro PROLOG 2 ; %1: Rounder, %2 load Dst-Rounder + pxor mm6, mm6 + movq mm7, [%1] ; TODO: dangerous! (eax isn't checked) +%if %2 + movq mm5, [rounding1_mmx] +%endif + + PROLOG0 +%endmacro + + ; performs: mm0 == (mm0+mm2) mm1 == (mm1+mm3) +%macro MIX 0 + punpcklbw mm0, mm6 + punpcklbw mm2, mm6 + punpckhbw mm1, mm6 + punpckhbw mm3, mm6 + paddusw mm0, mm2 + paddusw mm1, mm3 +%endmacro + +%macro MIX_DST 0 + movq mm3, mm2 + paddusw mm0, mm7 ; rounder + paddusw mm1, mm7 ; rounder + punpcklbw mm2, mm6 + punpckhbw mm3, mm6 + psrlw mm0, 1 + psrlw mm1, 1 + + paddusw mm0, mm2 ; mix Src(mm0/mm1) with Dst(mm2/mm3) + paddusw mm1, mm3 + paddusw mm0, mm5 + paddusw mm1, mm5 + psrlw mm0, 1 + psrlw mm1, 1 + + packuswb mm0, mm1 +%endmacro + +%macro MIX2 0 + punpcklbw mm0, mm6 + punpcklbw mm2, mm6 + paddusw mm0, mm2 + paddusw mm0, mm7 + punpckhbw mm1, mm6 + punpckhbw mm3, mm6 + paddusw mm1, mm7 + paddusw mm1, mm3 + psrlw mm0, 1 + psrlw mm1, 1 + + packuswb mm0, mm1 +%endmacro + +;=========================================================================== +; +; void interpolate8x8_halfpel_add_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + +%macro ADD_FF_MMX 1 + movq mm0, [eax] + movq mm2, [ecx] + movq mm1, mm0 + movq mm3, mm2 +%if (%1!=0) + lea eax,[eax+%1*edx] +%endif + MIX + paddusw mm0, mm5 ; rounder + paddusw mm1, mm5 ; rounder + psrlw mm0, 1 + psrlw mm1, 1 + + packuswb mm0, mm1 + movq [ecx], mm0 +%if (%1!=0) + lea ecx,[ecx+%1*edx] +%endif +%endmacro + +ALIGN 16 +interpolate8x8_halfpel_add_mmx: + PROLOG rounding1_mmx, 1 + ADD_FF_MMX 1 + ADD_FF_MMX 1 + ADD_FF_MMX 1 + ADD_FF_MMX 1 + ADD_FF_MMX 1 + ADD_FF_MMX 1 + ADD_FF_MMX 1 + ADD_FF_MMX 0 + ret +.endfunc + +;=========================================================================== +; +; void interpolate8x8_halfpel_h_add_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + +%macro ADD_FH_MMX 0 + movq mm0, [eax] + movq mm2, [eax+1] + movq mm1, mm0 + movq mm3, mm2 + + lea eax,[eax+edx] + + MIX + movq mm2, [ecx] ; prepare mix with Dst[0] + MIX_DST + movq [ecx], mm0 +%endmacro + +ALIGN 16 +interpolate8x8_halfpel_h_add_mmx: + PROLOG rounding1_mmx, 1 + + ADD_FH_MMX + lea ecx,[ecx+edx] + ADD_FH_MMX + lea ecx,[ecx+edx] + ADD_FH_MMX + lea ecx,[ecx+edx] + ADD_FH_MMX + lea ecx,[ecx+edx] + ADD_FH_MMX + lea ecx,[ecx+edx] + ADD_FH_MMX + lea ecx,[ecx+edx] + ADD_FH_MMX + lea ecx,[ecx+edx] + ADD_FH_MMX + ret +.endfunc + +;=========================================================================== +; +; void interpolate8x8_halfpel_v_add_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + +%macro ADD_HF_MMX 0 + movq mm0, [eax] + movq mm2, [eax+edx] + movq mm1, mm0 + movq mm3, mm2 + + lea eax,[eax+edx] + + MIX + movq mm2, [ecx] ; prepare mix with Dst[0] + MIX_DST + movq [ecx], mm0 + +%endmacro + +ALIGN 16 +interpolate8x8_halfpel_v_add_mmx: + PROLOG rounding1_mmx, 1 + + ADD_HF_MMX + lea ecx,[ecx+edx] + ADD_HF_MMX + lea ecx,[ecx+edx] + ADD_HF_MMX + lea ecx,[ecx+edx] + ADD_HF_MMX + lea ecx,[ecx+edx] + ADD_HF_MMX + lea ecx,[ecx+edx] + ADD_HF_MMX + lea ecx,[ecx+edx] + ADD_HF_MMX + lea ecx,[ecx+edx] + ADD_HF_MMX + ret +.endfunc + +; The trick is to correct the result of 'pavgb' with some combination of the +; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). +; The boolean relations are: +; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st +; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st +; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st +; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st +; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t. + +; Moreover, we process 2 lines at a times, for better overlapping (~15% faster). + +;=========================================================================== +; +; void interpolate8x8_halfpel_hv_add_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + +%macro ADD_HH_MMX 0 + lea eax,[eax+edx] + + ; transfert prev line to mm0/mm1 + movq mm0, mm2 + movq mm1, mm3 + + ; load new line in mm2/mm3 + movq mm2, [eax] + movq mm4, [eax+1] + movq mm3, mm2 + movq mm5, mm4 + + punpcklbw mm2, mm6 + punpcklbw mm4, mm6 + paddusw mm2, mm4 + punpckhbw mm3, mm6 + punpckhbw mm5, mm6 + paddusw mm3, mm5 + + ; mix current line (mm2/mm3) with previous (mm0,mm1); + ; we'll preserve mm2/mm3 for next line... + + paddusw mm0, mm2 + paddusw mm1, mm3 + + movq mm4, [ecx] ; prepare mix with Dst[0] + movq mm5, mm4 + + paddusw mm0, mm7 ; finish mixing current line + paddusw mm1, mm7 + + punpcklbw mm4, mm6 + punpckhbw mm5, mm6 + + psrlw mm0, 2 + psrlw mm1, 2 + + paddusw mm0, mm4 ; mix Src(mm0/mm1) with Dst(mm2/mm3) + paddusw mm1, mm5 + + paddusw mm0, [rounding1_mmx] + paddusw mm1, [rounding1_mmx] + + psrlw mm0, 1 + psrlw mm1, 1 + + packuswb mm0, mm1 + + movq [ecx], mm0 +%endmacro - movq mm7, [rounding2_mmx + eax * 8] - - mov edi, [esp + 8 + 4] ; dst - mov esi, [esp + 8 + 8] ; src - - mov eax, 8 - - pxor mm6, mm6 ; zero - - mov edx, [esp + 8 + 12] ; stride - - COPY_HV_MMX - COPY_HV_MMX - COPY_HV_MMX - COPY_HV_MMX - COPY_HV_MMX - COPY_HV_MMX - COPY_HV_MMX - COPY_HV_MMX +ALIGN 16 +interpolate8x8_halfpel_hv_add_mmx: + PROLOG rounding2_mmx, 0 ; mm5 is busy. Don't load dst-rounder + + ; preprocess first line + movq mm0, [eax] + movq mm2, [eax+1] + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 + punpcklbw mm2, mm6 + punpckhbw mm1, mm6 + punpckhbw mm3, mm6 + paddusw mm2, mm0 + paddusw mm3, mm1 + + ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line + + ADD_HH_MMX + lea ecx,[ecx+edx] + ADD_HH_MMX + lea ecx,[ecx+edx] + ADD_HH_MMX + lea ecx,[ecx+edx] + ADD_HH_MMX + lea ecx,[ecx+edx] + ADD_HH_MMX + lea ecx,[ecx+edx] + ADD_HH_MMX + lea ecx,[ecx+edx] + ADD_HH_MMX + lea ecx,[ecx+edx] + ADD_HH_MMX - pop edi - pop esi + ret +.endfunc - ret