--- interpolate8x8_mmx.asm 2003/10/28 22:23:03 1.12.2.1 +++ interpolate8x8_mmx.asm 2008/11/26 23:35:50 1.22 @@ -4,7 +4,7 @@ ; * - mmx 8x8 block-based halfpel interpolation - ; * ; * Copyright(C) 2001 Peter Ross -; * 2002 Michael Militzer +; * 2002-2008 Michael Militzer ; * ; * This program is free software ; you can redistribute it and/or modify ; * it under the terms of the GNU General Public License as published by @@ -22,28 +22,19 @@ ; * ; ****************************************************************************/ -BITS 32 - -%macro cglobal 1 - %ifdef PREFIX - global _%1 - %define %1 _%1 - %else - global %1 - %endif -%endmacro +%include "nasm.inc" ;============================================================================= ; Read only data ;============================================================================= -SECTION .rodata +DATA ;----------------------------------------------------------------------------- ; (16 - r) rounding table ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN rounding_lowpass_mmx: times 4 dw 16 times 4 dw 15 @@ -86,16 +77,27 @@ ; Code ;============================================================================= -SECTION .text +SECTION .rotext align=SECTION_ALIGN cglobal interpolate8x8_halfpel_h_mmx cglobal interpolate8x8_halfpel_v_mmx cglobal interpolate8x8_halfpel_hv_mmx + +cglobal interpolate8x4_halfpel_h_mmx +cglobal interpolate8x4_halfpel_v_mmx +cglobal interpolate8x4_halfpel_hv_mmx + cglobal interpolate8x8_avg4_mmx cglobal interpolate8x8_avg2_mmx + cglobal interpolate8x8_6tap_lowpass_h_mmx cglobal interpolate8x8_6tap_lowpass_v_mmx +cglobal interpolate8x8_halfpel_add_mmx +cglobal interpolate8x8_halfpel_h_add_mmx +cglobal interpolate8x8_halfpel_v_add_mmx +cglobal interpolate8x8_halfpel_hv_add_mmx + %macro CALC_AVG 6 punpcklbw %3, %6 punpckhbw %4, %6 @@ -120,8 +122,8 @@ ;----------------------------------------------------------------------------- %macro COPY_H_MMX 0 - movq mm0, [esi] - movq mm2, [esi + 1] + movq mm0, [TMP0] + movq mm2, [TMP0 + 1] movq mm1, mm0 movq mm3, mm2 @@ -131,26 +133,24 @@ CALC_AVG mm0, mm1, mm2, mm3, mm7, mm6 packuswb mm0, mm1 - movq [edi], mm0 ; [dst] = mm01 + movq [_EAX], mm0 ; [dst] = mm01 - add esi, edx ; src += stride - add edi, edx ; dst += stride + add TMP0, TMP1 ; src += stride + add _EAX, TMP1 ; dst += stride %endmacro -ALIGN 16 +ALIGN SECTION_ALIGN interpolate8x8_halfpel_h_mmx: - push esi - push edi - mov eax, [esp + 8 + 16] ; rounding - - movq mm7, [rounding1_mmx + eax * 8] + mov _EAX, prm4 ; rounding + lea TMP0, [rounding1_mmx] + movq mm7, [TMP0 + _EAX * 8] + + mov _EAX, prm1 ; dst + mov TMP0, prm2 ; src + mov TMP1, prm3 ; stride - mov edi, [esp + 8 + 4] ; dst - mov esi, [esp + 8 + 8] ; src - mov edx, [esp + 8 + 12] ; stride - - pxor mm6, mm6 ; zero + pxor mm6, mm6 ; zero COPY_H_MMX COPY_H_MMX @@ -161,10 +161,8 @@ COPY_H_MMX COPY_H_MMX - pop edi - pop esi - ret +ENDFUNC ;----------------------------------------------------------------------------- @@ -177,8 +175,8 @@ ;----------------------------------------------------------------------------- %macro COPY_V_MMX 0 - movq mm0, [esi] - movq mm2, [esi + edx] + movq mm0, [TMP0] + movq mm2, [TMP0 + TMP1] movq mm1, mm0 movq mm3, mm2 @@ -188,27 +186,24 @@ CALC_AVG mm0, mm1, mm2, mm3, mm7, mm6 packuswb mm0, mm1 - movq [edi], mm0 ; [dst] = mm01 + movq [_EAX], mm0 ; [dst] = mm01 - add esi, edx ; src += stride - add edi, edx ; dst += stride + add TMP0, TMP1 ; src += stride + add _EAX, TMP1 ; dst += stride %endmacro -ALIGN 16 +ALIGN SECTION_ALIGN interpolate8x8_halfpel_v_mmx: - push esi - push edi - - mov eax, [esp + 8 + 16] ; rounding + mov _EAX, prm4 ; rounding + lea TMP0, [rounding1_mmx] + movq mm7, [TMP0 + _EAX * 8] + + mov _EAX, prm1 ; dst + mov TMP0, prm2 ; src + mov TMP1, prm3 ; stride - movq mm7, [rounding1_mmx + eax * 8] - - mov edi, [esp + 8 + 4] ; dst - mov esi, [esp + 8 + 8] ; src - mov edx, [esp + 8 + 12] ; stride - - pxor mm6, mm6 ; zero + pxor mm6, mm6 ; zero COPY_V_MMX @@ -220,10 +215,8 @@ COPY_V_MMX COPY_V_MMX - pop edi - pop esi - ret +ENDFUNC ;----------------------------------------------------------------------------- @@ -238,8 +231,8 @@ %macro COPY_HV_MMX 0 ; current row - movq mm0, [esi] - movq mm2, [esi + 1] + movq mm0, [TMP0] + movq mm2, [TMP0 + 1] movq mm1, mm0 movq mm3, mm2 @@ -253,8 +246,8 @@ paddusw mm1, mm3 ; next row - movq mm4, [esi + edx] - movq mm2, [esi + edx + 1] + movq mm4, [TMP0 + TMP1] + movq mm2, [TMP0 + TMP1 + 1] movq mm5, mm4 movq mm3, mm2 @@ -277,30 +270,25 @@ psrlw mm1, 2 packuswb mm0, mm1 - movq [edi], mm0 ; [dst] = mm01 + movq [_EAX], mm0 ; [dst] = mm01 - add esi, edx ; src += stride - add edi, edx ; dst += stride + add TMP0, TMP1 ; src += stride + add _EAX, TMP1 ; dst += stride %endmacro -ALIGN 16 +ALIGN SECTION_ALIGN interpolate8x8_halfpel_hv_mmx: - push esi - push edi + mov _EAX, prm4 ; rounding + lea TMP0, [rounding2_mmx] + movq mm7, [TMP0 + _EAX * 8] - mov eax, [esp + 8 + 16] ; rounding + mov _EAX, prm1 ; dst + mov TMP0, prm2 ; src - movq mm7, [rounding2_mmx + eax * 8] + pxor mm6, mm6 ; zero - mov edi, [esp + 8 + 4] ; dst - mov esi, [esp + 8 + 8] ; src - - mov eax, 8 - - pxor mm6, mm6 ; zero - - mov edx, [esp + 8 + 12] ; stride + mov TMP1, prm3 ; stride COPY_HV_MMX COPY_HV_MMX @@ -311,10 +299,103 @@ COPY_HV_MMX COPY_HV_MMX - pop edi - pop esi + ret +ENDFUNC + +;----------------------------------------------------------------------------- +; +; void interpolate8x4_halfpel_h_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +;----------------------------------------------------------------------------- + +ALIGN SECTION_ALIGN +interpolate8x4_halfpel_h_mmx: + + mov _EAX, prm4 ; rounding + lea TMP0, [rounding1_mmx] + movq mm7, [TMP0 + _EAX * 8] + + mov _EAX, prm1 ; dst + mov TMP0, prm2 ; src + mov TMP1, prm3 ; stride + + pxor mm6, mm6 ; zero + + COPY_H_MMX + COPY_H_MMX + COPY_H_MMX + COPY_H_MMX + + ret +ENDFUNC + + +;----------------------------------------------------------------------------- +; +; void interpolate8x4_halfpel_v_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +;----------------------------------------------------------------------------- + +ALIGN SECTION_ALIGN +interpolate8x4_halfpel_v_mmx: + + mov _EAX, prm4 ; rounding + lea TMP0, [rounding1_mmx] + movq mm7, [TMP0 + _EAX * 8] + + mov _EAX, prm1 ; dst + mov TMP0, prm2 ; src + mov TMP1, prm3 ; stride + + pxor mm6, mm6 ; zero + + + COPY_V_MMX + COPY_V_MMX + COPY_V_MMX + COPY_V_MMX + + ret +ENDFUNC + + +;----------------------------------------------------------------------------- +; +; void interpolate8x4_halfpel_hv_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;----------------------------------------------------------------------------- + +ALIGN SECTION_ALIGN +interpolate8x4_halfpel_hv_mmx: + + mov _EAX, prm4 ; rounding + lea TMP0, [rounding2_mmx] + movq mm7, [TMP0 + _EAX * 8] + + mov _EAX, prm1 ; dst + mov TMP0, prm2 ; src + + pxor mm6, mm6 ; zero + + mov TMP1, prm3 ; stride + + COPY_HV_MMX + COPY_HV_MMX + COPY_HV_MMX + COPY_HV_MMX ret +ENDFUNC ;----------------------------------------------------------------------------- ; @@ -328,11 +409,11 @@ ;----------------------------------------------------------------------------- %macro AVG2_MMX_RND0 0 - movq mm0, [eax] ; src1 -> mm0 - movq mm1, [ebx] ; src2 -> mm1 + movq mm0, [_EAX] ; src1 -> mm0 + movq mm1, [_EBX] ; src2 -> mm1 - movq mm4, [eax+edx] - movq mm5, [ebx+edx] + movq mm4, [_EAX+TMP1] + movq mm5, [_EBX+TMP1] movq mm2, mm0 ; src1 -> mm2 movq mm3, mm1 ; src2 -> mm3 @@ -367,19 +448,19 @@ paddb mm4, mm5 paddb mm4, mm3 - lea eax, [eax+2*edx] - lea ebx, [ebx+2*edx] + lea _EAX, [_EAX+2*TMP1] + lea _EBX, [_EBX+2*TMP1] - movq [ecx], mm0 ; (src1 + src2 + 1) / 2 -> dst - movq [ecx+edx], mm4 + movq [TMP0], mm0 ; (src1 + src2 + 1) / 2 -> dst + movq [TMP0+TMP1], mm4 %endmacro %macro AVG2_MMX_RND1 0 - movq mm0, [eax] ; src1 -> mm0 - movq mm1, [ebx] ; src2 -> mm1 + movq mm0, [_EAX] ; src1 -> mm0 + movq mm1, [_EBX] ; src2 -> mm1 - movq mm4, [eax+edx] - movq mm5, [ebx+edx] + movq mm4, [_EAX+TMP1] + movq mm5, [_EBX+TMP1] movq mm2, mm0 ; src1 -> mm2 movq mm3, mm1 ; src2 -> mm3 @@ -414,81 +495,92 @@ paddb mm4, mm5 paddb mm4, mm3 - lea eax, [eax+2*edx] - lea ebx, [ebx+2*edx] + lea _EAX, [_EAX+2*TMP1] + lea _EBX, [_EBX+2*TMP1] - movq [ecx], mm0 ; (src1 + src2 + 1) / 2 -> dst - movq [ecx+edx], mm4 + movq [TMP0], mm0 ; (src1 + src2 + 1) / 2 -> dst + movq [TMP0+TMP1], mm4 %endmacro -ALIGN 16 +ALIGN SECTION_ALIGN interpolate8x8_avg2_mmx: - push ebx - - mov eax, [esp + 4 + 20] ; rounding - test eax, eax + mov eax, prm5d ; rounding + test _EAX, _EAX jnz near .rounding1 - mov eax, [esp + 4 + 24] ; height -> eax - sub eax, 8 - test eax, eax - - mov ecx, [esp + 4 + 4] ; dst -> edi - mov eax, [esp + 4 + 8] ; src1 -> esi - mov ebx, [esp + 4 + 12] ; src2 -> eax - mov edx, [esp + 4 + 16] ; stride -> edx + mov eax, prm6d ; height -> _EAX + sub _EAX, 8 + test _EAX, _EAX + + mov TMP0, prm1 ; dst -> edi + mov _EAX, prm2 ; src1 -> esi + mov TMP1, prm4 ; stride -> TMP1 + + push _EBX +%ifdef ARCH_IS_X86_64 + mov _EBX, prm3 +%else + mov _EBX, [esp + 4 + 12] ; src2 -> eax +%endif movq mm7, [mmx_one] jz near .start0 AVG2_MMX_RND0 - lea ecx, [ecx+2*edx] + lea TMP0, [TMP0+2*TMP1] -.start0 +.start0: AVG2_MMX_RND0 - lea ecx, [ecx+2*edx] + lea TMP0, [TMP0+2*TMP1] AVG2_MMX_RND0 - lea ecx, [ecx+2*edx] + lea TMP0, [TMP0+2*TMP1] AVG2_MMX_RND0 - lea ecx, [ecx+2*edx] + lea TMP0, [TMP0+2*TMP1] AVG2_MMX_RND0 - pop ebx + pop _EBX ret -.rounding1 - mov eax, [esp + 4 + 24] ; height -> eax - sub eax, 8 - test eax, eax - - mov ecx, [esp + 4 + 4] ; dst -> edi - mov eax, [esp + 4 + 8] ; src1 -> esi - mov ebx, [esp + 4 + 12] ; src2 -> eax - mov edx, [esp + 4 + 16] ; stride -> edx +.rounding1: + mov eax, prm6d ; height -> _EAX + sub _EAX, 8 + test _EAX, _EAX + + mov TMP0, prm1 ; dst -> edi + mov _EAX, prm2 ; src1 -> esi + mov TMP1, prm4 ; stride -> TMP1 + + push _EBX +%ifdef ARCH_IS_X86_64 + mov _EBX, prm3 +%else + mov _EBX, [esp + 4 + 12] ; src2 -> eax +%endif movq mm7, [mmx_one] jz near .start1 AVG2_MMX_RND1 - lea ecx, [ecx+2*edx] + lea TMP0, [TMP0+2*TMP1] -.start1 +.start1: AVG2_MMX_RND1 - lea ecx, [ecx+2*edx] + lea TMP0, [TMP0+2*TMP1] AVG2_MMX_RND1 - lea ecx, [ecx+2*edx] + lea TMP0, [TMP0+2*TMP1] AVG2_MMX_RND1 - lea ecx, [ecx+2*edx] + lea TMP0, [TMP0+2*TMP1] AVG2_MMX_RND1 - pop ebx + pop _EBX ret +ENDFUNC ;----------------------------------------------------------------------------- @@ -504,8 +596,8 @@ ;----------------------------------------------------------------------------- %macro AVG4_MMX_RND0 0 - movq mm0, [eax] ; src1 -> mm0 - movq mm1, [ebx] ; src2 -> mm1 + movq mm0, [_EAX] ; src1 -> mm0 + movq mm1, [_EBX] ; src2 -> mm1 movq mm2, mm0 movq mm3, mm1 @@ -519,14 +611,14 @@ psrlq mm0, 2 psrlq mm1, 2 - lea eax, [eax+edx] - lea ebx, [ebx+edx] + lea _EAX, [_EAX+TMP1] + lea _EBX, [_EBX+TMP1] paddb mm0, mm1 paddb mm2, mm3 - movq mm4, [esi] ; src3 -> mm0 - movq mm5, [edi] ; src4 -> mm1 + movq mm4, [_ESI] ; src3 -> mm0 + movq mm5, [_EDI] ; src4 -> mm1 movq mm1, mm4 movq mm3, mm5 @@ -552,15 +644,15 @@ psrlq mm2, 2 paddb mm0, mm2 - lea esi, [esi+edx] - lea edi, [edi+edx] + lea _ESI, [_ESI+TMP1] + lea _EDI, [_EDI+TMP1] - movq [ecx], mm0 ; (src1 + src2 + src3 + src4 + 2) / 4 -> dst + movq [TMP0], mm0 ; (src1 + src2 + src3 + src4 + 2) / 4 -> dst %endmacro %macro AVG4_MMX_RND1 0 - movq mm0, [eax] ; src1 -> mm0 - movq mm1, [ebx] ; src2 -> mm1 + movq mm0, [_EAX] ; src1 -> mm0 + movq mm1, [_EBX] ; src2 -> mm1 movq mm2, mm0 movq mm3, mm1 @@ -574,14 +666,14 @@ psrlq mm0, 2 psrlq mm1, 2 - lea eax,[eax+edx] - lea ebx,[ebx+edx] + lea _EAX,[_EAX+TMP1] + lea _EBX,[_EBX+TMP1] paddb mm0, mm1 paddb mm2, mm3 - movq mm4, [esi] ; src3 -> mm0 - movq mm5, [edi] ; src4 -> mm1 + movq mm4, [_ESI] ; src3 -> mm0 + movq mm5, [_EDI] ; src4 -> mm1 movq mm1, mm4 movq mm3, mm5 @@ -607,76 +699,86 @@ psrlq mm2, 2 paddb mm0, mm2 - lea esi,[esi+edx] - lea edi,[edi+edx] + lea _ESI,[_ESI+TMP1] + lea _EDI,[_EDI+TMP1] - movq [ecx], mm0 ; (src1 + src2 + src3 + src4 + 2) / 4 -> dst + movq [TMP0], mm0 ; (src1 + src2 + src3 + src4 + 2) / 4 -> dst %endmacro -ALIGN 16 +ALIGN SECTION_ALIGN interpolate8x8_avg4_mmx: - push ebx - push edi - push esi - - mov eax, [esp + 12 + 28] ; rounding - - test eax, eax - - mov ecx, [esp + 12 + 4] ; dst -> edi - mov eax, [esp + 12 + 8] ; src1 -> esi - mov ebx, [esp + 12 + 12] ; src2 -> eax - mov esi, [esp + 12 + 16] ; src3 -> esi - mov edi, [esp + 12 + 20] ; src4 -> edi - mov edx, [esp + 12 + 24] ; stride -> edx + mov eax, prm7d ; rounding + test _EAX, _EAX + + mov TMP0, prm1 ; dst -> edi + mov _EAX, prm5 ; src4 -> edi + mov TMP1d, prm6d ; stride -> TMP1 + + + push _EBX + push _EDI + push _ESI + + mov _EDI, _EAX + +%ifdef ARCH_IS_X86_64 + mov _EAX, prm2 + mov _EBX, prm3 + mov _ESI, prm4 +%else + mov _EAX, [esp + 12 + 8] ; src1 -> esi + mov _EBX, [esp + 12 + 12] ; src2 -> _EAX + mov _ESI, [esp + 12 + 16] ; src3 -> esi +%endif movq mm7, [mmx_one] jnz near .rounding1 AVG4_MMX_RND0 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND0 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND0 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND0 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND0 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND0 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND0 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND0 - pop esi - pop edi - pop ebx + pop _ESI + pop _EDI + pop _EBX ret -.rounding1 +.rounding1: AVG4_MMX_RND1 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND1 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND1 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND1 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND1 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND1 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND1 - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] AVG4_MMX_RND1 - pop esi - pop edi - pop ebx + pop _ESI + pop _EDI + pop _EBX ret +ENDFUNC ;----------------------------------------------------------------------------- @@ -689,8 +791,8 @@ ;----------------------------------------------------------------------------- %macro LOWPASS_6TAP_H_MMX 0 - movq mm0, [eax] - movq mm2, [eax+1] + movq mm0, [_EAX] + movq mm2, [_EAX+1] movq mm1, mm0 movq mm3, mm2 @@ -707,8 +809,8 @@ psllw mm0, 2 psllw mm1, 2 - movq mm2, [eax-1] - movq mm4, [eax+2] + movq mm2, [_EAX-1] + movq mm4, [_EAX+2] movq mm3, mm2 movq mm5, mm4 @@ -728,8 +830,8 @@ pmullw mm0, [mmx_five] pmullw mm1, [mmx_five] - movq mm2, [eax-2] - movq mm4, [eax+3] + movq mm2, [_EAX-2] + movq mm4, [_EAX+3] movq mm3, mm2 movq mm5, mm4 @@ -752,41 +854,43 @@ psraw mm0, 5 psraw mm1, 5 - lea eax, [eax+edx] + lea _EAX, [_EAX+TMP1] packuswb mm0, mm1 - movq [ecx], mm0 + movq [TMP0], mm0 %endmacro -ALIGN 16 +ALIGN SECTION_ALIGN interpolate8x8_6tap_lowpass_h_mmx: - mov eax, [esp + 16] ; rounding + mov _EAX, prm4 ; rounding - movq mm6, [rounding_lowpass_mmx + eax * 8] + lea TMP0, [rounding_lowpass_mmx] + movq mm6, [TMP0 + _EAX * 8] - mov ecx, [esp + 4] ; dst -> edi - mov eax, [esp + 8] ; src -> esi - mov edx, [esp + 12] ; stride -> edx + mov TMP0, prm1 ; dst -> edi + mov _EAX, prm2 ; src -> esi + mov TMP1, prm3 ; stride -> edx pxor mm7, mm7 LOWPASS_6TAP_H_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_H_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_H_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_H_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_H_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_H_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_H_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_H_MMX ret +ENDFUNC ;----------------------------------------------------------------------------- ; @@ -798,8 +902,8 @@ ;----------------------------------------------------------------------------- %macro LOWPASS_6TAP_V_MMX 0 - movq mm0, [eax] - movq mm2, [eax+edx] + movq mm0, [_EAX] + movq mm2, [_EAX+TMP1] movq mm1, mm0 movq mm3, mm2 @@ -816,9 +920,9 @@ psllw mm0, 2 psllw mm1, 2 - movq mm4, [eax+2*edx] - sub eax, ebx - movq mm2, [eax+2*edx] + movq mm4, [_EAX+2*TMP1] + sub _EAX, _EBX + movq mm2, [_EAX+2*TMP1] movq mm3, mm2 movq mm5, mm4 @@ -838,8 +942,8 @@ pmullw mm0, [mmx_five] pmullw mm1, [mmx_five] - movq mm2, [eax+edx] - movq mm4, [eax+2*ebx] + movq mm2, [_EAX+TMP1] + movq mm4, [_EAX+2*_EBX] movq mm3, mm2 movq mm5, mm4 @@ -862,45 +966,370 @@ psraw mm0, 5 psraw mm1, 5 - lea eax, [eax+4*edx] + lea _EAX, [_EAX+4*TMP1] packuswb mm0, mm1 - movq [ecx], mm0 + movq [TMP0], mm0 %endmacro -ALIGN 16 +ALIGN SECTION_ALIGN interpolate8x8_6tap_lowpass_v_mmx: - push ebx + mov _EAX, prm4 ; rounding - mov eax, [esp + 4 + 16] ; rounding + lea TMP0, [rounding_lowpass_mmx] + movq mm6, [TMP0 + _EAX * 8] - movq mm6, [rounding_lowpass_mmx + eax * 8] + mov TMP0, prm1 ; dst -> edi + mov _EAX, prm2 ; src -> esi + mov TMP1, prm3 ; stride -> edx - mov ecx, [esp + 4 + 4] ; dst -> edi - mov eax, [esp + 4 + 8] ; src -> esi - mov edx, [esp + 4 + 12] ; stride -> edx + push _EBX - mov ebx, edx - shl ebx, 1 - add ebx, edx + mov _EBX, TMP1 + shl _EBX, 1 + add _EBX, TMP1 pxor mm7, mm7 LOWPASS_6TAP_V_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_V_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_V_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_V_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_V_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_V_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_V_MMX - lea ecx, [ecx+edx] + lea TMP0, [TMP0+TMP1] LOWPASS_6TAP_V_MMX - pop ebx + pop _EBX + ret +ENDFUNC + +;=========================================================================== +; +; The next functions combine both source halfpel interpolation step and the +; averaging (with rouding) step to avoid wasting memory bandwidth computing +; intermediate halfpel images and then averaging them. +; +;=========================================================================== + +%macro PROLOG0 0 + mov TMP0, prm1 ; Dst + mov _EAX, prm2 ; Src + mov TMP1, prm3 ; BpS +%endmacro + +%macro PROLOG 2 ; %1: Rounder, %2 load Dst-Rounder + pxor mm6, mm6 + movq mm7, [%1] ; TODO: dangerous! (eax isn't checked) +%if %2 + movq mm5, [rounding1_mmx] +%endif + + PROLOG0 +%endmacro + + ; performs: mm0 == (mm0+mm2) mm1 == (mm1+mm3) +%macro MIX 0 + punpcklbw mm0, mm6 + punpcklbw mm2, mm6 + punpckhbw mm1, mm6 + punpckhbw mm3, mm6 + paddusw mm0, mm2 + paddusw mm1, mm3 +%endmacro + +%macro MIX_DST 0 + movq mm3, mm2 + paddusw mm0, mm7 ; rounder + paddusw mm1, mm7 ; rounder + punpcklbw mm2, mm6 + punpckhbw mm3, mm6 + psrlw mm0, 1 + psrlw mm1, 1 + + paddusw mm0, mm2 ; mix Src(mm0/mm1) with Dst(mm2/mm3) + paddusw mm1, mm3 + paddusw mm0, mm5 + paddusw mm1, mm5 + psrlw mm0, 1 + psrlw mm1, 1 + + packuswb mm0, mm1 +%endmacro + +%macro MIX2 0 + punpcklbw mm0, mm6 + punpcklbw mm2, mm6 + paddusw mm0, mm2 + paddusw mm0, mm7 + punpckhbw mm1, mm6 + punpckhbw mm3, mm6 + paddusw mm1, mm7 + paddusw mm1, mm3 + psrlw mm0, 1 + psrlw mm1, 1 + + packuswb mm0, mm1 +%endmacro + +;=========================================================================== +; +; void interpolate8x8_halfpel_add_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + +%macro ADD_FF_MMX 1 + movq mm0, [_EAX] + movq mm2, [TMP0] + movq mm1, mm0 + movq mm3, mm2 +%if (%1!=0) + lea _EAX,[_EAX+%1*TMP1] +%endif + MIX + paddusw mm0, mm5 ; rounder + paddusw mm1, mm5 ; rounder + psrlw mm0, 1 + psrlw mm1, 1 + + packuswb mm0, mm1 + movq [TMP0], mm0 +%if (%1!=0) + lea TMP0,[TMP0+%1*TMP1] +%endif +%endmacro + +ALIGN SECTION_ALIGN +interpolate8x8_halfpel_add_mmx: + PROLOG rounding1_mmx, 1 + ADD_FF_MMX 1 + ADD_FF_MMX 1 + ADD_FF_MMX 1 + ADD_FF_MMX 1 + ADD_FF_MMX 1 + ADD_FF_MMX 1 + ADD_FF_MMX 1 + ADD_FF_MMX 0 + ret +ENDFUNC + +;=========================================================================== +; +; void interpolate8x8_halfpel_h_add_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + +%macro ADD_FH_MMX 0 + movq mm0, [_EAX] + movq mm2, [_EAX+1] + movq mm1, mm0 + movq mm3, mm2 + + lea _EAX,[_EAX+TMP1] + + MIX + movq mm2, [TMP0] ; prepare mix with Dst[0] + MIX_DST + movq [TMP0], mm0 +%endmacro + +ALIGN SECTION_ALIGN +interpolate8x8_halfpel_h_add_mmx: + PROLOG rounding1_mmx, 1 + + ADD_FH_MMX + lea TMP0,[TMP0+TMP1] + ADD_FH_MMX + lea TMP0,[TMP0+TMP1] + ADD_FH_MMX + lea TMP0,[TMP0+TMP1] + ADD_FH_MMX + lea TMP0,[TMP0+TMP1] + ADD_FH_MMX + lea TMP0,[TMP0+TMP1] + ADD_FH_MMX + lea TMP0,[TMP0+TMP1] + ADD_FH_MMX + lea TMP0,[TMP0+TMP1] + ADD_FH_MMX + ret +ENDFUNC + +;=========================================================================== +; +; void interpolate8x8_halfpel_v_add_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + +%macro ADD_HF_MMX 0 + movq mm0, [_EAX] + movq mm2, [_EAX+TMP1] + movq mm1, mm0 + movq mm3, mm2 + + lea _EAX,[_EAX+TMP1] + + MIX + movq mm2, [TMP0] ; prepare mix with Dst[0] + MIX_DST + movq [TMP0], mm0 + +%endmacro + +ALIGN SECTION_ALIGN +interpolate8x8_halfpel_v_add_mmx: + PROLOG rounding1_mmx, 1 + + ADD_HF_MMX + lea TMP0,[TMP0+TMP1] + ADD_HF_MMX + lea TMP0,[TMP0+TMP1] + ADD_HF_MMX + lea TMP0,[TMP0+TMP1] + ADD_HF_MMX + lea TMP0,[TMP0+TMP1] + ADD_HF_MMX + lea TMP0,[TMP0+TMP1] + ADD_HF_MMX + lea TMP0,[TMP0+TMP1] + ADD_HF_MMX + lea TMP0,[TMP0+TMP1] + ADD_HF_MMX + ret +ENDFUNC + +; The trick is to correct the result of 'pavgb' with some combination of the +; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). +; The boolean relations are: +; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st +; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st +; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st +; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st +; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t. + +; Moreover, we process 2 lines at a times, for better overlapping (~15% faster). + +;=========================================================================== +; +; void interpolate8x8_halfpel_hv_add_mmx(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + +%macro ADD_HH_MMX 0 + lea _EAX,[_EAX+TMP1] + + ; transfert prev line to mm0/mm1 + movq mm0, mm2 + movq mm1, mm3 + + ; load new line in mm2/mm3 + movq mm2, [_EAX] + movq mm4, [_EAX+1] + movq mm3, mm2 + movq mm5, mm4 + + punpcklbw mm2, mm6 + punpcklbw mm4, mm6 + paddusw mm2, mm4 + punpckhbw mm3, mm6 + punpckhbw mm5, mm6 + paddusw mm3, mm5 + + ; mix current line (mm2/mm3) with previous (mm0,mm1); + ; we'll preserve mm2/mm3 for next line... + + paddusw mm0, mm2 + paddusw mm1, mm3 + + movq mm4, [TMP0] ; prepare mix with Dst[0] + movq mm5, mm4 + + paddusw mm0, mm7 ; finish mixing current line + paddusw mm1, mm7 + + punpcklbw mm4, mm6 + punpckhbw mm5, mm6 + + psrlw mm0, 2 + psrlw mm1, 2 + + paddusw mm0, mm4 ; mix Src(mm0/mm1) with Dst(mm2/mm3) + paddusw mm1, mm5 + + paddusw mm0, [rounding1_mmx] + paddusw mm1, [rounding1_mmx] + + psrlw mm0, 1 + psrlw mm1, 1 + + packuswb mm0, mm1 + + movq [TMP0], mm0 +%endmacro + +ALIGN SECTION_ALIGN +interpolate8x8_halfpel_hv_add_mmx: + PROLOG rounding2_mmx, 0 ; mm5 is busy. Don't load dst-rounder + + ; preprocess first line + movq mm0, [_EAX] + movq mm2, [_EAX+1] + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 + punpcklbw mm2, mm6 + punpckhbw mm1, mm6 + punpckhbw mm3, mm6 + paddusw mm2, mm0 + paddusw mm3, mm1 + + ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line + + ADD_HH_MMX + lea TMP0,[TMP0+TMP1] + ADD_HH_MMX + lea TMP0,[TMP0+TMP1] + ADD_HH_MMX + lea TMP0,[TMP0+TMP1] + ADD_HH_MMX + lea TMP0,[TMP0+TMP1] + ADD_HH_MMX + lea TMP0,[TMP0+TMP1] + ADD_HH_MMX + lea TMP0,[TMP0+TMP1] + ADD_HH_MMX + lea TMP0,[TMP0+TMP1] + ADD_HH_MMX + ret +ENDFUNC + + +%ifidn __OUTPUT_FORMAT__,elf +section ".note.GNU-stack" noalloc noexec nowrite progbits +%endif +