--- interpolate8x8_xmm.asm 2004/08/10 21:58:55 1.7 +++ interpolate8x8_xmm.asm 2008/11/11 20:46:24 1.12 @@ -26,10 +26,23 @@ %macro cglobal 1 %ifdef PREFIX - global _%1 - %define %1 _%1 + %ifdef MARK_FUNCS + global _%1:function %1.endfunc-%1 + %define %1 _%1:function %1.endfunc-%1 + %define ENDFUNC .endfunc + %else + global _%1 + %define %1 _%1 + %define ENDFUNC + %endif %else - global %1 + %ifdef MARK_FUNCS + global %1:function %1.endfunc-%1 + %define ENDFUNC .endfunc + %else + global %1 + %define ENDFUNC + %endif %endif %endmacro @@ -53,6 +66,10 @@ cglobal interpolate8x8_halfpel_v_xmm cglobal interpolate8x8_halfpel_hv_xmm +cglobal interpolate8x4_halfpel_h_xmm +cglobal interpolate8x4_halfpel_v_xmm +cglobal interpolate8x4_halfpel_hv_xmm + cglobal interpolate8x8_halfpel_add_xmm cglobal interpolate8x8_halfpel_h_add_xmm cglobal interpolate8x8_halfpel_v_add_xmm @@ -117,7 +134,7 @@ COPY_H_SSE_RND0 ret -.rounding1 +.rounding1: ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 movq mm7, [mmx_one] COPY_H_SSE_RND1 @@ -128,6 +145,7 @@ lea ecx,[ecx+2*edx] COPY_H_SSE_RND1 ret +ENDFUNC ;=========================================================================== ; @@ -188,7 +206,7 @@ COPY_V_SSE_RND0 ret -.rounding1 +.rounding1: ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 movq mm7, [mmx_one] movq mm2, [eax] ; loop invariant @@ -202,6 +220,7 @@ lea ecx,[ecx+2*edx] COPY_V_SSE_RND1 ret +ENDFUNC ;=========================================================================== ; @@ -331,7 +350,7 @@ COPY_HV_SSE_RND0 ret -.rounding1 +.rounding1: COPY_HV_SSE_RND1 add ecx, edx COPY_HV_SSE_RND1 @@ -340,6 +359,131 @@ add ecx, edx COPY_HV_SSE_RND1 ret +ENDFUNC + +;=========================================================================== +; +; void interpolate8x4_halfpel_h_xmm(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +;=========================================================================== + +ALIGN 16 +interpolate8x4_halfpel_h_xmm: + + mov eax, [esp+16] ; rounding + mov ecx, [esp+ 4] ; Dst + test eax,eax + mov eax, [esp+ 8] ; Src + mov edx, [esp+12] ; stride + + jnz near .rounding1 + + COPY_H_SSE_RND0 + lea ecx,[ecx+2*edx] + COPY_H_SSE_RND0 + ret + +.rounding1: + ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 + movq mm7, [mmx_one] + COPY_H_SSE_RND1 + lea ecx, [ecx+2*edx] + COPY_H_SSE_RND1 + ret +ENDFUNC + +;=========================================================================== +; +; void interpolate8x4_halfpel_v_xmm(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +;=========================================================================== + +ALIGN 16 +interpolate8x4_halfpel_v_xmm: + + mov eax, [esp+16]; rounding + mov ecx, [esp+ 4] ; Dst + test eax,eax + mov eax, [esp+ 8] ; Src + mov edx, [esp+12] ; stride + + ; we process 2 line at a time + jnz near .rounding1 + + COPY_V_SSE_RND0 + lea ecx, [ecx+2*edx] + COPY_V_SSE_RND0 + ret + +.rounding1: + ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 + movq mm7, [mmx_one] + movq mm2, [eax] ; loop invariant + add eax, edx + + COPY_V_SSE_RND1 + lea ecx,[ecx+2*edx] + COPY_V_SSE_RND1 + ret +ENDFUNC + +;=========================================================================== +; +; void interpolate8x4_halfpel_hv_xmm(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + +; The trick is to correct the result of 'pavgb' with some combination of the +; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). +; The boolean relations are: +; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st +; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st +; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st +; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st +; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t. + +; Moreover, we process 2 lines at a times, for better overlapping (~15% faster). + +ALIGN 16 +interpolate8x4_halfpel_hv_xmm: + mov eax, [esp+16] ; rounding + mov ecx, [esp+ 4] ; Dst + test eax, eax + mov eax, [esp+ 8] ; Src + mov edx, [esp+12] ; stride + + movq mm7, [mmx_one] + + ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j + movq mm2, [eax] + movq mm3, [eax+1] + movq mm6, mm2 + pavgb mm2, mm3 + pxor mm3, mm6 ; mm2/mm3 ready + + jnz near .rounding1 + + COPY_HV_SSE_RND0 + add ecx, edx + COPY_HV_SSE_RND0 + ret + +.rounding1: + COPY_HV_SSE_RND1 + add ecx, edx + COPY_HV_SSE_RND1 + ret +ENDFUNC ;=========================================================================== ; @@ -407,6 +551,7 @@ lea ecx,[ecx+2*edx] ADD_FF 0, edx EPILOG +ENDFUNC ;=========================================================================== ; @@ -468,7 +613,7 @@ ADD_FH_RND0 0, edx EPILOG -.Loop1 +.Loop1: ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 ; movq mm7, [mmx_one] ADD_FH_RND1 0, edx @@ -482,6 +627,7 @@ lea ecx,[ecx+2*edx] ADD_FH_RND1 0, edx EPILOG +ENDFUNC ;=========================================================================== @@ -542,7 +688,7 @@ ADD_8_HF_RND0 EPILOG -.Loop1 +.Loop1: movq mm0, [eax] ; loop invariant movq mm7, [mmx_one] @@ -557,6 +703,7 @@ lea ecx,[ecx+2*edx] ADD_8_HF_RND1 EPILOG +ENDFUNC ; The trick is to correct the result of 'pavgb' with some combination of the ; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). @@ -686,7 +833,7 @@ ADD_HH_RND0 EPILOG -.Loop1 +.Loop1: ADD_HH_RND1 add ecx, edx ADD_HH_RND1 @@ -696,3 +843,10 @@ ADD_HH_RND1 EPILOG +ENDFUNC + + +%ifidn __OUTPUT_FORMAT__,elf +section ".note.GNU-stack" noalloc noexec nowrite progbits +%endif +