--- interpolate8x8_xmm.asm	2004/08/10 21:58:55	1.7
+++ interpolate8x8_xmm.asm	2008/11/11 20:46:24	1.12
@@ -26,10 +26,23 @@
 
 %macro cglobal 1
 	%ifdef PREFIX
-		global _%1
-		%define %1 _%1
+		%ifdef MARK_FUNCS
+			global _%1:function %1.endfunc-%1
+			%define %1 _%1:function %1.endfunc-%1
+			%define ENDFUNC .endfunc
+		%else
+			global _%1
+			%define %1 _%1
+			%define ENDFUNC
+		%endif
 	%else
-		global %1
+		%ifdef MARK_FUNCS
+			global %1:function %1.endfunc-%1
+			%define ENDFUNC .endfunc
+		%else
+			global %1
+			%define ENDFUNC
+		%endif
 	%endif
 %endmacro
 
@@ -53,6 +66,10 @@
 cglobal interpolate8x8_halfpel_v_xmm
 cglobal interpolate8x8_halfpel_hv_xmm
 
+cglobal interpolate8x4_halfpel_h_xmm
+cglobal interpolate8x4_halfpel_v_xmm
+cglobal interpolate8x4_halfpel_hv_xmm
+
 cglobal interpolate8x8_halfpel_add_xmm
 cglobal interpolate8x8_halfpel_h_add_xmm
 cglobal interpolate8x8_halfpel_v_add_xmm
@@ -117,7 +134,7 @@
   COPY_H_SSE_RND0
   ret
 
-.rounding1
+.rounding1:
  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
   movq mm7, [mmx_one]
   COPY_H_SSE_RND1
@@ -128,6 +145,7 @@
   lea ecx,[ecx+2*edx]
   COPY_H_SSE_RND1
   ret
+ENDFUNC
 
 ;===========================================================================
 ;
@@ -188,7 +206,7 @@
   COPY_V_SSE_RND0
   ret
 
-.rounding1
+.rounding1:
  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
   movq mm7, [mmx_one]
   movq mm2, [eax]       ; loop invariant
@@ -202,6 +220,7 @@
   lea ecx,[ecx+2*edx]
   COPY_V_SSE_RND1
   ret
+ENDFUNC
 
 ;===========================================================================
 ;
@@ -331,7 +350,7 @@
   COPY_HV_SSE_RND0
   ret
 
-.rounding1
+.rounding1:
   COPY_HV_SSE_RND1
   add ecx, edx
   COPY_HV_SSE_RND1
@@ -340,6 +359,131 @@
   add ecx, edx
   COPY_HV_SSE_RND1
   ret
+ENDFUNC
+
+;===========================================================================
+;
+; void interpolate8x4_halfpel_h_xmm(uint8_t * const dst,
+;						const uint8_t * const src,
+;						const uint32_t stride,
+;						const uint32_t rounding);
+;
+;===========================================================================
+
+ALIGN 16
+interpolate8x4_halfpel_h_xmm:
+
+  mov eax, [esp+16]     ; rounding
+  mov ecx, [esp+ 4]     ; Dst
+  test eax,eax
+  mov eax, [esp+ 8]     ; Src
+  mov edx, [esp+12]     ; stride
+
+  jnz near .rounding1
+
+  COPY_H_SSE_RND0
+  lea ecx,[ecx+2*edx]
+  COPY_H_SSE_RND0
+  ret
+
+.rounding1:
+ ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
+  movq mm7, [mmx_one]
+  COPY_H_SSE_RND1
+  lea ecx, [ecx+2*edx]
+  COPY_H_SSE_RND1
+  ret
+ENDFUNC
+
+;===========================================================================
+;
+; void interpolate8x4_halfpel_v_xmm(uint8_t * const dst,
+;                       const uint8_t * const src,
+;                       const uint32_t stride,
+;                       const uint32_t rounding);
+;
+;===========================================================================
+
+ALIGN 16
+interpolate8x4_halfpel_v_xmm:
+
+  mov eax, [esp+16]; rounding
+  mov ecx, [esp+ 4]     ; Dst
+  test eax,eax
+  mov eax, [esp+ 8]     ; Src
+  mov edx, [esp+12]     ; stride
+
+  ; we process 2 line at a time
+  jnz near .rounding1
+
+  COPY_V_SSE_RND0
+  lea ecx, [ecx+2*edx]
+  COPY_V_SSE_RND0
+  ret
+
+.rounding1:
+ ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
+  movq mm7, [mmx_one]
+  movq mm2, [eax]       ; loop invariant
+  add eax, edx
+
+  COPY_V_SSE_RND1
+  lea ecx,[ecx+2*edx]
+  COPY_V_SSE_RND1
+  ret
+ENDFUNC
+
+;===========================================================================
+;
+; void interpolate8x4_halfpel_hv_xmm(uint8_t * const dst,
+;                       const uint8_t * const src,
+;                       const uint32_t stride,
+;                       const uint32_t rounding);
+;
+;
+;===========================================================================
+
+; The trick is to correct the result of 'pavgb' with some combination of the
+; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
+; The boolean relations are:
+;   (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
+;   (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st
+;   (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st
+;   (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st
+; with  s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
+
+; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
+
+ALIGN 16
+interpolate8x4_halfpel_hv_xmm:
+  mov eax, [esp+16]  ; rounding
+  mov ecx, [esp+ 4]  ; Dst
+  test eax, eax
+  mov eax, [esp+ 8]  ; Src
+  mov edx, [esp+12]  ; stride
+
+  movq mm7, [mmx_one]
+
+    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
+  movq mm2, [eax]
+  movq mm3, [eax+1]
+  movq mm6, mm2
+  pavgb mm2, mm3
+  pxor mm3, mm6       ; mm2/mm3 ready
+
+  jnz near .rounding1
+
+  COPY_HV_SSE_RND0
+  add ecx, edx
+  COPY_HV_SSE_RND0
+  ret
+
+.rounding1:
+  COPY_HV_SSE_RND1
+  add ecx, edx
+  COPY_HV_SSE_RND1
+  ret
+ENDFUNC
 
 ;===========================================================================
 ;
@@ -407,6 +551,7 @@
   lea ecx,[ecx+2*edx]
   ADD_FF 0, edx
   EPILOG
+ENDFUNC
 
 ;===========================================================================
 ;
@@ -468,7 +613,7 @@
   ADD_FH_RND0 0, edx
   EPILOG
 
-.Loop1
+.Loop1:
   ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
   ; movq mm7, [mmx_one]
   ADD_FH_RND1 0, edx
@@ -482,6 +627,7 @@
   lea ecx,[ecx+2*edx]
   ADD_FH_RND1 0, edx
   EPILOG
+ENDFUNC
 
 
 ;===========================================================================
@@ -542,7 +688,7 @@
   ADD_8_HF_RND0
   EPILOG
 
-.Loop1
+.Loop1:
   movq mm0, [eax] ; loop invariant
   movq mm7, [mmx_one]
 
@@ -557,6 +703,7 @@
   lea ecx,[ecx+2*edx]
   ADD_8_HF_RND1 
   EPILOG
+ENDFUNC
 
 ; The trick is to correct the result of 'pavgb' with some combination of the
 ; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
@@ -686,7 +833,7 @@
   ADD_HH_RND0
   EPILOG
 
-.Loop1
+.Loop1:
   ADD_HH_RND1
   add ecx, edx
   ADD_HH_RND1
@@ -696,3 +843,10 @@
   ADD_HH_RND1
 
   EPILOG
+ENDFUNC
+
+
+%ifidn __OUTPUT_FORMAT__,elf
+section ".note.GNU-stack" noalloc noexec nowrite progbits
+%endif
+