--- qpel.h	2003/08/22 15:52:35	1.1.4.1
+++ qpel.h	2004/10/12 21:08:41	1.4
@@ -1,35 +1,32 @@
 /*****************************************************************************
-*
-*  XVID MPEG-4 VIDEO CODEC
-*  - QPel interpolation -
-*
-*  This program is free software ; you can redistribute it and/or modify
-*  it under the terms of the GNU General Public License as published by
-*  the Free Software Foundation ; either version 2 of the License, or
-*  (at your option) any later version.
-*
-*  This program is distributed in the hope that it will be useful,
-*  but WITHOUT ANY WARRANTY ; without even the implied warranty of
-*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-*  GNU General Public License for more details.
-*
-*  You should have received a copy of the GNU General Public License
-*  along with this program ; if not, write to the Free Software
-*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
-*
-*****************************************************************************/
-
-/**************************************************************************
  *
- *	History:
+ *  XVID MPEG-4 VIDEO CODEC
+ *  - QPel interpolation -
+ *
+ *  Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
+ *
+ *  This program is free software ; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation ; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program ; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  *
- *  22.10.2002	initial coding  - Skal -
+ * $Id: qpel.h,v 1.4 2004/10/12 21:08:41 edgomez Exp $
  *
- *************************************************************************/
+ ****************************************************************************/
 
 #ifndef _XVID_QPEL_H_
 #define _XVID_QPEL_H_
 
+#include "interpolate8x8.h"
 #include "../utils/mem_transfer.h"
 
 /*****************************************************************************
@@ -41,44 +38,48 @@
 
 typedef  XVID_QP_PASS_SIGNATURE(XVID_QP_PASS);
 
-    // We put everything in a single struct so it can easily be passed
-    // to prediction functions as a whole...
+/* We put everything in a single struct so it can easily be passed
+ * to prediction functions as a whole... */
 
-struct XVID_QP_FUNCS {
+typedef struct _XVID_QP_FUNCS {
 
-    // filter for QPel 16x? prediction
+	/* filter for QPel 16x? prediction */
 
-  XVID_QP_PASS *H_Pass;
-  XVID_QP_PASS *H_Pass_Avrg;
-  XVID_QP_PASS *H_Pass_Avrg_Up;
-  XVID_QP_PASS *V_Pass;
-  XVID_QP_PASS *V_Pass_Avrg;
-  XVID_QP_PASS *V_Pass_Avrg_Up;
-
-    // filter for QPel 8x? prediction
-
-  XVID_QP_PASS *H_Pass_8;
-  XVID_QP_PASS *H_Pass_Avrg_8;
-  XVID_QP_PASS *H_Pass_Avrg_Up_8;
-  XVID_QP_PASS *V_Pass_8;
-  XVID_QP_PASS *V_Pass_Avrg_8;
-  XVID_QP_PASS *V_Pass_Avrg_Up_8;
-};
-typedef struct XVID_QP_FUNCS  XVID_QP_FUNCS;
+	XVID_QP_PASS *H_Pass;
+	XVID_QP_PASS *H_Pass_Avrg;
+	XVID_QP_PASS *H_Pass_Avrg_Up;
+	XVID_QP_PASS *V_Pass;
+	XVID_QP_PASS *V_Pass_Avrg;
+	XVID_QP_PASS *V_Pass_Avrg_Up;
+
+    /* filter for QPel 8x? prediction */
+
+	XVID_QP_PASS *H_Pass_8;
+	XVID_QP_PASS *H_Pass_Avrg_8;
+	XVID_QP_PASS *H_Pass_Avrg_Up_8;
+	XVID_QP_PASS *V_Pass_8;
+	XVID_QP_PASS *V_Pass_Avrg_8;
+	XVID_QP_PASS *V_Pass_Avrg_Up_8;
+} XVID_QP_FUNCS;
 
 /*****************************************************************************
  * fwd dcl
  ****************************************************************************/
+extern void xvid_Init_QP();
+
+extern XVID_QP_FUNCS xvid_QP_Funcs_C_ref;       /* for P-frames */
+extern XVID_QP_FUNCS xvid_QP_Add_Funcs_C_ref;   /* for B-frames */
 
-extern XVID_QP_FUNCS xvid_QP_Funcs_C;       // for P-frames
-extern XVID_QP_FUNCS xvid_QP_Add_Funcs_C;   // for B-frames
+extern XVID_QP_FUNCS xvid_QP_Funcs_C;       /* for P-frames */
+extern XVID_QP_FUNCS xvid_QP_Add_Funcs_C;   /* for B-frames */
 
+#ifdef ARCH_IS_IA32
 extern XVID_QP_FUNCS xvid_QP_Funcs_mmx;
 extern XVID_QP_FUNCS xvid_QP_Add_Funcs_mmx;
-extern void xvid_Init_QP_mmx(); // should be called at mmx initialization
+#endif
 
-extern XVID_QP_FUNCS *xvid_QP_Funcs;      // <- main pointer for enc/dec structure
-extern XVID_QP_FUNCS *xvid_QP_Add_Funcs;  // <- main pointer for enc/dec structure
+extern XVID_QP_FUNCS *xvid_QP_Funcs;      /* <- main pointer for enc/dec structure */
+extern XVID_QP_FUNCS *xvid_QP_Add_Funcs;  /* <- main pointer for enc/dec structure */
 
 /*****************************************************************************
  * macros
@@ -100,16 +101,16 @@
 
  ****************************************************************************/
 
-static __inline void new_interpolate16x16_quarterpel(
-    uint8_t * const cur,
-    uint8_t * const refn,
-		uint8_t * const refh,
-		uint8_t * const refv,
-		uint8_t * const refhv,
-		const uint32_t x, const uint32_t y,
-		const int32_t dx,  const int dy,
-		const uint32_t stride,
-		const uint32_t rounding)
+static void __inline 
+interpolate16x16_quarterpel(uint8_t * const cur,
+								uint8_t * const refn,
+								uint8_t * const refh,
+								uint8_t * const refv,
+								uint8_t * const refhv,
+								const uint32_t x, const uint32_t y,
+								const int32_t dx,  const int dy,
+								const uint32_t stride,
+								const uint32_t rounding)
 {
 	const uint8_t *src;
 	uint8_t *dst;
@@ -119,199 +120,205 @@
 
 	int32_t x_int, y_int;
 
-  const int32_t xRef = x*4 + dx;
-  const int32_t yRef = y*4 + dy;
- 
-     Ops = xvid_QP_Funcs; // TODO: pass as argument
-     quads = (dx&3) | ((dy&3)<<2);
- 
-  x_int = xRef/4;
-  if (xRef < 0 && xRef % 4)
-   x_int--;
- 
-  y_int  = yRef/4;
-  if (yRef < 0 && yRef % 4)
-   y_int--;
- 
-    dst = cur + y * stride + x;
-	src = refn + y_int * stride + x_int;
-
-  tmp = refh; // we need at least a 16 x stride scratch block
-
-  switch(quads) {
-    case 0:
-      transfer8x8_copy( dst, src, stride);
-      transfer8x8_copy( dst+8, src+8, stride);
-      transfer8x8_copy( dst+8*stride, src+8*stride, stride);
-      transfer8x8_copy( dst+8*stride+8, src+8*stride+8, stride);
-    break;
-    case 1:
-      Ops->H_Pass_Avrg(dst, src, 16, stride, rounding);
-    break;
-    case 2:
-      Ops->H_Pass(dst, src, 16, stride, rounding);
-    break;
-    case 3:
-      Ops->H_Pass_Avrg_Up(dst, src, 16, stride, rounding);
-    break;
-    case 4:
-      Ops->V_Pass_Avrg(dst, src, 16, stride, rounding);
-    break;
-    case 5:
-      Ops->H_Pass_Avrg(tmp, src, 17, stride, rounding);
-      Ops->V_Pass_Avrg(dst, tmp, 16, stride, rounding);
-    break;
-    case 6:
-      Ops->H_Pass(tmp, src,   17, stride, rounding);
-      Ops->V_Pass_Avrg(dst, tmp, 16, stride, rounding);
-    break;
-    case 7:
-      Ops->H_Pass_Avrg_Up(tmp, src, 17, stride, rounding);
-      Ops->V_Pass_Avrg(dst, tmp, 16, stride, rounding);
-    break;
-    case 8:
-      Ops->V_Pass(dst, src, 16, stride, rounding);
-    break;
-    case 9:
-      Ops->H_Pass_Avrg(tmp, src, 17, stride, rounding);
-      Ops->V_Pass(dst, tmp, 16, stride, rounding);
-    break;
-    case 10:
-      Ops->H_Pass(tmp, src, 17, stride, rounding);
-      Ops->V_Pass(dst, tmp, 16, stride, rounding);
-    break;
-    case 11:
-      Ops->H_Pass_Avrg_Up(tmp, src, 17, stride, rounding);
-      Ops->V_Pass(dst, tmp, 16, stride, rounding);
-    break;
-    case 12:
-      Ops->V_Pass_Avrg_Up(dst, src, 16, stride, rounding);
-    break;
-    case 13:
-      Ops->H_Pass_Avrg(tmp, src, 17, stride, rounding);
-      Ops->V_Pass_Avrg_Up(dst, tmp, 16, stride, rounding);
-    break;
-    case 14:
-      Ops->H_Pass(tmp, src, 17, stride, rounding);
-      Ops->V_Pass_Avrg_Up( dst, tmp, 16, stride, rounding);
-    break;
-    case 15:
-      Ops->H_Pass_Avrg_Up(tmp, src, 17, stride, rounding);
-      Ops->V_Pass_Avrg_Up(dst, tmp, 16, stride, rounding);
-    break;
-  }
+	const int32_t xRef = (int)x*4 + dx;
+	const int32_t yRef = (int)y*4 + dy;
+
+	Ops = xvid_QP_Funcs;
+	quads = (dx&3) | ((dy&3)<<2);
+
+	x_int = xRef/4;
+	if (xRef < 0 && xRef % 4)
+		x_int--;
+
+	y_int	 = yRef/4;
+	if (yRef < 0 && yRef % 4)
+		y_int--;
+
+	dst = cur + y * stride + x;
+	src = refn + y_int * (int)stride + x_int;
+
+	tmp = refh; /* we need at least a 16 x stride scratch block */
+
+	switch(quads) {
+	case 0:
+		transfer8x8_copy(dst, src, stride);
+		transfer8x8_copy(dst+8, src+8, stride);
+		transfer8x8_copy(dst+8*stride, src+8*stride, stride);
+		transfer8x8_copy(dst+8*stride+8, src+8*stride+8, stride);
+		break;
+	case 1:
+		Ops->H_Pass_Avrg(dst, src, 16, stride, rounding);
+		break;
+	case 2:
+		Ops->H_Pass(dst, src, 16, stride, rounding);
+		break;
+	case 3:
+		Ops->H_Pass_Avrg_Up(dst, src, 16, stride, rounding);
+		break;
+	case 4:
+		Ops->V_Pass_Avrg(dst, src, 16, stride, rounding);
+		break;
+	case 5:
+		Ops->H_Pass_Avrg(tmp, src, 17, stride, rounding);
+		Ops->V_Pass_Avrg(dst, tmp, 16, stride, rounding);
+		break;
+	case 6:
+		Ops->H_Pass(tmp, src,	  17, stride, rounding);
+		Ops->V_Pass_Avrg(dst, tmp, 16, stride, rounding);
+		break;
+	case 7:
+		Ops->H_Pass_Avrg_Up(tmp, src, 17, stride, rounding);
+		Ops->V_Pass_Avrg(dst, tmp, 16, stride, rounding);
+		break;
+	case 8:
+		Ops->V_Pass(dst, src, 16, stride, rounding);
+		break;
+	case 9:
+		Ops->H_Pass_Avrg(tmp, src, 17, stride, rounding);
+		Ops->V_Pass(dst, tmp, 16, stride, rounding);
+		break;
+	case 10:
+		Ops->H_Pass(tmp, src, 17, stride, rounding);
+		Ops->V_Pass(dst, tmp, 16, stride, rounding);
+		break;
+	case 11:
+		Ops->H_Pass_Avrg_Up(tmp, src, 17, stride, rounding);
+		Ops->V_Pass(dst, tmp, 16, stride, rounding);
+		break;
+	case 12:
+		Ops->V_Pass_Avrg_Up(dst, src, 16, stride, rounding);
+		break;
+	case 13:
+		Ops->H_Pass_Avrg(tmp, src, 17, stride, rounding);
+		Ops->V_Pass_Avrg_Up(dst, tmp, 16, stride, rounding);
+		break;
+	case 14:
+		Ops->H_Pass(tmp, src, 17, stride, rounding);
+		Ops->V_Pass_Avrg_Up( dst, tmp, 16, stride, rounding);
+		break;
+	case 15:
+		Ops->H_Pass_Avrg_Up(tmp, src, 17, stride, rounding);
+		Ops->V_Pass_Avrg_Up(dst, tmp, 16, stride, rounding);
+		break;
+	}
 }
 
-static __inline void new_interpolate16x8_quarterpel(
-    uint8_t * const cur,
-    uint8_t * const refn,
-		uint8_t * const refh,
-		uint8_t * const refv,
-		uint8_t * const refhv,
-		const uint32_t x, const uint32_t y,
-		const int32_t dx,  const int dy,
-		const uint32_t stride,
-		const uint32_t rounding)
+static void __inline 
+interpolate16x16_add_quarterpel(uint8_t * const cur,
+								uint8_t * const refn,
+								uint8_t * const refh,
+								uint8_t * const refv,
+								uint8_t * const refhv,
+								const uint32_t x, const uint32_t y,
+								const int32_t dx,  const int dy,
+								const uint32_t stride,
+								const uint32_t rounding)
 {
 	const uint8_t *src;
 	uint8_t *dst;
 	uint8_t *tmp;
 	int32_t quads;
 	const XVID_QP_FUNCS *Ops;
+	const XVID_QP_FUNCS *Ops_Copy;
 
 	int32_t x_int, y_int;
 
-  const int32_t xRef = x*4 + dx;
-  const int32_t yRef = y*4 + dy;
- 
-     Ops = xvid_QP_Funcs; // TODO: pass as argument
-     quads = (dx&3) | ((dy&3)<<2);
- 
-  x_int = xRef/4;
-  if (xRef < 0 && xRef % 4)
-   x_int--;
- 
-  y_int  = yRef/4;
-  if (yRef < 0 && yRef % 4)
-   y_int--;
- 
-    dst = cur + y * stride + x;
-	src = refn + y_int * stride + x_int;
-
-  tmp = refh; // we need at least a 16 x stride scratch block
-
-  switch(quads) {
-    case 0:
-      transfer8x8_copy( dst, src, stride);
-      transfer8x8_copy( dst+8, src+8, stride);
-    break;
-    case 1:
-      Ops->H_Pass_Avrg(dst, src, 8, stride, rounding);
-    break;
-    case 2:
-      Ops->H_Pass(dst, src, 8, stride, rounding);
-    break;
-    case 3:
-      Ops->H_Pass_Avrg_Up(dst, src, 8, stride, rounding);
-    break;
-    case 4:
-      Ops->V_Pass_Avrg_8(dst, src, 16, stride, rounding);
-    break;
-    case 5:
-      Ops->H_Pass_Avrg(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_Avrg_8(dst, tmp, 16, stride, rounding);
-    break;
-    case 6:
-      Ops->H_Pass(tmp, src,   9, stride, rounding);
-      Ops->V_Pass_Avrg_8(dst, tmp, 16, stride, rounding);
-    break;
-    case 7:
-      Ops->H_Pass_Avrg_Up(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_Avrg_8(dst, tmp, 16, stride, rounding);
-    break;
-    case 8:
-      Ops->V_Pass_8(dst, src, 16, stride, rounding);
-    break;
-    case 9:
-      Ops->H_Pass_Avrg(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_8(dst, tmp, 16, stride, rounding);
-    break;
-    case 10:
-      Ops->H_Pass(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_8(dst, tmp, 16, stride, rounding);
-    break;
-    case 11:
-      Ops->H_Pass_Avrg_Up(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_8(dst, tmp, 16, stride, rounding);
-    break;
-    case 12:
-      Ops->V_Pass_Avrg_Up_8(dst, src, 16, stride, rounding);
-    break;
-    case 13:
-      Ops->H_Pass_Avrg(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_Avrg_Up_8(dst, tmp, 16, stride, rounding);
-    break;
-    case 14:
-      Ops->H_Pass(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_Avrg_Up_8( dst, tmp, 16, stride, rounding);
-    break;
-    case 15:
-      Ops->H_Pass_Avrg_Up(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_Avrg_Up_8(dst, tmp, 16, stride, rounding);
-    break;
-  }
+	const int32_t xRef = (int)x*4 + dx;
+	const int32_t yRef = (int)y*4 + dy;
+
+	Ops = xvid_QP_Add_Funcs;
+	Ops_Copy = xvid_QP_Funcs;
+	quads = (dx&3) | ((dy&3)<<2);
+
+	x_int = xRef/4;
+	if (xRef < 0 && xRef % 4)
+		x_int--;
+
+	y_int	 = yRef/4;
+	if (yRef < 0 && yRef % 4)
+		y_int--;
+
+	dst = cur + y * stride + x;
+	src = refn + y_int * (int)stride + x_int;
+
+	tmp = refh; /* we need at least a 16 x stride scratch block */
+
+	switch(quads) {
+	case 0:
+		/* NB: there is no halfpel involved ! the name's function can be
+		 *     misleading */
+		interpolate8x8_halfpel_add(dst, src, stride, rounding);
+		interpolate8x8_halfpel_add(dst+8, src+8, stride, rounding);
+		interpolate8x8_halfpel_add(dst+8*stride, src+8*stride, stride, rounding);
+		interpolate8x8_halfpel_add(dst+8*stride+8, src+8*stride+8, stride, rounding);
+		break;
+	case 1:
+		Ops->H_Pass_Avrg(dst, src, 16, stride, rounding);
+		break;
+	case 2:
+		Ops->H_Pass(dst, src, 16, stride, rounding);
+		break;
+	case 3:
+		Ops->H_Pass_Avrg_Up(dst, src, 16, stride, rounding);
+		break;
+	case 4:
+		Ops->V_Pass_Avrg(dst, src, 16, stride, rounding);
+		break;
+	case 5:
+		Ops_Copy->H_Pass_Avrg(tmp, src, 17, stride, rounding);
+		Ops->V_Pass_Avrg(dst, tmp, 16, stride, rounding);
+		break;
+	case 6:
+		Ops_Copy->H_Pass(tmp, src,	  17, stride, rounding);
+		Ops->V_Pass_Avrg(dst, tmp, 16, stride, rounding);
+		break;
+	case 7:
+		Ops_Copy->H_Pass_Avrg_Up(tmp, src, 17, stride, rounding);
+		Ops->V_Pass_Avrg(dst, tmp, 16, stride, rounding);
+		break;
+	case 8:
+		Ops->V_Pass(dst, src, 16, stride, rounding);
+		break;
+	case 9:
+		Ops_Copy->H_Pass_Avrg(tmp, src, 17, stride, rounding);
+		Ops->V_Pass(dst, tmp, 16, stride, rounding);
+		break;
+	case 10:
+		Ops_Copy->H_Pass(tmp, src, 17, stride, rounding);
+		Ops->V_Pass(dst, tmp, 16, stride, rounding);
+		break;
+	case 11:
+		Ops_Copy->H_Pass_Avrg_Up(tmp, src, 17, stride, rounding);
+		Ops->V_Pass(dst, tmp, 16, stride, rounding);
+		break;
+	case 12:
+		Ops->V_Pass_Avrg_Up(dst, src, 16, stride, rounding);
+		break;
+	case 13:
+		Ops_Copy->H_Pass_Avrg(tmp, src, 17, stride, rounding);
+		Ops->V_Pass_Avrg_Up(dst, tmp, 16, stride, rounding);
+		break;
+	case 14:
+		Ops_Copy->H_Pass(tmp, src, 17, stride, rounding);
+		Ops->V_Pass_Avrg_Up( dst, tmp, 16, stride, rounding);
+		break;
+	case 15:
+		Ops_Copy->H_Pass_Avrg_Up(tmp, src, 17, stride, rounding);
+		Ops->V_Pass_Avrg_Up(dst, tmp, 16, stride, rounding);
+		break;
+	}
 }
 
-static __inline void new_interpolate8x8_quarterpel(
-    uint8_t * const cur,
-    uint8_t * const refn,
-		uint8_t * const refh,
-		uint8_t * const refv,
-		uint8_t * const refhv,
-		const uint32_t x, const uint32_t y,
-		const int32_t dx,  const int dy,
-		const uint32_t stride,
-		const uint32_t rounding)
+static void __inline
+interpolate16x8_quarterpel(uint8_t * const cur,
+							   uint8_t * const refn,
+							   uint8_t * const refh,
+							   uint8_t * const refv,
+							   uint8_t * const refhv,
+							   const uint32_t x, const uint32_t y,
+							   const int32_t dx,  const int dy,
+							   const uint32_t stride,
+							   const uint32_t rounding)
 {
 	const uint8_t *src;
 	uint8_t *dst;
@@ -321,85 +328,287 @@
 
 	int32_t x_int, y_int;
 
-  const int32_t xRef = x*4 + dx;
-  const int32_t yRef = y*4 + dy;
- 
-     Ops = xvid_QP_Funcs; // TODO: pass as argument
-     quads = (dx&3) | ((dy&3)<<2);
- 
-  x_int = xRef/4;
-  if (xRef < 0 && xRef % 4)
-   x_int--;
- 
-  y_int  = yRef/4;
-  if (yRef < 0 && yRef % 4)
-   y_int--;
- 
-    dst = cur + y * stride + x;
-	src = refn + y_int * stride + x_int;
-
-  tmp = refh; // we need at least a 16 x stride scratch block
-
-  switch(quads) {
-    case 0:
-      transfer8x8_copy( dst, src, stride);
-    break;
-    case 1:
-      Ops->H_Pass_Avrg_8(dst, src, 8, stride, rounding);
-    break;
-    case 2:
-      Ops->H_Pass_8(dst, src, 8, stride, rounding);
-    break;
-    case 3:
-      Ops->H_Pass_Avrg_Up_8(dst, src, 8, stride, rounding);
-    break;
-    case 4:
-      Ops->V_Pass_Avrg_8(dst, src, 8, stride, rounding);
-    break;
-    case 5:
-      Ops->H_Pass_Avrg_8(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_Avrg_8(dst, tmp, 8, stride, rounding);
-    break;
-    case 6:
-      Ops->H_Pass_8(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_Avrg_8(dst, tmp, 8, stride, rounding);
-    break;
-    case 7:
-      Ops->H_Pass_Avrg_Up_8(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_Avrg_8(dst, tmp, 8, stride, rounding);
-    break;
-    case 8:
-      Ops->V_Pass_8(dst, src, 8, stride, rounding);
-    break;
-    case 9:
-      Ops->H_Pass_Avrg_8(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_8(dst, tmp, 8, stride, rounding);
-    break;
-    case 10:
-      Ops->H_Pass_8(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_8(dst, tmp, 8, stride, rounding);
-    break;
-    case 11:
-      Ops->H_Pass_Avrg_Up_8(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_8(dst, tmp, 8, stride, rounding);
-    break;
-    case 12:
-      Ops->V_Pass_Avrg_Up_8(dst, src, 8, stride, rounding);
-    break;
-    case 13:
-      Ops->H_Pass_Avrg_8(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_Avrg_Up_8(dst, tmp, 8, stride, rounding);
-    break;
-    case 14:
-      Ops->H_Pass_8(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_Avrg_Up_8( dst, tmp, 8, stride, rounding);
-    break;
-    case 15:
-      Ops->H_Pass_Avrg_Up_8(tmp, src, 9, stride, rounding);
-      Ops->V_Pass_Avrg_Up_8(dst, tmp, 8, stride, rounding);
-    break;
-  }
+	const int32_t xRef = (int)x*4 + dx;
+	const int32_t yRef = (int)y*4 + dy;
+
+	Ops = xvid_QP_Funcs;
+	quads = (dx&3) | ((dy&3)<<2);
+
+	x_int = xRef/4;
+	if (xRef < 0 && xRef % 4)
+		x_int--;
+
+	y_int	 = yRef/4;
+	if (yRef < 0 && yRef % 4)
+		y_int--;
+
+	dst = cur + y * stride + x;
+	src = refn + y_int * (int)stride + x_int;
+
+	tmp = refh; /* we need at least a 16 x stride scratch block */
+
+	switch(quads) {
+	case 0:
+		transfer8x8_copy( dst, src, stride);
+		transfer8x8_copy( dst+8, src+8, stride);
+		break;
+	case 1:
+		Ops->H_Pass_Avrg(dst, src, 8, stride, rounding);
+		break;
+	case 2:
+		Ops->H_Pass(dst, src, 8, stride, rounding);
+		break;
+	case 3:
+		Ops->H_Pass_Avrg_Up(dst, src, 8, stride, rounding);
+		break;
+	case 4:
+		Ops->V_Pass_Avrg_8(dst, src, 16, stride, rounding);
+		break;
+	case 5:
+		Ops->H_Pass_Avrg(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_8(dst, tmp, 16, stride, rounding);
+		break;
+	case 6:
+		Ops->H_Pass(tmp, src,	  9, stride, rounding);
+		Ops->V_Pass_Avrg_8(dst, tmp, 16, stride, rounding);
+		break;
+	case 7:
+		Ops->H_Pass_Avrg_Up(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_8(dst, tmp, 16, stride, rounding);
+		break;
+	case 8:
+		Ops->V_Pass_8(dst, src, 16, stride, rounding);
+		break;
+	case 9:
+		Ops->H_Pass_Avrg(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_8(dst, tmp, 16, stride, rounding);
+		break;
+	case 10:
+		Ops->H_Pass(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_8(dst, tmp, 16, stride, rounding);
+		break;
+	case 11:
+		Ops->H_Pass_Avrg_Up(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_8(dst, tmp, 16, stride, rounding);
+		break;
+	case 12:
+		Ops->V_Pass_Avrg_Up_8(dst, src, 16, stride, rounding);
+		break;
+	case 13:
+		Ops->H_Pass_Avrg(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_Up_8(dst, tmp, 16, stride, rounding);
+		break;
+	case 14:
+		Ops->H_Pass(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_Up_8( dst, tmp, 16, stride, rounding);
+		break;
+	case 15:
+		Ops->H_Pass_Avrg_Up(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_Up_8(dst, tmp, 16, stride, rounding);
+		break;
+	}
+}
+
+static void __inline
+interpolate8x8_quarterpel(uint8_t * const cur,
+							  uint8_t * const refn,
+							  uint8_t * const refh,
+							  uint8_t * const refv,
+							  uint8_t * const refhv,
+							  const uint32_t x, const uint32_t y,
+							  const int32_t dx,  const int dy,
+							  const uint32_t stride,
+							  const uint32_t rounding)
+{
+	const uint8_t *src;
+	uint8_t *dst;
+	uint8_t *tmp;
+	int32_t quads;
+	const XVID_QP_FUNCS *Ops;
+
+	int32_t x_int, y_int;
+
+	const int32_t xRef = (int)x*4 + dx;
+	const int32_t yRef = (int)y*4 + dy;
+
+	Ops = xvid_QP_Funcs;
+	quads = (dx&3) | ((dy&3)<<2);
+
+	x_int = xRef/4;
+	if (xRef < 0 && xRef % 4)
+		x_int--;
+
+	y_int	 = yRef/4;
+	if (yRef < 0 && yRef % 4)
+		y_int--;
+
+	dst = cur + y * stride + x;
+	src = refn + y_int * (int)stride + x_int;
+
+	tmp = refh; /* we need at least a 16 x stride scratch block */
+
+	switch(quads) {
+	case 0:
+		transfer8x8_copy( dst, src, stride);
+		break;
+	case 1:
+		Ops->H_Pass_Avrg_8(dst, src, 8, stride, rounding);
+		break;
+	case 2:
+		Ops->H_Pass_8(dst, src, 8, stride, rounding);
+		break;
+	case 3:
+		Ops->H_Pass_Avrg_Up_8(dst, src, 8, stride, rounding);
+		break;
+	case 4:
+		Ops->V_Pass_Avrg_8(dst, src, 8, stride, rounding);
+		break;
+	case 5:
+		Ops->H_Pass_Avrg_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 6:
+		Ops->H_Pass_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 7:
+		Ops->H_Pass_Avrg_Up_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 8:
+		Ops->V_Pass_8(dst, src, 8, stride, rounding);
+		break;
+	case 9:
+		Ops->H_Pass_Avrg_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 10:
+		Ops->H_Pass_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 11:
+		Ops->H_Pass_Avrg_Up_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 12:
+		Ops->V_Pass_Avrg_Up_8(dst, src, 8, stride, rounding);
+		break;
+	case 13:
+		Ops->H_Pass_Avrg_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_Up_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 14:
+		Ops->H_Pass_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_Up_8( dst, tmp, 8, stride, rounding);
+		break;
+	case 15:
+		Ops->H_Pass_Avrg_Up_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_Up_8(dst, tmp, 8, stride, rounding);
+		break;
+	}
+}
+
+static void __inline
+interpolate8x8_add_quarterpel(uint8_t * const cur,
+							  uint8_t * const refn,
+							  uint8_t * const refh,
+							  uint8_t * const refv,
+							  uint8_t * const refhv,
+							  const uint32_t x, const uint32_t y,
+							  const int32_t dx,  const int dy,
+							  const uint32_t stride,
+							  const uint32_t rounding)
+{
+	const uint8_t *src;
+	uint8_t *dst;
+	uint8_t *tmp;
+	int32_t quads;
+	const XVID_QP_FUNCS *Ops;
+	const XVID_QP_FUNCS *Ops_Copy;
+
+	int32_t x_int, y_int;
+
+	const int32_t xRef = (int)x*4 + dx;
+	const int32_t yRef = (int)y*4 + dy;
+
+	Ops = xvid_QP_Add_Funcs;
+	Ops_Copy = xvid_QP_Funcs;
+	quads = (dx&3) | ((dy&3)<<2);
+
+	x_int = xRef/4;
+	if (xRef < 0 && xRef % 4)
+		x_int--;
+
+	y_int	 = yRef/4;
+	if (yRef < 0 && yRef % 4)
+		y_int--;
+
+	dst = cur + y * stride + x;
+	src = refn + y_int * (int)stride + x_int;
+
+	tmp = refh; /* we need at least a 16 x stride scratch block */
+
+	switch(quads) {
+	case 0:
+		/* Misleading function name, there is no halfpel involved
+		 * just dst and src averaging with rounding=0 */
+		interpolate8x8_halfpel_add(dst, src, stride, rounding);
+		break;
+	case 1:
+		Ops->H_Pass_Avrg_8(dst, src, 8, stride, rounding);
+		break;
+	case 2:
+		Ops->H_Pass_8(dst, src, 8, stride, rounding);
+		break;
+	case 3:
+		Ops->H_Pass_Avrg_Up_8(dst, src, 8, stride, rounding);
+		break;
+	case 4:
+		Ops->V_Pass_Avrg_8(dst, src, 8, stride, rounding);
+		break;
+	case 5:
+		Ops_Copy->H_Pass_Avrg_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 6:
+		Ops_Copy->H_Pass_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 7:
+		Ops_Copy->H_Pass_Avrg_Up_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 8:
+		Ops->V_Pass_8(dst, src, 8, stride, rounding);
+		break;
+	case 9:
+		Ops_Copy->H_Pass_Avrg_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 10:
+		Ops_Copy->H_Pass_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 11:
+		Ops_Copy->H_Pass_Avrg_Up_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 12:
+		Ops->V_Pass_Avrg_Up_8(dst, src, 8, stride, rounding);
+		break;
+	case 13:
+		Ops_Copy->H_Pass_Avrg_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_Up_8(dst, tmp, 8, stride, rounding);
+		break;
+	case 14:
+		Ops_Copy->H_Pass_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_Up_8( dst, tmp, 8, stride, rounding);
+		break;
+	case 15:
+		Ops_Copy->H_Pass_Avrg_Up_8(tmp, src, 9, stride, rounding);
+		Ops->V_Pass_Avrg_Up_8(dst, tmp, 8, stride, rounding);
+		break;
+	}
 }
-/*****************************************************************************/
 
 #endif  /* _XVID_QPEL_H_ */