--- gmc.c	2003/09/30 18:20:31	1.1.2.5
+++ gmc.c	2008/11/27 16:31:48	1.9
@@ -19,7 +19,7 @@
  *  along with this program ; if not, write to the Free Software
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  *
- * $Id: gmc.c,v 1.1.2.5 2003/09/30 18:20:31 edgomez Exp $
+ * $Id: gmc.c,v 1.9 2008/11/27 16:31:48 Isibaar Exp $
  *
  ****************************************************************************/
 
@@ -27,9 +27,62 @@
 #include "../global.h"
 #include "../encoder.h"
 #include "gmc.h"
+#include "../utils/emms.h"
 
 #include <stdio.h>
 
+  /* initialized by init_GMC(), for 3points */
+static
+void (*Predict_16x16_func)(const NEW_GMC_DATA * const This,
+                           uint8_t *dst, const uint8_t *src,
+                           int dststride, int srcstride, int x, int y, int rounding) = 0;
+static
+void (*Predict_8x8_func)(const NEW_GMC_DATA * const This,
+                         uint8_t *uDst, const uint8_t *uSrc,
+                         uint8_t *vDst, const uint8_t *vSrc,
+                         int dststride, int srcstride, int x, int y, int rounding) = 0;
+
+/****************************************************************************/
+/* this is borrowed from   bitstream.c  until we find a common solution */
+static uint32_t __inline
+log2bin(uint32_t value)
+{
+/* Changed by Chenm001 */
+#if !defined(_MSC_VER)
+  int n = 0;
+
+  while (value) {
+	value >>= 1;
+	n++;
+  }
+  return n;
+#else
+  __asm {
+	bsr eax, value
+	inc eax
+  }
+#endif
+}
+
+/* 16*sizeof(int) -> 1 or 2 cachelines */
+/* table lookup might be faster!  (still to be benchmarked) */
+
+/*
+static int log2bin_table[16] =
+	{ 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4};
+*/
+/*	1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 */
+
+#define RDIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b))
+#define RSHIFT(a,b) ( (a)>0 ? ((a) + (1<<((b)-1)))>>(b) : ((a) + (1<<((b)-1))-1)>>(b))
+
+#define MLT(i)  (((16-(i))<<16) + (i))
+static const uint32_t MTab[16] = {
+  MLT( 0), MLT( 1), MLT( 2), MLT( 3), MLT( 4), MLT( 5), MLT( 6), MLT( 7),
+  MLT( 8), MLT( 9), MLT(10), MLT(11), MLT(12), MLT(13), MLT(14), MLT(15)
+};
+#undef MLT
+
 /* ************************************************************
  * Pts = 2 or 3
  *
@@ -38,9 +91,10 @@
  * Conversely, *dst is the macroblock top-left adress.
  */
 
+static
 void Predict_16x16_C(const NEW_GMC_DATA * const This,
-					 uint8_t *dst, const uint8_t *src,
-					 int dststride, int srcstride, int x, int y, int rounding)
+                     uint8_t *dst, const uint8_t *src,
+                     int dststride, int srcstride, int x, int y, int rounding)
 {
 	const int W = This->sW;
 	const int H	= This->sH;
@@ -70,12 +124,17 @@
 			U += dUx; V += dVx;
 
 			if (u > 0 && u <= W) { ri = MTab[u&15]; Offset = u>>4;	}
-			else if (u > W) Offset = W>>4;
-			else Offset = -1;
- 
+			else {
+				if (u > W) Offset = W>>4;
+				else Offset = 0;
+				ri = MTab[0];
+			}
+
 			if (v > 0 && v <= H) { rj = MTab[v&15]; Offset += (v>>4)*srcstride; }
-			else if (v > H) Offset += (H>>4)*srcstride;
-			else Offset -= srcstride;
+			else {
+				if (v > H) Offset += (H>>4)*srcstride;
+				rj = MTab[0];
+			}
 
 			f0	= src[Offset + 0];
 			f0 |= src[Offset + 1] << 16;
@@ -92,10 +151,11 @@
 	}
 }
 
+static
 void Predict_8x8_C(const NEW_GMC_DATA * const This,
-					 uint8_t *uDst, const uint8_t *uSrc,
-					 uint8_t *vDst, const uint8_t *vSrc,
-					 int dststride, int srcstride, int x, int y, int rounding)
+                   uint8_t *uDst, const uint8_t *uSrc,
+                   uint8_t *vDst, const uint8_t *vSrc,
+                   int dststride, int srcstride, int x, int y, int rounding)
 {
 	const int W	 = This->sW >> 1;
 	const int H	 = This->sH >> 1;
@@ -131,18 +191,17 @@
 				ri = MTab[u&15];
 				Offset = u>>4;
 			} else {
-				ri = 16;
 				if (u>W) Offset = W>>4;
-				else Offset = -1;
+				else Offset = 0;
+				ri = MTab[0];
 			}
-			
+
 			if (v > 0 && v <= H) {
 				rj = MTab[v&15];
 				Offset += (v>>4)*srcstride;
 			} else {
-				rj = 16;
 				if (v>H) Offset += (H>>4)*srcstride;
-				else Offset -= srcstride;
+				rj = MTab[0];
 			}
 
 			f0	= uSrc[Offset + 0];
@@ -160,9 +219,9 @@
 			f0 |= vSrc[Offset + 1] << 16;
 			f1	= vSrc[Offset + srcstride + 0];
 			f1 |= vSrc[Offset + srcstride + 1] << 16;
-			f0 = (ri*f0)>>16; 
+			f0 = (ri*f0)>>16;
 			f1 = (ri*f1) & 0x0fff0000;
-			f0 |= f1; 
+			f0 |= f1;
 			f0 = (rj*f0 + Rounder) >> 24;
 
 			vDst[i] = (uint8_t)f0;
@@ -172,8 +231,9 @@
 	}
 }
 
+static
 void get_average_mv_C(const NEW_GMC_DATA * const Dsp, VECTOR * const mv,
-						int x, int y, int qpel)
+                      int x, int y, int qpel)
 {
 	int i, j;
 	int vx = 0, vy = 0;
@@ -184,10 +244,10 @@
 	int32_t U, V;
 	U = uo; uo += Dsp->dU[1];
 	V = vo; vo += Dsp->dV[1];
-	for (i=16; i>0; --i)	 
+	for (i=16; i>0; --i)
 	{
 		int32_t u,v;
-		u = U >> 16; U += Dsp->dU[0]; vx += u; 
+		u = U >> 16; U += Dsp->dU[0]; vx += u;
 		v = V >> 16; V += Dsp->dV[0]; vy += v;
 	}
 	}
@@ -202,9 +262,10 @@
  * simplified version for 1 warp point
  */
 
+static
 void Predict_1pt_16x16_C(const NEW_GMC_DATA * const This,
-						 uint8_t *Dst, const uint8_t *Src, 
-						 int dststride, int srcstride, int x, int y, int rounding)
+                         uint8_t *Dst, const uint8_t *Src,
+                         int dststride, int srcstride, int x, int y, int rounding)
 {
 	const int W	 = This->sW;
 	const int H	 = This->sH;
@@ -214,17 +275,23 @@
 
 	int32_t uo = This->Uo + (x<<8);	 /* ((16*x)<<4) */
 	int32_t vo = This->Vo + (y<<8);
-	const uint32_t ri = MTab[uo & 15];
-	const uint32_t rj = MTab[vo & 15];
+	uint32_t ri = MTab[uo & 15];
+	uint32_t rj = MTab[vo & 15];
 	int i, j;
 
 	int32_t Offset;
-	if ((uint32_t)vo<=(uint32_t)H) Offset	= (vo>>4)*srcstride;
-	else if (vo>H)				 Offset	= ( H>>4)*srcstride;
-	else							 Offset	=-16*srcstride;
-	if ((uint32_t)uo<=(uint32_t)W) Offset += (uo>>4);
-	else if (uo>W)				 Offset += ( W>>4);
-	else							 Offset -= 16;
+	if (vo>=(-16<<4) && vo<=H) Offset = (vo>>4)*srcstride;
+	else {
+		if (vo>H) Offset = ( H>>4)*srcstride;
+		else Offset =-16*srcstride;
+		rj = MTab[0];
+	}
+	if (uo>=(-16<<4) && uo<=W) Offset += (uo>>4);
+	else {
+		if (uo>W) Offset += (W>>4);
+		else Offset -= 16;
+		ri = MTab[0];
+	}
 
 	Dst += 16;
 
@@ -239,18 +306,19 @@
 		f1 |= Src[ Offset+srcstride +1 ] << 16;
 		f0 = (ri*f0)>>16;
 		f1 = (ri*f1) & 0x0fff0000;
-		f0 |= f1; 
+		f0 |= f1;
 		f0 = ( rj*f0 + Rounder ) >> 24;
 		Dst[i] = (uint8_t)f0;
 	}
 	Dst += dststride;
 	}
-}	 
+}
 
+static
 void Predict_1pt_8x8_C(const NEW_GMC_DATA * const This,
-						 uint8_t *uDst, const uint8_t *uSrc,
-						 uint8_t *vDst, const uint8_t *vSrc,
-						 int dststride, int srcstride, int x, int y, int rounding)
+                       uint8_t *uDst, const uint8_t *uSrc,
+                       uint8_t *vDst, const uint8_t *vSrc,
+                       int dststride, int srcstride, int x, int y, int rounding)
 {
 	const int W	 = This->sW >> 1;
 	const int H	 = This->sH >> 1;
@@ -259,17 +327,23 @@
 
 	int32_t uo = This->Uco + (x<<7);
 	int32_t vo = This->Vco + (y<<7);
-	const uint32_t rri = MTab[uo & 15];
-	const uint32_t rrj = MTab[vo & 15];
+	uint32_t rri = MTab[uo & 15];
+	uint32_t rrj = MTab[vo & 15];
 	int i, j;
 
 	int32_t Offset;
-	if ((uint32_t)vo<=(uint32_t)H) Offset = (vo>>4)*srcstride;
-	else if (vo>H) Offset = ( H>>4)*srcstride;
-	else Offset =-8*srcstride;
-	if ((uint32_t)uo<=(uint32_t)W) Offset += (uo>>4);
-	else if (uo>W) Offset += (W>>4);
-	else Offset -= 8;
+	if (vo>=(-8<<4) && vo<=H) Offset = (vo>>4)*srcstride;
+	else {
+		if (vo>H) Offset = ( H>>4)*srcstride;
+		else Offset =-8*srcstride;
+		rrj = MTab[0];
+	}
+	if (uo>=(-8<<4) && uo<=W) Offset += (uo>>4);
+	else {
+		if (uo>W) Offset += ( W>>4);
+		else Offset -= 8;
+		rri = MTab[0];
+	}
 
 	uDst += 8;
 	vDst += 8;
@@ -278,17 +352,17 @@
 	for(i=-8; i<0; ++i, Offset++)
 	{
 		uint32_t f0, f1;
-		f0	= uSrc[ Offset + 0 ]; 
+		f0	= uSrc[ Offset + 0 ];
 		f0 |= uSrc[ Offset + 1 ] << 16;
 		f1	= uSrc[ Offset + srcstride + 0 ];
 		f1 |= uSrc[ Offset + srcstride + 1 ] << 16;
 		f0 = (rri*f0)>>16;
-		f1 = (rri*f1) & 0x0fff0000; 
-		f0 |= f1; 
+		f1 = (rri*f1) & 0x0fff0000;
+		f0 |= f1;
 		f0 = ( rrj*f0 + Rounder ) >> 24;
-		uDst[i] = (uint8_t)f0;	 
+		uDst[i] = (uint8_t)f0;
 
-		f0	= vSrc[ Offset + 0 ];	 
+		f0	= vSrc[ Offset + 0 ];
 		f0 |= vSrc[ Offset + 1 ] << 16;
 		f1	= vSrc[ Offset + srcstride + 0 ];
 		f1 |= vSrc[ Offset + srcstride + 1 ] << 16;
@@ -303,6 +377,7 @@
 	}
 }
 
+static
 void get_average_mv_1pt_C(const NEW_GMC_DATA * const Dsp, VECTOR * const mv,
 							int x, int y, int qpel)
 {
@@ -310,6 +385,230 @@
 	mv->y = RSHIFT(Dsp->Vo<<qpel, 3);
 }
 
+#if defined(ARCH_IS_IA32) || defined(ARCH_IS_X86_64)
+/* *************************************************************
+ * MMX core function
+ */
+
+static
+void (*GMC_Core_Lin_8)(uint8_t *Dst, const uint16_t * Offsets, 
+                       const uint8_t * const Src0, const int BpS, const int Rounder) = 0;
+
+extern void xvid_GMC_Core_Lin_8_mmx(uint8_t *Dst, const uint16_t * Offsets, 
+                                    const uint8_t * const Src0, const int BpS, const int Rounder);
+
+extern void xvid_GMC_Core_Lin_8_sse2(uint8_t *Dst, const uint16_t * Offsets, 
+                                     const uint8_t * const Src0, const int BpS, const int Rounder);
+
+extern void xvid_GMC_Core_Lin_8_sse41(uint8_t *Dst, const uint16_t * Offsets, 
+                                      const uint8_t * const Src0, const int BpS, const int Rounder);
+
+/* *************************************************************/
+
+static void GMC_Core_Non_Lin_8(uint8_t *Dst, 
+                               const uint16_t * Offsets,
+                               const uint8_t * const Src0, const int srcstride,
+                               const int Rounder)
+{
+  int i;
+  for(i=0; i<8; ++i)
+  {
+    uint32_t u = Offsets[i   ];
+    uint32_t v = Offsets[i+16];
+    const uint32_t ri = MTab[u&0x0f];
+    const uint32_t rj = MTab[v&0x0f];
+    uint32_t f0, f1;
+    const uint8_t * const Src = Src0 + (u>>4) + (v>>4)*srcstride;
+    f0  = Src[0];
+    f0 |= Src[1] << 16;
+    f1  = Src[srcstride +0];
+    f1 |= Src[srcstride +1] << 16;
+    f0 = (ri*f0)>>16;
+    f1 = (ri*f1) & 0x0fff0000;
+    f0 |= f1;
+    f0 = ( rj*f0 + Rounder ) >> 24;
+    Dst[i] = (uint8_t)f0;
+  }
+}
+
+//////////////////////////////////////////////////////////
+
+static
+void Predict_16x16_mmx(const NEW_GMC_DATA * const This,
+                       uint8_t *dst, const uint8_t *src,
+                       int dststride, int srcstride, int x, int y, int rounding)
+{
+  const int W = This->sW;
+  const int H = This->sH;
+  const int rho = 3 - This->accuracy;
+  const int Rounder = ( 128 - (rounding<<(2*rho)) ) << 16;
+  const uint32_t W2 = W<<(16-rho);
+  const uint32_t H2 = H<<(16-rho);
+  
+  const int dUx = This->dU[0];
+  const int dVx = This->dV[0];
+  const int dUy = This->dU[1];
+  const int dVy = This->dV[1];
+
+  int Uo = This->Uo + 16*(dUy*y + dUx*x);
+  int Vo = This->Vo + 16*(dVy*y + dVx*x);
+
+  int i, j;
+
+  DECLARE_ALIGNED_MATRIX(Offsets, 2,16, uint16_t, CACHE_LINE);
+  for(j=16; j>0; --j)
+  {
+    int32_t U = Uo, V = Vo;
+    Uo += dUy; Vo += dVy;
+    if ( W2>(uint32_t)U && W2>(uint32_t)(U+15*dUx) &&
+         H2>(uint32_t)V && H2>(uint32_t)(V+15*dVx) )
+    {
+      uint32_t UV1, UV2;
+      for(i=0; i<16; ++i)
+      {
+        uint32_t u = ( U >> 16 ) << rho;
+        uint32_t v = ( V >> 16 ) << rho;
+        U += dUx;  V += dVx;
+        Offsets[   i] = u;
+        Offsets[16+i] = v;
+      }
+          // batch 8 input pixels when linearity says it's ok
+
+      UV1 = (Offsets[0] | (Offsets[16]<<16)) & 0xfff0fff0U;
+      UV2 = (Offsets[7] | (Offsets[23]<<16)) & 0xfff0fff0U;
+      if (UV1+7*16==UV2)
+        GMC_Core_Lin_8(dst,    Offsets,    src + (Offsets[0]>>4) + (Offsets[16]>>4)*srcstride, srcstride, Rounder);
+      else
+        GMC_Core_Non_Lin_8(dst,   Offsets,   src, srcstride, Rounder);
+      UV1 = (Offsets[ 8] | (Offsets[24]<<16)) & 0xfff0fff0U;
+      UV2 = (Offsets[15] | (Offsets[31]<<16)) & 0xfff0fff0U;
+      if (UV1+7*16==UV2)
+        GMC_Core_Lin_8(dst+8,  Offsets+8,  src + (Offsets[8]>>4) + (Offsets[24]>>4)*srcstride, srcstride, Rounder);
+      else
+        GMC_Core_Non_Lin_8(dst+8, Offsets+8, src, srcstride, Rounder);
+    }
+    else
+    {
+      for(i=0; i<16; ++i)
+      {
+        int u = ( U >> 16 ) << rho;
+        int v = ( V >> 16 ) << rho;
+        U += dUx; V += dVx;
+
+        Offsets[   i] = (u<0) ? 0 : (u>=W) ? W : u;
+        Offsets[16+i] = (v<0) ? 0 : (v>=H) ? H : v;
+      }
+        // due to boundary clipping, we cannot infer the 8-pixels batchability
+        // simply by using the linearity. Oh well, not a big deal...
+      GMC_Core_Non_Lin_8(dst,   Offsets,   src, srcstride, Rounder);
+      GMC_Core_Non_Lin_8(dst+8, Offsets+8, src, srcstride, Rounder);
+    }
+    dst += dststride;
+  }
+}
+
+static
+void Predict_8x8_mmx(const NEW_GMC_DATA * const This,
+                     uint8_t *uDst, const uint8_t *uSrc,
+                     uint8_t *vDst, const uint8_t *vSrc,
+                     int dststride, int srcstride, int x, int y, int rounding)
+{
+  const int W   = This->sW >> 1;
+  const int H   = This->sH >> 1;
+  const int rho = 3-This->accuracy;
+  const int32_t Rounder = ( 128 - (rounding<<(2*rho)) ) << 16;
+  const uint32_t W2 = W<<(16-rho);
+  const uint32_t H2 = H<<(16-rho);
+
+  const int dUx = This->dU[0];
+  const int dVx = This->dV[0];
+  const int dUy = This->dU[1];
+  const int dVy = This->dV[1];
+
+  int Uo = This->Uco + 8*(dUy*y + dUx*x);
+  int Vo = This->Vco + 8*(dVy*y + dVx*x);
+
+  DECLARE_ALIGNED_MATRIX(Offsets, 2,16, uint16_t, CACHE_LINE);
+  int i, j;
+  for(j=8; j>0; --j)
+  {
+    int32_t U = Uo, V = Vo;
+    Uo += dUy; Vo += dVy;
+    if ( W2>(uint32_t)U && W2>(uint32_t)(U+15*dUx) &&
+         H2>(uint32_t)V && H2>(uint32_t)(V+15*dVx) )
+    {
+      uint32_t UV1, UV2;
+      for(i=0; i<8; ++i)
+      {
+        int32_t u = ( U >> 16 ) << rho;
+        int32_t v = ( V >> 16 ) << rho;
+        U += dUx; V += dVx;
+        Offsets[   i] = u;
+        Offsets[16+i] = v;
+      }
+
+          // batch 8 input pixels when linearity says it's ok
+			UV1 = (Offsets[ 0] | (Offsets[16]<<16)) & 0xfff0fff0U;
+			UV2 = (Offsets[ 7] | (Offsets[23]<<16)) & 0xfff0fff0U;
+			if (UV1+7*16==UV2)
+      {
+				const uint32_t Off = (Offsets[0]>>4) + (Offsets[16]>>4)*srcstride;
+				GMC_Core_Lin_8(uDst, Offsets, uSrc+Off, srcstride, Rounder);
+				GMC_Core_Lin_8(vDst, Offsets, vSrc+Off, srcstride, Rounder);
+      }
+      else {
+        GMC_Core_Non_Lin_8(uDst, Offsets, uSrc, srcstride, Rounder);
+        GMC_Core_Non_Lin_8(vDst, Offsets, vSrc, srcstride, Rounder);
+      }
+    }
+    else
+    {
+      for(i=0; i<8; ++i)
+      {
+        int u = ( U >> 16 ) << rho;
+        int v = ( V >> 16 ) << rho;
+        U += dUx; V += dVx;
+        Offsets[   i] = (u<0) ? 0 : (u>=W) ? W : u;
+        Offsets[16+i] = (v<0) ? 0 : (v>=H) ? H : v;
+      }
+      GMC_Core_Non_Lin_8(uDst, Offsets, uSrc, srcstride, Rounder);
+      GMC_Core_Non_Lin_8(vDst, Offsets, vSrc, srcstride, Rounder);
+    }
+    uDst += dststride;
+    vDst += dststride;
+  }
+}
+
+#endif /* ARCH_IS_IA32 */
+
+/* *************************************************************
+ * will initialize internal pointers
+ */
+
+void init_GMC(const unsigned int cpu_flags)
+{
+      Predict_16x16_func = Predict_16x16_C;
+      Predict_8x8_func   = Predict_8x8_C;
+
+#if defined(ARCH_IS_IA32) || defined(ARCH_IS_X86_64)
+      if ((cpu_flags & XVID_CPU_MMX)   || (cpu_flags & XVID_CPU_MMXEXT)   ||
+          (cpu_flags & XVID_CPU_3DNOW) || (cpu_flags & XVID_CPU_3DNOWEXT) ||
+          (cpu_flags & XVID_CPU_SSE)   || (cpu_flags & XVID_CPU_SSE2) ||
+          (cpu_flags & XVID_CPU_SSE3)  || (cpu_flags & XVID_CPU_SSE41))
+	{
+	   Predict_16x16_func = Predict_16x16_mmx;
+	   Predict_8x8_func   = Predict_8x8_mmx;
+
+           if (cpu_flags & XVID_CPU_SSE41)
+	     GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_sse41;
+	   else if (cpu_flags & XVID_CPU_SSE2)
+	     GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_sse2;
+	   else
+             GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_mmx;
+	}
+#endif
+}
+
 /* *************************************************************
  * Warning! It's Accuracy being passed, not 'resolution'!
  */
@@ -325,17 +624,16 @@
 	gmc->num_wp = nb_pts;
 
 	/* reduce the number of points, if possible */
-	if (nb_pts<3 || (pts->duv[2].x==-pts->duv[1].y && pts->duv[2].y==pts->duv[1].x)) {
-	if (nb_pts<2 || (pts->duv[1].x==0 && pts->duv[1].y==0)) {
-		if (nb_pts<1 || (pts->duv[0].x==0 && pts->duv[0].y==0)) {
-		nb_pts = 0;
-		}
-		else nb_pts = 1;
-	}
-	else nb_pts = 2;
-	}
-	else nb_pts = 3;
-	
+	if (nb_pts<2 || (pts->duv[2].x==0 && pts->duv[2].y==0 && pts->duv[1].x==0 && pts->duv[1].y==0 )) {
+  	if (nb_pts<2 || (pts->duv[1].x==0 && pts->duv[1].y==0)) {
+	  	if (nb_pts<1 || (pts->duv[0].x==0 && pts->duv[0].y==0)) {
+		    nb_pts = 0;
+  		}
+	  	else nb_pts = 1;
+  	}
+	  else nb_pts = 2;
+  }
+
 	/* now, nb_pts stores the actual number of points required for interpolation */
 
 	if (nb_pts<=1)
@@ -364,15 +662,11 @@
 	gmc->dU[0] = 16*Ws + RDIV( 8*Ws*pts->duv[1].x, width );	 /* dU/dx */
 	gmc->dV[0] =		 RDIV( 8*Ws*pts->duv[1].y, width );	 /* dV/dx */
 
-/*	 disabled, because possibly buggy? */
-
-#if 0
 	if (nb_pts==2) {
 		gmc->dU[1] = -gmc->dV[0];	/* -Sin */
 		gmc->dV[1] =	gmc->dU[0] ;	/* Cos */
 	}
 	else
-#endif
 	{
 		const int Beta = log2bin(height-1);
 		const int Hs = 1<<Beta;
@@ -402,8 +696,8 @@
 	gmc->Uco = (gmc->Uco + gmc->dU[0] + gmc->dU[1])>>2;
 	gmc->Vco = (gmc->Vco + gmc->dV[0] + gmc->dV[1])>>2;
 
-	gmc->predict_16x16	= Predict_16x16_C;
-	gmc->predict_8x8	= Predict_8x8_C;
+	gmc->predict_16x16	= Predict_16x16_func;
+	gmc->predict_8x8	= Predict_8x8_func;
 	gmc->get_average_mv = get_average_mv_C;
 	}
 }
@@ -435,8 +729,8 @@
 			const int mbnum = mj*mb_width+mi;
 			if (pGMC)
 			{
-				gmc_data->predict_16x16(gmc_data, 
-							pGMC->y + mj*16*stride + mi*16, pRef->y, 
+				gmc_data->predict_16x16(gmc_data,
+							pGMC->y + mj*16*stride + mi*16, pRef->y,
 							stride, stride, mi, mj, rounding);
 
 				gmc_data->predict_8x8(gmc_data,
@@ -451,4 +745,5 @@
 
 			pMBs[mbnum].mcsel = 0; /* until mode decision */
 	}
+  emms();
 }