--- interpolate8x8.c	2003/10/01 23:23:01	1.10.2.5
+++ interpolate8x8.c	2003/03/04 16:33:41	1.11
@@ -1,33 +1,40 @@
-/*****************************************************************************
+/**************************************************************************
  *
  *	XVID MPEG-4 VIDEO CODEC
- *	- 8x8 block-based halfpel interpolation -
+ *	8x8 block-based halfpel interpolation
  *
- *  Copyright(C) 2001-2003 Peter Ross <pross@xvid.org>
+ *	This program is free software; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License as published by
+ *	the Free Software Foundation; either version 2 of the License, or
+ *	(at your option) any later version.
  *
- *  This program is free software ; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation ; either version 2 of the License, or
- *  (at your option) any later version.
+ *	This program is distributed in the hope that it will be useful,
+ *	but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *	GNU General Public License for more details.
  *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
+ *	You should have received a copy of the GNU General Public License
+ *	along with this program; if not, write to the Free Software
+ *	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
- *  You should have received a copy of the GNU General Public License
- *  along with this program ; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *************************************************************************/
+
+/**************************************************************************
+ *
+ *	History:
  *
- * $Id: interpolate8x8.c,v 1.10.2.5 2003/10/01 23:23:01 edgomez Exp $
+ *  05.10.2002	new bilinear and qpel interpolation code - Isibaar
+ *	27.12.2001	modified "compensate_halfpel"
+ *	05.11.2001	initial version; (c)2001 peter ross <pross@cs.rmit.edu.au>
  *
- ****************************************************************************/
+ *************************************************************************/
+
 
 #include "../portab.h"
 #include "../global.h"
 #include "interpolate8x8.h"
 
-/* function pointers */
+// function pointers
 INTERPOLATE8X8_PTR interpolate8x8_halfpel_h;
 INTERPOLATE8X8_PTR interpolate8x8_halfpel_v;
 INTERPOLATE8X8_PTR interpolate8x8_halfpel_hv;
@@ -84,7 +91,7 @@
         dst[5] = (src1[5] + src2[5] + src3[5] + src4[5] + round) >> 2;
         dst[6] = (src1[6] + src2[6] + src3[6] + src4[6] + round) >> 2;
         dst[7] = (src1[7] + src2[7] + src3[7] + src4[7] + round) >> 2;
-
+        
 		dst += stride;
         src1 += stride;
         src2 += stride;
@@ -93,7 +100,7 @@
     }
 }
 
-/* dst = interpolate(src) */
+// dst = interpolate(src)
 
 void
 interpolate8x8_halfpel_h_c(uint8_t * const dst,
@@ -101,10 +108,10 @@
 						   const uint32_t stride,
 						   const uint32_t rounding)
 {
-	uintptr_t j;
-
+	intptr_t j;
+	
 	if (rounding)
-		for (j = 0; j < 8*stride; j+=stride)
+		for (j = 7*stride; j >= 0; j-=stride)
 		{
 				dst[j + 0] = (uint8_t)((src[j + 0] + src[j + 1] )>>1);
 				dst[j + 1] = (uint8_t)((src[j + 1] + src[j + 2] )>>1);
@@ -137,8 +144,8 @@
 						   const uint32_t stride,
 						   const uint32_t rounding)
 {
-	uintptr_t j;
-
+	intptr_t j;
+//	const uint8_t * const src2 = src+stride;		/* using a second pointer is _not_ faster here */
 
 	if (rounding)
 		for (j = 0; j < 8*stride; j+=stride)		/* forward is better. Some automatic prefetch perhaps. */
@@ -173,10 +180,10 @@
 							const uint32_t stride,
 							const uint32_t rounding)
 {
-	uintptr_t j;
+	intptr_t j;
 
 	if (rounding)
-		for (j = 0; j < 8*stride; j+=stride)
+		for (j = 7*stride; j >= 0; j-=stride)
 		{
 				dst[j + 0] = (uint8_t)((src[j+0] + src[j+1] + src[j+stride+0] + src[j+stride+1] +1)>>2);
 				dst[j + 1] = (uint8_t)((src[j+1] + src[j+2] + src[j+stride+1] + src[j+stride+2] +1)>>2);
@@ -188,7 +195,7 @@
 				dst[j + 7] = (uint8_t)((src[j+7] + src[j+8] + src[j+stride+7] + src[j+stride+8] +1)>>2);
 		}
 	else
-		for (j = 0; j < 8*stride; j+=stride)
+		for (j = 7*stride; j >= 0; j-=stride)
 		{
 				dst[j + 0] = (uint8_t)((src[j+0] + src[j+1] + src[j+stride+0] + src[j+stride+1] +2)>>2);
 				dst[j + 1] = (uint8_t)((src[j+1] + src[j+2] + src[j+stride+1] + src[j+stride+2] +2)>>2);
@@ -201,6 +208,9 @@
 		}
 }
 
+
+
+
 /*************************************************************
  * QPEL STUFF STARTS HERE                                    *
  *************************************************************/
@@ -341,7 +351,7 @@
         int32_t src15 = src[15 * stride];
         int32_t src16 = src[16 * stride];
 
-
+        
         dst[0] = CLIP(((7 * ((src0<<1) - src2) +  23 * src1 + 3 * src3 - src4 + round_add) >> 5), 0, 255);
         dst[stride] = CLIP(((19 * src1 + 20 * src2 - src5 + 3 * (src4 - src0 - (src3<<1)) + round_add) >> 5), 0, 255);
         dst[2*stride] = CLIP(((20 * (src2 + src3) + (src0<<1) + 3 * (src5 - ((src1 + src4)<<1)) - src6 + round_add) >> 5), 0, 255);
@@ -382,7 +392,7 @@
         int32_t src6 = src[6 * stride];
         int32_t src7 = src[7 * stride];
         int32_t src8 = src[8 * stride];
-
+        
         dst[0]			= CLIP(((7 * ((src0<<1) - src2) + 23 * src1 + 3 * src3 - src4 + round_add) >> 5), 0, 255);
         dst[stride]		= CLIP(((19 * src1 + 20 * src2 - src5 + 3 * (src4 - src0 - (src3 << 1)) + round_add) >> 5), 0, 255);
         dst[2 * stride] = CLIP(((20 * (src2 + src3) + (src0<<1) + 3 * (src5 - ((src1 + src4) <<1 )) - src6 + round_add) >> 5), 0, 255);