--- sad_altivec.c	2004/03/22 22:36:24	1.7
+++ sad_altivec.c	2004/12/09 23:02:54	1.11
@@ -17,104 +17,63 @@
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
 
-    $Id: sad_altivec.c,v 1.7 2004/03/22 22:36:24 edgomez Exp $
-    $Source: /home/xvid/cvs/cvs-server-root/xvid/xvidcore/src/motion/ppc_asm/sad_altivec.c,v $
-    $Date: 2004/03/22 22:36:24 $
-    $Author: edgomez $
-
+    $Id: sad_altivec.c,v 1.11 2004/12/09 23:02:54 edgomez Exp $
 */
 
-#define G_REG
-
-#ifdef G_REG
-register vector unsigned char perm0 asm("%v29");
-register vector unsigned char perm1 asm("%v30");
-register vector unsigned int zerovec asm("%v31");
+#ifdef HAVE_ALTIVEC_H
+#include <altivec.h>
 #endif
 
-#include <stdio.h>
-
-#undef DEBUG
 
-static const vector unsigned char perms[2] = {
-	(vector unsigned char) (	/* Used when cur is aligned */
-							   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-							   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17),
-	(vector unsigned char) (	/* Used when cur is unaligned */
-							   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-							   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f),
-};
-
-#ifdef G_REG
-void
-sadInit_altivec(void)
-{
-	perm0 = perms[0];
-	perm1 = perms[1];
-	zerovec = (vector unsigned int) (0);
-}
-static inline const vector unsigned char
-get_perm(unsigned long i)
-{
-	return i ? perm1 : perm0;
-}
+#include "../../portab.h"
 
-#define ZERODEF
-#define ZEROVEC zerovec
-#else
-void
-sadInit_altivec(void)
-{
-}
-static inline const vector unsigned char
-get_perm(unsigned long i)
-{
-	return perms[i];
-}
-
-#define ZERODEF vector unsigned int zerovec = (vector unsigned int)(0)
-#define ZEROVEC zerovec
-#endif
+/* no debugging by default */
+#undef DEBUG
 
+#include <stdio.h>
 
 #define SAD16() \
 t1  = vec_perm(ref[0], ref[1], perm);  /* align current vector  */ \
 t2  = vec_max(t1, *cur);      	 /* find largest of two           */ \
-t3  = vec_min(t1, *cur); 	         /* find smaller of two           */ \
-t4  = vec_sub(t2, t3);                   /* find absolute difference      */ \
-sad = vec_sum4s(t4, sad);                /* accumulate sum of differences */ \
+t1  = vec_min(t1, *cur); 	         /* find smaller of two           */ \
+t1  = vec_sub(t2, t1);                   /* find absolute difference      */ \
+sad = vec_sum4s(t1, vec_splat_u32(0));                /* sum of differences */ \
+sumdiffs = (vector unsigned int)vec_sums((vector signed int)sad, (vector signed int)sumdiffs);    /* accumulate sumdiffs */ \
+if(vec_any_ge(sumdiffs, best_vec)) \
+    goto bail; \
 cur += stride; ref += stride;
 
 /*
  * This function assumes cur and stride are 16 bytes aligned and ref is unaligned
  */
-unsigned long
-sad16_altivec(const vector unsigned char *cur,
-			  const vector unsigned char *ref,
-			  unsigned long stride,
-			  const unsigned long best_sad)
+
+uint32_t
+sad16_altivec_c(vector unsigned char *cur,
+			  vector unsigned char *ref,
+			  uint32_t stride,
+			  const uint32_t best_sad)
 {
 	vector unsigned char perm;
-	vector unsigned char t1, t2, t3, t4;
+	vector unsigned char t1, t2;
 	vector unsigned int sad;
-	vector signed int sumdiffs, best_vec;
-	unsigned long result;
-
-	ZERODEF;
+	vector unsigned int sumdiffs;
+	vector unsigned int best_vec;
+	uint32_t result;
 
+        
 #ifdef DEBUG
+        /* print alignment errors if DEBUG is on */
 	if (((unsigned long) cur) & 0xf)
-		fprintf(stderr, "sad16_altivec:incorrect align, cur: %x\n", cur);
-//  if (((unsigned long)ref) & 0xf)
-//      fprintf(stderr, "sad16_altivec:incorrect align, ref: %x\n", ref);
+		fprintf(stderr, "sad16_altivec:incorrect align, cur: %lx\n", (long)cur);
 	if (stride & 0xf)
-		fprintf(stderr, "sad16_altivec:incorrect align, stride: %x\n", stride);
+		fprintf(stderr, "sad16_altivec:incorrect align, stride: %lu\n", stride);
 #endif
 	/* initialization */
-	sad = (vector unsigned int) (ZEROVEC);
+	sad = vec_splat_u32(0);
+	sumdiffs = sad;
 	stride >>= 4;
 	perm = vec_lvsl(0, (unsigned char *) ref);
-	*((unsigned long *) &best_vec) = best_sad;
+	*((uint32_t*)&best_vec) = best_sad;
 	best_vec = vec_splat(best_vec, 0);
 
 	/* perform sum of differences between current and previous */
@@ -122,181 +81,280 @@
 	SAD16();
 	SAD16();
 	SAD16();
-	/* Temp sum for exit */
-	sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC);
-	if (vec_all_ge(sumdiffs, best_vec))
-		goto bail;
+
 	SAD16();
 	SAD16();
 	SAD16();
 	SAD16();
-	sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC);
-	if (vec_all_ge(sumdiffs, best_vec))
-		goto bail;
+
 	SAD16();
 	SAD16();
 	SAD16();
 	SAD16();
+        
 	SAD16();
 	SAD16();
 	SAD16();
 	SAD16();
 
-	/* sum all parts of difference into one 32 bit quantity */
-	sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC);
   bail:
 	/* copy vector sum into unaligned result */
 	sumdiffs = vec_splat(sumdiffs, 3);
-	vec_ste(sumdiffs, 0, (int *) &result);
-	return (result);
+	vec_ste(sumdiffs, 0, (uint32_t*) &result);
+	return result;
 }
 
+
 #define SAD8() \
-t1  = vec_perm(cur[0], cur[stride], perm_cur);  /* align current vector  */ \
-t2  = vec_perm(ref[0], ref[1], perm_ref1);  /* align current vector  */ \
-tp  = vec_perm(ref[stride], ref[stride+1], perm_ref1);  /* align current vector  */ \
-t2  = vec_perm(t2,tp,perm_ref2); \
-t3  = vec_max(t1, t2);  	        /* find largest of two           */ \
-t4  = vec_min(t1, t2);	        	 /* find smaller of two           */ \
-t5  = vec_sub(t3, t4);                   /* find absolute difference      */ \
-sad = vec_sum4s(t5, sad);                /* accumulate sum of differences */ \
-cur += stride<<1; ref += stride<<1;
+	c = vec_perm(vec_ld(0,cur),vec_ld(16,cur),vec_lvsl(0,cur));\
+	r = vec_perm(vec_ld(0,ref),vec_ld(16,ref),vec_lvsl(0,ref));\
+	c = vec_sub(vec_max(c,r),vec_min(c,r));\
+	sad = vec_sum4s(c,sad);\
+	cur += stride;\
+	ref += stride
 
 /*
- * This function assumes cur is 8 bytes aligned, stride is 16 bytes
- * aligned and ref is unaligned
+ * This function assumes nothing
  */
-unsigned long
-sad8_altivec(const vector unsigned char *cur,
-			 const vector unsigned char *ref,
-			 unsigned long stride)
+ 
+uint32_t
+sad8_altivec_c(const uint8_t * cur,
+	   const uint8_t *ref,
+	   const uint32_t stride)
 {
-	vector unsigned char t1, t2, t3, t4, t5, tp;
-	vector unsigned int sad;
-	vector signed int sumdiffs;
-	vector unsigned char perm_cur;
-	vector unsigned char perm_ref1, perm_ref2;
-	unsigned long result;
-
-	ZERODEF;
-
-#ifdef DEBUG
-	if (((unsigned long) cur) & 0x7)
-		fprintf(stderr, "sad8_altivec:incorrect align, cur: %x\n", cur);
-//  if (((unsigned long)ref) & 0x7)
-//      fprintf(stderr, "sad8_altivec:incorrect align, ref: %x\n", ref);
-	if (stride & 0xf)
-		fprintf(stderr, "sad8_altivec:incorrect align, stride: %x\n", stride);
-#endif
-
-	perm_cur = get_perm((((unsigned long) cur) >> 3) & 0x01);
-	perm_ref1 = vec_lvsl(0, (unsigned char *) ref);
-	perm_ref2 = get_perm(0);
-
-	/* initialization */
-	sad = (vector unsigned int) (ZEROVEC);
-	stride >>= 4;
-
-	/* perform sum of differences between current and previous */
+	uint32_t result = 0;
+	
+	register vector unsigned int sad;
+	register vector unsigned char c;
+	register vector unsigned char r;
+	
+	/* initialize */
+	sad = vec_splat_u32(0);
+	
+	/* Perform sad operations */
+	SAD8();
+	SAD8();
 	SAD8();
 	SAD8();
+	
 	SAD8();
 	SAD8();
+	SAD8();
+	SAD8();
+	
+	/* finish addition, add the first 2 together */
+	sad = vec_and(sad, (vector unsigned int)vec_pack(vec_splat_u16(-1),vec_splat_u16(0)));
+	sad = (vector unsigned int)vec_sums((vector signed int)sad, vec_splat_s32(0));
+	sad = vec_splat(sad,3);
+	vec_ste(sad, 0, &result);
+		
+	return result;
+}
 
-	/* sum all parts of difference into one 32 bit quantity */
-	sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC);
 
-	/* copy vector sum into unaligned result */
-	sumdiffs = vec_splat(sumdiffs, 3);
-	vec_ste(sumdiffs, 0, (int *) &result);
-	return (result);
-}
 
-#define MEAN16(i)\
-c##i=*cur;\
-mean = vec_sum4s(c##i,mean);\
-cur += stride;
-
-#define DEV16(i) \
-t2  = vec_max(c##i, mn);  	        /* find largest of two           */ \
-t3  = vec_min(c##i, mn);	        	 /* find smaller of two           */ \
-t4  = vec_sub(t2, t3);                   /* find absolute difference      */ \
-dev = vec_sum4s(t4, dev);
-
-unsigned long
-dev16_altivec(const vector unsigned char *cur,
-			  unsigned long stride)
+
+#define MEAN16() \
+mean = vec_sum4s(*ptr,mean);\
+ptr += stride
+
+#define DEV16() \
+t2  = vec_max(*ptr, mn);                    /* find largest of two           */ \
+t3  = vec_min(*ptr, mn);                    /* find smaller of two           */ \
+t2  = vec_sub(t2, t3);                      /* find absolute difference      */ \
+dev = vec_sum4s(t2, dev); \
+ptr += stride
+
+/*
+ * This function assumes cur is 16 bytes aligned and stride is 16 bytes
+ * aligned
+*/
+
+uint32_t
+dev16_altivec_c(vector unsigned char *cur,
+			  uint32_t stride)
 {
-	vector unsigned char t2, t3, t4, mn;
+	vector unsigned char t2, t3, mn;
 	vector unsigned int mean, dev;
-	vector signed int sumdiffs;
-	vector unsigned char c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12,
-		c13, c14, c15;
-	unsigned long result;
+	vector unsigned int sumdiffs;
+	vector unsigned char *ptr;
+	uint32_t result;
 
-	ZERODEF;
+#ifdef DEBUG
+        /* print alignment errors if DEBUG is on */
+        if(((unsigned long)cur) & 0x7)
+            fprintf(stderr, "dev16_altivec:incorrect align, cur: %lx\n", (long)cur);
+        if(stride & 0xf)
+            fprintf(stderr, "dev16_altivec:incorrect align, stride: %lu\n", stride);
+#endif
 
-	mean = (vector unsigned int) (ZEROVEC);
-	dev = (vector unsigned int) (ZEROVEC);
+	dev = mean = vec_splat_u32(0);
 	stride >>= 4;
-
-	MEAN16(0);
-	MEAN16(1);
-	MEAN16(2);
-	MEAN16(3);
-	MEAN16(4);
-	MEAN16(5);
-	MEAN16(6);
-	MEAN16(7);
-	MEAN16(8);
-	MEAN16(9);
-	MEAN16(10);
-	MEAN16(11);
-	MEAN16(12);
-	MEAN16(13);
-	MEAN16(14);
-	MEAN16(15);
-
-	sumdiffs = vec_sums((vector signed int) mean, (vector signed int) ZEROVEC);
-	mn = vec_perm((vector unsigned char) sumdiffs,
-				  (vector unsigned char) sumdiffs, (vector unsigned char) (14,
-																		   14,
-																		   14,
-																		   14,
-																		   14,
-																		   14,
-																		   14,
-																		   14,
-																		   14,
-																		   14,
-																		   14,
-																		   14,
-																		   14,
-																		   14,
-																		   14,
-																		   14));
-	DEV16(0);
-	DEV16(1);
-	DEV16(2);
-	DEV16(3);
-	DEV16(4);
-	DEV16(5);
-	DEV16(6);
-	DEV16(7);
-	DEV16(8);
-	DEV16(9);
-	DEV16(10);
-	DEV16(11);
-	DEV16(12);
-	DEV16(13);
-	DEV16(14);
-	DEV16(15);
+        
+	/* set pointer to iterate through cur */
+	ptr = cur;
+        
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+	MEAN16();
+        
+        /* Add all together in sumdiffs */
+	sumdiffs = (vector unsigned int)vec_sums((vector signed int) mean, vec_splat_s32(0));
+        /* teilen durch 16 * 16 */
+        mn = vec_perm((vector unsigned char)sumdiffs, (vector unsigned char)sumdiffs, vec_splat_u8(14));
+
+        /* set pointer to iterate through cur */
+        ptr = cur;
+        
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
+	DEV16();
 
 	/* sum all parts of difference into one 32 bit quantity */
-	sumdiffs = vec_sums((vector signed int) dev, (vector signed int) ZEROVEC);
+	sumdiffs = (vector unsigned int)vec_sums((vector signed int) dev, vec_splat_s32(0));
 
 	/* copy vector sum into unaligned result */
 	sumdiffs = vec_splat(sumdiffs, 3);
-	vec_ste(sumdiffs, 0, (int *) &result);
-	return (result);
+	vec_ste(sumdiffs, 0, (uint32_t*) &result);
+	return result;
+}
+
+#define SAD16BI() \
+    t1 = vec_perm(ref1[0], ref1[1], mask1); \
+    t2 = vec_perm(ref2[0], ref2[1], mask2); \
+    t1 = vec_avg(t1, t2); \
+    t2 = vec_max(t1, *cur); \
+    t1 = vec_min(t1, *cur); \
+    sad = vec_sub(t2, t1); \
+    sum = vec_sum4s(sad, sum); \
+    cur += stride; \
+    ref1 += stride; \
+    ref2 += stride
+
+/*
+ * This function assumes cur is 16 bytes aligned, stride is 16 bytes
+ * aligned and ref1 and ref2 is unaligned
+*/
+
+uint32_t
+sad16bi_altivec_c(vector unsigned char *cur,
+                        vector unsigned char *ref1,
+                        vector unsigned char *ref2,
+                        uint32_t stride)
+{
+    vector unsigned char t1, t2;
+    vector unsigned char mask1, mask2;
+    vector unsigned char sad;
+    vector unsigned int sum;
+    uint32_t result;
+    
+#ifdef DEBUG
+    /* print alignment errors if this is on */
+    if((long)cur & 0xf)
+        fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lx\n", (long)cur);
+    if(stride & 0xf)
+        fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lu\n", stride);
+#endif
+    
+    /* Initialisation stuff */
+    stride >>= 4;
+    mask1 = vec_lvsl(0, (unsigned char*)ref1);
+    mask2 = vec_lvsl(0, (unsigned char*)ref2);
+    sad = vec_splat_u8(0);
+    sum = (vector unsigned int)sad;
+    
+    SAD16BI();
+    SAD16BI();
+    SAD16BI();
+    SAD16BI();
+    
+    SAD16BI();
+    SAD16BI();
+    SAD16BI();
+    SAD16BI();
+    
+    SAD16BI();
+    SAD16BI();
+    SAD16BI();
+    SAD16BI();
+    
+    SAD16BI();
+    SAD16BI();
+    SAD16BI();
+    SAD16BI();
+    
+    sum = (vector unsigned int)vec_sums((vector signed int)sum, vec_splat_s32(0));
+    sum = vec_splat(sum, 3);
+    vec_ste(sum, 0, (uint32_t*)&result);
+    
+    return result;
+}
+
+
+#define SSE8_16BIT() \
+b1_vec = vec_perm(vec_ld(0,b1), vec_ld(16,b1), vec_lvsl(0,b1)); \
+b2_vec = vec_perm(vec_ld(0,b2), vec_ld(16,b2), vec_lvsl(0,b2)); \
+diff = vec_sub(b1_vec,b2_vec);  \
+sum = vec_msum(diff,diff,sum);  \
+b1 = (const int16_t*)((int8_t*)b1+stride);  \
+b2 = (const int16_t*)((int8_t*)b2+stride)
+
+uint32_t
+sse8_16bit_altivec_c(const int16_t * b1,
+			 const int16_t * b2,
+			 const uint32_t stride)
+{
+    register vector signed short b1_vec;
+    register vector signed short b2_vec;
+    register vector signed short diff;
+    register vector signed int sum;
+    uint32_t result;
+    
+    /* initialize */
+    sum = vec_splat_s32(0);
+    
+    SSE8_16BIT();
+    SSE8_16BIT();
+    SSE8_16BIT();
+    SSE8_16BIT();
+    
+    SSE8_16BIT();
+    SSE8_16BIT();
+    SSE8_16BIT();
+    SSE8_16BIT();
+        
+    /* sum the vector */
+    sum = vec_sums(sum, vec_splat_s32(0));
+    sum = vec_splat(sum,3);
+    
+    vec_ste(sum,0,(int*)&result);
+    
+    /* and return */
+    return result;
 }