--- sad_altivec.c 2004/03/22 22:36:24 1.7 +++ sad_altivec.c 2004/12/09 23:02:54 1.11 @@ -17,104 +17,63 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - $Id: sad_altivec.c,v 1.7 2004/03/22 22:36:24 edgomez Exp $ - $Source: /home/xvid/cvs/cvs-server-root/xvid/xvidcore/src/motion/ppc_asm/sad_altivec.c,v $ - $Date: 2004/03/22 22:36:24 $ - $Author: edgomez $ - + $Id: sad_altivec.c,v 1.11 2004/12/09 23:02:54 edgomez Exp $ */ -#define G_REG - -#ifdef G_REG -register vector unsigned char perm0 asm("%v29"); -register vector unsigned char perm1 asm("%v30"); -register vector unsigned int zerovec asm("%v31"); +#ifdef HAVE_ALTIVEC_H +#include #endif -#include - -#undef DEBUG -static const vector unsigned char perms[2] = { - (vector unsigned char) ( /* Used when cur is aligned */ - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17), - (vector unsigned char) ( /* Used when cur is unaligned */ - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f), -}; - -#ifdef G_REG -void -sadInit_altivec(void) -{ - perm0 = perms[0]; - perm1 = perms[1]; - zerovec = (vector unsigned int) (0); -} -static inline const vector unsigned char -get_perm(unsigned long i) -{ - return i ? perm1 : perm0; -} +#include "../../portab.h" -#define ZERODEF -#define ZEROVEC zerovec -#else -void -sadInit_altivec(void) -{ -} -static inline const vector unsigned char -get_perm(unsigned long i) -{ - return perms[i]; -} - -#define ZERODEF vector unsigned int zerovec = (vector unsigned int)(0) -#define ZEROVEC zerovec -#endif +/* no debugging by default */ +#undef DEBUG +#include #define SAD16() \ t1 = vec_perm(ref[0], ref[1], perm); /* align current vector */ \ t2 = vec_max(t1, *cur); /* find largest of two */ \ -t3 = vec_min(t1, *cur); /* find smaller of two */ \ -t4 = vec_sub(t2, t3); /* find absolute difference */ \ -sad = vec_sum4s(t4, sad); /* accumulate sum of differences */ \ +t1 = vec_min(t1, *cur); /* find smaller of two */ \ +t1 = vec_sub(t2, t1); /* find absolute difference */ \ +sad = vec_sum4s(t1, vec_splat_u32(0)); /* sum of differences */ \ +sumdiffs = (vector unsigned int)vec_sums((vector signed int)sad, (vector signed int)sumdiffs); /* accumulate sumdiffs */ \ +if(vec_any_ge(sumdiffs, best_vec)) \ + goto bail; \ cur += stride; ref += stride; /* * This function assumes cur and stride are 16 bytes aligned and ref is unaligned */ -unsigned long -sad16_altivec(const vector unsigned char *cur, - const vector unsigned char *ref, - unsigned long stride, - const unsigned long best_sad) + +uint32_t +sad16_altivec_c(vector unsigned char *cur, + vector unsigned char *ref, + uint32_t stride, + const uint32_t best_sad) { vector unsigned char perm; - vector unsigned char t1, t2, t3, t4; + vector unsigned char t1, t2; vector unsigned int sad; - vector signed int sumdiffs, best_vec; - unsigned long result; - - ZERODEF; + vector unsigned int sumdiffs; + vector unsigned int best_vec; + uint32_t result; + #ifdef DEBUG + /* print alignment errors if DEBUG is on */ if (((unsigned long) cur) & 0xf) - fprintf(stderr, "sad16_altivec:incorrect align, cur: %x\n", cur); -// if (((unsigned long)ref) & 0xf) -// fprintf(stderr, "sad16_altivec:incorrect align, ref: %x\n", ref); + fprintf(stderr, "sad16_altivec:incorrect align, cur: %lx\n", (long)cur); if (stride & 0xf) - fprintf(stderr, "sad16_altivec:incorrect align, stride: %x\n", stride); + fprintf(stderr, "sad16_altivec:incorrect align, stride: %lu\n", stride); #endif /* initialization */ - sad = (vector unsigned int) (ZEROVEC); + sad = vec_splat_u32(0); + sumdiffs = sad; stride >>= 4; perm = vec_lvsl(0, (unsigned char *) ref); - *((unsigned long *) &best_vec) = best_sad; + *((uint32_t*)&best_vec) = best_sad; best_vec = vec_splat(best_vec, 0); /* perform sum of differences between current and previous */ @@ -122,181 +81,280 @@ SAD16(); SAD16(); SAD16(); - /* Temp sum for exit */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); - if (vec_all_ge(sumdiffs, best_vec)) - goto bail; + SAD16(); SAD16(); SAD16(); SAD16(); - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); - if (vec_all_ge(sumdiffs, best_vec)) - goto bail; + SAD16(); SAD16(); SAD16(); SAD16(); + SAD16(); SAD16(); SAD16(); SAD16(); - /* sum all parts of difference into one 32 bit quantity */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); bail: /* copy vector sum into unaligned result */ sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, (int *) &result); - return (result); + vec_ste(sumdiffs, 0, (uint32_t*) &result); + return result; } + #define SAD8() \ -t1 = vec_perm(cur[0], cur[stride], perm_cur); /* align current vector */ \ -t2 = vec_perm(ref[0], ref[1], perm_ref1); /* align current vector */ \ -tp = vec_perm(ref[stride], ref[stride+1], perm_ref1); /* align current vector */ \ -t2 = vec_perm(t2,tp,perm_ref2); \ -t3 = vec_max(t1, t2); /* find largest of two */ \ -t4 = vec_min(t1, t2); /* find smaller of two */ \ -t5 = vec_sub(t3, t4); /* find absolute difference */ \ -sad = vec_sum4s(t5, sad); /* accumulate sum of differences */ \ -cur += stride<<1; ref += stride<<1; + c = vec_perm(vec_ld(0,cur),vec_ld(16,cur),vec_lvsl(0,cur));\ + r = vec_perm(vec_ld(0,ref),vec_ld(16,ref),vec_lvsl(0,ref));\ + c = vec_sub(vec_max(c,r),vec_min(c,r));\ + sad = vec_sum4s(c,sad);\ + cur += stride;\ + ref += stride /* - * This function assumes cur is 8 bytes aligned, stride is 16 bytes - * aligned and ref is unaligned + * This function assumes nothing */ -unsigned long -sad8_altivec(const vector unsigned char *cur, - const vector unsigned char *ref, - unsigned long stride) + +uint32_t +sad8_altivec_c(const uint8_t * cur, + const uint8_t *ref, + const uint32_t stride) { - vector unsigned char t1, t2, t3, t4, t5, tp; - vector unsigned int sad; - vector signed int sumdiffs; - vector unsigned char perm_cur; - vector unsigned char perm_ref1, perm_ref2; - unsigned long result; - - ZERODEF; - -#ifdef DEBUG - if (((unsigned long) cur) & 0x7) - fprintf(stderr, "sad8_altivec:incorrect align, cur: %x\n", cur); -// if (((unsigned long)ref) & 0x7) -// fprintf(stderr, "sad8_altivec:incorrect align, ref: %x\n", ref); - if (stride & 0xf) - fprintf(stderr, "sad8_altivec:incorrect align, stride: %x\n", stride); -#endif - - perm_cur = get_perm((((unsigned long) cur) >> 3) & 0x01); - perm_ref1 = vec_lvsl(0, (unsigned char *) ref); - perm_ref2 = get_perm(0); - - /* initialization */ - sad = (vector unsigned int) (ZEROVEC); - stride >>= 4; - - /* perform sum of differences between current and previous */ + uint32_t result = 0; + + register vector unsigned int sad; + register vector unsigned char c; + register vector unsigned char r; + + /* initialize */ + sad = vec_splat_u32(0); + + /* Perform sad operations */ + SAD8(); + SAD8(); SAD8(); SAD8(); + SAD8(); SAD8(); + SAD8(); + SAD8(); + + /* finish addition, add the first 2 together */ + sad = vec_and(sad, (vector unsigned int)vec_pack(vec_splat_u16(-1),vec_splat_u16(0))); + sad = (vector unsigned int)vec_sums((vector signed int)sad, vec_splat_s32(0)); + sad = vec_splat(sad,3); + vec_ste(sad, 0, &result); + + return result; +} - /* sum all parts of difference into one 32 bit quantity */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); - /* copy vector sum into unaligned result */ - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, (int *) &result); - return (result); -} -#define MEAN16(i)\ -c##i=*cur;\ -mean = vec_sum4s(c##i,mean);\ -cur += stride; - -#define DEV16(i) \ -t2 = vec_max(c##i, mn); /* find largest of two */ \ -t3 = vec_min(c##i, mn); /* find smaller of two */ \ -t4 = vec_sub(t2, t3); /* find absolute difference */ \ -dev = vec_sum4s(t4, dev); - -unsigned long -dev16_altivec(const vector unsigned char *cur, - unsigned long stride) + +#define MEAN16() \ +mean = vec_sum4s(*ptr,mean);\ +ptr += stride + +#define DEV16() \ +t2 = vec_max(*ptr, mn); /* find largest of two */ \ +t3 = vec_min(*ptr, mn); /* find smaller of two */ \ +t2 = vec_sub(t2, t3); /* find absolute difference */ \ +dev = vec_sum4s(t2, dev); \ +ptr += stride + +/* + * This function assumes cur is 16 bytes aligned and stride is 16 bytes + * aligned +*/ + +uint32_t +dev16_altivec_c(vector unsigned char *cur, + uint32_t stride) { - vector unsigned char t2, t3, t4, mn; + vector unsigned char t2, t3, mn; vector unsigned int mean, dev; - vector signed int sumdiffs; - vector unsigned char c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, - c13, c14, c15; - unsigned long result; + vector unsigned int sumdiffs; + vector unsigned char *ptr; + uint32_t result; - ZERODEF; +#ifdef DEBUG + /* print alignment errors if DEBUG is on */ + if(((unsigned long)cur) & 0x7) + fprintf(stderr, "dev16_altivec:incorrect align, cur: %lx\n", (long)cur); + if(stride & 0xf) + fprintf(stderr, "dev16_altivec:incorrect align, stride: %lu\n", stride); +#endif - mean = (vector unsigned int) (ZEROVEC); - dev = (vector unsigned int) (ZEROVEC); + dev = mean = vec_splat_u32(0); stride >>= 4; - - MEAN16(0); - MEAN16(1); - MEAN16(2); - MEAN16(3); - MEAN16(4); - MEAN16(5); - MEAN16(6); - MEAN16(7); - MEAN16(8); - MEAN16(9); - MEAN16(10); - MEAN16(11); - MEAN16(12); - MEAN16(13); - MEAN16(14); - MEAN16(15); - - sumdiffs = vec_sums((vector signed int) mean, (vector signed int) ZEROVEC); - mn = vec_perm((vector unsigned char) sumdiffs, - (vector unsigned char) sumdiffs, (vector unsigned char) (14, - 14, - 14, - 14, - 14, - 14, - 14, - 14, - 14, - 14, - 14, - 14, - 14, - 14, - 14, - 14)); - DEV16(0); - DEV16(1); - DEV16(2); - DEV16(3); - DEV16(4); - DEV16(5); - DEV16(6); - DEV16(7); - DEV16(8); - DEV16(9); - DEV16(10); - DEV16(11); - DEV16(12); - DEV16(13); - DEV16(14); - DEV16(15); + + /* set pointer to iterate through cur */ + ptr = cur; + + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + MEAN16(); + + /* Add all together in sumdiffs */ + sumdiffs = (vector unsigned int)vec_sums((vector signed int) mean, vec_splat_s32(0)); + /* teilen durch 16 * 16 */ + mn = vec_perm((vector unsigned char)sumdiffs, (vector unsigned char)sumdiffs, vec_splat_u8(14)); + + /* set pointer to iterate through cur */ + ptr = cur; + + DEV16(); + DEV16(); + DEV16(); + DEV16(); + DEV16(); + DEV16(); + DEV16(); + DEV16(); + DEV16(); + DEV16(); + DEV16(); + DEV16(); + DEV16(); + DEV16(); + DEV16(); + DEV16(); /* sum all parts of difference into one 32 bit quantity */ - sumdiffs = vec_sums((vector signed int) dev, (vector signed int) ZEROVEC); + sumdiffs = (vector unsigned int)vec_sums((vector signed int) dev, vec_splat_s32(0)); /* copy vector sum into unaligned result */ sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, (int *) &result); - return (result); + vec_ste(sumdiffs, 0, (uint32_t*) &result); + return result; +} + +#define SAD16BI() \ + t1 = vec_perm(ref1[0], ref1[1], mask1); \ + t2 = vec_perm(ref2[0], ref2[1], mask2); \ + t1 = vec_avg(t1, t2); \ + t2 = vec_max(t1, *cur); \ + t1 = vec_min(t1, *cur); \ + sad = vec_sub(t2, t1); \ + sum = vec_sum4s(sad, sum); \ + cur += stride; \ + ref1 += stride; \ + ref2 += stride + +/* + * This function assumes cur is 16 bytes aligned, stride is 16 bytes + * aligned and ref1 and ref2 is unaligned +*/ + +uint32_t +sad16bi_altivec_c(vector unsigned char *cur, + vector unsigned char *ref1, + vector unsigned char *ref2, + uint32_t stride) +{ + vector unsigned char t1, t2; + vector unsigned char mask1, mask2; + vector unsigned char sad; + vector unsigned int sum; + uint32_t result; + +#ifdef DEBUG + /* print alignment errors if this is on */ + if((long)cur & 0xf) + fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lx\n", (long)cur); + if(stride & 0xf) + fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lu\n", stride); +#endif + + /* Initialisation stuff */ + stride >>= 4; + mask1 = vec_lvsl(0, (unsigned char*)ref1); + mask2 = vec_lvsl(0, (unsigned char*)ref2); + sad = vec_splat_u8(0); + sum = (vector unsigned int)sad; + + SAD16BI(); + SAD16BI(); + SAD16BI(); + SAD16BI(); + + SAD16BI(); + SAD16BI(); + SAD16BI(); + SAD16BI(); + + SAD16BI(); + SAD16BI(); + SAD16BI(); + SAD16BI(); + + SAD16BI(); + SAD16BI(); + SAD16BI(); + SAD16BI(); + + sum = (vector unsigned int)vec_sums((vector signed int)sum, vec_splat_s32(0)); + sum = vec_splat(sum, 3); + vec_ste(sum, 0, (uint32_t*)&result); + + return result; +} + + +#define SSE8_16BIT() \ +b1_vec = vec_perm(vec_ld(0,b1), vec_ld(16,b1), vec_lvsl(0,b1)); \ +b2_vec = vec_perm(vec_ld(0,b2), vec_ld(16,b2), vec_lvsl(0,b2)); \ +diff = vec_sub(b1_vec,b2_vec); \ +sum = vec_msum(diff,diff,sum); \ +b1 = (const int16_t*)((int8_t*)b1+stride); \ +b2 = (const int16_t*)((int8_t*)b2+stride) + +uint32_t +sse8_16bit_altivec_c(const int16_t * b1, + const int16_t * b2, + const uint32_t stride) +{ + register vector signed short b1_vec; + register vector signed short b2_vec; + register vector signed short diff; + register vector signed int sum; + uint32_t result; + + /* initialize */ + sum = vec_splat_s32(0); + + SSE8_16BIT(); + SSE8_16BIT(); + SSE8_16BIT(); + SSE8_16BIT(); + + SSE8_16BIT(); + SSE8_16BIT(); + SSE8_16BIT(); + SSE8_16BIT(); + + /* sum the vector */ + sum = vec_sums(sum, vec_splat_s32(0)); + sum = vec_splat(sum,3); + + vec_ste(sum,0,(int*)&result); + + /* and return */ + return result; }