--- interpolate8x8.c 2002/10/15 22:27:07 1.4.2.3 +++ interpolate8x8.c 2003/03/27 15:00:34 1.10.2.1 @@ -31,6 +31,7 @@ #include "../portab.h" +#include "../global.h" #include "interpolate8x8.h" // function pointers @@ -53,12 +54,12 @@ INTERPOLATE8X8_6TAP_LOWPASS_PTR interpolate8x8_6tap_lowpass_h; INTERPOLATE8X8_6TAP_LOWPASS_PTR interpolate8x8_6tap_lowpass_v; -void interpolate8x8_avg2_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint32_t stride, const uint32_t rounding) +void interpolate8x8_avg2_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint32_t stride, const uint32_t rounding, const uint32_t height) { - int32_t i; + uint32_t i; const int32_t round = 1 - rounding; - for(i = 0; i < 9; i++) + for(i = 0; i < height; i++) { dst[0] = (src1[0] + src2[0] + round) >> 1; dst[1] = (src1[1] + src2[1] + round) >> 1; @@ -107,19 +108,32 @@ const uint32_t stride, const uint32_t rounding) { - uint32_t i, j; - - for (j = 0; j < 8; j++) { - for (i = 0; i < 8; i++) { - - int16_t tot = - (int32_t) src[j * stride + i] + (int32_t) src[j * stride + i + - 1]; - - tot = (int32_t) ((tot + 1 - rounding) >> 1); - dst[j * stride + i] = (uint8_t) tot; + intptr_t j; + + if (rounding) + for (j = 7*stride; j >= 0; j-=stride) + { + dst[j + 0] = (uint8_t)((src[j + 0] + src[j + 1] )>>1); + dst[j + 1] = (uint8_t)((src[j + 1] + src[j + 2] )>>1); + dst[j + 2] = (uint8_t)((src[j + 2] + src[j + 3] )>>1); + dst[j + 3] = (uint8_t)((src[j + 3] + src[j + 4] )>>1); + dst[j + 4] = (uint8_t)((src[j + 4] + src[j + 5] )>>1); + dst[j + 5] = (uint8_t)((src[j + 5] + src[j + 6] )>>1); + dst[j + 6] = (uint8_t)((src[j + 6] + src[j + 7] )>>1); + dst[j + 7] = (uint8_t)((src[j + 7] + src[j + 8] )>>1); + } + else + for (j = 0; j < 8*stride; j+=stride) /* forward or backwards? Who knows ... */ + { + dst[j + 0] = (uint8_t)((src[j + 0] + src[j + 1] + 1)>>1); + dst[j + 1] = (uint8_t)((src[j + 1] + src[j + 2] + 1)>>1); + dst[j + 2] = (uint8_t)((src[j + 2] + src[j + 3] + 1)>>1); + dst[j + 3] = (uint8_t)((src[j + 3] + src[j + 4] + 1)>>1); + dst[j + 4] = (uint8_t)((src[j + 4] + src[j + 5] + 1)>>1); + dst[j + 5] = (uint8_t)((src[j + 5] + src[j + 6] + 1)>>1); + dst[j + 6] = (uint8_t)((src[j + 6] + src[j + 7] + 1)>>1); + dst[j + 7] = (uint8_t)((src[j + 7] + src[j + 8] + 1)>>1); } - } } @@ -130,16 +144,33 @@ const uint32_t stride, const uint32_t rounding) { - uint32_t i, j; + intptr_t j; - for (j = 0; j < 8; j++) { - for (i = 0; i < 8; i++) { - int16_t tot = src[j * stride + i] + src[j * stride + i + stride]; - tot = ((tot + 1 - rounding) >> 1); - dst[j * stride + i] = (uint8_t) tot; + if (rounding) + for (j = 0; j < 8*stride; j+=stride) /* forward is better. Some automatic prefetch perhaps. */ + { + dst[j + 0] = (uint8_t)((src[j + 0] + src[j + stride + 0] )>>1); + dst[j + 1] = (uint8_t)((src[j + 1] + src[j + stride + 1] )>>1); + dst[j + 2] = (uint8_t)((src[j + 2] + src[j + stride + 2] )>>1); + dst[j + 3] = (uint8_t)((src[j + 3] + src[j + stride + 3] )>>1); + dst[j + 4] = (uint8_t)((src[j + 4] + src[j + stride + 4] )>>1); + dst[j + 5] = (uint8_t)((src[j + 5] + src[j + stride + 5] )>>1); + dst[j + 6] = (uint8_t)((src[j + 6] + src[j + stride + 6] )>>1); + dst[j + 7] = (uint8_t)((src[j + 7] + src[j + stride + 7] )>>1); + } + else + for (j = 0; j < 8*stride; j+=stride) + { + dst[j + 0] = (uint8_t)((src[j + 0] + src[j + stride + 0] + 1)>>1); + dst[j + 1] = (uint8_t)((src[j + 1] + src[j + stride + 1] + 1)>>1); + dst[j + 2] = (uint8_t)((src[j + 2] + src[j + stride + 2] + 1)>>1); + dst[j + 3] = (uint8_t)((src[j + 3] + src[j + stride + 3] + 1)>>1); + dst[j + 4] = (uint8_t)((src[j + 4] + src[j + stride + 4] + 1)>>1); + dst[j + 5] = (uint8_t)((src[j + 5] + src[j + stride + 5] + 1)>>1); + dst[j + 6] = (uint8_t)((src[j + 6] + src[j + stride + 6] + 1)>>1); + dst[j + 7] = (uint8_t)((src[j + 7] + src[j + stride + 7] + 1)>>1); } - } } @@ -149,26 +180,38 @@ const uint32_t stride, const uint32_t rounding) { - uint32_t i, j; + intptr_t j; - for (j = 0; j < 8; j++) { - for (i = 0; i < 8; i++) { - int16_t tot = - src[j * stride + i] + src[j * stride + i + 1] + - src[j * stride + i + stride] + src[j * stride + i + stride + - 1]; - tot = ((tot + 2 - rounding) >> 2); - dst[j * stride + i] = (uint8_t) tot; + if (rounding) + for (j = 7*stride; j >= 0; j-=stride) + { + dst[j + 0] = (uint8_t)((src[j+0] + src[j+1] + src[j+stride+0] + src[j+stride+1] +1)>>2); + dst[j + 1] = (uint8_t)((src[j+1] + src[j+2] + src[j+stride+1] + src[j+stride+2] +1)>>2); + dst[j + 2] = (uint8_t)((src[j+2] + src[j+3] + src[j+stride+2] + src[j+stride+3] +1)>>2); + dst[j + 3] = (uint8_t)((src[j+3] + src[j+4] + src[j+stride+3] + src[j+stride+4] +1)>>2); + dst[j + 4] = (uint8_t)((src[j+4] + src[j+5] + src[j+stride+4] + src[j+stride+5] +1)>>2); + dst[j + 5] = (uint8_t)((src[j+5] + src[j+6] + src[j+stride+5] + src[j+stride+6] +1)>>2); + dst[j + 6] = (uint8_t)((src[j+6] + src[j+7] + src[j+stride+6] + src[j+stride+7] +1)>>2); + dst[j + 7] = (uint8_t)((src[j+7] + src[j+8] + src[j+stride+7] + src[j+stride+8] +1)>>2); + } + else + for (j = 7*stride; j >= 0; j-=stride) + { + dst[j + 0] = (uint8_t)((src[j+0] + src[j+1] + src[j+stride+0] + src[j+stride+1] +2)>>2); + dst[j + 1] = (uint8_t)((src[j+1] + src[j+2] + src[j+stride+1] + src[j+stride+2] +2)>>2); + dst[j + 2] = (uint8_t)((src[j+2] + src[j+3] + src[j+stride+2] + src[j+stride+3] +2)>>2); + dst[j + 3] = (uint8_t)((src[j+3] + src[j+4] + src[j+stride+3] + src[j+stride+4] +2)>>2); + dst[j + 4] = (uint8_t)((src[j+4] + src[j+5] + src[j+stride+4] + src[j+stride+5] +2)>>2); + dst[j + 5] = (uint8_t)((src[j+5] + src[j+6] + src[j+stride+5] + src[j+stride+6] +2)>>2); + dst[j + 6] = (uint8_t)((src[j+6] + src[j+7] + src[j+stride+6] + src[j+stride+7] +2)>>2); + dst[j + 7] = (uint8_t)((src[j+7] + src[j+8] + src[j+stride+7] + src[j+stride+8] +2)>>2); } - } } /************************************************************* * QPEL STUFF STARTS HERE * *************************************************************/ -#define CLIP(X,A,B) (X < A) ? (A) : ((X > B) ? (B) : (X)) - void interpolate8x8_6tap_lowpass_h_c(uint8_t *dst, uint8_t *src, int32_t stride, int32_t rounding) { int32_t i; @@ -421,4 +464,4 @@ interpolate8x8_lowpass_v_c(dst1, dst2, stride, rounding); -} \ No newline at end of file +}