--- gmc.c 2003/09/30 18:20:31 1.1.2.5 +++ gmc.c 2008/11/27 16:31:48 1.9 @@ -19,7 +19,7 @@ * along with this program ; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * $Id: gmc.c,v 1.1.2.5 2003/09/30 18:20:31 edgomez Exp $ + * $Id: gmc.c,v 1.9 2008/11/27 16:31:48 Isibaar Exp $ * ****************************************************************************/ @@ -27,9 +27,62 @@ #include "../global.h" #include "../encoder.h" #include "gmc.h" +#include "../utils/emms.h" #include + /* initialized by init_GMC(), for 3points */ +static +void (*Predict_16x16_func)(const NEW_GMC_DATA * const This, + uint8_t *dst, const uint8_t *src, + int dststride, int srcstride, int x, int y, int rounding) = 0; +static +void (*Predict_8x8_func)(const NEW_GMC_DATA * const This, + uint8_t *uDst, const uint8_t *uSrc, + uint8_t *vDst, const uint8_t *vSrc, + int dststride, int srcstride, int x, int y, int rounding) = 0; + +/****************************************************************************/ +/* this is borrowed from bitstream.c until we find a common solution */ +static uint32_t __inline +log2bin(uint32_t value) +{ +/* Changed by Chenm001 */ +#if !defined(_MSC_VER) + int n = 0; + + while (value) { + value >>= 1; + n++; + } + return n; +#else + __asm { + bsr eax, value + inc eax + } +#endif +} + +/* 16*sizeof(int) -> 1 or 2 cachelines */ +/* table lookup might be faster! (still to be benchmarked) */ + +/* +static int log2bin_table[16] = + { 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4}; +*/ +/* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */ + +#define RDIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b)) +#define RSHIFT(a,b) ( (a)>0 ? ((a) + (1<<((b)-1)))>>(b) : ((a) + (1<<((b)-1))-1)>>(b)) + +#define MLT(i) (((16-(i))<<16) + (i)) +static const uint32_t MTab[16] = { + MLT( 0), MLT( 1), MLT( 2), MLT( 3), MLT( 4), MLT( 5), MLT( 6), MLT( 7), + MLT( 8), MLT( 9), MLT(10), MLT(11), MLT(12), MLT(13), MLT(14), MLT(15) +}; +#undef MLT + /* ************************************************************ * Pts = 2 or 3 * @@ -38,9 +91,10 @@ * Conversely, *dst is the macroblock top-left adress. */ +static void Predict_16x16_C(const NEW_GMC_DATA * const This, - uint8_t *dst, const uint8_t *src, - int dststride, int srcstride, int x, int y, int rounding) + uint8_t *dst, const uint8_t *src, + int dststride, int srcstride, int x, int y, int rounding) { const int W = This->sW; const int H = This->sH; @@ -70,12 +124,17 @@ U += dUx; V += dVx; if (u > 0 && u <= W) { ri = MTab[u&15]; Offset = u>>4; } - else if (u > W) Offset = W>>4; - else Offset = -1; - + else { + if (u > W) Offset = W>>4; + else Offset = 0; + ri = MTab[0]; + } + if (v > 0 && v <= H) { rj = MTab[v&15]; Offset += (v>>4)*srcstride; } - else if (v > H) Offset += (H>>4)*srcstride; - else Offset -= srcstride; + else { + if (v > H) Offset += (H>>4)*srcstride; + rj = MTab[0]; + } f0 = src[Offset + 0]; f0 |= src[Offset + 1] << 16; @@ -92,10 +151,11 @@ } } +static void Predict_8x8_C(const NEW_GMC_DATA * const This, - uint8_t *uDst, const uint8_t *uSrc, - uint8_t *vDst, const uint8_t *vSrc, - int dststride, int srcstride, int x, int y, int rounding) + uint8_t *uDst, const uint8_t *uSrc, + uint8_t *vDst, const uint8_t *vSrc, + int dststride, int srcstride, int x, int y, int rounding) { const int W = This->sW >> 1; const int H = This->sH >> 1; @@ -131,18 +191,17 @@ ri = MTab[u&15]; Offset = u>>4; } else { - ri = 16; if (u>W) Offset = W>>4; - else Offset = -1; + else Offset = 0; + ri = MTab[0]; } - + if (v > 0 && v <= H) { rj = MTab[v&15]; Offset += (v>>4)*srcstride; } else { - rj = 16; if (v>H) Offset += (H>>4)*srcstride; - else Offset -= srcstride; + rj = MTab[0]; } f0 = uSrc[Offset + 0]; @@ -160,9 +219,9 @@ f0 |= vSrc[Offset + 1] << 16; f1 = vSrc[Offset + srcstride + 0]; f1 |= vSrc[Offset + srcstride + 1] << 16; - f0 = (ri*f0)>>16; + f0 = (ri*f0)>>16; f1 = (ri*f1) & 0x0fff0000; - f0 |= f1; + f0 |= f1; f0 = (rj*f0 + Rounder) >> 24; vDst[i] = (uint8_t)f0; @@ -172,8 +231,9 @@ } } +static void get_average_mv_C(const NEW_GMC_DATA * const Dsp, VECTOR * const mv, - int x, int y, int qpel) + int x, int y, int qpel) { int i, j; int vx = 0, vy = 0; @@ -184,10 +244,10 @@ int32_t U, V; U = uo; uo += Dsp->dU[1]; V = vo; vo += Dsp->dV[1]; - for (i=16; i>0; --i) + for (i=16; i>0; --i) { int32_t u,v; - u = U >> 16; U += Dsp->dU[0]; vx += u; + u = U >> 16; U += Dsp->dU[0]; vx += u; v = V >> 16; V += Dsp->dV[0]; vy += v; } } @@ -202,9 +262,10 @@ * simplified version for 1 warp point */ +static void Predict_1pt_16x16_C(const NEW_GMC_DATA * const This, - uint8_t *Dst, const uint8_t *Src, - int dststride, int srcstride, int x, int y, int rounding) + uint8_t *Dst, const uint8_t *Src, + int dststride, int srcstride, int x, int y, int rounding) { const int W = This->sW; const int H = This->sH; @@ -214,17 +275,23 @@ int32_t uo = This->Uo + (x<<8); /* ((16*x)<<4) */ int32_t vo = This->Vo + (y<<8); - const uint32_t ri = MTab[uo & 15]; - const uint32_t rj = MTab[vo & 15]; + uint32_t ri = MTab[uo & 15]; + uint32_t rj = MTab[vo & 15]; int i, j; int32_t Offset; - if ((uint32_t)vo<=(uint32_t)H) Offset = (vo>>4)*srcstride; - else if (vo>H) Offset = ( H>>4)*srcstride; - else Offset =-16*srcstride; - if ((uint32_t)uo<=(uint32_t)W) Offset += (uo>>4); - else if (uo>W) Offset += ( W>>4); - else Offset -= 16; + if (vo>=(-16<<4) && vo<=H) Offset = (vo>>4)*srcstride; + else { + if (vo>H) Offset = ( H>>4)*srcstride; + else Offset =-16*srcstride; + rj = MTab[0]; + } + if (uo>=(-16<<4) && uo<=W) Offset += (uo>>4); + else { + if (uo>W) Offset += (W>>4); + else Offset -= 16; + ri = MTab[0]; + } Dst += 16; @@ -239,18 +306,19 @@ f1 |= Src[ Offset+srcstride +1 ] << 16; f0 = (ri*f0)>>16; f1 = (ri*f1) & 0x0fff0000; - f0 |= f1; + f0 |= f1; f0 = ( rj*f0 + Rounder ) >> 24; Dst[i] = (uint8_t)f0; } Dst += dststride; } -} +} +static void Predict_1pt_8x8_C(const NEW_GMC_DATA * const This, - uint8_t *uDst, const uint8_t *uSrc, - uint8_t *vDst, const uint8_t *vSrc, - int dststride, int srcstride, int x, int y, int rounding) + uint8_t *uDst, const uint8_t *uSrc, + uint8_t *vDst, const uint8_t *vSrc, + int dststride, int srcstride, int x, int y, int rounding) { const int W = This->sW >> 1; const int H = This->sH >> 1; @@ -259,17 +327,23 @@ int32_t uo = This->Uco + (x<<7); int32_t vo = This->Vco + (y<<7); - const uint32_t rri = MTab[uo & 15]; - const uint32_t rrj = MTab[vo & 15]; + uint32_t rri = MTab[uo & 15]; + uint32_t rrj = MTab[vo & 15]; int i, j; int32_t Offset; - if ((uint32_t)vo<=(uint32_t)H) Offset = (vo>>4)*srcstride; - else if (vo>H) Offset = ( H>>4)*srcstride; - else Offset =-8*srcstride; - if ((uint32_t)uo<=(uint32_t)W) Offset += (uo>>4); - else if (uo>W) Offset += (W>>4); - else Offset -= 8; + if (vo>=(-8<<4) && vo<=H) Offset = (vo>>4)*srcstride; + else { + if (vo>H) Offset = ( H>>4)*srcstride; + else Offset =-8*srcstride; + rrj = MTab[0]; + } + if (uo>=(-8<<4) && uo<=W) Offset += (uo>>4); + else { + if (uo>W) Offset += ( W>>4); + else Offset -= 8; + rri = MTab[0]; + } uDst += 8; vDst += 8; @@ -278,17 +352,17 @@ for(i=-8; i<0; ++i, Offset++) { uint32_t f0, f1; - f0 = uSrc[ Offset + 0 ]; + f0 = uSrc[ Offset + 0 ]; f0 |= uSrc[ Offset + 1 ] << 16; f1 = uSrc[ Offset + srcstride + 0 ]; f1 |= uSrc[ Offset + srcstride + 1 ] << 16; f0 = (rri*f0)>>16; - f1 = (rri*f1) & 0x0fff0000; - f0 |= f1; + f1 = (rri*f1) & 0x0fff0000; + f0 |= f1; f0 = ( rrj*f0 + Rounder ) >> 24; - uDst[i] = (uint8_t)f0; + uDst[i] = (uint8_t)f0; - f0 = vSrc[ Offset + 0 ]; + f0 = vSrc[ Offset + 0 ]; f0 |= vSrc[ Offset + 1 ] << 16; f1 = vSrc[ Offset + srcstride + 0 ]; f1 |= vSrc[ Offset + srcstride + 1 ] << 16; @@ -303,6 +377,7 @@ } } +static void get_average_mv_1pt_C(const NEW_GMC_DATA * const Dsp, VECTOR * const mv, int x, int y, int qpel) { @@ -310,6 +385,230 @@ mv->y = RSHIFT(Dsp->Vo<>4) + (v>>4)*srcstride; + f0 = Src[0]; + f0 |= Src[1] << 16; + f1 = Src[srcstride +0]; + f1 |= Src[srcstride +1] << 16; + f0 = (ri*f0)>>16; + f1 = (ri*f1) & 0x0fff0000; + f0 |= f1; + f0 = ( rj*f0 + Rounder ) >> 24; + Dst[i] = (uint8_t)f0; + } +} + +////////////////////////////////////////////////////////// + +static +void Predict_16x16_mmx(const NEW_GMC_DATA * const This, + uint8_t *dst, const uint8_t *src, + int dststride, int srcstride, int x, int y, int rounding) +{ + const int W = This->sW; + const int H = This->sH; + const int rho = 3 - This->accuracy; + const int Rounder = ( 128 - (rounding<<(2*rho)) ) << 16; + const uint32_t W2 = W<<(16-rho); + const uint32_t H2 = H<<(16-rho); + + const int dUx = This->dU[0]; + const int dVx = This->dV[0]; + const int dUy = This->dU[1]; + const int dVy = This->dV[1]; + + int Uo = This->Uo + 16*(dUy*y + dUx*x); + int Vo = This->Vo + 16*(dVy*y + dVx*x); + + int i, j; + + DECLARE_ALIGNED_MATRIX(Offsets, 2,16, uint16_t, CACHE_LINE); + for(j=16; j>0; --j) + { + int32_t U = Uo, V = Vo; + Uo += dUy; Vo += dVy; + if ( W2>(uint32_t)U && W2>(uint32_t)(U+15*dUx) && + H2>(uint32_t)V && H2>(uint32_t)(V+15*dVx) ) + { + uint32_t UV1, UV2; + for(i=0; i<16; ++i) + { + uint32_t u = ( U >> 16 ) << rho; + uint32_t v = ( V >> 16 ) << rho; + U += dUx; V += dVx; + Offsets[ i] = u; + Offsets[16+i] = v; + } + // batch 8 input pixels when linearity says it's ok + + UV1 = (Offsets[0] | (Offsets[16]<<16)) & 0xfff0fff0U; + UV2 = (Offsets[7] | (Offsets[23]<<16)) & 0xfff0fff0U; + if (UV1+7*16==UV2) + GMC_Core_Lin_8(dst, Offsets, src + (Offsets[0]>>4) + (Offsets[16]>>4)*srcstride, srcstride, Rounder); + else + GMC_Core_Non_Lin_8(dst, Offsets, src, srcstride, Rounder); + UV1 = (Offsets[ 8] | (Offsets[24]<<16)) & 0xfff0fff0U; + UV2 = (Offsets[15] | (Offsets[31]<<16)) & 0xfff0fff0U; + if (UV1+7*16==UV2) + GMC_Core_Lin_8(dst+8, Offsets+8, src + (Offsets[8]>>4) + (Offsets[24]>>4)*srcstride, srcstride, Rounder); + else + GMC_Core_Non_Lin_8(dst+8, Offsets+8, src, srcstride, Rounder); + } + else + { + for(i=0; i<16; ++i) + { + int u = ( U >> 16 ) << rho; + int v = ( V >> 16 ) << rho; + U += dUx; V += dVx; + + Offsets[ i] = (u<0) ? 0 : (u>=W) ? W : u; + Offsets[16+i] = (v<0) ? 0 : (v>=H) ? H : v; + } + // due to boundary clipping, we cannot infer the 8-pixels batchability + // simply by using the linearity. Oh well, not a big deal... + GMC_Core_Non_Lin_8(dst, Offsets, src, srcstride, Rounder); + GMC_Core_Non_Lin_8(dst+8, Offsets+8, src, srcstride, Rounder); + } + dst += dststride; + } +} + +static +void Predict_8x8_mmx(const NEW_GMC_DATA * const This, + uint8_t *uDst, const uint8_t *uSrc, + uint8_t *vDst, const uint8_t *vSrc, + int dststride, int srcstride, int x, int y, int rounding) +{ + const int W = This->sW >> 1; + const int H = This->sH >> 1; + const int rho = 3-This->accuracy; + const int32_t Rounder = ( 128 - (rounding<<(2*rho)) ) << 16; + const uint32_t W2 = W<<(16-rho); + const uint32_t H2 = H<<(16-rho); + + const int dUx = This->dU[0]; + const int dVx = This->dV[0]; + const int dUy = This->dU[1]; + const int dVy = This->dV[1]; + + int Uo = This->Uco + 8*(dUy*y + dUx*x); + int Vo = This->Vco + 8*(dVy*y + dVx*x); + + DECLARE_ALIGNED_MATRIX(Offsets, 2,16, uint16_t, CACHE_LINE); + int i, j; + for(j=8; j>0; --j) + { + int32_t U = Uo, V = Vo; + Uo += dUy; Vo += dVy; + if ( W2>(uint32_t)U && W2>(uint32_t)(U+15*dUx) && + H2>(uint32_t)V && H2>(uint32_t)(V+15*dVx) ) + { + uint32_t UV1, UV2; + for(i=0; i<8; ++i) + { + int32_t u = ( U >> 16 ) << rho; + int32_t v = ( V >> 16 ) << rho; + U += dUx; V += dVx; + Offsets[ i] = u; + Offsets[16+i] = v; + } + + // batch 8 input pixels when linearity says it's ok + UV1 = (Offsets[ 0] | (Offsets[16]<<16)) & 0xfff0fff0U; + UV2 = (Offsets[ 7] | (Offsets[23]<<16)) & 0xfff0fff0U; + if (UV1+7*16==UV2) + { + const uint32_t Off = (Offsets[0]>>4) + (Offsets[16]>>4)*srcstride; + GMC_Core_Lin_8(uDst, Offsets, uSrc+Off, srcstride, Rounder); + GMC_Core_Lin_8(vDst, Offsets, vSrc+Off, srcstride, Rounder); + } + else { + GMC_Core_Non_Lin_8(uDst, Offsets, uSrc, srcstride, Rounder); + GMC_Core_Non_Lin_8(vDst, Offsets, vSrc, srcstride, Rounder); + } + } + else + { + for(i=0; i<8; ++i) + { + int u = ( U >> 16 ) << rho; + int v = ( V >> 16 ) << rho; + U += dUx; V += dVx; + Offsets[ i] = (u<0) ? 0 : (u>=W) ? W : u; + Offsets[16+i] = (v<0) ? 0 : (v>=H) ? H : v; + } + GMC_Core_Non_Lin_8(uDst, Offsets, uSrc, srcstride, Rounder); + GMC_Core_Non_Lin_8(vDst, Offsets, vSrc, srcstride, Rounder); + } + uDst += dststride; + vDst += dststride; + } +} + +#endif /* ARCH_IS_IA32 */ + +/* ************************************************************* + * will initialize internal pointers + */ + +void init_GMC(const unsigned int cpu_flags) +{ + Predict_16x16_func = Predict_16x16_C; + Predict_8x8_func = Predict_8x8_C; + +#if defined(ARCH_IS_IA32) || defined(ARCH_IS_X86_64) + if ((cpu_flags & XVID_CPU_MMX) || (cpu_flags & XVID_CPU_MMXEXT) || + (cpu_flags & XVID_CPU_3DNOW) || (cpu_flags & XVID_CPU_3DNOWEXT) || + (cpu_flags & XVID_CPU_SSE) || (cpu_flags & XVID_CPU_SSE2) || + (cpu_flags & XVID_CPU_SSE3) || (cpu_flags & XVID_CPU_SSE41)) + { + Predict_16x16_func = Predict_16x16_mmx; + Predict_8x8_func = Predict_8x8_mmx; + + if (cpu_flags & XVID_CPU_SSE41) + GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_sse41; + else if (cpu_flags & XVID_CPU_SSE2) + GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_sse2; + else + GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_mmx; + } +#endif +} + /* ************************************************************* * Warning! It's Accuracy being passed, not 'resolution'! */ @@ -325,17 +624,16 @@ gmc->num_wp = nb_pts; /* reduce the number of points, if possible */ - if (nb_pts<3 || (pts->duv[2].x==-pts->duv[1].y && pts->duv[2].y==pts->duv[1].x)) { - if (nb_pts<2 || (pts->duv[1].x==0 && pts->duv[1].y==0)) { - if (nb_pts<1 || (pts->duv[0].x==0 && pts->duv[0].y==0)) { - nb_pts = 0; - } - else nb_pts = 1; - } - else nb_pts = 2; - } - else nb_pts = 3; - + if (nb_pts<2 || (pts->duv[2].x==0 && pts->duv[2].y==0 && pts->duv[1].x==0 && pts->duv[1].y==0 )) { + if (nb_pts<2 || (pts->duv[1].x==0 && pts->duv[1].y==0)) { + if (nb_pts<1 || (pts->duv[0].x==0 && pts->duv[0].y==0)) { + nb_pts = 0; + } + else nb_pts = 1; + } + else nb_pts = 2; + } + /* now, nb_pts stores the actual number of points required for interpolation */ if (nb_pts<=1) @@ -364,15 +662,11 @@ gmc->dU[0] = 16*Ws + RDIV( 8*Ws*pts->duv[1].x, width ); /* dU/dx */ gmc->dV[0] = RDIV( 8*Ws*pts->duv[1].y, width ); /* dV/dx */ -/* disabled, because possibly buggy? */ - -#if 0 if (nb_pts==2) { gmc->dU[1] = -gmc->dV[0]; /* -Sin */ gmc->dV[1] = gmc->dU[0] ; /* Cos */ } else -#endif { const int Beta = log2bin(height-1); const int Hs = 1<Uco = (gmc->Uco + gmc->dU[0] + gmc->dU[1])>>2; gmc->Vco = (gmc->Vco + gmc->dV[0] + gmc->dV[1])>>2; - gmc->predict_16x16 = Predict_16x16_C; - gmc->predict_8x8 = Predict_8x8_C; + gmc->predict_16x16 = Predict_16x16_func; + gmc->predict_8x8 = Predict_8x8_func; gmc->get_average_mv = get_average_mv_C; } } @@ -435,8 +729,8 @@ const int mbnum = mj*mb_width+mi; if (pGMC) { - gmc_data->predict_16x16(gmc_data, - pGMC->y + mj*16*stride + mi*16, pRef->y, + gmc_data->predict_16x16(gmc_data, + pGMC->y + mj*16*stride + mi*16, pRef->y, stride, stride, mi, mj, rounding); gmc_data->predict_8x8(gmc_data, @@ -451,4 +745,5 @@ pMBs[mbnum].mcsel = 0; /* until mode decision */ } + emms(); }