--- decoder.c	2003/01/03 16:25:14	1.37.2.25
+++ decoder.c	2003/05/03 23:23:38	1.49.4.1
@@ -1,16 +1,9 @@
-/**************************************************************************
+/*****************************************************************************
  *
  *  XVID MPEG-4 VIDEO CODEC
- *  -  Decoder main module  -
+ *  - Decoder Module -
  *
- *  This program is an implementation of a part of one or more MPEG-4
- *  Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
- *  to use this software module in hardware or software products are
- *  advised that its use may infringe existing patents or copyrights, and
- *  any such use would be at such party's own risk.  The original
- *  developer of this software module and his/her company, and subsequent
- *  editors and their companies, will have no liability for use of this
- *  software or modifications or derivatives thereof.
+ *  This file is part of XviD, a free MPEG-4 video encoder/decoder
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -26,39 +19,11 @@
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  *
- *************************************************************************/
-
-/**************************************************************************
- *
- *  History:
- *
- *  15.07.2002  fix a bug in B-frame decode at DIRECT mode
- *              MinChen <chenm001@163.com>
- *  10.07.2002  added BFRAMES_DEC_DEBUG support
- *              Fix a little bug for low_delay flage
- *              MinChen <chenm001@163.com>
- *  28.06.2002  added basic resync support to iframe/pframe_decode()
- *  22.06.2002	added primative N_VOP support
- *				#define BFRAMES_DEC now enables Minchen's bframe decoder
- *  08.05.2002  add low_delay support for B_VOP decode
- *              MinChen <chenm001@163.com>
- *  05.05.2002  fix some B-frame decode problem
- *  02.05.2002  add B-frame decode support(have some problem);
- *              MinChen <chenm001@163.com>
- *  22.04.2002  add some B-frame decode support;  chenm001 <chenm001@163.com>
- *  29.03.2002  interlacing fix - compensated block wasn't being used when
- *              reconstructing blocks, thus artifacts
- *              interlacing speedup - used transfers to re-interlace
- *              interlaced decoding should be as fast as progressive now
- *  26.03.2002  interlacing support - moved transfers outside decode loop
- *  26.12.2001  decoder_mbinter: dequant/idct moved within if(coded) block
- *  22.12.2001  lock based interpolation
- *  01.12.2001  inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
+ * $Id: decoder.c,v 1.49.4.1 2003/05/03 23:23:38 Isibaar Exp $
  *
- *  $Id: decoder.c,v 1.37.2.25 2003/01/03 16:25:14 suxen_drol Exp $
- *
- *************************************************************************/
+ ****************************************************************************/
 
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -68,6 +33,7 @@
 
 #include "xvid.h"
 #include "portab.h"
+#include "global.h"
 
 #include "decoder.h"
 #include "bitstream/bitstream.h"
@@ -91,25 +57,26 @@
 #include "image/image.h"
 #include "image/colorspace.h"
 #include "utils/mem_align.h"
+#include "image/postprocessing.h"
 
 int
 decoder_resize(DECODER * dec)
 {
 	/* free existing */
-
 	image_destroy(&dec->cur, dec->edged_width, dec->edged_height);
 	image_destroy(&dec->refn[0], dec->edged_width, dec->edged_height);
 	image_destroy(&dec->refn[1], dec->edged_width, dec->edged_height);
 	image_destroy(&dec->tmp, dec->edged_width, dec->edged_height);
 	image_destroy(&dec->qtmp, dec->edged_width, dec->edged_height);
 
+	image_destroy(&dec->gmc, dec->edged_width, dec->edged_height);
+
 	if (dec->last_mbs) 
 		xvid_free(dec->last_mbs);
 	if (dec->mbs)
 		xvid_free(dec->mbs);
 
 	/* realloc */
-
 	dec->mb_width = (dec->width + 15) / 16;
 	dec->mb_height = (dec->height + 15) / 16;
 
@@ -127,8 +94,7 @@
 		return XVID_ERR_MEMORY;
 	}
 
-	// add by chenm001 <chenm001@163.com>
-	// for support B-frame to reference last 2 frame
+	/* Support B-frame to reference last 2 frame */
 	if (image_create(&dec->refn[1], dec->edged_width, dec->edged_height)) {
 		image_destroy(&dec->cur, dec->edged_width, dec->edged_height);
 		image_destroy(&dec->refn[0], dec->edged_width, dec->edged_height);
@@ -152,6 +118,16 @@
 		return XVID_ERR_MEMORY;
 	}
 
+	if (image_create(&dec->gmc, dec->edged_width, dec->edged_height)) {
+		image_destroy(&dec->qtmp, dec->edged_width, dec->edged_height);
+		image_destroy(&dec->cur, dec->edged_width, dec->edged_height);
+		image_destroy(&dec->refn[0], dec->edged_width, dec->edged_height);
+		image_destroy(&dec->refn[1], dec->edged_width, dec->edged_height);
+		image_destroy(&dec->tmp, dec->edged_width, dec->edged_height);
+		xvid_free(dec);
+		return XVID_ERR_MEMORY;
+	}
+
 	dec->mbs =
 		xvid_malloc(sizeof(MACROBLOCK) * dec->mb_width * dec->mb_height,
 					CACHE_LINE);
@@ -166,8 +142,7 @@
 	}
 	memset(dec->mbs, 0, sizeof(MACROBLOCK) * dec->mb_width * dec->mb_height);
 
-	// add by chenm001 <chenm001@163.com>
-	// for skip MB flag
+	/* For skip MB flag */
 	dec->last_mbs =
 		xvid_malloc(sizeof(MACROBLOCK) * dec->mb_width * dec->mb_height,
 					CACHE_LINE);
@@ -210,13 +185,16 @@
 	image_null(&dec->tmp);
 	image_null(&dec->qtmp);
 
+	/* image based GMC */
+	image_null(&dec->gmc);
+
+
 	dec->mbs = NULL;
 	dec->last_mbs = NULL;
 
 	init_timer();
 
-	// add by chenm001 <chenm001@163.com>
-	// for support B-frame to save reference frame's time
+	/* For B-frame support (used to save reference frame's time */
 	dec->frames = 0;
 	dec->time = dec->time_base = dec->last_time_base = 0;
 	dec->low_delay = 0;
@@ -236,6 +214,10 @@
 {
 	xvid_free(dec->last_mbs);
 	xvid_free(dec->mbs);
+
+	/* image based GMC */
+	image_destroy(&dec->gmc, dec->edged_width, dec->edged_height);
+
 	image_destroy(&dec->refn[0], dec->edged_width, dec->edged_height);
 	image_destroy(&dec->refn[1], dec->edged_width, dec->edged_height);
 	image_destroy(&dec->tmp, dec->edged_width, dec->edged_height);
@@ -256,8 +238,7 @@
 
 
 
-// decode an intra macroblock
-
+/* decode an intra macroblock */
 void
 decoder_mbintra(DECODER * dec,
 				MACROBLOCK * pMB,
@@ -292,7 +273,7 @@
 		pV_Cur = dec->cur.v + (y_pos << 3) * stride2 + (x_pos << 3);
 	}
 
-	memset(block, 0, 6 * 64 * sizeof(int16_t));	// clear
+	memset(block, 0, 6 * 64 * sizeof(int16_t));	/* clear */
 
 	for (i = 0; i < 6; i++) {
 		uint32_t iDcScaler = get_dc_scaler(iQuant, i < 4);
@@ -315,7 +296,7 @@
 			dc_dif = dc_size ? get_dc_dif(bs, dc_size) : 0;
 
 			if (dc_size > 8) {
-				BitstreamSkip(bs, 1);	// marker
+				BitstreamSkip(bs, 1);	/* marker */
 			}
 
 			block[i * 64 + 0] = dc_dif;
@@ -327,7 +308,7 @@
 		}
 
 		start_timer();
-		if (cbp & (1 << (5 - i)))	// coded
+		if (cbp & (1 << (5 - i)))	/* coded */
 		{
 			int direction = dec->alternate_vertical_scan ?
 				2 : pMB->acpred_directions[i];
@@ -384,18 +365,13 @@
 
 
 
-
-#define SIGN(X) (((X)>0)?1:-1)
-#define ABS(X) (((X)>0)?(X):-(X))
-
-// decode an inter macroblock
-
+/* decode an inter macroblock */
 void
 decoder_mbinter(DECODER * dec,
 				const MACROBLOCK * pMB,
 				const uint32_t x_pos,
 				const uint32_t y_pos,
-				const uint32_t acpred_flag,
+				const uint32_t fcode,
 				const uint32_t cbp,
 				Bitstream * bs,
 				const uint32_t quant,
@@ -424,24 +400,19 @@
 			mv[i].x = RRV_MV_SCALEUP(pMB->mvs[i].x);
 			mv[i].y = RRV_MV_SCALEUP(pMB->mvs[i].y);
 		}
-	}else{
+	} else {
 		pY_Cur = dec->cur.y + (y_pos << 4) * stride + (x_pos << 4);
 		pU_Cur = dec->cur.u + (y_pos << 3) * stride2 + (x_pos << 3);
 		pV_Cur = dec->cur.v + (y_pos << 3) * stride2 + (x_pos << 3);
 		for (i = 0; i < 4; i++)
 			mv[i] = pMB->mvs[i];
 	}
-
+	
 	if (pMB->mode == MODE_INTER || pMB->mode == MODE_INTER_Q) {
-		uv_dx = mv[0].x;
-		uv_dy = mv[0].y;
-
-		if (dec->quarterpel)
-		{
-			uv_dx /= 2;
-			uv_dy /= 2;
-		}
 
+		uv_dx = mv[0].x / (1 + dec->quarterpel);
+		uv_dy = mv[0].y / (1 + dec->quarterpel);
+		
 		uv_dx = (uv_dx >> 1) + roundtab_79[uv_dx & 0x3];
 		uv_dy = (uv_dy >> 1) + roundtab_79[uv_dy & 0x3];
 
@@ -461,11 +432,11 @@
 			if(dec->quarterpel) {
 				interpolate16x16_quarterpel(dec->cur.y, dec->refn[0].y, dec->qtmp.y, dec->qtmp.y + 64,
 	 										dec->qtmp.y + 128, 16*x_pos, 16*y_pos,
-											mv[0].x, mv[0].y, stride,  rounding);
+											mv[0].x, mv[0].y, stride, rounding);
 			}
 			else {
 				interpolate16x16_switch(dec->cur.y, dec->refn[0].y, 16*x_pos, 16*y_pos,
-									  mv[0].x, mv[0].y, stride,  rounding);
+									  mv[0].x, mv[0].y, stride, rounding);
 			}
 
 			interpolate8x8_switch(dec->cur.u, dec->refn[0].u, 8 * x_pos, 8 * y_pos,
@@ -497,49 +468,49 @@
 		{
 			interpolate16x16_switch(dec->cur.y, dec->refn[0].y, 32*x_pos, 32*y_pos,
 								  mv[0].x, mv[0].y, stride,  rounding);
-			interpolate16x16_switch(dec->cur.y, dec->refn[0].y, 32*x_pos + 16, 32*y_pos,
+			interpolate16x16_switch(dec->cur.y, dec->refn[0].y , 32*x_pos + 16, 32*y_pos,
 								  mv[1].x, mv[1].y, stride,  rounding);
-			interpolate16x16_switch(dec->cur.y, dec->refn[0].y, 32*x_pos, 32*y_pos + 16,
+			interpolate16x16_switch(dec->cur.y, dec->refn[0].y , 32*x_pos, 32*y_pos + 16,
 								  mv[2].x, mv[2].y, stride,  rounding);
-			interpolate16x16_switch(dec->cur.y, dec->refn[0].y, 32*x_pos + 16, 32*y_pos + 16, 
+			interpolate16x16_switch(dec->cur.y, dec->refn[0].y , 32*x_pos + 16, 32*y_pos + 16, 
 								  mv[3].x, mv[3].y, stride,  rounding);
-			interpolate16x16_switch(dec->cur.u, dec->refn[0].u, 16 * x_pos, 16 * y_pos,
+			interpolate16x16_switch(dec->cur.u, dec->refn[0].u , 16 * x_pos, 16 * y_pos,
 								  uv_dx, uv_dy, stride2, rounding);
-			interpolate16x16_switch(dec->cur.v, dec->refn[0].v, 16 * x_pos, 16 * y_pos,
+			interpolate16x16_switch(dec->cur.v, dec->refn[0].v , 16 * x_pos, 16 * y_pos,
 								  uv_dx, uv_dy, stride2, rounding);
 
-			// set_block(pY_Cur, stride, 32, 32, 127);
+			/* set_block(pY_Cur, stride, 32, 32, 127); */
 		} 
 		else
 		{
 			if(dec->quarterpel) {
-				interpolate8x8_quarterpel(dec->cur.y, dec->refn[0].y, dec->qtmp.y, dec->qtmp.y + 64,
+				interpolate8x8_quarterpel(dec->cur.y, dec->refn[0].y , dec->qtmp.y, dec->qtmp.y + 64,
 										  dec->qtmp.y + 128, 16*x_pos, 16*y_pos,
 										  mv[0].x, mv[0].y, stride,  rounding);
-				interpolate8x8_quarterpel(dec->cur.y, dec->refn[0].y, dec->qtmp.y, dec->qtmp.y + 64,
+				interpolate8x8_quarterpel(dec->cur.y, dec->refn[0].y , dec->qtmp.y, dec->qtmp.y + 64,
 										  dec->qtmp.y + 128, 16*x_pos + 8, 16*y_pos,
 										  mv[1].x, mv[1].y, stride,  rounding);
-				interpolate8x8_quarterpel(dec->cur.y, dec->refn[0].y, dec->qtmp.y, dec->qtmp.y + 64,
+				interpolate8x8_quarterpel(dec->cur.y, dec->refn[0].y , dec->qtmp.y, dec->qtmp.y + 64,
 										  dec->qtmp.y + 128, 16*x_pos, 16*y_pos + 8,
 										  mv[2].x, mv[2].y, stride,  rounding);
-				interpolate8x8_quarterpel(dec->cur.y, dec->refn[0].y, dec->qtmp.y, dec->qtmp.y + 64,
+				interpolate8x8_quarterpel(dec->cur.y, dec->refn[0].y , dec->qtmp.y, dec->qtmp.y + 64,
 										  dec->qtmp.y + 128, 16*x_pos + 8, 16*y_pos + 8,
 										  mv[3].x, mv[3].y, stride,  rounding);
 			}
 			else {
-				interpolate8x8_switch(dec->cur.y, dec->refn[0].y, 16*x_pos, 16*y_pos,
+				interpolate8x8_switch(dec->cur.y, dec->refn[0].y , 16*x_pos, 16*y_pos,
 									  mv[0].x, mv[0].y, stride,  rounding);
-				interpolate8x8_switch(dec->cur.y, dec->refn[0].y, 16*x_pos + 8, 16*y_pos,
+				interpolate8x8_switch(dec->cur.y, dec->refn[0].y , 16*x_pos + 8, 16*y_pos,
 									  mv[1].x, mv[1].y, stride,  rounding);
-				interpolate8x8_switch(dec->cur.y, dec->refn[0].y, 16*x_pos, 16*y_pos + 8,
+				interpolate8x8_switch(dec->cur.y, dec->refn[0].y , 16*x_pos, 16*y_pos + 8,
 									  mv[2].x, mv[2].y, stride,  rounding);
-				interpolate8x8_switch(dec->cur.y, dec->refn[0].y, 16*x_pos + 8, 16*y_pos + 8, 
+				interpolate8x8_switch(dec->cur.y, dec->refn[0].y , 16*x_pos + 8, 16*y_pos + 8, 
 									  mv[3].x, mv[3].y, stride,  rounding);
 			}
 
-			interpolate8x8_switch(dec->cur.u, dec->refn[0].u, 8 * x_pos, 8 * y_pos,
+			interpolate8x8_switch(dec->cur.u, dec->refn[0].u , 8 * x_pos, 8 * y_pos,
 								  uv_dx, uv_dy, stride2, rounding);
-			interpolate8x8_switch(dec->cur.v, dec->refn[0].v, 8 * x_pos, 8 * y_pos,
+			interpolate8x8_switch(dec->cur.v, dec->refn[0].v , 8 * x_pos, 8 * y_pos,
 								  uv_dx, uv_dy, stride2, rounding);
 		}
 		stop_comp_timer();
@@ -548,9 +519,9 @@
 	for (i = 0; i < 6; i++) {
 		int direction = dec->alternate_vertical_scan ? 2 : 0;
 
-		if (cbp & (1 << (5 - i)))	// coded
+		if (cbp & (1 << (5 - i)))	/* coded */
 		{
-			memset(&block[i * 64], 0, 64 * sizeof(int16_t));	// clear
+			memset(&block[i * 64], 0, 64 * sizeof(int16_t));	/* clear */
 
 			start_timer();
 			get_inter_block(bs, &block[i * 64], direction);
@@ -609,6 +580,119 @@
 	stop_transfer_timer();
 }
 
+static __inline int gmc_sanitize(int value, int quarterpel, int fcode)
+{
+	int length = 1 << (fcode+4);
+
+/*	if (quarterpel) value *= 2; */
+
+	if (value < -length) 
+		return -length;
+	else if (value >= length) 
+		return length-1;
+	else return value;
+}
+
+
+static void
+decoder_mbgmc(DECODER * dec,
+				MACROBLOCK * const pMB,
+				const uint32_t x_pos,
+				const uint32_t y_pos,
+				const uint32_t fcode,
+				const uint32_t cbp,
+				Bitstream * bs,
+				const uint32_t quant,
+				const uint32_t rounding,
+				const int reduced_resolution)	/* no reduced res support */
+{
+
+	DECLARE_ALIGNED_MATRIX(block, 6, 64, int16_t, CACHE_LINE);
+	DECLARE_ALIGNED_MATRIX(data, 6, 64, int16_t, CACHE_LINE);
+
+	const uint32_t stride = dec->edged_width;
+	const uint32_t stride2 = stride / 2;
+	const uint32_t next_block = stride * (reduced_resolution ? 16 : 8);
+	uint32_t i;
+	const uint32_t iQuant = pMB->quant;
+	uint8_t *const pY_Cur=dec->cur.y + (y_pos << 4) * stride + (x_pos << 4);
+	uint8_t *const pU_Cur=dec->cur.u + (y_pos << 3) * stride2 + (x_pos << 3);
+	uint8_t *const pV_Cur=dec->cur.v + (y_pos << 3) * stride2 + (x_pos << 3);
+
+	pMB->mvs[0] = pMB->mvs[1] = pMB->mvs[2] = pMB->mvs[3] = pMB->amv;
+
+	start_timer();
+	
+/* this is where the calculations are done */
+	
+	{
+		pMB->amv = generate_GMCimageMB(&dec->gmc_data, &dec->refn[0], x_pos, y_pos, 
+					stride, stride2, dec->quarterpel, rounding, &dec->cur);
+
+		pMB->amv.x = gmc_sanitize(pMB->amv.x, dec->quarterpel, fcode);
+		pMB->amv.y = gmc_sanitize(pMB->amv.y, dec->quarterpel, fcode);
+	}
+	pMB->mvs[0] = pMB->mvs[1] = pMB->mvs[2] = pMB->mvs[3] = pMB->amv;
+	
+/*
+	transfer16x16_copy(pY_Cur, dec->gmc.y + (y_pos << 4)*stride + (x_pos  << 4), stride);
+	transfer8x8_copy(pU_Cur, dec->gmc.u + (y_pos << 3)*stride2 + (x_pos  << 3), stride2);
+	transfer8x8_copy(pV_Cur, dec->gmc.v + (y_pos << 3)*stride2 + (x_pos << 3), stride2);
+*/
+
+
+	stop_transfer_timer();
+	
+	if (!cbp) return;
+
+	for (i = 0; i < 6; i++) {
+		int direction = dec->alternate_vertical_scan ? 2 : 0;
+
+		if (cbp & (1 << (5 - i)))	/* coded */
+		{
+			memset(&block[i * 64], 0, 64 * sizeof(int16_t));	/* clear */
+
+			start_timer();
+			get_inter_block(bs, &block[i * 64], direction);
+			stop_coding_timer();
+
+			start_timer();
+			if (dec->quant_type == 0) {
+				dequant_inter(&data[i * 64], &block[i * 64], iQuant);
+			} else {
+				dequant4_inter(&data[i * 64], &block[i * 64], iQuant);
+			}
+			stop_iquant_timer();
+
+			start_timer();
+			idct(&data[i * 64]);
+			stop_idct_timer();
+		}
+	}
+
+/* interlace + GMC is this possible ??? */
+/*
+  if (dec->interlacing && pMB->field_dct) {
+	  next_block = stride;
+	  stride *= 2;
+  }
+*/
+	start_timer();
+	if (cbp & 32)
+		transfer_16to8add(pY_Cur, &data[0 * 64], stride);
+	if (cbp & 16)
+		transfer_16to8add(pY_Cur + 8, &data[1 * 64], stride);
+	if (cbp & 8)
+		transfer_16to8add(pY_Cur + next_block, &data[2 * 64], stride);
+	if (cbp & 4)
+		transfer_16to8add(pY_Cur + 8 + next_block, &data[3 * 64], stride);
+	if (cbp & 2)
+		transfer_16to8add(pU_Cur, &data[4 * 64], stride2);
+	if (cbp & 1)
+		transfer_16to8add(pV_Cur, &data[5 * 64], stride2);
+	stop_transfer_timer();
+}
+
 
 void
 decoder_iframe(DECODER * dec,
@@ -716,7 +800,7 @@
 	mv.x = get_mv(bs, fcode);
 	mv.y = get_mv(bs, fcode);
 
-	DPRINTF(DPRINTF_MV,"mv_diff (%i,%i) pred (%i,%i)", mv.x, mv.y, pmv.x, pmv.y);
+	DPRINTF(DPRINTF_MV,"mv_diff (%i,%i) pred (%i,%i) result (%i,%i)", mv.x, mv.y, pmv.x, pmv.y, mv.x+pmv.x, mv.y+pmv.y);
 
 	mv.x += pmv.x;
 	mv.y += pmv.y;
@@ -739,21 +823,9 @@
 
 
 
-static __inline int gmc_sanitize(int value, int quarterpel, int fcode)
-{
-	int length = 1 << (fcode+4);
 
-	if (quarterpel) value *= 2;
 
-	if (value < -length) 
-		return -length;
-	else if (value >= length) 
-		return length-1;
-	else return value;
-}
-
-
-/* for P_VOP set gmc_mv to NULL */
+/* for P_VOP set gmc_warp to NULL */
 void
 decoder_pframe(DECODER * dec,
 			   Bitstream * bs,
@@ -762,7 +834,7 @@
 			   int quant,
 			   int fcode,
 			   int intra_dc_threshold,
-			   VECTOR * gmc_mv)
+			   const WARPPOINTS *const gmc_warp)
 {
 
 	uint32_t x, y;
@@ -782,6 +854,31 @@
 				   dec->width, dec->height);
 	stop_edges_timer();
 
+	if (gmc_warp)
+	{	
+
+		/* accuracy:  0==1/2, 1=1/4, 2=1/8, 3=1/16 */
+		if ( (dec->sprite_warping_accuracy != 3) || (dec->sprite_warping_points != 2) )
+		{	
+			fprintf(stderr,"Wrong GMC parameters acc=%d(-> 1/%d), %d!!!\n",
+				dec->sprite_warping_accuracy,(2<<dec->sprite_warping_accuracy),
+				dec->sprite_warping_points);
+		}
+		
+		generate_GMCparameters(	dec->sprite_warping_points, 
+				(2 << dec->sprite_warping_accuracy), gmc_warp, 
+				dec->width, dec->height, &dec->gmc_data);
+
+/* image warping is done block-based  in decoder_mbgmc(), now */	
+/*
+	generate_GMCimage(&dec->gmc_data, &dec->refn[0], 
+					mb_width, mb_height, 
+					dec->edged_width, dec->edged_width/2,
+					fcode, dec->quarterpel, 0, 
+					rounding, dec->mbs, &dec->gmc);
+*/
+	}
+
 	bound = 0;
 
 	for (y = 0; y < mb_height; y++) {
@@ -789,7 +886,7 @@
 		for (x = 0; x < mb_width; x++) {
 			MACROBLOCK *mb;
 
-			// skip stuffing
+			/* skip stuffing */
 			while (BitstreamShowBits(bs, 10) == 1)
 				BitstreamSkip(bs, 10);
 
@@ -804,8 +901,8 @@
 
 			DPRINTF(DPRINTF_MB, "macroblock (%i,%i) %08x", x, y, BitstreamShowBits(bs, 32));
 
-			//if (!(dec->mb_skip[y*dec->mb_width + x]=BitstreamGetBit(bs)))         // not_coded
-			if (!(BitstreamGetBit(bs)))	// not_coded
+			/* if (!(dec->mb_skip[y*dec->mb_width + x]=BitstreamGetBit(bs))) */ /* not_coded */
+			if (!(BitstreamGetBit(bs)))	/* block _is_ coded */
 			{
 				uint32_t mcbpc;
 				uint32_t cbpc;
@@ -813,7 +910,7 @@
 				uint32_t cbpy;
 				uint32_t cbp;
 				uint32_t intra;
-				int mcsel = 0;		// mcsel: '0'=local motion, '1'=GMC
+				int mcsel = 0;		/* mcsel: '0'=local motion, '1'=GMC */
 
 				cp_mb++;
 				mcbpc = get_mcbpc_inter(bs);
@@ -830,13 +927,13 @@
 					acpred_flag = BitstreamGetBit(bs);
 				}
 
-				if (gmc_mv && (mb->mode == MODE_INTER || mb->mode == MODE_INTER_Q))
+				if (gmc_warp && (mb->mode == MODE_INTER || mb->mode == MODE_INTER_Q))
 				{
 					mcsel = BitstreamGetBit(bs);
 				}
 
 				cbpy = get_cbpy(bs, intra);
-				DPRINTF(DPRINTF_MB, "cbpy %i", cbpy);
+				DPRINTF(DPRINTF_MB, "cbpy %i  mcsel %i ", cbpy,mcsel);
 
 				cbp = (cbpy << 2) | cbpc;
 
@@ -871,15 +968,15 @@
 						}
 					}
 				}
+				
+				if (mcsel) {
+					decoder_mbgmc(dec, mb, x, y, fcode, cbp, bs, quant,
+								rounding, reduced_resolution);
+					continue;
 
-				if (mb->mode == MODE_INTER || mb->mode == MODE_INTER_Q) {
-
-					if (mcsel)
-					{
-						mb->mvs[0].x = mb->mvs[1].x = mb->mvs[2].x = mb->mvs[3].x = gmc_sanitize(gmc_mv[0].x, dec->quarterpel, fcode);
-						mb->mvs[0].y = mb->mvs[1].y = mb->mvs[2].y = mb->mvs[3].y = gmc_sanitize(gmc_mv[0].y, dec->quarterpel, fcode);
+				} else if (mb->mode == MODE_INTER || mb->mode == MODE_INTER_Q) {
 
-					} else if (dec->interlacing && mb->field_pred) {
+					if (dec->interlacing && mb->field_pred) {
 						get_motion_vector(dec, bs, x, y, 0, &mb->mvs[0],
 										  fcode, bound);
 						get_motion_vector(dec, bs, x, y, 0, &mb->mvs[1],
@@ -887,10 +984,7 @@
 					} else {
 						get_motion_vector(dec, bs, x, y, 0, &mb->mvs[0],
 										  fcode, bound);
-						mb->mvs[1].x = mb->mvs[2].x = mb->mvs[3].x =
-							mb->mvs[0].x;
-						mb->mvs[1].y = mb->mvs[2].y = mb->mvs[3].y =
-							mb->mvs[0].y;
+						mb->mvs[1] = mb->mvs[2] = mb->mvs[3] = mb->mvs[0];
 					}
 				} else if (mb->mode == MODE_INTER4V ) {
 
@@ -898,7 +992,7 @@
 					get_motion_vector(dec, bs, x, y, 1, &mb->mvs[1], fcode, bound);
 					get_motion_vector(dec, bs, x, y, 2, &mb->mvs[2], fcode, bound);
 					get_motion_vector(dec, bs, x, y, 3, &mb->mvs[3], fcode, bound);
-				} else			// MODE_INTRA, MODE_INTRA_Q
+				} else			/* MODE_INTRA, MODE_INTRA_Q */
 				{
 					mb->mvs[0].x = mb->mvs[1].x = mb->mvs[2].x = mb->mvs[3].x =
 						0;
@@ -909,16 +1003,26 @@
 					continue;
 				}
 
-				decoder_mbinter(dec, mb, x, y, acpred_flag, cbp, bs, quant,
+				decoder_mbinter(dec, mb, x, y, fcode, cbp, bs, quant,
 								rounding, reduced_resolution);
 
-			} 
-			else if (gmc_mv)	/* not coded S_VOP macroblock */
+			}
+			else if (gmc_warp)	/* a not coded S(GMC)-VOP macroblock */
 			{
 				mb->mode = MODE_NOT_CODED_GMC;
-				mb->mvs[0].x = mb->mvs[1].x = mb->mvs[2].x = mb->mvs[3].x = gmc_sanitize(gmc_mv[0].x, dec->quarterpel, fcode);
-				mb->mvs[0].y = mb->mvs[1].y = mb->mvs[2].y = mb->mvs[3].y = gmc_sanitize(gmc_mv[0].y, dec->quarterpel, fcode);
-				decoder_mbinter(dec, mb, x, y, 0, 0, bs, quant, rounding, reduced_resolution);
+
+				start_timer();
+
+				decoder_mbgmc(dec, mb, x, y, fcode, 0x00, bs, quant,
+								rounding, reduced_resolution);
+
+				stop_transfer_timer();
+
+				if(dec->out_frm && cp_mb > 0) {
+				  output_slice(&dec->cur, dec->edged_width,dec->width,dec->out_frm,st_mb,y,cp_mb);
+				  cp_mb = 0;
+				}
+				st_mb = x+1;
 			}
 			else	/* not coded P_VOP macroblock */
 			{
@@ -926,7 +1030,7 @@
 
 				mb->mvs[0].x = mb->mvs[1].x = mb->mvs[2].x = mb->mvs[3].x = 0;
 				mb->mvs[0].y = mb->mvs[1].y = mb->mvs[2].y = mb->mvs[3].y = 0;
-				// copy macroblock directly from ref to cur
+				/* copy macroblock directly from ref to cur */
 
 				start_timer();
 
@@ -974,8 +1078,7 @@
 }
 
 
-// add by MinChen <chenm001@163.com>
-// decode B-frame motion vector
+/* decode B-frame motion vector */
 void
 get_b_motion_vector(DECODER * dec,
 					Bitstream * bs,
@@ -1019,8 +1122,7 @@
 }
 
 
-// add by MinChen <chenm001@163.com>
-// decode an B-frame forward & backward inter macroblock
+/* decode an B-frame forward & backward inter macroblock */
 void
 decoder_bf_mbinter(DECODER * dec,
 				   const MACROBLOCK * pMB,
@@ -1104,9 +1206,9 @@
 	for (i = 0; i < 6; i++) {
 		int direction = dec->alternate_vertical_scan ? 2 : 0;
 
-		if (cbp & (1 << (5 - i)))	// coded
+		if (cbp & (1 << (5 - i)))	/* coded */
 		{
-			memset(&block[i * 64], 0, 64 * sizeof(int16_t));	// clear
+			memset(&block[i * 64], 0, 64 * sizeof(int16_t));	/* clear */
 
 			start_timer();
 			get_inter_block(bs, &block[i * 64], direction);
@@ -1147,8 +1249,7 @@
 	stop_transfer_timer();
 }
 
-// add by MinChen <chenm001@163.com>
-// decode an B-frame direct &  inter macroblock
+/* decode an B-frame direct &  inter macroblock */
 void
 decoder_bf_interpolate_mbinter(DECODER * dec,
 							   IMAGE forward,
@@ -1345,9 +1446,9 @@
 	for (i = 0; i < 6; i++) {
 		int direction = dec->alternate_vertical_scan ? 2 : 0;
 
-		if (cbp & (1 << (5 - i)))	// coded
+		if (cbp & (1 << (5 - i)))	/* coded */
 		{
-			memset(&block[i * 64], 0, 64 * sizeof(int16_t));	// clear
+			memset(&block[i * 64], 0, 64 * sizeof(int16_t));	/* clear */
 
 			start_timer();
 			get_inter_block(bs, &block[i * 64], direction);
@@ -1389,26 +1490,26 @@
 }
 
 
-// add by MinChen <chenm001@163.com>
-// for decode B-frame dbquant
+/* for decode B-frame dbquant */
 int32_t __inline
 get_dbquant(Bitstream * bs)
 {
-	if (!BitstreamGetBit(bs))	// '0'
+	if (!BitstreamGetBit(bs))      /*  '0' */
 		return (0);
-	else if (!BitstreamGetBit(bs))	// '10'
+	else if (!BitstreamGetBit(bs)) /* '10' */
 		return (-2);
-	else
-		return (2);				// '11'
+	else                           /* '11' */
+		return (2);
 }
 
-// add by MinChen <chenm001@163.com>
-// for decode B-frame mb_type
-// bit   ret_value
-// 1        0
-// 01       1
-// 001      2
-// 0001     3
+/*
+ * For decode B-frame mb_type
+ * bit   ret_value
+ * 1        0
+ * 01       1
+ * 001      2
+ * 0001     3
+ */
 int32_t __inline
 get_mbtype(Bitstream * bs)
 {
@@ -1457,7 +1558,7 @@
 #endif
 
 	for (y = 0; y < dec->mb_height; y++) {
-		// Initialize Pred Motion Vector
+		/* Initialize Pred Motion Vector */
 		dec->p_fmv = dec->p_bmv = zeromv;
 		for (x = 0; x < dec->mb_width; x++) {
 			MACROBLOCK *mb = &dec->mbs[y * dec->mb_width + x];
@@ -1467,11 +1568,14 @@
 			mb->b_mvs[0] = mb->b_mvs[1] = mb->b_mvs[2] = mb->b_mvs[3] =
 			mb->mvs[0] = mb->mvs[1] = mb->mvs[2] = mb->mvs[3] = zeromv;
 
-			// skip if the co-located P_VOP macroblock is not coded 
-			// note: gmc+not_coded isn't skipped
+			/*
+			 * skip if the co-located P_VOP macroblock is not coded 
+			 * if not codec in co-located S_VOP macroblock is _not_
+			 * automatically skipped
+			 */
 
 			if (last_mb->mode == MODE_NOT_CODED) {
-				//DEBUG2("Skip MB in B-frame at (X,Y)=!",x,y);
+				/* DEBUG2("Skip MB in B-frame at (X,Y)=!",x,y); */
 				mb->cbp = 0;
 #ifdef BFRAMES_DEC_DEBUG
 				mb->mb_type = MODE_NOT_CODED;
@@ -1479,19 +1583,21 @@
 #endif
 				mb->mb_type = MODE_FORWARD;
 				mb->quant = last_mb->quant;
-				//mb->mvs[1].x = mb->mvs[2].x = mb->mvs[3].x = mb->mvs[0].x;
-				//mb->mvs[1].y = mb->mvs[2].y = mb->mvs[3].y = mb->mvs[0].y;
+				/*
+				  mb->mvs[1].x = mb->mvs[2].x = mb->mvs[3].x = mb->mvs[0].x;
+				  mb->mvs[1].y = mb->mvs[2].y = mb->mvs[3].y = mb->mvs[0].y;
+				*/
 
 				decoder_bf_mbinter(dec, mb, x, y, mb->cbp, bs, mb->quant, 1);
 				continue;
 			}
 
-			if (!BitstreamGetBit(bs)) {	// modb=='0'
+			if (!BitstreamGetBit(bs)) {	/* modb=='0' */
 				const uint8_t modb2 = BitstreamGetBit(bs);
 
 				mb->mb_type = get_mbtype(bs);
 
-				if (!modb2) {	// modb=='00'
+				if (!modb2) {	/* modb=='00' */
 					mb->cbp = BitstreamGetBits(bs, 6);
 				} else {
 					mb->cbp = 0;
@@ -1512,7 +1618,7 @@
 
 			mb->quant = quant;
 			mb->mode = MODE_INTER4V;
-			//DEBUG1("Switch bm_type=",mb->mb_type);
+			/* DEBUG1("Switch bm_type=",mb->mb_type); */
 
 #ifdef BFRAMES_DEC_DEBUG
 	BFRAME_DEBUG
@@ -1541,7 +1647,7 @@
 										  / TRD
 									    : mb->mvs[i].y - last_mb->mvs[i].y);
 					}
-					//DEBUG("B-frame Direct!\n");
+					/* DEBUG("B-frame Direct!\n"); */
 				}
 				decoder_bf_interpolate_mbinter(dec, dec->refn[1], dec->refn[0],
 											   mb, x, y, bs);
@@ -1559,7 +1665,7 @@
 
 				decoder_bf_interpolate_mbinter(dec, dec->refn[1], dec->refn[0],
 											   mb, x, y, bs);
-				//DEBUG("B-frame Bidir!\n");
+				/* DEBUG("B-frame Bidir!\n"); */
 				break;
 
 			case MODE_BACKWARD:
@@ -1569,7 +1675,7 @@
 
 				mb->mode = MODE_INTER;
 				decoder_bf_mbinter(dec, mb, x, y, mb->cbp, bs, quant, 0);
-				//DEBUG("B-frame Backward!\n");
+				/* DEBUG("B-frame Backward!\n"); */
 				break;
 
 			case MODE_FORWARD:
@@ -1579,15 +1685,15 @@
 
 				mb->mode = MODE_INTER;
 				decoder_bf_mbinter(dec, mb, x, y, mb->cbp, bs, quant, 1);
-				//DEBUG("B-frame Forward!\n");
+				/* DEBUG("B-frame Forward!\n"); */
 				break;
 
 			default:
 				DPRINTF(DPRINTF_ERROR,"Not support B-frame mb_type = %i", mb->mb_type);
 			}
-
-		}						// end of FOR
+		} /* End of for */
 	}
+
 #ifdef BFRAMES_DEC_DEBUG
 	if (!first){
 		first=1;
@@ -1597,7 +1703,7 @@
 #endif
 }
 
-// swap two MACROBLOCK array
+/* swap two MACROBLOCK array */
 void
 mb_swap(MACROBLOCK ** mb1,
 		MACROBLOCK ** mb2)
@@ -1618,9 +1724,9 @@
 	{
 		/* note: image is stored to tmp */
 		image_copy(&dec->tmp, img, dec->edged_width, dec->height);
-		image_deblock_rrv(&dec->tmp, dec->edged_width, 
-						mbs, dec->mb_width, dec->mb_height, dec->mb_width,
-						8, frame->general);
+		image_deblock(&dec->tmp, dec->edged_width, 
+					  mbs, dec->mb_width, dec->mb_height, dec->mb_width,
+					  frame->general);
 		img = &dec->tmp;
 	}
 
@@ -1642,8 +1748,8 @@
 	uint32_t fcode_forward;
 	uint32_t fcode_backward;
 	uint32_t intra_dc_threshold;
-	VECTOR gmc_mv[5];
-	uint32_t vop_type;
+	WARPPOINTS gmc_warp;
+	int vop_type;
 	int success = 0;
 	int output = 0;
 	int seen_something = 0;
@@ -1671,7 +1777,7 @@
 		{
 			stats->notify = output ? XVID_DEC_VOP : XVID_DEC_NOTHING;
 			stats->data.vop.time_base = (int)dec->time_base;
-			stats->data.vop.time_increment = 0;	//XXX: todo
+			stats->data.vop.time_increment = 0;	/* XXX: todo */
 		}
 
 		emms();
@@ -1682,7 +1788,7 @@
 
 	BitstreamInit(&bs, frame->bitstream, frame->length);
 
-	// XXX: 0x7f is only valid whilst decoding vfw xvid/divx5 avi's
+	/* XXX: 0x7f is only valid whilst decoding vfw xvid/divx5 avi's */
 	if(dec->low_delay_default && frame->length == 1 && BitstreamShowBits(&bs, 8) == 0x7f)
 	{
 		if (stats)
@@ -1697,12 +1803,12 @@
 repeat:
 
 	vop_type =	BitstreamReadHeaders(&bs, dec, &rounding, &reduced_resolution, 
-			&quant, &fcode_forward, &fcode_backward, &intra_dc_threshold, gmc_mv);
+			&quant, &fcode_forward, &fcode_backward, &intra_dc_threshold, &gmc_warp);
 
-	DPRINTF(DPRINTF_HEADER, "vop_type=%i,  packed=%i,  time=%i,  time_pp=%i,  time_bp=%i", 
+	DPRINTF(DPRINTF_HEADER, "vop_type=%i,  packed=%i,  time=%lli,  time_pp=%i,  time_bp=%i", 
 							vop_type,	dec->packed_mode, dec->time, dec->time_pp, dec->time_bp);
 
-	if (vop_type == - 1)
+	if (vop_type == -1)
 	{
 		if (success) goto done;
 		emms();
@@ -1732,7 +1838,7 @@
 		goto repeat;
 	} 
 
-	dec->p_bmv.x = dec->p_bmv.y = dec->p_fmv.y = dec->p_fmv.y = 0;	// init pred vector to 0
+	dec->p_bmv.x = dec->p_bmv.y = dec->p_fmv.y = dec->p_fmv.y = 0;	/* init pred vector to 0 */
 
 
 	/* packed_mode: special-N_VOP treament */
@@ -1758,7 +1864,7 @@
 			break;
 		case S_VOP :
 			decoder_pframe(dec, &bs, rounding, reduced_resolution, quant, 
-						fcode_forward, intra_dc_threshold, gmc_mv);
+						fcode_forward, intra_dc_threshold, &gmc_warp);
 			break;
 		case N_VOP :
 			image_copy(&dec->cur, &dec->refn[0], dec->edged_width, dec->height);
@@ -1862,7 +1968,7 @@
 	{
 		stats->notify = output ? XVID_DEC_VOP : XVID_DEC_NOTHING;
 		stats->data.vop.time_base = (int)dec->time_base;
-		stats->data.vop.time_increment = 0;	//XXX: todo
+		stats->data.vop.time_increment = 0;	/* XXX: todo */
 	}
 	
 	emms();