--- mbtransquant.c 2003/10/01 23:23:01 1.21.2.17 +++ mbtransquant.c 2004/12/19 12:49:05 1.27 @@ -21,7 +21,7 @@ * along with this program ; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * $Id: mbtransquant.c,v 1.21.2.17 2003/10/01 23:23:01 edgomez Exp $ + * $Id: mbtransquant.c,v 1.27 2004/12/19 12:49:05 edgomez Exp $ * ****************************************************************************/ @@ -39,11 +39,10 @@ #include "../bitstream/zigzag.h" #include "../dct/fdct.h" #include "../dct/idct.h" -#include "../quant/quant_mpeg4.h" -#include "../quant/quant_h263.h" +#include "../quant/quant.h" #include "../encoder.h" -#include "../image/reduced.h" +#include "../quant/quant_matrix.h" MBFIELDTEST_PTR MBFieldTest; @@ -126,10 +125,10 @@ int mpeg; int scaler_lum, scaler_chr; - quanth263_intraFuncPtr const quant[2] = + quant_intraFuncPtr const quant[2] = { - (quanth263_intraFuncPtr)quant_intra, - (quanth263_intraFuncPtr)quant4_intra + quant_h263_intra, + quant_mpeg_intra }; mpeg = !!(pParam->vol_flags & XVID_VOL_MPEGQUANT); @@ -138,12 +137,12 @@ /* Quantize the block */ start_timer(); - quant[mpeg](&data[0 * 64], &qcoeff[0 * 64], pMB->quant, scaler_lum); - quant[mpeg](&data[1 * 64], &qcoeff[1 * 64], pMB->quant, scaler_lum); - quant[mpeg](&data[2 * 64], &qcoeff[2 * 64], pMB->quant, scaler_lum); - quant[mpeg](&data[3 * 64], &qcoeff[3 * 64], pMB->quant, scaler_lum); - quant[mpeg](&data[4 * 64], &qcoeff[4 * 64], pMB->quant, scaler_chr); - quant[mpeg](&data[5 * 64], &qcoeff[5 * 64], pMB->quant, scaler_chr); + quant[mpeg](&data[0 * 64], &qcoeff[0 * 64], pMB->quant, scaler_lum, pParam->mpeg_quant_matrices); + quant[mpeg](&data[1 * 64], &qcoeff[1 * 64], pMB->quant, scaler_lum, pParam->mpeg_quant_matrices); + quant[mpeg](&data[2 * 64], &qcoeff[2 * 64], pMB->quant, scaler_lum, pParam->mpeg_quant_matrices); + quant[mpeg](&data[3 * 64], &qcoeff[3 * 64], pMB->quant, scaler_lum, pParam->mpeg_quant_matrices); + quant[mpeg](&data[4 * 64], &qcoeff[4 * 64], pMB->quant, scaler_chr, pParam->mpeg_quant_matrices); + quant[mpeg](&data[5 * 64], &qcoeff[5 * 64], pMB->quant, scaler_chr, pParam->mpeg_quant_matrices); stop_quant_timer(); } @@ -157,10 +156,10 @@ int mpeg; int scaler_lum, scaler_chr; - quanth263_intraFuncPtr const dequant[2] = + quant_intraFuncPtr const dequant[2] = { - (quanth263_intraFuncPtr)dequant_intra, - (quanth263_intraFuncPtr)dequant4_intra + dequant_h263_intra, + dequant_mpeg_intra }; mpeg = !!(pParam->vol_flags & XVID_VOL_MPEGQUANT); @@ -168,35 +167,23 @@ scaler_chr = get_dc_scaler(iQuant, 0); start_timer(); - dequant[mpeg](&qcoeff[0 * 64], &data[0 * 64], iQuant, scaler_lum); - dequant[mpeg](&qcoeff[1 * 64], &data[1 * 64], iQuant, scaler_lum); - dequant[mpeg](&qcoeff[2 * 64], &data[2 * 64], iQuant, scaler_lum); - dequant[mpeg](&qcoeff[3 * 64], &data[3 * 64], iQuant, scaler_lum); - dequant[mpeg](&qcoeff[4 * 64], &data[4 * 64], iQuant, scaler_chr); - dequant[mpeg](&qcoeff[5 * 64], &data[5 * 64], iQuant, scaler_chr); + dequant[mpeg](&qcoeff[0 * 64], &data[0 * 64], iQuant, scaler_lum, pParam->mpeg_quant_matrices); + dequant[mpeg](&qcoeff[1 * 64], &data[1 * 64], iQuant, scaler_lum, pParam->mpeg_quant_matrices); + dequant[mpeg](&qcoeff[2 * 64], &data[2 * 64], iQuant, scaler_lum, pParam->mpeg_quant_matrices); + dequant[mpeg](&qcoeff[3 * 64], &data[3 * 64], iQuant, scaler_lum, pParam->mpeg_quant_matrices); + dequant[mpeg](&qcoeff[4 * 64], &data[4 * 64], iQuant, scaler_chr, pParam->mpeg_quant_matrices); + dequant[mpeg](&qcoeff[5 * 64], &data[5 * 64], iQuant, scaler_chr, pParam->mpeg_quant_matrices); stop_iquant_timer(); } - -typedef int (*trellis_func_ptr_t)(int16_t *const Out, - const int16_t *const In, - int Q, - const uint16_t * const Zigzag, - int Non_Zero); - -static int -dct_quantize_trellis_h263_c(int16_t *const Out, - const int16_t *const In, - int Q, - const uint16_t * const Zigzag, - int Non_Zero); - static int -dct_quantize_trellis_mpeg_c(int16_t *const Out, - const int16_t *const In, - int Q, - const uint16_t * const Zigzag, - int Non_Zero); +dct_quantize_trellis_c(int16_t *const Out, + const int16_t *const In, + int Q, + const uint16_t * const Zigzag, + const uint16_t * const QuantMatrix, + int Non_Zero, + int Sum); /* Quantize all blocks -- Inter mode */ static __inline uint8_t @@ -214,16 +201,10 @@ int sum; int code_block, mpeg; - quanth263_interFuncPtr const quant[2] = + quant_interFuncPtr const quant[2] = { - (quanth263_interFuncPtr)quant_inter, - (quanth263_interFuncPtr)quant4_inter - }; - - trellis_func_ptr_t const trellis[2] = - { - (trellis_func_ptr_t)dct_quantize_trellis_h263_c, - (trellis_func_ptr_t)dct_quantize_trellis_mpeg_c + quant_h263_inter, + quant_mpeg_inter }; mpeg = !!(pParam->vol_flags & XVID_VOL_MPEGQUANT); @@ -233,10 +214,28 @@ /* Quantize the block */ start_timer(); - sum = quant[mpeg](&qcoeff[i*64], &data[i*64], pMB->quant); + sum = quant[mpeg](&qcoeff[i*64], &data[i*64], pMB->quant, pParam->mpeg_quant_matrices); - if(sum && (frame->vop_flags & XVID_VOP_TRELLISQUANT)) { - sum = trellis[mpeg](&qcoeff[i*64], &data[i*64], pMB->quant, &scan_tables[0][0], 63); + if(sum && (pMB->quant > 2) && (frame->vop_flags & XVID_VOP_TRELLISQUANT)) { + const uint16_t *matrix; + const static uint16_t h263matrix[] = + { + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16 + }; + + matrix = (mpeg)?get_inter_matrix(pParam->mpeg_quant_matrices):h263matrix; + sum = dct_quantize_trellis_c(&qcoeff[i*64], &data[i*64], + pMB->quant, &scan_tables[0][0], + matrix, + 63, + sum); } stop_quant_timer(); @@ -277,21 +276,21 @@ { int mpeg; - quanth263_interFuncPtr const dequant[2] = + quant_interFuncPtr const dequant[2] = { - (quanth263_interFuncPtr)dequant_inter, - (quanth263_interFuncPtr)dequant4_inter + dequant_h263_inter, + dequant_mpeg_inter }; mpeg = !!(pParam->vol_flags & XVID_VOL_MPEGQUANT); start_timer(); - if(cbp & (1 << (5 - 0))) dequant[mpeg](&data[0 * 64], &qcoeff[0 * 64], iQuant); - if(cbp & (1 << (5 - 1))) dequant[mpeg](&data[1 * 64], &qcoeff[1 * 64], iQuant); - if(cbp & (1 << (5 - 2))) dequant[mpeg](&data[2 * 64], &qcoeff[2 * 64], iQuant); - if(cbp & (1 << (5 - 3))) dequant[mpeg](&data[3 * 64], &qcoeff[3 * 64], iQuant); - if(cbp & (1 << (5 - 4))) dequant[mpeg](&data[4 * 64], &qcoeff[4 * 64], iQuant); - if(cbp & (1 << (5 - 5))) dequant[mpeg](&data[5 * 64], &qcoeff[5 * 64], iQuant); + if(cbp & (1 << (5 - 0))) dequant[mpeg](&data[0 * 64], &qcoeff[0 * 64], iQuant, pParam->mpeg_quant_matrices); + if(cbp & (1 << (5 - 1))) dequant[mpeg](&data[1 * 64], &qcoeff[1 * 64], iQuant, pParam->mpeg_quant_matrices); + if(cbp & (1 << (5 - 2))) dequant[mpeg](&data[2 * 64], &qcoeff[2 * 64], iQuant, pParam->mpeg_quant_matrices); + if(cbp & (1 << (5 - 3))) dequant[mpeg](&data[3 * 64], &qcoeff[3 * 64], iQuant, pParam->mpeg_quant_matrices); + if(cbp & (1 << (5 - 4))) dequant[mpeg](&data[4 * 64], &qcoeff[4 * 64], iQuant, pParam->mpeg_quant_matrices); + if(cbp & (1 << (5 - 5))) dequant[mpeg](&data[5 * 64], &qcoeff[5 * 64], iQuant, pParam->mpeg_quant_matrices); stop_iquant_timer(); } @@ -310,38 +309,22 @@ uint32_t stride = pParam->edged_width; uint32_t stride2 = stride / 2; uint32_t next_block = stride * 8; - int32_t cst; - int vop_reduced; uint8_t *pY_Cur, *pU_Cur, *pV_Cur; const IMAGE * const pCurrent = &frame->image; - transfer_operation_8to16_t * const functions[2] = - { - (transfer_operation_8to16_t *)transfer_8to16copy, - (transfer_operation_8to16_t *)filter_18x18_to_8x8 - }; - transfer_operation_8to16_t *transfer_op = NULL; - - vop_reduced = !!(frame->vop_flags & XVID_VOP_REDUCED); /* Image pointers */ - pY_Cur = pCurrent->y + (y_pos << (4+vop_reduced)) * stride + (x_pos << (4+vop_reduced)); - pU_Cur = pCurrent->u + (y_pos << (3+vop_reduced)) * stride2 + (x_pos << (3+vop_reduced)); - pV_Cur = pCurrent->v + (y_pos << (3+vop_reduced)) * stride2 + (x_pos << (3+vop_reduced)); - - /* Block size */ - cst = 8<y + (y_pos << 4) * stride + (x_pos << 4); + pU_Cur = pCurrent->u + (y_pos << 3) * stride2 + (x_pos << 3); + pV_Cur = pCurrent->v + (y_pos << 3) * stride2 + (x_pos << 3); /* Do the transfer */ start_timer(); - transfer_op(&data[0 * 64], pY_Cur, stride); - transfer_op(&data[1 * 64], pY_Cur + cst, stride); - transfer_op(&data[2 * 64], pY_Cur + next_block, stride); - transfer_op(&data[3 * 64], pY_Cur + next_block + cst, stride); - transfer_op(&data[4 * 64], pU_Cur, stride2); - transfer_op(&data[5 * 64], pV_Cur, stride2); + transfer_8to16copy(&data[0 * 64], pY_Cur, stride); + transfer_8to16copy(&data[1 * 64], pY_Cur + 8, stride); + transfer_8to16copy(&data[2 * 64], pY_Cur + next_block, stride); + transfer_8to16copy(&data[3 * 64], pY_Cur + next_block + 8, stride); + transfer_8to16copy(&data[4 * 64], pU_Cur, stride2); + transfer_8to16copy(&data[5 * 64], pV_Cur, stride2); stop_transfer_timer(); } @@ -359,47 +342,38 @@ uint32_t stride = pParam->edged_width; uint32_t stride2 = stride / 2; uint32_t next_block = stride * 8; - uint32_t cst; - int vop_reduced; const IMAGE * const pCurrent = &frame->image; - /* Array of function pointers, indexed by [vop_reduced<<1+add] */ - transfer_operation_16to8_t * const functions[4] = + + /* Array of function pointers, indexed by [add] */ + transfer_operation_16to8_t * const functions[2] = { (transfer_operation_16to8_t*)transfer_16to8copy, (transfer_operation_16to8_t*)transfer_16to8add, - (transfer_operation_16to8_t*)copy_upsampled_8x8_16to8, - (transfer_operation_16to8_t*)add_upsampled_8x8_16to8 }; transfer_operation_16to8_t *transfer_op = NULL; + /* Image pointers */ + pY_Cur = pCurrent->y + (y_pos << 4) * stride + (x_pos << 4); + pU_Cur = pCurrent->u + (y_pos << 3) * stride2 + (x_pos << 3); + pV_Cur = pCurrent->v + (y_pos << 3) * stride2 + (x_pos << 3); + if (pMB->field_dct) { next_block = stride; stride *= 2; } - /* Makes this vars booleans */ - vop_reduced = !!(frame->vop_flags & XVID_VOP_REDUCED); - - /* Image pointers */ - pY_Cur = pCurrent->y + (y_pos << (4+vop_reduced)) * stride + (x_pos << (4+vop_reduced)); - pU_Cur = pCurrent->u + (y_pos << (3+vop_reduced)) * stride2 + (x_pos << (3+vop_reduced)); - pV_Cur = pCurrent->v + (y_pos << (3+vop_reduced)) * stride2 + (x_pos << (3+vop_reduced)); - - /* Block size */ - cst = 8<plugin_flags & XVID_REQORIGINAL)) { @@ -637,10 +607,6 @@ MOVLINE(LINE(3, 3), tmp); } - - - - /***************************************************************************** * Trellis based R-D optimal quantization * @@ -648,19 +614,6 @@ * ****************************************************************************/ - -#if 0 -static int -dct_quantize_trellis_mpeg_c(int16_t *const Out, - const int16_t *const In, - int Q, - const uint16_t * const Zigzag, - int Non_Zero) -{ - return 63; -} -#endif - /*---------------------------------------------------------------------------- * * Trellis-Based quantization @@ -672,11 +625,11 @@ * IEEE Transactions on Image Processing, Vol.9, No.8, Aug. 2000. * * we are at stake with a simplified Bellmand-Ford / Dijkstra Single - * Source Shorted Path algo. But due to the underlying graph structure + * Source Shortest Path algo. But due to the underlying graph structure * ("Trellis"), it can be turned into a dynamic programming algo, * partially saving the explicit graph's nodes representation. And * without using a heap, since the open frontier of the DAG is always - * known, and of fixed sized. + * known, and of fixed size. *--------------------------------------------------------------------------*/ @@ -776,7 +729,14 @@ Code_Len24,Code_Len23,Code_Len22,Code_Len21, Code_Len3, Code_Len1, }; -#define TL(q) 0xfe00/(q*q) +/* TL_SHIFT controls the precision of the RD optimizations in trellis + * valid range is [10..16]. The bigger, the more trellis is vulnerable + * to overflows in cost formulas. + * - 10 allows ac values up to 2^11 == 2048 + * - 16 allows ac values up to 2^8 == 256 + */ +#define TL_SHIFT 11 +#define TL(q) ((0xfe00>>(16-TL_SHIFT))/(q*q)) static const int Trellis_Lambda_Tabs[31] = { TL( 1),TL( 2),TL( 3),TL( 4),TL( 5),TL( 6), TL( 7), @@ -796,55 +756,55 @@ return -1; } -static int __inline -Compute_Sum(const int16_t *C, int last) -{ - int sum = 0; - - while(last--) - sum += abs(C[last]); - - return(sum); -} /* this routine has been strippen of all debug code */ - static int -dct_quantize_trellis_h263_c(int16_t *const Out, const int16_t *const In, int Q, const uint16_t * const Zigzag, int Non_Zero) +dct_quantize_trellis_c(int16_t *const Out, + const int16_t *const In, + int Q, + const uint16_t * const Zigzag, + const uint16_t * const QuantMatrix, + int Non_Zero, + int Sum) { - /* - * Note: We should search last non-zero coeffs on *real* DCT input coeffs (In[]), - * not quantized one (Out[]). However, it only improves the result *very* - * slightly (~0.01dB), whereas speed drops to crawling level :) - * Well, actually, taking 1 more coeff past Non_Zero into account sometimes helps. - */ + /* Note: We should search last non-zero coeffs on *real* DCT input coeffs + * (In[]), not quantized one (Out[]). However, it only improves the result + * *very* slightly (~0.01dB), whereas speed drops to crawling level :) + * Well, actually, taking 1 more coeff past Non_Zero into account sometimes + * helps. */ typedef struct { int16_t Run, Level; } NODE; NODE Nodes[65], Last; uint32_t Run_Costs0[64+1]; uint32_t * const Run_Costs = Run_Costs0 + 1; - const int Mult = 2*Q; - const int Bias = (Q-1) | 1; - const int Lev0 = Mult + Bias; - const int Lambda = Trellis_Lambda_Tabs[Q-1]; /* it's 1/lambda, actually */ + + /* it's 1/lambda, actually */ + const int Lambda = Trellis_Lambda_Tabs[Q-1]; int Run_Start = -1; - uint32_t Min_Cost = 2<<16; + uint32_t Min_Cost = 2<>4); + const int Mult = 2*q; + const int Bias = (q-1) | 1; + const int Lev0 = Mult + Bias; + const int AC = In[Zigzag[i]]; const int Level1 = Out[Zigzag[i]]; - const int Dist0 = Lambda* AC*AC; + const unsigned int Dist0 = Lambda* AC*AC; uint32_t Best_Cost = 0xf0000000; Last_Cost += Dist0; @@ -864,22 +824,20 @@ Cost0 = Lambda*dQ*dQ; Nodes[i].Run = 1; - Best_Cost = (Code_Len20[0]<<16) + Run_Costs[i-1]+Cost0; + Best_Cost = (Code_Len20[0]<0; --Run) { const uint32_t Cost_Base = Cost0 + Run_Costs[i-Run]; - const uint32_t Cost = Cost_Base + (Code_Len20[Run-1]<<16); - const uint32_t lCost = Cost_Base + (Code_Len24[Run-1]<<16); + const uint32_t Cost = Cost_Base + (Code_Len20[Run-1]< hifreq errors (HVS) -- gruel ) */ + /* Gruel: I'd say, favour short runs => hifreq errors (HVS) */ if (Cost(uint32_t)(Level1+25)) { + /* "big" levels (not less than ESC3, though) */ const uint8_t *Tbl_L1, *Tbl_L2, *Tbl_L1_Last, *Tbl_L2_Last; int Level2; int dQ1, dQ2; @@ -903,16 +862,16 @@ dQ1 = Level1*Mult-AC + Bias; dQ2 = dQ1 - Mult; Level2 = Level1-1; - Tbl_L1 = (Level1<=24) ? B16_17_Code_Len[Level1-1] : Code_Len0; - Tbl_L2 = (Level2<=24) ? B16_17_Code_Len[Level2-1] : Code_Len0; + Tbl_L1 = (Level1<=24) ? B16_17_Code_Len[Level1-1] : Code_Len0; + Tbl_L2 = (Level2<=24) ? B16_17_Code_Len[Level2-1] : Code_Len0; Tbl_L1_Last = (Level1<=6) ? B16_17_Code_Len_Last[Level1-1] : Code_Len0; Tbl_L2_Last = (Level2<=6) ? B16_17_Code_Len_Last[Level2-1] : Code_Len0; } else { /* Level1<-1 */ dQ1 = Level1*Mult-AC - Bias; dQ2 = dQ1 + Mult; Level2 = Level1 + 1; - Tbl_L1 = (Level1>=-24) ? B16_17_Code_Len[Level1^-1] : Code_Len0; - Tbl_L2 = (Level2>=-24) ? B16_17_Code_Len[Level2^-1] : Code_Len0; + Tbl_L1 = (Level1>=-24) ? B16_17_Code_Len[Level1^-1] : Code_Len0; + Tbl_L2 = (Level2>=-24) ? B16_17_Code_Len[Level2^-1] : Code_Len0; Tbl_L1_Last = (Level1>=- 6) ? B16_17_Code_Len_Last[Level1^-1] : Code_Len0; Tbl_L2_Last = (Level2>=- 6) ? B16_17_Code_Len_Last[Level2^-1] : Code_Len0; } @@ -927,14 +886,13 @@ uint32_t Cost1, Cost2; int bLevel; - /* - * for sub-optimal (but slightly worth it, speed-wise) search, uncomment the following: - * if (Cost_Base>=Best_Cost) continue; - * (? doesn't seem to have any effect -- gruel ) - */ + /* for sub-optimal (but slightly worth it, speed-wise) search, + * uncomment the following: + * if (Cost_Base>=Best_Cost) continue; + * (? doesn't seem to have any effect -- gruel ) */ - Cost1 = Cost_Base + (Tbl_L1[Run-1]<<16); - Cost2 = Cost_Base + (Tbl_L2[Run-1]<<16) + dDist21; + Cost1 = Cost_Base + (Tbl_L1[Run-1]< Simply pick best Run. */ + int Run; + for(Run=i-Run_Start; Run>0; --Run) { + /* 30 bits + no distortion */ + const uint32_t Cost = (30<Min_Cost+(1<<16) ) + /* as noticed by Michael Niedermayer (michaelni at gmx.at), + * there's a code shorter by 1 bit for a larger run (!), same + * level. We give it a chance by not moving the left barrier too + * much. */ + while( Run_Costs[Run_Start]>Min_Cost+(1<=0) { Out[Zigzag[i]] = Nodes[i].Level; - sum += abs(Nodes[i].Level); + Sum += abs(Nodes[i].Level); i -= Nodes[i].Run; } - return sum; -} - -static int -dct_quantize_trellis_mpeg_c(int16_t *const Out, const int16_t *const In, int Q, const uint16_t * const Zigzag, int Non_Zero) -{ - /* ToDo: Ok ok it's just a place holder for Gruel -- damn write this one :-) */ - return Compute_Sum(Out, 63); + return Sum; } /* original version including heavy debugging info */ @@ -1072,7 +1042,7 @@ V -= Ref[Zigzag[i]]; Dist += V*V; } - Cost = Lambda*Dist + (Bits<<16); + Cost = Lambda*Dist + (Bits<>12= %d ", Last,Max, Bits, Dist, Cost, Cost>>12 ); return Cost; @@ -1104,8 +1074,8 @@ const int Lambda = Trellis_Lambda_Tabs[Q-1]; /* it's 1/lambda, actually */ int Run_Start = -1; - Run_Costs[-1] = 2<<16; /* source (w/ CBP penalty) */ - uint32_t Min_Cost = 2<<16; + Run_Costs[-1] = 2<0; --Run) { const uint32_t Cost_Base = Cost0 + Run_Costs[i-Run]; - const uint32_t Cost = Cost_Base + (Code_Len20[Run-1]<<16); - const uint32_t lCost = Cost_Base + (Code_Len24[Run-1]<<16); + const uint32_t Cost = Cost_Base + (Code_Len20[Run-1]<=Best_Cost) continue; */ - Cost1 = Cost_Base + (Tbl_L1[Run-1]<<16); - Cost2 = Cost_Base + (Tbl_L2[Run-1]<<16) + dDist21; + Cost1 = Cost_Base + (Tbl_L1[Run-1]<Min_Cost+(1<<16) ) + while( Run_Costs[Run_Start]>Min_Cost+(1<