--- encoder.c 2004/03/30 12:30:48 1.95.2.66 +++ encoder.c 2011/04/06 14:30:14 1.141 @@ -3,9 +3,9 @@ * XVID MPEG-4 VIDEO CODEC * - Encoder main module - * - * Copyright(C) 2002 Michael Militzer - * 2002-2003 Peter Ross - * 2002 Daniel Smith + * Copyright(C) 2002-2010 Michael Militzer + * 2002-2003 Peter Ross + * 2002 Daniel Smith * * This program is free software ; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,7 +21,7 @@ * along with this program ; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * $Id: encoder.c,v 1.95.2.66 2004/03/30 12:30:48 syskin Exp $ + * $Id: encoder.c,v 1.141 2011/04/06 14:30:14 Isibaar Exp $ * ****************************************************************************/ @@ -49,6 +49,9 @@ #include "quant/quant_matrix.h" #include "utils/mem_align.h" +# include "motion/motion_smp.h" + + /***************************************************************************** * Local function prototypes ****************************************************************************/ @@ -57,9 +60,7 @@ Bitstream * bs); static int FrameCodeP(Encoder * pEnc, - Bitstream * bs, - bool force_inter, - bool vol_header); + Bitstream * bs); static void FrameCodeB(Encoder * pEnc, FRAMEINFO * frame, @@ -87,26 +88,48 @@ /* * Simplify the "fincr/fbase" fraction */ +static int +gcd(int a, int b) +{ + int r ; + + if (b > a) { + r = a; + a = b; + b = r; + } + + while ((r = a % b)) { + a = b; + b = r; + } + return b; +} + static void simplify_time(int *inc, int *base) { /* common factor */ - int i = *inc; - while (i > 1) { - if (*inc % i == 0 && *base % i == 0) { - *inc /= i; - *base /= i; - i = *inc; - continue; - } - i--; - } - - /* if neccessary, round to 65535 accuracy */ - if (*base > 65535) { - float div = (float) *base / 65535; - *base = (int) (*base / div); - *inc = (int) (*inc / div); + const int s = gcd(*inc, *base); + *inc /= s; + *base /= s; + + if (*base > 65535 || *inc > 65535) { + int *biggest; + int *other; + float div; + + if (*base > *inc) { + biggest = base; + other = inc; + } else { + biggest = inc; + other = base; + } + + div = ((float)*biggest)/((float)65535); + *biggest = (unsigned int)(((float)*biggest)/div); + *other = (unsigned int)(((float)*other)/div); } } @@ -115,7 +138,7 @@ enc_create(xvid_enc_create_t * create) { Encoder *pEnc; - int n; + int n; if (XVID_VERSION_MAJOR(create->version) != 1) /* v1.x.x */ return XVID_ERR_VERSION; @@ -137,6 +160,8 @@ /* global flags */ pEnc->mbParam.global_flags = create->global; + if ((pEnc->mbParam.global_flags & XVID_GLOBAL_PACKED)) + pEnc->mbParam.global_flags |= XVID_GLOBAL_DIVX5_USERDATA; /* width, height */ pEnc->mbParam.width = create->width; @@ -150,7 +175,7 @@ pEnc->mbParam.fincr = MAX(create->fincr, 0); pEnc->mbParam.fbase = create->fincr <= 0 ? 25 : create->fbase; if (pEnc->mbParam.fincr>0) - simplify_time(&pEnc->mbParam.fincr, &pEnc->mbParam.fbase); + simplify_time((int*)&pEnc->mbParam.fincr, (int*)&pEnc->mbParam.fbase); /* zones */ if(create->num_zones > 0) { @@ -181,7 +206,7 @@ memset(&pinfo, 0, sizeof(xvid_plg_info_t)); pinfo.version = XVID_VERSION; - if (create->plugins[n].func(0, XVID_PLG_INFO, &pinfo, 0) >= 0) { + if (create->plugins[n].func(NULL, XVID_PLG_INFO, &pinfo, NULL) >= 0) { pEnc->mbParam.plugin_flags |= pinfo.flags; } @@ -198,7 +223,7 @@ pcreate.param = create->plugins[n].param; pEnc->plugins[n].func = NULL; /* disable plugins that fail */ - if (create->plugins[n].func(0, XVID_PLG_CREATE, &pcreate, &pEnc->plugins[n].param) >= 0) { + if (create->plugins[n].func(NULL, XVID_PLG_CREATE, &pcreate, &pEnc->plugins[n].param) >= 0) { pEnc->plugins[n].func = create->plugins[n].func; } } @@ -216,6 +241,14 @@ goto xvid_err_memory1a; } + /* temp lambdas */ + if (pEnc->mbParam.plugin_flags & XVID_REQLAMBDA) { + pEnc->temp_lambda = (float *) xvid_malloc(pEnc->mbParam.mb_width * + pEnc->mbParam.mb_height * 6 * sizeof(float), CACHE_LINE); + if (pEnc->temp_lambda == NULL) + goto xvid_err_memory1a; + } + /* bframes */ pEnc->mbParam.max_bframes = MAX(create->max_bframes, 0); pEnc->mbParam.bquant_ratio = MAX(create->bquant_ratio, 0); @@ -402,7 +435,7 @@ /* timestamp stuff */ pEnc->mbParam.m_stamp = 0; - pEnc->m_framenum = 0; + pEnc->m_framenum = create->start_frame_num; pEnc->current->stamp = 0; pEnc->reference->stamp = 0; @@ -411,6 +444,73 @@ pEnc->iFrameNum = 0; pEnc->fMvPrevSigma = -1; + /* slices */ + pEnc->num_slices = MIN(MAX(1, create->num_slices), (int) pEnc->mbParam.mb_height); + + /* multithreaded stuff */ + if (create->num_threads > 0) { +#ifndef HAVE_PTHREAD + int t = MAX(1, create->num_threads); +#else + int t = MIN(create->num_threads, (int) (pEnc->mbParam.mb_height>>1)); /* at least two rows per thread */ +#endif + int threads_per_slice = MAX(1, (t / pEnc->num_slices)); + int rows_per_thread = (pEnc->mbParam.mb_height + threads_per_slice - 1) / threads_per_slice; + + pEnc->num_threads = t; + pEnc->smpData = xvid_malloc(t*sizeof(SMPData), CACHE_LINE); + if (!pEnc->smpData) + goto xvid_err_nosmp; + + /* tmp bitstream buffer for slice coding */ + pEnc->smpData[0].tmp_buffer = xvid_malloc(16*pEnc->mbParam.edged_width*pEnc->mbParam.mb_height*sizeof(uint8_t), CACHE_LINE); + if (! pEnc->smpData[0].tmp_buffer) goto xvid_err_nosmp; + + for (n = 0; n < t; n++) { + int s = MIN(pEnc->num_threads, pEnc->num_slices); + + pEnc->smpData[n].complete_count_self = + xvid_malloc(rows_per_thread * sizeof(int), CACHE_LINE); + + if (!pEnc->smpData[n].complete_count_self) + goto xvid_err_nosmp; + + if (n > 0 && n < s) { + pEnc->smpData[n].bs = (Bitstream *) xvid_malloc(sizeof(Bitstream), CACHE_LINE); + if (!pEnc->smpData[n].bs) + goto xvid_err_nosmp; + + pEnc->smpData[n].sStat = (Statistics *) xvid_malloc(sizeof(Statistics), CACHE_LINE); + if (!pEnc->smpData[n].sStat) + goto xvid_err_nosmp; + + pEnc->smpData[n].tmp_buffer = pEnc->smpData[0].tmp_buffer + 16*(((n-1)*pEnc->mbParam.edged_width*pEnc->mbParam.mb_height)/s); + BitstreamInit(pEnc->smpData[n].bs, pEnc->smpData[n].tmp_buffer, 0); + } + + if (n != 0) + pEnc->smpData[n].complete_count_above = + pEnc->smpData[n-1].complete_count_self; + } + pEnc->smpData[0].complete_count_above = + pEnc->smpData[t-1].complete_count_self - 1; + + } else { + xvid_err_nosmp: + /* no SMP */ + if (pEnc->smpData) { + if (pEnc->smpData[0].tmp_buffer) + xvid_free(pEnc->smpData[0].tmp_buffer); + } + else { + pEnc->smpData = xvid_malloc(1*sizeof(SMPData), CACHE_LINE); + if (pEnc->smpData == NULL) + goto xvid_err_memory5; + } + + create->num_threads = 0; + } + create->handle = (void *) pEnc; init_timer(); @@ -497,10 +597,14 @@ xvid_free(pEnc->temp_dquants); } + if(pEnc->mbParam.plugin_flags & XVID_REQLAMBDA) { + xvid_free(pEnc->temp_lambda); + } + xvid_err_memory0: for (n=0; nnum_plugins;n++) { if (pEnc->plugins[n].func) { - pEnc->plugins[n].func(pEnc->plugins[n].param, XVID_PLG_DESTROY, 0, 0); + pEnc->plugins[n].func(pEnc->plugins[n].param, XVID_PLG_DESTROY, NULL, NULL); } } xvid_free(pEnc->plugins); @@ -595,6 +699,9 @@ xvid_free(pEnc->temp_dquants); } + if ((pEnc->mbParam.plugin_flags & XVID_REQLAMBDA)) { + xvid_free(pEnc->temp_lambda); + } if (pEnc->num_plugins>0) { xvid_plg_destroy_t pdestroy; @@ -605,7 +712,7 @@ for (i=0; inum_plugins;i++) { if (pEnc->plugins[i].func) { - pEnc->plugins[i].func(pEnc->plugins[i].param, XVID_PLG_DESTROY, &pdestroy, 0); + pEnc->plugins[i].func(pEnc->plugins[i].param, XVID_PLG_DESTROY, &pdestroy, NULL); } } xvid_free(pEnc->plugins); @@ -613,9 +720,21 @@ xvid_free(pEnc->mbParam.mpeg_quant_matrices); - if (pEnc->num_plugins>0) + if (pEnc->num_zones > 0) xvid_free(pEnc->zones); + if (pEnc->num_threads > 0) { + for (i = 1; i < MAX(1, MIN(pEnc->num_threads, pEnc->num_slices)); i++) { + xvid_free(pEnc->smpData[i].bs); + xvid_free(pEnc->smpData[i].sStat); + } + if (pEnc->smpData[0].tmp_buffer) xvid_free(pEnc->smpData[0].tmp_buffer); + + for (i = 0; i < pEnc->num_threads; i++) + xvid_free(pEnc->smpData[i].complete_count_self); + } + xvid_free(pEnc->smpData); + xvid_free(pEnc); return 0; /* ok */ @@ -629,7 +748,7 @@ static void call_plugins(Encoder * pEnc, FRAMEINFO * frame, IMAGE * original, int opt, int * type, int * quant, xvid_enc_stats_t * stats) { - unsigned int i, j; + unsigned int i, j, k; xvid_plg_data_t data; /* set data struct */ @@ -688,9 +807,19 @@ if ((pEnc->mbParam.plugin_flags & XVID_REQDQUANTS)) { data.dquant = pEnc->temp_dquants; data.dquant_stride = pEnc->mbParam.mb_width; - memset(data.dquant, 0, data.mb_width*data.mb_height); + memset(data.dquant, 0, data.mb_width*data.mb_height*sizeof(int)); } - + + if(pEnc->mbParam.plugin_flags & XVID_REQLAMBDA) { + int block = 0; + emms(); + data.lambda = pEnc->temp_lambda; + for(i = 0;i < pEnc->mbParam.mb_height; i++) + for(j = 0;j < pEnc->mbParam.mb_width; j++) + for (k = 0; k < 6; k++) + data.lambda[block++] = 1.0f; + } + } else { /* XVID_PLG_AFTER */ if ((pEnc->mbParam.plugin_flags & XVID_REQORIGINAL)) { data.original.csp = XVID_CSP_PLANAR; @@ -765,7 +894,7 @@ for (i=0; i<(unsigned int)pEnc->num_plugins;i++) { emms(); if (pEnc->plugins[i].func) { - if (pEnc->plugins[i].func(pEnc->plugins[i].param, opt, &data, 0) < 0) { + if (pEnc->plugins[i].func(pEnc->plugins[i].param, opt, &data, NULL) < 0) { continue; } } @@ -794,6 +923,23 @@ frame->mbs[j*pEnc->mbParam.mb_width + i].dquant = 0; } } + + if (pEnc->mbParam.plugin_flags & XVID_REQLAMBDA) { + for (j = 0; j < pEnc->mbParam.mb_height; j++) + for (i = 0; i < pEnc->mbParam.mb_width; i++) + for (k = 0; k < 6; k++) { + frame->mbs[j*pEnc->mbParam.mb_width + i].lambda[k] = + (int) ((float)(1<mbParam.mb_height; j++) + for (i = 0; imbParam.mb_width; i++) + for (k = 0; k < 6; k++) { + frame->mbs[j*pEnc->mbParam.mb_width + i].lambda[k] = 1<mbs[0].quant = data.quant; /* FRAME will not affect the quant in stats */ } @@ -863,24 +1009,6 @@ #endif } -static int -gcd(int a, int b) -{ - int r ; - - if (b > a) { - r = a; - a = b; - b = r; - } - - while ((r = a % b)) { - a = b; - b = r; - } - return b; -} - static void simplify_par(int *par_width, int *par_height) { @@ -911,7 +1039,6 @@ return; } - /***************************************************************************** * IPB frame encoder entry point * @@ -1006,7 +1133,7 @@ } FrameCodeB(pEnc, pEnc->bframes[pEnc->bframenum_head], &bs); - call_plugins(pEnc, pEnc->bframes[pEnc->bframenum_head], &pEnc->sOriginal2, XVID_PLG_AFTER, 0, 0, stats); + call_plugins(pEnc, pEnc->bframes[pEnc->bframenum_head], &pEnc->sOriginal2, XVID_PLG_AFTER, NULL, NULL, stats); pEnc->bframenum_head++; goto done; @@ -1038,7 +1165,7 @@ /* add the not-coded length to the reference frame size */ pEnc->current->length += (BitstreamPos(&bs) - bits) / 8; - call_plugins(pEnc, pEnc->current, &pEnc->sOriginal, XVID_PLG_AFTER, 0, 0, stats); + call_plugins(pEnc, pEnc->current, &pEnc->sOriginal, XVID_PLG_AFTER, NULL, NULL, stats); /* flush complete: reset counters */ pEnc->flush_bframes = 0; @@ -1066,7 +1193,7 @@ pEnc->queue_head, pEnc->queue_tail, pEnc->queue_size); if (!(pEnc->mbParam.global_flags & XVID_GLOBAL_PACKED) && pEnc->mbParam.max_bframes > 0) { - call_plugins(pEnc, pEnc->current, &pEnc->sOriginal, XVID_PLG_AFTER, 0, 0, stats); + call_plugins(pEnc, pEnc->current, &pEnc->sOriginal, XVID_PLG_AFTER, NULL, NULL, stats); } /* if the very last frame is to be b-vop, we must change it to a p-vop */ @@ -1091,11 +1218,11 @@ pEnc->queue_head, pEnc->queue_tail, pEnc->queue_size); pEnc->mbParam.frame_drop_ratio = -1; /* it must be a coded vop */ - FrameCodeP(pEnc, &bs, 1, 0); + FrameCodeP(pEnc, &bs); if ((pEnc->mbParam.global_flags & XVID_GLOBAL_PACKED) && pEnc->bframenum_tail==0) { - call_plugins(pEnc, pEnc->current, &pEnc->sOriginal, XVID_PLG_AFTER, 0, 0, stats); + call_plugins(pEnc, pEnc->current, &pEnc->sOriginal, XVID_PLG_AFTER, NULL, NULL, stats); }else{ pEnc->flush_bframes = 1; goto done; @@ -1144,7 +1271,7 @@ type = frame->type; pEnc->current->quant = frame->quant; - call_plugins(pEnc, pEnc->current, NULL, XVID_PLG_BEFORE, &type, &pEnc->current->quant, stats); + call_plugins(pEnc, pEnc->current, NULL, XVID_PLG_BEFORE, &type, (int*)&pEnc->current->quant, stats); if (type > 0){ /* XVID_TYPE_?VOP */ type = type2coding(type); /* convert XVID_TYPE_?VOP to bitstream coding type */ @@ -1219,10 +1346,11 @@ if (!(pEnc->mbParam.global_flags & XVID_GLOBAL_PACKED) && pEnc->mbParam.max_bframes > 0) { if (pEnc->current->stamp > 0) { - call_plugins(pEnc, pEnc->reference, &pEnc->sOriginal, XVID_PLG_AFTER, 0, 0, stats); + call_plugins(pEnc, pEnc->reference, &pEnc->sOriginal, XVID_PLG_AFTER, NULL, NULL, stats); } - else - stats->type = XVID_TYPE_NOTHING; + else if (stats) { + stats->type = XVID_TYPE_NOTHING; + } } /* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -1247,7 +1375,7 @@ SWAP(FRAMEINFO*, pEnc->current, pEnc->bframes[pEnc->bframenum_tail]); if ((pEnc->current->vop_flags & XVID_VOP_DEBUG)) { - image_printf(&pEnc->current->image, pEnc->mbParam.edged_width, pEnc->mbParam.height, 5, 100, "DX50 BVOP->PVOP"); + image_printf(&pEnc->current->image, pEnc->mbParam.edged_width, pEnc->mbParam.height, 5, 100, "CLOSED GOP BVOP->PVOP"); } /* convert B-VOP quant to P-VOP */ @@ -1308,9 +1436,6 @@ /* prevent vol/vop misuse */ - if (!(pEnc->current->vol_flags & XVID_VOL_REDUCED_ENABLE)) - pEnc->current->vop_flags &= ~XVID_VOP_REDUCED; - if (!(pEnc->current->vol_flags & XVID_VOL_INTERLACING)) pEnc->current->vop_flags &= ~(XVID_VOP_TOPFIELDFIRST|XVID_VOP_ALTERNATESCAN); @@ -1343,9 +1468,11 @@ pEnc->mbParam.edged_width, pEnc->mbParam.height); } - if ( FrameCodeP(pEnc, &bs, 1, 0) == 0 ) { + if ( FrameCodeP(pEnc, &bs) == 0 ) { /* N-VOP, we mustn't code b-frames yet */ - call_plugins(pEnc, pEnc->current, &pEnc->sOriginal, XVID_PLG_AFTER, 0, 0, stats); + if ((pEnc->mbParam.global_flags & XVID_GLOBAL_PACKED) || + pEnc->mbParam.max_bframes == 0) + call_plugins(pEnc, pEnc->current, &pEnc->sOriginal, XVID_PLG_AFTER, NULL, NULL, stats); goto done; } } @@ -1366,7 +1493,7 @@ /* packed or no-bframes or no-bframes-queued: output stats */ if ((pEnc->mbParam.global_flags & XVID_GLOBAL_PACKED) || pEnc->mbParam.max_bframes == 0 ) { - call_plugins(pEnc, pEnc->current, &pEnc->sOriginal, XVID_PLG_AFTER, 0, 0, stats); + call_plugins(pEnc, pEnc->current, &pEnc->sOriginal, XVID_PLG_AFTER, NULL, NULL, stats); } /* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -1406,10 +1533,8 @@ static __inline void -CodeIntraMB(Encoder * pEnc, - MACROBLOCK * pMB) +CodeIntraMB(MACROBLOCK * pMB) { - pMB->mode = MODE_INTRA; /* zero mv statistics */ @@ -1423,34 +1548,113 @@ } } - - -static int -FrameCodeI(Encoder * pEnc, - Bitstream * bs) +static void +SliceCodeI(SMPData *data) { - int bits = BitstreamPos(bs); + Encoder *pEnc = (Encoder *) data->pEnc; + Bitstream *bs = (Bitstream *) data->bs; + + uint16_t x, y; int mb_width = pEnc->mbParam.mb_width; int mb_height = pEnc->mbParam.mb_height; + int bound = 0, num_slices = pEnc->num_slices; + FRAMEINFO *const current = pEnc->current; + DECLARE_ALIGNED_MATRIX(dct_codes, 6, 64, int16_t, CACHE_LINE); DECLARE_ALIGNED_MATRIX(qcoeff, 6, 64, int16_t, CACHE_LINE); - uint16_t x, y; + if (data->start_y > 0) { /* write resync marker */ + bound = data->start_y*mb_width; + write_video_packet_header(bs, &pEnc->mbParam, current, bound); + } - if ((pEnc->current->vol_flags & XVID_VOL_REDUCED_ENABLE)) - { - mb_width = (pEnc->mbParam.width + 31) / 32; - mb_height = (pEnc->mbParam.height + 31) / 32; + for (y = data->start_y; y < data->stop_y; y++) { + int new_bound = mb_width * ((((y*num_slices) / mb_height) * mb_height + (num_slices-1)) / num_slices); + + if (new_bound > bound) { + bound = new_bound; + BitstreamPadAlways(bs); + write_video_packet_header(bs, &pEnc->mbParam, current, bound); + } - /* 16x16->8x8 downsample requires 1 additional edge pixel*/ - /* XXX: setedges is overkill */ - start_timer(); - image_setedges(&pEnc->current->image, - pEnc->mbParam.edged_width, pEnc->mbParam.edged_height, - pEnc->mbParam.width, pEnc->mbParam.height, 0); - stop_edges_timer(); + for (x = 0; x < mb_width; x++) { + MACROBLOCK *pMB = ¤t->mbs[x + y * mb_width]; + + CodeIntraMB(pMB); + + MBTransQuantIntra(&pEnc->mbParam, current, pMB, x, y, + dct_codes, qcoeff); + + start_timer(); + MBPrediction(current, x, y, mb_width, qcoeff, bound); + stop_prediction_timer(); + + start_timer(); + MBCoding(current, pMB, qcoeff, bs, data->sStat); + stop_coding_timer(); + + } + } + + emms(); + BitstreamPadAlways(bs); +} + +static __inline void +SerializeBitstreams(Encoder *pEnc, FRAMEINFO *current, Bitstream *bs, int num_threads) +{ + int k; + uint32_t pos = BitstreamLength(bs); + + for (k = 1; k < num_threads; k++) { + uint32_t len = BitstreamLength(pEnc->smpData[k].bs); + + memcpy((void *)((ptr_t)bs->start + pos), + (void *)((ptr_t)pEnc->smpData[k].bs->start), len); + + current->length += len; + pos += len; + + /* collect stats */ + current->sStat.iTextBits += pEnc->smpData[k].sStat->iTextBits; + current->sStat.kblks += pEnc->smpData[k].sStat->kblks; + current->sStat.mblks += pEnc->smpData[k].sStat->mblks; + current->sStat.ublks += pEnc->smpData[k].sStat->ublks; + current->sStat.iMVBits += pEnc->smpData[k].sStat->iMVBits; + } + + if (num_threads > 1) { + uint32_t pos32 = pos>>2; + bs->tail = bs->start + pos32; + bs->pos = 8*(pos - (pos32<<2)); + bs->buf = 0; + + if (bs->pos > 0) { + uint32_t pos8 = bs->pos/8; + memset((void *)((ptr_t)bs->tail+pos8), 0, (4-pos8)); + pos = *bs->tail; +#ifndef ARCH_IS_BIG_ENDIAN + BSWAP(pos); +#endif + bs->buf = pos; + } } +} + +static int +FrameCodeI(Encoder * pEnc, + Bitstream * bs) +{ + int bits = BitstreamPos(bs); + int bound = 0, num_slices = pEnc->num_slices; + int num_threads = MAX(1, MIN(pEnc->num_threads, num_slices)); + int slices_per_thread = (num_slices*1024 / num_threads); + int mb_height = pEnc->mbParam.mb_height; +#ifdef HAVE_PTHREAD + void * status = NULL; +#endif + uint16_t k; pEnc->mbParam.m_rounding_type = 1; pEnc->current->rounding_type = pEnc->mbParam.m_rounding_type; @@ -1460,7 +1664,7 @@ SetMacroblockQuants(&pEnc->mbParam, pEnc->current); - BitstreamWriteVolHeader(bs, &pEnc->mbParam, pEnc->current); + BitstreamWriteVolHeader(bs, &pEnc->mbParam, pEnc->current, num_slices); set_timecodes(pEnc->current,pEnc->reference,pEnc->mbParam.fbase); @@ -1469,44 +1673,52 @@ BitstreamWriteVopHeader(bs, &pEnc->mbParam, pEnc->current, 1, pEnc->current->mbs[0].quant); pEnc->current->sStat.iTextBits = 0; - pEnc->current->sStat.kblks = mb_width * mb_height; - pEnc->current->sStat.mblks = pEnc->current->sStat.ublks = 0; - for (y = 0; y < mb_height; y++) - for (x = 0; x < mb_width; x++) { - MACROBLOCK *pMB = - &pEnc->current->mbs[x + y * pEnc->mbParam.mb_width]; + /* multithreaded intra coding - dispatch threads */ + for (k = 0; k < num_threads; k++) { + int add = ((slices_per_thread + 512) >> 10); - CodeIntraMB(pEnc, pMB); + slices_per_thread += ((num_slices*1024 / num_threads) - add*1024); - MBTransQuantIntra(&pEnc->mbParam, pEnc->current, pMB, x, y, - dct_codes, qcoeff); + pEnc->smpData[k].pEnc = (void *) pEnc; + pEnc->smpData[k].stop_y = (((bound+add) * mb_height + (num_slices-1)) / num_slices); + pEnc->smpData[k].start_y = ((bound * mb_height + (num_slices-1)) / num_slices); - start_timer(); - MBPrediction(pEnc->current, x, y, pEnc->mbParam.mb_width, qcoeff); - stop_prediction_timer(); + bound += add; - start_timer(); - if (pEnc->current->vop_flags & XVID_VOP_GREYSCALE) - { pMB->cbp &= 0x3C; /* keep only bits 5-2 */ - qcoeff[4*64+0]=0; /* zero, because for INTRA MBs DC value is saved */ - qcoeff[5*64+0]=0; - } - MBCoding(pEnc->current, pMB, qcoeff, bs, &pEnc->current->sStat); - stop_coding_timer(); + if (k > 0) { + BitstreamReset(pEnc->smpData[k].bs); + pEnc->smpData[k].sStat->iTextBits = 0; } + } + pEnc->smpData[0].bs = bs; + pEnc->smpData[0].sStat = &pEnc->current->sStat; - if ((pEnc->current->vop_flags & XVID_VOP_REDUCED)) - { - image_deblock_rrv(&pEnc->current->image, pEnc->mbParam.edged_width, - pEnc->current->mbs, mb_width, mb_height, pEnc->mbParam.mb_width, - 16, 0); +#ifdef HAVE_PTHREAD + /* create threads */ + for (k = 1; k < num_threads; k++) { + pthread_create(&pEnc->smpData[k].handle, NULL, + (void*)SliceCodeI, (void*)&pEnc->smpData[k]); } - emms(); +#endif - BitstreamPadAlways(bs); /* next_start_code() at the end of VideoObjectPlane() */ + SliceCodeI(&pEnc->smpData[0]); + +#ifdef HAVE_PTHREAD + /* wait until all threads are finished */ + for (k = 1; k < num_threads; k++) { + pthread_join(pEnc->smpData[k].handle, &status); + } +#endif + + pEnc->current->length = BitstreamLength(bs) - (bits/8); + + /* reassemble the pieces together */ + SerializeBitstreams(pEnc, pEnc->current, bs, num_threads); - pEnc->current->length = (BitstreamPos(bs) - bits) / 8; + pEnc->current->sStat.iMVBits = 0; + pEnc->current->sStat.mblks = pEnc->current->sStat.ublks = 0; + pEnc->current->sStat.kblks = pEnc->mbParam.mb_width * pEnc->mbParam.mb_height; pEnc->fMvPrevSigma = -1; pEnc->mbParam.m_fcode = 2; @@ -1517,28 +1729,186 @@ return 1; /* intra */ } +static __inline void +updateFcode(Statistics * sStat, Encoder * pEnc) +{ + float fSigma; + int iSearchRange; + + if (sStat->iMvCount == 0) + sStat->iMvCount = 1; + + fSigma = (float) sqrt((float) sStat->iMvSum / sStat->iMvCount); + + iSearchRange = 16 << pEnc->mbParam.m_fcode; + + if ((3.0 * fSigma > iSearchRange) && (pEnc->mbParam.m_fcode <= 5) ) + pEnc->mbParam.m_fcode++; + + else if ((5.0 * fSigma < iSearchRange) + && (4.0 * pEnc->fMvPrevSigma < iSearchRange) + && (pEnc->mbParam.m_fcode >= 2) ) + pEnc->mbParam.m_fcode--; + + pEnc->fMvPrevSigma = fSigma; +} -#define INTRA_THRESHOLD 0.5 #define BFRAME_SKIP_THRESHHOLD 30 +static void +SliceCodeP(SMPData *data) +{ + Encoder *pEnc = (Encoder *) data->pEnc; + Bitstream *bs = (Bitstream *) data->bs; + + int x, y, k; + FRAMEINFO *const current = pEnc->current; + FRAMEINFO *const reference = pEnc->reference; + MBParam * const pParam = &pEnc->mbParam; + int mb_width = pParam->mb_width; + int mb_height = pParam->mb_height; + + DECLARE_ALIGNED_MATRIX(dct_codes, 6, 64, int16_t, CACHE_LINE); + DECLARE_ALIGNED_MATRIX(qcoeff, 6, 64, int16_t, CACHE_LINE); + + int bound = 0, num_slices = pEnc->num_slices; + + if (data->start_y > 0) { /* write resync marker */ + bound = data->start_y*mb_width; + write_video_packet_header(bs, pParam, current, bound); + } + + for (y = data->start_y; y < data->stop_y; y++) { + int new_bound = mb_width * ((((y*num_slices) / mb_height) * mb_height + (num_slices-1)) / num_slices); + + if (new_bound > bound) { + bound = new_bound; + BitstreamPadAlways(bs); + write_video_packet_header(bs, pParam, current, bound); + } + + for (x = 0; x < mb_width; x++) { + MACROBLOCK *pMB = ¤t->mbs[x + y * pParam->mb_width]; + int skip_possible; + + if (pMB->mode == MODE_INTRA || pMB->mode == MODE_INTRA_Q) { + CodeIntraMB(pMB); + MBTransQuantIntra(pParam, current, pMB, x, y, + dct_codes, qcoeff); + + start_timer(); + MBPrediction(current, x, y, pParam->mb_width, qcoeff, bound); + stop_prediction_timer(); + + data->sStat->kblks++; + + MBCoding(current, pMB, qcoeff, bs, data->sStat); + stop_coding_timer(); + continue; + } + + start_timer(); + MBMotionCompensation(pMB, x, y, &reference->image, + &pEnc->vInterH, &pEnc->vInterV, + &pEnc->vInterHV, &pEnc->vGMC, + ¤t->image, + dct_codes, pParam->width, + pParam->height, + pParam->edged_width, + (current->vol_flags & XVID_VOL_QUARTERPEL), + current->rounding_type, + data->RefQ); + + stop_comp_timer(); + + pMB->field_pred = 0; + + if (pMB->cbp != 0) { + pMB->cbp = MBTransQuantInter(pParam, current, pMB, x, y, + dct_codes, qcoeff); + } + + if (pMB->dquant != 0) + MBSetDquant(pMB, x, y, pParam); + + + if (pMB->cbp || pMB->mvs[0].x || pMB->mvs[0].y || + pMB->mvs[1].x || pMB->mvs[1].y || pMB->mvs[2].x || + pMB->mvs[2].y || pMB->mvs[3].x || pMB->mvs[3].y) { + data->sStat->mblks++; + } else { + data->sStat->ublks++; + } + + start_timer(); + + /* Finished processing the MB, now check if to CODE or SKIP */ + + skip_possible = (pMB->cbp == 0) && (pMB->mode == MODE_INTER); + + if (current->coding_type == S_VOP) + skip_possible &= (pMB->mcsel == 1); + else { /* PVOP */ + const VECTOR * const mv = (pParam->vol_flags & XVID_VOL_QUARTERPEL) ? + pMB->qmvs : pMB->mvs; + skip_possible &= ((mv->x|mv->y) == 0); + } + + if ((pMB->mode == MODE_NOT_CODED) || (skip_possible)) { + /* This is a candidate for SKIPping, but for P-VOPs check intermediate B-frames first */ + int bSkip = 1; + + if (current->coding_type == P_VOP) { /* special rule for P-VOP's SKIP */ + for (k = pEnc->bframenum_head; k < pEnc->bframenum_tail; k++) { + int iSAD; + iSAD = sad16(reference->image.y + 16*y*pParam->edged_width + 16*x, + pEnc->bframes[k]->image.y + 16*y*pParam->edged_width + 16*x, + pParam->edged_width, BFRAME_SKIP_THRESHHOLD * pMB->quant); + if (iSAD >= BFRAME_SKIP_THRESHHOLD * pMB->quant || ((bound > 1) && + ((y*mb_width+x == bound) || (y*mb_width+x == bound+1)))) { /* Some third-party decoders have problems with coloc skip MB before or after + resync marker in BVOP. We avoid any ambiguity and force no skip at slice boundary */ + bSkip = 0; /* could not SKIP */ + if (pParam->vol_flags & XVID_VOL_QUARTERPEL) { + VECTOR predMV = get_qpmv2(current->mbs, pParam->mb_width, bound, x, y, 0); + pMB->pmvs[0].x = - predMV.x; + pMB->pmvs[0].y = - predMV.y; + } else { + VECTOR predMV = get_pmv2(current->mbs, pParam->mb_width, bound, x, y, 0); + pMB->pmvs[0].x = - predMV.x; + pMB->pmvs[0].y = - predMV.y; + } + pMB->mode = MODE_INTER; + pMB->cbp = 0; + break; + } + } + } + + if (bSkip) { + /* do SKIP */ + pMB->mode = MODE_NOT_CODED; + MBSkip(bs); + stop_coding_timer(); + continue; /* next MB */ + } + } + + /* ordinary case: normal coded INTER/INTER4V block */ + MBCoding(current, pMB, qcoeff, bs, data->sStat); + stop_coding_timer(); + } + } + + BitstreamPadAlways(bs); /* next_start_code() at the end of VideoObjectPlane() */ + emms(); +} /* FrameCodeP also handles S(GMC)-VOPs */ static int -FrameCodeP(Encoder * pEnc, - Bitstream * bs, - bool force_inter, - bool vol_header) +FrameCodeP(Encoder * pEnc, Bitstream * bs) { - float fSigma; int bits = BitstreamPos(bs); - DECLARE_ALIGNED_MATRIX(dct_codes, 6, 64, int16_t, CACHE_LINE); - DECLARE_ALIGNED_MATRIX(qcoeff, 6, 64, int16_t, CACHE_LINE); - - int iLimit; - int x, y, k; - int iSearchRange; - int bIntra=0, skip_possible; FRAMEINFO *const current = pEnc->current; FRAMEINFO *const reference = pEnc->reference; MBParam * const pParam = &pEnc->mbParam; @@ -1546,21 +1916,20 @@ int mb_height = pParam->mb_height; int coded = 1; + int k = 0, bound = 0, num_slices = pEnc->num_slices; + int num_threads = MAX(1, MIN(pEnc->num_threads, num_slices)); +#ifdef HAVE_PTHREAD + void * status = NULL; + int threads_per_slice = (pEnc->num_threads*1024 / num_threads); +#endif + int slices_per_thread = (num_slices*1024 / num_threads); - /* IMAGE *pCurrent = ¤t->image; */ IMAGE *pRef = &reference->image; - if ((current->vop_flags & XVID_VOP_REDUCED)) - { - mb_width = (pParam->width + 31) / 32; - mb_height = (pParam->height + 31) / 32; - } - - if (!reference->is_edged) { start_timer(); image_setedges(pRef, pParam->edged_width, pParam->edged_height, - pParam->width, pParam->height, 0); + pParam->width, pParam->height, XVID_BS_VERSION); stop_edges_timer(); reference->is_edged = 1; } @@ -1569,16 +1938,11 @@ current->rounding_type = pParam->m_rounding_type; current->fcode = pParam->m_fcode; - if (!force_inter) - iLimit = (int)(mb_width * mb_height * INTRA_THRESHOLD); - else - iLimit = mb_width * mb_height + 1; - if ((current->vop_flags & XVID_VOP_HALFPEL)) { if (reference->is_interpolated != current->rounding_type) { start_timer(); - image_interpolate(pRef, &pEnc->vInterH, &pEnc->vInterV, - &pEnc->vInterHV, pParam->edged_width, + image_interpolate(pRef->y, pEnc->vInterH.y, pEnc->vInterV.y, + pEnc->vInterHV.y, pParam->edged_width, pParam->edged_height, (pParam->vol_flags & XVID_VOL_QUARTERPEL), current->rounding_type); @@ -1587,17 +1951,26 @@ } } + current->sStat.iTextBits = current->sStat.iMvSum = current->sStat.iMvCount = + current->sStat.kblks = current->sStat.mblks = current->sStat.ublks = + current->sStat.iMVBits = 0; + current->coding_type = P_VOP; + if (current->vop_flags & XVID_VOP_RD_PSNRHVSM) { + image_block_variance(¤t->image, pParam->edged_width, current->mbs, + pParam->mb_width, pParam->mb_height); + } + call_plugins(pEnc, pEnc->current, NULL, XVID_PLG_FRAME, NULL, NULL, NULL); SetMacroblockQuants(&pEnc->mbParam, current); start_timer(); - if (current->vol_flags & XVID_VOL_GMC ) /* GMC only for S(GMC)-VOPs */ + if (current->vol_flags & XVID_VOL_GMC) /* GMC only for S(GMC)-VOPs */ { int gmcval; current->warp = GlobalMotionEst( current->mbs, pParam, current, reference, - &pEnc->vInterH, &pEnc->vInterV, &pEnc->vInterHV); + &pEnc->vInterH, &pEnc->vInterV, &pEnc->vInterHV, num_slices); if (current->motion_flags & XVID_ME_GME_REFINE) { gmcval = GlobalMotionEstRefine(¤t->warp, @@ -1644,238 +2017,142 @@ } } - bIntra = - MotionEstimation(&pEnc->mbParam, current, reference, - &pEnc->vInterH, &pEnc->vInterV, &pEnc->vInterHV, - &pEnc->vGMC, iLimit); - - - stop_motion_timer(); - - if (bIntra == 1) return FrameCodeI(pEnc, bs); - - set_timecodes(current,reference,pParam->fbase); - if (vol_header) - { BitstreamWriteVolHeader(bs, &pEnc->mbParam, current); - BitstreamPad(bs); - } - - BitstreamWriteVopHeader(bs, &pEnc->mbParam, current, 1, current->mbs[0].quant); - - current->sStat.iTextBits = current->sStat.iMvSum = current->sStat.iMvCount = - current->sStat.kblks = current->sStat.mblks = current->sStat.ublks = 0; - - - for (y = 0; y < mb_height; y++) { - for (x = 0; x < mb_width; x++) { - MACROBLOCK *pMB = - ¤t->mbs[x + y * pParam->mb_width]; - - bIntra = (pMB->mode == MODE_INTRA) || (pMB->mode == MODE_INTRA_Q); - - if (bIntra) { - CodeIntraMB(pEnc, pMB); - MBTransQuantIntra(&pEnc->mbParam, current, pMB, x, y, - dct_codes, qcoeff); - - start_timer(); - MBPrediction(current, x, y, pParam->mb_width, qcoeff); - stop_prediction_timer(); - - current->sStat.kblks++; +#ifdef HAVE_PTHREAD + if (pEnc->num_threads > 0) { - if (pEnc->current->vop_flags & XVID_VOP_GREYSCALE) - { pMB->cbp &= 0x3C; /* keep only bits 5-2 */ - qcoeff[4*64+0]=0; /* zero, because for INTRA MBs DC value is saved */ - qcoeff[5*64+0]=0; - } - MBCoding(current, pMB, qcoeff, bs, ¤t->sStat); - stop_coding_timer(); - continue; - } - - start_timer(); - MBMotionCompensation(pMB, x, y, &reference->image, - &pEnc->vInterH, &pEnc->vInterV, - &pEnc->vInterHV, &pEnc->vGMC, - ¤t->image, - dct_codes, pParam->width, - pParam->height, - pParam->edged_width, - (current->vol_flags & XVID_VOL_QUARTERPEL), - (current->vop_flags & XVID_VOP_REDUCED), - current->rounding_type); - - stop_comp_timer(); - - pMB->field_pred = 0; - - if (pMB->mode != MODE_NOT_CODED) - { pMB->cbp = - MBTransQuantInter(&pEnc->mbParam, current, pMB, x, y, - dct_codes, qcoeff); + /* multithreaded motion estimation - dispatch threads */ + while (k < pEnc->num_threads) { + int i, add_s = (slices_per_thread + 512) >> 10; + int add_t = (threads_per_slice + 512) >> 10; + + int start_y = (bound * mb_height + (num_slices-1)) / num_slices; + int stop_y = ((bound+add_s) * mb_height + (num_slices-1)) / num_slices; + int rows_per_thread = (stop_y - start_y + add_t - 1) / add_t; + + slices_per_thread += ((num_slices*1024 / num_threads) - add_s*1024); + threads_per_slice += ((pEnc->num_threads*1024 / num_threads) - add_t*1024); + + for (i = 0; i < add_t; i++) { + memset(pEnc->smpData[k+i].complete_count_self, 0, rows_per_thread * sizeof(int)); + + pEnc->smpData[k+i].pEnc = (void *) pEnc; + pEnc->smpData[k+i].y_row = i; + pEnc->smpData[k+i].y_step = add_t; + pEnc->smpData[k+i].stop_y = stop_y; + pEnc->smpData[k+i].start_y = start_y; + + /* todo: sort out temp space once and for all */ + pEnc->smpData[k+i].RefQ = (((k+i)&1) ? pEnc->vInterV.u : pEnc->vInterV.v) + + 16*((k+i)>>1)*pParam->edged_width; } + + pEnc->smpData[k].complete_count_above = + pEnc->smpData[k+add_t-1].complete_count_self - 1; - if (pMB->dquant != 0) - MBSetDquant(pMB, x, y, &pEnc->mbParam); - - - if (pMB->cbp || pMB->mvs[0].x || pMB->mvs[0].y || - pMB->mvs[1].x || pMB->mvs[1].y || pMB->mvs[2].x || - pMB->mvs[2].y || pMB->mvs[3].x || pMB->mvs[3].y) { - current->sStat.mblks++; - } else { - current->sStat.ublks++; - } + bound += add_s; + k += add_t; + } - start_timer(); + for (k = 1; k < pEnc->num_threads; k++) { + pthread_create(&pEnc->smpData[k].handle, NULL, + (void*)MotionEstimateSMP, (void*)&pEnc->smpData[k]); + } - /* Finished processing the MB, now check if to CODE or SKIP */ + MotionEstimateSMP(&pEnc->smpData[0]); - skip_possible = (pMB->cbp == 0) && (pMB->mode == MODE_INTER) && - (pMB->dquant == 0); + for (k = 1; k < pEnc->num_threads; k++) { + pthread_join(pEnc->smpData[k].handle, &status); + } - if (current->coding_type == S_VOP) - skip_possible &= (pMB->mcsel == 1); - else if (current->coding_type == P_VOP) { - if ((pParam->vol_flags & XVID_VOL_QUARTERPEL)) - skip_possible &= ( (pMB->qmvs[0].x == 0) && (pMB->qmvs[0].y == 0) ); - else - skip_possible &= ( (pMB->mvs[0].x == 0) && (pMB->mvs[0].y == 0) ); - } + current->fcode = 0; + for (k = 0; k < pEnc->num_threads; k++) { + current->sStat.iMvSum += pEnc->smpData[k].mvSum; + current->sStat.iMvCount += pEnc->smpData[k].mvCount; + if (pEnc->smpData[k].minfcode > current->fcode) + current->fcode = pEnc->smpData[k].minfcode; + } - if ( (pMB->mode == MODE_NOT_CODED) || (skip_possible)) { + } else +#endif + { -/* This is a candidate for SKIPping, but for P-VOPs check intermediate B-frames first */ + /* regular ME */ - if (current->coding_type == P_VOP) /* special rule for P-VOP's SKIP */ - { - int bSkip = 1; + MotionEstimation(&pEnc->mbParam, current, reference, + &pEnc->vInterH, &pEnc->vInterV, &pEnc->vInterHV, + &pEnc->vGMC, 256*4096, num_slices); - for (k=pEnc->bframenum_head; k< pEnc->bframenum_tail; k++) - { - int iSAD; - iSAD = sad16(reference->image.y + 16*y*pParam->edged_width + 16*x, - pEnc->bframes[k]->image.y + 16*y*pParam->edged_width + 16*x, - pParam->edged_width,BFRAME_SKIP_THRESHHOLD); - if (iSAD >= BFRAME_SKIP_THRESHHOLD * pMB->quant) - { bSkip = 0; - break; - } - } + } - if (!bSkip) { /* no SKIP, but trivial block */ - if((pParam->vol_flags & XVID_VOL_QUARTERPEL)) { - VECTOR predMV = get_qpmv2(current->mbs, pParam->mb_width, 0, x, y, 0); - pMB->pmvs[0].x = - predMV.x; - pMB->pmvs[0].y = - predMV.y; - } - else { - VECTOR predMV = get_pmv2(current->mbs, pParam->mb_width, 0, x, y, 0); - pMB->pmvs[0].x = - predMV.x; - pMB->pmvs[0].y = - predMV.y; - } - pMB->mode = MODE_INTER; - pMB->cbp = 0; - MBCoding(current, pMB, qcoeff, bs, ¤t->sStat); - stop_coding_timer(); + stop_motion_timer(); - continue; /* next MB */ - } - } - /* do SKIP */ + set_timecodes(current,reference,pParam->fbase); - pMB->mode = MODE_NOT_CODED; - MBSkip(bs); - stop_coding_timer(); - continue; /* next MB */ - } - /* ordinary case: normal coded INTER/INTER4V block */ + BitstreamWriteVopHeader(bs, &pEnc->mbParam, current, 1, current->mbs[0].quant); - if ((current->vop_flags & XVID_VOP_GREYSCALE)) - { pMB->cbp &= 0x3C; /* keep only bits 5-2 */ - qcoeff[4*64+0]=0; /* zero, because DC for INTRA MBs DC value is saved */ - qcoeff[5*64+0]=0; - } + /* multithreaded inter coding - dispatch threads */ - if((pParam->vol_flags & XVID_VOL_QUARTERPEL)) { - VECTOR predMV = get_qpmv2(current->mbs, pParam->mb_width, 0, x, y, 0); - pMB->pmvs[0].x = pMB->qmvs[0].x - predMV.x; - pMB->pmvs[0].y = pMB->qmvs[0].y - predMV.y; - DPRINTF(XVID_DEBUG_MV,"mv_diff (%i,%i) pred (%i,%i) result (%i,%i)\n", pMB->pmvs[0].x, pMB->pmvs[0].y, predMV.x, predMV.y, pMB->mvs[0].x, pMB->mvs[0].y); - } else { - VECTOR predMV = get_pmv2(current->mbs, pParam->mb_width, 0, x, y, 0); - pMB->pmvs[0].x = pMB->mvs[0].x - predMV.x; - pMB->pmvs[0].y = pMB->mvs[0].y - predMV.y; - DPRINTF(XVID_DEBUG_MV,"mv_diff (%i,%i) pred (%i,%i) result (%i,%i)\n", pMB->pmvs[0].x, pMB->pmvs[0].y, predMV.x, predMV.y, pMB->mvs[0].x, pMB->mvs[0].y); - } + bound = 0; + slices_per_thread = (num_slices*1024 / num_threads); + for (k = 0; k < num_threads; k++) { + int add = ((slices_per_thread + 512) >> 10); - if (pMB->mode == MODE_INTER4V) - { int k; - for (k=1;k<4;k++) - { - if((pParam->vol_flags & XVID_VOL_QUARTERPEL)) { - VECTOR predMV = get_qpmv2(current->mbs, pParam->mb_width, 0, x, y, k); - pMB->pmvs[k].x = pMB->qmvs[k].x - predMV.x; - pMB->pmvs[k].y = pMB->qmvs[k].y - predMV.y; - DPRINTF(XVID_DEBUG_MV,"mv_diff (%i,%i) pred (%i,%i) result (%i,%i)\n", pMB->pmvs[k].x, pMB->pmvs[k].y, predMV.x, predMV.y, pMB->mvs[k].x, pMB->mvs[k].y); - } else { - VECTOR predMV = get_pmv2(current->mbs, pParam->mb_width, 0, x, y, k); - pMB->pmvs[k].x = pMB->mvs[k].x - predMV.x; - pMB->pmvs[k].y = pMB->mvs[k].y - predMV.y; - DPRINTF(XVID_DEBUG_MV,"mv_diff (%i,%i) pred (%i,%i) result (%i,%i)\n", pMB->pmvs[k].x, pMB->pmvs[k].y, predMV.x, predMV.y, pMB->mvs[k].x, pMB->mvs[k].y); - } + slices_per_thread += ((num_slices*1024 / num_threads) - add*1024); - } - } + pEnc->smpData[k].pEnc = (void *) pEnc; + pEnc->smpData[k].stop_y = (((bound+add) * mb_height + (num_slices-1)) / num_slices); + pEnc->smpData[k].start_y = ((bound * mb_height + (num_slices-1)) / num_slices); + pEnc->smpData[k].RefQ = ((k&1) ? pEnc->vInterV.u : pEnc->vInterV.v) + 16*(k>>1)*pParam->edged_width; - MBCoding(current, pMB, qcoeff, bs, &pEnc->current->sStat); - stop_coding_timer(); + bound += add; + if (k > 0) { + pEnc->smpData[k].sStat->iTextBits = pEnc->smpData[k].sStat->kblks = + pEnc->smpData[k].sStat->mblks = pEnc->smpData[k].sStat->ublks = + pEnc->smpData[k].sStat->iMVBits = 0; + + BitstreamReset(pEnc->smpData[k].bs); } } + pEnc->smpData[0].bs = bs; + pEnc->smpData[0].sStat = ¤t->sStat; - if ((current->vop_flags & XVID_VOP_REDUCED)) - { - image_deblock_rrv(¤t->image, pParam->edged_width, - current->mbs, mb_width, mb_height, pParam->mb_width, - 16, 0); +#ifdef HAVE_PTHREAD + /* create threads */ + for (k = 1; k < num_threads; k++) { + pthread_create(&pEnc->smpData[k].handle, NULL, + (void*)SliceCodeP, (void*)&pEnc->smpData[k]); } +#endif - emms(); + SliceCodeP(&pEnc->smpData[0]); - if (current->sStat.iMvCount == 0) - current->sStat.iMvCount = 1; +#ifdef HAVE_PTHREAD + /* wait until all threads are finished */ + for (k = 1; k < num_threads; k++) { + pthread_join(pEnc->smpData[k].handle, &status); + } +#endif - fSigma = (float) sqrt((float) current->sStat.iMvSum / current->sStat.iMvCount); + current->length = BitstreamLength(bs) - (bits/8); - iSearchRange = 1 << (3 + pParam->m_fcode); + /* reassemble the pieces together */ + SerializeBitstreams(pEnc, pEnc->current, bs, num_threads); - if ((fSigma > iSearchRange / 3) - && (pParam->m_fcode <= (3 + (pParam->vol_flags & XVID_VOL_QUARTERPEL?1:0) ))) /* maximum search range 128 */ - { - pParam->m_fcode++; - iSearchRange *= 2; - } else if ((fSigma < iSearchRange / 6) - && (pEnc->fMvPrevSigma >= 0) - && (pEnc->fMvPrevSigma < iSearchRange / 6) - && (pParam->m_fcode >= (2 + (pParam->vol_flags & XVID_VOL_QUARTERPEL?1:0) ))) /* minimum search range 16 */ - { - pParam->m_fcode--; - iSearchRange /= 2; - } - - pEnc->fMvPrevSigma = fSigma; + updateFcode(¤t->sStat, pEnc); /* frame drop code */ #if 0 DPRINTF(XVID_DEBUG_DEBUG, "kmu %i %i %i\n", current->sStat.kblks, current->sStat.mblks, current->sStat.ublks); #endif - if (current->sStat.kblks + current->sStat.mblks <= - (pParam->frame_drop_ratio * mb_width * mb_height) / 100) + + if (current->sStat.kblks + current->sStat.mblks < + (pParam->frame_drop_ratio * mb_width * mb_height) / 100 && + ( (pEnc->bframenum_head >= pEnc->bframenum_tail) || !(pEnc->mbParam.global_flags & XVID_GLOBAL_CLOSED_GOP)) && + (current->coding_type == P_VOP) ) { - current->sStat.kblks = current->sStat.mblks = 0; + current->sStat.kblks = current->sStat.mblks = current->sStat.iTextBits = 0; current->sStat.ublks = mb_width * mb_height; BitstreamReset(bs); @@ -1894,6 +2171,10 @@ memcpy(current->mbs, reference->mbs, sizeof(MACROBLOCK) * mb_width * mb_height); coded = 0; + BitstreamPadAlways(bs); /* next_start_code() at the end of VideoObjectPlane() */ + + current->length = (BitstreamPos(bs) - bits) / 8; + } else { pEnc->current->is_edged = 0; /* not edged */ @@ -1922,13 +2203,90 @@ } */ - BitstreamPadAlways(bs); /* next_start_code() at the end of VideoObjectPlane() */ - - current->length = (BitstreamPos(bs) - bits) / 8; - return coded; } +static void +SliceCodeB(SMPData *data) +{ + Encoder *pEnc = (Encoder *) data->pEnc; + Bitstream *bs = (Bitstream *) data->bs; + + DECLARE_ALIGNED_MATRIX(dct_codes, 6, 64, int16_t, CACHE_LINE); + DECLARE_ALIGNED_MATRIX(qcoeff, 6, 64, int16_t, CACHE_LINE); + + int x, y; + FRAMEINFO * const frame = (FRAMEINFO * const) data->current; + MBParam * const pParam = &pEnc->mbParam; + int mb_width = pParam->mb_width; + int mb_height = pParam->mb_height; + IMAGE *f_ref = &pEnc->reference->image; + IMAGE *b_ref = &pEnc->current->image; + + int bound = data->start_y*mb_width; + int num_slices = pEnc->num_slices; + + if (data->start_y > 0) { /* write resync marker */ + write_video_packet_header(bs, pParam, frame, bound+1); + } + + for (y = data->start_y; y < MIN(data->stop_y+1, mb_height); y++) { + int new_bound = mb_width * ((((y*num_slices) / mb_height) * mb_height + (num_slices-1)) / num_slices); + int stop_x = (y == data->stop_y) ? 1 : mb_width; + int start_x = (y == data->start_y && y > 0) ? 1 : 0; + + for (x = start_x; x < stop_x; x++) { + MACROBLOCK * const mb = &frame->mbs[x + y * pEnc->mbParam.mb_width]; + + /* decoder ignores mb when refence block is INTER(0,0), CBP=0 */ + if (mb->mode == MODE_NOT_CODED) { + if (pParam->plugin_flags & XVID_REQORIGINAL) { + MBMotionCompensation(mb, x, y, f_ref, NULL, f_ref, NULL, NULL, &frame->image, + NULL, 0, 0, pParam->edged_width, 0, 0, data->RefQ); + } + continue; + } + + if (new_bound > bound && x > 0) { + bound = new_bound; + BitstreamPadAlways(bs); + write_video_packet_header(bs, pParam, frame, y*mb_width+x); + } + + mb->quant = frame->quant; + + if (mb->cbp != 0 || pParam->plugin_flags & XVID_REQORIGINAL) { + /* we have to motion-compensate, transfer etc, + because there might be blocks to code */ + + MBMotionCompensationBVOP(pParam, mb, x, y, &frame->image, + f_ref, &pEnc->f_refh, &pEnc->f_refv, + &pEnc->f_refhv, b_ref, &pEnc->vInterH, + &pEnc->vInterV, &pEnc->vInterHV, dct_codes, + data->RefQ); + + mb->cbp = MBTransQuantInterBVOP(pParam, frame, mb, x, y, dct_codes, qcoeff); + } + + if (mb->mode == MODE_DIRECT_NO4V) + mb->mode = MODE_DIRECT; + + if (mb->mode == MODE_DIRECT && (mb->cbp | mb->pmvs[3].x | mb->pmvs[3].y) == 0) + mb->mode = MODE_DIRECT_NONE_MV; /* skipped */ + else + if (frame->vop_flags & XVID_VOP_GREYSCALE) + /* keep only bits 5-2 -- Chroma blocks will just be skipped by MBCodingBVOP */ + mb->cbp &= 0x3C; + + start_timer(); + MBCodingBVOP(frame, mb, qcoeff, frame->fcode, frame->bcode, bs, data->sStat); + stop_coding_timer(); + } + } + + BitstreamPadAlways(bs); /* next_start_code() at the end of VideoObjectPlane() */ + emms(); +} static void FrameCodeB(Encoder * pEnc, @@ -1936,13 +2294,20 @@ Bitstream * bs) { int bits = BitstreamPos(bs); - DECLARE_ALIGNED_MATRIX(dct_codes, 6, 64, int16_t, CACHE_LINE); - DECLARE_ALIGNED_MATRIX(qcoeff, 6, 64, int16_t, CACHE_LINE); - uint32_t x, y; + int k = 0, bound = 0, num_slices = pEnc->num_slices; + int num_threads = MAX(1, MIN(pEnc->num_threads, num_slices)); +#ifdef HAVE_PTHREAD + void * status = NULL; + int threads_per_slice = (pEnc->num_threads*1024 / num_threads); +#endif + int slices_per_thread = (num_slices*1024 / num_threads); IMAGE *f_ref = &pEnc->reference->image; IMAGE *b_ref = &pEnc->current->image; + MBParam * const pParam = &pEnc->mbParam; + int mb_height = pParam->mb_height; + #ifdef BFRAMES_DEC_DEBUG FILE *fp; static char first=0; @@ -1950,8 +2315,6 @@ fprintf(fp,"Y=%3d X=%3d MB=%2d CBP=%02X\n",y,x,mb->mode,mb->cbp); \ } - /* XXX: pEnc->current->global_flags &= ~XVID_VOP_REDUCED; reduced resoltion not yet supported */ - if (!first){ fp=fopen("C:\\XVIDDBGE.TXT","w"); } @@ -1961,13 +2324,13 @@ if (!pEnc->reference->is_edged) { image_setedges(f_ref, pEnc->mbParam.edged_width, pEnc->mbParam.edged_height, pEnc->mbParam.width, - pEnc->mbParam.height, 0); - pEnc->current->is_edged = 1; + pEnc->mbParam.height, XVID_BS_VERSION); + pEnc->reference->is_edged = 1; } if (pEnc->reference->is_interpolated != 0) { start_timer(); - image_interpolate(f_ref, &pEnc->f_refh, &pEnc->f_refv, &pEnc->f_refhv, + image_interpolate(f_ref->y, pEnc->f_refh.y, pEnc->f_refv.y, pEnc->f_refhv.y, pEnc->mbParam.edged_width, pEnc->mbParam.edged_height, (pEnc->mbParam.vol_flags & XVID_VOL_QUARTERPEL), 0); stop_inter_timer(); @@ -1978,13 +2341,13 @@ if (!pEnc->current->is_edged) { image_setedges(b_ref, pEnc->mbParam.edged_width, pEnc->mbParam.edged_height, pEnc->mbParam.width, - pEnc->mbParam.height, 0); + pEnc->mbParam.height, XVID_BS_VERSION); pEnc->current->is_edged = 1; } if (pEnc->current->is_interpolated != 0) { start_timer(); - image_interpolate(b_ref, &pEnc->vInterH, &pEnc->vInterV, &pEnc->vInterHV, + image_interpolate(b_ref->y, pEnc->vInterH.y, pEnc->vInterV.y, pEnc->vInterHV.y, pEnc->mbParam.edged_width, pEnc->mbParam.edged_height, (pEnc->mbParam.vol_flags & XVID_VOL_QUARTERPEL), 0); stop_inter_timer(); @@ -1992,80 +2355,147 @@ } frame->coding_type = B_VOP; - call_plugins(pEnc, pEnc->current, NULL, XVID_PLG_FRAME, NULL, NULL, NULL); + + if ((frame->vop_flags & XVID_VOP_RD_PSNRHVSM) && (frame->vop_flags & XVID_VOP_RD_BVOP)) { + image_block_variance(&frame->image, pEnc->mbParam.edged_width, frame->mbs, + pEnc->mbParam.mb_width, pEnc->mbParam.mb_height); + } + + call_plugins(pEnc, frame, NULL, XVID_PLG_FRAME, NULL, NULL, NULL); + + frame->fcode = frame->bcode = pEnc->current->fcode; start_timer(); - MotionEstimationBVOP(&pEnc->mbParam, frame, - ((int32_t)(pEnc->current->stamp - frame->stamp)), /* time_bp */ - ((int32_t)(pEnc->current->stamp - pEnc->reference->stamp)), /* time_pp */ - pEnc->reference->mbs, f_ref, - &pEnc->f_refh, &pEnc->f_refv, &pEnc->f_refhv, - pEnc->current, b_ref, &pEnc->vInterH, - &pEnc->vInterV, &pEnc->vInterHV); + +#ifdef HAVE_PTHREAD + if (pEnc->num_threads > 0) { + + /* multithreaded motion estimation - dispatch threads */ + while (k < pEnc->num_threads) { + int i, add_s = (slices_per_thread + 512) >> 10; + int add_t = (threads_per_slice + 512) >> 10; + + int start_y = (bound * mb_height + (num_slices-1)) / num_slices; + int stop_y = ((bound+add_s) * mb_height + (num_slices-1)) / num_slices; + int rows_per_thread = (stop_y - start_y + add_t - 1) / add_t; + + slices_per_thread += ((num_slices*1024 / num_threads) - add_s*1024); + threads_per_slice += ((pEnc->num_threads*1024 / num_threads) - add_t*1024); + + for (i = 0; i < add_t; i++) { + memset(pEnc->smpData[k+i].complete_count_self, 0, rows_per_thread * sizeof(int)); + + pEnc->smpData[k+i].pEnc = (void *) pEnc; + pEnc->smpData[k+i].current = frame; + + pEnc->smpData[k+i].y_row = i; + pEnc->smpData[k+i].y_step = add_t; + pEnc->smpData[k+i].stop_y = stop_y; + pEnc->smpData[k+i].start_y = start_y; + + /* todo: sort out temp space once and for all */ + pEnc->smpData[k+i].RefQ = (((k+i)&1) ? pEnc->vInterV.u : pEnc->vInterV.v) + + 16*((k+i)>>1)*pParam->edged_width; + } + + pEnc->smpData[k].complete_count_above = + pEnc->smpData[k+add_t-1].complete_count_self - 1; + + bound += add_s; + k += add_t; + } + + for (k = 1; k < pEnc->num_threads; k++) { + pthread_create(&pEnc->smpData[k].handle, NULL, + (void*)SMPMotionEstimationBVOP, (void*)&pEnc->smpData[k]); + } + + SMPMotionEstimationBVOP(&pEnc->smpData[0]); + + for (k = 1; k < pEnc->num_threads; k++) { + pthread_join(pEnc->smpData[k].handle, &status); + } + + frame->fcode = frame->bcode = 0; + for (k = 0; k < pEnc->num_threads; k++) { + if (pEnc->smpData[k].minfcode > frame->fcode) + frame->fcode = pEnc->smpData[k].minfcode; + if (pEnc->smpData[k].minbcode > frame->bcode) + frame->bcode = pEnc->smpData[k].minbcode; + } + } else +#endif + { + + MotionEstimationBVOP(&pEnc->mbParam, frame, + ((int32_t)(pEnc->current->stamp - frame->stamp)), /* time_bp */ + ((int32_t)(pEnc->current->stamp - pEnc->reference->stamp)), /* time_pp */ + pEnc->reference->mbs, f_ref, + &pEnc->f_refh, &pEnc->f_refv, &pEnc->f_refhv, + pEnc->current, b_ref, &pEnc->vInterH, + &pEnc->vInterV, &pEnc->vInterHV, + pEnc->num_slices); + } stop_motion_timer(); set_timecodes(frame, pEnc->reference,pEnc->mbParam.fbase); BitstreamWriteVopHeader(bs, &pEnc->mbParam, frame, 1, frame->quant); + /* reset stats */ frame->sStat.iTextBits = 0; + frame->sStat.iMVBits = 0; frame->sStat.iMvSum = 0; frame->sStat.iMvCount = 0; frame->sStat.kblks = frame->sStat.mblks = frame->sStat.ublks = 0; frame->sStat.mblks = pEnc->mbParam.mb_width * pEnc->mbParam.mb_height; frame->sStat.kblks = frame->sStat.ublks = 0; + + /* multithreaded inter coding - dispatch threads */ + bound = 0; + slices_per_thread = (num_slices*1024 / num_threads); + + for (k = 0; k < num_threads; k++) { + int add = ((slices_per_thread + 512) >> 10); - for (y = 0; y < pEnc->mbParam.mb_height; y++) { - for (x = 0; x < pEnc->mbParam.mb_width; x++) { - MACROBLOCK * const mb = &frame->mbs[x + y * pEnc->mbParam.mb_width]; - - /* decoder ignores mb when refence block is INTER(0,0), CBP=0 */ - if (mb->mode == MODE_NOT_CODED) { - if (pEnc->mbParam.plugin_flags & XVID_REQORIGINAL) { - MBMotionCompensation(mb, x, y, f_ref, NULL, f_ref, NULL, NULL, &frame->image, - NULL, 0, 0, pEnc->mbParam.edged_width, 0, 0, 0); - } - - continue; - } + slices_per_thread += ((num_slices*1024 / num_threads) - add*1024); - if (mb->mode != MODE_DIRECT_NONE_MV || pEnc->mbParam.plugin_flags & XVID_REQORIGINAL) { - MBMotionCompensationBVOP(&pEnc->mbParam, mb, x, y, &frame->image, - f_ref, &pEnc->f_refh, &pEnc->f_refv, - &pEnc->f_refhv, b_ref, &pEnc->vInterH, - &pEnc->vInterV, &pEnc->vInterHV, - dct_codes); - - if (mb->mode == MODE_DIRECT_NO4V) mb->mode = MODE_DIRECT; - mb->quant = frame->quant; - - if (mb->mode != MODE_DIRECT_NONE_MV) - mb->cbp = MBTransQuantInterBVOP(&pEnc->mbParam, frame, mb, x, y, dct_codes, qcoeff); - - if ( (mb->mode == MODE_DIRECT) && (mb->cbp == 0) - && (mb->pmvs[3].x == 0) && (mb->pmvs[3].y == 0) ) { - mb->mode = MODE_DIRECT_NONE_MV; /* skipped */ - } - } + pEnc->smpData[k].pEnc = (void *) pEnc; + pEnc->smpData[k].current = frame; + pEnc->smpData[k].stop_y = (((bound+add) * mb_height + (num_slices-1)) / num_slices); + pEnc->smpData[k].start_y = ((bound * mb_height + (num_slices-1)) / num_slices); + bound += add; - /* keep only bits 5-2 -- Chroma blocks will just be skipped by the - * coding function for BFrames, that's why we don't zero teh DC - * coeffs */ - if ((frame->vop_flags & XVID_VOP_GREYSCALE)) - mb->cbp &= 0x3C; + /* todo: sort out temp space once and for all */ + pEnc->smpData[k].RefQ = ((k&1) ? pEnc->vInterV.u : pEnc->vInterV.v) + 16*(k>>1)*pParam->edged_width; - start_timer(); - MBCodingBVOP(frame, mb, qcoeff, frame->fcode, frame->bcode, bs, - &frame->sStat); - stop_coding_timer(); + if (k > 0) { + BitstreamReset(pEnc->smpData[k].bs); + pEnc->smpData[k].sStat->iTextBits = pEnc->smpData[k].sStat->kblks = + pEnc->smpData[k].sStat->mblks = pEnc->smpData[k].sStat->ublks = pEnc->smpData[k].sStat->iMVBits = 0; } } - emms(); +#ifdef HAVE_PTHREAD + for (k = 1; k < num_threads; k++) { + pthread_create(&pEnc->smpData[k].handle, NULL, + (void*)SliceCodeB, (void*)&pEnc->smpData[k]); + } +#endif - /* TODO: dynamic fcode/bcode ??? */ + pEnc->smpData[0].bs = bs; + pEnc->smpData[0].sStat = &frame->sStat; + SliceCodeB(&pEnc->smpData[0]); + +#ifdef HAVE_PTHREAD + for (k = 1; k < num_threads; k++) { + pthread_join(pEnc->smpData[k].handle, &status); + } +#endif - BitstreamPadAlways(bs); /* next_start_code() at the end of VideoObjectPlane() */ - frame->length = (BitstreamPos(bs) - bits) / 8; + frame->length = BitstreamLength(bs) - (bits/8); + + /* reassemble the pieces together */ + SerializeBitstreams(pEnc, frame, bs, num_threads); #ifdef BFRAMES_DEC_DEBUG if (!first){