--- plugin_psnrhvsm.c 2010/10/10 19:19:46 1.1 +++ plugin_psnrhvsm.c 2010/11/28 15:18:21 1.4 @@ -19,7 +19,7 @@ * along with this program ; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * $Id: plugin_psnrhvsm.c,v 1.1 2010/10/10 19:19:46 Isibaar Exp $ + * $Id: plugin_psnrhvsm.c,v 1.4 2010/11/28 15:18:21 Isibaar Exp $ * ****************************************************************************/ @@ -43,48 +43,53 @@ #include "../xvid.h" #include "../dct/fdct.h" #include "../image/image.h" +#include "../motion/sad.h" #include "../utils/mem_transfer.h" #include "../utils/emms.h" typedef struct { - int pixels; /* width*height */ - uint64_t mse_sum; /* for avrg psnrhvsm */ + uint64_t mse_sum_y; /* for avrg psnr-hvs-m */ + uint64_t mse_sum_u; + uint64_t mse_sum_v; + long frame_cnt; } psnrhvsm_data_t; /* internal plugin data */ -static const float CSF_Coeff[64] = - { 1.608443f, 2.339554f, 2.573509f, 1.608443f, 1.072295f, 0.643377f, 0.504610f, 0.421887f, - 2.144591f, 2.144591f, 1.838221f, 1.354478f, 0.989811f, 0.443708f, 0.428918f, 0.467911f, - 1.838221f, 1.979622f, 1.608443f, 1.072295f, 0.643377f, 0.451493f, 0.372972f, 0.459555f, - 1.838221f, 1.513829f, 1.169777f, 0.887417f, 0.504610f, 0.295806f, 0.321689f, 0.415082f, - 1.429727f, 1.169777f, 0.695543f, 0.459555f, 0.378457f, 0.236102f, 0.249855f, 0.334222f, - 1.072295f, 0.735288f, 0.467911f, 0.402111f, 0.317717f, 0.247453f, 0.227744f, 0.279729f, - 0.525206f, 0.402111f, 0.329937f, 0.295806f, 0.249855f, 0.212687f, 0.214459f, 0.254803f, - 0.357432f, 0.279729f, 0.270896f, 0.262603f, 0.229778f, 0.257351f, 0.249855f, 0.259950f - }; - -static const float Mask_Coeff[64] = - { 0.000000f, 0.826446f, 1.000000f, 0.390625f, 0.173611f, 0.062500f, 0.038447f, 0.026874f, - 0.694444f, 0.694444f, 0.510204f, 0.277008f, 0.147929f, 0.029727f, 0.027778f, 0.033058f, - 0.510204f, 0.591716f, 0.390625f, 0.173611f, 0.062500f, 0.030779f, 0.021004f, 0.031888f, - 0.510204f, 0.346021f, 0.206612f, 0.118906f, 0.038447f, 0.013212f, 0.015625f, 0.026015f, - 0.308642f, 0.206612f, 0.073046f, 0.031888f, 0.021626f, 0.008417f, 0.009426f, 0.016866f, - 0.173611f, 0.081633f, 0.033058f, 0.024414f, 0.015242f, 0.009246f, 0.007831f, 0.011815f, - 0.041649f, 0.024414f, 0.016437f, 0.013212f, 0.009426f, 0.006830f, 0.006944f, 0.009803f, - 0.019290f, 0.011815f, 0.011080f, 0.010412f, 0.007972f, 0.010000f, 0.009426f, 0.010203f - }; -#if 1 /* Floating-point implementation */ +#if 0 /* Floating-point implementation: Slow but accurate */ + +static const float CSF_Coeff[64] = { + 1.608443f, 2.339554f, 2.573509f, 1.608443f, 1.072295f, 0.643377f, 0.504610f, 0.421887f, + 2.144591f, 2.144591f, 1.838221f, 1.354478f, 0.989811f, 0.443708f, 0.428918f, 0.467911f, + 1.838221f, 1.979622f, 1.608443f, 1.072295f, 0.643377f, 0.451493f, 0.372972f, 0.459555f, + 1.838221f, 1.513829f, 1.169777f, 0.887417f, 0.504610f, 0.295806f, 0.321689f, 0.415082f, + 1.429727f, 1.169777f, 0.695543f, 0.459555f, 0.378457f, 0.236102f, 0.249855f, 0.334222f, + 1.072295f, 0.735288f, 0.467911f, 0.402111f, 0.317717f, 0.247453f, 0.227744f, 0.279729f, + 0.525206f, 0.402111f, 0.329937f, 0.295806f, 0.249855f, 0.212687f, 0.214459f, 0.254803f, + 0.357432f, 0.279729f, 0.270896f, 0.262603f, 0.229778f, 0.257351f, 0.249855f, 0.259950f +}; + +static const float Mask_Coeff[64] = { + 0.000000f, 0.826446f, 1.000000f, 0.390625f, 0.173611f, 0.062500f, 0.038447f, 0.026874f, + 0.694444f, 0.694444f, 0.510204f, 0.277008f, 0.147929f, 0.029727f, 0.027778f, 0.033058f, + 0.510204f, 0.591716f, 0.390625f, 0.173611f, 0.062500f, 0.030779f, 0.021004f, 0.031888f, + 0.510204f, 0.346021f, 0.206612f, 0.118906f, 0.038447f, 0.013212f, 0.015625f, 0.026015f, + 0.308642f, 0.206612f, 0.073046f, 0.031888f, 0.021626f, 0.008417f, 0.009426f, 0.016866f, + 0.173611f, 0.081633f, 0.033058f, 0.024414f, 0.015242f, 0.009246f, 0.007831f, 0.011815f, + 0.041649f, 0.024414f, 0.016437f, 0.013212f, 0.009426f, 0.006830f, 0.006944f, 0.009803f, + 0.019290f, 0.011815f, 0.011080f, 0.010412f, 0.007972f, 0.010000f, 0.009426f, 0.010203f +}; -static uint32_t Calc_MSE_H(int16_t *DCT_A, int16_t *DCT_B, uint8_t *IMG_A, uint8_t *IMG_B, int stride) +static uint32_t calc_SSE_H(int16_t *DCT_A, int16_t *DCT_B, uint8_t *IMG_A, uint8_t *IMG_B, int stride) { int x, y, i, j; uint32_t Global_A, Global_B, Sum_A = 0, Sum_B = 0; uint32_t Local[8] = {0, 0, 0, 0, 0, 0, 0, 0}; uint32_t Local_Square[8] = {0, 0, 0, 0, 0, 0, 0, 0}; float MASK_A = 0.f, MASK_B = 0.f; + float Mult1 = 1.f, Mult2 = 1.f; uint32_t MSE_H = 0; /* Step 1: Calculate CSF weighted energy of DCT coefficients */ @@ -98,14 +103,16 @@ /* Step 2: Determine local variances compared to entire block variance */ for (y = 0; y < 2; y++) { for (x = 0; x < 2; x++) { - for (i = 0; i < 4; i++) { - uint8_t A = IMG_A[y*4*stride + 4*x + i]; - uint8_t B = IMG_B[y*4*stride + 4*x + i]; - - Local[y*2 + x] += A; - Local[y*2 + x + 4] += B; - Local_Square[y*2 + x] += A*A; - Local_Square[y*2 + x + 4] += B*B; + for (j = 0; j < 4; j++) { + for (i = 0; i < 4; i++) { + uint8_t A = IMG_A[(y*4+j)*stride + 4*x + i]; + uint8_t B = IMG_B[(y*4+j)*stride + 4*x + i]; + + Local[y*2 + x] += A; + Local[y*2 + x + 4] += B; + Local_Square[y*2 + x] += A*A; + Local_Square[y*2 + x + 4] += B*B; + } } } } @@ -123,8 +130,14 @@ Global_B = (Local_Square[4]<<6) - Global_B*Global_B; /* 64*Var(D) */ /* Step 3: Calculate contrast masking threshold */ - MASK_A = (float)sqrt(MASK_A*((float)(Local[0]+Local[1]+Local[2]+Local[3])/((float)Global_A/4.f)))/32.f; - MASK_B = (float)sqrt(MASK_B*((float)(Local[4]+Local[5]+Local[6]+Local[7])/((float)Global_B/4.f)))/32.f; + if (Global_A) + Mult1 = (float)(Local[0]+Local[1]+Local[2]+Local[3])/((float)(Global_A)/4.f); + + if (Global_B) + Mult2 = (float)(Local[4]+Local[5]+Local[6]+Local[7])/((float)(Global_B)/4.f); + + MASK_A = (float)sqrt(MASK_A * Mult1) / 32.f; + MASK_B = (float)sqrt(MASK_B * Mult2) / 32.f; if (MASK_B > MASK_A) MASK_A = MASK_B; /* MAX(MASK_A, MASK_B) */ @@ -140,131 +153,59 @@ u -= (MASK_A / Mask_Coeff[j*8 + i]); } - MSE_H += (uint32_t) ((256.f*(u * CSF_Coeff[j*8 + i])*(u * CSF_Coeff[j*8 + i])) + 0.5f); + MSE_H += (uint32_t) ((16.f*(u * CSF_Coeff[j*8 + i])*(u * CSF_Coeff[j*8 + i])) + 0.5f); } } - return MSE_H; /* Fixed-point value right-shifted by eight */ + return MSE_H; /* Fixed-point value right-shifted by four */ } -#else /* First draft of a fixed-point implementation. - Might serve as a template for MMX/SSE code */ +#else -static const uint16_t iMask_Coeff[64] = - { 0, 59577, 65535, 40959, 27306, 16384, 12850, 10743, - 54612, 54612, 46811, 34492, 25206, 11299, 10923, 11915, - 46811, 50412, 40959, 27306, 16384, 11497, 9498, 11703, - 46811, 38550, 29789, 22598, 12850, 7533, 8192, 10570, - 36408, 29789, 17712, 11703, 9637, 6012, 6363, 8511, - 27306, 18724, 11915, 10240, 8091, 6302, 5799, 7123, - 13374, 10240, 8402, 7533, 6363, 5416, 5461, 6489, - 9102, 7123, 6898, 6687, 5851, 6553, 6363, 6620 - }; - -static const uint16_t Inv_iMask_Coeff[64] = - { 0, 310, 256, 655, 1475, 4096, 6659, 9526, - 369, 369, 502, 924, 1731, 8612, 9216, 7744, - 502, 433, 655, 1475, 4096, 8317, 12188, 8028, - 502, 740, 1239, 2153, 6659, 19376, 16384, 9840, - 829, 1239, 3505, 8028, 11838, 30415, 27159, 15178, - 1475, 3136, 7744, 10486, 16796, 27688, 32691, 21667, - 6147, 10486, 15575, 19376, 27159, 37482, 36866, 26114, - 13271, 21667, 23105, 24587, 32112, 25600, 27159, 25091 - }; - -static const uint16_t iCSF_Coeff[64] = - { 1647, 2396, 2635, 1647, 1098, 659, 517, 432, - 2196, 2196, 1882, 1387, 1014, 454, 439, 479, - 1882, 2027, 1647, 1098, 659, 462, 382, 471, - 1882, 1550, 1198, 909, 517, 303, 329, 425, - 1464, 1198, 712, 471, 388, 242, 256, 342, - 1098, 753, 479, 412, 325, 253, 233, 286, - 538, 412, 338, 303, 256, 218, 220, 261, - 366, 286, 277, 269, 235, 264, 256, 266 - }; - -static uint32_t Calc_MSE_H(int16_t *DCT_A, int16_t *DCT_B, uint8_t *IMG_A, uint8_t *IMG_B, int stride) +static uint32_t calc_SSE_H(int16_t *DCT_A, int16_t *DCT_B, uint8_t *IMG_A, uint8_t *IMG_B, int stride) { - int x, y, i, j; - uint32_t Global_A, Global_B, Sum_A = 0, Sum_B = 0; - uint32_t Local[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - uint32_t Local_Square[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - uint32_t MASK; - uint32_t MSE_H = 0; + DECLARE_ALIGNED_MATRIX(sums, 1, 8, uint16_t, CACHE_LINE); + DECLARE_ALIGNED_MATRIX(squares, 1, 8, uint32_t, CACHE_LINE); + uint32_t i, Global_A, Global_B, Sum_A = 0, Sum_B = 0; + uint32_t local[8], MASK_A, MASK_B, Mult1 = 64, Mult2 = 64; /* Step 1: Calculate CSF weighted energy of DCT coefficients */ - for (y = 0; y < 8; y++) { - for (x = 0; x < 8; x++) { - uint16_t A = (abs(DCT_A[y*8 + x]) * iMask_Coeff[y*8 + x] + 4096) >> 13; - uint16_t B = (abs(DCT_B[y*8 + x]) * iMask_Coeff[y*8 + x] + 4096) >> 13; - - Sum_A += ((A*A) >> 3); /* PMADDWD */ - Sum_B += ((B*B) >> 3); - } - } + + Sum_A = coeff8_energy(DCT_A); + Sum_B = coeff8_energy(DCT_B); /* Step 2: Determine local variances compared to entire block variance */ - for (y = 0; y < 2; y++) { - for (x = 0; x < 2; x++) { - for (i = 0; i < 4; i++) { - uint8_t A = IMG_A[y*4*stride + 4*x + i]; - uint8_t B = IMG_B[y*4*stride + 4*x + i]; - - Local[y*2 + x] += A; - Local[y*2 + x + 4] += B; - Local_Square[y*2 + x] += A*A; - Local_Square[y*2 + x + 4] += B*B; - } - } - } - - Global_A = Local[0] + Local[1] + Local[2] + Local[3]; - Global_B = Local[4] + Local[5] + Local[6] + Local[7]; + + Global_A = blocksum8(IMG_A, stride, sums, squares); + Global_B = blocksum8(IMG_B, stride, &sums[4], &squares[4]); for (i = 0; i < 8; i++) - Local[i] = (Local_Square[i]<<4) - (Local[i]*Local[i]); /* 16*Var(Di) */ + local[i] = (squares[i]<<4) - (sums[i]*sums[i]); /* 16*Var(Di) */ - Local_Square[0] += (Local_Square[1] + Local_Square[2] + Local_Square[3]); - Local_Square[4] += (Local_Square[5] + Local_Square[6] + Local_Square[7]); + squares[0] += (squares[1] + squares[2] + squares[3]); + squares[4] += (squares[5] + squares[6] + squares[7]); - Global_A = (Local_Square[0]<<6) - Global_A*Global_A; /* 64*Var(D) */ - Global_B = (Local_Square[4]<<6) - Global_B*Global_B; /* 64*Var(D) */ + Global_A = (squares[0]<<6) - Global_A*Global_A; /* 64*Var(D) */ + Global_B = (squares[4]<<6) - Global_B*Global_B; /* 64*Var(D) */ /* Step 3: Calculate contrast masking threshold */ - { - float MASK_A, MASK_B; /* TODO */ - - MASK_A = (float)((double)(Sum_A + 4) / 8.); - MASK_B = (float)((double)(Sum_B + 4) / 8.); - - MASK_A = sqrt(MASK_A*((float)(Local[0]+Local[1]+Local[2]+Local[3])/((float)Global_A/4.f)))/32.f; - MASK_B = sqrt(MASK_B*((float)(Local[4]+Local[5]+Local[6]+Local[7])/((float)Global_B/4.f)))/32.f; - - if (MASK_B > MASK_A) /* MAX(MASK_A, MASK_B) */ - MASK = (uint32_t) (MASK_B * 1024.f + 0.5f); - else - MASK = (uint32_t) (MASK_A * 1024.f + 0.5f); - } + + if (Global_A) + Mult1 = ((local[0]+local[1]+local[2]+local[3])<<8) / Global_A; + + if (Global_B) + Mult2 = ((local[4]+local[5]+local[6]+local[7])<<8) / Global_B; + + MASK_A = isqrt(2*Sum_A*Mult1) + 16; + MASK_B = isqrt(2*Sum_B*Mult2) + 16; + + if (MASK_B > MASK_A) /* MAX(MASK_A, MASK_B) */ + MASK_A = ((MASK_B + 32) >> 6); + else + MASK_A = ((MASK_A + 32) >> 6); /* Step 4: Calculate MSE of DCT coeffs reduced by masking effect */ - for (j = 0; j < 8; j++) { - for (i = 0; i < 8; i++) { - uint32_t Thresh = (MASK * Inv_iMask_Coeff[j*8 + i] + 128) >> 8; - uint32_t u = abs(DCT_A[j*8 + i] - DCT_B[j*8 + i]) << 10; - - if ((i|j)>0) { - if (u < Thresh) - u = 0; /* The error is not perceivable */ - else - u -= Thresh; - } - - { - uint64_t tmp = (u * iCSF_Coeff[j*8 + i] + 512) >> 10; - MSE_H += (uint32_t) ((tmp * tmp) >> 12); - } - } - } - return MSE_H; + + return sseh8_16bit(DCT_A, DCT_B, (uint16_t) MASK_A); } #endif @@ -272,17 +213,17 @@ static void psnrhvsm_after(xvid_plg_data_t *data, psnrhvsm_data_t *psnrhvsm) { DECLARE_ALIGNED_MATRIX(DCT, 2, 64, int16_t, CACHE_LINE); - int x, y, stride = data->original.stride[0]; + int32_t x, y, u, v; int16_t *DCT_A = &DCT[0], *DCT_B = &DCT[64]; - uint8_t *IMG_A = (uint8_t *) data->original.plane[0]; - uint8_t *IMG_B = (uint8_t *) data->current.plane[0]; - uint32_t MSE_H = 0; + uint64_t sse_y = 0, sse_u = 0, sse_v = 0; - psnrhvsm->pixels = data->width * data->height; + for (y = 0; y < data->height>>3; y++) { + uint8_t *IMG_A = (uint8_t *) data->original.plane[0]; + uint8_t *IMG_B = (uint8_t *) data->current.plane[0]; + uint32_t stride = data->original.stride[0]; - for (y = 0; y < data->height; y += 8) { - for (x = 0; x < data->width; x += 8) { - int offset = y*stride + x; + for (x = 0; x < data->width>>3; x++) { /* non multiple of 8 handling ?? */ + int offset = (y<<3)*stride + (x<<3); emms(); @@ -296,15 +237,63 @@ emms(); - /* Calculate MSE_H reduced by contrast masking effect */ - MSE_H += Calc_MSE_H(DCT_A, DCT_B, IMG_A + offset, IMG_B + offset, stride); + /* Calculate SSE_H reduced by contrast masking effect */ + sse_y += calc_SSE_H(DCT_A, DCT_B, IMG_A + offset, IMG_B + offset, stride); + } + } + + for (y = 0; y < data->height>>4; y++) { + uint8_t *U_A = (uint8_t *) data->original.plane[1]; + uint8_t *V_A = (uint8_t *) data->original.plane[2]; + uint8_t *U_B = (uint8_t *) data->current.plane[1]; + uint8_t *V_B = (uint8_t *) data->current.plane[2]; + uint32_t stride_uv = data->current.stride[1]; + + for (x = 0; x < data->width>>4; x++) { /* non multiple of 8 handling ?? */ + int offset = (y<<3)*stride_uv + (x<<3); + + emms(); + + /* Transfer data */ + transfer_8to16copy(DCT_A, U_A + offset, stride_uv); + transfer_8to16copy(DCT_B, U_B + offset, stride_uv); + + /* Perform DCT */ + fdct(DCT_A); + fdct(DCT_B); + + emms(); + + /* Calculate SSE_H reduced by contrast masking effect */ + sse_u += calc_SSE_H(DCT_A, DCT_B, U_A + offset, U_B + offset, stride_uv); + + emms(); + + /* Transfer data */ + transfer_8to16copy(DCT_A, V_A + offset, stride_uv); + transfer_8to16copy(DCT_B, V_B + offset, stride_uv); + + /* Perform DCT */ + fdct(DCT_A); + fdct(DCT_B); + + emms(); + + /* Calculate SSE_H reduced by contrast masking effect */ + sse_v += calc_SSE_H(DCT_A, DCT_B, V_A + offset, V_B + offset, stride_uv); } } - psnrhvsm->mse_sum += MSE_H; + y = (int32_t) ( 4*16*sse_y / (data->width * data->height)); + u = (int32_t) (16*16*sse_u / (data->width * data->height)); + v = (int32_t) (16*16*sse_v / (data->width * data->height)); + + psnrhvsm->mse_sum_y += y; + psnrhvsm->mse_sum_u += u; + psnrhvsm->mse_sum_v += v; psnrhvsm->frame_cnt++; - printf(" psnrhvsm: %2.2f\n", sse_to_PSNR(MSE_H, 256*psnrhvsm->pixels)); + printf(" psnrhvsm y: %2.2f, psnrhvsm u: %2.2f, psnrhvsm v: %2.2f\n", sse_to_PSNR(y, 1024), sse_to_PSNR(u, 1024), sse_to_PSNR(v, 1024)); } static int psnrhvsm_create(xvid_plg_create_t *create, void **handle) @@ -312,10 +301,11 @@ psnrhvsm_data_t *psnrhvsm; psnrhvsm = (psnrhvsm_data_t *) malloc(sizeof(psnrhvsm_data_t)); - psnrhvsm->mse_sum = 0; - psnrhvsm->frame_cnt = 0; + psnrhvsm->mse_sum_y = 0; + psnrhvsm->mse_sum_u = 0; + psnrhvsm->mse_sum_v = 0; - psnrhvsm->pixels = 0; + psnrhvsm->frame_cnt = 0; *(handle) = (void*) psnrhvsm; return 0; @@ -338,14 +328,17 @@ break; case(XVID_PLG_DESTROY): { - uint32_t MSE_H; + uint32_t y, u, v; psnrhvsm_data_t *psnrhvsm = (psnrhvsm_data_t *)handle; if (psnrhvsm) { - MSE_H = (uint32_t) (psnrhvsm->mse_sum / psnrhvsm->frame_cnt); + y = (uint32_t) (psnrhvsm->mse_sum_y / psnrhvsm->frame_cnt); + u = (uint32_t) (psnrhvsm->mse_sum_u / psnrhvsm->frame_cnt); + v = (uint32_t) (psnrhvsm->mse_sum_v / psnrhvsm->frame_cnt); emms(); - printf("Average psnrhvsm: %2.2f\n", sse_to_PSNR(MSE_H, 256*psnrhvsm->pixels)); + printf("Average psnrhvsm y: %2.2f, psnrhvsm u: %2.2f, psnrhvsm v: %2.2f\n", + sse_to_PSNR(y, 1024), sse_to_PSNR(u, 1024), sse_to_PSNR(v, 1024)); free(psnrhvsm); } }