--- quantize_mpeg_mmx.asm 2003/10/28 22:23:03 1.1.2.3 +++ quantize_mpeg_mmx.asm 2004/08/29 10:02:38 1.5 @@ -21,7 +21,7 @@ ; * along with this program ; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * $Id: quantize_mpeg_mmx.asm,v 1.1.2.3 2003/10/28 22:23:03 edgomez Exp $ +; * $Id: quantize_mpeg_mmx.asm,v 1.5 2004/08/29 10:02:38 edgomez Exp $ ; * ; *************************************************************************/ @@ -31,10 +31,19 @@ %macro cglobal 1 %ifdef PREFIX - global _%1 - %define %1 _%1 + %ifdef MARK_FUNCS + global _%1:function %1.endfunc-%1 + %define %1 _%1:function %1.endfunc-%1 + %else + global _%1 + %define %1 _%1 + %endif %else - global %1 + %ifdef MARK_FUNCS + global %1:function %1.endfunc-%1 + %else + global %1 + %endif %endif %endmacro @@ -51,7 +60,11 @@ ; Local data (Read Only) ;============================================================================= +%ifdef FORMAT_COFF SECTION .rodata +%else +SECTION .rodata align=16 +%endif mmx_one: times 4 dw 1 @@ -72,21 +85,6 @@ %assign quant quant+1 %endrep -;----------------------------------------------------------------------------- -; intra matrix -;----------------------------------------------------------------------------- - -cextern intra_matrix -cextern intra_matrix_fix - -;----------------------------------------------------------------------------- -; inter matrix -;----------------------------------------------------------------------------- - -cextern inter_matrix -cextern inter_matrix_fix - - %define VM18P 3 %define VM18Q 4 @@ -146,7 +144,8 @@ ; uint32_t quant_mpeg_intra_mmx(int16_t * coeff, ; const int16_t const * data, ; const uint32_t quant, -; const uint32_t dcscalar); +; const uint32_t dcscalar, +; const uint16_t *mpeg_matrices); ; ;----------------------------------------------------------------------------- @@ -156,10 +155,12 @@ push ecx push esi push edi + push ebx - mov edi, [esp + 12 + 4] ; coeff - mov esi, [esp + 12 + 8] ; data - mov eax, [esp + 12 + 12] ; quant + mov edi, [esp + 16 + 4] ; coeff + mov esi, [esp + 16 + 8] ; data + mov eax, [esp + 16 + 12] ; quant + mov ebx, [esp + 16 + 20] ; mpeg_quant_matrices movq mm5, [quantd + eax * 8 - 8] ; quantd -> mm5 @@ -186,15 +187,15 @@ psubw mm3, mm4 ; psllw mm0, 4 ; level << 4 psllw mm3, 4 - movq mm2, [intra_matrix + 8*ecx] + movq mm2, [ebx + 8*ecx] psrlw mm2, 1 ; intra_matrix[i]>>1 paddw mm0, mm2 - movq mm2, [intra_matrix_fix + ecx*8] + movq mm2, [ebx + 256 + ecx*8] pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i] - movq mm2, [intra_matrix + 8*ecx + 8] + movq mm2, [ebx + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 - movq mm2, [intra_matrix_fix + ecx*8 + 8] + movq mm2, [ebx + 256 + ecx*8 + 8] pmulhw mm3, mm2 paddw mm0, mm5 ; + quantd paddw mm3, mm5 @@ -216,7 +217,7 @@ .done ; caclulate data[0] // (int32_t)dcscalar) - mov ecx, [esp + 12 + 16] ; dcscalar + mov ecx, [esp + 16 + 16] ; dcscalar mov edx, ecx movsx eax, word [esi] ; data[0] shr edx, 1 ; edx = dcscalar /2 @@ -233,6 +234,7 @@ mov [edi], ax ; coeff[0] = ax + pop ebx pop edi pop esi pop ecx @@ -254,15 +256,15 @@ psubw mm3, mm4 ; psllw mm0, 4 psllw mm3, 4 - movq mm2, [intra_matrix + 8*ecx] + movq mm2, [ebx + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 - movq mm2, [intra_matrix_fix + ecx*8] + movq mm2, [ebx + 256 + ecx*8] pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i] - movq mm2, [intra_matrix + 8*ecx + 8] + movq mm2, [ebx + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 - movq mm2, [intra_matrix_fix + ecx*8 + 8] + movq mm2, [ebx + 256 + ecx*8 + 8] pmulhw mm3, mm2 paddw mm0, mm5 paddw mm3, mm5 @@ -295,15 +297,15 @@ psubw mm3, mm4 ; psllw mm0, 4 psllw mm3, 4 - movq mm2, [intra_matrix + 8*ecx] + movq mm2, [ebx + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 - movq mm2, [intra_matrix_fix + ecx*8] + movq mm2, [ebx + 256 + ecx*8] pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i] - movq mm2, [intra_matrix + 8*ecx + 8] + movq mm2, [ebx + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 - movq mm2, [intra_matrix_fix + ecx*8 + 8] + movq mm2, [ebx + 256 + ecx*8 + 8] pmulhw mm3, mm2 paddw mm0, mm5 paddw mm3, mm5 @@ -320,13 +322,14 @@ cmp ecx,16 jnz near .q2loop jmp near .done - +.endfunc ;----------------------------------------------------------------------------- ; ; uint32_t quant_mpeg_inter_mmx(int16_t * coeff, ; const int16_t const * data, -; const uint32_t quant); +; const uint32_t quant, +; const uint16_t *mpeg_matrices); ; ;----------------------------------------------------------------------------- @@ -336,10 +339,12 @@ push ecx push esi push edi + push ebx - mov edi, [esp + 12 + 4] ; coeff - mov esi, [esp + 12 + 8] ; data - mov eax, [esp + 12 + 12] ; quant + mov edi, [esp + 16 + 4] ; coeff + mov esi, [esp + 16 + 8] ; data + mov eax, [esp + 16 + 12] ; quant + mov ebx, [esp + 16 + 16] ; mpeg_quant_matrices xor ecx, ecx @@ -367,15 +372,15 @@ psubw mm3, mm4 ; psllw mm0, 4 psllw mm3, 4 - movq mm2, [inter_matrix + 8*ecx] + movq mm2, [ebx + 512 + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 - movq mm2, [inter_matrix_fix + ecx*8] + movq mm2, [ebx + 768 + ecx*8] pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i] - movq mm2, [inter_matrix + 8*ecx + 8] + movq mm2, [ebx + 512 + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 - movq mm2, [inter_matrix_fix + ecx*8 + 8] + movq mm2, [ebx + 768 + ecx*8 + 8] pmulhw mm3, mm2 pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 pmulhw mm3, mm7 ; @@ -401,6 +406,7 @@ paddd mm0, mm5 movd eax, mm0 ; return sum + pop ebx pop edi pop esi pop ecx @@ -421,15 +427,15 @@ psubw mm3, mm4 ; psllw mm0, 4 psllw mm3, 4 - movq mm2, [inter_matrix + 8*ecx] + movq mm2, [ebx + 512 + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 - movq mm2, [inter_matrix_fix + ecx*8] + movq mm2, [ebx + 768 + ecx*8] pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i] - movq mm2, [inter_matrix + 8*ecx + 8] + movq mm2, [ebx + 512 + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 - movq mm2, [inter_matrix_fix + ecx*8 + 8] + movq mm2, [ebx + 768 + ecx*8 + 8] pmulhw mm3, mm2 psrlw mm0, 1 ; mm0 >>= 1 (/2) psrlw mm3, 1 ; @@ -448,7 +454,6 @@ jmp .done - ALIGN 16 .q2loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] @@ -463,15 +468,15 @@ psubw mm3, mm4 ; psllw mm0, 4 psllw mm3, 4 - movq mm2, [inter_matrix + 8*ecx] + movq mm2, [ebx + 512 + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 - movq mm2, [inter_matrix_fix + ecx*8] + movq mm2, [ebx + 768 + ecx*8] pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i] - movq mm2, [inter_matrix + 8*ecx + 8] + movq mm2, [ebx + 512 + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 - movq mm2, [inter_matrix_fix + ecx*8 + 8] + movq mm2, [ebx + 768 + ecx*8 + 8] pmulhw mm3, mm2 psrlw mm0, 2 ; mm0 >>= 1 (/2) psrlw mm3, 2 ; @@ -489,6 +494,7 @@ jnz near .q2loop jmp .done +.endfunc ;----------------------------------------------------------------------------- @@ -496,7 +502,8 @@ ; uint32_t dequant_mpeg_intra_mmx(int16_t *data, ; const int16_t const *coeff, ; const uint32_t quant, -; const uint32_t dcscalar); +; const uint32_t dcscalar, +; const uint16_t *mpeg_matrices); ; ;----------------------------------------------------------------------------- @@ -519,7 +526,7 @@ psubw mm0, mm1 ; -> mm0 = abs(coeff[i]), mm1 = sign of coeff[i] movq mm2, mm7 ; mm2 = quant - pmullw mm2, [intra_matrix + 8*eax + 8*16 ] ; matrix[i]*quant. + pmullw mm2, [ebx + 8*eax + 8*16 ] ; matrix[i]*quant. movq mm6, mm2 pmulhw mm2, mm0 ; high of coeff*(matrix*quant) (should be 0 if no overflow) @@ -542,9 +549,12 @@ ALIGN 16 dequant_mpeg_intra_mmx: - mov edx, [esp+4] ; data - mov ecx, [esp+8] ; coeff - mov eax, [esp+12] ; quant + push ebx + + mov edx, [esp + 4 + 4] ; data + mov ecx, [esp + 4 + 8] ; coeff + mov eax, [esp + 4 + 12] ; quant + mov ebx, [esp + 4 + 20] ; mpeg_quant_matrices movq mm7, [mmx_mul_quant + eax*8 - 8] mov eax, -16 ; to keep ALIGNed, we regularly process coeff[0] @@ -561,7 +571,7 @@ movq mm2, mm7 ; mm2 = quant pcmpgtw mm4, mm3 ; mm4 = sgn(c') - pmullw mm2, [intra_matrix + 8*eax + 8*16 ] ; matrix[i]*quant + pmullw mm2, [ebx + 8*eax + 8*16 ] ; matrix[i]*quant pxor mm0, mm1 ; negate if negative pxor mm3, mm4 ; negate if negative @@ -577,7 +587,7 @@ pmulhw mm0, mm5 ; high of coeff*(matrix*quant) movq mm5, mm7 ; mm2 = quant - pmullw mm5, [intra_matrix + 8*eax + 8*16 +8] ; matrix[i+1]*quant + pmullw mm5, [ebx + 8*eax + 8*16 +8] ; matrix[i+1]*quant movq mm6, mm5 add eax,2 ; z-flag will be tested later @@ -609,7 +619,7 @@ ; deal with DC movd mm0, [ecx] - pmullw mm0, [esp+16] ; dcscalar + pmullw mm0, [esp + 4 + 16] ; dcscalar movq mm2, [mmx_32767_minus_2047] paddsw mm0, mm2 psubsw mm0, mm2 @@ -620,13 +630,18 @@ mov [edx], ax xor eax, eax + + pop ebx + ret +.endfunc ;----------------------------------------------------------------------------- ; ; uint32_t dequant_mpeg_inter_mmx(int16_t * data, ; const int16_t * const coeff, -; const uint32_t quant); +; const uint32_t quant, +; const uint16_t *mpeg_matrices); ; ;----------------------------------------------------------------------------- @@ -638,9 +653,13 @@ ALIGN 16 dequant_mpeg_inter_mmx: - mov edx, [esp+ 4] ; data - mov ecx, [esp+ 8] ; coeff - mov eax, [esp+12] ; quant + push ebx + + mov edx, [esp + 4 + 4] ; data + mov ecx, [esp + 4 + 8] ; coeff + mov eax, [esp + 4 + 12] ; quant + mov ebx, [esp + 4 + 16] ; mpeg_quant_matrices + movq mm7, [mmx_mul_quant + eax*8 - 8] mov eax, -16 paddw mm7, mm7 ; << 1 @@ -675,13 +694,13 @@ ; we're short on register, here. Poor pairing... movq mm4, mm7 ; (matrix*quant) - pmullw mm4, [inter_matrix + 8*eax + 8*16 -2*8] + pmullw mm4, [ebx + 512 + 8*eax + 8*16 -2*8] movq mm5, mm4 pmulhw mm5, mm0 ; high of c*(matrix*quant) pmullw mm0, mm4 ; low of c*(matrix*quant) movq mm4, mm7 ; (matrix*quant) - pmullw mm4, [inter_matrix + 8*eax + 8*16 -2*8 + 8] + pmullw mm4, [ebx + 512 + 8*eax + 8*16 -2*8 + 8] pcmpgtw mm5, [zero] paddusw mm0, mm5 @@ -725,4 +744,9 @@ xor word [edx + 2*63], ax xor eax, eax + + pop ebx + ret +.endfunc +