--- quantize4_xmm.asm 2002/12/19 00:40:55 1.1 +++ quantize4_xmm.asm 2003/02/15 15:22:19 1.2 @@ -0,0 +1,818 @@ +;/****************************************************************************** +; * * +; * This file is part of XviD, a free MPEG-4 video encoder/decoder * +; * * +; * XviD is an implementation of a part of one or more MPEG-4 Video tools * +; * as specified in ISO/IEC 14496-2 standard. Those intending to use this * +; * software module in hardware or software products are advised that its * +; * use may infringe existing patents or copyrights, and any such use * +; * would be at such party's own risk. The original developer of this * +; * software module and his/her company, and subsequent editors and their * +; * companies, will have no liability for use of this software or * +; * modifications or derivatives thereof. * +; * * +; * XviD is free software; you can redistribute it and/or modify it * +; * under the terms of the GNU General Public License as published by * +; * the Free Software Foundation; either version 2 of the License, or * +; * (at your option) any later version. * +; * * +; * XviD is distributed in the hope that it will be useful, but * +; * WITHOUT ANY WARRANTY; without even the implied warranty of * +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +; * GNU General Public License for more details. * +; * * +; * You should have received a copy of the GNU General Public License * +; * along with this program; if not, write to the Free Software * +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * +; * * +; ******************************************************************************/ +; +;/****************************************************************************** +; * quant4 bugs have been fixed: (a) overflow bug for matrix elements * +; * equal to 1 or 2 is fixed by substituting pmulhw with pmulhuw (iSSE) * +; * and using multiplier 0ffffh instead of 10001h (for matrix element = 1; * +; * in that case, 1 is added before multiplying, that additional 1 comes * +; * from intra_matrix1; (b) rounding error for large coefficients and matrix * +; * elements is fixed by two-step approach: first approximation (rounded * +; * down) is found as usual; the result is multiplied by the matrix element * +; * and mismatch is used to calculate the correction. * +; ******************************************************************************/ +; +; _3dne functions are compatible with iSSE, but are optimized specifically for +; K7 pipelines +; +;------------------------------------------------------------------------------ +; 09.12.2002 Athlon optimizations contributed by Jaan Kalda +;------------------------------------------------------------------------------ + + +; data/text alignment +%define ALIGN 8 +%define SATURATE + +bits 32 + +%ifdef FORMAT_COFF +SECTION .data data +%else +SECTION .data data align=8 +%endif + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +%macro cextern 1 + %ifdef PREFIX + extern _%1 + %define %1 _%1 + %else + extern %1 + %endif +%endmacro +align 8 +mmzero dd 0,0 + +mmx_one times 4 dw 1 + +;=========================================================================== +; +; divide by 2Q table +; +;=========================================================================== + +align ALIGN +mmx_divs ;i>2 +%assign i 1 +%rep 31 + times 4 dw ((1 << 15) / i + 1) + %assign i i+1 +%endrep + +align ALIGN +mmx_div ;i>2 +%assign i 1 +%rep 31 + times 4 dw ((1 << 16) / i + 1) + %assign i i+1 +%endrep + + +;=========================================================================== +; +; intra matrix +; +;=========================================================================== + +%macro FIXX 1 +dw (1 << 16) / (%1) + 1 +%endmacro + +cextern intra_matrix_fixl +cextern intra_matrix_fix +cextern intra_matrix1 +cextern intra_matrix + +;=========================================================================== +; +; inter matrix +; +;=========================================================================== + +cextern inter_matrix1 +cextern inter_matrix +cextern inter_matrix_fix +cextern inter_matrix_fixl + + +%define VM18P 3 +%define VM18Q 4 +%define nop4 DB 08Dh,074h,026h,0 +%define nop3 add esp,byte 0 +%define nop2 mov esp,esp +%define nop7 db 08dh,02ch,02dh,0,0,0,0 +%define nop6 add ebp,dword 0 +;=========================================================================== +; +; quantd table +; +;=========================================================================== + + +quantd +%assign i 1 +%rep 31 + times 4 dw (((VM18P*i) + (VM18Q/2)) / VM18Q) + %assign i i+1 +%endrep + +;=========================================================================== +; +; multiple by 2Q table +; +;=========================================================================== + + +mmx_mul_quant +%assign i 1 +%rep 31 + times 4 dw i + %assign i i+1 +%endrep + +;=========================================================================== +; +; saturation limits +; +;=========================================================================== + +align 16 + +mmx_32767_minus_2047 times 4 dw (32767-2047) +mmx_32768_minus_2048 times 4 dw (32768-2048) +mmx_2047 times 4 dw 2047 +mmx_minus_2048 times 4 dw (-2048) +zero times 4 dw 0 + +int_div +dd 0 +%assign i 1 +%rep 255 + dd (1 << 17) / ( i) + 1 + %assign i i+1 +%endrep + +section .text + +;=========================================================================== +; +; void quant_intra4_xmm(int16_t * coeff, +; const int16_t const * data, +; const uint32_t quant, +; const uint32_t dcscalar); +; +;=========================================================================== + +align ALIGN +cglobal quant4_intra_xmm +quant4_intra_xmm + + mov eax, [esp + 8] ; data + mov ecx, [esp + 12] ; quant + mov edx, [esp + 4] ; coeff + push esi + push edi + push ebx + nop + mov edi,mmzero + mov esi,-14 + pxor mm0,mm0 + pxor mm3,mm3 + cmp ecx,byte 1 + je near .q1loop + cmp ecx,byte 19 + jg near .lloop + nop6 + + +align ALIGN +.loop + movq mm1, [eax + 8*esi+112] ; mm0 = [1st] + psubw mm0,mm1 ;-mm1 + movq mm4, [eax + 8*esi + 120] ; + psubw mm3,mm4 ;-mm4 + pmaxsw mm0,mm1 ;|src| + pmaxsw mm3,mm4 + nop2 + psraw mm1,15 ;sign src + psraw mm4,15 + psllw mm0, 4 ; level << 4 + psllw mm3, 4 ; + paddw mm0, [intra_matrix1 + 8*esi+112] + paddw mm3, [intra_matrix1 + 8*esi+120] + movq mm5,[intra_matrix_fixl + 8*esi+112] + movq mm7,[intra_matrix_fixl + 8*esi+120] + pmulhuw mm5,mm0 + pmulhuw mm7,mm3 + mov esp,esp + movq mm2,[intra_matrix + 8*esi+112] + movq mm6,[intra_matrix + 8*esi+120] + pmullw mm2,mm5 + pmullw mm6,mm7 + psubw mm0,mm2 + psubw mm3,mm6 + nop4 + movq mm2,[quantd + ecx * 8 - 8] + movq mm6,[mmx_divs + ecx * 8 - 8] + paddw mm5,mm2 + paddw mm7,mm2 + mov esp,esp + pmulhuw mm0,[intra_matrix_fix + 8*esi+112] + pmulhuw mm3,[intra_matrix_fix + 8*esi+120] + paddw mm5,mm0 + paddw mm7,mm3 + movq mm0,[edi] + movq mm3,[edi] + pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 + pmulhuw mm7, mm6 ; (level + quantd) / quant (0> 16 + pmulhuw mm7, mm6 ; (level + quantd) / quant (0> 16 + pmulhuw mm7, mm6 ; (level ) / quant (0> 16 + pmulhuw mm7, mm6 ; (level ) / quant (00 in one shot. + ; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0. + ; It's mixed with the extraction of the absolute value. + +align 16 +cglobal dequant4_inter_3dne +dequant4_inter_3dne: + + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm7, [mmx_mul_quant + eax*8 - 8] + mov eax, -14 + paddw mm7, mm7 ; << 1 + pxor mm6, mm6 ; mismatch sum + push esi + mov esi,mmzero + pxor mm1,mm1 + pxor mm3,mm3 + nop + nop4 + +align 16 +.loop + movq mm0, [ecx+8*eax + 7*16 ] ; mm0 = coeff[i] + pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved) + movq mm2, [ecx+8*eax + 7*16 +8] ; mm2 = coeff[i+1] + pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved) + paddsw mm0, mm1 ; c += sgn(c) + paddsw mm2, mm3 ; c += sgn(c') + paddw mm0, mm0 ; c *= 2 + paddw mm2, mm2 ; c'*= 2 + + movq mm4, [esi] + movq mm5, [esi] + psubw mm4, mm0 ; -c + psubw mm5, mm2 ; -c' + + psraw mm4, 16 ; mm4 = sgn(-c) + psraw mm5, 16 ; mm5 = sgn(-c') + psubsw mm0, mm4 ; c -= sgn(-c) + psubsw mm2, mm5 ; c' -= sgn(-c') + pxor mm0, mm1 ; finish changing sign if needed + pxor mm2, mm3 ; finish changing sign if needed + + ; we're short on register, here. Poor pairing... + + movq mm4, mm7 ; (matrix*quant) + nop + pmullw mm4, [inter_matrix + 8*eax + 7*16] + movq mm5, mm4 + pmulhw mm5, mm0 ; high of c*(matrix*quant) + pmullw mm0, mm4 ; low of c*(matrix*quant) + + movq mm4, mm7 ; (matrix*quant) + pmullw mm4, [inter_matrix + 8*eax + 7*16 + 8] + add eax,byte 2 + + pcmpgtw mm5, [esi] + paddusw mm0, mm5 + psrlw mm0, 5 + pxor mm0, mm1 ; start restoring sign + psubusw mm1, mm5 + + movq mm5, mm4 + pmulhw mm5, mm2 ; high of c*(matrix*quant) + pmullw mm2, mm4 ; low of c*(matrix*quant) + psubw mm0, mm1 ; finish restoring sign + + pcmpgtw mm5, [esi] + paddusw mm2, mm5 + psrlw mm2, 5 + pxor mm2, mm3 ; start restoring sign + psubusw mm3, mm5 + psubw mm2, mm3 ; finish restoring sign + movq mm1, [esi] + movq mm3, [byte esi] + pxor mm6, mm0 ; mismatch control + movq [edx + 8*eax + 7*16 -2*8 ], mm0 ; data[i] + pxor mm6, mm2 ; mismatch control + movq [edx + 8*eax + 7*16 -2*8 +8], mm2 ; data[i+1] + + jng .loop + nop + + ; mismatch control + + pshufw mm0,mm6,01010101b + pshufw mm1,mm6,10101010b + pshufw mm2,mm6,11111111b + pxor mm6, mm0 + pxor mm1, mm2 + pxor mm6, mm1 + movd eax, mm6 + and eax,byte 1 + xor eax,byte 1 + mov esi,[esp] + add esp,byte 4 + xor word [edx + 2*63], ax + + ret +