Diff of /xvidcore/src/quant/x86_asm/quantize_3dne.asm

-revision 1.1, Thu Dec 19 00:40:50 2002 UTC
+revision 1.2, Sat Feb 15 15:22:19 2003 UTC
-Line 0
+Line 1
+ ;/**************************************************************************
+ ; *
+ ; *     XVID MPEG-4 VIDEO CODEC
+ ; *     mmx quantization/dequantization
+ ; *
+ ; *     This program is an implementation of a part of one or more MPEG-4
+ ; *     Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
+ ; *     to use this software module in hardware or software products are
+ ; *     advised that its use may infringe existing patents or copyrights, and
+ ; *     any such use would be at such party's own risk.  The original
+ ; *     developer of this software module and his/her company, and subsequent
+ ; *     editors and their companies, will have no liability for use of this
+ ; *     software or modifications or derivatives thereof.
+ ; *
+ ; *     This program is free software; you can redistribute it and/or modify
+ ; *     it under the terms of the GNU General Public License as published by
+ ; *     the Free Software Foundation; either version 2 of the License, or
+ ; *     (at your option) any later version.
+ ; *
+ ; *     This program is distributed in the hope that it will be useful,
+ ; *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ; *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ; *     GNU General Public License for more details.
+ ; *
+ ; *     You should have received a copy of the GNU General Public License
+ ; *     along with this program; if not, write to the Free Software
+ ; *     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ ; *
+ ; *************************************************************************/
+ ; these 3dne functions are compatible with iSSE, but are optimized specifically for
+ ; K7 pipelines
+ ;
+ ;------------------------------------------------------------------------------
+ ; 09.12.2002  Athlon optimizations contributed by Jaan Kalda
+ ;------------------------------------------------------------------------------
+ ; enable dequant saturate [-2048,2047], test purposes only.
+ %define SATURATE
+ ; data/text alignment
+ %define ALIGN 16
+ bits 32
+ %ifdef FORMAT_COFF
+ section .data data
+ %else
+ section .data data align=16
+ %endif
+ %macro cglobal 1
+         %ifdef PREFIX
+                 global _%1
+                 %define %1 _%1
+         %else
+                 global %1
+         %endif
+ %endmacro
+ align 4
+ int_div
+ dd 0
+ %assign i 1
+ %rep 255
+         dd  (1 << 16) / ( i) + 1
+         %assign i i+1
+ %endrep
+ align 16
+ plus_one times 8        dw       1
+ ;===========================================================================
+ ;
+ ; subtract by Q/2 table
+ ;
+ ;===========================================================================
+ %macro MMX_SUB  1
+ times 4 dw %1 / 2
+ %endmacro
+ align 16
+ mmx_sub
+ %assign i 1
+ %rep 31
+         times 4 dw i / 2
+         %assign i i+1
+ %endrep
+ ;===========================================================================
+ ;
+ ; divide by 2Q table
+ ;
+ ; use a shift of 16 to take full advantage of _pmulhw_
+ ; for q=1, _pmulhw_ will overflow so it is treated seperately
+ ; (3dnow2 provides _pmulhuw_ which wont cause overflow)
+ ;
+ ;===========================================================================
+ align 16
+ mmx_div
+ %assign i 1
+ %rep 31
+         times 4 dw  (1 << 16) / (i * 2) + 1
+         %assign i i+1
+ %endrep
+ ;===========================================================================
+ ;
+ ; add by (odd(Q) ? Q : Q - 1) table
+ ;
+ ;===========================================================================
+ %macro MMX_ADD  1
+ %if %1 % 2 != 0
+ times 4 dw %1
+ %else
+ times 4 dw %1 - 1
+ %endif
+ %endmacro
+ align 16
+ mmx_add
+ %assign i 1
+ %rep 31
+         MMX_ADD i
+         %assign i i+1
+ %endrep
+ ;===========================================================================
+ ;
+ ; multiple by 2Q table
+ ;
+ ;===========================================================================
+ %macro MMX_MUL  1
+ times 4 dw %1 * 2
+ %endmacro
+ align 16
+ mmx_mul
+ %assign i 1
+ %rep 31
+         times 4 dw i * 2
+         %assign i i+1
+ %endrep
+ ;===========================================================================
+ ;
+ ; saturation limits
+ ;
+ ;===========================================================================
+ align 8
+ mmx_32768_minus_2048                            times 4 dw (32768-2048)
+ mmx_32767_minus_2047                            times 4 dw (32767-2047)
+ align 16
+ mmx_2047 times 4 dw 2047
+ align 8
+ mmzero dd 0, 0
+ int2047 dd 2047
+ int_2048 dd -2048
+ section .text
+ ;===========================================================================
+ ;
+ ; void quant_intra_3dne(int16_t * coeff,
+ ;                                       const int16_t const * data,
+ ;                                       const uint32_t quant,
+ ;                                       const uint32_t dcscalar);
+ ;
+ ;===========================================================================
+ ;This is Athlon-optimized code (ca 70 clk per call)
+ ;Optimized by Jaan, 30 Nov 2002
+  %macro quant_intra1  1
+                 psubw   mm1,mm0   ;A3
+                 psubw   mm3,mm2   ;B3
+ %if (%1)
+                 psubw   mm5, mm4        ;C8
+                 psubw   mm7, mm6        ;D8
+ %endif
+ align 8
+                 db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq      mm4, [ecx + %1 * 32 +16+32]     ;C1
+                 pmaxsw  mm1,mm0   ;A4
+                 db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24);movq       mm6, [ecx + %1 * 32 +24+32]     ;D1
+                 pmaxsw  mm3,mm2   ;B4
+                 psraw   mm0,15    ;A5
+                 psraw   mm2,15    ;B5
+ %if (%1)
+                 movq    [edx + %1 * 32 + 16-32], mm5 ;C9
+                 movq    [edx + %1 * 32 + 24-32], mm7 ;D9
+ %endif
+                 psrlw   mm1, 1  ;A6
+                 psrlw   mm3, 1  ;B6
+                 movq    mm5, [ebx]      ;C2
+                 movq    mm7, [ebx]      ;D2
+                 pxor    mm1, mm0        ;A7
+                 pxor    mm3, mm2        ;B7
+                 psubw   mm5,mm4   ;C3
+                 psubw   mm7,mm6   ;D3
+                 psubw   mm1, mm0        ;A8
+                 psubw   mm3, mm2        ;B8
+ %if (%1 == 0)
+                 push    ebp
+                 movq    mm0, [ecx + %1 * 32 +32]
+ %elif (%1 < 3)
+                 db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq      mm0, [ecx + %1 * 32 +32]        ;A1
+ %endif
+                 pmaxsw  mm5,mm4   ;C4
+ %if (%1 < 3)
+                 db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq   mm2, [ecx + %1 * 32 +8+32]      ;B1
+ %else
+  cmp esp,esp
+ %endif
+                 pmaxsw  mm7,mm6   ;D4
+                 psraw   mm4,15    ;C5
+                 psraw   mm6,15    ;D5
+                 movq    [byte edx + %1 * 32], mm1 ;A9
+                 movq    [edx + %1 * 32+8], mm3    ;B9
+                 psrlw   mm5, 1  ;C6
+                 psrlw   mm7, 1  ;D6
+ %if (%1 < 3)
+                 movq    mm1, [ebx]      ;A2
+                 movq    mm3, [ebx]      ;B2
+ %endif
+ %if (%1 == 3)
+                                 imul    eax,[int_div+4*edi]
+ %endif
+                 pxor    mm5, mm4        ;C7
+                 pxor    mm7, mm6        ;D7
+ %endm
+ %macro quant_intra  1 ;rules for athlon: 1) schedule latencies, 2) add/mul and load/store in 2:1 proportion,
+                                                 ; 3) avoid spliting >3byte instructions over 8byte boundaries
+                 psubw   mm1,mm0   ;A3
+                 psubw   mm3,mm2   ;B3
+ %if (%1)
+                 psubw   mm5, mm4        ;C8
+                 psubw   mm7, mm6        ;D8
+ %endif
+ align 8
+                 db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq      mm4, [ecx + %1 * 32 +16+32]     ;C1
+                 pmaxsw  mm1,mm0   ;A4
+                 db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24);movq       mm6, [ecx + %1 * 32 +24+32]     ;D1
+                 pmaxsw  mm3,mm2   ;B4
+                 psraw   mm0,15    ;A5
+                 psraw   mm2,15    ;B5
+ %if (%1)
+                 movq    [edx + %1 * 32 + 16-32], mm5 ;C9
+                 movq    [edx + %1 * 32 + 24-32], mm7 ;D9
+ %endif
+                 pmulhw  mm1, [esi]  ;A6
+                 pmulhw  mm3, [esi]      ;B6
+                 movq    mm5, [ebx]      ;C2
+                 movq    mm7, [ebx]      ;D2
+                 nop
+                 nop
+                 pxor    mm1, mm0        ;A7
+                 pxor    mm3, mm2        ;B7
+                 psubw   mm5,mm4   ;C3
+                 psubw   mm7,mm6   ;D3
+                 psubw   mm1, mm0        ;A8
+                 psubw   mm3, mm2        ;B8
+ %if (%1 < 3)
+                 db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq      mm0, [ecx + %1 * 32 +32]        ;A1
+ %endif
+                 pmaxsw  mm5,mm4   ;C4
+ %if (%1 < 3)
+                 db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq   mm2, [ecx + %1 * 32 +8+32]      ;B1
+ %else
+  cmp esp,esp
+ %endif
+                 pmaxsw  mm7,mm6   ;D4
+                 psraw   mm4,15    ;C5
+                 psraw   mm6,15    ;D5
+                 movq    [byte edx + %1 * 32], mm1 ;A9
+                 movq    [edx + %1 * 32+8], mm3    ;B9
+                 pmulhw  mm5, [esi]      ;C6
+                 pmulhw  mm7, [esi]      ;D6
+ %if (%1 < 3)
+                 movq    mm1, [ebx]      ;A2
+                 movq    mm3, [ebx]      ;B2
+ %endif
+ %if (%1 == 0)
+                 push    ebp
+ %elif (%1 < 3)
+                 nop
+ %endif
+                 nop
+ %if (%1 == 3)
+                                 imul    eax,[int_div+4*edi]
+ %endif
+                 pxor    mm5, mm4        ;C7
+                 pxor    mm7, mm6        ;D7
+ %endmacro
+ align ALIGN
+ cglobal quant_intra_3dne
+ quant_intra_3dne:
+                 mov     eax, [esp + 12]         ; quant
+                 mov     ecx, [esp + 8]          ; data
+                 mov     edx, [esp + 4]          ; coeff
+                 cmp     al, 1
+                 pxor    mm1,mm1
+                 pxor    mm3,mm3
+                 movq    mm0, [ecx ]             ; mm0 = [1st]
+                 movq    mm2, [ecx +8]
+                 push esi
+                 lea     esi, [mmx_div + eax * 8 - 8]
+                 push ebx
+                 mov     ebx,mmzero
+                 push edi
+                 jz      near .q1loop
+ quant_intra 0
+ mov     ebp, [esp + 16 + 16]    ; dcscalar
+ movsx   eax, word [byte ecx] ;x
+ quant_intra 1
+ mov             edi,eax
+ sar             edi,31 ;sign(x)
+ shr     ebp,byte 1                      ; ebp = dcscalar /2
+ quant_intra 2
+ sub             eax,edi ; x (+1)
+ xor     ebp,edi ;sign(x) dcscalar /2  (-1)
+ mov             edi,[esp + 16 + 16]
+ lea             eax,[byte eax+ebp]  ;x + sign(x) dcscalar /2
+ mov             ebp,[byte esp]
+ quant_intra 3
+                 psubw   mm5, mm4        ;C8
+                 mov     esi,[esp+12]
+                 mov             edi,[esp+4]
+                 mov     ebx,[esp+8]
+                 add esp,byte 16
+                 sar     eax,16
+                 mov     [edx], ax               ; coeff[0] = ax
+                 psubw   mm7, mm6        ;D8
+                 movq    [edx + 3 * 32 + 16], mm5 ;C9
+                 movq    [edx + 3 * 32 + 24], mm7 ;D9
+                 ret
+ align 16
+ .q1loop
+ quant_intra1 0
+ mov     ebp, [esp + 16 + 16]    ; dcscalar
+ movsx   eax, word [byte ecx] ;x
+ quant_intra1 1
+ mov             edi,eax
+ sar             edi,31 ;sign(x)
+ shr     ebp,byte 1                      ; ebp = dcscalar /2
+ quant_intra1 2
+ sub             eax,edi ; x (+1)
+ xor     ebp,edi ;sign(x) dcscalar /2  (-1)
+ mov             edi,[esp + 16 + 16]
+ lea             eax,[byte eax+ebp]  ;x + sign(x) dcscalar /2
+ mov             ebp,[byte esp]
+ quant_intra1 3
+                 psubw   mm5, mm4        ;C8
+                 mov     esi,[dword esp+12]
+                 mov             edi,[esp+4]
+                 mov     ebx,[esp+8]
+                 add esp,byte 16
+                 sar     eax,16
+                 mov     [edx], ax               ; coeff[0] = ax
+                 psubw   mm7, mm6        ;D8
+                 movq    [edx + 3 * 32 + 16], mm5 ;C9
+                 movq    [edx + 3 * 32 + 24], mm7 ;D9
+                 ret
+ ;===========================================================================
+ ;
+ ; uint32_t quant_inter_3dne(int16_t * coeff,
+ ;                                       const int16_t const * data,
+ ;                                       const uint32_t quant);
+ ;
+ ;===========================================================================
+ ;This is Athlon-optimized code (ca 90 clk per call)
+ ;Optimized by Jaan, 30 Nov 2002
+ %macro quantinter 1
+                 movq    mm1, [eax] ;A2
+                 psraw   mm3,15                  ;B6
+ %if (%1)
+                 psubw   mm2, mm6                ;C10
+ %endif
+                 psubw   mm1,mm0 ;A3
+                 pmulhw  mm4, mm7                ; B7
+                 movq    mm6, [ecx + %1*24+16]   ; C1
+                 pmaxsw  mm1,mm0 ;A4
+                 paddw   mm5, mm4                ;B8
+ %if (%1)
+                 movq    [edx + %1*24+16-24], mm2 ;C11
+ %endif
+                 psubusw mm1, [ebx]              ; A5 mm0 -= sub (unsigned, dont go < 0)
+                 pxor    mm4, mm3                ;B9
+                 movq    mm2, [eax] ;C2
+                 psraw   mm0,15                  ;A6
+                 psubw   mm4, mm3                ;B10
+                 psubw   mm2,mm6  ;C3
+                 pmulhw  mm1, mm7                ; A7 mm0 = (mm0 / 2Q) >> 24
+                 movq    mm3, [ecx + %1*24+8]    ; B1
+                 pmaxsw  mm2,mm6 ;C4
+                 paddw   mm5, mm1                ; A8 sum += mm0
+ %if (%1)
+                 movq    [edx + %1*24+8-24], mm4 ;B11
+ %else
+                 movq    [edx + 120], mm4 ;B11
+ %endif
+                 psubusw mm2, [ebx]              ;C5
+                 pxor    mm1, mm0                ; A9 mm0 *= sign(mm0)
+                 movq    mm4, [eax] ;B2
+                 psraw   mm6,15                  ;C6
+                 psubw   mm1, mm0                ;A10 undisplace
+                 psubw   mm4,mm3 ;B3
+                 pmulhw  mm2, mm7                ; C7
+                 movq    mm0, [ecx + %1*24+24]           ;A1 mm0 = [1st]
+                 pmaxsw  mm4,mm3 ;B4
+                 paddw   mm5, mm2                ;C8
+                 movq    [byte edx + %1*24], mm1 ;A11
+                 psubusw mm4, [ebx]              ;B5
+                 pxor    mm2, mm6                ;C9
+ %endmacro
+ %macro quantinter1 1
+                 movq    mm0, [byte ecx + %1*16]         ; mm0 = [1st]
+                 movq    mm3, [ecx + %1*16+8]    ;
+                 movq    mm1, [eax]
+                 movq    mm4, [eax]
+                 psubw   mm1,mm0
+                 psubw   mm4,mm3
+                 pmaxsw  mm1,mm0
+                 pmaxsw  mm4,mm3
+                 psubusw mm1, mm6                ; mm0 -= sub (unsigned, dont go < 0)
+                 psubusw mm4, mm6                ;
+                 psraw   mm0,15
+                 psraw   mm3,15
+                 psrlw   mm1, 1          ; mm0 = (mm0 / 2Q) >> 16
+                 psrlw   mm4, 1          ;
+                 paddw   mm5, mm1                ; sum += mm0
+                 pxor    mm1, mm0                ; mm0 *= sign(mm0)
+                 paddw   mm5, mm4
+                 pxor    mm4, mm3                ;
+                 psubw   mm1, mm0                ; undisplace
+                 psubw   mm4, mm3
+                 cmp     esp,esp
+                 movq    [byte edx + %1*16], mm1
+                 movq    [edx + %1*16+8], mm4
+ %endmacro
+ align ALIGN
+ cglobal quant_inter_3dne
+                 quant_inter_3dne
+                 mov     edx, [esp  + 4]         ; coeff
+                 mov     ecx, [esp  + 8]         ; data
+                 mov     eax, [esp  + 12]        ; quant
+                 push    ebx
+                 pxor mm5, mm5                           ; sum
+                 nop
+                 lea ebx,[mmx_sub + eax * 8 - 8] ; sub
+                 movq    mm7, [mmx_div + eax * 8 - 8]    ; divider
+                 cmp     al, 1
+                 lea     eax,[mmzero]
+                 jz  near .q1loop
+                 cmp     esp,esp
+ align 8
+                 movq    mm3, [ecx + 120]        ; B1
+                 pxor    mm4,mm4 ;B2
+                 psubw   mm4,mm3 ;B3
+                 movq    mm0, [ecx]              ;A1 mm0 = [1st]
+                 pmaxsw  mm4,mm3 ;B4
+                 psubusw mm4, [ebx]              ;B5
+                 quantinter 0
+                 quantinter 1
+                 quantinter 2
+                 quantinter 3
+                 quantinter 4
+                 psraw   mm3,15                  ;B6
+                 psubw   mm2, mm6                ;C10
+                 pmulhw  mm4, mm7                ; B7
+                 paddw   mm5, mm4                ;B8
+                 pxor    mm4, mm3                ;B9
+                 psubw   mm4, mm3                ;B10
+                 movq    [edx + 4*24+16], mm2 ;C11
+                 pop ebx
+                 movq    [edx + 4*24+8], mm4 ;B11
+                 pmaddwd mm5, [plus_one]
+                 movq    mm0, mm5
+                 punpckhdq   mm5, mm5
+                 paddd   mm0, mm5
+                 movd    eax, mm0                ; return sum
+                 ret
+ align ALIGN
+ .q1loop
+                 movq mm6,[byte ebx]
+                 quantinter1 0
+                 quantinter1 1
+                 quantinter1 2
+                 quantinter1 3
+                 quantinter1 4
+                 quantinter1 5
+                 quantinter1 6
+                 quantinter1 7
+                 pmaddwd mm5, [plus_one]
+                 movq    mm0, mm5
+                 psrlq   mm5, 32
+                 paddd   mm0, mm5
+                 movd    eax, mm0                ; return sum
+                 pop ebx
+                 ret
+ ;===========================================================================
+ ;
+ ; void dequant_intra_3dne(int16_t *data,
+ ;                                       const int16_t const *coeff,
+ ;                                       const uint32_t quant,
+ ;                                       const uint32_t dcscalar);
+ ;
+ ;===========================================================================
+   ; this is the same as dequant_inter_3dne, except that we're
+   ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
+ ;This is Athlon-optimized code (ca 106 clk per call)
+ %macro dequant 1
+   movq mm1, [ecx+%1*24]     ;A2 ; c  = coeff[i]
+   psubw mm0,mm1 ;-c             ;A3 (1st dep)
+ %if (%1)
+   paddw mm4,mm6 ;                       C11 mm6 free (4th+)
+ %endif
+   pmaxsw mm0,mm1 ;|c|           ;A4 (2nd)
+ %if (%1)
+  mov ebp,ebp
+   pminsw mm4,[ebx] ;            C12 saturates to +2047 (5th+) 1ater
+ %endif
+   movq  mm6,[esi] ;0            ;A5  mm6 in use
+   pandn mm7,[eax] ;              B9 offset = isZero ? 0 : quant_add (2nd)
+ %if (%1)
+   pxor mm5, mm4 ;                       C13 (6th+) 1later
+ %endif
+   movq  mm4,[esi] ;                     C1 ;0
+   mov esp,esp
+   pcmpeqw mm6, [ecx+%1*24]  ;A6 (c ==0) ? -1 : 0 (1st)
+ align 4
+   psraw mm1,15 ; sign(c)        ;A7 (2nd)
+ %if (%1)
+   movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later
+ %endif
+   paddw mm7,mm3 ;                       B10  offset +negate back (3rd)
+   pmullw mm0, [edi] ;*= 2Q  ;A8 (3rd+)
+   paddw mm2,mm7 ;                       B11 mm7 free (4th+)
+   lea ebp,[byte ebp]
+   movq mm5, [ecx+%1*24+16]  ;C2 ; c  = coeff[i]
+   psubw mm4,mm5 ;-c             ;C3 (1st dep)
+   pandn mm6,[eax] ;              A9 offset = isZero ? 0 : quant_add (2nd)
+   pminsw mm2,[ebx] ;            B12 saturates to +2047 (5th+)
+   pxor mm3, mm2 ;                       B13 (6th+)
+   movq  mm2,[byte esi] ;                        B1 ;0
+ %if (%1)
+   movq [edx+%1*24+8-24], mm3 ;  B14 (7th)
+ %else
+   movq [edx+120], mm3
+ %endif
+   pmaxsw mm4,mm5 ;|c|           ;C4 (2nd)
+   paddw mm6,mm1 ;                       A10  offset +negate back (3rd)
+   movq mm3, [ecx+%1*24 + 8]  ;B2 ; c  = coeff[i]
+   psubw mm2,mm3 ;-c             ;B3 (1st dep)
+   paddw mm0,mm6 ;                       A11 mm6 free (4th+)
+   movq  mm6,[byte esi] ;0               ;C5  mm6 in use
+   pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st)
+   pminsw mm0,[ebx] ;            A12 saturates to +2047 (5th+)
+   pmaxsw mm2,mm3 ;|c|           ;B4 (2nd)
+   pxor mm1, mm0 ;                       A13 (6th+)
+   pmullw mm4, [edi] ;*= 2Q  ;C8 (3rd+)
+   psraw mm5,15 ; sign(c)        ;C7 (2nd)
+   movq  mm7,[byte esi] ;0               ;B5 mm7 in use
+   pcmpeqw mm7, [ecx+%1*24 + 8]                  ;B6 (c ==0) ? -1 : 0 (1st)
+ %if (%1 < 4)
+   movq  mm0,[byte esi] ;                        A1 ;0
+ %endif
+   pandn mm6,[byte eax] ;                 C9 offset = isZero ? 0 : quant_add (2nd)
+   psraw mm3,15 ; sign(c)        ;B7 (2nd)
+   movq [byte edx+%1*24], mm1 ;  A14 (7th)
+   paddw mm6,mm5 ;                       C10  offset +negate back (3rd)
+   pmullw mm2, [edi] ;*= 2Q  ;B8 (3rd+)
+   mov   esp,esp
+ %endmacro
+ align ALIGN
+ cglobal dequant_intra_3dne
+ dequant_intra_3dne:
+   mov    ecx, [esp+ 8]        ; coeff
+   mov    eax, [esp+12]        ; quant
+   pxor mm0,mm0
+   pxor mm2,mm2
+   push  edi
+   push  ebx
+   lea edi,[mmx_mul + eax*8 - 8]  ; 2*quant
+   push ebp
+   mov   ebx,mmx_2047
+   movsx ebp,word [ecx]
+   lea eax,[mmx_add + eax*8 - 8]  ; quant or quant-1
+   push  esi
+   mov   esi,mmzero
+   pxor mm7,mm7
+   movq mm3, [ecx+120]            ;B2 ; c  = coeff[i]
+   pcmpeqw mm7, [ecx+120]                ;B6 (c ==0) ? -1 : 0 (1st)
+   imul ebp,[esp+16+16]    ; dcscalar
+   psubw mm2,mm3 ;-c             ;B3 (1st dep)
+   pmaxsw mm2,mm3 ;|c|           ;B4 (2nd)
+   pmullw mm2, [edi] ;*= 2Q  ;B8 (3rd+)
+   psraw mm3,15 ; sign(c)        ;B7 (2nd)
+   mov   edx, [esp+ 4+16]        ; data
+ align 8
+ dequant 0
+   cmp   ebp,-2048
+   mov esp,esp
+ dequant 1
+   cmovl ebp,[int_2048]
+   nop
+ dequant 2
+   cmp   ebp,2047
+   mov esp,esp
+ dequant 3
+   cmovg ebp,[int2047]
+   nop
+ dequant 4
+   paddw mm4,mm6 ;                       C11 mm6 free (4th+)
+   pminsw mm4,[ebx] ;            C12 saturates to +2047 (5th+)
+   pandn mm7,[eax] ;              B9 offset = isZero ? 0 : quant_add (2nd)
+         mov     eax,ebp
+         mov     esi,[esp]
+         mov     ebp,[esp+4]
+   pxor mm5, mm4 ;                       C13 (6th+)
+   paddw mm7,mm3 ;                       B10  offset +negate back (3rd)
+   movq [edx+4*24+16], mm5 ; C14 (7th)
+   paddw mm2,mm7 ;                       B11 mm7 free (4th+)
+   pminsw mm2,[ebx] ;            B12 saturates to +2047 (5th+)
+         mov     ebx,[esp+8]
+         mov     edi,[esp+12]
+         add esp,byte 16
+   pxor mm3, mm2 ;                       B13 (6th+)
+   movq [edx+4*24+8], mm3 ;      B14 (7th)
+   mov [edx], ax
+   ret
+ ;===========================================================================
+ ;
+ ; void dequant_inter_3dne(int16_t * data,
+ ;                                       const int16_t * const coeff,
+ ;                                       const uint32_t quant);
+ ;
+ ;===========================================================================
+   ; this is the same as dequant_inter_3dne,
+   ; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
+ ;This is Athlon-optimized code (ca 100 clk per call)
+ ;Optimized by Jaan, 30 Nov 2002
+ align ALIGN
+ cglobal dequant_inter_3dne
+ dequant_inter_3dne:
+   mov    ecx, [esp+ 8]        ; coeff
+   mov    eax, [esp+12]        ; quant
+   pxor mm0,mm0
+   pxor mm2,mm2
+   push  edi
+   push  ebx
+   push  esi
+   lea edi,[mmx_mul + eax*8 - 8]  ; 2*quant
+   mov   ebx,mmx_2047
+   pxor mm7,mm7
+   movq mm3, [ecx+120]            ;B2 ; c  = coeff[i]
+   pcmpeqw mm7, [ecx+120]                ;B6 (c ==0) ? -1 : 0 (1st)
+   lea eax,[mmx_add + eax*8 - 8]  ; quant or quant-1
+   psubw mm2,mm3 ;-c             ;B3 (1st dep)
+   mov   esi,mmzero
+   pmaxsw mm2,mm3 ;|c|           ;B4 (2nd)
+   pmullw mm2, [edi] ;*= 2Q  ;B8 (3rd+)
+   psraw mm3,15 ; sign(c)        ;B7 (2nd)
+   mov   edx, [dword esp+ 4+12]        ; data
+ align 8
+ dequant 0
+ dequant 1
+ dequant 2
+ dequant 3
+ dequant 4
+   paddw mm4,mm6 ;                       C11 mm6 free (4th+)
+   pminsw mm4,[ebx] ;            C12 saturates to +2047 (5th+)
+   pandn mm7,[eax] ;              B9 offset = isZero ? 0 : quant_add (2nd)
+         mov     esi,[esp]
+   pxor mm5, mm4 ;                       C13 (6th+)
+   paddw mm7,mm3 ;                       B10  offset +negate back (3rd)
+   movq [edx+4*24+16], mm5 ; C14 (7th)
+   paddw mm2,mm7 ;                       B11 mm7 free (4th+)
+   pminsw mm2,[ebx] ;            B12 saturates to +2047 (5th+)
+         mov     ebx,[esp+4]
+         mov     edi,[esp+8]
+         add esp,byte 12
+   pxor mm3, mm2 ;                       B13 (6th+)
+   movq [edx+4*24+8], mm3 ;      B14 (7th)
+   ret

 Legend:



Removed from v.1.1
 


changed lines


 
Added in v.1.2
 Legend:



Removed from v.1.1
 


changed lines


 
Added in v.1.2
-Removed from v.1.1
+Added in v.1.2

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4