Diff of /xvidcore/src/quant/x86_asm/quantize_h263_3dne.asm

-revision 1.1.2.1, Tue Oct  7 13:02:35 2003 UTC
+revision 1.7, Tue Aug 19 09:06:48 2008 UTC
 Line 1
  ;/**************************************************************************
  ; *
  ; *  XVID MPEG-4 VIDEO CODEC
- ; *  - mmx quantization/dequantization -
+ ; *  - 3dne Quantization/Dequantization -
  ; *
- ; *  Copyright(C) 2001-2003 XviD Team <xvid-devel@xvid.org>
+ ; *  Copyright(C) 2002-2003 Jaan Kalda
  ; *
  ; *  This program is free software ; you can redistribute it and/or modify
  ; *  it under the terms of the GNU General Public License as published by
 Line 22
  ; * $Id$
  ; *
  ; *************************************************************************/
+ ;
  ; these 3dne functions are compatible with iSSE, but are optimized specifically for
  ; K7 pipelines
- ;
- ;------------------------------------------------------------------------------
- ; 09.12.2002  Athlon optimizations contributed by Jaan Kalda
- ;------------------------------------------------------------------------------
  ; enable dequant saturate [-2048,2047], test purposes only.
  %define SATURATE
- ; data/text alignment
+ BITS 32
- %define ALIGN 16
- bits 32
- %ifdef FORMAT_COFF
- section .data data
- %else
- section .data data align=16
- %endif
  %macro cglobal 1
          %ifdef PREFIX
+                 %ifdef MARK_FUNCS
+                         global _%1:function %1.endfunc-%1
+                         %define %1 _%1:function %1.endfunc-%1
+                 %else
                  global _%1
                  %define %1 _%1
+                 %endif
+         %else
+                 %ifdef MARK_FUNCS
+                         global %1:function %1.endfunc-%1
          %else
                  global %1
          %endif
+         %endif
  %endmacro
+ ;=============================================================================
+ ; Local data
+ ;=============================================================================
+ %ifdef FORMAT_COFF
+ SECTION .rodata
+ %else
+ SECTION .rodata align=16
+ %endif
  align 4
- int_div
+ int_div:
  dd 0
  %assign i 1
  %rep 255
-Line 60
+Line 68
          %assign i i+1
  %endrep
- align 16
+ ALIGN 16
+ plus_one:
- plus_one times 8        dw       1
+         times 8 dw 1
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- ;
  ; subtract by Q/2 table
- ;
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
- %macro MMX_SUB  1
+ ALIGN 16
- times 4 dw %1 / 2
+ mmx_sub:
- %endmacro
- align 16
- mmx_sub
  %assign i 1
  %rep 31
          times 4 dw i / 2
-Line 84
+Line 85
  %endrep
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
  ; divide by 2Q table
  ;
-Line 92
+Line 93
  ; for q=1, _pmulhw_ will overflow so it is treated seperately
  ; (3dnow2 provides _pmulhuw_ which wont cause overflow)
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
- mmx_div
+ ALIGN 16
+ mmx_div:
  %assign i 1
  %rep 31
          times 4 dw  (1 << 16) / (i * 2) + 1
          %assign i i+1
  %endrep
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- ;
  ; add by (odd(Q) ? Q : Q - 1) table
- ;
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
- %macro MMX_ADD  1
- %if %1 % 2 != 0
- times 4 dw %1
- %else
- times 4 dw %1 - 1
- %endif
- %endmacro
- align 16
- mmx_add
+ ALIGN 16
+ mmx_add:
  %assign i 1
  %rep 31
-         MMX_ADD i
+         %if i % 2 != 0
+         times 4 dw i
+         %else
+         times 4 dw i - 1
+         %endif
          %assign i i+1
  %endrep
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- ;
  ; multiple by 2Q table
- ;
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
- %macro MMX_MUL  1
- times 4 dw %1 * 2
- %endmacro
- align 16
- mmx_mul
+ ALIGN 16
+ mmx_mul:
  %assign i 1
  %rep 31
          times 4 dw i * 2
          %assign i i+1
  %endrep
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- ;
  ; saturation limits
- ;
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
- align 8
+ ALIGN 8
- mmx_32768_minus_2048                            times 4 dw (32768-2048)
+ mmx_32768_minus_2048:
- mmx_32767_minus_2047                            times 4 dw (32767-2047)
+         times 4 dw (32768-2048)
+ mmx_32767_minus_2047:
+         times 4 dw (32767-2047)
- align 16
+ ALIGN 16
- mmx_2047 times 4 dw 2047
+ mmx_2047:
+         times 4 dw 2047
- align 8
+ ALIGN 8
- mmzero dd 0, 0
+ mmzero:
- int2047 dd 2047
+         dd 0, 0
- int_2048 dd -2048
+ int2047:
+         dd 2047
+ int_2048:
+         dd -2048
- section .text
+ ;=============================================================================
+ ; Code
+ ;=============================================================================
+ SECTION .text
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
- ; void quant_intra_3dne(int16_t * coeff,
+ ; uint32_t quant_h263_intra_3dne(int16_t * coeff,
  ;                                       const int16_t const * data,
  ;                                       const uint32_t quant,
- ;                                       const uint32_t dcscalar);
+ ;                                const uint32_t dcscalar,
+ ;                                const uint16_t *mpeg_matrices);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;This is Athlon-optimized code (ca 70 clk per call)
- ;Optimized by Jaan, 30 Nov 2002
  %macro quant_intra1  1
          psubw   mm1, mm0        ;A3
          psubw   mm3, mm2        ;B3
  %if (%1)
-Line 186
+Line 179
          psubw   mm7, mm6        ;D8
  %endif
- align 8
+ ALIGN 8
          db      0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)       ;movq   mm4, [ecx + %1 * 32 +16+32]     ;C1
          pmaxsw  mm1, mm0        ;A4
          db      0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)       ;movq   mm6, [ecx + %1 * 32 +24+32]     ;D1
-Line 260
+Line 253
          psubw   mm7, mm6        ;D8
  %endif
- align 8
+ ALIGN 8
          db      0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)       ;movq   mm4, [ecx + %1 * 32 +16+32]     ;C1
          pmaxsw  mm1, mm0        ;A4
          db      0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)       ;movq   mm6, [ecx + %1 * 32 +24+32]     ;D1
-Line 327
+Line 320
  %endmacro
- align ALIGN
+ ALIGN 16
  cglobal quant_h263_intra_3dne
  quant_h263_intra_3dne:
-Line 379
+Line 372
          movq    [edx + 3 * 32 + 16], mm5        ;C9
          movq    [edx + 3 * 32 + 24], mm7        ;D9
+   xor eax, eax
          ret
-         align 16
+ ALIGN 16
  .q1loop
  quant_intra1 0
-Line 402
+Line 396
  quant_intra1 3
          psubw   mm5, mm4                        ;C8
-         mov     esi, [dword esp + 12]           ; pop back the register value
+   mov esi, [esp + 12]         ; pop back the register value
          mov     edi, [esp + 4]                  ; pop back the register value
          sar     eax, 16
          lea     ebx, [byte eax + 1]             ; workaround for eax < 0
-Line 414
+Line 408
          movq    [edx + 3 * 32 + 16], mm5        ;C9
          movq    [edx + 3 * 32 + 24], mm7        ;D9
+   xor eax, eax
          ret
+ .endfunc
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
- ; uint32_t quant_inter_3dne(int16_t * coeff,
+ ; uint32_t quant_h263_inter_3dne(int16_t * coeff,
  ;                                       const int16_t const * data,
- ;                                       const uint32_t quant);
+ ;                                const uint32_t quant,
+ ;                                const uint16_t *mpeg_matrices);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;This is Athlon-optimized code (ca 90 clk per call)
  ;Optimized by Jaan, 30 Nov 2002
-Line 500
+Line 497
          movq    [edx + %1*16+8], mm4
  %endmacro
- align ALIGN
+ ALIGN 16
  cglobal quant_h263_inter_3dne
  quant_h263_inter_3dne:
          mov     edx, [esp  + 4]         ; coeff
-Line 517
+Line 514
          lea     eax, [mmzero]
          jz      near .q1loop
          cmp     esp, esp
- align 8
+ ALIGN 8
          movq    mm3, [ecx + 120]        ;B1
          pxor    mm4, mm4                ;B2
          psubw   mm4, mm3                ;B3
-Line 548
+Line 545
          ret
- align ALIGN
+ ALIGN 16
  .q1loop
          movq mm6, [byte ebx]
-Line 570
+Line 567
          pop ebx
          ret
+ .endfunc
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
- ; void dequant_intra_3dne(int16_t *data,
+ ; uint32_t dequant_h263_intra_3dne(int16_t *data,
  ;                                       const int16_t const *coeff,
  ;                                       const uint32_t quant,
- ;                                       const uint32_t dcscalar);
+ ;                                  const uint32_t dcscalar,
+ ;                                  const uint16_t *mpeg_matrices);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
    ; this is the same as dequant_inter_3dne, except that we're
    ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
-Line 604
+Line 603
          movq    mm4, [esi]              ;C1 ;0
          mov     esp, esp
          pcmpeqw mm6, [ecx+%1*24]        ;A6 (c ==0) ? -1 : 0 (1st)
- align 4
+ ALIGN 4
          psraw   mm1, 15                 ; sign(c)       ;A7 (2nd)
  %if (%1)
          movq    [edx+%1*24+16-24], mm5  ; C14 (7th) 2later
-Line 650
+Line 649
  %endmacro
- align ALIGN
+ ALIGN 16
  cglobal dequant_h263_intra_3dne
  dequant_h263_intra_3dne:
          mov     ecx, [esp+ 8]                   ; coeff
-Line 677
+Line 676
          psraw   mm3, 15                         ; sign(c)       ;B7 (2nd)
          mov     edx, [esp+ 4+16]                ; data
- align 8
+ ALIGN 8
          dequant 0
          cmp     ebp, -2048
-Line 717
+Line 716
          pxor    mm3, mm2                ;B13 (6th+)
          movq    [edx+4*24+8], mm3       ;B14 (7th)
          mov     [edx], ax
+   xor eax, eax
          ret
+ .endfunc
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
- ; void dequant_inter_3dne(int16_t * data,
+ ; uint32_t dequant_h263_inter_3dne(int16_t * data,
  ;                                       const int16_t * const coeff,
- ;                                       const uint32_t quant);
+ ;                                  const uint32_t quant,
+ ;                                  const uint16_t *mpeg_matrices);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ; this is the same as dequant_inter_3dne,
  ; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
  ;This is Athlon-optimized code (ca 100 clk per call)
- ;Optimized by Jaan, 30 Nov 2002
- align ALIGN
+ ALIGN 16
  cglobal dequant_h263_inter_3dne
  dequant_h263_inter_3dne:
          mov     ecx, [esp+ 8]                   ; coeff
-Line 753
+Line 755
          pmaxsw  mm2, mm3                        ;|c|            ;B4 (2nd)
          pmullw  mm2, [edi]                      ;*= 2Q          ;B8 (3rd+)
          psraw   mm3, 15                         ; sign(c)       ;B7 (2nd)
-         mov     edx, [dword esp+ 4+12]          ; data
+   mov edx, [esp+ 4+12]        ; data
- align 8
+ ALIGN 8
          dequant 0
          dequant 1
-Line 778
+Line 780
          pxor    mm3, mm2                ;B13 (6th+)
          movq    [edx+4*24+8], mm3       ;B14 (7th)
+   xor eax, eax
          ret
+ .endfunc
+ %ifidn __OUTPUT_FORMAT__,elf
+ section ".note.GNU-stack" noalloc noexec nowrite progbits
+ %endif

 Legend:



Removed from v.1.1.2.1
 


changed lines


 
Added in v.1.7
 Legend:



Removed from v.1.1.2.1
 


changed lines


 
Added in v.1.7
-Removed from v.1.1.2.1
+Added in v.1.7

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4