Diff of /xvidcore/src/dct/x86_asm/fdct_mmx_ffmpeg.asm

-revision 1.1, Mon Oct 27 01:03:06 2003 UTC
+revision 1.2, Mon Mar 22 22:36:23 2004 UTC
-Line 0
+Line 1
+ ;/****************************************************************************
+ ; *
+ ; *  XVID MPEG-4 VIDEO CODEC
+ ; *  - MMX and XMM forward discrete cosine transform -
+ ; *
+ ; *  Copyright(C) 2003 Edouard Gomez <ed.gomez@free.fr>
+ ; *
+ ; *  This program is free software; you can redistribute it and/or modify it
+ ; *  under the terms of the GNU General Public License as published by
+ ; *  the Free Software Foundation; either version 2 of the License, or
+ ; *  (at your option) any later version.
+ ; *
+ ; *  This program is distributed in the hope that it will be useful,
+ ; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ; *  GNU General Public License for more details.
+ ; *
+ ; *  You should have received a copy of the GNU General Public License
+ ; *  along with this program; if not, write to the Free Software
+ ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ ; *
+ ; * $Id$
+ ; *
+ ; ***************************************************************************/
+ ;/****************************************************************************
+ ; *
+ ; *  Initial, but incomplete version provided by Intel at AppNote AP-922
+ ; *    http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+ ; *  Copyright (C) 1999 Intel Corporation
+ ; *
+ ; *  Completed and corrected in fdctmm32.c/fdctmm32.doc
+ ; *    http://members.tripod.com/~liaor/
+ ; *  Copyright (C) 2000 - Royce Shih-Wea Liao <liaor@iname.com>
+ ; *
+ ; *  Minimizing coefficients reordering changing the tables constants order
+ ; *    http://ffmpeg.sourceforge.net/
+ ; *  Copyright (C) 2001 Fabrice Bellard.
+ ; *
+ ; *  The version coded here is just a port to NASM syntax from the FFMPEG's
+ ; *  version. So all credits go to the previous authors for all their
+ ; *  respective work in order to have a nice/fast mmx fDCT.
+ ; ***************************************************************************/
+ BITS 32
+ ;=============================================================================
+ ; Macros and other preprocessor constants
+ ;=============================================================================
+ %macro cglobal 1
+         %ifdef PREFIX
+                 global _%1
+                 %define %1 _%1
+         %else
+                 global %1
+         %endif
+ %endmacro
+ ;;; Define this if you want an unrolled version of the code
+ %define UNROLLED_LOOP
+ %define BITS_FRW_ACC   3
+ %define SHIFT_FRW_COL  BITS_FRW_ACC
+ %define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17)
+ %define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))
+ %define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1))
+ ;=============================================================================
+ ; Local Data (Read Only)
+ ;=============================================================================
+ %ifdef FORMAT_COFF
+ SECTION .rodata data
+ %else
+ SECTION .rodata data align=16
+ %endif
+ ALIGN 8
+ tab_frw_01234567:
+   dw  16384,   16384,   -8867,  -21407
+   dw  16384,   16384,   21407,    8867
+   dw  16384,  -16384,   21407,   -8867
+   dw -16384,   16384,    8867,  -21407
+   dw  22725,   19266,  -22725,  -12873
+   dw  12873,    4520,   19266,   -4520
+   dw  12873,  -22725,   19266,  -22725
+   dw   4520,   19266,    4520,  -12873
+   dw  22725,   22725,  -12299,  -29692
+   dw  22725,   22725,   29692,   12299
+   dw  22725,  -22725,   29692,  -12299
+   dw -22725,   22725,   12299,  -29692
+   dw  31521,   26722,  -31521,  -17855
+   dw  17855,    6270,   26722,   -6270
+   dw  17855,  -31521,   26722,  -31521
+   dw   6270,   26722,    6270,  -17855
+   dw  21407,   21407,  -11585,  -27969
+   dw  21407,   21407,   27969,   11585
+   dw  21407,  -21407,   27969,  -11585
+   dw -21407,   21407,   11585,  -27969
+   dw  29692,   25172,  -29692,  -16819
+   dw  16819,    5906,   25172,   -5906
+   dw  16819,  -29692,   25172,  -29692
+   dw   5906,   25172,    5906,  -16819
+   dw  19266,   19266,  -10426,  -25172
+   dw  19266,   19266,   25172,   10426
+   dw  19266,  -19266,   25172,  -10426
+   dw -19266,   19266,   10426,  -25172
+   dw  26722,   22654,  -26722,  -15137
+   dw  15137,    5315,   22654,   -5315
+   dw  15137,  -26722,   22654,  -26722
+   dw   5315,   22654,    5315,  -15137
+   dw  16384,   16384,   -8867,  -21407
+   dw  16384,   16384,   21407,    8867
+   dw  16384,  -16384,   21407,   -8867
+   dw -16384,   16384,    8867,  -21407
+   dw  22725,   19266,  -22725,  -12873
+   dw  12873,    4520,   19266,   -4520
+   dw  12873,  -22725,   19266,  -22725
+   dw   4520,   19266,    4520,  -12873
+   dw  19266,   19266,  -10426,  -25172
+   dw  19266,   19266,   25172,   10426
+   dw  19266,  -19266,   25172,  -10426
+   dw -19266,   19266,   10426,  -25172
+   dw  26722,   22654,  -26722,  -15137
+   dw  15137,    5315,   22654,   -5315
+   dw  15137,  -26722,   22654,  -26722
+   dw   5315,   22654,    5315,  -15137
+   dw  21407,   21407,  -11585,  -27969
+   dw  21407,   21407,   27969,   11585
+   dw  21407,  -21407,   27969,  -11585
+   dw -21407,   21407,   11585,  -27969
+   dw  29692,   25172,  -29692,  -16819
+   dw  16819,    5906,   25172,   -5906
+   dw  16819,  -29692,   25172,  -29692
+   dw   5906,   25172,    5906,  -16819,
+   dw  22725,   22725,  -12299,  -29692
+   dw  22725,   22725,   29692,   12299
+   dw  22725,  -22725,   29692,  -12299
+   dw -22725,   22725,   12299,  -29692
+   dw  31521,   26722,  -31521,  -17855
+   dw  17855,    6270,   26722,   -6270
+   dw  17855,  -31521,   26722,  -31521
+   dw   6270,   26722,    6270,  -17855
+ ALIGN 8
+ fdct_one_corr:
+   dw 1, 1, 1, 1
+ ALIGN 8
+ fdct_tg_all_16:
+   dw  13036,    13036,  13036,  13036
+   dw  27146,    27146,  27146,  27146
+   dw -21746, -21746, -21746, -21746
+ ALIGN 8
+ cos_4_16:
+   dw -19195, -19195, -19195, -19195
+ ALIGN 8
+ ocos_4_16:
+   dw 23170, 23170, 23170, 23170
+ ALIGN 8
+ fdct_r_row:
+   dd RND_FRW_ROW, RND_FRW_ROW
+ ;=============================================================================
+ ; Factorized parts of the code turned into macros for better understanding
+ ;=============================================================================
+         ;; Macro for column DCT
+         ;; FDCT_COLUMN_MMX(int16_t *out, const int16_t *in, int offset);
+         ;;  - out, register name holding the out address
+         ;;  - in, register name holding the in address
+         ;;  - column number to process
+ %macro FDCT_COLUMN_COMMON 3
+   movq mm0, [%2 + %3*2 + 1*16]
+   movq mm1, [%2 + %3*2 + 6*16]
+   movq mm2, mm0
+   movq mm3, [%2 + %3*2 + 2*16]
+   paddsw mm0, mm1
+   movq mm4, [%2 + %3*2 + 5*16]
+   psllw mm0, SHIFT_FRW_COL
+   movq mm5, [%2 + %3*2 + 0*16]
+   paddsw mm4, mm3
+   paddsw mm5, [%2 + %3*2 + 7*16]
+   psllw mm4, SHIFT_FRW_COL
+   movq mm6, mm0
+   psubsw mm2, mm1
+   movq mm1, [fdct_tg_all_16 + 4*2]
+   psubsw mm0, mm4
+   movq mm7, [%2 + %3*2 + 3*16]
+   pmulhw mm1, mm0
+   paddsw mm7, [%2 + %3*2 + 4*16]
+   psllw mm5, SHIFT_FRW_COL
+   paddsw mm6, mm4
+   psllw mm7, SHIFT_FRW_COL
+   movq mm4, mm5
+   psubsw mm5, mm7
+   paddsw mm1, mm5
+   paddsw mm4, mm7
+   por mm1, [fdct_one_corr]
+   psllw mm2, SHIFT_FRW_COL + 1
+   pmulhw mm5, [fdct_tg_all_16 + 4*2]
+   movq mm7, mm4
+   psubsw mm3, [%2 + %3*2 + 5*16]
+   psubsw mm4, mm6
+   movq [%1 + %3*2 + 2*16], mm1
+   paddsw mm7, mm6
+   movq mm1, [%2 + %3*2 + 3*16]
+   psllw mm3, SHIFT_FRW_COL + 1
+   psubsw mm1, [%2 + %3*2 + 4*16]
+   movq mm6, mm2
+   movq [%1 + %3*2 + 4*16], mm4
+   paddsw mm2, mm3
+   pmulhw mm2, [ocos_4_16]
+   psubsw mm6, mm3
+   pmulhw mm6, [ocos_4_16]
+   psubsw mm5, mm0
+   por mm5, [fdct_one_corr]
+   psllw mm1, SHIFT_FRW_COL
+   por mm2, [fdct_one_corr]
+   movq mm4, mm1
+   movq mm3, [%2 + %3*2 + 0*16]
+   paddsw mm1, mm6
+   psubsw mm3, [%2 + %3*2 + 7*16]
+   psubsw mm4, mm6
+   movq mm0, [fdct_tg_all_16 + 0*2]
+   psllw mm3, SHIFT_FRW_COL
+   movq mm6, [fdct_tg_all_16 + 8*2]
+   pmulhw mm0, mm1
+   movq [%1 + %3*2 + 0*16], mm7
+   pmulhw mm6, mm4
+   movq [%1 + %3*2 + 6*16], mm5
+   movq mm7, mm3
+   movq mm5, [fdct_tg_all_16 + 8*2]
+   psubsw mm7, mm2
+   paddsw mm3, mm2
+   pmulhw mm5, mm7
+   paddsw mm0, mm3
+   paddsw mm6, mm4
+   pmulhw mm3, [fdct_tg_all_16 + 0*2]
+   por mm0, [fdct_one_corr]
+   paddsw mm5, mm7
+   psubsw mm7, mm6
+   movq [%1 + %3*2 + 1*16], mm0
+   paddsw mm5, mm4
+   movq [%1 + %3*2 + 3*16], mm7
+   psubsw mm3, mm1
+   movq [%1 + %3*2 + 5*16], mm5
+   movq [%1 + %3*2 + 7*16], mm3
+ %endmacro
+         ;; Macro for row DCT using MMX punpcklw instructions
+         ;; FDCT_ROW_MMX(int16_t *out, const int16_t *in, const int16_t *table);
+         ;;  - out, register name holding the out address
+         ;;  - in, register name holding the in address
+         ;;  - table coefficients address (register or absolute)
+ %macro FDCT_ROW_MMX 3
+   movd mm1, [%2 + 6*2]
+   punpcklwd mm1, [%2 + 4*2]
+   movq mm2, mm1
+   psrlq mm1, 0x20
+   movq mm0, [%2 + 0*2]
+   punpcklwd mm1, mm2
+   movq mm5, mm0
+   paddsw mm0, mm1
+   psubsw mm5, mm1
+   movq mm1, mm0
+   movq mm6, mm5
+   punpckldq mm3, mm5
+   punpckhdq mm6, mm3
+   movq mm3, [%3 + 0*2]
+   movq mm4, [%3 + 4*2]
+   punpckldq mm2, mm0
+   pmaddwd mm3, mm0
+   punpckhdq mm1, mm2
+   movq mm2, [%3 + 16*2]
+   pmaddwd mm4, mm1
+   pmaddwd mm0, [%3 + 8*2]
+   movq mm7, [%3 + 20*2]
+   pmaddwd mm2, mm5
+   paddd mm3, [fdct_r_row]
+   pmaddwd mm7, mm6
+   pmaddwd mm1, [%3 + 12*2]
+   paddd mm3, mm4
+   pmaddwd mm5, [%3 + 24*2]
+   pmaddwd mm6, [%3 + 28*2]
+   paddd mm2, mm7
+   paddd mm0, [fdct_r_row]
+   psrad mm3, SHIFT_FRW_ROW
+   paddd mm2, [fdct_r_row]
+   paddd mm0, mm1
+   paddd mm5, [fdct_r_row]
+   psrad mm2, SHIFT_FRW_ROW
+   paddd mm5, mm6
+   psrad mm0, SHIFT_FRW_ROW
+   psrad mm5, SHIFT_FRW_ROW
+   packssdw mm3, mm0
+   packssdw mm2, mm5
+   movq mm6, mm3
+   punpcklwd mm3, mm2
+   punpckhwd mm6, mm2
+   movq [%1 + 0*2], mm3
+   movq [%1 + 4*2], mm6
+ %endmacro
+         ;; Macro for column DCT using XMM instuction pshufw
+         ;; FDCT_ROW_XMM(int16_t *out, const int16_t *in, const int16_t *table);
+         ;;  - out, register name holding the out address
+         ;;  - in, register name holding the in address
+         ;;  - table coefficient address
+ %macro FDCT_ROW_XMM 3
+         ;; fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
+   pshufw mm5, [%2 + 4*2], 0x1B
+   movq mm0, [%2 + 0*2]
+   movq mm1, mm0
+   paddsw mm0, mm5
+   psubsw mm1, mm5
+   pshufw mm2, mm0, 0x4E
+   pshufw mm3, mm1, 0x4E
+   movq mm4, [%3 +  0*2]
+   movq mm6, [%3 +  4*2]
+   movq mm5, [%3 + 16*2]
+   movq mm7, [%3 + 20*2]
+   pmaddwd mm4, mm0
+   pmaddwd mm5, mm1
+   pmaddwd mm6, mm2
+   pmaddwd mm7, mm3
+   pmaddwd mm0, [%3 +  8*2]
+   pmaddwd mm2, [%3 + 12*2]
+   pmaddwd mm1, [%3 + 24*2]
+   pmaddwd mm3, [%3 + 28*2]
+   paddd mm4, mm6
+   paddd mm5, mm7
+   paddd mm0, mm2
+   paddd mm1, mm3
+   movq mm7, [fdct_r_row]
+   paddd mm4, mm7
+   paddd mm5, mm7
+   paddd mm0, mm7
+   paddd mm1, mm7
+   psrad mm4, SHIFT_FRW_ROW
+   psrad mm5, SHIFT_FRW_ROW
+   psrad mm0, SHIFT_FRW_ROW
+   psrad mm1, SHIFT_FRW_ROW
+   packssdw mm4, mm0
+   packssdw mm5, mm1
+   movq mm2, mm4
+   punpcklwd mm4, mm5
+   punpckhwd mm2, mm5
+   movq [%1 + 0*2], mm4
+   movq [%1 + 4*2], mm2
+ %endmacro
+ %macro MAKE_FDCT_FUNC 2
+ ALIGN 16
+ cglobal %1
+ %1:
+         ;; Move the destination/source address to the eax register
+   mov eax, [esp + 4]
+         ;; Process the columns (4 at a time)
+   FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3
+   FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7
+ %ifdef UNROLLED_LOOP
+         ; Unrolled loop version
+ %assign i 0
+ %rep 8
+         ;; Process the 'i'th row
+   %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i
+         %assign i i+1
+ %endrep
+ %else
+   mov ecx, 8
+   mov edx, tab_frw_01234567
+ ALIGN 8
+ .loop
+   %2 eax, eax, edx
+   add eax, 2*8
+   add edx, 2*32
+   dec ecx
+   jne .loop
+ %endif
+   ret
+ %endmacro
+ ;=============================================================================
+ ; Code
+ ;=============================================================================
+ SECTION .text
+ ;-----------------------------------------------------------------------------
+ ; void fdct_mmx_ffmpeg(int16_t block[64]);
+ ;-----------------------------------------------------------------------------
+ MAKE_FDCT_FUNC fdct_mmx_ffmpeg, FDCT_ROW_MMX
+ ;-----------------------------------------------------------------------------
+ ; void fdct_xmm_ffmpeg(int16_t block[64]);
+ ;-----------------------------------------------------------------------------
+ MAKE_FDCT_FUNC fdct_xmm_ffmpeg, FDCT_ROW_XMM

 Legend:



Removed from v.1.1
 


changed lines


 
Added in v.1.2
 Legend:



Removed from v.1.1
 


changed lines


 
Added in v.1.2
-Removed from v.1.1
+Added in v.1.2

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4