--- fdct_mmx_ffmpeg.asm	2003/10/27 01:03:06	1.1
+++ fdct_mmx_ffmpeg.asm	2004/03/22 22:36:23	1.2
@@ -0,0 +1,414 @@
+;/****************************************************************************
+; *
+; *  XVID MPEG-4 VIDEO CODEC
+; *  - MMX and XMM forward discrete cosine transform -
+; *
+; *  Copyright(C) 2003 Edouard Gomez <ed.gomez@free.fr>
+; *
+; *  This program is free software; you can redistribute it and/or modify it
+; *  under the terms of the GNU General Public License as published by
+; *  the Free Software Foundation; either version 2 of the License, or
+; *  (at your option) any later version.
+; *
+; *  This program is distributed in the hope that it will be useful,
+; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; *  GNU General Public License for more details.
+; *
+; *  You should have received a copy of the GNU General Public License
+; *  along with this program; if not, write to the Free Software
+; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+; *
+; * $Id: fdct_mmx_ffmpeg.asm,v 1.2 2004/03/22 22:36:23 edgomez Exp $
+; *
+; ***************************************************************************/
+
+;/****************************************************************************
+; *
+; *  Initial, but incomplete version provided by Intel at AppNote AP-922
+; *    http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+; *  Copyright (C) 1999 Intel Corporation
+; *
+; *  Completed and corrected in fdctmm32.c/fdctmm32.doc
+; *    http://members.tripod.com/~liaor/
+; *  Copyright (C) 2000 - Royce Shih-Wea Liao <liaor@iname.com>
+; *
+; *  Minimizing coefficients reordering changing the tables constants order
+; *    http://ffmpeg.sourceforge.net/
+; *  Copyright (C) 2001 Fabrice Bellard.
+; *
+; *  The version coded here is just a port to NASM syntax from the FFMPEG's
+; *  version. So all credits go to the previous authors for all their
+; *  respective work in order to have a nice/fast mmx fDCT.
+; ***************************************************************************/
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+;;; Define this if you want an unrolled version of the code
+%define UNROLLED_LOOP
+
+%define BITS_FRW_ACC   3
+%define SHIFT_FRW_COL  BITS_FRW_ACC
+%define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17)
+%define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))
+%define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1))
+
+;=============================================================================
+; Local Data (Read Only)
+;=============================================================================
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata data align=16
+%endif
+
+ALIGN 8
+tab_frw_01234567:
+  dw  16384,   16384,   -8867,  -21407
+  dw  16384,   16384,   21407,    8867
+  dw  16384,  -16384,   21407,   -8867
+  dw -16384,   16384,    8867,  -21407
+  dw  22725,   19266,  -22725,  -12873
+  dw  12873,    4520,   19266,   -4520
+  dw  12873,  -22725,   19266,  -22725
+  dw   4520,   19266,    4520,  -12873
+
+  dw  22725,   22725,  -12299,  -29692
+  dw  22725,   22725,   29692,   12299
+  dw  22725,  -22725,   29692,  -12299
+  dw -22725,   22725,   12299,  -29692
+  dw  31521,   26722,  -31521,  -17855
+  dw  17855,    6270,   26722,   -6270
+  dw  17855,  -31521,   26722,  -31521
+  dw   6270,   26722,    6270,  -17855
+
+  dw  21407,   21407,  -11585,  -27969
+  dw  21407,   21407,   27969,   11585
+  dw  21407,  -21407,   27969,  -11585
+  dw -21407,   21407,   11585,  -27969
+  dw  29692,   25172,  -29692,  -16819
+  dw  16819,    5906,   25172,   -5906
+  dw  16819,  -29692,   25172,  -29692
+  dw   5906,   25172,    5906,  -16819
+
+  dw  19266,   19266,  -10426,  -25172
+  dw  19266,   19266,   25172,   10426
+  dw  19266,  -19266,   25172,  -10426
+  dw -19266,   19266,   10426,  -25172
+  dw  26722,   22654,  -26722,  -15137
+  dw  15137,    5315,   22654,   -5315
+  dw  15137,  -26722,   22654,  -26722
+  dw   5315,   22654,    5315,  -15137
+
+  dw  16384,   16384,   -8867,  -21407
+  dw  16384,   16384,   21407,    8867
+  dw  16384,  -16384,   21407,   -8867
+  dw -16384,   16384,    8867,  -21407
+  dw  22725,   19266,  -22725,  -12873
+  dw  12873,    4520,   19266,   -4520
+  dw  12873,  -22725,   19266,  -22725
+  dw   4520,   19266,    4520,  -12873
+
+  dw  19266,   19266,  -10426,  -25172
+  dw  19266,   19266,   25172,   10426
+  dw  19266,  -19266,   25172,  -10426
+  dw -19266,   19266,   10426,  -25172
+  dw  26722,   22654,  -26722,  -15137
+  dw  15137,    5315,   22654,   -5315
+  dw  15137,  -26722,   22654,  -26722
+  dw   5315,   22654,    5315,  -15137
+
+  dw  21407,   21407,  -11585,  -27969
+  dw  21407,   21407,   27969,   11585
+  dw  21407,  -21407,   27969,  -11585
+  dw -21407,   21407,   11585,  -27969
+  dw  29692,   25172,  -29692,  -16819
+  dw  16819,    5906,   25172,   -5906
+  dw  16819,  -29692,   25172,  -29692
+  dw   5906,   25172,    5906,  -16819,
+
+  dw  22725,   22725,  -12299,  -29692
+  dw  22725,   22725,   29692,   12299
+  dw  22725,  -22725,   29692,  -12299
+  dw -22725,   22725,   12299,  -29692
+  dw  31521,   26722,  -31521,  -17855
+  dw  17855,    6270,   26722,   -6270
+  dw  17855,  -31521,   26722,  -31521
+  dw   6270,   26722,    6270,  -17855
+
+ALIGN 8
+fdct_one_corr:
+  dw 1, 1, 1, 1
+
+ALIGN 8
+fdct_tg_all_16:
+  dw  13036,	13036,	13036,	13036
+  dw  27146,	27146,	27146,	27146
+  dw -21746, -21746, -21746, -21746
+
+ALIGN 8
+cos_4_16:
+  dw -19195, -19195, -19195, -19195
+
+ALIGN 8
+ocos_4_16:
+  dw 23170, 23170, 23170, 23170
+
+ALIGN 8
+fdct_r_row:
+  dd RND_FRW_ROW, RND_FRW_ROW
+
+;=============================================================================
+; Factorized parts of the code turned into macros for better understanding
+;=============================================================================
+
+	;; Macro for column DCT
+	;; FDCT_COLUMN_MMX(int16_t *out, const int16_t *in, int offset);
+	;;  - out, register name holding the out address
+	;;  - in, register name holding the in address
+	;;  - column number to process
+%macro FDCT_COLUMN_COMMON 3
+  movq mm0, [%2 + %3*2 + 1*16]
+  movq mm1, [%2 + %3*2 + 6*16]
+  movq mm2, mm0
+  movq mm3, [%2 + %3*2 + 2*16]
+  paddsw mm0, mm1
+  movq mm4, [%2 + %3*2 + 5*16]
+  psllw mm0, SHIFT_FRW_COL
+  movq mm5, [%2 + %3*2 + 0*16]
+  paddsw mm4, mm3
+  paddsw mm5, [%2 + %3*2 + 7*16]
+  psllw mm4, SHIFT_FRW_COL
+  movq mm6, mm0
+  psubsw mm2, mm1
+  movq mm1, [fdct_tg_all_16 + 4*2]
+  psubsw mm0, mm4
+  movq mm7, [%2 + %3*2 + 3*16]
+  pmulhw mm1, mm0
+  paddsw mm7, [%2 + %3*2 + 4*16]
+  psllw mm5, SHIFT_FRW_COL
+  paddsw mm6, mm4
+  psllw mm7, SHIFT_FRW_COL
+  movq mm4, mm5
+  psubsw mm5, mm7
+  paddsw mm1, mm5
+  paddsw mm4, mm7
+  por mm1, [fdct_one_corr]
+  psllw mm2, SHIFT_FRW_COL + 1
+  pmulhw mm5, [fdct_tg_all_16 + 4*2]
+  movq mm7, mm4
+  psubsw mm3, [%2 + %3*2 + 5*16]
+  psubsw mm4, mm6
+  movq [%1 + %3*2 + 2*16], mm1
+  paddsw mm7, mm6
+  movq mm1, [%2 + %3*2 + 3*16]
+  psllw mm3, SHIFT_FRW_COL + 1
+  psubsw mm1, [%2 + %3*2 + 4*16]
+  movq mm6, mm2
+  movq [%1 + %3*2 + 4*16], mm4
+  paddsw mm2, mm3
+  pmulhw mm2, [ocos_4_16]
+  psubsw mm6, mm3
+  pmulhw mm6, [ocos_4_16]
+  psubsw mm5, mm0
+  por mm5, [fdct_one_corr]
+  psllw mm1, SHIFT_FRW_COL
+  por mm2, [fdct_one_corr]
+  movq mm4, mm1
+  movq mm3, [%2 + %3*2 + 0*16]
+  paddsw mm1, mm6
+  psubsw mm3, [%2 + %3*2 + 7*16]
+  psubsw mm4, mm6
+  movq mm0, [fdct_tg_all_16 + 0*2]
+  psllw mm3, SHIFT_FRW_COL
+  movq mm6, [fdct_tg_all_16 + 8*2]
+  pmulhw mm0, mm1
+  movq [%1 + %3*2 + 0*16], mm7
+  pmulhw mm6, mm4
+  movq [%1 + %3*2 + 6*16], mm5
+  movq mm7, mm3
+  movq mm5, [fdct_tg_all_16 + 8*2]
+  psubsw mm7, mm2
+  paddsw mm3, mm2
+  pmulhw mm5, mm7
+  paddsw mm0, mm3
+  paddsw mm6, mm4
+  pmulhw mm3, [fdct_tg_all_16 + 0*2]
+  por mm0, [fdct_one_corr]
+  paddsw mm5, mm7
+  psubsw mm7, mm6
+  movq [%1 + %3*2 + 1*16], mm0
+  paddsw mm5, mm4
+  movq [%1 + %3*2 + 3*16], mm7
+  psubsw mm3, mm1
+  movq [%1 + %3*2 + 5*16], mm5
+  movq [%1 + %3*2 + 7*16], mm3
+%endmacro
+
+	;; Macro for row DCT using MMX punpcklw instructions
+	;; FDCT_ROW_MMX(int16_t *out, const int16_t *in, const int16_t *table);
+	;;  - out, register name holding the out address
+	;;  - in, register name holding the in address
+	;;  - table coefficients address (register or absolute)
+%macro FDCT_ROW_MMX 3
+  movd mm1, [%2 + 6*2]
+  punpcklwd mm1, [%2 + 4*2]
+  movq mm2, mm1
+  psrlq mm1, 0x20
+  movq mm0, [%2 + 0*2]
+  punpcklwd mm1, mm2
+  movq mm5, mm0
+  paddsw mm0, mm1
+  psubsw mm5, mm1
+  movq mm1, mm0
+  movq mm6, mm5
+  punpckldq mm3, mm5
+  punpckhdq mm6, mm3
+  movq mm3, [%3 + 0*2]
+  movq mm4, [%3 + 4*2]
+  punpckldq mm2, mm0
+  pmaddwd mm3, mm0
+  punpckhdq mm1, mm2
+  movq mm2, [%3 + 16*2]
+  pmaddwd mm4, mm1
+  pmaddwd mm0, [%3 + 8*2]
+  movq mm7, [%3 + 20*2]
+  pmaddwd mm2, mm5
+  paddd mm3, [fdct_r_row]
+  pmaddwd mm7, mm6
+  pmaddwd mm1, [%3 + 12*2]
+  paddd mm3, mm4
+  pmaddwd mm5, [%3 + 24*2]
+  pmaddwd mm6, [%3 + 28*2]
+  paddd mm2, mm7
+  paddd mm0, [fdct_r_row]
+  psrad mm3, SHIFT_FRW_ROW
+  paddd mm2, [fdct_r_row]
+  paddd mm0, mm1
+  paddd mm5, [fdct_r_row]
+  psrad mm2, SHIFT_FRW_ROW
+  paddd mm5, mm6
+  psrad mm0, SHIFT_FRW_ROW
+  psrad mm5, SHIFT_FRW_ROW
+  packssdw mm3, mm0
+  packssdw mm2, mm5
+  movq mm6, mm3
+  punpcklwd mm3, mm2
+  punpckhwd mm6, mm2
+  movq [%1 + 0*2], mm3
+  movq [%1 + 4*2], mm6
+%endmacro
+
+	;; Macro for column DCT using XMM instuction pshufw
+	;; FDCT_ROW_XMM(int16_t *out, const int16_t *in, const int16_t *table);
+	;;  - out, register name holding the out address
+	;;  - in, register name holding the in address
+	;;  - table coefficient address
+%macro FDCT_ROW_XMM 3
+	;; fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
+  pshufw mm5, [%2 + 4*2], 0x1B
+  movq mm0, [%2 + 0*2]
+  movq mm1, mm0
+  paddsw mm0, mm5
+  psubsw mm1, mm5
+  pshufw mm2, mm0, 0x4E
+  pshufw mm3, mm1, 0x4E
+  movq mm4, [%3 +  0*2]
+  movq mm6, [%3 +  4*2]
+  movq mm5, [%3 + 16*2]
+  movq mm7, [%3 + 20*2]
+  pmaddwd mm4, mm0
+  pmaddwd mm5, mm1
+  pmaddwd mm6, mm2
+  pmaddwd mm7, mm3
+  pmaddwd mm0, [%3 +  8*2]
+  pmaddwd mm2, [%3 + 12*2]
+  pmaddwd mm1, [%3 + 24*2]
+  pmaddwd mm3, [%3 + 28*2]
+  paddd mm4, mm6
+  paddd mm5, mm7
+  paddd mm0, mm2
+  paddd mm1, mm3
+  movq mm7, [fdct_r_row]
+  paddd mm4, mm7
+  paddd mm5, mm7
+  paddd mm0, mm7
+  paddd mm1, mm7
+  psrad mm4, SHIFT_FRW_ROW
+  psrad mm5, SHIFT_FRW_ROW
+  psrad mm0, SHIFT_FRW_ROW
+  psrad mm1, SHIFT_FRW_ROW
+  packssdw mm4, mm0
+  packssdw mm5, mm1
+  movq mm2, mm4
+  punpcklwd mm4, mm5
+  punpckhwd mm2, mm5
+  movq [%1 + 0*2], mm4
+  movq [%1 + 4*2], mm2
+%endmacro
+
+%macro MAKE_FDCT_FUNC 2
+ALIGN 16
+cglobal %1
+%1:
+	;; Move the destination/source address to the eax register
+  mov eax, [esp + 4]
+
+	;; Process the columns (4 at a time)
+  FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3
+  FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7
+
+%ifdef UNROLLED_LOOP
+	; Unrolled loop version
+%assign i 0
+%rep 8
+	;; Process the 'i'th row
+  %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i
+	%assign i i+1
+%endrep
+%else
+  mov ecx, 8
+  mov edx, tab_frw_01234567
+ALIGN 8
+.loop
+  %2 eax, eax, edx
+  add eax, 2*8
+  add edx, 2*32
+  dec ecx
+  jne .loop
+%endif
+
+  ret
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void fdct_mmx_ffmpeg(int16_t block[64]);
+;-----------------------------------------------------------------------------
+
+MAKE_FDCT_FUNC fdct_mmx_ffmpeg, FDCT_ROW_MMX
+
+;-----------------------------------------------------------------------------
+; void fdct_xmm_ffmpeg(int16_t block[64]);
+;-----------------------------------------------------------------------------
+
+MAKE_FDCT_FUNC fdct_xmm_ffmpeg, FDCT_ROW_XMM
\ No newline at end of file