--- fdct_mmx_ffmpeg.asm 2003/10/27 01:03:06 1.1 +++ fdct_mmx_ffmpeg.asm 2004/03/22 22:36:23 1.2 @@ -0,0 +1,414 @@ +;/**************************************************************************** +; * +; * XVID MPEG-4 VIDEO CODEC +; * - MMX and XMM forward discrete cosine transform - +; * +; * Copyright(C) 2003 Edouard Gomez +; * +; * This program is free software; you can redistribute it and/or modify it +; * under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. +; * +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +; * +; * $Id: fdct_mmx_ffmpeg.asm,v 1.2 2004/03/22 22:36:23 edgomez Exp $ +; * +; ***************************************************************************/ + +;/**************************************************************************** +; * +; * Initial, but incomplete version provided by Intel at AppNote AP-922 +; * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm +; * Copyright (C) 1999 Intel Corporation +; * +; * Completed and corrected in fdctmm32.c/fdctmm32.doc +; * http://members.tripod.com/~liaor/ +; * Copyright (C) 2000 - Royce Shih-Wea Liao +; * +; * Minimizing coefficients reordering changing the tables constants order +; * http://ffmpeg.sourceforge.net/ +; * Copyright (C) 2001 Fabrice Bellard. +; * +; * The version coded here is just a port to NASM syntax from the FFMPEG's +; * version. So all credits go to the previous authors for all their +; * respective work in order to have a nice/fast mmx fDCT. +; ***************************************************************************/ + +BITS 32 + +;============================================================================= +; Macros and other preprocessor constants +;============================================================================= + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +;;; Define this if you want an unrolled version of the code +%define UNROLLED_LOOP + +%define BITS_FRW_ACC 3 +%define SHIFT_FRW_COL BITS_FRW_ACC +%define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) +%define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) +%define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) + +;============================================================================= +; Local Data (Read Only) +;============================================================================= + +%ifdef FORMAT_COFF +SECTION .rodata data +%else +SECTION .rodata data align=16 +%endif + +ALIGN 8 +tab_frw_01234567: + dw 16384, 16384, -8867, -21407 + dw 16384, 16384, 21407, 8867 + dw 16384, -16384, 21407, -8867 + dw -16384, 16384, 8867, -21407 + dw 22725, 19266, -22725, -12873 + dw 12873, 4520, 19266, -4520 + dw 12873, -22725, 19266, -22725 + dw 4520, 19266, 4520, -12873 + + dw 22725, 22725, -12299, -29692 + dw 22725, 22725, 29692, 12299 + dw 22725, -22725, 29692, -12299 + dw -22725, 22725, 12299, -29692 + dw 31521, 26722, -31521, -17855 + dw 17855, 6270, 26722, -6270 + dw 17855, -31521, 26722, -31521 + dw 6270, 26722, 6270, -17855 + + dw 21407, 21407, -11585, -27969 + dw 21407, 21407, 27969, 11585 + dw 21407, -21407, 27969, -11585 + dw -21407, 21407, 11585, -27969 + dw 29692, 25172, -29692, -16819 + dw 16819, 5906, 25172, -5906 + dw 16819, -29692, 25172, -29692 + dw 5906, 25172, 5906, -16819 + + dw 19266, 19266, -10426, -25172 + dw 19266, 19266, 25172, 10426 + dw 19266, -19266, 25172, -10426 + dw -19266, 19266, 10426, -25172 + dw 26722, 22654, -26722, -15137 + dw 15137, 5315, 22654, -5315 + dw 15137, -26722, 22654, -26722 + dw 5315, 22654, 5315, -15137 + + dw 16384, 16384, -8867, -21407 + dw 16384, 16384, 21407, 8867 + dw 16384, -16384, 21407, -8867 + dw -16384, 16384, 8867, -21407 + dw 22725, 19266, -22725, -12873 + dw 12873, 4520, 19266, -4520 + dw 12873, -22725, 19266, -22725 + dw 4520, 19266, 4520, -12873 + + dw 19266, 19266, -10426, -25172 + dw 19266, 19266, 25172, 10426 + dw 19266, -19266, 25172, -10426 + dw -19266, 19266, 10426, -25172 + dw 26722, 22654, -26722, -15137 + dw 15137, 5315, 22654, -5315 + dw 15137, -26722, 22654, -26722 + dw 5315, 22654, 5315, -15137 + + dw 21407, 21407, -11585, -27969 + dw 21407, 21407, 27969, 11585 + dw 21407, -21407, 27969, -11585 + dw -21407, 21407, 11585, -27969 + dw 29692, 25172, -29692, -16819 + dw 16819, 5906, 25172, -5906 + dw 16819, -29692, 25172, -29692 + dw 5906, 25172, 5906, -16819, + + dw 22725, 22725, -12299, -29692 + dw 22725, 22725, 29692, 12299 + dw 22725, -22725, 29692, -12299 + dw -22725, 22725, 12299, -29692 + dw 31521, 26722, -31521, -17855 + dw 17855, 6270, 26722, -6270 + dw 17855, -31521, 26722, -31521 + dw 6270, 26722, 6270, -17855 + +ALIGN 8 +fdct_one_corr: + dw 1, 1, 1, 1 + +ALIGN 8 +fdct_tg_all_16: + dw 13036, 13036, 13036, 13036 + dw 27146, 27146, 27146, 27146 + dw -21746, -21746, -21746, -21746 + +ALIGN 8 +cos_4_16: + dw -19195, -19195, -19195, -19195 + +ALIGN 8 +ocos_4_16: + dw 23170, 23170, 23170, 23170 + +ALIGN 8 +fdct_r_row: + dd RND_FRW_ROW, RND_FRW_ROW + +;============================================================================= +; Factorized parts of the code turned into macros for better understanding +;============================================================================= + + ;; Macro for column DCT + ;; FDCT_COLUMN_MMX(int16_t *out, const int16_t *in, int offset); + ;; - out, register name holding the out address + ;; - in, register name holding the in address + ;; - column number to process +%macro FDCT_COLUMN_COMMON 3 + movq mm0, [%2 + %3*2 + 1*16] + movq mm1, [%2 + %3*2 + 6*16] + movq mm2, mm0 + movq mm3, [%2 + %3*2 + 2*16] + paddsw mm0, mm1 + movq mm4, [%2 + %3*2 + 5*16] + psllw mm0, SHIFT_FRW_COL + movq mm5, [%2 + %3*2 + 0*16] + paddsw mm4, mm3 + paddsw mm5, [%2 + %3*2 + 7*16] + psllw mm4, SHIFT_FRW_COL + movq mm6, mm0 + psubsw mm2, mm1 + movq mm1, [fdct_tg_all_16 + 4*2] + psubsw mm0, mm4 + movq mm7, [%2 + %3*2 + 3*16] + pmulhw mm1, mm0 + paddsw mm7, [%2 + %3*2 + 4*16] + psllw mm5, SHIFT_FRW_COL + paddsw mm6, mm4 + psllw mm7, SHIFT_FRW_COL + movq mm4, mm5 + psubsw mm5, mm7 + paddsw mm1, mm5 + paddsw mm4, mm7 + por mm1, [fdct_one_corr] + psllw mm2, SHIFT_FRW_COL + 1 + pmulhw mm5, [fdct_tg_all_16 + 4*2] + movq mm7, mm4 + psubsw mm3, [%2 + %3*2 + 5*16] + psubsw mm4, mm6 + movq [%1 + %3*2 + 2*16], mm1 + paddsw mm7, mm6 + movq mm1, [%2 + %3*2 + 3*16] + psllw mm3, SHIFT_FRW_COL + 1 + psubsw mm1, [%2 + %3*2 + 4*16] + movq mm6, mm2 + movq [%1 + %3*2 + 4*16], mm4 + paddsw mm2, mm3 + pmulhw mm2, [ocos_4_16] + psubsw mm6, mm3 + pmulhw mm6, [ocos_4_16] + psubsw mm5, mm0 + por mm5, [fdct_one_corr] + psllw mm1, SHIFT_FRW_COL + por mm2, [fdct_one_corr] + movq mm4, mm1 + movq mm3, [%2 + %3*2 + 0*16] + paddsw mm1, mm6 + psubsw mm3, [%2 + %3*2 + 7*16] + psubsw mm4, mm6 + movq mm0, [fdct_tg_all_16 + 0*2] + psllw mm3, SHIFT_FRW_COL + movq mm6, [fdct_tg_all_16 + 8*2] + pmulhw mm0, mm1 + movq [%1 + %3*2 + 0*16], mm7 + pmulhw mm6, mm4 + movq [%1 + %3*2 + 6*16], mm5 + movq mm7, mm3 + movq mm5, [fdct_tg_all_16 + 8*2] + psubsw mm7, mm2 + paddsw mm3, mm2 + pmulhw mm5, mm7 + paddsw mm0, mm3 + paddsw mm6, mm4 + pmulhw mm3, [fdct_tg_all_16 + 0*2] + por mm0, [fdct_one_corr] + paddsw mm5, mm7 + psubsw mm7, mm6 + movq [%1 + %3*2 + 1*16], mm0 + paddsw mm5, mm4 + movq [%1 + %3*2 + 3*16], mm7 + psubsw mm3, mm1 + movq [%1 + %3*2 + 5*16], mm5 + movq [%1 + %3*2 + 7*16], mm3 +%endmacro + + ;; Macro for row DCT using MMX punpcklw instructions + ;; FDCT_ROW_MMX(int16_t *out, const int16_t *in, const int16_t *table); + ;; - out, register name holding the out address + ;; - in, register name holding the in address + ;; - table coefficients address (register or absolute) +%macro FDCT_ROW_MMX 3 + movd mm1, [%2 + 6*2] + punpcklwd mm1, [%2 + 4*2] + movq mm2, mm1 + psrlq mm1, 0x20 + movq mm0, [%2 + 0*2] + punpcklwd mm1, mm2 + movq mm5, mm0 + paddsw mm0, mm1 + psubsw mm5, mm1 + movq mm1, mm0 + movq mm6, mm5 + punpckldq mm3, mm5 + punpckhdq mm6, mm3 + movq mm3, [%3 + 0*2] + movq mm4, [%3 + 4*2] + punpckldq mm2, mm0 + pmaddwd mm3, mm0 + punpckhdq mm1, mm2 + movq mm2, [%3 + 16*2] + pmaddwd mm4, mm1 + pmaddwd mm0, [%3 + 8*2] + movq mm7, [%3 + 20*2] + pmaddwd mm2, mm5 + paddd mm3, [fdct_r_row] + pmaddwd mm7, mm6 + pmaddwd mm1, [%3 + 12*2] + paddd mm3, mm4 + pmaddwd mm5, [%3 + 24*2] + pmaddwd mm6, [%3 + 28*2] + paddd mm2, mm7 + paddd mm0, [fdct_r_row] + psrad mm3, SHIFT_FRW_ROW + paddd mm2, [fdct_r_row] + paddd mm0, mm1 + paddd mm5, [fdct_r_row] + psrad mm2, SHIFT_FRW_ROW + paddd mm5, mm6 + psrad mm0, SHIFT_FRW_ROW + psrad mm5, SHIFT_FRW_ROW + packssdw mm3, mm0 + packssdw mm2, mm5 + movq mm6, mm3 + punpcklwd mm3, mm2 + punpckhwd mm6, mm2 + movq [%1 + 0*2], mm3 + movq [%1 + 4*2], mm6 +%endmacro + + ;; Macro for column DCT using XMM instuction pshufw + ;; FDCT_ROW_XMM(int16_t *out, const int16_t *in, const int16_t *table); + ;; - out, register name holding the out address + ;; - in, register name holding the in address + ;; - table coefficient address +%macro FDCT_ROW_XMM 3 + ;; fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) + pshufw mm5, [%2 + 4*2], 0x1B + movq mm0, [%2 + 0*2] + movq mm1, mm0 + paddsw mm0, mm5 + psubsw mm1, mm5 + pshufw mm2, mm0, 0x4E + pshufw mm3, mm1, 0x4E + movq mm4, [%3 + 0*2] + movq mm6, [%3 + 4*2] + movq mm5, [%3 + 16*2] + movq mm7, [%3 + 20*2] + pmaddwd mm4, mm0 + pmaddwd mm5, mm1 + pmaddwd mm6, mm2 + pmaddwd mm7, mm3 + pmaddwd mm0, [%3 + 8*2] + pmaddwd mm2, [%3 + 12*2] + pmaddwd mm1, [%3 + 24*2] + pmaddwd mm3, [%3 + 28*2] + paddd mm4, mm6 + paddd mm5, mm7 + paddd mm0, mm2 + paddd mm1, mm3 + movq mm7, [fdct_r_row] + paddd mm4, mm7 + paddd mm5, mm7 + paddd mm0, mm7 + paddd mm1, mm7 + psrad mm4, SHIFT_FRW_ROW + psrad mm5, SHIFT_FRW_ROW + psrad mm0, SHIFT_FRW_ROW + psrad mm1, SHIFT_FRW_ROW + packssdw mm4, mm0 + packssdw mm5, mm1 + movq mm2, mm4 + punpcklwd mm4, mm5 + punpckhwd mm2, mm5 + movq [%1 + 0*2], mm4 + movq [%1 + 4*2], mm2 +%endmacro + +%macro MAKE_FDCT_FUNC 2 +ALIGN 16 +cglobal %1 +%1: + ;; Move the destination/source address to the eax register + mov eax, [esp + 4] + + ;; Process the columns (4 at a time) + FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3 + FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7 + +%ifdef UNROLLED_LOOP + ; Unrolled loop version +%assign i 0 +%rep 8 + ;; Process the 'i'th row + %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i + %assign i i+1 +%endrep +%else + mov ecx, 8 + mov edx, tab_frw_01234567 +ALIGN 8 +.loop + %2 eax, eax, edx + add eax, 2*8 + add edx, 2*32 + dec ecx + jne .loop +%endif + + ret +%endmacro + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +;----------------------------------------------------------------------------- +; void fdct_mmx_ffmpeg(int16_t block[64]); +;----------------------------------------------------------------------------- + +MAKE_FDCT_FUNC fdct_mmx_ffmpeg, FDCT_ROW_MMX + +;----------------------------------------------------------------------------- +; void fdct_xmm_ffmpeg(int16_t block[64]); +;----------------------------------------------------------------------------- + +MAKE_FDCT_FUNC fdct_xmm_ffmpeg, FDCT_ROW_XMM \ No newline at end of file