[cvs] / xvidcore / src / dct / x86_asm / simple_idct_mmx.asm Repository:
ViewVC logotype

View of /xvidcore/src/dct/x86_asm/simple_idct_mmx.asm

Parent Directory Parent Directory | Revision Log Revision Log

Revision - (download) (annotate)
Mon Oct 27 01:03:06 2003 UTC (20 years, 7 months ago) by edgomez
Branch: dev-api-4
Changes since +214 -173 lines
* Ported the ffmpeg fDCT functions (mmx and xmm).
* Modified the skal's versions a bit to allow rolling loops.
* Activated Skal's fDCTs (unrolled versions) for mmx _and_ xmm
  (old code was ignoring xmm versions)
* Removed the SSE2 versions (they'll be back later)
* .data -> .rodata
* Applied announced asm CodingStyle to the dct dir
  (I'll have to add a section with the said CodingStyle)
; * Simple IDCT MMX
; *
; * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
; *
; * This library is free software; you can redistribute it and/or
; * modify it under the terms of the GNU Lesser General Public
; * License as published by the Free Software Foundation; either
; * version 2 of the License, or (at your option) any later version.
; *
; * This library is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * Lesser General Public License for more details.
; *
; * You should have received a copy of the GNU Lesser General Public
; * License along with this library; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
; *
; * Ported to nasm by Peter Ross <pross@xvid.org>
; */


; Macros and other preprocessor constants

%macro cglobal 1 
	%ifdef PREFIX
		global _%1 
		%define %1 _%1
		global %1

%define ROW_SHIFT 11
%define COL_SHIFT 20
%define C0 23170	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 23170.475006
%define C1 22725	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 22725.260826
%define C2 21407	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 21406.727617
%define C3 19266	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 19265.545870
%define C4 16383	;cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 = 16384.000000
%define C5 12873	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 12872.826198
%define C6 8867		;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 8866.956905
%define C7 4520		;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 4520.335430

; Data (Read Only)

SECTION .rodata

; Trigonometric Tables

	dw 0, 0xffff, 0, 0xffff

	dd 0x40000, 0

	dw	1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,		; 0
	dw	1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,		; 8

	dw	C4,  C4,  C4,  C4		; 16
	dw	C4, -C4,  C4, -C4		; 24
	dw	C2,  C6,  C2,  C6		; 32
	dw	C6, -C2,  C6, -C2		; 40
	dw	C1,  C3,  C1,  C3		; 48
	dw	C5,  C7,  C5,  C7		; 56
	dw	C3, -C7,  C3, -C7		; 64
	dw	-C1, -C5, -C1, -C5		; 72
	dw	C5, -C1,  C5, -C1		; 80
	dw	C7,  C3,  C7,  C3		; 88
	dw	C7, -C5,  C7, -C5		; 96
	dw	C3, -C1,  C3, -C1		; 104

; Helper macros


%macro	DC_COND_IDCT	8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm1,[src4]				; R6	R2	r6	r2
	movq mm2,[src1]				; R3	R1	r3	r1
	movq mm3,[src5]				; R7	R5	r7	r5
	movq mm4,[wm1010]
	pand mm4,mm0
	por mm4,mm1
	por mm4,mm2
	por mm4,mm3
	packssdw mm4,mm4
	movd eax,mm4
	or eax,eax
	jz near .skip1
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	movq mm5,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm5,mm1				; C6R6+C2R2	C6r6+C2r2
	movq mm6,[coeffs+40]		; -C2	C6	-C2	C6
	pmaddwd mm1,mm6				; -C2R6+C6R2	-C2r6+C6r2
	movq mm7,[coeffs+48]		; C3	C1	C3	C1
	pmaddwd mm7,mm2				; C3R3+C1R1	C3r3+C1r1
	rounder_op	mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	paddd mm4,mm5				; A0		a0
	psubd mm6,mm5				; A3		a3
	movq mm5,[coeffs+56]		; C7	C5	C7	C5
	pmaddwd mm5,mm3				; C7R7+C5R5	C7r7+C5r5
	rounder_op	mm0, rounder_arg
	paddd mm1,mm0				; A1		a1
	paddd mm0,mm0
	psubd mm0,mm1				; A2		a2
	pmaddwd mm2,[coeffs+64]		; -C7R3+C3R1	-C7r3+C3r1
	paddd mm7,mm5				; B0		b0
	movq mm5,[coeffs+72]		; -C5	-C1	-C5	-C1
	pmaddwd mm5,mm3				; -C5R7-C1R5	-C5r7-C1r5
	paddd mm7,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm7				; A0-B0		a0-b0
	paddd mm5,mm2				; B1		b1
	psrad mm7,shift
	psrad mm4,shift
	movq mm2,mm1				; A1		a1
	paddd mm1,mm5				; A1+B1		a1+b1
	psubd mm2,mm5				; A1-B1		a1-b1
	psrad mm1,shift
	psrad mm2,shift
	packssdw mm7,mm1			; A1+B1	a1+b1	A0+B0	a0+b0
	packssdw mm2,mm4			; A0-B0	a0-b0	A1-B1	a1-b1
	movq [dst],mm7
	movq mm1,[src1]				; R3	R1	r3	r1
	movq mm4,[coeffs+80]		;-C1	C5	-C1 	C5
	movq [dst + 24],mm2
	pmaddwd	mm4,mm1				; -C1R3+C5R1	-C1r3+C5r1
	movq mm7,[coeffs+88]		; C3	C7	C3 	C7
	pmaddwd mm1,[coeffs+96]		; -C5R3+C7R1	-C5r3+C7r1
	pmaddwd mm7,mm3				; C3R7+C7R5	C3r7+C7r5
	movq mm2,mm0				; A2		a2
	pmaddwd mm3,[coeffs+104]	; -C1R7+C3R5	-C1r7+C3r5
	paddd mm4,mm7				; B2		b2
	paddd mm2,mm4				; A2+B2		a2+b2
	psubd mm0,mm4				; a2-B2		a2-b2
	psrad mm2,shift
	psrad mm0,shift
	movq mm4,mm6				; A3		a3
	paddd mm3,mm1				; B3		b3
	paddd mm6,mm3				; A3+B3		a3+b3
	psubd mm4,mm3				; a3-B3		a3-b3
	psrad mm6,shift
	packssdw mm2,mm6			; A3+B3	a3+b3	A2+B2	a2+b2
	movq [ dst + 8],mm2
	psrad mm4,shift
	packssdw mm4,mm0			; A2-B2	a2-b2	A3-B3	a3-b3
	movq [ dst + 16],mm4
	jmp short .skip2
	pslld mm0,16			
	paddd mm0,[d40000]
	psrad mm0,13
	packssdw mm0,mm0
	movq [ dst ],mm0
	movq [ dst + 8],mm0
	movq [ dst + 16],mm0
	movq [ dst + 24],mm0
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift


%macro	Z_COND_IDCT	9
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
%define	bt			%9
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm1,[src4]				; R6	R2	r6	r2
	movq mm2,[src1]				; R3	R1	r3	r1
	movq mm3,[src5]				; R7	R5	r7	r5
	movq mm4,mm0
	por mm4,mm1
	por mm4,mm2
	por mm4,mm3
	packssdw mm4,mm4
	movd eax,mm4
	or eax,eax
	jz near bt
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	movq mm5,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm5,mm1				; C6R6+C2R2	C6r6+C2r2
	movq mm6,[coeffs+40]		; -C2	C6	-C2	C6
	pmaddwd mm1,mm6				; -C2R6+C6R2	-C2r6+C6r2
	movq mm7,[coeffs+48]		; C3	C1	C3	C1
	pmaddwd mm7,mm2				; C3R3+C1R1	C3r3+C1r1
	rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	paddd mm4,mm5				; A0		a0
	psubd mm6,mm5				; A3		a3
	movq mm5,[coeffs+56]		; C7	C5	C7	C5
	pmaddwd mm5,mm3				; C7R7+C5R5	C7r7+C5r5
	rounder_op mm0, rounder_arg
	paddd mm1,mm0				; A1		a1
	paddd mm0,mm0
	psubd mm0,mm1				; A2		a2
	pmaddwd mm2,[coeffs+64]		; -C7R3+C3R1	-C7r3+C3r1
	paddd mm7,mm5				; B0		b0
	movq mm5,[coeffs+72]		; -C5	-C1	-C5	-C1
	pmaddwd mm5,mm3				; -C5R7-C1R5	-C5r7-C1r5
	paddd mm7,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm7				; A0-B0		a0-b0
	paddd mm5,mm2				; B1		b1
	psrad mm7,shift
	psrad mm4,shift
	movq mm2,mm1				; A1		a1
	paddd mm1,mm5				; A1+B1		a1+b1
	psubd mm2,mm5				; A1-B1		a1-b1
	psrad mm1,shift
	psrad mm2,shift
	packssdw mm7,mm1			; A1+B1	a1+b1	A0+B0	a0+b0
	packssdw mm2,mm4			; A0-B0	a0-b0	A1-B1	a1-b1
	movq [ dst ],mm7
	movq mm1,[src1]				; R3	R1	r3	r1
	movq mm4,[coeffs+80]		; -C1	C5	-C1 	C5
	movq [ dst + 24 ],mm2
	pmaddwd mm4,mm1				; -C1R3+C5R1	-C1r3+C5r1
	movq mm7,[coeffs+88]		; C3	C7	C3 	C7
	pmaddwd mm1,[coeffs+96]		; -C5R3+C7R1	-C5r3+C7r1
	pmaddwd mm7,mm3				; C3R7+C7R5	C3r7+C7r5
	movq mm2,mm0				; A2		a2
	pmaddwd mm3,[coeffs+104]	; -C1R7+C3R5	-C1r7+C3r5
	paddd mm4,mm7				; B2		b2
	paddd mm2,mm4				; A2+B2		a2+b2
	psubd mm0,mm4				; a2-B2		a2-b2
	psrad mm2,shift
	psrad mm0,shift
	movq mm4,mm6				; A3		a3
	paddd mm3,mm1				; B3		b3
	paddd mm6,mm3				; A3+B3		a3+b3
	psubd mm4,mm3				; a3-B3		a3-b3
	psrad mm6,shift
	packssdw mm2,mm6			; A3+B3	a3+b3	A2+B2	a2+b2
	movq [ dst + 8],mm2
	psrad mm4,shift
	packssdw mm4,mm0			; A2-B2	a2-b2	A3-B3	a3-b3
	movq [dst + 16],mm4
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%undef	bt


%macro	IDCT0		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm1,[src4]				; R6	R2	r6	r2
	movq mm2,[src1]				; R3	R1	r3	r1
	movq mm3,[src5]				; R7	R5	r7	r5
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	movq mm5,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm5,mm1				; C6R6+C2R2	C6r6+C2r2
	movq mm6,[coeffs+40]		; -C2	C6	-C2	C6
	pmaddwd mm1,mm6				; -C2R6+C6R2	-C2r6+C6r2
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	movq mm7,[coeffs+48]		; C3	C1	C3	C1
	; rounder_op mm0, rounder_arg
	pmaddwd mm7,mm2				; C3R3+C1R1	C3r3+C1r1
	paddd mm4,mm5				; A0		a0
	psubd mm6,mm5				; A3		a3
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	paddd mm0,mm1				; A1		a1
	psubd mm5,mm1				; A2		a2
	movq mm1,[coeffs+56]		; C7	C5	C7	C5
	pmaddwd mm1,mm3				; C7R7+C5R5	C7r7+C5r5
	pmaddwd mm2,[coeffs+64]		; -C7R3+C3R1	-C7r3+C3r1
	paddd mm7,mm1				; B0		b0
	movq mm1,[coeffs+72]		; -C5	-C1	-C5	-C1
	pmaddwd mm1,mm3				; -C5R7-C1R5	-C5r7-C1r5
	paddd mm7,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm7				; A0-B0		a0-b0
	paddd mm1,mm2				; B1		b1
	psrad mm7,shift
	psrad mm4,shift
	movq mm2,mm0				; A1		a1
	paddd mm0,mm1				; A1+B1		a1+b1
	psubd mm2,mm1				; A1-B1		a1-b1
	psrad mm0,shift
	psrad mm2,shift
	packssdw mm7,mm7			; A0+B0	a0+b0
	movd [ dst ],mm7
	packssdw mm0,mm0			; A1+B1	a1+b1
	movd [ dst + 16],mm0
	packssdw mm2,mm2			; A1-B1	a1-b1
	movd [ dst + 96 ],mm2
	packssdw mm4,mm4			; A0-B0	a0-b0
	movd [ dst + 112],mm4
	movq mm0,[src1]				; R3	R1	r3	r1
	movq mm4,[coeffs+80]		; -C1	C5	-C1 	C5
	pmaddwd mm4,mm0				; -C1R3+C5R1	-C1r3+C5r1
	movq mm7,[coeffs+88]		; C3	C7	C3 	C7
	pmaddwd mm0,[coeffs+96]		; -C5R3+C7R1	-C5r3+C7r1
	pmaddwd mm7,mm3				; C3R7+C7R5	C3r7+C7r5
	movq mm2,mm5				; A2		a2
	pmaddwd mm3,[coeffs+104]	; -C1R7+C3R5	-C1r7+C3r5
	paddd mm4,mm7				; B2		b2
	paddd mm2,mm4				; A2+B2		a2+b2
	psubd mm5,mm4				; a2-B2		a2-b2
	psrad mm2,shift
	psrad mm5,shift
	movq mm4,mm6				; A3		a3
	paddd mm3,mm0				; B3		b3
	paddd mm6,mm3				; A3+B3		a3+b3
	psubd mm4,mm3				; a3-B3		a3-b3
	psrad mm6,shift
	psrad mm4,shift
	packssdw mm2,mm2			; A2+B2	a2+b2
	packssdw mm6,mm6			; A3+B3	a3+b3
	movd [ dst + 32 ],mm2
	packssdw mm4,mm4			; A3-B3	a3-b3
	packssdw mm5,mm5			; A2-B2	a2-b2
	movd [ dst + 48 ],mm6
	movd [ dst + 64 ],mm4
	movd [ dst + 80 ],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift


%macro	IDCT4		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm1,[src4]				; R6	R2	r6	r2
	movq mm3,[src5]				; R7	R5	r7	r5
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	movq mm5,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm5,mm1				; C6R6+C2R2	C6r6+C2r2
	movq mm6,[coeffs+40]		; -C2	C6	-C2	C6
	pmaddwd mm1,mm6				; -C2R6+C6R2	-C2r6+C6r2
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	; rounder_op mm0, rounder_arg
	paddd mm4,mm5				; A0		a0
	psubd mm6,mm5				; A3		a3
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	paddd mm0,mm1				; A1		a1
	psubd mm5,mm1				; A2		a2
	movq mm1,[coeffs+56]		; C7	C5	C7	C5
	pmaddwd mm1,mm3				; C7R7+C5R5	C7r7+C5r5
	movq mm7,[coeffs+72]		; -C5	-C1	-C5	-C1
	pmaddwd mm7,mm3				; -C5R7-C1R5	-C5r7-C1r5
	paddd mm1,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm1				; A0-B0		a0-b0
	psrad mm1,shift
	psrad mm4,shift
	movq mm2,mm0				; A1		a1
	paddd mm0,mm7				; A1+B1		a1+b1
	psubd mm2,mm7				; A1-B1		a1-b1
	psrad mm0,shift
	psrad mm2,shift
	packssdw mm1,mm1			; A0+B0	a0+b0
	movd [ dst ],mm1
	packssdw mm0,mm0			; A1+B1	a1+b1
	movd [ dst + 16 ],mm0
	packssdw mm2,mm2			; A1-B1	a1-b1
	movd [ dst + 96 ],mm2
	packssdw mm4,mm4			; A0-B0	a0-b0
	movd [ dst + 112 ],mm4
	movq mm1,[coeffs+88]		; C3	C7	C3 	C7
	pmaddwd mm1,mm3				; C3R7+C7R5	C3r7+C7r5
	movq mm2,mm5				; A2		a2
	pmaddwd mm3,[coeffs+104]	; -C1R7+C3R5	-C1r7+C3r5
	paddd mm2,mm1				; A2+B2		a2+b2
	psubd mm5,mm1				; a2-B2		a2-b2
	psrad mm2,shift
	psrad mm5,shift
	movq mm1,mm6				; A3		a3
	paddd mm6,mm3				; A3+B3		a3+b3
	psubd mm1,mm3				; a3-B3		a3-b3
	psrad mm6,shift
	psrad mm1,shift
	packssdw mm2,mm2			; A2+B2	a2+b2
	packssdw mm6,mm6			; A3+B3	a3+b3
	movd [dst + 32],mm2
	packssdw mm1,mm1			; A3-B3	a3-b3
	packssdw mm5,mm5			; A2-B2	a2-b2
	movd [dst + 48],mm6
	movd [dst + 64],mm1
	movd [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift


%macro	IDCT6		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm3,[src5]				; R7	R5	r7	r5
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	; rounder_op mm0, rounder_arg
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	movq mm1,[coeffs+56]		; C7	C5	C7	C5
	pmaddwd mm1,mm3				; C7R7+C5R5	C7r7+C5r5
	movq mm7,[coeffs+72]		; -C5	-C1	-C5	-C1
	pmaddwd mm7,mm3				; -C5R7-C1R5	-C5r7-C1r5
	paddd mm1,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm1				; A0-B0		a0-b0
	psrad mm1,shift
	psrad mm4,shift
	movq mm2,mm0				; A1		a1
	paddd mm0,mm7				; A1+B1		a1+b1
	psubd mm2,mm7				; A1-B1		a1-b1
	psrad mm0,shift
	psrad mm2,shift
	packssdw mm1,mm1			; A0+B0	a0+b0
	movd [ dst ],mm1
	packssdw mm0,mm0			; A1+B1	a1+b1
	movd [ dst + 16 ],mm0
	packssdw mm2,mm2			; A1-B1	a1-b1
	movd [ dst + 96 ],mm2
	packssdw mm4,mm4			; A0-B0	a0-b0
	movd [ dst + 112 ],mm4
	movq mm1,[coeffs+88]		; C3	C7	C3 	C7
	pmaddwd mm1,mm3				; C3R7+C7R5	C3r7+C7r5
	movq mm2,mm5				; A2		a2
	pmaddwd mm3,[coeffs+104]	; -C1R7+C3R5	-C1r7+C3r5
	paddd mm2,mm1				; A2+B2		a2+b2
	psubd mm5,mm1				; a2-B2		a2-b2
	psrad mm2,shift
	psrad mm5,shift
	movq mm1,mm6				; A3		a3
	paddd mm6,mm3				; A3+B3		a3+b3
	psubd mm1,mm3				; a3-B3		a3-b3
	psrad mm6,shift
	psrad mm1,shift
	packssdw mm2,mm2			; A2+B2	a2+b2
	packssdw mm6,mm6			; A3+B3	a3+b3
	movd [dst + 32],mm2
	packssdw mm1,mm1			; A3-B3	a3-b3
	packssdw mm5,mm5			; A2-B2	a2-b2
	movd [dst + 48],mm6
	movd [dst + 64],mm1
	movd [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift


%macro	IDCT2		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm2,[src1]				; R3	R1	r3	r1
	movq mm3,[src5]				; R7	R5	r7	r5
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	movq mm7,[coeffs+48]		; C3	C1	C3	C1
	; rounder_op mm0, rounder_arg
	pmaddwd mm7,mm2				; C3R3+C1R1	C3r3+C1r1
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	movq mm1,[coeffs+56]		; C7	C5	C7	C5
	pmaddwd mm1,mm3				; C7R7+C5R5	C7r7+C5r5
	pmaddwd mm2,[coeffs+64]		; -C7R3+C3R1	-C7r3+C3r1
	paddd mm7,mm1				; B0		b0
	movq mm1,[coeffs+72]		; -C5	-C1	-C5	-C1
	pmaddwd mm1,mm3				; -C5R7-C1R5	-C5r7-C1r5
	paddd mm7,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm7				; A0-B0		a0-b0
	paddd mm1,mm2				; B1		b1
	psrad mm7,shift
	psrad mm4,shift
	movq mm2,mm0				; A1		a1
	paddd mm0,mm1				; A1+B1		a1+b1
	psubd mm2,mm1				; A1-B1		a1-b1
	psrad mm0,shift
	psrad mm2,shift
	packssdw mm7,mm7			; A0+B0	a0+b0
	movd [dst],mm7
	packssdw mm0,mm0			; A1+B1	a1+b1
	movd [dst + 16],mm0
	packssdw mm2,mm2			; A1-B1	a1-b1
	movd [dst + 96],mm2
	packssdw mm4,mm4			; A0-B0	a0-b0
	movd [dst + 112],mm4
	movq mm0,[src1]				; R3	R1	r3	r1
	movq mm4,[coeffs+80]		; -C1	C5	-C1 	C5
	pmaddwd mm4,mm0				; -C1R3+C5R1	-C1r3+C5r1
	movq mm7,[coeffs+88]		; C3	C7	C3 	C7
	pmaddwd mm0,[coeffs+96]		; -C5R3+C7R1	-C5r3+C7r1
	pmaddwd mm7,mm3				; C3R7+C7R5	C3r7+C7r5
	movq mm2,mm5				; A2		a2
	pmaddwd mm3,[coeffs+104]	; -C1R7+C3R5	-C1r7+C3r5
	paddd mm4,mm7				; B2		b2
	paddd mm2,mm4				; A2+B2		a2+b2
	psubd mm5,mm4				; a2-B2		a2-b2
	psrad mm2,shift
	psrad mm5,shift
	movq mm4,mm6				; A3		a3
	paddd mm3,mm0				; B3		b3
	paddd mm6,mm3				; A3+B3		a3+b3
	psubd mm4,mm3				; a3-B3		a3-b3
	psrad mm6,shift
	psrad mm4,shift
	packssdw mm2,mm2			; A2+B2	a2+b2
	packssdw mm6,mm6			; A3+B3	a3+b3
	movd [dst + 32],mm2
	packssdw mm4,mm4			; A3-B3	a3-b3
	packssdw mm5,mm5			; A2-B2	a2-b2
	movd [dst + 48],mm6
	movd [dst + 64],mm4
	movd [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift


%macro	IDCT3		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm2,[src1]				; R3	R1	r3	r1
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	movq mm7,[coeffs+48]		; C3	C1	C3	C1
	; rounder_op mm0, rounder_arg
	pmaddwd mm7,mm2				; C3R3+C1R1	C3r3+C1r1
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	movq mm3,[coeffs+64]
	pmaddwd mm3,mm2				; -C7R3+C3R1	-C7r3+C3r1
	paddd mm7,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm7				; A0-B0		a0-b0
	psrad mm7,shift
	psrad mm4,shift
	movq mm1,mm0				; A1		a1
	paddd mm0,mm3				; A1+B1		a1+b1
	psubd mm1,mm3				; A1-B1		a1-b1
	psrad mm0,shift
	psrad mm1,shift
	packssdw mm7,mm7			; A0+B0	a0+b0
	movd [dst],mm7
	packssdw mm0,mm0			; A1+B1	a1+b1
	movd [dst + 16],mm0
	packssdw mm1,mm1			; A1-B1	a1-b1
	movd [dst + 96],mm1
	packssdw mm4,mm4			; A0-B0	a0-b0
	movd [dst + 112],mm4
	movq mm4,[coeffs+80]		; -C1	C5	-C1 	C5
	pmaddwd mm4,mm2				; -C1R3+C5R1	-C1r3+C5r1
	pmaddwd mm2,[coeffs+96]		; -C5R3+C7R1	-C5r3+C7r1
	movq mm1,mm5				; A2		a2
	paddd mm1,mm4				; A2+B2		a2+b2
	psubd mm5,mm4				; a2-B2		a2-b2
	psrad mm1,shift
	psrad mm5,shift
	movq mm4,mm6				; A3		a3
	paddd mm6,mm2				; A3+B3		a3+b3
	psubd mm4,mm2				; a3-B3		a3-b3
	psrad mm6,shift
	psrad mm4,shift
	packssdw mm1,mm1			; A2+B2	a2+b2
	packssdw mm6,mm6			; A3+B3	a3+b3
	movd [dst + 32],mm1
	packssdw mm4,mm4			; A3-B3	a3-b3
	packssdw mm5,mm5			; A2-B2	a2-b2
	movd [dst + 48],mm6
	movd [dst + 64],mm4
	movd [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift


%macro	IDCT5		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm1,[src4]				; R6	R2	r6	r2
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	movq mm5,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm5,mm1				; C6R6+C2R2	C6r6+C2r2
	movq mm6,[coeffs+40]		; -C2	C6	-C2	C6
	pmaddwd mm1,mm6				; -C2R6+C6R2	-C2r6+C6r2
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	paddd mm4,mm5				; A0		a0
	; rounder_op mm0, rounder_arg
	psubd mm6,mm5				; A3		a3
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	paddd mm0,mm1				; A1		a1
	psubd mm5,mm1				; A2		a2
	movq mm2,[src0 + 8]			; R4	R0	r4	r0
	movq mm3,[src4 + 8]			; R6	R2	r6	r2
	movq mm1,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm1,mm2				; C4R4+C4R0	C4r4+C4r0
	movq mm7,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm2,mm7				; -C4R4+C4R0	-C4r4+C4r0
	movq mm7,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm7,mm3				; C6R6+C2R2	C6r6+C2r2
	pmaddwd mm3,[coeffs+40]		; -C2R6+C6R2	-C2r6+C6r2
	; rounder_op mm1, rounder_arg
	paddd mm7,mm1				; A0		a0
	paddd mm1,mm1				; 2C0		2c0
	; rounder_op mm2, rounder_arg
	psubd mm1,mm7				; A3		a3
	paddd mm3,mm2				; A1		a1
	paddd mm2,mm2				; 2C1		2c1
	psubd mm2,mm3				; A2		a2
	psrad mm4,shift
	psrad mm7,shift
	psrad mm3,shift
	packssdw mm4,mm7			; A0	a0
	movq [dst],mm4
	psrad mm0,shift
	packssdw mm0,mm3			; A1	a1
	movq [dst + 16],mm0
	movq [dst + 96],mm0
	movq [dst + 112],mm4
	psrad mm5,shift
	psrad mm6,shift
	psrad mm2,shift
	packssdw mm5,mm2			; A2-B2	a2-b2
	movq [dst + 32],mm5
	psrad mm1,shift
	packssdw mm6,mm1			; A3+B3	a3+b3
	movq [dst + 48],mm6
	movq [dst + 64],mm6
	movq [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift


%macro	IDCT1		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm1,[src4]				; R6	R2	r6	r2
	movq mm2,[src1]				; R3	R1	r3	r1
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	movq mm5,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm5,mm1				; C6R6+C2R2	C6r6+C2r2
	movq mm6,[coeffs+40]		; -C2	C6	-C2	C6
	pmaddwd mm1,mm6				; -C2R6+C6R2	-C2r6+C6r2
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	movq mm7,[coeffs+48]		; C3	C1	C3	C1
	; rounder_op mm0, rounder_arg
	pmaddwd mm7,mm2				; C3R3+C1R1	C3r3+C1r1
	paddd mm4,mm5				; A0		a0
	psubd mm6,mm5				; A3		a3
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	paddd mm0,mm1				; A1		a1
	psubd mm5,mm1				; A2		a2
	movq mm1,[coeffs+64]
	pmaddwd mm1,mm2				; -C7R3+C3R1	-C7r3+C3r1
	paddd mm7,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm7				; A0-B0		a0-b0
	psrad mm7,shift
	psrad mm4,shift
	movq mm3,mm0				; A1		a1
	paddd mm0,mm1				; A1+B1		a1+b1
	psubd mm3,mm1				; A1-B1		a1-b1
	psrad mm0,shift
	psrad mm3,shift
	packssdw mm7,mm7			; A0+B0	a0+b0
	movd [dst],mm7
	packssdw mm0,mm0			; A1+B1	a1+b1
	movd [dst + 16],mm0
	packssdw mm3,mm3			; A1-B1	a1-b1
	movd [dst + 96],mm3
	packssdw mm4,mm4			; A0-B0	a0-b0
	movd [dst + 112],mm4
	movq mm4,[coeffs+80]		; -C1	C5	-C1 	C5
	pmaddwd mm4,mm2				; -C1R3+C5R1	-C1r3+C5r1
	pmaddwd mm2,[coeffs+96]		; -C5R3+C7R1	-C5r3+C7r1
	movq mm3,mm5				; A2		a2
	paddd mm3,mm4				; A2+B2		a2+b2
	psubd mm5,mm4				; a2-B2		a2-b2
	psrad mm3,shift
	psrad mm5,shift
	movq mm4,mm6				; A3		a3
	paddd mm6,mm2				; A3+B3		a3+b3
	psubd mm4,mm2				; a3-B3		a3-b3
	psrad mm6,shift
	packssdw mm3,mm3			; A2+B2	a2+b2
	movd [dst + 32],mm3
	psrad mm4,shift
	packssdw mm6,mm6			; A3+B3	a3+b3
	movd [dst + 48],mm6
	packssdw mm4,mm4			; A3-B3	a3-b3
	packssdw mm5,mm5			; A2-B2	a2-b2
	movd [dst + 64],mm4
	movd [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift


%macro	IDCT7		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	; rounder_op mm4, rounder_arg
	; rounder_op mm0, rounder_arg
	psrad mm4,shift
	psrad mm0,shift
	movq mm2,[src0 + 8]			; R4	R0	r4	r0
	movq mm1,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm1,mm2				; C4R4+C4R0	C4r4+C4r0
	movq mm7,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm2,mm7				; -C4R4+C4R0	-C4r4+C4r0
	movq mm7,[coeffs+32]		; C6	C2	C6	C2
	; rounder_op mm1, rounder_arg
	; rounder_op mm2, rounder_arg
	psrad mm1,shift
	packssdw mm4,mm1			; A0	a0
	movq [dst],mm4
	psrad mm2,shift
	packssdw mm0,mm2			; A1	a1
	movq [dst + 16],mm0
	movq [dst + 96],mm0
	movq [dst + 112],mm4
	movq [dst + 32],mm0
	movq [dst + 48],mm4
	movq [dst + 64],mm4
	movq [dst + 80],mm0
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift

; Permutation helpers

%macro XLODA 2 
	mov	bx, [srcP+2*%2]  	; get src contents
	mov	ax, [srcP+2*%1]  	; get dest contents
	mov	[srcP+2*%1], bx     ; store new dest val

%macro XCHGA 2 
	mov	ax, [srcP+2*%1]  	; get dest contents
	mov	[srcP+2*%1], bx     ; store new dest val

%macro XCHGB 2 
	mov	bx, [srcP+2*%1]	    ; get dest contents
	mov	[srcP+2*%1], ax     ; store new dest val

%macro XSTRA 2 
	mov	[srcP+2*%1], bx     ; store dest val

%macro XSTRB 2 
	mov	[srcP+2*%1], ax     ; store dest val

; Permutation macro

%macro PERMUTEP 1 
%define	srcP		%1
	push ebx

;	XCHGA  0x00, 0x00      ; nothing to do
	XLODA  0x08, 0x01 
	XCHGB  0x10, 0x08
	XCHGA  0x20, 0x10 
	XCHGB  0x02, 0x20
	XCHGA  0x04, 0x02 
	XSTRB  0x01, 0x04

	XLODA  0x09, 0x03
	XCHGB  0x18, 0x09 
	XCHGA  0x12, 0x18 
	XCHGB  0x24, 0x12 
	XSTRA  0x03, 0x24 

	XLODA  0x0C, 0x05
	XCHGB  0x11, 0x0C 
	XCHGA  0x28, 0x11 
	XCHGB  0x30, 0x28
	XCHGA  0x22, 0x30 
	XCHGB  0x06, 0x22 
	XSTRA  0x05, 0x06

	XLODA  0x0D, 0x07 
	XCHGB  0x1C, 0x0D 
	XCHGA  0x13, 0x1C 
	XCHGB  0x29, 0x13 
	XCHGA  0x38, 0x29
	XCHGB  0x32, 0x38
	XCHGA  0x26, 0x32
	XSTRB  0x07, 0x26 

	XLODA  0x14, 0x0A 
	XCHGB  0x21, 0x14 
	XSTRA  0x0A, 0x21 

	XLODA  0x19, 0x0B 
	XCHGB  0x1A, 0x19 
	XCHGA  0x16, 0x1A
	XCHGB  0x25, 0x16 
	XCHGA  0x0E, 0x25 
	XCHGB  0x15, 0x0E 
	XCHGA  0x2C, 0x15 
	XCHGB  0x31, 0x2C
	XCHGA  0x2A, 0x31
	XCHGB  0x34, 0x2A
	XCHGA  0x23, 0x34 
	XSTRB  0x0B, 0x23 

	XLODA  0x1D, 0x0F 
	XCHGB  0x1E, 0x1D 
	XCHGA  0x17, 0x1E
	XCHGB  0x2D, 0x17 
	XCHGA  0x3C, 0x2D
	XCHGB  0x33, 0x3C
	XCHGA  0x2B, 0x33 
	XCHGB  0x39, 0x2B
	XCHGA  0x3A, 0x39
	XCHGB  0x36, 0x3A
	XCHGA  0x27, 0x36 
	XSTRB  0x0F, 0x27 

;	XCHGA  0x1B, 0x1B 

;	XCHGA  0x1F, 0x1F 

	XLODA  0x35, 0x2E
	XSTRB  0x2E, 0x35 

	XLODA  0x3D, 0x2F 
	XCHGB  0x3E, 0x3D
	XCHGA  0x37, 0x3E
	XSTRB  0x2F, 0x37 

;	XCHGA  0x3B, 0x3B

;	XCHGA  0x3F, 0x3F
    pop  ebx
%undef	srcP

;  Code


; void simple_idct_mmx_P(int16_t * const block)
; expects input data to be permutated

cglobal simple_idct_mmx_P
	sub esp, 128
	mov edx, [esp+128+4]

;				src0,	src4,	src1,	src5,	dst,	rndop,	rndarg,		shift,	bt
	DC_COND_IDCT edx+0,	edx+8,	edx+16,	edx+24,	esp,	paddd,	[coeffs+8],	11
	Z_COND_IDCT	edx+32,	edx+40,	edx+48,	edx+56,	esp+32,	paddd,	[coeffs],	11,		.four
	Z_COND_IDCT	edx+64,	edx+72,	edx+80,	edx+88,	esp+64,	paddd,	[coeffs],	11,		.two
	Z_COND_IDCT	edx+96,	edx+104,edx+112,edx+120,esp+96,	paddd,	[coeffs],	11,		.one
	IDCT0		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT0		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT0		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT0		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

	Z_COND_IDCT	edx+64,	edx+72,	edx+80,	edx+88,	esp+64,	paddd,	[coeffs],	11,		.six
	Z_COND_IDCT	edx+96,	edx+104,edx+112,edx+120,esp+96,	paddd,	[coeffs],	11,		.five
	IDCT4		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT4		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT4		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT4		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

	Z_COND_IDCT	edx+96,	edx+104,edx+112,edx+120,esp+96,	paddd,	[coeffs],	11,		.seven
	IDCT6		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT6		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT6		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT6		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

	Z_COND_IDCT	edx+96,	edx+104,edx+112,edx+120,esp+96,	paddd,	[coeffs],	11,		.three
	IDCT2		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT2		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT2		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT2		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

	IDCT3		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT3		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT3		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT3		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

	IDCT5		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	; IDCT5		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT5		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	; IDCT5		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

	IDCT1		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT1		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT1		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT1		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

	IDCT7		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	; IDCT7		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT7		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	; IDCT7		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20

	add esp, 128


; void simple_idct_mmx(int16_t * const block)
; simple_idct_mmx is the same function as simple_idct_mmx_P above except that
; on entry it will do a fast in-line and in-place permutation on the iDCT parm
; list.  This means that same parm list will also not have to be copied on the
; way out. - trbarry 6/2003

cglobal simple_idct_mmx
	sub esp, 128
	mov edx, [esp+128+4]
	PERMUTEP edx			; permute parm list in place

;				src0,	src4,	src1,	src5,	dst,	rndop,	rndarg,		shift,	bt
	DC_COND_IDCT edx+0,	edx+8,	edx+16,	edx+24,	esp,	paddd,	[coeffs+8],	11
	Z_COND_IDCT	edx+32,	edx+40,	edx+48,	edx+56,	esp+32,	paddd,	[coeffs],	11,		.fourP
	Z_COND_IDCT	edx+64,	edx+72,	edx+80,	edx+88,	esp+64,	paddd,	[coeffs],	11,		.twoP
	Z_COND_IDCT	edx+96,	edx+104,edx+112,edx+120,esp+96,	paddd,	[coeffs],	11,		.oneP
	IDCT0		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT0		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT0		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT0		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.retP

	Z_COND_IDCT	edx+64,	edx+72,	edx+80,	edx+88,	esp+64,	paddd,	[coeffs],	11,		.sixP
	Z_COND_IDCT	edx+96,	edx+104,edx+112,edx+120,esp+96,	paddd,	[coeffs],	11,		.fiveP
	IDCT4		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT4		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT4		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT4		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.retP

	Z_COND_IDCT	edx+96,	edx+104,edx+112,edx+120,esp+96,	paddd,	[coeffs],	11,		.sevenP
	IDCT6		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT6		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT6		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT6		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.retP

	Z_COND_IDCT	edx+96,	edx+104,edx+112,edx+120,esp+96,	paddd,	[coeffs],	11,		.threeP
	IDCT2		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT2		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT2		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT2		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.retP

	IDCT3		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT3		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT3		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT3		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.retP

	IDCT5		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	; IDCT5		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT5		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	; IDCT5		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.retP

	IDCT1		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT1		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT1		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT1		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.retP

	IDCT7		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	; IDCT7		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT7		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	; IDCT7		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20

	add	esp, 128


No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4