32 |
; * |
; * |
33 |
; *************************************************************************/ |
; *************************************************************************/ |
34 |
|
|
35 |
bits 32 |
%include "nasm.inc" |
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
%ifdef MARK_FUNCS |
|
|
global _%1:function %1.endfunc-%1 |
|
|
%define %1 _%1:function %1.endfunc-%1 |
|
|
%define ENDFUNC .endfunc |
|
|
%else |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%define ENDFUNC |
|
|
%endif |
|
|
%else |
|
|
%ifdef MARK_FUNCS |
|
|
global %1:function %1.endfunc-%1 |
|
|
%define ENDFUNC .endfunc |
|
|
%else |
|
|
global %1 |
|
|
%define ENDFUNC |
|
|
%endif |
|
|
%endif |
|
|
%endmacro |
|
36 |
|
|
37 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
38 |
|
|
42 |
|
|
43 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
44 |
|
|
45 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata |
|
|
%else |
|
|
SECTION .rodata align=16 |
|
|
%endif |
|
46 |
|
|
47 |
align 16 |
align SECTION_ALIGN |
48 |
Cst16: |
Cst16: |
49 |
times 8 dw 16 |
times 8 dw 16 |
50 |
|
|
51 |
SECTION .text |
SECTION .rotext align=SECTION_ALIGN |
52 |
|
|
53 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
54 |
;// mmx version |
;// mmx version |
56 |
%macro GMC_4_SSE 2 ; %1: i %2: out reg (mm5 or mm6) |
%macro GMC_4_SSE 2 ; %1: i %2: out reg (mm5 or mm6) |
57 |
|
|
58 |
pcmpeqw mm0, mm0 |
pcmpeqw mm0, mm0 |
59 |
movq mm1, [eax+2*(%1) ] ; u0 | u1 | u2 | u3 |
movq mm1, [_EAX+2*(%1) ] ; u0 | u1 | u2 | u3 |
60 |
psrlw mm0, 12 ; mask 0x000f |
psrlw mm0, 12 ; mask 0x000f |
61 |
movq mm2, [eax+2*(%1)+2*16] ; v0 | v1 | v2 | v3 |
movq mm2, [_EAX+2*(%1)+2*16] ; v0 | v1 | v2 | v3 |
62 |
|
|
63 |
pand mm1, mm0 ; u0 |
pand mm1, mm0 ; u0 |
64 |
pand mm2, mm0 ; v0 |
pand mm2, mm0 ; v0 |
73 |
pmullw mm0, mm4 ; (16-u).(16-v) |
pmullw mm0, mm4 ; (16-u).(16-v) |
74 |
pmullw mm1, mm4 ; u .(16-v) |
pmullw mm1, mm4 ; u .(16-v) |
75 |
|
|
76 |
movd mm4, [ecx+edx +%1] ; src2 |
movd mm4, [TMP0+TMP1 +%1] ; src2 |
77 |
movd %2, [ecx+edx+1+%1] ; src3 |
movd %2, [TMP0+TMP1+1+%1] ; src3 |
78 |
punpcklbw mm4, mm7 |
punpcklbw mm4, mm7 |
79 |
punpcklbw %2, mm7 |
punpcklbw %2, mm7 |
80 |
pmullw mm2, mm4 |
pmullw mm2, mm4 |
81 |
pmullw mm3, %2 |
pmullw mm3, %2 |
82 |
|
|
83 |
movd mm4, [ecx +%1] ; src0 |
movd mm4, [TMP0 +%1] ; src0 |
84 |
movd %2, [ecx +1+%1] ; src1 |
movd %2, [TMP0 +1+%1] ; src1 |
85 |
punpcklbw mm4, mm7 |
punpcklbw mm4, mm7 |
86 |
punpcklbw %2, mm7 |
punpcklbw %2, mm7 |
87 |
pmullw mm4, mm0 |
pmullw mm4, mm0 |
93 |
paddw %2, mm2 |
paddw %2, mm2 |
94 |
%endmacro |
%endmacro |
95 |
|
|
96 |
align 16 |
align SECTION_ALIGN |
97 |
xvid_GMC_Core_Lin_8_mmx: |
xvid_GMC_Core_Lin_8_mmx: |
98 |
mov eax, [esp + 8] ; Offsets |
mov _EAX, prm2 ; Offsets |
99 |
mov ecx, [esp +12] ; Src0 |
mov TMP0, prm3 ; Src0 |
100 |
mov edx, [esp +16] ; BpS |
mov TMP1, prm4 ; BpS |
101 |
|
|
102 |
pxor mm7, mm7 |
pxor mm7, mm7 |
103 |
|
|
104 |
GMC_4_SSE 0, mm5 |
GMC_4_SSE 0, mm5 |
105 |
GMC_4_SSE 4, mm6 |
GMC_4_SSE 4, mm6 |
106 |
|
|
107 |
; pshufw mm4, [esp +20], 01010101b ; Rounder (bits [16..31]) |
; pshufw mm4, prm5, 01010101b ; Rounder (bits [16..31]) |
108 |
movd mm4, [esp+20] ; Rounder (bits [16..31]) |
movd mm4, prm5d ; Rounder (bits [16..31]) |
109 |
mov eax, [esp + 4] ; Dst |
mov _EAX, prm1 ; Dst |
110 |
punpcklwd mm4, mm4 |
punpcklwd mm4, mm4 |
111 |
punpckhdq mm4, mm4 |
punpckhdq mm4, mm4 |
112 |
|
|
115 |
psrlw mm5, 8 |
psrlw mm5, 8 |
116 |
psrlw mm6, 8 |
psrlw mm6, 8 |
117 |
packuswb mm5, mm6 |
packuswb mm5, mm6 |
118 |
movq [eax], mm5 |
movq [_EAX], mm5 |
119 |
|
|
120 |
ret |
ret |
121 |
ENDFUNC |
ENDFUNC |
126 |
%macro GMC_8_SSE2 1 |
%macro GMC_8_SSE2 1 |
127 |
|
|
128 |
pcmpeqw xmm0, xmm0 |
pcmpeqw xmm0, xmm0 |
129 |
movdqa xmm1, [eax ] ; u... |
movdqa xmm1, [_EAX ] ; u... |
130 |
psrlw xmm0, 12 ; mask = 0x000f |
psrlw xmm0, 12 ; mask = 0x000f |
131 |
movdqa xmm2, [eax+2*16] ; v... |
movdqa xmm2, [_EAX+2*16] ; v... |
132 |
pand xmm1, xmm0 |
pand xmm1, xmm0 |
133 |
pand xmm2, xmm0 |
pand xmm2, xmm0 |
134 |
|
|
143 |
pmullw xmm1, xmm4 ; u .(16-v) |
pmullw xmm1, xmm4 ; u .(16-v) |
144 |
|
|
145 |
%if (%1!=0) ; SSE41 |
%if (%1!=0) ; SSE41 |
146 |
pmovzxbw xmm4, [ecx+edx ] ; src2 |
pmovzxbw xmm4, [TMP0+TMP1 ] ; src2 |
147 |
pmovzxbw xmm5, [ecx+edx+1] ; src3 |
pmovzxbw xmm5, [TMP0+TMP1+1] ; src3 |
148 |
%else |
%else |
149 |
movq xmm4, [ecx+edx ] ; src2 |
movq xmm4, [TMP0+TMP1 ] ; src2 |
150 |
movq xmm5, [ecx+edx+1] ; src3 |
movq xmm5, [TMP0+TMP1+1] ; src3 |
151 |
punpcklbw xmm4, xmm7 |
punpcklbw xmm4, xmm7 |
152 |
punpcklbw xmm5, xmm7 |
punpcklbw xmm5, xmm7 |
153 |
%endif |
%endif |
155 |
pmullw xmm3, xmm5 |
pmullw xmm3, xmm5 |
156 |
|
|
157 |
%if (%1!=0) ; SSE41 |
%if (%1!=0) ; SSE41 |
158 |
pmovzxbw xmm4, [ecx ] ; src0 |
pmovzxbw xmm4, [TMP0 ] ; src0 |
159 |
pmovzxbw xmm5, [ecx +1] ; src1 |
pmovzxbw xmm5, [TMP0 +1] ; src1 |
160 |
%else |
%else |
161 |
movq xmm4, [ecx ] ; src0 |
movq xmm4, [TMP0 ] ; src0 |
162 |
movq xmm5, [ecx +1] ; src1 |
movq xmm5, [TMP0 +1] ; src1 |
163 |
punpcklbw xmm4, xmm7 |
punpcklbw xmm4, xmm7 |
164 |
punpcklbw xmm5, xmm7 |
punpcklbw xmm5, xmm7 |
165 |
%endif |
%endif |
172 |
paddw xmm5, xmm2 |
paddw xmm5, xmm2 |
173 |
%endmacro |
%endmacro |
174 |
|
|
175 |
align 16 |
align SECTION_ALIGN |
176 |
xvid_GMC_Core_Lin_8_sse2: |
xvid_GMC_Core_Lin_8_sse2: |
177 |
mov eax, [esp + 8] ; Offsets |
mov _EAX, prm2 ; Offsets |
178 |
mov ecx, [esp +12] ; Src0 |
mov TMP0, prm3 ; Src0 |
179 |
mov edx, [esp +16] ; BpS |
mov TMP1, prm4 ; BpS |
180 |
|
|
181 |
pxor xmm7, xmm7 |
pxor xmm7, xmm7 |
182 |
|
|
183 |
GMC_8_SSE2 0 |
GMC_8_SSE2 0 |
184 |
|
|
185 |
movd xmm4, [esp +20] |
movd xmm4, prm5d |
186 |
pshuflw xmm4, xmm4, 01010101b ; Rounder (bits [16..31]) |
pshuflw xmm4, xmm4, 01010101b ; Rounder (bits [16..31]) |
187 |
punpckldq xmm4, xmm4 |
punpckldq xmm4, xmm4 |
188 |
mov eax, [esp + 4] ; Dst |
mov _EAX, prm1 ; Dst |
189 |
|
|
190 |
paddw xmm5, xmm4 |
paddw xmm5, xmm4 |
191 |
psrlw xmm5, 8 |
psrlw xmm5, 8 |
192 |
packuswb xmm5, xmm5 |
packuswb xmm5, xmm5 |
193 |
movq [eax], xmm5 |
movq [_EAX], xmm5 |
194 |
|
|
195 |
ret |
ret |
196 |
ENDFUNC |
ENDFUNC |
197 |
|
|
198 |
align 16 |
align SECTION_ALIGN |
199 |
xvid_GMC_Core_Lin_8_sse41: |
xvid_GMC_Core_Lin_8_sse41: |
200 |
mov eax, [esp + 8] ; Offsets |
mov _EAX, prm2 ; Offsets |
201 |
mov ecx, [esp +12] ; Src0 |
mov TMP0, prm3 ; Src0 |
202 |
mov edx, [esp +16] ; BpS |
mov TMP1, prm4 ; BpS |
203 |
|
|
204 |
GMC_8_SSE2 1 |
GMC_8_SSE2 1 |
205 |
|
|
206 |
movd xmm4, [esp +20] |
movd xmm4, prm5d |
207 |
pshuflw xmm4, xmm4, 01010101b ; Rounder (bits [16..31]) |
pshuflw xmm4, xmm4, 01010101b ; Rounder (bits [16..31]) |
208 |
punpckldq xmm4, xmm4 |
punpckldq xmm4, xmm4 |
209 |
mov eax, [esp + 4] ; Dst |
mov _EAX, prm1 ; Dst |
210 |
|
|
211 |
paddw xmm5, xmm4 |
paddw xmm5, xmm4 |
212 |
psrlw xmm5, 8 |
psrlw xmm5, 8 |
213 |
packuswb xmm5, xmm5 |
packuswb xmm5, xmm5 |
214 |
movq [eax], xmm5 |
movq [_EAX], xmm5 |
215 |
|
|
216 |
ret |
ret |
217 |
ENDFUNC |
ENDFUNC |