4 |
; * - GMC core functions - |
; * - GMC core functions - |
5 |
; * Copyright(C) 2006 Pascal Massimino <skal@planet-d.net> |
; * Copyright(C) 2006 Pascal Massimino <skal@planet-d.net> |
6 |
; * |
; * |
7 |
; * This file is part of XviD, a free MPEG-4 video encoder/decoder |
; * This file is part of Xvid, a free MPEG-4 video encoder/decoder |
8 |
; * |
; * |
9 |
; * XviD is free software; you can redistribute it and/or modify it |
; * Xvid is free software; you can redistribute it and/or modify it |
10 |
; * under the terms of the GNU General Public License as published by |
; * under the terms of the GNU General Public License as published by |
11 |
; * the Free Software Foundation; either version 2 of the License, or |
; * the Free Software Foundation; either version 2 of the License, or |
12 |
; * (at your option) any later version. |
; * (at your option) any later version. |
32 |
; * |
; * |
33 |
; *************************************************************************/ |
; *************************************************************************/ |
34 |
|
|
35 |
bits 32 |
%include "nasm.inc" |
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
%ifdef MARK_FUNCS |
|
|
global _%1:function %1.endfunc-%1 |
|
|
%define %1 _%1:function %1.endfunc-%1 |
|
|
%else |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%endif |
|
|
%else |
|
|
%ifdef MARK_FUNCS |
|
|
global %1:function %1.endfunc-%1 |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
|
%endif |
|
|
%endmacro |
|
36 |
|
|
37 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
38 |
|
|
39 |
cglobal xvid_GMC_Core_Lin_8_mmx |
cglobal xvid_GMC_Core_Lin_8_mmx |
40 |
cglobal xvid_GMC_Core_Lin_8_sse2 |
cglobal xvid_GMC_Core_Lin_8_sse2 |
41 |
|
cglobal xvid_GMC_Core_Lin_8_sse41 |
42 |
|
|
43 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
44 |
|
|
45 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata |
|
|
%else |
|
|
SECTION .rodata align=16 |
|
|
%endif |
|
46 |
|
|
47 |
align 16 |
align SECTION_ALIGN |
48 |
Cst16: |
Cst16: |
49 |
times 8 dw 16 |
times 8 dw 16 |
50 |
|
|
51 |
SECTION .text |
TEXT |
52 |
|
|
53 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
54 |
;// mmx version |
;// mmx version |
56 |
%macro GMC_4_SSE 2 ; %1: i %2: out reg (mm5 or mm6) |
%macro GMC_4_SSE 2 ; %1: i %2: out reg (mm5 or mm6) |
57 |
|
|
58 |
pcmpeqw mm0, mm0 |
pcmpeqw mm0, mm0 |
59 |
movq mm1, [eax+2*(%1) ] ; u0 | u1 | u2 | u3 |
movq mm1, [_EAX+2*(%1) ] ; u0 | u1 | u2 | u3 |
60 |
psrlw mm0, 12 ; mask 0x000f |
psrlw mm0, 12 ; mask 0x000f |
61 |
movq mm2, [eax+2*(%1)+2*16] ; v0 | v1 | v2 | v3 |
movq mm2, [_EAX+2*(%1)+2*16] ; v0 | v1 | v2 | v3 |
62 |
|
|
63 |
pand mm1, mm0 ; u0 |
pand mm1, mm0 ; u0 |
64 |
pand mm2, mm0 ; v0 |
pand mm2, mm0 ; v0 |
73 |
pmullw mm0, mm4 ; (16-u).(16-v) |
pmullw mm0, mm4 ; (16-u).(16-v) |
74 |
pmullw mm1, mm4 ; u .(16-v) |
pmullw mm1, mm4 ; u .(16-v) |
75 |
|
|
76 |
movd mm4, [ecx+edx +%1] ; src2 |
movd mm4, [TMP0+TMP1 +%1] ; src2 |
77 |
movd %2, [ecx+edx+1+%1] ; src3 |
movd %2, [TMP0+TMP1+1+%1] ; src3 |
78 |
punpcklbw mm4, mm7 |
punpcklbw mm4, mm7 |
79 |
punpcklbw %2, mm7 |
punpcklbw %2, mm7 |
80 |
pmullw mm2, mm4 |
pmullw mm2, mm4 |
81 |
pmullw mm3, %2 |
pmullw mm3, %2 |
82 |
|
|
83 |
movd mm4, [ecx +%1] ; src0 |
movd mm4, [TMP0 +%1] ; src0 |
84 |
movd %2, [ecx +1+%1] ; src1 |
movd %2, [TMP0 +1+%1] ; src1 |
85 |
punpcklbw mm4, mm7 |
punpcklbw mm4, mm7 |
86 |
punpcklbw %2, mm7 |
punpcklbw %2, mm7 |
87 |
pmullw mm4, mm0 |
pmullw mm4, mm0 |
93 |
paddw %2, mm2 |
paddw %2, mm2 |
94 |
%endmacro |
%endmacro |
95 |
|
|
96 |
align 16 |
align SECTION_ALIGN |
97 |
xvid_GMC_Core_Lin_8_mmx: |
xvid_GMC_Core_Lin_8_mmx: |
98 |
mov eax, [esp + 8] ; Offsets |
mov _EAX, prm2 ; Offsets |
99 |
mov ecx, [esp +12] ; Src0 |
mov TMP0, prm3 ; Src0 |
100 |
mov edx, [esp +16] ; BpS |
mov TMP1, prm4 ; BpS |
101 |
|
|
102 |
pxor mm7, mm7 |
pxor mm7, mm7 |
103 |
|
|
104 |
GMC_4_SSE 0, mm5 |
GMC_4_SSE 0, mm5 |
105 |
GMC_4_SSE 4, mm6 |
GMC_4_SSE 4, mm6 |
106 |
|
|
107 |
; pshufw mm4, [esp +20], 01010101b ; Rounder (bits [16..31]) |
; pshufw mm4, prm5d, 01010101b ; Rounder (bits [16..31]) |
108 |
movd mm4, [esp+20] ; Rounder (bits [16..31]) |
movd mm4, prm5d ; Rounder (bits [16..31]) |
109 |
mov eax, [esp + 4] ; Dst |
mov _EAX, prm1 ; Dst |
110 |
punpcklwd mm4, mm4 |
punpcklwd mm4, mm4 |
111 |
punpckhdq mm4, mm4 |
punpckhdq mm4, mm4 |
112 |
|
|
115 |
psrlw mm5, 8 |
psrlw mm5, 8 |
116 |
psrlw mm6, 8 |
psrlw mm6, 8 |
117 |
packuswb mm5, mm6 |
packuswb mm5, mm6 |
118 |
movq [eax], mm5 |
movq [_EAX], mm5 |
119 |
|
|
120 |
ret |
ret |
121 |
.endfunc |
ENDFUNC |
122 |
|
|
123 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
124 |
;// SSE2 version |
;// SSE2 version |
125 |
|
|
126 |
%macro GMC_8_SSE2 0 |
%macro GMC_8_SSE2 1 |
127 |
|
|
128 |
pcmpeqw xmm0, xmm0 |
pcmpeqw xmm0, xmm0 |
129 |
movdqa xmm1, [eax ] ; u... |
movdqa xmm1, [_EAX ] ; u... |
130 |
psrlw xmm0, 12 ; mask = 0x000f |
psrlw xmm0, 12 ; mask = 0x000f |
131 |
movdqa xmm2, [eax+2*16] ; v... |
movdqa xmm2, [_EAX+2*16] ; v... |
132 |
pand xmm1, xmm0 |
pand xmm1, xmm0 |
133 |
pand xmm2, xmm0 |
pand xmm2, xmm0 |
134 |
|
|
142 |
pmullw xmm0, xmm4 ; (16-u).(16-v) |
pmullw xmm0, xmm4 ; (16-u).(16-v) |
143 |
pmullw xmm1, xmm4 ; u .(16-v) |
pmullw xmm1, xmm4 ; u .(16-v) |
144 |
|
|
145 |
movq xmm4, [ecx+edx ] ; src2 |
%if (%1!=0) ; SSE41 |
146 |
movq xmm5, [ecx+edx+1] ; src3 |
pmovzxbw xmm4, [TMP0+TMP1 ] ; src2 |
147 |
|
pmovzxbw xmm5, [TMP0+TMP1+1] ; src3 |
148 |
|
%else |
149 |
|
movq xmm4, [TMP0+TMP1 ] ; src2 |
150 |
|
movq xmm5, [TMP0+TMP1+1] ; src3 |
151 |
punpcklbw xmm4, xmm7 |
punpcklbw xmm4, xmm7 |
152 |
punpcklbw xmm5, xmm7 |
punpcklbw xmm5, xmm7 |
153 |
|
%endif |
154 |
pmullw xmm2, xmm4 |
pmullw xmm2, xmm4 |
155 |
pmullw xmm3, xmm5 |
pmullw xmm3, xmm5 |
156 |
|
|
157 |
movq xmm4, [ecx ] ; src0 |
%if (%1!=0) ; SSE41 |
158 |
movq xmm5, [ecx +1] ; src1 |
pmovzxbw xmm4, [TMP0 ] ; src0 |
159 |
|
pmovzxbw xmm5, [TMP0 +1] ; src1 |
160 |
|
%else |
161 |
|
movq xmm4, [TMP0 ] ; src0 |
162 |
|
movq xmm5, [TMP0 +1] ; src1 |
163 |
punpcklbw xmm4, xmm7 |
punpcklbw xmm4, xmm7 |
164 |
punpcklbw xmm5, xmm7 |
punpcklbw xmm5, xmm7 |
165 |
|
%endif |
166 |
pmullw xmm4, xmm0 |
pmullw xmm4, xmm0 |
167 |
pmullw xmm5, xmm1 |
pmullw xmm5, xmm1 |
168 |
|
|
172 |
paddw xmm5, xmm2 |
paddw xmm5, xmm2 |
173 |
%endmacro |
%endmacro |
174 |
|
|
175 |
align 16 |
align SECTION_ALIGN |
176 |
xvid_GMC_Core_Lin_8_sse2: |
xvid_GMC_Core_Lin_8_sse2: |
177 |
mov eax, [esp + 8] ; Offsets |
PUSH_XMM6_XMM7 |
178 |
mov ecx, [esp +12] ; Src0 |
|
179 |
mov edx, [esp +16] ; BpS |
mov _EAX, prm2 ; Offsets |
180 |
|
mov TMP0, prm3 ; Src0 |
181 |
|
mov TMP1, prm4 ; BpS |
182 |
|
|
183 |
pxor xmm7, xmm7 |
pxor xmm7, xmm7 |
184 |
|
|
185 |
GMC_8_SSE2 |
GMC_8_SSE2 0 |
186 |
|
|
187 |
movd xmm4, [esp +20] |
movd xmm4, prm5d |
188 |
pshuflw xmm4, xmm4, 01010101b ; Rounder (bits [16..31]) |
pshuflw xmm4, xmm4, 01010101b ; Rounder (bits [16..31]) |
189 |
punpckldq xmm4, xmm4 |
punpckldq xmm4, xmm4 |
190 |
mov eax, [esp + 4] ; Dst |
mov _EAX, prm1 ; Dst |
191 |
|
|
192 |
paddw xmm5, xmm4 |
paddw xmm5, xmm4 |
193 |
psrlw xmm5, 8 |
psrlw xmm5, 8 |
194 |
packuswb xmm5, xmm5 |
packuswb xmm5, xmm5 |
195 |
movq [eax], xmm5 |
movq [_EAX], xmm5 |
196 |
|
|
197 |
|
POP_XMM6_XMM7 |
198 |
ret |
ret |
199 |
.endfunc |
ENDFUNC |
200 |
|
|
201 |
;////////////////////////////////////////////////////////////////////// |
align SECTION_ALIGN |
202 |
|
xvid_GMC_Core_Lin_8_sse41: |
203 |
|
mov _EAX, prm2 ; Offsets |
204 |
|
mov TMP0, prm3 ; Src0 |
205 |
|
mov TMP1, prm4 ; BpS |
206 |
|
|
207 |
%ifidn __OUTPUT_FORMAT__,elf |
GMC_8_SSE2 1 |
208 |
section ".note.GNU-stack" noalloc noexec nowrite progbits |
|
209 |
%endif |
movd xmm4, prm5d |
210 |
|
pshuflw xmm4, xmm4, 01010101b ; Rounder (bits [16..31]) |
211 |
|
punpckldq xmm4, xmm4 |
212 |
|
mov _EAX, prm1 ; Dst |
213 |
|
|
214 |
|
paddw xmm5, xmm4 |
215 |
|
psrlw xmm5, 8 |
216 |
|
packuswb xmm5, xmm5 |
217 |
|
movq [_EAX], xmm5 |
218 |
|
|
219 |
|
ret |
220 |
|
ENDFUNC |
221 |
|
|
222 |
|
;////////////////////////////////////////////////////////////////////// |
223 |
|
NON_EXEC_STACK |