Annotation of /xvidcore/src/image/x86_asm/interpolate8x8_3dn.asm

Revision 1.3 - (view) (download)

1 :	chl	1.2	;/*****************************************************************************
2 :	Isibaar	1.1	; *
3 :	chl	1.2	; * XVID MPEG-4 VIDEO CODEC
4 :			; * 3dnow 8x8 block-based halfpel interpolation
5 :	Isibaar	1.1	; *
6 :	chl	1.2	; * Copyright(C) 2002 Peter Ross <pross@xvid.org>
7 :			; * Copyright(C) 2002 Michael Militzer <michael@xvid.org>
8 :	Isibaar	1.1	; *
9 :	edgomez	1.3	; * This file is part of XviD, a free MPEG-4 video encoder/decoder
10 :	chl	1.2	; *
11 :	edgomez	1.3	; * XviD is free software; you can redistribute it and/or modify it
12 :			; * under the terms of the GNU General Public License as published by
13 :	chl	1.2	; * the Free Software Foundation; either version 2 of the License, or
14 :			; * (at your option) any later version.
15 :			; *
16 :			; * This program is distributed in the hope that it will be useful,
17 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 :			; * GNU General Public License for more details.
20 :			; *
21 :			; * You should have received a copy of the GNU General Public License
22 :			; * along with this program; if not, write to the Free Software
23 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 :	edgomez	1.3	; *
25 :			; * Under section 8 of the GNU General Public License, the copyright
26 :			; * holders of XVID explicitly forbid distribution in the following
27 :			; * countries:
28 :			; *
29 :			; * - Japan
30 :			; * - United States of America
31 :			; *
32 :			; * Linking XviD statically or dynamically with other modules is making a
33 :			; * combined work based on XviD. Thus, the terms and conditions of the
34 :			; * GNU General Public License cover the whole combination.
35 :			; *
36 :			; * As a special exception, the copyright holders of XviD give you
37 :			; * permission to link XviD with independent modules that communicate with
38 :			; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the
39 :			; * license terms of these independent modules, and to copy and distribute
40 :			; * the resulting combined work under terms of your choice, provided that
41 :			; * every copy of the combined work is accompanied by a complete copy of
42 :			; * the source code of XviD (the version of XviD used to produce the
43 :			; * combined work), being distributed under the terms of the GNU General
44 :			; * Public License plus this exception. An independent module is a module
45 :			; * which is not derived from or based on XviD.
46 :			; *
47 :			; * Note that people who make modified versions of XviD are not obligated
48 :			; * to grant this special exception for their modified versions; it is
49 :			; * their choice whether to do so. The GNU General Public License gives
50 :			; * permission to release a modified version without this exception; this
51 :			; * exception also makes it possible to release a modified version which
52 :			; * carries forward this exception.
53 :			; *
54 :			; * $Id$
55 :	Isibaar	1.1	; *
56 :	chl	1.2	; ****************************************************************************/
57 :	Isibaar	1.1
58 :			bits 32
59 :
60 :			%macro cglobal 1
61 :			%ifdef PREFIX
62 :			global _%1
63 :			%define %1 _%1
64 :			%else
65 :			global %1
66 :			%endif
67 :			%endmacro
68 :
69 :			section .data
70 :
71 :
72 :			align 16
73 :
74 :			mmx_one
75 :			times 8 db 1
76 :
77 :			section .text
78 :
79 :			cglobal interpolate8x8_halfpel_h_3dn
80 :			cglobal interpolate8x8_halfpel_v_3dn
81 :			cglobal interpolate8x8_halfpel_hv_3dn
82 :
83 :
84 :			;===========================================================================
85 :			;
86 :			; void interpolate8x8_halfpel_h_3dn(uint8_t * const dst,
87 :			; const uint8_t * const src,
88 :			; const uint32_t stride,
89 :			; const uint32_t rounding);
90 :			;
91 :			;===========================================================================
92 :
93 :			%macro COPY_H_3DN_RND0 0
94 :			movq mm0, [eax]
95 :			pavgusb mm0, [eax+1]
96 :			movq mm1, [eax+edx]
97 :			pavgusb mm1, [eax+edx+1]
98 :			lea eax,[eax+2*edx]
99 :			movq [ecx],mm0
100 :			movq [ecx+edx],mm1
101 :			%endmacro
102 :
103 :			%macro COPY_H_3DN_RND1 0
104 :			movq mm0, [eax]
105 :			movq mm1, [eax+edx]
106 :			movq mm4, mm0
107 :			movq mm5, mm1
108 :			movq mm2, [eax+1]
109 :			movq mm3, [eax+edx+1]
110 :			pavgusb mm0, mm2
111 :			pxor mm2, mm4
112 :			pavgusb mm1, mm3
113 :			lea eax,[eax+2*edx]
114 :			pxor mm3, mm5
115 :			pand mm2, mm7
116 :			pand mm3, mm7
117 :			psubb mm0, mm2
118 :			movq [ecx], mm0
119 :			psubb mm1, mm3
120 :			movq [ecx+edx], mm1
121 :			%endmacro
122 :
123 :			align 16
124 :			interpolate8x8_halfpel_h_3dn:
125 :
126 :			mov eax, [esp+16] ; rounding
127 :			mov ecx, [esp+ 4] ; Dst
128 :			test eax,eax
129 :			mov eax, [esp+ 8] ; Src
130 :			mov edx, [esp+12] ; stride
131 :
132 :			jnz near .rounding1
133 :
134 :			COPY_H_3DN_RND0
135 :			lea ecx,[ecx+2*edx]
136 :			COPY_H_3DN_RND0
137 :			lea ecx,[ecx+2*edx]
138 :			COPY_H_3DN_RND0
139 :			lea ecx,[ecx+2*edx]
140 :			COPY_H_3DN_RND0
141 :			ret
142 :
143 :			.rounding1
144 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
145 :			movq mm7, [mmx_one]
146 :			COPY_H_3DN_RND1
147 :			lea ecx, [ecx+2*edx]
148 :			COPY_H_3DN_RND1
149 :			lea ecx,[ecx+2*edx]
150 :			COPY_H_3DN_RND1
151 :			lea ecx,[ecx+2*edx]
152 :			COPY_H_3DN_RND1
153 :			ret
154 :
155 :
156 :			;===========================================================================
157 :			;
158 :			; void interpolate8x8_halfpel_v_3dn(uint8_t * const dst,
159 :			; const uint8_t * const src,
160 :			; const uint32_t stride,
161 :			; const uint32_t rounding);
162 :			;
163 :			;===========================================================================
164 :
165 :			%macro COPY_V_3DN_RND0 0
166 :			movq mm0, [eax]
167 :			movq mm1, [eax+edx]
168 :			pavgusb mm0, mm1
169 :			pavgusb mm1, [eax+2*edx]
170 :			lea eax,[eax+2*edx]
171 :			movq [ecx],mm0
172 :			movq [ecx+edx],mm1
173 :			%endmacro
174 :
175 :			%macro COPY_V_3DN_RND1 0
176 :			movq mm0, mm2
177 :			movq mm1, [eax]
178 :			movq mm2, [eax+edx]
179 :			lea eax,[eax+2*edx]
180 :			movq mm4, mm0
181 :			movq mm5, mm1
182 :			pavgusb mm0, mm1
183 :			pxor mm4, mm1
184 :			pavgusb mm1, mm2
185 :			pxor mm5, mm2
186 :			pand mm4, mm7 ; lsb's of (i^j)...
187 :			pand mm5, mm7 ; lsb's of (i^j)...
188 :			psubb mm0, mm4 ; ...are substracted from result of pavgusb
189 :			movq [ecx], mm0
190 :			psubb mm1, mm5 ; ...are substracted from result of pavgusb
191 :			movq [ecx+edx], mm1
192 :			%endmacro
193 :
194 :			align 16
195 :			interpolate8x8_halfpel_v_3dn:
196 :
197 :			mov eax, [esp+16] ; rounding
198 :			mov ecx, [esp+ 4] ; Dst
199 :			test eax,eax
200 :			mov eax, [esp+ 8] ; Src
201 :			mov edx, [esp+12] ; stride
202 :
203 :			; we process 2 line at a time
204 :
205 :			jnz near .rounding1
206 :
207 :			COPY_V_3DN_RND0
208 :			lea ecx, [ecx+2*edx]
209 :			COPY_V_3DN_RND0
210 :			lea ecx, [ecx+2*edx]
211 :			COPY_V_3DN_RND0
212 :			lea ecx, [ecx+2*edx]
213 :			COPY_V_3DN_RND0
214 :			ret
215 :
216 :			.rounding1
217 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
218 :			movq mm7, [mmx_one]
219 :			movq mm2, [eax] ; loop invariant
220 :			add eax, edx
221 :
222 :			COPY_V_3DN_RND1
223 :			lea ecx,[ecx+2*edx]
224 :			COPY_V_3DN_RND1
225 :			lea ecx,[ecx+2*edx]
226 :			COPY_V_3DN_RND1
227 :			lea ecx,[ecx+2*edx]
228 :			COPY_V_3DN_RND1
229 :			ret
230 :
231 :
232 :			;===========================================================================
233 :			;
234 :			; void interpolate8x8_halfpel_hv_3dn(uint8_t * const dst,
235 :			; const uint8_t * const src,
236 :			; const uint32_t stride,
237 :			; const uint32_t rounding);
238 :			;
239 :			;
240 :			;===========================================================================
241 :
242 :			; The trick is to correct the result of 'pavgusb' with some combination of the
243 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgusb' (s and t).
244 :			; The boolean relations are:
245 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
246 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
247 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
248 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
249 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
250 :
251 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
252 :
253 :			%macro COPY_HV_3DN_RND0 0
254 :			lea eax,[eax+edx]
255 :
256 :			movq mm0, [eax]
257 :			movq mm1, [eax+1]
258 :
259 :			movq mm6, mm0
260 :			pavgusb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
261 :			lea eax,[eax+edx]
262 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
263 :
264 :			por mm3, mm1 ; ij \|= jk
265 :			movq mm6, mm2
266 :			pxor mm6, mm0 ; mm6 = s^t
267 :			pand mm3, mm6 ; (ij\|jk) &= st
268 :			pavgusb mm2, mm0 ; mm2 = (s+t+1)/2
269 :			pand mm3, mm7 ; mask lsb
270 :			psubb mm2, mm3 ; apply.
271 :
272 :			movq [ecx], mm2
273 :
274 :			movq mm2, [eax]
275 :			movq mm3, [eax+1]
276 :			movq mm6, mm2
277 :			pavgusb mm2, mm3 ; preserved for next iteration
278 :			lea ecx,[ecx+edx]
279 :			pxor mm3, mm6 ; preserved for next iteration
280 :
281 :			por mm1, mm3
282 :			movq mm6, mm0
283 :			pxor mm6, mm2
284 :			pand mm1, mm6
285 :			pavgusb mm0, mm2
286 :
287 :			pand mm1, mm7
288 :			psubb mm0, mm1
289 :
290 :			movq [ecx], mm0
291 :			%endmacro
292 :
293 :			%macro COPY_HV_3DN_RND1 0
294 :			lea eax,[eax+edx]
295 :
296 :			movq mm0, [eax]
297 :			movq mm1, [eax+1]
298 :
299 :			movq mm6, mm0
300 :			pavgusb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
301 :			lea eax,[eax+edx]
302 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
303 :
304 :			pand mm3, mm1
305 :			movq mm6, mm2
306 :			pxor mm6, mm0
307 :			por mm3, mm6
308 :			pavgusb mm2, mm0
309 :			pand mm3, mm7
310 :			psubb mm2, mm3
311 :
312 :			movq [ecx], mm2
313 :
314 :			movq mm2, [eax]
315 :			movq mm3, [eax+1]
316 :			movq mm6, mm2
317 :			pavgusb mm2, mm3 ; preserved for next iteration
318 :			lea ecx,[ecx+edx]
319 :			pxor mm3, mm6 ; preserved for next iteration
320 :
321 :			pand mm1, mm3
322 :			movq mm6, mm0
323 :			pxor mm6, mm2
324 :			por mm1, mm6
325 :			pavgusb mm0, mm2
326 :			pand mm1, mm7
327 :			psubb mm0, mm1
328 :
329 :			movq [ecx], mm0
330 :			%endmacro
331 :
332 :			align 16
333 :			interpolate8x8_halfpel_hv_3dn
334 :			mov eax, [esp+16] ; rounding
335 :			mov ecx, [esp+ 4] ; Dst
336 :			test eax,eax
337 :			mov eax, [esp+ 8] ; Src
338 :			mov edx, [esp+12] ; stride
339 :
340 :			movq mm7, [mmx_one]
341 :
342 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
343 :			movq mm2, [eax]
344 :			movq mm3, [eax+1]
345 :			movq mm6, mm2
346 :			pavgusb mm2, mm3
347 :			pxor mm3, mm6 ; mm2/mm3 ready
348 :
349 :			jnz near .rounding1
350 :
351 :			COPY_HV_3DN_RND0
352 :			add ecx, edx
353 :			COPY_HV_3DN_RND0
354 :			add ecx, edx
355 :			COPY_HV_3DN_RND0
356 :			add ecx, edx
357 :			COPY_HV_3DN_RND0
358 :			ret
359 :
360 :			.rounding1
361 :			COPY_HV_3DN_RND1
362 :			add ecx, edx
363 :			COPY_HV_3DN_RND1
364 :			add ecx, edx
365 :			COPY_HV_3DN_RND1
366 :			add ecx, edx
367 :			COPY_HV_3DN_RND1
368 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4