Annotation of /xvidcore/src/image/x86_asm/interpolate8x8_3dn.asm

Revision 1.1 - (view) (download)

1 :	Isibaar	1.1	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * 3dnow 8x8 block-based halfpel interpolation
5 :			; *
6 :			; * This program is free software; you can redistribute it and/or modify
7 :			; * it under the terms of the GNU General Public License as published by
8 :			; * the Free Software Foundation; either version 2 of the License, or
9 :			; * (at your option) any later version.
10 :			; *
11 :			; * This program is distributed in the hope that it will be useful,
12 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 :			; * GNU General Public License for more details.
15 :			; *
16 :			; * You should have received a copy of the GNU General Public License
17 :			; * along with this program; if not, write to the Free Software
18 :			; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 :			; *
20 :			; *************************************************************************/
21 :
22 :			;/**************************************************************************
23 :			; *
24 :			; * History:
25 :			; *
26 :			; * 06.07.2002 applied Skal's XMM changes to the 3dnow code (Isibaar)
27 :			; * 04.02.2002 additional 3dnow optimizations (Isibaar)
28 :			; * 22.12.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
29 :			; *
30 :			; *************************************************************************/
31 :
32 :
33 :			bits 32
34 :
35 :			%macro cglobal 1
36 :			%ifdef PREFIX
37 :			global _%1
38 :			%define %1 _%1
39 :			%else
40 :			global %1
41 :			%endif
42 :			%endmacro
43 :
44 :			section .data
45 :
46 :
47 :			align 16
48 :
49 :			mmx_one
50 :			times 8 db 1
51 :
52 :			section .text
53 :
54 :			cglobal interpolate8x8_halfpel_h_3dn
55 :			cglobal interpolate8x8_halfpel_v_3dn
56 :			cglobal interpolate8x8_halfpel_hv_3dn
57 :
58 :
59 :			;===========================================================================
60 :			;
61 :			; void interpolate8x8_halfpel_h_3dn(uint8_t * const dst,
62 :			; const uint8_t * const src,
63 :			; const uint32_t stride,
64 :			; const uint32_t rounding);
65 :			;
66 :			;===========================================================================
67 :
68 :			%macro COPY_H_3DN_RND0 0
69 :			movq mm0, [eax]
70 :			pavgusb mm0, [eax+1]
71 :			movq mm1, [eax+edx]
72 :			pavgusb mm1, [eax+edx+1]
73 :			lea eax,[eax+2*edx]
74 :			movq [ecx],mm0
75 :			movq [ecx+edx],mm1
76 :			%endmacro
77 :
78 :			%macro COPY_H_3DN_RND1 0
79 :			movq mm0, [eax]
80 :			movq mm1, [eax+edx]
81 :			movq mm4, mm0
82 :			movq mm5, mm1
83 :			movq mm2, [eax+1]
84 :			movq mm3, [eax+edx+1]
85 :			pavgusb mm0, mm2
86 :			pxor mm2, mm4
87 :			pavgusb mm1, mm3
88 :			lea eax,[eax+2*edx]
89 :			pxor mm3, mm5
90 :			pand mm2, mm7
91 :			pand mm3, mm7
92 :			psubb mm0, mm2
93 :			movq [ecx], mm0
94 :			psubb mm1, mm3
95 :			movq [ecx+edx], mm1
96 :			%endmacro
97 :
98 :			align 16
99 :			interpolate8x8_halfpel_h_3dn:
100 :
101 :			mov eax, [esp+16] ; rounding
102 :			mov ecx, [esp+ 4] ; Dst
103 :			test eax,eax
104 :			mov eax, [esp+ 8] ; Src
105 :			mov edx, [esp+12] ; stride
106 :
107 :			jnz near .rounding1
108 :
109 :			COPY_H_3DN_RND0
110 :			lea ecx,[ecx+2*edx]
111 :			COPY_H_3DN_RND0
112 :			lea ecx,[ecx+2*edx]
113 :			COPY_H_3DN_RND0
114 :			lea ecx,[ecx+2*edx]
115 :			COPY_H_3DN_RND0
116 :			ret
117 :
118 :			.rounding1
119 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
120 :			movq mm7, [mmx_one]
121 :			COPY_H_3DN_RND1
122 :			lea ecx, [ecx+2*edx]
123 :			COPY_H_3DN_RND1
124 :			lea ecx,[ecx+2*edx]
125 :			COPY_H_3DN_RND1
126 :			lea ecx,[ecx+2*edx]
127 :			COPY_H_3DN_RND1
128 :			ret
129 :
130 :
131 :			;===========================================================================
132 :			;
133 :			; void interpolate8x8_halfpel_v_3dn(uint8_t * const dst,
134 :			; const uint8_t * const src,
135 :			; const uint32_t stride,
136 :			; const uint32_t rounding);
137 :			;
138 :			;===========================================================================
139 :
140 :			%macro COPY_V_3DN_RND0 0
141 :			movq mm0, [eax]
142 :			movq mm1, [eax+edx]
143 :			pavgusb mm0, mm1
144 :			pavgusb mm1, [eax+2*edx]
145 :			lea eax,[eax+2*edx]
146 :			movq [ecx],mm0
147 :			movq [ecx+edx],mm1
148 :			%endmacro
149 :
150 :			%macro COPY_V_3DN_RND1 0
151 :			movq mm0, mm2
152 :			movq mm1, [eax]
153 :			movq mm2, [eax+edx]
154 :			lea eax,[eax+2*edx]
155 :			movq mm4, mm0
156 :			movq mm5, mm1
157 :			pavgusb mm0, mm1
158 :			pxor mm4, mm1
159 :			pavgusb mm1, mm2
160 :			pxor mm5, mm2
161 :			pand mm4, mm7 ; lsb's of (i^j)...
162 :			pand mm5, mm7 ; lsb's of (i^j)...
163 :			psubb mm0, mm4 ; ...are substracted from result of pavgusb
164 :			movq [ecx], mm0
165 :			psubb mm1, mm5 ; ...are substracted from result of pavgusb
166 :			movq [ecx+edx], mm1
167 :			%endmacro
168 :
169 :			align 16
170 :			interpolate8x8_halfpel_v_3dn:
171 :
172 :			mov eax, [esp+16] ; rounding
173 :			mov ecx, [esp+ 4] ; Dst
174 :			test eax,eax
175 :			mov eax, [esp+ 8] ; Src
176 :			mov edx, [esp+12] ; stride
177 :
178 :			; we process 2 line at a time
179 :
180 :			jnz near .rounding1
181 :
182 :			COPY_V_3DN_RND0
183 :			lea ecx, [ecx+2*edx]
184 :			COPY_V_3DN_RND0
185 :			lea ecx, [ecx+2*edx]
186 :			COPY_V_3DN_RND0
187 :			lea ecx, [ecx+2*edx]
188 :			COPY_V_3DN_RND0
189 :			ret
190 :
191 :			.rounding1
192 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
193 :			movq mm7, [mmx_one]
194 :			movq mm2, [eax] ; loop invariant
195 :			add eax, edx
196 :
197 :			COPY_V_3DN_RND1
198 :			lea ecx,[ecx+2*edx]
199 :			COPY_V_3DN_RND1
200 :			lea ecx,[ecx+2*edx]
201 :			COPY_V_3DN_RND1
202 :			lea ecx,[ecx+2*edx]
203 :			COPY_V_3DN_RND1
204 :			ret
205 :
206 :
207 :			;===========================================================================
208 :			;
209 :			; void interpolate8x8_halfpel_hv_3dn(uint8_t * const dst,
210 :			; const uint8_t * const src,
211 :			; const uint32_t stride,
212 :			; const uint32_t rounding);
213 :			;
214 :			;
215 :			;===========================================================================
216 :
217 :			; The trick is to correct the result of 'pavgusb' with some combination of the
218 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgusb' (s and t).
219 :			; The boolean relations are:
220 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
221 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
222 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
223 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
224 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
225 :
226 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
227 :
228 :			%macro COPY_HV_3DN_RND0 0
229 :			lea eax,[eax+edx]
230 :
231 :			movq mm0, [eax]
232 :			movq mm1, [eax+1]
233 :
234 :			movq mm6, mm0
235 :			pavgusb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
236 :			lea eax,[eax+edx]
237 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
238 :
239 :			por mm3, mm1 ; ij \|= jk
240 :			movq mm6, mm2
241 :			pxor mm6, mm0 ; mm6 = s^t
242 :			pand mm3, mm6 ; (ij\|jk) &= st
243 :			pavgusb mm2, mm0 ; mm2 = (s+t+1)/2
244 :			pand mm3, mm7 ; mask lsb
245 :			psubb mm2, mm3 ; apply.
246 :
247 :			movq [ecx], mm2
248 :
249 :			movq mm2, [eax]
250 :			movq mm3, [eax+1]
251 :			movq mm6, mm2
252 :			pavgusb mm2, mm3 ; preserved for next iteration
253 :			lea ecx,[ecx+edx]
254 :			pxor mm3, mm6 ; preserved for next iteration
255 :
256 :			por mm1, mm3
257 :			movq mm6, mm0
258 :			pxor mm6, mm2
259 :			pand mm1, mm6
260 :			pavgusb mm0, mm2
261 :
262 :			pand mm1, mm7
263 :			psubb mm0, mm1
264 :
265 :			movq [ecx], mm0
266 :			%endmacro
267 :
268 :			%macro COPY_HV_3DN_RND1 0
269 :			lea eax,[eax+edx]
270 :
271 :			movq mm0, [eax]
272 :			movq mm1, [eax+1]
273 :
274 :			movq mm6, mm0
275 :			pavgusb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
276 :			lea eax,[eax+edx]
277 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
278 :
279 :			pand mm3, mm1
280 :			movq mm6, mm2
281 :			pxor mm6, mm0
282 :			por mm3, mm6
283 :			pavgusb mm2, mm0
284 :			pand mm3, mm7
285 :			psubb mm2, mm3
286 :
287 :			movq [ecx], mm2
288 :
289 :			movq mm2, [eax]
290 :			movq mm3, [eax+1]
291 :			movq mm6, mm2
292 :			pavgusb mm2, mm3 ; preserved for next iteration
293 :			lea ecx,[ecx+edx]
294 :			pxor mm3, mm6 ; preserved for next iteration
295 :
296 :			pand mm1, mm3
297 :			movq mm6, mm0
298 :			pxor mm6, mm2
299 :			por mm1, mm6
300 :			pavgusb mm0, mm2
301 :			pand mm1, mm7
302 :			psubb mm0, mm1
303 :
304 :			movq [ecx], mm0
305 :			%endmacro
306 :
307 :			align 16
308 :			interpolate8x8_halfpel_hv_3dn
309 :			mov eax, [esp+16] ; rounding
310 :			mov ecx, [esp+ 4] ; Dst
311 :			test eax,eax
312 :			mov eax, [esp+ 8] ; Src
313 :			mov edx, [esp+12] ; stride
314 :
315 :			movq mm7, [mmx_one]
316 :
317 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
318 :			movq mm2, [eax]
319 :			movq mm3, [eax+1]
320 :			movq mm6, mm2
321 :			pavgusb mm2, mm3
322 :			pxor mm3, mm6 ; mm2/mm3 ready
323 :
324 :			jnz near .rounding1
325 :
326 :			COPY_HV_3DN_RND0
327 :			add ecx, edx
328 :			COPY_HV_3DN_RND0
329 :			add ecx, edx
330 :			COPY_HV_3DN_RND0
331 :			add ecx, edx
332 :			COPY_HV_3DN_RND0
333 :			ret
334 :
335 :			.rounding1
336 :			COPY_HV_3DN_RND1
337 :			add ecx, edx
338 :			COPY_HV_3DN_RND1
339 :			add ecx, edx
340 :			COPY_HV_3DN_RND1
341 :			add ecx, edx
342 :			COPY_HV_3DN_RND1
343 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4