Annotation of /xvidcore/src/image/x86_asm/interpolate8x8_3dne.asm

Revision 1.8 - (view) (download)

1 :	edgomez	1.3	;/*****************************************************************************
2 :	edgomez	1.2	; *
3 :	edgomez	1.3	; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dne pipeline optimized 8x8 block-based halfpel interpolation -
5 :	edgomez	1.2	; *
6 :	edgomez	1.3	; * Copyright(C) 2002 Jaan Kalda
7 :	edgomez	1.2	; *
8 :	edgomez	1.3	; * This program is free software ; you can redistribute it and/or modify
9 :			; * it under the terms of the GNU General Public License as published by
10 :			; * the Free Software Foundation ; either version 2 of the License, or
11 :			; * (at your option) any later version.
12 :	edgomez	1.2	; *
13 :	edgomez	1.3	; * This program is distributed in the hope that it will be useful,
14 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
15 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :			; * GNU General Public License for more details.
17 :	edgomez	1.2	; *
18 :	edgomez	1.3	; * You should have received a copy of the GNU General Public License
19 :			; * along with this program ; if not, write to the Free Software
20 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :	edgomez	1.2	; *
22 :	edgomez	1.3	; ****************************************************************************/
23 :	edgomez	1.2
24 :	edgomez	1.3	; these 3dne functions are compatible with iSSE, but are optimized specifically
25 :			; for K7 pipelines
26 :	edgomez	1.2
27 :	edgomez	1.3	BITS 32
28 :	edgomez	1.2
29 :	edgomez	1.3	%macro cglobal 1
30 :	edgomez	1.2	%ifdef PREFIX
31 :	edgomez	1.5	%ifdef MARK_FUNCS
32 :	edgomez	1.6	global _%1:function %1.endfunc-%1
33 :			%define %1 _%1:function %1.endfunc-%1
34 :	edgomez	1.5	%else
35 :			global _%1
36 :			%define %1 _%1
37 :			%endif
38 :	edgomez	1.2	%else
39 :	edgomez	1.5	%ifdef MARK_FUNCS
40 :	edgomez	1.6	global %1:function %1.endfunc-%1
41 :	edgomez	1.5	%else
42 :			global %1
43 :			%endif
44 :	edgomez	1.2	%endif
45 :			%endmacro
46 :	edgomez	1.3
47 :			;=============================================================================
48 :			; Read only data
49 :			;=============================================================================
50 :
51 :	edgomez	1.2	%ifdef FORMAT_COFF
52 :	edgomez	1.4	SECTION .rodata
53 :	edgomez	1.2	%else
54 :	edgomez	1.4	SECTION .rodata align=16
55 :	edgomez	1.2	%endif
56 :
57 :	edgomez	1.3	ALIGN 16
58 :			mmx_one:
59 :			times 8 db 1
60 :
61 :			ALIGN 8
62 :			mm_minusone:
63 :			dd -1,-1
64 :	edgomez	1.2
65 :	edgomez	1.3	;=============================================================================
66 :			; Macros
67 :			;=============================================================================
68 :	edgomez	1.2
69 :	edgomez	1.3	%macro nop4 0
70 :			DB 08Dh,074h,026h,0
71 :			%endmacro
72 :
73 :			;=============================================================================
74 :			; Macros
75 :			;=============================================================================
76 :	edgomez	1.2
77 :	edgomez	1.3	SECTION .text
78 :	edgomez	1.2
79 :			cglobal interpolate8x8_halfpel_h_3dne
80 :			cglobal interpolate8x8_halfpel_v_3dne
81 :			cglobal interpolate8x8_halfpel_hv_3dne
82 :
83 :	suxen_drol	1.7	cglobal interpolate8x4_halfpel_h_3dne
84 :			cglobal interpolate8x4_halfpel_v_3dne
85 :			cglobal interpolate8x4_halfpel_hv_3dne
86 :
87 :	edgomez	1.3	;-----------------------------------------------------------------------------
88 :	edgomez	1.2	;
89 :			; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst,
90 :	edgomez	1.3	; const uint8_t * const src,
91 :			; const uint32_t stride,
92 :			; const uint32_t rounding);
93 :	edgomez	1.2	;
94 :	edgomez	1.3	;-----------------------------------------------------------------------------
95 :	edgomez	1.2
96 :			%macro COPY_H_SSE_RND0 1
97 :			%if (%1)
98 :	edgomez	1.3	movq mm0, [eax]
99 :	edgomez	1.2	%else
100 :	suxen_drol	1.8	movq mm0, [eax+0]
101 :			; ---
102 :			; nasm >0.99.x rejects the original statement:
103 :			; movq mm0, [dword eax]
104 :			; as it is ambiguous. for this statement nasm <0.99.x would
105 :			; generate "movq mm0,[eax+0]"
106 :			; ---
107 :	edgomez	1.2	%endif
108 :			pavgb mm0, [eax+1]
109 :	edgomez	1.3	movq mm1, [eax+edx]
110 :	edgomez	1.2	pavgb mm1, [eax+edx+1]
111 :	edgomez	1.3	lea eax, [eax+2*edx]
112 :			movq [ecx], mm0
113 :			movq [ecx+edx], mm1
114 :	edgomez	1.2	%endmacro
115 :
116 :			%macro COPY_H_SSE_RND1 0
117 :			movq mm0, [eax]
118 :			movq mm1, [eax+edx]
119 :			movq mm4, mm0
120 :			movq mm5, mm1
121 :	edgomez	1.3	movq mm2, [eax+1]
122 :	edgomez	1.2	movq mm3, [eax+edx+1]
123 :			pavgb mm0, mm2
124 :			pxor mm2, mm4
125 :			pavgb mm1, mm3
126 :	edgomez	1.3	lea eax, [eax+2*edx]
127 :	edgomez	1.2	pxor mm3, mm5
128 :			pand mm2, mm7
129 :			pand mm3, mm7
130 :			psubb mm0, mm2
131 :			movq [ecx], mm0
132 :			psubb mm1, mm3
133 :	edgomez	1.3	movq [ecx+edx], mm1
134 :	edgomez	1.2	%endmacro
135 :
136 :	edgomez	1.3	ALIGN 16
137 :	edgomez	1.2	interpolate8x8_halfpel_h_3dne:
138 :
139 :			mov eax, [esp+ 8] ; Src
140 :			mov edx, [esp+12] ; stride
141 :			dec dword [esp+16]; rounding
142 :
143 :			jz .rounding1
144 :			mov ecx, [esp+ 4] ; Dst
145 :
146 :			COPY_H_SSE_RND0 0
147 :			lea ecx,[ecx+2*edx]
148 :			COPY_H_SSE_RND0 1
149 :			lea ecx,[ecx+2*edx]
150 :			COPY_H_SSE_RND0 1
151 :			lea ecx,[ecx+2*edx]
152 :			COPY_H_SSE_RND0 1
153 :			ret
154 :
155 :			.rounding1
156 :	edgomez	1.3	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
157 :	edgomez	1.2	mov ecx, [esp+ 4] ; Dst
158 :			movq mm7, [mmx_one]
159 :			COPY_H_SSE_RND1
160 :			lea ecx, [ecx+2*edx]
161 :			COPY_H_SSE_RND1
162 :			lea ecx,[ecx+2*edx]
163 :			COPY_H_SSE_RND1
164 :			lea ecx,[ecx+2*edx]
165 :			COPY_H_SSE_RND1
166 :			ret
167 :	edgomez	1.6	.endfunc
168 :	edgomez	1.2
169 :	edgomez	1.3	;-----------------------------------------------------------------------------
170 :	edgomez	1.2	;
171 :			; void interpolate8x8_halfpel_v_3dne(uint8_t * const dst,
172 :	edgomez	1.3	; const uint8_t * const src,
173 :			; const uint32_t stride,
174 :			; const uint32_t rounding);
175 :	edgomez	1.2	;
176 :	edgomez	1.3	;-----------------------------------------------------------------------------
177 :	edgomez	1.2
178 :	edgomez	1.3	ALIGN 16
179 :	edgomez	1.2	interpolate8x8_halfpel_v_3dne:
180 :
181 :			mov eax, [esp+ 8] ; Src
182 :			mov edx, [esp+12] ; stride
183 :			dec dword [esp+16]; rounding
184 :
185 :			; we process 2 line at a time
186 :
187 :			jz .rounding1
188 :			pxor mm2,mm2
189 :	edgomez	1.3	movq mm0, [eax]
190 :			movq mm1, [eax+edx]
191 :			por mm2, [eax+2*edx]
192 :	edgomez	1.2	mov ecx, [esp+ 4] ; Dst
193 :	edgomez	1.3	lea eax, [eax+2*edx]
194 :			pxor mm4, mm4
195 :	edgomez	1.2	pavgb mm0, mm1
196 :	edgomez	1.3	pavgb mm1, mm2
197 :			movq [byte ecx], mm0
198 :			movq [ecx+edx], mm1
199 :			pxor mm6, mm6
200 :			add eax, edx
201 :			lea ecx, [ecx+2*edx]
202 :			movq mm3, [byte eax]
203 :			por mm4, [eax+edx]
204 :			lea eax, [eax+2*edx]
205 :	edgomez	1.2	pavgb mm2, mm3
206 :			pavgb mm3, mm4
207 :	edgomez	1.3	movq [ecx], mm2
208 :			movq [ecx+edx], mm3
209 :			lea ecx, [byte ecx+2*edx]
210 :			movq mm5, [byte eax]
211 :			por mm6, [eax+edx]
212 :			lea eax, [eax+2*edx]
213 :	edgomez	1.2	pavgb mm4, mm5
214 :			pavgb mm5, mm6
215 :	edgomez	1.3	movq [ecx], mm4
216 :			movq [ecx+edx], mm5
217 :			lea ecx, [ecx+2*edx]
218 :			movq mm7, [eax]
219 :			movq mm0, [eax+edx]
220 :	edgomez	1.2	pavgb mm6, mm7
221 :			pavgb mm7, mm0
222 :	edgomez	1.3	movq [ecx], mm6
223 :			movq [ecx+edx], mm7
224 :	edgomez	1.2	ret
225 :
226 :	edgomez	1.3	ALIGN 8
227 :	edgomez	1.2	.rounding1
228 :	edgomez	1.3	pcmpeqb mm0, mm0
229 :			psubusb mm0, [eax]
230 :			add eax, edx
231 :	edgomez	1.2	mov ecx, [esp+ 4] ; Dst
232 :			push esi
233 :	edgomez	1.3	pcmpeqb mm1, mm1
234 :			pcmpeqb mm2, mm2
235 :			mov esi, mm_minusone
236 :			psubusb mm1, [byte eax]
237 :			psubusb mm2, [eax+edx]
238 :			lea eax, [eax+2*edx]
239 :	edgomez	1.2	movq mm6, [esi]
240 :			movq mm7, [esi]
241 :			pavgb mm0, mm1
242 :			pavgb mm1, mm2
243 :	edgomez	1.3	psubusb mm6, mm0
244 :			psubusb mm7, mm1
245 :	edgomez	1.2	movq [ecx], mm6
246 :			movq [ecx+edx], mm7
247 :	edgomez	1.3	lea ecx, [ecx+2*edx]
248 :			pcmpeqb mm3, mm3
249 :			pcmpeqb mm4, mm4
250 :			psubusb mm3, [eax]
251 :			psubusb mm4, [eax+edx]
252 :			lea eax, [eax+2*edx]
253 :	edgomez	1.2	pavgb mm2, mm3
254 :			pavgb mm3, mm4
255 :			movq mm0, [esi]
256 :			movq mm1, [esi]
257 :	edgomez	1.3	psubusb mm0, mm2
258 :			psubusb mm1, mm3
259 :	edgomez	1.2	movq [ecx], mm0
260 :			movq [ecx+edx], mm1
261 :			lea ecx,[ecx+2*edx]
262 :
263 :	edgomez	1.3	pcmpeqb mm5, mm5
264 :			pcmpeqb mm6, mm6
265 :			psubusb mm5, [eax]
266 :			psubusb mm6, [eax+edx]
267 :			lea eax, [eax+2*edx]
268 :	edgomez	1.2	pavgb mm4, mm5
269 :			pavgb mm5, mm6
270 :			movq mm2, [esi]
271 :			movq mm3, [esi]
272 :	edgomez	1.3	psubusb mm2, mm4
273 :			psubusb mm3, mm5
274 :	edgomez	1.2	movq [ecx], mm2
275 :			movq [ecx+edx], mm3
276 :	edgomez	1.3	lea ecx, [ecx+2*edx]
277 :			pcmpeqb mm7, mm7
278 :			pcmpeqb mm0, mm0
279 :			psubusb mm7, [eax]
280 :			psubusb mm0, [eax+edx]
281 :	edgomez	1.2	pavgb mm6, mm7
282 :			pavgb mm7, mm0
283 :			movq mm4, [esi]
284 :			movq mm5, [esi]
285 :	edgomez	1.3	psubusb mm4, mm6
286 :	edgomez	1.2	pop esi
287 :	edgomez	1.3	psubusb mm5, mm7
288 :	edgomez	1.2	movq [ecx], mm4
289 :			movq [ecx+edx], mm5
290 :			ret
291 :	edgomez	1.6	.endfunc
292 :	edgomez	1.3
293 :			;-----------------------------------------------------------------------------
294 :	edgomez	1.2	;
295 :			; void interpolate8x8_halfpel_hv_3dne(uint8_t * const dst,
296 :	edgomez	1.3	; const uint8_t * const src,
297 :			; const uint32_t stride,
298 :			; const uint32_t rounding);
299 :	edgomez	1.2	;
300 :			;
301 :	edgomez	1.3	;-----------------------------------------------------------------------------
302 :	edgomez	1.2
303 :			; The trick is to correct the result of 'pavgb' with some combination of the
304 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
305 :			; The boolean relations are:
306 :	edgomez	1.3	; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
307 :	edgomez	1.2	; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
308 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
309 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
310 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
311 :
312 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
313 :
314 :			%macro COPY_HV_SSE_RND0 0
315 :
316 :	edgomez	1.3	movq mm0, [eax+edx]
317 :			movq mm1, [eax+edx+1]
318 :	edgomez	1.2
319 :	edgomez	1.3	movq mm6, mm0
320 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
321 :			lea eax, [eax+2*edx]
322 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
323 :	edgomez	1.2
324 :	edgomez	1.3	por mm3, mm1 ; ij \|= jk
325 :			movq mm6, mm2
326 :			pxor mm6, mm0 ; mm6 = s^t
327 :			pand mm3, mm6 ; (ij\|jk) &= st
328 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
329 :			movq mm6, [eax]
330 :			pand mm3, mm7 ; mask lsb
331 :			psubb mm2, mm3 ; apply.
332 :
333 :			movq [ecx], mm2
334 :	edgomez	1.2
335 :	edgomez	1.3	movq mm2, [eax]
336 :			movq mm3, [eax+1]
337 :			pavgb mm2, mm3 ; preserved for next iteration
338 :			pxor mm3, mm6 ; preserved for next iteration
339 :
340 :			por mm1, mm3
341 :			movq mm6, mm0
342 :			pxor mm6, mm2
343 :			pand mm1, mm6
344 :			pavgb mm0, mm2
345 :
346 :			pand mm1, mm7
347 :			psubb mm0, mm1
348 :
349 :			movq [ecx+edx], mm0
350 :	edgomez	1.2	%endmacro
351 :
352 :			%macro COPY_HV_SSE_RND1 0
353 :	edgomez	1.3	movq mm0, [eax+edx]
354 :			movq mm1, [eax+edx+1]
355 :
356 :			movq mm6, mm0
357 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
358 :			lea eax,[eax+2*edx]
359 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
360 :	edgomez	1.2
361 :	edgomez	1.3	pand mm3, mm1
362 :			movq mm6, mm2
363 :			pxor mm6, mm0
364 :			por mm3, mm6
365 :			pavgb mm2, mm0
366 :			movq mm6, [eax]
367 :			pand mm3, mm7
368 :			psubb mm2, mm3
369 :
370 :			movq [ecx], mm2
371 :
372 :			movq mm2, [eax]
373 :			movq mm3, [eax+1]
374 :			pavgb mm2, mm3 ; preserved for next iteration
375 :			pxor mm3, mm6 ; preserved for next iteration
376 :
377 :			pand mm1, mm3
378 :			movq mm6, mm0
379 :			pxor mm6, mm2
380 :			por mm1, mm6
381 :			pavgb mm0, mm2
382 :			pand mm1, mm7
383 :			psubb mm0, mm1
384 :			movq [ecx+edx], mm0
385 :	edgomez	1.2	%endmacro
386 :
387 :	edgomez	1.3	ALIGN 16
388 :	edgomez	1.2	interpolate8x8_halfpel_hv_3dne:
389 :	edgomez	1.3	mov eax, [esp+ 8] ; Src
390 :			mov edx, [esp+12] ; stride
391 :			dec dword [esp+16] ; rounding
392 :	edgomez	1.2
393 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
394 :			movq mm2, [eax]
395 :			movq mm3, [eax+1]
396 :			movq mm6, mm2
397 :			pavgb mm2, mm3
398 :	edgomez	1.3	pxor mm3, mm6 ; mm2/mm3 ready
399 :			mov ecx, [esp+ 4] ; Dst
400 :	edgomez	1.2	movq mm7, [mmx_one]
401 :
402 :			jz near .rounding1
403 :	edgomez	1.3	lea ebp,[byte ebp]
404 :	edgomez	1.2	COPY_HV_SSE_RND0
405 :	edgomez	1.3	lea ecx,[ecx+2*edx]
406 :	edgomez	1.2	COPY_HV_SSE_RND0
407 :	edgomez	1.3	lea ecx,[ecx+2*edx]
408 :	edgomez	1.2	COPY_HV_SSE_RND0
409 :	edgomez	1.3	lea ecx,[ecx+2*edx]
410 :	edgomez	1.2	COPY_HV_SSE_RND0
411 :			ret
412 :
413 :	edgomez	1.3	ALIGN 16
414 :	edgomez	1.2	.rounding1
415 :			COPY_HV_SSE_RND1
416 :	edgomez	1.3	lea ecx,[ecx+2*edx]
417 :	edgomez	1.2	COPY_HV_SSE_RND1
418 :	edgomez	1.3	lea ecx,[ecx+2*edx]
419 :	edgomez	1.2	COPY_HV_SSE_RND1
420 :	edgomez	1.3	lea ecx,[ecx+2*edx]
421 :	edgomez	1.2	COPY_HV_SSE_RND1
422 :	edgomez	1.3	ret
423 :	edgomez	1.6	.endfunc
424 :
425 :	suxen_drol	1.7	;-----------------------------------------------------------------------------
426 :			;
427 :			; void interpolate8x4_halfpel_h_3dne(uint8_t * const dst,
428 :			; const uint8_t * const src,
429 :			; const uint32_t stride,
430 :			; const uint32_t rounding);
431 :			;
432 :			;-----------------------------------------------------------------------------
433 :
434 :			ALIGN 16
435 :			interpolate8x4_halfpel_h_3dne:
436 :
437 :			mov eax, [esp+ 8] ; Src
438 :			mov edx, [esp+12] ; stride
439 :			dec dword [esp+16]; rounding
440 :
441 :			jz .rounding1
442 :			mov ecx, [esp+ 4] ; Dst
443 :
444 :			COPY_H_SSE_RND0 0
445 :			lea ecx,[ecx+2*edx]
446 :			COPY_H_SSE_RND0 1
447 :			ret
448 :
449 :			.rounding1
450 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
451 :			mov ecx, [esp+ 4] ; Dst
452 :			movq mm7, [mmx_one]
453 :			COPY_H_SSE_RND1
454 :			lea ecx, [ecx+2*edx]
455 :			COPY_H_SSE_RND1
456 :			ret
457 :			.endfunc
458 :
459 :			;-----------------------------------------------------------------------------
460 :			;
461 :			; void interpolate8x4_halfpel_v_3dne(uint8_t * const dst,
462 :			; const uint8_t * const src,
463 :			; const uint32_t stride,
464 :			; const uint32_t rounding);
465 :			;
466 :			;-----------------------------------------------------------------------------
467 :
468 :			ALIGN 16
469 :			interpolate8x4_halfpel_v_3dne:
470 :
471 :			mov eax, [esp+ 8] ; Src
472 :			mov edx, [esp+12] ; stride
473 :			dec dword [esp+16]; rounding
474 :
475 :			; we process 2 line at a time
476 :
477 :			jz .rounding1
478 :			pxor mm2,mm2
479 :			movq mm0, [eax]
480 :			movq mm1, [eax+edx]
481 :			por mm2, [eax+2*edx] ; Something like preload (pipelining)
482 :			mov ecx, [esp+ 4] ; Dst
483 :			lea eax, [eax+2*edx]
484 :			pxor mm4, mm4
485 :			pavgb mm0, mm1
486 :			pavgb mm1, mm2
487 :			movq [byte ecx], mm0
488 :			movq [ecx+edx], mm1
489 :
490 :			pxor mm6, mm6
491 :			add eax, edx
492 :			lea ecx, [ecx+2*edx]
493 :			movq mm3, [byte eax]
494 :			por mm4, [eax+edx]
495 :			lea eax, [eax+2*edx]
496 :			pavgb mm2, mm3
497 :			pavgb mm3, mm4
498 :			movq [ecx], mm2
499 :			movq [ecx+edx], mm3
500 :
501 :			ret
502 :
503 :			ALIGN 8
504 :			.rounding1
505 :			pcmpeqb mm0, mm0
506 :			psubusb mm0, [eax] ; eax==line0
507 :			add eax, edx ; eax==line1
508 :			mov ecx, [esp+ 4] ; Dst
509 :
510 :			push esi
511 :
512 :			pcmpeqb mm1, mm1
513 :			pcmpeqb mm2, mm2
514 :			mov esi, mm_minusone
515 :			psubusb mm1, [byte eax] ; line1
516 :			psubusb mm2, [eax+edx] ; line2
517 :			lea eax, [eax+2*edx] ; eax==line3
518 :			movq mm6, [esi]
519 :			movq mm7, [esi]
520 :			pavgb mm0, mm1
521 :			pavgb mm1, mm2
522 :			psubusb mm6, mm0
523 :			psubusb mm7, mm1
524 :			movq [ecx], mm6 ; store line0
525 :			movq [ecx+edx], mm7 ; store line1
526 :
527 :			lea ecx, [ecx+2*edx]
528 :			pcmpeqb mm3, mm3
529 :			pcmpeqb mm4, mm4
530 :			psubusb mm3, [eax] ; line3
531 :			psubusb mm4, [eax+edx] ; line4
532 :			lea eax, [eax+2*edx] ; eax==line 5
533 :			pavgb mm2, mm3
534 :			pavgb mm3, mm4
535 :			movq mm0, [esi]
536 :			movq mm1, [esi]
537 :			psubusb mm0, mm2
538 :			psubusb mm1, mm3
539 :			movq [ecx], mm0
540 :			movq [ecx+edx], mm1
541 :
542 :			pop esi
543 :
544 :			ret
545 :
546 :			.endfunc
547 :
548 :			;-----------------------------------------------------------------------------
549 :			;
550 :			; void interpolate8x4_halfpel_hv_3dne(uint8_t * const dst,
551 :			; const uint8_t * const src,
552 :			; const uint32_t stride,
553 :			; const uint32_t rounding);
554 :			;
555 :			;
556 :			;-----------------------------------------------------------------------------
557 :
558 :			ALIGN 16
559 :			interpolate8x4_halfpel_hv_3dne:
560 :			mov eax, [esp+ 8] ; Src
561 :			mov edx, [esp+12] ; stride
562 :			dec dword [esp+16] ; rounding
563 :
564 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
565 :			movq mm2, [eax]
566 :			movq mm3, [eax+1]
567 :			movq mm6, mm2
568 :			pavgb mm2, mm3
569 :			pxor mm3, mm6 ; mm2/mm3 ready
570 :			mov ecx, [esp+ 4] ; Dst
571 :			movq mm7, [mmx_one]
572 :
573 :			jz near .rounding1
574 :			lea ebp,[byte ebp]
575 :			COPY_HV_SSE_RND0
576 :			lea ecx,[ecx+2*edx]
577 :			COPY_HV_SSE_RND0
578 :			ret
579 :
580 :			ALIGN 16
581 :			.rounding1
582 :			COPY_HV_SSE_RND1
583 :			lea ecx,[ecx+2*edx]
584 :			COPY_HV_SSE_RND1
585 :			ret
586 :			.endfunc
587 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4