Annotation of /xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm

Revision 1.8 - (view) (download)

1 :	Isibaar	1.1	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx 8bit<->16bit transfers
5 :			; *
6 :	edgomez	1.8	; * This file is part of XviD, a free MPEG-4 video encoder/decoder
7 :			; *
8 :			; * XviD is free software; you can redistribute it and/or modify it
9 :			; * under the terms of the GNU General Public License as published by
10 :			; * the Free Software Foundation; either version 2 of the License, or
11 :			; * (at your option) any later version.
12 :			; *
13 :			; * This program is distributed in the hope that it will be useful,
14 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :			; * GNU General Public License for more details.
17 :			; *
18 :			; * You should have received a copy of the GNU General Public License
19 :			; * along with this program; if not, write to the Free Software
20 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :			; *
22 :			; * Under section 8 of the GNU General Public License, the copyright
23 :			; * holders of XVID explicitly forbid distribution in the following
24 :			; * countries:
25 :			; *
26 :			; * - Japan
27 :			; * - United States of America
28 :			; *
29 :			; * Linking XviD statically or dynamically with other modules is making a
30 :			; * combined work based on XviD. Thus, the terms and conditions of the
31 :			; * GNU General Public License cover the whole combination.
32 :			; *
33 :			; * As a special exception, the copyright holders of XviD give you
34 :			; * permission to link XviD with independent modules that communicate with
35 :			; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the
36 :			; * license terms of these independent modules, and to copy and distribute
37 :			; * the resulting combined work under terms of your choice, provided that
38 :			; * every copy of the combined work is accompanied by a complete copy of
39 :			; * the source code of XviD (the version of XviD used to produce the
40 :			; * combined work), being distributed under the terms of the GNU General
41 :			; * Public License plus this exception. An independent module is a module
42 :			; * which is not derived from or based on XviD.
43 :			; *
44 :			; * Note that people who make modified versions of XviD are not obligated
45 :			; * to grant this special exception for their modified versions; it is
46 :			; * their choice whether to do so. The GNU General Public License gives
47 :			; * permission to release a modified version without this exception; this
48 :			; * exception also makes it possible to release a modified version which
49 :			; * carries forward this exception.
50 :			; *
51 :			; * $Id$
52 :	Isibaar	1.1	; *
53 :			; *************************************************************************/
54 :
55 :			;/**************************************************************************
56 :			; *
57 :			; * History:
58 :			; *
59 :	Isibaar	1.5	; * 04.06.2002 speed enhancement (unroll+overlap). -Skal-
60 :			; * + added transfer_8to16sub2_mmx/xmm
61 :	Isibaar	1.1	; * 07.01.2002 merge functions from compensate_mmx; rename functions
62 :	suxen_drol	1.7	; * 07.11.2001 initial version; (c)2001 peter ross <pross@xvid.org>
63 :	Isibaar	1.1	; *
64 :			; *************************************************************************/
65 :
66 :
67 :			bits 32
68 :
69 :			%macro cglobal 1
70 :			%ifdef PREFIX
71 :			global _%1
72 :			%define %1 _%1
73 :			%else
74 :			global %1
75 :			%endif
76 :			%endmacro
77 :
78 :
79 :			section .text
80 :
81 :	Isibaar	1.5	cglobal transfer_8to16copy_mmx
82 :			cglobal transfer_16to8copy_mmx
83 :			cglobal transfer_8to16sub_mmx
84 :			cglobal transfer_8to16sub2_mmx
85 :			cglobal transfer_8to16sub2_xmm
86 :			cglobal transfer_16to8add_mmx
87 :			cglobal transfer8x8_copy_mmx
88 :	Isibaar	1.1
89 :			;===========================================================================
90 :			;
91 :			; void transfer_8to16copy_mmx(int16_t * const dst,
92 :			; const uint8_t * const src,
93 :			; uint32_t stride);
94 :			;
95 :			;===========================================================================
96 :
97 :	Isibaar	1.5	%macro COPY_8_TO_16 1
98 :			movq mm0, [eax]
99 :			movq mm1, [eax+edx]
100 :			movq mm2, mm0
101 :			movq mm3, mm1
102 :			punpcklbw mm0, mm7
103 :			movq [ecx+%1*32], mm0
104 :			punpcklbw mm1, mm7
105 :			movq [ecx+%1*32+16], mm1
106 :			punpckhbw mm2, mm7
107 :			punpckhbw mm3, mm7
108 :			lea eax,[eax+2*edx]
109 :			movq [ecx+%1*32+8], mm2
110 :			movq [ecx+%1*32+24], mm3
111 :			%endmacro
112 :
113 :	Isibaar	1.1	align 16
114 :	Isibaar	1.5	transfer_8to16copy_mmx:
115 :	Isibaar	1.1
116 :	Isibaar	1.5	mov ecx, [esp+ 4] ; Dst
117 :			mov eax, [esp+ 8] ; Src
118 :			mov edx, [esp+12] ; Stride
119 :			pxor mm7,mm7
120 :
121 :			COPY_8_TO_16 0
122 :			COPY_8_TO_16 1
123 :			COPY_8_TO_16 2
124 :			COPY_8_TO_16 3
125 :			ret
126 :	Isibaar	1.1
127 :			;===========================================================================
128 :			;
129 :			; void transfer_16to8copy_mmx(uint8_t * const dst,
130 :			; const int16_t * const src,
131 :			; uint32_t stride);
132 :			;
133 :			;===========================================================================
134 :
135 :	Isibaar	1.5	%macro COPY_16_TO_8 1
136 :			movq mm0, [eax+%1*32]
137 :			movq mm1, [eax+%1*32+8]
138 :			packuswb mm0, mm1
139 :			movq [ecx], mm0
140 :			movq mm2, [eax+%1*32+16]
141 :			movq mm3, [eax+%1*32+24]
142 :			packuswb mm2, mm3
143 :			movq [ecx+edx], mm2
144 :			%endmacro
145 :
146 :	Isibaar	1.1	align 16
147 :	Isibaar	1.5	transfer_16to8copy_mmx:
148 :	Isibaar	1.1
149 :	Isibaar	1.5	mov ecx, [esp+ 4] ; Dst
150 :			mov eax, [esp+ 8] ; Src
151 :			mov edx, [esp+12] ; Stride
152 :
153 :			COPY_16_TO_8 0
154 :			lea ecx,[ecx+2*edx]
155 :			COPY_16_TO_8 1
156 :			lea ecx,[ecx+2*edx]
157 :			COPY_16_TO_8 2
158 :			lea ecx,[ecx+2*edx]
159 :			COPY_16_TO_8 3
160 :			ret
161 :	Isibaar	1.1
162 :			;===========================================================================
163 :			;
164 :			; void transfer_8to16sub_mmx(int16_t * const dct,
165 :			; uint8_t * const cur,
166 :			; const uint8_t * const ref,
167 :			; const uint32_t stride);
168 :			;
169 :			;===========================================================================
170 :			;/**************************************************************************
171 :			; *
172 :			; * History:
173 :			; *
174 :			; * 27.12.2001 renamed from 'compensate' to 'transfer_8to16sub'
175 :			; * 02.12.2001 loop unrolled, code runs 10% faster now (Isibaar)
176 :			; * 30.11.2001 16 pixels are processed per iteration (Isibaar)
177 :			; * 30.11.2001 .text missing
178 :	suxen_drol	1.7	; * 06.11.2001 inital version; (c)2001 peter ross <pross@xvid.org>
179 :	Isibaar	1.1	; *
180 :			; *************************************************************************/
181 :
182 :	Isibaar	1.5	%macro COPY_8_TO_16_SUB 1
183 :			movq mm0, [eax] ; cur
184 :			movq mm2, [eax+edx]
185 :			movq mm1, mm0
186 :			movq mm3, mm2
187 :
188 :			punpcklbw mm0, mm7
189 :			punpcklbw mm2, mm7
190 :			movq mm4, [ebx] ; ref
191 :			punpckhbw mm1, mm7
192 :			punpckhbw mm3, mm7
193 :			movq mm5, [ebx+edx] ; ref
194 :
195 :			movq mm6, mm4
196 :			movq [eax], mm4
197 :			movq [eax+edx], mm5
198 :			punpcklbw mm4, mm7
199 :			punpckhbw mm6, mm7
200 :			psubsw mm0, mm4
201 :			psubsw mm1, mm6
202 :			movq mm6, mm5
203 :			punpcklbw mm5, mm7
204 :			punpckhbw mm6, mm7
205 :			psubsw mm2, mm5
206 :			lea eax,[eax+2*edx]
207 :			psubsw mm3, mm6
208 :			lea ebx,[ebx+2*edx]
209 :
210 :			movq [ecx+%1*32+ 0], mm0 ; dst
211 :			movq [ecx+%1*32+ 8], mm1
212 :			movq [ecx+%1*32+16], mm2
213 :			movq [ecx+%1*32+24], mm3
214 :			%endmacro
215 :
216 :	Isibaar	1.1	align 16
217 :	Isibaar	1.5	transfer_8to16sub_mmx:
218 :			mov ecx, [esp + 4] ; Dst
219 :			mov eax, [esp + 8] ; Cur
220 :			push ebx
221 :			mov ebx, [esp+4+12] ; Ref
222 :			mov edx, [esp+4+16] ; Stride
223 :			pxor mm7, mm7
224 :	Isibaar	1.1
225 :	Isibaar	1.5	COPY_8_TO_16_SUB 0
226 :			COPY_8_TO_16_SUB 1
227 :			COPY_8_TO_16_SUB 2
228 :			COPY_8_TO_16_SUB 3
229 :	Isibaar	1.1
230 :	Isibaar	1.5	pop ebx
231 :			ret
232 :	Isibaar	1.1
233 :	Isibaar	1.5	;===========================================================================
234 :			;
235 :			; void transfer_8to16sub2_mmx(int16_t * const dct,
236 :			; uint8_t * const cur,
237 :			; const uint8_t * ref1,
238 :			; const uint8_t * ref2,
239 :			; const uint32_t stride)
240 :			;
241 :			;===========================================================================
242 :
243 :			%macro COPY_8_TO_16_SUB2_MMX 1
244 :			movq mm0, [eax] ; cur
245 :			movq mm2, [eax+edx]
246 :
247 :			; mm4 <- (ref1+ref2+1) / 2
248 :			movq mm4, [ebx] ; ref1
249 :			movq mm1, [esi] ; ref2
250 :			movq mm6, mm4
251 :			movq mm3, mm1
252 :			punpcklbw mm4, mm7
253 :			punpcklbw mm1, mm7
254 :			punpckhbw mm6, mm7
255 :			punpckhbw mm3, mm7
256 :			paddusw mm4, mm1
257 :			paddusw mm6, mm3
258 :			psrlw mm4,1
259 :			psrlw mm6,1
260 :			packuswb mm4, mm6
261 :
262 :			; mm5 <- (ref1+ref2+1) / 2
263 :			movq mm5, [ebx+edx] ; ref1
264 :			movq mm1, [esi+edx] ; ref2
265 :			movq mm6, mm5
266 :			movq mm3, mm1
267 :			punpcklbw mm5, mm7
268 :			punpcklbw mm1, mm7
269 :			punpckhbw mm6, mm7
270 :			punpckhbw mm3, mm7
271 :			paddusw mm5, mm1
272 :			paddusw mm6, mm3
273 :			lea esi,[esi+2*edx]
274 :			psrlw mm5,1
275 :			psrlw mm6,1
276 :			packuswb mm5, mm6
277 :
278 :
279 :			movq mm1, mm0
280 :			movq mm3, mm2
281 :			punpcklbw mm0, mm7
282 :			punpcklbw mm2, mm7
283 :			punpckhbw mm1, mm7
284 :			punpckhbw mm3, mm7
285 :
286 :			movq mm6, mm4
287 :			punpcklbw mm4, mm7
288 :			punpckhbw mm6, mm7
289 :			psubsw mm0, mm4
290 :			psubsw mm1, mm6
291 :			movq mm6, mm5
292 :			punpcklbw mm5, mm7
293 :			punpckhbw mm6, mm7
294 :			psubsw mm2, mm5
295 :			lea eax,[eax+2*edx]
296 :			psubsw mm3, mm6
297 :			lea ebx,[ebx+2*edx]
298 :
299 :			movq [ecx+%1*32+ 0], mm0 ; dst
300 :			movq [ecx+%1*32+ 8], mm1
301 :			movq [ecx+%1*32+16], mm2
302 :			movq [ecx+%1*32+24], mm3
303 :			%endmacro
304 :	Isibaar	1.1
305 :	Isibaar	1.5	align 16
306 :			transfer_8to16sub2_mmx:
307 :			mov ecx, [esp + 4] ; Dst
308 :			mov eax, [esp + 8] ; Cur
309 :			push ebx
310 :			mov ebx, [esp+4+12] ; Ref1
311 :			push esi
312 :			mov esi, [esp+8+16] ; Ref2
313 :			mov edx, [esp+8+20] ; Stride
314 :			pxor mm7, mm7
315 :
316 :			COPY_8_TO_16_SUB2_MMX 0
317 :			COPY_8_TO_16_SUB2_MMX 1
318 :			COPY_8_TO_16_SUB2_MMX 2
319 :			COPY_8_TO_16_SUB2_MMX 3
320 :
321 :			pop esi
322 :			pop ebx
323 :			ret
324 :	Isibaar	1.1
325 :	edgomez	1.2	;===========================================================================
326 :			;
327 :			; void transfer_8to16sub2_xmm(int16_t * const dct,
328 :	Isibaar	1.5	; uint8_t * const cur,
329 :			; const uint8_t * ref1,
330 :			; const uint8_t * ref2,
331 :			; const uint32_t stride)
332 :	edgomez	1.2	;
333 :			;===========================================================================
334 :
335 :	Isibaar	1.5	%macro COPY_8_TO_16_SUB2_SSE 1
336 :			movq mm0, [eax] ; cur
337 :			movq mm2, [eax+edx]
338 :			movq mm1, mm0
339 :			movq mm3, mm2
340 :
341 :			punpcklbw mm0, mm7
342 :			punpcklbw mm2, mm7
343 :			movq mm4, [ebx] ; ref1
344 :			pavgb mm4, [esi] ; ref2
345 :			punpckhbw mm1, mm7
346 :			punpckhbw mm3, mm7
347 :			movq mm5, [ebx+edx] ; ref
348 :			pavgb mm5, [esi+edx] ; ref2
349 :
350 :			movq mm6, mm4
351 :			punpcklbw mm4, mm7
352 :			punpckhbw mm6, mm7
353 :			psubsw mm0, mm4
354 :			psubsw mm1, mm6
355 :			lea esi,[esi+2*edx]
356 :			movq mm6, mm5
357 :			punpcklbw mm5, mm7
358 :			punpckhbw mm6, mm7
359 :			psubsw mm2, mm5
360 :			lea eax,[eax+2*edx]
361 :			psubsw mm3, mm6
362 :			lea ebx,[ebx+2*edx]
363 :
364 :			movq [ecx+%1*32+ 0], mm0 ; dst
365 :			movq [ecx+%1*32+ 8], mm1
366 :			movq [ecx+%1*32+16], mm2
367 :			movq [ecx+%1*32+24], mm3
368 :			%endmacro
369 :
370 :	edgomez	1.2	align 16
371 :	Isibaar	1.5	transfer_8to16sub2_xmm:
372 :			mov ecx, [esp + 4] ; Dst
373 :			mov eax, [esp + 8] ; Cur
374 :			push ebx
375 :			mov ebx, [esp+4+12] ; Ref1
376 :			push esi
377 :			mov esi, [esp+8+16] ; Ref2
378 :			mov edx, [esp+8+20] ; Stride
379 :			pxor mm7, mm7
380 :
381 :			COPY_8_TO_16_SUB2_SSE 0
382 :			COPY_8_TO_16_SUB2_SSE 1
383 :			COPY_8_TO_16_SUB2_SSE 2
384 :			COPY_8_TO_16_SUB2_SSE 3
385 :
386 :			pop esi
387 :			pop ebx
388 :			ret
389 :	Isibaar	1.1
390 :			;===========================================================================
391 :			;
392 :			; void transfer_16to8add_mmx(uint8_t * const dst,
393 :			; const int16_t * const src,
394 :			; uint32_t stride);
395 :			;
396 :			;===========================================================================
397 :
398 :	Isibaar	1.5	%macro COPY_16_TO_8_ADD 1
399 :			movq mm0, [ecx]
400 :			movq mm2, [ecx+edx]
401 :			movq mm1, mm0
402 :			movq mm3, mm2
403 :			punpcklbw mm0, mm7
404 :			punpcklbw mm2, mm7
405 :			punpckhbw mm1, mm7
406 :			punpckhbw mm3, mm7
407 :			paddsw mm0, [eax+%1*32+ 0]
408 :			paddsw mm1, [eax+%1*32+ 8]
409 :			paddsw mm2, [eax+%1*32+16]
410 :			paddsw mm3, [eax+%1*32+24]
411 :			packuswb mm0, mm1
412 :			movq [ecx], mm0
413 :			packuswb mm2, mm3
414 :			movq [ecx+edx], mm2
415 :			%endmacro
416 :	Isibaar	1.1
417 :
418 :	Isibaar	1.5	align 16
419 :			transfer_16to8add_mmx:
420 :			mov ecx, [esp+ 4] ; Dst
421 :			mov eax, [esp+ 8] ; Src
422 :			mov edx, [esp+12] ; Stride
423 :			pxor mm7, mm7
424 :
425 :			COPY_16_TO_8_ADD 0
426 :			lea ecx,[ecx+2*edx]
427 :			COPY_16_TO_8_ADD 1
428 :			lea ecx,[ecx+2*edx]
429 :			COPY_16_TO_8_ADD 2
430 :			lea ecx,[ecx+2*edx]
431 :			COPY_16_TO_8_ADD 3
432 :			ret
433 :	Isibaar	1.1
434 :			;===========================================================================
435 :			;
436 :			; void transfer8x8_copy_mmx(uint8_t * const dst,
437 :			; const uint8_t * const src,
438 :			; const uint32_t stride);
439 :			;
440 :			;
441 :			;===========================================================================
442 :
443 :	Isibaar	1.5	%macro COPY_8_TO_8 0
444 :			movq mm0, [eax]
445 :			movq mm1, [eax+edx]
446 :			movq [ecx], mm0
447 :			lea eax,[eax+2*edx]
448 :			movq [ecx+edx], mm1
449 :			%endmacro
450 :
451 :	Isibaar	1.1	align 16
452 :	Isibaar	1.5	transfer8x8_copy_mmx:
453 :			mov ecx, [esp+ 4] ; Dst
454 :			mov eax, [esp+ 8] ; Src
455 :			mov edx, [esp+12] ; Stride
456 :
457 :			COPY_8_TO_8
458 :			lea ecx,[ecx+2*edx]
459 :			COPY_8_TO_8
460 :			lea ecx,[ecx+2*edx]
461 :			COPY_8_TO_8
462 :			lea ecx,[ecx+2*edx]
463 :			COPY_8_TO_8
464 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4