Annotation of /xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm

Revision 1.9 - (view) (download)

1 :	Isibaar	1.1	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx 8bit<->16bit transfers
5 :			; *
6 :	edgomez	1.9	; * This program is an implementation of a part of one or more MPEG-4
7 :			; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
8 :			; * to use this software module in hardware or software products are
9 :			; * advised that its use may infringe existing patents or copyrights, and
10 :			; * any such use would be at such party's own risk. The original
11 :			; * developer of this software module and his/her company, and subsequent
12 :			; * editors and their companies, will have no liability for use of this
13 :			; * software or modifications or derivatives thereof.
14 :			; *
15 :			; * This program is free software; you can redistribute it and/or modify
16 :			; * it under the terms of the GNU General Public License as published by
17 :			; * the Free Software Foundation; either version 2 of the License, or
18 :			; * (at your option) any later version.
19 :			; *
20 :			; * This program is distributed in the hope that it will be useful,
21 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 :			; * GNU General Public License for more details.
24 :			; *
25 :			; * You should have received a copy of the GNU General Public License
26 :			; * along with this program; if not, write to the Free Software
27 :			; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 :	Isibaar	1.1	; *
29 :			; *************************************************************************/
30 :
31 :			;/**************************************************************************
32 :			; *
33 :			; * History:
34 :			; *
35 :	Isibaar	1.5	; * 04.06.2002 speed enhancement (unroll+overlap). -Skal-
36 :			; * + added transfer_8to16sub2_mmx/xmm
37 :	Isibaar	1.1	; * 07.01.2002 merge functions from compensate_mmx; rename functions
38 :	edgomez	1.9	; * 07.11.2001 initial version; (c)2001 peter ross <pross@cs.rmit.edu.au>
39 :	Isibaar	1.1	; *
40 :			; *************************************************************************/
41 :
42 :
43 :			bits 32
44 :
45 :			%macro cglobal 1
46 :			%ifdef PREFIX
47 :			global _%1
48 :			%define %1 _%1
49 :			%else
50 :			global %1
51 :			%endif
52 :			%endmacro
53 :
54 :
55 :			section .text
56 :
57 :	Isibaar	1.5	cglobal transfer_8to16copy_mmx
58 :			cglobal transfer_16to8copy_mmx
59 :			cglobal transfer_8to16sub_mmx
60 :	edgomez	1.9	cglobal transfer_8to16subro_mmx
61 :	Isibaar	1.5	cglobal transfer_8to16sub2_mmx
62 :			cglobal transfer_8to16sub2_xmm
63 :			cglobal transfer_16to8add_mmx
64 :			cglobal transfer8x8_copy_mmx
65 :	Isibaar	1.1
66 :			;===========================================================================
67 :			;
68 :			; void transfer_8to16copy_mmx(int16_t * const dst,
69 :			; const uint8_t * const src,
70 :			; uint32_t stride);
71 :			;
72 :			;===========================================================================
73 :
74 :	Isibaar	1.5	%macro COPY_8_TO_16 1
75 :			movq mm0, [eax]
76 :			movq mm1, [eax+edx]
77 :			movq mm2, mm0
78 :			movq mm3, mm1
79 :			punpcklbw mm0, mm7
80 :			movq [ecx+%1*32], mm0
81 :			punpcklbw mm1, mm7
82 :			movq [ecx+%1*32+16], mm1
83 :			punpckhbw mm2, mm7
84 :			punpckhbw mm3, mm7
85 :			lea eax,[eax+2*edx]
86 :			movq [ecx+%1*32+8], mm2
87 :			movq [ecx+%1*32+24], mm3
88 :			%endmacro
89 :
90 :	Isibaar	1.1	align 16
91 :	Isibaar	1.5	transfer_8to16copy_mmx:
92 :	Isibaar	1.1
93 :	Isibaar	1.5	mov ecx, [esp+ 4] ; Dst
94 :			mov eax, [esp+ 8] ; Src
95 :			mov edx, [esp+12] ; Stride
96 :			pxor mm7,mm7
97 :
98 :			COPY_8_TO_16 0
99 :			COPY_8_TO_16 1
100 :			COPY_8_TO_16 2
101 :			COPY_8_TO_16 3
102 :			ret
103 :	Isibaar	1.1
104 :			;===========================================================================
105 :			;
106 :			; void transfer_16to8copy_mmx(uint8_t * const dst,
107 :			; const int16_t * const src,
108 :			; uint32_t stride);
109 :			;
110 :			;===========================================================================
111 :
112 :	Isibaar	1.5	%macro COPY_16_TO_8 1
113 :			movq mm0, [eax+%1*32]
114 :			movq mm1, [eax+%1*32+8]
115 :			packuswb mm0, mm1
116 :			movq [ecx], mm0
117 :			movq mm2, [eax+%1*32+16]
118 :			movq mm3, [eax+%1*32+24]
119 :			packuswb mm2, mm3
120 :			movq [ecx+edx], mm2
121 :			%endmacro
122 :
123 :	Isibaar	1.1	align 16
124 :	Isibaar	1.5	transfer_16to8copy_mmx:
125 :	Isibaar	1.1
126 :	Isibaar	1.5	mov ecx, [esp+ 4] ; Dst
127 :			mov eax, [esp+ 8] ; Src
128 :			mov edx, [esp+12] ; Stride
129 :
130 :			COPY_16_TO_8 0
131 :			lea ecx,[ecx+2*edx]
132 :			COPY_16_TO_8 1
133 :			lea ecx,[ecx+2*edx]
134 :			COPY_16_TO_8 2
135 :			lea ecx,[ecx+2*edx]
136 :			COPY_16_TO_8 3
137 :			ret
138 :	Isibaar	1.1
139 :			;===========================================================================
140 :			;
141 :			; void transfer_8to16sub_mmx(int16_t * const dct,
142 :			; uint8_t * const cur,
143 :			; const uint8_t * const ref,
144 :			; const uint32_t stride);
145 :			;
146 :			;===========================================================================
147 :			;/**************************************************************************
148 :			; *
149 :			; * History:
150 :			; *
151 :			; * 27.12.2001 renamed from 'compensate' to 'transfer_8to16sub'
152 :			; * 02.12.2001 loop unrolled, code runs 10% faster now (Isibaar)
153 :			; * 30.11.2001 16 pixels are processed per iteration (Isibaar)
154 :			; * 30.11.2001 .text missing
155 :	edgomez	1.9	; * 06.11.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
156 :	Isibaar	1.1	; *
157 :			; *************************************************************************/
158 :
159 :	edgomez	1.9	; when second argument == 1, reference (ebx) block is to current (eax)
160 :			%macro COPY_8_TO_16_SUB 2
161 :	Isibaar	1.5	movq mm0, [eax] ; cur
162 :			movq mm2, [eax+edx]
163 :			movq mm1, mm0
164 :			movq mm3, mm2
165 :
166 :			punpcklbw mm0, mm7
167 :			punpcklbw mm2, mm7
168 :			movq mm4, [ebx] ; ref
169 :	edgomez	1.9	punpckhbw mm1, mm7
170 :			punpckhbw mm3, mm7
171 :	Isibaar	1.5	movq mm5, [ebx+edx] ; ref
172 :
173 :			movq mm6, mm4
174 :	edgomez	1.9	%if %2 == 1
175 :	Isibaar	1.5	movq [eax], mm4
176 :			movq [eax+edx], mm5
177 :	edgomez	1.9	%endif
178 :	Isibaar	1.5	punpcklbw mm4, mm7
179 :			punpckhbw mm6, mm7
180 :			psubsw mm0, mm4
181 :			psubsw mm1, mm6
182 :			movq mm6, mm5
183 :			punpcklbw mm5, mm7
184 :			punpckhbw mm6, mm7
185 :			psubsw mm2, mm5
186 :			lea eax,[eax+2*edx]
187 :			psubsw mm3, mm6
188 :			lea ebx,[ebx+2*edx]
189 :
190 :			movq [ecx+%1*32+ 0], mm0 ; dst
191 :	edgomez	1.9	movq [ecx+%1*32+ 8], mm1
192 :			movq [ecx+%1*32+16], mm2
193 :			movq [ecx+%1*32+24], mm3
194 :	Isibaar	1.5	%endmacro
195 :
196 :	Isibaar	1.1	align 16
197 :	Isibaar	1.5	transfer_8to16sub_mmx:
198 :			mov ecx, [esp + 4] ; Dst
199 :			mov eax, [esp + 8] ; Cur
200 :			push ebx
201 :			mov ebx, [esp+4+12] ; Ref
202 :			mov edx, [esp+4+16] ; Stride
203 :			pxor mm7, mm7
204 :	Isibaar	1.1
205 :	edgomez	1.9	COPY_8_TO_16_SUB 0, 1
206 :			COPY_8_TO_16_SUB 1, 1
207 :			COPY_8_TO_16_SUB 2, 1
208 :			COPY_8_TO_16_SUB 3, 1
209 :
210 :			pop ebx
211 :			ret
212 :
213 :
214 :			align 16
215 :			transfer_8to16subro_mmx:
216 :			mov ecx, [esp + 4] ; Dst
217 :			mov eax, [esp + 8] ; Cur
218 :			push ebx
219 :			mov ebx, [esp+4+12] ; Ref
220 :			mov edx, [esp+4+16] ; Stride
221 :			pxor mm7, mm7
222 :
223 :			COPY_8_TO_16_SUB 0, 0
224 :			COPY_8_TO_16_SUB 1, 0
225 :			COPY_8_TO_16_SUB 2, 0
226 :			COPY_8_TO_16_SUB 3, 0
227 :	Isibaar	1.1
228 :	Isibaar	1.5	pop ebx
229 :			ret
230 :	edgomez	1.9
231 :	Isibaar	1.1
232 :	Isibaar	1.5	;===========================================================================
233 :			;
234 :			; void transfer_8to16sub2_mmx(int16_t * const dct,
235 :			; uint8_t * const cur,
236 :			; const uint8_t * ref1,
237 :			; const uint8_t * ref2,
238 :			; const uint32_t stride)
239 :			;
240 :			;===========================================================================
241 :
242 :			%macro COPY_8_TO_16_SUB2_MMX 1
243 :			movq mm0, [eax] ; cur
244 :			movq mm2, [eax+edx]
245 :
246 :			; mm4 <- (ref1+ref2+1) / 2
247 :			movq mm4, [ebx] ; ref1
248 :			movq mm1, [esi] ; ref2
249 :			movq mm6, mm4
250 :			movq mm3, mm1
251 :			punpcklbw mm4, mm7
252 :			punpcklbw mm1, mm7
253 :			punpckhbw mm6, mm7
254 :			punpckhbw mm3, mm7
255 :			paddusw mm4, mm1
256 :			paddusw mm6, mm3
257 :			psrlw mm4,1
258 :			psrlw mm6,1
259 :			packuswb mm4, mm6
260 :
261 :			; mm5 <- (ref1+ref2+1) / 2
262 :			movq mm5, [ebx+edx] ; ref1
263 :			movq mm1, [esi+edx] ; ref2
264 :			movq mm6, mm5
265 :			movq mm3, mm1
266 :			punpcklbw mm5, mm7
267 :			punpcklbw mm1, mm7
268 :			punpckhbw mm6, mm7
269 :			punpckhbw mm3, mm7
270 :			paddusw mm5, mm1
271 :			paddusw mm6, mm3
272 :			lea esi,[esi+2*edx]
273 :			psrlw mm5,1
274 :			psrlw mm6,1
275 :			packuswb mm5, mm6
276 :
277 :
278 :			movq mm1, mm0
279 :			movq mm3, mm2
280 :			punpcklbw mm0, mm7
281 :			punpcklbw mm2, mm7
282 :			punpckhbw mm1, mm7
283 :			punpckhbw mm3, mm7
284 :
285 :			movq mm6, mm4
286 :			punpcklbw mm4, mm7
287 :			punpckhbw mm6, mm7
288 :			psubsw mm0, mm4
289 :			psubsw mm1, mm6
290 :			movq mm6, mm5
291 :			punpcklbw mm5, mm7
292 :			punpckhbw mm6, mm7
293 :			psubsw mm2, mm5
294 :			lea eax,[eax+2*edx]
295 :			psubsw mm3, mm6
296 :			lea ebx,[ebx+2*edx]
297 :
298 :			movq [ecx+%1*32+ 0], mm0 ; dst
299 :			movq [ecx+%1*32+ 8], mm1
300 :			movq [ecx+%1*32+16], mm2
301 :			movq [ecx+%1*32+24], mm3
302 :			%endmacro
303 :	Isibaar	1.1
304 :	Isibaar	1.5	align 16
305 :			transfer_8to16sub2_mmx:
306 :			mov ecx, [esp + 4] ; Dst
307 :			mov eax, [esp + 8] ; Cur
308 :			push ebx
309 :			mov ebx, [esp+4+12] ; Ref1
310 :			push esi
311 :			mov esi, [esp+8+16] ; Ref2
312 :			mov edx, [esp+8+20] ; Stride
313 :			pxor mm7, mm7
314 :
315 :			COPY_8_TO_16_SUB2_MMX 0
316 :			COPY_8_TO_16_SUB2_MMX 1
317 :			COPY_8_TO_16_SUB2_MMX 2
318 :			COPY_8_TO_16_SUB2_MMX 3
319 :
320 :			pop esi
321 :			pop ebx
322 :			ret
323 :	Isibaar	1.1
324 :	edgomez	1.2	;===========================================================================
325 :			;
326 :			; void transfer_8to16sub2_xmm(int16_t * const dct,
327 :	Isibaar	1.5	; uint8_t * const cur,
328 :			; const uint8_t * ref1,
329 :			; const uint8_t * ref2,
330 :			; const uint32_t stride)
331 :	edgomez	1.2	;
332 :			;===========================================================================
333 :
334 :	Isibaar	1.5	%macro COPY_8_TO_16_SUB2_SSE 1
335 :			movq mm0, [eax] ; cur
336 :			movq mm2, [eax+edx]
337 :			movq mm1, mm0
338 :			movq mm3, mm2
339 :
340 :			punpcklbw mm0, mm7
341 :			punpcklbw mm2, mm7
342 :			movq mm4, [ebx] ; ref1
343 :			pavgb mm4, [esi] ; ref2
344 :			punpckhbw mm1, mm7
345 :			punpckhbw mm3, mm7
346 :			movq mm5, [ebx+edx] ; ref
347 :			pavgb mm5, [esi+edx] ; ref2
348 :
349 :			movq mm6, mm4
350 :			punpcklbw mm4, mm7
351 :			punpckhbw mm6, mm7
352 :			psubsw mm0, mm4
353 :			psubsw mm1, mm6
354 :			lea esi,[esi+2*edx]
355 :			movq mm6, mm5
356 :			punpcklbw mm5, mm7
357 :			punpckhbw mm6, mm7
358 :			psubsw mm2, mm5
359 :			lea eax,[eax+2*edx]
360 :			psubsw mm3, mm6
361 :			lea ebx,[ebx+2*edx]
362 :
363 :			movq [ecx+%1*32+ 0], mm0 ; dst
364 :			movq [ecx+%1*32+ 8], mm1
365 :			movq [ecx+%1*32+16], mm2
366 :			movq [ecx+%1*32+24], mm3
367 :			%endmacro
368 :
369 :	edgomez	1.2	align 16
370 :	Isibaar	1.5	transfer_8to16sub2_xmm:
371 :			mov ecx, [esp + 4] ; Dst
372 :			mov eax, [esp + 8] ; Cur
373 :			push ebx
374 :			mov ebx, [esp+4+12] ; Ref1
375 :			push esi
376 :			mov esi, [esp+8+16] ; Ref2
377 :			mov edx, [esp+8+20] ; Stride
378 :			pxor mm7, mm7
379 :
380 :			COPY_8_TO_16_SUB2_SSE 0
381 :			COPY_8_TO_16_SUB2_SSE 1
382 :			COPY_8_TO_16_SUB2_SSE 2
383 :			COPY_8_TO_16_SUB2_SSE 3
384 :
385 :			pop esi
386 :			pop ebx
387 :			ret
388 :	Isibaar	1.1
389 :			;===========================================================================
390 :			;
391 :			; void transfer_16to8add_mmx(uint8_t * const dst,
392 :			; const int16_t * const src,
393 :			; uint32_t stride);
394 :			;
395 :			;===========================================================================
396 :
397 :	Isibaar	1.5	%macro COPY_16_TO_8_ADD 1
398 :			movq mm0, [ecx]
399 :			movq mm2, [ecx+edx]
400 :			movq mm1, mm0
401 :			movq mm3, mm2
402 :			punpcklbw mm0, mm7
403 :			punpcklbw mm2, mm7
404 :			punpckhbw mm1, mm7
405 :			punpckhbw mm3, mm7
406 :			paddsw mm0, [eax+%1*32+ 0]
407 :			paddsw mm1, [eax+%1*32+ 8]
408 :			paddsw mm2, [eax+%1*32+16]
409 :			paddsw mm3, [eax+%1*32+24]
410 :			packuswb mm0, mm1
411 :			movq [ecx], mm0
412 :			packuswb mm2, mm3
413 :			movq [ecx+edx], mm2
414 :			%endmacro
415 :	Isibaar	1.1
416 :
417 :	Isibaar	1.5	align 16
418 :			transfer_16to8add_mmx:
419 :			mov ecx, [esp+ 4] ; Dst
420 :			mov eax, [esp+ 8] ; Src
421 :			mov edx, [esp+12] ; Stride
422 :			pxor mm7, mm7
423 :
424 :			COPY_16_TO_8_ADD 0
425 :			lea ecx,[ecx+2*edx]
426 :			COPY_16_TO_8_ADD 1
427 :			lea ecx,[ecx+2*edx]
428 :			COPY_16_TO_8_ADD 2
429 :			lea ecx,[ecx+2*edx]
430 :			COPY_16_TO_8_ADD 3
431 :			ret
432 :	Isibaar	1.1
433 :			;===========================================================================
434 :			;
435 :			; void transfer8x8_copy_mmx(uint8_t * const dst,
436 :			; const uint8_t * const src,
437 :			; const uint32_t stride);
438 :			;
439 :			;
440 :			;===========================================================================
441 :
442 :	Isibaar	1.5	%macro COPY_8_TO_8 0
443 :			movq mm0, [eax]
444 :			movq mm1, [eax+edx]
445 :			movq [ecx], mm0
446 :			lea eax,[eax+2*edx]
447 :			movq [ecx+edx], mm1
448 :			%endmacro
449 :
450 :	Isibaar	1.1	align 16
451 :	Isibaar	1.5	transfer8x8_copy_mmx:
452 :			mov ecx, [esp+ 4] ; Dst
453 :			mov eax, [esp+ 8] ; Src
454 :			mov edx, [esp+12] ; Stride
455 :
456 :			COPY_8_TO_8
457 :			lea ecx,[ecx+2*edx]
458 :			COPY_8_TO_8
459 :			lea ecx,[ecx+2*edx]
460 :			COPY_8_TO_8
461 :			lea ecx,[ecx+2*edx]
462 :			COPY_8_TO_8
463 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4