Annotation of /xvidcore/src/utils/x86_asm/mem_transfer_3dne.asm

Revision 1.2 - (view) (download)

1 :	edgomez	1.2	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx 8bit<->16bit transfers
5 :			; *
6 :			; * This program is an implementation of a part of one or more MPEG-4
7 :			; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
8 :			; * to use this software module in hardware or software products are
9 :			; * advised that its use may infringe existing patents or copyrights, and
10 :			; * any such use would be at such party's own risk. The original
11 :			; * developer of this software module and his/her company, and subsequent
12 :			; * editors and their companies, will have no liability for use of this
13 :			; * software or modifications or derivatives thereof.
14 :			; *
15 :			; * This program is free software; you can redistribute it and/or modify
16 :			; * it under the terms of the GNU General Public License as published by
17 :			; * the Free Software Foundation; either version 2 of the License, or
18 :			; * (at your option) any later version.
19 :			; *
20 :			; * This program is distributed in the hope that it will be useful,
21 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 :			; * GNU General Public License for more details.
24 :			; *
25 :			; * You should have received a copy of the GNU General Public License
26 :			; * along with this program; if not, write to the Free Software
27 :			; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 :			; *
29 :			; *************************************************************************/
30 :
31 :			; these 3dne functions are compatible with iSSE, but are optimized specifically for
32 :			; K7 pipelines
33 :			;
34 :			;------------------------------------------------------------------------------
35 :			; 09.12.2002 Athlon optimizations contributed by Jaan Kalda
36 :			;------------------------------------------------------------------------------
37 :
38 :
39 :			bits 32
40 :			%ifdef FORMAT_COFF
41 :			section .data data
42 :			%else
43 :			section .data data align=16
44 :			%endif
45 :
46 :
47 :			align 8
48 :			mm_zero:
49 :			dd 0,0
50 :
51 :
52 :			%macro cglobal 1
53 :			%ifdef PREFIX
54 :			global _%1
55 :			%define %1 _%1
56 :			%else
57 :			global %1
58 :			%endif
59 :			%endmacro
60 :			%macro nop4 0
61 :			DB 08Dh,074h,026h,0
62 :			%endmacro
63 :
64 :			section .text
65 :
66 :			cglobal transfer_8to16copy_3dne
67 :			cglobal transfer_16to8copy_3dne
68 :			cglobal transfer_8to16sub_3dne
69 :			cglobal transfer_8to16subro_3dne
70 :			cglobal transfer_8to16sub2_3dne
71 :			cglobal transfer_16to8add_3dne
72 :			cglobal transfer8x8_copy_3dne
73 :
74 :			;===========================================================================
75 :			;
76 :			; void transfer_8to16copy_3dne(int16_t * const dst,
77 :			; const uint8_t * const src,
78 :			; uint32_t stride);
79 :			;
80 :			;===========================================================================
81 :
82 :			align 16
83 :			transfer_8to16copy_3dne:
84 :
85 :			mov eax, [esp+ 8] ; Src
86 :			mov edx, [esp+12] ; Stride
87 :			mov ecx, [esp+ 4] ; Dst
88 :			punpcklbw mm0, [byte eax]
89 :			punpcklbw mm1, [eax+4]
90 :			movq mm2,[eax+edx]
91 :			movq mm3,[eax+edx]
92 :			pxor mm7,mm7
93 :			lea eax,[eax+2*edx]
94 :			punpcklbw mm2,mm7
95 :			punpckhbw mm3,mm7
96 :			psrlw mm0,8
97 :			psrlw mm1,8
98 :			punpcklbw mm4, [eax]
99 :			punpcklbw mm5, [eax+edx+4]
100 :			movq [byte ecx+0*64], mm0
101 :			movq [ecx+0*64+8], mm1
102 :			punpcklbw mm6, [eax+edx]
103 :			punpcklbw mm7, [eax+4]
104 :			lea eax,[byte eax+2*edx]
105 :			psrlw mm4,8
106 :			psrlw mm5,8
107 :			punpcklbw mm0, [eax]
108 :			punpcklbw mm1, [eax+edx+4]
109 :			movq [ecx+0*64+16], mm2
110 :			movq [ecx+0*64+24], mm3
111 :			psrlw mm6,8
112 :			psrlw mm7,8
113 :			punpcklbw mm2, [eax+edx]
114 :			punpcklbw mm3, [eax+4]
115 :			lea eax,[byte eax+2*edx]
116 :			movq [byte ecx+0*64+32], mm4
117 :			movq [ecx+0*64+56], mm5
118 :			psrlw mm0,8
119 :			psrlw mm1,8
120 :			punpcklbw mm4, [eax]
121 :			punpcklbw mm5, [eax+edx+4]
122 :			movq [byte ecx+0*64+48], mm6
123 :			movq [ecx+0*64+40], mm7
124 :			psrlw mm2,8
125 :			psrlw mm3,8
126 :			punpcklbw mm6, [eax+edx]
127 :			punpcklbw mm7, [eax+4]
128 :			movq [byte ecx+1*64], mm0
129 :			movq [ecx+1*64+24], mm1
130 :			psrlw mm4,8
131 :			psrlw mm5,8
132 :			movq [ecx+1*64+16], mm2
133 :			movq [ecx+1*64+8], mm3
134 :			psrlw mm6,8
135 :			psrlw mm7,8
136 :			movq [byte ecx+1*64+32], mm4
137 :			movq [ecx+1*64+56], mm5
138 :			movq [byte ecx+1*64+48], mm6
139 :			movq [ecx+1*64+40], mm7
140 :			ret
141 :
142 :
143 :
144 :			;===========================================================================
145 :			;
146 :			; void transfer_16to8copy_3dne(uint8_t * const dst,
147 :			; const int16_t * const src,
148 :			; uint32_t stride);
149 :			;
150 :			;===========================================================================
151 :
152 :			align 16
153 :			transfer_16to8copy_3dne:
154 :
155 :			mov eax, [esp+ 8] ; Src
156 :			mov ecx, [esp+ 4] ; Dst
157 :			mov edx, [esp+12] ; Stride
158 :
159 :			movq mm0, [byte eax+0*32]
160 :			packuswb mm0,[eax+0*32+8]
161 :			movq mm1, [eax+0*32+16]
162 :			packuswb mm1,[eax+0*32+24]
163 :			movq mm5, [eax+2*32+16]
164 :			movq mm2, [eax+1*32]
165 :			packuswb mm2, [eax+1*32+8]
166 :			movq mm3, [eax+1*32+16]
167 :			packuswb mm3, [eax+1*32+24]
168 :			movq mm6, [eax+3*32]
169 :			movq mm4, [eax+2*32]
170 :			packuswb mm4, [eax+2*32+8]
171 :			packuswb mm5, [eax+2*32+24]
172 :			movq mm7, [eax+3*32+16]
173 :			packuswb mm7, [eax+3*32+24]
174 :			packuswb mm6, [eax+3*32+8]
175 :			movq [ecx], mm0
176 :			lea eax,[3*edx]
177 :			add eax,ecx
178 :			movq [ecx+edx], mm1
179 :			movq [ecx+2*edx], mm2
180 :			movq [byte eax], mm3
181 :			movq [ecx+4*edx], mm4
182 :			lea ecx,[byte ecx+4*edx]
183 :			movq [eax+2*edx], mm5
184 :			movq [eax+4*edx], mm7
185 :			movq [ecx+2*edx], mm6
186 :			ret
187 :
188 :			;===========================================================================
189 :			;
190 :			; void transfer_8to16sub_3dne(int16_t * const dct,
191 :			; uint8_t * const cur,
192 :			; const uint8_t * const ref,
193 :			; const uint32_t stride);
194 :			;
195 :			;===========================================================================
196 :			;/**************************************************************************
197 :			; *
198 :			; * History:
199 :			; *
200 :			; * 27.12.2001 renamed from 'compensate' to 'transfer_8to16sub'
201 :			; * 02.12.2001 loop unrolled, code runs 10% faster now (Isibaar)
202 :			; * 30.11.2001 16 pixels are processed per iteration (Isibaar)
203 :			; * 30.11.2001 .text missing
204 :			; * 06.11.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
205 :			; *
206 :			; *************************************************************************/
207 :
208 :			; when second argument == 1, reference (ebx) block is to current (eax)
209 :			%macro COPY_8_TO_16_SUB 2
210 :			movq mm1, [eax] ; cur
211 :			movq mm0, mm1
212 :			movq mm4, [ecx] ; ref
213 :			movq mm6, mm4
214 :			%if %2 == 1
215 :			movq [eax], mm4
216 :			%endif
217 :			punpckhbw mm1, mm7
218 :			punpckhbw mm6, mm7
219 :			punpcklbw mm4, mm7
220 :			align 8
221 :			movq mm2, [byte eax+edx]
222 :			punpcklbw mm0, mm7
223 :			movq mm3, [byte eax+edx]
224 :			punpcklbw mm2, mm7
225 :			movq mm5, [byte ecx+edx] ; ref
226 :			punpckhbw mm3, mm7
227 :			%if %2 == 1
228 :			movq [byte eax+edx], mm5
229 :			%endif
230 :			psubsw mm1, mm6
231 :
232 :			movq mm6, mm5
233 :			psubsw mm0, mm4
234 :			%if (%1 < 3)
235 :			lea eax,[eax+2*edx]
236 :			lea ecx,[ecx+2*edx]
237 :			%else
238 :			mov ecx,[esp]
239 :			add esp,byte 4
240 :			%endif
241 :			movq [edi+%1*32+ 8], mm1
242 :			movq [byte edi+%1*32+ 0], mm0 ; dst
243 :			punpcklbw mm5, mm7
244 :			punpckhbw mm6, mm7
245 :			psubsw mm2, mm5
246 :			psubsw mm3, mm6
247 :			movq [edi+%1*32+16], mm2
248 :			movq [edi+%1*32+24], mm3
249 :			%endmacro
250 :
251 :			align 16
252 :			transfer_8to16sub_3dne:
253 :			mov eax, [esp + 8] ; Cur
254 :			mov ecx, [esp +12] ; Ref
255 :			push edi
256 :			mov edx, [dword esp+4+16] ; Stride
257 :			mov edi, [esp+4+ 4] ; Dst
258 :			pxor mm7, mm7
259 :			nop
260 :			align 4
261 :			COPY_8_TO_16_SUB 0, 1
262 :			COPY_8_TO_16_SUB 1, 1
263 :			COPY_8_TO_16_SUB 2, 1
264 :			COPY_8_TO_16_SUB 3, 1
265 :			mov edi,ecx
266 :			ret
267 :
268 :			align 16
269 :			transfer_8to16subro_3dne:
270 :			mov eax, [esp + 8] ; Cur
271 :			mov ecx, [esp +12] ; Ref
272 :			push edi
273 :			mov edx, [dword esp+4+16] ; Stride
274 :			mov edi, [esp+4+ 4] ; Dst
275 :			pxor mm7, mm7
276 :			nop
277 :			align 4
278 :			COPY_8_TO_16_SUB 0, 0
279 :			COPY_8_TO_16_SUB 1, 0
280 :			COPY_8_TO_16_SUB 2, 0
281 :			COPY_8_TO_16_SUB 3, 0
282 :			mov edi,ecx
283 :			ret
284 :
285 :
286 :			;===========================================================================
287 :			;
288 :			; void transfer_8to16sub2_3dne(int16_t * const dct,
289 :			; uint8_t * const cur,
290 :			; const uint8_t * ref1,
291 :			; const uint8_t * ref2,
292 :			; const uint32_t stride)
293 :			;
294 :			;===========================================================================
295 :
296 :			%macro COPY_8_TO_16_SUB2_SSE 1
297 :			db 0Fh, 6Fh, 44h, 20h, 00 ;movq mm0, [byte eax] ; cur
298 :			punpcklbw mm0, mm7
299 :			movq mm2, [byte eax+edx]
300 :			punpcklbw mm2, mm7
301 :			db 0Fh, 6Fh, 4ch, 20h, 00 ;movq mm1, [byte eax]
302 :			punpckhbw mm1, mm7
303 :			movq mm3, [byte eax+edx]
304 :			punpckhbw mm3, mm7
305 :
306 :			movq mm4, [byte ebx] ; ref1
307 :			pavgb mm4, [byte esi] ; ref2
308 :			movq mm5, [ebx+edx] ; ref
309 :			pavgb mm5, [esi+edx] ; ref2
310 :			movq mm6, mm4
311 :			punpcklbw mm4, mm7
312 :			punpckhbw mm6, mm7
313 :			%if (%1 < 3)
314 :			lea esi,[esi+2*edx]
315 :			lea ebx,[byte ebx+2*edx]
316 :			lea eax,[eax+2*edx]
317 :			%else
318 :			mov esi,[esp]
319 :			mov ebx,[esp+4]
320 :			add esp,byte 8
321 :			%endif
322 :			psubsw mm0, mm4
323 :			psubsw mm1, mm6
324 :			movq mm6, mm5
325 :			punpcklbw mm5, mm7
326 :			punpckhbw mm6, mm7
327 :			psubsw mm2, mm5
328 :			psubsw mm3, mm6
329 :			movq [byte ecx+%1*32+ 0], mm0 ; dst
330 :			movq [ecx+%1*32+ 8], mm1
331 :			movq [ecx+%1*32+16], mm2
332 :			movq [ecx+%1*32+24], mm3
333 :			%endmacro
334 :
335 :			align 16
336 :			transfer_8to16sub2_3dne:
337 :			mov edx, [esp +20] ; Stride
338 :			mov ecx, [esp + 4] ; Dst
339 :			mov eax, [esp + 8] ; Cur
340 :			push ebx
341 :			lea ebp,[byte ebp]
342 :			mov ebx, [esp+4+12] ; Ref1
343 :			push esi
344 :			pxor mm7, mm7
345 :			mov esi, [esp+8+16] ; Ref2
346 :			nop4
347 :			COPY_8_TO_16_SUB2_SSE 0
348 :			COPY_8_TO_16_SUB2_SSE 1
349 :			COPY_8_TO_16_SUB2_SSE 2
350 :			COPY_8_TO_16_SUB2_SSE 3
351 :
352 :			ret
353 :
354 :
355 :			;===========================================================================
356 :			;
357 :			; void transfer_16to8add_3dne(uint8_t * const dst,
358 :			; const int16_t * const src,
359 :			; uint32_t stride);
360 :			;
361 :			;===========================================================================
362 :
363 :			%macro COPY_16_TO_8_ADD 1
364 :			db 0Fh, 6Fh, 44h, 21h, 00 ;movq mm0, [byte ecx]
365 :			punpcklbw mm0, mm7
366 :			movq mm2, [byte ecx+edx]
367 :			punpcklbw mm2, mm7
368 :			db 0Fh, 6Fh, 4ch, 21h, 00 ;movq mm1, [byte ecx]
369 :			punpckhbw mm1, mm7
370 :			movq mm3, [byte ecx+edx]
371 :			punpckhbw mm3, mm7
372 :			paddsw mm0, [byte eax+%1*32+ 0]
373 :			paddsw mm1, [eax+%1*32+ 8]
374 :			paddsw mm2, [eax+%1*32+16]
375 :			paddsw mm3, [eax+%1*32+24]
376 :			packuswb mm0, mm1
377 :			packuswb mm2, mm3
378 :			mov esp,esp
379 :			movq [byte ecx], mm0
380 :			movq [ecx+edx], mm2
381 :			%endmacro
382 :
383 :
384 :			align 16
385 :			transfer_16to8add_3dne:
386 :			mov ecx, [esp+ 4] ; Dst
387 :			mov edx, [esp+12] ; Stride
388 :			mov eax, [esp+ 8] ; Src
389 :			pxor mm7, mm7
390 :			nop
391 :
392 :			COPY_16_TO_8_ADD 0
393 :			lea ecx,[byte ecx+2*edx]
394 :			COPY_16_TO_8_ADD 1
395 :			lea ecx,[byte ecx+2*edx]
396 :			COPY_16_TO_8_ADD 2
397 :			lea ecx,[byte ecx+2*edx]
398 :			COPY_16_TO_8_ADD 3
399 :			ret
400 :
401 :			;===========================================================================
402 :			;
403 :			; void transfer8x8_copy_3dne(uint8_t * const dst,
404 :			; const uint8_t * const src,
405 :			; const uint32_t stride);
406 :			;
407 :			;
408 :			;===========================================================================
409 :
410 :			%macro COPY_8_TO_8 0
411 :			movq mm0, [byte eax]
412 :			movq mm1, [eax+edx]
413 :			movq [byte ecx], mm0
414 :			lea eax,[byte eax+2*edx]
415 :			movq [ecx+edx], mm1
416 :			%endmacro
417 :
418 :			align 16
419 :			transfer8x8_copy_3dne:
420 :			mov eax, [esp+ 8] ; Src
421 :			mov edx, [esp+12] ; Stride
422 :			mov ecx, [esp+ 4] ; Dst
423 :
424 :			COPY_8_TO_8
425 :			lea ecx,[byte ecx+2*edx]
426 :			COPY_8_TO_8
427 :			lea ecx,[byte ecx+2*edx]
428 :			COPY_8_TO_8
429 :			lea ecx,[byte ecx+2*edx]
430 :			COPY_8_TO_8
431 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4