Annotation of /xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm

Revision 1.2 - (view) (download)

1 :	Isibaar	1.1	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx 8bit<->16bit transfers
5 :			; *
6 :			; * This program is an implementation of a part of one or more MPEG-4
7 :			; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
8 :			; * to use this software module in hardware or software products are
9 :			; * advised that its use may infringe existing patents or copyrights, and
10 :			; * any such use would be at such party's own risk. The original
11 :			; * developer of this software module and his/her company, and subsequent
12 :			; * editors and their companies, will have no liability for use of this
13 :			; * software or modifications or derivatives thereof.
14 :			; *
15 :			; * This program is free software; you can redistribute it and/or modify
16 :			; * it under the terms of the GNU General Public License as published by
17 :			; * the Free Software Foundation; either version 2 of the License, or
18 :			; * (at your option) any later version.
19 :			; *
20 :			; * This program is distributed in the hope that it will be useful,
21 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 :			; * GNU General Public License for more details.
24 :			; *
25 :			; * You should have received a copy of the GNU General Public License
26 :			; * along with this program; if not, write to the Free Software
27 :			; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 :			; *
29 :			; *************************************************************************/
30 :
31 :			;/**************************************************************************
32 :			; *
33 :			; * History:
34 :			; *
35 :			; * 07.01.2002 merge functions from compensate_mmx; rename functions
36 :			; * 07.11.2001 initial version; (c)2001 peter ross <pross@cs.rmit.edu.au>
37 :			; *
38 :			; *************************************************************************/
39 :
40 :
41 :			bits 32
42 :
43 :			%macro cglobal 1
44 :			%ifdef PREFIX
45 :			global _%1
46 :			%define %1 _%1
47 :			%else
48 :			global %1
49 :			%endif
50 :			%endmacro
51 :
52 :
53 :			section .text
54 :
55 :
56 :			;===========================================================================
57 :			;
58 :			; void transfer_8to16copy_mmx(int16_t * const dst,
59 :			; const uint8_t * const src,
60 :			; uint32_t stride);
61 :			;
62 :			;===========================================================================
63 :
64 :			align 16
65 :			cglobal transfer_8to16copy_mmx
66 :			transfer_8to16copy_mmx
67 :
68 :			push esi
69 :			push edi
70 :
71 :			mov edi, [esp + 8 + 4] ; dst
72 :			mov esi, [esp + 8 + 8] ; src
73 :			mov ecx, [esp + 8 + 12] ; stride
74 :
75 :			pxor mm7, mm7 ; mm7 = zero
76 :
77 :			mov eax, 8
78 :
79 :			.loop
80 :			movq mm0, [esi]
81 :			movq mm1, mm0
82 :			punpcklbw mm0, mm7 ; mm01 = unpack([src])
83 :			punpckhbw mm1, mm7
84 :
85 :			movq [edi], mm0 ; [dst] = mm01
86 :			movq [edi + 8], mm1
87 :
88 :			add edi, 16
89 :			add esi, ecx
90 :			dec eax
91 :			jnz .loop
92 :
93 :			pop edi
94 :			pop esi
95 :
96 :			ret
97 :
98 :
99 :
100 :			;===========================================================================
101 :			;
102 :			; void transfer_16to8copy_mmx(uint8_t * const dst,
103 :			; const int16_t * const src,
104 :			; uint32_t stride);
105 :			;
106 :			;===========================================================================
107 :
108 :			align 16
109 :			cglobal transfer_16to8copy_mmx
110 :			transfer_16to8copy_mmx
111 :
112 :			push esi
113 :			push edi
114 :
115 :			mov edi, [esp + 8 + 4] ; dst
116 :			mov esi, [esp + 8 + 8] ; src
117 :			mov ecx, [esp + 8 + 12] ; stride
118 :
119 :			mov eax, 8
120 :
121 :			.loop
122 :			movq mm0, [esi]
123 :			packuswb mm0, [esi + 8] ; mm0 = pack([src])
124 :
125 :			movq [edi], mm0 ; [dst] = mm0
126 :
127 :			add esi, 16
128 :			add edi, ecx
129 :			dec eax
130 :			jnz .loop
131 :
132 :			pop edi
133 :			pop esi
134 :
135 :			ret
136 :
137 :
138 :			;===========================================================================
139 :			;
140 :			; void transfer_8to16sub_mmx(int16_t * const dct,
141 :			; uint8_t * const cur,
142 :			; const uint8_t * const ref,
143 :			; const uint32_t stride);
144 :			;
145 :			;===========================================================================
146 :			;/**************************************************************************
147 :			; *
148 :			; * History:
149 :			; *
150 :			; * 27.12.2001 renamed from 'compensate' to 'transfer_8to16sub'
151 :			; * 02.12.2001 loop unrolled, code runs 10% faster now (Isibaar)
152 :			; * 30.11.2001 16 pixels are processed per iteration (Isibaar)
153 :			; * 30.11.2001 .text missing
154 :			; * 06.11.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
155 :			; *
156 :			; *************************************************************************/
157 :
158 :			align 16
159 :			cglobal transfer_8to16sub_mmx
160 :			transfer_8to16sub_mmx
161 :			push esi
162 :			push edi
163 :			push ebx
164 :
165 :			mov edi, [esp + 12 + 4] ; dct [out]
166 :			mov edx, [esp + 12 + 8] ; cur [in/out]
167 :			mov esi, [esp + 12 + 12] ; ref [in]
168 :			mov ecx, [esp + 12 + 16] ; stride [in]
169 :
170 :			mov eax, edx ; cur -> eax
171 :			mov ebx, esi ; ref -> ebx
172 :			add eax, ecx ; cur + stride
173 :			add ebx, ecx ; ref + stride
174 :
175 :			shl ecx, 1
176 :
177 :			pxor mm7, mm7 ; mm7 = zero
178 :
179 :			movq mm0, [edx] ; mm01 = [cur]
180 :			movq mm1, mm0
181 :
182 :			punpcklbw mm0, mm7
183 :			punpckhbw mm1, mm7
184 :
185 :			movq mm4, [eax]
186 :			movq mm5, mm4
187 :
188 :			punpcklbw mm4, mm7
189 :			punpckhbw mm5, mm7
190 :
191 :			movq mm2, [esi] ; mm23 = [ref]
192 :			movq mm3, mm2
193 :
194 :			movq mm6, [ebx]
195 :
196 :			movq [edx], mm2 ; [cur] = [ref]
197 :			movq [eax], mm6
198 :
199 :			punpcklbw mm2, mm7
200 :			punpckhbw mm3, mm7
201 :
202 :			psubsw mm0, mm2 ; mm01 -= mm23
203 :
204 :			movq mm2, mm6
205 :
206 :			punpcklbw mm2, mm7
207 :			punpckhbw mm6, mm7
208 :
209 :			psubsw mm1, mm3
210 :
211 :			psubsw mm4, mm2
212 :			psubsw mm5, mm6
213 :
214 :			movq [edi], mm0 ; dct[] = mm01
215 :			movq [edi + 8], mm1
216 :			movq [edi + 16], mm4
217 :			movq [edi + 24], mm5
218 :
219 :			add edx, ecx
220 :			add esi, ecx
221 :			add eax, ecx
222 :			add ebx, ecx
223 :
224 :			movq mm0, [edx] ; mm01 = [cur]
225 :			movq mm1, mm0
226 :
227 :			punpcklbw mm0, mm7
228 :			punpckhbw mm1, mm7
229 :
230 :			movq mm4, [eax]
231 :			movq mm5, mm4
232 :
233 :			punpcklbw mm4, mm7
234 :			punpckhbw mm5, mm7
235 :
236 :			movq mm2, [esi] ; mm23 = [ref]
237 :			movq mm3, mm2
238 :
239 :			movq mm6, [ebx]
240 :
241 :			movq [edx], mm2 ; [cur] = [ref]
242 :			movq [eax], mm6
243 :
244 :			punpcklbw mm2, mm7
245 :			punpckhbw mm3, mm7
246 :
247 :			psubsw mm0, mm2 ; mm01 -= mm23
248 :
249 :			movq mm2, mm6
250 :
251 :			punpcklbw mm2, mm7
252 :			punpckhbw mm6, mm7
253 :
254 :			psubsw mm1, mm3
255 :
256 :			psubsw mm4, mm2
257 :			psubsw mm5, mm6
258 :
259 :			movq [edi + 32], mm0 ; dct[] = mm01
260 :			movq [edi + 40], mm1
261 :			movq [edi + 48], mm4
262 :			movq [edi + 56], mm5
263 :
264 :			add edx, ecx
265 :			add esi, ecx
266 :			add eax, ecx
267 :			add ebx, ecx
268 :
269 :			movq mm0, [edx] ; mm01 = [cur]
270 :			movq mm1, mm0
271 :
272 :			punpcklbw mm0, mm7
273 :			punpckhbw mm1, mm7
274 :
275 :			movq mm4, [eax]
276 :			movq mm5, mm4
277 :
278 :			punpcklbw mm4, mm7
279 :			punpckhbw mm5, mm7
280 :
281 :			movq mm2, [esi] ; mm23 = [ref]
282 :			movq mm3, mm2
283 :
284 :			movq mm6, [ebx]
285 :
286 :			movq [edx], mm2 ; [cur] = [ref]
287 :			movq [eax], mm6
288 :
289 :			punpcklbw mm2, mm7
290 :			punpckhbw mm3, mm7
291 :
292 :			psubsw mm0, mm2 ; mm01 -= mm23
293 :
294 :			movq mm2, mm6
295 :
296 :			punpcklbw mm2, mm7
297 :			punpckhbw mm6, mm7
298 :
299 :			psubsw mm1, mm3
300 :
301 :			psubsw mm4, mm2
302 :			psubsw mm5, mm6
303 :
304 :			movq [edi + 64], mm0 ; dct[] = mm01
305 :			movq [edi + 72], mm1
306 :			movq [edi + 80], mm4
307 :			movq [edi + 88], mm5
308 :
309 :			add edx, ecx
310 :			add esi, ecx
311 :			add eax, ecx
312 :			add ebx, ecx
313 :
314 :			movq mm0, [edx] ; mm01 = [cur]
315 :			movq mm1, mm0
316 :
317 :			punpcklbw mm0, mm7
318 :			punpckhbw mm1, mm7
319 :
320 :			movq mm4, [eax]
321 :			movq mm5, mm4
322 :
323 :			punpcklbw mm4, mm7
324 :			punpckhbw mm5, mm7
325 :
326 :			movq mm2, [esi] ; mm23 = [ref]
327 :			movq mm3, mm2
328 :
329 :			movq mm6, [ebx]
330 :
331 :			movq [edx], mm2 ; [cur] = [ref]
332 :			movq [eax], mm6
333 :
334 :			punpcklbw mm2, mm7
335 :			punpckhbw mm3, mm7
336 :
337 :			psubsw mm0, mm2 ; mm01 -= mm23
338 :
339 :			movq mm2, mm6
340 :
341 :			punpcklbw mm2, mm7
342 :			punpckhbw mm6, mm7
343 :
344 :			psubsw mm1, mm3
345 :
346 :			psubsw mm4, mm2
347 :			psubsw mm5, mm6
348 :
349 :			movq [edi + 96], mm0 ; dct[] = mm01
350 :			movq [edi + 104], mm1
351 :			movq [edi + 112], mm4
352 :			movq [edi + 120], mm5
353 :
354 :			pop ebx
355 :			pop edi
356 :			pop esi
357 :
358 :			ret
359 :
360 :
361 :	edgomez	1.2	;===========================================================================
362 :			;
363 :			; void transfer_8to16sub2_xmm(int16_t * const dct,
364 :			; uint8_t * const cur,
365 :			; const uint8_t * ref1,
366 :			; const uint8_t * ref2,
367 :			; const uint32_t stride);
368 :			;
369 :			;===========================================================================
370 :
371 :			align 16
372 :			cglobal transfer_8to16sub2_xmm
373 :			transfer_8to16sub2_xmm
374 :
375 :			push edi
376 :			push esi
377 :			push ebx
378 :
379 :			mov edi, [esp + 12 + 4] ; edi = &dct
380 :			mov esi, [esp + 12 + 8] ; esi = &cur
381 :			mov ebx, [esp + 12 + 12] ; ebx = &ref1
382 :			mov edx, [esp + 12 + 16] ; edx = &ref2
383 :			mov eax, [esp + 12 + 20] ; eax = stride
384 :
385 :			pxor mm7, mm7 ; mm7 = 0
386 :			shl eax, 1 ; eax = stride<<1
387 :			add eax, 16
388 :
389 :			; Row processing
390 :			; One row at a time
391 :			movq mm0, [esi + 0] ; mm0 = cur row
392 :			movq mm2, [ebx + 0] ; mm2 = ref1 row
393 :			movq mm3, [edx + 0] ; mm3 = ref2 row
394 :			movq mm1, mm0 ; mm1 = cur row
395 :
396 :			pavgb mm2, mm3 ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
397 :			punpcklbw mm0, mm7 ; mm0 = cur(3-0) <-> 16bit
398 :
399 :			movq mm3,mm2 ; mm3 = avg
400 :			punpckhbw mm1, mm7 ; mm1 = cur(7-4) <-> 16bit
401 :
402 :			punpcklbw mm2, mm7 ; mm2 = avg(3-0) <-> 16bit
403 :			punpckhbw mm3, mm7 ; mm3 = avg(7-4) <-> 16bit
404 :
405 :			psubw mm0, mm2 ; mm0 = cur(3-0) - avg(3-0)
406 :			psubw mm1, mm3 ; mm1 = cur(7-4) - avg(7-4)
407 :
408 :			movq [edi + 0], mm0 ; dct(3-0) = mm0
409 :			movq [edi + 8], mm1 ; dct(7-4) = mm1
410 :
411 :			; Increment all pointers
412 :			add edi, eax ; edi = &(next dct row)
413 :
414 :			; Row processing
415 :			; One row at a time
416 :			movq mm0, [esi + 8] ; mm0 = cur row
417 :			movq mm2, [ebx + 8] ; mm2 = ref1 row
418 :			movq mm3, [edx + 8] ; mm3 = ref2 row
419 :			movq mm1, mm0 ; mm1 = cur row
420 :
421 :			pavgb mm2, mm3 ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
422 :			punpcklbw mm0, mm7 ; mm0 = cur(3-0) <-> 16bit
423 :
424 :			movq mm3,mm2 ; mm3 = avg
425 :			punpckhbw mm1, mm7 ; mm1 = cur(7-4) <-> 16bit
426 :
427 :			punpcklbw mm2, mm7 ; mm2 = avg(3-0) <-> 16bit
428 :			punpckhbw mm3, mm7 ; mm3 = avg(7-4) <-> 16bit
429 :
430 :			psubw mm0, mm2 ; mm0 = cur(3-0) - avg(3-0)
431 :			psubw mm1, mm3 ; mm1 = cur(7-4) - avg(7-4)
432 :
433 :			movq [edi + 0], mm0 ; dct(3-0) = mm0
434 :			movq [edi + 8], mm1 ; dct(7-4) = mm1
435 :
436 :			; Increment all pointers
437 :			add edi, eax ; edi = &(next dct row)
438 :
439 :			; Row processing
440 :			; One row at a time
441 :			movq mm0, [esi + 16] ; mm0 = cur row
442 :			movq mm2, [ebx + 16] ; mm2 = ref1 row
443 :			movq mm3, [edx + 16] ; mm3 = ref2 row
444 :			movq mm1, mm0 ; mm1 = cur row
445 :
446 :			pavgb mm2, mm3 ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
447 :			punpcklbw mm0, mm7 ; mm0 = cur(3-0) <-> 16bit
448 :
449 :			movq mm3,mm2 ; mm3 = avg
450 :			punpckhbw mm1, mm7 ; mm1 = cur(7-4) <-> 16bit
451 :
452 :			punpcklbw mm2, mm7 ; mm2 = avg(3-0) <-> 16bit
453 :			punpckhbw mm3, mm7 ; mm3 = avg(7-4) <-> 16bit
454 :
455 :			psubw mm0, mm2 ; mm0 = cur(3-0) - avg(3-0)
456 :			psubw mm1, mm3 ; mm1 = cur(7-4) - avg(7-4)
457 :
458 :			movq [edi + 0], mm0 ; dct(3-0) = mm0
459 :			movq [edi + 8], mm1 ; dct(7-4) = mm1
460 :
461 :			; Increment all pointers
462 :			add edi, eax ; edi = &(next dct row)
463 :
464 :			; Row processing
465 :			; One row at a time
466 :			movq mm0, [esi + 24] ; mm0 = cur row
467 :			movq mm2, [ebx + 24] ; mm2 = ref1 row
468 :			movq mm3, [edx + 24] ; mm3 = ref2 row
469 :			movq mm1, mm0 ; mm1 = cur row
470 :
471 :			pavgb mm2, mm3 ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
472 :			punpcklbw mm0, mm7 ; mm0 = cur(3-0) <-> 16bit
473 :
474 :			movq mm3,mm2 ; mm3 = avg
475 :			punpckhbw mm1, mm7 ; mm1 = cur(7-4) <-> 16bit
476 :
477 :			punpcklbw mm2, mm7 ; mm2 = avg(3-0) <-> 16bit
478 :			punpckhbw mm3, mm7 ; mm3 = avg(7-4) <-> 16bit
479 :
480 :			psubw mm0, mm2 ; mm0 = cur(3-0) - avg(3-0)
481 :			psubw mm1, mm3 ; mm1 = cur(7-4) - avg(7-4)
482 :
483 :			movq [edi + 0], mm0 ; dct(3-0) = mm0
484 :			movq [edi + 8], mm1 ; dct(7-4) = mm1
485 :
486 :			; Increment all pointers
487 :			add edi, eax ; edi = &(next dct row)
488 :
489 :			; Row processing
490 :			; One row at a time
491 :			movq mm0, [esi + 32] ; mm0 = cur row
492 :			movq mm2, [ebx + 32] ; mm2 = ref1 row
493 :			movq mm3, [edx + 32] ; mm3 = ref2 row
494 :			movq mm1, mm0 ; mm1 = cur row
495 :
496 :			pavgb mm2, mm3 ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
497 :			punpcklbw mm0, mm7 ; mm0 = cur(3-0) <-> 16bit
498 :
499 :			movq mm3,mm2 ; mm3 = avg
500 :			punpckhbw mm1, mm7 ; mm1 = cur(7-4) <-> 16bit
501 :
502 :			punpcklbw mm2, mm7 ; mm2 = avg(3-0) <-> 16bit
503 :			punpckhbw mm3, mm7 ; mm3 = avg(7-4) <-> 16bit
504 :
505 :			psubw mm0, mm2 ; mm0 = cur(3-0) - avg(3-0)
506 :			psubw mm1, mm3 ; mm1 = cur(7-4) - avg(7-4)
507 :
508 :			movq [edi + 0], mm0 ; dct(3-0) = mm0
509 :			movq [edi + 8], mm1 ; dct(7-4) = mm1
510 :
511 :			; Increment all pointers
512 :			add edi, eax ; edi = &(next dct row)
513 :
514 :			; Row processing
515 :			; One row at a time
516 :			movq mm0, [esi + 40] ; mm0 = cur row
517 :			movq mm2, [ebx + 40] ; mm2 = ref1 row
518 :			movq mm3, [edx + 40] ; mm3 = ref2 row
519 :			movq mm1, mm0 ; mm1 = cur row
520 :
521 :			pavgb mm2, mm3 ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
522 :			punpcklbw mm0, mm7 ; mm0 = cur(3-0) <-> 16bit
523 :
524 :			movq mm3,mm2 ; mm3 = avg
525 :			punpckhbw mm1, mm7 ; mm1 = cur(7-4) <-> 16bit
526 :
527 :			punpcklbw mm2, mm7 ; mm2 = avg(3-0) <-> 16bit
528 :			punpckhbw mm3, mm7 ; mm3 = avg(7-4) <-> 16bit
529 :
530 :			psubw mm0, mm2 ; mm0 = cur(3-0) - avg(3-0)
531 :			psubw mm1, mm3 ; mm1 = cur(7-4) - avg(7-4)
532 :
533 :			movq [edi + 0], mm0 ; dct(3-0) = mm0
534 :			movq [edi + 8], mm1 ; dct(7-4) = mm1
535 :
536 :			; Increment all pointers
537 :			add edi, eax ; edi = &(next dct row)
538 :
539 :			; Row processing
540 :			; One row at a time
541 :			movq mm0, [esi + 48] ; mm0 = cur row
542 :			movq mm2, [ebx + 48] ; mm2 = ref1 row
543 :			movq mm3, [edx + 48] ; mm3 = ref2 row
544 :			movq mm1, mm0 ; mm1 = cur row
545 :
546 :			pavgb mm2, mm3 ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
547 :			punpcklbw mm0, mm7 ; mm0 = cur(3-0) <-> 16bit
548 :
549 :			movq mm3,mm2 ; mm3 = avg
550 :			punpckhbw mm1, mm7 ; mm1 = cur(7-4) <-> 16bit
551 :
552 :			punpcklbw mm2, mm7 ; mm2 = avg(3-0) <-> 16bit
553 :			punpckhbw mm3, mm7 ; mm3 = avg(7-4) <-> 16bit
554 :
555 :			psubw mm0, mm2 ; mm0 = cur(3-0) - avg(3-0)
556 :			psubw mm1, mm3 ; mm1 = cur(7-4) - avg(7-4)
557 :
558 :			movq [edi + 0], mm0 ; dct(3-0) = mm0
559 :			movq [edi + 8], mm1 ; dct(7-4) = mm1
560 :
561 :			; Increment all pointers
562 :			add edi, eax ; edi = &(next dct row)
563 :
564 :			; Row processing
565 :			; One row at a time
566 :			movq mm0, [esi + 56] ; mm0 = cur row
567 :			movq mm2, [ebx + 56] ; mm2 = ref1 row
568 :			movq mm3, [edx + 56] ; mm3 = ref2 row
569 :			movq mm1, mm0 ; mm1 = cur row
570 :
571 :			pavgb mm2, mm3 ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
572 :			punpcklbw mm0, mm7 ; mm0 = cur(3-0) <-> 16bit
573 :
574 :			movq mm3,mm2 ; mm3 = avg
575 :			punpckhbw mm1, mm7 ; mm1 = cur(7-4) <-> 16bit
576 :
577 :			punpcklbw mm2, mm7 ; mm2 = avg(3-0) <-> 16bit
578 :			punpckhbw mm3, mm7 ; mm3 = avg(7-4) <-> 16bit
579 :
580 :			psubw mm0, mm2 ; mm0 = cur(3-0) - avg(3-0)
581 :			psubw mm1, mm3 ; mm1 = cur(7-4) - avg(7-4)
582 :
583 :			movq [edi + 0], mm0 ; dct(3-0) = mm0
584 :			movq [edi + 8], mm1 ; dct(7-4) = mm1
585 :
586 :			; Exit
587 :
588 :			pop ebx
589 :			pop esi
590 :			pop edi
591 :
592 :			ret
593 :	Isibaar	1.1
594 :			;===========================================================================
595 :			;
596 :			; void transfer_16to8add_mmx(uint8_t * const dst,
597 :			; const int16_t * const src,
598 :			; uint32_t stride);
599 :			;
600 :			;===========================================================================
601 :
602 :			align 16
603 :			cglobal transfer_16to8add_mmx
604 :			transfer_16to8add_mmx
605 :
606 :			push esi
607 :			push edi
608 :
609 :			mov edi, [esp + 8 + 4] ; dst
610 :			mov esi, [esp + 8 + 8] ; src
611 :			mov ecx, [esp + 8 + 12] ; stride
612 :
613 :			pxor mm7, mm7
614 :
615 :			mov eax, 8
616 :
617 :			.loop
618 :			movq mm0, [edi]
619 :			movq mm1, mm0
620 :			punpcklbw mm0, mm7 ; mm23 = unpack([dst])
621 :			punpckhbw mm1, mm7
622 :
623 :			movq mm2, [esi] ; mm01 = [src]
624 :			movq mm3, [esi + 8]
625 :
626 :			paddsw mm0, mm2 ; mm01 += mm23
627 :			paddsw mm1, mm3
628 :
629 :			packuswb mm0, mm1 ; [dst] = pack(mm01)
630 :			movq [edi], mm0
631 :
632 :			add esi, 16
633 :			add edi, ecx
634 :			dec eax
635 :			jnz .loop
636 :
637 :			pop edi
638 :			pop esi
639 :
640 :			ret
641 :
642 :
643 :			;===========================================================================
644 :			;
645 :			; void transfer8x8_copy_mmx(uint8_t * const dst,
646 :			; const uint8_t * const src,
647 :			; const uint32_t stride);
648 :			;
649 :			;
650 :			;===========================================================================
651 :
652 :			align 16
653 :			cglobal transfer8x8_copy_mmx
654 :			transfer8x8_copy_mmx
655 :			push esi
656 :			push edi
657 :
658 :			mov edi, [esp + 8 + 4] ; dst [out]
659 :			mov esi, [esp + 8 + 8] ; src [in]
660 :			mov eax, [esp + 8 + 12] ; stride [in]
661 :
662 :			movq mm0, [esi]
663 :			movq mm1, [esi+eax]
664 :			movq [edi], mm0
665 :			movq [edi+eax], mm1
666 :
667 :			add esi, eax
668 :			add edi, eax
669 :			add esi, eax
670 :			add edi, eax
671 :
672 :			movq mm0, [esi]
673 :			movq mm1, [esi+eax]
674 :			movq [edi], mm0
675 :			movq [edi+eax], mm1
676 :
677 :			add esi, eax
678 :			add edi, eax
679 :			add esi, eax
680 :			add edi, eax
681 :
682 :			movq mm0, [esi]
683 :			movq mm1, [esi+eax]
684 :			movq [edi], mm0
685 :			movq [edi+eax], mm1
686 :
687 :			add esi, eax
688 :			add edi, eax
689 :			add esi, eax
690 :			add edi, eax
691 :
692 :			movq mm0, [esi]
693 :			movq mm1, [esi+eax]
694 :			movq [edi], mm0
695 :			movq [edi+eax], mm1
696 :
697 :			add esi, eax
698 :			add edi, eax
699 :			add esi, eax
700 :			add edi, eax
701 :
702 :			pop edi
703 :			pop esi
704 :
705 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4