1 |
|
;/************************************************************************** |
2 |
|
; * |
3 |
|
; * XVID MPEG-4 VIDEO CODEC |
4 |
|
; * mmx 8bit<->16bit transfers |
5 |
|
; * |
6 |
|
; * This program is an implementation of a part of one or more MPEG-4 |
7 |
|
; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending |
8 |
|
; * to use this software module in hardware or software products are |
9 |
|
; * advised that its use may infringe existing patents or copyrights, and |
10 |
|
; * any such use would be at such party's own risk. The original |
11 |
|
; * developer of this software module and his/her company, and subsequent |
12 |
|
; * editors and their companies, will have no liability for use of this |
13 |
|
; * software or modifications or derivatives thereof. |
14 |
|
; * |
15 |
|
; * This program is free software; you can redistribute it and/or modify |
16 |
|
; * it under the terms of the GNU General Public License as published by |
17 |
|
; * the Free Software Foundation; either version 2 of the License, or |
18 |
|
; * (at your option) any later version. |
19 |
|
; * |
20 |
|
; * This program is distributed in the hope that it will be useful, |
21 |
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 |
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 |
|
; * GNU General Public License for more details. |
24 |
|
; * |
25 |
|
; * You should have received a copy of the GNU General Public License |
26 |
|
; * along with this program; if not, write to the Free Software |
27 |
|
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
28 |
|
; * |
29 |
|
; *************************************************************************/ |
30 |
|
|
31 |
|
; these 3dne functions are compatible with iSSE, but are optimized specifically for |
32 |
|
; K7 pipelines |
33 |
|
; |
34 |
|
;------------------------------------------------------------------------------ |
35 |
|
; 09.12.2002 Athlon optimizations contributed by Jaan Kalda |
36 |
|
;------------------------------------------------------------------------------ |
37 |
|
|
38 |
|
|
39 |
|
bits 32 |
40 |
|
%ifdef FORMAT_COFF |
41 |
|
section .data data |
42 |
|
%else |
43 |
|
section .data data align=16 |
44 |
|
%endif |
45 |
|
|
46 |
|
|
47 |
|
align 8 |
48 |
|
mm_zero: |
49 |
|
dd 0,0 |
50 |
|
|
51 |
|
|
52 |
|
%macro cglobal 1 |
53 |
|
%ifdef PREFIX |
54 |
|
global _%1 |
55 |
|
%define %1 _%1 |
56 |
|
%else |
57 |
|
global %1 |
58 |
|
%endif |
59 |
|
%endmacro |
60 |
|
%macro nop4 0 |
61 |
|
DB 08Dh,074h,026h,0 |
62 |
|
%endmacro |
63 |
|
|
64 |
|
section .text |
65 |
|
|
66 |
|
cglobal transfer_8to16copy_3dne |
67 |
|
cglobal transfer_16to8copy_3dne |
68 |
|
cglobal transfer_8to16sub_3dne |
69 |
|
cglobal transfer_8to16subro_3dne |
70 |
|
cglobal transfer_8to16sub2_3dne |
71 |
|
cglobal transfer_16to8add_3dne |
72 |
|
cglobal transfer8x8_copy_3dne |
73 |
|
|
74 |
|
;=========================================================================== |
75 |
|
; |
76 |
|
; void transfer_8to16copy_3dne(int16_t * const dst, |
77 |
|
; const uint8_t * const src, |
78 |
|
; uint32_t stride); |
79 |
|
; |
80 |
|
;=========================================================================== |
81 |
|
|
82 |
|
align 16 |
83 |
|
transfer_8to16copy_3dne: |
84 |
|
|
85 |
|
mov eax, [esp+ 8] ; Src |
86 |
|
mov edx, [esp+12] ; Stride |
87 |
|
mov ecx, [esp+ 4] ; Dst |
88 |
|
punpcklbw mm0, [byte eax] |
89 |
|
punpcklbw mm1, [eax+4] |
90 |
|
movq mm2,[eax+edx] |
91 |
|
movq mm3,[eax+edx] |
92 |
|
pxor mm7,mm7 |
93 |
|
lea eax,[eax+2*edx] |
94 |
|
punpcklbw mm2,mm7 |
95 |
|
punpckhbw mm3,mm7 |
96 |
|
psrlw mm0,8 |
97 |
|
psrlw mm1,8 |
98 |
|
punpcklbw mm4, [eax] |
99 |
|
punpcklbw mm5, [eax+edx+4] |
100 |
|
movq [byte ecx+0*64], mm0 |
101 |
|
movq [ecx+0*64+8], mm1 |
102 |
|
punpcklbw mm6, [eax+edx] |
103 |
|
punpcklbw mm7, [eax+4] |
104 |
|
lea eax,[byte eax+2*edx] |
105 |
|
psrlw mm4,8 |
106 |
|
psrlw mm5,8 |
107 |
|
punpcklbw mm0, [eax] |
108 |
|
punpcklbw mm1, [eax+edx+4] |
109 |
|
movq [ecx+0*64+16], mm2 |
110 |
|
movq [ecx+0*64+24], mm3 |
111 |
|
psrlw mm6,8 |
112 |
|
psrlw mm7,8 |
113 |
|
punpcklbw mm2, [eax+edx] |
114 |
|
punpcklbw mm3, [eax+4] |
115 |
|
lea eax,[byte eax+2*edx] |
116 |
|
movq [byte ecx+0*64+32], mm4 |
117 |
|
movq [ecx+0*64+56], mm5 |
118 |
|
psrlw mm0,8 |
119 |
|
psrlw mm1,8 |
120 |
|
punpcklbw mm4, [eax] |
121 |
|
punpcklbw mm5, [eax+edx+4] |
122 |
|
movq [byte ecx+0*64+48], mm6 |
123 |
|
movq [ecx+0*64+40], mm7 |
124 |
|
psrlw mm2,8 |
125 |
|
psrlw mm3,8 |
126 |
|
punpcklbw mm6, [eax+edx] |
127 |
|
punpcklbw mm7, [eax+4] |
128 |
|
movq [byte ecx+1*64], mm0 |
129 |
|
movq [ecx+1*64+24], mm1 |
130 |
|
psrlw mm4,8 |
131 |
|
psrlw mm5,8 |
132 |
|
movq [ecx+1*64+16], mm2 |
133 |
|
movq [ecx+1*64+8], mm3 |
134 |
|
psrlw mm6,8 |
135 |
|
psrlw mm7,8 |
136 |
|
movq [byte ecx+1*64+32], mm4 |
137 |
|
movq [ecx+1*64+56], mm5 |
138 |
|
movq [byte ecx+1*64+48], mm6 |
139 |
|
movq [ecx+1*64+40], mm7 |
140 |
|
ret |
141 |
|
|
142 |
|
|
143 |
|
|
144 |
|
;=========================================================================== |
145 |
|
; |
146 |
|
; void transfer_16to8copy_3dne(uint8_t * const dst, |
147 |
|
; const int16_t * const src, |
148 |
|
; uint32_t stride); |
149 |
|
; |
150 |
|
;=========================================================================== |
151 |
|
|
152 |
|
align 16 |
153 |
|
transfer_16to8copy_3dne: |
154 |
|
|
155 |
|
mov eax, [esp+ 8] ; Src |
156 |
|
mov ecx, [esp+ 4] ; Dst |
157 |
|
mov edx, [esp+12] ; Stride |
158 |
|
|
159 |
|
movq mm0, [byte eax+0*32] |
160 |
|
packuswb mm0,[eax+0*32+8] |
161 |
|
movq mm1, [eax+0*32+16] |
162 |
|
packuswb mm1,[eax+0*32+24] |
163 |
|
movq mm5, [eax+2*32+16] |
164 |
|
movq mm2, [eax+1*32] |
165 |
|
packuswb mm2, [eax+1*32+8] |
166 |
|
movq mm3, [eax+1*32+16] |
167 |
|
packuswb mm3, [eax+1*32+24] |
168 |
|
movq mm6, [eax+3*32] |
169 |
|
movq mm4, [eax+2*32] |
170 |
|
packuswb mm4, [eax+2*32+8] |
171 |
|
packuswb mm5, [eax+2*32+24] |
172 |
|
movq mm7, [eax+3*32+16] |
173 |
|
packuswb mm7, [eax+3*32+24] |
174 |
|
packuswb mm6, [eax+3*32+8] |
175 |
|
movq [ecx], mm0 |
176 |
|
lea eax,[3*edx] |
177 |
|
add eax,ecx |
178 |
|
movq [ecx+edx], mm1 |
179 |
|
movq [ecx+2*edx], mm2 |
180 |
|
movq [byte eax], mm3 |
181 |
|
movq [ecx+4*edx], mm4 |
182 |
|
lea ecx,[byte ecx+4*edx] |
183 |
|
movq [eax+2*edx], mm5 |
184 |
|
movq [eax+4*edx], mm7 |
185 |
|
movq [ecx+2*edx], mm6 |
186 |
|
ret |
187 |
|
|
188 |
|
;=========================================================================== |
189 |
|
; |
190 |
|
; void transfer_8to16sub_3dne(int16_t * const dct, |
191 |
|
; uint8_t * const cur, |
192 |
|
; const uint8_t * const ref, |
193 |
|
; const uint32_t stride); |
194 |
|
; |
195 |
|
;=========================================================================== |
196 |
|
;/************************************************************************** |
197 |
|
; * |
198 |
|
; * History: |
199 |
|
; * |
200 |
|
; * 27.12.2001 renamed from 'compensate' to 'transfer_8to16sub' |
201 |
|
; * 02.12.2001 loop unrolled, code runs 10% faster now (Isibaar) |
202 |
|
; * 30.11.2001 16 pixels are processed per iteration (Isibaar) |
203 |
|
; * 30.11.2001 .text missing |
204 |
|
; * 06.11.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au> |
205 |
|
; * |
206 |
|
; *************************************************************************/ |
207 |
|
|
208 |
|
; when second argument == 1, reference (ebx) block is to current (eax) |
209 |
|
%macro COPY_8_TO_16_SUB 2 |
210 |
|
movq mm1, [eax] ; cur |
211 |
|
movq mm0, mm1 |
212 |
|
movq mm4, [ecx] ; ref |
213 |
|
movq mm6, mm4 |
214 |
|
%if %2 == 1 |
215 |
|
movq [eax], mm4 |
216 |
|
%endif |
217 |
|
punpckhbw mm1, mm7 |
218 |
|
punpckhbw mm6, mm7 |
219 |
|
punpcklbw mm4, mm7 |
220 |
|
align 8 |
221 |
|
movq mm2, [byte eax+edx] |
222 |
|
punpcklbw mm0, mm7 |
223 |
|
movq mm3, [byte eax+edx] |
224 |
|
punpcklbw mm2, mm7 |
225 |
|
movq mm5, [byte ecx+edx] ; ref |
226 |
|
punpckhbw mm3, mm7 |
227 |
|
%if %2 == 1 |
228 |
|
movq [byte eax+edx], mm5 |
229 |
|
%endif |
230 |
|
psubsw mm1, mm6 |
231 |
|
|
232 |
|
movq mm6, mm5 |
233 |
|
psubsw mm0, mm4 |
234 |
|
%if (%1 < 3) |
235 |
|
lea eax,[eax+2*edx] |
236 |
|
lea ecx,[ecx+2*edx] |
237 |
|
%else |
238 |
|
mov ecx,[esp] |
239 |
|
add esp,byte 4 |
240 |
|
%endif |
241 |
|
movq [edi+%1*32+ 8], mm1 |
242 |
|
movq [byte edi+%1*32+ 0], mm0 ; dst |
243 |
|
punpcklbw mm5, mm7 |
244 |
|
punpckhbw mm6, mm7 |
245 |
|
psubsw mm2, mm5 |
246 |
|
psubsw mm3, mm6 |
247 |
|
movq [edi+%1*32+16], mm2 |
248 |
|
movq [edi+%1*32+24], mm3 |
249 |
|
%endmacro |
250 |
|
|
251 |
|
align 16 |
252 |
|
transfer_8to16sub_3dne: |
253 |
|
mov eax, [esp + 8] ; Cur |
254 |
|
mov ecx, [esp +12] ; Ref |
255 |
|
push edi |
256 |
|
mov edx, [dword esp+4+16] ; Stride |
257 |
|
mov edi, [esp+4+ 4] ; Dst |
258 |
|
pxor mm7, mm7 |
259 |
|
nop |
260 |
|
align 4 |
261 |
|
COPY_8_TO_16_SUB 0, 1 |
262 |
|
COPY_8_TO_16_SUB 1, 1 |
263 |
|
COPY_8_TO_16_SUB 2, 1 |
264 |
|
COPY_8_TO_16_SUB 3, 1 |
265 |
|
mov edi,ecx |
266 |
|
ret |
267 |
|
|
268 |
|
align 16 |
269 |
|
transfer_8to16subro_3dne: |
270 |
|
mov eax, [esp + 8] ; Cur |
271 |
|
mov ecx, [esp +12] ; Ref |
272 |
|
push edi |
273 |
|
mov edx, [dword esp+4+16] ; Stride |
274 |
|
mov edi, [esp+4+ 4] ; Dst |
275 |
|
pxor mm7, mm7 |
276 |
|
nop |
277 |
|
align 4 |
278 |
|
COPY_8_TO_16_SUB 0, 0 |
279 |
|
COPY_8_TO_16_SUB 1, 0 |
280 |
|
COPY_8_TO_16_SUB 2, 0 |
281 |
|
COPY_8_TO_16_SUB 3, 0 |
282 |
|
mov edi,ecx |
283 |
|
ret |
284 |
|
|
285 |
|
|
286 |
|
;=========================================================================== |
287 |
|
; |
288 |
|
; void transfer_8to16sub2_3dne(int16_t * const dct, |
289 |
|
; uint8_t * const cur, |
290 |
|
; const uint8_t * ref1, |
291 |
|
; const uint8_t * ref2, |
292 |
|
; const uint32_t stride) |
293 |
|
; |
294 |
|
;=========================================================================== |
295 |
|
|
296 |
|
%macro COPY_8_TO_16_SUB2_SSE 1 |
297 |
|
db 0Fh, 6Fh, 44h, 20h, 00 ;movq mm0, [byte eax] ; cur |
298 |
|
punpcklbw mm0, mm7 |
299 |
|
movq mm2, [byte eax+edx] |
300 |
|
punpcklbw mm2, mm7 |
301 |
|
db 0Fh, 6Fh, 4ch, 20h, 00 ;movq mm1, [byte eax] |
302 |
|
punpckhbw mm1, mm7 |
303 |
|
movq mm3, [byte eax+edx] |
304 |
|
punpckhbw mm3, mm7 |
305 |
|
|
306 |
|
movq mm4, [byte ebx] ; ref1 |
307 |
|
pavgb mm4, [byte esi] ; ref2 |
308 |
|
movq mm5, [ebx+edx] ; ref |
309 |
|
pavgb mm5, [esi+edx] ; ref2 |
310 |
|
movq mm6, mm4 |
311 |
|
punpcklbw mm4, mm7 |
312 |
|
punpckhbw mm6, mm7 |
313 |
|
%if (%1 < 3) |
314 |
|
lea esi,[esi+2*edx] |
315 |
|
lea ebx,[byte ebx+2*edx] |
316 |
|
lea eax,[eax+2*edx] |
317 |
|
%else |
318 |
|
mov esi,[esp] |
319 |
|
mov ebx,[esp+4] |
320 |
|
add esp,byte 8 |
321 |
|
%endif |
322 |
|
psubsw mm0, mm4 |
323 |
|
psubsw mm1, mm6 |
324 |
|
movq mm6, mm5 |
325 |
|
punpcklbw mm5, mm7 |
326 |
|
punpckhbw mm6, mm7 |
327 |
|
psubsw mm2, mm5 |
328 |
|
psubsw mm3, mm6 |
329 |
|
movq [byte ecx+%1*32+ 0], mm0 ; dst |
330 |
|
movq [ecx+%1*32+ 8], mm1 |
331 |
|
movq [ecx+%1*32+16], mm2 |
332 |
|
movq [ecx+%1*32+24], mm3 |
333 |
|
%endmacro |
334 |
|
|
335 |
|
align 16 |
336 |
|
transfer_8to16sub2_3dne: |
337 |
|
mov edx, [esp +20] ; Stride |
338 |
|
mov ecx, [esp + 4] ; Dst |
339 |
|
mov eax, [esp + 8] ; Cur |
340 |
|
push ebx |
341 |
|
lea ebp,[byte ebp] |
342 |
|
mov ebx, [esp+4+12] ; Ref1 |
343 |
|
push esi |
344 |
|
pxor mm7, mm7 |
345 |
|
mov esi, [esp+8+16] ; Ref2 |
346 |
|
nop4 |
347 |
|
COPY_8_TO_16_SUB2_SSE 0 |
348 |
|
COPY_8_TO_16_SUB2_SSE 1 |
349 |
|
COPY_8_TO_16_SUB2_SSE 2 |
350 |
|
COPY_8_TO_16_SUB2_SSE 3 |
351 |
|
|
352 |
|
ret |
353 |
|
|
354 |
|
|
355 |
|
;=========================================================================== |
356 |
|
; |
357 |
|
; void transfer_16to8add_3dne(uint8_t * const dst, |
358 |
|
; const int16_t * const src, |
359 |
|
; uint32_t stride); |
360 |
|
; |
361 |
|
;=========================================================================== |
362 |
|
|
363 |
|
%macro COPY_16_TO_8_ADD 1 |
364 |
|
db 0Fh, 6Fh, 44h, 21h, 00 ;movq mm0, [byte ecx] |
365 |
|
punpcklbw mm0, mm7 |
366 |
|
movq mm2, [byte ecx+edx] |
367 |
|
punpcklbw mm2, mm7 |
368 |
|
db 0Fh, 6Fh, 4ch, 21h, 00 ;movq mm1, [byte ecx] |
369 |
|
punpckhbw mm1, mm7 |
370 |
|
movq mm3, [byte ecx+edx] |
371 |
|
punpckhbw mm3, mm7 |
372 |
|
paddsw mm0, [byte eax+%1*32+ 0] |
373 |
|
paddsw mm1, [eax+%1*32+ 8] |
374 |
|
paddsw mm2, [eax+%1*32+16] |
375 |
|
paddsw mm3, [eax+%1*32+24] |
376 |
|
packuswb mm0, mm1 |
377 |
|
packuswb mm2, mm3 |
378 |
|
mov esp,esp |
379 |
|
movq [byte ecx], mm0 |
380 |
|
movq [ecx+edx], mm2 |
381 |
|
%endmacro |
382 |
|
|
383 |
|
|
384 |
|
align 16 |
385 |
|
transfer_16to8add_3dne: |
386 |
|
mov ecx, [esp+ 4] ; Dst |
387 |
|
mov edx, [esp+12] ; Stride |
388 |
|
mov eax, [esp+ 8] ; Src |
389 |
|
pxor mm7, mm7 |
390 |
|
nop |
391 |
|
|
392 |
|
COPY_16_TO_8_ADD 0 |
393 |
|
lea ecx,[byte ecx+2*edx] |
394 |
|
COPY_16_TO_8_ADD 1 |
395 |
|
lea ecx,[byte ecx+2*edx] |
396 |
|
COPY_16_TO_8_ADD 2 |
397 |
|
lea ecx,[byte ecx+2*edx] |
398 |
|
COPY_16_TO_8_ADD 3 |
399 |
|
ret |
400 |
|
|
401 |
|
;=========================================================================== |
402 |
|
; |
403 |
|
; void transfer8x8_copy_3dne(uint8_t * const dst, |
404 |
|
; const uint8_t * const src, |
405 |
|
; const uint32_t stride); |
406 |
|
; |
407 |
|
; |
408 |
|
;=========================================================================== |
409 |
|
|
410 |
|
%macro COPY_8_TO_8 0 |
411 |
|
movq mm0, [byte eax] |
412 |
|
movq mm1, [eax+edx] |
413 |
|
movq [byte ecx], mm0 |
414 |
|
lea eax,[byte eax+2*edx] |
415 |
|
movq [ecx+edx], mm1 |
416 |
|
%endmacro |
417 |
|
|
418 |
|
align 16 |
419 |
|
transfer8x8_copy_3dne: |
420 |
|
mov eax, [esp+ 8] ; Src |
421 |
|
mov edx, [esp+12] ; Stride |
422 |
|
mov ecx, [esp+ 4] ; Dst |
423 |
|
|
424 |
|
COPY_8_TO_8 |
425 |
|
lea ecx,[byte ecx+2*edx] |
426 |
|
COPY_8_TO_8 |
427 |
|
lea ecx,[byte ecx+2*edx] |
428 |
|
COPY_8_TO_8 |
429 |
|
lea ecx,[byte ecx+2*edx] |
430 |
|
COPY_8_TO_8 |
431 |
|
ret |