5 |
; * |
; * |
6 |
; * Copyright(C) 2002-2003 Jaan Kalda |
; * Copyright(C) 2002-2003 Jaan Kalda |
7 |
; * |
; * |
8 |
; * This program is free software ; you can redistribute it and/or modify |
; * This program is free software ; you can r_EDIstribute it and/or modify |
9 |
; * it under the terms of the GNU General Public License as published by |
; * it under the terms of the GNU General Public License as published by |
10 |
; * the Free Software Foundation ; either version 2 of the License, or |
; * the Free Software Foundation ; either version 2 of the License, or |
11 |
; * (at your option) any later version. |
; * (at your option) any later version. |
29 |
; enable dequant saturate [-2048,2047], test purposes only. |
; enable dequant saturate [-2048,2047], test purposes only. |
30 |
%define SATURATE |
%define SATURATE |
31 |
|
|
32 |
BITS 32 |
%include "nasm.inc" |
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
|
%endmacro |
|
33 |
|
|
34 |
;============================================================================= |
;============================================================================= |
35 |
; Local data |
; Local data |
36 |
;============================================================================= |
;============================================================================= |
37 |
|
|
38 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata |
|
|
%else |
|
|
SECTION .rodata align=16 |
|
|
%endif |
|
39 |
|
|
40 |
align 4 |
align SECTION_ALIGN |
41 |
int_div: |
int_div: |
42 |
dd 0 |
dd 0 |
43 |
%assign i 1 |
%assign i 1 |
46 |
%assign i i+1 |
%assign i i+1 |
47 |
%endrep |
%endrep |
48 |
|
|
49 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
50 |
plus_one: |
plus_one: |
51 |
times 8 dw 1 |
times 8 dw 1 |
52 |
|
|
54 |
; subtract by Q/2 table |
; subtract by Q/2 table |
55 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
56 |
|
|
57 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
58 |
mmx_sub: |
mmx_sub: |
59 |
%assign i 1 |
%assign i 1 |
60 |
%rep 31 |
%rep 31 |
73 |
; |
; |
74 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
75 |
|
|
76 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
77 |
mmx_div: |
mmx_div: |
78 |
%assign i 1 |
%assign i 1 |
79 |
%rep 31 |
%rep 31 |
85 |
; add by (odd(Q) ? Q : Q - 1) table |
; add by (odd(Q) ? Q : Q - 1) table |
86 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
87 |
|
|
88 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
89 |
mmx_add: |
mmx_add: |
90 |
%assign i 1 |
%assign i 1 |
91 |
%rep 31 |
%rep 31 |
101 |
; multiple by 2Q table |
; multiple by 2Q table |
102 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
103 |
|
|
104 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
105 |
mmx_mul: |
mmx_mul: |
106 |
%assign i 1 |
%assign i 1 |
107 |
%rep 31 |
%rep 31 |
113 |
; saturation limits |
; saturation limits |
114 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
115 |
|
|
116 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
117 |
mmx_32768_minus_2048: |
mmx_32768_minus_2048: |
118 |
times 4 dw (32768-2048) |
times 4 dw (32768-2048) |
119 |
mmx_32767_minus_2047: |
mmx_32767_minus_2047: |
120 |
times 4 dw (32767-2047) |
times 4 dw (32767-2047) |
121 |
|
|
122 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
123 |
mmx_2047: |
mmx_2047: |
124 |
times 4 dw 2047 |
times 4 dw 2047 |
125 |
|
|
126 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
127 |
mmzero: |
mmzero: |
128 |
dd 0, 0 |
dd 0, 0 |
129 |
int2047: |
int2047: |
135 |
; Code |
; Code |
136 |
;============================================================================= |
;============================================================================= |
137 |
|
|
138 |
SECTION .text |
TEXT |
139 |
|
|
140 |
|
|
141 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
157 |
psubw mm7, mm6 ;D8 |
psubw mm7, mm6 ;D8 |
158 |
%endif |
%endif |
159 |
|
|
160 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
161 |
db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 |
movq mm4, [_ECX + %1 * 32 +16] ;C1 |
162 |
pmaxsw mm1, mm0 ;A4 |
pmaxsw mm1, mm0 ;A4 |
163 |
db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1 |
movq mm6, [_ECX + %1 * 32 +24] ;D1 |
164 |
pmaxsw mm3, mm2 ;B4 |
pmaxsw mm3, mm2 ;B4 |
165 |
|
|
166 |
|
|
167 |
psraw mm0, 15 ;A5 |
psraw mm0, 15 ;A5 |
168 |
psraw mm2, 15 ;B5 |
psraw mm2, 15 ;B5 |
169 |
%if (%1) |
%if (%1) |
170 |
movq [edx + %1 * 32 + 16-32], mm5 ;C9 |
movq [_EDX + %1 * 32 + 16-32], mm5 ;C9 |
171 |
movq [edx + %1 * 32 + 24-32], mm7 ;D9 |
movq [_EDX + %1 * 32 + 24-32], mm7 ;D9 |
172 |
%endif |
%endif |
173 |
|
|
174 |
psrlw mm1, 1 ;A6 |
psrlw mm1, 1 ;A6 |
175 |
psrlw mm3, 1 ;B6 |
psrlw mm3, 1 ;B6 |
176 |
movq mm5, [ebx] ;C2 |
movq mm5, [_EBX] ;C2 |
177 |
movq mm7, [ebx] ;D2 |
movq mm7, [_EBX] ;D2 |
178 |
|
|
179 |
pxor mm1, mm0 ;A7 |
pxor mm1, mm0 ;A7 |
180 |
pxor mm3, mm2 ;B7 |
pxor mm3, mm2 ;B7 |
185 |
psubw mm3, mm2 ;B8 |
psubw mm3, mm2 ;B8 |
186 |
|
|
187 |
%if (%1 == 0) |
%if (%1 == 0) |
188 |
push ebp |
push _EBP |
189 |
movq mm0, [ecx + %1 * 32 +32] |
movq mm0, [_ECX + %1 * 32 +32] |
190 |
%elif (%1 < 3) |
%elif (%1 < 3) |
191 |
db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 |
movq mm0, [_ECX + %1 * 32 +32] ;A1 |
192 |
%endif |
%endif |
193 |
pmaxsw mm5, mm4 ;C4 |
pmaxsw mm5, mm4 ;C4 |
194 |
%if (%1 < 3) |
%if (%1 < 3) |
195 |
db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 |
movq mm2, [_ECX + %1 * 32 +8+32] ;B1 |
196 |
%else |
%else |
197 |
cmp esp, esp |
cmp _ESP, _ESP |
198 |
%endif |
%endif |
199 |
pmaxsw mm7, mm6 ;D4 |
pmaxsw mm7, mm6 ;D4 |
200 |
|
|
201 |
psraw mm4, 15 ;C5 |
psraw mm4, 15 ;C5 |
202 |
psraw mm6, 15 ;D5 |
psraw mm6, 15 ;D5 |
203 |
movq [byte edx + %1 * 32], mm1 ;A9 |
movq [byte _EDX + %1 * 32], mm1 ;A9 |
204 |
movq [edx + %1 * 32+8], mm3 ;B9 |
movq [_EDX + %1 * 32+8], mm3 ;B9 |
205 |
|
|
206 |
|
|
207 |
psrlw mm5, 1 ;C6 |
psrlw mm5, 1 ;C6 |
208 |
psrlw mm7, 1 ;D6 |
psrlw mm7, 1 ;D6 |
209 |
%if (%1 < 3) |
%if (%1 < 3) |
210 |
movq mm1, [ebx] ;A2 |
movq mm1, [_EBX] ;A2 |
211 |
movq mm3, [ebx] ;B2 |
movq mm3, [_EBX] ;B2 |
212 |
%endif |
%endif |
213 |
%if (%1 == 3) |
%if (%1 == 3) |
214 |
imul eax, [int_div+4*edi] |
%ifdef ARCH_IS_X86_64 |
215 |
|
lea r9, [int_div] |
216 |
|
imul eax, dword [r9+4*_EDI] |
217 |
|
%else |
218 |
|
imul _EAX, [int_div+4*_EDI] |
219 |
|
%endif |
220 |
%endif |
%endif |
221 |
pxor mm5, mm4 ;C7 |
pxor mm5, mm4 ;C7 |
222 |
pxor mm7, mm6 ;D7 |
pxor mm7, mm6 ;D7 |
236 |
psubw mm7, mm6 ;D8 |
psubw mm7, mm6 ;D8 |
237 |
%endif |
%endif |
238 |
|
|
239 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
240 |
db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 |
movq mm4, [_ECX + %1 * 32 +16] ;C1 |
241 |
pmaxsw mm1, mm0 ;A4 |
pmaxsw mm1, mm0 ;A4 |
242 |
db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1 |
movq mm6, [_ECX + %1 * 32 +24] ;D1 |
243 |
pmaxsw mm3, mm2 ;B4 |
pmaxsw mm3, mm2 ;B4 |
244 |
|
|
245 |
|
|
246 |
psraw mm0, 15 ;A5 |
psraw mm0, 15 ;A5 |
247 |
psraw mm2, 15 ;B5 |
psraw mm2, 15 ;B5 |
248 |
%if (%1) |
%if (%1) |
249 |
movq [edx + %1 * 32 + 16-32], mm5 ;C9 |
movq [_EDX + %1 * 32 + 16-32], mm5 ;C9 |
250 |
movq [edx + %1 * 32 + 24-32], mm7 ;D9 |
movq [_EDX + %1 * 32 + 24-32], mm7 ;D9 |
251 |
%endif |
%endif |
252 |
|
|
253 |
pmulhw mm1, [esi] ;A6 |
pmulhw mm1, [_ESI] ;A6 |
254 |
pmulhw mm3, [esi] ;B6 |
pmulhw mm3, [_ESI] ;B6 |
255 |
movq mm5, [ebx] ;C2 |
movq mm5, [_EBX] ;C2 |
256 |
movq mm7, [ebx] ;D2 |
movq mm7, [_EBX] ;D2 |
257 |
|
|
258 |
nop |
nop |
259 |
nop |
nop |
267 |
|
|
268 |
|
|
269 |
%if (%1 < 3) |
%if (%1 < 3) |
270 |
db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 |
movq mm0, [_ECX + %1 * 32 +32] ;A1 |
271 |
%endif |
%endif |
272 |
pmaxsw mm5, mm4 ;C4 |
pmaxsw mm5, mm4 ;C4 |
273 |
%if (%1 < 3) |
%if (%1 < 3) |
274 |
db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 |
movq mm2, [_ECX + %1 * 32 +8+32] ;B1 |
275 |
%else |
%else |
276 |
cmp esp, esp |
cmp _ESP, _ESP |
277 |
%endif |
%endif |
278 |
pmaxsw mm7,mm6 ;D4 |
pmaxsw mm7,mm6 ;D4 |
279 |
|
|
280 |
psraw mm4, 15 ;C5 |
psraw mm4, 15 ;C5 |
281 |
psraw mm6, 15 ;D5 |
psraw mm6, 15 ;D5 |
282 |
movq [byte edx + %1 * 32], mm1 ;A9 |
movq [byte _EDX + %1 * 32], mm1 ;A9 |
283 |
movq [edx + %1 * 32+8], mm3 ;B9 |
movq [_EDX + %1 * 32+8], mm3 ;B9 |
284 |
|
|
285 |
|
|
286 |
pmulhw mm5, [esi] ;C6 |
pmulhw mm5, [_ESI] ;C6 |
287 |
pmulhw mm7, [esi] ;D6 |
pmulhw mm7, [_ESI] ;D6 |
288 |
%if (%1 < 3) |
%if (%1 < 3) |
289 |
movq mm1, [ebx] ;A2 |
movq mm1, [_EBX] ;A2 |
290 |
movq mm3, [ebx] ;B2 |
movq mm3, [_EBX] ;B2 |
291 |
%endif |
%endif |
292 |
%if (%1 == 0) |
%if (%1 == 0) |
293 |
push ebp |
push _EBP |
294 |
%elif (%1 < 3) |
%elif (%1 < 3) |
295 |
nop |
nop |
296 |
%endif |
%endif |
297 |
nop |
nop |
298 |
%if (%1 == 3) |
%if (%1 == 3) |
299 |
imul eax, [int_div+4*edi] |
%ifdef ARCH_IS_X86_64 |
300 |
|
lea r9, [int_div] |
301 |
|
imul eax, dword [r9+4*_EDI] |
302 |
|
%else |
303 |
|
imul _EAX, [int_div+4*_EDI] |
304 |
|
%endif |
305 |
%endif |
%endif |
306 |
pxor mm5, mm4 ;C7 |
pxor mm5, mm4 ;C7 |
307 |
pxor mm7, mm6 ;D7 |
pxor mm7, mm6 ;D7 |
308 |
%endmacro |
%endmacro |
309 |
|
|
310 |
|
|
311 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
312 |
cglobal quant_h263_intra_3dne |
cglobal quant_h263_intra_3dne |
313 |
quant_h263_intra_3dne: |
quant_h263_intra_3dne: |
314 |
|
|
315 |
mov eax, [esp + 12] ; quant |
%ifdef ARCH_IS_X86_64 |
316 |
mov ecx, [esp + 8] ; data |
mov TMP0, [_ESP] |
317 |
mov edx, [esp + 4] ; coeff |
add _ESP, PTR_SIZE |
318 |
|
%ifndef WINDOWS |
319 |
|
push prm6 |
320 |
|
push prm5 |
321 |
|
%endif |
322 |
|
push prm4 |
323 |
|
push prm3 |
324 |
|
push prm2 |
325 |
|
push prm1 |
326 |
|
sub _ESP, PTR_SIZE |
327 |
|
mov [_ESP], TMP0 |
328 |
|
%endif |
329 |
|
|
330 |
|
mov _EAX, [_ESP + 3*PTR_SIZE] ; quant |
331 |
|
mov _ECX, [_ESP + 2*PTR_SIZE] ; data |
332 |
|
mov _EDX, [_ESP + 1*PTR_SIZE] ; coeff |
333 |
cmp al, 1 |
cmp al, 1 |
334 |
pxor mm1, mm1 |
pxor mm1, mm1 |
335 |
pxor mm3, mm3 |
pxor mm3, mm3 |
336 |
movq mm0, [ecx] ; mm0 = [1st] |
movq mm0, [_ECX] ; mm0 = [1st] |
337 |
movq mm2, [ecx + 8] |
movq mm2, [_ECX + 8] |
338 |
push esi |
push _ESI |
339 |
lea esi, [mmx_div + eax*8 - 8] |
%ifdef ARCH_IS_X86_64 |
340 |
|
lea _ESI, [mmx_div] |
341 |
push ebx |
lea _ESI, [_ESI + _EAX*8 - 8] |
342 |
mov ebx, mmzero |
%else |
343 |
push edi |
lea _ESI, [mmx_div + _EAX*8 - 8] |
344 |
|
%endif |
345 |
|
|
346 |
|
push _EBX |
347 |
|
lea _EBX, [mmzero] |
348 |
|
push _EDI |
349 |
jz near .q1loop |
jz near .q1loop |
350 |
|
|
351 |
quant_intra 0 |
quant_intra 0 |
352 |
mov ebp, [esp + 16 + 16] ; dcscalar |
mov _EBP, [_ESP + (4+4)*PTR_SIZE] ; dcscalar |
353 |
; NB -- there are 3 pushes in the function preambule and one more |
; NB -- there are 3 pushes in the function preambule and one more |
354 |
; in "quant_intra 0", thus an added offset of 16 bytes |
; in "quant_intra 0", thus an added offset of 16 bytes |
355 |
movsx eax, word [byte ecx] ; DC |
movsx _EAX, word [byte _ECX] ; DC |
356 |
|
|
357 |
quant_intra 1 |
quant_intra 1 |
358 |
mov edi, eax |
mov _EDI, _EAX |
359 |
sar edi, 31 ; sign(DC) |
sar _EDI, 31 ; sign(DC) |
360 |
shr ebp, byte 1 ; ebp = dcscalar/2 |
shr _EBP, byte 1 ; _EBP = dcscalar/2 |
361 |
|
|
362 |
quant_intra 2 |
quant_intra 2 |
363 |
sub eax, edi ; DC (+1) |
sub _EAX, _EDI ; DC (+1) |
364 |
xor ebp, edi ; sign(DC) dcscalar /2 (-1) |
xor _EBP, _EDI ; sign(DC) dcscalar /2 (-1) |
365 |
mov edi, [esp + 16 + 16] ; dscalar |
mov _EDI, [_ESP + (4+4)*PTR_SIZE] ; dscalar |
366 |
lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2 |
lea _EAX, [byte _EAX + _EBP] ; DC + sign(DC) dcscalar/2 |
367 |
mov ebp, [byte esp] |
mov _EBP, [byte _ESP] |
368 |
|
|
369 |
quant_intra 3 |
quant_intra 3 |
370 |
psubw mm5, mm4 ;C8 |
psubw mm5, mm4 ;C8 |
371 |
mov esi, [esp + 12] ; pop back the register value |
mov _ESI, [_ESP + 3*PTR_SIZE] ; pop back the register value |
372 |
mov edi, [esp + 4] ; pop back the register value |
mov _EDI, [_ESP + 1*PTR_SIZE] ; pop back the register value |
373 |
sar eax, 16 |
sar _EAX, 16 |
374 |
lea ebx, [byte eax + 1] ; workaround for eax < 0 |
lea _EBX, [byte _EAX + 1] ; workaround for _EAX < 0 |
375 |
cmovs eax, ebx ; conditionnaly move the corrected value |
cmovs _EAX, _EBX ; conditionnaly move the corrected value |
376 |
mov [edx], ax ; coeff[0] = ax |
mov [_EDX], ax ; coeff[0] = ax |
377 |
mov ebx, [esp + 8] ; pop back the register value |
mov _EBX, [_ESP + 2*PTR_SIZE] ; pop back the register value |
378 |
add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 |
add _ESP, byte 4*PTR_SIZE ; "quant_intra 0" pushed _EBP, but we don't restore that one, just correct the stack offset by 16 |
379 |
psubw mm7, mm6 ;D8 |
psubw mm7, mm6 ;D8 |
380 |
movq [edx + 3 * 32 + 16], mm5 ;C9 |
movq [_EDX + 3 * 32 + 16], mm5 ;C9 |
381 |
movq [edx + 3 * 32 + 24], mm7 ;D9 |
movq [_EDX + 3 * 32 + 24], mm7 ;D9 |
382 |
|
|
383 |
|
xor _EAX, _EAX |
384 |
|
|
385 |
|
%ifdef ARCH_IS_X86_64 |
386 |
|
mov TMP0, [_ESP] |
387 |
|
%ifndef WINDOWS |
388 |
|
add _ESP, 6*PTR_SIZE |
389 |
|
%else |
390 |
|
add _ESP, 4*PTR_SIZE |
391 |
|
%endif |
392 |
|
mov [_ESP], TMP0 |
393 |
|
%endif |
394 |
|
|
|
xor eax, eax |
|
395 |
ret |
ret |
396 |
|
|
397 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
398 |
|
|
399 |
.q1loop |
.q1loop: |
400 |
quant_intra1 0 |
quant_intra1 0 |
401 |
mov ebp, [esp + 16 + 16] ; dcscalar |
mov _EBP, [_ESP + (4+4)*PTR_SIZE] ; dcscalar |
402 |
movsx eax, word [byte ecx] ; DC |
movsx _EAX, word [byte _ECX] ; DC |
403 |
|
|
404 |
quant_intra1 1 |
quant_intra1 1 |
405 |
mov edi, eax |
mov _EDI, _EAX |
406 |
sar edi, 31 ; sign(DC) |
sar _EDI, 31 ; sign(DC) |
407 |
shr ebp, byte 1 ; ebp = dcscalar /2 |
shr _EBP, byte 1 ; _EBP = dcscalar /2 |
408 |
|
|
409 |
quant_intra1 2 |
quant_intra1 2 |
410 |
sub eax, edi ; DC (+1) |
sub _EAX, _EDI ; DC (+1) |
411 |
xor ebp, edi ; sign(DC) dcscalar /2 (-1) |
xor _EBP, _EDI ; sign(DC) dcscalar /2 (-1) |
412 |
mov edi, [esp + 16 + 16] ; dcscalar |
mov _EDI, [_ESP + (4+4)*PTR_SIZE] ; dcscalar |
413 |
lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2 |
lea _EAX, [byte _EAX + _EBP] ; DC + sign(DC) dcscalar /2 |
414 |
mov ebp, [byte esp] |
mov _EBP, [byte _ESP] |
415 |
|
|
416 |
quant_intra1 3 |
quant_intra1 3 |
417 |
psubw mm5, mm4 ;C8 |
psubw mm5, mm4 ;C8 |
418 |
mov esi, [dword esp + 12] ; pop back the register value |
mov _ESI, [_ESP + 3*PTR_SIZE] ; pop back the register value |
419 |
mov edi, [esp + 4] ; pop back the register value |
mov _EDI, [_ESP + 1*PTR_SIZE] ; pop back the register value |
420 |
sar eax, 16 |
sar _EAX, 16 |
421 |
lea ebx, [byte eax + 1] ; workaround for eax < 0 |
lea _EBX, [byte _EAX + 1] ; workaround for _EAX < 0 |
422 |
cmovs eax, ebx ; conditionnaly move the corrected value |
cmovs _EAX, _EBX ; conditionnaly move the corrected value |
423 |
mov [edx], ax ; coeff[0] = ax |
mov [_EDX], ax ; coeff[0] = ax |
424 |
mov ebx, [esp + 8] ; pop back the register value |
mov _EBX, [_ESP + 2*PTR_SIZE] ; pop back the register value |
425 |
add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 |
add _ESP, byte 4*PTR_SIZE ; "quant_intra 0" pushed _EBP, but we don't restore that one, just correct the stack offset by 16 |
426 |
psubw mm7, mm6 ;D8 |
psubw mm7, mm6 ;D8 |
427 |
movq [edx + 3 * 32 + 16], mm5 ;C9 |
movq [_EDX + 3 * 32 + 16], mm5 ;C9 |
428 |
movq [edx + 3 * 32 + 24], mm7 ;D9 |
movq [_EDX + 3 * 32 + 24], mm7 ;D9 |
429 |
|
|
430 |
xor eax, eax |
xor _EAX, _EAX |
|
ret |
|
431 |
|
|
432 |
|
%ifdef ARCH_IS_X86_64 |
433 |
|
mov TMP0, [_ESP] |
434 |
|
%ifndef WINDOWS |
435 |
|
add _ESP, 6*PTR_SIZE |
436 |
|
%else |
437 |
|
add _ESP, 4*PTR_SIZE |
438 |
|
%endif |
439 |
|
mov [_ESP], TMP0 |
440 |
|
%endif |
441 |
|
|
442 |
|
ret |
443 |
|
ENDFUNC |
444 |
|
|
445 |
|
|
446 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
456 |
|
|
457 |
|
|
458 |
%macro quantinter 1 |
%macro quantinter 1 |
459 |
movq mm1, [eax] ;A2 |
movq mm1, [_EAX] ;A2 |
460 |
psraw mm3, 15 ;B6 |
psraw mm3, 15 ;B6 |
461 |
%if (%1) |
%if (%1) |
462 |
psubw mm2, mm6 ;C10 |
psubw mm2, mm6 ;C10 |
463 |
%endif |
%endif |
464 |
psubw mm1, mm0 ;A3 |
psubw mm1, mm0 ;A3 |
465 |
pmulhw mm4, mm7 ;B7 |
pmulhw mm4, mm7 ;B7 |
466 |
movq mm6, [ecx + %1*24+16] ;C1 |
movq mm6, [_ECX + %1*24+16] ;C1 |
467 |
pmaxsw mm1, mm0 ;A4 |
pmaxsw mm1, mm0 ;A4 |
468 |
paddw mm5, mm4 ;B8 |
paddw mm5, mm4 ;B8 |
469 |
%if (%1) |
%if (%1) |
470 |
movq [edx + %1*24+16-24], mm2 ;C11 |
movq [_EDX + %1*24+16-24], mm2 ;C11 |
471 |
%endif |
%endif |
472 |
psubusw mm1, [ebx] ;A5 mm0 -= sub (unsigned, dont go < 0) |
psubusw mm1, [_EBX] ;A5 mm0 -= sub (unsigned, dont go < 0) |
473 |
pxor mm4, mm3 ;B9 |
pxor mm4, mm3 ;B9 |
474 |
movq mm2, [eax] ;C2 |
movq mm2, [_EAX] ;C2 |
475 |
psraw mm0, 15 ;A6 |
psraw mm0, 15 ;A6 |
476 |
psubw mm4, mm3 ;B10 |
psubw mm4, mm3 ;B10 |
477 |
psubw mm2, mm6 ;C3 |
psubw mm2, mm6 ;C3 |
478 |
pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24 |
pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24 |
479 |
movq mm3, [ecx + %1*24+8] ;B1 |
movq mm3, [_ECX + %1*24+8] ;B1 |
480 |
pmaxsw mm2, mm6 ;C4 |
pmaxsw mm2, mm6 ;C4 |
481 |
paddw mm5, mm1 ;A8 sum += mm0 |
paddw mm5, mm1 ;A8 sum += mm0 |
482 |
%if (%1) |
%if (%1) |
483 |
movq [edx + %1*24+8-24], mm4 ;B11 |
movq [_EDX + %1*24+8-24], mm4 ;B11 |
484 |
%else |
%else |
485 |
movq [edx + 120], mm4 ;B11 |
movq [_EDX + 120], mm4 ;B11 |
486 |
%endif |
%endif |
487 |
psubusw mm2, [ebx] ;C5 |
psubusw mm2, [_EBX] ;C5 |
488 |
pxor mm1, mm0 ;A9 mm0 *= sign(mm0) |
pxor mm1, mm0 ;A9 mm0 *= sign(mm0) |
489 |
movq mm4, [eax] ;B2 |
movq mm4, [_EAX] ;B2 |
490 |
psraw mm6, 15 ;C6 |
psraw mm6, 15 ;C6 |
491 |
psubw mm1, mm0 ;A10 undisplace |
psubw mm1, mm0 ;A10 undisplace |
492 |
psubw mm4, mm3 ;B3 |
psubw mm4, mm3 ;B3 |
493 |
pmulhw mm2, mm7 ;C7 |
pmulhw mm2, mm7 ;C7 |
494 |
movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st] |
movq mm0, [_ECX + %1*24+24] ;A1 mm0 = [1st] |
495 |
pmaxsw mm4, mm3 ;B4 |
pmaxsw mm4, mm3 ;B4 |
496 |
paddw mm5, mm2 ;C8 |
paddw mm5, mm2 ;C8 |
497 |
movq [byte edx + %1*24], mm1 ;A11 |
movq [byte _EDX + %1*24], mm1 ;A11 |
498 |
psubusw mm4, [ebx] ;B5 |
psubusw mm4, [_EBX] ;B5 |
499 |
pxor mm2, mm6 ;C9 |
pxor mm2, mm6 ;C9 |
500 |
%endmacro |
%endmacro |
501 |
|
|
502 |
%macro quantinter1 1 |
%macro quantinter1 1 |
503 |
movq mm0, [byte ecx + %1*16] ;mm0 = [1st] |
movq mm0, [byte _ECX + %1*16] ;mm0 = [1st] |
504 |
movq mm3, [ecx + %1*16+8] ; |
movq mm3, [_ECX + %1*16+8] ; |
505 |
movq mm1, [eax] |
movq mm1, [_EAX] |
506 |
movq mm4, [eax] |
movq mm4, [_EAX] |
507 |
psubw mm1, mm0 |
psubw mm1, mm0 |
508 |
psubw mm4, mm3 |
psubw mm4, mm3 |
509 |
pmaxsw mm1, mm0 |
pmaxsw mm1, mm0 |
520 |
pxor mm4, mm3 ; |
pxor mm4, mm3 ; |
521 |
psubw mm1, mm0 ; undisplace |
psubw mm1, mm0 ; undisplace |
522 |
psubw mm4, mm3 |
psubw mm4, mm3 |
523 |
cmp esp, esp |
cmp _ESP, _ESP |
524 |
movq [byte edx + %1*16], mm1 |
movq [byte _EDX + %1*16], mm1 |
525 |
movq [edx + %1*16+8], mm4 |
movq [_EDX + %1*16+8], mm4 |
526 |
%endmacro |
%endmacro |
527 |
|
|
528 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
529 |
cglobal quant_h263_inter_3dne |
cglobal quant_h263_inter_3dne |
530 |
quant_h263_inter_3dne: |
quant_h263_inter_3dne: |
531 |
mov edx, [esp + 4] ; coeff |
|
532 |
mov ecx, [esp + 8] ; data |
%ifdef ARCH_IS_X86_64 |
533 |
mov eax, [esp + 12] ; quant |
mov TMP0, [_ESP] |
534 |
push ebx |
add _ESP, PTR_SIZE |
535 |
|
%ifndef WINDOWS |
536 |
|
push prm6 |
537 |
|
push prm5 |
538 |
|
%endif |
539 |
|
push prm4 |
540 |
|
push prm3 |
541 |
|
push prm2 |
542 |
|
push prm1 |
543 |
|
sub _ESP, PTR_SIZE |
544 |
|
mov [_ESP], TMP0 |
545 |
|
%endif |
546 |
|
|
547 |
|
mov _EDX, [_ESP + 1*PTR_SIZE] ; coeff |
548 |
|
mov _ECX, [_ESP + 2*PTR_SIZE] ; data |
549 |
|
mov _EAX, [_ESP + 3*PTR_SIZE] ; quant |
550 |
|
push _EBX |
551 |
|
|
552 |
pxor mm5, mm5 ; sum |
pxor mm5, mm5 ; sum |
553 |
nop |
nop |
554 |
lea ebx,[mmx_sub + eax * 8 - 8] ; sub |
%ifdef ARCH_IS_X86_64 |
555 |
movq mm7, [mmx_div + eax * 8 - 8] ; divider |
lea _EBX, [mmx_div] |
556 |
|
movq mm7, [_EBX + _EAX * 8 - 8] |
557 |
|
lea _EBX, [mmx_sub] |
558 |
|
lea _EBX, [_EBX + _EAX * 8 - 8] |
559 |
|
%else |
560 |
|
lea _EBX,[mmx_sub + _EAX * 8 - 8] ; sub |
561 |
|
movq mm7, [mmx_div + _EAX * 8 - 8] ; divider |
562 |
|
%endif |
563 |
|
|
564 |
cmp al, 1 |
cmp al, 1 |
565 |
lea eax, [mmzero] |
lea _EAX, [mmzero] |
566 |
jz near .q1loop |
jz near .q1loop |
567 |
cmp esp, esp |
cmp _ESP, _ESP |
568 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
569 |
movq mm3, [ecx + 120] ;B1 |
movq mm3, [_ECX + 120] ;B1 |
570 |
pxor mm4, mm4 ;B2 |
pxor mm4, mm4 ;B2 |
571 |
psubw mm4, mm3 ;B3 |
psubw mm4, mm3 ;B3 |
572 |
movq mm0, [ecx] ;A1 mm0 = [1st] |
movq mm0, [_ECX] ;A1 mm0 = [1st] |
573 |
pmaxsw mm4, mm3 ;B4 |
pmaxsw mm4, mm3 ;B4 |
574 |
psubusw mm4, [ebx] ;B5 |
psubusw mm4, [_EBX] ;B5 |
575 |
|
|
576 |
quantinter 0 |
quantinter 0 |
577 |
quantinter 1 |
quantinter 1 |
585 |
paddw mm5, mm4 ;B8 |
paddw mm5, mm4 ;B8 |
586 |
pxor mm4, mm3 ;B9 |
pxor mm4, mm3 ;B9 |
587 |
psubw mm4, mm3 ;B10 |
psubw mm4, mm3 ;B10 |
588 |
movq [edx + 4*24+16], mm2 ;C11 |
movq [_EDX + 4*24+16], mm2 ;C11 |
589 |
pop ebx |
pop _EBX |
590 |
movq [edx + 4*24+8], mm4 ;B11 |
movq [_EDX + 4*24+8], mm4 ;B11 |
591 |
pmaddwd mm5, [plus_one] |
pmaddwd mm5, [plus_one] |
592 |
movq mm0, mm5 |
movq mm0, mm5 |
593 |
punpckhdq mm5, mm5 |
punpckhdq mm5, mm5 |
594 |
paddd mm0, mm5 |
paddd mm0, mm5 |
595 |
movd eax, mm0 ; return sum |
movd eax, mm0 ; return sum |
596 |
|
|
597 |
|
%ifdef ARCH_IS_X86_64 |
598 |
|
mov TMP0, [_ESP] |
599 |
|
%ifndef WINDOWS |
600 |
|
add _ESP, 6*PTR_SIZE |
601 |
|
%else |
602 |
|
add _ESP, 4*PTR_SIZE |
603 |
|
%endif |
604 |
|
mov [_ESP], TMP0 |
605 |
|
%endif |
606 |
|
|
607 |
ret |
ret |
608 |
|
|
609 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
610 |
.q1loop |
.q1loop: |
611 |
movq mm6, [byte ebx] |
movq mm6, [byte _EBX] |
612 |
|
|
613 |
quantinter1 0 |
quantinter1 0 |
614 |
quantinter1 1 |
quantinter1 1 |
625 |
paddd mm0, mm5 |
paddd mm0, mm5 |
626 |
movd eax, mm0 ; return sum |
movd eax, mm0 ; return sum |
627 |
|
|
628 |
pop ebx |
pop _EBX |
629 |
|
|
630 |
|
%ifdef ARCH_IS_X86_64 |
631 |
|
mov TMP0, [_ESP] |
632 |
|
%ifndef WINDOWS |
633 |
|
add _ESP, 6*PTR_SIZE |
634 |
|
%else |
635 |
|
add _ESP, 4*PTR_SIZE |
636 |
|
%endif |
637 |
|
mov [_ESP], TMP0 |
638 |
|
%endif |
639 |
|
|
640 |
ret |
ret |
641 |
|
ENDFUNC |
642 |
|
|
643 |
|
|
644 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
645 |
; |
; |
657 |
;This is Athlon-optimized code (ca 106 clk per call) |
;This is Athlon-optimized code (ca 106 clk per call) |
658 |
|
|
659 |
%macro dequant 1 |
%macro dequant 1 |
660 |
movq mm1, [ecx+%1*24] ; c = coeff[i] ;A2 |
movq mm1, [_ECX+%1*24] ; c = coeff[i] ;A2 |
661 |
psubw mm0, mm1 ;-c ;A3 (1st dep) |
psubw mm0, mm1 ;-c ;A3 (1st dep) |
662 |
%if (%1) |
%if (%1) |
663 |
paddw mm4, mm6 ;C11 mm6 free (4th+) |
paddw mm4, mm6 ;C11 mm6 free (4th+) |
664 |
%endif |
%endif |
665 |
pmaxsw mm0, mm1 ;|c| ;A4 (2nd) |
pmaxsw mm0, mm1 ;|c| ;A4 (2nd) |
666 |
%if (%1) |
%if (%1) |
667 |
mov ebp, ebp |
mov _EBP, _EBP |
668 |
pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) later |
pminsw mm4, [_EBX] ;C12 saturates to +2047 (5th+) later |
669 |
%endif |
%endif |
670 |
movq mm6, [esi] ;0 ;A5 mm6 in use |
movq mm6, [_ESI] ;0 ;A5 mm6 in use |
671 |
pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) |
pandn mm7, [_EAX] ;B9 offset = isZero ? 0 : quant_add (2nd) |
672 |
%if (%1) |
%if (%1) |
673 |
pxor mm5, mm4 ;C13 (6th+) 1later |
pxor mm5, mm4 ;C13 (6th+) 1later |
674 |
%endif |
%endif |
675 |
movq mm4, [esi] ;C1 ;0 |
movq mm4, [_ESI] ;C1 ;0 |
676 |
mov esp, esp |
mov _ESP, _ESP |
677 |
pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st) |
pcmpeqw mm6, [_ECX+%1*24] ;A6 (c ==0) ? -1 : 0 (1st) |
678 |
ALIGN 4 |
ALIGN SECTION_ALIGN |
679 |
psraw mm1, 15 ; sign(c) ;A7 (2nd) |
psraw mm1, 15 ; sign(c) ;A7 (2nd) |
680 |
%if (%1) |
%if (%1) |
681 |
movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later |
movq [_EDX+%1*24+16-24], mm5 ; C14 (7th) 2later |
682 |
%endif |
%endif |
683 |
paddw mm7, mm3 ;B10 offset +negate back (3rd) |
paddw mm7, mm3 ;B10 offset +negate back (3rd) |
684 |
pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+) |
pmullw mm0, [_EDI] ;*= 2Q ;A8 (3rd+) |
685 |
paddw mm2, mm7 ;B11 mm7 free (4th+) |
paddw mm2, mm7 ;B11 mm7 free (4th+) |
686 |
lea ebp, [byte ebp] |
lea _EBP, [byte _EBP] |
687 |
movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i] |
movq mm5, [_ECX+%1*24+16] ;C2 ; c = coeff[i] |
688 |
psubw mm4, mm5 ;-c ;C3 (1st dep) |
psubw mm4, mm5 ;-c ;C3 (1st dep) |
689 |
pandn mm6, [eax] ;A9 offset = isZero ? 0 : quant_add (2nd) |
pandn mm6, [_EAX] ;A9 offset = isZero ? 0 : quant_add (2nd) |
690 |
pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) |
pminsw mm2, [_EBX] ;B12 saturates to +2047 (5th+) |
691 |
pxor mm3, mm2 ;B13 (6th+) |
pxor mm3, mm2 ;B13 (6th+) |
692 |
movq mm2, [byte esi] ;B1 ;0 |
movq mm2, [byte _ESI] ;B1 ;0 |
693 |
%if (%1) |
%if (%1) |
694 |
movq [edx+%1*24+8-24], mm3 ;B14 (7th) |
movq [_EDX+%1*24+8-24], mm3 ;B14 (7th) |
695 |
%else |
%else |
696 |
movq [edx+120], mm3 |
movq [_EDX+120], mm3 |
697 |
%endif |
%endif |
698 |
pmaxsw mm4, mm5 ;|c| ;C4 (2nd) |
pmaxsw mm4, mm5 ;|c| ;C4 (2nd) |
699 |
paddw mm6, mm1 ;A10 offset +negate back (3rd) |
paddw mm6, mm1 ;A10 offset +negate back (3rd) |
700 |
movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i] |
movq mm3, [_ECX+%1*24 + 8] ;B2 ; c = coeff[i] |
701 |
psubw mm2, mm3 ;-c ;B3 (1st dep) |
psubw mm2, mm3 ;-c ;B3 (1st dep) |
702 |
paddw mm0, mm6 ;A11 mm6 free (4th+) |
paddw mm0, mm6 ;A11 mm6 free (4th+) |
703 |
movq mm6, [byte esi] ;0 ;C5 mm6 in use |
movq mm6, [byte _ESI] ;0 ;C5 mm6 in use |
704 |
pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st) |
pcmpeqw mm6, [_ECX+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st) |
705 |
pminsw mm0, [ebx] ;A12 saturates to +2047 (5th+) |
pminsw mm0, [_EBX] ;A12 saturates to +2047 (5th+) |
706 |
pmaxsw mm2, mm3 ;|c| ;B4 (2nd) |
pmaxsw mm2, mm3 ;|c| ;B4 (2nd) |
707 |
pxor mm1, mm0 ;A13 (6th+) |
pxor mm1, mm0 ;A13 (6th+) |
708 |
pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+) |
pmullw mm4, [_EDI] ;*= 2Q ;C8 (3rd+) |
709 |
psraw mm5, 15 ; sign(c) ;C7 (2nd) |
psraw mm5, 15 ; sign(c) ;C7 (2nd) |
710 |
movq mm7, [byte esi] ;0 ;B5 mm7 in use |
movq mm7, [byte _ESI] ;0 ;B5 mm7 in use |
711 |
pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st) |
pcmpeqw mm7, [_ECX+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st) |
712 |
%if (%1 < 4) |
%if (%1 < 4) |
713 |
movq mm0, [byte esi] ;A1 ;0 |
movq mm0, [byte _ESI] ;A1 ;0 |
714 |
%endif |
%endif |
715 |
pandn mm6, [byte eax] ;C9 offset = isZero ? 0 : quant_add (2nd) |
pandn mm6, [byte _EAX] ;C9 offset = isZero ? 0 : quant_add (2nd) |
716 |
psraw mm3, 15 ;sign(c) ;B7 (2nd) |
psraw mm3, 15 ;sign(c) ;B7 (2nd) |
717 |
movq [byte edx+%1*24], mm1 ;A14 (7th) |
movq [byte _EDX+%1*24], mm1 ;A14 (7th) |
718 |
paddw mm6, mm5 ;C10 offset +negate back (3rd) |
paddw mm6, mm5 ;C10 offset +negate back (3rd) |
719 |
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) |
pmullw mm2, [_EDI] ;*= 2Q ;B8 (3rd+) |
720 |
mov esp, esp |
mov _ESP, _ESP |
721 |
%endmacro |
%endmacro |
722 |
|
|
723 |
|
|
724 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
725 |
cglobal dequant_h263_intra_3dne |
cglobal dequant_h263_intra_3dne |
726 |
dequant_h263_intra_3dne: |
dequant_h263_intra_3dne: |
727 |
mov ecx, [esp+ 8] ; coeff |
|
728 |
mov eax, [esp+12] ; quant |
%ifdef ARCH_IS_X86_64 |
729 |
|
mov TMP0, [_ESP] |
730 |
|
add _ESP, PTR_SIZE |
731 |
|
%ifndef WINDOWS |
732 |
|
push prm6 |
733 |
|
push prm5 |
734 |
|
%endif |
735 |
|
push prm4 |
736 |
|
push prm3 |
737 |
|
push prm2 |
738 |
|
push prm1 |
739 |
|
sub _ESP, PTR_SIZE |
740 |
|
mov [_ESP], TMP0 |
741 |
|
%endif |
742 |
|
|
743 |
|
mov _ECX, [_ESP+ 2*PTR_SIZE] ; coeff |
744 |
|
mov _EAX, [_ESP+ 3*PTR_SIZE] ; quant |
745 |
pxor mm0, mm0 |
pxor mm0, mm0 |
746 |
pxor mm2, mm2 |
pxor mm2, mm2 |
747 |
push edi |
push _EDI |
748 |
push ebx |
push _EBX |
749 |
lea edi, [mmx_mul + eax*8 - 8] ; 2*quant |
%ifdef ARCH_IS_X86_64 |
750 |
push ebp |
lea _EDI, [mmx_mul] |
751 |
mov ebx, mmx_2047 |
lea _EDI, [_EDI + _EAX*8 - 8] ; 2*quant |
752 |
movsx ebp, word [ecx] |
%else |
753 |
lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 |
lea _EDI, [mmx_mul + _EAX*8 - 8] ; 2*quant |
754 |
push esi |
%endif |
755 |
mov esi, mmzero |
push _EBP |
756 |
|
lea _EBX, [mmx_2047] |
757 |
|
movsx _EBP, word [_ECX] |
758 |
|
%ifdef ARCH_IS_X86_64 |
759 |
|
lea r9, [mmx_add] |
760 |
|
lea _EAX, [r9 + _EAX*8 - 8] ; quant or quant-1 |
761 |
|
%else |
762 |
|
lea _EAX, [mmx_add + _EAX*8 - 8] ; quant or quant-1 |
763 |
|
%endif |
764 |
|
push _ESI |
765 |
|
lea _ESI, [mmzero] |
766 |
pxor mm7, mm7 |
pxor mm7, mm7 |
767 |
movq mm3, [ecx+120] ;B2 ; c = coeff[i] |
movq mm3, [_ECX+120] ;B2 ; c = coeff[i] |
768 |
pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) |
pcmpeqw mm7, [_ECX+120] ;B6 (c ==0) ? -1 : 0 (1st) |
769 |
|
|
770 |
imul ebp, [esp+16+16] ; dcscalar |
imul _EBP, [_ESP+(4+4)*PTR_SIZE] ; dcscalar |
771 |
psubw mm2, mm3 ;-c ;B3 (1st dep) |
psubw mm2, mm3 ;-c ;B3 (1st dep) |
772 |
pmaxsw mm2, mm3 ;|c| ;B4 (2nd) |
pmaxsw mm2, mm3 ;|c| ;B4 (2nd) |
773 |
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) |
pmullw mm2, [_EDI] ;*= 2Q ;B8 (3rd+) |
774 |
psraw mm3, 15 ; sign(c) ;B7 (2nd) |
psraw mm3, 15 ; sign(c) ;B7 (2nd) |
775 |
mov edx, [esp+ 4+16] ; data |
mov _EDX, [_ESP+ (1+4)*PTR_SIZE] ; data |
776 |
|
|
777 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
778 |
dequant 0 |
dequant 0 |
779 |
|
|
780 |
cmp ebp, -2048 |
cmp _EBP, -2048 |
781 |
mov esp, esp |
mov _ESP, _ESP |
782 |
|
|
783 |
dequant 1 |
dequant 1 |
784 |
|
|
785 |
cmovl ebp, [int_2048] |
cmovl _EBP, [int_2048] |
786 |
nop |
nop |
787 |
|
|
788 |
dequant 2 |
dequant 2 |
789 |
|
|
790 |
cmp ebp, 2047 |
cmp _EBP, 2047 |
791 |
mov esp, esp |
mov _ESP, _ESP |
792 |
|
|
793 |
dequant 3 |
dequant 3 |
794 |
|
|
795 |
cmovg ebp, [int2047] |
cmovg _EBP, [int2047] |
796 |
nop |
nop |
797 |
|
|
798 |
dequant 4 |
dequant 4 |
799 |
|
|
800 |
paddw mm4, mm6 ;C11 mm6 free (4th+) |
paddw mm4, mm6 ;C11 mm6 free (4th+) |
801 |
pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) |
pminsw mm4, [_EBX] ;C12 saturates to +2047 (5th+) |
802 |
pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) |
pandn mm7, [_EAX] ;B9 offset = isZero ? 0 : quant_add (2nd) |
803 |
mov eax, ebp |
mov _EAX, _EBP |
804 |
mov esi, [esp] |
mov _ESI, [_ESP] |
805 |
mov ebp, [esp+4] |
mov _EBP, [_ESP+PTR_SIZE] |
806 |
pxor mm5, mm4 ;C13 (6th+) |
pxor mm5, mm4 ;C13 (6th+) |
807 |
paddw mm7, mm3 ;B10 offset +negate back (3rd) |
paddw mm7, mm3 ;B10 offset +negate back (3rd) |
808 |
movq [edx+4*24+16], mm5 ;C14 (7th) |
movq [_EDX+4*24+16], mm5 ;C14 (7th) |
809 |
paddw mm2, mm7 ;B11 mm7 free (4th+) |
paddw mm2, mm7 ;B11 mm7 free (4th+) |
810 |
pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) |
pminsw mm2, [_EBX] ;B12 saturates to +2047 (5th+) |
811 |
mov ebx, [esp+8] |
mov _EBX, [_ESP+2*PTR_SIZE] |
812 |
mov edi, [esp+12] |
mov _EDI, [_ESP+3*PTR_SIZE] |
813 |
add esp, byte 16 |
add _ESP, byte 4*PTR_SIZE |
814 |
pxor mm3, mm2 ;B13 (6th+) |
pxor mm3, mm2 ;B13 (6th+) |
815 |
movq [edx+4*24+8], mm3 ;B14 (7th) |
movq [_EDX+4*24+8], mm3 ;B14 (7th) |
816 |
mov [edx], ax |
mov [_EDX], ax |
817 |
|
|
818 |
|
xor _EAX, _EAX |
819 |
|
|
820 |
|
%ifdef ARCH_IS_X86_64 |
821 |
|
mov TMP0, [_ESP] |
822 |
|
%ifndef WINDOWS |
823 |
|
add _ESP, 6*PTR_SIZE |
824 |
|
%else |
825 |
|
add _ESP, 4*PTR_SIZE |
826 |
|
%endif |
827 |
|
mov [_ESP], TMP0 |
828 |
|
%endif |
829 |
|
|
|
xor eax, eax |
|
830 |
ret |
ret |
831 |
|
ENDFUNC |
832 |
|
|
833 |
|
|
834 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
835 |
; |
; |
844 |
; except that we're saturating using 'pminsw' (saves 2 cycles/loop) |
; except that we're saturating using 'pminsw' (saves 2 cycles/loop) |
845 |
; This is Athlon-optimized code (ca 100 clk per call) |
; This is Athlon-optimized code (ca 100 clk per call) |
846 |
|
|
847 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
848 |
cglobal dequant_h263_inter_3dne |
cglobal dequant_h263_inter_3dne |
849 |
dequant_h263_inter_3dne: |
dequant_h263_inter_3dne: |
850 |
mov ecx, [esp+ 8] ; coeff |
|
851 |
mov eax, [esp+12] ; quant |
%ifdef ARCH_IS_X86_64 |
852 |
|
mov TMP0, [_ESP] |
853 |
|
add _ESP, PTR_SIZE |
854 |
|
%ifndef WINDOWS |
855 |
|
push prm6 |
856 |
|
push prm5 |
857 |
|
%endif |
858 |
|
push prm4 |
859 |
|
push prm3 |
860 |
|
push prm2 |
861 |
|
push prm1 |
862 |
|
sub _ESP, PTR_SIZE |
863 |
|
mov [_ESP], TMP0 |
864 |
|
%endif |
865 |
|
|
866 |
|
mov _ECX, [_ESP+ 2*PTR_SIZE] ; coeff |
867 |
|
mov _EAX, [_ESP+ 3*PTR_SIZE] ; quant |
868 |
pxor mm0, mm0 |
pxor mm0, mm0 |
869 |
pxor mm2, mm2 |
pxor mm2, mm2 |
870 |
push edi |
push _EDI |
871 |
push ebx |
push _EBX |
872 |
push esi |
push _ESI |
873 |
lea edi, [mmx_mul + eax*8 - 8] ; 2*quant |
%ifdef ARCH_IS_X86_64 |
874 |
mov ebx, mmx_2047 |
lea _EDI, [mmx_mul] |
875 |
|
lea _EDI, [_EDI + _EAX*8 - 8] ; 2*quant |
876 |
|
%else |
877 |
|
lea _EDI, [mmx_mul + _EAX*8 - 8] ; 2*quant |
878 |
|
%endif |
879 |
|
lea _EBX, [mmx_2047] |
880 |
pxor mm7, mm7 |
pxor mm7, mm7 |
881 |
movq mm3, [ecx+120] ;B2 ; c = coeff[i] |
movq mm3, [_ECX+120] ;B2 ; c = coeff[i] |
882 |
pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) |
pcmpeqw mm7, [_ECX+120] ;B6 (c ==0) ? -1 : 0 (1st) |
883 |
lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 |
%ifdef ARCH_IS_X86_64 |
884 |
|
lea r9, [mmx_add] |
885 |
|
lea _EAX, [r9 + _EAX*8 - 8] ; quant or quant-1 |
886 |
|
%else |
887 |
|
lea _EAX, [mmx_add + _EAX*8 - 8] ; quant or quant-1 |
888 |
|
%endif |
889 |
psubw mm2, mm3 ;-c ;B3 (1st dep) |
psubw mm2, mm3 ;-c ;B3 (1st dep) |
890 |
mov esi, mmzero |
lea _ESI, [mmzero] |
891 |
pmaxsw mm2, mm3 ;|c| ;B4 (2nd) |
pmaxsw mm2, mm3 ;|c| ;B4 (2nd) |
892 |
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) |
pmullw mm2, [_EDI] ;*= 2Q ;B8 (3rd+) |
893 |
psraw mm3, 15 ; sign(c) ;B7 (2nd) |
psraw mm3, 15 ; sign(c) ;B7 (2nd) |
894 |
mov edx, [dword esp+ 4+12] ; data |
mov _EDX, [_ESP+ (1+3)*PTR_SIZE] ; data |
895 |
|
|
896 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
897 |
|
|
898 |
dequant 0 |
dequant 0 |
899 |
dequant 1 |
dequant 1 |
902 |
dequant 4 |
dequant 4 |
903 |
|
|
904 |
paddw mm4, mm6 ;C11 mm6 free (4th+) |
paddw mm4, mm6 ;C11 mm6 free (4th+) |
905 |
pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) |
pminsw mm4, [_EBX] ;C12 saturates to +2047 (5th+) |
906 |
pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) |
pandn mm7, [_EAX] ;B9 offset = isZero ? 0 : quant_add (2nd) |
907 |
mov esi, [esp] |
mov _ESI, [_ESP] |
908 |
pxor mm5, mm4 ;C13 (6th+) |
pxor mm5, mm4 ;C13 (6th+) |
909 |
paddw mm7, mm3 ;B10 offset +negate back (3rd) |
paddw mm7, mm3 ;B10 offset +negate back (3rd) |
910 |
movq [edx+4*24+16], mm5 ;C14 (7th) |
movq [_EDX+4*24+16], mm5 ;C14 (7th) |
911 |
paddw mm2, mm7 ;B11 mm7 free (4th+) |
paddw mm2, mm7 ;B11 mm7 free (4th+) |
912 |
pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) |
pminsw mm2, [_EBX] ;B12 saturates to +2047 (5th+) |
913 |
mov ebx, [esp+4] |
mov _EBX, [_ESP+PTR_SIZE] |
914 |
mov edi, [esp+8] |
mov _EDI, [_ESP+2*PTR_SIZE] |
915 |
add esp, byte 12 |
add _ESP, byte 3*PTR_SIZE |
916 |
pxor mm3, mm2 ;B13 (6th+) |
pxor mm3, mm2 ;B13 (6th+) |
917 |
movq [edx+4*24+8], mm3 ;B14 (7th) |
movq [_EDX+4*24+8], mm3 ;B14 (7th) |
918 |
|
|
919 |
|
xor _EAX, _EAX |
920 |
|
|
921 |
|
%ifdef ARCH_IS_X86_64 |
922 |
|
mov TMP0, [_ESP] |
923 |
|
%ifndef WINDOWS |
924 |
|
add _ESP, 6*PTR_SIZE |
925 |
|
%else |
926 |
|
add _ESP, 4*PTR_SIZE |
927 |
|
%endif |
928 |
|
mov [_ESP], TMP0 |
929 |
|
%endif |
930 |
|
|
|
xor eax, eax |
|
931 |
ret |
ret |
932 |
|
ENDFUNC |
933 |
|
|
934 |
|
%ifidn __OUTPUT_FORMAT__,elf |
935 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
936 |
|
%endif |
937 |
|
|