5 |
; * |
; * |
6 |
; * Copyright(C) 2002-2003 Jaan Kalda |
; * Copyright(C) 2002-2003 Jaan Kalda |
7 |
; * |
; * |
8 |
; * This program is free software ; you can redistribute it and/or modify |
; * This program is free software ; you can r_EDIstribute it and/or modify |
9 |
; * it under the terms of the GNU General Public License as published by |
; * it under the terms of the GNU General Public License as published by |
10 |
; * the Free Software Foundation ; either version 2 of the License, or |
; * the Free Software Foundation ; either version 2 of the License, or |
11 |
; * (at your option) any later version. |
; * (at your option) any later version. |
29 |
; enable dequant saturate [-2048,2047], test purposes only. |
; enable dequant saturate [-2048,2047], test purposes only. |
30 |
%define SATURATE |
%define SATURATE |
31 |
|
|
32 |
BITS 32 |
%include "nasm.inc" |
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
|
%endmacro |
|
33 |
|
|
34 |
;============================================================================= |
;============================================================================= |
35 |
; Local data |
; Local data |
36 |
;============================================================================= |
;============================================================================= |
37 |
|
|
38 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata data |
|
|
%else |
|
|
SECTION .rodata data align=16 |
|
|
%endif |
|
39 |
|
|
40 |
align 4 |
align SECTION_ALIGN |
41 |
int_div: |
int_div: |
42 |
dd 0 |
dd 0 |
43 |
%assign i 1 |
%assign i 1 |
46 |
%assign i i+1 |
%assign i i+1 |
47 |
%endrep |
%endrep |
48 |
|
|
49 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
50 |
plus_one: |
plus_one: |
51 |
times 8 dw 1 |
times 8 dw 1 |
52 |
|
|
54 |
; subtract by Q/2 table |
; subtract by Q/2 table |
55 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
56 |
|
|
57 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
58 |
mmx_sub: |
mmx_sub: |
59 |
%assign i 1 |
%assign i 1 |
60 |
%rep 31 |
%rep 31 |
73 |
; |
; |
74 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
75 |
|
|
76 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
77 |
mmx_div: |
mmx_div: |
78 |
%assign i 1 |
%assign i 1 |
79 |
%rep 31 |
%rep 31 |
85 |
; add by (odd(Q) ? Q : Q - 1) table |
; add by (odd(Q) ? Q : Q - 1) table |
86 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
87 |
|
|
88 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
89 |
mmx_add: |
mmx_add: |
90 |
%assign i 1 |
%assign i 1 |
91 |
%rep 31 |
%rep 31 |
101 |
; multiple by 2Q table |
; multiple by 2Q table |
102 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
103 |
|
|
104 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
105 |
mmx_mul: |
mmx_mul: |
106 |
%assign i 1 |
%assign i 1 |
107 |
%rep 31 |
%rep 31 |
113 |
; saturation limits |
; saturation limits |
114 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
115 |
|
|
116 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
117 |
mmx_32768_minus_2048: |
mmx_32768_minus_2048: |
118 |
times 4 dw (32768-2048) |
times 4 dw (32768-2048) |
119 |
mmx_32767_minus_2047: |
mmx_32767_minus_2047: |
120 |
times 4 dw (32767-2047) |
times 4 dw (32767-2047) |
121 |
|
|
122 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
123 |
mmx_2047: |
mmx_2047: |
124 |
times 4 dw 2047 |
times 4 dw 2047 |
125 |
|
|
126 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
127 |
mmzero: |
mmzero: |
128 |
dd 0, 0 |
dd 0, 0 |
129 |
int2047: |
int2047: |
135 |
; Code |
; Code |
136 |
;============================================================================= |
;============================================================================= |
137 |
|
|
138 |
SECTION .text |
TEXT |
|
|
|
139 |
|
|
140 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
141 |
; |
; |
156 |
psubw mm7, mm6 ;D8 |
psubw mm7, mm6 ;D8 |
157 |
%endif |
%endif |
158 |
|
|
159 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
160 |
db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 |
movq mm4, [_ECX + %1 * 32 +16] ;C1 |
161 |
pmaxsw mm1, mm0 ;A4 |
pmaxsw mm1, mm0 ;A4 |
162 |
db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1 |
movq mm6, [_ECX + %1 * 32 +24] ;D1 |
163 |
pmaxsw mm3, mm2 ;B4 |
pmaxsw mm3, mm2 ;B4 |
164 |
|
|
165 |
|
|
166 |
psraw mm0, 15 ;A5 |
psraw mm0, 15 ;A5 |
167 |
psraw mm2, 15 ;B5 |
psraw mm2, 15 ;B5 |
168 |
%if (%1) |
%if (%1) |
169 |
movq [edx + %1 * 32 + 16-32], mm5 ;C9 |
movq [_EDX + %1 * 32 + 16-32], mm5 ;C9 |
170 |
movq [edx + %1 * 32 + 24-32], mm7 ;D9 |
movq [_EDX + %1 * 32 + 24-32], mm7 ;D9 |
171 |
%endif |
%endif |
172 |
|
|
173 |
psrlw mm1, 1 ;A6 |
psrlw mm1, 1 ;A6 |
174 |
psrlw mm3, 1 ;B6 |
psrlw mm3, 1 ;B6 |
175 |
movq mm5, [ebx] ;C2 |
movq mm5, [_EBX] ;C2 |
176 |
movq mm7, [ebx] ;D2 |
movq mm7, [_EBX] ;D2 |
177 |
|
|
178 |
pxor mm1, mm0 ;A7 |
pxor mm1, mm0 ;A7 |
179 |
pxor mm3, mm2 ;B7 |
pxor mm3, mm2 ;B7 |
184 |
psubw mm3, mm2 ;B8 |
psubw mm3, mm2 ;B8 |
185 |
|
|
186 |
%if (%1 == 0) |
%if (%1 == 0) |
187 |
push ebp |
push _EBP |
188 |
movq mm0, [ecx + %1 * 32 +32] |
movq mm0, [_ECX + %1 * 32 +32] |
189 |
%elif (%1 < 3) |
%elif (%1 < 3) |
190 |
db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 |
movq mm0, [_ECX + %1 * 32 +32] ;A1 |
191 |
%endif |
%endif |
192 |
pmaxsw mm5, mm4 ;C4 |
pmaxsw mm5, mm4 ;C4 |
193 |
%if (%1 < 3) |
%if (%1 < 3) |
194 |
db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 |
movq mm2, [_ECX + %1 * 32 +8+32] ;B1 |
195 |
%else |
%else |
196 |
cmp esp, esp |
cmp _ESP, _ESP |
197 |
%endif |
%endif |
198 |
pmaxsw mm7, mm6 ;D4 |
pmaxsw mm7, mm6 ;D4 |
199 |
|
|
200 |
psraw mm4, 15 ;C5 |
psraw mm4, 15 ;C5 |
201 |
psraw mm6, 15 ;D5 |
psraw mm6, 15 ;D5 |
202 |
movq [byte edx + %1 * 32], mm1 ;A9 |
movq [byte _EDX + %1 * 32], mm1 ;A9 |
203 |
movq [edx + %1 * 32+8], mm3 ;B9 |
movq [_EDX + %1 * 32+8], mm3 ;B9 |
204 |
|
|
205 |
|
|
206 |
psrlw mm5, 1 ;C6 |
psrlw mm5, 1 ;C6 |
207 |
psrlw mm7, 1 ;D6 |
psrlw mm7, 1 ;D6 |
208 |
%if (%1 < 3) |
%if (%1 < 3) |
209 |
movq mm1, [ebx] ;A2 |
movq mm1, [_EBX] ;A2 |
210 |
movq mm3, [ebx] ;B2 |
movq mm3, [_EBX] ;B2 |
211 |
%endif |
%endif |
212 |
%if (%1 == 3) |
%if (%1 == 3) |
213 |
imul eax, [int_div+4*edi] |
%ifdef ARCH_IS_X86_64 |
214 |
|
lea r9, [int_div] |
215 |
|
imul eax, dword [r9+4*_EDI] |
216 |
|
%else |
217 |
|
imul _EAX, [int_div+4*_EDI] |
218 |
|
%endif |
219 |
%endif |
%endif |
220 |
pxor mm5, mm4 ;C7 |
pxor mm5, mm4 ;C7 |
221 |
pxor mm7, mm6 ;D7 |
pxor mm7, mm6 ;D7 |
235 |
psubw mm7, mm6 ;D8 |
psubw mm7, mm6 ;D8 |
236 |
%endif |
%endif |
237 |
|
|
238 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
239 |
db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 |
movq mm4, [_ECX + %1 * 32 +16] ;C1 |
240 |
pmaxsw mm1, mm0 ;A4 |
pmaxsw mm1, mm0 ;A4 |
241 |
db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1 |
movq mm6, [_ECX + %1 * 32 +24] ;D1 |
242 |
pmaxsw mm3, mm2 ;B4 |
pmaxsw mm3, mm2 ;B4 |
243 |
|
|
244 |
|
|
245 |
psraw mm0, 15 ;A5 |
psraw mm0, 15 ;A5 |
246 |
psraw mm2, 15 ;B5 |
psraw mm2, 15 ;B5 |
247 |
%if (%1) |
%if (%1) |
248 |
movq [edx + %1 * 32 + 16-32], mm5 ;C9 |
movq [_EDX + %1 * 32 + 16-32], mm5 ;C9 |
249 |
movq [edx + %1 * 32 + 24-32], mm7 ;D9 |
movq [_EDX + %1 * 32 + 24-32], mm7 ;D9 |
250 |
%endif |
%endif |
251 |
|
|
252 |
pmulhw mm1, [esi] ;A6 |
pmulhw mm1, [_ESI] ;A6 |
253 |
pmulhw mm3, [esi] ;B6 |
pmulhw mm3, [_ESI] ;B6 |
254 |
movq mm5, [ebx] ;C2 |
movq mm5, [_EBX] ;C2 |
255 |
movq mm7, [ebx] ;D2 |
movq mm7, [_EBX] ;D2 |
256 |
|
|
257 |
nop |
nop |
258 |
nop |
nop |
266 |
|
|
267 |
|
|
268 |
%if (%1 < 3) |
%if (%1 < 3) |
269 |
db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 |
movq mm0, [_ECX + %1 * 32 +32] ;A1 |
270 |
%endif |
%endif |
271 |
pmaxsw mm5, mm4 ;C4 |
pmaxsw mm5, mm4 ;C4 |
272 |
%if (%1 < 3) |
%if (%1 < 3) |
273 |
db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 |
movq mm2, [_ECX + %1 * 32 +8+32] ;B1 |
274 |
%else |
%else |
275 |
cmp esp, esp |
cmp _ESP, _ESP |
276 |
%endif |
%endif |
277 |
pmaxsw mm7,mm6 ;D4 |
pmaxsw mm7,mm6 ;D4 |
278 |
|
|
279 |
psraw mm4, 15 ;C5 |
psraw mm4, 15 ;C5 |
280 |
psraw mm6, 15 ;D5 |
psraw mm6, 15 ;D5 |
281 |
movq [byte edx + %1 * 32], mm1 ;A9 |
movq [byte _EDX + %1 * 32], mm1 ;A9 |
282 |
movq [edx + %1 * 32+8], mm3 ;B9 |
movq [_EDX + %1 * 32+8], mm3 ;B9 |
283 |
|
|
284 |
|
|
285 |
pmulhw mm5, [esi] ;C6 |
pmulhw mm5, [_ESI] ;C6 |
286 |
pmulhw mm7, [esi] ;D6 |
pmulhw mm7, [_ESI] ;D6 |
287 |
%if (%1 < 3) |
%if (%1 < 3) |
288 |
movq mm1, [ebx] ;A2 |
movq mm1, [_EBX] ;A2 |
289 |
movq mm3, [ebx] ;B2 |
movq mm3, [_EBX] ;B2 |
290 |
%endif |
%endif |
291 |
%if (%1 == 0) |
%if (%1 == 0) |
292 |
push ebp |
push _EBP |
293 |
%elif (%1 < 3) |
%elif (%1 < 3) |
294 |
nop |
nop |
295 |
%endif |
%endif |
296 |
nop |
nop |
297 |
%if (%1 == 3) |
%if (%1 == 3) |
298 |
imul eax, [int_div+4*edi] |
%ifdef ARCH_IS_X86_64 |
299 |
|
lea r9, [int_div] |
300 |
|
imul eax, dword [r9+4*_EDI] |
301 |
|
%else |
302 |
|
imul _EAX, [int_div+4*_EDI] |
303 |
|
%endif |
304 |
%endif |
%endif |
305 |
pxor mm5, mm4 ;C7 |
pxor mm5, mm4 ;C7 |
306 |
pxor mm7, mm6 ;D7 |
pxor mm7, mm6 ;D7 |
307 |
%endmacro |
%endmacro |
308 |
|
|
309 |
|
|
310 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
311 |
cglobal quant_h263_intra_3dne |
cglobal quant_h263_intra_3dne |
312 |
quant_h263_intra_3dne: |
quant_h263_intra_3dne: |
313 |
|
|
314 |
mov eax, [esp + 12] ; quant |
%ifdef ARCH_IS_X86_64 |
315 |
mov ecx, [esp + 8] ; data |
mov TMP0, [_ESP] |
316 |
mov edx, [esp + 4] ; coeff |
add _ESP, PTR_SIZE |
317 |
|
%ifndef WINDOWS |
318 |
|
push prm6 |
319 |
|
push prm5 |
320 |
|
%endif |
321 |
|
push prm4 |
322 |
|
push prm3 |
323 |
|
push prm2 |
324 |
|
push prm1 |
325 |
|
sub _ESP, PTR_SIZE |
326 |
|
mov [_ESP], TMP0 |
327 |
|
%endif |
328 |
|
|
329 |
|
mov _EAX, [_ESP + 3*PTR_SIZE] ; quant |
330 |
|
mov _ECX, [_ESP + 2*PTR_SIZE] ; data |
331 |
|
mov _EDX, [_ESP + 1*PTR_SIZE] ; coeff |
332 |
cmp al, 1 |
cmp al, 1 |
333 |
pxor mm1, mm1 |
pxor mm1, mm1 |
334 |
pxor mm3, mm3 |
pxor mm3, mm3 |
335 |
movq mm0, [ecx] ; mm0 = [1st] |
movq mm0, [_ECX] ; mm0 = [1st] |
336 |
movq mm2, [ecx + 8] |
movq mm2, [_ECX + 8] |
337 |
push esi |
push _ESI |
338 |
lea esi, [mmx_div + eax*8 - 8] |
%ifdef ARCH_IS_X86_64 |
339 |
|
lea _ESI, [mmx_div] |
340 |
push ebx |
lea _ESI, [_ESI + _EAX*8 - 8] |
341 |
mov ebx, mmzero |
%else |
342 |
push edi |
lea _ESI, [mmx_div + _EAX*8 - 8] |
343 |
|
%endif |
344 |
|
|
345 |
|
push _EBX |
346 |
|
lea _EBX, [mmzero] |
347 |
|
push _EDI |
348 |
jz near .q1loop |
jz near .q1loop |
349 |
|
|
350 |
quant_intra 0 |
quant_intra 0 |
351 |
mov ebp, [esp + 16 + 16] ; dcscalar |
mov _EBP, [_ESP + (4+4)*PTR_SIZE] ; dcscalar |
352 |
; NB -- there are 3 pushes in the function preambule and one more |
; NB -- there are 3 pushes in the function preambule and one more |
353 |
; in "quant_intra 0", thus an added offset of 16 bytes |
; in "quant_intra 0", thus an added offset of 16 bytes |
354 |
movsx eax, word [byte ecx] ; DC |
movsx _EAX, word [byte _ECX] ; DC |
355 |
|
|
356 |
quant_intra 1 |
quant_intra 1 |
357 |
mov edi, eax |
mov _EDI, _EAX |
358 |
sar edi, 31 ; sign(DC) |
sar _EDI, 31 ; sign(DC) |
359 |
shr ebp, byte 1 ; ebp = dcscalar/2 |
shr _EBP, byte 1 ; _EBP = dcscalar/2 |
360 |
|
|
361 |
quant_intra 2 |
quant_intra 2 |
362 |
sub eax, edi ; DC (+1) |
sub _EAX, _EDI ; DC (+1) |
363 |
xor ebp, edi ; sign(DC) dcscalar /2 (-1) |
xor _EBP, _EDI ; sign(DC) dcscalar /2 (-1) |
364 |
mov edi, [esp + 16 + 16] ; dscalar |
mov _EDI, [_ESP + (4+4)*PTR_SIZE] ; dscalar |
365 |
lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2 |
lea _EAX, [byte _EAX + _EBP] ; DC + sign(DC) dcscalar/2 |
366 |
mov ebp, [byte esp] |
mov _EBP, [byte _ESP] |
367 |
|
|
368 |
quant_intra 3 |
quant_intra 3 |
369 |
psubw mm5, mm4 ;C8 |
psubw mm5, mm4 ;C8 |
370 |
mov esi, [esp + 12] ; pop back the register value |
mov _ESI, [_ESP + 3*PTR_SIZE] ; pop back the register value |
371 |
mov edi, [esp + 4] ; pop back the register value |
mov _EDI, [_ESP + 1*PTR_SIZE] ; pop back the register value |
372 |
sar eax, 16 |
sar _EAX, 16 |
373 |
lea ebx, [byte eax + 1] ; workaround for eax < 0 |
lea _EBX, [byte _EAX + 1] ; workaround for _EAX < 0 |
374 |
cmovs eax, ebx ; conditionnaly move the corrected value |
cmovs _EAX, _EBX ; conditionnaly move the corrected value |
375 |
mov [edx], ax ; coeff[0] = ax |
mov [_EDX], ax ; coeff[0] = ax |
376 |
mov ebx, [esp + 8] ; pop back the register value |
mov _EBX, [_ESP + 2*PTR_SIZE] ; pop back the register value |
377 |
add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 |
add _ESP, byte 4*PTR_SIZE ; "quant_intra 0" pushed _EBP, but we don't restore that one, just correct the stack offset by 16 |
378 |
psubw mm7, mm6 ;D8 |
psubw mm7, mm6 ;D8 |
379 |
movq [edx + 3 * 32 + 16], mm5 ;C9 |
movq [_EDX + 3 * 32 + 16], mm5 ;C9 |
380 |
movq [edx + 3 * 32 + 24], mm7 ;D9 |
movq [_EDX + 3 * 32 + 24], mm7 ;D9 |
381 |
|
|
382 |
|
xor _EAX, _EAX |
383 |
|
|
384 |
|
%ifdef ARCH_IS_X86_64 |
385 |
|
mov TMP0, [_ESP] |
386 |
|
%ifndef WINDOWS |
387 |
|
add _ESP, 6*PTR_SIZE |
388 |
|
%else |
389 |
|
add _ESP, 4*PTR_SIZE |
390 |
|
%endif |
391 |
|
mov [_ESP], TMP0 |
392 |
|
%endif |
393 |
|
|
|
xor eax, eax |
|
394 |
ret |
ret |
395 |
|
|
396 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
397 |
|
|
398 |
.q1loop |
.q1loop: |
399 |
quant_intra1 0 |
quant_intra1 0 |
400 |
mov ebp, [esp + 16 + 16] ; dcscalar |
mov _EBP, [_ESP + (4+4)*PTR_SIZE] ; dcscalar |
401 |
movsx eax, word [byte ecx] ; DC |
movsx _EAX, word [byte _ECX] ; DC |
402 |
|
|
403 |
quant_intra1 1 |
quant_intra1 1 |
404 |
mov edi, eax |
mov _EDI, _EAX |
405 |
sar edi, 31 ; sign(DC) |
sar _EDI, 31 ; sign(DC) |
406 |
shr ebp, byte 1 ; ebp = dcscalar /2 |
shr _EBP, byte 1 ; _EBP = dcscalar /2 |
407 |
|
|
408 |
quant_intra1 2 |
quant_intra1 2 |
409 |
sub eax, edi ; DC (+1) |
sub _EAX, _EDI ; DC (+1) |
410 |
xor ebp, edi ; sign(DC) dcscalar /2 (-1) |
xor _EBP, _EDI ; sign(DC) dcscalar /2 (-1) |
411 |
mov edi, [esp + 16 + 16] ; dcscalar |
mov _EDI, [_ESP + (4+4)*PTR_SIZE] ; dcscalar |
412 |
lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2 |
lea _EAX, [byte _EAX + _EBP] ; DC + sign(DC) dcscalar /2 |
413 |
mov ebp, [byte esp] |
mov _EBP, [byte _ESP] |
414 |
|
|
415 |
quant_intra1 3 |
quant_intra1 3 |
416 |
psubw mm5, mm4 ;C8 |
psubw mm5, mm4 ;C8 |
417 |
mov esi, [dword esp + 12] ; pop back the register value |
mov _ESI, [_ESP + 3*PTR_SIZE] ; pop back the register value |
418 |
mov edi, [esp + 4] ; pop back the register value |
mov _EDI, [_ESP + 1*PTR_SIZE] ; pop back the register value |
419 |
sar eax, 16 |
sar _EAX, 16 |
420 |
lea ebx, [byte eax + 1] ; workaround for eax < 0 |
lea _EBX, [byte _EAX + 1] ; workaround for _EAX < 0 |
421 |
cmovs eax, ebx ; conditionnaly move the corrected value |
cmovs _EAX, _EBX ; conditionnaly move the corrected value |
422 |
mov [edx], ax ; coeff[0] = ax |
mov [_EDX], ax ; coeff[0] = ax |
423 |
mov ebx, [esp + 8] ; pop back the register value |
mov _EBX, [_ESP + 2*PTR_SIZE] ; pop back the register value |
424 |
add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 |
add _ESP, byte 4*PTR_SIZE ; "quant_intra 0" pushed _EBP, but we don't restore that one, just correct the stack offset by 16 |
425 |
psubw mm7, mm6 ;D8 |
psubw mm7, mm6 ;D8 |
426 |
movq [edx + 3 * 32 + 16], mm5 ;C9 |
movq [_EDX + 3 * 32 + 16], mm5 ;C9 |
427 |
movq [edx + 3 * 32 + 24], mm7 ;D9 |
movq [_EDX + 3 * 32 + 24], mm7 ;D9 |
428 |
|
|
429 |
xor eax, eax |
xor _EAX, _EAX |
|
ret |
|
430 |
|
|
431 |
|
%ifdef ARCH_IS_X86_64 |
432 |
|
mov TMP0, [_ESP] |
433 |
|
%ifndef WINDOWS |
434 |
|
add _ESP, 6*PTR_SIZE |
435 |
|
%else |
436 |
|
add _ESP, 4*PTR_SIZE |
437 |
|
%endif |
438 |
|
mov [_ESP], TMP0 |
439 |
|
%endif |
440 |
|
|
441 |
|
ret |
442 |
|
ENDFUNC |
443 |
|
|
444 |
|
|
445 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
455 |
|
|
456 |
|
|
457 |
%macro quantinter 1 |
%macro quantinter 1 |
458 |
movq mm1, [eax] ;A2 |
movq mm1, [_EAX] ;A2 |
459 |
psraw mm3, 15 ;B6 |
psraw mm3, 15 ;B6 |
460 |
%if (%1) |
%if (%1) |
461 |
psubw mm2, mm6 ;C10 |
psubw mm2, mm6 ;C10 |
462 |
%endif |
%endif |
463 |
psubw mm1, mm0 ;A3 |
psubw mm1, mm0 ;A3 |
464 |
pmulhw mm4, mm7 ;B7 |
pmulhw mm4, mm7 ;B7 |
465 |
movq mm6, [ecx + %1*24+16] ;C1 |
movq mm6, [_ECX + %1*24+16] ;C1 |
466 |
pmaxsw mm1, mm0 ;A4 |
pmaxsw mm1, mm0 ;A4 |
467 |
paddw mm5, mm4 ;B8 |
paddw mm5, mm4 ;B8 |
468 |
%if (%1) |
%if (%1) |
469 |
movq [edx + %1*24+16-24], mm2 ;C11 |
movq [_EDX + %1*24+16-24], mm2 ;C11 |
470 |
%endif |
%endif |
471 |
psubusw mm1, [ebx] ;A5 mm0 -= sub (unsigned, dont go < 0) |
psubusw mm1, [_EBX] ;A5 mm0 -= sub (unsigned, dont go < 0) |
472 |
pxor mm4, mm3 ;B9 |
pxor mm4, mm3 ;B9 |
473 |
movq mm2, [eax] ;C2 |
movq mm2, [_EAX] ;C2 |
474 |
psraw mm0, 15 ;A6 |
psraw mm0, 15 ;A6 |
475 |
psubw mm4, mm3 ;B10 |
psubw mm4, mm3 ;B10 |
476 |
psubw mm2, mm6 ;C3 |
psubw mm2, mm6 ;C3 |
477 |
pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24 |
pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24 |
478 |
movq mm3, [ecx + %1*24+8] ;B1 |
movq mm3, [_ECX + %1*24+8] ;B1 |
479 |
pmaxsw mm2, mm6 ;C4 |
pmaxsw mm2, mm6 ;C4 |
480 |
paddw mm5, mm1 ;A8 sum += mm0 |
paddw mm5, mm1 ;A8 sum += mm0 |
481 |
%if (%1) |
%if (%1) |
482 |
movq [edx + %1*24+8-24], mm4 ;B11 |
movq [_EDX + %1*24+8-24], mm4 ;B11 |
483 |
%else |
%else |
484 |
movq [edx + 120], mm4 ;B11 |
movq [_EDX + 120], mm4 ;B11 |
485 |
%endif |
%endif |
486 |
psubusw mm2, [ebx] ;C5 |
psubusw mm2, [_EBX] ;C5 |
487 |
pxor mm1, mm0 ;A9 mm0 *= sign(mm0) |
pxor mm1, mm0 ;A9 mm0 *= sign(mm0) |
488 |
movq mm4, [eax] ;B2 |
movq mm4, [_EAX] ;B2 |
489 |
psraw mm6, 15 ;C6 |
psraw mm6, 15 ;C6 |
490 |
psubw mm1, mm0 ;A10 undisplace |
psubw mm1, mm0 ;A10 undisplace |
491 |
psubw mm4, mm3 ;B3 |
psubw mm4, mm3 ;B3 |
492 |
pmulhw mm2, mm7 ;C7 |
pmulhw mm2, mm7 ;C7 |
493 |
movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st] |
movq mm0, [_ECX + %1*24+24] ;A1 mm0 = [1st] |
494 |
pmaxsw mm4, mm3 ;B4 |
pmaxsw mm4, mm3 ;B4 |
495 |
paddw mm5, mm2 ;C8 |
paddw mm5, mm2 ;C8 |
496 |
movq [byte edx + %1*24], mm1 ;A11 |
movq [byte _EDX + %1*24], mm1 ;A11 |
497 |
psubusw mm4, [ebx] ;B5 |
psubusw mm4, [_EBX] ;B5 |
498 |
pxor mm2, mm6 ;C9 |
pxor mm2, mm6 ;C9 |
499 |
%endmacro |
%endmacro |
500 |
|
|
501 |
%macro quantinter1 1 |
%macro quantinter1 1 |
502 |
movq mm0, [byte ecx + %1*16] ;mm0 = [1st] |
movq mm0, [byte _ECX + %1*16] ;mm0 = [1st] |
503 |
movq mm3, [ecx + %1*16+8] ; |
movq mm3, [_ECX + %1*16+8] ; |
504 |
movq mm1, [eax] |
movq mm1, [_EAX] |
505 |
movq mm4, [eax] |
movq mm4, [_EAX] |
506 |
psubw mm1, mm0 |
psubw mm1, mm0 |
507 |
psubw mm4, mm3 |
psubw mm4, mm3 |
508 |
pmaxsw mm1, mm0 |
pmaxsw mm1, mm0 |
519 |
pxor mm4, mm3 ; |
pxor mm4, mm3 ; |
520 |
psubw mm1, mm0 ; undisplace |
psubw mm1, mm0 ; undisplace |
521 |
psubw mm4, mm3 |
psubw mm4, mm3 |
522 |
cmp esp, esp |
cmp _ESP, _ESP |
523 |
movq [byte edx + %1*16], mm1 |
movq [byte _EDX + %1*16], mm1 |
524 |
movq [edx + %1*16+8], mm4 |
movq [_EDX + %1*16+8], mm4 |
525 |
%endmacro |
%endmacro |
526 |
|
|
527 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
528 |
cglobal quant_h263_inter_3dne |
cglobal quant_h263_inter_3dne |
529 |
quant_h263_inter_3dne: |
quant_h263_inter_3dne: |
530 |
mov edx, [esp + 4] ; coeff |
|
531 |
mov ecx, [esp + 8] ; data |
%ifdef ARCH_IS_X86_64 |
532 |
mov eax, [esp + 12] ; quant |
mov TMP0, [_ESP] |
533 |
push ebx |
add _ESP, PTR_SIZE |
534 |
|
%ifndef WINDOWS |
535 |
|
push prm6 |
536 |
|
push prm5 |
537 |
|
%endif |
538 |
|
push prm4 |
539 |
|
push prm3 |
540 |
|
push prm2 |
541 |
|
push prm1 |
542 |
|
sub _ESP, PTR_SIZE |
543 |
|
mov [_ESP], TMP0 |
544 |
|
%endif |
545 |
|
|
546 |
|
mov _EDX, [_ESP + 1*PTR_SIZE] ; coeff |
547 |
|
mov _ECX, [_ESP + 2*PTR_SIZE] ; data |
548 |
|
mov _EAX, [_ESP + 3*PTR_SIZE] ; quant |
549 |
|
push _EBX |
550 |
|
|
551 |
pxor mm5, mm5 ; sum |
pxor mm5, mm5 ; sum |
552 |
nop |
nop |
553 |
lea ebx,[mmx_sub + eax * 8 - 8] ; sub |
%ifdef ARCH_IS_X86_64 |
554 |
movq mm7, [mmx_div + eax * 8 - 8] ; divider |
lea _EBX, [mmx_div] |
555 |
|
movq mm7, [_EBX + _EAX * 8 - 8] |
556 |
|
lea _EBX, [mmx_sub] |
557 |
|
lea _EBX, [_EBX + _EAX * 8 - 8] |
558 |
|
%else |
559 |
|
lea _EBX,[mmx_sub + _EAX * 8 - 8] ; sub |
560 |
|
movq mm7, [mmx_div + _EAX * 8 - 8] ; divider |
561 |
|
%endif |
562 |
|
|
563 |
cmp al, 1 |
cmp al, 1 |
564 |
lea eax, [mmzero] |
lea _EAX, [mmzero] |
565 |
jz near .q1loop |
jz near .q1loop |
566 |
cmp esp, esp |
cmp _ESP, _ESP |
567 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
568 |
movq mm3, [ecx + 120] ;B1 |
movq mm3, [_ECX + 120] ;B1 |
569 |
pxor mm4, mm4 ;B2 |
pxor mm4, mm4 ;B2 |
570 |
psubw mm4, mm3 ;B3 |
psubw mm4, mm3 ;B3 |
571 |
movq mm0, [ecx] ;A1 mm0 = [1st] |
movq mm0, [_ECX] ;A1 mm0 = [1st] |
572 |
pmaxsw mm4, mm3 ;B4 |
pmaxsw mm4, mm3 ;B4 |
573 |
psubusw mm4, [ebx] ;B5 |
psubusw mm4, [_EBX] ;B5 |
574 |
|
|
575 |
quantinter 0 |
quantinter 0 |
576 |
quantinter 1 |
quantinter 1 |
584 |
paddw mm5, mm4 ;B8 |
paddw mm5, mm4 ;B8 |
585 |
pxor mm4, mm3 ;B9 |
pxor mm4, mm3 ;B9 |
586 |
psubw mm4, mm3 ;B10 |
psubw mm4, mm3 ;B10 |
587 |
movq [edx + 4*24+16], mm2 ;C11 |
movq [_EDX + 4*24+16], mm2 ;C11 |
588 |
pop ebx |
pop _EBX |
589 |
movq [edx + 4*24+8], mm4 ;B11 |
movq [_EDX + 4*24+8], mm4 ;B11 |
590 |
pmaddwd mm5, [plus_one] |
pmaddwd mm5, [plus_one] |
591 |
movq mm0, mm5 |
movq mm0, mm5 |
592 |
punpckhdq mm5, mm5 |
punpckhdq mm5, mm5 |
593 |
paddd mm0, mm5 |
paddd mm0, mm5 |
594 |
movd eax, mm0 ; return sum |
movd eax, mm0 ; return sum |
595 |
|
|
596 |
|
%ifdef ARCH_IS_X86_64 |
597 |
|
mov TMP0, [_ESP] |
598 |
|
%ifndef WINDOWS |
599 |
|
add _ESP, 6*PTR_SIZE |
600 |
|
%else |
601 |
|
add _ESP, 4*PTR_SIZE |
602 |
|
%endif |
603 |
|
mov [_ESP], TMP0 |
604 |
|
%endif |
605 |
|
|
606 |
ret |
ret |
607 |
|
|
608 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
609 |
.q1loop |
.q1loop: |
610 |
movq mm6, [byte ebx] |
movq mm6, [byte _EBX] |
611 |
|
|
612 |
quantinter1 0 |
quantinter1 0 |
613 |
quantinter1 1 |
quantinter1 1 |
624 |
paddd mm0, mm5 |
paddd mm0, mm5 |
625 |
movd eax, mm0 ; return sum |
movd eax, mm0 ; return sum |
626 |
|
|
627 |
pop ebx |
pop _EBX |
628 |
|
|
629 |
|
%ifdef ARCH_IS_X86_64 |
630 |
|
mov TMP0, [_ESP] |
631 |
|
%ifndef WINDOWS |
632 |
|
add _ESP, 6*PTR_SIZE |
633 |
|
%else |
634 |
|
add _ESP, 4*PTR_SIZE |
635 |
|
%endif |
636 |
|
mov [_ESP], TMP0 |
637 |
|
%endif |
638 |
|
|
639 |
ret |
ret |
640 |
|
ENDFUNC |
641 |
|
|
642 |
|
|
643 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
644 |
; |
; |
656 |
;This is Athlon-optimized code (ca 106 clk per call) |
;This is Athlon-optimized code (ca 106 clk per call) |
657 |
|
|
658 |
%macro dequant 1 |
%macro dequant 1 |
659 |
movq mm1, [ecx+%1*24] ; c = coeff[i] ;A2 |
movq mm1, [_ECX+%1*24] ; c = coeff[i] ;A2 |
660 |
psubw mm0, mm1 ;-c ;A3 (1st dep) |
psubw mm0, mm1 ;-c ;A3 (1st dep) |
661 |
%if (%1) |
%if (%1) |
662 |
paddw mm4, mm6 ;C11 mm6 free (4th+) |
paddw mm4, mm6 ;C11 mm6 free (4th+) |
663 |
%endif |
%endif |
664 |
pmaxsw mm0, mm1 ;|c| ;A4 (2nd) |
pmaxsw mm0, mm1 ;|c| ;A4 (2nd) |
665 |
%if (%1) |
%if (%1) |
666 |
mov ebp, ebp |
mov _EBP, _EBP |
667 |
pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) later |
pminsw mm4, [_EBX] ;C12 saturates to +2047 (5th+) later |
668 |
%endif |
%endif |
669 |
movq mm6, [esi] ;0 ;A5 mm6 in use |
movq mm6, [_ESI] ;0 ;A5 mm6 in use |
670 |
pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) |
pandn mm7, [_EAX] ;B9 offset = isZero ? 0 : quant_add (2nd) |
671 |
%if (%1) |
%if (%1) |
672 |
pxor mm5, mm4 ;C13 (6th+) 1later |
pxor mm5, mm4 ;C13 (6th+) 1later |
673 |
%endif |
%endif |
674 |
movq mm4, [esi] ;C1 ;0 |
movq mm4, [_ESI] ;C1 ;0 |
675 |
mov esp, esp |
mov _ESP, _ESP |
676 |
pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st) |
pcmpeqw mm6, [_ECX+%1*24] ;A6 (c ==0) ? -1 : 0 (1st) |
677 |
ALIGN 4 |
ALIGN SECTION_ALIGN |
678 |
psraw mm1, 15 ; sign(c) ;A7 (2nd) |
psraw mm1, 15 ; sign(c) ;A7 (2nd) |
679 |
%if (%1) |
%if (%1) |
680 |
movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later |
movq [_EDX+%1*24+16-24], mm5 ; C14 (7th) 2later |
681 |
%endif |
%endif |
682 |
paddw mm7, mm3 ;B10 offset +negate back (3rd) |
paddw mm7, mm3 ;B10 offset +negate back (3rd) |
683 |
pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+) |
pmullw mm0, [_EDI] ;*= 2Q ;A8 (3rd+) |
684 |
paddw mm2, mm7 ;B11 mm7 free (4th+) |
paddw mm2, mm7 ;B11 mm7 free (4th+) |
685 |
lea ebp, [byte ebp] |
lea _EBP, [byte _EBP] |
686 |
movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i] |
movq mm5, [_ECX+%1*24+16] ;C2 ; c = coeff[i] |
687 |
psubw mm4, mm5 ;-c ;C3 (1st dep) |
psubw mm4, mm5 ;-c ;C3 (1st dep) |
688 |
pandn mm6, [eax] ;A9 offset = isZero ? 0 : quant_add (2nd) |
pandn mm6, [_EAX] ;A9 offset = isZero ? 0 : quant_add (2nd) |
689 |
pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) |
pminsw mm2, [_EBX] ;B12 saturates to +2047 (5th+) |
690 |
pxor mm3, mm2 ;B13 (6th+) |
pxor mm3, mm2 ;B13 (6th+) |
691 |
movq mm2, [byte esi] ;B1 ;0 |
movq mm2, [byte _ESI] ;B1 ;0 |
692 |
%if (%1) |
%if (%1) |
693 |
movq [edx+%1*24+8-24], mm3 ;B14 (7th) |
movq [_EDX+%1*24+8-24], mm3 ;B14 (7th) |
694 |
%else |
%else |
695 |
movq [edx+120], mm3 |
movq [_EDX+120], mm3 |
696 |
%endif |
%endif |
697 |
pmaxsw mm4, mm5 ;|c| ;C4 (2nd) |
pmaxsw mm4, mm5 ;|c| ;C4 (2nd) |
698 |
paddw mm6, mm1 ;A10 offset +negate back (3rd) |
paddw mm6, mm1 ;A10 offset +negate back (3rd) |
699 |
movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i] |
movq mm3, [_ECX+%1*24 + 8] ;B2 ; c = coeff[i] |
700 |
psubw mm2, mm3 ;-c ;B3 (1st dep) |
psubw mm2, mm3 ;-c ;B3 (1st dep) |
701 |
paddw mm0, mm6 ;A11 mm6 free (4th+) |
paddw mm0, mm6 ;A11 mm6 free (4th+) |
702 |
movq mm6, [byte esi] ;0 ;C5 mm6 in use |
movq mm6, [byte _ESI] ;0 ;C5 mm6 in use |
703 |
pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st) |
pcmpeqw mm6, [_ECX+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st) |
704 |
pminsw mm0, [ebx] ;A12 saturates to +2047 (5th+) |
pminsw mm0, [_EBX] ;A12 saturates to +2047 (5th+) |
705 |
pmaxsw mm2, mm3 ;|c| ;B4 (2nd) |
pmaxsw mm2, mm3 ;|c| ;B4 (2nd) |
706 |
pxor mm1, mm0 ;A13 (6th+) |
pxor mm1, mm0 ;A13 (6th+) |
707 |
pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+) |
pmullw mm4, [_EDI] ;*= 2Q ;C8 (3rd+) |
708 |
psraw mm5, 15 ; sign(c) ;C7 (2nd) |
psraw mm5, 15 ; sign(c) ;C7 (2nd) |
709 |
movq mm7, [byte esi] ;0 ;B5 mm7 in use |
movq mm7, [byte _ESI] ;0 ;B5 mm7 in use |
710 |
pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st) |
pcmpeqw mm7, [_ECX+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st) |
711 |
%if (%1 < 4) |
%if (%1 < 4) |
712 |
movq mm0, [byte esi] ;A1 ;0 |
movq mm0, [byte _ESI] ;A1 ;0 |
713 |
%endif |
%endif |
714 |
pandn mm6, [byte eax] ;C9 offset = isZero ? 0 : quant_add (2nd) |
pandn mm6, [byte _EAX] ;C9 offset = isZero ? 0 : quant_add (2nd) |
715 |
psraw mm3, 15 ;sign(c) ;B7 (2nd) |
psraw mm3, 15 ;sign(c) ;B7 (2nd) |
716 |
movq [byte edx+%1*24], mm1 ;A14 (7th) |
movq [byte _EDX+%1*24], mm1 ;A14 (7th) |
717 |
paddw mm6, mm5 ;C10 offset +negate back (3rd) |
paddw mm6, mm5 ;C10 offset +negate back (3rd) |
718 |
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) |
pmullw mm2, [_EDI] ;*= 2Q ;B8 (3rd+) |
719 |
mov esp, esp |
mov _ESP, _ESP |
720 |
%endmacro |
%endmacro |
721 |
|
|
722 |
|
|
723 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
724 |
cglobal dequant_h263_intra_3dne |
cglobal dequant_h263_intra_3dne |
725 |
dequant_h263_intra_3dne: |
dequant_h263_intra_3dne: |
726 |
mov ecx, [esp+ 8] ; coeff |
|
727 |
mov eax, [esp+12] ; quant |
%ifdef ARCH_IS_X86_64 |
728 |
|
mov TMP0, [_ESP] |
729 |
|
add _ESP, PTR_SIZE |
730 |
|
%ifndef WINDOWS |
731 |
|
push prm6 |
732 |
|
push prm5 |
733 |
|
%endif |
734 |
|
push prm4 |
735 |
|
push prm3 |
736 |
|
push prm2 |
737 |
|
push prm1 |
738 |
|
sub _ESP, PTR_SIZE |
739 |
|
mov [_ESP], TMP0 |
740 |
|
%endif |
741 |
|
|
742 |
|
mov _ECX, [_ESP+ 2*PTR_SIZE] ; coeff |
743 |
|
mov _EAX, [_ESP+ 3*PTR_SIZE] ; quant |
744 |
pxor mm0, mm0 |
pxor mm0, mm0 |
745 |
pxor mm2, mm2 |
pxor mm2, mm2 |
746 |
push edi |
push _EDI |
747 |
push ebx |
push _EBX |
748 |
lea edi, [mmx_mul + eax*8 - 8] ; 2*quant |
%ifdef ARCH_IS_X86_64 |
749 |
push ebp |
lea _EDI, [mmx_mul] |
750 |
mov ebx, mmx_2047 |
lea _EDI, [_EDI + _EAX*8 - 8] ; 2*quant |
751 |
movsx ebp, word [ecx] |
%else |
752 |
lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 |
lea _EDI, [mmx_mul + _EAX*8 - 8] ; 2*quant |
753 |
push esi |
%endif |
754 |
mov esi, mmzero |
push _EBP |
755 |
|
lea _EBX, [mmx_2047] |
756 |
|
movsx _EBP, word [_ECX] |
757 |
|
%ifdef ARCH_IS_X86_64 |
758 |
|
lea r9, [mmx_add] |
759 |
|
lea _EAX, [r9 + _EAX*8 - 8] ; quant or quant-1 |
760 |
|
%else |
761 |
|
lea _EAX, [mmx_add + _EAX*8 - 8] ; quant or quant-1 |
762 |
|
%endif |
763 |
|
push _ESI |
764 |
|
lea _ESI, [mmzero] |
765 |
pxor mm7, mm7 |
pxor mm7, mm7 |
766 |
movq mm3, [ecx+120] ;B2 ; c = coeff[i] |
movq mm3, [_ECX+120] ;B2 ; c = coeff[i] |
767 |
pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) |
pcmpeqw mm7, [_ECX+120] ;B6 (c ==0) ? -1 : 0 (1st) |
768 |
|
|
769 |
imul ebp, [esp+16+16] ; dcscalar |
imul _EBP, [_ESP+(4+4)*PTR_SIZE] ; dcscalar |
770 |
psubw mm2, mm3 ;-c ;B3 (1st dep) |
psubw mm2, mm3 ;-c ;B3 (1st dep) |
771 |
pmaxsw mm2, mm3 ;|c| ;B4 (2nd) |
pmaxsw mm2, mm3 ;|c| ;B4 (2nd) |
772 |
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) |
pmullw mm2, [_EDI] ;*= 2Q ;B8 (3rd+) |
773 |
psraw mm3, 15 ; sign(c) ;B7 (2nd) |
psraw mm3, 15 ; sign(c) ;B7 (2nd) |
774 |
mov edx, [esp+ 4+16] ; data |
mov _EDX, [_ESP+ (1+4)*PTR_SIZE] ; data |
775 |
|
|
776 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
777 |
dequant 0 |
dequant 0 |
778 |
|
|
779 |
cmp ebp, -2048 |
cmp _EBP, -2048 |
780 |
mov esp, esp |
mov _ESP, _ESP |
781 |
|
|
782 |
dequant 1 |
dequant 1 |
783 |
|
|
784 |
cmovl ebp, [int_2048] |
cmovl _EBP, [int_2048] |
785 |
nop |
nop |
786 |
|
|
787 |
dequant 2 |
dequant 2 |
788 |
|
|
789 |
cmp ebp, 2047 |
cmp _EBP, 2047 |
790 |
mov esp, esp |
mov _ESP, _ESP |
791 |
|
|
792 |
dequant 3 |
dequant 3 |
793 |
|
|
794 |
cmovg ebp, [int2047] |
cmovg _EBP, [int2047] |
795 |
nop |
nop |
796 |
|
|
797 |
dequant 4 |
dequant 4 |
798 |
|
|
799 |
paddw mm4, mm6 ;C11 mm6 free (4th+) |
paddw mm4, mm6 ;C11 mm6 free (4th+) |
800 |
pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) |
pminsw mm4, [_EBX] ;C12 saturates to +2047 (5th+) |
801 |
pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) |
pandn mm7, [_EAX] ;B9 offset = isZero ? 0 : quant_add (2nd) |
802 |
mov eax, ebp |
mov _EAX, _EBP |
803 |
mov esi, [esp] |
mov _ESI, [_ESP] |
804 |
mov ebp, [esp+4] |
mov _EBP, [_ESP+PTR_SIZE] |
805 |
pxor mm5, mm4 ;C13 (6th+) |
pxor mm5, mm4 ;C13 (6th+) |
806 |
paddw mm7, mm3 ;B10 offset +negate back (3rd) |
paddw mm7, mm3 ;B10 offset +negate back (3rd) |
807 |
movq [edx+4*24+16], mm5 ;C14 (7th) |
movq [_EDX+4*24+16], mm5 ;C14 (7th) |
808 |
paddw mm2, mm7 ;B11 mm7 free (4th+) |
paddw mm2, mm7 ;B11 mm7 free (4th+) |
809 |
pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) |
pminsw mm2, [_EBX] ;B12 saturates to +2047 (5th+) |
810 |
mov ebx, [esp+8] |
mov _EBX, [_ESP+2*PTR_SIZE] |
811 |
mov edi, [esp+12] |
mov _EDI, [_ESP+3*PTR_SIZE] |
812 |
add esp, byte 16 |
add _ESP, byte 4*PTR_SIZE |
813 |
pxor mm3, mm2 ;B13 (6th+) |
pxor mm3, mm2 ;B13 (6th+) |
814 |
movq [edx+4*24+8], mm3 ;B14 (7th) |
movq [_EDX+4*24+8], mm3 ;B14 (7th) |
815 |
mov [edx], ax |
mov [_EDX], ax |
816 |
|
|
817 |
|
xor _EAX, _EAX |
818 |
|
|
819 |
|
%ifdef ARCH_IS_X86_64 |
820 |
|
mov TMP0, [_ESP] |
821 |
|
%ifndef WINDOWS |
822 |
|
add _ESP, 6*PTR_SIZE |
823 |
|
%else |
824 |
|
add _ESP, 4*PTR_SIZE |
825 |
|
%endif |
826 |
|
mov [_ESP], TMP0 |
827 |
|
%endif |
828 |
|
|
|
xor eax, eax |
|
829 |
ret |
ret |
830 |
|
ENDFUNC |
831 |
|
|
832 |
|
|
833 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
834 |
; |
; |
843 |
; except that we're saturating using 'pminsw' (saves 2 cycles/loop) |
; except that we're saturating using 'pminsw' (saves 2 cycles/loop) |
844 |
; This is Athlon-optimized code (ca 100 clk per call) |
; This is Athlon-optimized code (ca 100 clk per call) |
845 |
|
|
846 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
847 |
cglobal dequant_h263_inter_3dne |
cglobal dequant_h263_inter_3dne |
848 |
dequant_h263_inter_3dne: |
dequant_h263_inter_3dne: |
849 |
mov ecx, [esp+ 8] ; coeff |
|
850 |
mov eax, [esp+12] ; quant |
%ifdef ARCH_IS_X86_64 |
851 |
|
mov TMP0, [_ESP] |
852 |
|
add _ESP, PTR_SIZE |
853 |
|
%ifndef WINDOWS |
854 |
|
push prm6 |
855 |
|
push prm5 |
856 |
|
%endif |
857 |
|
push prm4 |
858 |
|
push prm3 |
859 |
|
push prm2 |
860 |
|
push prm1 |
861 |
|
sub _ESP, PTR_SIZE |
862 |
|
mov [_ESP], TMP0 |
863 |
|
%endif |
864 |
|
|
865 |
|
mov _ECX, [_ESP+ 2*PTR_SIZE] ; coeff |
866 |
|
mov _EAX, [_ESP+ 3*PTR_SIZE] ; quant |
867 |
pxor mm0, mm0 |
pxor mm0, mm0 |
868 |
pxor mm2, mm2 |
pxor mm2, mm2 |
869 |
push edi |
push _EDI |
870 |
push ebx |
push _EBX |
871 |
push esi |
push _ESI |
872 |
lea edi, [mmx_mul + eax*8 - 8] ; 2*quant |
%ifdef ARCH_IS_X86_64 |
873 |
mov ebx, mmx_2047 |
lea _EDI, [mmx_mul] |
874 |
|
lea _EDI, [_EDI + _EAX*8 - 8] ; 2*quant |
875 |
|
%else |
876 |
|
lea _EDI, [mmx_mul + _EAX*8 - 8] ; 2*quant |
877 |
|
%endif |
878 |
|
lea _EBX, [mmx_2047] |
879 |
pxor mm7, mm7 |
pxor mm7, mm7 |
880 |
movq mm3, [ecx+120] ;B2 ; c = coeff[i] |
movq mm3, [_ECX+120] ;B2 ; c = coeff[i] |
881 |
pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) |
pcmpeqw mm7, [_ECX+120] ;B6 (c ==0) ? -1 : 0 (1st) |
882 |
lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 |
%ifdef ARCH_IS_X86_64 |
883 |
|
lea r9, [mmx_add] |
884 |
|
lea _EAX, [r9 + _EAX*8 - 8] ; quant or quant-1 |
885 |
|
%else |
886 |
|
lea _EAX, [mmx_add + _EAX*8 - 8] ; quant or quant-1 |
887 |
|
%endif |
888 |
psubw mm2, mm3 ;-c ;B3 (1st dep) |
psubw mm2, mm3 ;-c ;B3 (1st dep) |
889 |
mov esi, mmzero |
lea _ESI, [mmzero] |
890 |
pmaxsw mm2, mm3 ;|c| ;B4 (2nd) |
pmaxsw mm2, mm3 ;|c| ;B4 (2nd) |
891 |
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) |
pmullw mm2, [_EDI] ;*= 2Q ;B8 (3rd+) |
892 |
psraw mm3, 15 ; sign(c) ;B7 (2nd) |
psraw mm3, 15 ; sign(c) ;B7 (2nd) |
893 |
mov edx, [dword esp+ 4+12] ; data |
mov _EDX, [_ESP+ (1+3)*PTR_SIZE] ; data |
894 |
|
|
895 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
896 |
|
|
897 |
dequant 0 |
dequant 0 |
898 |
dequant 1 |
dequant 1 |
901 |
dequant 4 |
dequant 4 |
902 |
|
|
903 |
paddw mm4, mm6 ;C11 mm6 free (4th+) |
paddw mm4, mm6 ;C11 mm6 free (4th+) |
904 |
pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) |
pminsw mm4, [_EBX] ;C12 saturates to +2047 (5th+) |
905 |
pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) |
pandn mm7, [_EAX] ;B9 offset = isZero ? 0 : quant_add (2nd) |
906 |
mov esi, [esp] |
mov _ESI, [_ESP] |
907 |
pxor mm5, mm4 ;C13 (6th+) |
pxor mm5, mm4 ;C13 (6th+) |
908 |
paddw mm7, mm3 ;B10 offset +negate back (3rd) |
paddw mm7, mm3 ;B10 offset +negate back (3rd) |
909 |
movq [edx+4*24+16], mm5 ;C14 (7th) |
movq [_EDX+4*24+16], mm5 ;C14 (7th) |
910 |
paddw mm2, mm7 ;B11 mm7 free (4th+) |
paddw mm2, mm7 ;B11 mm7 free (4th+) |
911 |
pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) |
pminsw mm2, [_EBX] ;B12 saturates to +2047 (5th+) |
912 |
mov ebx, [esp+4] |
mov _EBX, [_ESP+PTR_SIZE] |
913 |
mov edi, [esp+8] |
mov _EDI, [_ESP+2*PTR_SIZE] |
914 |
add esp, byte 12 |
add _ESP, byte 3*PTR_SIZE |
915 |
pxor mm3, mm2 ;B13 (6th+) |
pxor mm3, mm2 ;B13 (6th+) |
916 |
movq [edx+4*24+8], mm3 ;B14 (7th) |
movq [_EDX+4*24+8], mm3 ;B14 (7th) |
917 |
|
|
918 |
|
xor _EAX, _EAX |
919 |
|
|
920 |
|
%ifdef ARCH_IS_X86_64 |
921 |
|
mov TMP0, [_ESP] |
922 |
|
%ifndef WINDOWS |
923 |
|
add _ESP, 6*PTR_SIZE |
924 |
|
%else |
925 |
|
add _ESP, 4*PTR_SIZE |
926 |
|
%endif |
927 |
|
mov [_ESP], TMP0 |
928 |
|
%endif |
929 |
|
|
|
xor eax, eax |
|
930 |
ret |
ret |
931 |
|
ENDFUNC |
932 |
|
|
933 |
|
NON_EXEC_STACK |