1 |
;/************************************************************************** |
;/**************************************************************************** |
2 |
; * |
; * |
3 |
; * XVID MPEG-4 VIDEO CODEC |
; * XVID MPEG-4 VIDEO CODEC |
4 |
; * - mmx quantization/dequantization - |
; * - 3dne Quantization/Dequantization - |
5 |
; * |
; * |
6 |
; * Copyright(C) 2001-2003 XviD Team <xvid-devel@xvid.org> |
; * Copyright (C) 2002-2003 Peter Ross <pross@xvid.org> |
7 |
|
; * 2002 Jaan Kalda |
8 |
; * |
; * |
9 |
; * This program is free software ; you can redistribute it and/or modify |
; * This program is free software ; you can redistribute it and/or modify |
10 |
; * it under the terms of the GNU General Public License as published by |
; * it under the terms of the GNU General Public License as published by |
22 |
; * |
; * |
23 |
; * $Id$ |
; * $Id$ |
24 |
; * |
; * |
25 |
; *************************************************************************/ |
; ***************************************************************************/ |
26 |
;/************************************************************************** |
|
|
; * quant4 bugs have been fixed: (a) overflow bug for matrix elements |
|
|
; * equal to 1 or 2 is fixed by substituting pmulhw with pmulhuw (iSSE) |
|
|
; * and using multiplier 0ffffh instead of 10001h (for matrix element = 1; |
|
|
; * in that case, 1 is added before multiplying, that additional 1 comes |
|
|
; * from intra_matrix1; (b) rounding error for large coefficients and matrix |
|
|
; * elements is fixed by two-step approach: first approximation (rounded |
|
|
; * down) is found as usual; the result is multiplied by the matrix element |
|
|
; * and mismatch is used to calculate the correction. |
|
|
; *************************************************************************/ |
|
27 |
; _3dne functions are compatible with iSSE, but are optimized specifically |
; _3dne functions are compatible with iSSE, but are optimized specifically |
28 |
; for K7 pipelines |
; for K7 pipelines |
|
; |
|
|
;--------------------------------------------------------------------------- |
|
|
; 09.12.2002 Athlon optimizations contributed by Jaan Kalda |
|
|
;--------------------------------------------------------------------------- |
|
|
|
|
29 |
|
|
|
; data/text alignment |
|
|
%define ALIGN 8 |
|
30 |
%define SATURATE |
%define SATURATE |
31 |
|
|
32 |
bits 32 |
BITS 32 |
|
|
|
|
%ifdef FORMAT_COFF |
|
|
SECTION .data data |
|
|
%else |
|
|
SECTION .data data align=8 |
|
|
%endif |
|
33 |
|
|
34 |
%macro cglobal 1 |
%macro cglobal 1 |
35 |
%ifdef PREFIX |
%ifdef PREFIX |
36 |
|
%ifdef MARK_FUNCS |
37 |
|
global _%1:function |
38 |
|
%define %1 _%1:function |
39 |
|
%else |
40 |
global _%1 |
global _%1 |
41 |
%define %1 _%1 |
%define %1 _%1 |
42 |
|
%endif |
43 |
|
%else |
44 |
|
%ifdef MARK_FUNCS |
45 |
|
global %1:function |
46 |
%else |
%else |
47 |
global %1 |
global %1 |
48 |
%endif |
%endif |
49 |
|
%endif |
50 |
%endmacro |
%endmacro |
51 |
|
|
52 |
%macro cextern 1 |
%macro cextern 1 |
57 |
extern %1 |
extern %1 |
58 |
%endif |
%endif |
59 |
%endmacro |
%endmacro |
|
align 8 |
|
|
mmzero dd 0,0 |
|
60 |
|
|
61 |
mmx_one times 4 dw 1 |
;============================================================================= |
62 |
|
; Local data |
63 |
|
;============================================================================= |
64 |
|
|
65 |
;=========================================================================== |
%ifdef FORMAT_COFF |
66 |
; |
SECTION .rodata |
67 |
|
%else |
68 |
|
SECTION .rodata align=16 |
69 |
|
%endif |
70 |
|
|
71 |
|
ALIGN 8 |
72 |
|
mmzero: |
73 |
|
dd 0,0 |
74 |
|
mmx_one: |
75 |
|
times 4 dw 1 |
76 |
|
|
77 |
|
;----------------------------------------------------------------------------- |
78 |
; divide by 2Q table |
; divide by 2Q table |
79 |
; |
;----------------------------------------------------------------------------- |
|
;=========================================================================== |
|
80 |
|
|
81 |
align ALIGN |
ALIGN 16 |
82 |
mmx_divs ;i>2 |
mmx_divs: ;i>2 |
83 |
%assign i 1 |
%assign i 1 |
84 |
%rep 31 |
%rep 31 |
85 |
times 4 dw ((1 << 15) / i + 1) |
times 4 dw ((1 << 15) / i + 1) |
86 |
%assign i i+1 |
%assign i i+1 |
87 |
%endrep |
%endrep |
88 |
|
|
89 |
align ALIGN |
ALIGN 16 |
90 |
mmx_div ;i>2 |
mmx_div: ;quant>2 |
91 |
%assign i 1 |
times 4 dw 65535 ; the div by 2 formula will overflow for the case |
92 |
|
; quant=1 but we don't care much because quant=1 |
93 |
|
; is handled by a different piece of code that |
94 |
|
; doesn't use this table. |
95 |
|
%assign quant 2 |
96 |
%rep 31 |
%rep 31 |
97 |
times 4 dw ((1 << 16) / i + 1) |
times 4 dw ((1 << 16) / quant + 1) |
98 |
%assign i i+1 |
%assign quant quant+1 |
99 |
%endrep |
%endrep |
100 |
|
|
|
|
|
|
;=========================================================================== |
|
|
; |
|
|
; intra matrix |
|
|
; |
|
|
;=========================================================================== |
|
|
|
|
101 |
%macro FIXX 1 |
%macro FIXX 1 |
102 |
dw (1 << 16) / (%1) + 1 |
dw (1 << 16) / (%1) + 1 |
103 |
%endmacro |
%endmacro |
104 |
|
|
|
cextern intra_matrix_fixl |
|
|
cextern intra_matrix_fix |
|
|
cextern intra_matrix1 |
|
|
cextern intra_matrix |
|
|
|
|
|
;=========================================================================== |
|
|
; |
|
|
; inter matrix |
|
|
; |
|
|
;=========================================================================== |
|
|
|
|
|
cextern inter_matrix1 |
|
|
cextern inter_matrix |
|
|
cextern inter_matrix_fix |
|
|
cextern inter_matrix_fixl |
|
|
|
|
|
|
|
|
%define VM18P 3 |
|
|
%define VM18Q 4 |
|
105 |
%define nop4 db 08Dh,074h,026h,0 |
%define nop4 db 08Dh,074h,026h,0 |
106 |
%define nop3 add esp,byte 0 |
%define nop3 add esp,byte 0 |
107 |
%define nop2 mov esp,esp |
%define nop2 mov esp,esp |
108 |
%define nop7 db 08dh,02ch,02dh,0,0,0,0 |
%define nop7 db 08dh,02ch,02dh,0,0,0,0 |
109 |
%define nop6 add ebp,dword 0 |
%define nop6 add ebp,dword 0 |
110 |
|
|
111 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
|
; |
|
112 |
; quantd table |
; quantd table |
113 |
; |
;----------------------------------------------------------------------------- |
|
;=========================================================================== |
|
114 |
|
|
115 |
|
%define VM18P 3 |
116 |
|
%define VM18Q 4 |
117 |
|
|
118 |
quantd |
ALIGN 16 |
119 |
|
quantd: |
120 |
%assign i 1 |
%assign i 1 |
121 |
%rep 31 |
%rep 31 |
122 |
times 4 dw (((VM18P*i) + (VM18Q/2)) / VM18Q) |
times 4 dw (((VM18P*i) + (VM18Q/2)) / VM18Q) |
123 |
%assign i i+1 |
%assign i i+1 |
124 |
%endrep |
%endrep |
125 |
|
|
126 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
|
; |
|
127 |
; multiple by 2Q table |
; multiple by 2Q table |
128 |
; |
;----------------------------------------------------------------------------- |
|
;=========================================================================== |
|
129 |
|
|
130 |
|
ALIGN 16 |
131 |
mmx_mul_quant |
mmx_mul_quant: |
132 |
%assign i 1 |
%assign i 1 |
133 |
%rep 31 |
%rep 31 |
134 |
times 4 dw i |
times 4 dw i |
135 |
%assign i i+1 |
%assign i i+1 |
136 |
%endrep |
%endrep |
137 |
|
|
138 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
|
; |
|
139 |
; saturation limits |
; saturation limits |
140 |
; |
;----------------------------------------------------------------------------- |
|
;=========================================================================== |
|
141 |
|
|
142 |
align 16 |
ALIGN 16 |
143 |
|
mmx_32767_minus_2047: |
144 |
|
times 4 dw (32767-2047) |
145 |
|
mmx_32768_minus_2048: |
146 |
|
times 4 dw (32768-2048) |
147 |
|
mmx_2047: |
148 |
|
times 4 dw 2047 |
149 |
|
mmx_minus_2048: |
150 |
|
times 4 dw (-2048) |
151 |
|
zero: |
152 |
|
times 4 dw 0 |
153 |
|
|
154 |
mmx_32767_minus_2047 times 4 dw (32767-2047) |
int_div: |
|
mmx_32768_minus_2048 times 4 dw (32768-2048) |
|
|
mmx_2047 times 4 dw 2047 |
|
|
mmx_minus_2048 times 4 dw (-2048) |
|
|
zero times 4 dw 0 |
|
|
|
|
|
int_div |
|
155 |
dd 0 |
dd 0 |
156 |
%assign i 1 |
%assign i 1 |
157 |
%rep 255 |
%rep 255 |
159 |
%assign i i+1 |
%assign i i+1 |
160 |
%endrep |
%endrep |
161 |
|
|
162 |
section .text |
;============================================================================= |
163 |
|
; Code |
164 |
|
;============================================================================= |
165 |
|
|
166 |
|
SECTION .text |
167 |
|
|
168 |
|
cglobal quant_mpeg_intra_xmm |
169 |
|
cglobal quant_mpeg_inter_xmm |
170 |
|
cglobal dequant_mpeg_intra_3dne |
171 |
|
cglobal dequant_mpeg_inter_3dne |
172 |
|
|
173 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
174 |
; |
; |
175 |
; void quant4_intra_xmm(int16_t * coeff, |
; uint32_t quant_mpeg_intra_xmm(int16_t * coeff, |
176 |
; const int16_t const * data, |
; const int16_t const * data, |
177 |
; const uint32_t quant, |
; const uint32_t quant, |
178 |
; const uint32_t dcscalar); |
; const uint32_t dcscalar, |
179 |
|
; const uint16_t *mpeg_matrices); |
180 |
; |
; |
181 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
182 |
|
|
183 |
align ALIGN |
ALIGN 16 |
|
cglobal quant_mpeg_intra_xmm |
|
184 |
quant_mpeg_intra_xmm: |
quant_mpeg_intra_xmm: |
185 |
mov eax, [esp + 8] ; data |
mov eax, [esp + 8] ; data |
186 |
mov ecx, [esp + 12] ; quant |
mov ecx, [esp + 12] ; quant |
189 |
push edi |
push edi |
190 |
push ebx |
push ebx |
191 |
nop |
nop |
192 |
mov edi,mmzero |
mov edi, [esp + 12 + 20] ; mpeg_quant_matrices |
193 |
mov esi,-14 |
mov esi,-14 |
194 |
pxor mm0,mm0 |
pxor mm0,mm0 |
195 |
pxor mm3,mm3 |
pxor mm3,mm3 |
199 |
jg near .lloop |
jg near .lloop |
200 |
nop6 |
nop6 |
201 |
|
|
202 |
|
ALIGN 16 |
|
align ALIGN |
|
203 |
.loop |
.loop |
204 |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
205 |
psubw mm0, mm1 ;-mm1 |
psubw mm0, mm1 ;-mm1 |
212 |
psraw mm4, 15 |
psraw mm4, 15 |
213 |
psllw mm0, 4 ;level << 4 ; |
psllw mm0, 4 ;level << 4 ; |
214 |
psllw mm3, 4 |
psllw mm3, 4 |
215 |
paddw mm0, [intra_matrix1 + 8*esi+112] |
paddw mm0, [edi + 128 + 8*esi+112] |
216 |
paddw mm3, [intra_matrix1 + 8*esi+120] |
paddw mm3, [edi + 128 + 8*esi+120] |
217 |
movq mm5, [intra_matrix_fixl + 8*esi+112] |
movq mm5, [edi + 384 + 8*esi+112] |
218 |
movq mm7, [intra_matrix_fixl + 8*esi+120] |
movq mm7, [edi + 384 + 8*esi+120] |
219 |
pmulhuw mm5, mm0 |
pmulhuw mm5, mm0 |
220 |
pmulhuw mm7, mm3 |
pmulhuw mm7, mm3 |
221 |
mov esp, esp |
mov esp, esp |
222 |
movq mm2, [intra_matrix + 8*esi+112] |
movq mm2, [edi + 8*esi+112] |
223 |
movq mm6, [intra_matrix + 8*esi+120] |
movq mm6, [edi + 8*esi+120] |
224 |
pmullw mm2, mm5 |
pmullw mm2, mm5 |
225 |
pmullw mm6, mm7 |
pmullw mm6, mm7 |
226 |
psubw mm0, mm2 |
psubw mm0, mm2 |
231 |
paddw mm5, mm2 |
paddw mm5, mm2 |
232 |
paddw mm7, mm2 |
paddw mm7, mm2 |
233 |
mov esp, esp |
mov esp, esp |
234 |
pmulhuw mm0, [intra_matrix_fix + 8*esi+112] |
pmulhuw mm0, [edi + 256 + 8*esi+112] |
235 |
pmulhuw mm3, [intra_matrix_fix + 8*esi+120] |
pmulhuw mm3, [edi + 256 + 8*esi+120] |
236 |
paddw mm5, mm0 |
paddw mm5, mm0 |
237 |
paddw mm7, mm3 |
paddw mm7, mm3 |
238 |
movq mm0, [edi] |
pxor mm0, mm0 |
239 |
movq mm3, [edi] |
pxor mm3, mm3 |
240 |
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 |
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 |
241 |
pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32) |
pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32) |
242 |
pxor mm5, mm1 ; mm0 *= sign(mm0) |
pxor mm5, mm1 ; mm0 *= sign(mm0) |
273 |
add esp, byte 12 |
add esp, byte 12 |
274 |
mov [edx], cx ; coeff[0] = ax |
mov [edx], cx ; coeff[0] = ax |
275 |
|
|
276 |
|
xor eax, eax |
277 |
ret |
ret |
278 |
|
|
279 |
align ALIGN |
ALIGN 16 |
280 |
.q1loop |
.q1loop |
281 |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
282 |
psubw mm0, mm1 ;-mm1 |
psubw mm0, mm1 ;-mm1 |
289 |
psraw mm4, 15 |
psraw mm4, 15 |
290 |
psllw mm0, 4 ; level << 4 |
psllw mm0, 4 ; level << 4 |
291 |
psllw mm3, 4 |
psllw mm3, 4 |
292 |
paddw mm0, [intra_matrix1 + 8*esi+112] ;mm0 is to be divided |
paddw mm0, [edi + 128 + 8*esi+112] ;mm0 is to be divided |
293 |
paddw mm3, [intra_matrix1 + 8*esi+120] ;intra1 contains fix for division by 1 |
paddw mm3, [edi + 128 + 8*esi+120] ;intra1 contains fix for division by 1 |
294 |
movq mm5, [intra_matrix_fixl + 8*esi+112] ;with rounding down |
movq mm5, [edi + 384 + 8*esi+112] ;with rounding down |
295 |
movq mm7, [intra_matrix_fixl + 8*esi+120] |
movq mm7, [edi + 384 + 8*esi+120] |
296 |
pmulhuw mm5, mm0 |
pmulhuw mm5, mm0 |
297 |
pmulhuw mm7, mm3 ;mm7: first approx of division |
pmulhuw mm7, mm3 ;mm7: first approx of division |
298 |
mov esp, esp |
mov esp, esp |
299 |
movq mm2, [intra_matrix + 8*esi+112] |
movq mm2, [edi + 8*esi+112] |
300 |
movq mm6, [intra_matrix + 8*esi+120] ; divs for q<=16 |
movq mm6, [edi + 8*esi+120] ; divs for q<=16 |
301 |
pmullw mm2, mm5 ;test value <= original |
pmullw mm2, mm5 ;test value <= original |
302 |
pmullw mm6, mm7 |
pmullw mm6, mm7 |
303 |
psubw mm0, mm2 ;mismatch |
psubw mm0, mm2 ;mismatch |
307 |
paddw mm5, mm2 ;first approx with quantd |
paddw mm5, mm2 ;first approx with quantd |
308 |
paddw mm7, mm2 |
paddw mm7, mm2 |
309 |
mov esp, esp |
mov esp, esp |
310 |
pmulhuw mm0, [intra_matrix_fix + 8*esi+112] ;correction |
pmulhuw mm0, [edi + 256 + 8*esi+112] ;correction |
311 |
pmulhuw mm3, [intra_matrix_fix + 8*esi+120] |
pmulhuw mm3, [edi + 256 + 8*esi+120] |
312 |
paddw mm5, mm0 ;final result with quantd |
paddw mm5, mm0 ;final result with quantd |
313 |
paddw mm7, mm3 |
paddw mm7, mm3 |
314 |
movq mm0, [edi] |
pxor mm0, mm0 |
315 |
movq mm3, [edi] |
pxor mm3, mm3 |
316 |
mov esp, esp |
mov esp, esp |
317 |
psrlw mm5, 1 ; (level + quantd) /2 (quant = 1) |
psrlw mm5, 1 ; (level + quantd) /2 (quant = 1) |
318 |
psrlw mm7, 1 |
psrlw mm7, 1 |
326 |
jng near .q1loop |
jng near .q1loop |
327 |
jmp near .done |
jmp near .done |
328 |
|
|
329 |
align 8 |
ALIGN 8 |
330 |
.lloop |
.lloop |
331 |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
332 |
psubw mm0, mm1 ;-mm1 |
psubw mm0, mm1 ;-mm1 |
333 |
movq mm4, [eax + 8*esi+120] ; |
movq mm4, [eax + 8*esi+120] |
334 |
psubw mm3, mm4 ;-mm4 |
psubw mm3, mm4 ;-mm4 |
335 |
pmaxsw mm0, mm1 ;|src| |
pmaxsw mm0, mm1 ;|src| |
336 |
pmaxsw mm3, mm4 |
pmaxsw mm3, mm4 |
339 |
psraw mm4, 15 |
psraw mm4, 15 |
340 |
psllw mm0, 4 ; level << 4 |
psllw mm0, 4 ; level << 4 |
341 |
psllw mm3, 4 ; |
psllw mm3, 4 ; |
342 |
paddw mm0, [intra_matrix1 + 8*esi+112] ;mm0 is to be divided intra1 contains fix for division by 1 |
paddw mm0, [edi + 128 + 8*esi+112] ;mm0 is to be divided intra1 contains fix for division by 1 |
343 |
paddw mm3, [intra_matrix1 + 8*esi+120] |
paddw mm3, [edi + 128 + 8*esi+120] |
344 |
movq mm5, [intra_matrix_fixl + 8*esi+112] |
movq mm5, [edi + 384 + 8*esi+112] |
345 |
movq mm7, [intra_matrix_fixl + 8*esi+120] |
movq mm7, [edi + 384 + 8*esi+120] |
346 |
pmulhuw mm5, mm0 |
pmulhuw mm5, mm0 |
347 |
pmulhuw mm7, mm3 ;mm7: first approx of division |
pmulhuw mm7, mm3 ;mm7: first approx of division |
348 |
mov esp, esp |
mov esp, esp |
349 |
movq mm2, [intra_matrix + 8*esi+112] |
movq mm2, [edi + 8*esi+112] |
350 |
movq mm6, [intra_matrix + 8*esi+120] |
movq mm6, [edi + 8*esi+120] |
351 |
pmullw mm2, mm5 ;test value <= original |
pmullw mm2, mm5 ;test value <= original |
352 |
pmullw mm6, mm7 |
pmullw mm6, mm7 |
353 |
psubw mm0, mm2 ;mismatch |
psubw mm0, mm2 ;mismatch |
358 |
paddw mm5, mm2 ;first approx with quantd |
paddw mm5, mm2 ;first approx with quantd |
359 |
paddw mm7, mm2 |
paddw mm7, mm2 |
360 |
mov esp, esp |
mov esp, esp |
361 |
pmulhuw mm0, [intra_matrix_fix + 8*esi+112] ;correction |
pmulhuw mm0, [edi + 256 + 8*esi+112] ;correction |
362 |
pmulhuw mm3, [intra_matrix_fix + 8*esi+120] |
pmulhuw mm3, [edi + 256 + 8*esi+120] |
363 |
paddw mm5, mm0 ;final result with quantd |
paddw mm5, mm0 ;final result with quantd |
364 |
paddw mm7, mm3 |
paddw mm7, mm3 |
365 |
movq mm0, [edi] |
pxor mm0, mm0 |
366 |
movq mm3, [edi] |
pxor mm3, mm3 |
367 |
mov esp, esp |
mov esp, esp |
368 |
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 |
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 |
369 |
pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32) |
pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32) |
379 |
jng near .lloop |
jng near .lloop |
380 |
jmp near .done |
jmp near .done |
381 |
|
|
382 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
383 |
; |
; |
384 |
; uint32_t quant4_inter_xmm(int16_t * coeff, |
; uint32_t quant_mpeg_inter_xmm(int16_t * coeff, |
385 |
; const int16_t const * data, |
; const int16_t const * data, |
386 |
; const uint32_t quant); |
; const uint32_t quant, |
387 |
|
; const uint16_t *mpeg_matrices); |
388 |
; |
; |
389 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
390 |
|
|
391 |
align ALIGN |
ALIGN 16 |
|
cglobal quant_mpeg_inter_xmm |
|
392 |
quant_mpeg_inter_xmm: |
quant_mpeg_inter_xmm: |
393 |
mov eax, [esp + 8] ; data |
mov eax, [esp + 8] ; data |
394 |
mov ecx, [esp + 12] ; quant |
mov ecx, [esp + 12] ; quant |
397 |
push edi |
push edi |
398 |
push ebx |
push ebx |
399 |
nop |
nop |
400 |
mov edi,mmzero |
mov edi, [esp + 12 + 16] |
401 |
mov esi,-14 |
mov esi,-14 |
402 |
mov ebx,esp |
mov ebx,esp |
403 |
sub esp,byte 24 |
sub esp,byte 24 |
404 |
lea ebx,[esp+8] |
lea ebx,[esp+8] |
405 |
and ebx,byte -8 ;align 8 |
and ebx, byte -8 ;ALIGN 8 |
406 |
pxor mm0,mm0 |
pxor mm0,mm0 |
407 |
pxor mm3,mm3 |
pxor mm3,mm3 |
408 |
movq [byte ebx],mm0 |
movq [byte ebx],mm0 |
413 |
jg near .lloop |
jg near .lloop |
414 |
nop |
nop |
415 |
|
|
416 |
align ALIGN |
ALIGN 16 |
417 |
.loop |
.loop |
418 |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
419 |
psubw mm0,mm1 ;-mm1 |
psubw mm0,mm1 ;-mm1 |
426 |
psraw mm4,15 |
psraw mm4,15 |
427 |
psllw mm0, 4 ; level << 4 |
psllw mm0, 4 ; level << 4 |
428 |
psllw mm3, 4 ; |
psllw mm3, 4 ; |
429 |
paddw mm0, [inter_matrix1 + 8*esi+112] |
paddw mm0, [edi + 640 + 8*esi+112] |
430 |
paddw mm3, [inter_matrix1 + 8*esi+120] |
paddw mm3, [edi + 640 + 8*esi+120] |
431 |
movq mm5,[inter_matrix_fixl + 8*esi+112] |
movq mm5, [edi + 896 + 8*esi+112] |
432 |
movq mm7,[inter_matrix_fixl + 8*esi+120] |
movq mm7, [edi + 896 + 8*esi+120] |
433 |
pmulhuw mm5,mm0 |
pmulhuw mm5,mm0 |
434 |
pmulhuw mm7,mm3 |
pmulhuw mm7,mm3 |
435 |
mov esp,esp |
mov esp,esp |
436 |
movq mm2,[inter_matrix + 8*esi+112] |
movq mm2, [edi + 512 + 8*esi+112] |
437 |
movq mm6,[inter_matrix + 8*esi+120] |
movq mm6, [edi + 512 + 8*esi+120] |
438 |
pmullw mm2,mm5 |
pmullw mm2,mm5 |
439 |
pmullw mm6,mm7 |
pmullw mm6,mm7 |
440 |
psubw mm0,mm2 |
psubw mm0,mm2 |
441 |
psubw mm3,mm6 |
psubw mm3,mm6 |
442 |
movq mm2,[byte ebx] |
movq mm2,[byte ebx] |
443 |
movq mm6,[mmx_divs + ecx * 8 - 8] |
movq mm6,[mmx_divs + ecx * 8 - 8] |
444 |
pmulhuw mm0,[inter_matrix_fix + 8*esi+112] |
pmulhuw mm0, [edi + 768 + 8*esi+112] |
445 |
pmulhuw mm3,[inter_matrix_fix + 8*esi+120] |
pmulhuw mm3, [edi + 768 + 8*esi+120] |
446 |
paddw mm2,[ebx+8] ;sum |
paddw mm2,[ebx+8] ;sum |
447 |
paddw mm5,mm0 |
paddw mm5,mm0 |
448 |
paddw mm7,mm3 |
paddw mm7,mm3 |
449 |
movq mm0,[edi] |
pxor mm0, mm0 |
450 |
movq mm3,[edi] |
pxor mm3, mm3 |
451 |
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 |
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 |
452 |
pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32) |
pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32) |
453 |
add esi,byte 2 |
add esi,byte 2 |
477 |
|
|
478 |
ret |
ret |
479 |
|
|
480 |
align ALIGN |
ALIGN 16 |
481 |
.q1loop |
.q1loop |
482 |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
483 |
psubw mm0,mm1 ;-mm1 |
psubw mm0,mm1 ;-mm1 |
484 |
movq mm4, [eax + 8*esi+120] ; |
movq mm4, [eax + 8*esi+120] |
485 |
psubw mm3,mm4 ;-mm4 |
psubw mm3,mm4 ;-mm4 |
486 |
pmaxsw mm0,mm1 ;|src| |
pmaxsw mm0,mm1 ;|src| |
487 |
pmaxsw mm3,mm4 |
pmaxsw mm3,mm4 |
490 |
psraw mm4,15 |
psraw mm4,15 |
491 |
psllw mm0, 4 ; level << 4 |
psllw mm0, 4 ; level << 4 |
492 |
psllw mm3, 4 |
psllw mm3, 4 |
493 |
paddw mm0, [inter_matrix1 + 8*esi+112] ;mm0 is to be divided |
paddw mm0, [edi + 640 + 8*esi+112] ;mm0 is to be divided |
494 |
paddw mm3, [inter_matrix1 + 8*esi+120] ; inter1 contains fix for division by 1 |
paddw mm3, [edi + 640 + 8*esi+120] ; inter1 contains fix for division by 1 |
495 |
movq mm5,[inter_matrix_fixl + 8*esi+112] ;with rounding down |
movq mm5, [edi + 896 + 8*esi+112] ;with rounding down |
496 |
movq mm7,[inter_matrix_fixl + 8*esi+120] |
movq mm7, [edi + 896 + 8*esi+120] |
497 |
pmulhuw mm5,mm0 |
pmulhuw mm5,mm0 |
498 |
pmulhuw mm7,mm3 ;mm7: first approx of division |
pmulhuw mm7,mm3 ;mm7: first approx of division |
499 |
mov esp,esp |
mov esp,esp |
500 |
movq mm2,[inter_matrix + 8*esi+112] |
movq mm2, [edi + 512 + 8*esi+112] |
501 |
movq mm6,[inter_matrix + 8*esi+120] ; divs for q<=16 |
movq mm6, [edi + 512 + 8*esi+120] ; divs for q<=16 |
502 |
pmullw mm2,mm5 ;test value <= original |
pmullw mm2,mm5 ;test value <= original |
503 |
pmullw mm6,mm7 |
pmullw mm6,mm7 |
504 |
psubw mm0,mm2 ;mismatch |
psubw mm0,mm2 ;mismatch |
505 |
psubw mm3,mm6 |
psubw mm3,mm6 |
506 |
movq mm2,[byte ebx] |
movq mm2,[byte ebx] |
507 |
pmulhuw mm0,[inter_matrix_fix + 8*esi+112] ;correction |
pmulhuw mm0, [edi + 768 + 8*esi+112] ;correction |
508 |
pmulhuw mm3,[inter_matrix_fix + 8*esi+120] |
pmulhuw mm3, [edi + 768 + 8*esi+120] |
509 |
paddw mm2,[ebx+8] ;sum |
paddw mm2,[ebx+8] ;sum |
510 |
paddw mm5,mm0 ;final result |
paddw mm5,mm0 ;final result |
511 |
paddw mm7,mm3 |
paddw mm7,mm3 |
512 |
movq mm0,[edi] |
pxor mm0, mm0 |
513 |
movq mm3,[edi] |
pxor mm3, mm3 |
514 |
psrlw mm5, 1 ; (level ) /2 (quant = 1) |
psrlw mm5, 1 ; (level ) /2 (quant = 1) |
515 |
psrlw mm7, 1 |
psrlw mm7, 1 |
516 |
add esi,byte 2 |
add esi,byte 2 |
526 |
jng near .q1loop |
jng near .q1loop |
527 |
jmp near .done |
jmp near .done |
528 |
|
|
529 |
align 8 |
ALIGN 8 |
530 |
.lloop |
.lloop |
531 |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
532 |
psubw mm0,mm1 ;-mm1 |
psubw mm0,mm1 ;-mm1 |
533 |
movq mm4, [eax + 8*esi+120] ; |
movq mm4, [eax + 8*esi+120] |
534 |
psubw mm3,mm4 ;-mm4 |
psubw mm3,mm4 ;-mm4 |
535 |
pmaxsw mm0,mm1 ;|src| |
pmaxsw mm0,mm1 ;|src| |
536 |
pmaxsw mm3,mm4 |
pmaxsw mm3,mm4 |
539 |
psraw mm4,15 |
psraw mm4,15 |
540 |
psllw mm0, 4 ; level << 4 |
psllw mm0, 4 ; level << 4 |
541 |
psllw mm3, 4 ; |
psllw mm3, 4 ; |
542 |
paddw mm0, [inter_matrix1 + 8*esi+112] ;mm0 is to be divided inter1 contains fix for division by 1 |
paddw mm0, [edi + 640 + 8*esi+112] ;mm0 is to be divided inter1 contains fix for division by 1 |
543 |
paddw mm3, [inter_matrix1 + 8*esi+120] |
paddw mm3, [edi + 640 + 8*esi+120] |
544 |
movq mm5,[inter_matrix_fixl + 8*esi+112] |
movq mm5,[edi + 896 + 8*esi+112] |
545 |
movq mm7,[inter_matrix_fixl + 8*esi+120] |
movq mm7,[edi + 896 + 8*esi+120] |
546 |
pmulhuw mm5,mm0 |
pmulhuw mm5,mm0 |
547 |
pmulhuw mm7,mm3 ;mm7: first approx of division |
pmulhuw mm7,mm3 ;mm7: first approx of division |
548 |
mov esp,esp |
mov esp,esp |
549 |
movq mm2,[inter_matrix + 8*esi+112] |
movq mm2,[edi + 512 + 8*esi+112] |
550 |
movq mm6,[inter_matrix + 8*esi+120] |
movq mm6,[edi + 512 + 8*esi+120] |
551 |
pmullw mm2,mm5 ;test value <= original |
pmullw mm2,mm5 ;test value <= original |
552 |
pmullw mm6,mm7 |
pmullw mm6,mm7 |
553 |
psubw mm0,mm2 ;mismatch |
psubw mm0,mm2 ;mismatch |
554 |
psubw mm3,mm6 |
psubw mm3,mm6 |
555 |
movq mm2,[byte ebx] |
movq mm2,[byte ebx] |
556 |
movq mm6,[mmx_div + ecx * 8 - 8] ; divs for q<=16 |
movq mm6,[mmx_div + ecx * 8 - 8] ; divs for q<=16 |
557 |
pmulhuw mm0,[inter_matrix_fix + 8*esi+112] ;correction |
pmulhuw mm0,[edi + 768 + 8*esi+112] ;correction |
558 |
pmulhuw mm3,[inter_matrix_fix + 8*esi+120] |
pmulhuw mm3,[edi + 768 + 8*esi+120] |
559 |
paddw mm2,[ebx+8] ;sum |
paddw mm2,[ebx+8] ;sum |
560 |
paddw mm5,mm0 ;final result |
paddw mm5,mm0 ;final result |
561 |
paddw mm7,mm3 |
paddw mm7,mm3 |
562 |
movq mm0,[edi] |
pxor mm0,mm0 |
563 |
movq mm3,[edi] |
pxor mm3,mm3 |
564 |
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 |
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 |
565 |
pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32) |
pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32) |
566 |
add esi,byte 2 |
add esi,byte 2 |
579 |
jmp near .done |
jmp near .done |
580 |
|
|
581 |
|
|
582 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
583 |
; |
; |
584 |
; void dequant4_intra_mmx(int16_t *data, |
; uint32_t dequant_mpeg_intra_3dne(int16_t *data, |
585 |
; const int16_t const *coeff, |
; const int16_t const *coeff, |
586 |
; const uint32_t quant, |
; const uint32_t quant, |
587 |
; const uint32_t dcscalar); |
; const uint32_t dcscalar, |
588 |
|
; const uint16_t *mpeg_matrices); |
589 |
; |
; |
590 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
591 |
|
|
592 |
; Note: in order to saturate 'easily', we pre-shift the quantifier |
; Note: in order to saturate 'easily', we pre-shift the quantifier |
593 |
; by 4. Then, the high-word of (coeff[]*matrix[i]*quant) are used to |
; by 4. Then, the high-word of (coeff[]*matrix[i]*quant) are used to |
598 |
; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a |
; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a |
599 |
; and quant in [1..31]. |
; and quant in [1..31]. |
600 |
; |
; |
601 |
;******************************************************************** |
|
602 |
%macro DEQUANT4INTRAMMX 1 |
%macro DEQUANT4INTRAMMX 1 |
603 |
movq mm1, [byte ecx+ 16 * %1] ; mm0 = c = coeff[i] |
movq mm1, [byte ecx+ 16 * %1] ; mm0 = c = coeff[i] |
604 |
movq mm4, [ecx+ 16 * %1 +8]; mm3 = c' = coeff[i+1] |
movq mm4, [ecx+ 16 * %1 +8]; mm3 = c' = coeff[i+1] |
612 |
movq mm2,[eax+8] ;preshifted quant |
movq mm2,[eax+8] ;preshifted quant |
613 |
movq mm7,[eax+8] |
movq mm7,[eax+8] |
614 |
%endif |
%endif |
615 |
pmullw mm2, [intra_matrix + 16 * %1 ] ; matrix[i]*quant |
pmullw mm2, [edi + 16 * %1 ] ; matrix[i]*quant |
616 |
pmullw mm7, [intra_matrix + 16 * %1 +8] ; matrix[i+1]*quant |
pmullw mm7, [edi + 16 * %1 +8] ; matrix[i+1]*quant |
617 |
movq mm5,mm0 |
movq mm5,mm0 |
618 |
movq mm6,mm3 |
movq mm6,mm3 |
619 |
pmulhw mm0, mm2 ; high of coeff*(matrix*quant) |
pmulhw mm0, mm2 ; high of coeff*(matrix*quant) |
638 |
movq [edx + 16 * %1 +8], mm7 ; data[i+1] |
movq [edx + 16 * %1 +8], mm7 ; data[i+1] |
639 |
%endmacro |
%endmacro |
640 |
|
|
641 |
align 16 |
ALIGN 16 |
|
cglobal dequant_mpeg_intra_3dne |
|
642 |
dequant_mpeg_intra_3dne: |
dequant_mpeg_intra_3dne: |
643 |
mov eax, [esp+12] ; quant |
mov eax, [esp+12] ; quant |
644 |
mov ecx, [esp+8] ; coeff |
mov ecx, [esp+8] ; coeff |
652 |
push esi |
push esi |
653 |
lea eax,[esp-28] |
lea eax,[esp-28] |
654 |
sub esp,byte 32 |
sub esp,byte 32 |
655 |
and eax,byte -8 ;points to qword aligned space on stack |
and eax, byte -8 ;points to qword ALIGNed space on stack |
656 |
movq [eax],mm0 |
movq [eax],mm0 |
657 |
movq [eax+8],mm7 |
movq [eax+8],mm7 |
658 |
imul ebx,[esp+16+8+32] ; dcscalar |
imul ebx,[esp+16+8+32] ; dcscalar |
659 |
movq mm2,mm7 |
movq mm2,mm7 |
660 |
|
push edi |
661 |
|
mov edi, [esp + 32 + 12 + 20] ; mpeg_quant_matrices |
662 |
align 4 |
ALIGN 4 |
663 |
|
|
664 |
DEQUANT4INTRAMMX 0 |
DEQUANT4INTRAMMX 0 |
665 |
|
|
681 |
|
|
682 |
DEQUANT4INTRAMMX 3 |
DEQUANT4INTRAMMX 3 |
683 |
|
|
684 |
mov esi, [esp+32] |
mov esi, [esp+36] |
685 |
mov [byte edx], bx |
mov [byte edx], bx |
686 |
mov ebx, [esp+32+4] |
mov ebx, [esp+36+4] |
687 |
|
|
688 |
DEQUANT4INTRAMMX 4 |
DEQUANT4INTRAMMX 4 |
689 |
DEQUANT4INTRAMMX 5 |
DEQUANT4INTRAMMX 5 |
690 |
DEQUANT4INTRAMMX 6 |
DEQUANT4INTRAMMX 6 |
691 |
DEQUANT4INTRAMMX 7 |
DEQUANT4INTRAMMX 7 |
692 |
|
|
693 |
|
pop edi |
694 |
|
|
695 |
add esp, byte 32+8 |
add esp, byte 32+8 |
696 |
|
|
697 |
|
xor eax, eax |
698 |
ret |
ret |
699 |
|
|
700 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
701 |
; |
; |
702 |
; void dequant4_inter_3dne(int16_t * data, |
; uint32_t dequant_mpeg_inter_3dne(int16_t * data, |
703 |
; const int16_t * const coeff, |
; const int16_t * const coeff, |
704 |
; const uint32_t quant); |
; const uint32_t quant, |
705 |
|
; const uint16_t *mpeg_matrices); |
706 |
; |
; |
707 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
708 |
|
|
709 |
; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier |
; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier |
710 |
; so we handle the 3 cases: c<0, c==0, and c>0 in one shot. |
; so we handle the 3 cases: c<0, c==0, and c>0 in one shot. |
711 |
; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0. |
; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0. |
712 |
; It's mixed with the extraction of the absolute value. |
; It's mixed with the extraction of the absolute value. |
713 |
|
|
714 |
align 16 |
ALIGN 16 |
|
cglobal dequant_mpeg_inter_3dne |
|
715 |
dequant_mpeg_inter_3dne: |
dequant_mpeg_inter_3dne: |
716 |
mov edx, [esp+ 4] ; data |
mov edx, [esp+ 4] ; data |
717 |
mov ecx, [esp+ 8] ; coeff |
mov ecx, [esp+ 8] ; coeff |
721 |
paddw mm7, mm7 ; << 1 |
paddw mm7, mm7 ; << 1 |
722 |
pxor mm6, mm6 ; mismatch sum |
pxor mm6, mm6 ; mismatch sum |
723 |
push esi |
push esi |
724 |
|
push edi |
725 |
mov esi,mmzero |
mov esi,mmzero |
726 |
pxor mm1,mm1 |
pxor mm1,mm1 |
727 |
pxor mm3,mm3 |
pxor mm3,mm3 |
728 |
|
mov edi, [esp + 8 + 16] ; mpeg_quant_matrices |
729 |
nop |
nop |
730 |
nop4 |
nop4 |
731 |
|
|
732 |
align 16 |
ALIGN 16 |
733 |
.loop |
.loop |
734 |
movq mm0, [ecx+8*eax + 7*16 ] ; mm0 = coeff[i] |
movq mm0, [ecx+8*eax + 7*16 ] ; mm0 = coeff[i] |
735 |
pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved) |
pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved) |
756 |
|
|
757 |
movq mm4, mm7 ; (matrix*quant) |
movq mm4, mm7 ; (matrix*quant) |
758 |
nop |
nop |
759 |
pmullw mm4, [inter_matrix + 8*eax + 7*16] |
pmullw mm4, [edi + 512 + 8*eax + 7*16] |
760 |
movq mm5, mm4 |
movq mm5, mm4 |
761 |
pmulhw mm5, mm0 ; high of c*(matrix*quant) |
pmulhw mm5, mm0 ; high of c*(matrix*quant) |
762 |
pmullw mm0, mm4 ; low of c*(matrix*quant) |
pmullw mm0, mm4 ; low of c*(matrix*quant) |
763 |
|
|
764 |
movq mm4, mm7 ; (matrix*quant) |
movq mm4, mm7 ; (matrix*quant) |
765 |
pmullw mm4, [inter_matrix + 8*eax + 7*16 + 8] |
pmullw mm4, [edi + 512 + 8*eax + 7*16 + 8] |
766 |
add eax,byte 2 |
add eax,byte 2 |
767 |
|
|
768 |
pcmpgtw mm5, [esi] |
pcmpgtw mm5, [esi] |
801 |
pxor mm1, mm2 |
pxor mm1, mm2 |
802 |
pxor mm6, mm1 |
pxor mm6, mm1 |
803 |
movd eax, mm6 |
movd eax, mm6 |
804 |
|
pop edi |
805 |
and eax,byte 1 |
and eax,byte 1 |
806 |
xor eax,byte 1 |
xor eax,byte 1 |
807 |
mov esi,[esp] |
mov esi,[esp] |
808 |
add esp,byte 4 |
add esp,byte 4 |
809 |
xor word [edx + 2*63], ax |
xor word [edx + 2*63], ax |
810 |
|
|
811 |
|
xor eax, eax |
812 |
ret |
ret |