Annotation of /xvidcore/src/quant/x86_asm/quantize_mpeg_mmx.asm

Revision 1.5 - (view) (download)

1 :	edgomez	1.2	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dne Quantization/Dequantization -
5 :			; *
6 :			; * Copyright (C) 2002-2003 Peter Ross <pross@xvid.org>
7 :			; * 2002-2003 Michael Militzer <isibaar@xvid.org>
8 :			; * 2002-2003 Pascal Massimino <skal@planet-d.net>
9 :			; *
10 :			; * This program is free software ; you can redistribute it and/or modify
11 :			; * it under the terms of the GNU General Public License as published by
12 :			; * the Free Software Foundation ; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :			; *
15 :			; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :			; *
20 :			; * You should have received a copy of the GNU General Public License
21 :			; * along with this program ; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :			; *
24 :			; * $Id$
25 :			; *
26 :			; *************************************************************************/
27 :
28 :			%define SATURATE
29 :
30 :			BITS 32
31 :
32 :			%macro cglobal 1
33 :			%ifdef PREFIX
34 :	edgomez	1.4	%ifdef MARK_FUNCS
35 :	edgomez	1.5	global _%1:function %1.endfunc-%1
36 :			%define %1 _%1:function %1.endfunc-%1
37 :	edgomez	1.4	%else
38 :			global _%1
39 :			%define %1 _%1
40 :			%endif
41 :	edgomez	1.2	%else
42 :	edgomez	1.4	%ifdef MARK_FUNCS
43 :	edgomez	1.5	global %1:function %1.endfunc-%1
44 :	edgomez	1.4	%else
45 :			global %1
46 :			%endif
47 :	edgomez	1.2	%endif
48 :			%endmacro
49 :
50 :			%macro cextern 1
51 :			%ifdef PREFIX
52 :			extern _%1
53 :			%define %1 _%1
54 :			%else
55 :			extern %1
56 :			%endif
57 :			%endmacro
58 :
59 :			;=============================================================================
60 :			; Local data (Read Only)
61 :			;=============================================================================
62 :
63 :			%ifdef FORMAT_COFF
64 :	edgomez	1.3	SECTION .rodata
65 :	edgomez	1.2	%else
66 :	edgomez	1.3	SECTION .rodata align=16
67 :	edgomez	1.2	%endif
68 :
69 :			mmx_one:
70 :			times 4 dw 1
71 :
72 :			;-----------------------------------------------------------------------------
73 :			; divide by 2Q table
74 :			;-----------------------------------------------------------------------------
75 :
76 :			ALIGN 16
77 :			mmx_div:
78 :			times 4 dw 65535 ; the div by 2 formula will overflow for the case
79 :			; quant=1 but we don't care much because quant=1
80 :			; is handled by a different piece of code that
81 :			; doesn't use this table.
82 :			%assign quant 2
83 :			%rep 30
84 :			times 4 dw (1<<17) / (quant*2) + 1
85 :			%assign quant quant+1
86 :			%endrep
87 :
88 :			%define VM18P 3
89 :			%define VM18Q 4
90 :
91 :
92 :			;-----------------------------------------------------------------------------
93 :			; quantd table
94 :			;-----------------------------------------------------------------------------
95 :
96 :			quantd:
97 :			%assign quant 1
98 :			%rep 31
99 :			times 4 dw ((VM18P*quant) + (VM18Q/2)) / VM18Q
100 :			%assign quant quant+1
101 :			%endrep
102 :
103 :			;-----------------------------------------------------------------------------
104 :			; multiple by 2Q table
105 :			;-----------------------------------------------------------------------------
106 :
107 :			mmx_mul_quant:
108 :			%assign quant 1
109 :			%rep 31
110 :			times 4 dw quant
111 :			%assign quant quant+1
112 :			%endrep
113 :
114 :			;-----------------------------------------------------------------------------
115 :			; saturation limits
116 :			;-----------------------------------------------------------------------------
117 :
118 :			ALIGN 16
119 :
120 :			mmx_32767_minus_2047:
121 :			times 4 dw (32767-2047)
122 :			mmx_32768_minus_2048:
123 :			times 4 dw (32768-2048)
124 :			mmx_2047:
125 :			times 4 dw 2047
126 :			mmx_minus_2048:
127 :			times 4 dw (-2048)
128 :			zero:
129 :			times 4 dw 0
130 :
131 :			;=============================================================================
132 :			; Code
133 :			;=============================================================================
134 :
135 :			SECTION .text
136 :
137 :			cglobal quant_mpeg_intra_mmx
138 :			cglobal quant_mpeg_inter_mmx
139 :			cglobal dequant_mpeg_intra_mmx
140 :			cglobal dequant_mpeg_inter_mmx
141 :
142 :			;-----------------------------------------------------------------------------
143 :			;
144 :			; uint32_t quant_mpeg_intra_mmx(int16_t * coeff,
145 :			; const int16_t const * data,
146 :			; const uint32_t quant,
147 :			; const uint32_t dcscalar,
148 :			; const uint16_t *mpeg_matrices);
149 :			;
150 :			;-----------------------------------------------------------------------------
151 :
152 :			ALIGN 16
153 :			quant_mpeg_intra_mmx:
154 :
155 :			push ecx
156 :			push esi
157 :			push edi
158 :			push ebx
159 :
160 :			mov edi, [esp + 16 + 4] ; coeff
161 :			mov esi, [esp + 16 + 8] ; data
162 :			mov eax, [esp + 16 + 12] ; quant
163 :			mov ebx, [esp + 16 + 20] ; mpeg_quant_matrices
164 :
165 :			movq mm5, [quantd + eax * 8 - 8] ; quantd -> mm5
166 :
167 :			xor ecx, ecx
168 :			cmp al, 1
169 :			jz near .q1loop
170 :
171 :			cmp al, 2
172 :			jz near .q2loop
173 :
174 :			movq mm7, [mmx_div + eax * 8 - 8] ; multipliers[quant] -> mm7
175 :
176 :			ALIGN 16
177 :			.loop
178 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
179 :			movq mm3, [esi + 8*ecx + 8] ;
180 :			pxor mm1, mm1 ; mm1 = 0
181 :			pxor mm4, mm4
182 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
183 :			pcmpgtw mm4, mm3
184 :			pxor mm0, mm1 ; mm0 = \|mm0\|
185 :			pxor mm3, mm4 ;
186 :			psubw mm0, mm1 ; displace
187 :			psubw mm3, mm4 ;
188 :			psllw mm0, 4 ; level << 4
189 :			psllw mm3, 4
190 :			movq mm2, [ebx + 8*ecx]
191 :			psrlw mm2, 1 ; intra_matrix[i]>>1
192 :			paddw mm0, mm2
193 :			movq mm2, [ebx + 256 + ecx*8]
194 :			pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
195 :			movq mm2, [ebx + 8*ecx + 8]
196 :			psrlw mm2, 1
197 :			paddw mm3, mm2
198 :			movq mm2, [ebx + 256 + ecx*8 + 8]
199 :			pmulhw mm3, mm2
200 :			paddw mm0, mm5 ; + quantd
201 :			paddw mm3, mm5
202 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
203 :			pmulhw mm3, mm7 ;
204 :			psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17
205 :			psrlw mm3, 1
206 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
207 :			pxor mm3, mm4 ;
208 :			psubw mm0, mm1 ; undisplace
209 :			psubw mm3, mm4 ;
210 :
211 :			movq [edi + 8*ecx], mm0
212 :			movq [edi + 8*ecx + 8], mm3
213 :
214 :			add ecx,2
215 :			cmp ecx,16
216 :			jnz near .loop
217 :
218 :			.done
219 :			; caclulate data[0] // (int32_t)dcscalar)
220 :			mov ecx, [esp + 16 + 16] ; dcscalar
221 :			mov edx, ecx
222 :			movsx eax, word [esi] ; data[0]
223 :			shr edx, 1 ; edx = dcscalar /2
224 :			cmp eax, 0
225 :			jg .gtzero
226 :
227 :			sub eax, edx
228 :			jmp short .mul
229 :			.gtzero
230 :			add eax, edx
231 :			.mul
232 :			cdq ; expand eax -> edx:eax
233 :			idiv ecx ; eax = edx:eax / dcscalar
234 :
235 :			mov [edi], ax ; coeff[0] = ax
236 :
237 :			pop ebx
238 :			pop edi
239 :			pop esi
240 :			pop ecx
241 :
242 :			xor eax, eax ; return(0);
243 :			ret
244 :
245 :			ALIGN 16
246 :			.q1loop
247 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
248 :			movq mm3, [esi + 8*ecx + 8] ;
249 :			pxor mm1, mm1 ; mm1 = 0
250 :			pxor mm4, mm4 ;
251 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
252 :			pcmpgtw mm4, mm3 ;
253 :			pxor mm0, mm1 ; mm0 = \|mm0\|
254 :			pxor mm3, mm4 ;
255 :			psubw mm0, mm1 ; displace
256 :			psubw mm3, mm4 ;
257 :			psllw mm0, 4
258 :			psllw mm3, 4
259 :			movq mm2, [ebx + 8*ecx]
260 :			psrlw mm2, 1
261 :			paddw mm0, mm2
262 :			movq mm2, [ebx + 256 + ecx*8]
263 :			pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
264 :			movq mm2, [ebx + 8*ecx + 8]
265 :			psrlw mm2, 1
266 :			paddw mm3, mm2
267 :			movq mm2, [ebx + 256 + ecx*8 + 8]
268 :			pmulhw mm3, mm2
269 :			paddw mm0, mm5
270 :			paddw mm3, mm5
271 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
272 :			psrlw mm3, 1 ;
273 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
274 :			pxor mm3, mm4 ;
275 :			psubw mm0, mm1 ; undisplace
276 :			psubw mm3, mm4 ;
277 :			movq [edi + 8*ecx], mm0
278 :			movq [edi + 8*ecx + 8], mm3
279 :
280 :			add ecx, 2
281 :			cmp ecx, 16
282 :			jnz near .q1loop
283 :			jmp near .done
284 :
285 :
286 :			ALIGN 16
287 :			.q2loop
288 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
289 :			movq mm3, [esi + 8*ecx + 8] ;
290 :			pxor mm1, mm1 ; mm1 = 0
291 :			pxor mm4, mm4 ;
292 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
293 :			pcmpgtw mm4, mm3 ;
294 :			pxor mm0, mm1 ; mm0 = \|mm0\|
295 :			pxor mm3, mm4 ;
296 :			psubw mm0, mm1 ; displace
297 :			psubw mm3, mm4 ;
298 :			psllw mm0, 4
299 :			psllw mm3, 4
300 :			movq mm2, [ebx + 8*ecx]
301 :			psrlw mm2, 1
302 :			paddw mm0, mm2
303 :			movq mm2, [ebx + 256 + ecx*8]
304 :			pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
305 :			movq mm2, [ebx + 8*ecx + 8]
306 :			psrlw mm2, 1
307 :			paddw mm3, mm2
308 :			movq mm2, [ebx + 256 + ecx*8 + 8]
309 :			pmulhw mm3, mm2
310 :			paddw mm0, mm5
311 :			paddw mm3, mm5
312 :			psrlw mm0, 2 ; mm0 >>= 1 (/4)
313 :			psrlw mm3, 2 ;
314 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
315 :			pxor mm3, mm4 ;
316 :			psubw mm0, mm1 ; undisplace
317 :			psubw mm3, mm4 ;
318 :			movq [edi + 8*ecx], mm0
319 :			movq [edi + 8*ecx + 8], mm3
320 :
321 :			add ecx,2
322 :			cmp ecx,16
323 :			jnz near .q2loop
324 :			jmp near .done
325 :	edgomez	1.5	.endfunc
326 :	edgomez	1.2
327 :			;-----------------------------------------------------------------------------
328 :			;
329 :			; uint32_t quant_mpeg_inter_mmx(int16_t * coeff,
330 :			; const int16_t const * data,
331 :			; const uint32_t quant,
332 :			; const uint16_t *mpeg_matrices);
333 :			;
334 :			;-----------------------------------------------------------------------------
335 :
336 :			ALIGN 16
337 :			quant_mpeg_inter_mmx:
338 :
339 :			push ecx
340 :			push esi
341 :			push edi
342 :			push ebx
343 :
344 :			mov edi, [esp + 16 + 4] ; coeff
345 :			mov esi, [esp + 16 + 8] ; data
346 :			mov eax, [esp + 16 + 12] ; quant
347 :			mov ebx, [esp + 16 + 16] ; mpeg_quant_matrices
348 :
349 :			xor ecx, ecx
350 :
351 :			pxor mm5, mm5 ; sum
352 :
353 :			cmp al, 1
354 :			jz near .q1loop
355 :
356 :			cmp al, 2
357 :			jz near .q2loop
358 :
359 :			movq mm7, [mmx_div + eax * 8 - 8] ; divider
360 :
361 :			ALIGN 16
362 :			.loop
363 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
364 :			movq mm3, [esi + 8*ecx + 8] ;
365 :			pxor mm1, mm1 ; mm1 = 0
366 :			pxor mm4, mm4 ;
367 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
368 :			pcmpgtw mm4, mm3 ;
369 :			pxor mm0, mm1 ; mm0 = \|mm0\|
370 :			pxor mm3, mm4 ;
371 :			psubw mm0, mm1 ; displace
372 :			psubw mm3, mm4 ;
373 :			psllw mm0, 4
374 :			psllw mm3, 4
375 :			movq mm2, [ebx + 512 + 8*ecx]
376 :			psrlw mm2, 1
377 :			paddw mm0, mm2
378 :			movq mm2, [ebx + 768 + ecx*8]
379 :			pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
380 :			movq mm2, [ebx + 512 + 8*ecx + 8]
381 :			psrlw mm2, 1
382 :			paddw mm3, mm2
383 :			movq mm2, [ebx + 768 + ecx*8 + 8]
384 :			pmulhw mm3, mm2
385 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
386 :			pmulhw mm3, mm7 ;
387 :			psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17
388 :			psrlw mm3, 1
389 :			paddw mm5, mm0 ; sum += mm0
390 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
391 :			paddw mm5, mm3 ;
392 :			pxor mm3, mm4 ;
393 :			psubw mm0, mm1 ; undisplace
394 :			psubw mm3, mm4
395 :			movq [edi + 8*ecx], mm0
396 :			movq [edi + 8*ecx + 8], mm3
397 :
398 :			add ecx, 2
399 :			cmp ecx, 16
400 :			jnz near .loop
401 :
402 :			.done
403 :			pmaddwd mm5, [mmx_one]
404 :			movq mm0, mm5
405 :			psrlq mm5, 32
406 :			paddd mm0, mm5
407 :			movd eax, mm0 ; return sum
408 :
409 :			pop ebx
410 :			pop edi
411 :			pop esi
412 :			pop ecx
413 :
414 :			ret
415 :
416 :			ALIGN 16
417 :			.q1loop
418 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
419 :			movq mm3, [esi + 8*ecx+ 8]
420 :			pxor mm1, mm1 ; mm1 = 0
421 :			pxor mm4, mm4 ;
422 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
423 :			pcmpgtw mm4, mm3 ;
424 :			pxor mm0, mm1 ; mm0 = \|mm0\|
425 :			pxor mm3, mm4 ;
426 :			psubw mm0, mm1 ; displace
427 :			psubw mm3, mm4 ;
428 :			psllw mm0, 4
429 :			psllw mm3, 4
430 :			movq mm2, [ebx + 512 + 8*ecx]
431 :			psrlw mm2, 1
432 :			paddw mm0, mm2
433 :			movq mm2, [ebx + 768 + ecx*8]
434 :			pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
435 :			movq mm2, [ebx + 512 + 8*ecx + 8]
436 :			psrlw mm2, 1
437 :			paddw mm3, mm2
438 :			movq mm2, [ebx + 768 + ecx*8 + 8]
439 :			pmulhw mm3, mm2
440 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
441 :			psrlw mm3, 1 ;
442 :			paddw mm5, mm0 ; sum += mm0
443 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
444 :			paddw mm5, mm3 ;
445 :			pxor mm3, mm4 ;
446 :			psubw mm0, mm1 ; undisplace
447 :			psubw mm3, mm4
448 :			movq [edi + 8*ecx], mm0
449 :			movq [edi + 8*ecx + 8], mm3
450 :
451 :			add ecx, 2
452 :			cmp ecx, 16
453 :			jnz near .q1loop
454 :
455 :			jmp .done
456 :
457 :			ALIGN 16
458 :			.q2loop
459 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
460 :			movq mm3, [esi + 8*ecx+ 8]
461 :			pxor mm1, mm1 ; mm1 = 0
462 :			pxor mm4, mm4 ;
463 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
464 :			pcmpgtw mm4, mm3 ;
465 :			pxor mm0, mm1 ; mm0 = \|mm0\|
466 :			pxor mm3, mm4 ;
467 :			psubw mm0, mm1 ; displace
468 :			psubw mm3, mm4 ;
469 :			psllw mm0, 4
470 :			psllw mm3, 4
471 :			movq mm2, [ebx + 512 + 8*ecx]
472 :			psrlw mm2, 1
473 :			paddw mm0, mm2
474 :			movq mm2, [ebx + 768 + ecx*8]
475 :			pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
476 :			movq mm2, [ebx + 512 + 8*ecx + 8]
477 :			psrlw mm2, 1
478 :			paddw mm3, mm2
479 :			movq mm2, [ebx + 768 + ecx*8 + 8]
480 :			pmulhw mm3, mm2
481 :			psrlw mm0, 2 ; mm0 >>= 1 (/2)
482 :			psrlw mm3, 2 ;
483 :			paddw mm5, mm0 ; sum += mm0
484 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
485 :			paddw mm5, mm3 ;
486 :			pxor mm3, mm4 ;
487 :			psubw mm0, mm1 ; undisplace
488 :			psubw mm3, mm4
489 :			movq [edi + 8*ecx], mm0
490 :			movq [edi + 8*ecx + 8], mm3
491 :
492 :			add ecx, 2
493 :			cmp ecx, 16
494 :			jnz near .q2loop
495 :
496 :			jmp .done
497 :	edgomez	1.5	.endfunc
498 :	edgomez	1.2
499 :
500 :			;-----------------------------------------------------------------------------
501 :			;
502 :			; uint32_t dequant_mpeg_intra_mmx(int16_t *data,
503 :			; const int16_t const *coeff,
504 :			; const uint32_t quant,
505 :			; const uint32_t dcscalar,
506 :			; const uint16_t *mpeg_matrices);
507 :			;
508 :			;-----------------------------------------------------------------------------
509 :
510 :			; Note: in order to saturate 'easily', we pre-shift the quantifier
511 :			; by 4. Then, the high-word of (coeff[]matrix[i]quant) are used to
512 :			; build a saturating mask. It is non-zero only when an overflow occured.
513 :			; We thus avoid packing/unpacking toward double-word.
514 :			; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,
515 :			; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not
516 :			; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a
517 :			; and quant in [1..31].
518 :			;
519 :			; The original loop is:
520 :			;
521 :			%if 0
522 :			movq mm0, [ecx+8eax + 816] ; mm0 = coeff[i]
523 :			pxor mm1, mm1
524 :			pcmpgtw mm1, mm0
525 :			pxor mm0, mm1 ; change sign if negative
526 :			psubw mm0, mm1 ; -> mm0 = abs(coeff[i]), mm1 = sign of coeff[i]
527 :
528 :			movq mm2, mm7 ; mm2 = quant
529 :			pmullw mm2, [ebx + 8eax + 816 ] ; matrix[i]*quant.
530 :
531 :			movq mm6, mm2
532 :			pmulhw mm2, mm0 ; high of coeff(matrixquant) (should be 0 if no overflow)
533 :			pmullw mm0, mm6 ; low of coeff(matrixquant)
534 :
535 :			pxor mm5, mm5
536 :			pcmpgtw mm2, mm5 ; otherflow?
537 :			psrlw mm2, 5 ; =0 if no clamp, 2047 otherwise
538 :			psrlw mm0, 5
539 :			paddw mm0, mm1 ; start restoring sign
540 :			por mm0, mm2 ; saturate to 2047 if needed
541 :			pxor mm0, mm1 ; finish negating back
542 :
543 :			movq [edx + 8eax + 816], mm0 ; data[i]
544 :			add eax, 1
545 :			%endif
546 :
547 :			;********************************************************************
548 :
549 :			ALIGN 16
550 :			dequant_mpeg_intra_mmx:
551 :
552 :			push ebx
553 :
554 :			mov edx, [esp + 4 + 4] ; data
555 :			mov ecx, [esp + 4 + 8] ; coeff
556 :			mov eax, [esp + 4 + 12] ; quant
557 :			mov ebx, [esp + 4 + 20] ; mpeg_quant_matrices
558 :
559 :			movq mm7, [mmx_mul_quant + eax*8 - 8]
560 :			mov eax, -16 ; to keep ALIGNed, we regularly process coeff[0]
561 :			psllw mm7, 2 ; << 2. See comment.
562 :			pxor mm6, mm6 ; this is a NOP
563 :
564 :			ALIGN 16
565 :			.loop
566 :			movq mm0, [ecx+8eax + 816] ; mm0 = c = coeff[i]
567 :			movq mm3, [ecx+8eax + 816 +8]; mm3 = c' = coeff[i+1]
568 :			pxor mm1, mm1
569 :			pxor mm4, mm4
570 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c)
571 :			movq mm2, mm7 ; mm2 = quant
572 :
573 :			pcmpgtw mm4, mm3 ; mm4 = sgn(c')
574 :			pmullw mm2, [ebx + 8eax + 816 ] ; matrix[i]*quant
575 :
576 :			pxor mm0, mm1 ; negate if negative
577 :			pxor mm3, mm4 ; negate if negative
578 :
579 :			psubw mm0, mm1
580 :			psubw mm3, mm4
581 :
582 :			; we're short on register, here. Poor pairing...
583 :
584 :			movq mm5, mm2
585 :			pmullw mm2, mm0 ; low of coeff(matrixquant)
586 :
587 :			pmulhw mm0, mm5 ; high of coeff(matrixquant)
588 :			movq mm5, mm7 ; mm2 = quant
589 :
590 :			pmullw mm5, [ebx + 8eax + 816 +8] ; matrix[i+1]*quant
591 :
592 :			movq mm6, mm5
593 :			add eax,2 ; z-flag will be tested later
594 :
595 :			pmullw mm6, mm3 ; low of coeff(matrixquant)
596 :			pmulhw mm3, mm5 ; high of coeff(matrixquant)
597 :
598 :			pcmpgtw mm0, [zero]
599 :			paddusw mm2, mm0
600 :			psrlw mm2, 5
601 :
602 :			pcmpgtw mm3, [zero]
603 :			paddusw mm6, mm3
604 :			psrlw mm6, 5
605 :
606 :			pxor mm2, mm1 ; start negating back
607 :			pxor mm6, mm4 ; start negating back
608 :
609 :			psubusw mm1, mm0
610 :			psubusw mm4, mm3
611 :
612 :			psubw mm2, mm1 ; finish negating back
613 :			psubw mm6, mm4 ; finish negating back
614 :
615 :			movq [edx + 8eax + 816 -2*8 ], mm2 ; data[i]
616 :			movq [edx + 8eax + 816 -2*8 +8], mm6 ; data[i+1]
617 :
618 :			jnz near .loop
619 :
620 :			; deal with DC
621 :			movd mm0, [ecx]
622 :			pmullw mm0, [esp + 4 + 16] ; dcscalar
623 :			movq mm2, [mmx_32767_minus_2047]
624 :			paddsw mm0, mm2
625 :			psubsw mm0, mm2
626 :			movq mm2, [mmx_32768_minus_2048]
627 :			psubsw mm0, mm2
628 :			paddsw mm0, mm2
629 :			movd eax, mm0
630 :			mov [edx], ax
631 :
632 :			xor eax, eax
633 :
634 :			pop ebx
635 :
636 :			ret
637 :	edgomez	1.5	.endfunc
638 :	edgomez	1.2
639 :			;-----------------------------------------------------------------------------
640 :			;
641 :			; uint32_t dequant_mpeg_inter_mmx(int16_t * data,
642 :			; const int16_t * const coeff,
643 :			; const uint32_t quant,
644 :			; const uint16_t *mpeg_matrices);
645 :			;
646 :			;-----------------------------------------------------------------------------
647 :
648 :			; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier
649 :			; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.
650 :			; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0.
651 :			; It's mixed with the extraction of the absolute value.
652 :
653 :			ALIGN 16
654 :			dequant_mpeg_inter_mmx:
655 :
656 :			push ebx
657 :
658 :			mov edx, [esp + 4 + 4] ; data
659 :			mov ecx, [esp + 4 + 8] ; coeff
660 :			mov eax, [esp + 4 + 12] ; quant
661 :			mov ebx, [esp + 4 + 16] ; mpeg_quant_matrices
662 :
663 :			movq mm7, [mmx_mul_quant + eax*8 - 8]
664 :			mov eax, -16
665 :			paddw mm7, mm7 ; << 1
666 :			pxor mm6, mm6 ; mismatch sum
667 :
668 :			ALIGN 16
669 :			.loop
670 :			movq mm0, [ecx+8eax + 816 ] ; mm0 = coeff[i]
671 :			movq mm2, [ecx+8eax + 816 +8] ; mm2 = coeff[i+1]
672 :			add eax, 2
673 :
674 :			pxor mm1, mm1
675 :			pxor mm3, mm3
676 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved)
677 :			pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved)
678 :			paddsw mm0, mm1 ; c += sgn(c)
679 :			paddsw mm2, mm3 ; c += sgn(c')
680 :			paddw mm0, mm0 ; c *= 2
681 :			paddw mm2, mm2 ; c'*= 2
682 :
683 :			pxor mm4, mm4
684 :			pxor mm5, mm5
685 :			psubw mm4, mm0 ; -c
686 :			psubw mm5, mm2 ; -c'
687 :			psraw mm4, 16 ; mm4 = sgn(-c)
688 :			psraw mm5, 16 ; mm5 = sgn(-c')
689 :			psubsw mm0, mm4 ; c -= sgn(-c)
690 :			psubsw mm2, mm5 ; c' -= sgn(-c')
691 :			pxor mm0, mm1 ; finish changing sign if needed
692 :			pxor mm2, mm3 ; finish changing sign if needed
693 :
694 :			; we're short on register, here. Poor pairing...
695 :
696 :			movq mm4, mm7 ; (matrix*quant)
697 :			pmullw mm4, [ebx + 512 + 8eax + 816 -2*8]
698 :			movq mm5, mm4
699 :			pmulhw mm5, mm0 ; high of c(matrixquant)
700 :			pmullw mm0, mm4 ; low of c(matrixquant)
701 :
702 :			movq mm4, mm7 ; (matrix*quant)
703 :			pmullw mm4, [ebx + 512 + 8eax + 816 -2*8 + 8]
704 :
705 :			pcmpgtw mm5, [zero]
706 :			paddusw mm0, mm5
707 :			psrlw mm0, 5
708 :			pxor mm0, mm1 ; start restoring sign
709 :			psubusw mm1, mm5
710 :
711 :			movq mm5, mm4
712 :			pmulhw mm5, mm2 ; high of c(matrixquant)
713 :			pmullw mm2, mm4 ; low of c(matrixquant)
714 :			psubw mm0, mm1 ; finish restoring sign
715 :
716 :			pcmpgtw mm5, [zero]
717 :			paddusw mm2, mm5
718 :			psrlw mm2, 5
719 :			pxor mm2, mm3 ; start restoring sign
720 :			psubusw mm3, mm5
721 :			psubw mm2, mm3 ; finish restoring sign
722 :
723 :			pxor mm6, mm0 ; mismatch control
724 :			movq [edx + 8eax + 816 -2*8 ], mm0 ; data[i]
725 :			pxor mm6, mm2 ; mismatch control
726 :			movq [edx + 8eax + 816 -2*8 +8], mm2 ; data[i+1]
727 :
728 :			jnz near .loop
729 :
730 :			; mismatch control
731 :
732 :			movq mm0, mm6
733 :			psrlq mm0, 48
734 :			movq mm1, mm6
735 :			movq mm2, mm6
736 :			psrlq mm1, 32
737 :			pxor mm6, mm0
738 :			psrlq mm2, 16
739 :			pxor mm6, mm1
740 :			pxor mm6, mm2
741 :			movd eax, mm6
742 :			and eax, 1
743 :			xor eax, 1
744 :			xor word [edx + 2*63], ax
745 :
746 :			xor eax, eax
747 :
748 :			pop ebx
749 :
750 :			ret
751 :	edgomez	1.5	.endfunc
752 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4