Annotation of /xvidcore/src/quant/x86_asm/quantize_mpeg_mmx.asm

Revision 1.2 - (view) (download)

1 :	edgomez	1.2	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dne Quantization/Dequantization -
5 :			; *
6 :			; * Copyright (C) 2002-2003 Peter Ross <pross@xvid.org>
7 :			; * 2002-2003 Michael Militzer <isibaar@xvid.org>
8 :			; * 2002-2003 Pascal Massimino <skal@planet-d.net>
9 :			; *
10 :			; * This program is free software ; you can redistribute it and/or modify
11 :			; * it under the terms of the GNU General Public License as published by
12 :			; * the Free Software Foundation ; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :			; *
15 :			; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :			; *
20 :			; * You should have received a copy of the GNU General Public License
21 :			; * along with this program ; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :			; *
24 :			; * $Id$
25 :			; *
26 :			; *************************************************************************/
27 :
28 :			%define SATURATE
29 :
30 :			BITS 32
31 :
32 :			%macro cglobal 1
33 :			%ifdef PREFIX
34 :			global _%1
35 :			%define %1 _%1
36 :			%else
37 :			global %1
38 :			%endif
39 :			%endmacro
40 :
41 :			%macro cextern 1
42 :			%ifdef PREFIX
43 :			extern _%1
44 :			%define %1 _%1
45 :			%else
46 :			extern %1
47 :			%endif
48 :			%endmacro
49 :
50 :			;=============================================================================
51 :			; Local data (Read Only)
52 :			;=============================================================================
53 :
54 :			%ifdef FORMAT_COFF
55 :			SECTION .rodata data
56 :			%else
57 :			SECTION .rodata data align=16
58 :			%endif
59 :
60 :			mmx_one:
61 :			times 4 dw 1
62 :
63 :			;-----------------------------------------------------------------------------
64 :			; divide by 2Q table
65 :			;-----------------------------------------------------------------------------
66 :
67 :			ALIGN 16
68 :			mmx_div:
69 :			times 4 dw 65535 ; the div by 2 formula will overflow for the case
70 :			; quant=1 but we don't care much because quant=1
71 :			; is handled by a different piece of code that
72 :			; doesn't use this table.
73 :			%assign quant 2
74 :			%rep 30
75 :			times 4 dw (1<<17) / (quant*2) + 1
76 :			%assign quant quant+1
77 :			%endrep
78 :
79 :			%define VM18P 3
80 :			%define VM18Q 4
81 :
82 :
83 :			;-----------------------------------------------------------------------------
84 :			; quantd table
85 :			;-----------------------------------------------------------------------------
86 :
87 :			quantd:
88 :			%assign quant 1
89 :			%rep 31
90 :			times 4 dw ((VM18P*quant) + (VM18Q/2)) / VM18Q
91 :			%assign quant quant+1
92 :			%endrep
93 :
94 :			;-----------------------------------------------------------------------------
95 :			; multiple by 2Q table
96 :			;-----------------------------------------------------------------------------
97 :
98 :			mmx_mul_quant:
99 :			%assign quant 1
100 :			%rep 31
101 :			times 4 dw quant
102 :			%assign quant quant+1
103 :			%endrep
104 :
105 :			;-----------------------------------------------------------------------------
106 :			; saturation limits
107 :			;-----------------------------------------------------------------------------
108 :
109 :			ALIGN 16
110 :
111 :			mmx_32767_minus_2047:
112 :			times 4 dw (32767-2047)
113 :			mmx_32768_minus_2048:
114 :			times 4 dw (32768-2048)
115 :			mmx_2047:
116 :			times 4 dw 2047
117 :			mmx_minus_2048:
118 :			times 4 dw (-2048)
119 :			zero:
120 :			times 4 dw 0
121 :
122 :			;=============================================================================
123 :			; Code
124 :			;=============================================================================
125 :
126 :			SECTION .text
127 :
128 :			cglobal quant_mpeg_intra_mmx
129 :			cglobal quant_mpeg_inter_mmx
130 :			cglobal dequant_mpeg_intra_mmx
131 :			cglobal dequant_mpeg_inter_mmx
132 :
133 :			;-----------------------------------------------------------------------------
134 :			;
135 :			; uint32_t quant_mpeg_intra_mmx(int16_t * coeff,
136 :			; const int16_t const * data,
137 :			; const uint32_t quant,
138 :			; const uint32_t dcscalar,
139 :			; const uint16_t *mpeg_matrices);
140 :			;
141 :			;-----------------------------------------------------------------------------
142 :
143 :			ALIGN 16
144 :			quant_mpeg_intra_mmx:
145 :
146 :			push ecx
147 :			push esi
148 :			push edi
149 :			push ebx
150 :
151 :			mov edi, [esp + 16 + 4] ; coeff
152 :			mov esi, [esp + 16 + 8] ; data
153 :			mov eax, [esp + 16 + 12] ; quant
154 :			mov ebx, [esp + 16 + 20] ; mpeg_quant_matrices
155 :
156 :			movq mm5, [quantd + eax * 8 - 8] ; quantd -> mm5
157 :
158 :			xor ecx, ecx
159 :			cmp al, 1
160 :			jz near .q1loop
161 :
162 :			cmp al, 2
163 :			jz near .q2loop
164 :
165 :			movq mm7, [mmx_div + eax * 8 - 8] ; multipliers[quant] -> mm7
166 :
167 :			ALIGN 16
168 :			.loop
169 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
170 :			movq mm3, [esi + 8*ecx + 8] ;
171 :			pxor mm1, mm1 ; mm1 = 0
172 :			pxor mm4, mm4
173 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
174 :			pcmpgtw mm4, mm3
175 :			pxor mm0, mm1 ; mm0 = \|mm0\|
176 :			pxor mm3, mm4 ;
177 :			psubw mm0, mm1 ; displace
178 :			psubw mm3, mm4 ;
179 :			psllw mm0, 4 ; level << 4
180 :			psllw mm3, 4
181 :			movq mm2, [ebx + 8*ecx]
182 :			psrlw mm2, 1 ; intra_matrix[i]>>1
183 :			paddw mm0, mm2
184 :			movq mm2, [ebx + 256 + ecx*8]
185 :			pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
186 :			movq mm2, [ebx + 8*ecx + 8]
187 :			psrlw mm2, 1
188 :			paddw mm3, mm2
189 :			movq mm2, [ebx + 256 + ecx*8 + 8]
190 :			pmulhw mm3, mm2
191 :			paddw mm0, mm5 ; + quantd
192 :			paddw mm3, mm5
193 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
194 :			pmulhw mm3, mm7 ;
195 :			psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17
196 :			psrlw mm3, 1
197 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
198 :			pxor mm3, mm4 ;
199 :			psubw mm0, mm1 ; undisplace
200 :			psubw mm3, mm4 ;
201 :
202 :			movq [edi + 8*ecx], mm0
203 :			movq [edi + 8*ecx + 8], mm3
204 :
205 :			add ecx,2
206 :			cmp ecx,16
207 :			jnz near .loop
208 :
209 :			.done
210 :			; caclulate data[0] // (int32_t)dcscalar)
211 :			mov ecx, [esp + 16 + 16] ; dcscalar
212 :			mov edx, ecx
213 :			movsx eax, word [esi] ; data[0]
214 :			shr edx, 1 ; edx = dcscalar /2
215 :			cmp eax, 0
216 :			jg .gtzero
217 :
218 :			sub eax, edx
219 :			jmp short .mul
220 :			.gtzero
221 :			add eax, edx
222 :			.mul
223 :			cdq ; expand eax -> edx:eax
224 :			idiv ecx ; eax = edx:eax / dcscalar
225 :
226 :			mov [edi], ax ; coeff[0] = ax
227 :
228 :			pop ebx
229 :			pop edi
230 :			pop esi
231 :			pop ecx
232 :
233 :			xor eax, eax ; return(0);
234 :			ret
235 :
236 :			ALIGN 16
237 :			.q1loop
238 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
239 :			movq mm3, [esi + 8*ecx + 8] ;
240 :			pxor mm1, mm1 ; mm1 = 0
241 :			pxor mm4, mm4 ;
242 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
243 :			pcmpgtw mm4, mm3 ;
244 :			pxor mm0, mm1 ; mm0 = \|mm0\|
245 :			pxor mm3, mm4 ;
246 :			psubw mm0, mm1 ; displace
247 :			psubw mm3, mm4 ;
248 :			psllw mm0, 4
249 :			psllw mm3, 4
250 :			movq mm2, [ebx + 8*ecx]
251 :			psrlw mm2, 1
252 :			paddw mm0, mm2
253 :			movq mm2, [ebx + 256 + ecx*8]
254 :			pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
255 :			movq mm2, [ebx + 8*ecx + 8]
256 :			psrlw mm2, 1
257 :			paddw mm3, mm2
258 :			movq mm2, [ebx + 256 + ecx*8 + 8]
259 :			pmulhw mm3, mm2
260 :			paddw mm0, mm5
261 :			paddw mm3, mm5
262 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
263 :			psrlw mm3, 1 ;
264 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
265 :			pxor mm3, mm4 ;
266 :			psubw mm0, mm1 ; undisplace
267 :			psubw mm3, mm4 ;
268 :			movq [edi + 8*ecx], mm0
269 :			movq [edi + 8*ecx + 8], mm3
270 :
271 :			add ecx, 2
272 :			cmp ecx, 16
273 :			jnz near .q1loop
274 :			jmp near .done
275 :
276 :
277 :			ALIGN 16
278 :			.q2loop
279 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
280 :			movq mm3, [esi + 8*ecx + 8] ;
281 :			pxor mm1, mm1 ; mm1 = 0
282 :			pxor mm4, mm4 ;
283 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
284 :			pcmpgtw mm4, mm3 ;
285 :			pxor mm0, mm1 ; mm0 = \|mm0\|
286 :			pxor mm3, mm4 ;
287 :			psubw mm0, mm1 ; displace
288 :			psubw mm3, mm4 ;
289 :			psllw mm0, 4
290 :			psllw mm3, 4
291 :			movq mm2, [ebx + 8*ecx]
292 :			psrlw mm2, 1
293 :			paddw mm0, mm2
294 :			movq mm2, [ebx + 256 + ecx*8]
295 :			pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
296 :			movq mm2, [ebx + 8*ecx + 8]
297 :			psrlw mm2, 1
298 :			paddw mm3, mm2
299 :			movq mm2, [ebx + 256 + ecx*8 + 8]
300 :			pmulhw mm3, mm2
301 :			paddw mm0, mm5
302 :			paddw mm3, mm5
303 :			psrlw mm0, 2 ; mm0 >>= 1 (/4)
304 :			psrlw mm3, 2 ;
305 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
306 :			pxor mm3, mm4 ;
307 :			psubw mm0, mm1 ; undisplace
308 :			psubw mm3, mm4 ;
309 :			movq [edi + 8*ecx], mm0
310 :			movq [edi + 8*ecx + 8], mm3
311 :
312 :			add ecx,2
313 :			cmp ecx,16
314 :			jnz near .q2loop
315 :			jmp near .done
316 :
317 :
318 :			;-----------------------------------------------------------------------------
319 :			;
320 :			; uint32_t quant_mpeg_inter_mmx(int16_t * coeff,
321 :			; const int16_t const * data,
322 :			; const uint32_t quant,
323 :			; const uint16_t *mpeg_matrices);
324 :			;
325 :			;-----------------------------------------------------------------------------
326 :
327 :			ALIGN 16
328 :			quant_mpeg_inter_mmx:
329 :
330 :			push ecx
331 :			push esi
332 :			push edi
333 :			push ebx
334 :
335 :			mov edi, [esp + 16 + 4] ; coeff
336 :			mov esi, [esp + 16 + 8] ; data
337 :			mov eax, [esp + 16 + 12] ; quant
338 :			mov ebx, [esp + 16 + 16] ; mpeg_quant_matrices
339 :
340 :			xor ecx, ecx
341 :
342 :			pxor mm5, mm5 ; sum
343 :
344 :			cmp al, 1
345 :			jz near .q1loop
346 :
347 :			cmp al, 2
348 :			jz near .q2loop
349 :
350 :			movq mm7, [mmx_div + eax * 8 - 8] ; divider
351 :
352 :			ALIGN 16
353 :			.loop
354 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
355 :			movq mm3, [esi + 8*ecx + 8] ;
356 :			pxor mm1, mm1 ; mm1 = 0
357 :			pxor mm4, mm4 ;
358 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
359 :			pcmpgtw mm4, mm3 ;
360 :			pxor mm0, mm1 ; mm0 = \|mm0\|
361 :			pxor mm3, mm4 ;
362 :			psubw mm0, mm1 ; displace
363 :			psubw mm3, mm4 ;
364 :			psllw mm0, 4
365 :			psllw mm3, 4
366 :			movq mm2, [ebx + 512 + 8*ecx]
367 :			psrlw mm2, 1
368 :			paddw mm0, mm2
369 :			movq mm2, [ebx + 768 + ecx*8]
370 :			pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
371 :			movq mm2, [ebx + 512 + 8*ecx + 8]
372 :			psrlw mm2, 1
373 :			paddw mm3, mm2
374 :			movq mm2, [ebx + 768 + ecx*8 + 8]
375 :			pmulhw mm3, mm2
376 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
377 :			pmulhw mm3, mm7 ;
378 :			psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17
379 :			psrlw mm3, 1
380 :			paddw mm5, mm0 ; sum += mm0
381 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
382 :			paddw mm5, mm3 ;
383 :			pxor mm3, mm4 ;
384 :			psubw mm0, mm1 ; undisplace
385 :			psubw mm3, mm4
386 :			movq [edi + 8*ecx], mm0
387 :			movq [edi + 8*ecx + 8], mm3
388 :
389 :			add ecx, 2
390 :			cmp ecx, 16
391 :			jnz near .loop
392 :
393 :			.done
394 :			pmaddwd mm5, [mmx_one]
395 :			movq mm0, mm5
396 :			psrlq mm5, 32
397 :			paddd mm0, mm5
398 :			movd eax, mm0 ; return sum
399 :
400 :			pop ebx
401 :			pop edi
402 :			pop esi
403 :			pop ecx
404 :
405 :			ret
406 :
407 :			ALIGN 16
408 :			.q1loop
409 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
410 :			movq mm3, [esi + 8*ecx+ 8]
411 :			pxor mm1, mm1 ; mm1 = 0
412 :			pxor mm4, mm4 ;
413 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
414 :			pcmpgtw mm4, mm3 ;
415 :			pxor mm0, mm1 ; mm0 = \|mm0\|
416 :			pxor mm3, mm4 ;
417 :			psubw mm0, mm1 ; displace
418 :			psubw mm3, mm4 ;
419 :			psllw mm0, 4
420 :			psllw mm3, 4
421 :			movq mm2, [ebx + 512 + 8*ecx]
422 :			psrlw mm2, 1
423 :			paddw mm0, mm2
424 :			movq mm2, [ebx + 768 + ecx*8]
425 :			pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
426 :			movq mm2, [ebx + 512 + 8*ecx + 8]
427 :			psrlw mm2, 1
428 :			paddw mm3, mm2
429 :			movq mm2, [ebx + 768 + ecx*8 + 8]
430 :			pmulhw mm3, mm2
431 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
432 :			psrlw mm3, 1 ;
433 :			paddw mm5, mm0 ; sum += mm0
434 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
435 :			paddw mm5, mm3 ;
436 :			pxor mm3, mm4 ;
437 :			psubw mm0, mm1 ; undisplace
438 :			psubw mm3, mm4
439 :			movq [edi + 8*ecx], mm0
440 :			movq [edi + 8*ecx + 8], mm3
441 :
442 :			add ecx, 2
443 :			cmp ecx, 16
444 :			jnz near .q1loop
445 :
446 :			jmp .done
447 :
448 :
449 :			ALIGN 16
450 :			.q2loop
451 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
452 :			movq mm3, [esi + 8*ecx+ 8]
453 :			pxor mm1, mm1 ; mm1 = 0
454 :			pxor mm4, mm4 ;
455 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
456 :			pcmpgtw mm4, mm3 ;
457 :			pxor mm0, mm1 ; mm0 = \|mm0\|
458 :			pxor mm3, mm4 ;
459 :			psubw mm0, mm1 ; displace
460 :			psubw mm3, mm4 ;
461 :			psllw mm0, 4
462 :			psllw mm3, 4
463 :			movq mm2, [ebx + 512 + 8*ecx]
464 :			psrlw mm2, 1
465 :			paddw mm0, mm2
466 :			movq mm2, [ebx + 768 + ecx*8]
467 :			pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
468 :			movq mm2, [ebx + 512 + 8*ecx + 8]
469 :			psrlw mm2, 1
470 :			paddw mm3, mm2
471 :			movq mm2, [ebx + 768 + ecx*8 + 8]
472 :			pmulhw mm3, mm2
473 :			psrlw mm0, 2 ; mm0 >>= 1 (/2)
474 :			psrlw mm3, 2 ;
475 :			paddw mm5, mm0 ; sum += mm0
476 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
477 :			paddw mm5, mm3 ;
478 :			pxor mm3, mm4 ;
479 :			psubw mm0, mm1 ; undisplace
480 :			psubw mm3, mm4
481 :			movq [edi + 8*ecx], mm0
482 :			movq [edi + 8*ecx + 8], mm3
483 :
484 :			add ecx, 2
485 :			cmp ecx, 16
486 :			jnz near .q2loop
487 :
488 :			jmp .done
489 :
490 :
491 :			;-----------------------------------------------------------------------------
492 :			;
493 :			; uint32_t dequant_mpeg_intra_mmx(int16_t *data,
494 :			; const int16_t const *coeff,
495 :			; const uint32_t quant,
496 :			; const uint32_t dcscalar,
497 :			; const uint16_t *mpeg_matrices);
498 :			;
499 :			;-----------------------------------------------------------------------------
500 :
501 :			; Note: in order to saturate 'easily', we pre-shift the quantifier
502 :			; by 4. Then, the high-word of (coeff[]matrix[i]quant) are used to
503 :			; build a saturating mask. It is non-zero only when an overflow occured.
504 :			; We thus avoid packing/unpacking toward double-word.
505 :			; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,
506 :			; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not
507 :			; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a
508 :			; and quant in [1..31].
509 :			;
510 :			; The original loop is:
511 :			;
512 :			%if 0
513 :			movq mm0, [ecx+8eax + 816] ; mm0 = coeff[i]
514 :			pxor mm1, mm1
515 :			pcmpgtw mm1, mm0
516 :			pxor mm0, mm1 ; change sign if negative
517 :			psubw mm0, mm1 ; -> mm0 = abs(coeff[i]), mm1 = sign of coeff[i]
518 :
519 :			movq mm2, mm7 ; mm2 = quant
520 :			pmullw mm2, [ebx + 8eax + 816 ] ; matrix[i]*quant.
521 :
522 :			movq mm6, mm2
523 :			pmulhw mm2, mm0 ; high of coeff(matrixquant) (should be 0 if no overflow)
524 :			pmullw mm0, mm6 ; low of coeff(matrixquant)
525 :
526 :			pxor mm5, mm5
527 :			pcmpgtw mm2, mm5 ; otherflow?
528 :			psrlw mm2, 5 ; =0 if no clamp, 2047 otherwise
529 :			psrlw mm0, 5
530 :			paddw mm0, mm1 ; start restoring sign
531 :			por mm0, mm2 ; saturate to 2047 if needed
532 :			pxor mm0, mm1 ; finish negating back
533 :
534 :			movq [edx + 8eax + 816], mm0 ; data[i]
535 :			add eax, 1
536 :			%endif
537 :
538 :			;********************************************************************
539 :
540 :			ALIGN 16
541 :			dequant_mpeg_intra_mmx:
542 :
543 :			push ebx
544 :
545 :			mov edx, [esp + 4 + 4] ; data
546 :			mov ecx, [esp + 4 + 8] ; coeff
547 :			mov eax, [esp + 4 + 12] ; quant
548 :			mov ebx, [esp + 4 + 20] ; mpeg_quant_matrices
549 :
550 :			movq mm7, [mmx_mul_quant + eax*8 - 8]
551 :			mov eax, -16 ; to keep ALIGNed, we regularly process coeff[0]
552 :			psllw mm7, 2 ; << 2. See comment.
553 :			pxor mm6, mm6 ; this is a NOP
554 :
555 :			ALIGN 16
556 :			.loop
557 :			movq mm0, [ecx+8eax + 816] ; mm0 = c = coeff[i]
558 :			movq mm3, [ecx+8eax + 816 +8]; mm3 = c' = coeff[i+1]
559 :			pxor mm1, mm1
560 :			pxor mm4, mm4
561 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c)
562 :			movq mm2, mm7 ; mm2 = quant
563 :
564 :			pcmpgtw mm4, mm3 ; mm4 = sgn(c')
565 :			pmullw mm2, [ebx + 8eax + 816 ] ; matrix[i]*quant
566 :
567 :			pxor mm0, mm1 ; negate if negative
568 :			pxor mm3, mm4 ; negate if negative
569 :
570 :			psubw mm0, mm1
571 :			psubw mm3, mm4
572 :
573 :			; we're short on register, here. Poor pairing...
574 :
575 :			movq mm5, mm2
576 :			pmullw mm2, mm0 ; low of coeff(matrixquant)
577 :
578 :			pmulhw mm0, mm5 ; high of coeff(matrixquant)
579 :			movq mm5, mm7 ; mm2 = quant
580 :
581 :			pmullw mm5, [ebx + 8eax + 816 +8] ; matrix[i+1]*quant
582 :
583 :			movq mm6, mm5
584 :			add eax,2 ; z-flag will be tested later
585 :
586 :			pmullw mm6, mm3 ; low of coeff(matrixquant)
587 :			pmulhw mm3, mm5 ; high of coeff(matrixquant)
588 :
589 :			pcmpgtw mm0, [zero]
590 :			paddusw mm2, mm0
591 :			psrlw mm2, 5
592 :
593 :			pcmpgtw mm3, [zero]
594 :			paddusw mm6, mm3
595 :			psrlw mm6, 5
596 :
597 :			pxor mm2, mm1 ; start negating back
598 :			pxor mm6, mm4 ; start negating back
599 :
600 :			psubusw mm1, mm0
601 :			psubusw mm4, mm3
602 :
603 :			psubw mm2, mm1 ; finish negating back
604 :			psubw mm6, mm4 ; finish negating back
605 :
606 :			movq [edx + 8eax + 816 -2*8 ], mm2 ; data[i]
607 :			movq [edx + 8eax + 816 -2*8 +8], mm6 ; data[i+1]
608 :
609 :			jnz near .loop
610 :
611 :			; deal with DC
612 :			movd mm0, [ecx]
613 :			pmullw mm0, [esp + 4 + 16] ; dcscalar
614 :			movq mm2, [mmx_32767_minus_2047]
615 :			paddsw mm0, mm2
616 :			psubsw mm0, mm2
617 :			movq mm2, [mmx_32768_minus_2048]
618 :			psubsw mm0, mm2
619 :			paddsw mm0, mm2
620 :			movd eax, mm0
621 :			mov [edx], ax
622 :
623 :			xor eax, eax
624 :
625 :			pop ebx
626 :
627 :			ret
628 :
629 :			;-----------------------------------------------------------------------------
630 :			;
631 :			; uint32_t dequant_mpeg_inter_mmx(int16_t * data,
632 :			; const int16_t * const coeff,
633 :			; const uint32_t quant,
634 :			; const uint16_t *mpeg_matrices);
635 :			;
636 :			;-----------------------------------------------------------------------------
637 :
638 :			; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier
639 :			; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.
640 :			; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0.
641 :			; It's mixed with the extraction of the absolute value.
642 :
643 :			ALIGN 16
644 :			dequant_mpeg_inter_mmx:
645 :
646 :			push ebx
647 :
648 :			mov edx, [esp + 4 + 4] ; data
649 :			mov ecx, [esp + 4 + 8] ; coeff
650 :			mov eax, [esp + 4 + 12] ; quant
651 :			mov ebx, [esp + 4 + 16] ; mpeg_quant_matrices
652 :
653 :			movq mm7, [mmx_mul_quant + eax*8 - 8]
654 :			mov eax, -16
655 :			paddw mm7, mm7 ; << 1
656 :			pxor mm6, mm6 ; mismatch sum
657 :
658 :			ALIGN 16
659 :			.loop
660 :			movq mm0, [ecx+8eax + 816 ] ; mm0 = coeff[i]
661 :			movq mm2, [ecx+8eax + 816 +8] ; mm2 = coeff[i+1]
662 :			add eax, 2
663 :
664 :			pxor mm1, mm1
665 :			pxor mm3, mm3
666 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved)
667 :			pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved)
668 :			paddsw mm0, mm1 ; c += sgn(c)
669 :			paddsw mm2, mm3 ; c += sgn(c')
670 :			paddw mm0, mm0 ; c *= 2
671 :			paddw mm2, mm2 ; c'*= 2
672 :
673 :			pxor mm4, mm4
674 :			pxor mm5, mm5
675 :			psubw mm4, mm0 ; -c
676 :			psubw mm5, mm2 ; -c'
677 :			psraw mm4, 16 ; mm4 = sgn(-c)
678 :			psraw mm5, 16 ; mm5 = sgn(-c')
679 :			psubsw mm0, mm4 ; c -= sgn(-c)
680 :			psubsw mm2, mm5 ; c' -= sgn(-c')
681 :			pxor mm0, mm1 ; finish changing sign if needed
682 :			pxor mm2, mm3 ; finish changing sign if needed
683 :
684 :			; we're short on register, here. Poor pairing...
685 :
686 :			movq mm4, mm7 ; (matrix*quant)
687 :			pmullw mm4, [ebx + 512 + 8eax + 816 -2*8]
688 :			movq mm5, mm4
689 :			pmulhw mm5, mm0 ; high of c(matrixquant)
690 :			pmullw mm0, mm4 ; low of c(matrixquant)
691 :
692 :			movq mm4, mm7 ; (matrix*quant)
693 :			pmullw mm4, [ebx + 512 + 8eax + 816 -2*8 + 8]
694 :
695 :			pcmpgtw mm5, [zero]
696 :			paddusw mm0, mm5
697 :			psrlw mm0, 5
698 :			pxor mm0, mm1 ; start restoring sign
699 :			psubusw mm1, mm5
700 :
701 :			movq mm5, mm4
702 :			pmulhw mm5, mm2 ; high of c(matrixquant)
703 :			pmullw mm2, mm4 ; low of c(matrixquant)
704 :			psubw mm0, mm1 ; finish restoring sign
705 :
706 :			pcmpgtw mm5, [zero]
707 :			paddusw mm2, mm5
708 :			psrlw mm2, 5
709 :			pxor mm2, mm3 ; start restoring sign
710 :			psubusw mm3, mm5
711 :			psubw mm2, mm3 ; finish restoring sign
712 :
713 :			pxor mm6, mm0 ; mismatch control
714 :			movq [edx + 8eax + 816 -2*8 ], mm0 ; data[i]
715 :			pxor mm6, mm2 ; mismatch control
716 :			movq [edx + 8eax + 816 -2*8 +8], mm2 ; data[i+1]
717 :
718 :			jnz near .loop
719 :
720 :			; mismatch control
721 :
722 :			movq mm0, mm6
723 :			psrlq mm0, 48
724 :			movq mm1, mm6
725 :			movq mm2, mm6
726 :			psrlq mm1, 32
727 :			pxor mm6, mm0
728 :			psrlq mm2, 16
729 :			pxor mm6, mm1
730 :			pxor mm6, mm2
731 :			movd eax, mm6
732 :			and eax, 1
733 :			xor eax, 1
734 :			xor word [edx + 2*63], ax
735 :
736 :			xor eax, eax
737 :
738 :			pop ebx
739 :
740 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4