Annotation of /xvidcore/src/quant/x86_asm/quantize_h263_3dne.asm

Revision 1.2 - (view) (download)

1 :	edgomez	1.2	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dne Quantization/Dequantization -
5 :			; *
6 :			; * Copyright(C) 2002-2003 Jaan Kalda
7 :			; *
8 :			; * This program is free software ; you can redistribute it and/or modify
9 :			; * it under the terms of the GNU General Public License as published by
10 :			; * the Free Software Foundation ; either version 2 of the License, or
11 :			; * (at your option) any later version.
12 :			; *
13 :			; * This program is distributed in the hope that it will be useful,
14 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
15 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :			; * GNU General Public License for more details.
17 :			; *
18 :			; * You should have received a copy of the GNU General Public License
19 :			; * along with this program ; if not, write to the Free Software
20 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :			; *
22 :			; * $Id$
23 :			; *
24 :			; *************************************************************************/
25 :			;
26 :			; these 3dne functions are compatible with iSSE, but are optimized specifically for
27 :			; K7 pipelines
28 :
29 :			; enable dequant saturate [-2048,2047], test purposes only.
30 :			%define SATURATE
31 :
32 :			BITS 32
33 :
34 :			%macro cglobal 1
35 :			%ifdef PREFIX
36 :			global _%1
37 :			%define %1 _%1
38 :			%else
39 :			global %1
40 :			%endif
41 :			%endmacro
42 :
43 :			;=============================================================================
44 :			; Local data
45 :			;=============================================================================
46 :
47 :			%ifdef FORMAT_COFF
48 :			SECTION .rodata data
49 :			%else
50 :			SECTION .rodata data align=16
51 :			%endif
52 :
53 :			align 4
54 :			int_div:
55 :			dd 0
56 :			%assign i 1
57 :			%rep 255
58 :			dd (1 << 16) / (i) + 1
59 :			%assign i i+1
60 :			%endrep
61 :
62 :			ALIGN 16
63 :			plus_one:
64 :			times 8 dw 1
65 :
66 :			;-----------------------------------------------------------------------------
67 :			; subtract by Q/2 table
68 :			;-----------------------------------------------------------------------------
69 :
70 :			ALIGN 16
71 :			mmx_sub:
72 :			%assign i 1
73 :			%rep 31
74 :			times 4 dw i / 2
75 :			%assign i i+1
76 :			%endrep
77 :
78 :
79 :			;-----------------------------------------------------------------------------
80 :			;
81 :			; divide by 2Q table
82 :			;
83 :			; use a shift of 16 to take full advantage of _pmulhw_
84 :			; for q=1, _pmulhw_ will overflow so it is treated seperately
85 :			; (3dnow2 provides _pmulhuw_ which wont cause overflow)
86 :			;
87 :			;-----------------------------------------------------------------------------
88 :
89 :			ALIGN 16
90 :			mmx_div:
91 :			%assign i 1
92 :			%rep 31
93 :			times 4 dw (1 << 16) / (i * 2) + 1
94 :			%assign i i+1
95 :			%endrep
96 :
97 :			;-----------------------------------------------------------------------------
98 :			; add by (odd(Q) ? Q : Q - 1) table
99 :			;-----------------------------------------------------------------------------
100 :
101 :			ALIGN 16
102 :			mmx_add:
103 :			%assign i 1
104 :			%rep 31
105 :			%if i % 2 != 0
106 :			times 4 dw i
107 :			%else
108 :			times 4 dw i - 1
109 :			%endif
110 :			%assign i i+1
111 :			%endrep
112 :
113 :			;-----------------------------------------------------------------------------
114 :			; multiple by 2Q table
115 :			;-----------------------------------------------------------------------------
116 :
117 :			ALIGN 16
118 :			mmx_mul:
119 :			%assign i 1
120 :			%rep 31
121 :			times 4 dw i * 2
122 :			%assign i i+1
123 :			%endrep
124 :
125 :			;-----------------------------------------------------------------------------
126 :			; saturation limits
127 :			;-----------------------------------------------------------------------------
128 :
129 :			ALIGN 8
130 :			mmx_32768_minus_2048:
131 :			times 4 dw (32768-2048)
132 :			mmx_32767_minus_2047:
133 :			times 4 dw (32767-2047)
134 :
135 :			ALIGN 16
136 :			mmx_2047:
137 :			times 4 dw 2047
138 :
139 :			ALIGN 8
140 :			mmzero:
141 :			dd 0, 0
142 :			int2047:
143 :			dd 2047
144 :			int_2048:
145 :			dd -2048
146 :
147 :			;=============================================================================
148 :			; Code
149 :			;=============================================================================
150 :
151 :			SECTION .text
152 :
153 :
154 :			;-----------------------------------------------------------------------------
155 :			;
156 :			; uint32_t quant_h263_intra_3dne(int16_t * coeff,
157 :			; const int16_t const * data,
158 :			; const uint32_t quant,
159 :			; const uint32_t dcscalar,
160 :			; const uint16_t *mpeg_matrices);
161 :			;
162 :			;-----------------------------------------------------------------------------
163 :			;This is Athlon-optimized code (ca 70 clk per call)
164 :
165 :			%macro quant_intra1 1
166 :			psubw mm1, mm0 ;A3
167 :			psubw mm3, mm2 ;B3
168 :			%if (%1)
169 :			psubw mm5, mm4 ;C8
170 :			psubw mm7, mm6 ;D8
171 :			%endif
172 :
173 :			ALIGN 8
174 :			db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1
175 :			pmaxsw mm1, mm0 ;A4
176 :			db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1
177 :			pmaxsw mm3, mm2 ;B4
178 :
179 :
180 :			psraw mm0, 15 ;A5
181 :			psraw mm2, 15 ;B5
182 :			%if (%1)
183 :			movq [edx + %1 * 32 + 16-32], mm5 ;C9
184 :			movq [edx + %1 * 32 + 24-32], mm7 ;D9
185 :			%endif
186 :
187 :			psrlw mm1, 1 ;A6
188 :			psrlw mm3, 1 ;B6
189 :			movq mm5, [ebx] ;C2
190 :			movq mm7, [ebx] ;D2
191 :
192 :			pxor mm1, mm0 ;A7
193 :			pxor mm3, mm2 ;B7
194 :
195 :			psubw mm5, mm4 ;C3
196 :			psubw mm7, mm6 ;D3
197 :			psubw mm1, mm0 ;A8
198 :			psubw mm3, mm2 ;B8
199 :
200 :			%if (%1 == 0)
201 :			push ebp
202 :			movq mm0, [ecx + %1 * 32 +32]
203 :			%elif (%1 < 3)
204 :			db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1
205 :			%endif
206 :			pmaxsw mm5, mm4 ;C4
207 :			%if (%1 < 3)
208 :			db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1
209 :			%else
210 :			cmp esp, esp
211 :			%endif
212 :			pmaxsw mm7, mm6 ;D4
213 :
214 :			psraw mm4, 15 ;C5
215 :			psraw mm6, 15 ;D5
216 :			movq [byte edx + %1 * 32], mm1 ;A9
217 :			movq [edx + %1 * 32+8], mm3 ;B9
218 :
219 :
220 :			psrlw mm5, 1 ;C6
221 :			psrlw mm7, 1 ;D6
222 :			%if (%1 < 3)
223 :			movq mm1, [ebx] ;A2
224 :			movq mm3, [ebx] ;B2
225 :			%endif
226 :			%if (%1 == 3)
227 :			imul eax, [int_div+4*edi]
228 :			%endif
229 :			pxor mm5, mm4 ;C7
230 :			pxor mm7, mm6 ;D7
231 :			%endm
232 :
233 :
234 :			%macro quant_intra 1
235 :			; Rules for athlon:
236 :			; 1) schedule latencies
237 :			; 2) add/mul and load/store in 2:1 proportion
238 :			; 3) avoid spliting >3byte instructions over 8byte boundaries
239 :
240 :			psubw mm1, mm0 ;A3
241 :			psubw mm3, mm2 ;B3
242 :			%if (%1)
243 :			psubw mm5, mm4 ;C8
244 :			psubw mm7, mm6 ;D8
245 :			%endif
246 :
247 :			ALIGN 8
248 :			db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1
249 :			pmaxsw mm1, mm0 ;A4
250 :			db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1
251 :			pmaxsw mm3, mm2 ;B4
252 :
253 :
254 :			psraw mm0, 15 ;A5
255 :			psraw mm2, 15 ;B5
256 :			%if (%1)
257 :			movq [edx + %1 * 32 + 16-32], mm5 ;C9
258 :			movq [edx + %1 * 32 + 24-32], mm7 ;D9
259 :			%endif
260 :
261 :			pmulhw mm1, [esi] ;A6
262 :			pmulhw mm3, [esi] ;B6
263 :			movq mm5, [ebx] ;C2
264 :			movq mm7, [ebx] ;D2
265 :
266 :			nop
267 :			nop
268 :			pxor mm1, mm0 ;A7
269 :			pxor mm3, mm2 ;B7
270 :
271 :			psubw mm5, mm4 ;C3
272 :			psubw mm7, mm6 ;D3
273 :			psubw mm1, mm0 ;A8
274 :			psubw mm3, mm2 ;B8
275 :
276 :
277 :			%if (%1 < 3)
278 :			db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1
279 :			%endif
280 :			pmaxsw mm5, mm4 ;C4
281 :			%if (%1 < 3)
282 :			db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1
283 :			%else
284 :			cmp esp, esp
285 :			%endif
286 :			pmaxsw mm7,mm6 ;D4
287 :
288 :			psraw mm4, 15 ;C5
289 :			psraw mm6, 15 ;D5
290 :			movq [byte edx + %1 * 32], mm1 ;A9
291 :			movq [edx + %1 * 32+8], mm3 ;B9
292 :
293 :
294 :			pmulhw mm5, [esi] ;C6
295 :			pmulhw mm7, [esi] ;D6
296 :			%if (%1 < 3)
297 :			movq mm1, [ebx] ;A2
298 :			movq mm3, [ebx] ;B2
299 :			%endif
300 :			%if (%1 == 0)
301 :			push ebp
302 :			%elif (%1 < 3)
303 :			nop
304 :			%endif
305 :			nop
306 :			%if (%1 == 3)
307 :			imul eax, [int_div+4*edi]
308 :			%endif
309 :			pxor mm5, mm4 ;C7
310 :			pxor mm7, mm6 ;D7
311 :			%endmacro
312 :
313 :
314 :			ALIGN 16
315 :			cglobal quant_h263_intra_3dne
316 :			quant_h263_intra_3dne:
317 :
318 :			mov eax, [esp + 12] ; quant
319 :			mov ecx, [esp + 8] ; data
320 :			mov edx, [esp + 4] ; coeff
321 :			cmp al, 1
322 :			pxor mm1, mm1
323 :			pxor mm3, mm3
324 :			movq mm0, [ecx] ; mm0 = [1st]
325 :			movq mm2, [ecx + 8]
326 :			push esi
327 :			lea esi, [mmx_div + eax*8 - 8]
328 :
329 :			push ebx
330 :			mov ebx, mmzero
331 :			push edi
332 :			jz near .q1loop
333 :
334 :			quant_intra 0
335 :			mov ebp, [esp + 16 + 16] ; dcscalar
336 :			; NB -- there are 3 pushes in the function preambule and one more
337 :			; in "quant_intra 0", thus an added offset of 16 bytes
338 :			movsx eax, word [byte ecx] ; DC
339 :
340 :			quant_intra 1
341 :			mov edi, eax
342 :			sar edi, 31 ; sign(DC)
343 :			shr ebp, byte 1 ; ebp = dcscalar/2
344 :
345 :			quant_intra 2
346 :			sub eax, edi ; DC (+1)
347 :			xor ebp, edi ; sign(DC) dcscalar /2 (-1)
348 :			mov edi, [esp + 16 + 16] ; dscalar
349 :			lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2
350 :			mov ebp, [byte esp]
351 :
352 :			quant_intra 3
353 :			psubw mm5, mm4 ;C8
354 :			mov esi, [esp + 12] ; pop back the register value
355 :			mov edi, [esp + 4] ; pop back the register value
356 :			sar eax, 16
357 :			lea ebx, [byte eax + 1] ; workaround for eax < 0
358 :			cmovs eax, ebx ; conditionnaly move the corrected value
359 :			mov [edx], ax ; coeff[0] = ax
360 :			mov ebx, [esp + 8] ; pop back the register value
361 :			add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
362 :			psubw mm7, mm6 ;D8
363 :			movq [edx + 3 * 32 + 16], mm5 ;C9
364 :			movq [edx + 3 * 32 + 24], mm7 ;D9
365 :
366 :			xor eax, eax
367 :			ret
368 :
369 :			ALIGN 16
370 :
371 :			.q1loop
372 :			quant_intra1 0
373 :			mov ebp, [esp + 16 + 16] ; dcscalar
374 :			movsx eax, word [byte ecx] ; DC
375 :
376 :			quant_intra1 1
377 :			mov edi, eax
378 :			sar edi, 31 ; sign(DC)
379 :			shr ebp, byte 1 ; ebp = dcscalar /2
380 :
381 :			quant_intra1 2
382 :			sub eax, edi ; DC (+1)
383 :			xor ebp, edi ; sign(DC) dcscalar /2 (-1)
384 :			mov edi, [esp + 16 + 16] ; dcscalar
385 :			lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2
386 :			mov ebp, [byte esp]
387 :
388 :			quant_intra1 3
389 :			psubw mm5, mm4 ;C8
390 :			mov esi, [dword esp + 12] ; pop back the register value
391 :			mov edi, [esp + 4] ; pop back the register value
392 :			sar eax, 16
393 :			lea ebx, [byte eax + 1] ; workaround for eax < 0
394 :			cmovs eax, ebx ; conditionnaly move the corrected value
395 :			mov [edx], ax ; coeff[0] = ax
396 :			mov ebx, [esp + 8] ; pop back the register value
397 :			add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
398 :			psubw mm7, mm6 ;D8
399 :			movq [edx + 3 * 32 + 16], mm5 ;C9
400 :			movq [edx + 3 * 32 + 24], mm7 ;D9
401 :
402 :			xor eax, eax
403 :			ret
404 :
405 :
406 :
407 :
408 :			;-----------------------------------------------------------------------------
409 :			;
410 :			; uint32_t quant_h263_inter_3dne(int16_t * coeff,
411 :			; const int16_t const * data,
412 :			; const uint32_t quant,
413 :			; const uint16_t *mpeg_matrices);
414 :			;
415 :			;-----------------------------------------------------------------------------
416 :			;This is Athlon-optimized code (ca 90 clk per call)
417 :			;Optimized by Jaan, 30 Nov 2002
418 :
419 :
420 :			%macro quantinter 1
421 :			movq mm1, [eax] ;A2
422 :			psraw mm3, 15 ;B6
423 :			%if (%1)
424 :			psubw mm2, mm6 ;C10
425 :			%endif
426 :			psubw mm1, mm0 ;A3
427 :			pmulhw mm4, mm7 ;B7
428 :			movq mm6, [ecx + %1*24+16] ;C1
429 :			pmaxsw mm1, mm0 ;A4
430 :			paddw mm5, mm4 ;B8
431 :			%if (%1)
432 :			movq [edx + %1*24+16-24], mm2 ;C11
433 :			%endif
434 :			psubusw mm1, [ebx] ;A5 mm0 -= sub (unsigned, dont go < 0)
435 :			pxor mm4, mm3 ;B9
436 :			movq mm2, [eax] ;C2
437 :			psraw mm0, 15 ;A6
438 :			psubw mm4, mm3 ;B10
439 :			psubw mm2, mm6 ;C3
440 :			pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24
441 :			movq mm3, [ecx + %1*24+8] ;B1
442 :			pmaxsw mm2, mm6 ;C4
443 :			paddw mm5, mm1 ;A8 sum += mm0
444 :			%if (%1)
445 :			movq [edx + %1*24+8-24], mm4 ;B11
446 :			%else
447 :			movq [edx + 120], mm4 ;B11
448 :			%endif
449 :			psubusw mm2, [ebx] ;C5
450 :			pxor mm1, mm0 ;A9 mm0 *= sign(mm0)
451 :			movq mm4, [eax] ;B2
452 :			psraw mm6, 15 ;C6
453 :			psubw mm1, mm0 ;A10 undisplace
454 :			psubw mm4, mm3 ;B3
455 :			pmulhw mm2, mm7 ;C7
456 :			movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st]
457 :			pmaxsw mm4, mm3 ;B4
458 :			paddw mm5, mm2 ;C8
459 :			movq [byte edx + %1*24], mm1 ;A11
460 :			psubusw mm4, [ebx] ;B5
461 :			pxor mm2, mm6 ;C9
462 :			%endmacro
463 :
464 :			%macro quantinter1 1
465 :			movq mm0, [byte ecx + %1*16] ;mm0 = [1st]
466 :			movq mm3, [ecx + %1*16+8] ;
467 :			movq mm1, [eax]
468 :			movq mm4, [eax]
469 :			psubw mm1, mm0
470 :			psubw mm4, mm3
471 :			pmaxsw mm1, mm0
472 :			pmaxsw mm4, mm3
473 :			psubusw mm1, mm6 ; mm0 -= sub (unsigned, dont go < 0)
474 :			psubusw mm4, mm6 ;
475 :			psraw mm0, 15
476 :			psraw mm3, 15
477 :			psrlw mm1, 1 ; mm0 = (mm0 / 2Q) >> 16
478 :			psrlw mm4, 1 ;
479 :			paddw mm5, mm1 ; sum += mm0
480 :			pxor mm1, mm0 ; mm0 *= sign(mm0)
481 :			paddw mm5, mm4
482 :			pxor mm4, mm3 ;
483 :			psubw mm1, mm0 ; undisplace
484 :			psubw mm4, mm3
485 :			cmp esp, esp
486 :			movq [byte edx + %1*16], mm1
487 :			movq [edx + %1*16+8], mm4
488 :			%endmacro
489 :
490 :			ALIGN 16
491 :			cglobal quant_h263_inter_3dne
492 :			quant_h263_inter_3dne:
493 :			mov edx, [esp + 4] ; coeff
494 :			mov ecx, [esp + 8] ; data
495 :			mov eax, [esp + 12] ; quant
496 :			push ebx
497 :
498 :			pxor mm5, mm5 ; sum
499 :			nop
500 :			lea ebx,[mmx_sub + eax * 8 - 8] ; sub
501 :			movq mm7, [mmx_div + eax * 8 - 8] ; divider
502 :
503 :			cmp al, 1
504 :			lea eax, [mmzero]
505 :			jz near .q1loop
506 :			cmp esp, esp
507 :			ALIGN 8
508 :			movq mm3, [ecx + 120] ;B1
509 :			pxor mm4, mm4 ;B2
510 :			psubw mm4, mm3 ;B3
511 :			movq mm0, [ecx] ;A1 mm0 = [1st]
512 :			pmaxsw mm4, mm3 ;B4
513 :			psubusw mm4, [ebx] ;B5
514 :
515 :			quantinter 0
516 :			quantinter 1
517 :			quantinter 2
518 :			quantinter 3
519 :			quantinter 4
520 :
521 :			psraw mm3, 15 ;B6
522 :			psubw mm2, mm6 ;C10
523 :			pmulhw mm4, mm7 ;B7
524 :			paddw mm5, mm4 ;B8
525 :			pxor mm4, mm3 ;B9
526 :			psubw mm4, mm3 ;B10
527 :			movq [edx + 4*24+16], mm2 ;C11
528 :			pop ebx
529 :			movq [edx + 4*24+8], mm4 ;B11
530 :			pmaddwd mm5, [plus_one]
531 :			movq mm0, mm5
532 :			punpckhdq mm5, mm5
533 :			paddd mm0, mm5
534 :			movd eax, mm0 ; return sum
535 :
536 :			ret
537 :
538 :			ALIGN 16
539 :			.q1loop
540 :			movq mm6, [byte ebx]
541 :
542 :			quantinter1 0
543 :			quantinter1 1
544 :			quantinter1 2
545 :			quantinter1 3
546 :			quantinter1 4
547 :			quantinter1 5
548 :			quantinter1 6
549 :			quantinter1 7
550 :
551 :			pmaddwd mm5, [plus_one]
552 :			movq mm0, mm5
553 :			psrlq mm5, 32
554 :			paddd mm0, mm5
555 :			movd eax, mm0 ; return sum
556 :
557 :			pop ebx
558 :
559 :			ret
560 :
561 :			;-----------------------------------------------------------------------------
562 :			;
563 :			; uint32_t dequant_h263_intra_3dne(int16_t *data,
564 :			; const int16_t const *coeff,
565 :			; const uint32_t quant,
566 :			; const uint32_t dcscalar,
567 :			; const uint16_t *mpeg_matrices);
568 :			;
569 :			;-----------------------------------------------------------------------------
570 :
571 :			; this is the same as dequant_inter_3dne, except that we're
572 :			; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
573 :
574 :			;This is Athlon-optimized code (ca 106 clk per call)
575 :
576 :			%macro dequant 1
577 :			movq mm1, [ecx+%1*24] ; c = coeff[i] ;A2
578 :			psubw mm0, mm1 ;-c ;A3 (1st dep)
579 :			%if (%1)
580 :			paddw mm4, mm6 ;C11 mm6 free (4th+)
581 :			%endif
582 :			pmaxsw mm0, mm1 ;\|c\| ;A4 (2nd)
583 :			%if (%1)
584 :			mov ebp, ebp
585 :			pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) later
586 :			%endif
587 :			movq mm6, [esi] ;0 ;A5 mm6 in use
588 :			pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
589 :			%if (%1)
590 :			pxor mm5, mm4 ;C13 (6th+) 1later
591 :			%endif
592 :			movq mm4, [esi] ;C1 ;0
593 :			mov esp, esp
594 :			pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st)
595 :			ALIGN 4
596 :			psraw mm1, 15 ; sign(c) ;A7 (2nd)
597 :			%if (%1)
598 :			movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later
599 :			%endif
600 :			paddw mm7, mm3 ;B10 offset +negate back (3rd)
601 :			pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+)
602 :			paddw mm2, mm7 ;B11 mm7 free (4th+)
603 :			lea ebp, [byte ebp]
604 :			movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i]
605 :			psubw mm4, mm5 ;-c ;C3 (1st dep)
606 :			pandn mm6, [eax] ;A9 offset = isZero ? 0 : quant_add (2nd)
607 :			pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
608 :			pxor mm3, mm2 ;B13 (6th+)
609 :			movq mm2, [byte esi] ;B1 ;0
610 :			%if (%1)
611 :			movq [edx+%1*24+8-24], mm3 ;B14 (7th)
612 :			%else
613 :			movq [edx+120], mm3
614 :			%endif
615 :			pmaxsw mm4, mm5 ;\|c\| ;C4 (2nd)
616 :			paddw mm6, mm1 ;A10 offset +negate back (3rd)
617 :			movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i]
618 :			psubw mm2, mm3 ;-c ;B3 (1st dep)
619 :			paddw mm0, mm6 ;A11 mm6 free (4th+)
620 :			movq mm6, [byte esi] ;0 ;C5 mm6 in use
621 :			pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st)
622 :			pminsw mm0, [ebx] ;A12 saturates to +2047 (5th+)
623 :			pmaxsw mm2, mm3 ;\|c\| ;B4 (2nd)
624 :			pxor mm1, mm0 ;A13 (6th+)
625 :			pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+)
626 :			psraw mm5, 15 ; sign(c) ;C7 (2nd)
627 :			movq mm7, [byte esi] ;0 ;B5 mm7 in use
628 :			pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st)
629 :			%if (%1 < 4)
630 :			movq mm0, [byte esi] ;A1 ;0
631 :			%endif
632 :			pandn mm6, [byte eax] ;C9 offset = isZero ? 0 : quant_add (2nd)
633 :			psraw mm3, 15 ;sign(c) ;B7 (2nd)
634 :			movq [byte edx+%1*24], mm1 ;A14 (7th)
635 :			paddw mm6, mm5 ;C10 offset +negate back (3rd)
636 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
637 :			mov esp, esp
638 :			%endmacro
639 :
640 :
641 :			ALIGN 16
642 :			cglobal dequant_h263_intra_3dne
643 :			dequant_h263_intra_3dne:
644 :			mov ecx, [esp+ 8] ; coeff
645 :			mov eax, [esp+12] ; quant
646 :			pxor mm0, mm0
647 :			pxor mm2, mm2
648 :			push edi
649 :			push ebx
650 :			lea edi, [mmx_mul + eax8 - 8] ; 2quant
651 :			push ebp
652 :			mov ebx, mmx_2047
653 :			movsx ebp, word [ecx]
654 :			lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1
655 :			push esi
656 :			mov esi, mmzero
657 :			pxor mm7, mm7
658 :			movq mm3, [ecx+120] ;B2 ; c = coeff[i]
659 :			pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st)
660 :
661 :			imul ebp, [esp+16+16] ; dcscalar
662 :			psubw mm2, mm3 ;-c ;B3 (1st dep)
663 :			pmaxsw mm2, mm3 ;\|c\| ;B4 (2nd)
664 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
665 :			psraw mm3, 15 ; sign(c) ;B7 (2nd)
666 :			mov edx, [esp+ 4+16] ; data
667 :
668 :			ALIGN 8
669 :			dequant 0
670 :
671 :			cmp ebp, -2048
672 :			mov esp, esp
673 :
674 :			dequant 1
675 :
676 :			cmovl ebp, [int_2048]
677 :			nop
678 :
679 :			dequant 2
680 :
681 :			cmp ebp, 2047
682 :			mov esp, esp
683 :
684 :			dequant 3
685 :
686 :			cmovg ebp, [int2047]
687 :			nop
688 :
689 :			dequant 4
690 :
691 :			paddw mm4, mm6 ;C11 mm6 free (4th+)
692 :			pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+)
693 :			pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
694 :			mov eax, ebp
695 :			mov esi, [esp]
696 :			mov ebp, [esp+4]
697 :			pxor mm5, mm4 ;C13 (6th+)
698 :			paddw mm7, mm3 ;B10 offset +negate back (3rd)
699 :			movq [edx+4*24+16], mm5 ;C14 (7th)
700 :			paddw mm2, mm7 ;B11 mm7 free (4th+)
701 :			pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
702 :			mov ebx, [esp+8]
703 :			mov edi, [esp+12]
704 :			add esp, byte 16
705 :			pxor mm3, mm2 ;B13 (6th+)
706 :			movq [edx+4*24+8], mm3 ;B14 (7th)
707 :			mov [edx], ax
708 :
709 :			xor eax, eax
710 :			ret
711 :
712 :			;-----------------------------------------------------------------------------
713 :			;
714 :			; uint32_t dequant_h263_inter_3dne(int16_t * data,
715 :			; const int16_t * const coeff,
716 :			; const uint32_t quant,
717 :			; const uint16_t *mpeg_matrices);
718 :			;
719 :			;-----------------------------------------------------------------------------
720 :
721 :			; this is the same as dequant_inter_3dne,
722 :			; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
723 :			; This is Athlon-optimized code (ca 100 clk per call)
724 :
725 :			ALIGN 16
726 :			cglobal dequant_h263_inter_3dne
727 :			dequant_h263_inter_3dne:
728 :			mov ecx, [esp+ 8] ; coeff
729 :			mov eax, [esp+12] ; quant
730 :			pxor mm0, mm0
731 :			pxor mm2, mm2
732 :			push edi
733 :			push ebx
734 :			push esi
735 :			lea edi, [mmx_mul + eax8 - 8] ; 2quant
736 :			mov ebx, mmx_2047
737 :			pxor mm7, mm7
738 :			movq mm3, [ecx+120] ;B2 ; c = coeff[i]
739 :			pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st)
740 :			lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1
741 :			psubw mm2, mm3 ;-c ;B3 (1st dep)
742 :			mov esi, mmzero
743 :			pmaxsw mm2, mm3 ;\|c\| ;B4 (2nd)
744 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
745 :			psraw mm3, 15 ; sign(c) ;B7 (2nd)
746 :			mov edx, [dword esp+ 4+12] ; data
747 :
748 :			ALIGN 8
749 :
750 :			dequant 0
751 :			dequant 1
752 :			dequant 2
753 :			dequant 3
754 :			dequant 4
755 :
756 :			paddw mm4, mm6 ;C11 mm6 free (4th+)
757 :			pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+)
758 :			pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
759 :			mov esi, [esp]
760 :			pxor mm5, mm4 ;C13 (6th+)
761 :			paddw mm7, mm3 ;B10 offset +negate back (3rd)
762 :			movq [edx+4*24+16], mm5 ;C14 (7th)
763 :			paddw mm2, mm7 ;B11 mm7 free (4th+)
764 :			pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
765 :			mov ebx, [esp+4]
766 :			mov edi, [esp+8]
767 :			add esp, byte 12
768 :			pxor mm3, mm2 ;B13 (6th+)
769 :			movq [edx+4*24+8], mm3 ;B14 (7th)
770 :
771 :			xor eax, eax
772 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4