Annotation of /xvidcore/src/quant/x86_64_asm/quantize_mpeg_xmm.asm

Revision 1.1 - (view) (download)

1 :	edgomez	1.1	;/****************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dne Quantization/Dequantization -
5 :			; *
6 :			; * Copyright (C) 2002-2003 Peter Ross <pross@xvid.org>
7 :			; * 2002 Jaan Kalda
8 :			; * 2004 Andre Werthmann <wertmann@aei.mpg.de>
9 :			; *
10 :			; * This program is free software ; you can redistribute it and/or modify
11 :			; * it under the terms of the GNU General Public License as published by
12 :			; * the Free Software Foundation ; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :			; *
15 :			; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :			; *
20 :			; * You should have received a copy of the GNU General Public License
21 :			; * along with this program ; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :			; *
24 :			; * $Id$
25 :			; *
26 :			; ***************************************************************************/
27 :
28 :			; _3dne functions are compatible with iSSE, but are optimized specifically
29 :			; for K7 pipelines
30 :
31 :			%define SATURATE
32 :
33 :			BITS 64
34 :
35 :			%macro cglobal 1
36 :			%ifdef PREFIX
37 :			%ifdef MARK_FUNCS
38 :			global _%1:function %1.endfunc-%1
39 :			%define %1 _%1:function %1.endfunc-%1
40 :			%else
41 :			global _%1
42 :			%define %1 _%1
43 :			%endif
44 :			%else
45 :			%ifdef MARK_FUNCS
46 :			global %1:function %1.endfunc-%1
47 :			%else
48 :			global %1
49 :			%endif
50 :			%endif
51 :			%endmacro
52 :
53 :			;=============================================================================
54 :			; Local data
55 :			;=============================================================================
56 :
57 :			%ifdef FORMAT_COFF
58 :			SECTION .rodata
59 :			%else
60 :			SECTION .rodata align=16
61 :			%endif
62 :
63 :			ALIGN 8
64 :			mmzero:
65 :			dd 0,0
66 :			mmx_one:
67 :			times 4 dw 1
68 :
69 :			;-----------------------------------------------------------------------------
70 :			; divide by 2Q table
71 :			;-----------------------------------------------------------------------------
72 :
73 :			ALIGN 16
74 :			mmx_divs: ;i>2
75 :			%assign i 1
76 :			%rep 31
77 :			times 4 dw ((1 << 15) / i + 1)
78 :			%assign i i+1
79 :			%endrep
80 :
81 :			ALIGN 16
82 :			mmx_div: ;quant>2
83 :			times 4 dw 65535 ; the div by 2 formula will overflow for the case
84 :			; quant=1 but we don't care much because quant=1
85 :			; is handled by a different piece of code that
86 :			; doesn't use this table.
87 :			%assign quant 2
88 :			%rep 31
89 :			times 4 dw ((1 << 16) / quant + 1)
90 :			%assign quant quant+1
91 :			%endrep
92 :
93 :			%macro FIXX 1
94 :			dw (1 << 16) / (%1) + 1
95 :			%endmacro
96 :
97 :			%define nop4 db 08Dh, 074h, 026h,0
98 :			%define nop3 add esp, byte 0
99 :			%define nop2 mov esp, esp
100 :			%define nop7 db 08dh, 02ch, 02dh,0,0,0,0
101 :			%define nop6 add ebp, dword 0
102 :
103 :			;-----------------------------------------------------------------------------
104 :			; quantd table
105 :			;-----------------------------------------------------------------------------
106 :
107 :			%define VM18P 3
108 :			%define VM18Q 4
109 :
110 :			ALIGN 16
111 :			quantd:
112 :			%assign i 1
113 :			%rep 31
114 :			times 4 dw (((VM18P*i) + (VM18Q/2)) / VM18Q)
115 :			%assign i i+1
116 :			%endrep
117 :
118 :			;-----------------------------------------------------------------------------
119 :			; multiple by 2Q table
120 :			;-----------------------------------------------------------------------------
121 :
122 :			ALIGN 16
123 :			mmx_mul_quant:
124 :			%assign i 1
125 :			%rep 31
126 :			times 4 dw i
127 :			%assign i i+1
128 :			%endrep
129 :
130 :			;-----------------------------------------------------------------------------
131 :			; saturation limits
132 :			;-----------------------------------------------------------------------------
133 :
134 :			ALIGN 16
135 :			mmx_32767_minus_2047:
136 :			times 4 dw (32767-2047)
137 :			mmx_32768_minus_2048:
138 :			times 4 dw (32768-2048)
139 :			mmx_2047:
140 :			times 4 dw 2047
141 :			mmx_minus_2048:
142 :			times 4 dw (-2048)
143 :			zero:
144 :			times 4 dw 0
145 :
146 :			int_div:
147 :			dd 0
148 :			%assign i 1
149 :			%rep 255
150 :			dd (1 << 17) / ( i) + 1
151 :			%assign i i+1
152 :			%endrep
153 :
154 :			;=============================================================================
155 :			; Code
156 :			;=============================================================================
157 :
158 :			SECTION .text align=16
159 :
160 :			cglobal quant_mpeg_intra_x86_64
161 :			cglobal quant_mpeg_inter_x86_64
162 :			cglobal dequant_mpeg_intra_x86_64
163 :			cglobal dequant_mpeg_inter_x86_64
164 :
165 :			;-----------------------------------------------------------------------------
166 :			;
167 :			; uint32_t quant_mpeg_intra_x86_64(int16_t * coeff,
168 :			; const int16_t const * data,
169 :			; const uint32_t quant,
170 :			; const uint32_t dcscalar,
171 :			; const uint16_t *mpeg_matrices);
172 :			; Ported from its 32bit xmm cousin
173 :			;-----------------------------------------------------------------------------
174 :
175 :			ALIGN 16
176 :			quant_mpeg_intra_x86_64:
177 :			mov rax, rsi ; data
178 :			mov r9, rcx ; save dcscalar
179 :			mov rcx, rdx ; quant
180 :			mov rdx, rdi ; coeff
181 :
182 :			push rbx
183 :
184 :			mov rdi, r8 ; mpeg_quant_matrices
185 :
186 :			mov rsi, -14
187 :			pxor mm0, mm0
188 :			pxor mm3, mm3
189 :			cmp rcx, byte 1
190 :			je near .q1loop
191 :			cmp rcx, byte 19
192 :			jg near .lloop
193 :
194 :			ALIGN 16
195 :			.loop
196 :			movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
197 :			psubw mm0, mm1 ;-mm1
198 :			movq mm4, [rax + 8*rsi + 120] ;
199 :			psubw mm3, mm4 ;-mm4
200 :			pmaxsw mm0, mm1 ;\|src\|
201 :			pmaxsw mm3,mm4
202 :			; nop2
203 :			psraw mm1, 15 ;sign src
204 :			psraw mm4, 15
205 :			psllw mm0, 4 ;level << 4 ;
206 :			psllw mm3, 4
207 :			paddw mm0, [rdi + 128 + 8*rsi+112]
208 :			paddw mm3, [rdi + 128 + 8*rsi+120]
209 :			movq mm5, [rdi + 384 + 8*rsi+112]
210 :			movq mm7, [rdi + 384 + 8*rsi+120]
211 :			pmulhuw mm5, mm0
212 :			pmulhuw mm7, mm3
213 :			; mov esp, esp
214 :			movq mm2, [rdi + 8*rsi+112]
215 :			movq mm6, [rdi + 8*rsi+120]
216 :			pmullw mm2, mm5
217 :			pmullw mm6, mm7
218 :			psubw mm0, mm2
219 :			psubw mm3, mm6
220 :			; nop4
221 :			lea r11, [quantd wrt rip]
222 :			movq mm2, [r11 + rcx * 8 - 8]
223 :			lea r11, [mmx_divs wrt rip]
224 :			movq mm6, [r11 + rcx * 8 - 8]
225 :			paddw mm5, mm2
226 :			paddw mm7, mm2
227 :			; mov esp, esp
228 :			pmulhuw mm0, [rdi + 256 + 8*rsi+112]
229 :			pmulhuw mm3, [rdi + 256 + 8*rsi+120]
230 :			paddw mm5, mm0
231 :			paddw mm7, mm3
232 :			pxor mm0, mm0
233 :			pxor mm3, mm3
234 :			pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
235 :			pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32)
236 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
237 :			pxor mm7, mm4 ;
238 :			psubw mm5, mm1 ; undisplace
239 :			psubw mm7, mm4 ;
240 :			movq [rdx + 8*rsi+112], mm5
241 :			movq [rdx + 8*rsi +120], mm7
242 :			add rsi, byte 2
243 :			jng near .loop
244 :
245 :			.done
246 :			; calculate data[0] // (int32_t)dcscalar)
247 :			; mov esi, [esp + 12 + 16] ; dcscalar
248 :			mov rsi, r9 ; dcscalar
249 :			movsx rcx, word [rax]
250 :			mov rdi, rcx
251 :			; mov edx, [esp + 12 + 16]
252 :			mov r11, rdx ; save rdx
253 :			mov rdx, r9 ;
254 :			shr edx, 1 ; ebx = dcscalar /2
255 :			sar edi, 31 ; cdq is vectorpath
256 :			xor edx, edi ; ebx = eax V -eax -1
257 :			sub ecx, edi
258 :			add ecx, edx
259 :			;; mov rdx, [dword esp + 12 + 4]
260 :			mov rdx, r11 ; restore rdx
261 :			lea r11, [int_div wrt rip]
262 :			mov rsi, [r11+4*rsi]
263 :			imul ecx, esi
264 :			sar ecx, 17
265 :			lea rbx, [byte rcx + 1]
266 :			cmovs rcx, rbx
267 :			; idiv cx ; ecx = edi:ecx / dcscalar
268 :
269 :			; mov ebx, [esp]
270 :			; mov edi, [esp+4]
271 :			; mov esi, [esp+8]
272 :			; add esp, byte 12 ; pops...
273 :			pop rbx
274 :			; mov [rdx], rcx ; coeff[0] = ax
275 :			mov [rdx], cx ; coeff[0] = cx
276 :
277 :			xor rax, rax
278 :			ret
279 :
280 :			ALIGN 16
281 :			.q1loop
282 :			movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
283 :			psubw mm0, mm1 ;-mm1
284 :			movq mm4, [rax + 8*rsi+120] ;
285 :			psubw mm3, mm4 ;-mm4
286 :			pmaxsw mm0, mm1 ;\|src\|
287 :			pmaxsw mm3, mm4
288 :			; nop2
289 :			psraw mm1, 15 ;sign src
290 :			psraw mm4, 15
291 :			psllw mm0, 4 ; level << 4
292 :			psllw mm3, 4
293 :			paddw mm0, [rdi + 128 + 8*rsi+112] ;mm0 is to be divided
294 :			paddw mm3, [rdi + 128 + 8*rsi+120] ;intra1 contains fix for division by 1
295 :			movq mm5, [rdi + 384 + 8*rsi+112] ;with rounding down
296 :			movq mm7, [rdi + 384 + 8*rsi+120]
297 :			pmulhuw mm5, mm0
298 :			pmulhuw mm7, mm3 ;mm7: first approx of division
299 :			; mov esp, esp
300 :			movq mm2, [rdi + 8*rsi+112]
301 :			movq mm6, [rdi + 8*rsi+120] ; divs for q<=16
302 :			pmullw mm2, mm5 ;test value <= original
303 :			pmullw mm6, mm7
304 :			psubw mm0, mm2 ;mismatch
305 :			psubw mm3, mm6
306 :			; nop4
307 :			lea r11, [quantd wrt rip]
308 :			movq mm2, [r11 + rcx * 8 - 8]
309 :			paddw mm5, mm2 ;first approx with quantd
310 :			paddw mm7, mm2
311 :			; mov esp, esp
312 :			pmulhuw mm0, [rdi + 256 + 8*rsi+112] ;correction
313 :			pmulhuw mm3, [rdi + 256 + 8*rsi+120]
314 :			paddw mm5, mm0 ;final result with quantd
315 :			paddw mm7, mm3
316 :			pxor mm0, mm0
317 :			pxor mm3, mm3
318 :			; mov esp, esp
319 :			psrlw mm5, 1 ; (level + quantd) /2 (quant = 1)
320 :			psrlw mm7, 1
321 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
322 :			pxor mm7, mm4 ;
323 :			psubw mm5, mm1 ; undisplace
324 :			psubw mm7, mm4 ;
325 :			movq [rdx + 8*rsi+112], mm5
326 :			movq [rdx + 8*rsi +120], mm7
327 :			add rsi, byte 2
328 :			jng near .q1loop
329 :			jmp near .done
330 :
331 :			ALIGN 8
332 :			.lloop
333 :			movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
334 :			psubw mm0, mm1 ;-mm1
335 :			movq mm4, [rax + 8*rsi+120]
336 :			psubw mm3, mm4 ;-mm4
337 :			pmaxsw mm0, mm1 ;\|src\|
338 :			pmaxsw mm3, mm4
339 :			; nop2
340 :			psraw mm1, 15 ;sign src
341 :			psraw mm4, 15
342 :			psllw mm0, 4 ; level << 4
343 :			psllw mm3, 4 ;
344 :			paddw mm0, [rdi + 128 + 8*rsi+112] ;mm0 is to be divided intra1 contains fix for division by 1
345 :			paddw mm3, [rdi + 128 + 8*rsi+120]
346 :			movq mm5, [rdi + 384 + 8*rsi+112]
347 :			movq mm7, [rdi + 384 + 8*rsi+120]
348 :			pmulhuw mm5, mm0
349 :			pmulhuw mm7, mm3 ;mm7: first approx of division
350 :			; mov esp, esp
351 :			movq mm2, [rdi + 8*rsi+112]
352 :			movq mm6, [rdi + 8*rsi+120]
353 :			pmullw mm2, mm5 ;test value <= original
354 :			pmullw mm6, mm7
355 :			psubw mm0, mm2 ;mismatch
356 :			psubw mm3, mm6
357 :			; nop4
358 :			lea r11, [quantd wrt rip]
359 :			movq mm2, [r11 + rcx * 8 - 8]
360 :			lea r11, [mmx_div wrt rip]
361 :			movq mm6, [r11 + rcx * 8 - 8] ; divs for q<=16
362 :			paddw mm5, mm2 ;first approx with quantd
363 :			paddw mm7, mm2
364 :			; mov esp, esp
365 :			pmulhuw mm0, [rdi + 256 + 8*rsi+112] ;correction
366 :			pmulhuw mm3, [rdi + 256 + 8*rsi+120]
367 :			paddw mm5, mm0 ;final result with quantd
368 :			paddw mm7, mm3
369 :			pxor mm0, mm0
370 :			pxor mm3, mm3
371 :			; mov esp, esp
372 :			pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
373 :			pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32)
374 :			psrlw mm5, 1 ; (level + quantd) / (2*quant)
375 :			psrlw mm7, 1
376 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
377 :			pxor mm7, mm4 ;
378 :			psubw mm5, mm1 ; undisplace
379 :			psubw mm7, mm4 ;
380 :			movq [rdx + 8*rsi+112], mm5
381 :			movq [rdx + 8*rsi +120], mm7
382 :			add rsi,byte 2
383 :			jng near .lloop
384 :			jmp near .done
385 :			.endfunc
386 :
387 :			;-----------------------------------------------------------------------------
388 :			;
389 :			; uint32_t quant_mpeg_inter_x86_64(int16_t * coeff,
390 :			; const int16_t const * data,
391 :			; const uint32_t quant,
392 :			; const uint16_t *mpeg_matrices);
393 :			; Ported from its 32bit xmm cousin
394 :			;-----------------------------------------------------------------------------
395 :
396 :			ALIGN 16
397 :			quant_mpeg_inter_x86_64:
398 :			mov rax, rsi ; data
399 :			mov r8, rdi ; save coeff
400 :			mov rdi, rcx ; mpeg_matrices
401 :			mov rcx, rdx ; quant
402 :			mov rdx, r8 ; coeff
403 :
404 :			push rbx
405 :
406 :			mov rsi, -14
407 :			mov rbx, rsp
408 :			sub rsp, byte 24 ; 16 would be enough, but it isn't important
409 :			lea rbx, [rsp+8]
410 :			and rbx, byte -8 ;ALIGN 8
411 :			pxor mm0, mm0
412 :			pxor mm3, mm3
413 :			movq [byte rbx],mm0
414 :			movq [rbx+8],mm0
415 :			cmp rcx, byte 1
416 :			je near .q1loop
417 :			cmp rcx, byte 19
418 :			jg near .lloop
419 :
420 :			ALIGN 16
421 :			.loop
422 :			movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
423 :			psubw mm0, mm1 ;-mm1
424 :			movq mm4, [rax + 8*rsi + 120] ;
425 :			psubw mm3, mm4 ;-mm4
426 :			pmaxsw mm0, mm1 ;\|src\|
427 :			pmaxsw mm3, mm4
428 :			; nop2
429 :			psraw mm1, 15 ;sign src
430 :			psraw mm4, 15
431 :			psllw mm0, 4 ; level << 4
432 :			psllw mm3, 4 ;
433 :			paddw mm0, [rdi + 640 + 8*rsi+112]
434 :			paddw mm3, [rdi + 640 + 8*rsi+120]
435 :			movq mm5, [rdi + 896 + 8*rsi+112]
436 :			movq mm7, [rdi + 896 + 8*rsi+120]
437 :			pmulhuw mm5, mm0
438 :			pmulhuw mm7, mm3
439 :			; mov esp, esp
440 :			movq mm2, [rdi + 512 + 8*rsi+112]
441 :			movq mm6, [rdi + 512 + 8*rsi+120]
442 :			pmullw mm2, mm5
443 :			pmullw mm6, mm7
444 :			psubw mm0, mm2
445 :			psubw mm3, mm6
446 :			movq mm2, [byte rbx]
447 :			lea r11, [mmx_divs wrt rip]
448 :			movq mm6, [r11 + rcx * 8 - 8]
449 :			pmulhuw mm0, [rdi + 768 + 8*rsi+112]
450 :			pmulhuw mm3, [rdi + 768 + 8*rsi+120]
451 :			paddw mm2, [rbx+8] ;sum
452 :			paddw mm5, mm0
453 :			paddw mm7, mm3
454 :			pxor mm0, mm0
455 :			pxor mm3, mm3
456 :			pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
457 :			pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32)
458 :			add rsi, byte 2
459 :			paddw mm2, mm5 ;sum += x1
460 :			movq [rbx], mm7 ;store x2
461 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
462 :			pxor mm7, mm4 ;
463 :			psubw mm5, mm1 ; undisplace
464 :			psubw mm7, mm4 ;
465 :			; db 0Fh, 7Fh, 54h, 23h, 08 ;movq [ebx+8],mm2 ;store sum
466 :			movq [rbx+8], mm2 ;store sum
467 :			movq [rdx + 8*rsi+112-16], mm5
468 :			movq [rdx + 8*rsi +120-16], mm7
469 :			jng near .loop
470 :
471 :			.done
472 :			; calculate data[0] // (int32_t)dcscalar)
473 :			paddw mm2, [rbx]
474 :			add rsp, byte 24
475 :			pop rbx
476 :			pmaddwd mm2, [mmx_one wrt rip]
477 :			punpckldq mm0, mm2 ;get low dw to mm0:high
478 :			paddd mm0,mm2
479 :			punpckhdq mm0, mm0 ;get result to low
480 :			movd rax, mm0
481 :
482 :			ret
483 :
484 :			ALIGN 16
485 :			.q1loop
486 :			movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
487 :			psubw mm0, mm1 ;-mm1
488 :			movq mm4, [rax + 8*rsi+120]
489 :			psubw mm3, mm4 ;-mm4
490 :			pmaxsw mm0, mm1 ;\|src\|
491 :			pmaxsw mm3, mm4
492 :			; nop2
493 :			psraw mm1, 15 ; sign src
494 :			psraw mm4, 15
495 :			psllw mm0, 4 ; level << 4
496 :			psllw mm3, 4
497 :			paddw mm0, [rdi + 640 + 8*rsi+112] ;mm0 is to be divided
498 :			paddw mm3, [rdi + 640 + 8*rsi+120] ; inter1 contains fix for division by 1
499 :			movq mm5, [rdi + 896 + 8*rsi+112] ;with rounding down
500 :			movq mm7, [rdi + 896 + 8*rsi+120]
501 :			pmulhuw mm5, mm0
502 :			pmulhuw mm7, mm3 ;mm7: first approx of division
503 :			; mov esp, esp
504 :			movq mm2, [rdi + 512 + 8*rsi+112]
505 :			movq mm6, [rdi + 512 + 8*rsi+120] ; divs for q<=16
506 :			pmullw mm2, mm5 ;test value <= original
507 :			pmullw mm6, mm7
508 :			psubw mm0, mm2 ;mismatch
509 :			psubw mm3, mm6
510 :			movq mm2, [byte rbx]
511 :			pmulhuw mm0, [rdi + 768 + 8*rsi+112] ;correction
512 :			pmulhuw mm3, [rdi + 768 + 8*rsi+120]
513 :			paddw mm2, [rbx+8] ;sum
514 :			paddw mm5, mm0 ;final result
515 :			paddw mm7, mm3
516 :			pxor mm0, mm0
517 :			pxor mm3, mm3
518 :			psrlw mm5, 1 ; (level ) /2 (quant = 1)
519 :			psrlw mm7, 1
520 :			add rsi, byte 2
521 :			paddw mm2, mm5 ;sum += x1
522 :			movq [rbx], mm7 ;store x2
523 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
524 :			pxor mm7, mm4 ;
525 :			psubw mm5, mm1 ; undisplace
526 :			psubw mm7, mm4 ;
527 :			movq [rbx+8], mm2 ;store sum
528 :			movq [rdx + 8*rsi+112-16], mm5
529 :			movq [rdx + 8*rsi +120-16], mm7
530 :			jng near .q1loop
531 :			jmp near .done
532 :
533 :			ALIGN 8
534 :			.lloop
535 :			movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
536 :			psubw mm0,mm1 ;-mm1
537 :			movq mm4, [rax + 8*rsi+120]
538 :			psubw mm3,mm4 ;-mm4
539 :			pmaxsw mm0,mm1 ;\|src\|
540 :			pmaxsw mm3,mm4
541 :			; nop2
542 :			psraw mm1,15 ;sign src
543 :			psraw mm4,15
544 :			psllw mm0, 4 ; level << 4
545 :			psllw mm3, 4 ;
546 :			paddw mm0, [rdi + 640 + 8*rsi+112] ;mm0 is to be divided inter1 contains fix for division by 1
547 :			paddw mm3, [rdi + 640 + 8*rsi+120]
548 :			movq mm5,[rdi + 896 + 8*rsi+112]
549 :			movq mm7,[rdi + 896 + 8*rsi+120]
550 :			pmulhuw mm5,mm0
551 :			pmulhuw mm7,mm3 ;mm7: first approx of division
552 :			; mov esp,esp
553 :			movq mm2,[rdi + 512 + 8*rsi+112]
554 :			movq mm6,[rdi + 512 + 8*rsi+120]
555 :			pmullw mm2,mm5 ;test value <= original
556 :			pmullw mm6,mm7
557 :			psubw mm0,mm2 ;mismatch
558 :			psubw mm3,mm6
559 :			movq mm2,[byte rbx]
560 :			lea r11, [mmx_div wrt rip]
561 :			movq mm6,[r11 + rcx * 8 - 8] ; divs for q<=16
562 :			pmulhuw mm0,[rdi + 768 + 8*rsi+112] ;correction
563 :			pmulhuw mm3,[rdi + 768 + 8*rsi+120]
564 :			paddw mm2,[rbx+8] ;sum
565 :			paddw mm5,mm0 ;final result
566 :			paddw mm7,mm3
567 :			pxor mm0,mm0
568 :			pxor mm3,mm3
569 :			pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
570 :			pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32)
571 :			add rsi,byte 2
572 :			psrlw mm5, 1 ; (level ) / (2*quant)
573 :			paddw mm2,mm5 ;sum += x1
574 :			psrlw mm7, 1
575 :			movq [rbx],mm7 ;store x2
576 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
577 :			pxor mm7, mm4 ;
578 :			psubw mm5, mm1 ; undisplace
579 :			psubw mm7, mm4 ;
580 :			; db 0Fh, 7Fh, 54h, 23h, 08 ;movq [ebx+8],mm2 ;store sum
581 :			movq [rbx+8], mm2 ;store sum
582 :			movq [rdx + 8*rsi+112-16], mm5
583 :			movq [rdx + 8*rsi +120-16], mm7
584 :			jng near .lloop
585 :			jmp near .done
586 :			.endfunc
587 :
588 :			;-----------------------------------------------------------------------------
589 :			;
590 :			; uint32_t dequant_mpeg_intra_x86_64(int16_t *data,
591 :			; const int16_t const *coeff,
592 :			; const uint32_t quant,
593 :			; const uint32_t dcscalar,
594 :			; const uint16_t *mpeg_matrices);
595 :			; Ported from the 32bit 3dne cousin
596 :			;-----------------------------------------------------------------------------
597 :
598 :			; Note: in order to saturate 'easily', we pre-shift the quantifier
599 :			; by 4. Then, the high-word of (coeff[]matrix[i]quant) are used to
600 :			; build a saturating mask. It is non-zero only when an overflow occured.
601 :			; We thus avoid packing/unpacking toward double-word.
602 :			; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,
603 :			; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not
604 :			; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a
605 :			; and quant in [1..31].
606 :			;
607 :
608 :			%macro DEQUANT4INTRAMMX 1
609 :			movq mm1, [byte rcx+ 16 * %1] ; mm0 = c = coeff[i]
610 :			movq mm4, [rcx+ 16 * %1 +8] ; mm3 = c' = coeff[i+1]
611 :			psubw mm0, mm1
612 :			psubw mm3, mm4
613 :			pmaxsw mm0, mm1
614 :			pmaxsw mm3, mm4
615 :			psraw mm1, 15
616 :			psraw mm4, 15
617 :			%if %1
618 :			movq mm2, [rsp-16]
619 :			movq mm7, [rsp-16]
620 :			%endif
621 :			pmullw mm2, [rdi + 16 * %1 ] ; matrix[i]*quant
622 :			pmullw mm7, [rdi + 16 * %1 +8] ; matrix[i+1]*quant
623 :			movq mm5, mm0
624 :			movq mm6, mm3
625 :			pmulhw mm0, mm2 ; high of coeff(matrixquant)
626 :			pmulhw mm3, mm7 ; high of coeff(matrixquant)
627 :			pmullw mm2, mm5 ; low of coeff(matrixquant)
628 :			pmullw mm7, mm6 ; low of coeff(matrixquant)
629 :			pcmpgtw mm0, [rsp-8]
630 :			pcmpgtw mm3, [rsp-8]
631 :			paddusw mm2, mm0
632 :			paddusw mm7, mm3
633 :			psrlw mm2, 5
634 :			psrlw mm7, 5
635 :			pxor mm2, mm1 ; start negating back
636 :			pxor mm7, mm4 ; start negating back
637 :			psubusw mm1, mm0
638 :			psubusw mm4, mm3
639 :			movq mm0, [rsp-8]
640 :			movq mm3, [rsp-8]
641 :			psubw mm2, mm1 ; finish negating back
642 :			psubw mm7, mm4 ; finish negating back
643 :			movq [byte rdx + 16 * %1], mm2 ; data[i]
644 :			movq [rdx + 16 * %1 +8], mm7 ; data[i+1]
645 :			%endmacro
646 :
647 :			ALIGN 16
648 :			dequant_mpeg_intra_x86_64:
649 :			mov rax, rdx ; quant
650 :			mov rdx, rdi ; data
651 :			mov r9, rcx ; dcscalar
652 :			mov rcx, rsi ; coeff
653 :
654 :			lea r11, [mmx_mul_quant wrt rip]
655 :			movq mm7, [r11 + rax*8 - 8]
656 :			psllw mm7, 2 ; << 2. See comment.
657 :			push rbx
658 :
659 :			movsx ebx, word [rcx]
660 :			pxor mm0, mm0
661 :			pxor mm3, mm3
662 :
663 :			movq [rsp-8], mm0
664 :			movq [rsp-16], mm7
665 :
666 :			imul ebx, r9d
667 :			movq mm2, mm7
668 :			mov rdi, r8 ; mpeg_quant_matrices
669 :			ALIGN 4
670 :
671 :			DEQUANT4INTRAMMX 0
672 :
673 :			mov esi, -2048
674 :			; nop
675 :			cmp ebx, esi
676 :
677 :			DEQUANT4INTRAMMX 1
678 :
679 :			cmovl ebx, esi
680 :			neg esi
681 :			sub esi, byte 1 ;2047
682 :
683 :			DEQUANT4INTRAMMX 2
684 :
685 :			cmp ebx, esi
686 :			cmovg ebx, esi
687 :
688 :			DEQUANT4INTRAMMX 3
689 :
690 :			mov [byte rdx], bx
691 :
692 :			DEQUANT4INTRAMMX 4
693 :			DEQUANT4INTRAMMX 5
694 :			DEQUANT4INTRAMMX 6
695 :			DEQUANT4INTRAMMX 7
696 :
697 :			pop rbx
698 :
699 :			xor rax, rax
700 :			ret
701 :			.endfunc
702 :
703 :			;-----------------------------------------------------------------------------
704 :			;
705 :			; uint32_t dequant_mpeg_inter_3dne(int16_t * data,
706 :			; const int16_t * const coeff,
707 :			; const uint32_t quant,
708 :			; const uint16_t *mpeg_matrices);
709 :			; Ported from 32bit 3dne cousin
710 :			;-----------------------------------------------------------------------------
711 :
712 :			; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier
713 :			; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.
714 :			; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0.
715 :			; It's mixed with the extraction of the absolute value.
716 :
717 :			ALIGN 16
718 :			dequant_mpeg_inter_x86_64:
719 :			mov rax, rdx ; quant
720 :			mov rdx, rdi ; data
721 :			mov rdi, rcx ; mpeg_matrices
722 :			mov rcx, rsi ; coeff
723 :
724 :			lea r11, [mmx_mul_quant wrt rip]
725 :			movq mm7, [r11 + rax*8 - 8]
726 :			mov rax, -14
727 :			paddw mm7, mm7 ; << 1
728 :			pxor mm6, mm6 ; mismatch sum
729 :			pxor mm1, mm1
730 :			pxor mm3, mm3
731 :
732 :			ALIGN 16
733 :			.loop
734 :			movq mm0, [rcx+8rax + 716 ] ; mm0 = coeff[i]
735 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved)
736 :			movq mm2, [rcx+8rax + 716 +8] ; mm2 = coeff[i+1]
737 :			pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved)
738 :			paddsw mm0, mm1 ; c += sgn(c)
739 :			paddsw mm2, mm3 ; c += sgn(c')
740 :			paddw mm0, mm0 ; c *= 2
741 :			paddw mm2, mm2 ; c'*= 2
742 :
743 :			movq mm4, [mmzero wrt rip]
744 :			movq mm5, [mmzero wrt rip]
745 :			psubw mm4, mm0 ; -c
746 :			psubw mm5, mm2 ; -c'
747 :
748 :			psraw mm4, 16 ; mm4 = sgn(-c)
749 :			psraw mm5, 16 ; mm5 = sgn(-c')
750 :			psubsw mm0, mm4 ; c -= sgn(-c)
751 :			psubsw mm2, mm5 ; c' -= sgn(-c')
752 :			pxor mm0, mm1 ; finish changing sign if needed
753 :			pxor mm2, mm3 ; finish changing sign if needed
754 :
755 :			; we're short on register, here. Poor pairing...
756 :
757 :			movq mm4, mm7 ; (matrix*quant)
758 :			; nop
759 :			pmullw mm4, [rdi + 512 + 8rax + 716]
760 :			movq mm5, mm4
761 :			pmulhw mm5, mm0 ; high of c(matrixquant)
762 :			pmullw mm0, mm4 ; low of c(matrixquant)
763 :
764 :			movq mm4, mm7 ; (matrix*quant)
765 :			pmullw mm4, [rdi + 512 + 8rax + 716 + 8]
766 :			add rax, byte 2
767 :
768 :			pcmpgtw mm5, [mmzero wrt rip]
769 :			paddusw mm0, mm5
770 :			psrlw mm0, 5
771 :			pxor mm0, mm1 ; start restoring sign
772 :			psubusw mm1, mm5
773 :
774 :			movq mm5, mm4
775 :			pmulhw mm5, mm2 ; high of c(matrixquant)
776 :			pmullw mm2, mm4 ; low of c(matrixquant)
777 :			psubw mm0, mm1 ; finish restoring sign
778 :
779 :			pcmpgtw mm5, [mmzero wrt rip]
780 :			paddusw mm2, mm5
781 :			psrlw mm2, 5
782 :			pxor mm2, mm3 ; start restoring sign
783 :			psubusw mm3, mm5
784 :			psubw mm2, mm3 ; finish restoring sign
785 :			movq mm1, [mmzero wrt rip]
786 :			movq mm3, [byte mmzero wrt rip]
787 :			pxor mm6, mm0 ; mismatch control
788 :			movq [rdx + 8rax + 716 -2*8 ], mm0 ; data[i]
789 :			pxor mm6, mm2 ; mismatch control
790 :			movq [rdx + 8rax + 716 -2*8 +8], mm2 ; data[i+1]
791 :
792 :			jng .loop
793 :			; nop
794 :
795 :			; mismatch control
796 :
797 :			pshufw mm0, mm6, 01010101b
798 :			pshufw mm1, mm6, 10101010b
799 :			pshufw mm2, mm6, 11111111b
800 :			pxor mm6, mm0
801 :			pxor mm1, mm2
802 :			pxor mm6, mm1
803 :			movd rax, mm6
804 :			and rax, byte 1
805 :			xor rax, byte 1
806 :			xor word [rdx + 2*63], ax
807 :
808 :			xor rax, rax
809 :			ret
810 :			.endfunc

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4