Annotation of /xvidcore/src/quant/x86_64_asm/quantize_mpeg_xmm.asm

Revision 1.4 - (view) (download)

1 :	edgomez	1.1	;/****************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dne Quantization/Dequantization -
5 :			; *
6 :			; * Copyright (C) 2002-2003 Peter Ross <pross@xvid.org>
7 :			; * 2002 Jaan Kalda
8 :			; * 2004 Andre Werthmann <wertmann@aei.mpg.de>
9 :			; *
10 :			; * This program is free software ; you can redistribute it and/or modify
11 :			; * it under the terms of the GNU General Public License as published by
12 :			; * the Free Software Foundation ; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :			; *
15 :			; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :			; *
20 :			; * You should have received a copy of the GNU General Public License
21 :			; * along with this program ; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :			; *
24 :	Isibaar	1.4	; * $Id: quantize_mpeg_xmm.asm,v 1.3 2008/11/11 20:46:24 Isibaar Exp $
25 :	edgomez	1.1	; *
26 :			; ***************************************************************************/
27 :
28 :			; _3dne functions are compatible with iSSE, but are optimized specifically
29 :			; for K7 pipelines
30 :
31 :			%define SATURATE
32 :
33 :			BITS 64
34 :
35 :			%macro cglobal 1
36 :			%ifdef PREFIX
37 :			%ifdef MARK_FUNCS
38 :			global _%1:function %1.endfunc-%1
39 :			%define %1 _%1:function %1.endfunc-%1
40 :	Isibaar	1.3	%define ENDFUNC .endfunc
41 :	edgomez	1.1	%else
42 :			global _%1
43 :			%define %1 _%1
44 :	Isibaar	1.3	%define ENDFUNC
45 :	edgomez	1.1	%endif
46 :			%else
47 :			%ifdef MARK_FUNCS
48 :			global %1:function %1.endfunc-%1
49 :	Isibaar	1.3	%define ENDFUNC .endfunc
50 :	edgomez	1.1	%else
51 :			global %1
52 :	Isibaar	1.3	%define ENDFUNC
53 :	edgomez	1.1	%endif
54 :			%endif
55 :			%endmacro
56 :
57 :			;=============================================================================
58 :			; Local data
59 :			;=============================================================================
60 :
61 :			%ifdef FORMAT_COFF
62 :			SECTION .rodata
63 :			%else
64 :			SECTION .rodata align=16
65 :			%endif
66 :
67 :			ALIGN 8
68 :			mmzero:
69 :			dd 0,0
70 :			mmx_one:
71 :			times 4 dw 1
72 :
73 :			;-----------------------------------------------------------------------------
74 :			; divide by 2Q table
75 :			;-----------------------------------------------------------------------------
76 :
77 :			ALIGN 16
78 :			mmx_divs: ;i>2
79 :			%assign i 1
80 :			%rep 31
81 :			times 4 dw ((1 << 15) / i + 1)
82 :			%assign i i+1
83 :			%endrep
84 :
85 :			ALIGN 16
86 :			mmx_div: ;quant>2
87 :			times 4 dw 65535 ; the div by 2 formula will overflow for the case
88 :			; quant=1 but we don't care much because quant=1
89 :			; is handled by a different piece of code that
90 :			; doesn't use this table.
91 :			%assign quant 2
92 :			%rep 31
93 :			times 4 dw ((1 << 16) / quant + 1)
94 :			%assign quant quant+1
95 :			%endrep
96 :
97 :			%macro FIXX 1
98 :			dw (1 << 16) / (%1) + 1
99 :			%endmacro
100 :
101 :			%define nop4 db 08Dh, 074h, 026h,0
102 :			%define nop3 add esp, byte 0
103 :			%define nop2 mov esp, esp
104 :			%define nop7 db 08dh, 02ch, 02dh,0,0,0,0
105 :			%define nop6 add ebp, dword 0
106 :
107 :			;-----------------------------------------------------------------------------
108 :			; quantd table
109 :			;-----------------------------------------------------------------------------
110 :
111 :			%define VM18P 3
112 :			%define VM18Q 4
113 :
114 :			ALIGN 16
115 :			quantd:
116 :			%assign i 1
117 :			%rep 31
118 :			times 4 dw (((VM18P*i) + (VM18Q/2)) / VM18Q)
119 :			%assign i i+1
120 :			%endrep
121 :
122 :			;-----------------------------------------------------------------------------
123 :			; multiple by 2Q table
124 :			;-----------------------------------------------------------------------------
125 :
126 :			ALIGN 16
127 :			mmx_mul_quant:
128 :			%assign i 1
129 :			%rep 31
130 :			times 4 dw i
131 :			%assign i i+1
132 :			%endrep
133 :
134 :			;-----------------------------------------------------------------------------
135 :			; saturation limits
136 :			;-----------------------------------------------------------------------------
137 :
138 :			ALIGN 16
139 :			mmx_32767_minus_2047:
140 :			times 4 dw (32767-2047)
141 :			mmx_32768_minus_2048:
142 :			times 4 dw (32768-2048)
143 :			mmx_2047:
144 :			times 4 dw 2047
145 :			mmx_minus_2048:
146 :			times 4 dw (-2048)
147 :			zero:
148 :			times 4 dw 0
149 :
150 :			int_div:
151 :			dd 0
152 :			%assign i 1
153 :			%rep 255
154 :			dd (1 << 17) / ( i) + 1
155 :			%assign i i+1
156 :			%endrep
157 :
158 :			;=============================================================================
159 :			; Code
160 :			;=============================================================================
161 :
162 :			SECTION .text align=16
163 :
164 :			cglobal quant_mpeg_intra_x86_64
165 :			cglobal quant_mpeg_inter_x86_64
166 :			cglobal dequant_mpeg_intra_x86_64
167 :			cglobal dequant_mpeg_inter_x86_64
168 :
169 :			;-----------------------------------------------------------------------------
170 :			;
171 :			; uint32_t quant_mpeg_intra_x86_64(int16_t * coeff,
172 :			; const int16_t const * data,
173 :			; const uint32_t quant,
174 :			; const uint32_t dcscalar,
175 :			; const uint16_t *mpeg_matrices);
176 :			; Ported from its 32bit xmm cousin
177 :			;-----------------------------------------------------------------------------
178 :
179 :			ALIGN 16
180 :			quant_mpeg_intra_x86_64:
181 :			mov rax, rsi ; data
182 :			mov r9, rcx ; save dcscalar
183 :			mov rcx, rdx ; quant
184 :			mov rdx, rdi ; coeff
185 :
186 :			push rbx
187 :
188 :			mov rdi, r8 ; mpeg_quant_matrices
189 :
190 :			mov rsi, -14
191 :			pxor mm0, mm0
192 :			pxor mm3, mm3
193 :			cmp rcx, byte 1
194 :			je near .q1loop
195 :			cmp rcx, byte 19
196 :			jg near .lloop
197 :
198 :			ALIGN 16
199 :	Isibaar	1.3	.loop:
200 :	edgomez	1.1	movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
201 :			psubw mm0, mm1 ;-mm1
202 :			movq mm4, [rax + 8*rsi + 120] ;
203 :			psubw mm3, mm4 ;-mm4
204 :			pmaxsw mm0, mm1 ;\|src\|
205 :			pmaxsw mm3,mm4
206 :			; nop2
207 :			psraw mm1, 15 ;sign src
208 :			psraw mm4, 15
209 :			psllw mm0, 4 ;level << 4 ;
210 :			psllw mm3, 4
211 :			paddw mm0, [rdi + 128 + 8*rsi+112]
212 :			paddw mm3, [rdi + 128 + 8*rsi+120]
213 :			movq mm5, [rdi + 384 + 8*rsi+112]
214 :			movq mm7, [rdi + 384 + 8*rsi+120]
215 :			pmulhuw mm5, mm0
216 :			pmulhuw mm7, mm3
217 :			; mov esp, esp
218 :			movq mm2, [rdi + 8*rsi+112]
219 :			movq mm6, [rdi + 8*rsi+120]
220 :			pmullw mm2, mm5
221 :			pmullw mm6, mm7
222 :			psubw mm0, mm2
223 :			psubw mm3, mm6
224 :			; nop4
225 :			lea r11, [quantd wrt rip]
226 :			movq mm2, [r11 + rcx * 8 - 8]
227 :			lea r11, [mmx_divs wrt rip]
228 :			movq mm6, [r11 + rcx * 8 - 8]
229 :			paddw mm5, mm2
230 :			paddw mm7, mm2
231 :			; mov esp, esp
232 :			pmulhuw mm0, [rdi + 256 + 8*rsi+112]
233 :			pmulhuw mm3, [rdi + 256 + 8*rsi+120]
234 :			paddw mm5, mm0
235 :			paddw mm7, mm3
236 :			pxor mm0, mm0
237 :			pxor mm3, mm3
238 :			pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
239 :			pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32)
240 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
241 :			pxor mm7, mm4 ;
242 :			psubw mm5, mm1 ; undisplace
243 :			psubw mm7, mm4 ;
244 :			movq [rdx + 8*rsi+112], mm5
245 :			movq [rdx + 8*rsi +120], mm7
246 :			add rsi, byte 2
247 :			jng near .loop
248 :
249 :	Isibaar	1.3	.done:
250 :	edgomez	1.1	; calculate data[0] // (int32_t)dcscalar)
251 :			; mov esi, [esp + 12 + 16] ; dcscalar
252 :			mov rsi, r9 ; dcscalar
253 :			movsx rcx, word [rax]
254 :			mov rdi, rcx
255 :			; mov edx, [esp + 12 + 16]
256 :			mov r11, rdx ; save rdx
257 :			mov rdx, r9 ;
258 :			shr edx, 1 ; ebx = dcscalar /2
259 :			sar edi, 31 ; cdq is vectorpath
260 :			xor edx, edi ; ebx = eax V -eax -1
261 :			sub ecx, edi
262 :			add ecx, edx
263 :			;; mov rdx, [dword esp + 12 + 4]
264 :			mov rdx, r11 ; restore rdx
265 :			lea r11, [int_div wrt rip]
266 :			mov rsi, [r11+4*rsi]
267 :			imul ecx, esi
268 :			sar ecx, 17
269 :			lea rbx, [byte rcx + 1]
270 :			cmovs rcx, rbx
271 :			; idiv cx ; ecx = edi:ecx / dcscalar
272 :
273 :			; mov ebx, [esp]
274 :			; mov edi, [esp+4]
275 :			; mov esi, [esp+8]
276 :			; add esp, byte 12 ; pops...
277 :			pop rbx
278 :			; mov [rdx], rcx ; coeff[0] = ax
279 :			mov [rdx], cx ; coeff[0] = cx
280 :
281 :			xor rax, rax
282 :			ret
283 :
284 :			ALIGN 16
285 :	Isibaar	1.3	.q1loop:
286 :	edgomez	1.1	movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
287 :			psubw mm0, mm1 ;-mm1
288 :			movq mm4, [rax + 8*rsi+120] ;
289 :			psubw mm3, mm4 ;-mm4
290 :			pmaxsw mm0, mm1 ;\|src\|
291 :			pmaxsw mm3, mm4
292 :			; nop2
293 :			psraw mm1, 15 ;sign src
294 :			psraw mm4, 15
295 :			psllw mm0, 4 ; level << 4
296 :			psllw mm3, 4
297 :			paddw mm0, [rdi + 128 + 8*rsi+112] ;mm0 is to be divided
298 :			paddw mm3, [rdi + 128 + 8*rsi+120] ;intra1 contains fix for division by 1
299 :			movq mm5, [rdi + 384 + 8*rsi+112] ;with rounding down
300 :			movq mm7, [rdi + 384 + 8*rsi+120]
301 :			pmulhuw mm5, mm0
302 :			pmulhuw mm7, mm3 ;mm7: first approx of division
303 :			; mov esp, esp
304 :			movq mm2, [rdi + 8*rsi+112]
305 :			movq mm6, [rdi + 8*rsi+120] ; divs for q<=16
306 :			pmullw mm2, mm5 ;test value <= original
307 :			pmullw mm6, mm7
308 :			psubw mm0, mm2 ;mismatch
309 :			psubw mm3, mm6
310 :			; nop4
311 :			lea r11, [quantd wrt rip]
312 :			movq mm2, [r11 + rcx * 8 - 8]
313 :			paddw mm5, mm2 ;first approx with quantd
314 :			paddw mm7, mm2
315 :			; mov esp, esp
316 :			pmulhuw mm0, [rdi + 256 + 8*rsi+112] ;correction
317 :			pmulhuw mm3, [rdi + 256 + 8*rsi+120]
318 :			paddw mm5, mm0 ;final result with quantd
319 :			paddw mm7, mm3
320 :			pxor mm0, mm0
321 :			pxor mm3, mm3
322 :			; mov esp, esp
323 :			psrlw mm5, 1 ; (level + quantd) /2 (quant = 1)
324 :			psrlw mm7, 1
325 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
326 :			pxor mm7, mm4 ;
327 :			psubw mm5, mm1 ; undisplace
328 :			psubw mm7, mm4 ;
329 :			movq [rdx + 8*rsi+112], mm5
330 :			movq [rdx + 8*rsi +120], mm7
331 :			add rsi, byte 2
332 :			jng near .q1loop
333 :			jmp near .done
334 :
335 :			ALIGN 8
336 :	Isibaar	1.3	.lloop:
337 :	edgomez	1.1	movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
338 :			psubw mm0, mm1 ;-mm1
339 :			movq mm4, [rax + 8*rsi+120]
340 :			psubw mm3, mm4 ;-mm4
341 :			pmaxsw mm0, mm1 ;\|src\|
342 :			pmaxsw mm3, mm4
343 :			; nop2
344 :			psraw mm1, 15 ;sign src
345 :			psraw mm4, 15
346 :			psllw mm0, 4 ; level << 4
347 :			psllw mm3, 4 ;
348 :			paddw mm0, [rdi + 128 + 8*rsi+112] ;mm0 is to be divided intra1 contains fix for division by 1
349 :			paddw mm3, [rdi + 128 + 8*rsi+120]
350 :			movq mm5, [rdi + 384 + 8*rsi+112]
351 :			movq mm7, [rdi + 384 + 8*rsi+120]
352 :			pmulhuw mm5, mm0
353 :			pmulhuw mm7, mm3 ;mm7: first approx of division
354 :			; mov esp, esp
355 :			movq mm2, [rdi + 8*rsi+112]
356 :			movq mm6, [rdi + 8*rsi+120]
357 :			pmullw mm2, mm5 ;test value <= original
358 :			pmullw mm6, mm7
359 :			psubw mm0, mm2 ;mismatch
360 :			psubw mm3, mm6
361 :			; nop4
362 :			lea r11, [quantd wrt rip]
363 :			movq mm2, [r11 + rcx * 8 - 8]
364 :			lea r11, [mmx_div wrt rip]
365 :			movq mm6, [r11 + rcx * 8 - 8] ; divs for q<=16
366 :			paddw mm5, mm2 ;first approx with quantd
367 :			paddw mm7, mm2
368 :			; mov esp, esp
369 :			pmulhuw mm0, [rdi + 256 + 8*rsi+112] ;correction
370 :			pmulhuw mm3, [rdi + 256 + 8*rsi+120]
371 :			paddw mm5, mm0 ;final result with quantd
372 :			paddw mm7, mm3
373 :			pxor mm0, mm0
374 :			pxor mm3, mm3
375 :			; mov esp, esp
376 :			pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
377 :			pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32)
378 :			psrlw mm5, 1 ; (level + quantd) / (2*quant)
379 :			psrlw mm7, 1
380 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
381 :			pxor mm7, mm4 ;
382 :			psubw mm5, mm1 ; undisplace
383 :			psubw mm7, mm4 ;
384 :			movq [rdx + 8*rsi+112], mm5
385 :			movq [rdx + 8*rsi +120], mm7
386 :			add rsi,byte 2
387 :			jng near .lloop
388 :			jmp near .done
389 :	Isibaar	1.3	ENDFUNC
390 :	edgomez	1.1
391 :			;-----------------------------------------------------------------------------
392 :			;
393 :			; uint32_t quant_mpeg_inter_x86_64(int16_t * coeff,
394 :			; const int16_t const * data,
395 :			; const uint32_t quant,
396 :			; const uint16_t *mpeg_matrices);
397 :			; Ported from its 32bit xmm cousin
398 :			;-----------------------------------------------------------------------------
399 :
400 :			ALIGN 16
401 :			quant_mpeg_inter_x86_64:
402 :			mov rax, rsi ; data
403 :			mov r8, rdi ; save coeff
404 :			mov rdi, rcx ; mpeg_matrices
405 :			mov rcx, rdx ; quant
406 :			mov rdx, r8 ; coeff
407 :
408 :			push rbx
409 :
410 :			mov rsi, -14
411 :			mov rbx, rsp
412 :			sub rsp, byte 24 ; 16 would be enough, but it isn't important
413 :			lea rbx, [rsp+8]
414 :			and rbx, byte -8 ;ALIGN 8
415 :			pxor mm0, mm0
416 :			pxor mm3, mm3
417 :			movq [byte rbx],mm0
418 :			movq [rbx+8],mm0
419 :			cmp rcx, byte 1
420 :			je near .q1loop
421 :			cmp rcx, byte 19
422 :			jg near .lloop
423 :
424 :			ALIGN 16
425 :	Isibaar	1.3	.loop:
426 :	edgomez	1.1	movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
427 :			psubw mm0, mm1 ;-mm1
428 :			movq mm4, [rax + 8*rsi + 120] ;
429 :			psubw mm3, mm4 ;-mm4
430 :			pmaxsw mm0, mm1 ;\|src\|
431 :			pmaxsw mm3, mm4
432 :			; nop2
433 :			psraw mm1, 15 ;sign src
434 :			psraw mm4, 15
435 :			psllw mm0, 4 ; level << 4
436 :			psllw mm3, 4 ;
437 :			paddw mm0, [rdi + 640 + 8*rsi+112]
438 :			paddw mm3, [rdi + 640 + 8*rsi+120]
439 :			movq mm5, [rdi + 896 + 8*rsi+112]
440 :			movq mm7, [rdi + 896 + 8*rsi+120]
441 :			pmulhuw mm5, mm0
442 :			pmulhuw mm7, mm3
443 :			; mov esp, esp
444 :			movq mm2, [rdi + 512 + 8*rsi+112]
445 :			movq mm6, [rdi + 512 + 8*rsi+120]
446 :			pmullw mm2, mm5
447 :			pmullw mm6, mm7
448 :			psubw mm0, mm2
449 :			psubw mm3, mm6
450 :			movq mm2, [byte rbx]
451 :			lea r11, [mmx_divs wrt rip]
452 :			movq mm6, [r11 + rcx * 8 - 8]
453 :			pmulhuw mm0, [rdi + 768 + 8*rsi+112]
454 :			pmulhuw mm3, [rdi + 768 + 8*rsi+120]
455 :			paddw mm2, [rbx+8] ;sum
456 :			paddw mm5, mm0
457 :			paddw mm7, mm3
458 :			pxor mm0, mm0
459 :			pxor mm3, mm3
460 :			pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
461 :			pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32)
462 :			add rsi, byte 2
463 :			paddw mm2, mm5 ;sum += x1
464 :			movq [rbx], mm7 ;store x2
465 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
466 :			pxor mm7, mm4 ;
467 :			psubw mm5, mm1 ; undisplace
468 :			psubw mm7, mm4 ;
469 :			; db 0Fh, 7Fh, 54h, 23h, 08 ;movq [ebx+8],mm2 ;store sum
470 :			movq [rbx+8], mm2 ;store sum
471 :			movq [rdx + 8*rsi+112-16], mm5
472 :			movq [rdx + 8*rsi +120-16], mm7
473 :			jng near .loop
474 :
475 :			.done
476 :			; calculate data[0] // (int32_t)dcscalar)
477 :			paddw mm2, [rbx]
478 :			add rsp, byte 24
479 :			pop rbx
480 :			pmaddwd mm2, [mmx_one wrt rip]
481 :			punpckldq mm0, mm2 ;get low dw to mm0:high
482 :			paddd mm0,mm2
483 :			punpckhdq mm0, mm0 ;get result to low
484 :			movd rax, mm0
485 :
486 :			ret
487 :
488 :			ALIGN 16
489 :	Isibaar	1.3	.q1loop:
490 :	edgomez	1.1	movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
491 :			psubw mm0, mm1 ;-mm1
492 :			movq mm4, [rax + 8*rsi+120]
493 :			psubw mm3, mm4 ;-mm4
494 :			pmaxsw mm0, mm1 ;\|src\|
495 :			pmaxsw mm3, mm4
496 :			; nop2
497 :			psraw mm1, 15 ; sign src
498 :			psraw mm4, 15
499 :			psllw mm0, 4 ; level << 4
500 :			psllw mm3, 4
501 :			paddw mm0, [rdi + 640 + 8*rsi+112] ;mm0 is to be divided
502 :			paddw mm3, [rdi + 640 + 8*rsi+120] ; inter1 contains fix for division by 1
503 :			movq mm5, [rdi + 896 + 8*rsi+112] ;with rounding down
504 :			movq mm7, [rdi + 896 + 8*rsi+120]
505 :			pmulhuw mm5, mm0
506 :			pmulhuw mm7, mm3 ;mm7: first approx of division
507 :			; mov esp, esp
508 :			movq mm2, [rdi + 512 + 8*rsi+112]
509 :			movq mm6, [rdi + 512 + 8*rsi+120] ; divs for q<=16
510 :			pmullw mm2, mm5 ;test value <= original
511 :			pmullw mm6, mm7
512 :			psubw mm0, mm2 ;mismatch
513 :			psubw mm3, mm6
514 :			movq mm2, [byte rbx]
515 :			pmulhuw mm0, [rdi + 768 + 8*rsi+112] ;correction
516 :			pmulhuw mm3, [rdi + 768 + 8*rsi+120]
517 :			paddw mm2, [rbx+8] ;sum
518 :			paddw mm5, mm0 ;final result
519 :			paddw mm7, mm3
520 :			pxor mm0, mm0
521 :			pxor mm3, mm3
522 :			psrlw mm5, 1 ; (level ) /2 (quant = 1)
523 :			psrlw mm7, 1
524 :			add rsi, byte 2
525 :			paddw mm2, mm5 ;sum += x1
526 :			movq [rbx], mm7 ;store x2
527 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
528 :			pxor mm7, mm4 ;
529 :			psubw mm5, mm1 ; undisplace
530 :			psubw mm7, mm4 ;
531 :			movq [rbx+8], mm2 ;store sum
532 :			movq [rdx + 8*rsi+112-16], mm5
533 :			movq [rdx + 8*rsi +120-16], mm7
534 :			jng near .q1loop
535 :			jmp near .done
536 :
537 :			ALIGN 8
538 :	Isibaar	1.3	.lloop:
539 :	edgomez	1.1	movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
540 :			psubw mm0,mm1 ;-mm1
541 :			movq mm4, [rax + 8*rsi+120]
542 :			psubw mm3,mm4 ;-mm4
543 :			pmaxsw mm0,mm1 ;\|src\|
544 :			pmaxsw mm3,mm4
545 :			; nop2
546 :			psraw mm1,15 ;sign src
547 :			psraw mm4,15
548 :			psllw mm0, 4 ; level << 4
549 :			psllw mm3, 4 ;
550 :			paddw mm0, [rdi + 640 + 8*rsi+112] ;mm0 is to be divided inter1 contains fix for division by 1
551 :			paddw mm3, [rdi + 640 + 8*rsi+120]
552 :			movq mm5,[rdi + 896 + 8*rsi+112]
553 :			movq mm7,[rdi + 896 + 8*rsi+120]
554 :			pmulhuw mm5,mm0
555 :			pmulhuw mm7,mm3 ;mm7: first approx of division
556 :			; mov esp,esp
557 :			movq mm2,[rdi + 512 + 8*rsi+112]
558 :			movq mm6,[rdi + 512 + 8*rsi+120]
559 :			pmullw mm2,mm5 ;test value <= original
560 :			pmullw mm6,mm7
561 :			psubw mm0,mm2 ;mismatch
562 :			psubw mm3,mm6
563 :			movq mm2,[byte rbx]
564 :			lea r11, [mmx_div wrt rip]
565 :			movq mm6,[r11 + rcx * 8 - 8] ; divs for q<=16
566 :			pmulhuw mm0,[rdi + 768 + 8*rsi+112] ;correction
567 :			pmulhuw mm3,[rdi + 768 + 8*rsi+120]
568 :			paddw mm2,[rbx+8] ;sum
569 :			paddw mm5,mm0 ;final result
570 :			paddw mm7,mm3
571 :			pxor mm0,mm0
572 :			pxor mm3,mm3
573 :			pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
574 :			pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32)
575 :			add rsi,byte 2
576 :			psrlw mm5, 1 ; (level ) / (2*quant)
577 :			paddw mm2,mm5 ;sum += x1
578 :			psrlw mm7, 1
579 :			movq [rbx],mm7 ;store x2
580 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
581 :			pxor mm7, mm4 ;
582 :			psubw mm5, mm1 ; undisplace
583 :			psubw mm7, mm4 ;
584 :			; db 0Fh, 7Fh, 54h, 23h, 08 ;movq [ebx+8],mm2 ;store sum
585 :			movq [rbx+8], mm2 ;store sum
586 :			movq [rdx + 8*rsi+112-16], mm5
587 :			movq [rdx + 8*rsi +120-16], mm7
588 :			jng near .lloop
589 :			jmp near .done
590 :	Isibaar	1.3	ENDFUNC
591 :	edgomez	1.1
592 :			;-----------------------------------------------------------------------------
593 :			;
594 :			; uint32_t dequant_mpeg_intra_x86_64(int16_t *data,
595 :			; const int16_t const *coeff,
596 :			; const uint32_t quant,
597 :			; const uint32_t dcscalar,
598 :			; const uint16_t *mpeg_matrices);
599 :			; Ported from the 32bit 3dne cousin
600 :			;-----------------------------------------------------------------------------
601 :
602 :			; Note: in order to saturate 'easily', we pre-shift the quantifier
603 :			; by 4. Then, the high-word of (coeff[]matrix[i]quant) are used to
604 :			; build a saturating mask. It is non-zero only when an overflow occured.
605 :			; We thus avoid packing/unpacking toward double-word.
606 :			; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,
607 :			; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not
608 :			; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a
609 :			; and quant in [1..31].
610 :			;
611 :
612 :			%macro DEQUANT4INTRAMMX 1
613 :			movq mm1, [byte rcx+ 16 * %1] ; mm0 = c = coeff[i]
614 :			movq mm4, [rcx+ 16 * %1 +8] ; mm3 = c' = coeff[i+1]
615 :			psubw mm0, mm1
616 :			psubw mm3, mm4
617 :			pmaxsw mm0, mm1
618 :			pmaxsw mm3, mm4
619 :			psraw mm1, 15
620 :			psraw mm4, 15
621 :			%if %1
622 :			movq mm2, [rsp-16]
623 :			movq mm7, [rsp-16]
624 :			%endif
625 :			pmullw mm2, [rdi + 16 * %1 ] ; matrix[i]*quant
626 :			pmullw mm7, [rdi + 16 * %1 +8] ; matrix[i+1]*quant
627 :			movq mm5, mm0
628 :			movq mm6, mm3
629 :			pmulhw mm0, mm2 ; high of coeff(matrixquant)
630 :			pmulhw mm3, mm7 ; high of coeff(matrixquant)
631 :			pmullw mm2, mm5 ; low of coeff(matrixquant)
632 :			pmullw mm7, mm6 ; low of coeff(matrixquant)
633 :			pcmpgtw mm0, [rsp-8]
634 :			pcmpgtw mm3, [rsp-8]
635 :			paddusw mm2, mm0
636 :			paddusw mm7, mm3
637 :			psrlw mm2, 5
638 :			psrlw mm7, 5
639 :			pxor mm2, mm1 ; start negating back
640 :			pxor mm7, mm4 ; start negating back
641 :			psubusw mm1, mm0
642 :			psubusw mm4, mm3
643 :			movq mm0, [rsp-8]
644 :			movq mm3, [rsp-8]
645 :			psubw mm2, mm1 ; finish negating back
646 :			psubw mm7, mm4 ; finish negating back
647 :			movq [byte rdx + 16 * %1], mm2 ; data[i]
648 :			movq [rdx + 16 * %1 +8], mm7 ; data[i+1]
649 :			%endmacro
650 :
651 :			ALIGN 16
652 :			dequant_mpeg_intra_x86_64:
653 :			mov rax, rdx ; quant
654 :			mov rdx, rdi ; data
655 :			mov r9, rcx ; dcscalar
656 :			mov rcx, rsi ; coeff
657 :
658 :			lea r11, [mmx_mul_quant wrt rip]
659 :			movq mm7, [r11 + rax*8 - 8]
660 :			psllw mm7, 2 ; << 2. See comment.
661 :			push rbx
662 :
663 :			movsx ebx, word [rcx]
664 :			pxor mm0, mm0
665 :			pxor mm3, mm3
666 :
667 :			movq [rsp-8], mm0
668 :			movq [rsp-16], mm7
669 :
670 :			imul ebx, r9d
671 :			movq mm2, mm7
672 :			mov rdi, r8 ; mpeg_quant_matrices
673 :			ALIGN 4
674 :
675 :			DEQUANT4INTRAMMX 0
676 :
677 :			mov esi, -2048
678 :			; nop
679 :			cmp ebx, esi
680 :
681 :			DEQUANT4INTRAMMX 1
682 :
683 :			cmovl ebx, esi
684 :			neg esi
685 :			sub esi, byte 1 ;2047
686 :
687 :			DEQUANT4INTRAMMX 2
688 :
689 :			cmp ebx, esi
690 :			cmovg ebx, esi
691 :
692 :			DEQUANT4INTRAMMX 3
693 :
694 :			mov [byte rdx], bx
695 :
696 :			DEQUANT4INTRAMMX 4
697 :			DEQUANT4INTRAMMX 5
698 :			DEQUANT4INTRAMMX 6
699 :			DEQUANT4INTRAMMX 7
700 :
701 :			pop rbx
702 :
703 :			xor rax, rax
704 :			ret
705 :	Isibaar	1.3	ENDFUNC
706 :	edgomez	1.1
707 :			;-----------------------------------------------------------------------------
708 :			;
709 :			; uint32_t dequant_mpeg_inter_3dne(int16_t * data,
710 :			; const int16_t * const coeff,
711 :			; const uint32_t quant,
712 :			; const uint16_t *mpeg_matrices);
713 :			; Ported from 32bit 3dne cousin
714 :			;-----------------------------------------------------------------------------
715 :
716 :			; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier
717 :			; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.
718 :			; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0.
719 :			; It's mixed with the extraction of the absolute value.
720 :
721 :			ALIGN 16
722 :			dequant_mpeg_inter_x86_64:
723 :			mov rax, rdx ; quant
724 :			mov rdx, rdi ; data
725 :			mov rdi, rcx ; mpeg_matrices
726 :			mov rcx, rsi ; coeff
727 :
728 :			lea r11, [mmx_mul_quant wrt rip]
729 :			movq mm7, [r11 + rax*8 - 8]
730 :			mov rax, -14
731 :			paddw mm7, mm7 ; << 1
732 :			pxor mm6, mm6 ; mismatch sum
733 :			pxor mm1, mm1
734 :			pxor mm3, mm3
735 :
736 :			ALIGN 16
737 :	Isibaar	1.3	.loop:
738 :	edgomez	1.1	movq mm0, [rcx+8rax + 716 ] ; mm0 = coeff[i]
739 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved)
740 :			movq mm2, [rcx+8rax + 716 +8] ; mm2 = coeff[i+1]
741 :			pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved)
742 :			paddsw mm0, mm1 ; c += sgn(c)
743 :			paddsw mm2, mm3 ; c += sgn(c')
744 :			paddw mm0, mm0 ; c *= 2
745 :			paddw mm2, mm2 ; c'*= 2
746 :
747 :			movq mm4, [mmzero wrt rip]
748 :			movq mm5, [mmzero wrt rip]
749 :			psubw mm4, mm0 ; -c
750 :			psubw mm5, mm2 ; -c'
751 :
752 :			psraw mm4, 16 ; mm4 = sgn(-c)
753 :			psraw mm5, 16 ; mm5 = sgn(-c')
754 :			psubsw mm0, mm4 ; c -= sgn(-c)
755 :			psubsw mm2, mm5 ; c' -= sgn(-c')
756 :			pxor mm0, mm1 ; finish changing sign if needed
757 :			pxor mm2, mm3 ; finish changing sign if needed
758 :
759 :			; we're short on register, here. Poor pairing...
760 :
761 :			movq mm4, mm7 ; (matrix*quant)
762 :			; nop
763 :			pmullw mm4, [rdi + 512 + 8rax + 716]
764 :			movq mm5, mm4
765 :			pmulhw mm5, mm0 ; high of c(matrixquant)
766 :			pmullw mm0, mm4 ; low of c(matrixquant)
767 :
768 :			movq mm4, mm7 ; (matrix*quant)
769 :			pmullw mm4, [rdi + 512 + 8rax + 716 + 8]
770 :			add rax, byte 2
771 :
772 :			pcmpgtw mm5, [mmzero wrt rip]
773 :			paddusw mm0, mm5
774 :			psrlw mm0, 5
775 :			pxor mm0, mm1 ; start restoring sign
776 :			psubusw mm1, mm5
777 :
778 :			movq mm5, mm4
779 :			pmulhw mm5, mm2 ; high of c(matrixquant)
780 :			pmullw mm2, mm4 ; low of c(matrixquant)
781 :			psubw mm0, mm1 ; finish restoring sign
782 :
783 :			pcmpgtw mm5, [mmzero wrt rip]
784 :			paddusw mm2, mm5
785 :			psrlw mm2, 5
786 :			pxor mm2, mm3 ; start restoring sign
787 :			psubusw mm3, mm5
788 :			psubw mm2, mm3 ; finish restoring sign
789 :			movq mm1, [mmzero wrt rip]
790 :			movq mm3, [byte mmzero wrt rip]
791 :			pxor mm6, mm0 ; mismatch control
792 :			movq [rdx + 8rax + 716 -2*8 ], mm0 ; data[i]
793 :			pxor mm6, mm2 ; mismatch control
794 :			movq [rdx + 8rax + 716 -2*8 +8], mm2 ; data[i+1]
795 :
796 :			jng .loop
797 :			; nop
798 :
799 :			; mismatch control
800 :
801 :			pshufw mm0, mm6, 01010101b
802 :			pshufw mm1, mm6, 10101010b
803 :			pshufw mm2, mm6, 11111111b
804 :			pxor mm6, mm0
805 :			pxor mm1, mm2
806 :			pxor mm6, mm1
807 :			movd rax, mm6
808 :			and rax, byte 1
809 :			xor rax, byte 1
810 :			xor word [rdx + 2*63], ax
811 :
812 :			xor rax, rax
813 :			ret
814 :	Isibaar	1.3	ENDFUNC
815 :	Isibaar	1.2
816 :			%ifidn __OUTPUT_FORMAT__,elf
817 :			section ".note.GNU-stack" noalloc noexec nowrite progbits
818 :			%endif
819 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4