23 |
; * |
; * |
24 |
; ***************************************************************************/ |
; ***************************************************************************/ |
25 |
|
|
26 |
BITS 32 |
%include "nasm.inc" |
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
|
%endmacro |
|
27 |
|
|
28 |
;;; Define this if you want an unrolled version of the code |
;;; Define this if you want an unrolled version of the code |
29 |
%define UNROLLED_LOOP |
%define UNROLLED_LOOP |
95 |
; Read only data |
; Read only data |
96 |
;============================================================================= |
;============================================================================= |
97 |
|
|
98 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata data |
|
|
%else |
|
|
SECTION .rodata data align=16 |
|
|
%endif |
|
99 |
|
|
100 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
101 |
tan1: |
tan1: |
102 |
dw 0x32ec,0x32ec,0x32ec,0x32ec ; tan( pi/16) |
dw 0x32ec,0x32ec,0x32ec,0x32ec ; tan( pi/16) |
103 |
tan2: |
tan2: |
107 |
sqrt2: |
sqrt2: |
108 |
dw 0x5a82,0x5a82,0x5a82,0x5a82 ; 0.5/sqrt(2) |
dw 0x5a82,0x5a82,0x5a82,0x5a82 ; 0.5/sqrt(2) |
109 |
|
|
110 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
111 |
fdct_table: |
fdct_table: |
112 |
;fTab1: |
;fTab1: |
113 |
dw 0x4000, 0x4000, 0x58c5, 0x4b42 |
dw 0x4000, 0x4000, 0x58c5, 0x4b42 |
189 |
dw 0x300b, 0x8c04, 0x187e, 0xba41 |
dw 0x300b, 0x8c04, 0x187e, 0xba41 |
190 |
dw 0x73fc, 0xcff5, 0x6862, 0x84df |
dw 0x73fc, 0xcff5, 0x6862, 0x84df |
191 |
|
|
192 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
193 |
fdct_rounding_1: |
fdct_rounding_1: |
194 |
dw 6, 8, 8, 8 |
dw 6, 8, 8, 8 |
195 |
dw 10, 8, 8, 8 |
dw 10, 8, 8, 8 |
200 |
dw 8, 8, 8, 8 |
dw 8, 8, 8, 8 |
201 |
dw 8, 8, 8, 8 |
dw 8, 8, 8, 8 |
202 |
|
|
203 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
204 |
fdct_rounding_2: |
fdct_rounding_2: |
205 |
dw 6, 8, 8, 8 |
dw 6, 8, 8, 8 |
206 |
dw 8, 8, 8, 8 |
dw 8, 8, 8, 8 |
211 |
dw 8, 8, 8, 8 |
dw 8, 8, 8, 8 |
212 |
dw 8, 8, 8, 8 |
dw 8, 8, 8, 8 |
213 |
|
|
214 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
215 |
MMX_One: |
MMX_One: |
216 |
dw 1, 1, 1, 1 |
dw 1, 1, 1, 1 |
217 |
|
|
356 |
paddd mm2, mm3 ; [ out0 | out1 ] |
paddd mm2, mm3 ; [ out0 | out1 ] |
357 |
pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27] |
pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27] |
358 |
psrad mm2, 16 |
psrad mm2, 16 |
359 |
pmaddwd mm0, qword [%3 + 48] ; [a0.M12+a1.M13 | b0.M28+b1.M29] |
pmaddwd mm0, [%3 + 48] ; [a0.M12+a1.M13 | b0.M28+b1.M29] |
360 |
paddd mm4, mm5 ; [ out2 | out3 ] |
paddd mm4, mm5 ; [ out2 | out3 ] |
361 |
pmaddwd mm1, qword [%3 + 56] ; [a0.M14+a1.M15 | b0.M30+b1.M31] |
pmaddwd mm1, [%3 + 56] ; [a0.M14+a1.M15 | b0.M30+b1.M31] |
362 |
psrad mm4, 16 |
psrad mm4, 16 |
363 |
|
|
364 |
paddd mm6, mm7 ; [ out4 | out5 ] |
paddd mm6, mm7 ; [ out4 | out5 ] |
414 |
paddd mm2, mm3 ; [ out0 | out1 ] |
paddd mm2, mm3 ; [ out0 | out1 ] |
415 |
pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27] |
pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27] |
416 |
psrad mm2, 16 |
psrad mm2, 16 |
417 |
pmaddwd mm0, qword [%3 + 48] ; [a0.M12+a1.M13 | b0.M28+b1.M29] |
pmaddwd mm0, [%3 + 48] ; [a0.M12+a1.M13 | b0.M28+b1.M29] |
418 |
paddd mm4, mm5 ; [ out2 | out3 ] |
paddd mm4, mm5 ; [ out2 | out3 ] |
419 |
pmaddwd mm1, qword [%3 + 56] ; [a0.M14+a1.M15 | b0.M30+b1.M31] |
pmaddwd mm1, [%3 + 56] ; [a0.M14+a1.M15 | b0.M30+b1.M31] |
420 |
psrad mm4, 16 |
psrad mm4, 16 |
421 |
|
|
422 |
paddd mm6, mm7 ; [ out4 | out5 ] |
paddd mm6, mm7 ; [ out4 | out5 ] |
442 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
443 |
|
|
444 |
%macro MAKE_FDCT_FUNC 2 |
%macro MAKE_FDCT_FUNC 2 |
445 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
446 |
cglobal %1 |
cglobal %1 |
447 |
%1: |
%1: |
448 |
%ifdef UNROLLED_LOOP |
mov TMP0, prm1 |
449 |
mov ecx, [esp + 4] |
%ifndef UNROLLED_LOOP |
450 |
%else |
push _EBX |
451 |
push ebx |
push _EDI |
|
push edi |
|
|
mov ecx, [esp + 8 + 4] |
|
452 |
%endif |
%endif |
453 |
|
|
454 |
fLLM_PASS ecx+0, ecx+0, 3 |
fLLM_PASS TMP0+0, TMP0+0, 3 |
455 |
fLLM_PASS ecx+8, ecx+8, 3 |
fLLM_PASS TMP0+8, TMP0+8, 3 |
456 |
|
|
457 |
%ifdef UNROLLED_LOOP |
%ifdef UNROLLED_LOOP |
458 |
%assign i 0 |
%assign i 0 |
459 |
%rep 8 |
%rep 8 |
460 |
%2 ecx+i*16, ecx+i*16, fdct_table+i*64, fdct_rounding_1+i*8, fdct_rounding_2+i*8 |
%2 TMP0+i*16, TMP0+i*16, fdct_table+i*64, fdct_rounding_1+i*8, fdct_rounding_2+i*8 |
461 |
%assign i i+1 |
%assign i i+1 |
462 |
%endrep |
%endrep |
463 |
%else |
%else |
464 |
mov eax, 8 |
mov _EAX, 8 |
465 |
mov edx, fdct_table |
mov TMP1, fdct_table |
466 |
mov ebx, fdct_rounding_1 |
mov _EBX, fdct_rounding_1 |
467 |
mov edi, fdct_rounding_2 |
mov _EDI, fdct_rounding_2 |
468 |
.loop |
.loop |
469 |
%2 ecx, ecx, edx, ebx, edi |
%2 TMP0, TMP0, TMP1, _EBX, _EDI |
470 |
add eax, 2*16 |
add TMP0, 2*8 |
471 |
add edx, 2*32 |
add TMP1, 2*32 |
472 |
add ebx, 2*4 |
add _EBX, 2*4 |
473 |
add edi, 2*4 |
add _EDI, 2*4 |
474 |
dec eax |
dec _EAX |
475 |
jne .loop |
jne .loop |
476 |
|
|
477 |
pop edi |
pop _EDI |
478 |
pop ebx |
pop _EBX |
479 |
%endif |
%endif |
480 |
|
|
481 |
ret |
ret |
482 |
|
ENDFUNC |
483 |
%endmacro |
%endmacro |
484 |
|
|
485 |
;============================================================================= |
;============================================================================= |
486 |
; Code |
; Code |
487 |
;============================================================================= |
;============================================================================= |
488 |
|
|
489 |
SECTION .text |
TEXT |
490 |
|
|
491 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
492 |
; void fdct_mmx_skal(int16_t block[64]]; |
; void fdct_mmx_skal(int16_t block[64]]; |
499 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
500 |
|
|
501 |
MAKE_FDCT_FUNC fdct_xmm_skal, fMTX_MULT_XMM |
MAKE_FDCT_FUNC fdct_xmm_skal, fMTX_MULT_XMM |
502 |
|
|
503 |
|
NON_EXEC_STACK |