20 |
; * Ported to nasm by Peter Ross <pross@xvid.org> |
; * Ported to nasm by Peter Ross <pross@xvid.org> |
21 |
; */ |
; */ |
22 |
|
|
23 |
bits 32 |
BITS 32 |
24 |
|
|
25 |
|
;============================================================================= |
26 |
|
; Macros and other preprocessor constants |
27 |
|
;============================================================================= |
28 |
|
|
29 |
;=========================================================================== |
%macro cglobal 1 |
30 |
; data |
%ifdef PREFIX |
31 |
;=========================================================================== |
%ifdef MARK_FUNCS |
32 |
|
global _%1:function %1.endfunc-%1 |
33 |
%ifdef FORMAT_COFF |
%define %1 _%1:function %1.endfunc-%1 |
34 |
section .data |
%define ENDFUNC .endfunc |
35 |
align 8 |
%else |
36 |
|
global _%1 |
37 |
|
%define %1 _%1 |
38 |
|
%define ENDFUNC |
39 |
|
%endif |
40 |
|
%else |
41 |
|
%ifdef MARK_FUNCS |
42 |
|
global %1:function %1.endfunc-%1 |
43 |
|
%define ENDFUNC .endfunc |
44 |
%else |
%else |
45 |
section .data data align=8 |
global %1 |
46 |
|
%define ENDFUNC |
47 |
%endif |
%endif |
48 |
|
%endif |
49 |
wm1010 dw 0, 0xffff, 0, 0xffff |
%endmacro |
|
d40000 dd 0x40000, 0 |
|
|
|
|
50 |
|
|
51 |
%define ROW_SHIFT 11 |
%define ROW_SHIFT 11 |
52 |
%define COL_SHIFT 20 |
%define COL_SHIFT 20 |
59 |
%define C6 8867 ;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 8866.956905 |
%define C6 8867 ;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 8866.956905 |
60 |
%define C7 4520 ;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 4520.335430 |
%define C7 4520 ;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 4520.335430 |
61 |
|
|
62 |
coeffs |
;=========================================================================== |
63 |
|
; Data (Read Only) |
64 |
|
;=========================================================================== |
65 |
|
|
66 |
|
%ifdef FORMAT_COFF |
67 |
|
SECTION .rodata |
68 |
|
%else |
69 |
|
SECTION .rodata align=16 |
70 |
|
%endif |
71 |
|
|
72 |
|
;----------------------------------------------------------------------------- |
73 |
|
; Trigonometric Tables |
74 |
|
;----------------------------------------------------------------------------- |
75 |
|
|
76 |
|
ALIGN 16 |
77 |
|
wm1010: |
78 |
|
dw 0, 0xffff, 0, 0xffff |
79 |
|
|
80 |
|
ALIGN 16 |
81 |
|
d40000: |
82 |
|
dd 0x40000, 0 |
83 |
|
|
84 |
|
ALIGN 16 |
85 |
|
coeffs: |
86 |
dw 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, ; 0 |
dw 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, ; 0 |
87 |
dw 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, ; 8 |
dw 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, ; 8 |
88 |
|
|
106 |
|
|
107 |
|
|
108 |
;=========================================================================== |
;=========================================================================== |
109 |
; text |
; Helper macros |
110 |
;=========================================================================== |
;=========================================================================== |
111 |
section .text |
|
112 |
|
;--------------------------------------------------------------------------- |
113 |
|
; DC_COND_IDCT |
114 |
|
;--------------------------------------------------------------------------- |
115 |
|
|
116 |
%macro DC_COND_IDCT 8 |
%macro DC_COND_IDCT 8 |
117 |
%define src0 %1 |
%define src0 %1 |
198 |
packssdw mm4,mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 |
packssdw mm4,mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 |
199 |
movq [ dst + 16],mm4 |
movq [ dst + 16],mm4 |
200 |
jmp short .skip2 |
jmp short .skip2 |
201 |
.skip1 |
.skip1: |
202 |
pslld mm0,16 |
pslld mm0,16 |
203 |
paddd mm0,[d40000] |
paddd mm0,[d40000] |
204 |
psrad mm0,13 |
psrad mm0,13 |
207 |
movq [ dst + 8],mm0 |
movq [ dst + 8],mm0 |
208 |
movq [ dst + 16],mm0 |
movq [ dst + 16],mm0 |
209 |
movq [ dst + 24],mm0 |
movq [ dst + 24],mm0 |
210 |
.skip2 |
.skip2: |
211 |
%undef src0 |
%undef src0 |
212 |
%undef src4 |
%undef src4 |
213 |
%undef src1 |
%undef src1 |
218 |
%undef shift |
%undef shift |
219 |
%endmacro |
%endmacro |
220 |
|
|
221 |
|
;--------------------------------------------------------------------------- |
222 |
|
; Z_COND_IDCT |
223 |
|
;--------------------------------------------------------------------------- |
224 |
|
|
225 |
%macro Z_COND_IDCT 9 |
%macro Z_COND_IDCT 9 |
226 |
%define src0 %1 |
%define src0 %1 |
317 |
%undef bt |
%undef bt |
318 |
%endmacro |
%endmacro |
319 |
|
|
320 |
|
;--------------------------------------------------------------------------- |
321 |
|
; IDCT0 |
322 |
|
;--------------------------------------------------------------------------- |
323 |
|
|
324 |
%macro IDCT0 8 |
%macro IDCT0 8 |
325 |
%define src0 %1 |
%define src0 %1 |
414 |
%undef shift |
%undef shift |
415 |
%endmacro |
%endmacro |
416 |
|
|
417 |
|
;--------------------------------------------------------------------------- |
418 |
|
; IDCT4 |
419 |
|
;--------------------------------------------------------------------------- |
420 |
|
|
421 |
%macro IDCT4 8 |
%macro IDCT4 8 |
422 |
%define src0 %1 |
%define src0 %1 |
499 |
%undef shift |
%undef shift |
500 |
%endmacro |
%endmacro |
501 |
|
|
502 |
|
;--------------------------------------------------------------------------- |
503 |
|
; IDCT6 |
504 |
|
;--------------------------------------------------------------------------- |
505 |
|
|
506 |
%macro IDCT6 8 |
%macro IDCT6 8 |
507 |
%define src0 %1 |
%define src0 %1 |
575 |
%undef shift |
%undef shift |
576 |
%endmacro |
%endmacro |
577 |
|
|
578 |
|
;--------------------------------------------------------------------------- |
579 |
|
; IDCT2 |
580 |
|
;--------------------------------------------------------------------------- |
581 |
|
|
582 |
%macro IDCT2 8 |
%macro IDCT2 8 |
583 |
%define src0 %1 |
%define src0 %1 |
663 |
%undef shift |
%undef shift |
664 |
%endmacro |
%endmacro |
665 |
|
|
666 |
|
;--------------------------------------------------------------------------- |
667 |
|
; IDCT3 |
668 |
|
;--------------------------------------------------------------------------- |
669 |
|
|
670 |
%macro IDCT3 8 |
%macro IDCT3 8 |
671 |
%define src0 %1 |
%define src0 %1 |
739 |
%undef shift |
%undef shift |
740 |
%endmacro |
%endmacro |
741 |
|
|
742 |
|
;--------------------------------------------------------------------------- |
743 |
|
; IDCT5 |
744 |
|
;--------------------------------------------------------------------------- |
745 |
|
|
746 |
%macro IDCT5 8 |
%macro IDCT5 8 |
747 |
%define src0 %1 |
%define src0 %1 |
817 |
%undef shift |
%undef shift |
818 |
%endmacro |
%endmacro |
819 |
|
|
820 |
|
;--------------------------------------------------------------------------- |
821 |
|
; IDCT1 |
822 |
|
;--------------------------------------------------------------------------- |
823 |
|
|
824 |
%macro IDCT1 8 |
%macro IDCT1 8 |
825 |
%define src0 %1 |
%define src0 %1 |
902 |
%undef shift |
%undef shift |
903 |
%endmacro |
%endmacro |
904 |
|
|
905 |
|
;--------------------------------------------------------------------------- |
906 |
|
; IDCT7 |
907 |
|
;--------------------------------------------------------------------------- |
908 |
|
|
909 |
%macro IDCT7 8 |
%macro IDCT7 8 |
910 |
%define src0 %1 |
%define src0 %1 |
954 |
%undef shift |
%undef shift |
955 |
%endmacro |
%endmacro |
956 |
|
|
957 |
|
;--------------------------------------------------------------------------- |
958 |
|
; Permutation helpers |
959 |
|
;--------------------------------------------------------------------------- |
960 |
|
|
961 |
|
%macro XLODA 2 |
962 |
|
mov bx, [srcP+2*%2] ; get src contents |
963 |
|
mov ax, [srcP+2*%1] ; get dest contents |
964 |
|
mov [srcP+2*%1], bx ; store new dest val |
965 |
|
%endmacro |
966 |
|
|
967 |
|
%macro XCHGA 2 |
968 |
|
mov ax, [srcP+2*%1] ; get dest contents |
969 |
|
mov [srcP+2*%1], bx ; store new dest val |
970 |
|
%endmacro |
971 |
|
|
972 |
%macro cglobal 1 |
%macro XCHGB 2 |
973 |
%ifdef PREFIX |
mov bx, [srcP+2*%1] ; get dest contents |
974 |
global _%1 |
mov [srcP+2*%1], ax ; store new dest val |
|
%define %1 _%1 |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
975 |
%endmacro |
%endmacro |
976 |
|
|
977 |
|
%macro XSTRA 2 |
978 |
|
mov [srcP+2*%1], bx ; store dest val |
979 |
|
%endmacro |
980 |
|
|
981 |
; void simple_idct_mmx(int16_t * const block); |
%macro XSTRB 2 |
982 |
align 16 |
mov [srcP+2*%1], ax ; store dest val |
983 |
|
%endmacro |
984 |
|
|
985 |
|
;--------------------------------------------------------------------------- |
986 |
|
; Permutation macro |
987 |
|
;--------------------------------------------------------------------------- |
988 |
|
|
989 |
|
%macro PERMUTEP 1 |
990 |
|
%define srcP %1 |
991 |
|
push ebx |
992 |
|
|
993 |
|
; XCHGA 0x00, 0x00 ; nothing to do |
994 |
|
|
995 |
|
XLODA 0x08, 0x01 |
996 |
|
XCHGB 0x10, 0x08 |
997 |
|
XCHGA 0x20, 0x10 |
998 |
|
XCHGB 0x02, 0x20 |
999 |
|
XCHGA 0x04, 0x02 |
1000 |
|
XSTRB 0x01, 0x04 |
1001 |
|
|
1002 |
|
XLODA 0x09, 0x03 |
1003 |
|
XCHGB 0x18, 0x09 |
1004 |
|
XCHGA 0x12, 0x18 |
1005 |
|
XCHGB 0x24, 0x12 |
1006 |
|
XSTRA 0x03, 0x24 |
1007 |
|
|
1008 |
|
XLODA 0x0C, 0x05 |
1009 |
|
XCHGB 0x11, 0x0C |
1010 |
|
XCHGA 0x28, 0x11 |
1011 |
|
XCHGB 0x30, 0x28 |
1012 |
|
XCHGA 0x22, 0x30 |
1013 |
|
XCHGB 0x06, 0x22 |
1014 |
|
XSTRA 0x05, 0x06 |
1015 |
|
|
1016 |
|
XLODA 0x0D, 0x07 |
1017 |
|
XCHGB 0x1C, 0x0D |
1018 |
|
XCHGA 0x13, 0x1C |
1019 |
|
XCHGB 0x29, 0x13 |
1020 |
|
XCHGA 0x38, 0x29 |
1021 |
|
XCHGB 0x32, 0x38 |
1022 |
|
XCHGA 0x26, 0x32 |
1023 |
|
XSTRB 0x07, 0x26 |
1024 |
|
|
1025 |
|
XLODA 0x14, 0x0A |
1026 |
|
XCHGB 0x21, 0x14 |
1027 |
|
XSTRA 0x0A, 0x21 |
1028 |
|
|
1029 |
|
XLODA 0x19, 0x0B |
1030 |
|
XCHGB 0x1A, 0x19 |
1031 |
|
XCHGA 0x16, 0x1A |
1032 |
|
XCHGB 0x25, 0x16 |
1033 |
|
XCHGA 0x0E, 0x25 |
1034 |
|
XCHGB 0x15, 0x0E |
1035 |
|
XCHGA 0x2C, 0x15 |
1036 |
|
XCHGB 0x31, 0x2C |
1037 |
|
XCHGA 0x2A, 0x31 |
1038 |
|
XCHGB 0x34, 0x2A |
1039 |
|
XCHGA 0x23, 0x34 |
1040 |
|
XSTRB 0x0B, 0x23 |
1041 |
|
|
1042 |
|
XLODA 0x1D, 0x0F |
1043 |
|
XCHGB 0x1E, 0x1D |
1044 |
|
XCHGA 0x17, 0x1E |
1045 |
|
XCHGB 0x2D, 0x17 |
1046 |
|
XCHGA 0x3C, 0x2D |
1047 |
|
XCHGB 0x33, 0x3C |
1048 |
|
XCHGA 0x2B, 0x33 |
1049 |
|
XCHGB 0x39, 0x2B |
1050 |
|
XCHGA 0x3A, 0x39 |
1051 |
|
XCHGB 0x36, 0x3A |
1052 |
|
XCHGA 0x27, 0x36 |
1053 |
|
XSTRB 0x0F, 0x27 |
1054 |
|
|
1055 |
|
; XCHGA 0x1B, 0x1B |
1056 |
|
|
1057 |
|
; XCHGA 0x1F, 0x1F |
1058 |
|
|
1059 |
|
XLODA 0x35, 0x2E |
1060 |
|
XSTRB 0x2E, 0x35 |
1061 |
|
|
1062 |
|
XLODA 0x3D, 0x2F |
1063 |
|
XCHGB 0x3E, 0x3D |
1064 |
|
XCHGA 0x37, 0x3E |
1065 |
|
XSTRB 0x2F, 0x37 |
1066 |
|
|
1067 |
|
; XCHGA 0x3B, 0x3B |
1068 |
|
|
1069 |
|
; XCHGA 0x3F, 0x3F |
1070 |
|
pop ebx |
1071 |
|
%undef srcP |
1072 |
|
%endmacro |
1073 |
|
|
1074 |
|
;============================================================================= |
1075 |
|
; Code |
1076 |
|
;============================================================================= |
1077 |
|
|
1078 |
|
SECTION .text |
1079 |
|
|
1080 |
|
cglobal simple_idct_mmx_P |
1081 |
cglobal simple_idct_mmx |
cglobal simple_idct_mmx |
1082 |
simple_idct_mmx |
|
1083 |
|
;----------------------------------------------------------------------------- |
1084 |
|
; void simple_idct_mmx_P(int16_t * const block) |
1085 |
|
; expects input data to be permutated |
1086 |
|
;----------------------------------------------------------------------------- |
1087 |
|
|
1088 |
|
ALIGN 16 |
1089 |
|
simple_idct_mmx_P: |
1090 |
sub esp, 128 |
sub esp, 128 |
1091 |
mov edx, [esp+128+4] |
mov edx, [esp+128+4] |
1092 |
|
|
1093 |
; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt |
; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt |
|
|
|
1094 |
DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11 |
DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11 |
1095 |
Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .four |
Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .four |
1096 |
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .two |
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .two |
1101 |
IDCT0 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
IDCT0 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1102 |
jmp .ret |
jmp .ret |
1103 |
|
|
1104 |
align 16 |
ALIGN 16 |
1105 |
.four |
.four: |
1106 |
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .six |
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .six |
1107 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .five |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .five |
1108 |
IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1111 |
IDCT4 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
IDCT4 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1112 |
jmp .ret |
jmp .ret |
1113 |
|
|
1114 |
align 16 |
ALIGN 16 |
1115 |
.six |
.six: |
1116 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .seven |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .seven |
1117 |
IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1118 |
IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1120 |
IDCT6 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
IDCT6 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1121 |
jmp .ret |
jmp .ret |
1122 |
|
|
1123 |
align 16 |
ALIGN 16 |
1124 |
.two |
.two: |
1125 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .three |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .three |
1126 |
IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1127 |
IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1129 |
IDCT2 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
IDCT2 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1130 |
jmp .ret |
jmp .ret |
1131 |
|
|
1132 |
align 16 |
ALIGN 16 |
1133 |
.three |
.three: |
1134 |
IDCT3 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
IDCT3 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1135 |
IDCT3 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
IDCT3 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1136 |
IDCT3 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
IDCT3 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1137 |
IDCT3 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
IDCT3 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1138 |
jmp .ret |
jmp .ret |
1139 |
|
|
1140 |
align 16 |
ALIGN 16 |
1141 |
.five |
.five: |
1142 |
IDCT5 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
IDCT5 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1143 |
; IDCT5 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
; IDCT5 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1144 |
IDCT5 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
IDCT5 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1145 |
; IDCT5 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
; IDCT5 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1146 |
jmp .ret |
jmp .ret |
1147 |
|
|
1148 |
align 16 |
ALIGN 16 |
1149 |
.one |
.one: |
1150 |
IDCT1 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
IDCT1 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1151 |
IDCT1 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
IDCT1 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1152 |
IDCT1 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
IDCT1 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1153 |
IDCT1 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
IDCT1 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1154 |
jmp .ret |
jmp .ret |
1155 |
|
|
1156 |
align 16 |
ALIGN 16 |
1157 |
.seven |
.seven: |
1158 |
IDCT7 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
IDCT7 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1159 |
; IDCT7 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
; IDCT7 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1160 |
IDCT7 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
IDCT7 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1161 |
; IDCT7 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
; IDCT7 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1162 |
|
|
1163 |
.ret |
.ret: |
1164 |
add esp, 128 |
add esp, 128 |
1165 |
|
|
1166 |
|
ret |
1167 |
|
ENDFUNC |
1168 |
|
|
1169 |
|
|
1170 |
|
;----------------------------------------------------------------------------- |
1171 |
|
; void simple_idct_mmx(int16_t * const block) |
1172 |
|
; |
1173 |
|
; simple_idct_mmx is the same function as simple_idct_mmx_P above except that |
1174 |
|
; on entry it will do a fast in-line and in-place permutation on the iDCT parm |
1175 |
|
; list. This means that same parm list will also not have to be copied on the |
1176 |
|
; way out. - trbarry 6/2003 |
1177 |
|
;----------------------------------------------------------------------------- |
1178 |
|
|
1179 |
|
ALIGN 16 |
1180 |
|
simple_idct_mmx: |
1181 |
|
sub esp, 128 |
1182 |
|
mov edx, [esp+128+4] |
1183 |
|
PERMUTEP edx ; permute parm list in place |
1184 |
|
|
1185 |
|
; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt |
1186 |
|
DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11 |
1187 |
|
Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .fourP |
1188 |
|
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .twoP |
1189 |
|
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .oneP |
1190 |
|
IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1191 |
|
IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1192 |
|
IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1193 |
|
IDCT0 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1194 |
|
jmp .retP |
1195 |
|
|
1196 |
|
ALIGN 16 |
1197 |
|
.fourP: |
1198 |
|
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .sixP |
1199 |
|
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .fiveP |
1200 |
|
IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1201 |
|
IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1202 |
|
IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1203 |
|
IDCT4 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1204 |
|
jmp .retP |
1205 |
|
|
1206 |
|
ALIGN 16 |
1207 |
|
.sixP: |
1208 |
|
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .sevenP |
1209 |
|
IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1210 |
|
IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1211 |
|
IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1212 |
|
IDCT6 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1213 |
|
jmp .retP |
1214 |
|
|
1215 |
|
ALIGN 16 |
1216 |
|
.twoP: |
1217 |
|
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .threeP |
1218 |
|
IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1219 |
|
IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1220 |
|
IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1221 |
|
IDCT2 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1222 |
|
jmp .retP |
1223 |
|
|
1224 |
|
ALIGN 16 |
1225 |
|
.threeP: |
1226 |
|
IDCT3 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1227 |
|
IDCT3 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1228 |
|
IDCT3 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1229 |
|
IDCT3 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1230 |
|
jmp .retP |
1231 |
|
|
1232 |
|
ALIGN 16 |
1233 |
|
.fiveP: |
1234 |
|
IDCT5 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1235 |
|
; IDCT5 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1236 |
|
IDCT5 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1237 |
|
; IDCT5 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1238 |
|
jmp .retP |
1239 |
|
|
1240 |
|
ALIGN 16 |
1241 |
|
.oneP: |
1242 |
|
IDCT1 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1243 |
|
IDCT1 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1244 |
|
IDCT1 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1245 |
|
IDCT1 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1246 |
|
jmp .retP |
1247 |
|
|
1248 |
|
ALIGN 16 |
1249 |
|
.sevenP: |
1250 |
|
IDCT7 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1251 |
|
; IDCT7 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1252 |
|
IDCT7 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1253 |
|
; IDCT7 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1254 |
|
|
1255 |
|
.retP: |
1256 |
|
add esp, 128 |
1257 |
|
|
1258 |
ret |
ret |
1259 |
|
ENDFUNC |
1260 |
|
|
1261 |
|
|
1262 |
|
%ifidn __OUTPUT_FORMAT__,elf |
1263 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
1264 |
|
%endif |
1265 |
|
|