913 |
%endmacro |
%endmacro |
914 |
|
|
915 |
|
|
916 |
; void simple_idct_mmx(int16_t * const block); |
; void simple_idct_mmx_P(int16_t * const block); |
917 |
|
; expects input data to be permutated |
918 |
|
; |
919 |
align 16 |
align 16 |
920 |
cglobal simple_idct_mmx |
cglobal simple_idct_mmx_P |
921 |
simple_idct_mmx |
simple_idct_mmx_P |
922 |
sub esp, 128 |
sub esp, 128 |
923 |
mov edx, [esp+128+4] |
mov edx, [esp+128+4] |
924 |
|
|
998 |
ret |
ret |
999 |
|
|
1000 |
|
|
1001 |
|
;------------------ again with permuted parms -------- |
1002 |
|
; |
1003 |
|
; simple_idct_mmx is the same function as simple_idct_mmx_P above except that on entry it will |
1004 |
|
; do a fast in-line and in-place permutation on the iDCT parm list. This means that same parm list |
1005 |
|
; will also not have to be copied on the way out. - trbarry 6/2003 |
1006 |
|
|
1007 |
|
%macro XLODA 2 |
1008 |
|
mov bx, [srcP+2*%2] ; get src contents |
1009 |
|
mov ax, [srcP+2*%1] ; get dest contents |
1010 |
|
mov [srcP+2*%1], bx ; store new dest val |
1011 |
|
%endmacro |
1012 |
|
|
1013 |
|
%macro XCHGA 2 |
1014 |
|
mov ax, [srcP+2*%1] ; get dest contents |
1015 |
|
mov [srcP+2*%1], bx ; store new dest val |
1016 |
|
%endmacro |
1017 |
|
|
1018 |
|
%macro XCHGB 2 |
1019 |
|
mov bx, [srcP+2*%1] ; get dest contents |
1020 |
|
mov [srcP+2*%1], ax ; store new dest val |
1021 |
|
%endmacro |
1022 |
|
|
1023 |
|
%macro XSTRA 2 |
1024 |
|
mov [srcP+2*%1], bx ; store dest val |
1025 |
|
%endmacro |
1026 |
|
|
1027 |
|
%macro XSTRB 2 |
1028 |
|
mov [srcP+2*%1], ax ; store dest val |
1029 |
|
%endmacro |
1030 |
|
|
1031 |
|
%macro PERMUTEP 1 |
1032 |
|
%define srcP %1 |
1033 |
|
push ebx |
1034 |
|
|
1035 |
|
; XCHGA 0x00, 0x00 ; nothing to do |
1036 |
|
|
1037 |
|
XLODA 0x08, 0x01 |
1038 |
|
XCHGB 0x10, 0x08 |
1039 |
|
XCHGA 0x20, 0x10 |
1040 |
|
XCHGB 0x02, 0x20 |
1041 |
|
XCHGA 0x04, 0x02 |
1042 |
|
XSTRB 0x01, 0x04 |
1043 |
|
|
1044 |
|
XLODA 0x09, 0x03 |
1045 |
|
XCHGB 0x18, 0x09 |
1046 |
|
XCHGA 0x12, 0x18 |
1047 |
|
XCHGB 0x24, 0x12 |
1048 |
|
XSTRA 0x03, 0x24 |
1049 |
|
|
1050 |
|
XLODA 0x0C, 0x05 |
1051 |
|
XCHGB 0x11, 0x0C |
1052 |
|
XCHGA 0x28, 0x11 |
1053 |
|
XCHGB 0x30, 0x28 |
1054 |
|
XCHGA 0x22, 0x30 |
1055 |
|
XCHGB 0x06, 0x22 |
1056 |
|
XSTRA 0x05, 0x06 |
1057 |
|
|
1058 |
|
XLODA 0x0D, 0x07 |
1059 |
|
XCHGB 0x1C, 0x0D |
1060 |
|
XCHGA 0x13, 0x1C |
1061 |
|
XCHGB 0x29, 0x13 |
1062 |
|
XCHGA 0x38, 0x29 |
1063 |
|
XCHGB 0x32, 0x38 |
1064 |
|
XCHGA 0x26, 0x32 |
1065 |
|
XSTRB 0x07, 0x26 |
1066 |
|
|
1067 |
|
XLODA 0x14, 0x0A |
1068 |
|
XCHGB 0x21, 0x14 |
1069 |
|
XSTRA 0x0A, 0x21 |
1070 |
|
|
1071 |
|
XLODA 0x19, 0x0B |
1072 |
|
XCHGB 0x1A, 0x19 |
1073 |
|
XCHGA 0x16, 0x1A |
1074 |
|
XCHGB 0x25, 0x16 |
1075 |
|
XCHGA 0x0E, 0x25 |
1076 |
|
XCHGB 0x15, 0x0E |
1077 |
|
XCHGA 0x2C, 0x15 |
1078 |
|
XCHGB 0x31, 0x2C |
1079 |
|
XCHGA 0x2A, 0x31 |
1080 |
|
XCHGB 0x34, 0x2A |
1081 |
|
XCHGA 0x23, 0x34 |
1082 |
|
XSTRB 0x0B, 0x23 |
1083 |
|
|
1084 |
|
XLODA 0x1D, 0x0F |
1085 |
|
XCHGB 0x1E, 0x1D |
1086 |
|
XCHGA 0x17, 0x1E |
1087 |
|
XCHGB 0x2D, 0x17 |
1088 |
|
XCHGA 0x3C, 0x2D |
1089 |
|
XCHGB 0x33, 0x3C |
1090 |
|
XCHGA 0x2B, 0x33 |
1091 |
|
XCHGB 0x39, 0x2B |
1092 |
|
XCHGA 0x3A, 0x39 |
1093 |
|
XCHGB 0x36, 0x3A |
1094 |
|
XCHGA 0x27, 0x36 |
1095 |
|
XSTRB 0x0F, 0x27 |
1096 |
|
|
1097 |
|
; XCHGA 0x1B, 0x1B |
1098 |
|
|
1099 |
|
; XCHGA 0x1F, 0x1F |
1100 |
|
|
1101 |
|
XLODA 0x35, 0x2E |
1102 |
|
XSTRB 0x2E, 0x35 |
1103 |
|
|
1104 |
|
XLODA 0x3D, 0x2F |
1105 |
|
XCHGB 0x3E, 0x3D |
1106 |
|
XCHGA 0x37, 0x3E |
1107 |
|
XSTRB 0x2F, 0x37 |
1108 |
|
|
1109 |
|
; XCHGA 0x3B, 0x3B |
1110 |
|
|
1111 |
|
; XCHGA 0x3F, 0x3F |
1112 |
|
pop ebx |
1113 |
|
%undef srcP |
1114 |
|
%endmacro |
1115 |
|
|
1116 |
|
; void simple_idct_mmx(int16_t * const block); |
1117 |
|
align 16 |
1118 |
|
cglobal simple_idct_mmx |
1119 |
|
|
1120 |
|
simple_idct_mmx |
1121 |
|
sub esp, 128 |
1122 |
|
mov edx, [esp+128+4] |
1123 |
|
PERMUTEP edx ; permute parm list in place |
1124 |
|
|
1125 |
|
; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt |
1126 |
|
|
1127 |
|
DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11 |
1128 |
|
Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .fourP |
1129 |
|
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .twoP |
1130 |
|
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .oneP |
1131 |
|
IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1132 |
|
IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1133 |
|
IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1134 |
|
IDCT0 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1135 |
|
jmp .retP |
1136 |
|
|
1137 |
|
align 16 |
1138 |
|
.fourP |
1139 |
|
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .sixP |
1140 |
|
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .fiveP |
1141 |
|
IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1142 |
|
IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1143 |
|
IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1144 |
|
IDCT4 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1145 |
|
jmp .retP |
1146 |
|
|
1147 |
|
align 16 |
1148 |
|
.sixP |
1149 |
|
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .sevenP |
1150 |
|
IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1151 |
|
IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1152 |
|
IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1153 |
|
IDCT6 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1154 |
|
jmp .retP |
1155 |
|
|
1156 |
|
align 16 |
1157 |
|
.twoP |
1158 |
|
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .threeP |
1159 |
|
IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1160 |
|
IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1161 |
|
IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1162 |
|
IDCT2 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1163 |
|
jmp .retP |
1164 |
|
|
1165 |
|
align 16 |
1166 |
|
.threeP |
1167 |
|
IDCT3 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1168 |
|
IDCT3 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1169 |
|
IDCT3 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1170 |
|
IDCT3 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1171 |
|
jmp .retP |
1172 |
|
|
1173 |
|
align 16 |
1174 |
|
.fiveP |
1175 |
|
IDCT5 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1176 |
|
; IDCT5 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1177 |
|
IDCT5 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1178 |
|
; IDCT5 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1179 |
|
jmp .retP |
1180 |
|
|
1181 |
|
align 16 |
1182 |
|
.oneP |
1183 |
|
IDCT1 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1184 |
|
IDCT1 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1185 |
|
IDCT1 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1186 |
|
IDCT1 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1187 |
|
jmp .retP |
1188 |
|
|
1189 |
|
align 16 |
1190 |
|
.sevenP |
1191 |
|
IDCT7 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1192 |
|
; IDCT7 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1193 |
|
IDCT7 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1194 |
|
; IDCT7 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
1195 |
|
|
1196 |
|
.retP |
1197 |
|
add esp, 128 |
1198 |
|
ret |
1199 |
|
|
1200 |
|
|