32 |
; * |
; * |
33 |
; * History: |
; * History: |
34 |
; * |
; * |
35 |
|
; * 17.04.2002 sse2 stuff |
36 |
; * 22.03.2002 0.01 ; Min Chen <chenm001@163.com> |
; * 22.03.2002 0.01 ; Min Chen <chenm001@163.com> |
37 |
; * ; use 386 cpu's 'BTS' to replace 'cbp |= 1 << (edx-1)' |
; * ; use 386 cpu's 'BTS' to replace 'cbp |= 1 << (edx-1)' |
38 |
; * 24.11.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au> |
; * 24.11.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au> |
53 |
%endif |
%endif |
54 |
%endmacro |
%endmacro |
55 |
|
|
56 |
ignore_dc dw 0, -1, -1, -1 |
align 16 |
57 |
|
|
58 |
|
ignore_dc dw 0, -1, -1, -1, -1, -1, -1, -1 |
59 |
|
|
60 |
|
|
61 |
section .text |
section .text |
137 |
pop ebx |
pop ebx |
138 |
|
|
139 |
ret |
ret |
140 |
|
|
141 |
|
|
142 |
|
|
143 |
|
;=========================================================================== |
144 |
|
; |
145 |
|
; uint32_t calc_cbp_sse2(const int16_t coeff[6][64]); |
146 |
|
; |
147 |
|
; not enabled - slower than mmx? |
148 |
|
; |
149 |
|
;=========================================================================== |
150 |
|
|
151 |
|
align 16 |
152 |
|
cglobal calc_cbp_sse2 |
153 |
|
calc_cbp_sse2 |
154 |
|
push esi |
155 |
|
|
156 |
|
mov esi, [esp + 4 + 4] ; coeff |
157 |
|
movdqa xmm7, [ignore_dc] ; mask to ignore dc value |
158 |
|
|
159 |
|
xor eax, eax ; cbp = 0 |
160 |
|
pxor xmm6, xmm6 ; zeroes to help psadbw |
161 |
|
|
162 |
|
.first movdqa xmm0, [esi] |
163 |
|
pand xmm0, xmm7 |
164 |
|
movdqa xmm1, [esi+16] |
165 |
|
|
166 |
|
por xmm0, [esi+32] |
167 |
|
por xmm1, [esi+48] |
168 |
|
por xmm0, [esi+64] |
169 |
|
por xmm1, [esi+80] |
170 |
|
por xmm0, [esi+96] |
171 |
|
por xmm1, [esi+112] |
172 |
|
|
173 |
|
por xmm0, xmm1 ; xmm0 = xmm1 = 128 bits worth of info |
174 |
|
psadbw xmm0, xmm6 ; contains 2 dwords with sums |
175 |
|
movhlps xmm1, xmm0 ; move high dword from xmm0 to low xmm1 |
176 |
|
por xmm0, xmm1 ; combine |
177 |
|
movd ecx, xmm0 ; if ecx set, values were found |
178 |
|
|
179 |
|
add esi, 128 |
180 |
|
|
181 |
|
or ecx, ecx |
182 |
|
jz .second |
183 |
|
|
184 |
|
bts eax, 5 |
185 |
|
|
186 |
|
.second movdqa xmm0, [esi] |
187 |
|
pand xmm0, xmm7 |
188 |
|
movdqa xmm1, [esi+16] |
189 |
|
|
190 |
|
por xmm0, [esi+32] |
191 |
|
por xmm1, [esi+48] |
192 |
|
por xmm0, [esi+64] |
193 |
|
por xmm1, [esi+80] |
194 |
|
por xmm0, [esi+96] |
195 |
|
por xmm1, [esi+112] |
196 |
|
|
197 |
|
por xmm0, xmm1 |
198 |
|
psadbw xmm0, xmm6 |
199 |
|
movhlps xmm1, xmm0 |
200 |
|
por xmm0, xmm1 |
201 |
|
movd ecx, xmm0 |
202 |
|
|
203 |
|
add esi, 128 |
204 |
|
|
205 |
|
or ecx, ecx |
206 |
|
jz .third |
207 |
|
|
208 |
|
bts eax, 4 |
209 |
|
|
210 |
|
.third movdqa xmm0, [esi] |
211 |
|
pand xmm0, xmm7 |
212 |
|
movdqa xmm1, [esi+16] |
213 |
|
|
214 |
|
por xmm0, [esi+32] |
215 |
|
por xmm1, [esi+48] |
216 |
|
por xmm0, [esi+64] |
217 |
|
por xmm1, [esi+80] |
218 |
|
por xmm0, [esi+96] |
219 |
|
por xmm1, [esi+112] |
220 |
|
|
221 |
|
por xmm0, xmm1 |
222 |
|
psadbw xmm0, xmm6 |
223 |
|
movhlps xmm1, xmm0 |
224 |
|
por xmm0, xmm1 |
225 |
|
movd ecx, xmm0 |
226 |
|
|
227 |
|
add esi, 128 |
228 |
|
|
229 |
|
or ecx, ecx |
230 |
|
jz .fourth |
231 |
|
|
232 |
|
bts eax, 3 |
233 |
|
|
234 |
|
.fourth movdqa xmm0, [esi] |
235 |
|
pand xmm0, xmm7 |
236 |
|
movdqa xmm1, [esi+16] |
237 |
|
|
238 |
|
por xmm0, [esi+32] |
239 |
|
por xmm1, [esi+48] |
240 |
|
por xmm0, [esi+64] |
241 |
|
por xmm1, [esi+80] |
242 |
|
por xmm0, [esi+96] |
243 |
|
por xmm1, [esi+112] |
244 |
|
|
245 |
|
por xmm0, xmm1 |
246 |
|
psadbw xmm0, xmm6 |
247 |
|
movhlps xmm1, xmm0 |
248 |
|
por xmm0, xmm1 |
249 |
|
movd ecx, xmm0 |
250 |
|
|
251 |
|
add esi, 128 |
252 |
|
|
253 |
|
or ecx, ecx |
254 |
|
jz .fifth |
255 |
|
|
256 |
|
bts eax, 2 |
257 |
|
|
258 |
|
.fifth movdqa xmm0, [esi] |
259 |
|
pand xmm0, xmm7 |
260 |
|
movdqa xmm1, [esi+16] |
261 |
|
|
262 |
|
por xmm0, [esi+32] |
263 |
|
por xmm1, [esi+48] |
264 |
|
por xmm0, [esi+64] |
265 |
|
por xmm1, [esi+80] |
266 |
|
por xmm0, [esi+96] |
267 |
|
por xmm1, [esi+112] |
268 |
|
|
269 |
|
por xmm0, xmm1 |
270 |
|
psadbw xmm0, xmm6 |
271 |
|
movhlps xmm1, xmm0 |
272 |
|
por xmm0, xmm1 |
273 |
|
movd ecx, xmm0 |
274 |
|
|
275 |
|
add esi, 128 |
276 |
|
|
277 |
|
or ecx, ecx |
278 |
|
jz .sixth |
279 |
|
|
280 |
|
bts eax, 1 |
281 |
|
|
282 |
|
.sixth movdqa xmm0, [esi] |
283 |
|
pand xmm0, xmm7 |
284 |
|
movdqa xmm1, [esi+16] |
285 |
|
|
286 |
|
por xmm0, [esi+32] |
287 |
|
por xmm1, [esi+48] |
288 |
|
por xmm0, [esi+64] |
289 |
|
por xmm1, [esi+80] |
290 |
|
por xmm0, [esi+96] |
291 |
|
por xmm1, [esi+112] |
292 |
|
|
293 |
|
por xmm0, xmm1 |
294 |
|
psadbw xmm0, xmm6 |
295 |
|
movhlps xmm1, xmm0 |
296 |
|
por xmm0, xmm1 |
297 |
|
movd ecx, xmm0 |
298 |
|
|
299 |
|
or ecx, ecx |
300 |
|
jz .end |
301 |
|
|
302 |
|
bts eax, 0 |
303 |
|
|
304 |
|
.end pop esi |
305 |
|
|
306 |
|
ret |