32 |
; * |
; * |
33 |
; * History: |
; * History: |
34 |
; * |
; * |
35 |
|
; * 24.04.2002 had to use sse2's movdqu instead of movdqa (???) |
36 |
|
; * 17.04.2002 sse2 stuff |
37 |
; * 22.03.2002 0.01 ; Min Chen <chenm001@163.com> |
; * 22.03.2002 0.01 ; Min Chen <chenm001@163.com> |
38 |
; * ; use 386 cpu's 'BTS' to replace 'cbp |= 1 << (edx-1)' |
; * ; use 386 cpu's 'BTS' to replace 'cbp |= 1 << (edx-1)' |
39 |
; * 24.11.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au> |
; * 24.11.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au> |
54 |
%endif |
%endif |
55 |
%endmacro |
%endmacro |
56 |
|
|
57 |
ignore_dc dw 0, -1, -1, -1 |
align 16 |
58 |
|
|
59 |
|
ignore_dc dw 0, -1, -1, -1, -1, -1, -1, -1 |
60 |
|
|
61 |
|
|
62 |
section .text |
section .text |
138 |
pop ebx |
pop ebx |
139 |
|
|
140 |
ret |
ret |
141 |
|
|
142 |
|
|
143 |
|
|
144 |
|
;=========================================================================== |
145 |
|
; |
146 |
|
; uint32_t calc_cbp_sse2(const int16_t coeff[6][64]); |
147 |
|
; |
148 |
|
; not enabled - slower than mmx? |
149 |
|
; |
150 |
|
;=========================================================================== |
151 |
|
|
152 |
|
align 16 |
153 |
|
cglobal calc_cbp_sse2 |
154 |
|
calc_cbp_sse2 |
155 |
|
push esi |
156 |
|
|
157 |
|
mov esi, [esp + 4 + 4] ; coeff |
158 |
|
movdqu xmm7, [ignore_dc] ; mask to ignore dc value |
159 |
|
|
160 |
|
xor eax, eax ; cbp = 0 |
161 |
|
pxor xmm6, xmm6 ; zeroes to help psadbw |
162 |
|
|
163 |
|
.first movdqa xmm0, [esi] |
164 |
|
pand xmm0, xmm7 |
165 |
|
movdqa xmm1, [esi+16] |
166 |
|
|
167 |
|
por xmm0, [esi+32] |
168 |
|
por xmm1, [esi+48] |
169 |
|
por xmm0, [esi+64] |
170 |
|
por xmm1, [esi+80] |
171 |
|
por xmm0, [esi+96] |
172 |
|
por xmm1, [esi+112] |
173 |
|
|
174 |
|
por xmm0, xmm1 ; xmm0 = xmm1 = 128 bits worth of info |
175 |
|
psadbw xmm0, xmm6 ; contains 2 dwords with sums |
176 |
|
movhlps xmm1, xmm0 ; move high dword from xmm0 to low xmm1 |
177 |
|
por xmm0, xmm1 ; combine |
178 |
|
movd ecx, xmm0 ; if ecx set, values were found |
179 |
|
|
180 |
|
add esi, 128 |
181 |
|
|
182 |
|
or ecx, ecx |
183 |
|
jz .second |
184 |
|
|
185 |
|
bts eax, 5 |
186 |
|
|
187 |
|
.second movdqa xmm0, [esi] |
188 |
|
pand xmm0, xmm7 |
189 |
|
movdqa xmm1, [esi+16] |
190 |
|
|
191 |
|
por xmm0, [esi+32] |
192 |
|
por xmm1, [esi+48] |
193 |
|
por xmm0, [esi+64] |
194 |
|
por xmm1, [esi+80] |
195 |
|
por xmm0, [esi+96] |
196 |
|
por xmm1, [esi+112] |
197 |
|
|
198 |
|
por xmm0, xmm1 |
199 |
|
psadbw xmm0, xmm6 |
200 |
|
movhlps xmm1, xmm0 |
201 |
|
por xmm0, xmm1 |
202 |
|
movd ecx, xmm0 |
203 |
|
|
204 |
|
add esi, 128 |
205 |
|
|
206 |
|
or ecx, ecx |
207 |
|
jz .third |
208 |
|
|
209 |
|
bts eax, 4 |
210 |
|
|
211 |
|
.third movdqa xmm0, [esi] |
212 |
|
pand xmm0, xmm7 |
213 |
|
movdqa xmm1, [esi+16] |
214 |
|
|
215 |
|
por xmm0, [esi+32] |
216 |
|
por xmm1, [esi+48] |
217 |
|
por xmm0, [esi+64] |
218 |
|
por xmm1, [esi+80] |
219 |
|
por xmm0, [esi+96] |
220 |
|
por xmm1, [esi+112] |
221 |
|
|
222 |
|
por xmm0, xmm1 |
223 |
|
psadbw xmm0, xmm6 |
224 |
|
movhlps xmm1, xmm0 |
225 |
|
por xmm0, xmm1 |
226 |
|
movd ecx, xmm0 |
227 |
|
|
228 |
|
add esi, 128 |
229 |
|
|
230 |
|
or ecx, ecx |
231 |
|
jz .fourth |
232 |
|
|
233 |
|
bts eax, 3 |
234 |
|
|
235 |
|
.fourth movdqa xmm0, [esi] |
236 |
|
pand xmm0, xmm7 |
237 |
|
movdqa xmm1, [esi+16] |
238 |
|
|
239 |
|
por xmm0, [esi+32] |
240 |
|
por xmm1, [esi+48] |
241 |
|
por xmm0, [esi+64] |
242 |
|
por xmm1, [esi+80] |
243 |
|
por xmm0, [esi+96] |
244 |
|
por xmm1, [esi+112] |
245 |
|
|
246 |
|
por xmm0, xmm1 |
247 |
|
psadbw xmm0, xmm6 |
248 |
|
movhlps xmm1, xmm0 |
249 |
|
por xmm0, xmm1 |
250 |
|
movd ecx, xmm0 |
251 |
|
|
252 |
|
add esi, 128 |
253 |
|
|
254 |
|
or ecx, ecx |
255 |
|
jz .fifth |
256 |
|
|
257 |
|
bts eax, 2 |
258 |
|
|
259 |
|
.fifth movdqa xmm0, [esi] |
260 |
|
pand xmm0, xmm7 |
261 |
|
movdqa xmm1, [esi+16] |
262 |
|
|
263 |
|
por xmm0, [esi+32] |
264 |
|
por xmm1, [esi+48] |
265 |
|
por xmm0, [esi+64] |
266 |
|
por xmm1, [esi+80] |
267 |
|
por xmm0, [esi+96] |
268 |
|
por xmm1, [esi+112] |
269 |
|
|
270 |
|
por xmm0, xmm1 |
271 |
|
psadbw xmm0, xmm6 |
272 |
|
movhlps xmm1, xmm0 |
273 |
|
por xmm0, xmm1 |
274 |
|
movd ecx, xmm0 |
275 |
|
|
276 |
|
add esi, 128 |
277 |
|
|
278 |
|
or ecx, ecx |
279 |
|
jz .sixth |
280 |
|
|
281 |
|
bts eax, 1 |
282 |
|
|
283 |
|
.sixth movdqa xmm0, [esi] |
284 |
|
pand xmm0, xmm7 |
285 |
|
movdqa xmm1, [esi+16] |
286 |
|
|
287 |
|
por xmm0, [esi+32] |
288 |
|
por xmm1, [esi+48] |
289 |
|
por xmm0, [esi+64] |
290 |
|
por xmm1, [esi+80] |
291 |
|
por xmm0, [esi+96] |
292 |
|
por xmm1, [esi+112] |
293 |
|
|
294 |
|
por xmm0, xmm1 |
295 |
|
psadbw xmm0, xmm6 |
296 |
|
movhlps xmm1, xmm0 |
297 |
|
por xmm0, xmm1 |
298 |
|
movd ecx, xmm0 |
299 |
|
|
300 |
|
or ecx, ecx |
301 |
|
jz .end |
302 |
|
|
303 |
|
bts eax, 0 |
304 |
|
|
305 |
|
.end pop esi |
306 |
|
|
307 |
|
ret |