1 |
|
;/************************************************************************** |
2 |
|
; * |
3 |
|
; * XVID MPEG-4 VIDEO CODEC |
4 |
|
; * xmm 8x8 block-based halfpel interpolation |
5 |
|
; * |
6 |
|
; * This program is an implementation of a part of one or more MPEG-4 |
7 |
|
; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending |
8 |
|
; * to use this software module in hardware or software products are |
9 |
|
; * advised that its use may infringe existing patents or copyrights, and |
10 |
|
; * any such use would be at such party's own risk. The original |
11 |
|
; * developer of this software module and his/her company, and subsequent |
12 |
|
; * editors and their companies, will have no liability for use of this |
13 |
|
; * software or modifications or derivatives thereof. |
14 |
|
; * |
15 |
|
; * This program is free software; you can redistribute it and/or modify |
16 |
|
; * it under the terms of the GNU General Public License as published by |
17 |
|
; * the Free Software Foundation; either version 2 of the License, or |
18 |
|
; * (at your option) any later version. |
19 |
|
; * |
20 |
|
; * This program is distributed in the hope that it will be useful, |
21 |
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 |
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 |
|
; * GNU General Public License for more details. |
24 |
|
; * |
25 |
|
; * You should have received a copy of the GNU General Public License |
26 |
|
; * along with this program; if not, write to the Free Software |
27 |
|
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
28 |
|
; * |
29 |
|
; *************************************************************************/ |
30 |
|
|
31 |
|
; these 3dne functions are compatible with iSSE, but are optimized specifically for |
32 |
|
; K7 pipelines |
33 |
|
; |
34 |
|
;------------------------------------------------------------------------------ |
35 |
|
; 09.12.2002 Athlon optimizations contributed by Jaan Kalda |
36 |
|
;------------------------------------------------------------------------------ |
37 |
|
|
38 |
|
bits 32 |
39 |
|
|
40 |
|
%macro cglobal 1 |
41 |
|
%ifdef PREFIX |
42 |
|
global _%1 |
43 |
|
%define %1 _%1 |
44 |
|
%else |
45 |
|
global %1 |
46 |
|
%endif |
47 |
|
%endmacro |
48 |
|
%macro nop4 0 |
49 |
|
DB 08Dh,074h,026h,0 |
50 |
|
%endmacro |
51 |
|
%ifdef FORMAT_COFF |
52 |
|
section .data data |
53 |
|
%else |
54 |
|
section .data data align=16 |
55 |
|
%endif |
56 |
|
|
57 |
|
|
58 |
|
align 16 |
59 |
|
mmx_one |
60 |
|
times 8 db 1 |
61 |
|
|
62 |
|
align 8 |
63 |
|
mm_minusone: |
64 |
|
dd -1,-1 |
65 |
|
|
66 |
|
section .text |
67 |
|
|
68 |
|
cglobal interpolate8x8_halfpel_h_3dne |
69 |
|
cglobal interpolate8x8_halfpel_v_3dne |
70 |
|
cglobal interpolate8x8_halfpel_hv_3dne |
71 |
|
|
72 |
|
;=========================================================================== |
73 |
|
; |
74 |
|
; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst, |
75 |
|
; const uint8_t * const src, |
76 |
|
; const uint32_t stride, |
77 |
|
; const uint32_t rounding); |
78 |
|
; |
79 |
|
;=========================================================================== |
80 |
|
|
81 |
|
%macro COPY_H_SSE_RND0 1 |
82 |
|
%if (%1) |
83 |
|
movq mm0, [eax] |
84 |
|
%else |
85 |
|
movq mm0, [dword eax] |
86 |
|
%endif |
87 |
|
pavgb mm0, [eax+1] |
88 |
|
movq mm1, [eax+edx] |
89 |
|
pavgb mm1, [eax+edx+1] |
90 |
|
lea eax,[eax+2*edx] |
91 |
|
movq [ecx],mm0 |
92 |
|
movq [ecx+edx],mm1 |
93 |
|
%endmacro |
94 |
|
|
95 |
|
%macro COPY_H_SSE_RND1 0 |
96 |
|
movq mm0, [eax] |
97 |
|
movq mm1, [eax+edx] |
98 |
|
movq mm4, mm0 |
99 |
|
movq mm5, mm1 |
100 |
|
movq mm2, [eax+1] |
101 |
|
movq mm3, [eax+edx+1] |
102 |
|
pavgb mm0, mm2 |
103 |
|
pxor mm2, mm4 |
104 |
|
pavgb mm1, mm3 |
105 |
|
lea eax,[eax+2*edx] |
106 |
|
pxor mm3, mm5 |
107 |
|
pand mm2, mm7 |
108 |
|
pand mm3, mm7 |
109 |
|
psubb mm0, mm2 |
110 |
|
movq [ecx], mm0 |
111 |
|
psubb mm1, mm3 |
112 |
|
movq [ecx+edx], mm1 |
113 |
|
%endmacro |
114 |
|
|
115 |
|
align 16 |
116 |
|
interpolate8x8_halfpel_h_3dne: |
117 |
|
|
118 |
|
mov eax, [esp+ 8] ; Src |
119 |
|
mov edx, [esp+12] ; stride |
120 |
|
dec dword [esp+16]; rounding |
121 |
|
|
122 |
|
jz .rounding1 |
123 |
|
mov ecx, [esp+ 4] ; Dst |
124 |
|
|
125 |
|
COPY_H_SSE_RND0 0 |
126 |
|
lea ecx,[ecx+2*edx] |
127 |
|
COPY_H_SSE_RND0 1 |
128 |
|
lea ecx,[ecx+2*edx] |
129 |
|
COPY_H_SSE_RND0 1 |
130 |
|
lea ecx,[ecx+2*edx] |
131 |
|
COPY_H_SSE_RND0 1 |
132 |
|
ret |
133 |
|
|
134 |
|
.rounding1 |
135 |
|
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
136 |
|
mov ecx, [esp+ 4] ; Dst |
137 |
|
movq mm7, [mmx_one] |
138 |
|
COPY_H_SSE_RND1 |
139 |
|
lea ecx, [ecx+2*edx] |
140 |
|
COPY_H_SSE_RND1 |
141 |
|
lea ecx,[ecx+2*edx] |
142 |
|
COPY_H_SSE_RND1 |
143 |
|
lea ecx,[ecx+2*edx] |
144 |
|
COPY_H_SSE_RND1 |
145 |
|
ret |
146 |
|
|
147 |
|
;=========================================================================== |
148 |
|
; |
149 |
|
; void interpolate8x8_halfpel_v_3dne(uint8_t * const dst, |
150 |
|
; const uint8_t * const src, |
151 |
|
; const uint32_t stride, |
152 |
|
; const uint32_t rounding); |
153 |
|
; |
154 |
|
;=========================================================================== |
155 |
|
|
156 |
|
align 16 |
157 |
|
interpolate8x8_halfpel_v_3dne: |
158 |
|
|
159 |
|
mov eax, [esp+ 8] ; Src |
160 |
|
mov edx, [esp+12] ; stride |
161 |
|
dec dword [esp+16]; rounding |
162 |
|
|
163 |
|
; we process 2 line at a time |
164 |
|
|
165 |
|
jz .rounding1 |
166 |
|
pxor mm2,mm2 |
167 |
|
movq mm0, [eax] |
168 |
|
movq mm1, [eax+edx] |
169 |
|
por mm2, [eax+2*edx] |
170 |
|
mov ecx, [esp+ 4] ; Dst |
171 |
|
lea eax,[eax+2*edx] |
172 |
|
pxor mm4,mm4 |
173 |
|
pavgb mm0, mm1 |
174 |
|
pavgb mm1, mm2 |
175 |
|
movq [byte ecx],mm0 |
176 |
|
movq [ecx+edx],mm1 |
177 |
|
pxor mm6,mm6 |
178 |
|
add eax,edx |
179 |
|
lea ecx,[ecx+2*edx] |
180 |
|
movq mm3, [byte eax] |
181 |
|
por mm4, [eax+edx] |
182 |
|
lea eax,[eax+2*edx] |
183 |
|
pavgb mm2, mm3 |
184 |
|
pavgb mm3, mm4 |
185 |
|
movq [ecx],mm2 |
186 |
|
movq [ecx+edx],mm3 |
187 |
|
lea ecx,[byte ecx+2*edx] |
188 |
|
movq mm5, [byte eax] |
189 |
|
por mm6, [eax+edx] |
190 |
|
lea eax,[eax+2*edx] |
191 |
|
pavgb mm4, mm5 |
192 |
|
pavgb mm5, mm6 |
193 |
|
movq [ecx],mm4 |
194 |
|
movq [ecx+edx],mm5 |
195 |
|
lea ecx,[ecx+2*edx] |
196 |
|
movq mm7, [eax] |
197 |
|
movq mm0, [eax+edx] |
198 |
|
pavgb mm6, mm7 |
199 |
|
pavgb mm7, mm0 |
200 |
|
movq [ecx],mm6 |
201 |
|
movq [ecx+edx],mm7 |
202 |
|
ret |
203 |
|
|
204 |
|
align 8 |
205 |
|
.rounding1 |
206 |
|
pcmpeqb mm0,mm0 |
207 |
|
psubusb mm0,[eax] |
208 |
|
add eax,edx |
209 |
|
mov ecx, [esp+ 4] ; Dst |
210 |
|
push esi |
211 |
|
pcmpeqb mm1,mm1 |
212 |
|
pcmpeqb mm2,mm2 |
213 |
|
mov esi,mm_minusone |
214 |
|
psubusb mm1,[byte eax] |
215 |
|
psubusb mm2,[eax+edx] |
216 |
|
lea eax,[eax+2*edx] |
217 |
|
movq mm6, [esi] |
218 |
|
movq mm7, [esi] |
219 |
|
pavgb mm0, mm1 |
220 |
|
pavgb mm1, mm2 |
221 |
|
psubusb mm6,mm0 |
222 |
|
psubusb mm7,mm1 |
223 |
|
movq [ecx], mm6 |
224 |
|
movq [ecx+edx], mm7 |
225 |
|
lea ecx,[ecx+2*edx] |
226 |
|
pcmpeqb mm3,mm3 |
227 |
|
pcmpeqb mm4,mm4 |
228 |
|
psubusb mm3,[eax] |
229 |
|
psubusb mm4,[eax+edx] |
230 |
|
lea eax,[eax+2*edx] |
231 |
|
pavgb mm2, mm3 |
232 |
|
pavgb mm3, mm4 |
233 |
|
movq mm0, [esi] |
234 |
|
movq mm1, [esi] |
235 |
|
psubusb mm0,mm2 |
236 |
|
psubusb mm1,mm3 |
237 |
|
movq [ecx], mm0 |
238 |
|
movq [ecx+edx], mm1 |
239 |
|
lea ecx,[ecx+2*edx] |
240 |
|
|
241 |
|
pcmpeqb mm5,mm5 |
242 |
|
pcmpeqb mm6,mm6 |
243 |
|
psubusb mm5,[eax] |
244 |
|
psubusb mm6,[eax+edx] |
245 |
|
lea eax,[eax+2*edx] |
246 |
|
pavgb mm4, mm5 |
247 |
|
pavgb mm5, mm6 |
248 |
|
movq mm2, [esi] |
249 |
|
movq mm3, [esi] |
250 |
|
psubusb mm2,mm4 |
251 |
|
psubusb mm3,mm5 |
252 |
|
movq [ecx], mm2 |
253 |
|
movq [ecx+edx], mm3 |
254 |
|
lea ecx,[ecx+2*edx] |
255 |
|
pcmpeqb mm7,mm7 |
256 |
|
pcmpeqb mm0,mm0 |
257 |
|
psubusb mm7,[eax] |
258 |
|
psubusb mm0,[eax+edx] |
259 |
|
pavgb mm6, mm7 |
260 |
|
pavgb mm7, mm0 |
261 |
|
movq mm4, [esi] |
262 |
|
movq mm5, [esi] |
263 |
|
psubusb mm4,mm6 |
264 |
|
pop esi |
265 |
|
psubusb mm5,mm7 |
266 |
|
movq [ecx], mm4 |
267 |
|
movq [ecx+edx], mm5 |
268 |
|
ret |
269 |
|
;=========================================================================== |
270 |
|
; |
271 |
|
; void interpolate8x8_halfpel_hv_3dne(uint8_t * const dst, |
272 |
|
; const uint8_t * const src, |
273 |
|
; const uint32_t stride, |
274 |
|
; const uint32_t rounding); |
275 |
|
; |
276 |
|
; |
277 |
|
;=========================================================================== |
278 |
|
|
279 |
|
; The trick is to correct the result of 'pavgb' with some combination of the |
280 |
|
; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). |
281 |
|
; The boolean relations are: |
282 |
|
; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st |
283 |
|
; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st |
284 |
|
; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st |
285 |
|
; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st |
286 |
|
; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t. |
287 |
|
|
288 |
|
; Moreover, we process 2 lines at a times, for better overlapping (~15% faster). |
289 |
|
|
290 |
|
%macro COPY_HV_SSE_RND0 0 |
291 |
|
|
292 |
|
movq mm0, [eax+edx] |
293 |
|
movq mm1, [eax+edx+1] |
294 |
|
|
295 |
|
movq mm6, mm0 |
296 |
|
pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step |
297 |
|
lea eax,[eax+2*edx] |
298 |
|
pxor mm1, mm6 ; mm1=(j^k). preserved for next step |
299 |
|
|
300 |
|
por mm3, mm1 ; ij |= jk |
301 |
|
movq mm6, mm2 |
302 |
|
pxor mm6, mm0 ; mm6 = s^t |
303 |
|
pand mm3, mm6 ; (ij|jk) &= st |
304 |
|
pavgb mm2, mm0 ; mm2 = (s+t+1)/2 |
305 |
|
movq mm6, [eax] |
306 |
|
pand mm3, mm7 ; mask lsb |
307 |
|
psubb mm2, mm3 ; apply. |
308 |
|
|
309 |
|
movq [ecx], mm2 |
310 |
|
|
311 |
|
movq mm2, [eax] |
312 |
|
movq mm3, [eax+1] |
313 |
|
pavgb mm2, mm3 ; preserved for next iteration |
314 |
|
pxor mm3, mm6 ; preserved for next iteration |
315 |
|
|
316 |
|
por mm1, mm3 |
317 |
|
movq mm6, mm0 |
318 |
|
pxor mm6, mm2 |
319 |
|
pand mm1, mm6 |
320 |
|
pavgb mm0, mm2 |
321 |
|
|
322 |
|
pand mm1, mm7 |
323 |
|
psubb mm0, mm1 |
324 |
|
|
325 |
|
movq [ecx+edx], mm0 |
326 |
|
%endmacro |
327 |
|
|
328 |
|
%macro COPY_HV_SSE_RND1 0 |
329 |
|
movq mm0, [eax+edx] |
330 |
|
movq mm1, [eax+edx+1] |
331 |
|
|
332 |
|
movq mm6, mm0 |
333 |
|
pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step |
334 |
|
lea eax,[eax+2*edx] |
335 |
|
pxor mm1, mm6 ; mm1=(j^k). preserved for next step |
336 |
|
|
337 |
|
pand mm3, mm1 |
338 |
|
movq mm6, mm2 |
339 |
|
pxor mm6, mm0 |
340 |
|
por mm3, mm6 |
341 |
|
pavgb mm2, mm0 |
342 |
|
movq mm6, [eax] |
343 |
|
pand mm3, mm7 |
344 |
|
psubb mm2, mm3 |
345 |
|
|
346 |
|
movq [ecx], mm2 |
347 |
|
|
348 |
|
movq mm2, [eax] |
349 |
|
movq mm3, [eax+1] |
350 |
|
pavgb mm2, mm3 ; preserved for next iteration |
351 |
|
pxor mm3, mm6 ; preserved for next iteration |
352 |
|
|
353 |
|
pand mm1, mm3 |
354 |
|
movq mm6, mm0 |
355 |
|
pxor mm6, mm2 |
356 |
|
por mm1, mm6 |
357 |
|
pavgb mm0, mm2 |
358 |
|
pand mm1, mm7 |
359 |
|
psubb mm0, mm1 |
360 |
|
movq [ecx+edx], mm0 |
361 |
|
%endmacro |
362 |
|
|
363 |
|
align 16 |
364 |
|
interpolate8x8_halfpel_hv_3dne: |
365 |
|
mov eax, [esp+ 8] ; Src |
366 |
|
mov edx, [esp+12] ; stride |
367 |
|
dec dword [esp+16] ; rounding |
368 |
|
|
369 |
|
; loop invariants: mm2=(i+j+1)/2 and mm3= i^j |
370 |
|
movq mm2, [eax] |
371 |
|
movq mm3, [eax+1] |
372 |
|
movq mm6, mm2 |
373 |
|
pavgb mm2, mm3 |
374 |
|
pxor mm3, mm6 ; mm2/mm3 ready |
375 |
|
mov ecx, [esp+ 4] ; Dst |
376 |
|
movq mm7, [mmx_one] |
377 |
|
|
378 |
|
jz near .rounding1 |
379 |
|
lea ebp,[byte ebp] |
380 |
|
COPY_HV_SSE_RND0 |
381 |
|
lea ecx,[ecx+2*edx] |
382 |
|
COPY_HV_SSE_RND0 |
383 |
|
lea ecx,[ecx+2*edx] |
384 |
|
COPY_HV_SSE_RND0 |
385 |
|
lea ecx,[ecx+2*edx] |
386 |
|
COPY_HV_SSE_RND0 |
387 |
|
ret |
388 |
|
|
389 |
|
align 16 |
390 |
|
.rounding1 |
391 |
|
COPY_HV_SSE_RND1 |
392 |
|
lea ecx,[ecx+2*edx] |
393 |
|
COPY_HV_SSE_RND1 |
394 |
|
lea ecx,[ecx+2*edx] |
395 |
|
COPY_HV_SSE_RND1 |
396 |
|
lea ecx,[ecx+2*edx] |
397 |
|
COPY_HV_SSE_RND1 |
398 |
|
ret |