Parent Directory | Revision Log
Revision 1.14 - (view) (download)
1 : | edgomez | 1.3 | ;/***************************************************************************** |
2 : | edgomez | 1.2 | ; * |
3 : | edgomez | 1.3 | ; * XVID MPEG-4 VIDEO CODEC |
4 : | ; * - 3dne pipeline optimized 8x8 block-based halfpel interpolation - | ||
5 : | edgomez | 1.2 | ; * |
6 : | edgomez | 1.3 | ; * Copyright(C) 2002 Jaan Kalda |
7 : | edgomez | 1.2 | ; * |
8 : | edgomez | 1.3 | ; * This program is free software ; you can redistribute it and/or modify |
9 : | ; * it under the terms of the GNU General Public License as published by | ||
10 : | ; * the Free Software Foundation ; either version 2 of the License, or | ||
11 : | ; * (at your option) any later version. | ||
12 : | edgomez | 1.2 | ; * |
13 : | edgomez | 1.3 | ; * This program is distributed in the hope that it will be useful, |
14 : | ; * but WITHOUT ANY WARRANTY ; without even the implied warranty of | ||
15 : | ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 : | ; * GNU General Public License for more details. | ||
17 : | edgomez | 1.2 | ; * |
18 : | edgomez | 1.3 | ; * You should have received a copy of the GNU General Public License |
19 : | ; * along with this program ; if not, write to the Free Software | ||
20 : | ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
21 : | edgomez | 1.2 | ; * |
22 : | edgomez | 1.3 | ; ****************************************************************************/ |
23 : | edgomez | 1.2 | |
24 : | edgomez | 1.3 | ; these 3dne functions are compatible with iSSE, but are optimized specifically |
25 : | ; for K7 pipelines | ||
26 : | edgomez | 1.2 | |
27 : | Isibaar | 1.11 | %include "nasm.inc" |
28 : | edgomez | 1.3 | |
29 : | ;============================================================================= | ||
30 : | ; Read only data | ||
31 : | ;============================================================================= | ||
32 : | |||
33 : | Isibaar | 1.11 | DATA |
34 : | edgomez | 1.2 | |
35 : | Isibaar | 1.11 | ALIGN SECTION_ALIGN |
36 : | edgomez | 1.3 | mmx_one: |
37 : | times 8 db 1 | ||
38 : | |||
39 : | Isibaar | 1.11 | ALIGN SECTION_ALIGN |
40 : | edgomez | 1.3 | mm_minusone: |
41 : | dd -1,-1 | ||
42 : | edgomez | 1.2 | |
43 : | edgomez | 1.3 | ;============================================================================= |
44 : | ; Macros | ||
45 : | ;============================================================================= | ||
46 : | edgomez | 1.2 | |
47 : | edgomez | 1.3 | %macro nop4 0 |
48 : | DB 08Dh,074h,026h,0 | ||
49 : | %endmacro | ||
50 : | |||
51 : | ;============================================================================= | ||
52 : | ; Macros | ||
53 : | ;============================================================================= | ||
54 : | edgomez | 1.2 | |
55 : | Isibaar | 1.12 | TEXT |
56 : | edgomez | 1.2 | |
57 : | cglobal interpolate8x8_halfpel_h_3dne | ||
58 : | cglobal interpolate8x8_halfpel_v_3dne | ||
59 : | cglobal interpolate8x8_halfpel_hv_3dne | ||
60 : | |||
61 : | suxen_drol | 1.7 | cglobal interpolate8x4_halfpel_h_3dne |
62 : | cglobal interpolate8x4_halfpel_v_3dne | ||
63 : | cglobal interpolate8x4_halfpel_hv_3dne | ||
64 : | |||
65 : | edgomez | 1.3 | ;----------------------------------------------------------------------------- |
66 : | edgomez | 1.2 | ; |
67 : | ; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst, | ||
68 : | edgomez | 1.3 | ; const uint8_t * const src, |
69 : | ; const uint32_t stride, | ||
70 : | ; const uint32_t rounding); | ||
71 : | edgomez | 1.2 | ; |
72 : | edgomez | 1.3 | ;----------------------------------------------------------------------------- |
73 : | edgomez | 1.2 | |
74 : | %macro COPY_H_SSE_RND0 1 | ||
75 : | %if (%1) | ||
76 : | Isibaar | 1.11 | movq mm0, [_EAX] |
77 : | edgomez | 1.2 | %else |
78 : | Isibaar | 1.11 | movq mm0, [_EAX+0] |
79 : | suxen_drol | 1.8 | ; --- |
80 : | ; nasm >0.99.x rejects the original statement: | ||
81 : | Isibaar | 1.11 | ; movq mm0, [dword _EAX] |
82 : | suxen_drol | 1.8 | ; as it is ambiguous. for this statement nasm <0.99.x would |
83 : | Isibaar | 1.11 | ; generate "movq mm0,[_EAX+0]" |
84 : | suxen_drol | 1.8 | ; --- |
85 : | edgomez | 1.2 | %endif |
86 : | Isibaar | 1.11 | pavgb mm0, [_EAX+1] |
87 : | movq mm1, [_EAX+TMP1] | ||
88 : | pavgb mm1, [_EAX+TMP1+1] | ||
89 : | lea _EAX, [_EAX+2*TMP1] | ||
90 : | movq [TMP0], mm0 | ||
91 : | movq [TMP0+TMP1], mm1 | ||
92 : | edgomez | 1.2 | %endmacro |
93 : | |||
94 : | %macro COPY_H_SSE_RND1 0 | ||
95 : | Isibaar | 1.11 | movq mm0, [_EAX] |
96 : | movq mm1, [_EAX+TMP1] | ||
97 : | edgomez | 1.2 | movq mm4, mm0 |
98 : | movq mm5, mm1 | ||
99 : | Isibaar | 1.11 | movq mm2, [_EAX+1] |
100 : | movq mm3, [_EAX+TMP1+1] | ||
101 : | edgomez | 1.2 | pavgb mm0, mm2 |
102 : | pxor mm2, mm4 | ||
103 : | pavgb mm1, mm3 | ||
104 : | Isibaar | 1.11 | lea _EAX, [_EAX+2*TMP1] |
105 : | edgomez | 1.2 | pxor mm3, mm5 |
106 : | pand mm2, mm7 | ||
107 : | pand mm3, mm7 | ||
108 : | psubb mm0, mm2 | ||
109 : | Isibaar | 1.11 | movq [TMP0], mm0 |
110 : | edgomez | 1.2 | psubb mm1, mm3 |
111 : | Isibaar | 1.11 | movq [TMP0+TMP1], mm1 |
112 : | edgomez | 1.2 | %endmacro |
113 : | |||
114 : | Isibaar | 1.11 | ALIGN SECTION_ALIGN |
115 : | edgomez | 1.2 | interpolate8x8_halfpel_h_3dne: |
116 : | |||
117 : | Isibaar | 1.11 | mov _EAX, prm2 ; Src |
118 : | mov TMP1, prm3 ; stride | ||
119 : | dec PTR_TYPE prm4; rounding | ||
120 : | edgomez | 1.2 | |
121 : | Isibaar | 1.11 | jz near .rounding1 |
122 : | mov TMP0, prm1 ; Dst | ||
123 : | edgomez | 1.2 | |
124 : | COPY_H_SSE_RND0 0 | ||
125 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
126 : | edgomez | 1.2 | COPY_H_SSE_RND0 1 |
127 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
128 : | edgomez | 1.2 | COPY_H_SSE_RND0 1 |
129 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
130 : | edgomez | 1.2 | COPY_H_SSE_RND0 1 |
131 : | ret | ||
132 : | |||
133 : | Isibaar | 1.10 | .rounding1: |
134 : | edgomez | 1.3 | ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
135 : | Isibaar | 1.11 | mov TMP0, prm1 ; Dst |
136 : | edgomez | 1.2 | movq mm7, [mmx_one] |
137 : | COPY_H_SSE_RND1 | ||
138 : | Isibaar | 1.11 | lea TMP0, [TMP0+2*TMP1] |
139 : | edgomez | 1.2 | COPY_H_SSE_RND1 |
140 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
141 : | edgomez | 1.2 | COPY_H_SSE_RND1 |
142 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
143 : | edgomez | 1.2 | COPY_H_SSE_RND1 |
144 : | ret | ||
145 : | Isibaar | 1.10 | ENDFUNC |
146 : | edgomez | 1.2 | |
147 : | edgomez | 1.3 | ;----------------------------------------------------------------------------- |
148 : | edgomez | 1.2 | ; |
149 : | ; void interpolate8x8_halfpel_v_3dne(uint8_t * const dst, | ||
150 : | edgomez | 1.3 | ; const uint8_t * const src, |
151 : | ; const uint32_t stride, | ||
152 : | ; const uint32_t rounding); | ||
153 : | edgomez | 1.2 | ; |
154 : | edgomez | 1.3 | ;----------------------------------------------------------------------------- |
155 : | edgomez | 1.2 | |
156 : | Isibaar | 1.11 | ALIGN SECTION_ALIGN |
157 : | edgomez | 1.2 | interpolate8x8_halfpel_v_3dne: |
158 : | |||
159 : | Isibaar | 1.11 | mov _EAX, prm2 ; Src |
160 : | mov TMP1, prm3 ; stride | ||
161 : | dec PTR_TYPE prm4; rounding | ||
162 : | edgomez | 1.2 | |
163 : | ; we process 2 line at a time | ||
164 : | |||
165 : | Isibaar | 1.11 | jz near .rounding1 |
166 : | edgomez | 1.2 | pxor mm2,mm2 |
167 : | Isibaar | 1.11 | movq mm0, [_EAX] |
168 : | movq mm1, [_EAX+TMP1] | ||
169 : | por mm2, [_EAX+2*TMP1] | ||
170 : | mov TMP0, prm1 ; Dst | ||
171 : | lea _EAX, [_EAX+2*TMP1] | ||
172 : | edgomez | 1.3 | pxor mm4, mm4 |
173 : | edgomez | 1.2 | pavgb mm0, mm1 |
174 : | edgomez | 1.3 | pavgb mm1, mm2 |
175 : | Isibaar | 1.11 | movq [byte TMP0], mm0 |
176 : | movq [TMP0+TMP1], mm1 | ||
177 : | edgomez | 1.3 | pxor mm6, mm6 |
178 : | Isibaar | 1.11 | add _EAX, TMP1 |
179 : | lea TMP0, [TMP0+2*TMP1] | ||
180 : | movq mm3, [byte _EAX] | ||
181 : | por mm4, [_EAX+TMP1] | ||
182 : | lea _EAX, [_EAX+2*TMP1] | ||
183 : | edgomez | 1.2 | pavgb mm2, mm3 |
184 : | pavgb mm3, mm4 | ||
185 : | Isibaar | 1.11 | movq [TMP0], mm2 |
186 : | movq [TMP0+TMP1], mm3 | ||
187 : | lea TMP0, [byte TMP0+2*TMP1] | ||
188 : | movq mm5, [byte _EAX] | ||
189 : | por mm6, [_EAX+TMP1] | ||
190 : | lea _EAX, [_EAX+2*TMP1] | ||
191 : | edgomez | 1.2 | pavgb mm4, mm5 |
192 : | pavgb mm5, mm6 | ||
193 : | Isibaar | 1.11 | movq [TMP0], mm4 |
194 : | movq [TMP0+TMP1], mm5 | ||
195 : | lea TMP0, [TMP0+2*TMP1] | ||
196 : | movq mm7, [_EAX] | ||
197 : | movq mm0, [_EAX+TMP1] | ||
198 : | edgomez | 1.2 | pavgb mm6, mm7 |
199 : | pavgb mm7, mm0 | ||
200 : | Isibaar | 1.11 | movq [TMP0], mm6 |
201 : | movq [TMP0+TMP1], mm7 | ||
202 : | edgomez | 1.2 | ret |
203 : | |||
204 : | Isibaar | 1.11 | ALIGN SECTION_ALIGN |
205 : | Isibaar | 1.10 | .rounding1: |
206 : | edgomez | 1.3 | pcmpeqb mm0, mm0 |
207 : | Isibaar | 1.11 | psubusb mm0, [_EAX] |
208 : | add _EAX, TMP1 | ||
209 : | mov TMP0, prm1 ; Dst | ||
210 : | push _ESI | ||
211 : | edgomez | 1.3 | pcmpeqb mm1, mm1 |
212 : | pcmpeqb mm2, mm2 | ||
213 : | Isibaar | 1.13 | lea _ESI, [mm_minusone] |
214 : | Isibaar | 1.11 | psubusb mm1, [byte _EAX] |
215 : | psubusb mm2, [_EAX+TMP1] | ||
216 : | lea _EAX, [_EAX+2*TMP1] | ||
217 : | movq mm6, [_ESI] | ||
218 : | movq mm7, [_ESI] | ||
219 : | edgomez | 1.2 | pavgb mm0, mm1 |
220 : | pavgb mm1, mm2 | ||
221 : | edgomez | 1.3 | psubusb mm6, mm0 |
222 : | psubusb mm7, mm1 | ||
223 : | Isibaar | 1.11 | movq [TMP0], mm6 |
224 : | movq [TMP0+TMP1], mm7 | ||
225 : | lea TMP0, [TMP0+2*TMP1] | ||
226 : | edgomez | 1.3 | pcmpeqb mm3, mm3 |
227 : | pcmpeqb mm4, mm4 | ||
228 : | Isibaar | 1.11 | psubusb mm3, [_EAX] |
229 : | psubusb mm4, [_EAX+TMP1] | ||
230 : | lea _EAX, [_EAX+2*TMP1] | ||
231 : | edgomez | 1.2 | pavgb mm2, mm3 |
232 : | pavgb mm3, mm4 | ||
233 : | Isibaar | 1.11 | movq mm0, [_ESI] |
234 : | movq mm1, [_ESI] | ||
235 : | edgomez | 1.3 | psubusb mm0, mm2 |
236 : | psubusb mm1, mm3 | ||
237 : | Isibaar | 1.11 | movq [TMP0], mm0 |
238 : | movq [TMP0+TMP1], mm1 | ||
239 : | lea TMP0,[TMP0+2*TMP1] | ||
240 : | edgomez | 1.2 | |
241 : | edgomez | 1.3 | pcmpeqb mm5, mm5 |
242 : | pcmpeqb mm6, mm6 | ||
243 : | Isibaar | 1.11 | psubusb mm5, [_EAX] |
244 : | psubusb mm6, [_EAX+TMP1] | ||
245 : | lea _EAX, [_EAX+2*TMP1] | ||
246 : | edgomez | 1.2 | pavgb mm4, mm5 |
247 : | pavgb mm5, mm6 | ||
248 : | Isibaar | 1.11 | movq mm2, [_ESI] |
249 : | movq mm3, [_ESI] | ||
250 : | edgomez | 1.3 | psubusb mm2, mm4 |
251 : | psubusb mm3, mm5 | ||
252 : | Isibaar | 1.11 | movq [TMP0], mm2 |
253 : | movq [TMP0+TMP1], mm3 | ||
254 : | lea TMP0, [TMP0+2*TMP1] | ||
255 : | edgomez | 1.3 | pcmpeqb mm7, mm7 |
256 : | pcmpeqb mm0, mm0 | ||
257 : | Isibaar | 1.11 | psubusb mm7, [_EAX] |
258 : | psubusb mm0, [_EAX+TMP1] | ||
259 : | edgomez | 1.2 | pavgb mm6, mm7 |
260 : | pavgb mm7, mm0 | ||
261 : | Isibaar | 1.11 | movq mm4, [_ESI] |
262 : | movq mm5, [_ESI] | ||
263 : | edgomez | 1.3 | psubusb mm4, mm6 |
264 : | Isibaar | 1.11 | pop _ESI |
265 : | edgomez | 1.3 | psubusb mm5, mm7 |
266 : | Isibaar | 1.11 | movq [TMP0], mm4 |
267 : | movq [TMP0+TMP1], mm5 | ||
268 : | edgomez | 1.2 | ret |
269 : | Isibaar | 1.10 | ENDFUNC |
270 : | edgomez | 1.3 | |
271 : | ;----------------------------------------------------------------------------- | ||
272 : | edgomez | 1.2 | ; |
273 : | ; void interpolate8x8_halfpel_hv_3dne(uint8_t * const dst, | ||
274 : | edgomez | 1.3 | ; const uint8_t * const src, |
275 : | ; const uint32_t stride, | ||
276 : | ; const uint32_t rounding); | ||
277 : | edgomez | 1.2 | ; |
278 : | ; | ||
279 : | edgomez | 1.3 | ;----------------------------------------------------------------------------- |
280 : | edgomez | 1.2 | |
281 : | ; The trick is to correct the result of 'pavgb' with some combination of the | ||
282 : | ; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). | ||
283 : | ; The boolean relations are: | ||
284 : | edgomez | 1.3 | ; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st |
285 : | edgomez | 1.2 | ; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st |
286 : | ; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st | ||
287 : | ; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st | ||
288 : | ; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t. | ||
289 : | |||
290 : | ; Moreover, we process 2 lines at a times, for better overlapping (~15% faster). | ||
291 : | |||
292 : | %macro COPY_HV_SSE_RND0 0 | ||
293 : | |||
294 : | Isibaar | 1.11 | movq mm0, [_EAX+TMP1] |
295 : | movq mm1, [_EAX+TMP1+1] | ||
296 : | edgomez | 1.2 | |
297 : | edgomez | 1.3 | movq mm6, mm0 |
298 : | pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step | ||
299 : | Isibaar | 1.11 | lea _EAX, [_EAX+2*TMP1] |
300 : | edgomez | 1.3 | pxor mm1, mm6 ; mm1=(j^k). preserved for next step |
301 : | edgomez | 1.2 | |
302 : | edgomez | 1.3 | por mm3, mm1 ; ij |= jk |
303 : | movq mm6, mm2 | ||
304 : | pxor mm6, mm0 ; mm6 = s^t | ||
305 : | pand mm3, mm6 ; (ij|jk) &= st | ||
306 : | pavgb mm2, mm0 ; mm2 = (s+t+1)/2 | ||
307 : | Isibaar | 1.11 | movq mm6, [_EAX] |
308 : | edgomez | 1.3 | pand mm3, mm7 ; mask lsb |
309 : | psubb mm2, mm3 ; apply. | ||
310 : | |||
311 : | Isibaar | 1.11 | movq [TMP0], mm2 |
312 : | edgomez | 1.2 | |
313 : | Isibaar | 1.11 | movq mm2, [_EAX] |
314 : | movq mm3, [_EAX+1] | ||
315 : | edgomez | 1.3 | pavgb mm2, mm3 ; preserved for next iteration |
316 : | pxor mm3, mm6 ; preserved for next iteration | ||
317 : | |||
318 : | por mm1, mm3 | ||
319 : | movq mm6, mm0 | ||
320 : | pxor mm6, mm2 | ||
321 : | pand mm1, mm6 | ||
322 : | pavgb mm0, mm2 | ||
323 : | |||
324 : | pand mm1, mm7 | ||
325 : | psubb mm0, mm1 | ||
326 : | |||
327 : | Isibaar | 1.11 | movq [TMP0+TMP1], mm0 |
328 : | edgomez | 1.2 | %endmacro |
329 : | |||
330 : | %macro COPY_HV_SSE_RND1 0 | ||
331 : | Isibaar | 1.11 | movq mm0, [_EAX+TMP1] |
332 : | movq mm1, [_EAX+TMP1+1] | ||
333 : | edgomez | 1.3 | |
334 : | movq mm6, mm0 | ||
335 : | pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step | ||
336 : | Isibaar | 1.11 | lea _EAX,[_EAX+2*TMP1] |
337 : | edgomez | 1.3 | pxor mm1, mm6 ; mm1=(j^k). preserved for next step |
338 : | edgomez | 1.2 | |
339 : | edgomez | 1.3 | pand mm3, mm1 |
340 : | movq mm6, mm2 | ||
341 : | pxor mm6, mm0 | ||
342 : | por mm3, mm6 | ||
343 : | pavgb mm2, mm0 | ||
344 : | Isibaar | 1.11 | movq mm6, [_EAX] |
345 : | edgomez | 1.3 | pand mm3, mm7 |
346 : | psubb mm2, mm3 | ||
347 : | |||
348 : | Isibaar | 1.11 | movq [TMP0], mm2 |
349 : | edgomez | 1.3 | |
350 : | Isibaar | 1.11 | movq mm2, [_EAX] |
351 : | movq mm3, [_EAX+1] | ||
352 : | edgomez | 1.3 | pavgb mm2, mm3 ; preserved for next iteration |
353 : | pxor mm3, mm6 ; preserved for next iteration | ||
354 : | |||
355 : | pand mm1, mm3 | ||
356 : | movq mm6, mm0 | ||
357 : | pxor mm6, mm2 | ||
358 : | por mm1, mm6 | ||
359 : | pavgb mm0, mm2 | ||
360 : | pand mm1, mm7 | ||
361 : | psubb mm0, mm1 | ||
362 : | Isibaar | 1.11 | movq [TMP0+TMP1], mm0 |
363 : | edgomez | 1.2 | %endmacro |
364 : | |||
365 : | Isibaar | 1.11 | ALIGN SECTION_ALIGN |
366 : | edgomez | 1.2 | interpolate8x8_halfpel_hv_3dne: |
367 : | Isibaar | 1.11 | mov _EAX, prm2 ; Src |
368 : | mov TMP1, prm3 ; stride | ||
369 : | dec PTR_TYPE prm4 ; rounding | ||
370 : | edgomez | 1.2 | |
371 : | ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j | ||
372 : | Isibaar | 1.11 | movq mm2, [_EAX] |
373 : | movq mm3, [_EAX+1] | ||
374 : | edgomez | 1.2 | movq mm6, mm2 |
375 : | pavgb mm2, mm3 | ||
376 : | edgomez | 1.3 | pxor mm3, mm6 ; mm2/mm3 ready |
377 : | Isibaar | 1.11 | mov TMP0, prm1 ; Dst |
378 : | edgomez | 1.2 | movq mm7, [mmx_one] |
379 : | |||
380 : | jz near .rounding1 | ||
381 : | Isibaar | 1.11 | lea _EBP,[byte _EBP] |
382 : | edgomez | 1.2 | COPY_HV_SSE_RND0 |
383 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
384 : | edgomez | 1.2 | COPY_HV_SSE_RND0 |
385 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
386 : | edgomez | 1.2 | COPY_HV_SSE_RND0 |
387 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
388 : | edgomez | 1.2 | COPY_HV_SSE_RND0 |
389 : | ret | ||
390 : | |||
391 : | Isibaar | 1.11 | ALIGN SECTION_ALIGN |
392 : | Isibaar | 1.10 | .rounding1: |
393 : | edgomez | 1.2 | COPY_HV_SSE_RND1 |
394 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
395 : | edgomez | 1.2 | COPY_HV_SSE_RND1 |
396 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
397 : | edgomez | 1.2 | COPY_HV_SSE_RND1 |
398 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
399 : | edgomez | 1.2 | COPY_HV_SSE_RND1 |
400 : | edgomez | 1.3 | ret |
401 : | Isibaar | 1.10 | ENDFUNC |
402 : | edgomez | 1.6 | |
403 : | suxen_drol | 1.7 | ;----------------------------------------------------------------------------- |
404 : | ; | ||
405 : | ; void interpolate8x4_halfpel_h_3dne(uint8_t * const dst, | ||
406 : | ; const uint8_t * const src, | ||
407 : | ; const uint32_t stride, | ||
408 : | ; const uint32_t rounding); | ||
409 : | ; | ||
410 : | ;----------------------------------------------------------------------------- | ||
411 : | |||
412 : | Isibaar | 1.11 | ALIGN SECTION_ALIGN |
413 : | suxen_drol | 1.7 | interpolate8x4_halfpel_h_3dne: |
414 : | |||
415 : | Isibaar | 1.11 | mov _EAX, prm2 ; Src |
416 : | mov TMP1, prm3 ; stride | ||
417 : | dec PTR_TYPE prm4; rounding | ||
418 : | suxen_drol | 1.7 | |
419 : | jz .rounding1 | ||
420 : | Isibaar | 1.11 | mov TMP0, prm1 ; Dst |
421 : | suxen_drol | 1.7 | |
422 : | COPY_H_SSE_RND0 0 | ||
423 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
424 : | suxen_drol | 1.7 | COPY_H_SSE_RND0 1 |
425 : | ret | ||
426 : | |||
427 : | Isibaar | 1.10 | .rounding1: |
428 : | suxen_drol | 1.7 | ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
429 : | Isibaar | 1.11 | mov TMP0, prm1 ; Dst |
430 : | suxen_drol | 1.7 | movq mm7, [mmx_one] |
431 : | COPY_H_SSE_RND1 | ||
432 : | Isibaar | 1.11 | lea TMP0, [TMP0+2*TMP1] |
433 : | suxen_drol | 1.7 | COPY_H_SSE_RND1 |
434 : | ret | ||
435 : | Isibaar | 1.10 | ENDFUNC |
436 : | suxen_drol | 1.7 | |
437 : | ;----------------------------------------------------------------------------- | ||
438 : | ; | ||
439 : | ; void interpolate8x4_halfpel_v_3dne(uint8_t * const dst, | ||
440 : | ; const uint8_t * const src, | ||
441 : | ; const uint32_t stride, | ||
442 : | ; const uint32_t rounding); | ||
443 : | ; | ||
444 : | ;----------------------------------------------------------------------------- | ||
445 : | |||
446 : | Isibaar | 1.11 | ALIGN SECTION_ALIGN |
447 : | suxen_drol | 1.7 | interpolate8x4_halfpel_v_3dne: |
448 : | |||
449 : | Isibaar | 1.11 | mov _EAX, prm2 ; Src |
450 : | mov TMP1, prm3 ; stride | ||
451 : | dec PTR_TYPE prm4; rounding | ||
452 : | suxen_drol | 1.7 | |
453 : | ; we process 2 line at a time | ||
454 : | |||
455 : | jz .rounding1 | ||
456 : | pxor mm2,mm2 | ||
457 : | Isibaar | 1.11 | movq mm0, [_EAX] |
458 : | movq mm1, [_EAX+TMP1] | ||
459 : | por mm2, [_EAX+2*TMP1] ; Something like preload (pipelining) | ||
460 : | mov TMP0, prm1 ; Dst | ||
461 : | lea _EAX, [_EAX+2*TMP1] | ||
462 : | suxen_drol | 1.7 | pxor mm4, mm4 |
463 : | pavgb mm0, mm1 | ||
464 : | pavgb mm1, mm2 | ||
465 : | Isibaar | 1.11 | movq [byte TMP0], mm0 |
466 : | movq [TMP0+TMP1], mm1 | ||
467 : | suxen_drol | 1.7 | |
468 : | pxor mm6, mm6 | ||
469 : | Isibaar | 1.11 | add _EAX, TMP1 |
470 : | lea TMP0, [TMP0+2*TMP1] | ||
471 : | movq mm3, [byte _EAX] | ||
472 : | por mm4, [_EAX+TMP1] | ||
473 : | lea _EAX, [_EAX+2*TMP1] | ||
474 : | suxen_drol | 1.7 | pavgb mm2, mm3 |
475 : | pavgb mm3, mm4 | ||
476 : | Isibaar | 1.11 | movq [TMP0], mm2 |
477 : | movq [TMP0+TMP1], mm3 | ||
478 : | suxen_drol | 1.7 | |
479 : | ret | ||
480 : | |||
481 : | Isibaar | 1.11 | ALIGN SECTION_ALIGN |
482 : | Isibaar | 1.10 | .rounding1: |
483 : | suxen_drol | 1.7 | pcmpeqb mm0, mm0 |
484 : | Isibaar | 1.11 | psubusb mm0, [_EAX] ; _EAX==line0 |
485 : | add _EAX, TMP1 ; _EAX==line1 | ||
486 : | mov TMP0, prm1 ; Dst | ||
487 : | suxen_drol | 1.7 | |
488 : | Isibaar | 1.11 | push _ESI |
489 : | suxen_drol | 1.7 | |
490 : | pcmpeqb mm1, mm1 | ||
491 : | pcmpeqb mm2, mm2 | ||
492 : | Isibaar | 1.13 | lea _ESI, [mm_minusone] |
493 : | Isibaar | 1.11 | psubusb mm1, [byte _EAX] ; line1 |
494 : | psubusb mm2, [_EAX+TMP1] ; line2 | ||
495 : | lea _EAX, [_EAX+2*TMP1] ; _EAX==line3 | ||
496 : | movq mm6, [_ESI] | ||
497 : | movq mm7, [_ESI] | ||
498 : | suxen_drol | 1.7 | pavgb mm0, mm1 |
499 : | pavgb mm1, mm2 | ||
500 : | psubusb mm6, mm0 | ||
501 : | psubusb mm7, mm1 | ||
502 : | Isibaar | 1.11 | movq [TMP0], mm6 ; store line0 |
503 : | movq [TMP0+TMP1], mm7 ; store line1 | ||
504 : | suxen_drol | 1.7 | |
505 : | Isibaar | 1.11 | lea TMP0, [TMP0+2*TMP1] |
506 : | suxen_drol | 1.7 | pcmpeqb mm3, mm3 |
507 : | pcmpeqb mm4, mm4 | ||
508 : | Isibaar | 1.11 | psubusb mm3, [_EAX] ; line3 |
509 : | psubusb mm4, [_EAX+TMP1] ; line4 | ||
510 : | lea _EAX, [_EAX+2*TMP1] ; _EAX==line 5 | ||
511 : | suxen_drol | 1.7 | pavgb mm2, mm3 |
512 : | pavgb mm3, mm4 | ||
513 : | Isibaar | 1.11 | movq mm0, [_ESI] |
514 : | movq mm1, [_ESI] | ||
515 : | suxen_drol | 1.7 | psubusb mm0, mm2 |
516 : | psubusb mm1, mm3 | ||
517 : | Isibaar | 1.11 | movq [TMP0], mm0 |
518 : | movq [TMP0+TMP1], mm1 | ||
519 : | suxen_drol | 1.7 | |
520 : | Isibaar | 1.11 | pop _ESI |
521 : | suxen_drol | 1.7 | |
522 : | ret | ||
523 : | |||
524 : | Isibaar | 1.10 | ENDFUNC |
525 : | suxen_drol | 1.7 | |
526 : | ;----------------------------------------------------------------------------- | ||
527 : | ; | ||
528 : | ; void interpolate8x4_halfpel_hv_3dne(uint8_t * const dst, | ||
529 : | ; const uint8_t * const src, | ||
530 : | ; const uint32_t stride, | ||
531 : | ; const uint32_t rounding); | ||
532 : | ; | ||
533 : | ; | ||
534 : | ;----------------------------------------------------------------------------- | ||
535 : | |||
536 : | Isibaar | 1.11 | ALIGN SECTION_ALIGN |
537 : | suxen_drol | 1.7 | interpolate8x4_halfpel_hv_3dne: |
538 : | Isibaar | 1.11 | mov _EAX, prm2 ; Src |
539 : | mov TMP1, prm3 ; stride | ||
540 : | dec PTR_TYPE prm4 ; rounding | ||
541 : | suxen_drol | 1.7 | |
542 : | ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j | ||
543 : | Isibaar | 1.11 | movq mm2, [_EAX] |
544 : | movq mm3, [_EAX+1] | ||
545 : | suxen_drol | 1.7 | movq mm6, mm2 |
546 : | pavgb mm2, mm3 | ||
547 : | pxor mm3, mm6 ; mm2/mm3 ready | ||
548 : | Isibaar | 1.11 | mov TMP0, prm1 ; Dst |
549 : | suxen_drol | 1.7 | movq mm7, [mmx_one] |
550 : | |||
551 : | jz near .rounding1 | ||
552 : | Isibaar | 1.11 | lea _EBP,[byte _EBP] |
553 : | suxen_drol | 1.7 | COPY_HV_SSE_RND0 |
554 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
555 : | suxen_drol | 1.7 | COPY_HV_SSE_RND0 |
556 : | ret | ||
557 : | |||
558 : | Isibaar | 1.11 | ALIGN SECTION_ALIGN |
559 : | Isibaar | 1.10 | .rounding1: |
560 : | suxen_drol | 1.7 | COPY_HV_SSE_RND1 |
561 : | Isibaar | 1.11 | lea TMP0,[TMP0+2*TMP1] |
562 : | suxen_drol | 1.7 | COPY_HV_SSE_RND1 |
563 : | ret | ||
564 : | Isibaar | 1.10 | ENDFUNC |
565 : | suxen_drol | 1.7 | |
566 : | Isibaar | 1.14 | NON_EXEC_STACK |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |