Parent Directory | Revision Log
Revision 1.3 - (view) (download)
1 : | edgomez | 1.1 | /***************************************************************************** |
2 : | * | ||
3 : | * XVID MPEG-4 VIDEO CODEC | ||
4 : | * - 8x8 block-based halfpel interpolation with altivec optimization - | ||
5 : | * | ||
6 : | * Copyright(C) 2004 Christoph Naegeli <chn@kbw.ch> | ||
7 : | * | ||
8 : | * This program is free software ; you can redistribute it and/or modify | ||
9 : | * it under the terms of the GNU General Public License as published by | ||
10 : | * the Free Software Foundation ; either version 2 of the License, or | ||
11 : | * (at your option) any later version. | ||
12 : | * | ||
13 : | * This program is distributed in the hope that it will be useful, | ||
14 : | * but WITHOUT ANY WARRANTY ; without even the implied warranty of | ||
15 : | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 : | * GNU General Public License for more details. | ||
17 : | * | ||
18 : | * You should have received a copy of the GNU General Public License | ||
19 : | * along with this program ; if not, write to the Free Software | ||
20 : | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
21 : | * | ||
22 : | * $Id$ | ||
23 : | * | ||
24 : | ****************************************************************************/ | ||
25 : | |||
26 : | |||
27 : | #ifdef HAVE_ALTIVEC_H | ||
28 : | #include <altivec.h> | ||
29 : | #endif | ||
30 : | |||
31 : | #include "../../portab.h" | ||
32 : | |||
33 : | #undef DEBUG | ||
34 : | #include <stdio.h> | ||
35 : | |||
36 : | static inline unsigned | ||
37 : | build_prefetch(unsigned char block_size, unsigned char block_count, short stride) | ||
38 : | { | ||
39 : | if(block_size > 31) | ||
40 : | block_size = 0; | ||
41 : | |||
42 : | return ((block_size << 24) | (block_count << 16) | stride); | ||
43 : | } | ||
44 : | |||
45 : | #define NO_ROUNDING | ||
46 : | |||
47 : | #define ROUNDING \ | ||
48 : | s1 = vec_and(vec_add(s1, s2), vec_splat_u8(1)); \ | ||
49 : | d = vec_sub(d, s1); | ||
50 : | |||
51 : | #define INTERPLATE8X8_HALFPEL_H(round) \ | ||
52 : | s1 = vec_perm(vec_ld(0, src), vec_ld(16, src), vec_lvsl(0, src)); \ | ||
53 : | s2 = vec_perm(s1, s1, s2_mask); \ | ||
54 : | d = vec_avg(s1, s2); \ | ||
55 : | round; \ | ||
56 : | mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); \ | ||
57 : | d = vec_perm(d, d, vec_lvsl(0, dst)); \ | ||
58 : | d = vec_sel(d, vec_ld(0, dst), mask); \ | ||
59 : | vec_st(d, 0, dst); \ | ||
60 : | dst += stride; \ | ||
61 : | src += stride | ||
62 : | |||
63 : | |||
64 : | /* This function assumes: | ||
65 : | * dst is 8 byte aligned | ||
66 : | * src is unaligned | ||
67 : | * stride is a multiple of 8 | ||
68 : | */ | ||
69 : | void | ||
70 : | interpolate8x8_halfpel_h_altivec_c( uint8_t *dst, | ||
71 : | uint8_t *src, | ||
72 : | const uint32_t stride, | ||
73 : | const uint32_t rounding) | ||
74 : | { | ||
75 : | register vector unsigned char s1, s2; | ||
76 : | register vector unsigned char d; | ||
77 : | register vector unsigned char mask; | ||
78 : | register vector unsigned char s2_mask; | ||
79 : | register vector unsigned char mask_stencil; | ||
80 : | |||
81 : | #ifdef DEBUG | ||
82 : | /* Dump alignment errors if DEBUG is defined */ | ||
83 : | if(((unsigned long)dst) & 0x7) | ||
84 : | edgomez | 1.3 | fprintf(stderr, "interpolate8x8_halfpel_h_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
85 : | edgomez | 1.1 | if(stride & 0x7) |
86 : | fprintf(stderr, "interpolate8x8_halfpel_h_altivec_c:incorrect stride, stride: %u\n", stride); | ||
87 : | #endif | ||
88 : | |||
89 : | s2_mask = vec_lvsl(1, (unsigned char*)0); | ||
90 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
91 : | |||
92 : | if(rounding) { | ||
93 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
94 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
95 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
96 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
97 : | |||
98 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
99 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
100 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
101 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
102 : | } | ||
103 : | else { | ||
104 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
105 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
106 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
107 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
108 : | |||
109 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
110 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
111 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
112 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
113 : | } | ||
114 : | } | ||
115 : | |||
116 : | #define INTERPLATE8X8_HALFPEL_V(round) \ | ||
117 : | s1 = vec_perm(vec_ld(0, src), vec_ld(16, src), vec_lvsl(0, src)); \ | ||
118 : | s2 = vec_perm(vec_ld(0, src + stride), vec_ld(16, src + stride), vec_lvsl(0, src + stride)); \ | ||
119 : | d = vec_avg(s1, s2); \ | ||
120 : | round; \ | ||
121 : | mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); \ | ||
122 : | d = vec_perm(d, d, vec_lvsl(0, dst)); \ | ||
123 : | d = vec_sel(d, vec_ld(0, dst), mask); \ | ||
124 : | vec_st(d, 0, dst); \ | ||
125 : | dst += stride; \ | ||
126 : | src += stride | ||
127 : | |||
128 : | /* | ||
129 : | * This function assumes | ||
130 : | * dst is 8 byte aligned | ||
131 : | * src is unaligned | ||
132 : | * stride is a multiple of 8 | ||
133 : | */ | ||
134 : | void | ||
135 : | interpolate8x8_halfpel_v_altivec_c( uint8_t *dst, | ||
136 : | uint8_t *src, | ||
137 : | const uint32_t stride, | ||
138 : | const uint32_t rounding) | ||
139 : | { | ||
140 : | vector unsigned char s1, s2; | ||
141 : | vector unsigned char d; | ||
142 : | vector unsigned char mask; | ||
143 : | vector unsigned char mask_stencil; | ||
144 : | |||
145 : | #ifdef DEBUG | ||
146 : | /* if this is on, print alignment errors */ | ||
147 : | if(((unsigned long)dst) & 0x7) | ||
148 : | edgomez | 1.3 | fprintf(stderr, "interpolate8x8_halfpel_v_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
149 : | edgomez | 1.1 | if(stride & 0x7) |
150 : | fprintf(stderr, "interpolate8x8_halfpel_v_altivec_c:incorrect stride, stride: %u\n", stride); | ||
151 : | #endif | ||
152 : | |||
153 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
154 : | |||
155 : | if(rounding) { | ||
156 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
157 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
158 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
159 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
160 : | |||
161 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
162 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
163 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
164 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
165 : | } | ||
166 : | else { | ||
167 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
168 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
169 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
170 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
171 : | |||
172 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
173 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
174 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
175 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
176 : | } | ||
177 : | } | ||
178 : | |||
179 : | |||
180 : | #define INTERPOLATE8X8_HALFPEL_HV(adding) \ | ||
181 : | t = vec_perm(vec_ld(0, src), vec_ld(16, src), vec_lvsl(0, src)); \ | ||
182 : | s1 = (vector unsigned short)vec_mergeh(zerovec, t); \ | ||
183 : | t = vec_perm(vec_ld(1, src), vec_ld(17, src), vec_lvsl(1, src)); \ | ||
184 : | s2 = (vector unsigned short)vec_mergeh(zerovec, t); \ | ||
185 : | t = vec_perm(vec_ld(0, src + stride), vec_ld(16, src + stride), vec_lvsl(0, src + stride)); \ | ||
186 : | s3 = (vector unsigned short)vec_mergeh(zerovec, t); \ | ||
187 : | t = vec_perm(vec_ld(1, src + stride), vec_ld(17, src + stride), vec_lvsl(1, src + stride)); \ | ||
188 : | s4 = (vector unsigned short)vec_mergeh(zerovec, t); \ | ||
189 : | edgomez | 1.3 | s1 = vec_add(s1,s2);\ |
190 : | s3 = vec_add(s3,s4);\ | ||
191 : | s1 = vec_add(s1,s3);\ | ||
192 : | edgomez | 1.1 | s1 = vec_add(s1, adding); \ |
193 : | s1 = vec_sr(s1, two); \ | ||
194 : | t = vec_pack(s1, s1); \ | ||
195 : | mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); \ | ||
196 : | t = vec_sel(t, vec_ld(0, dst), mask); \ | ||
197 : | vec_st(t, 0, dst); \ | ||
198 : | dst += stride; \ | ||
199 : | src += stride | ||
200 : | |||
201 : | void | ||
202 : | interpolate8x8_halfpel_hv_altivec_c(uint8_t *dst, | ||
203 : | uint8_t *src, | ||
204 : | const uint32_t stride, | ||
205 : | const uint32_t rounding) | ||
206 : | { | ||
207 : | vector unsigned short s1, s2, s3, s4; | ||
208 : | vector unsigned char t; | ||
209 : | vector unsigned short one, two; | ||
210 : | vector unsigned char zerovec; | ||
211 : | vector unsigned char mask; | ||
212 : | vector unsigned char mask_stencil; | ||
213 : | |||
214 : | /* Initialisation stuff */ | ||
215 : | zerovec = vec_splat_u8(0); | ||
216 : | one = vec_splat_u16(1); | ||
217 : | two = vec_splat_u16(2); | ||
218 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
219 : | |||
220 : | if(rounding) { | ||
221 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
222 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
223 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
224 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
225 : | |||
226 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
227 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
228 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
229 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
230 : | } | ||
231 : | else { | ||
232 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
233 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
234 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
235 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
236 : | |||
237 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
238 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
239 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
240 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
241 : | } | ||
242 : | } | ||
243 : | |||
244 : | /* | ||
245 : | * This function assumes: | ||
246 : | * dst is 8 byte aligned | ||
247 : | * src1 is unaligned | ||
248 : | * src2 is unaligned | ||
249 : | * stirde is a multiple of 8 | ||
250 : | * rounding is smaller than than max signed short + 2 | ||
251 : | */ | ||
252 : | |||
253 : | void | ||
254 : | interpolate8x8_avg2_altivec_c( uint8_t *dst, | ||
255 : | const uint8_t *src1, | ||
256 : | const uint8_t *src2, | ||
257 : | const uint32_t stride, | ||
258 : | const uint32_t rounding, | ||
259 : | const uint32_t height) | ||
260 : | { | ||
261 : | uint32_t i; | ||
262 : | vector unsigned char t; | ||
263 : | vector unsigned char mask; | ||
264 : | vector unsigned char mask_stencil; | ||
265 : | vector unsigned char zerovec; | ||
266 : | vector signed short s1, s2; | ||
267 : | vector signed short d; | ||
268 : | vector signed short round; | ||
269 : | |||
270 : | #ifdef DEBUG | ||
271 : | /* If this is on, print alignment errors */ | ||
272 : | if(((unsigned long)dst) & 0x7) | ||
273 : | edgomez | 1.3 | fprintf(stderr, "interpolate8x8_avg2_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
274 : | edgomez | 1.1 | if(stride & 0x7) |
275 : | fprintf(stderr, "interpolate8x8_avg2_altivec_c:incorrect stride, stride: %u\n", stride); | ||
276 : | if(rounding > (32767 + 2)) | ||
277 : | fprintf(stderr, "interpolate8x8_avg2_altivec_c:incorrect rounding, rounding: %d\n", rounding); | ||
278 : | #endif | ||
279 : | |||
280 : | /* initialisation */ | ||
281 : | zerovec = vec_splat_u8(0); | ||
282 : | *((short*)&round) = 1 - rounding; | ||
283 : | round = vec_splat(round, 0); | ||
284 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
285 : | |||
286 : | for(i = 0; i < height; i++) { | ||
287 : | |||
288 : | t = vec_perm(vec_ld(0, src1), vec_ld(16, src1), vec_lvsl(0, src1)); | ||
289 : | d = vec_add((vector signed short)zerovec, round); | ||
290 : | s1 = (vector signed short)vec_mergeh(zerovec, t); | ||
291 : | |||
292 : | t = vec_perm(vec_ld(0, src2), vec_ld(16, src2), vec_lvsl(0, src2)); | ||
293 : | d = vec_add(d, s1); | ||
294 : | s2 = (vector signed short)vec_mergeh(zerovec, t); | ||
295 : | |||
296 : | d = vec_add(d, s2); | ||
297 : | d = vec_sr(d, vec_splat_u16(1)); | ||
298 : | |||
299 : | t = vec_pack((vector unsigned short)d, (vector unsigned short)zerovec); | ||
300 : | mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); | ||
301 : | t = vec_perm(t, t, vec_lvsl(0, dst)); | ||
302 : | t = vec_sel(t, vec_ld(0, dst), mask); | ||
303 : | vec_st(t, 0, dst); | ||
304 : | |||
305 : | dst += stride; | ||
306 : | src1 += stride; | ||
307 : | src2 += stride; | ||
308 : | } | ||
309 : | } | ||
310 : | |||
311 : | |||
312 : | #define INTERPOLATE8X8_AVG4() \ | ||
313 : | d = r; \ | ||
314 : | \ | ||
315 : | t = vec_perm(vec_ld(0, src1), vec_ld(16, src1), vec_lvsl(0, src1)); \ | ||
316 : | s = (vector signed short)vec_mergeh(zerovec, t); \ | ||
317 : | d = vec_add(d, s); \ | ||
318 : | \ | ||
319 : | t = vec_perm(vec_ld(0, src2), vec_ld(16, src2), vec_lvsl(0, src2)); \ | ||
320 : | s = (vector signed short)vec_mergeh(zerovec, t); \ | ||
321 : | d = vec_add(d, s); \ | ||
322 : | \ | ||
323 : | t = vec_perm(vec_ld(0, src3), vec_ld(16, src3), vec_lvsl(0, src3)); \ | ||
324 : | s = (vector signed short)vec_mergeh(zerovec, t); \ | ||
325 : | d = vec_add(d, s); \ | ||
326 : | \ | ||
327 : | t = vec_perm(vec_ld(0, src4), vec_ld(16, src4), vec_lvsl(0, src4)); \ | ||
328 : | s = (vector signed short)vec_mergeh(zerovec, t); \ | ||
329 : | d = vec_add(d, s); \ | ||
330 : | \ | ||
331 : | d = vec_sr(d, shift); \ | ||
332 : | \ | ||
333 : | t = vec_pack((vector unsigned short)d, (vector unsigned short)zerovec); \ | ||
334 : | mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); \ | ||
335 : | t = vec_perm(t, t, vec_lvsl(0, dst)); \ | ||
336 : | t = vec_sel(t, vec_ld(0, dst), mask); \ | ||
337 : | vec_st(t, 0, dst); \ | ||
338 : | \ | ||
339 : | dst += stride; \ | ||
340 : | src1 += stride; \ | ||
341 : | src2 += stride; \ | ||
342 : | src3 += stride; \ | ||
343 : | src4 += stride | ||
344 : | |||
345 : | /* This function assumes: | ||
346 : | * dst is 8 byte aligned | ||
347 : | * src1, src2, src3, src4 are unaligned | ||
348 : | * stride is a multiple of 8 | ||
349 : | */ | ||
350 : | |||
351 : | void | ||
352 : | interpolate8x8_avg4_altivec_c(uint8_t *dst, | ||
353 : | const uint8_t *src1, const uint8_t *src2, | ||
354 : | const uint8_t *src3, const uint8_t *src4, | ||
355 : | const uint32_t stride, const uint32_t rounding) | ||
356 : | { | ||
357 : | vector signed short r; | ||
358 : | register vector signed short s, d; | ||
359 : | register vector unsigned short shift; | ||
360 : | register vector unsigned char t; | ||
361 : | register vector unsigned char zerovec; | ||
362 : | register vector unsigned char mask; | ||
363 : | register vector unsigned char mask_stencil; | ||
364 : | |||
365 : | #ifdef DEBUG | ||
366 : | /* if debug is set, print alignment errors */ | ||
367 : | if(((unsigned)dst) & 0x7) | ||
368 : | edgomez | 1.3 | fprintf(stderr, "interpolate8x8_avg4_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
369 : | edgomez | 1.1 | if(stride & 0x7) |
370 : | fprintf(stderr, "interpolate8x8_avg4_altivec_c:incorrect stride, stride: %u\n", stride); | ||
371 : | #endif | ||
372 : | |||
373 : | /* Initialization */ | ||
374 : | zerovec = vec_splat_u8(0); | ||
375 : | *((short*)&r) = 2 - rounding; | ||
376 : | r = vec_splat(r, 0); | ||
377 : | shift = vec_splat_u16(2); | ||
378 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
379 : | |||
380 : | /* interpolate */ | ||
381 : | INTERPOLATE8X8_AVG4(); | ||
382 : | INTERPOLATE8X8_AVG4(); | ||
383 : | INTERPOLATE8X8_AVG4(); | ||
384 : | INTERPOLATE8X8_AVG4(); | ||
385 : | |||
386 : | INTERPOLATE8X8_AVG4(); | ||
387 : | INTERPOLATE8X8_AVG4(); | ||
388 : | INTERPOLATE8X8_AVG4(); | ||
389 : | INTERPOLATE8X8_AVG4(); | ||
390 : | } | ||
391 : | |||
392 : | edgomez | 1.2 | /* |
393 : | * This function assumes: | ||
394 : | * dst is 8 byte aligned | ||
395 : | * src is unaligned | ||
396 : | * stirde is a multiple of 8 | ||
397 : | * rounding is ignored | ||
398 : | */ | ||
399 : | void | ||
400 : | interpolate8x8_halfpel_add_altivec_c(uint8_t *dst, const uint8_t *src, const uint32_t stride, const uint32_t rouding) | ||
401 : | { | ||
402 : | interpolate8x8_avg2_altivec_c(dst, dst, src, stride, 0, 8); | ||
403 : | } | ||
404 : | |||
405 : | #define INTERPOLATE8X8_HALFPEL_H_ADD_ROUND() \ | ||
406 : | mask_dst = vec_lvsl(0,dst); \ | ||
407 : | s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); \ | ||
408 : | d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst); \ | ||
409 : | \ | ||
410 : | edgomez | 1.3 | s2 = vec_perm(s1,s1,rot1);\ |
411 : | tmp = vec_avg(s1,s2);\ | ||
412 : | s1 = vec_xor(s1,s2);\ | ||
413 : | s1 = vec_sub(tmp,vec_and(s1,one));\ | ||
414 : | edgomez | 1.2 | \ |
415 : | d = vec_avg(s1,d);\ | ||
416 : | \ | ||
417 : | mask = vec_perm(mask_stencil, mask_stencil, mask_dst); \ | ||
418 : | d = vec_perm(d,d,mask_dst); \ | ||
419 : | d = vec_sel(d,vec_ld(0,dst),mask); \ | ||
420 : | vec_st(d,0,dst); \ | ||
421 : | \ | ||
422 : | dst += stride; \ | ||
423 : | src += stride | ||
424 : | |||
425 : | #define INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND() \ | ||
426 : | mask_dst = vec_lvsl(0,dst); \ | ||
427 : | s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); \ | ||
428 : | d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst); \ | ||
429 : | \ | ||
430 : | s1 = vec_avg(s1, vec_perm(s1,s1,rot1));\ | ||
431 : | d = vec_avg(s1,d);\ | ||
432 : | \ | ||
433 : | mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ | ||
434 : | d = vec_perm(d,d,mask_dst);\ | ||
435 : | d = vec_sel(d,vec_ld(0,dst),mask);\ | ||
436 : | vec_st(d,0,dst);\ | ||
437 : | \ | ||
438 : | dst += stride;\ | ||
439 : | src += stride | ||
440 : | |||
441 : | /* | ||
442 : | * This function assumes: | ||
443 : | * dst is 8 byte aligned | ||
444 : | * src is unaligned | ||
445 : | * stride is a multiple of 8 | ||
446 : | */ | ||
447 : | void | ||
448 : | interpolate8x8_halfpel_h_add_altivec_c(uint8_t *dst, uint8_t *src, const uint32_t stride, const uint32_t rounding) | ||
449 : | { | ||
450 : | register vector unsigned char s1,s2; | ||
451 : | register vector unsigned char d; | ||
452 : | register vector unsigned char tmp; | ||
453 : | |||
454 : | register vector unsigned char mask_dst; | ||
455 : | register vector unsigned char one; | ||
456 : | register vector unsigned char rot1; | ||
457 : | |||
458 : | register vector unsigned char mask_stencil; | ||
459 : | register vector unsigned char mask; | ||
460 : | |||
461 : | #ifdef DEBUG | ||
462 : | edgomez | 1.3 | if(((unsigned)dst) & 0x7) |
463 : | fprintf(stderr, "interpolate8x8_halfpel_h_add_altivec_c:incorrect align, dst: %lx\n", (long)dst); | ||
464 : | edgomez | 1.2 | if(stride & 0x7) |
465 : | fprintf(stderr, "interpolate8x8_halfpel_h_add_altivec_c:incorrect stride, stride: %u\n", stride); | ||
466 : | #endif | ||
467 : | |||
468 : | /* initialization */ | ||
469 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
470 : | one = vec_splat_u8(1); | ||
471 : | rot1 = vec_lvsl(1,(unsigned char*)0); | ||
472 : | |||
473 : | if(rounding) { | ||
474 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
475 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
476 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
477 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
478 : | |||
479 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
480 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
481 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
482 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
483 : | } | ||
484 : | else { | ||
485 : | |||
486 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
487 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
488 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
489 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
490 : | |||
491 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
492 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
493 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
494 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
495 : | } | ||
496 : | } | ||
497 : | |||
498 : | |||
499 : | edgomez | 1.1 | |
500 : | |||
501 : | edgomez | 1.2 | #define INTERPOLATE8X8_HALFPEL_V_ADD_ROUND()\ |
502 : | src += stride;\ | ||
503 : | mask_dst = vec_lvsl(0,dst);\ | ||
504 : | s2 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ | ||
505 : | d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ | ||
506 : | \ | ||
507 : | tmp = vec_avg(s1,s2);\ | ||
508 : | edgomez | 1.3 | s1 = vec_xor(s1,s2);\ |
509 : | s1 = vec_sub(tmp,vec_and(s1,vec_splat_u8(1)));\ | ||
510 : | edgomez | 1.2 | d = vec_avg(s1,d);\ |
511 : | \ | ||
512 : | mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ | ||
513 : | d = vec_perm(d,d,mask_dst);\ | ||
514 : | d = vec_sel(d,vec_ld(0,dst),mask);\ | ||
515 : | vec_st(d,0,dst);\ | ||
516 : | \ | ||
517 : | s1 = s2;\ | ||
518 : | \ | ||
519 : | dst += stride | ||
520 : | edgomez | 1.1 | |
521 : | edgomez | 1.2 | #define INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND()\ |
522 : | src += stride;\ | ||
523 : | mask_dst = vec_lvsl(0,dst);\ | ||
524 : | s2 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ | ||
525 : | d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ | ||
526 : | \ | ||
527 : | s1 = vec_avg(s1,s2);\ | ||
528 : | d = vec_avg(s1,d);\ | ||
529 : | \ | ||
530 : | mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ | ||
531 : | d = vec_perm(d,d,mask_dst);\ | ||
532 : | d = vec_sel(d,vec_ld(0,dst),mask);\ | ||
533 : | vec_st(d,0,dst);\ | ||
534 : | \ | ||
535 : | s1 = s2;\ | ||
536 : | dst += stride | ||
537 : | |||
538 : | /* | ||
539 : | * This function assumes: | ||
540 : | * dst: 8 byte aligned | ||
541 : | * src: unaligned | ||
542 : | * stride is a multiple of 8 | ||
543 : | */ | ||
544 : | |||
545 : | void | ||
546 : | interpolate8x8_halfpel_v_add_altivec_c(uint8_t *dst, uint8_t *src, const uint32_t stride, const uint32_t rounding) | ||
547 : | { | ||
548 : | register vector unsigned char s1,s2; | ||
549 : | register vector unsigned char tmp; | ||
550 : | register vector unsigned char d; | ||
551 : | |||
552 : | register vector unsigned char mask; | ||
553 : | register vector unsigned char mask_dst; | ||
554 : | register vector unsigned char mask_stencil; | ||
555 : | |||
556 : | #ifdef DEBUG | ||
557 : | if(((unsigned)dst) & 0x7) | ||
558 : | edgomez | 1.3 | fprintf(stderr, "interpolate8x8_halfpel_v_add_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
559 : | edgomez | 1.2 | if(stride & 0x7) |
560 : | fprintf(stderr, "interpolate8x8_halfpel_v_add_altivec_c:incorrect align, dst: %u\n", stride); | ||
561 : | #endif | ||
562 : | |||
563 : | /* initialization */ | ||
564 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
565 : | |||
566 : | if(rounding) { | ||
567 : | |||
568 : | /* Interpolate vertical with rounding */ | ||
569 : | s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); | ||
570 : | |||
571 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
572 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
573 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
574 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
575 : | |||
576 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
577 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
578 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
579 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
580 : | } | ||
581 : | else { | ||
582 : | /* Interpolate vertical without rounding */ | ||
583 : | s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); | ||
584 : | |||
585 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
586 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
587 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
588 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
589 : | |||
590 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
591 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
592 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
593 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
594 : | } | ||
595 : | } | ||
596 : | |||
597 : | |||
598 : | |||
599 : | #define INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND()\ | ||
600 : | src += stride;\ | ||
601 : | mask_dst = vec_lvsl(0,dst);\ | ||
602 : | c10 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ | ||
603 : | d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ | ||
604 : | c11 = vec_perm(c10,c10,rot1);\ | ||
605 : | \ | ||
606 : | s00 = (vector unsigned short)vec_mergeh(zero,c00);\ | ||
607 : | s01 = (vector unsigned short)vec_mergeh(zero,c01);\ | ||
608 : | s10 = (vector unsigned short)vec_mergeh(zero,c10);\ | ||
609 : | s11 = (vector unsigned short)vec_mergeh(zero,c11);\ | ||
610 : | \ | ||
611 : | s00 = vec_add(s00,s10);\ | ||
612 : | s01 = vec_add(s01,s11);\ | ||
613 : | s00 = vec_add(s00,s01);\ | ||
614 : | s00 = vec_add(s00,one);\ | ||
615 : | \ | ||
616 : | s00 = vec_sr(s00,two);\ | ||
617 : | s00 = vec_add(s00, (vector unsigned short)vec_mergeh(zero,d));\ | ||
618 : | s00 = vec_sr(s00,one);\ | ||
619 : | \ | ||
620 : | d = vec_pack(s00,s00);\ | ||
621 : | mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ | ||
622 : | d = vec_sel(d,vec_ld(0,dst),mask);\ | ||
623 : | vec_st(d,0,dst);\ | ||
624 : | \ | ||
625 : | c00 = c10;\ | ||
626 : | c01 = c11;\ | ||
627 : | dst += stride | ||
628 : | |||
629 : | |||
630 : | #define INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND()\ | ||
631 : | src += stride;\ | ||
632 : | mask_dst = vec_lvsl(0,dst);\ | ||
633 : | c10 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ | ||
634 : | d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ | ||
635 : | c11 = vec_perm(c10,c10,rot1);\ | ||
636 : | \ | ||
637 : | s00 = (vector unsigned short)vec_mergeh(zero,c00);\ | ||
638 : | s01 = (vector unsigned short)vec_mergeh(zero,c01);\ | ||
639 : | s10 = (vector unsigned short)vec_mergeh(zero,c10);\ | ||
640 : | s11 = (vector unsigned short)vec_mergeh(zero,c11);\ | ||
641 : | \ | ||
642 : | s00 = vec_add(s00,s10);\ | ||
643 : | s01 = vec_add(s01,s11);\ | ||
644 : | s00 = vec_add(s00,s01);\ | ||
645 : | s00 = vec_add(s00,two);\ | ||
646 : | s00 = vec_sr(s00,two);\ | ||
647 : | \ | ||
648 : | c00 = vec_pack(s00,s00);\ | ||
649 : | d = vec_avg(d,c00);\ | ||
650 : | \ | ||
651 : | mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ | ||
652 : | d = vec_perm(d,d,mask_dst);\ | ||
653 : | d = vec_sel(d,vec_ld(0,dst),mask);\ | ||
654 : | vec_st(d,0,dst);\ | ||
655 : | \ | ||
656 : | c00 = c10;\ | ||
657 : | c01 = c11;\ | ||
658 : | dst += stride | ||
659 : | |||
660 : | |||
661 : | /* | ||
662 : | * This function assumes: | ||
663 : | * dst: 8 byte aligned | ||
664 : | * src: unaligned | ||
665 : | * stride: multiple of 8 | ||
666 : | edgomez | 1.1 | */ |
667 : | |||
668 : | void | ||
669 : | edgomez | 1.2 | interpolate8x8_halfpel_hv_add_altivec_c(uint8_t *dst, uint8_t *src, const uint32_t stride, const uint32_t rounding) |
670 : | edgomez | 1.1 | { |
671 : | edgomez | 1.2 | register vector unsigned char c00,c10,c01,c11; |
672 : | register vector unsigned short s00,s10,s01,s11; | ||
673 : | register vector unsigned char d; | ||
674 : | |||
675 : | register vector unsigned char mask; | ||
676 : | register vector unsigned char mask_stencil; | ||
677 : | |||
678 : | register vector unsigned char rot1; | ||
679 : | register vector unsigned char mask_dst; | ||
680 : | register vector unsigned char zero; | ||
681 : | register vector unsigned short one,two; | ||
682 : | |||
683 : | #ifdef DEBUG | ||
684 : | if(((unsigned)dst) & 0x7) | ||
685 : | edgomez | 1.3 | fprintf(stderr, "interpolate8x8_halfpel_hv_add_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
686 : | edgomez | 1.2 | if(stride & 0x7) |
687 : | fprintf(stderr, "interpolate8x8_halfpel_hv_add_altivec_c:incorrect stride, stride: %u\n", stride); | ||
688 : | #endif | ||
689 : | |||
690 : | /* initialization */ | ||
691 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
692 : | rot1 = vec_lvsl(1,(unsigned char*)0); | ||
693 : | zero = vec_splat_u8(0); | ||
694 : | one = vec_splat_u16(1); | ||
695 : | two = vec_splat_u16(2); | ||
696 : | |||
697 : | if(rounding) { | ||
698 : | |||
699 : | /* Load the first row 'manually' */ | ||
700 : | c00 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); | ||
701 : | c01 = vec_perm(c00,c00,rot1); | ||
702 : | |||
703 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
704 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
705 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
706 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
707 : | |||
708 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
709 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
710 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
711 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
712 : | } | ||
713 : | else { | ||
714 : | |||
715 : | /* Load the first row 'manually' */ | ||
716 : | c00 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); | ||
717 : | c01 = vec_perm(c00,c00,rot1); | ||
718 : | |||
719 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
720 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
721 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
722 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
723 : | |||
724 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
725 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
726 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
727 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
728 : | } | ||
729 : | edgomez | 1.1 | } |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |