18 |
|
|
19 |
|
|
20 |
$Id$ |
$Id$ |
|
$Source$ |
|
|
$Date$ |
|
|
$Author$ |
|
|
|
|
21 |
*/ |
*/ |
22 |
|
|
23 |
#define G_REG |
#ifdef HAVE_ALTIVEC_H |
24 |
|
#include <altivec.h> |
|
#ifdef G_REG |
|
|
register vector unsigned char perm0 asm("%v29"); |
|
|
register vector unsigned char perm1 asm("%v30"); |
|
|
register vector unsigned int zerovec asm("%v31"); |
|
25 |
#endif |
#endif |
26 |
|
|
|
#include <stdio.h> |
|
|
|
|
|
#undef DEBUG |
|
|
|
|
|
static const vector unsigned char perms[2] = { |
|
|
(vector unsigned char) ( /* Used when cur is aligned */ |
|
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
|
|
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17), |
|
|
(vector unsigned char) ( /* Used when cur is unaligned */ |
|
|
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
|
|
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f), |
|
|
}; |
|
|
|
|
|
#ifdef G_REG |
|
|
void |
|
|
sadInit_altivec(void) |
|
|
{ |
|
|
perm0 = perms[0]; |
|
|
perm1 = perms[1]; |
|
|
zerovec = (vector unsigned int) (0); |
|
|
} |
|
|
static inline const vector unsigned char |
|
|
get_perm(unsigned long i) |
|
|
{ |
|
|
return i ? perm1 : perm0; |
|
|
} |
|
27 |
|
|
28 |
#define ZERODEF |
#include "../../portab.h" |
|
#define ZEROVEC zerovec |
|
|
#else |
|
|
void |
|
|
sadInit_altivec(void) |
|
|
{ |
|
|
} |
|
|
static inline const vector unsigned char |
|
|
get_perm(unsigned long i) |
|
|
{ |
|
|
return perms[i]; |
|
|
} |
|
29 |
|
|
30 |
#define ZERODEF vector unsigned int zerovec = (vector unsigned int)(0) |
/* no debugging by default */ |
31 |
#define ZEROVEC zerovec |
#undef DEBUG |
|
#endif |
|
32 |
|
|
33 |
|
#include <stdio.h> |
34 |
|
|
35 |
#define SAD16() \ |
#define SAD16() \ |
36 |
t1 = vec_perm(ref[0], ref[1], perm); /* align current vector */ \ |
t1 = vec_perm(ref[0], ref[1], perm); /* align current vector */ \ |
37 |
t2 = vec_max(t1, *cur); /* find largest of two */ \ |
t2 = vec_max(t1, *cur); /* find largest of two */ \ |
38 |
t3 = vec_min(t1, *cur); /* find smaller of two */ \ |
t1 = vec_min(t1, *cur); /* find smaller of two */ \ |
39 |
t4 = vec_sub(t2, t3); /* find absolute difference */ \ |
t1 = vec_sub(t2, t1); /* find absolute difference */ \ |
40 |
sad = vec_sum4s(t4, sad); /* accumulate sum of differences */ \ |
sad = vec_sum4s(t1, vec_splat_u32(0)); /* sum of differences */ \ |
41 |
|
sumdiffs = (vector unsigned int)vec_sums((vector signed int)sad, (vector signed int)sumdiffs); /* accumulate sumdiffs */ \ |
42 |
|
if(vec_any_ge(sumdiffs, best_vec)) \ |
43 |
|
goto bail; \ |
44 |
cur += stride; ref += stride; |
cur += stride; ref += stride; |
45 |
|
|
46 |
/* |
/* |
47 |
* This function assumes cur and stride are 16 bytes aligned and ref is unaligned |
* This function assumes cur and stride are 16 bytes aligned and ref is unaligned |
48 |
*/ |
*/ |
49 |
unsigned long |
|
50 |
sad16_altivec(const vector unsigned char *cur, |
uint32_t |
51 |
const vector unsigned char *ref, |
sad16_altivec_c(vector unsigned char *cur, |
52 |
unsigned long stride, |
vector unsigned char *ref, |
53 |
const unsigned long best_sad) |
uint32_t stride, |
54 |
|
const uint32_t best_sad) |
55 |
{ |
{ |
56 |
vector unsigned char perm; |
vector unsigned char perm; |
57 |
vector unsigned char t1, t2, t3, t4; |
vector unsigned char t1, t2; |
58 |
vector unsigned int sad; |
vector unsigned int sad; |
59 |
vector signed int sumdiffs, best_vec; |
vector unsigned int sumdiffs; |
60 |
unsigned long result; |
vector unsigned int best_vec; |
61 |
|
uint32_t result; |
62 |
|
|
|
ZERODEF; |
|
63 |
|
|
64 |
#ifdef DEBUG |
#ifdef DEBUG |
65 |
|
/* print alignment errors if DEBUG is on */ |
66 |
if (((unsigned long) cur) & 0xf) |
if (((unsigned long) cur) & 0xf) |
67 |
fprintf(stderr, "sad16_altivec:incorrect align, cur: %x\n", cur); |
fprintf(stderr, "sad16_altivec:incorrect align, cur: %lx\n", (long)cur); |
|
// if (((unsigned long)ref) & 0xf) |
|
|
// fprintf(stderr, "sad16_altivec:incorrect align, ref: %x\n", ref); |
|
68 |
if (stride & 0xf) |
if (stride & 0xf) |
69 |
fprintf(stderr, "sad16_altivec:incorrect align, stride: %x\n", stride); |
fprintf(stderr, "sad16_altivec:incorrect align, stride: %lu\n", stride); |
70 |
#endif |
#endif |
71 |
/* initialization */ |
/* initialization */ |
72 |
sad = (vector unsigned int) (ZEROVEC); |
sad = vec_splat_u32(0); |
73 |
|
sumdiffs = sad; |
74 |
stride >>= 4; |
stride >>= 4; |
75 |
perm = vec_lvsl(0, (unsigned char *) ref); |
perm = vec_lvsl(0, (unsigned char *) ref); |
76 |
*((unsigned long *) &best_vec) = best_sad; |
*((uint32_t*)&best_vec) = best_sad; |
77 |
best_vec = vec_splat(best_vec, 0); |
best_vec = vec_splat(best_vec, 0); |
78 |
|
|
79 |
/* perform sum of differences between current and previous */ |
/* perform sum of differences between current and previous */ |
81 |
SAD16(); |
SAD16(); |
82 |
SAD16(); |
SAD16(); |
83 |
SAD16(); |
SAD16(); |
84 |
/* Temp sum for exit */ |
|
|
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); |
|
|
if (vec_all_ge(sumdiffs, best_vec)) |
|
|
goto bail; |
|
85 |
SAD16(); |
SAD16(); |
86 |
SAD16(); |
SAD16(); |
87 |
SAD16(); |
SAD16(); |
88 |
SAD16(); |
SAD16(); |
89 |
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); |
|
|
if (vec_all_ge(sumdiffs, best_vec)) |
|
|
goto bail; |
|
90 |
SAD16(); |
SAD16(); |
91 |
SAD16(); |
SAD16(); |
92 |
SAD16(); |
SAD16(); |
93 |
SAD16(); |
SAD16(); |
94 |
|
|
95 |
SAD16(); |
SAD16(); |
96 |
SAD16(); |
SAD16(); |
97 |
SAD16(); |
SAD16(); |
98 |
SAD16(); |
SAD16(); |
99 |
|
|
|
/* sum all parts of difference into one 32 bit quantity */ |
|
|
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); |
|
100 |
bail: |
bail: |
101 |
/* copy vector sum into unaligned result */ |
/* copy vector sum into unaligned result */ |
102 |
sumdiffs = vec_splat(sumdiffs, 3); |
sumdiffs = vec_splat(sumdiffs, 3); |
103 |
vec_ste(sumdiffs, 0, (int *) &result); |
vec_ste(sumdiffs, 0, (uint32_t*) &result); |
104 |
return (result); |
return result; |
105 |
} |
} |
106 |
|
|
107 |
|
|
108 |
#define SAD8() \ |
#define SAD8() \ |
109 |
t1 = vec_perm(cur[0], cur[stride], perm_cur); /* align current vector */ \ |
c = vec_perm(vec_ld(0,cur),vec_ld(16,cur),vec_lvsl(0,cur));\ |
110 |
t2 = vec_perm(ref[0], ref[1], perm_ref1); /* align current vector */ \ |
r = vec_perm(vec_ld(0,ref),vec_ld(16,ref),vec_lvsl(0,ref));\ |
111 |
tp = vec_perm(ref[stride], ref[stride+1], perm_ref1); /* align current vector */ \ |
c = vec_sub(vec_max(c,r),vec_min(c,r));\ |
112 |
t2 = vec_perm(t2,tp,perm_ref2); \ |
sad = vec_sum4s(c,sad);\ |
113 |
t3 = vec_max(t1, t2); /* find largest of two */ \ |
cur += stride;\ |
114 |
t4 = vec_min(t1, t2); /* find smaller of two */ \ |
ref += stride |
|
t5 = vec_sub(t3, t4); /* find absolute difference */ \ |
|
|
sad = vec_sum4s(t5, sad); /* accumulate sum of differences */ \ |
|
|
cur += stride<<1; ref += stride<<1; |
|
115 |
|
|
116 |
/* |
/* |
117 |
* This function assumes cur is 8 bytes aligned, stride is 16 bytes |
* This function assumes nothing |
|
* aligned and ref is unaligned |
|
118 |
*/ |
*/ |
|
unsigned long |
|
|
sad8_altivec(const vector unsigned char *cur, |
|
|
const vector unsigned char *ref, |
|
|
unsigned long stride) |
|
|
{ |
|
|
vector unsigned char t1, t2, t3, t4, t5, tp; |
|
|
vector unsigned int sad; |
|
|
vector signed int sumdiffs; |
|
|
vector unsigned char perm_cur; |
|
|
vector unsigned char perm_ref1, perm_ref2; |
|
|
unsigned long result; |
|
119 |
|
|
120 |
ZERODEF; |
uint32_t |
121 |
|
sad8_altivec_c(const uint8_t * cur, |
122 |
|
const uint8_t *ref, |
123 |
|
const uint32_t stride) |
124 |
|
{ |
125 |
|
uint32_t result = 0; |
126 |
|
|
127 |
#ifdef DEBUG |
register vector unsigned int sad; |
128 |
if (((unsigned long) cur) & 0x7) |
register vector unsigned char c; |
129 |
fprintf(stderr, "sad8_altivec:incorrect align, cur: %x\n", cur); |
register vector unsigned char r; |
|
// if (((unsigned long)ref) & 0x7) |
|
|
// fprintf(stderr, "sad8_altivec:incorrect align, ref: %x\n", ref); |
|
|
if (stride & 0xf) |
|
|
fprintf(stderr, "sad8_altivec:incorrect align, stride: %x\n", stride); |
|
|
#endif |
|
130 |
|
|
131 |
perm_cur = get_perm((((unsigned long) cur) >> 3) & 0x01); |
/* initialize */ |
132 |
perm_ref1 = vec_lvsl(0, (unsigned char *) ref); |
sad = vec_splat_u32(0); |
|
perm_ref2 = get_perm(0); |
|
133 |
|
|
134 |
/* initialization */ |
/* Perform sad operations */ |
135 |
sad = (vector unsigned int) (ZEROVEC); |
SAD8(); |
136 |
stride >>= 4; |
SAD8(); |
137 |
|
SAD8(); |
138 |
|
SAD8(); |
139 |
|
|
|
/* perform sum of differences between current and previous */ |
|
140 |
SAD8(); |
SAD8(); |
141 |
SAD8(); |
SAD8(); |
142 |
SAD8(); |
SAD8(); |
143 |
SAD8(); |
SAD8(); |
144 |
|
|
145 |
/* sum all parts of difference into one 32 bit quantity */ |
/* finish addition, add the first 2 together */ |
146 |
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); |
sad = vec_and(sad, (vector unsigned int)vec_pack(vec_splat_u16(-1),vec_splat_u16(0))); |
147 |
|
sad = (vector unsigned int)vec_sums((vector signed int)sad, vec_splat_s32(0)); |
148 |
|
sad = vec_splat(sad,3); |
149 |
|
vec_ste(sad, 0, &result); |
150 |
|
|
151 |
/* copy vector sum into unaligned result */ |
return result; |
|
sumdiffs = vec_splat(sumdiffs, 3); |
|
|
vec_ste(sumdiffs, 0, (int *) &result); |
|
|
return (result); |
|
152 |
} |
} |
153 |
|
|
154 |
#define MEAN16(i)\ |
|
155 |
c##i=*cur;\ |
|
156 |
mean = vec_sum4s(c##i,mean);\ |
|
157 |
cur += stride; |
#define MEAN16() \ |
158 |
|
mean = vec_sum4s(*ptr,mean);\ |
159 |
#define DEV16(i) \ |
ptr += stride |
160 |
t2 = vec_max(c##i, mn); /* find largest of two */ \ |
|
161 |
t3 = vec_min(c##i, mn); /* find smaller of two */ \ |
#define DEV16() \ |
162 |
t4 = vec_sub(t2, t3); /* find absolute difference */ \ |
t2 = vec_max(*ptr, mn); /* find largest of two */ \ |
163 |
dev = vec_sum4s(t4, dev); |
t3 = vec_min(*ptr, mn); /* find smaller of two */ \ |
164 |
|
t2 = vec_sub(t2, t3); /* find absolute difference */ \ |
165 |
unsigned long |
dev = vec_sum4s(t2, dev); \ |
166 |
dev16_altivec(const vector unsigned char *cur, |
ptr += stride |
167 |
unsigned long stride) |
|
168 |
|
/* |
169 |
|
* This function assumes cur is 16 bytes aligned and stride is 16 bytes |
170 |
|
* aligned |
171 |
|
*/ |
172 |
|
|
173 |
|
uint32_t |
174 |
|
dev16_altivec_c(vector unsigned char *cur, |
175 |
|
uint32_t stride) |
176 |
{ |
{ |
177 |
vector unsigned char t2, t3, t4, mn; |
vector unsigned char t2, t3, mn; |
178 |
vector unsigned int mean, dev; |
vector unsigned int mean, dev; |
179 |
vector signed int sumdiffs; |
vector unsigned int sumdiffs; |
180 |
vector unsigned char c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, |
vector unsigned char *ptr; |
181 |
c13, c14, c15; |
uint32_t result; |
|
unsigned long result; |
|
182 |
|
|
183 |
ZERODEF; |
#ifdef DEBUG |
184 |
|
/* print alignment errors if DEBUG is on */ |
185 |
|
if(((unsigned long)cur) & 0x7) |
186 |
|
fprintf(stderr, "dev16_altivec:incorrect align, cur: %lx\n", (long)cur); |
187 |
|
if(stride & 0xf) |
188 |
|
fprintf(stderr, "dev16_altivec:incorrect align, stride: %lu\n", stride); |
189 |
|
#endif |
190 |
|
|
191 |
mean = (vector unsigned int) (ZEROVEC); |
dev = mean = vec_splat_u32(0); |
|
dev = (vector unsigned int) (ZEROVEC); |
|
192 |
stride >>= 4; |
stride >>= 4; |
193 |
|
|
194 |
MEAN16(0); |
/* set pointer to iterate through cur */ |
195 |
MEAN16(1); |
ptr = cur; |
196 |
MEAN16(2); |
|
197 |
MEAN16(3); |
MEAN16(); |
198 |
MEAN16(4); |
MEAN16(); |
199 |
MEAN16(5); |
MEAN16(); |
200 |
MEAN16(6); |
MEAN16(); |
201 |
MEAN16(7); |
MEAN16(); |
202 |
MEAN16(8); |
MEAN16(); |
203 |
MEAN16(9); |
MEAN16(); |
204 |
MEAN16(10); |
MEAN16(); |
205 |
MEAN16(11); |
MEAN16(); |
206 |
MEAN16(12); |
MEAN16(); |
207 |
MEAN16(13); |
MEAN16(); |
208 |
MEAN16(14); |
MEAN16(); |
209 |
MEAN16(15); |
MEAN16(); |
210 |
|
MEAN16(); |
211 |
sumdiffs = vec_sums((vector signed int) mean, (vector signed int) ZEROVEC); |
MEAN16(); |
212 |
mn = vec_perm((vector unsigned char) sumdiffs, |
MEAN16(); |
213 |
(vector unsigned char) sumdiffs, (vector unsigned char) (14, |
|
214 |
14, |
/* Add all together in sumdiffs */ |
215 |
14, |
sumdiffs = (vector unsigned int)vec_sums((vector signed int) mean, vec_splat_s32(0)); |
216 |
14, |
/* teilen durch 16 * 16 */ |
217 |
14, |
mn = vec_perm((vector unsigned char)sumdiffs, (vector unsigned char)sumdiffs, vec_splat_u8(14)); |
218 |
14, |
|
219 |
14, |
/* set pointer to iterate through cur */ |
220 |
14, |
ptr = cur; |
221 |
14, |
|
222 |
14, |
DEV16(); |
223 |
14, |
DEV16(); |
224 |
14, |
DEV16(); |
225 |
14, |
DEV16(); |
226 |
14, |
DEV16(); |
227 |
14, |
DEV16(); |
228 |
14)); |
DEV16(); |
229 |
DEV16(0); |
DEV16(); |
230 |
DEV16(1); |
DEV16(); |
231 |
DEV16(2); |
DEV16(); |
232 |
DEV16(3); |
DEV16(); |
233 |
DEV16(4); |
DEV16(); |
234 |
DEV16(5); |
DEV16(); |
235 |
DEV16(6); |
DEV16(); |
236 |
DEV16(7); |
DEV16(); |
237 |
DEV16(8); |
DEV16(); |
|
DEV16(9); |
|
|
DEV16(10); |
|
|
DEV16(11); |
|
|
DEV16(12); |
|
|
DEV16(13); |
|
|
DEV16(14); |
|
|
DEV16(15); |
|
238 |
|
|
239 |
/* sum all parts of difference into one 32 bit quantity */ |
/* sum all parts of difference into one 32 bit quantity */ |
240 |
sumdiffs = vec_sums((vector signed int) dev, (vector signed int) ZEROVEC); |
sumdiffs = (vector unsigned int)vec_sums((vector signed int) dev, vec_splat_s32(0)); |
241 |
|
|
242 |
/* copy vector sum into unaligned result */ |
/* copy vector sum into unaligned result */ |
243 |
sumdiffs = vec_splat(sumdiffs, 3); |
sumdiffs = vec_splat(sumdiffs, 3); |
244 |
vec_ste(sumdiffs, 0, (int *) &result); |
vec_ste(sumdiffs, 0, (uint32_t*) &result); |
245 |
return (result); |
return result; |
246 |
|
} |
247 |
|
|
248 |
|
#define SAD16BI() \ |
249 |
|
t1 = vec_perm(ref1[0], ref1[1], mask1); \ |
250 |
|
t2 = vec_perm(ref2[0], ref2[1], mask2); \ |
251 |
|
t1 = vec_avg(t1, t2); \ |
252 |
|
t2 = vec_max(t1, *cur); \ |
253 |
|
t1 = vec_min(t1, *cur); \ |
254 |
|
sad = vec_sub(t2, t1); \ |
255 |
|
sum = vec_sum4s(sad, sum); \ |
256 |
|
cur += stride; \ |
257 |
|
ref1 += stride; \ |
258 |
|
ref2 += stride |
259 |
|
|
260 |
|
/* |
261 |
|
* This function assumes cur is 16 bytes aligned, stride is 16 bytes |
262 |
|
* aligned and ref1 and ref2 is unaligned |
263 |
|
*/ |
264 |
|
|
265 |
|
uint32_t |
266 |
|
sad16bi_altivec_c(vector unsigned char *cur, |
267 |
|
vector unsigned char *ref1, |
268 |
|
vector unsigned char *ref2, |
269 |
|
uint32_t stride) |
270 |
|
{ |
271 |
|
vector unsigned char t1, t2; |
272 |
|
vector unsigned char mask1, mask2; |
273 |
|
vector unsigned char sad; |
274 |
|
vector unsigned int sum; |
275 |
|
uint32_t result; |
276 |
|
|
277 |
|
#ifdef DEBUG |
278 |
|
/* print alignment errors if this is on */ |
279 |
|
if((long)cur & 0xf) |
280 |
|
fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lx\n", (long)cur); |
281 |
|
if(stride & 0xf) |
282 |
|
fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lu\n", stride); |
283 |
|
#endif |
284 |
|
|
285 |
|
/* Initialisation stuff */ |
286 |
|
stride >>= 4; |
287 |
|
mask1 = vec_lvsl(0, (unsigned char*)ref1); |
288 |
|
mask2 = vec_lvsl(0, (unsigned char*)ref2); |
289 |
|
sad = vec_splat_u8(0); |
290 |
|
sum = (vector unsigned int)sad; |
291 |
|
|
292 |
|
SAD16BI(); |
293 |
|
SAD16BI(); |
294 |
|
SAD16BI(); |
295 |
|
SAD16BI(); |
296 |
|
|
297 |
|
SAD16BI(); |
298 |
|
SAD16BI(); |
299 |
|
SAD16BI(); |
300 |
|
SAD16BI(); |
301 |
|
|
302 |
|
SAD16BI(); |
303 |
|
SAD16BI(); |
304 |
|
SAD16BI(); |
305 |
|
SAD16BI(); |
306 |
|
|
307 |
|
SAD16BI(); |
308 |
|
SAD16BI(); |
309 |
|
SAD16BI(); |
310 |
|
SAD16BI(); |
311 |
|
|
312 |
|
sum = (vector unsigned int)vec_sums((vector signed int)sum, vec_splat_s32(0)); |
313 |
|
sum = vec_splat(sum, 3); |
314 |
|
vec_ste(sum, 0, (uint32_t*)&result); |
315 |
|
|
316 |
|
return result; |
317 |
|
} |
318 |
|
|
319 |
|
|
320 |
|
#define SSE8_16BIT() \ |
321 |
|
b1_vec = vec_perm(vec_ld(0,b1), vec_ld(16,b1), vec_lvsl(0,b1)); \ |
322 |
|
b2_vec = vec_perm(vec_ld(0,b2), vec_ld(16,b2), vec_lvsl(0,b2)); \ |
323 |
|
diff = vec_sub(b1_vec,b2_vec); \ |
324 |
|
sum = vec_msum(diff,diff,sum); \ |
325 |
|
b1 = (const int16_t*)((int8_t*)b1+stride); \ |
326 |
|
b2 = (const int16_t*)((int8_t*)b2+stride) |
327 |
|
|
328 |
|
uint32_t |
329 |
|
sse8_16bit_altivec_c(const int16_t * b1, |
330 |
|
const int16_t * b2, |
331 |
|
const uint32_t stride) |
332 |
|
{ |
333 |
|
register vector signed short b1_vec; |
334 |
|
register vector signed short b2_vec; |
335 |
|
register vector signed short diff; |
336 |
|
register vector signed int sum; |
337 |
|
uint32_t result; |
338 |
|
|
339 |
|
/* initialize */ |
340 |
|
sum = vec_splat_s32(0); |
341 |
|
|
342 |
|
SSE8_16BIT(); |
343 |
|
SSE8_16BIT(); |
344 |
|
SSE8_16BIT(); |
345 |
|
SSE8_16BIT(); |
346 |
|
|
347 |
|
SSE8_16BIT(); |
348 |
|
SSE8_16BIT(); |
349 |
|
SSE8_16BIT(); |
350 |
|
SSE8_16BIT(); |
351 |
|
|
352 |
|
/* sum the vector */ |
353 |
|
sum = vec_sums(sum, vec_splat_s32(0)); |
354 |
|
sum = vec_splat(sum,3); |
355 |
|
|
356 |
|
vec_ste(sum,0,(int*)&result); |
357 |
|
|
358 |
|
/* and return */ |
359 |
|
return result; |
360 |
} |
} |