1 |
/***************************************************************************** |
/* |
|
* |
|
|
* XVID MPEG-4 VIDEO CODEC |
|
|
* - altivec sum of absolute difference (C version) |
|
|
* |
|
|
* Copyright (C) 2002 Benjamin Herrenschmidt <benh@kernel.crashing.org> |
|
|
* |
|
|
* This program is an implementation of a part of one or more MPEG-4 |
|
|
* Video tools as specified in ISO/IEC 14496-2 standard. Those intending |
|
|
* to use this software module in hardware or software products are |
|
|
* advised that its use may infringe existing patents or copyrights, and |
|
|
* any such use would be at such party's own risk. The original |
|
|
* developer of this software module and his/her company, and subsequent |
|
|
* editors and their companies, will have no liability for use of this |
|
|
* software or modifications or derivatives thereof. |
|
|
* |
|
|
* This program is free software; you can redistribute it and/or modify |
|
|
* it under the terms of the GNU General Public License as published by |
|
|
* the Free Software Foundation; either version 2 of the License, or |
|
|
* (at your option) any later version. |
|
|
* |
|
|
* This program is distributed in the hope that it will be useful, |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
|
* GNU General Public License for more details. |
|
|
* |
|
|
* You should have received a copy of the GNU General Public License |
|
|
* along with this program; if not, write to the Free Software |
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
|
* |
|
|
* $Id$ |
|
|
* |
|
|
****************************************************************************/ |
|
|
|
|
|
#define G_REG |
|
|
|
|
|
#ifdef G_REG |
|
|
register vector unsigned char perm0 asm("%v29"); |
|
|
register vector unsigned char perm1 asm("%v30"); |
|
|
register vector unsigned int zerovec asm("%v31"); |
|
|
#endif |
|
2 |
|
|
3 |
#include <stdio.h> |
Copyright (C) 2002 Benjamin Herrenschmidt <benh@kernel.crashing.org> |
4 |
|
|
5 |
#undef DEBUG |
This program is free software; you can redistribute it and/or modify |
6 |
|
it under the terms of the GNU General Public License as published by |
7 |
|
the Free Software Foundation; either version 2 of the License, or |
8 |
|
(at your option) any later version. |
9 |
|
|
10 |
static const vector unsigned char perms[2] = { |
This program is distributed in the hope that it will be useful, |
11 |
(vector unsigned char) ( /* Used when cur is aligned */ |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 |
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 |
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17), |
GNU General Public License for more details. |
|
(vector unsigned char) ( /* Used when cur is unaligned */ |
|
|
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
|
|
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f), |
|
|
}; |
|
|
|
|
|
#ifdef G_REG |
|
|
void |
|
|
sadInit_altivec(void) |
|
|
{ |
|
|
perm0 = perms[0]; |
|
|
perm1 = perms[1]; |
|
|
zerovec = (vector unsigned int) (0); |
|
|
} |
|
|
static inline const vector unsigned char |
|
|
get_perm(unsigned long i) |
|
|
{ |
|
|
return i ? perm1 : perm0; |
|
|
} |
|
14 |
|
|
15 |
#define ZERODEF |
You should have received a copy of the GNU General Public License |
16 |
#define ZEROVEC zerovec |
along with this program; if not, write to the Free Software |
17 |
#else |
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
18 |
void |
|
19 |
sadInit_altivec(void) |
|
20 |
{ |
$Id$ |
21 |
} |
*/ |
|
static inline const vector unsigned char |
|
|
get_perm(unsigned long i) |
|
|
{ |
|
|
return perms[i]; |
|
|
} |
|
22 |
|
|
23 |
#define ZERODEF vector unsigned int zerovec = (vector unsigned int)(0) |
#ifdef HAVE_ALTIVEC_H |
24 |
#define ZEROVEC zerovec |
#include <altivec.h> |
25 |
#endif |
#endif |
26 |
|
|
27 |
|
|
28 |
|
#include "../../portab.h" |
29 |
|
|
30 |
|
/* no debugging by default */ |
31 |
|
#undef DEBUG |
32 |
|
|
33 |
|
#include <stdio.h> |
34 |
|
|
35 |
#define SAD16() \ |
#define SAD16() \ |
36 |
t1 = vec_perm(ref[0], ref[1], perm); /* align current vector */ \ |
t1 = vec_perm(ref[0], ref[1], perm); /* align current vector */ \ |
37 |
t2 = vec_max(t1, *cur); /* find largest of two */ \ |
t2 = vec_max(t1, *cur); /* find largest of two */ \ |
38 |
t3 = vec_min(t1, *cur); /* find smaller of two */ \ |
t1 = vec_min(t1, *cur); /* find smaller of two */ \ |
39 |
t4 = vec_sub(t2, t3); /* find absolute difference */ \ |
t1 = vec_sub(t2, t1); /* find absolute difference */ \ |
40 |
sad = vec_sum4s(t4, sad); /* accumulate sum of differences */ \ |
sad = vec_sum4s(t1, vec_splat_u32(0)); /* sum of differences */ \ |
41 |
|
sumdiffs = (vector unsigned int)vec_sums((vector signed int)sad, (vector signed int)sumdiffs); /* accumulate sumdiffs */ \ |
42 |
|
if(vec_any_ge(sumdiffs, best_vec)) \ |
43 |
|
goto bail; \ |
44 |
cur += stride; ref += stride; |
cur += stride; ref += stride; |
45 |
|
|
46 |
/* |
/* |
47 |
* This function assumes cur and stride are 16 bytes aligned and ref is unaligned |
* This function assumes cur and stride are 16 bytes aligned and ref is unaligned |
48 |
*/ |
*/ |
49 |
unsigned long |
unsigned long |
50 |
sad16_altivec(const vector unsigned char *cur, |
sad16_altivec_c(const vector unsigned char *cur, |
51 |
const vector unsigned char *ref, |
const vector unsigned char *ref, |
52 |
unsigned long stride, |
unsigned long stride, |
53 |
const unsigned long best_sad) |
const unsigned long best_sad) |
54 |
{ |
{ |
55 |
vector unsigned char perm; |
vector unsigned char perm; |
56 |
vector unsigned char t1, t2, t3, t4; |
vector unsigned char t1, t2; |
57 |
vector unsigned int sad; |
vector unsigned int sad; |
58 |
vector signed int sumdiffs, best_vec; |
vector unsigned int sumdiffs; |
59 |
|
vector unsigned int best_vec; |
60 |
unsigned long result; |
unsigned long result; |
61 |
|
|
|
ZERODEF; |
|
62 |
|
|
63 |
#ifdef DEBUG |
#ifdef DEBUG |
64 |
|
/* print alignment errors if DEBUG is on */ |
65 |
if (((unsigned long) cur) & 0xf) |
if (((unsigned long) cur) & 0xf) |
66 |
fprintf(stderr, "sad16_altivec:incorrect align, cur: %x\n", cur); |
fprintf(stderr, "sad16_altivec:incorrect align, cur: %x\n", cur); |
|
// if (((unsigned long)ref) & 0xf) |
|
|
// fprintf(stderr, "sad16_altivec:incorrect align, ref: %x\n", ref); |
|
67 |
if (stride & 0xf) |
if (stride & 0xf) |
68 |
fprintf(stderr, "sad16_altivec:incorrect align, stride: %x\n", stride); |
fprintf(stderr, "sad16_altivec:incorrect align, stride: %x\n", stride); |
69 |
#endif |
#endif |
70 |
/* initialization */ |
/* initialization */ |
71 |
sad = (vector unsigned int) (ZEROVEC); |
sad = vec_splat_u32(0); |
72 |
|
sumdiffs = sad; |
73 |
stride >>= 4; |
stride >>= 4; |
74 |
perm = vec_lvsl(0, (unsigned char *) ref); |
perm = vec_lvsl(0, (unsigned char *) ref); |
75 |
*((unsigned long *) &best_vec) = best_sad; |
*((unsigned long *) &best_vec) = best_sad; |
80 |
SAD16(); |
SAD16(); |
81 |
SAD16(); |
SAD16(); |
82 |
SAD16(); |
SAD16(); |
83 |
/* Temp sum for exit */ |
|
|
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); |
|
|
if (vec_all_ge(sumdiffs, best_vec)) |
|
|
goto bail; |
|
84 |
SAD16(); |
SAD16(); |
85 |
SAD16(); |
SAD16(); |
86 |
SAD16(); |
SAD16(); |
87 |
SAD16(); |
SAD16(); |
88 |
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); |
|
|
if (vec_all_ge(sumdiffs, best_vec)) |
|
|
goto bail; |
|
89 |
SAD16(); |
SAD16(); |
90 |
SAD16(); |
SAD16(); |
91 |
SAD16(); |
SAD16(); |
92 |
SAD16(); |
SAD16(); |
93 |
|
|
94 |
SAD16(); |
SAD16(); |
95 |
SAD16(); |
SAD16(); |
96 |
SAD16(); |
SAD16(); |
97 |
SAD16(); |
SAD16(); |
98 |
|
|
|
/* sum all parts of difference into one 32 bit quantity */ |
|
|
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); |
|
99 |
bail: |
bail: |
100 |
/* copy vector sum into unaligned result */ |
/* copy vector sum into unaligned result */ |
101 |
sumdiffs = vec_splat(sumdiffs, 3); |
sumdiffs = vec_splat(sumdiffs, 3); |
102 |
vec_ste(sumdiffs, 0, (int *) &result); |
vec_ste(sumdiffs, 0, (unsigned long *) &result); |
103 |
return (result); |
return result; |
104 |
} |
} |
105 |
|
|
106 |
|
|
107 |
#define SAD8() \ |
#define SAD8() \ |
108 |
t1 = vec_perm(cur[0], cur[stride], perm_cur); /* align current vector */ \ |
t1 = vec_perm(cur[0], cur[stride], perm_cur); /* align current vector */ \ |
109 |
t2 = vec_perm(ref[0], ref[1], perm_ref1); /* align current vector */ \ |
t2 = vec_perm(ref[0], ref[1], perm_ref1); /* align current vector */ \ |
110 |
tp = vec_perm(ref[stride], ref[stride+1], perm_ref1); /* align current vector */ \ |
tp = vec_perm(ref[stride], ref[stride+1], perm_ref1); /* align current vector */ \ |
111 |
t2 = vec_perm(t2,tp,perm_ref2); \ |
t2 = vec_perm(t2,tp,perm_ref2); \ |
112 |
t3 = vec_max(t1, t2); /* find largest of two */ \ |
tp = vec_max(t1, t2); /* find largest of two */ \ |
113 |
t4 = vec_min(t1, t2); /* find smaller of two */ \ |
t1 = vec_min(t1, t2); /* find smaller of two */ \ |
114 |
t5 = vec_sub(t3, t4); /* find absolute difference */ \ |
tp = vec_sub(tp, t1); /* find absolute difference */ \ |
115 |
sad = vec_sum4s(t5, sad); /* accumulate sum of differences */ \ |
sad = vec_sum4s(tp, sad); /* accumulate sum of differences */ \ |
116 |
cur += stride<<1; ref += stride<<1; |
cur += stride<<1; ref += stride<<1; |
117 |
|
|
118 |
/* |
/* |
120 |
* aligned and ref is unaligned |
* aligned and ref is unaligned |
121 |
*/ |
*/ |
122 |
unsigned long |
unsigned long |
123 |
sad8_altivec(const vector unsigned char *cur, |
sad8_altivec_c(const vector unsigned char *cur, |
124 |
const vector unsigned char *ref, |
const vector unsigned char *ref, |
125 |
unsigned long stride) |
unsigned long stride) |
126 |
{ |
{ |
127 |
vector unsigned char t1, t2, t3, t4, t5, tp; |
vector unsigned char t1, t2, tp; |
128 |
vector unsigned int sad; |
vector unsigned int sad; |
129 |
vector signed int sumdiffs; |
vector unsigned int sumdiffs; |
130 |
vector unsigned char perm_cur; |
vector unsigned char perm_cur; |
131 |
vector unsigned char perm_ref1, perm_ref2; |
vector unsigned char perm_ref1, perm_ref2; |
132 |
unsigned long result; |
unsigned long result; |
133 |
|
|
|
ZERODEF; |
|
|
|
|
134 |
#ifdef DEBUG |
#ifdef DEBUG |
135 |
|
/* print alignment errors if DEBUG is on */ |
136 |
if (((unsigned long) cur) & 0x7) |
if (((unsigned long) cur) & 0x7) |
137 |
fprintf(stderr, "sad8_altivec:incorrect align, cur: %x\n", cur); |
fprintf(stderr, "sad8_altivec:incorrect align, cur: %x\n", cur); |
|
// if (((unsigned long)ref) & 0x7) |
|
|
// fprintf(stderr, "sad8_altivec:incorrect align, ref: %x\n", ref); |
|
138 |
if (stride & 0xf) |
if (stride & 0xf) |
139 |
fprintf(stderr, "sad8_altivec:incorrect align, stride: %x\n", stride); |
fprintf(stderr, "sad8_altivec:incorrect align, stride: %x\n", stride); |
140 |
#endif |
#endif |
141 |
|
|
142 |
perm_cur = get_perm((((unsigned long) cur) >> 3) & 0x01); |
/* check if cur is 8 or 16 bytes aligned an create the perm_cur vector */ |
143 |
perm_ref1 = vec_lvsl(0, (unsigned char *) ref); |
perm_ref1 = vec_lvsl(0, (unsigned char *) ref); |
144 |
perm_ref2 = get_perm(0); |
perm_ref2 = vec_add(vec_lvsl(0, (unsigned char*)NULL), vec_pack(vec_splat_u16(0), vec_splat_u16(8))); |
145 |
|
perm_cur = vec_add(perm_ref2, vec_splat(vec_lvsl(0, (unsigned char*)cur), 0)); |
146 |
|
|
147 |
/* initialization */ |
/* initialization */ |
148 |
sad = (vector unsigned int) (ZEROVEC); |
sad = vec_splat_u32(0); |
149 |
stride >>= 4; |
stride >>= 4; |
150 |
|
|
151 |
/* perform sum of differences between current and previous */ |
/* perform sum of differences between current and previous */ |
155 |
SAD8(); |
SAD8(); |
156 |
|
|
157 |
/* sum all parts of difference into one 32 bit quantity */ |
/* sum all parts of difference into one 32 bit quantity */ |
158 |
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); |
sumdiffs = (vector unsigned int)vec_sums((vector signed int) sad, vec_splat_s32(0)); |
159 |
|
|
160 |
/* copy vector sum into unaligned result */ |
/* copy vector sum into unaligned result */ |
161 |
sumdiffs = vec_splat(sumdiffs, 3); |
sumdiffs = vec_splat(sumdiffs, 3); |
162 |
vec_ste(sumdiffs, 0, (int *) &result); |
vec_ste(sumdiffs, 0, (unsigned int *) &result); |
163 |
return (result); |
return result; |
164 |
} |
} |
165 |
|
|
|
#define MEAN16(i)\ |
|
|
c##i=*cur;\ |
|
|
mean = vec_sum4s(c##i,mean);\ |
|
|
cur += stride; |
|
|
|
|
|
#define DEV16(i) \ |
|
|
t2 = vec_max(c##i, mn); /* find largest of two */ \ |
|
|
t3 = vec_min(c##i, mn); /* find smaller of two */ \ |
|
|
t4 = vec_sub(t2, t3); /* find absolute difference */ \ |
|
|
dev = vec_sum4s(t4, dev); |
|
166 |
|
|
167 |
|
#define MEAN16() \ |
168 |
|
mean = vec_sum4s(*ptr,mean);\ |
169 |
|
ptr += stride |
170 |
|
|
171 |
|
#define DEV16() \ |
172 |
|
t2 = vec_max(*ptr, mn); /* find largest of two */ \ |
173 |
|
t3 = vec_min(*ptr, mn); /* find smaller of two */ \ |
174 |
|
t2 = vec_sub(t2, t3); /* find absolute difference */ \ |
175 |
|
dev = vec_sum4s(t2, dev); \ |
176 |
|
ptr += stride |
177 |
|
|
178 |
|
/* |
179 |
|
* This function assumes cur is 16 bytes aligned and stride is 16 bytes |
180 |
|
* aligned |
181 |
|
*/ |
182 |
unsigned long |
unsigned long |
183 |
dev16_altivec(const vector unsigned char *cur, |
dev16_altivec_c(const vector unsigned char *cur, |
184 |
unsigned long stride) |
unsigned long stride) |
185 |
{ |
{ |
186 |
vector unsigned char t2, t3, t4, mn; |
vector unsigned char t2, t3, mn; |
187 |
vector unsigned int mean, dev; |
vector unsigned int mean, dev; |
188 |
vector signed int sumdiffs; |
vector unsigned int sumdiffs; |
189 |
vector unsigned char c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, |
const vector unsigned char *ptr; |
|
c13, c14, c15; |
|
190 |
unsigned long result; |
unsigned long result; |
191 |
|
|
192 |
ZERODEF; |
#ifdef DEBUG |
193 |
|
/* print alignment errors if DEBUG is on */ |
194 |
|
if(((unsigned long)cur) & 0x7) |
195 |
|
fprintf(stderr, "dev16_altivec:incorrect align, cur: %x\n", cur); |
196 |
|
if(stride & 0xf) |
197 |
|
fprintf(stderr, "dev16_altivec:incorrect align, stride: %ld\n", stride); |
198 |
|
#endif |
199 |
|
|
200 |
mean = (vector unsigned int) (ZEROVEC); |
dev = mean = vec_splat_u32(0); |
|
dev = (vector unsigned int) (ZEROVEC); |
|
201 |
stride >>= 4; |
stride >>= 4; |
202 |
|
|
203 |
MEAN16(0); |
/* set pointer to iterate through cur */ |
204 |
MEAN16(1); |
ptr = cur; |
205 |
MEAN16(2); |
|
206 |
MEAN16(3); |
MEAN16(); |
207 |
MEAN16(4); |
MEAN16(); |
208 |
MEAN16(5); |
MEAN16(); |
209 |
MEAN16(6); |
MEAN16(); |
210 |
MEAN16(7); |
MEAN16(); |
211 |
MEAN16(8); |
MEAN16(); |
212 |
MEAN16(9); |
MEAN16(); |
213 |
MEAN16(10); |
MEAN16(); |
214 |
MEAN16(11); |
MEAN16(); |
215 |
MEAN16(12); |
MEAN16(); |
216 |
MEAN16(13); |
MEAN16(); |
217 |
MEAN16(14); |
MEAN16(); |
218 |
MEAN16(15); |
MEAN16(); |
219 |
|
MEAN16(); |
220 |
sumdiffs = vec_sums((vector signed int) mean, (vector signed int) ZEROVEC); |
MEAN16(); |
221 |
mn = vec_perm((vector unsigned char) sumdiffs, |
MEAN16(); |
222 |
(vector unsigned char) sumdiffs, (vector unsigned char) (14, |
|
223 |
14, |
/* Add all together in sumdiffs */ |
224 |
14, |
sumdiffs = (vector unsigned int)vec_sums((vector signed int) mean, vec_splat_s32(0)); |
225 |
14, |
/* teilen durch 16 * 16 */ |
226 |
14, |
mn = vec_perm((vector unsigned char)sumdiffs, (vector unsigned char)sumdiffs, vec_splat_u8(14)); |
227 |
14, |
|
228 |
14, |
/* set pointer to iterate through cur */ |
229 |
14, |
ptr = cur; |
230 |
14, |
|
231 |
14, |
DEV16(); |
232 |
14, |
DEV16(); |
233 |
14, |
DEV16(); |
234 |
14, |
DEV16(); |
235 |
14, |
DEV16(); |
236 |
14, |
DEV16(); |
237 |
14)); |
DEV16(); |
238 |
DEV16(0); |
DEV16(); |
239 |
DEV16(1); |
DEV16(); |
240 |
DEV16(2); |
DEV16(); |
241 |
DEV16(3); |
DEV16(); |
242 |
DEV16(4); |
DEV16(); |
243 |
DEV16(5); |
DEV16(); |
244 |
DEV16(6); |
DEV16(); |
245 |
DEV16(7); |
DEV16(); |
246 |
DEV16(8); |
DEV16(); |
|
DEV16(9); |
|
|
DEV16(10); |
|
|
DEV16(11); |
|
|
DEV16(12); |
|
|
DEV16(13); |
|
|
DEV16(14); |
|
|
DEV16(15); |
|
247 |
|
|
248 |
/* sum all parts of difference into one 32 bit quantity */ |
/* sum all parts of difference into one 32 bit quantity */ |
249 |
sumdiffs = vec_sums((vector signed int) dev, (vector signed int) ZEROVEC); |
sumdiffs = (vector unsigned int)vec_sums((vector signed int) dev, vec_splat_s32(0)); |
250 |
|
|
251 |
/* copy vector sum into unaligned result */ |
/* copy vector sum into unaligned result */ |
252 |
sumdiffs = vec_splat(sumdiffs, 3); |
sumdiffs = vec_splat(sumdiffs, 3); |
253 |
vec_ste(sumdiffs, 0, (int *) &result); |
vec_ste(sumdiffs, 0, (unsigned int *) &result); |
254 |
return (result); |
return result; |
255 |
|
} |
256 |
|
|
257 |
|
#define SAD16BI() \ |
258 |
|
t1 = vec_perm(ref1[0], ref1[1], mask1); \ |
259 |
|
t2 = vec_perm(ref2[0], ref2[1], mask2); \ |
260 |
|
t1 = vec_avg(t1, t2); \ |
261 |
|
t2 = vec_max(t1, *cur); \ |
262 |
|
t1 = vec_min(t1, *cur); \ |
263 |
|
sad = vec_sub(t2, t1); \ |
264 |
|
sum = vec_sum4s(sad, sum); \ |
265 |
|
cur += stride; \ |
266 |
|
ref1 += stride; \ |
267 |
|
ref2 += stride |
268 |
|
|
269 |
|
/* |
270 |
|
* This function assumes cur is 16 bytes aligned, stride is 16 bytes |
271 |
|
* aligned and ref1 and ref2 is unaligned |
272 |
|
*/ |
273 |
|
unsigned long |
274 |
|
sad16bi_altivec_c(vector unsigned char *cur, |
275 |
|
vector unsigned char *ref1, |
276 |
|
vector unsigned char *ref2, |
277 |
|
unsigned long stride) |
278 |
|
{ |
279 |
|
vector unsigned char t1, t2; |
280 |
|
vector unsigned char mask1, mask2; |
281 |
|
vector unsigned char sad; |
282 |
|
vector unsigned int sum; |
283 |
|
unsigned long result; |
284 |
|
|
285 |
|
#ifdef DEBUG |
286 |
|
/* print alignment errors if this is on */ |
287 |
|
if(cur & 0xf) |
288 |
|
fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %x\n", cur); |
289 |
|
if(stride & 0xf) |
290 |
|
fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %ld\n", stride); |
291 |
|
#endif |
292 |
|
|
293 |
|
/* Initialisation stuff */ |
294 |
|
stride >>= 4; |
295 |
|
mask1 = vec_lvsl(0, (unsigned char*)ref1); |
296 |
|
mask2 = vec_lvsl(0, (unsigned char*)ref2); |
297 |
|
sad = vec_splat_u8(0); |
298 |
|
sum = (vector unsigned int)sad; |
299 |
|
|
300 |
|
SAD16BI(); |
301 |
|
SAD16BI(); |
302 |
|
SAD16BI(); |
303 |
|
SAD16BI(); |
304 |
|
|
305 |
|
SAD16BI(); |
306 |
|
SAD16BI(); |
307 |
|
SAD16BI(); |
308 |
|
SAD16BI(); |
309 |
|
|
310 |
|
SAD16BI(); |
311 |
|
SAD16BI(); |
312 |
|
SAD16BI(); |
313 |
|
SAD16BI(); |
314 |
|
|
315 |
|
SAD16BI(); |
316 |
|
SAD16BI(); |
317 |
|
SAD16BI(); |
318 |
|
SAD16BI(); |
319 |
|
|
320 |
|
sum = (vector unsigned int)vec_sums((vector signed int)sum, vec_splat_s32(0)); |
321 |
|
sum = vec_splat(sum, 3); |
322 |
|
vec_ste(sum, 0, (unsigned int*)&result); |
323 |
|
|
324 |
|
return result; |
325 |
|
} |
326 |
|
|
327 |
|
|
328 |
|
#define SSE8_16BIT() \ |
329 |
|
b1_vec = vec_perm(vec_ld(0,b1), vec_ld(16,b1), vec_lvsl(0,b1)); \ |
330 |
|
b2_vec = vec_perm(vec_ld(0,b2), vec_ld(16,b2), vec_lvsl(0,b2)); \ |
331 |
|
diff = vec_sub(b1_vec,b2_vec); \ |
332 |
|
sum = vec_msum(diff,diff,sum); \ |
333 |
|
b1 = (const int16_t*)((int8_t*)b1+stride); \ |
334 |
|
b2 = (const int16_t*)((int8_t*)b2+stride) |
335 |
|
|
336 |
|
uint32_t |
337 |
|
sse8_16bit_altivec_c(const int16_t * b1, |
338 |
|
const int16_t * b2, |
339 |
|
const uint32_t stride) |
340 |
|
{ |
341 |
|
register vector signed short b1_vec; |
342 |
|
register vector signed short b2_vec; |
343 |
|
register vector signed short diff; |
344 |
|
register vector signed int sum; |
345 |
|
uint32_t result; |
346 |
|
|
347 |
|
/* initialize */ |
348 |
|
sum = vec_splat_s32(0); |
349 |
|
|
350 |
|
SSE8_16BIT(); |
351 |
|
SSE8_16BIT(); |
352 |
|
SSE8_16BIT(); |
353 |
|
SSE8_16BIT(); |
354 |
|
|
355 |
|
SSE8_16BIT(); |
356 |
|
SSE8_16BIT(); |
357 |
|
SSE8_16BIT(); |
358 |
|
SSE8_16BIT(); |
359 |
|
|
360 |
|
/* sum the vector */ |
361 |
|
sum = vec_sums(sum, vec_splat_s32(0)); |
362 |
|
sum = vec_splat(sum,3); |
363 |
|
|
364 |
|
vec_ste(sum,0,(int*)&result); |
365 |
|
|
366 |
|
/* and return */ |
367 |
|
return result; |
368 |
} |
} |