1 |
/***************************************************************************** |
/* |
|
* |
|
|
* XVID MPEG-4 VIDEO CODEC |
|
|
* - altivec sum of absolute difference (C version) |
|
|
* |
|
|
* Copyright (C) 2002 Benjamin Herrenschmidt <benh@kernel.crashing.org> |
|
|
* |
|
|
* This file is part of XviD, a free MPEG-4 video encoder/decoder |
|
|
* |
|
|
* XviD is free software; you can redistribute it and/or modify it |
|
|
* under the terms of the GNU General Public License as published by |
|
|
* the Free Software Foundation; either version 2 of the License, or |
|
|
* (at your option) any later version. |
|
|
* |
|
|
* This program is distributed in the hope that it will be useful, |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
|
* GNU General Public License for more details. |
|
|
* |
|
|
* You should have received a copy of the GNU General Public License |
|
|
* along with this program; if not, write to the Free Software |
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
|
* |
|
|
* Under section 8 of the GNU General Public License, the copyright |
|
|
* holders of XVID explicitly forbid distribution in the following |
|
|
* countries: |
|
|
* |
|
|
* - Japan |
|
|
* - United States of America |
|
|
* |
|
|
* Linking XviD statically or dynamically with other modules is making a |
|
|
* combined work based on XviD. Thus, the terms and conditions of the |
|
|
* GNU General Public License cover the whole combination. |
|
|
* |
|
|
* As a special exception, the copyright holders of XviD give you |
|
|
* permission to link XviD with independent modules that communicate with |
|
|
* XviD solely through the VFW1.1 and DShow interfaces, regardless of the |
|
|
* license terms of these independent modules, and to copy and distribute |
|
|
* the resulting combined work under terms of your choice, provided that |
|
|
* every copy of the combined work is accompanied by a complete copy of |
|
|
* the source code of XviD (the version of XviD used to produce the |
|
|
* combined work), being distributed under the terms of the GNU General |
|
|
* Public License plus this exception. An independent module is a module |
|
|
* which is not derived from or based on XviD. |
|
|
* |
|
|
* Note that people who make modified versions of XviD are not obligated |
|
|
* to grant this special exception for their modified versions; it is |
|
|
* their choice whether to do so. The GNU General Public License gives |
|
|
* permission to release a modified version without this exception; this |
|
|
* exception also makes it possible to release a modified version which |
|
|
* carries forward this exception. |
|
|
* |
|
|
* $Id$ |
|
|
* |
|
|
****************************************************************************/ |
|
|
|
|
|
#define G_REG |
|
|
|
|
|
#ifdef G_REG |
|
|
register vector unsigned char perm0 asm("%v29"); |
|
|
register vector unsigned char perm1 asm("%v30"); |
|
|
register vector unsigned int zerovec asm("%v31"); |
|
|
#endif |
|
2 |
|
|
3 |
#include <stdio.h> |
Copyright (C) 2002 Benjamin Herrenschmidt <benh@kernel.crashing.org> |
4 |
|
|
5 |
#undef DEBUG |
This program is free software; you can redistribute it and/or modify |
6 |
|
it under the terms of the GNU General Public License as published by |
7 |
|
the Free Software Foundation; either version 2 of the License, or |
8 |
|
(at your option) any later version. |
9 |
|
|
10 |
static const vector unsigned char perms[2] = { |
This program is distributed in the hope that it will be useful, |
11 |
(vector unsigned char) ( /* Used when cur is aligned */ |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 |
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 |
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17), |
GNU General Public License for more details. |
|
(vector unsigned char) ( /* Used when cur is unaligned */ |
|
|
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
|
|
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f), |
|
|
}; |
|
|
|
|
|
#ifdef G_REG |
|
|
void |
|
|
sadInit_altivec(void) |
|
|
{ |
|
|
perm0 = perms[0]; |
|
|
perm1 = perms[1]; |
|
|
zerovec = (vector unsigned int) (0); |
|
|
} |
|
|
static inline const vector unsigned char |
|
|
get_perm(unsigned long i) |
|
|
{ |
|
|
return i ? perm1 : perm0; |
|
|
} |
|
14 |
|
|
15 |
#define ZERODEF |
You should have received a copy of the GNU General Public License |
16 |
#define ZEROVEC zerovec |
along with this program; if not, write to the Free Software |
17 |
#else |
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
18 |
void |
|
19 |
sadInit_altivec(void) |
|
20 |
{ |
$Id$ |
21 |
} |
*/ |
|
static inline const vector unsigned char |
|
|
get_perm(unsigned long i) |
|
|
{ |
|
|
return perms[i]; |
|
|
} |
|
22 |
|
|
23 |
#define ZERODEF vector unsigned int zerovec = (vector unsigned int)(0) |
#ifdef HAVE_ALTIVEC_H |
24 |
#define ZEROVEC zerovec |
#include <altivec.h> |
25 |
#endif |
#endif |
26 |
|
|
27 |
|
|
28 |
|
#include "../../portab.h" |
29 |
|
|
30 |
|
/* no debugging by default */ |
31 |
|
#undef DEBUG |
32 |
|
|
33 |
|
#include <stdio.h> |
34 |
|
|
35 |
#define SAD16() \ |
#define SAD16() \ |
36 |
t1 = vec_perm(ref[0], ref[1], perm); /* align current vector */ \ |
t1 = vec_perm(ref[0], ref[1], perm); /* align current vector */ \ |
37 |
t2 = vec_max(t1, *cur); /* find largest of two */ \ |
t2 = vec_max(t1, *cur); /* find largest of two */ \ |
38 |
t3 = vec_min(t1, *cur); /* find smaller of two */ \ |
t1 = vec_min(t1, *cur); /* find smaller of two */ \ |
39 |
t4 = vec_sub(t2, t3); /* find absolute difference */ \ |
t1 = vec_sub(t2, t1); /* find absolute difference */ \ |
40 |
sad = vec_sum4s(t4, sad); /* accumulate sum of differences */ \ |
sad = vec_sum4s(t1, vec_splat_u32(0)); /* sum of differences */ \ |
41 |
|
sumdiffs = (vector unsigned int)vec_sums((vector signed int)sad, (vector signed int)sumdiffs); /* accumulate sumdiffs */ \ |
42 |
|
if(vec_any_ge(sumdiffs, best_vec)) \ |
43 |
|
goto bail; \ |
44 |
cur += stride; ref += stride; |
cur += stride; ref += stride; |
45 |
|
|
46 |
/* |
/* |
47 |
* This function assumes cur and stride are 16 bytes aligned and ref is unaligned |
* This function assumes cur and stride are 16 bytes aligned and ref is unaligned |
48 |
*/ |
*/ |
49 |
unsigned long |
|
50 |
sad16_altivec(const vector unsigned char *cur, |
uint32_t |
51 |
const vector unsigned char *ref, |
sad16_altivec_c(vector unsigned char *cur, |
52 |
unsigned long stride, |
vector unsigned char *ref, |
53 |
const unsigned long best_sad) |
uint32_t stride, |
54 |
|
const uint32_t best_sad) |
55 |
{ |
{ |
56 |
vector unsigned char perm; |
vector unsigned char perm; |
57 |
vector unsigned char t1, t2, t3, t4; |
vector unsigned char t1, t2; |
58 |
vector unsigned int sad; |
vector unsigned int sad; |
59 |
vector signed int sumdiffs, best_vec; |
vector unsigned int sumdiffs; |
60 |
unsigned long result; |
vector unsigned int best_vec; |
61 |
|
uint32_t result; |
62 |
|
|
|
ZERODEF; |
|
63 |
|
|
64 |
#ifdef DEBUG |
#ifdef DEBUG |
65 |
|
/* print alignment errors if DEBUG is on */ |
66 |
if (((unsigned long) cur) & 0xf) |
if (((unsigned long) cur) & 0xf) |
67 |
fprintf(stderr, "sad16_altivec:incorrect align, cur: %x\n", cur); |
fprintf(stderr, "sad16_altivec:incorrect align, cur: %lx\n", (long)cur); |
|
// if (((unsigned long)ref) & 0xf) |
|
|
// fprintf(stderr, "sad16_altivec:incorrect align, ref: %x\n", ref); |
|
68 |
if (stride & 0xf) |
if (stride & 0xf) |
69 |
fprintf(stderr, "sad16_altivec:incorrect align, stride: %x\n", stride); |
fprintf(stderr, "sad16_altivec:incorrect align, stride: %lu\n", stride); |
70 |
#endif |
#endif |
71 |
/* initialization */ |
/* initialization */ |
72 |
sad = (vector unsigned int) (ZEROVEC); |
sad = vec_splat_u32(0); |
73 |
|
sumdiffs = sad; |
74 |
stride >>= 4; |
stride >>= 4; |
75 |
perm = vec_lvsl(0, (unsigned char *) ref); |
perm = vec_lvsl(0, (unsigned char *) ref); |
76 |
*((unsigned long *) &best_vec) = best_sad; |
*((uint32_t*)&best_vec) = best_sad; |
77 |
best_vec = vec_splat(best_vec, 0); |
best_vec = vec_splat(best_vec, 0); |
78 |
|
|
79 |
/* perform sum of differences between current and previous */ |
/* perform sum of differences between current and previous */ |
81 |
SAD16(); |
SAD16(); |
82 |
SAD16(); |
SAD16(); |
83 |
SAD16(); |
SAD16(); |
84 |
/* Temp sum for exit */ |
|
|
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); |
|
|
if (vec_all_ge(sumdiffs, best_vec)) |
|
|
goto bail; |
|
85 |
SAD16(); |
SAD16(); |
86 |
SAD16(); |
SAD16(); |
87 |
SAD16(); |
SAD16(); |
88 |
SAD16(); |
SAD16(); |
89 |
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); |
|
|
if (vec_all_ge(sumdiffs, best_vec)) |
|
|
goto bail; |
|
90 |
SAD16(); |
SAD16(); |
91 |
SAD16(); |
SAD16(); |
92 |
SAD16(); |
SAD16(); |
93 |
SAD16(); |
SAD16(); |
94 |
|
|
95 |
SAD16(); |
SAD16(); |
96 |
SAD16(); |
SAD16(); |
97 |
SAD16(); |
SAD16(); |
98 |
SAD16(); |
SAD16(); |
99 |
|
|
|
/* sum all parts of difference into one 32 bit quantity */ |
|
|
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); |
|
100 |
bail: |
bail: |
101 |
/* copy vector sum into unaligned result */ |
/* copy vector sum into unaligned result */ |
102 |
sumdiffs = vec_splat(sumdiffs, 3); |
sumdiffs = vec_splat(sumdiffs, 3); |
103 |
vec_ste(sumdiffs, 0, (int *) &result); |
vec_ste(sumdiffs, 0, (uint32_t*) &result); |
104 |
return (result); |
return result; |
105 |
} |
} |
106 |
|
|
107 |
|
|
108 |
#define SAD8() \ |
#define SAD8() \ |
109 |
t1 = vec_perm(cur[0], cur[stride], perm_cur); /* align current vector */ \ |
c = vec_perm(vec_ld(0,cur),vec_ld(16,cur),vec_lvsl(0,cur));\ |
110 |
t2 = vec_perm(ref[0], ref[1], perm_ref1); /* align current vector */ \ |
r = vec_perm(vec_ld(0,ref),vec_ld(16,ref),vec_lvsl(0,ref));\ |
111 |
tp = vec_perm(ref[stride], ref[stride+1], perm_ref1); /* align current vector */ \ |
c = vec_sub(vec_max(c,r),vec_min(c,r));\ |
112 |
t2 = vec_perm(t2,tp,perm_ref2); \ |
sad = vec_sum4s(c,sad);\ |
113 |
t3 = vec_max(t1, t2); /* find largest of two */ \ |
cur += stride;\ |
114 |
t4 = vec_min(t1, t2); /* find smaller of two */ \ |
ref += stride |
|
t5 = vec_sub(t3, t4); /* find absolute difference */ \ |
|
|
sad = vec_sum4s(t5, sad); /* accumulate sum of differences */ \ |
|
|
cur += stride<<1; ref += stride<<1; |
|
115 |
|
|
116 |
/* |
/* |
117 |
* This function assumes cur is 8 bytes aligned, stride is 16 bytes |
* This function assumes nothing |
|
* aligned and ref is unaligned |
|
118 |
*/ |
*/ |
119 |
unsigned long |
|
120 |
sad8_altivec(const vector unsigned char *cur, |
uint32_t |
121 |
const vector unsigned char *ref, |
sad8_altivec_c(const uint8_t * cur, |
122 |
unsigned long stride) |
const uint8_t *ref, |
123 |
|
const uint32_t stride) |
124 |
{ |
{ |
125 |
vector unsigned char t1, t2, t3, t4, t5, tp; |
uint32_t result = 0; |
|
vector unsigned int sad; |
|
|
vector signed int sumdiffs; |
|
|
vector unsigned char perm_cur; |
|
|
vector unsigned char perm_ref1, perm_ref2; |
|
|
unsigned long result; |
|
126 |
|
|
127 |
ZERODEF; |
register vector unsigned int sad; |
128 |
|
register vector unsigned char c; |
129 |
|
register vector unsigned char r; |
130 |
|
|
131 |
#ifdef DEBUG |
/* initialize */ |
132 |
if (((unsigned long) cur) & 0x7) |
sad = vec_splat_u32(0); |
|
fprintf(stderr, "sad8_altivec:incorrect align, cur: %x\n", cur); |
|
|
// if (((unsigned long)ref) & 0x7) |
|
|
// fprintf(stderr, "sad8_altivec:incorrect align, ref: %x\n", ref); |
|
|
if (stride & 0xf) |
|
|
fprintf(stderr, "sad8_altivec:incorrect align, stride: %x\n", stride); |
|
|
#endif |
|
133 |
|
|
134 |
perm_cur = get_perm((((unsigned long) cur) >> 3) & 0x01); |
/* Perform sad operations */ |
135 |
perm_ref1 = vec_lvsl(0, (unsigned char *) ref); |
SAD8(); |
136 |
perm_ref2 = get_perm(0); |
SAD8(); |
137 |
|
SAD8(); |
138 |
/* initialization */ |
SAD8(); |
|
sad = (vector unsigned int) (ZEROVEC); |
|
|
stride >>= 4; |
|
139 |
|
|
|
/* perform sum of differences between current and previous */ |
|
140 |
SAD8(); |
SAD8(); |
141 |
SAD8(); |
SAD8(); |
142 |
SAD8(); |
SAD8(); |
143 |
SAD8(); |
SAD8(); |
144 |
|
|
145 |
/* sum all parts of difference into one 32 bit quantity */ |
/* finish addition, add the first 2 together */ |
146 |
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); |
sad = vec_and(sad, (vector unsigned int)vec_pack(vec_splat_u16(-1),vec_splat_u16(0))); |
147 |
|
sad = (vector unsigned int)vec_sums((vector signed int)sad, vec_splat_s32(0)); |
148 |
|
sad = vec_splat(sad,3); |
149 |
|
vec_ste(sad, 0, &result); |
150 |
|
|
151 |
/* copy vector sum into unaligned result */ |
return result; |
|
sumdiffs = vec_splat(sumdiffs, 3); |
|
|
vec_ste(sumdiffs, 0, (int *) &result); |
|
|
return (result); |
|
152 |
} |
} |
153 |
|
|
154 |
#define MEAN16(i)\ |
|
155 |
c##i=*cur;\ |
|
156 |
mean = vec_sum4s(c##i,mean);\ |
|
157 |
cur += stride; |
#define MEAN16() \ |
158 |
|
mean = vec_sum4s(*ptr,mean);\ |
159 |
#define DEV16(i) \ |
ptr += stride |
160 |
t2 = vec_max(c##i, mn); /* find largest of two */ \ |
|
161 |
t3 = vec_min(c##i, mn); /* find smaller of two */ \ |
#define DEV16() \ |
162 |
t4 = vec_sub(t2, t3); /* find absolute difference */ \ |
t2 = vec_max(*ptr, mn); /* find largest of two */ \ |
163 |
dev = vec_sum4s(t4, dev); |
t3 = vec_min(*ptr, mn); /* find smaller of two */ \ |
164 |
|
t2 = vec_sub(t2, t3); /* find absolute difference */ \ |
165 |
unsigned long |
dev = vec_sum4s(t2, dev); \ |
166 |
dev16_altivec(const vector unsigned char *cur, |
ptr += stride |
167 |
unsigned long stride) |
|
168 |
|
/* |
169 |
|
* This function assumes cur is 16 bytes aligned and stride is 16 bytes |
170 |
|
* aligned |
171 |
|
*/ |
172 |
|
|
173 |
|
uint32_t |
174 |
|
dev16_altivec_c(vector unsigned char *cur, |
175 |
|
uint32_t stride) |
176 |
{ |
{ |
177 |
vector unsigned char t2, t3, t4, mn; |
vector unsigned char t2, t3, mn; |
178 |
vector unsigned int mean, dev; |
vector unsigned int mean, dev; |
179 |
vector signed int sumdiffs; |
vector unsigned int sumdiffs; |
180 |
vector unsigned char c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, |
vector unsigned char *ptr; |
181 |
c13, c14, c15; |
uint32_t result; |
|
unsigned long result; |
|
182 |
|
|
183 |
ZERODEF; |
#ifdef DEBUG |
184 |
|
/* print alignment errors if DEBUG is on */ |
185 |
|
if(((unsigned long)cur) & 0x7) |
186 |
|
fprintf(stderr, "dev16_altivec:incorrect align, cur: %lx\n", (long)cur); |
187 |
|
if(stride & 0xf) |
188 |
|
fprintf(stderr, "dev16_altivec:incorrect align, stride: %lu\n", stride); |
189 |
|
#endif |
190 |
|
|
191 |
mean = (vector unsigned int) (ZEROVEC); |
dev = mean = vec_splat_u32(0); |
|
dev = (vector unsigned int) (ZEROVEC); |
|
192 |
stride >>= 4; |
stride >>= 4; |
193 |
|
|
194 |
MEAN16(0); |
/* set pointer to iterate through cur */ |
195 |
MEAN16(1); |
ptr = cur; |
196 |
MEAN16(2); |
|
197 |
MEAN16(3); |
MEAN16(); |
198 |
MEAN16(4); |
MEAN16(); |
199 |
MEAN16(5); |
MEAN16(); |
200 |
MEAN16(6); |
MEAN16(); |
201 |
MEAN16(7); |
MEAN16(); |
202 |
MEAN16(8); |
MEAN16(); |
203 |
MEAN16(9); |
MEAN16(); |
204 |
MEAN16(10); |
MEAN16(); |
205 |
MEAN16(11); |
MEAN16(); |
206 |
MEAN16(12); |
MEAN16(); |
207 |
MEAN16(13); |
MEAN16(); |
208 |
MEAN16(14); |
MEAN16(); |
209 |
MEAN16(15); |
MEAN16(); |
210 |
|
MEAN16(); |
211 |
sumdiffs = vec_sums((vector signed int) mean, (vector signed int) ZEROVEC); |
MEAN16(); |
212 |
mn = vec_perm((vector unsigned char) sumdiffs, |
MEAN16(); |
213 |
(vector unsigned char) sumdiffs, (vector unsigned char) (14, |
|
214 |
14, |
/* Add all together in sumdiffs */ |
215 |
14, |
sumdiffs = (vector unsigned int)vec_sums((vector signed int) mean, vec_splat_s32(0)); |
216 |
14, |
/* teilen durch 16 * 16 */ |
217 |
14, |
mn = vec_perm((vector unsigned char)sumdiffs, (vector unsigned char)sumdiffs, vec_splat_u8(14)); |
218 |
14, |
|
219 |
14, |
/* set pointer to iterate through cur */ |
220 |
14, |
ptr = cur; |
221 |
14, |
|
222 |
14, |
DEV16(); |
223 |
14, |
DEV16(); |
224 |
14, |
DEV16(); |
225 |
14, |
DEV16(); |
226 |
14, |
DEV16(); |
227 |
14, |
DEV16(); |
228 |
14)); |
DEV16(); |
229 |
DEV16(0); |
DEV16(); |
230 |
DEV16(1); |
DEV16(); |
231 |
DEV16(2); |
DEV16(); |
232 |
DEV16(3); |
DEV16(); |
233 |
DEV16(4); |
DEV16(); |
234 |
DEV16(5); |
DEV16(); |
235 |
DEV16(6); |
DEV16(); |
236 |
DEV16(7); |
DEV16(); |
237 |
DEV16(8); |
DEV16(); |
|
DEV16(9); |
|
|
DEV16(10); |
|
|
DEV16(11); |
|
|
DEV16(12); |
|
|
DEV16(13); |
|
|
DEV16(14); |
|
|
DEV16(15); |
|
238 |
|
|
239 |
/* sum all parts of difference into one 32 bit quantity */ |
/* sum all parts of difference into one 32 bit quantity */ |
240 |
sumdiffs = vec_sums((vector signed int) dev, (vector signed int) ZEROVEC); |
sumdiffs = (vector unsigned int)vec_sums((vector signed int) dev, vec_splat_s32(0)); |
241 |
|
|
242 |
/* copy vector sum into unaligned result */ |
/* copy vector sum into unaligned result */ |
243 |
sumdiffs = vec_splat(sumdiffs, 3); |
sumdiffs = vec_splat(sumdiffs, 3); |
244 |
vec_ste(sumdiffs, 0, (int *) &result); |
vec_ste(sumdiffs, 0, (uint32_t*) &result); |
245 |
return (result); |
return result; |
246 |
|
} |
247 |
|
|
248 |
|
#define SAD16BI() \ |
249 |
|
t1 = vec_perm(ref1[0], ref1[1], mask1); \ |
250 |
|
t2 = vec_perm(ref2[0], ref2[1], mask2); \ |
251 |
|
t1 = vec_avg(t1, t2); \ |
252 |
|
t2 = vec_max(t1, *cur); \ |
253 |
|
t1 = vec_min(t1, *cur); \ |
254 |
|
sad = vec_sub(t2, t1); \ |
255 |
|
sum = vec_sum4s(sad, sum); \ |
256 |
|
cur += stride; \ |
257 |
|
ref1 += stride; \ |
258 |
|
ref2 += stride |
259 |
|
|
260 |
|
/* |
261 |
|
* This function assumes cur is 16 bytes aligned, stride is 16 bytes |
262 |
|
* aligned and ref1 and ref2 is unaligned |
263 |
|
*/ |
264 |
|
|
265 |
|
uint32_t |
266 |
|
sad16bi_altivec_c(vector unsigned char *cur, |
267 |
|
vector unsigned char *ref1, |
268 |
|
vector unsigned char *ref2, |
269 |
|
uint32_t stride) |
270 |
|
{ |
271 |
|
vector unsigned char t1, t2; |
272 |
|
vector unsigned char mask1, mask2; |
273 |
|
vector unsigned char sad; |
274 |
|
vector unsigned int sum; |
275 |
|
uint32_t result; |
276 |
|
|
277 |
|
#ifdef DEBUG |
278 |
|
/* print alignment errors if this is on */ |
279 |
|
if((long)cur & 0xf) |
280 |
|
fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lx\n", (long)cur); |
281 |
|
if(stride & 0xf) |
282 |
|
fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lu\n", stride); |
283 |
|
#endif |
284 |
|
|
285 |
|
/* Initialisation stuff */ |
286 |
|
stride >>= 4; |
287 |
|
mask1 = vec_lvsl(0, (unsigned char*)ref1); |
288 |
|
mask2 = vec_lvsl(0, (unsigned char*)ref2); |
289 |
|
sad = vec_splat_u8(0); |
290 |
|
sum = (vector unsigned int)sad; |
291 |
|
|
292 |
|
SAD16BI(); |
293 |
|
SAD16BI(); |
294 |
|
SAD16BI(); |
295 |
|
SAD16BI(); |
296 |
|
|
297 |
|
SAD16BI(); |
298 |
|
SAD16BI(); |
299 |
|
SAD16BI(); |
300 |
|
SAD16BI(); |
301 |
|
|
302 |
|
SAD16BI(); |
303 |
|
SAD16BI(); |
304 |
|
SAD16BI(); |
305 |
|
SAD16BI(); |
306 |
|
|
307 |
|
SAD16BI(); |
308 |
|
SAD16BI(); |
309 |
|
SAD16BI(); |
310 |
|
SAD16BI(); |
311 |
|
|
312 |
|
sum = (vector unsigned int)vec_sums((vector signed int)sum, vec_splat_s32(0)); |
313 |
|
sum = vec_splat(sum, 3); |
314 |
|
vec_ste(sum, 0, (uint32_t*)&result); |
315 |
|
|
316 |
|
return result; |
317 |
|
} |
318 |
|
|
319 |
|
|
320 |
|
#define SSE8_16BIT() \ |
321 |
|
b1_vec = vec_perm(vec_ld(0,b1), vec_ld(16,b1), vec_lvsl(0,b1)); \ |
322 |
|
b2_vec = vec_perm(vec_ld(0,b2), vec_ld(16,b2), vec_lvsl(0,b2)); \ |
323 |
|
diff = vec_sub(b1_vec,b2_vec); \ |
324 |
|
sum = vec_msum(diff,diff,sum); \ |
325 |
|
b1 = (const int16_t*)((int8_t*)b1+stride); \ |
326 |
|
b2 = (const int16_t*)((int8_t*)b2+stride) |
327 |
|
|
328 |
|
uint32_t |
329 |
|
sse8_16bit_altivec_c(const int16_t * b1, |
330 |
|
const int16_t * b2, |
331 |
|
const uint32_t stride) |
332 |
|
{ |
333 |
|
register vector signed short b1_vec; |
334 |
|
register vector signed short b2_vec; |
335 |
|
register vector signed short diff; |
336 |
|
register vector signed int sum; |
337 |
|
uint32_t result; |
338 |
|
|
339 |
|
/* initialize */ |
340 |
|
sum = vec_splat_s32(0); |
341 |
|
|
342 |
|
SSE8_16BIT(); |
343 |
|
SSE8_16BIT(); |
344 |
|
SSE8_16BIT(); |
345 |
|
SSE8_16BIT(); |
346 |
|
|
347 |
|
SSE8_16BIT(); |
348 |
|
SSE8_16BIT(); |
349 |
|
SSE8_16BIT(); |
350 |
|
SSE8_16BIT(); |
351 |
|
|
352 |
|
/* sum the vector */ |
353 |
|
sum = vec_sums(sum, vec_splat_s32(0)); |
354 |
|
sum = vec_splat(sum,3); |
355 |
|
|
356 |
|
vec_ste(sum,0,(int*)&result); |
357 |
|
|
358 |
|
/* and return */ |
359 |
|
return result; |
360 |
} |
} |