Annotation of /xvidcore/src/motion/ppc_asm/sad_altivec.c

Revision 1.10 - (view) (download)

1 :	edgomez	1.6	/*
2 :
3 :			Copyright (C) 2002 Benjamin Herrenschmidt <benh@kernel.crashing.org>
4 :
5 :			This program is free software; you can redistribute it and/or modify
6 :			it under the terms of the GNU General Public License as published by
7 :			the Free Software Foundation; either version 2 of the License, or
8 :			(at your option) any later version.
9 :
10 :			This program is distributed in the hope that it will be useful,
11 :			but WITHOUT ANY WARRANTY; without even the implied warranty of
12 :			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 :			GNU General Public License for more details.
14 :
15 :			You should have received a copy of the GNU General Public License
16 :			along with this program; if not, write to the Free Software
17 :			Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 :
19 :
20 :	edgomez	1.10	$Id$
21 :	edgomez	1.6	*/
22 :	canard	1.1
23 :	edgomez	1.9	#ifdef HAVE_ALTIVEC_H
24 :			#include <altivec.h>
25 :			#endif
26 :	canard	1.2
27 :
28 :	edgomez	1.9	#include "../../portab.h"
29 :	canard	1.1
30 :	edgomez	1.9	/* no debugging by default */
31 :	canard	1.1	#undef DEBUG
32 :
33 :	edgomez	1.9	#include <stdio.h>
34 :	canard	1.2
35 :	canard	1.1	#define SAD16() \
36 :			t1 = vec_perm(ref[0], ref[1], perm); /* align current vector */ \
37 :			t2 = vec_max(t1, cur); / find largest of two */ \
38 :	edgomez	1.9	t1 = vec_min(t1, cur); / find smaller of two */ \
39 :			t1 = vec_sub(t2, t1); /* find absolute difference */ \
40 :			sad = vec_sum4s(t1, vec_splat_u32(0)); /* sum of differences */ \
41 :			sumdiffs = (vector unsigned int)vec_sums((vector signed int)sad, (vector signed int)sumdiffs); /* accumulate sumdiffs */ \
42 :			if(vec_any_ge(sumdiffs, best_vec)) \
43 :			goto bail; \
44 :	canard	1.1	cur += stride; ref += stride;
45 :
46 :			/*
47 :			* This function assumes cur and stride are 16 bytes aligned and ref is unaligned
48 :			*/
49 :			unsigned long
50 :	edgomez	1.9	sad16_altivec_c(const vector unsigned char *cur,
51 :	edgomez	1.3	const vector unsigned char *ref,
52 :			unsigned long stride,
53 :			const unsigned long best_sad)
54 :	canard	1.1	{
55 :	edgomez	1.3	vector unsigned char perm;
56 :	edgomez	1.9	vector unsigned char t1, t2;
57 :	edgomez	1.3	vector unsigned int sad;
58 :	edgomez	1.9	vector unsigned int sumdiffs;
59 :			vector unsigned int best_vec;
60 :	edgomez	1.3	unsigned long result;
61 :
62 :	edgomez	1.9
63 :	canard	1.1	#ifdef DEBUG
64 :	edgomez	1.9	/* print alignment errors if DEBUG is on */
65 :	edgomez	1.3	if (((unsigned long) cur) & 0xf)
66 :			fprintf(stderr, "sad16_altivec:incorrect align, cur: %x\n", cur);
67 :			if (stride & 0xf)
68 :			fprintf(stderr, "sad16_altivec:incorrect align, stride: %x\n", stride);
69 :			#endif
70 :			/* initialization */
71 :	edgomez	1.9	sad = vec_splat_u32(0);
72 :			sumdiffs = sad;
73 :	edgomez	1.3	stride >>= 4;
74 :			perm = vec_lvsl(0, (unsigned char *) ref);
75 :			((unsigned long ) &best_vec) = best_sad;
76 :			best_vec = vec_splat(best_vec, 0);
77 :
78 :			/* perform sum of differences between current and previous */
79 :			SAD16();
80 :			SAD16();
81 :			SAD16();
82 :			SAD16();
83 :	edgomez	1.9
84 :	edgomez	1.3	SAD16();
85 :			SAD16();
86 :			SAD16();
87 :			SAD16();
88 :	edgomez	1.9
89 :	edgomez	1.3	SAD16();
90 :			SAD16();
91 :			SAD16();
92 :			SAD16();
93 :	edgomez	1.9
94 :	edgomez	1.3	SAD16();
95 :			SAD16();
96 :			SAD16();
97 :			SAD16();
98 :
99 :			bail:
100 :			/* copy vector sum into unaligned result */
101 :			sumdiffs = vec_splat(sumdiffs, 3);
102 :	edgomez	1.9	vec_ste(sumdiffs, 0, (unsigned long *) &result);
103 :			return result;
104 :	canard	1.1	}
105 :
106 :	edgomez	1.9
107 :	canard	1.1	#define SAD8() \
108 :			t1 = vec_perm(cur[0], cur[stride], perm_cur); /* align current vector */ \
109 :			t2 = vec_perm(ref[0], ref[1], perm_ref1); /* align current vector */ \
110 :			tp = vec_perm(ref[stride], ref[stride+1], perm_ref1); /* align current vector */ \
111 :			t2 = vec_perm(t2,tp,perm_ref2); \
112 :	edgomez	1.9	tp = vec_max(t1, t2); /* find largest of two */ \
113 :			t1 = vec_min(t1, t2); /* find smaller of two */ \
114 :			tp = vec_sub(tp, t1); /* find absolute difference */ \
115 :			sad = vec_sum4s(tp, sad); /* accumulate sum of differences */ \
116 :	canard	1.1	cur += stride<<1; ref += stride<<1;
117 :
118 :			/*
119 :			* This function assumes cur is 8 bytes aligned, stride is 16 bytes
120 :			* aligned and ref is unaligned
121 :			*/
122 :			unsigned long
123 :	edgomez	1.9	sad8_altivec_c(const vector unsigned char *cur,
124 :	edgomez	1.3	const vector unsigned char *ref,
125 :			unsigned long stride)
126 :	canard	1.1	{
127 :	edgomez	1.9	vector unsigned char t1, t2, tp;
128 :	edgomez	1.3	vector unsigned int sad;
129 :	edgomez	1.9	vector unsigned int sumdiffs;
130 :	edgomez	1.3	vector unsigned char perm_cur;
131 :			vector unsigned char perm_ref1, perm_ref2;
132 :			unsigned long result;
133 :
134 :	canard	1.1	#ifdef DEBUG
135 :	edgomez	1.9	/* print alignment errors if DEBUG is on */
136 :	edgomez	1.3	if (((unsigned long) cur) & 0x7)
137 :			fprintf(stderr, "sad8_altivec:incorrect align, cur: %x\n", cur);
138 :			if (stride & 0xf)
139 :			fprintf(stderr, "sad8_altivec:incorrect align, stride: %x\n", stride);
140 :			#endif
141 :	edgomez	1.9
142 :			/* check if cur is 8 or 16 bytes aligned an create the perm_cur vector */
143 :			perm_ref1 = vec_lvsl(0, (unsigned char*)ref);
144 :			perm_ref2 = vec_add(vec_lvsl(0, (unsigned char*)NULL), vec_pack(vec_splat_u16(0), vec_splat_u16(8)));
145 :			perm_cur = vec_add(perm_ref2, vec_splat(vec_lvsl(0, (unsigned char*)cur), 0));
146 :
147 :	edgomez	1.3	/* initialization */
148 :	edgomez	1.9	sad = vec_splat_u32(0);
149 :	edgomez	1.3	stride >>= 4;
150 :
151 :			/* perform sum of differences between current and previous */
152 :			SAD8();
153 :			SAD8();
154 :			SAD8();
155 :			SAD8();
156 :
157 :			/* sum all parts of difference into one 32 bit quantity */
158 :	edgomez	1.9	sumdiffs = (vector unsigned int)vec_sums((vector signed int) sad, vec_splat_s32(0));
159 :	edgomez	1.3
160 :			/* copy vector sum into unaligned result */
161 :			sumdiffs = vec_splat(sumdiffs, 3);
162 :	edgomez	1.9	vec_ste(sumdiffs, 0, (unsigned int *) &result);
163 :			return result;
164 :	canard	1.1	}
165 :
166 :
167 :	edgomez	1.9	#define MEAN16() \
168 :			mean = vec_sum4s(*ptr,mean);\
169 :			ptr += stride
170 :
171 :			#define DEV16() \
172 :			t2 = vec_max(ptr, mn); / find largest of two */ \
173 :			t3 = vec_min(ptr, mn); / find smaller of two */ \
174 :			t2 = vec_sub(t2, t3); /* find absolute difference */ \
175 :			dev = vec_sum4s(t2, dev); \
176 :			ptr += stride
177 :
178 :			/*
179 :			* This function assumes cur is 16 bytes aligned and stride is 16 bytes
180 :			* aligned
181 :			*/
182 :	canard	1.1	unsigned long
183 :	edgomez	1.9	dev16_altivec_c(const vector unsigned char *cur,
184 :	edgomez	1.3	unsigned long stride)
185 :	canard	1.1	{
186 :	edgomez	1.9	vector unsigned char t2, t3, mn;
187 :	edgomez	1.3	vector unsigned int mean, dev;
188 :	edgomez	1.9	vector unsigned int sumdiffs;
189 :			const vector unsigned char *ptr;
190 :	edgomez	1.3	unsigned long result;
191 :
192 :	edgomez	1.9	#ifdef DEBUG
193 :			/* print alignment errors if DEBUG is on */
194 :			if(((unsigned long)cur) & 0x7)
195 :			fprintf(stderr, "dev16_altivec:incorrect align, cur: %x\n", cur);
196 :			if(stride & 0xf)
197 :			fprintf(stderr, "dev16_altivec:incorrect align, stride: %ld\n", stride);
198 :			#endif
199 :	edgomez	1.3
200 :	edgomez	1.9	dev = mean = vec_splat_u32(0);
201 :	edgomez	1.3	stride >>= 4;
202 :	edgomez	1.9
203 :			/* set pointer to iterate through cur */
204 :			ptr = cur;
205 :
206 :			MEAN16();
207 :			MEAN16();
208 :			MEAN16();
209 :			MEAN16();
210 :			MEAN16();
211 :			MEAN16();
212 :			MEAN16();
213 :			MEAN16();
214 :			MEAN16();
215 :			MEAN16();
216 :			MEAN16();
217 :			MEAN16();
218 :			MEAN16();
219 :			MEAN16();
220 :			MEAN16();
221 :			MEAN16();
222 :
223 :			/* Add all together in sumdiffs */
224 :			sumdiffs = (vector unsigned int)vec_sums((vector signed int) mean, vec_splat_s32(0));
225 :			/* teilen durch 16 * 16 */
226 :			mn = vec_perm((vector unsigned char)sumdiffs, (vector unsigned char)sumdiffs, vec_splat_u8(14));
227 :
228 :			/* set pointer to iterate through cur */
229 :			ptr = cur;
230 :
231 :			DEV16();
232 :			DEV16();
233 :			DEV16();
234 :			DEV16();
235 :			DEV16();
236 :			DEV16();
237 :			DEV16();
238 :			DEV16();
239 :			DEV16();
240 :			DEV16();
241 :			DEV16();
242 :			DEV16();
243 :			DEV16();
244 :			DEV16();
245 :			DEV16();
246 :			DEV16();
247 :	edgomez	1.3
248 :			/* sum all parts of difference into one 32 bit quantity */
249 :	edgomez	1.9	sumdiffs = (vector unsigned int)vec_sums((vector signed int) dev, vec_splat_s32(0));
250 :	edgomez	1.3
251 :			/* copy vector sum into unaligned result */
252 :			sumdiffs = vec_splat(sumdiffs, 3);
253 :	edgomez	1.9	vec_ste(sumdiffs, 0, (unsigned int *) &result);
254 :			return result;
255 :			}
256 :
257 :			#define SAD16BI() \
258 :			t1 = vec_perm(ref1[0], ref1[1], mask1); \
259 :			t2 = vec_perm(ref2[0], ref2[1], mask2); \
260 :			t1 = vec_avg(t1, t2); \
261 :			t2 = vec_max(t1, *cur); \
262 :			t1 = vec_min(t1, *cur); \
263 :			sad = vec_sub(t2, t1); \
264 :			sum = vec_sum4s(sad, sum); \
265 :			cur += stride; \
266 :			ref1 += stride; \
267 :			ref2 += stride
268 :
269 :			/*
270 :			* This function assumes cur is 16 bytes aligned, stride is 16 bytes
271 :			* aligned and ref1 and ref2 is unaligned
272 :			*/
273 :			unsigned long
274 :			sad16bi_altivec_c(vector unsigned char *cur,
275 :			vector unsigned char *ref1,
276 :			vector unsigned char *ref2,
277 :			unsigned long stride)
278 :			{
279 :			vector unsigned char t1, t2;
280 :			vector unsigned char mask1, mask2;
281 :			vector unsigned char sad;
282 :			vector unsigned int sum;
283 :			unsigned long result;
284 :
285 :			#ifdef DEBUG
286 :			/* print alignment errors if this is on */
287 :			if(cur & 0xf)
288 :			fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %x\n", cur);
289 :			if(stride & 0xf)
290 :			fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %ld\n", stride);
291 :			#endif
292 :
293 :			/* Initialisation stuff */
294 :			stride >>= 4;
295 :			mask1 = vec_lvsl(0, (unsigned char*)ref1);
296 :			mask2 = vec_lvsl(0, (unsigned char*)ref2);
297 :			sad = vec_splat_u8(0);
298 :			sum = (vector unsigned int)sad;
299 :
300 :			SAD16BI();
301 :			SAD16BI();
302 :			SAD16BI();
303 :			SAD16BI();
304 :
305 :			SAD16BI();
306 :			SAD16BI();
307 :			SAD16BI();
308 :			SAD16BI();
309 :
310 :			SAD16BI();
311 :			SAD16BI();
312 :			SAD16BI();
313 :			SAD16BI();
314 :
315 :			SAD16BI();
316 :			SAD16BI();
317 :			SAD16BI();
318 :			SAD16BI();
319 :
320 :			sum = (vector unsigned int)vec_sums((vector signed int)sum, vec_splat_s32(0));
321 :			sum = vec_splat(sum, 3);
322 :			vec_ste(sum, 0, (unsigned int*)&result);
323 :
324 :			return result;
325 :			}
326 :
327 :
328 :			#define SSE8_16BIT() \
329 :			b1_vec = vec_perm(vec_ld(0,b1), vec_ld(16,b1), vec_lvsl(0,b1)); \
330 :			b2_vec = vec_perm(vec_ld(0,b2), vec_ld(16,b2), vec_lvsl(0,b2)); \
331 :			diff = vec_sub(b1_vec,b2_vec); \
332 :			sum = vec_msum(diff,diff,sum); \
333 :			b1 = (const int16_t)((int8_t)b1+stride); \
334 :			b2 = (const int16_t)((int8_t)b2+stride)
335 :
336 :			uint32_t
337 :			sse8_16bit_altivec_c(const int16_t * b1,
338 :			const int16_t * b2,
339 :			const uint32_t stride)
340 :			{
341 :			register vector signed short b1_vec;
342 :			register vector signed short b2_vec;
343 :			register vector signed short diff;
344 :			register vector signed int sum;
345 :			uint32_t result;
346 :
347 :			/* initialize */
348 :			sum = vec_splat_s32(0);
349 :
350 :			SSE8_16BIT();
351 :			SSE8_16BIT();
352 :			SSE8_16BIT();
353 :			SSE8_16BIT();
354 :
355 :			SSE8_16BIT();
356 :			SSE8_16BIT();
357 :			SSE8_16BIT();
358 :			SSE8_16BIT();
359 :
360 :			/* sum the vector */
361 :			sum = vec_sums(sum, vec_splat_s32(0));
362 :			sum = vec_splat(sum,3);
363 :
364 :			vec_ste(sum,0,(int*)&result);
365 :
366 :			/* and return */
367 :			return result;
368 :	canard	1.1	}

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4