Parent Directory | Revision Log
Revision 1.7 - (view) (download)
1 : | ia64p | 1.3 | // ------------------------------------------------------------------------------ |
2 : | // * | ||
3 : | // * Optimized Assembler Versions of sad8 and sad16 | ||
4 : | // * | ||
5 : | // ------------------------------------------------------------------------------ | ||
6 : | // * | ||
7 : | // * Hannes Jütting and Christopher Özbek | ||
8 : | // * {s_juetti,s_oezbek}@ira.uka.de | ||
9 : | // * | ||
10 : | // * Programmed for the IA64 laboratory held at University Karlsruhe 2002 | ||
11 : | // * http://www.info.uni-karlsruhe.de/~rubino/ia64p/ | ||
12 : | // * | ||
13 : | // ------------------------------------------------------------------------------ | ||
14 : | // * | ||
15 : | // * These are the optimized assembler versions of sad8 and sad16, which calculate | ||
16 : | // * the sum of absolute differences between two 8x8/16x16 block matrices. | ||
17 : | // * | ||
18 : | // * Our approach uses: | ||
19 : | // * - The Itanium command psad1, which solves the problem in hardware. | ||
20 : | // * - Modulo-Scheduled Loops as the best way to loop unrolling on the IA64 | ||
21 : | // * EPIC architecture | ||
22 : | // * - Alignment resolving to avoid memory faults | ||
23 : | // * | ||
24 : | // ------------------------------------------------------------------------------ | ||
25 : | edgomez | 1.7 | |
26 : | |||
27 : | |||
28 : | ia64p | 1.1 | |
29 : | .common sad16bi#,8,8 | ||
30 : | .align 16 | ||
31 : | .global sad16bi_ia64# | ||
32 : | .proc sad16bi_ia64# | ||
33 : | sad16bi_ia64: | ||
34 : | .prologue | ||
35 : | .save ar.lc, r2 | ||
36 : | mov r2 = ar.lc | ||
37 : | .body | ||
38 : | zxt4 r35 = r35 | ||
39 : | mov r8 = r0 | ||
40 : | mov r23 = r0 | ||
41 : | addl r22 = 255, r0 | ||
42 : | .L21: | ||
43 : | addl r14 = 7, r0 | ||
44 : | mov r19 = r32 | ||
45 : | mov r21 = r34 | ||
46 : | mov r20 = r33 | ||
47 : | ;; | ||
48 : | mov ar.lc = r14 | ||
49 : | ;; | ||
50 : | .L105: | ||
51 : | mov r17 = r20 | ||
52 : | mov r18 = r21 | ||
53 : | ;; | ||
54 : | ld1 r14 = [r17], 1 | ||
55 : | ld1 r15 = [r18], 1 | ||
56 : | ;; | ||
57 : | add r14 = r14, r15 | ||
58 : | ;; | ||
59 : | adds r14 = 1, r14 | ||
60 : | ;; | ||
61 : | shr.u r16 = r14, 1 | ||
62 : | ;; | ||
63 : | cmp4.le p6, p7 = r0, r16 | ||
64 : | ;; | ||
65 : | (p7) mov r16 = r0 | ||
66 : | (p7) br.cond.dpnt .L96 | ||
67 : | ;; | ||
68 : | cmp4.ge p6, p7 = r22, r16 | ||
69 : | ;; | ||
70 : | (p7) addl r16 = 255, r0 | ||
71 : | .L96: | ||
72 : | ld1 r14 = [r19] | ||
73 : | adds r20 = 2, r20 | ||
74 : | adds r21 = 2, r21 | ||
75 : | ;; | ||
76 : | sub r15 = r14, r16 | ||
77 : | ;; | ||
78 : | cmp4.ge p6, p7 = 0, r15 | ||
79 : | ;; | ||
80 : | (p6) sub r14 = r16, r14 | ||
81 : | (p7) add r8 = r8, r15 | ||
82 : | ;; | ||
83 : | (p6) add r8 = r8, r14 | ||
84 : | ld1 r15 = [r18] | ||
85 : | ld1 r14 = [r17] | ||
86 : | ;; | ||
87 : | add r14 = r14, r15 | ||
88 : | adds r17 = 1, r19 | ||
89 : | ;; | ||
90 : | adds r14 = 1, r14 | ||
91 : | ;; | ||
92 : | shr.u r16 = r14, 1 | ||
93 : | ;; | ||
94 : | cmp4.le p6, p7 = r0, r16 | ||
95 : | ;; | ||
96 : | (p7) mov r16 = r0 | ||
97 : | (p7) br.cond.dpnt .L102 | ||
98 : | ;; | ||
99 : | cmp4.ge p6, p7 = r22, r16 | ||
100 : | ;; | ||
101 : | (p7) addl r16 = 255, r0 | ||
102 : | .L102: | ||
103 : | ld1 r14 = [r17] | ||
104 : | adds r19 = 2, r19 | ||
105 : | ;; | ||
106 : | sub r15 = r14, r16 | ||
107 : | ;; | ||
108 : | cmp4.ge p6, p7 = 0, r15 | ||
109 : | ;; | ||
110 : | (p7) add r8 = r8, r15 | ||
111 : | (p6) sub r14 = r16, r14 | ||
112 : | ;; | ||
113 : | (p6) add r8 = r8, r14 | ||
114 : | br.cloop.sptk.few .L105 | ||
115 : | adds r23 = 1, r23 | ||
116 : | add r32 = r32, r35 | ||
117 : | add r33 = r33, r35 | ||
118 : | add r34 = r34, r35 | ||
119 : | ;; | ||
120 : | cmp4.geu p6, p7 = 15, r23 | ||
121 : | (p6) br.cond.dptk .L21 | ||
122 : | mov ar.lc = r2 | ||
123 : | br.ret.sptk.many b0 | ||
124 : | .endp sad16bi_ia64# | ||
125 : | |||
126 : | |||
127 : | ia64p | 1.2 | |
128 : | |||
129 : | |||
130 : | |||
131 : | |||
132 : | |||
133 : | .text | ||
134 : | ia64p | 1.1 | .align 16 |
135 : | .global dev16_ia64# | ||
136 : | .proc dev16_ia64# | ||
137 : | ia64p | 1.2 | .auto |
138 : | ia64p | 1.1 | dev16_ia64: |
139 : | ia64p | 1.2 | // renamings for better readability |
140 : | stride = r18 | ||
141 : | pfs = r19 //for saving previous function state | ||
142 : | cura0 = r20 //address of first 8-byte block of cur | ||
143 : | cura1 = r21 //address of second 8-byte block of cur | ||
144 : | mean0 = r22 //registers for calculating the sum in parallel | ||
145 : | mean1 = r23 | ||
146 : | mean2 = r24 | ||
147 : | mean3 = r25 | ||
148 : | dev0 = r26 //same for the deviation | ||
149 : | dev1 = r27 | ||
150 : | dev2 = r28 | ||
151 : | dev3 = r29 | ||
152 : | |||
153 : | ia64p | 1.1 | .body |
154 : | ia64p | 1.2 | alloc pfs = ar.pfs, 2, 38, 0, 40 |
155 : | |||
156 : | mov cura0 = in0 | ||
157 : | mov stride = in1 | ||
158 : | add cura1 = 8, cura0 | ||
159 : | |||
160 : | .rotr c[32], psad[8] // just using rotating registers to get an array ;-) | ||
161 : | |||
162 : | .explicit | ||
163 : | {.mmi | ||
164 : | ld8 c[0] = [cura0], stride // load them ... | ||
165 : | ld8 c[1] = [cura1], stride | ||
166 : | ;; | ||
167 : | } | ||
168 : | {.mmi | ||
169 : | ld8 c[2] = [cura0], stride | ||
170 : | ld8 c[3] = [cura1], stride | ||
171 : | ;; | ||
172 : | } | ||
173 : | {.mmi | ||
174 : | ld8 c[4] = [cura0], stride | ||
175 : | ld8 c[5] = [cura1], stride | ||
176 : | ;; | ||
177 : | } | ||
178 : | {.mmi | ||
179 : | ld8 c[6] = [cura0], stride | ||
180 : | ld8 c[7] = [cura1], stride | ||
181 : | ;; | ||
182 : | } | ||
183 : | {.mmi | ||
184 : | ld8 c[8] = [cura0], stride | ||
185 : | ld8 c[9] = [cura1], stride | ||
186 : | ;; | ||
187 : | } | ||
188 : | {.mmi | ||
189 : | ld8 c[10] = [cura0], stride | ||
190 : | ld8 c[11] = [cura1], stride | ||
191 : | ;; | ||
192 : | } | ||
193 : | {.mii | ||
194 : | ld8 c[12] = [cura0], stride | ||
195 : | psad1 mean0 = c[0], r0 // get the sum of them ... | ||
196 : | psad1 mean1 = c[1], r0 | ||
197 : | } | ||
198 : | {.mmi | ||
199 : | ld8 c[13] = [cura1], stride | ||
200 : | ;; | ||
201 : | ld8 c[14] = [cura0], stride | ||
202 : | psad1 mean2 = c[2], r0 | ||
203 : | } | ||
204 : | {.mii | ||
205 : | ld8 c[15] = [cura1], stride | ||
206 : | psad1 mean3 = c[3], r0 | ||
207 : | ;; | ||
208 : | psad1 psad[0] = c[4], r0 | ||
209 : | } | ||
210 : | {.mmi | ||
211 : | ld8 c[16] = [cura0], stride | ||
212 : | ld8 c[17] = [cura1], stride | ||
213 : | psad1 psad[1] = c[5], r0 | ||
214 : | ;; | ||
215 : | } | ||
216 : | {.mii | ||
217 : | ld8 c[18] = [cura0], stride | ||
218 : | psad1 psad[2] = c[6], r0 | ||
219 : | psad1 psad[3] = c[7], r0 | ||
220 : | } | ||
221 : | {.mmi | ||
222 : | ld8 c[19] = [cura1], stride | ||
223 : | ;; | ||
224 : | ld8 c[20] = [cura0], stride | ||
225 : | psad1 psad[4] = c[8], r0 | ||
226 : | } | ||
227 : | {.mii | ||
228 : | ld8 c[21] = [cura1], stride | ||
229 : | psad1 psad[5] = c[9], r0 | ||
230 : | ;; | ||
231 : | add mean0 = mean0, psad[0] | ||
232 : | } | ||
233 : | {.mmi | ||
234 : | ld8 c[22] = [cura0], stride | ||
235 : | ld8 c[23] = [cura1], stride | ||
236 : | add mean1 = mean1, psad[1] | ||
237 : | ;; | ||
238 : | } | ||
239 : | {.mii | ||
240 : | ld8 c[24] = [cura0], stride | ||
241 : | psad1 psad[0] = c[10], r0 | ||
242 : | psad1 psad[1] = c[11], r0 | ||
243 : | } | ||
244 : | {.mmi | ||
245 : | ld8 c[25] = [cura1], stride | ||
246 : | ;; | ||
247 : | ld8 c[26] = [cura0], stride | ||
248 : | add mean2 = mean2, psad[2] | ||
249 : | } | ||
250 : | {.mii | ||
251 : | ld8 c[27] = [cura1], stride | ||
252 : | add mean3 = mean3, psad[3] | ||
253 : | ;; | ||
254 : | psad1 psad[2] = c[12], r0 | ||
255 : | } | ||
256 : | {.mmi | ||
257 : | ld8 c[28] = [cura0], stride | ||
258 : | ld8 c[29] = [cura1], stride | ||
259 : | psad1 psad[3] = c[13], r0 | ||
260 : | ;; | ||
261 : | } | ||
262 : | {.mii | ||
263 : | ld8 c[30] = [cura0] | ||
264 : | psad1 psad[6] = c[14], r0 | ||
265 : | psad1 psad[7] = c[15], r0 | ||
266 : | } | ||
267 : | {.mmi | ||
268 : | ld8 c[31] = [cura1] | ||
269 : | ;; | ||
270 : | add mean0 = mean0, psad[0] | ||
271 : | add mean1 = mean1, psad[1] | ||
272 : | } | ||
273 : | {.mii | ||
274 : | add mean2 = mean2, psad[4] | ||
275 : | add mean3 = mean3, psad[5] | ||
276 : | ;; | ||
277 : | psad1 psad[0] = c[16], r0 | ||
278 : | } | ||
279 : | {.mmi | ||
280 : | add mean0 = mean0, psad[2] | ||
281 : | add mean1 = mean1, psad[3] | ||
282 : | psad1 psad[1] = c[17], r0 | ||
283 : | ;; | ||
284 : | } | ||
285 : | {.mii | ||
286 : | add mean2 = mean2, psad[6] | ||
287 : | psad1 psad[2] = c[18], r0 | ||
288 : | psad1 psad[3] = c[19], r0 | ||
289 : | } | ||
290 : | {.mmi | ||
291 : | add mean3 = mean3, psad[7] | ||
292 : | ;; | ||
293 : | add mean0 = mean0, psad[0] | ||
294 : | psad1 psad[4] = c[20], r0 | ||
295 : | } | ||
296 : | {.mii | ||
297 : | add mean1 = mean1, psad[1] | ||
298 : | psad1 psad[5] = c[21], r0 | ||
299 : | ;; | ||
300 : | psad1 psad[6] = c[22], r0 | ||
301 : | } | ||
302 : | {.mmi | ||
303 : | add mean2 = mean2, psad[2] | ||
304 : | add mean3 = mean3, psad[3] | ||
305 : | psad1 psad[7] = c[23], r0 | ||
306 : | ;; | ||
307 : | } | ||
308 : | {.mii | ||
309 : | add mean0 = mean0, psad[4] | ||
310 : | psad1 psad[0] = c[24], r0 | ||
311 : | psad1 psad[1] = c[25], r0 | ||
312 : | } | ||
313 : | {.mmi | ||
314 : | add mean1 = mean1, psad[5] | ||
315 : | ;; | ||
316 : | add mean2 = mean2, psad[6] | ||
317 : | psad1 psad[2] = c[26], r0 | ||
318 : | } | ||
319 : | {.mii | ||
320 : | add mean3 = mean3, psad[7] | ||
321 : | psad1 psad[3] = c[27], r0 | ||
322 : | ;; | ||
323 : | psad1 psad[4] = c[28], r0 | ||
324 : | } | ||
325 : | {.mmi | ||
326 : | add mean0 = mean0, psad[0] | ||
327 : | add mean1 = mean1, psad[1] | ||
328 : | psad1 psad[5] = c[29], r0 | ||
329 : | ;; | ||
330 : | } | ||
331 : | {.mii | ||
332 : | add mean2 = mean2, psad[2] | ||
333 : | psad1 psad[6] = c[30], r0 | ||
334 : | psad1 psad[7] = c[31], r0 | ||
335 : | } | ||
336 : | {.mmi | ||
337 : | add mean3 = mean3, psad[3] | ||
338 : | ;; | ||
339 : | add mean0 = mean0, psad[4] | ||
340 : | add mean1 = mean1, psad[5] | ||
341 : | } | ||
342 : | {.mbb | ||
343 : | add mean2 = mean2, mean3 | ||
344 : | nop.b 1 | ||
345 : | nop.b 1 | ||
346 : | ;; | ||
347 : | } | ||
348 : | {.mib | ||
349 : | add mean0 = mean0, psad[6] | ||
350 : | add mean1 = mean1, psad[7] | ||
351 : | nop.b 1 | ||
352 : | ;; | ||
353 : | } | ||
354 : | {.mib | ||
355 : | add mean0 = mean0, mean1 | ||
356 : | ia64p | 1.4 | // add mean2 = 127, mean2 // this could make our division more exactly, but does not help much |
357 : | ia64p | 1.2 | ;; |
358 : | } | ||
359 : | {.mib | ||
360 : | add mean0 = mean0, mean2 | ||
361 : | ;; | ||
362 : | } | ||
363 : | |||
364 : | {.mib | ||
365 : | shr.u mean0 = mean0, 8 // divide them ... | ||
366 : | ;; | ||
367 : | } | ||
368 : | {.mib | ||
369 : | mux1 mean0 = mean0, @brcst | ||
370 : | ;; | ||
371 : | } | ||
372 : | {.mii | ||
373 : | nop.m 0 | ||
374 : | psad1 dev0 = c[0], mean0 // and do a sad again ... | ||
375 : | psad1 dev1 = c[1], mean0 | ||
376 : | } | ||
377 : | {.mii | ||
378 : | nop.m 0 | ||
379 : | psad1 dev2 = c[2], mean0 | ||
380 : | psad1 dev3 = c[3], mean0 | ||
381 : | } | ||
382 : | {.mii | ||
383 : | nop.m 0 | ||
384 : | psad1 psad[0] = c[4], mean0 | ||
385 : | psad1 psad[1] = c[5], mean0 | ||
386 : | } | ||
387 : | {.mii | ||
388 : | nop.m 0 | ||
389 : | psad1 psad[2] = c[6], mean0 | ||
390 : | psad1 psad[3] = c[7], mean0 | ||
391 : | } | ||
392 : | {.mii | ||
393 : | nop.m 0 | ||
394 : | psad1 psad[4] = c[8], mean0 | ||
395 : | psad1 psad[5] = c[9], mean0 | ||
396 : | ;; | ||
397 : | } | ||
398 : | {.mii | ||
399 : | add dev0 = dev0, psad[0] | ||
400 : | psad1 psad[6] = c[10], mean0 | ||
401 : | psad1 psad[7] = c[11], mean0 | ||
402 : | } | ||
403 : | {.mmi | ||
404 : | add dev1 = dev1, psad[1] | ||
405 : | |||
406 : | add dev2 = dev2, psad[2] | ||
407 : | psad1 psad[0] = c[12], mean0 | ||
408 : | } | ||
409 : | {.mii | ||
410 : | add dev3 = dev3, psad[3] | ||
411 : | psad1 psad[1] = c[13], mean0 | ||
412 : | ;; | ||
413 : | psad1 psad[2] = c[14], mean0 | ||
414 : | } | ||
415 : | {.mmi | ||
416 : | add dev0 = dev0, psad[4] | ||
417 : | add dev1 = dev1, psad[5] | ||
418 : | psad1 psad[3] = c[15], mean0 | ||
419 : | } | ||
420 : | {.mii | ||
421 : | add dev2 = dev2, psad[6] | ||
422 : | psad1 psad[4] = c[16], mean0 | ||
423 : | psad1 psad[5] = c[17], mean0 | ||
424 : | } | ||
425 : | {.mmi | ||
426 : | add dev3 = dev3, psad[7] | ||
427 : | ;; | ||
428 : | add dev0 = dev0, psad[0] | ||
429 : | psad1 psad[6] = c[18], mean0 | ||
430 : | } | ||
431 : | {.mii | ||
432 : | add dev1 = dev1, psad[1] | ||
433 : | psad1 psad[7] = c[19], mean0 | ||
434 : | |||
435 : | psad1 psad[0] = c[20], mean0 | ||
436 : | } | ||
437 : | {.mmi | ||
438 : | add dev2 = dev2, psad[2] | ||
439 : | add dev3 = dev3, psad[3] | ||
440 : | psad1 psad[1] = c[21], mean0 | ||
441 : | ;; | ||
442 : | } | ||
443 : | {.mii | ||
444 : | add dev0 = dev0, psad[4] | ||
445 : | psad1 psad[2] = c[22], mean0 | ||
446 : | psad1 psad[3] = c[23], mean0 | ||
447 : | } | ||
448 : | {.mmi | ||
449 : | add dev1 = dev1, psad[5] | ||
450 : | |||
451 : | add dev2 = dev2, psad[6] | ||
452 : | psad1 psad[4] = c[24], mean0 | ||
453 : | } | ||
454 : | {.mii | ||
455 : | add dev3 = dev3, psad[7] | ||
456 : | psad1 psad[5] = c[25], mean0 | ||
457 : | ;; | ||
458 : | psad1 psad[6] = c[26], mean0 | ||
459 : | } | ||
460 : | {.mmi | ||
461 : | add dev0 = dev0, psad[0] | ||
462 : | add dev1 = dev1, psad[1] | ||
463 : | psad1 psad[7] = c[27], mean0 | ||
464 : | } | ||
465 : | {.mii | ||
466 : | add dev2 = dev2, psad[2] | ||
467 : | psad1 psad[0] = c[28], mean0 | ||
468 : | psad1 psad[1] = c[29], mean0 | ||
469 : | } | ||
470 : | {.mmi | ||
471 : | add dev3 = dev3, psad[3] | ||
472 : | ;; | ||
473 : | add dev0 = dev0, psad[4] | ||
474 : | psad1 psad[2] = c[30], mean0 | ||
475 : | } | ||
476 : | {.mii | ||
477 : | add dev1 = dev1, psad[5] | ||
478 : | psad1 psad[3] = c[31], mean0 | ||
479 : | ;; | ||
480 : | add dev2 = dev2, psad[6] | ||
481 : | } | ||
482 : | {.mmi | ||
483 : | add dev3 = dev3, psad[7] | ||
484 : | add dev0 = dev0, psad[0] | ||
485 : | add dev1 = dev1, psad[1] | ||
486 : | ;; | ||
487 : | } | ||
488 : | {.mii | ||
489 : | add dev2 = dev2, psad[2] | ||
490 : | add dev3 = dev3, psad[3] | ||
491 : | add ret0 = dev0, dev1 | ||
492 : | ;; | ||
493 : | } | ||
494 : | {.mib | ||
495 : | add dev2 = dev2, dev3 | ||
496 : | nop.i 1 | ||
497 : | nop.b 1 | ||
498 : | ;; | ||
499 : | } | ||
500 : | {.mib | ||
501 : | add ret0 = ret0, dev2 | ||
502 : | nop.i 1 | ||
503 : | ia64p | 1.1 | br.ret.sptk.many b0 |
504 : | ia64p | 1.2 | } |
505 : | ia64p | 1.1 | .endp dev16_ia64# |
506 : | ia64p | 1.5 | |
507 : | |||
508 : | // ########################################################### | ||
509 : | // ########################################################### | ||
510 : | // Neue version von gruppe 01 ################################ | ||
511 : | // ########################################################### | ||
512 : | // ########################################################### | ||
513 : | |||
514 : | |||
515 : | |||
516 : | .text | ||
517 : | .align 16 | ||
518 : | .global sad16_ia64# | ||
519 : | .proc sad16_ia64# | ||
520 : | sad16_ia64: | ||
521 : | alloc r1 = ar.pfs, 4, 76, 0, 0 | ||
522 : | mov r2 = pr | ||
523 : | dep r14 = r0, r33, 0, 3 // r14 = (r33 div 8)*8 (aligned version of ref) | ||
524 : | dep.z r31 = r33, 0, 3 // r31 = r33 mod 8 (misalignment of ref) | ||
525 : | ;; | ||
526 : | mov r64 = r34 //(1) calculate multiples of stride | ||
527 : | shl r65 = r34, 1 //(2) for being able to load all the | ||
528 : | shladd r66 = r34, 1, r34 //(3) data at once | ||
529 : | shl r67 = r34, 2 //(4) | ||
530 : | shladd r68 = r34, 2, r34 //(5) | ||
531 : | shl r71 = r34, 3 //(8) | ||
532 : | shladd r72 = r34, 3, r34 //(9) | ||
533 : | ;; | ||
534 : | shl r69 = r66, 1 //(6) | ||
535 : | shladd r70 = r66, 1, r34 //(7) | ||
536 : | shl r73 = r68, 1 //(10) | ||
537 : | shladd r74 = r68, 1, r34 //(11) | ||
538 : | shl r75 = r66, 2 //(12) | ||
539 : | shladd r76 = r66, 2, r34 //(13) | ||
540 : | shladd r77 = r66, 2, r65 //(14) | ||
541 : | shladd r78 = r66, 2, r66 //(15) | ||
542 : | ;; | ||
543 : | cmp.eq p16, p17 = 0, r31 // prepare predicates according to the misalignment | ||
544 : | cmp.eq p18, p19 = 2, r31 // ref | ||
545 : | cmp.eq p20, p21 = 4, r31 | ||
546 : | cmp.eq p22, p23 = 6, r31 | ||
547 : | cmp.eq p24, p25 = 1, r31 | ||
548 : | cmp.eq p26, p27 = 3, r31 | ||
549 : | cmp.eq p28, p29 = 5, r31 | ||
550 : | mov r96 = r14 // and calculate all the adresses where we have | ||
551 : | mov r33 = r32 // to load from | ||
552 : | add r97 = r14, r64 | ||
553 : | add r35 = r32, r64 | ||
554 : | add r98 = r14, r65 | ||
555 : | add r37 = r32, r65 | ||
556 : | add r99 = r14, r66 | ||
557 : | add r39 = r32, r66 | ||
558 : | add r100 = r14, r67 | ||
559 : | add r41 = r32, r67 | ||
560 : | add r101 = r14, r68 | ||
561 : | add r43 = r32, r68 | ||
562 : | add r102 = r14, r69 | ||
563 : | add r45 = r32, r69 | ||
564 : | add r103 = r14, r70 | ||
565 : | add r47 = r32, r70 | ||
566 : | add r104 = r14, r71 | ||
567 : | add r49 = r32, r71 | ||
568 : | add r105 = r14, r72 | ||
569 : | add r51 = r32, r72 | ||
570 : | add r106 = r14, r73 | ||
571 : | add r53 = r32, r73 | ||
572 : | add r107 = r14, r74 | ||
573 : | add r55 = r32, r74 | ||
574 : | add r108 = r14, r75 | ||
575 : | add r57 = r32, r75 | ||
576 : | add r109 = r14, r76 | ||
577 : | add r59 = r32, r76 | ||
578 : | add r110 = r14, r77 | ||
579 : | add r61 = r32, r77 | ||
580 : | add r111 = r14, r78 | ||
581 : | add r63 = r32, r78 | ||
582 : | ;; | ||
583 : | ld8 r32 = [r33], 8 // Load all the data which is needed for the sad | ||
584 : | ld8 r34 = [r35], 8 // in the registers. the goal is to have the array | ||
585 : | ld8 r36 = [r37], 8 // adressed by cur in the registers r32 - r63 and | ||
586 : | ld8 r38 = [r39], 8 // the aray adressed by ref in the registers | ||
587 : | ld8 r40 = [r41], 8 // r64 - r95. The registers r96 - r111 are needed | ||
588 : | ld8 r42 = [r43], 8 // to load the aligned 24 bits in which the | ||
589 : | ld8 r44 = [r45], 8 // needed misaligned 16 bits must be. | ||
590 : | ld8 r46 = [r47], 8 // After loading we start a preprocessing which | ||
591 : | ld8 r48 = [r49], 8 // guarantees that the data adressed by ref is in | ||
592 : | ld8 r50 = [r51], 8 // the registers r64 - r95. | ||
593 : | ld8 r52 = [r53], 8 | ||
594 : | ld8 r54 = [r55], 8 | ||
595 : | ld8 r56 = [r57], 8 | ||
596 : | ld8 r58 = [r59], 8 | ||
597 : | ld8 r60 = [r61], 8 | ||
598 : | ld8 r62 = [r63], 8 | ||
599 : | ld8 r64 = [r96], 8 | ||
600 : | ld8 r66 = [r97], 8 | ||
601 : | ld8 r68 = [r98], 8 | ||
602 : | ld8 r70 = [r99], 8 | ||
603 : | ld8 r72 = [r100], 8 | ||
604 : | ld8 r74 = [r101], 8 | ||
605 : | ld8 r76 = [r102], 8 | ||
606 : | ld8 r78 = [r103], 8 | ||
607 : | ld8 r80 = [r104], 8 | ||
608 : | ld8 r82 = [r105], 8 | ||
609 : | ld8 r84 = [r106], 8 | ||
610 : | ld8 r86 = [r107], 8 | ||
611 : | ld8 r88 = [r108], 8 | ||
612 : | ld8 r90 = [r109], 8 | ||
613 : | ld8 r92 = [r110], 8 | ||
614 : | ld8 r94 = [r111], 8 | ||
615 : | ;; | ||
616 : | ld8 r33 = [r33] | ||
617 : | ld8 r35 = [r35] | ||
618 : | ld8 r37 = [r37] | ||
619 : | ld8 r39 = [r39] | ||
620 : | ld8 r41 = [r41] | ||
621 : | ld8 r43 = [r43] | ||
622 : | ld8 r45 = [r45] | ||
623 : | ld8 r47 = [r47] | ||
624 : | ld8 r49 = [r49] | ||
625 : | ld8 r51 = [r51] | ||
626 : | ld8 r53 = [r53] | ||
627 : | ld8 r55 = [r55] | ||
628 : | ld8 r57 = [r57] | ||
629 : | ld8 r59 = [r59] | ||
630 : | ld8 r61 = [r61] | ||
631 : | ld8 r63 = [r63] | ||
632 : | ld8 r65 = [r96], 8 | ||
633 : | ld8 r67 = [r97], 8 | ||
634 : | ld8 r69 = [r98], 8 | ||
635 : | ld8 r71 = [r99], 8 | ||
636 : | ld8 r73 = [r100], 8 | ||
637 : | ld8 r75 = [r101], 8 | ||
638 : | ld8 r77 = [r102], 8 | ||
639 : | ld8 r79 = [r103], 8 | ||
640 : | ld8 r81 = [r104], 8 | ||
641 : | ld8 r83 = [r105], 8 | ||
642 : | ld8 r85 = [r106], 8 | ||
643 : | ld8 r87 = [r107], 8 | ||
644 : | ld8 r89 = [r108], 8 | ||
645 : | ld8 r91 = [r109], 8 | ||
646 : | ld8 r93 = [r110], 8 | ||
647 : | ld8 r95 = [r111], 8 | ||
648 : | (p16) br.cond.dptk.many .Lber // If ref is aligned, everything is loaded and we can start the calculation | ||
649 : | ;; | ||
650 : | ld8 r96 = [r96] // If not, we have to load a bit more | ||
651 : | ld8 r97 = [r97] | ||
652 : | ld8 r98 = [r98] | ||
653 : | ld8 r99 = [r99] | ||
654 : | ld8 r100 = [r100] | ||
655 : | ld8 r101 = [r101] | ||
656 : | ld8 r102 = [r102] | ||
657 : | ld8 r103 = [r103] | ||
658 : | ld8 r104 = [r104] | ||
659 : | ld8 r105 = [r105] | ||
660 : | ld8 r106 = [r106] | ||
661 : | ld8 r107 = [r107] | ||
662 : | ld8 r108 = [r108] | ||
663 : | ld8 r109 = [r109] | ||
664 : | ld8 r110 = [r110] | ||
665 : | ld8 r111 = [r111] | ||
666 : | (p24) br.cond.dptk.many .Lmod1 // according to the misalignment, we have | ||
667 : | (p18) br.cond.dpnt.many .Lmod2 // to jump to different preprocessing routines | ||
668 : | (p26) br.cond.dpnt.many .Lmod3 | ||
669 : | (p20) br.cond.dpnt.many .Lmod4 | ||
670 : | (p28) br.cond.dpnt.many .Lmod5 | ||
671 : | (p22) br.cond.dpnt.many .Lmod6 | ||
672 : | ;; | ||
673 : | .Lmod7: // this jump point is not needed | ||
674 : | shrp r64 = r65, r64, 56 // in these blocks, we do the preprocessing | ||
675 : | shrp r65 = r96, r65, 56 | ||
676 : | shrp r66 = r67, r66, 56 | ||
677 : | shrp r67 = r97, r67, 56 | ||
678 : | shrp r68 = r69, r68, 56 | ||
679 : | shrp r69 = r98, r69, 56 | ||
680 : | shrp r70 = r71, r70, 56 | ||
681 : | shrp r71 = r99, r71, 56 | ||
682 : | shrp r72 = r73, r72, 56 | ||
683 : | shrp r73 = r100, r73, 56 | ||
684 : | shrp r74 = r75, r74, 56 | ||
685 : | shrp r75 = r101, r75, 56 | ||
686 : | shrp r76 = r77, r76, 56 | ||
687 : | shrp r77 = r102, r77, 56 | ||
688 : | shrp r78 = r79, r78, 56 | ||
689 : | shrp r79 = r103, r79, 56 | ||
690 : | shrp r80 = r81, r80, 56 | ||
691 : | shrp r81 = r104, r81, 56 | ||
692 : | shrp r82 = r83, r82, 56 | ||
693 : | shrp r83 = r105, r83, 56 | ||
694 : | shrp r84 = r85, r84, 56 | ||
695 : | shrp r85 = r106, r85, 56 | ||
696 : | shrp r86 = r87, r86, 56 | ||
697 : | shrp r87 = r107, r87, 56 | ||
698 : | shrp r88 = r89, r88, 56 | ||
699 : | shrp r89 = r108, r89, 56 | ||
700 : | shrp r90 = r91, r90, 56 | ||
701 : | shrp r91 = r109, r91, 56 | ||
702 : | shrp r92 = r93, r92, 56 | ||
703 : | shrp r93 = r110, r93, 56 | ||
704 : | shrp r94 = r95, r94, 56 | ||
705 : | shrp r95 = r111, r95, 56 | ||
706 : | br.cond.sptk.many .Lber // and then we jump to the calculation | ||
707 : | ;; | ||
708 : | .Lmod6: | ||
709 : | shrp r64 = r65, r64, 48 | ||
710 : | shrp r65 = r96, r65, 48 | ||
711 : | shrp r66 = r67, r66, 48 | ||
712 : | shrp r67 = r97, r67, 48 | ||
713 : | shrp r68 = r69, r68, 48 | ||
714 : | shrp r69 = r98, r69, 48 | ||
715 : | shrp r70 = r71, r70, 48 | ||
716 : | shrp r71 = r99, r71, 48 | ||
717 : | shrp r72 = r73, r72, 48 | ||
718 : | shrp r73 = r100, r73, 48 | ||
719 : | shrp r74 = r75, r74, 48 | ||
720 : | shrp r75 = r101, r75, 48 | ||
721 : | shrp r76 = r77, r76, 48 | ||
722 : | shrp r77 = r102, r77, 48 | ||
723 : | shrp r78 = r79, r78, 48 | ||
724 : | shrp r79 = r103, r79, 48 | ||
725 : | shrp r80 = r81, r80, 48 | ||
726 : | shrp r81 = r104, r81, 48 | ||
727 : | shrp r82 = r83, r82, 48 | ||
728 : | shrp r83 = r105, r83, 48 | ||
729 : | shrp r84 = r85, r84, 48 | ||
730 : | shrp r85 = r106, r85, 48 | ||
731 : | shrp r86 = r87, r86, 48 | ||
732 : | shrp r87 = r107, r87, 48 | ||
733 : | shrp r88 = r89, r88, 48 | ||
734 : | shrp r89 = r108, r89, 48 | ||
735 : | shrp r90 = r91, r90, 48 | ||
736 : | shrp r91 = r109, r91, 48 | ||
737 : | shrp r92 = r93, r92, 48 | ||
738 : | shrp r93 = r110, r93, 48 | ||
739 : | shrp r94 = r95, r94, 48 | ||
740 : | shrp r95 = r111, r95, 48 | ||
741 : | br.cond.sptk.many .Lber | ||
742 : | ;; | ||
743 : | .Lmod5: | ||
744 : | shrp r64 = r65, r64, 40 | ||
745 : | shrp r65 = r96, r65, 40 | ||
746 : | shrp r66 = r67, r66, 40 | ||
747 : | shrp r67 = r97, r67, 40 | ||
748 : | shrp r68 = r69, r68, 40 | ||
749 : | shrp r69 = r98, r69, 40 | ||
750 : | shrp r70 = r71, r70, 40 | ||
751 : | shrp r71 = r99, r71, 40 | ||
752 : | shrp r72 = r73, r72, 40 | ||
753 : | shrp r73 = r100, r73, 40 | ||
754 : | shrp r74 = r75, r74, 40 | ||
755 : | shrp r75 = r101, r75, 40 | ||
756 : | shrp r76 = r77, r76, 40 | ||
757 : | shrp r77 = r102, r77, 40 | ||
758 : | shrp r78 = r79, r78, 40 | ||
759 : | shrp r79 = r103, r79, 40 | ||
760 : | shrp r80 = r81, r80, 40 | ||
761 : | shrp r81 = r104, r81, 40 | ||
762 : | shrp r82 = r83, r82, 40 | ||
763 : | shrp r83 = r105, r83, 40 | ||
764 : | shrp r84 = r85, r84, 40 | ||
765 : | shrp r85 = r106, r85, 40 | ||
766 : | shrp r86 = r87, r86, 40 | ||
767 : | shrp r87 = r107, r87, 40 | ||
768 : | shrp r88 = r89, r88, 40 | ||
769 : | shrp r89 = r108, r89, 40 | ||
770 : | shrp r90 = r91, r90, 40 | ||
771 : | shrp r91 = r109, r91, 40 | ||
772 : | shrp r92 = r93, r92, 40 | ||
773 : | shrp r93 = r110, r93, 40 | ||
774 : | shrp r94 = r95, r94, 40 | ||
775 : | shrp r95 = r111, r95, 40 | ||
776 : | br.cond.sptk.many .Lber | ||
777 : | ;; | ||
778 : | .Lmod4: | ||
779 : | shrp r64 = r65, r64, 32 | ||
780 : | shrp r65 = r96, r65, 32 | ||
781 : | shrp r66 = r67, r66, 32 | ||
782 : | shrp r67 = r97, r67, 32 | ||
783 : | shrp r68 = r69, r68, 32 | ||
784 : | shrp r69 = r98, r69, 32 | ||
785 : | shrp r70 = r71, r70, 32 | ||
786 : | shrp r71 = r99, r71, 32 | ||
787 : | shrp r72 = r73, r72, 32 | ||
788 : | shrp r73 = r100, r73, 32 | ||
789 : | shrp r74 = r75, r74, 32 | ||
790 : | shrp r75 = r101, r75, 32 | ||
791 : | shrp r76 = r77, r76, 32 | ||
792 : | shrp r77 = r102, r77, 32 | ||
793 : | shrp r78 = r79, r78, 32 | ||
794 : | shrp r79 = r103, r79, 32 | ||
795 : | shrp r80 = r81, r80, 32 | ||
796 : | shrp r81 = r104, r81, 32 | ||
797 : | shrp r82 = r83, r82, 32 | ||
798 : | shrp r83 = r105, r83, 32 | ||
799 : | shrp r84 = r85, r84, 32 | ||
800 : | shrp r85 = r106, r85, 32 | ||
801 : | shrp r86 = r87, r86, 32 | ||
802 : | shrp r87 = r107, r87, 32 | ||
803 : | shrp r88 = r89, r88, 32 | ||
804 : | shrp r89 = r108, r89, 32 | ||
805 : | shrp r90 = r91, r90, 32 | ||
806 : | shrp r91 = r109, r91, 32 | ||
807 : | shrp r92 = r93, r92, 32 | ||
808 : | shrp r93 = r110, r93, 32 | ||
809 : | shrp r94 = r95, r94, 32 | ||
810 : | shrp r95 = r111, r95, 32 | ||
811 : | br.cond.sptk.many .Lber | ||
812 : | ;; | ||
813 : | .Lmod3: | ||
814 : | shrp r64 = r65, r64, 24 | ||
815 : | shrp r65 = r96, r65, 24 | ||
816 : | shrp r66 = r67, r66, 24 | ||
817 : | shrp r67 = r97, r67, 24 | ||
818 : | shrp r68 = r69, r68, 24 | ||
819 : | shrp r69 = r98, r69, 24 | ||
820 : | shrp r70 = r71, r70, 24 | ||
821 : | shrp r71 = r99, r71, 24 | ||
822 : | shrp r72 = r73, r72, 24 | ||
823 : | shrp r73 = r100, r73, 24 | ||
824 : | shrp r74 = r75, r74, 24 | ||
825 : | shrp r75 = r101, r75, 24 | ||
826 : | shrp r76 = r77, r76, 24 | ||
827 : | shrp r77 = r102, r77, 24 | ||
828 : | shrp r78 = r79, r78, 24 | ||
829 : | shrp r79 = r103, r79, 24 | ||
830 : | shrp r80 = r81, r80, 24 | ||
831 : | shrp r81 = r104, r81, 24 | ||
832 : | shrp r82 = r83, r82, 24 | ||
833 : | shrp r83 = r105, r83, 24 | ||
834 : | shrp r84 = r85, r84, 24 | ||
835 : | shrp r85 = r106, r85, 24 | ||
836 : | shrp r86 = r87, r86, 24 | ||
837 : | shrp r87 = r107, r87, 24 | ||
838 : | shrp r88 = r89, r88, 24 | ||
839 : | shrp r89 = r108, r89, 24 | ||
840 : | shrp r90 = r91, r90, 24 | ||
841 : | shrp r91 = r109, r91, 24 | ||
842 : | shrp r92 = r93, r92, 24 | ||
843 : | shrp r93 = r110, r93, 24 | ||
844 : | shrp r94 = r95, r94, 24 | ||
845 : | shrp r95 = r111, r95, 24 | ||
846 : | br.cond.sptk.many .Lber | ||
847 : | ;; | ||
848 : | .Lmod2: | ||
849 : | shrp r64 = r65, r64, 16 | ||
850 : | shrp r65 = r96, r65, 16 | ||
851 : | shrp r66 = r67, r66, 16 | ||
852 : | shrp r67 = r97, r67, 16 | ||
853 : | shrp r68 = r69, r68, 16 | ||
854 : | shrp r69 = r98, r69, 16 | ||
855 : | shrp r70 = r71, r70, 16 | ||
856 : | shrp r71 = r99, r71, 16 | ||
857 : | shrp r72 = r73, r72, 16 | ||
858 : | shrp r73 = r100, r73, 16 | ||
859 : | shrp r74 = r75, r74, 16 | ||
860 : | shrp r75 = r101, r75, 16 | ||
861 : | shrp r76 = r77, r76, 16 | ||
862 : | shrp r77 = r102, r77, 16 | ||
863 : | shrp r78 = r79, r78, 16 | ||
864 : | shrp r79 = r103, r79, 16 | ||
865 : | shrp r80 = r81, r80, 16 | ||
866 : | shrp r81 = r104, r81, 16 | ||
867 : | shrp r82 = r83, r82, 16 | ||
868 : | shrp r83 = r105, r83, 16 | ||
869 : | shrp r84 = r85, r84, 16 | ||
870 : | shrp r85 = r106, r85, 16 | ||
871 : | shrp r86 = r87, r86, 16 | ||
872 : | shrp r87 = r107, r87, 16 | ||
873 : | shrp r88 = r89, r88, 16 | ||
874 : | shrp r89 = r108, r89, 16 | ||
875 : | shrp r90 = r91, r90, 16 | ||
876 : | shrp r91 = r109, r91, 16 | ||
877 : | shrp r92 = r93, r92, 16 | ||
878 : | shrp r93 = r110, r93, 16 | ||
879 : | shrp r94 = r95, r94, 16 | ||
880 : | shrp r95 = r111, r95, 16 | ||
881 : | br.cond.sptk.many .Lber | ||
882 : | ;; | ||
883 : | .Lmod1: | ||
884 : | shrp r64 = r65, r64, 8 | ||
885 : | shrp r65 = r96, r65, 8 | ||
886 : | shrp r66 = r67, r66, 8 | ||
887 : | shrp r67 = r97, r67, 8 | ||
888 : | shrp r68 = r69, r68, 8 | ||
889 : | shrp r69 = r98, r69, 8 | ||
890 : | shrp r70 = r71, r70, 8 | ||
891 : | shrp r71 = r99, r71, 8 | ||
892 : | shrp r72 = r73, r72, 8 | ||
893 : | shrp r73 = r100, r73, 8 | ||
894 : | shrp r74 = r75, r74, 8 | ||
895 : | shrp r75 = r101, r75, 8 | ||
896 : | shrp r76 = r77, r76, 8 | ||
897 : | shrp r77 = r102, r77, 8 | ||
898 : | shrp r78 = r79, r78, 8 | ||
899 : | shrp r79 = r103, r79, 8 | ||
900 : | shrp r80 = r81, r80, 8 | ||
901 : | shrp r81 = r104, r81, 8 | ||
902 : | shrp r82 = r83, r82, 8 | ||
903 : | shrp r83 = r105, r83, 8 | ||
904 : | shrp r84 = r85, r84, 8 | ||
905 : | shrp r85 = r106, r85, 8 | ||
906 : | shrp r86 = r87, r86, 8 | ||
907 : | shrp r87 = r107, r87, 8 | ||
908 : | shrp r88 = r89, r88, 8 | ||
909 : | shrp r89 = r108, r89, 8 | ||
910 : | shrp r90 = r91, r90, 8 | ||
911 : | shrp r91 = r109, r91, 8 | ||
912 : | shrp r92 = r93, r92, 8 | ||
913 : | shrp r93 = r110, r93, 8 | ||
914 : | shrp r94 = r95, r94, 8 | ||
915 : | shrp r95 = r111, r95, 8 | ||
916 : | .Lber: | ||
917 : | ;; | ||
918 : | psad1 r32 = r32, r64 // Here we do the calculation. | ||
919 : | psad1 r33 = r33, r65 // The machine is providing a fast method | ||
920 : | psad1 r34 = r34, r66 // for calculating sad, so we use it | ||
921 : | psad1 r35 = r35, r67 | ||
922 : | psad1 r36 = r36, r68 | ||
923 : | psad1 r37 = r37, r69 | ||
924 : | psad1 r38 = r38, r70 | ||
925 : | psad1 r39 = r39, r71 | ||
926 : | psad1 r40 = r40, r72 | ||
927 : | psad1 r41 = r41, r73 | ||
928 : | psad1 r42 = r42, r74 | ||
929 : | psad1 r43 = r43, r75 | ||
930 : | psad1 r44 = r44, r76 | ||
931 : | psad1 r45 = r45, r77 | ||
932 : | psad1 r46 = r46, r78 | ||
933 : | psad1 r47 = r47, r79 | ||
934 : | psad1 r48 = r48, r80 | ||
935 : | psad1 r49 = r49, r81 | ||
936 : | psad1 r50 = r50, r82 | ||
937 : | psad1 r51 = r51, r83 | ||
938 : | psad1 r52 = r52, r84 | ||
939 : | psad1 r53 = r53, r85 | ||
940 : | psad1 r54 = r54, r86 | ||
941 : | psad1 r55 = r55, r87 | ||
942 : | psad1 r56 = r56, r88 | ||
943 : | psad1 r57 = r57, r89 | ||
944 : | psad1 r58 = r58, r90 | ||
945 : | psad1 r59 = r59, r91 | ||
946 : | psad1 r60 = r60, r92 | ||
947 : | psad1 r61 = r61, r93 | ||
948 : | psad1 r62 = r62, r94 | ||
949 : | psad1 r63 = r63, r95 | ||
950 : | ;; | ||
951 : | add r32 = r32, r63 // at last, we have to sum up | ||
952 : | add r33 = r33, r62 // in 5 stages | ||
953 : | add r34 = r34, r61 | ||
954 : | add r35 = r35, r60 | ||
955 : | add r36 = r36, r59 | ||
956 : | add r37 = r37, r58 | ||
957 : | add r38 = r38, r57 | ||
958 : | add r39 = r39, r56 | ||
959 : | add r40 = r40, r55 | ||
960 : | add r41 = r41, r54 | ||
961 : | add r42 = r42, r53 | ||
962 : | add r43 = r43, r52 | ||
963 : | add r44 = r44, r51 | ||
964 : | add r45 = r45, r50 | ||
965 : | add r46 = r46, r49 | ||
966 : | add r47 = r47, r48 | ||
967 : | ;; | ||
968 : | add r32 = r32, r47 | ||
969 : | add r33 = r33, r46 | ||
970 : | add r34 = r34, r45 | ||
971 : | add r35 = r35, r44 | ||
972 : | add r36 = r36, r43 | ||
973 : | add r37 = r37, r42 | ||
974 : | add r38 = r38, r41 | ||
975 : | add r39 = r39, r40 | ||
976 : | ;; | ||
977 : | add r32 = r32, r39 | ||
978 : | add r33 = r33, r38 | ||
979 : | add r34 = r34, r37 | ||
980 : | add r35 = r35, r36 | ||
981 : | ;; | ||
982 : | add r32 = r32, r35 | ||
983 : | add r33 = r33, r34 | ||
984 : | ;; | ||
985 : | add r8 = r32, r33 // and store the result in r8 | ||
986 : | mov pr = r2, -1 | ||
987 : | mov ar.pfs = r1 | ||
988 : | br.ret.sptk.many b0 | ||
989 : | .endp sad16_ia64# | ||
990 : | |||
991 : | |||
992 : | |||
993 : | |||
994 : | .align 16 | ||
995 : | .global sad8_ia64# | ||
996 : | .proc sad8_ia64# | ||
997 : | sad8_ia64: | ||
998 : | alloc r1 = ar.pfs, 3, 21, 0, 0 | ||
999 : | mov r2 = pr | ||
1000 : | dep r14 = r0, r33, 0, 3 // calculate aligned version of ref | ||
1001 : | dep.z r31 = r33, 0, 3 // calculate misalignment of ref | ||
1002 : | ;; | ||
1003 : | mov r40 = r34 //(1) calculate multiples of stride | ||
1004 : | shl r41 = r34, 1 //(2) | ||
1005 : | shladd r42 = r34, 1, r34 //(3) | ||
1006 : | shl r43 = r34, 2 //(4) | ||
1007 : | shladd r44 = r34, 2, r34 //(5) | ||
1008 : | ;; | ||
1009 : | cmp.eq p16, p17 = 0, r31 // set predicates according to the misalignment of ref | ||
1010 : | cmp.eq p18, p19 = 2, r31 | ||
1011 : | shl r45 = r42, 1 //(6) | ||
1012 : | cmp.eq p20, p21 = 4, r31 | ||
1013 : | cmp.eq p22, p23 = 6, r31 | ||
1014 : | shladd r46 = r42, 1, r34 //(7) | ||
1015 : | cmp.eq p24, p25 = 1, r31 | ||
1016 : | cmp.eq p26, p27 = 3, r31 | ||
1017 : | cmp.eq p28, p29 = 5, r31 | ||
1018 : | ;; | ||
1019 : | mov r48 = r14 // calculate memory adresses of data | ||
1020 : | add r33 = r32, r40 | ||
1021 : | add r49 = r14, r40 | ||
1022 : | add r34 = r32, r41 | ||
1023 : | add r50 = r14, r41 | ||
1024 : | add r35 = r32, r42 | ||
1025 : | add r51 = r14, r42 | ||
1026 : | add r36 = r32, r43 | ||
1027 : | add r52 = r14, r43 | ||
1028 : | add r37 = r32, r44 | ||
1029 : | add r53 = r14, r44 | ||
1030 : | add r38 = r32, r45 | ||
1031 : | add r54 = r14, r45 | ||
1032 : | add r39 = r32, r46 | ||
1033 : | add r55 = r14, r46 | ||
1034 : | ;; | ||
1035 : | ld8 r32 = [r32] // load everythingund alles wird geladen | ||
1036 : | ld8 r33 = [r33] // cur is located in r32 - r39 | ||
1037 : | ld8 r34 = [r34] // ref in r40 - r47 | ||
1038 : | ld8 r35 = [r35] | ||
1039 : | ld8 r36 = [r36] | ||
1040 : | ld8 r37 = [r37] | ||
1041 : | ld8 r38 = [r38] | ||
1042 : | ld8 r39 = [r39] | ||
1043 : | ld8 r40 = [r48] ,8 | ||
1044 : | ld8 r41 = [r49] ,8 | ||
1045 : | ld8 r42 = [r50] ,8 | ||
1046 : | ld8 r43 = [r51] ,8 | ||
1047 : | ld8 r44 = [r52] ,8 | ||
1048 : | ld8 r45 = [r53] ,8 | ||
1049 : | ld8 r46 = [r54] ,8 | ||
1050 : | ld8 r47 = [r55] ,8 | ||
1051 : | (p16) br.cond.dptk.many .Lber2 // if ref is aligned, we can start the calculation | ||
1052 : | ;; | ||
1053 : | ld8 r48 = [r48] // if not, we have to load some more | ||
1054 : | ld8 r49 = [r49] // because of the alignment of ld8 | ||
1055 : | ld8 r50 = [r50] | ||
1056 : | ld8 r51 = [r51] | ||
1057 : | ld8 r52 = [r52] | ||
1058 : | ld8 r53 = [r53] | ||
1059 : | ld8 r54 = [r54] | ||
1060 : | ld8 r55 = [r55] | ||
1061 : | (p24) br.cond.dptk.many .Lmode1 | ||
1062 : | (p18) br.cond.dpnt.many .Lmode2 | ||
1063 : | (p26) br.cond.dpnt.many .Lmode3 | ||
1064 : | (p20) br.cond.dpnt.many .Lmode4 | ||
1065 : | (p28) br.cond.dpnt.many .Lmode5 | ||
1066 : | (p22) br.cond.dpnt.many .Lmode6 | ||
1067 : | ;; | ||
1068 : | .Lmode7: // this jump piont is not needed, it is for better understandment | ||
1069 : | shrp r40 = r48, r40, 56 // here we do some preprocessing on the data | ||
1070 : | shrp r41 = r49, r41, 56 // this is because of the alignment problem of ref | ||
1071 : | shrp r42 = r50, r42, 56 | ||
1072 : | shrp r43 = r51, r43, 56 | ||
1073 : | shrp r44 = r52, r44, 56 | ||
1074 : | shrp r45 = r53, r45, 56 | ||
1075 : | shrp r46 = r54, r46, 56 | ||
1076 : | shrp r47 = r55, r47, 56 | ||
1077 : | br.cond.sptk.many .Lber2 | ||
1078 : | ;; | ||
1079 : | .Lmode6: | ||
1080 : | shrp r40 = r48, r40, 48 | ||
1081 : | shrp r41 = r49, r41, 48 | ||
1082 : | shrp r42 = r50, r42, 48 | ||
1083 : | shrp r43 = r51, r43, 48 | ||
1084 : | shrp r44 = r52, r44, 48 | ||
1085 : | shrp r45 = r53, r45, 48 | ||
1086 : | shrp r46 = r54, r46, 48 | ||
1087 : | shrp r47 = r55, r47, 48 | ||
1088 : | br.cond.sptk.many .Lber2 | ||
1089 : | ;; | ||
1090 : | .Lmode5: | ||
1091 : | shrp r40 = r48, r40, 40 | ||
1092 : | shrp r41 = r49, r41, 40 | ||
1093 : | shrp r42 = r50, r42, 40 | ||
1094 : | shrp r43 = r51, r43, 40 | ||
1095 : | shrp r44 = r52, r44, 40 | ||
1096 : | shrp r45 = r53, r45, 40 | ||
1097 : | shrp r46 = r54, r46, 40 | ||
1098 : | shrp r47 = r55, r47, 40 | ||
1099 : | br.cond.sptk.many .Lber2 | ||
1100 : | ;; | ||
1101 : | .Lmode4: | ||
1102 : | shrp r40 = r48, r40, 32 | ||
1103 : | shrp r41 = r49, r41, 32 | ||
1104 : | shrp r42 = r50, r42, 32 | ||
1105 : | shrp r43 = r51, r43, 32 | ||
1106 : | shrp r44 = r52, r44, 32 | ||
1107 : | shrp r45 = r53, r45, 32 | ||
1108 : | shrp r46 = r54, r46, 32 | ||
1109 : | shrp r47 = r55, r47, 32 | ||
1110 : | br.cond.sptk.many .Lber2 | ||
1111 : | ;; | ||
1112 : | .Lmode3: | ||
1113 : | shrp r40 = r48, r40, 24 | ||
1114 : | shrp r41 = r49, r41, 24 | ||
1115 : | shrp r42 = r50, r42, 24 | ||
1116 : | shrp r43 = r51, r43, 24 | ||
1117 : | shrp r44 = r52, r44, 24 | ||
1118 : | shrp r45 = r53, r45, 24 | ||
1119 : | shrp r46 = r54, r46, 24 | ||
1120 : | shrp r47 = r55, r47, 24 | ||
1121 : | br.cond.sptk.many .Lber2 | ||
1122 : | ;; | ||
1123 : | .Lmode2: | ||
1124 : | shrp r40 = r48, r40, 16 | ||
1125 : | shrp r41 = r49, r41, 16 | ||
1126 : | shrp r42 = r50, r42, 16 | ||
1127 : | shrp r43 = r51, r43, 16 | ||
1128 : | shrp r44 = r52, r44, 16 | ||
1129 : | shrp r45 = r53, r45, 16 | ||
1130 : | shrp r46 = r54, r46, 16 | ||
1131 : | shrp r47 = r55, r47, 16 | ||
1132 : | br.cond.sptk.many .Lber2 | ||
1133 : | ;; | ||
1134 : | .Lmode1: | ||
1135 : | shrp r40 = r48, r40, 8 | ||
1136 : | shrp r41 = r49, r41, 8 | ||
1137 : | shrp r42 = r50, r42, 8 | ||
1138 : | shrp r43 = r51, r43, 8 | ||
1139 : | shrp r44 = r52, r44, 8 | ||
1140 : | shrp r45 = r53, r45, 8 | ||
1141 : | shrp r46 = r54, r46, 8 | ||
1142 : | shrp r47 = r55, r47, 8 | ||
1143 : | .Lber2: | ||
1144 : | ;; | ||
1145 : | psad1 r32 = r32, r40 // we start calculating sad | ||
1146 : | psad1 r33 = r33, r41 // using th psad1 command of IA64 | ||
1147 : | psad1 r34 = r34, r42 | ||
1148 : | psad1 r35 = r35, r43 | ||
1149 : | psad1 r36 = r36, r44 | ||
1150 : | psad1 r37 = r37, r45 | ||
1151 : | psad1 r38 = r38, r46 | ||
1152 : | psad1 r39 = r39, r47 | ||
1153 : | ;; | ||
1154 : | add r32 = r32, r33 // then we sum up everything | ||
1155 : | add r33 = r34, r35 | ||
1156 : | add r34 = r36, r37 | ||
1157 : | add r35 = r38, r39 | ||
1158 : | ;; | ||
1159 : | add r32 = r32, r33 | ||
1160 : | add r33 = r34, r35 | ||
1161 : | ;; | ||
1162 : | add r8 = r32, r33 // and store the result un r8 | ||
1163 : | mov pr = r2, -1 | ||
1164 : | mov ar.pfs = r1 | ||
1165 : | br.ret.sptk.many b0 | ||
1166 : | .endp sad8_ia64# |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |