1 |
|
// ------------------------------------------------------------------------------ |
2 |
|
// * |
3 |
|
// * Optimized Assembler Versions of sad8 and sad16 |
4 |
|
// * |
5 |
|
// ------------------------------------------------------------------------------ |
6 |
|
// * |
7 |
|
// * Hannes Jütting and Christopher Özbek |
8 |
|
// * {s_juetti,s_oezbek}@ira.uka.de |
9 |
|
// * |
10 |
|
// * Programmed for the IA64 laboratory held at University Karlsruhe 2002 |
11 |
|
// * http://www.info.uni-karlsruhe.de/~rubino/ia64p/ |
12 |
|
// * |
13 |
|
// ------------------------------------------------------------------------------ |
14 |
|
// * |
15 |
|
// * These are the optimized assembler versions of sad8 and sad16, which calculate |
16 |
|
// * the sum of absolute differences between two 8x8/16x16 block matrices. |
17 |
|
// * |
18 |
|
// * Our approach uses: |
19 |
|
// * - The Itanium command psad1, which solves the problem in hardware. |
20 |
|
// * - Modulo-Scheduled Loops as the best way to loop unrolling on the IA64 |
21 |
|
// * EPIC architecture |
22 |
|
// * - Alignment resolving to avoid memory faults |
23 |
|
// * |
24 |
|
// ------------------------------------------------------------------------------ |
25 |
|
|
26 |
.text |
.text |
27 |
|
|
28 |
|
// ------------------------------------------------------------------------------ |
29 |
|
// * SAD16_IA64 |
30 |
|
// * |
31 |
|
// * In: |
32 |
|
// * r32 = cur (aligned) |
33 |
|
// * r33 = ref (not aligned) |
34 |
|
// * r34 = stride |
35 |
|
// * r35 = bestsad |
36 |
|
// * Out: |
37 |
|
// * r8 = sum of absolute differences |
38 |
|
// * |
39 |
|
// ------------------------------------------------------------------------------ |
40 |
|
|
41 |
.align 16 |
.align 16 |
42 |
.global sad16_ia64# |
.global sad16_ia64# |
43 |
.proc sad16_ia64# |
.proc sad16_ia64# |
44 |
sad16_ia64: |
sad16_ia64: |
45 |
|
|
|
_LL=3 |
|
|
_SL=1 |
|
|
_OL=1 |
|
|
_PL=1 |
|
|
_AL=1 |
|
46 |
|
|
47 |
alloc r9=ar.pfs,4,44,0,48 |
// Define Latencies |
48 |
|
LL16=3 // load latency |
49 |
|
SL16=1 // shift latency |
50 |
|
OL16=1 // or latency |
51 |
|
PL16=1 // psad latency |
52 |
|
AL16=1 // add latency |
53 |
|
|
54 |
mov r8 = r0 |
// Allocate Registern in RSE |
55 |
|
alloc r9=ar.pfs,4,36,0,40 |
56 |
|
|
57 |
mov r20 = ar.lc |
// lfetch [r32] // might help |
|
mov r21 = pr |
|
58 |
|
|
59 |
dep.z r22 = r32, 3, 3 // erste 3 Bit mit 8 multiplizieren |
mov r8 = r0 // clear the return reg |
|
dep.z r23 = r33, 3, 3 // in r22 und r23 -> Schiebeflags |
|
60 |
|
|
61 |
and r14 = -8, r32 // Parameter in untere Register kopieren |
// Save LC and predicates |
62 |
and r15 = -8, r33 // Ref Cur mit 11111...1000 and-en |
mov r20 = ar.lc |
63 |
mov r16 = r34 |
mov r21 = pr |
|
mov r17 = r35 |
|
|
;; |
|
|
add r18 = 8, r14 // Adressenvorausberechnen |
|
|
add r19 = 8, r15 |
|
64 |
|
|
65 |
sub r24 = 64, r22 // Schiftanzahl ausrechnen |
dep.z r23 = r33, 3, 3 // get the # of bits ref is misaligned |
66 |
sub r25 = 64, r23 |
and r15 = -8, r33 // align the ref pointer by deleting the last 3 bit |
67 |
|
|
68 |
add r26 = 16, r14 // Adressenvorausberechnen |
mov r14 = r32 // save the cur pointer |
69 |
add r27 = 16, r15 |
mov r16 = r34 // save stride |
70 |
|
mov r17 = r35 // save bestsad |
71 |
|
|
72 |
// Loop-counter initialisieren |
;; |
73 |
mov ar.lc = 15 // Loop 16 mal durchlaufen |
add r18 = 8, r14 // precalc second cur pointer |
74 |
mov ar.ec = _LL + _SL + _OL + _PL + _AL + _AL // Die Loop am Schluss noch neun mal durchlaufen |
add r19 = 8, r15 // precalc second ref pointer |
75 |
|
add r27 = 16, r15 // precalc third ref pointer |
76 |
|
sub r25 = 64, r23 // # of right shifts |
77 |
|
|
78 |
// Rotating Predicate Register zuruecksetzen und P16 auf 1 |
// Initialize Loop-counters |
79 |
mov pr.rot = 1 << 16 |
mov ar.lc = 15 // loop 16 times |
80 |
|
mov ar.ec = LL16 + SL16 + OL16 + PL16 + AL16 + AL16 |
81 |
|
mov pr.rot = 1 << 16 // reseting rotating predicate regs and set p16 to 1 |
82 |
;; |
;; |
83 |
|
|
84 |
// Array-Konstrukte initialisieren |
// Intialize Arrays for Register Rotation |
85 |
.rotr _ald1[_LL+1], _ald2[_LL+1], _ald3[_LL+1], _ald4[_LL+1], _ald5[_LL+1], _ald6[_LL+1], _shru1[_SL+1], _shl1[_SL+1], _shru2[_SL], _shl2[_SL], _shru3[_SL], _shl3[_SL], _shru4[_SL], _shl4[_SL+1], _or1[_OL], _or2[_OL], _or3[_OL], _or4[_OL+1], _psadr1[_PL+1], _psadr2[_PL+1], _addr1[_AL+1] |
.rotr r_cur_ld1[LL16+SL16+OL16+1], r_cur_ld2[LL16+SL16+OL16+1], r_ref_16_ld1[LL16+1], r_ref_16_ld2[LL16+1], r_ref_16_ld3[LL16+1], r_ref_16_shru1[SL16], r_ref_16_shl1[SL16], r_ref_16_shru2[SL16], r_ref_16_shl2[SL16+1], r_ref_16_or1[OL16], r_ref_16_or2[OL16+1], r_psad1[PL16+1], r_psad2[PL16+1], r_add_16[AL16+1] |
86 |
.rotp _aldp[_LL], _shp[_SL], _orp[_OL], _psadrp[_PL], _addrp1[_AL], _addrp2[_AL] |
.rotp p_ld_16[LL16], p_sh_16[SL16], p_or_16[OL16], p_psad_16[PL16], p_add1_16[AL16], p_add2_16[AL16] |
87 |
|
|
88 |
.L_loop_16: |
.L_loop16: |
89 |
{.mmi |
{.mmi |
90 |
(_aldp[0]) ld8 _ald1[0] = [r14], r16 // Cur Erste 8 Byte |
(p_ld_16[0]) ld8 r_cur_ld1[0] = [r14], r16 // Cur load first 8 Byte |
91 |
(_aldp[0]) ld8 _ald2[0] = [r18], r16 // Cur Zweite 8 Byte |
(p_ld_16[0]) ld8 r_cur_ld2[0] = [r18], r16 // Cur load next 8 Byte |
92 |
(_psadrp[0]) psad1 _psadr1[0] = _or2[0], _or4[0] // Psadden |
(p_psad_16[0]) psad1 r_psad1[0] = r_cur_ld1[LL16+SL16+OL16], r_ref_16_or2[0] // psad of cur and ref |
93 |
} |
} |
94 |
{.mmi |
{.mmi |
95 |
(_aldp[0]) ld8 _ald3[0] = [r26], r16 // Cur Dritte 8 Byte |
(p_ld_16[0]) ld8 r_ref_16_ld1[0] = [r15], r16 // Ref load first 8 Byte (unaligned) |
96 |
(_aldp[0]) ld8 _ald4[0] = [r15], r16 // Ref Erste 8 Byte |
(p_ld_16[0]) ld8 r_ref_16_ld2[0] = [r19], r16 // Ref load next 8 Byte (unaligned) |
97 |
(_psadrp[0]) psad1 _psadr2[0] = _or3[0], _or4[_OL] // _or2 +1 |
(p_psad_16[0]) psad1 r_psad2[0] = r_cur_ld2[LL16+SL16+OL16], r_ref_16_or2[OL16] // psad of cur_2 and ref_2 |
|
} |
|
|
{.mmi |
|
|
(_aldp[0]) ld8 _ald5[0] = [r19], r16 // Ref Zweite 8 Byte |
|
|
(_aldp[0]) ld8 _ald6[0] = [r27], r16 // Ref Dritte 8 Byte |
|
|
(_shp[0]) shr.u _shru1[0] = _ald1[_LL], r22 |
|
|
} |
|
|
{.mii |
|
|
(_orp[0]) or _or1[0] = _shl2[0], _shru3[0] // _shru2 + 1 und _shl2 + 1 |
|
|
(_shp[0]) shl _shl1[0] = _ald2[_LL], r24 |
|
|
(_shp[0]) shr.u _shru2[0] = _ald2[_LL], r22 |
|
98 |
} |
} |
99 |
{.mii |
{.mii |
100 |
(_orp[0]) or _or2[0] = _shl3[0], _shru4[0] // _shru3 + 1 und _shl3 + 1 |
(p_ld_16[0]) ld8 r_ref_16_ld3[0] = [r27], r16 // Ref load third 8 Byte (unaligned) |
101 |
(_shp[0]) shl _shl2[0] = _ald3[_LL], r24 |
(p_or_16[0]) or r_ref_16_or1[0] = r_ref_16_shl1[0], r_ref_16_shru2[0] // Ref or r_ref_16_shl1 + 1 and r_ref_16_shl1 + 1 |
102 |
(_shp[0]) shr.u _shru3[0] = _ald4[_LL], r23 |
(p_sh_16[0]) shr.u r_ref_16_shru1[0] = r_ref_16_ld1[LL16], r23 // Ref shift |
103 |
} |
} |
104 |
{.mii |
{.mii |
105 |
(_orp[0]) or _or3[0] = _shl4[0], _shl4[_SL] //_shru4 + 1 und _shl4 + 1 |
(p_or_16[0]) or r_ref_16_or2[0] = r_ref_16_shl2[0], r_ref_16_shl2[SL16] // Ref or r_ref_shru2 + 1 and r_ref_shl2 + 1 |
106 |
(_shp[0]) shl _shl3[0] = _ald5[_LL], r25 |
(p_sh_16[0]) shl r_ref_16_shl1[0] = r_ref_16_ld2[LL16], r25 // Ref shift |
107 |
(_shp[0]) shr.u _shru4[0] = _ald5[_LL], r23 |
(p_sh_16[0]) shr.u r_ref_16_shru2[0] = r_ref_16_ld2[LL16], r23 // Ref shift |
108 |
} |
} |
109 |
{.mmi |
{.mib |
110 |
(_orp[0]) or _or4[0] = _shru1[_SL], _shl1[_SL] |
(p_add2_16[0]) cmp.ge.unc p6, p7 = r8, r17 |
111 |
(_shp[0]) shl _shl4[0]= _ald6[_LL], r25 |
(p_sh_16[0]) shl r_ref_16_shl2[0]= r_ref_16_ld3[LL16], r25 // Ref shift |
112 |
|
(p6) br.spnt.few .L_loop_exit16 |
113 |
} |
} |
114 |
{.mmb |
{.mmb |
115 |
(_addrp1[0]) add _addr1[0] = _psadr1[_PL], _psadr2[_PL] // Aufsummieren |
(p_add1_16[0]) add r_add_16[0] = r_psad1[PL16], r_psad2[PL16] // add the psad results |
116 |
(_addrp2[0]) add r8 = r8, _addr1[_AL] |
(p_add2_16[0]) add r8 = r8, r_add_16[AL16] // add the results to the sum |
117 |
br.ctop.sptk.few .L_loop_16 |
br.ctop.sptk.few .L_loop16 |
118 |
;; |
;; |
119 |
} |
} |
120 |
// Register zurueckschreiben |
.L_loop_exit16: |
121 |
|
|
122 |
|
// Restore LC and predicates |
123 |
mov ar.lc = r20 |
mov ar.lc = r20 |
124 |
mov pr = r21,-1 |
mov pr = r21,-1 |
125 |
|
|
126 |
|
// Return |
127 |
br.ret.sptk.many rp |
br.ret.sptk.many rp |
128 |
.endp sad16_ia64# |
.endp sad16_ia64# |
129 |
|
|
130 |
|
// ------------------------------------------------------------------------------ |
131 |
|
// * SAD8_IA64 |
132 |
|
// * |
133 |
|
// * In: |
134 |
|
// * r32 = cur (aligned) |
135 |
|
// * r33 = ref (not aligned) |
136 |
|
// * r34 = stride |
137 |
|
// * Out: |
138 |
|
// * r8 = sum of absolute differences |
139 |
|
// * |
140 |
|
// ------------------------------------------------------------------------------ |
141 |
|
|
142 |
.align 16 |
.align 16 |
143 |
.global sad8_ia64# |
.global sad8_ia64# |
145 |
|
|
146 |
sad8_ia64: |
sad8_ia64: |
147 |
|
|
|
LL=3 |
|
|
SL=1 |
|
|
OL=1 |
|
|
PL=1 |
|
|
AL=1 |
|
148 |
|
|
149 |
alloc r9=ar.pfs,3,29,0,32 |
// Define Latencies |
150 |
mov r20 = ar.lc |
LL8=3 // load latency |
151 |
mov r21 = pr |
SL8=1 // shift latency |
152 |
|
OL8=1 // or latency |
153 |
|
PL8=1 // psad latency |
154 |
|
AL8=1 // add latency |
155 |
|
|
156 |
dep.z r22 = r32, 3, 3 // erste 3 Bit mit 8 multiplizieren |
// Allocate Registers in RSE |
157 |
dep.z r23 = r33, 3, 3 // in r22 und r23 -> Schiebeflags |
alloc r9 = ar.pfs,3,21,0,24 |
158 |
|
|
159 |
mov r8 = r0 // . . . . |
// lfetch [r32] // Maybe this helps? |
160 |
and r14 = -8, r32 // 0xFFFFFFFFFFFFFFF8, r32 |
|
161 |
and r15 = -8, r33 // 0xFFFFFFFFFFFFFFF8, r33 |
mov r8 = r0 // Initialize result |
|
mov r16 = r34 |
|
|
// mov r17 = r35 |
|
|
;; |
|
162 |
|
|
163 |
add r18 = 8, r14 |
mov r14 = r32 // Save Cur |
164 |
add r19 = 8, r15 |
and r15 = -8, r33 // Align the Ref pointer by deleting the last 3 bit |
165 |
|
mov r16 = r34 // Save Stride |
166 |
|
|
167 |
sub r24 = 64, r22 |
// Save LC and predicates |
168 |
sub r25 = 64, r23 |
mov r20 = ar.lc |
169 |
|
mov r21 = pr |
170 |
|
|
171 |
// Loop-counter initialisieren |
dep.z r23 = r33, 3, 3 // get the # of bits ref is misaligned |
|
mov ar.lc = 7 // Loop 7 mal durchlaufen |
|
|
mov ar.ec = LL + SL + OL + PL + AL // Die Loop am Schluss noch zehn mal durchlaufen |
|
172 |
|
|
|
// Rotating Predicate Register zuruecksetzen und P16 auf 1 |
|
|
mov pr.rot = 1 << 16 |
|
173 |
;; |
;; |
174 |
.rotr ald1[LL+1], ald2[LL+1], ald3[LL+1], ald4[LL+1], shru1[SL+1], shl1[SL+1], shru2[SL+1], shl2[SL+1], or1[OL+1], or2[OL+1], psadr[PL+1], addr[AL+1] |
|
175 |
.rotp aldp[LL], shp[SL], orp[OL], psadrp[PL], addrp[AL] |
add r19 = 8, r15 // Precalculate second load-offset |
176 |
.L_loop_8: |
sub r25 = 64, r23 // Precalculate # of shifts |
177 |
{.mmi |
|
178 |
(aldp[0]) ld8 ald1[0] = [r14], r16 // Cur laden |
// Initialize Loop-Counters |
179 |
(aldp[0]) ld8 ald2[0] = [r18], r16 |
mov ar.lc = 7 // Loop 7 times |
180 |
(shp[0]) shr.u shru1[0] = ald1[LL], r22 // mergen |
mov ar.ec = LL8 + SL8 + OL8 + PL8 + AL8 // Epiloque |
181 |
} |
mov pr.rot = 1 << 16 // Reset Predicate Registers and initialize with P16 |
182 |
{.mii |
|
183 |
(orp[0]) or or1[0] = shru1[SL], shl1[SL] |
// Initalize Arrays for Register Rotation |
184 |
(shp[0]) shl shl1[0] = ald2[LL], r24 |
.rotr r_cur_ld[LL8+SL8+OL8+1], r_ref_ld1[LL8+1], r_ref_ld2[LL8+1], r_shru[SL8+1], r_shl[SL8+1], r_or[OL8+1], r_psad[PL8+1] |
185 |
(shp[0]) shr.u shru2[0] = ald3[LL], r23 // mergen |
.rotp p_ld[LL8], p_sh[SL8], p_or[OL8], p_psad[PL8], p_add[AL8] |
186 |
} |
|
187 |
{.mmi |
;; |
188 |
(aldp[0]) ld8 ald3[0] = [r15], r16 // Ref laden |
.L_loop8: |
189 |
(aldp[0]) ld8 ald4[0] = [r19], r16 |
// {.mmi |
190 |
(shp[0]) shl shl2[0] = ald4[LL], r25 |
(p_ld[0]) ld8 r_ref_ld1[0] = [r15], r16 // Load 1st 8Byte from Ref |
191 |
} |
(p_ld[0]) ld8 r_cur_ld[0] = [r14], r16 // Load Cur |
192 |
{.mmi |
(p_psad[0]) psad1 r_psad[0] = r_cur_ld[LL8+SL8+OL8], r_or[OL8] // Do the Calculation |
193 |
(orp[0]) or or2[0] = shru2[SL], shl2[SL] |
// } |
194 |
(addrp[0]) add r8 = r8, psadr[PL] |
// {.mii |
195 |
(psadrp[0]) psad1 psadr[0] = or1[OL], or2[OL] |
(p_ld[0]) ld8 r_ref_ld2[0] = [r19], r16 // Load 2nd 8Byte from Ref |
196 |
} |
(p_sh[0]) shr.u r_shru[0] = r_ref_ld1[LL8], r23 // Shift unaligned Ref parts |
197 |
{.mbb |
(p_sh[0]) shl r_shl[0] = r_ref_ld2[LL8], r25 // Shift unaligned Ref parts |
198 |
br.ctop.sptk.few .L_loop_8 |
// } |
199 |
|
// {.mib |
200 |
|
(p_or[0]) or r_or[0] = r_shru[SL8], r_shl[SL8] // Combine unaligned Ref parts |
201 |
|
(p_add[0]) add r8 = r8, r_psad[PL8] // Sum psad result |
202 |
|
br.ctop.sptk.few .L_loop8 |
203 |
;; |
;; |
204 |
} |
// } |
205 |
|
|
206 |
|
// Restore Loop counters |
207 |
mov ar.lc = r20 |
mov ar.lc = r20 |
208 |
mov pr = r21,-1 |
mov pr = r21,-1 |
209 |
|
|
210 |
|
// Return |
211 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
212 |
.endp sad8_ia64# |
.endp sad8_ia64# |
213 |
|
|
310 |
.endp sad16bi_ia64# |
.endp sad16bi_ia64# |
311 |
|
|
312 |
|
|
313 |
.common dev16#,8,8 |
|
314 |
|
|
315 |
|
|
316 |
|
|
317 |
|
|
318 |
|
|
319 |
|
.text |
320 |
.align 16 |
.align 16 |
321 |
.global dev16_ia64# |
.global dev16_ia64# |
322 |
.proc dev16_ia64# |
.proc dev16_ia64# |
323 |
|
.auto |
324 |
dev16_ia64: |
dev16_ia64: |
325 |
.prologue |
// renamings for better readability |
326 |
zxt4 r33 = r33 |
stride = r18 |
327 |
.save ar.lc, r2 |
pfs = r19 //for saving previous function state |
328 |
mov r2 = ar.lc |
cura0 = r20 //address of first 8-byte block of cur |
329 |
|
cura1 = r21 //address of second 8-byte block of cur |
330 |
|
mean0 = r22 //registers for calculating the sum in parallel |
331 |
|
mean1 = r23 |
332 |
|
mean2 = r24 |
333 |
|
mean3 = r25 |
334 |
|
dev0 = r26 //same for the deviation |
335 |
|
dev1 = r27 |
336 |
|
dev2 = r28 |
337 |
|
dev3 = r29 |
338 |
|
|
339 |
.body |
.body |
340 |
mov r21 = r0 |
alloc pfs = ar.pfs, 2, 38, 0, 40 |
341 |
mov r8 = r0 |
|
342 |
mov r23 = r32 |
mov cura0 = in0 |
343 |
mov r24 = r0 |
mov stride = in1 |
344 |
|
add cura1 = 8, cura0 |
345 |
|
|
346 |
|
.rotr c[32], psad[8] // just using rotating registers to get an array ;-) |
347 |
|
|
348 |
|
.explicit |
349 |
|
{.mmi |
350 |
|
ld8 c[0] = [cura0], stride // load them ... |
351 |
|
ld8 c[1] = [cura1], stride |
352 |
;; |
;; |
353 |
mov r25 = r33 |
} |
354 |
.L50: |
{.mmi |
355 |
mov r22 = r0 |
ld8 c[2] = [cura0], stride |
356 |
mov r20 = r23 |
ld8 c[3] = [cura1], stride |
|
;; |
|
|
.L54: |
|
|
mov r16 = r20 |
|
|
adds r14 = 2, r20 |
|
|
adds r15 = 3, r20 |
|
|
;; |
|
|
ld1 r17 = [r16], 1 |
|
|
ld1 r18 = [r14] |
|
|
ld1 r19 = [r15] |
|
|
;; |
|
|
ld1 r14 = [r16] |
|
|
add r21 = r17, r21 |
|
|
adds r15 = 4, r20 |
|
|
;; |
|
|
add r21 = r14, r21 |
|
|
ld1 r16 = [r15] |
|
|
adds r22 = 8, r22 |
|
|
;; |
|
|
add r21 = r18, r21 |
|
|
adds r14 = 5, r20 |
|
|
adds r15 = 6, r20 |
|
|
;; |
|
|
add r21 = r19, r21 |
|
|
ld1 r17 = [r14] |
|
|
ld1 r18 = [r15] |
|
|
;; |
|
|
add r21 = r16, r21 |
|
|
adds r14 = 7, r20 |
|
|
cmp4.geu p6, p7 = 15, r22 |
|
|
;; |
|
|
add r21 = r17, r21 |
|
|
ld1 r15 = [r14] |
|
|
adds r20 = 8, r20 |
|
|
;; |
|
|
add r21 = r18, r21 |
|
|
;; |
|
|
add r21 = r15, r21 |
|
|
(p6) br.cond.dptk .L54 |
|
|
adds r24 = 1, r24 |
|
|
add r23 = r23, r25 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 15, r24 |
|
|
(p6) br.cond.dptk .L50 |
|
|
extr.u r14 = r21, 8, 24 |
|
|
mov r23 = r32 |
|
|
mov r24 = r0 |
|
|
;; |
|
|
mov r21 = r14 |
|
|
.L60: |
|
|
addl r14 = 3, r0 |
|
|
mov r17 = r23 |
|
357 |
;; |
;; |
358 |
mov ar.lc = r14 |
} |
359 |
|
{.mmi |
360 |
|
ld8 c[4] = [cura0], stride |
361 |
|
ld8 c[5] = [cura1], stride |
362 |
;; |
;; |
363 |
.L144: |
} |
364 |
mov r16 = r17 |
{.mmi |
365 |
|
ld8 c[6] = [cura0], stride |
366 |
|
ld8 c[7] = [cura1], stride |
367 |
;; |
;; |
368 |
ld1 r14 = [r16], 1 |
} |
369 |
|
{.mmi |
370 |
|
ld8 c[8] = [cura0], stride |
371 |
|
ld8 c[9] = [cura1], stride |
372 |
;; |
;; |
373 |
sub r15 = r14, r21 |
} |
374 |
|
{.mmi |
375 |
|
ld8 c[10] = [cura0], stride |
376 |
|
ld8 c[11] = [cura1], stride |
377 |
;; |
;; |
378 |
cmp4.ge p6, p7 = 0, r15 |
} |
379 |
|
{.mii |
380 |
|
ld8 c[12] = [cura0], stride |
381 |
|
psad1 mean0 = c[0], r0 // get the sum of them ... |
382 |
|
psad1 mean1 = c[1], r0 |
383 |
|
} |
384 |
|
{.mmi |
385 |
|
ld8 c[13] = [cura1], stride |
386 |
;; |
;; |
387 |
(p7) add r8 = r8, r15 |
ld8 c[14] = [cura0], stride |
388 |
(p6) sub r14 = r21, r14 |
psad1 mean2 = c[2], r0 |
389 |
|
} |
390 |
|
{.mii |
391 |
|
ld8 c[15] = [cura1], stride |
392 |
|
psad1 mean3 = c[3], r0 |
393 |
;; |
;; |
394 |
(p6) add r8 = r8, r14 |
psad1 psad[0] = c[4], r0 |
395 |
ld1 r14 = [r16] |
} |
396 |
|
{.mmi |
397 |
|
ld8 c[16] = [cura0], stride |
398 |
|
ld8 c[17] = [cura1], stride |
399 |
|
psad1 psad[1] = c[5], r0 |
400 |
;; |
;; |
401 |
sub r15 = r14, r21 |
} |
402 |
adds r16 = 2, r17 |
{.mii |
403 |
|
ld8 c[18] = [cura0], stride |
404 |
|
psad1 psad[2] = c[6], r0 |
405 |
|
psad1 psad[3] = c[7], r0 |
406 |
|
} |
407 |
|
{.mmi |
408 |
|
ld8 c[19] = [cura1], stride |
409 |
;; |
;; |
410 |
cmp4.ge p6, p7 = 0, r15 |
ld8 c[20] = [cura0], stride |
411 |
|
psad1 psad[4] = c[8], r0 |
412 |
|
} |
413 |
|
{.mii |
414 |
|
ld8 c[21] = [cura1], stride |
415 |
|
psad1 psad[5] = c[9], r0 |
416 |
;; |
;; |
417 |
(p7) add r8 = r8, r15 |
add mean0 = mean0, psad[0] |
418 |
(p6) sub r14 = r21, r14 |
} |
419 |
|
{.mmi |
420 |
|
ld8 c[22] = [cura0], stride |
421 |
|
ld8 c[23] = [cura1], stride |
422 |
|
add mean1 = mean1, psad[1] |
423 |
;; |
;; |
424 |
(p6) add r8 = r8, r14 |
} |
425 |
ld1 r14 = [r16] |
{.mii |
426 |
|
ld8 c[24] = [cura0], stride |
427 |
|
psad1 psad[0] = c[10], r0 |
428 |
|
psad1 psad[1] = c[11], r0 |
429 |
|
} |
430 |
|
{.mmi |
431 |
|
ld8 c[25] = [cura1], stride |
432 |
;; |
;; |
433 |
sub r15 = r14, r21 |
ld8 c[26] = [cura0], stride |
434 |
adds r16 = 3, r17 |
add mean2 = mean2, psad[2] |
435 |
|
} |
436 |
|
{.mii |
437 |
|
ld8 c[27] = [cura1], stride |
438 |
|
add mean3 = mean3, psad[3] |
439 |
;; |
;; |
440 |
cmp4.ge p6, p7 = 0, r15 |
psad1 psad[2] = c[12], r0 |
441 |
adds r17 = 4, r17 |
} |
442 |
|
{.mmi |
443 |
|
ld8 c[28] = [cura0], stride |
444 |
|
ld8 c[29] = [cura1], stride |
445 |
|
psad1 psad[3] = c[13], r0 |
446 |
;; |
;; |
447 |
(p7) add r8 = r8, r15 |
} |
448 |
(p6) sub r14 = r21, r14 |
{.mii |
449 |
|
ld8 c[30] = [cura0] |
450 |
|
psad1 psad[6] = c[14], r0 |
451 |
|
psad1 psad[7] = c[15], r0 |
452 |
|
} |
453 |
|
{.mmi |
454 |
|
ld8 c[31] = [cura1] |
455 |
;; |
;; |
456 |
(p6) add r8 = r8, r14 |
add mean0 = mean0, psad[0] |
457 |
ld1 r14 = [r16] |
add mean1 = mean1, psad[1] |
458 |
|
} |
459 |
|
{.mii |
460 |
|
add mean2 = mean2, psad[4] |
461 |
|
add mean3 = mean3, psad[5] |
462 |
;; |
;; |
463 |
sub r15 = r14, r21 |
psad1 psad[0] = c[16], r0 |
464 |
|
} |
465 |
|
{.mmi |
466 |
|
add mean0 = mean0, psad[2] |
467 |
|
add mean1 = mean1, psad[3] |
468 |
|
psad1 psad[1] = c[17], r0 |
469 |
;; |
;; |
470 |
cmp4.ge p6, p7 = 0, r15 |
} |
471 |
|
{.mii |
472 |
|
add mean2 = mean2, psad[6] |
473 |
|
psad1 psad[2] = c[18], r0 |
474 |
|
psad1 psad[3] = c[19], r0 |
475 |
|
} |
476 |
|
{.mmi |
477 |
|
add mean3 = mean3, psad[7] |
478 |
;; |
;; |
479 |
(p7) add r8 = r8, r15 |
add mean0 = mean0, psad[0] |
480 |
(p6) sub r14 = r21, r14 |
psad1 psad[4] = c[20], r0 |
481 |
|
} |
482 |
|
{.mii |
483 |
|
add mean1 = mean1, psad[1] |
484 |
|
psad1 psad[5] = c[21], r0 |
485 |
;; |
;; |
486 |
(p6) add r8 = r8, r14 |
psad1 psad[6] = c[22], r0 |
487 |
br.cloop.sptk.few .L144 |
} |
488 |
adds r24 = 1, r24 |
{.mmi |
489 |
add r23 = r23, r33 |
add mean2 = mean2, psad[2] |
490 |
|
add mean3 = mean3, psad[3] |
491 |
|
psad1 psad[7] = c[23], r0 |
492 |
;; |
;; |
493 |
cmp4.geu p6, p7 = 15, r24 |
} |
494 |
(p6) br.cond.dptk .L60 |
{.mii |
495 |
mov ar.lc = r2 |
add mean0 = mean0, psad[4] |
496 |
|
psad1 psad[0] = c[24], r0 |
497 |
|
psad1 psad[1] = c[25], r0 |
498 |
|
} |
499 |
|
{.mmi |
500 |
|
add mean1 = mean1, psad[5] |
501 |
|
;; |
502 |
|
add mean2 = mean2, psad[6] |
503 |
|
psad1 psad[2] = c[26], r0 |
504 |
|
} |
505 |
|
{.mii |
506 |
|
add mean3 = mean3, psad[7] |
507 |
|
psad1 psad[3] = c[27], r0 |
508 |
|
;; |
509 |
|
psad1 psad[4] = c[28], r0 |
510 |
|
} |
511 |
|
{.mmi |
512 |
|
add mean0 = mean0, psad[0] |
513 |
|
add mean1 = mean1, psad[1] |
514 |
|
psad1 psad[5] = c[29], r0 |
515 |
|
;; |
516 |
|
} |
517 |
|
{.mii |
518 |
|
add mean2 = mean2, psad[2] |
519 |
|
psad1 psad[6] = c[30], r0 |
520 |
|
psad1 psad[7] = c[31], r0 |
521 |
|
} |
522 |
|
{.mmi |
523 |
|
add mean3 = mean3, psad[3] |
524 |
|
;; |
525 |
|
add mean0 = mean0, psad[4] |
526 |
|
add mean1 = mean1, psad[5] |
527 |
|
} |
528 |
|
{.mbb |
529 |
|
add mean2 = mean2, mean3 |
530 |
|
nop.b 1 |
531 |
|
nop.b 1 |
532 |
|
;; |
533 |
|
} |
534 |
|
{.mib |
535 |
|
add mean0 = mean0, psad[6] |
536 |
|
add mean1 = mean1, psad[7] |
537 |
|
nop.b 1 |
538 |
|
;; |
539 |
|
} |
540 |
|
{.mib |
541 |
|
add mean0 = mean0, mean1 |
542 |
|
// add mean2 = 127, mean2 // this could make our division more exact, but does not help much |
543 |
|
;; |
544 |
|
} |
545 |
|
{.mib |
546 |
|
add mean0 = mean0, mean2 |
547 |
|
;; |
548 |
|
} |
549 |
|
|
550 |
|
{.mib |
551 |
|
shr.u mean0 = mean0, 8 // divide them ... |
552 |
|
;; |
553 |
|
} |
554 |
|
{.mib |
555 |
|
mux1 mean0 = mean0, @brcst |
556 |
|
;; |
557 |
|
} |
558 |
|
{.mii |
559 |
|
nop.m 0 |
560 |
|
psad1 dev0 = c[0], mean0 // and do a sad again ... |
561 |
|
psad1 dev1 = c[1], mean0 |
562 |
|
} |
563 |
|
{.mii |
564 |
|
nop.m 0 |
565 |
|
psad1 dev2 = c[2], mean0 |
566 |
|
psad1 dev3 = c[3], mean0 |
567 |
|
} |
568 |
|
{.mii |
569 |
|
nop.m 0 |
570 |
|
psad1 psad[0] = c[4], mean0 |
571 |
|
psad1 psad[1] = c[5], mean0 |
572 |
|
} |
573 |
|
{.mii |
574 |
|
nop.m 0 |
575 |
|
psad1 psad[2] = c[6], mean0 |
576 |
|
psad1 psad[3] = c[7], mean0 |
577 |
|
} |
578 |
|
{.mii |
579 |
|
nop.m 0 |
580 |
|
psad1 psad[4] = c[8], mean0 |
581 |
|
psad1 psad[5] = c[9], mean0 |
582 |
|
;; |
583 |
|
} |
584 |
|
{.mii |
585 |
|
add dev0 = dev0, psad[0] |
586 |
|
psad1 psad[6] = c[10], mean0 |
587 |
|
psad1 psad[7] = c[11], mean0 |
588 |
|
} |
589 |
|
{.mmi |
590 |
|
add dev1 = dev1, psad[1] |
591 |
|
|
592 |
|
add dev2 = dev2, psad[2] |
593 |
|
psad1 psad[0] = c[12], mean0 |
594 |
|
} |
595 |
|
{.mii |
596 |
|
add dev3 = dev3, psad[3] |
597 |
|
psad1 psad[1] = c[13], mean0 |
598 |
|
;; |
599 |
|
psad1 psad[2] = c[14], mean0 |
600 |
|
} |
601 |
|
{.mmi |
602 |
|
add dev0 = dev0, psad[4] |
603 |
|
add dev1 = dev1, psad[5] |
604 |
|
psad1 psad[3] = c[15], mean0 |
605 |
|
} |
606 |
|
{.mii |
607 |
|
add dev2 = dev2, psad[6] |
608 |
|
psad1 psad[4] = c[16], mean0 |
609 |
|
psad1 psad[5] = c[17], mean0 |
610 |
|
} |
611 |
|
{.mmi |
612 |
|
add dev3 = dev3, psad[7] |
613 |
|
;; |
614 |
|
add dev0 = dev0, psad[0] |
615 |
|
psad1 psad[6] = c[18], mean0 |
616 |
|
} |
617 |
|
{.mii |
618 |
|
add dev1 = dev1, psad[1] |
619 |
|
psad1 psad[7] = c[19], mean0 |
620 |
|
|
621 |
|
psad1 psad[0] = c[20], mean0 |
622 |
|
} |
623 |
|
{.mmi |
624 |
|
add dev2 = dev2, psad[2] |
625 |
|
add dev3 = dev3, psad[3] |
626 |
|
psad1 psad[1] = c[21], mean0 |
627 |
|
;; |
628 |
|
} |
629 |
|
{.mii |
630 |
|
add dev0 = dev0, psad[4] |
631 |
|
psad1 psad[2] = c[22], mean0 |
632 |
|
psad1 psad[3] = c[23], mean0 |
633 |
|
} |
634 |
|
{.mmi |
635 |
|
add dev1 = dev1, psad[5] |
636 |
|
|
637 |
|
add dev2 = dev2, psad[6] |
638 |
|
psad1 psad[4] = c[24], mean0 |
639 |
|
} |
640 |
|
{.mii |
641 |
|
add dev3 = dev3, psad[7] |
642 |
|
psad1 psad[5] = c[25], mean0 |
643 |
|
;; |
644 |
|
psad1 psad[6] = c[26], mean0 |
645 |
|
} |
646 |
|
{.mmi |
647 |
|
add dev0 = dev0, psad[0] |
648 |
|
add dev1 = dev1, psad[1] |
649 |
|
psad1 psad[7] = c[27], mean0 |
650 |
|
} |
651 |
|
{.mii |
652 |
|
add dev2 = dev2, psad[2] |
653 |
|
psad1 psad[0] = c[28], mean0 |
654 |
|
psad1 psad[1] = c[29], mean0 |
655 |
|
} |
656 |
|
{.mmi |
657 |
|
add dev3 = dev3, psad[3] |
658 |
|
;; |
659 |
|
add dev0 = dev0, psad[4] |
660 |
|
psad1 psad[2] = c[30], mean0 |
661 |
|
} |
662 |
|
{.mii |
663 |
|
add dev1 = dev1, psad[5] |
664 |
|
psad1 psad[3] = c[31], mean0 |
665 |
|
;; |
666 |
|
add dev2 = dev2, psad[6] |
667 |
|
} |
668 |
|
{.mmi |
669 |
|
add dev3 = dev3, psad[7] |
670 |
|
add dev0 = dev0, psad[0] |
671 |
|
add dev1 = dev1, psad[1] |
672 |
|
;; |
673 |
|
} |
674 |
|
{.mii |
675 |
|
add dev2 = dev2, psad[2] |
676 |
|
add dev3 = dev3, psad[3] |
677 |
|
add ret0 = dev0, dev1 |
678 |
|
;; |
679 |
|
} |
680 |
|
{.mib |
681 |
|
add dev2 = dev2, dev3 |
682 |
|
nop.i 1 |
683 |
|
nop.b 1 |
684 |
|
;; |
685 |
|
} |
686 |
|
{.mib |
687 |
|
add ret0 = ret0, dev2 |
688 |
|
nop.i 1 |
689 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
690 |
|
} |
691 |
.endp dev16_ia64# |
.endp dev16_ia64# |