Parent Directory | Revision Log
Revision 1.2 - (view) (download)
1 : | ia64p | 1.2 | /**************************************************************************** |
2 : | * | ||
3 : | * mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, | ||
4 : | * University of Karlsruhe, Germany, 03.06.2002, during the laboratory | ||
5 : | * "IA-64 Video Codec Assember Parktikum" at IPD Goos. | ||
6 : | * | ||
7 : | * Annotations: | ||
8 : | * =========== | ||
9 : | * | ||
10 : | * - All functions work on 8x8-matrices. While the C-code-functions treat each | ||
11 : | * element seperatly, the functions in this assembler-code treat a whole line | ||
12 : | * simultaneously. So one loop is saved. | ||
13 : | * The remaining loop is relized by using softwarepipelining with rotating | ||
14 : | * rregisters. | ||
15 : | * - Register renaming is used for better readability | ||
16 : | * - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both | ||
17 : | * parts are shifted and joined together with an "OR"-Instruction. | ||
18 : | * - First parameter is stored in GR 32, next in GR 33, and so on. They must be | ||
19 : | * saved, as these GRs are used for register-rotation. | ||
20 : | * - Some of the orininal, German comments used during development are left in | ||
21 : | * in the code. They shouldn't bother anyone. | ||
22 : | * | ||
23 : | * Anmerkungen: | ||
24 : | * ============ | ||
25 : | * | ||
26 : | * - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code | ||
27 : | * jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- | ||
28 : | * Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. | ||
29 : | * Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit | ||
30 : | * rotierenden Registern realisiert. | ||
31 : | * - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. | ||
32 : | * - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke | ||
33 : | * geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem | ||
34 : | * logischen Oder zusammenkopiert. | ||
35 : | * - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- | ||
36 : | * sichert werden, da die Register für die register-Rotation benötigt werden. | ||
37 : | * - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase | ||
38 : | * sind im Code verblieben. Sie sollten niemanden stören. | ||
39 : | * | ||
40 : | ****************************************************************************/ | ||
41 : | |||
42 : | |||
43 : | // *** define Latencies for software pipilines *** | ||
44 : | |||
45 : | LL = 3 // Load | ||
46 : | SL = 3 // Store | ||
47 : | PL = 1 // Pack | ||
48 : | SHL = 1 // Shift | ||
49 : | OL = 1 // Or | ||
50 : | UL = 1 // Unpack | ||
51 : | PAL = 1 // Parallel Add | ||
52 : | PSL = 1 // Parallel Subtract | ||
53 : | PAVGL = 1 // Parallel Avarage | ||
54 : | |||
55 : | .text | ||
56 : | |||
57 : | |||
58 : | /**************************************************************************** | ||
59 : | * | ||
60 : | * transfer8x8_copy_ia64 | ||
61 : | * | ||
62 : | * SRC is missaligned, to align the source load two 8-bytes-words, shift it, | ||
63 : | * join them and store the aligned source into the destination address. | ||
64 : | * | ||
65 : | ****************************************************************************/ | ||
66 : | |||
67 : | .align 16 | ||
68 : | .global transfer8x8_copy_ia64# | ||
69 : | .proc transfer8x8_copy_ia64# | ||
70 : | |||
71 : | transfer8x8_copy_ia64: | ||
72 : | .prologue | ||
73 : | |||
74 : | // *** register renaming *** | ||
75 : | zero = r0 | ||
76 : | |||
77 : | oldLC = r2 | ||
78 : | oldPR = r3 | ||
79 : | |||
80 : | src_1 = r14 // left aligned address of src | ||
81 : | src_2 = r15 // right aligned address of src | ||
82 : | dst = r16 // destination address | ||
83 : | stride = r17 | ||
84 : | |||
85 : | offset = r18 // shift right offset | ||
86 : | aoffset = r19 // shift left offset | ||
87 : | |||
88 : | |||
89 : | .body | ||
90 : | |||
91 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
92 : | .save ar.lc, oldLC | ||
93 : | mov oldLC = ar.lc | ||
94 : | mov oldPR = pr | ||
95 : | |||
96 : | // *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** | ||
97 : | alloc r9 = ar.pfs, 3, 29, 0, 32 | ||
98 : | |||
99 : | // *** Saving Parameters *** | ||
100 : | mov dst = r32 | ||
101 : | mov stride = r34 | ||
102 : | |||
103 : | // *** Misalingment-Treatment *** | ||
104 : | and src_1 = -8, r33 // Computing adress of first aligned block containing src-values | ||
105 : | dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress | ||
106 : | ;; | ||
107 : | sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl | ||
108 : | add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values | ||
109 : | |||
110 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
111 : | mov ar.lc = 7 | ||
112 : | mov ar.ec = LL + SHL + OL + 1 | ||
113 : | mov pr.rot = 1 << 16 | ||
114 : | ;; | ||
115 : | |||
116 : | // *** define register arrays and predicate array for software pipeline *** | ||
117 : | // src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left | ||
118 : | .rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] | ||
119 : | .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] | ||
120 : | |||
121 : | /* Software pipelined loop: | ||
122 : | * Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 | ||
123 : | * Stage 2: Shift both values of source to SHD_R and SHD_L | ||
124 : | * Stage 3: Join both parts together with OR | ||
125 : | * Stage 4: Store aligned date to destination and add stride to destination address */ | ||
126 : | .Loop_8x8copy: | ||
127 : | {.mii | ||
128 : | (ld_stage[0]) ld8 src_v1[0] = [src_1], stride | ||
129 : | (sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset | ||
130 : | } | ||
131 : | {.mii | ||
132 : | (ld_stage[0]) ld8 src_v2[0] = [src_2], stride | ||
133 : | (sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset | ||
134 : | (or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL] | ||
135 : | } | ||
136 : | {.mib | ||
137 : | (st_stage[0]) st8 [dst] = value[OL] | ||
138 : | (st_stage[0]) add dst = dst, stride | ||
139 : | br.ctop.sptk.few .Loop_8x8copy | ||
140 : | ;; | ||
141 : | } | ||
142 : | |||
143 : | // *** Restore old LC and PRs *** | ||
144 : | mov ar.lc = oldLC | ||
145 : | mov pr = oldPR, -1 | ||
146 : | |||
147 : | br.ret.sptk.many b0 | ||
148 : | |||
149 : | .endp transfer8x8_copy_ia64# | ||
150 : | |||
151 : | |||
152 : | |||
153 : | |||
154 : | /***************************************************************************** | ||
155 : | * | ||
156 : | * transfer_8to16copy_ia64 | ||
157 : | * | ||
158 : | * SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, | ||
159 : | * UNPACK is used. So 8 bytes are loaded from source, unpacked to two | ||
160 : | * 4 x 16 bit values and stored to the destination. Destination is a continuous | ||
161 : | * array of 64 x 16 bit signed data. To store the next line, only 16 must be | ||
162 : | * added to the destination address. | ||
163 : | *****************************************************************************/ | ||
164 : | |||
165 : | ia64p | 1.1 | .align 16 |
166 : | .global transfer_8to16copy_ia64# | ||
167 : | .proc transfer_8to16copy_ia64# | ||
168 : | ia64p | 1.2 | |
169 : | |||
170 : | ia64p | 1.1 | transfer_8to16copy_ia64: |
171 : | .prologue | ||
172 : | ia64p | 1.2 | |
173 : | // *** register renaming *** | ||
174 : | oldLC = r2 | ||
175 : | oldPR = r3 | ||
176 : | |||
177 : | zero = r0 // damit ist die Zahl "zero" = 0 gemeint | ||
178 : | |||
179 : | dst_1 = r14 // destination address for first 4 x 16 bit values | ||
180 : | dst_2 = r15 // destination address for second 4 x 16 bit values | ||
181 : | src = r16 | ||
182 : | stride = r17 | ||
183 : | |||
184 : | |||
185 : | ia64p | 1.1 | .body |
186 : | ia64p | 1.2 | |
187 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
188 : | .save ar.lc, oldLC | ||
189 : | mov oldLC = ar.lc | ||
190 : | mov oldPR = pr | ||
191 : | |||
192 : | // *** Allocating new stackframe, define rotating registers *** | ||
193 : | alloc r9 = ar.pfs, 4, 92, 0, 96 | ||
194 : | |||
195 : | // *** Saving Paramters *** | ||
196 : | mov dst_1 = r32 // fist 4 x 16 bit values | ||
197 : | add dst_2 = 8, r32 // second 4 x 16 bit values | ||
198 : | mov src = r33 | ||
199 : | mov stride = r34 | ||
200 : | |||
201 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
202 : | mov ar.lc = 7 | ||
203 : | mov ar.ec = LL + UL + 1 | ||
204 : | mov pr.rot = 1 << 16 | ||
205 : | ;; | ||
206 : | |||
207 : | // *** define register arrays and predicate array for software pipeline *** | ||
208 : | // src_v = source value, dst_v1 = destination value 1 | ||
209 : | .rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] | ||
210 : | .rotp ld_stage[LL], upack_stage[UL], st_stage[1] | ||
211 : | |||
212 : | /* Software pipelined loop: | ||
213 : | * Stage 1: Load value of SRC | ||
214 : | * Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data | ||
215 : | * Stage 3: Store both 8 byte of 16 bit data */ | ||
216 : | .Loop_8to16copy: | ||
217 : | {.mii | ||
218 : | (ld_stage[0]) ld8 src_v[0] = [src], stride | ||
219 : | (upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL] | ||
220 : | (upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL] | ||
221 : | } | ||
222 : | {.mmb | ||
223 : | (st_stage[0]) st8 [dst_1] = dst_v1[UL], 16 | ||
224 : | (st_stage[0]) st8 [dst_2] = dst_v2[UL], 16 | ||
225 : | br.ctop.sptk.few .Loop_8to16copy | ||
226 : | ;; | ||
227 : | } | ||
228 : | |||
229 : | // *** Restore old LC and PRs *** | ||
230 : | mov ar.lc = oldLC | ||
231 : | mov pr = oldPR, -1 | ||
232 : | |||
233 : | ia64p | 1.1 | br.ret.sptk.many b0 |
234 : | .endp transfer_8to16copy_ia64# | ||
235 : | ia64p | 1.2 | |
236 : | |||
237 : | |||
238 : | |||
239 : | /***************************************************************************** | ||
240 : | * | ||
241 : | * transfer_16to8copy_ia64 | ||
242 : | * | ||
243 : | * src is a 64 x 16 bit signed continuous array. To convert the 16 bit | ||
244 : | * values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of | ||
245 : | * 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word | ||
246 : | * of 8 x 8 unsigned data to the destination. | ||
247 : | ****************************************************************************/ | ||
248 : | |||
249 : | ia64p | 1.1 | .align 16 |
250 : | .global transfer_16to8copy_ia64# | ||
251 : | .proc transfer_16to8copy_ia64# | ||
252 : | transfer_16to8copy_ia64: | ||
253 : | .prologue | ||
254 : | ia64p | 1.2 | |
255 : | // *** register renaming *** | ||
256 : | dst = r14 | ||
257 : | src_1 = r15 | ||
258 : | src_2 = r17 | ||
259 : | stride = r16 | ||
260 : | |||
261 : | |||
262 : | ia64p | 1.1 | .body |
263 : | ia64p | 1.2 | |
264 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
265 : | .save ar.lc, oldLC | ||
266 : | mov oldLC = ar.lc | ||
267 : | mov oldPR = pr | ||
268 : | |||
269 : | // *** Allocating new stackframe, define rotating registers *** | ||
270 : | alloc r9 = ar.pfs, 4, 92, 0, 96 | ||
271 : | |||
272 : | // *** Saving Paramters *** | ||
273 : | mov dst = r32 | ||
274 : | mov src_1 = r33 | ||
275 : | add src_2 = 8, r33 | ||
276 : | mov stride = r34 | ||
277 : | |||
278 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
279 : | mov ar.lc = 7 | ||
280 : | mov ar.ec = LL + PL + 1 | ||
281 : | mov pr.rot = 1 << 16 | ||
282 : | ;; | ||
283 : | |||
284 : | // *** define register arrays and predicate array for software pipeline *** | ||
285 : | // src_v1 = source value 1, dst_v = destination value | ||
286 : | .rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1] | ||
287 : | .rotp ld_stage[LL], pack_stage[PL], st_stage[1] | ||
288 : | |||
289 : | |||
290 : | /* Software pipelined loop: | ||
291 : | * Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data | ||
292 : | * Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data | ||
293 : | * Stage 3: Store the 8 byte to the destination address and add stride to | ||
294 : | * destination address (to get the next 8 byte line of destination)*/ | ||
295 : | .Loop_16to8copy: | ||
296 : | {.mmi | ||
297 : | (ld_stage[0]) ld8 src_v1[0] = [src_1], 16 | ||
298 : | (ld_stage[0]) ld8 src_v2[0] = [src_2], 16 | ||
299 : | (pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL] | ||
300 : | } | ||
301 : | {.mib | ||
302 : | (st_stage[0]) st8 [dst] = dst_v[PL] | ||
303 : | (st_stage[0]) add dst = dst, stride | ||
304 : | br.ctop.sptk.few .Loop_16to8copy | ||
305 : | ;; | ||
306 : | } | ||
307 : | |||
308 : | // *** Restore old LC and PRs *** | ||
309 : | mov ar.lc = oldLC | ||
310 : | mov pr = oldPR, -1 | ||
311 : | |||
312 : | ia64p | 1.1 | br.ret.sptk.many b0 |
313 : | .endp transfer_16to8copy_ia64# | ||
314 : | ia64p | 1.2 | |
315 : | |||
316 : | |||
317 : | /***************************************************************************** | ||
318 : | * | ||
319 : | * transfer_16to8add_ia64 | ||
320 : | * | ||
321 : | * The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- | ||
322 : | * bit-values. These are "parallel-added" to the values of src. The result is | ||
323 : | * converted into 8-bit-values using "PACK" and stored at the adress of dst. | ||
324 : | * We assume that there is no misalignment. | ||
325 : | * | ||
326 : | *****************************************************************************/ | ||
327 : | |||
328 : | .align 16 | ||
329 : | .global transfer_16to8add_ia64# | ||
330 : | .proc transfer_16to8add_ia64# | ||
331 : | |||
332 : | transfer_16to8add_ia64: | ||
333 : | .prologue | ||
334 : | |||
335 : | // *** register renaming *** | ||
336 : | dst = r14 | ||
337 : | src = r15 | ||
338 : | stride = r16 | ||
339 : | |||
340 : | _src = r17 | ||
341 : | |||
342 : | |||
343 : | .body | ||
344 : | |||
345 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
346 : | .save ar.lc, r2 | ||
347 : | mov oldLC = ar.lc | ||
348 : | mov oldPR = pr | ||
349 : | |||
350 : | // *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** | ||
351 : | alloc r9 = ar.pfs, 4, 92, 0, 96 | ||
352 : | |||
353 : | // *** Saving Paramters *** | ||
354 : | mov dst = r32 | ||
355 : | mov src = r33 | ||
356 : | mov stride = r34 | ||
357 : | add _src = 8, r33 | ||
358 : | |||
359 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
360 : | mov ar.lc = 7 | ||
361 : | mov ar.ec = LL + UL + PAL + PL + 1 | ||
362 : | mov pr.rot = 1 << 16 | ||
363 : | ;; | ||
364 : | |||
365 : | // *** define register arrays and predicate array for software pipeline *** | ||
366 : | .rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1] | ||
367 : | .rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] | ||
368 : | |||
369 : | |||
370 : | /* Software pipelined loop: | ||
371 : | * s1_p: The values of src and dst are loaded | ||
372 : | * s2_p: The dst-values are converted to 16-bit-values | ||
373 : | * s3_p: The values of src and dst are added | ||
374 : | * s4_p: The Results are packed into 8-bit-values | ||
375 : | * s5_p: The 8-bit-values are stored at the dst-adresses | ||
376 : | */ | ||
377 : | |||
378 : | .Loop_16to8add: | ||
379 : | {.mii | ||
380 : | (s1_p[0]) ld8 w_src_1[0] = [src], 16 // läd die 1. Hälfte der j. Zeile von src (i = 0..3) | ||
381 : | (s1_p[0]) mov _dst[0] = dst // erhöht die Adresse von dst um stride | ||
382 : | (s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst | ||
383 : | } | ||
384 : | {.mii | ||
385 : | (s1_p[0]) ld8 w_dst8[0] = [dst], stride // läd die j. Zeile von dst | ||
386 : | (s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird für i = 0..3 in 16-Bit umgewandelt | ||
387 : | (s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird für i = 4..7 in 16-Bit umgewandelt | ||
388 : | } | ||
389 : | {.mii | ||
390 : | (s1_p[0]) ld8 w_src_2[0] = [_src], 16 // läd die 2. Hälfte der j. Zeile von src (i = 4..7) | ||
391 : | (s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst | ||
392 : | (s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die Überprüfung der Wertebereiche erfolgt automatisch | ||
393 : | } | ||
394 : | {.mmb | ||
395 : | (s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab | ||
396 : | (s1_p[0]) nop.m 0 | ||
397 : | br.ctop.sptk.few .Loop_16to8add | ||
398 : | ;; | ||
399 : | } | ||
400 : | |||
401 : | // *** Restore old LC and PRs *** | ||
402 : | mov ar.lc = oldLC | ||
403 : | mov pr = oldPR, -1 | ||
404 : | |||
405 : | br.ret.sptk.many b0 | ||
406 : | .endp transfer_16to8add_ia64# | ||
407 : | |||
408 : | |||
409 : | |||
410 : | /***************************************************************************** | ||
411 : | * | ||
412 : | * transfer_8to16sub_ia64 | ||
413 : | * | ||
414 : | * The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The | ||
415 : | * Difference of cur and ref ist stored at the dct-adresses and cur is copied | ||
416 : | * into the ref-array. | ||
417 : | * | ||
418 : | * You must assume, that the data adressed by 'ref' are misaligned in memory. | ||
419 : | * But you can assume, that the other data are aligned (at least I hope so). | ||
420 : | * | ||
421 : | ****************************************************************************/ | ||
422 : | |||
423 : | ia64p | 1.1 | .align 16 |
424 : | .global transfer_8to16sub_ia64# | ||
425 : | .proc transfer_8to16sub_ia64# | ||
426 : | ia64p | 1.2 | |
427 : | |||
428 : | ia64p | 1.1 | transfer_8to16sub_ia64: |
429 : | .prologue | ||
430 : | ia64p | 1.2 | |
431 : | // *** register renaming *** | ||
432 : | oldLC = r2 | ||
433 : | oldPR = r3 | ||
434 : | |||
435 : | zero = r0 // damit ist die Zahl "zero" = 0 gemeint | ||
436 : | |||
437 : | //Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage | ||
438 : | dct = r14 | ||
439 : | cur = r15 | ||
440 : | ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das ÜbergabeRegister in dieser Liste | ||
441 : | stride = r16 | ||
442 : | |||
443 : | offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtrücken | ||
444 : | aoffset = r18 // Gegenstück zum Offset, | ||
445 : | ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref | ||
446 : | ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref | ||
447 : | |||
448 : | _dct = r21 // Register für die Zieladressen des 2. dct-Blocks | ||
449 : | |||
450 : | |||
451 : | ia64p | 1.1 | .body |
452 : | ia64p | 1.2 | |
453 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
454 : | .save ar.lc, r2 | ||
455 : | mov oldLC = ar.lc | ||
456 : | mov oldPR = pr | ||
457 : | |||
458 : | // *** Allocating new stackframe, define rotating registers *** | ||
459 : | alloc r9 = ar.pfs, 4, 92, 0, 96 | ||
460 : | |||
461 : | // *** Saving Paramters *** | ||
462 : | mov dct = r32 | ||
463 : | mov cur = r33 | ||
464 : | // mov ref = r34: ref is unaligned, get aligned ref below... | ||
465 : | mov stride = r35 | ||
466 : | |||
467 : | and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8) | ||
468 : | dep offset = ref, zero, 3, 3 | ||
469 : | ;; | ||
470 : | add ref_a2 = 8, ref_a1 | ||
471 : | sub aoffset = 64, offset // Gegenstück zum Offset wird berechnet | ||
472 : | add _dct = 8, dct // Die Adresse für den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) höher als beim 1. Block | ||
473 : | |||
474 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
475 : | mov ar.lc = 7 | ||
476 : | mov ar.ec = LL + SHL + OL + UL + PSL + 1 | ||
477 : | mov pr.rot = 1 << 16 | ||
478 : | ;; | ||
479 : | |||
480 : | // *** define register arrays and predicate array for software pipeline *** | ||
481 : | .rotr c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1], dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1] | ||
482 : | .rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] | ||
483 : | |||
484 : | |||
485 : | /* Software pipelined loop: | ||
486 : | * s1_p: The values of ref and cur ale loaded, a copy of cur is made. | ||
487 : | * s2_p: cur is converted to 16-bit and thehe misaligned values of ref are | ||
488 : | * shifted... | ||
489 : | * s3_p: ... and copied together. | ||
490 : | * s4_p: This ref-value is converted to 16-bit. The values of cur are stored | ||
491 : | * at the ref-adresses. | ||
492 : | * s5_p: the ref- abd cur-values are substracted... | ||
493 : | * s6_p: ...and the result is stored at the dct-adresses. | ||
494 : | */ | ||
495 : | |||
496 : | loop_8to16sub: | ||
497 : | {.mii | ||
498 : | (s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // läd den 1. 64-Bit-Block, der einen Teil der ref-Daten enthält | ||
499 : | (s1_p[0]) mov _cur[0] = cur // cur wird für spätere Verwendung gesichert | ||
500 : | (s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte Hälfte wird zurechtgerückt | ||
501 : | } | ||
502 : | {.mii | ||
503 : | (s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // läd den 2. 64-Bit-Block | ||
504 : | (s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke Hälfte wird zurechtgerückt | ||
505 : | (s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtgerückten Daten werden in r zusammenkopiert | ||
506 : | } | ||
507 : | {.mii | ||
508 : | (s1_p[0]) ld8 c[0] = [cur], stride //läd die j. Zeile von cur komplett | ||
509 : | (s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird für i = 0..3 in 16-Bit umgewandelt | ||
510 : | (s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird für i = 4..7 in 16-Bit umgewandelt | ||
511 : | } | ||
512 : | {.mii | ||
513 : | (s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt | ||
514 : | //Umwandeln der 8-Bit r und c -Werte in 16-bit Werte | ||
515 : | (s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird für i = 0..3 in 16-Bit umgewandelt | ||
516 : | (s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird für i = 4..7 in 16-Bit umgewandelt | ||
517 : | } | ||
518 : | {.mii | ||
519 : | (s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. Häfte der j. Zeile | ||
520 : | (s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. Hälfte | ||
521 : | } | ||
522 : | {.mmb | ||
523 : | (s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert | ||
524 : | (s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert | ||
525 : | br.ctop.sptk.few loop_8to16sub // Und hopp | ||
526 : | ;; | ||
527 : | } | ||
528 : | |||
529 : | // *** Restore old LC and PRs *** | ||
530 : | mov ar.lc = oldLC | ||
531 : | mov pr = oldPR, -1 | ||
532 : | |||
533 : | ia64p | 1.1 | br.ret.sptk.many b0 |
534 : | .endp transfer_8to16sub_ia64# | ||
535 : | ia64p | 1.2 | |
536 : | |||
537 : | |||
538 : | |||
539 : | |||
540 : | /***************************************************************************** | ||
541 : | * | ||
542 : | * transfer_8to16sub2_ia64 | ||
543 : | * | ||
544 : | * At the time, this function was written, it was not yet in use. | ||
545 : | * We assume that the values of ref1/2 are misaligned. | ||
546 : | * | ||
547 : | * The values of ref1/2 and cur are loaded, the ref-values need misalignment- | ||
548 : | * treatment. The values are converted to 16-bit using unpack. The average of | ||
549 : | * ref1 and ref2 is computed with pavg and substacted from cur. The results are | ||
550 : | * stored at the dct-adresses. | ||
551 : | * pavg1.raz is used to get the same results as the C-code-function. | ||
552 : | * | ||
553 : | *****************************************************************************/ | ||
554 : | |||
555 : | .text | ||
556 : | ia64p | 1.1 | .align 16 |
557 : | .global transfer_8to16sub2_ia64# | ||
558 : | .proc transfer_8to16sub2_ia64# | ||
559 : | ia64p | 1.2 | |
560 : | ia64p | 1.1 | transfer_8to16sub2_ia64: |
561 : | .prologue | ||
562 : | ia64p | 1.2 | |
563 : | // *** register renaming *** | ||
564 : | // We've tried to keep the C-Code names as often as possible, at least as | ||
565 : | // part of register-names | ||
566 : | oldLC = r2 | ||
567 : | oldPR = r3 | ||
568 : | |||
569 : | zero = r0 | ||
570 : | |||
571 : | dct_al = r14 // dct: adress of left block in one line | ||
572 : | dct_ar = r15 // dct: adress of right block in one line | ||
573 : | cur = r16 | ||
574 : | ref1_al = r17 // ref1: aligned adress of lower part | ||
575 : | ref1_ah = r18 // ref1: aligned adress of higher part | ||
576 : | ref2_al = r19 // ref2: aligned adress of lower part | ||
577 : | ref2_ah = r20 // ref2: aligned adress of higher part | ||
578 : | stride = r21 | ||
579 : | |||
580 : | offset_1 = r22 | ||
581 : | offset_2 = r23 | ||
582 : | aoffset_1 = r24 | ||
583 : | aoffset_2 = r25 | ||
584 : | |||
585 : | |||
586 : | .body | ||
587 : | |||
588 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
589 : | ia64p | 1.1 | .save ar.lc, r2 |
590 : | ia64p | 1.2 | mov oldLC = ar.lc |
591 : | mov oldPR = pr | ||
592 : | |||
593 : | // *** Saving Paramters *** | ||
594 : | // *** (as inputregisters r32 + are needed for register-rotation) *** | ||
595 : | mov dct_ar = r32 | ||
596 : | add dct_al = 8, r32 | ||
597 : | mov cur = r33 | ||
598 : | |||
599 : | and ref1_al = -8, r34 | ||
600 : | and ref2_al = -8, r35 // ref2 aligned adrress of lower part | ||
601 : | |||
602 : | mov stride = r36 | ||
603 : | |||
604 : | // *** Calculations for Misaligment-Handling *** | ||
605 : | dep offset_1 = r34, zero, 3, 3 | ||
606 : | dep offset_2 = r35, zero, 3, 3 | ||
607 : | ;; | ||
608 : | add ref1_ah = 8, ref1_al | ||
609 : | add ref2_ah = 8, ref2_al | ||
610 : | sub aoffset_1 = 64, offset_1 | ||
611 : | sub aoffset_2 = 64, offset_2 | ||
612 : | ;; | ||
613 : | |||
614 : | // *** Allocating new stackframe, define rotating registers *** | ||
615 : | alloc r9 = ar.pfs, 5, 91, 0, 96 | ||
616 : | |||
617 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
618 : | mov ar.lc = 7 | ||
619 : | mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1 | ||
620 : | mov pr.rot = 1 << 16 | ||
621 : | ;; | ||
622 : | |||
623 : | // *** define register arrays and predicate array for software pipeline *** | ||
624 : | .rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] | ||
625 : | .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] | ||
626 : | |||
627 : | /* software pipelined loop: | ||
628 : | * ld_stage: The values of ref1, ref2, cur are loaded | ||
629 : | * sh_stage: The misaligned values of ref1/2 are shifted... | ||
630 : | * or_stage: ...and copied together. | ||
631 : | * pavg_stage: The average of ref1 and ref2 is computed. | ||
632 : | * up_stage: The result and the cur-values are converted to 16-bit. | ||
633 : | * psub_stage: Those values are substracted... | ||
634 : | * st_stage: ...and stored at the dct-adresses. | ||
635 : | */ | ||
636 : | |||
637 : | .Loop_8to16sub2: | ||
638 : | {.mii | ||
639 : | (ld_stage[0]) ld8 c[0] = [cur], stride | ||
640 : | (sh_stage[0]) shr.u ref1_l[0] = ref1_vl[LL], offset_1 | ||
641 : | (sh_stage[0]) shl ref1_h[0] = ref1_vh[LL], aoffset_1 | ||
642 : | } | ||
643 : | {.mii | ||
644 : | (ld_stage[0]) ld8 ref1_vl[0] = [ref1_al], stride | ||
645 : | (sh_stage[0]) shr.u ref2_l[0] = ref2_vl[LL], offset_2 | ||
646 : | (sh_stage[0]) shl ref2_h[0] = ref2_vh[LL], aoffset_2 | ||
647 : | } | ||
648 : | {.mii | ||
649 : | (ld_stage[0]) ld8 ref1_vh[0] = [ref1_ah], stride | ||
650 : | (or_stage[0]) or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL] | ||
651 : | (or_stage[0]) or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL] | ||
652 : | } | ||
653 : | {.mii | ||
654 : | (ld_stage[0]) ld8 ref2_vl[0] = [ref2_al], stride | ||
655 : | (pavg_stage[0]) pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL] | ||
656 : | (up_stage[0]) unpack1.l r16_r[0] = zero, r[PAVGL] | ||
657 : | } | ||
658 : | {.mii | ||
659 : | (ld_stage[0]) ld8 ref2_vh[0] = [ref2_ah], stride | ||
660 : | (up_stage[0]) unpack1.h r16_l[0] = zero, r[PAVGL] | ||
661 : | (up_stage[0]) unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL] | ||
662 : | } | ||
663 : | {.mii | ||
664 : | (st_stage[0]) st8 [dct_ar] = dct16_r[PSL], 16 | ||
665 : | (up_stage[0]) unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL] | ||
666 : | (psub_stage[0]) psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL] | ||
667 : | } | ||
668 : | {.mib | ||
669 : | (st_stage[0]) st8 [dct_al] = dct16_l[PSL], 16 | ||
670 : | (psub_stage[0]) psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL] | ||
671 : | br.ctop.sptk.few .Loop_8to16sub2 // Und hopp | ||
672 : | ;; | ||
673 : | } | ||
674 : | |||
675 : | // *** Restore old LC and PRs *** | ||
676 : | mov ar.lc = oldLC | ||
677 : | mov pr = oldPR, -1 | ||
678 : | |||
679 : | ia64p | 1.1 | br.ret.sptk.many b0 |
680 : | .endp transfer_8to16sub2_ia64# |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |