[cvs] / xvidcore / src / utils / ia64_asm / mem_transfer_ia64.s Repository:
ViewVC logotype

Annotation of /xvidcore/src/utils/ia64_asm/mem_transfer_ia64.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download)

1 : ia64p 1.2 /****************************************************************************
2 :     *
3 :     * mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel,
4 :     * University of Karlsruhe, Germany, 03.06.2002, during the laboratory
5 :     * "IA-64 Video Codec Assember Parktikum" at IPD Goos.
6 :     *
7 :     * Annotations:
8 :     * ===========
9 :     *
10 :     * - All functions work on 8x8-matrices. While the C-code-functions treat each
11 :     * element seperatly, the functions in this assembler-code treat a whole line
12 :     * simultaneously. So one loop is saved.
13 :     * The remaining loop is relized by using softwarepipelining with rotating
14 :     * rregisters.
15 :     * - Register renaming is used for better readability
16 :     * - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both
17 :     * parts are shifted and joined together with an "OR"-Instruction.
18 :     * - First parameter is stored in GR 32, next in GR 33, and so on. They must be
19 :     * saved, as these GRs are used for register-rotation.
20 :     * - Some of the orininal, German comments used during development are left in
21 :     * in the code. They shouldn't bother anyone.
22 :     *
23 :     * Anmerkungen:
24 :     * ============
25 :     *
26 :     * - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code
27 :     * jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler-
28 :     * Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden.
29 :     * Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit
30 :     * rotierenden Registern realisiert.
31 :     * - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet.
32 :     * - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke
33 :     * geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem
34 :     * logischen Oder zusammenkopiert.
35 :     * - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge-
36 :     * sichert werden, da die Register für die register-Rotation benötigt werden.
37 :     * - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase
38 :     * sind im Code verblieben. Sie sollten niemanden stören.
39 :     *
40 :     ****************************************************************************/
41 :    
42 :    
43 :     // *** define Latencies for software pipilines ***
44 :    
45 :     LL = 3 // Load
46 :     SL = 3 // Store
47 :     PL = 1 // Pack
48 :     SHL = 1 // Shift
49 :     OL = 1 // Or
50 :     UL = 1 // Unpack
51 :     PAL = 1 // Parallel Add
52 :     PSL = 1 // Parallel Subtract
53 :     PAVGL = 1 // Parallel Avarage
54 :    
55 :     .text
56 :    
57 :    
58 :     /****************************************************************************
59 :     *
60 :     * transfer8x8_copy_ia64
61 :     *
62 :     * SRC is missaligned, to align the source load two 8-bytes-words, shift it,
63 :     * join them and store the aligned source into the destination address.
64 :     *
65 :     ****************************************************************************/
66 :    
67 :     .align 16
68 :     .global transfer8x8_copy_ia64#
69 :     .proc transfer8x8_copy_ia64#
70 :    
71 :     transfer8x8_copy_ia64:
72 :     .prologue
73 :    
74 :     // *** register renaming ***
75 :     zero = r0
76 :    
77 :     oldLC = r2
78 :     oldPR = r3
79 :    
80 :     src_1 = r14 // left aligned address of src
81 :     src_2 = r15 // right aligned address of src
82 :     dst = r16 // destination address
83 :     stride = r17
84 :    
85 :     offset = r18 // shift right offset
86 :     aoffset = r19 // shift left offset
87 :    
88 :    
89 :     .body
90 :    
91 :     // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
92 :     .save ar.lc, oldLC
93 :     mov oldLC = ar.lc
94 :     mov oldPR = pr
95 :    
96 :     // *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR ***
97 :     alloc r9 = ar.pfs, 3, 29, 0, 32
98 :    
99 :     // *** Saving Parameters ***
100 :     mov dst = r32
101 :     mov stride = r34
102 :    
103 :     // *** Misalingment-Treatment ***
104 :     and src_1 = -8, r33 // Computing adress of first aligned block containing src-values
105 :     dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress
106 :     ;;
107 :     sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl
108 :     add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values
109 :    
110 :     // *** init loop: set loop counter, epilog counter, predicates ***
111 :     mov ar.lc = 7
112 :     mov ar.ec = LL + SHL + OL + 1
113 :     mov pr.rot = 1 << 16
114 :     ;;
115 :    
116 :     // *** define register arrays and predicate array for software pipeline ***
117 :     // src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left
118 :     .rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1]
119 :     .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1]
120 :    
121 :     /* Software pipelined loop:
122 :     * Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2
123 :     * Stage 2: Shift both values of source to SHD_R and SHD_L
124 :     * Stage 3: Join both parts together with OR
125 :     * Stage 4: Store aligned date to destination and add stride to destination address */
126 :     .Loop_8x8copy:
127 :     {.mii
128 :     (ld_stage[0]) ld8 src_v1[0] = [src_1], stride
129 :     (sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset
130 :     }
131 :     {.mii
132 :     (ld_stage[0]) ld8 src_v2[0] = [src_2], stride
133 :     (sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset
134 :     (or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL]
135 :     }
136 :     {.mib
137 :     (st_stage[0]) st8 [dst] = value[OL]
138 :     (st_stage[0]) add dst = dst, stride
139 :     br.ctop.sptk.few .Loop_8x8copy
140 :     ;;
141 :     }
142 :    
143 :     // *** Restore old LC and PRs ***
144 :     mov ar.lc = oldLC
145 :     mov pr = oldPR, -1
146 :    
147 :     br.ret.sptk.many b0
148 :    
149 :     .endp transfer8x8_copy_ia64#
150 :    
151 :    
152 :    
153 :    
154 :     /*****************************************************************************
155 :     *
156 :     * transfer_8to16copy_ia64
157 :     *
158 :     * SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values,
159 :     * UNPACK is used. So 8 bytes are loaded from source, unpacked to two
160 :     * 4 x 16 bit values and stored to the destination. Destination is a continuous
161 :     * array of 64 x 16 bit signed data. To store the next line, only 16 must be
162 :     * added to the destination address.
163 :     *****************************************************************************/
164 :    
165 : ia64p 1.1 .align 16
166 :     .global transfer_8to16copy_ia64#
167 :     .proc transfer_8to16copy_ia64#
168 : ia64p 1.2
169 :    
170 : ia64p 1.1 transfer_8to16copy_ia64:
171 :     .prologue
172 : ia64p 1.2
173 :     // *** register renaming ***
174 :     oldLC = r2
175 :     oldPR = r3
176 :    
177 :     zero = r0 // damit ist die Zahl "zero" = 0 gemeint
178 :    
179 :     dst_1 = r14 // destination address for first 4 x 16 bit values
180 :     dst_2 = r15 // destination address for second 4 x 16 bit values
181 :     src = r16
182 :     stride = r17
183 :    
184 :    
185 : ia64p 1.1 .body
186 : ia64p 1.2
187 :     // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
188 :     .save ar.lc, oldLC
189 :     mov oldLC = ar.lc
190 :     mov oldPR = pr
191 :    
192 :     // *** Allocating new stackframe, define rotating registers ***
193 :     alloc r9 = ar.pfs, 4, 92, 0, 96
194 :    
195 :     // *** Saving Paramters ***
196 :     mov dst_1 = r32 // fist 4 x 16 bit values
197 :     add dst_2 = 8, r32 // second 4 x 16 bit values
198 :     mov src = r33
199 :     mov stride = r34
200 :    
201 :     // *** init loop: set loop counter, epilog counter, predicates ***
202 :     mov ar.lc = 7
203 :     mov ar.ec = LL + UL + 1
204 :     mov pr.rot = 1 << 16
205 :     ;;
206 :    
207 :     // *** define register arrays and predicate array for software pipeline ***
208 :     // src_v = source value, dst_v1 = destination value 1
209 :     .rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1]
210 :     .rotp ld_stage[LL], upack_stage[UL], st_stage[1]
211 :    
212 :     /* Software pipelined loop:
213 :     * Stage 1: Load value of SRC
214 :     * Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data
215 :     * Stage 3: Store both 8 byte of 16 bit data */
216 :     .Loop_8to16copy:
217 :     {.mii
218 :     (ld_stage[0]) ld8 src_v[0] = [src], stride
219 :     (upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL]
220 :     (upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL]
221 :     }
222 :     {.mmb
223 :     (st_stage[0]) st8 [dst_1] = dst_v1[UL], 16
224 :     (st_stage[0]) st8 [dst_2] = dst_v2[UL], 16
225 :     br.ctop.sptk.few .Loop_8to16copy
226 :     ;;
227 :     }
228 :    
229 :     // *** Restore old LC and PRs ***
230 :     mov ar.lc = oldLC
231 :     mov pr = oldPR, -1
232 :    
233 : ia64p 1.1 br.ret.sptk.many b0
234 :     .endp transfer_8to16copy_ia64#
235 : ia64p 1.2
236 :    
237 :    
238 :    
239 :     /*****************************************************************************
240 :     *
241 :     * transfer_16to8copy_ia64
242 :     *
243 :     * src is a 64 x 16 bit signed continuous array. To convert the 16 bit
244 :     * values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of
245 :     * 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word
246 :     * of 8 x 8 unsigned data to the destination.
247 :     ****************************************************************************/
248 :    
249 : ia64p 1.1 .align 16
250 :     .global transfer_16to8copy_ia64#
251 :     .proc transfer_16to8copy_ia64#
252 :     transfer_16to8copy_ia64:
253 :     .prologue
254 : ia64p 1.2
255 :     // *** register renaming ***
256 :     dst = r14
257 :     src_1 = r15
258 :     src_2 = r17
259 :     stride = r16
260 :    
261 :    
262 : ia64p 1.1 .body
263 : ia64p 1.2
264 :     // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
265 :     .save ar.lc, oldLC
266 :     mov oldLC = ar.lc
267 :     mov oldPR = pr
268 :    
269 :     // *** Allocating new stackframe, define rotating registers ***
270 :     alloc r9 = ar.pfs, 4, 92, 0, 96
271 :    
272 :     // *** Saving Paramters ***
273 :     mov dst = r32
274 :     mov src_1 = r33
275 :     add src_2 = 8, r33
276 :     mov stride = r34
277 :    
278 :     // *** init loop: set loop counter, epilog counter, predicates ***
279 :     mov ar.lc = 7
280 :     mov ar.ec = LL + PL + 1
281 :     mov pr.rot = 1 << 16
282 :     ;;
283 :    
284 :     // *** define register arrays and predicate array for software pipeline ***
285 :     // src_v1 = source value 1, dst_v = destination value
286 :     .rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1]
287 :     .rotp ld_stage[LL], pack_stage[PL], st_stage[1]
288 :    
289 :    
290 :     /* Software pipelined loop:
291 :     * Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data
292 :     * Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data
293 :     * Stage 3: Store the 8 byte to the destination address and add stride to
294 :     * destination address (to get the next 8 byte line of destination)*/
295 :     .Loop_16to8copy:
296 :     {.mmi
297 :     (ld_stage[0]) ld8 src_v1[0] = [src_1], 16
298 :     (ld_stage[0]) ld8 src_v2[0] = [src_2], 16
299 :     (pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL]
300 :     }
301 :     {.mib
302 :     (st_stage[0]) st8 [dst] = dst_v[PL]
303 :     (st_stage[0]) add dst = dst, stride
304 :     br.ctop.sptk.few .Loop_16to8copy
305 :     ;;
306 :     }
307 :    
308 :     // *** Restore old LC and PRs ***
309 :     mov ar.lc = oldLC
310 :     mov pr = oldPR, -1
311 :    
312 : ia64p 1.1 br.ret.sptk.many b0
313 :     .endp transfer_16to8copy_ia64#
314 : ia64p 1.2
315 :    
316 :    
317 :     /*****************************************************************************
318 :     *
319 :     * transfer_16to8add_ia64
320 :     *
321 :     * The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16-
322 :     * bit-values. These are "parallel-added" to the values of src. The result is
323 :     * converted into 8-bit-values using "PACK" and stored at the adress of dst.
324 :     * We assume that there is no misalignment.
325 :     *
326 :     *****************************************************************************/
327 :    
328 :     .align 16
329 :     .global transfer_16to8add_ia64#
330 :     .proc transfer_16to8add_ia64#
331 :    
332 :     transfer_16to8add_ia64:
333 :     .prologue
334 :    
335 :     // *** register renaming ***
336 :     dst = r14
337 :     src = r15
338 :     stride = r16
339 :    
340 :     _src = r17
341 :    
342 :    
343 :     .body
344 :    
345 :     // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
346 :     .save ar.lc, r2
347 :     mov oldLC = ar.lc
348 :     mov oldPR = pr
349 :    
350 :     // *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR ***
351 :     alloc r9 = ar.pfs, 4, 92, 0, 96
352 :    
353 :     // *** Saving Paramters ***
354 :     mov dst = r32
355 :     mov src = r33
356 :     mov stride = r34
357 :     add _src = 8, r33
358 :    
359 :     // *** init loop: set loop counter, epilog counter, predicates ***
360 :     mov ar.lc = 7
361 :     mov ar.ec = LL + UL + PAL + PL + 1
362 :     mov pr.rot = 1 << 16
363 :     ;;
364 :    
365 :     // *** define register arrays and predicate array for software pipeline ***
366 :     .rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1]
367 :     .rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1]
368 :    
369 :    
370 :     /* Software pipelined loop:
371 :     * s1_p: The values of src and dst are loaded
372 :     * s2_p: The dst-values are converted to 16-bit-values
373 :     * s3_p: The values of src and dst are added
374 :     * s4_p: The Results are packed into 8-bit-values
375 :     * s5_p: The 8-bit-values are stored at the dst-adresses
376 :     */
377 :    
378 :     .Loop_16to8add:
379 :     {.mii
380 :     (s1_p[0]) ld8 w_src_1[0] = [src], 16 // läd die 1. Hälfte der j. Zeile von src (i = 0..3)
381 :     (s1_p[0]) mov _dst[0] = dst // erhöht die Adresse von dst um stride
382 :     (s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst
383 :     }
384 :     {.mii
385 :     (s1_p[0]) ld8 w_dst8[0] = [dst], stride // läd die j. Zeile von dst
386 :     (s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird für i = 0..3 in 16-Bit umgewandelt
387 :     (s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird für i = 4..7 in 16-Bit umgewandelt
388 :     }
389 :     {.mii
390 :     (s1_p[0]) ld8 w_src_2[0] = [_src], 16 // läd die 2. Hälfte der j. Zeile von src (i = 4..7)
391 :     (s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst
392 :     (s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die Überprüfung der Wertebereiche erfolgt automatisch
393 :     }
394 :     {.mmb
395 :     (s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab
396 :     (s1_p[0]) nop.m 0
397 :     br.ctop.sptk.few .Loop_16to8add
398 :     ;;
399 :     }
400 :    
401 :     // *** Restore old LC and PRs ***
402 :     mov ar.lc = oldLC
403 :     mov pr = oldPR, -1
404 :    
405 :     br.ret.sptk.many b0
406 :     .endp transfer_16to8add_ia64#
407 :    
408 :    
409 :    
410 :     /*****************************************************************************
411 :     *
412 :     * transfer_8to16sub_ia64
413 :     *
414 :     * The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The
415 :     * Difference of cur and ref ist stored at the dct-adresses and cur is copied
416 :     * into the ref-array.
417 :     *
418 :     * You must assume, that the data adressed by 'ref' are misaligned in memory.
419 :     * But you can assume, that the other data are aligned (at least I hope so).
420 :     *
421 :     ****************************************************************************/
422 :    
423 : ia64p 1.1 .align 16
424 :     .global transfer_8to16sub_ia64#
425 :     .proc transfer_8to16sub_ia64#
426 : ia64p 1.2
427 :    
428 : ia64p 1.1 transfer_8to16sub_ia64:
429 :     .prologue
430 : ia64p 1.2
431 :     // *** register renaming ***
432 :     oldLC = r2
433 :     oldPR = r3
434 :    
435 :     zero = r0 // damit ist die Zahl "zero" = 0 gemeint
436 :    
437 :     //Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage
438 :     dct = r14
439 :     cur = r15
440 :     ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das ÜbergabeRegister in dieser Liste
441 :     stride = r16
442 :    
443 :     offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtrücken
444 :     aoffset = r18 // Gegenstück zum Offset,
445 :     ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref
446 :     ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref
447 :    
448 :     _dct = r21 // Register für die Zieladressen des 2. dct-Blocks
449 :    
450 :    
451 : ia64p 1.1 .body
452 : ia64p 1.2
453 :     // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
454 :     .save ar.lc, r2
455 :     mov oldLC = ar.lc
456 :     mov oldPR = pr
457 :    
458 :     // *** Allocating new stackframe, define rotating registers ***
459 :     alloc r9 = ar.pfs, 4, 92, 0, 96
460 :    
461 :     // *** Saving Paramters ***
462 :     mov dct = r32
463 :     mov cur = r33
464 :     // mov ref = r34: ref is unaligned, get aligned ref below...
465 :     mov stride = r35
466 :    
467 :     and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8)
468 :     dep offset = ref, zero, 3, 3
469 :     ;;
470 :     add ref_a2 = 8, ref_a1
471 :     sub aoffset = 64, offset // Gegenstück zum Offset wird berechnet
472 :     add _dct = 8, dct // Die Adresse für den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) höher als beim 1. Block
473 :    
474 :     // *** init loop: set loop counter, epilog counter, predicates ***
475 :     mov ar.lc = 7
476 :     mov ar.ec = LL + SHL + OL + UL + PSL + 1
477 :     mov pr.rot = 1 << 16
478 :     ;;
479 :    
480 :     // *** define register arrays and predicate array for software pipeline ***
481 :     .rotr c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1], dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1]
482 :     .rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1]
483 :    
484 :    
485 :     /* Software pipelined loop:
486 :     * s1_p: The values of ref and cur ale loaded, a copy of cur is made.
487 :     * s2_p: cur is converted to 16-bit and thehe misaligned values of ref are
488 :     * shifted...
489 :     * s3_p: ... and copied together.
490 :     * s4_p: This ref-value is converted to 16-bit. The values of cur are stored
491 :     * at the ref-adresses.
492 :     * s5_p: the ref- abd cur-values are substracted...
493 :     * s6_p: ...and the result is stored at the dct-adresses.
494 :     */
495 :    
496 :     loop_8to16sub:
497 :     {.mii
498 :     (s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // läd den 1. 64-Bit-Block, der einen Teil der ref-Daten enthält
499 :     (s1_p[0]) mov _cur[0] = cur // cur wird für spätere Verwendung gesichert
500 :     (s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte Hälfte wird zurechtgerückt
501 :     }
502 :     {.mii
503 :     (s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // läd den 2. 64-Bit-Block
504 :     (s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke Hälfte wird zurechtgerückt
505 :     (s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtgerückten Daten werden in r zusammenkopiert
506 :     }
507 :     {.mii
508 :     (s1_p[0]) ld8 c[0] = [cur], stride //läd die j. Zeile von cur komplett
509 :     (s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird für i = 0..3 in 16-Bit umgewandelt
510 :     (s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird für i = 4..7 in 16-Bit umgewandelt
511 :     }
512 :     {.mii
513 :     (s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt
514 :     //Umwandeln der 8-Bit r und c -Werte in 16-bit Werte
515 :     (s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird für i = 0..3 in 16-Bit umgewandelt
516 :     (s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird für i = 4..7 in 16-Bit umgewandelt
517 :     }
518 :     {.mii
519 :     (s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. Häfte der j. Zeile
520 :     (s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. Hälfte
521 :     }
522 :     {.mmb
523 :     (s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert
524 :     (s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert
525 :     br.ctop.sptk.few loop_8to16sub // Und hopp
526 :     ;;
527 :     }
528 :    
529 :     // *** Restore old LC and PRs ***
530 :     mov ar.lc = oldLC
531 :     mov pr = oldPR, -1
532 :    
533 : ia64p 1.1 br.ret.sptk.many b0
534 :     .endp transfer_8to16sub_ia64#
535 : ia64p 1.2
536 :    
537 :    
538 :    
539 :    
540 :     /*****************************************************************************
541 :     *
542 :     * transfer_8to16sub2_ia64
543 :     *
544 :     * At the time, this function was written, it was not yet in use.
545 :     * We assume that the values of ref1/2 are misaligned.
546 :     *
547 :     * The values of ref1/2 and cur are loaded, the ref-values need misalignment-
548 :     * treatment. The values are converted to 16-bit using unpack. The average of
549 :     * ref1 and ref2 is computed with pavg and substacted from cur. The results are
550 :     * stored at the dct-adresses.
551 :     * pavg1.raz is used to get the same results as the C-code-function.
552 :     *
553 :     *****************************************************************************/
554 :    
555 :     .text
556 : ia64p 1.1 .align 16
557 :     .global transfer_8to16sub2_ia64#
558 :     .proc transfer_8to16sub2_ia64#
559 : ia64p 1.2
560 : ia64p 1.1 transfer_8to16sub2_ia64:
561 :     .prologue
562 : ia64p 1.2
563 :     // *** register renaming ***
564 :     // We've tried to keep the C-Code names as often as possible, at least as
565 :     // part of register-names
566 :     oldLC = r2
567 :     oldPR = r3
568 :    
569 :     zero = r0
570 :    
571 :     dct_al = r14 // dct: adress of left block in one line
572 :     dct_ar = r15 // dct: adress of right block in one line
573 :     cur = r16
574 :     ref1_al = r17 // ref1: aligned adress of lower part
575 :     ref1_ah = r18 // ref1: aligned adress of higher part
576 :     ref2_al = r19 // ref2: aligned adress of lower part
577 :     ref2_ah = r20 // ref2: aligned adress of higher part
578 :     stride = r21
579 :    
580 :     offset_1 = r22
581 :     offset_2 = r23
582 :     aoffset_1 = r24
583 :     aoffset_2 = r25
584 :    
585 :    
586 :     .body
587 :    
588 :     // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
589 : ia64p 1.1 .save ar.lc, r2
590 : ia64p 1.2 mov oldLC = ar.lc
591 :     mov oldPR = pr
592 :    
593 :     // *** Saving Paramters ***
594 :     // *** (as inputregisters r32 + are needed for register-rotation) ***
595 :     mov dct_ar = r32
596 :     add dct_al = 8, r32
597 :     mov cur = r33
598 :    
599 :     and ref1_al = -8, r34
600 :     and ref2_al = -8, r35 // ref2 aligned adrress of lower part
601 :    
602 :     mov stride = r36
603 :    
604 :     // *** Calculations for Misaligment-Handling ***
605 :     dep offset_1 = r34, zero, 3, 3
606 :     dep offset_2 = r35, zero, 3, 3
607 :     ;;
608 :     add ref1_ah = 8, ref1_al
609 :     add ref2_ah = 8, ref2_al
610 :     sub aoffset_1 = 64, offset_1
611 :     sub aoffset_2 = 64, offset_2
612 :     ;;
613 :    
614 :     // *** Allocating new stackframe, define rotating registers ***
615 :     alloc r9 = ar.pfs, 5, 91, 0, 96
616 :    
617 :     // *** init loop: set loop counter, epilog counter, predicates ***
618 :     mov ar.lc = 7
619 :     mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1
620 :     mov pr.rot = 1 << 16
621 :     ;;
622 :    
623 :     // *** define register arrays and predicate array for software pipeline ***
624 :     .rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1]
625 :     .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1]
626 :    
627 :     /* software pipelined loop:
628 :     * ld_stage: The values of ref1, ref2, cur are loaded
629 :     * sh_stage: The misaligned values of ref1/2 are shifted...
630 :     * or_stage: ...and copied together.
631 :     * pavg_stage: The average of ref1 and ref2 is computed.
632 :     * up_stage: The result and the cur-values are converted to 16-bit.
633 :     * psub_stage: Those values are substracted...
634 :     * st_stage: ...and stored at the dct-adresses.
635 :     */
636 :    
637 :     .Loop_8to16sub2:
638 :     {.mii
639 :     (ld_stage[0]) ld8 c[0] = [cur], stride
640 :     (sh_stage[0]) shr.u ref1_l[0] = ref1_vl[LL], offset_1
641 :     (sh_stage[0]) shl ref1_h[0] = ref1_vh[LL], aoffset_1
642 :     }
643 :     {.mii
644 :     (ld_stage[0]) ld8 ref1_vl[0] = [ref1_al], stride
645 :     (sh_stage[0]) shr.u ref2_l[0] = ref2_vl[LL], offset_2
646 :     (sh_stage[0]) shl ref2_h[0] = ref2_vh[LL], aoffset_2
647 :     }
648 :     {.mii
649 :     (ld_stage[0]) ld8 ref1_vh[0] = [ref1_ah], stride
650 :     (or_stage[0]) or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL]
651 :     (or_stage[0]) or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL]
652 :     }
653 :     {.mii
654 :     (ld_stage[0]) ld8 ref2_vl[0] = [ref2_al], stride
655 :     (pavg_stage[0]) pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL]
656 :     (up_stage[0]) unpack1.l r16_r[0] = zero, r[PAVGL]
657 :     }
658 :     {.mii
659 :     (ld_stage[0]) ld8 ref2_vh[0] = [ref2_ah], stride
660 :     (up_stage[0]) unpack1.h r16_l[0] = zero, r[PAVGL]
661 :     (up_stage[0]) unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL]
662 :     }
663 :     {.mii
664 :     (st_stage[0]) st8 [dct_ar] = dct16_r[PSL], 16
665 :     (up_stage[0]) unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL]
666 :     (psub_stage[0]) psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL]
667 :     }
668 :     {.mib
669 :     (st_stage[0]) st8 [dct_al] = dct16_l[PSL], 16
670 :     (psub_stage[0]) psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL]
671 :     br.ctop.sptk.few .Loop_8to16sub2 // Und hopp
672 :     ;;
673 :     }
674 :    
675 :     // *** Restore old LC and PRs ***
676 :     mov ar.lc = oldLC
677 :     mov pr = oldPR, -1
678 :    
679 : ia64p 1.1 br.ret.sptk.many b0
680 :     .endp transfer_8to16sub2_ia64#

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4