Parent Directory | Revision Log
Revision 1.6 - (view) (download)
1 : | Isibaar | 1.6 | // **************************************************************************** |
2 : | // * | ||
3 : | // * XVID MPEG-4 VIDEO CODEC | ||
4 : | // * - IA64 8bit<->16bit transfer - | ||
5 : | // * | ||
6 : | // * Copyright(C) 2002 Sebastian Felis, Max Stengel | ||
7 : | // * | ||
8 : | // * This program is free software; you can redistribute it and/or modify it | ||
9 : | // * under the terms of the GNU General Public License as published by | ||
10 : | // * the Free Software Foundation; either version 2 of the License, or | ||
11 : | // * (at your option) any later version. | ||
12 : | // * | ||
13 : | // * This program is distributed in the hope that it will be useful, | ||
14 : | // * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 : | // * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 : | // * GNU General Public License for more details. | ||
17 : | // * | ||
18 : | // * You should have received a copy of the GNU General Public License | ||
19 : | // * along with this program; if not, write to the Free Software | ||
20 : | // * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
21 : | // * | ||
22 : | // * $Id: mem_transfer_ia64.s,v 1.5 2008/12/04 14:41:50 Isibaar Exp $ | ||
23 : | // * | ||
24 : | // ***************************************************************************/ | ||
25 : | // | ||
26 : | // **************************************************************************** | ||
27 : | // * | ||
28 : | // * mem_transfer_ia64.s, IA-64 8bit<->16bit transfer | ||
29 : | // * | ||
30 : | // * This version was implemented during an IA-64 practical training at | ||
31 : | // * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) | ||
32 : | // * | ||
33 : | // **************************************************************************** | ||
34 : | |||
35 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
36 : | // | ||
37 : | // mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, | ||
38 : | // University of Karlsruhe, Germany, 03.06.2002, during the laboratory | ||
39 : | // "IA-64 Video Codec Assember Parktikum" at IPD Goos. | ||
40 : | Isibaar | 1.6 | |
41 : | ia64p | 1.3 | ///// History ///////////////////////////////////////////////////////////////// |
42 : | // | ||
43 : | // - 16.07.2002: several minor changes for ecc-conformity | ||
44 : | // - 03.06.2002: initial version | ||
45 : | // | ||
46 : | Isibaar | 1.6 | |
47 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
48 : | // | ||
49 : | // Annotations: | ||
50 : | // =========== | ||
51 : | // | ||
52 : | // - All functions work on 8x8-matrices. While the C-code-functions treat each | ||
53 : | // element seperatly, the functions in this assembler-code treat a whole line | ||
54 : | // simultaneously. So one loop is saved. | ||
55 : | // The remaining loop is relized by using softwarepipelining with rotating | ||
56 : | // rregisters. | ||
57 : | // - Register renaming is used for better readability | ||
58 : | // - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both | ||
59 : | // parts are shifted and joined together with an "OR"-Instruction. | ||
60 : | // - First parameter is stored in GR 32, next in GR 33, and so on. They must be | ||
61 : | // saved, as these GRs are used for register-rotation. | ||
62 : | // - Some of the orininal, German comments used during development are left in | ||
63 : | // in the code. They shouldn't bother anyone. | ||
64 : | // | ||
65 : | // Anmerkungen: | ||
66 : | // ============ | ||
67 : | // | ||
68 : | // - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code | ||
69 : | // jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- | ||
70 : | // Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. | ||
71 : | // Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit | ||
72 : | // rotierenden Registern realisiert. | ||
73 : | // - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. | ||
74 : | // - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke | ||
75 : | // geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem | ||
76 : | // logischen Oder zusammenkopiert. | ||
77 : | // - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- | ||
78 : | // sichert werden, da die Register für die register-Rotation benötigt werden. | ||
79 : | // - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase | ||
80 : | // sind im Code verblieben. Sie sollten niemanden stören. | ||
81 : | // | ||
82 : | /////////////////////////////////////////////////////////////////////////////// | ||
83 : | ia64p | 1.2 | |
84 : | |||
85 : | // *** define Latencies for software pipilines *** | ||
86 : | |||
87 : | LL = 3 // Load | ||
88 : | SL = 3 // Store | ||
89 : | PL = 1 // Pack | ||
90 : | SHL = 1 // Shift | ||
91 : | OL = 1 // Or | ||
92 : | UL = 1 // Unpack | ||
93 : | PAL = 1 // Parallel Add | ||
94 : | PSL = 1 // Parallel Subtract | ||
95 : | PAVGL = 1 // Parallel Avarage | ||
96 : | |||
97 : | .text | ||
98 : | |||
99 : | |||
100 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
101 : | // | ||
102 : | // transfer8x8_copy_ia64 | ||
103 : | // | ||
104 : | // SRC is missaligned, to align the source load two 8-bytes-words, shift it, | ||
105 : | // join them and store the aligned source into the destination address. | ||
106 : | // | ||
107 : | /////////////////////////////////////////////////////////////////////////////// | ||
108 : | ia64p | 1.2 | |
109 : | .align 16 | ||
110 : | .global transfer8x8_copy_ia64# | ||
111 : | .proc transfer8x8_copy_ia64# | ||
112 : | |||
113 : | transfer8x8_copy_ia64: | ||
114 : | .prologue | ||
115 : | |||
116 : | // *** register renaming *** | ||
117 : | zero = r0 | ||
118 : | |||
119 : | oldLC = r2 | ||
120 : | oldPR = r3 | ||
121 : | |||
122 : | src_1 = r14 // left aligned address of src | ||
123 : | src_2 = r15 // right aligned address of src | ||
124 : | dst = r16 // destination address | ||
125 : | stride = r17 | ||
126 : | |||
127 : | offset = r18 // shift right offset | ||
128 : | aoffset = r19 // shift left offset | ||
129 : | |||
130 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
131 : | .save ar.lc, oldLC | ||
132 : | mov oldLC = ar.lc | ||
133 : | mov oldPR = pr | ||
134 : | ia64p | 1.3 | |
135 : | .body | ||
136 : | |||
137 : | ia64p | 1.2 | // *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
138 : | alloc r9 = ar.pfs, 3, 29, 0, 32 | ||
139 : | |||
140 : | // *** Saving Parameters *** | ||
141 : | mov dst = r32 | ||
142 : | mov stride = r34 | ||
143 : | |||
144 : | // *** Misalingment-Treatment *** | ||
145 : | and src_1 = -8, r33 // Computing adress of first aligned block containing src-values | ||
146 : | dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress | ||
147 : | ;; | ||
148 : | sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl | ||
149 : | add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values | ||
150 : | |||
151 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
152 : | mov ar.lc = 7 | ||
153 : | mov ar.ec = LL + SHL + OL + 1 | ||
154 : | mov pr.rot = 1 << 16 | ||
155 : | ;; | ||
156 : | |||
157 : | // *** define register arrays and predicate array for software pipeline *** | ||
158 : | // src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left | ||
159 : | .rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] | ||
160 : | .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] | ||
161 : | ia64p | 1.3 | |
162 : | |||
163 : | // Software pipelined loop: | ||
164 : | // Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 | ||
165 : | // Stage 2: Shift both values of source to SHD_R and SHD_L | ||
166 : | // Stage 3: Join both parts together with OR | ||
167 : | // Stage 4: Store aligned date to destination and add stride to destination address | ||
168 : | |||
169 : | |||
170 : | ia64p | 1.2 | .Loop_8x8copy: |
171 : | {.mii | ||
172 : | (ld_stage[0]) ld8 src_v1[0] = [src_1], stride | ||
173 : | (sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset | ||
174 : | } | ||
175 : | {.mii | ||
176 : | (ld_stage[0]) ld8 src_v2[0] = [src_2], stride | ||
177 : | (sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset | ||
178 : | (or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL] | ||
179 : | } | ||
180 : | {.mib | ||
181 : | (st_stage[0]) st8 [dst] = value[OL] | ||
182 : | (st_stage[0]) add dst = dst, stride | ||
183 : | br.ctop.sptk.few .Loop_8x8copy | ||
184 : | ;; | ||
185 : | } | ||
186 : | |||
187 : | // *** Restore old LC and PRs *** | ||
188 : | mov ar.lc = oldLC | ||
189 : | mov pr = oldPR, -1 | ||
190 : | |||
191 : | br.ret.sptk.many b0 | ||
192 : | |||
193 : | .endp transfer8x8_copy_ia64# | ||
194 : | |||
195 : | |||
196 : | |||
197 : | |||
198 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
199 : | // | ||
200 : | // transfer_8to16copy_ia64 | ||
201 : | // | ||
202 : | // SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, | ||
203 : | // UNPACK is used. So 8 bytes are loaded from source, unpacked to two | ||
204 : | // 4 x 16 bit values and stored to the destination. Destination is a continuous | ||
205 : | // array of 64 x 16 bit signed data. To store the next line, only 16 must be | ||
206 : | // added to the destination address. | ||
207 : | /////////////////////////////////////////////////////////////////////////////// | ||
208 : | ia64p | 1.2 | |
209 : | ia64p | 1.1 | .align 16 |
210 : | .global transfer_8to16copy_ia64# | ||
211 : | .proc transfer_8to16copy_ia64# | ||
212 : | ia64p | 1.2 | |
213 : | |||
214 : | ia64p | 1.1 | transfer_8to16copy_ia64: |
215 : | .prologue | ||
216 : | ia64p | 1.2 | |
217 : | // *** register renaming *** | ||
218 : | oldLC = r2 | ||
219 : | oldPR = r3 | ||
220 : | |||
221 : | zero = r0 // damit ist die Zahl "zero" = 0 gemeint | ||
222 : | |||
223 : | dst_1 = r14 // destination address for first 4 x 16 bit values | ||
224 : | dst_2 = r15 // destination address for second 4 x 16 bit values | ||
225 : | src = r16 | ||
226 : | stride = r17 | ||
227 : | |||
228 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
229 : | .save ar.lc, oldLC | ||
230 : | mov oldLC = ar.lc | ||
231 : | mov oldPR = pr | ||
232 : | |||
233 : | ia64p | 1.3 | |
234 : | .body | ||
235 : | |||
236 : | ia64p | 1.2 | // *** Allocating new stackframe, define rotating registers *** |
237 : | alloc r9 = ar.pfs, 4, 92, 0, 96 | ||
238 : | |||
239 : | // *** Saving Paramters *** | ||
240 : | mov dst_1 = r32 // fist 4 x 16 bit values | ||
241 : | add dst_2 = 8, r32 // second 4 x 16 bit values | ||
242 : | mov src = r33 | ||
243 : | mov stride = r34 | ||
244 : | |||
245 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
246 : | mov ar.lc = 7 | ||
247 : | mov ar.ec = LL + UL + 1 | ||
248 : | mov pr.rot = 1 << 16 | ||
249 : | ;; | ||
250 : | |||
251 : | // *** define register arrays and predicate array for software pipeline *** | ||
252 : | // src_v = source value, dst_v1 = destination value 1 | ||
253 : | .rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] | ||
254 : | .rotp ld_stage[LL], upack_stage[UL], st_stage[1] | ||
255 : | ia64p | 1.3 | |
256 : | |||
257 : | // Software pipelined loop: | ||
258 : | // Stage 1: Load value of SRC | ||
259 : | // Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data | ||
260 : | // Stage 3: Store both 8 byte of 16 bit data | ||
261 : | |||
262 : | ia64p | 1.2 | |
263 : | .Loop_8to16copy: | ||
264 : | {.mii | ||
265 : | (ld_stage[0]) ld8 src_v[0] = [src], stride | ||
266 : | (upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL] | ||
267 : | (upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL] | ||
268 : | } | ||
269 : | {.mmb | ||
270 : | (st_stage[0]) st8 [dst_1] = dst_v1[UL], 16 | ||
271 : | (st_stage[0]) st8 [dst_2] = dst_v2[UL], 16 | ||
272 : | br.ctop.sptk.few .Loop_8to16copy | ||
273 : | ;; | ||
274 : | } | ||
275 : | |||
276 : | // *** Restore old LC and PRs *** | ||
277 : | mov ar.lc = oldLC | ||
278 : | mov pr = oldPR, -1 | ||
279 : | |||
280 : | ia64p | 1.1 | br.ret.sptk.many b0 |
281 : | .endp transfer_8to16copy_ia64# | ||
282 : | ia64p | 1.2 | |
283 : | |||
284 : | |||
285 : | |||
286 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
287 : | // | ||
288 : | // transfer_16to8copy_ia64 | ||
289 : | // | ||
290 : | // src is a 64 x 16 bit signed continuous array. To convert the 16 bit | ||
291 : | // values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of | ||
292 : | // 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word | ||
293 : | // of 8 x 8 unsigned data to the destination. | ||
294 : | /////////////////////////////////////////////////////////////////////////////// | ||
295 : | ia64p | 1.2 | |
296 : | ia64p | 1.1 | .align 16 |
297 : | .global transfer_16to8copy_ia64# | ||
298 : | .proc transfer_16to8copy_ia64# | ||
299 : | transfer_16to8copy_ia64: | ||
300 : | .prologue | ||
301 : | ia64p | 1.2 | |
302 : | // *** register renaming *** | ||
303 : | dst = r14 | ||
304 : | src_1 = r15 | ||
305 : | src_2 = r17 | ||
306 : | stride = r16 | ||
307 : | |||
308 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
309 : | .save ar.lc, oldLC | ||
310 : | mov oldLC = ar.lc | ||
311 : | mov oldPR = pr | ||
312 : | |||
313 : | ia64p | 1.3 | |
314 : | .body | ||
315 : | |||
316 : | ia64p | 1.2 | // *** Allocating new stackframe, define rotating registers *** |
317 : | alloc r9 = ar.pfs, 4, 92, 0, 96 | ||
318 : | |||
319 : | // *** Saving Paramters *** | ||
320 : | mov dst = r32 | ||
321 : | mov src_1 = r33 | ||
322 : | add src_2 = 8, r33 | ||
323 : | mov stride = r34 | ||
324 : | |||
325 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
326 : | mov ar.lc = 7 | ||
327 : | mov ar.ec = LL + PL + 1 | ||
328 : | mov pr.rot = 1 << 16 | ||
329 : | ;; | ||
330 : | |||
331 : | // *** define register arrays and predicate array for software pipeline *** | ||
332 : | // src_v1 = source value 1, dst_v = destination value | ||
333 : | .rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1] | ||
334 : | .rotp ld_stage[LL], pack_stage[PL], st_stage[1] | ||
335 : | |||
336 : | |||
337 : | ia64p | 1.3 | // Software pipelined loop: |
338 : | // Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data | ||
339 : | // Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data | ||
340 : | // Stage 3: Store the 8 byte to the destination address and add stride to | ||
341 : | // destination address (to get the next 8 byte line of destination) | ||
342 : | |||
343 : | |||
344 : | ia64p | 1.2 | .Loop_16to8copy: |
345 : | {.mmi | ||
346 : | (ld_stage[0]) ld8 src_v1[0] = [src_1], 16 | ||
347 : | (ld_stage[0]) ld8 src_v2[0] = [src_2], 16 | ||
348 : | (pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL] | ||
349 : | } | ||
350 : | {.mib | ||
351 : | (st_stage[0]) st8 [dst] = dst_v[PL] | ||
352 : | (st_stage[0]) add dst = dst, stride | ||
353 : | br.ctop.sptk.few .Loop_16to8copy | ||
354 : | ;; | ||
355 : | } | ||
356 : | |||
357 : | // *** Restore old LC and PRs *** | ||
358 : | mov ar.lc = oldLC | ||
359 : | mov pr = oldPR, -1 | ||
360 : | |||
361 : | ia64p | 1.1 | br.ret.sptk.many b0 |
362 : | .endp transfer_16to8copy_ia64# | ||
363 : | ia64p | 1.2 | |
364 : | |||
365 : | |||
366 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
367 : | // | ||
368 : | // transfer_16to8add_ia64 | ||
369 : | // | ||
370 : | // The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- | ||
371 : | // bit-values. These are "parallel-added" to the values of src. The result is | ||
372 : | // converted into 8-bit-values using "PACK" and stored at the adress of dst. | ||
373 : | // We assume that there is no misalignment. | ||
374 : | // | ||
375 : | /////////////////////////////////////////////////////////////////////////////// | ||
376 : | ia64p | 1.2 | |
377 : | .align 16 | ||
378 : | .global transfer_16to8add_ia64# | ||
379 : | .proc transfer_16to8add_ia64# | ||
380 : | |||
381 : | transfer_16to8add_ia64: | ||
382 : | .prologue | ||
383 : | |||
384 : | // *** register renaming *** | ||
385 : | dst = r14 | ||
386 : | src = r15 | ||
387 : | stride = r16 | ||
388 : | |||
389 : | _src = r17 | ||
390 : | |||
391 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
392 : | .save ar.lc, r2 | ||
393 : | mov oldLC = ar.lc | ||
394 : | mov oldPR = pr | ||
395 : | |||
396 : | ia64p | 1.3 | |
397 : | .body | ||
398 : | |||
399 : | ia64p | 1.2 | // *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
400 : | alloc r9 = ar.pfs, 4, 92, 0, 96 | ||
401 : | |||
402 : | // *** Saving Paramters *** | ||
403 : | mov dst = r32 | ||
404 : | mov src = r33 | ||
405 : | mov stride = r34 | ||
406 : | add _src = 8, r33 | ||
407 : | |||
408 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
409 : | mov ar.lc = 7 | ||
410 : | mov ar.ec = LL + UL + PAL + PL + 1 | ||
411 : | mov pr.rot = 1 << 16 | ||
412 : | ;; | ||
413 : | |||
414 : | // *** define register arrays and predicate array for software pipeline *** | ||
415 : | .rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1] | ||
416 : | .rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] | ||
417 : | |||
418 : | |||
419 : | ia64p | 1.3 | // Software pipelined loop: |
420 : | // s1_p: The values of src and dst are loaded | ||
421 : | // s2_p: The dst-values are converted to 16-bit-values | ||
422 : | // s3_p: The values of src and dst are added | ||
423 : | // s4_p: The Results are packed into 8-bit-values | ||
424 : | // s5_p: The 8-bit-values are stored at the dst-adresses | ||
425 : | |||
426 : | ia64p | 1.2 | |
427 : | .Loop_16to8add: | ||
428 : | {.mii | ||
429 : | (s1_p[0]) ld8 w_src_1[0] = [src], 16 // läd die 1. Hälfte der j. Zeile von src (i = 0..3) | ||
430 : | (s1_p[0]) mov _dst[0] = dst // erhöht die Adresse von dst um stride | ||
431 : | (s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst | ||
432 : | } | ||
433 : | {.mii | ||
434 : | (s1_p[0]) ld8 w_dst8[0] = [dst], stride // läd die j. Zeile von dst | ||
435 : | (s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird für i = 0..3 in 16-Bit umgewandelt | ||
436 : | (s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird für i = 4..7 in 16-Bit umgewandelt | ||
437 : | } | ||
438 : | {.mii | ||
439 : | (s1_p[0]) ld8 w_src_2[0] = [_src], 16 // läd die 2. Hälfte der j. Zeile von src (i = 4..7) | ||
440 : | (s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst | ||
441 : | (s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die Überprüfung der Wertebereiche erfolgt automatisch | ||
442 : | } | ||
443 : | {.mmb | ||
444 : | (s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab | ||
445 : | (s1_p[0]) nop.m 0 | ||
446 : | br.ctop.sptk.few .Loop_16to8add | ||
447 : | ;; | ||
448 : | } | ||
449 : | |||
450 : | // *** Restore old LC and PRs *** | ||
451 : | mov ar.lc = oldLC | ||
452 : | mov pr = oldPR, -1 | ||
453 : | |||
454 : | br.ret.sptk.many b0 | ||
455 : | .endp transfer_16to8add_ia64# | ||
456 : | |||
457 : | |||
458 : | |||
459 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
460 : | // | ||
461 : | // transfer_8to16sub_ia64 | ||
462 : | // | ||
463 : | // The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The | ||
464 : | // Difference of cur and ref ist stored at the dct-adresses and cur is copied | ||
465 : | // into the ref-array. | ||
466 : | // | ||
467 : | // You must assume, that the data adressed by 'ref' are misaligned in memory. | ||
468 : | // But you can assume, that the other data are aligned (at least I hope so). | ||
469 : | // | ||
470 : | /////////////////////////////////////////////////////////////////////////////// | ||
471 : | ia64p | 1.2 | |
472 : | ia64p | 1.1 | .align 16 |
473 : | .global transfer_8to16sub_ia64# | ||
474 : | .proc transfer_8to16sub_ia64# | ||
475 : | ia64p | 1.2 | |
476 : | |||
477 : | ia64p | 1.1 | transfer_8to16sub_ia64: |
478 : | .prologue | ||
479 : | ia64p | 1.2 | |
480 : | // *** register renaming *** | ||
481 : | oldLC = r2 | ||
482 : | oldPR = r3 | ||
483 : | |||
484 : | zero = r0 // damit ist die Zahl "zero" = 0 gemeint | ||
485 : | |||
486 : | //Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage | ||
487 : | dct = r14 | ||
488 : | cur = r15 | ||
489 : | ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das ÜbergabeRegister in dieser Liste | ||
490 : | stride = r16 | ||
491 : | |||
492 : | offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtrücken | ||
493 : | aoffset = r18 // Gegenstück zum Offset, | ||
494 : | ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref | ||
495 : | ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref | ||
496 : | |||
497 : | _dct = r21 // Register für die Zieladressen des 2. dct-Blocks | ||
498 : | |||
499 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
500 : | .save ar.lc, r2 | ||
501 : | mov oldLC = ar.lc | ||
502 : | mov oldPR = pr | ||
503 : | |||
504 : | ia64p | 1.3 | |
505 : | .body | ||
506 : | |||
507 : | ia64p | 1.2 | // *** Allocating new stackframe, define rotating registers *** |
508 : | alloc r9 = ar.pfs, 4, 92, 0, 96 | ||
509 : | |||
510 : | // *** Saving Paramters *** | ||
511 : | mov dct = r32 | ||
512 : | mov cur = r33 | ||
513 : | // mov ref = r34: ref is unaligned, get aligned ref below... | ||
514 : | mov stride = r35 | ||
515 : | |||
516 : | and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8) | ||
517 : | dep offset = ref, zero, 3, 3 | ||
518 : | ;; | ||
519 : | add ref_a2 = 8, ref_a1 | ||
520 : | sub aoffset = 64, offset // Gegenstück zum Offset wird berechnet | ||
521 : | add _dct = 8, dct // Die Adresse für den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) höher als beim 1. Block | ||
522 : | |||
523 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
524 : | mov ar.lc = 7 | ||
525 : | mov ar.ec = LL + SHL + OL + UL + PSL + 1 | ||
526 : | mov pr.rot = 1 << 16 | ||
527 : | ;; | ||
528 : | |||
529 : | // *** define register arrays and predicate array for software pipeline *** | ||
530 : | .rotr c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1], dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1] | ||
531 : | .rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] | ||
532 : | |||
533 : | |||
534 : | ia64p | 1.3 | // Software pipelined loop: |
535 : | // s1_p: The values of ref and cur ale loaded, a copy of cur is made. | ||
536 : | // s2_p: cur is converted to 16-bit and thehe misaligned values of ref are | ||
537 : | // shifted... | ||
538 : | // s3_p: ... and copied together. | ||
539 : | // s4_p: This ref-value is converted to 16-bit. The values of cur are stored | ||
540 : | // at the ref-adresses. | ||
541 : | // s5_p: the ref- abd cur-values are substracted... | ||
542 : | // s6_p: ...and the result is stored at the dct-adresses. | ||
543 : | |||
544 : | ia64p | 1.2 | |
545 : | loop_8to16sub: | ||
546 : | {.mii | ||
547 : | (s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // läd den 1. 64-Bit-Block, der einen Teil der ref-Daten enthält | ||
548 : | (s1_p[0]) mov _cur[0] = cur // cur wird für spätere Verwendung gesichert | ||
549 : | (s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte Hälfte wird zurechtgerückt | ||
550 : | } | ||
551 : | {.mii | ||
552 : | (s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // läd den 2. 64-Bit-Block | ||
553 : | (s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke Hälfte wird zurechtgerückt | ||
554 : | (s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtgerückten Daten werden in r zusammenkopiert | ||
555 : | } | ||
556 : | {.mii | ||
557 : | (s1_p[0]) ld8 c[0] = [cur], stride //läd die j. Zeile von cur komplett | ||
558 : | (s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird für i = 0..3 in 16-Bit umgewandelt | ||
559 : | (s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird für i = 4..7 in 16-Bit umgewandelt | ||
560 : | } | ||
561 : | {.mii | ||
562 : | (s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt | ||
563 : | //Umwandeln der 8-Bit r und c -Werte in 16-bit Werte | ||
564 : | (s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird für i = 0..3 in 16-Bit umgewandelt | ||
565 : | (s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird für i = 4..7 in 16-Bit umgewandelt | ||
566 : | } | ||
567 : | {.mii | ||
568 : | (s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. Häfte der j. Zeile | ||
569 : | (s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. Hälfte | ||
570 : | } | ||
571 : | {.mmb | ||
572 : | (s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert | ||
573 : | (s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert | ||
574 : | br.ctop.sptk.few loop_8to16sub // Und hopp | ||
575 : | ;; | ||
576 : | } | ||
577 : | |||
578 : | // *** Restore old LC and PRs *** | ||
579 : | mov ar.lc = oldLC | ||
580 : | mov pr = oldPR, -1 | ||
581 : | |||
582 : | ia64p | 1.1 | br.ret.sptk.many b0 |
583 : | .endp transfer_8to16sub_ia64# | ||
584 : | ia64p | 1.2 | |
585 : | |||
586 : | |||
587 : | |||
588 : | |||
589 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
590 : | // | ||
591 : | // transfer_8to16sub2_ia64 | ||
592 : | // | ||
593 : | // At the time, this function was written, it was not yet in use. | ||
594 : | // We assume that the values of ref1/2 are misaligned. | ||
595 : | // | ||
596 : | // The values of ref1/2 and cur are loaded, the ref-values need misalignment- | ||
597 : | // treatment. The values are converted to 16-bit using unpack. The average of | ||
598 : | // ref1 and ref2 is computed with pavg and substacted from cur. The results are | ||
599 : | // stored at the dct-adresses. | ||
600 : | // pavg1.raz is used to get the same results as the C-code-function. | ||
601 : | // | ||
602 : | /////////////////////////////////////////////////////////////////////////////// | ||
603 : | ia64p | 1.2 | |
604 : | .text | ||
605 : | ia64p | 1.1 | .align 16 |
606 : | .global transfer_8to16sub2_ia64# | ||
607 : | .proc transfer_8to16sub2_ia64# | ||
608 : | ia64p | 1.2 | |
609 : | ia64p | 1.1 | transfer_8to16sub2_ia64: |
610 : | .prologue | ||
611 : | ia64p | 1.2 | |
612 : | // *** register renaming *** | ||
613 : | // We've tried to keep the C-Code names as often as possible, at least as | ||
614 : | // part of register-names | ||
615 : | oldLC = r2 | ||
616 : | oldPR = r3 | ||
617 : | |||
618 : | zero = r0 | ||
619 : | |||
620 : | dct_al = r14 // dct: adress of left block in one line | ||
621 : | dct_ar = r15 // dct: adress of right block in one line | ||
622 : | cur = r16 | ||
623 : | ref1_al = r17 // ref1: aligned adress of lower part | ||
624 : | ref1_ah = r18 // ref1: aligned adress of higher part | ||
625 : | ref2_al = r19 // ref2: aligned adress of lower part | ||
626 : | ref2_ah = r20 // ref2: aligned adress of higher part | ||
627 : | stride = r21 | ||
628 : | |||
629 : | offset_1 = r22 | ||
630 : | offset_2 = r23 | ||
631 : | aoffset_1 = r24 | ||
632 : | aoffset_2 = r25 | ||
633 : | |||
634 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
635 : | ia64p | 1.1 | .save ar.lc, r2 |
636 : | ia64p | 1.2 | mov oldLC = ar.lc |
637 : | mov oldPR = pr | ||
638 : | |||
639 : | ia64p | 1.3 | |
640 : | .body | ||
641 : | |||
642 : | ia64p | 1.2 | // *** Saving Paramters *** |
643 : | // *** (as inputregisters r32 + are needed for register-rotation) *** | ||
644 : | mov dct_ar = r32 | ||
645 : | add dct_al = 8, r32 | ||
646 : | mov cur = r33 | ||
647 : | |||
648 : | and ref1_al = -8, r34 | ||
649 : | and ref2_al = -8, r35 // ref2 aligned adrress of lower part | ||
650 : | |||
651 : | mov stride = r36 | ||
652 : | |||
653 : | // *** Calculations for Misaligment-Handling *** | ||
654 : | dep offset_1 = r34, zero, 3, 3 | ||
655 : | dep offset_2 = r35, zero, 3, 3 | ||
656 : | ;; | ||
657 : | add ref1_ah = 8, ref1_al | ||
658 : | add ref2_ah = 8, ref2_al | ||
659 : | sub aoffset_1 = 64, offset_1 | ||
660 : | sub aoffset_2 = 64, offset_2 | ||
661 : | ;; | ||
662 : | |||
663 : | // *** Allocating new stackframe, define rotating registers *** | ||
664 : | alloc r9 = ar.pfs, 5, 91, 0, 96 | ||
665 : | |||
666 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
667 : | mov ar.lc = 7 | ||
668 : | mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1 | ||
669 : | mov pr.rot = 1 << 16 | ||
670 : | ;; | ||
671 : | |||
672 : | // *** define register arrays and predicate array for software pipeline *** | ||
673 : | .rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] | ||
674 : | .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] | ||
675 : | ia64p | 1.3 | |
676 : | ia64p | 1.2 | |
677 : | ia64p | 1.3 | // software pipelined loop: |
678 : | // ld_stage: The values of ref1, ref2, cur are loaded | ||
679 : | // sh_stage: The misaligned values of ref1/2 are shifted... | ||
680 : | // or_stage: ...and copied together. | ||
681 : | // pavg_stage: The average of ref1 and ref2 is computed. | ||
682 : | // up_stage: The result and the cur-values are converted to 16-bit. | ||
683 : | // psub_stage: Those values are substracted... | ||
684 : | // st_stage: ...and stored at the dct-adresses. | ||
685 : | |||
686 : | ia64p | 1.2 | |
687 : | .Loop_8to16sub2: | ||
688 : | {.mii | ||
689 : | (ld_stage[0]) ld8 c[0] = [cur], stride | ||
690 : | (sh_stage[0]) shr.u ref1_l[0] = ref1_vl[LL], offset_1 | ||
691 : | (sh_stage[0]) shl ref1_h[0] = ref1_vh[LL], aoffset_1 | ||
692 : | } | ||
693 : | {.mii | ||
694 : | (ld_stage[0]) ld8 ref1_vl[0] = [ref1_al], stride | ||
695 : | (sh_stage[0]) shr.u ref2_l[0] = ref2_vl[LL], offset_2 | ||
696 : | (sh_stage[0]) shl ref2_h[0] = ref2_vh[LL], aoffset_2 | ||
697 : | } | ||
698 : | {.mii | ||
699 : | (ld_stage[0]) ld8 ref1_vh[0] = [ref1_ah], stride | ||
700 : | (or_stage[0]) or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL] | ||
701 : | (or_stage[0]) or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL] | ||
702 : | } | ||
703 : | {.mii | ||
704 : | (ld_stage[0]) ld8 ref2_vl[0] = [ref2_al], stride | ||
705 : | (pavg_stage[0]) pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL] | ||
706 : | (up_stage[0]) unpack1.l r16_r[0] = zero, r[PAVGL] | ||
707 : | } | ||
708 : | {.mii | ||
709 : | (ld_stage[0]) ld8 ref2_vh[0] = [ref2_ah], stride | ||
710 : | (up_stage[0]) unpack1.h r16_l[0] = zero, r[PAVGL] | ||
711 : | (up_stage[0]) unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL] | ||
712 : | } | ||
713 : | {.mii | ||
714 : | (st_stage[0]) st8 [dct_ar] = dct16_r[PSL], 16 | ||
715 : | (up_stage[0]) unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL] | ||
716 : | (psub_stage[0]) psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL] | ||
717 : | } | ||
718 : | {.mib | ||
719 : | (st_stage[0]) st8 [dct_al] = dct16_l[PSL], 16 | ||
720 : | (psub_stage[0]) psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL] | ||
721 : | br.ctop.sptk.few .Loop_8to16sub2 // Und hopp | ||
722 : | ;; | ||
723 : | } | ||
724 : | |||
725 : | // *** Restore old LC and PRs *** | ||
726 : | mov ar.lc = oldLC | ||
727 : | mov pr = oldPR, -1 | ||
728 : | |||
729 : | ia64p | 1.1 | br.ret.sptk.many b0 |
730 : | .endp transfer_8to16sub2_ia64# |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |