Parent Directory
|
Revision Log
Revision 1.4 - (view) (download)
1 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
2 : | // | ||
3 : | // mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, | ||
4 : | // University of Karlsruhe, Germany, 03.06.2002, during the laboratory | ||
5 : | // "IA-64 Video Codec Assember Parktikum" at IPD Goos. | ||
6 : | // | ||
7 : | // | ||
8 : | ///// legal header taken from original C-file /////////////////////////////////////// | ||
9 : | // | ||
10 : | edgomez | 1.4 | // * XVID MPEG-4 VIDEO CODEC |
11 : | // * - 8bit<->16bit transfer - | ||
12 : | // * | ||
13 : | // * This file is part of XviD, a free MPEG-4 video encoder/decoder | ||
14 : | // * | ||
15 : | // * XviD is free software; you can redistribute it and/or modify it | ||
16 : | // * under the terms of the GNU General Public License as published by | ||
17 : | // * the Free Software Foundation; either version 2 of the License, or | ||
18 : | // * (at your option) any later version. | ||
19 : | // * | ||
20 : | // * This program is distributed in the hope that it will be useful, | ||
21 : | // * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
22 : | // * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
23 : | // * GNU General Public License for more details. | ||
24 : | // * | ||
25 : | // * You should have received a copy of the GNU General Public License | ||
26 : | // * along with this program; if not, write to the Free Software | ||
27 : | // * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
28 : | // * | ||
29 : | // * Under section 8 of the GNU General Public License, the copyright | ||
30 : | // * holders of XVID explicitly forbid distribution in the following | ||
31 : | // * countries: | ||
32 : | // * | ||
33 : | // * - Japan | ||
34 : | // * - United States of America | ||
35 : | // * | ||
36 : | // * Linking XviD statically or dynamically with other modules is making a | ||
37 : | // * combined work based on XviD. Thus, the terms and conditions of the | ||
38 : | // * GNU General Public License cover the whole combination. | ||
39 : | // * | ||
40 : | // * As a special exception, the copyright holders of XviD give you | ||
41 : | // * permission to link XviD with independent modules that communicate with | ||
42 : | // * XviD solely through the VFW1.1 and DShow interfaces, regardless of the | ||
43 : | // * license terms of these independent modules, and to copy and distribute | ||
44 : | // * the resulting combined work under terms of your choice, provided that | ||
45 : | // * every copy of the combined work is accompanied by a complete copy of | ||
46 : | // * the source code of XviD (the version of XviD used to produce the | ||
47 : | // * combined work), being distributed under the terms of the GNU General | ||
48 : | // * Public License plus this exception. An independent module is a module | ||
49 : | // * which is not derived from or based on XviD. | ||
50 : | // * | ||
51 : | // * Note that people who make modified versions of XviD are not obligated | ||
52 : | // * to grant this special exception for their modified versions; it is | ||
53 : | // * their choice whether to do so. The GNU General Public License gives | ||
54 : | // * permission to release a modified version without this exception; this | ||
55 : | // * exception also makes it possible to release a modified version which | ||
56 : | // * carries forward this exception. | ||
57 : | // * | ||
58 : | // * $Id$ | ||
59 : | ia64p | 1.3 | // |
60 : | ///// History ///////////////////////////////////////////////////////////////// | ||
61 : | // | ||
62 : | // - 16.07.2002: several minor changes for ecc-conformity | ||
63 : | // - 03.06.2002: initial version | ||
64 : | // | ||
65 : | /////////////////////////////////////////////////////////////////////////////// | ||
66 : | // | ||
67 : | // Annotations: | ||
68 : | // =========== | ||
69 : | // | ||
70 : | // - All functions work on 8x8-matrices. While the C-code-functions treat each | ||
71 : | // element seperatly, the functions in this assembler-code treat a whole line | ||
72 : | // simultaneously. So one loop is saved. | ||
73 : | // The remaining loop is relized by using softwarepipelining with rotating | ||
74 : | // rregisters. | ||
75 : | // - Register renaming is used for better readability | ||
76 : | // - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both | ||
77 : | // parts are shifted and joined together with an "OR"-Instruction. | ||
78 : | // - First parameter is stored in GR 32, next in GR 33, and so on. They must be | ||
79 : | // saved, as these GRs are used for register-rotation. | ||
80 : | // - Some of the orininal, German comments used during development are left in | ||
81 : | // in the code. They shouldn't bother anyone. | ||
82 : | // | ||
83 : | // Anmerkungen: | ||
84 : | // ============ | ||
85 : | // | ||
86 : | // - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code | ||
87 : | // jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- | ||
88 : | // Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. | ||
89 : | // Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit | ||
90 : | // rotierenden Registern realisiert. | ||
91 : | // - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. | ||
92 : | // - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke | ||
93 : | // geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem | ||
94 : | // logischen Oder zusammenkopiert. | ||
95 : | // - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- | ||
96 : | // sichert werden, da die Register für die register-Rotation benötigt werden. | ||
97 : | // - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase | ||
98 : | // sind im Code verblieben. Sie sollten niemanden stören. | ||
99 : | // | ||
100 : | /////////////////////////////////////////////////////////////////////////////// | ||
101 : | ia64p | 1.2 | |
102 : | |||
103 : | // *** define Latencies for software pipilines *** | ||
104 : | |||
105 : | LL = 3 // Load | ||
106 : | SL = 3 // Store | ||
107 : | PL = 1 // Pack | ||
108 : | SHL = 1 // Shift | ||
109 : | OL = 1 // Or | ||
110 : | UL = 1 // Unpack | ||
111 : | PAL = 1 // Parallel Add | ||
112 : | PSL = 1 // Parallel Subtract | ||
113 : | PAVGL = 1 // Parallel Avarage | ||
114 : | |||
115 : | .text | ||
116 : | |||
117 : | |||
118 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
119 : | // | ||
120 : | // transfer8x8_copy_ia64 | ||
121 : | // | ||
122 : | // SRC is missaligned, to align the source load two 8-bytes-words, shift it, | ||
123 : | // join them and store the aligned source into the destination address. | ||
124 : | // | ||
125 : | /////////////////////////////////////////////////////////////////////////////// | ||
126 : | ia64p | 1.2 | |
127 : | .align 16 | ||
128 : | .global transfer8x8_copy_ia64# | ||
129 : | .proc transfer8x8_copy_ia64# | ||
130 : | |||
131 : | transfer8x8_copy_ia64: | ||
132 : | .prologue | ||
133 : | |||
134 : | // *** register renaming *** | ||
135 : | zero = r0 | ||
136 : | |||
137 : | oldLC = r2 | ||
138 : | oldPR = r3 | ||
139 : | |||
140 : | src_1 = r14 // left aligned address of src | ||
141 : | src_2 = r15 // right aligned address of src | ||
142 : | dst = r16 // destination address | ||
143 : | stride = r17 | ||
144 : | |||
145 : | offset = r18 // shift right offset | ||
146 : | aoffset = r19 // shift left offset | ||
147 : | |||
148 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
149 : | .save ar.lc, oldLC | ||
150 : | mov oldLC = ar.lc | ||
151 : | mov oldPR = pr | ||
152 : | ia64p | 1.3 | |
153 : | .body | ||
154 : | |||
155 : | ia64p | 1.2 | // *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
156 : | alloc r9 = ar.pfs, 3, 29, 0, 32 | ||
157 : | |||
158 : | // *** Saving Parameters *** | ||
159 : | mov dst = r32 | ||
160 : | mov stride = r34 | ||
161 : | |||
162 : | // *** Misalingment-Treatment *** | ||
163 : | and src_1 = -8, r33 // Computing adress of first aligned block containing src-values | ||
164 : | dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress | ||
165 : | ;; | ||
166 : | sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl | ||
167 : | add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values | ||
168 : | |||
169 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
170 : | mov ar.lc = 7 | ||
171 : | mov ar.ec = LL + SHL + OL + 1 | ||
172 : | mov pr.rot = 1 << 16 | ||
173 : | ;; | ||
174 : | |||
175 : | // *** define register arrays and predicate array for software pipeline *** | ||
176 : | // src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left | ||
177 : | .rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] | ||
178 : | .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] | ||
179 : | ia64p | 1.3 | |
180 : | |||
181 : | // Software pipelined loop: | ||
182 : | // Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 | ||
183 : | // Stage 2: Shift both values of source to SHD_R and SHD_L | ||
184 : | // Stage 3: Join both parts together with OR | ||
185 : | // Stage 4: Store aligned date to destination and add stride to destination address | ||
186 : | |||
187 : | |||
188 : | ia64p | 1.2 | .Loop_8x8copy: |
189 : | {.mii | ||
190 : | (ld_stage[0]) ld8 src_v1[0] = [src_1], stride | ||
191 : | (sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset | ||
192 : | } | ||
193 : | {.mii | ||
194 : | (ld_stage[0]) ld8 src_v2[0] = [src_2], stride | ||
195 : | (sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset | ||
196 : | (or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL] | ||
197 : | } | ||
198 : | {.mib | ||
199 : | (st_stage[0]) st8 [dst] = value[OL] | ||
200 : | (st_stage[0]) add dst = dst, stride | ||
201 : | br.ctop.sptk.few .Loop_8x8copy | ||
202 : | ;; | ||
203 : | } | ||
204 : | |||
205 : | // *** Restore old LC and PRs *** | ||
206 : | mov ar.lc = oldLC | ||
207 : | mov pr = oldPR, -1 | ||
208 : | |||
209 : | br.ret.sptk.many b0 | ||
210 : | |||
211 : | .endp transfer8x8_copy_ia64# | ||
212 : | |||
213 : | |||
214 : | |||
215 : | |||
216 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
217 : | // | ||
218 : | // transfer_8to16copy_ia64 | ||
219 : | // | ||
220 : | // SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, | ||
221 : | // UNPACK is used. So 8 bytes are loaded from source, unpacked to two | ||
222 : | // 4 x 16 bit values and stored to the destination. Destination is a continuous | ||
223 : | // array of 64 x 16 bit signed data. To store the next line, only 16 must be | ||
224 : | // added to the destination address. | ||
225 : | /////////////////////////////////////////////////////////////////////////////// | ||
226 : | ia64p | 1.2 | |
227 : | ia64p | 1.1 | .align 16 |
228 : | .global transfer_8to16copy_ia64# | ||
229 : | .proc transfer_8to16copy_ia64# | ||
230 : | ia64p | 1.2 | |
231 : | |||
232 : | ia64p | 1.1 | transfer_8to16copy_ia64: |
233 : | .prologue | ||
234 : | ia64p | 1.2 | |
235 : | // *** register renaming *** | ||
236 : | oldLC = r2 | ||
237 : | oldPR = r3 | ||
238 : | |||
239 : | zero = r0 // damit ist die Zahl "zero" = 0 gemeint | ||
240 : | |||
241 : | dst_1 = r14 // destination address for first 4 x 16 bit values | ||
242 : | dst_2 = r15 // destination address for second 4 x 16 bit values | ||
243 : | src = r16 | ||
244 : | stride = r17 | ||
245 : | |||
246 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
247 : | .save ar.lc, oldLC | ||
248 : | mov oldLC = ar.lc | ||
249 : | mov oldPR = pr | ||
250 : | |||
251 : | ia64p | 1.3 | |
252 : | .body | ||
253 : | |||
254 : | ia64p | 1.2 | // *** Allocating new stackframe, define rotating registers *** |
255 : | alloc r9 = ar.pfs, 4, 92, 0, 96 | ||
256 : | |||
257 : | // *** Saving Paramters *** | ||
258 : | mov dst_1 = r32 // fist 4 x 16 bit values | ||
259 : | add dst_2 = 8, r32 // second 4 x 16 bit values | ||
260 : | mov src = r33 | ||
261 : | mov stride = r34 | ||
262 : | |||
263 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
264 : | mov ar.lc = 7 | ||
265 : | mov ar.ec = LL + UL + 1 | ||
266 : | mov pr.rot = 1 << 16 | ||
267 : | ;; | ||
268 : | |||
269 : | // *** define register arrays and predicate array for software pipeline *** | ||
270 : | // src_v = source value, dst_v1 = destination value 1 | ||
271 : | .rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] | ||
272 : | .rotp ld_stage[LL], upack_stage[UL], st_stage[1] | ||
273 : | ia64p | 1.3 | |
274 : | |||
275 : | // Software pipelined loop: | ||
276 : | // Stage 1: Load value of SRC | ||
277 : | // Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data | ||
278 : | // Stage 3: Store both 8 byte of 16 bit data | ||
279 : | |||
280 : | ia64p | 1.2 | |
281 : | .Loop_8to16copy: | ||
282 : | {.mii | ||
283 : | (ld_stage[0]) ld8 src_v[0] = [src], stride | ||
284 : | (upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL] | ||
285 : | (upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL] | ||
286 : | } | ||
287 : | {.mmb | ||
288 : | (st_stage[0]) st8 [dst_1] = dst_v1[UL], 16 | ||
289 : | (st_stage[0]) st8 [dst_2] = dst_v2[UL], 16 | ||
290 : | br.ctop.sptk.few .Loop_8to16copy | ||
291 : | ;; | ||
292 : | } | ||
293 : | |||
294 : | // *** Restore old LC and PRs *** | ||
295 : | mov ar.lc = oldLC | ||
296 : | mov pr = oldPR, -1 | ||
297 : | |||
298 : | ia64p | 1.1 | br.ret.sptk.many b0 |
299 : | .endp transfer_8to16copy_ia64# | ||
300 : | ia64p | 1.2 | |
301 : | |||
302 : | |||
303 : | |||
304 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
305 : | // | ||
306 : | // transfer_16to8copy_ia64 | ||
307 : | // | ||
308 : | // src is a 64 x 16 bit signed continuous array. To convert the 16 bit | ||
309 : | // values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of | ||
310 : | // 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word | ||
311 : | // of 8 x 8 unsigned data to the destination. | ||
312 : | /////////////////////////////////////////////////////////////////////////////// | ||
313 : | ia64p | 1.2 | |
314 : | ia64p | 1.1 | .align 16 |
315 : | .global transfer_16to8copy_ia64# | ||
316 : | .proc transfer_16to8copy_ia64# | ||
317 : | transfer_16to8copy_ia64: | ||
318 : | .prologue | ||
319 : | ia64p | 1.2 | |
320 : | // *** register renaming *** | ||
321 : | dst = r14 | ||
322 : | src_1 = r15 | ||
323 : | src_2 = r17 | ||
324 : | stride = r16 | ||
325 : | |||
326 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
327 : | .save ar.lc, oldLC | ||
328 : | mov oldLC = ar.lc | ||
329 : | mov oldPR = pr | ||
330 : | |||
331 : | ia64p | 1.3 | |
332 : | .body | ||
333 : | |||
334 : | ia64p | 1.2 | // *** Allocating new stackframe, define rotating registers *** |
335 : | alloc r9 = ar.pfs, 4, 92, 0, 96 | ||
336 : | |||
337 : | // *** Saving Paramters *** | ||
338 : | mov dst = r32 | ||
339 : | mov src_1 = r33 | ||
340 : | add src_2 = 8, r33 | ||
341 : | mov stride = r34 | ||
342 : | |||
343 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
344 : | mov ar.lc = 7 | ||
345 : | mov ar.ec = LL + PL + 1 | ||
346 : | mov pr.rot = 1 << 16 | ||
347 : | ;; | ||
348 : | |||
349 : | // *** define register arrays and predicate array for software pipeline *** | ||
350 : | // src_v1 = source value 1, dst_v = destination value | ||
351 : | .rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1] | ||
352 : | .rotp ld_stage[LL], pack_stage[PL], st_stage[1] | ||
353 : | |||
354 : | |||
355 : | ia64p | 1.3 | // Software pipelined loop: |
356 : | // Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data | ||
357 : | // Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data | ||
358 : | // Stage 3: Store the 8 byte to the destination address and add stride to | ||
359 : | // destination address (to get the next 8 byte line of destination) | ||
360 : | |||
361 : | |||
362 : | ia64p | 1.2 | .Loop_16to8copy: |
363 : | {.mmi | ||
364 : | (ld_stage[0]) ld8 src_v1[0] = [src_1], 16 | ||
365 : | (ld_stage[0]) ld8 src_v2[0] = [src_2], 16 | ||
366 : | (pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL] | ||
367 : | } | ||
368 : | {.mib | ||
369 : | (st_stage[0]) st8 [dst] = dst_v[PL] | ||
370 : | (st_stage[0]) add dst = dst, stride | ||
371 : | br.ctop.sptk.few .Loop_16to8copy | ||
372 : | ;; | ||
373 : | } | ||
374 : | |||
375 : | // *** Restore old LC and PRs *** | ||
376 : | mov ar.lc = oldLC | ||
377 : | mov pr = oldPR, -1 | ||
378 : | |||
379 : | ia64p | 1.1 | br.ret.sptk.many b0 |
380 : | .endp transfer_16to8copy_ia64# | ||
381 : | ia64p | 1.2 | |
382 : | |||
383 : | |||
384 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
385 : | // | ||
386 : | // transfer_16to8add_ia64 | ||
387 : | // | ||
388 : | // The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- | ||
389 : | // bit-values. These are "parallel-added" to the values of src. The result is | ||
390 : | // converted into 8-bit-values using "PACK" and stored at the adress of dst. | ||
391 : | // We assume that there is no misalignment. | ||
392 : | // | ||
393 : | /////////////////////////////////////////////////////////////////////////////// | ||
394 : | ia64p | 1.2 | |
395 : | .align 16 | ||
396 : | .global transfer_16to8add_ia64# | ||
397 : | .proc transfer_16to8add_ia64# | ||
398 : | |||
399 : | transfer_16to8add_ia64: | ||
400 : | .prologue | ||
401 : | |||
402 : | // *** register renaming *** | ||
403 : | dst = r14 | ||
404 : | src = r15 | ||
405 : | stride = r16 | ||
406 : | |||
407 : | _src = r17 | ||
408 : | |||
409 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
410 : | .save ar.lc, r2 | ||
411 : | mov oldLC = ar.lc | ||
412 : | mov oldPR = pr | ||
413 : | |||
414 : | ia64p | 1.3 | |
415 : | .body | ||
416 : | |||
417 : | ia64p | 1.2 | // *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
418 : | alloc r9 = ar.pfs, 4, 92, 0, 96 | ||
419 : | |||
420 : | // *** Saving Paramters *** | ||
421 : | mov dst = r32 | ||
422 : | mov src = r33 | ||
423 : | mov stride = r34 | ||
424 : | add _src = 8, r33 | ||
425 : | |||
426 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
427 : | mov ar.lc = 7 | ||
428 : | mov ar.ec = LL + UL + PAL + PL + 1 | ||
429 : | mov pr.rot = 1 << 16 | ||
430 : | ;; | ||
431 : | |||
432 : | // *** define register arrays and predicate array for software pipeline *** | ||
433 : | .rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1] | ||
434 : | .rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] | ||
435 : | |||
436 : | |||
437 : | ia64p | 1.3 | // Software pipelined loop: |
438 : | // s1_p: The values of src and dst are loaded | ||
439 : | // s2_p: The dst-values are converted to 16-bit-values | ||
440 : | // s3_p: The values of src and dst are added | ||
441 : | // s4_p: The Results are packed into 8-bit-values | ||
442 : | // s5_p: The 8-bit-values are stored at the dst-adresses | ||
443 : | |||
444 : | ia64p | 1.2 | |
445 : | .Loop_16to8add: | ||
446 : | {.mii | ||
447 : | (s1_p[0]) ld8 w_src_1[0] = [src], 16 // läd die 1. Hälfte der j. Zeile von src (i = 0..3) | ||
448 : | (s1_p[0]) mov _dst[0] = dst // erhöht die Adresse von dst um stride | ||
449 : | (s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst | ||
450 : | } | ||
451 : | {.mii | ||
452 : | (s1_p[0]) ld8 w_dst8[0] = [dst], stride // läd die j. Zeile von dst | ||
453 : | (s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird für i = 0..3 in 16-Bit umgewandelt | ||
454 : | (s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird für i = 4..7 in 16-Bit umgewandelt | ||
455 : | } | ||
456 : | {.mii | ||
457 : | (s1_p[0]) ld8 w_src_2[0] = [_src], 16 // läd die 2. Hälfte der j. Zeile von src (i = 4..7) | ||
458 : | (s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst | ||
459 : | (s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die Überprüfung der Wertebereiche erfolgt automatisch | ||
460 : | } | ||
461 : | {.mmb | ||
462 : | (s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab | ||
463 : | (s1_p[0]) nop.m 0 | ||
464 : | br.ctop.sptk.few .Loop_16to8add | ||
465 : | ;; | ||
466 : | } | ||
467 : | |||
468 : | // *** Restore old LC and PRs *** | ||
469 : | mov ar.lc = oldLC | ||
470 : | mov pr = oldPR, -1 | ||
471 : | |||
472 : | br.ret.sptk.many b0 | ||
473 : | .endp transfer_16to8add_ia64# | ||
474 : | |||
475 : | |||
476 : | |||
477 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
478 : | // | ||
479 : | // transfer_8to16sub_ia64 | ||
480 : | // | ||
481 : | // The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The | ||
482 : | // Difference of cur and ref ist stored at the dct-adresses and cur is copied | ||
483 : | // into the ref-array. | ||
484 : | // | ||
485 : | // You must assume, that the data adressed by 'ref' are misaligned in memory. | ||
486 : | // But you can assume, that the other data are aligned (at least I hope so). | ||
487 : | // | ||
488 : | /////////////////////////////////////////////////////////////////////////////// | ||
489 : | ia64p | 1.2 | |
490 : | ia64p | 1.1 | .align 16 |
491 : | .global transfer_8to16sub_ia64# | ||
492 : | .proc transfer_8to16sub_ia64# | ||
493 : | ia64p | 1.2 | |
494 : | |||
495 : | ia64p | 1.1 | transfer_8to16sub_ia64: |
496 : | .prologue | ||
497 : | ia64p | 1.2 | |
498 : | // *** register renaming *** | ||
499 : | oldLC = r2 | ||
500 : | oldPR = r3 | ||
501 : | |||
502 : | zero = r0 // damit ist die Zahl "zero" = 0 gemeint | ||
503 : | |||
504 : | //Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage | ||
505 : | dct = r14 | ||
506 : | cur = r15 | ||
507 : | ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das ÜbergabeRegister in dieser Liste | ||
508 : | stride = r16 | ||
509 : | |||
510 : | offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtrücken | ||
511 : | aoffset = r18 // Gegenstück zum Offset, | ||
512 : | ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref | ||
513 : | ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref | ||
514 : | |||
515 : | _dct = r21 // Register für die Zieladressen des 2. dct-Blocks | ||
516 : | |||
517 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
518 : | .save ar.lc, r2 | ||
519 : | mov oldLC = ar.lc | ||
520 : | mov oldPR = pr | ||
521 : | |||
522 : | ia64p | 1.3 | |
523 : | .body | ||
524 : | |||
525 : | ia64p | 1.2 | // *** Allocating new stackframe, define rotating registers *** |
526 : | alloc r9 = ar.pfs, 4, 92, 0, 96 | ||
527 : | |||
528 : | // *** Saving Paramters *** | ||
529 : | mov dct = r32 | ||
530 : | mov cur = r33 | ||
531 : | // mov ref = r34: ref is unaligned, get aligned ref below... | ||
532 : | mov stride = r35 | ||
533 : | |||
534 : | and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8) | ||
535 : | dep offset = ref, zero, 3, 3 | ||
536 : | ;; | ||
537 : | add ref_a2 = 8, ref_a1 | ||
538 : | sub aoffset = 64, offset // Gegenstück zum Offset wird berechnet | ||
539 : | add _dct = 8, dct // Die Adresse für den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) höher als beim 1. Block | ||
540 : | |||
541 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
542 : | mov ar.lc = 7 | ||
543 : | mov ar.ec = LL + SHL + OL + UL + PSL + 1 | ||
544 : | mov pr.rot = 1 << 16 | ||
545 : | ;; | ||
546 : | |||
547 : | // *** define register arrays and predicate array for software pipeline *** | ||
548 : | .rotr c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1], dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1] | ||
549 : | .rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] | ||
550 : | |||
551 : | |||
552 : | ia64p | 1.3 | // Software pipelined loop: |
553 : | // s1_p: The values of ref and cur ale loaded, a copy of cur is made. | ||
554 : | // s2_p: cur is converted to 16-bit and thehe misaligned values of ref are | ||
555 : | // shifted... | ||
556 : | // s3_p: ... and copied together. | ||
557 : | // s4_p: This ref-value is converted to 16-bit. The values of cur are stored | ||
558 : | // at the ref-adresses. | ||
559 : | // s5_p: the ref- abd cur-values are substracted... | ||
560 : | // s6_p: ...and the result is stored at the dct-adresses. | ||
561 : | |||
562 : | ia64p | 1.2 | |
563 : | loop_8to16sub: | ||
564 : | {.mii | ||
565 : | (s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // läd den 1. 64-Bit-Block, der einen Teil der ref-Daten enthält | ||
566 : | (s1_p[0]) mov _cur[0] = cur // cur wird für spätere Verwendung gesichert | ||
567 : | (s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte Hälfte wird zurechtgerückt | ||
568 : | } | ||
569 : | {.mii | ||
570 : | (s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // läd den 2. 64-Bit-Block | ||
571 : | (s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke Hälfte wird zurechtgerückt | ||
572 : | (s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtgerückten Daten werden in r zusammenkopiert | ||
573 : | } | ||
574 : | {.mii | ||
575 : | (s1_p[0]) ld8 c[0] = [cur], stride //läd die j. Zeile von cur komplett | ||
576 : | (s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird für i = 0..3 in 16-Bit umgewandelt | ||
577 : | (s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird für i = 4..7 in 16-Bit umgewandelt | ||
578 : | } | ||
579 : | {.mii | ||
580 : | (s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt | ||
581 : | //Umwandeln der 8-Bit r und c -Werte in 16-bit Werte | ||
582 : | (s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird für i = 0..3 in 16-Bit umgewandelt | ||
583 : | (s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird für i = 4..7 in 16-Bit umgewandelt | ||
584 : | } | ||
585 : | {.mii | ||
586 : | (s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. Häfte der j. Zeile | ||
587 : | (s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. Hälfte | ||
588 : | } | ||
589 : | {.mmb | ||
590 : | (s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert | ||
591 : | (s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert | ||
592 : | br.ctop.sptk.few loop_8to16sub // Und hopp | ||
593 : | ;; | ||
594 : | } | ||
595 : | |||
596 : | // *** Restore old LC and PRs *** | ||
597 : | mov ar.lc = oldLC | ||
598 : | mov pr = oldPR, -1 | ||
599 : | |||
600 : | ia64p | 1.1 | br.ret.sptk.many b0 |
601 : | .endp transfer_8to16sub_ia64# | ||
602 : | ia64p | 1.2 | |
603 : | |||
604 : | |||
605 : | |||
606 : | |||
607 : | ia64p | 1.3 | /////////////////////////////////////////////////////////////////////////////// |
608 : | // | ||
609 : | // transfer_8to16sub2_ia64 | ||
610 : | // | ||
611 : | // At the time, this function was written, it was not yet in use. | ||
612 : | // We assume that the values of ref1/2 are misaligned. | ||
613 : | // | ||
614 : | // The values of ref1/2 and cur are loaded, the ref-values need misalignment- | ||
615 : | // treatment. The values are converted to 16-bit using unpack. The average of | ||
616 : | // ref1 and ref2 is computed with pavg and substacted from cur. The results are | ||
617 : | // stored at the dct-adresses. | ||
618 : | // pavg1.raz is used to get the same results as the C-code-function. | ||
619 : | // | ||
620 : | /////////////////////////////////////////////////////////////////////////////// | ||
621 : | ia64p | 1.2 | |
622 : | .text | ||
623 : | ia64p | 1.1 | .align 16 |
624 : | .global transfer_8to16sub2_ia64# | ||
625 : | .proc transfer_8to16sub2_ia64# | ||
626 : | ia64p | 1.2 | |
627 : | ia64p | 1.1 | transfer_8to16sub2_ia64: |
628 : | .prologue | ||
629 : | ia64p | 1.2 | |
630 : | // *** register renaming *** | ||
631 : | // We've tried to keep the C-Code names as often as possible, at least as | ||
632 : | // part of register-names | ||
633 : | oldLC = r2 | ||
634 : | oldPR = r3 | ||
635 : | |||
636 : | zero = r0 | ||
637 : | |||
638 : | dct_al = r14 // dct: adress of left block in one line | ||
639 : | dct_ar = r15 // dct: adress of right block in one line | ||
640 : | cur = r16 | ||
641 : | ref1_al = r17 // ref1: aligned adress of lower part | ||
642 : | ref1_ah = r18 // ref1: aligned adress of higher part | ||
643 : | ref2_al = r19 // ref2: aligned adress of lower part | ||
644 : | ref2_ah = r20 // ref2: aligned adress of higher part | ||
645 : | stride = r21 | ||
646 : | |||
647 : | offset_1 = r22 | ||
648 : | offset_2 = r23 | ||
649 : | aoffset_1 = r24 | ||
650 : | aoffset_2 = r25 | ||
651 : | |||
652 : | // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** | ||
653 : | ia64p | 1.1 | .save ar.lc, r2 |
654 : | ia64p | 1.2 | mov oldLC = ar.lc |
655 : | mov oldPR = pr | ||
656 : | |||
657 : | ia64p | 1.3 | |
658 : | .body | ||
659 : | |||
660 : | ia64p | 1.2 | // *** Saving Paramters *** |
661 : | // *** (as inputregisters r32 + are needed for register-rotation) *** | ||
662 : | mov dct_ar = r32 | ||
663 : | add dct_al = 8, r32 | ||
664 : | mov cur = r33 | ||
665 : | |||
666 : | and ref1_al = -8, r34 | ||
667 : | and ref2_al = -8, r35 // ref2 aligned adrress of lower part | ||
668 : | |||
669 : | mov stride = r36 | ||
670 : | |||
671 : | // *** Calculations for Misaligment-Handling *** | ||
672 : | dep offset_1 = r34, zero, 3, 3 | ||
673 : | dep offset_2 = r35, zero, 3, 3 | ||
674 : | ;; | ||
675 : | add ref1_ah = 8, ref1_al | ||
676 : | add ref2_ah = 8, ref2_al | ||
677 : | sub aoffset_1 = 64, offset_1 | ||
678 : | sub aoffset_2 = 64, offset_2 | ||
679 : | ;; | ||
680 : | |||
681 : | // *** Allocating new stackframe, define rotating registers *** | ||
682 : | alloc r9 = ar.pfs, 5, 91, 0, 96 | ||
683 : | |||
684 : | // *** init loop: set loop counter, epilog counter, predicates *** | ||
685 : | mov ar.lc = 7 | ||
686 : | mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1 | ||
687 : | mov pr.rot = 1 << 16 | ||
688 : | ;; | ||
689 : | |||
690 : | // *** define register arrays and predicate array for software pipeline *** | ||
691 : | .rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] | ||
692 : | .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] | ||
693 : | ia64p | 1.3 | |
694 : | ia64p | 1.2 | |
695 : | ia64p | 1.3 | // software pipelined loop: |
696 : | // ld_stage: The values of ref1, ref2, cur are loaded | ||
697 : | // sh_stage: The misaligned values of ref1/2 are shifted... | ||
698 : | // or_stage: ...and copied together. | ||
699 : | // pavg_stage: The average of ref1 and ref2 is computed. | ||
700 : | // up_stage: The result and the cur-values are converted to 16-bit. | ||
701 : | // psub_stage: Those values are substracted... | ||
702 : | // st_stage: ...and stored at the dct-adresses. | ||
703 : | |||
704 : | ia64p | 1.2 | |
705 : | .Loop_8to16sub2: | ||
706 : | {.mii | ||
707 : | (ld_stage[0]) ld8 c[0] = [cur], stride | ||
708 : | (sh_stage[0]) shr.u ref1_l[0] = ref1_vl[LL], offset_1 | ||
709 : | (sh_stage[0]) shl ref1_h[0] = ref1_vh[LL], aoffset_1 | ||
710 : | } | ||
711 : | {.mii | ||
712 : | (ld_stage[0]) ld8 ref1_vl[0] = [ref1_al], stride | ||
713 : | (sh_stage[0]) shr.u ref2_l[0] = ref2_vl[LL], offset_2 | ||
714 : | (sh_stage[0]) shl ref2_h[0] = ref2_vh[LL], aoffset_2 | ||
715 : | } | ||
716 : | {.mii | ||
717 : | (ld_stage[0]) ld8 ref1_vh[0] = [ref1_ah], stride | ||
718 : | (or_stage[0]) or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL] | ||
719 : | (or_stage[0]) or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL] | ||
720 : | } | ||
721 : | {.mii | ||
722 : | (ld_stage[0]) ld8 ref2_vl[0] = [ref2_al], stride | ||
723 : | (pavg_stage[0]) pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL] | ||
724 : | (up_stage[0]) unpack1.l r16_r[0] = zero, r[PAVGL] | ||
725 : | } | ||
726 : | {.mii | ||
727 : | (ld_stage[0]) ld8 ref2_vh[0] = [ref2_ah], stride | ||
728 : | (up_stage[0]) unpack1.h r16_l[0] = zero, r[PAVGL] | ||
729 : | (up_stage[0]) unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL] | ||
730 : | } | ||
731 : | {.mii | ||
732 : | (st_stage[0]) st8 [dct_ar] = dct16_r[PSL], 16 | ||
733 : | (up_stage[0]) unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL] | ||
734 : | (psub_stage[0]) psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL] | ||
735 : | } | ||
736 : | {.mib | ||
737 : | (st_stage[0]) st8 [dct_al] = dct16_l[PSL], 16 | ||
738 : | (psub_stage[0]) psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL] | ||
739 : | br.ctop.sptk.few .Loop_8to16sub2 // Und hopp | ||
740 : | ;; | ||
741 : | } | ||
742 : | |||
743 : | // *** Restore old LC and PRs *** | ||
744 : | mov ar.lc = oldLC | ||
745 : | mov pr = oldPR, -1 | ||
746 : | |||
747 : | ia64p | 1.1 | br.ret.sptk.many b0 |
748 : | .endp transfer_8to16sub2_ia64# |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |