1 |
/**************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
2 |
* |
// |
3 |
* mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, |
// mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, |
4 |
* University of Karlsruhe, Germany, 03.06.2002, during the laboratory |
// University of Karlsruhe, Germany, 03.06.2002, during the laboratory |
5 |
* "IA-64 Video Codec Assember Parktikum" at IPD Goos. |
// "IA-64 Video Codec Assember Parktikum" at IPD Goos. |
6 |
* |
// |
7 |
* Annotations: |
// |
8 |
* =========== |
///// legal header taken from original C-file /////////////////////////////////////// |
9 |
* |
// |
10 |
* - All functions work on 8x8-matrices. While the C-code-functions treat each |
// * XVID MPEG-4 VIDEO CODEC |
11 |
* element seperatly, the functions in this assembler-code treat a whole line |
// * - 8bit<->16bit transfer - |
12 |
* simultaneously. So one loop is saved. |
// * |
13 |
* The remaining loop is relized by using softwarepipelining with rotating |
// * This file is part of XviD, a free MPEG-4 video encoder/decoder |
14 |
* rregisters. |
// * |
15 |
* - Register renaming is used for better readability |
// * XviD is free software; you can redistribute it and/or modify it |
16 |
* - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both |
// * under the terms of the GNU General Public License as published by |
17 |
* parts are shifted and joined together with an "OR"-Instruction. |
// * the Free Software Foundation; either version 2 of the License, or |
18 |
* - First parameter is stored in GR 32, next in GR 33, and so on. They must be |
// * (at your option) any later version. |
19 |
* saved, as these GRs are used for register-rotation. |
// * |
20 |
* - Some of the orininal, German comments used during development are left in |
// * This program is distributed in the hope that it will be useful, |
21 |
* in the code. They shouldn't bother anyone. |
// * but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 |
* |
// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 |
* Anmerkungen: |
// * GNU General Public License for more details. |
24 |
* ============ |
// * |
25 |
* |
// * You should have received a copy of the GNU General Public License |
26 |
* - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code |
// * along with this program; if not, write to the Free Software |
27 |
* jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- |
// * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
28 |
* Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. |
// * |
29 |
* Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit |
// * Under section 8 of the GNU General Public License, the copyright |
30 |
* rotierenden Registern realisiert. |
// * holders of XVID explicitly forbid distribution in the following |
31 |
* - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. |
// * countries: |
32 |
* - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke |
// * |
33 |
* geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem |
// * - Japan |
34 |
* logischen Oder zusammenkopiert. |
// * - United States of America |
35 |
* - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- |
// * |
36 |
* sichert werden, da die Register für die register-Rotation benötigt werden. |
// * Linking XviD statically or dynamically with other modules is making a |
37 |
* - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase |
// * combined work based on XviD. Thus, the terms and conditions of the |
38 |
* sind im Code verblieben. Sie sollten niemanden stören. |
// * GNU General Public License cover the whole combination. |
39 |
* |
// * |
40 |
****************************************************************************/ |
// * As a special exception, the copyright holders of XviD give you |
41 |
|
// * permission to link XviD with independent modules that communicate with |
42 |
|
// * XviD solely through the VFW1.1 and DShow interfaces, regardless of the |
43 |
|
// * license terms of these independent modules, and to copy and distribute |
44 |
|
// * the resulting combined work under terms of your choice, provided that |
45 |
|
// * every copy of the combined work is accompanied by a complete copy of |
46 |
|
// * the source code of XviD (the version of XviD used to produce the |
47 |
|
// * combined work), being distributed under the terms of the GNU General |
48 |
|
// * Public License plus this exception. An independent module is a module |
49 |
|
// * which is not derived from or based on XviD. |
50 |
|
// * |
51 |
|
// * Note that people who make modified versions of XviD are not obligated |
52 |
|
// * to grant this special exception for their modified versions; it is |
53 |
|
// * their choice whether to do so. The GNU General Public License gives |
54 |
|
// * permission to release a modified version without this exception; this |
55 |
|
// * exception also makes it possible to release a modified version which |
56 |
|
// * carries forward this exception. |
57 |
|
// * |
58 |
|
// * $Id$ |
59 |
|
// |
60 |
|
///// History ///////////////////////////////////////////////////////////////// |
61 |
|
// |
62 |
|
// - 16.07.2002: several minor changes for ecc-conformity |
63 |
|
// - 03.06.2002: initial version |
64 |
|
// |
65 |
|
/////////////////////////////////////////////////////////////////////////////// |
66 |
|
// |
67 |
|
// Annotations: |
68 |
|
// =========== |
69 |
|
// |
70 |
|
// - All functions work on 8x8-matrices. While the C-code-functions treat each |
71 |
|
// element seperatly, the functions in this assembler-code treat a whole line |
72 |
|
// simultaneously. So one loop is saved. |
73 |
|
// The remaining loop is relized by using softwarepipelining with rotating |
74 |
|
// rregisters. |
75 |
|
// - Register renaming is used for better readability |
76 |
|
// - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both |
77 |
|
// parts are shifted and joined together with an "OR"-Instruction. |
78 |
|
// - First parameter is stored in GR 32, next in GR 33, and so on. They must be |
79 |
|
// saved, as these GRs are used for register-rotation. |
80 |
|
// - Some of the orininal, German comments used during development are left in |
81 |
|
// in the code. They shouldn't bother anyone. |
82 |
|
// |
83 |
|
// Anmerkungen: |
84 |
|
// ============ |
85 |
|
// |
86 |
|
// - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code |
87 |
|
// jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- |
88 |
|
// Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. |
89 |
|
// Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit |
90 |
|
// rotierenden Registern realisiert. |
91 |
|
// - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. |
92 |
|
// - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke |
93 |
|
// geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem |
94 |
|
// logischen Oder zusammenkopiert. |
95 |
|
// - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- |
96 |
|
// sichert werden, da die Register für die register-Rotation benötigt werden. |
97 |
|
// - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase |
98 |
|
// sind im Code verblieben. Sie sollten niemanden stören. |
99 |
|
// |
100 |
|
/////////////////////////////////////////////////////////////////////////////// |
101 |
|
|
102 |
|
|
103 |
// *** define Latencies for software pipilines *** |
// *** define Latencies for software pipilines *** |
115 |
.text |
.text |
116 |
|
|
117 |
|
|
118 |
/**************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
119 |
* |
// |
120 |
* transfer8x8_copy_ia64 |
// transfer8x8_copy_ia64 |
121 |
* |
// |
122 |
* SRC is missaligned, to align the source load two 8-bytes-words, shift it, |
// SRC is missaligned, to align the source load two 8-bytes-words, shift it, |
123 |
* join them and store the aligned source into the destination address. |
// join them and store the aligned source into the destination address. |
124 |
* |
// |
125 |
****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
126 |
|
|
127 |
.align 16 |
.align 16 |
128 |
.global transfer8x8_copy_ia64# |
.global transfer8x8_copy_ia64# |
145 |
offset = r18 // shift right offset |
offset = r18 // shift right offset |
146 |
aoffset = r19 // shift left offset |
aoffset = r19 // shift left offset |
147 |
|
|
|
|
|
|
.body |
|
|
|
|
148 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
149 |
.save ar.lc, oldLC |
.save ar.lc, oldLC |
150 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
151 |
mov oldPR = pr |
mov oldPR = pr |
152 |
|
|
153 |
|
.body |
154 |
|
|
155 |
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
156 |
alloc r9 = ar.pfs, 3, 29, 0, 32 |
alloc r9 = ar.pfs, 3, 29, 0, 32 |
157 |
|
|
177 |
.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] |
.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] |
178 |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] |
179 |
|
|
180 |
/* Software pipelined loop: |
|
181 |
* Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 |
// Software pipelined loop: |
182 |
* Stage 2: Shift both values of source to SHD_R and SHD_L |
// Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 |
183 |
* Stage 3: Join both parts together with OR |
// Stage 2: Shift both values of source to SHD_R and SHD_L |
184 |
* Stage 4: Store aligned date to destination and add stride to destination address */ |
// Stage 3: Join both parts together with OR |
185 |
|
// Stage 4: Store aligned date to destination and add stride to destination address |
186 |
|
|
187 |
|
|
188 |
.Loop_8x8copy: |
.Loop_8x8copy: |
189 |
{.mii |
{.mii |
190 |
(ld_stage[0]) ld8 src_v1[0] = [src_1], stride |
(ld_stage[0]) ld8 src_v1[0] = [src_1], stride |
213 |
|
|
214 |
|
|
215 |
|
|
216 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
217 |
* |
// |
218 |
* transfer_8to16copy_ia64 |
// transfer_8to16copy_ia64 |
219 |
* |
// |
220 |
* SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, |
// SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, |
221 |
* UNPACK is used. So 8 bytes are loaded from source, unpacked to two |
// UNPACK is used. So 8 bytes are loaded from source, unpacked to two |
222 |
* 4 x 16 bit values and stored to the destination. Destination is a continuous |
// 4 x 16 bit values and stored to the destination. Destination is a continuous |
223 |
* array of 64 x 16 bit signed data. To store the next line, only 16 must be |
// array of 64 x 16 bit signed data. To store the next line, only 16 must be |
224 |
* added to the destination address. |
// added to the destination address. |
225 |
*****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
226 |
|
|
227 |
.align 16 |
.align 16 |
228 |
.global transfer_8to16copy_ia64# |
.global transfer_8to16copy_ia64# |
243 |
src = r16 |
src = r16 |
244 |
stride = r17 |
stride = r17 |
245 |
|
|
|
|
|
|
.body |
|
|
|
|
246 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
247 |
.save ar.lc, oldLC |
.save ar.lc, oldLC |
248 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
249 |
mov oldPR = pr |
mov oldPR = pr |
250 |
|
|
251 |
|
|
252 |
|
.body |
253 |
|
|
254 |
// *** Allocating new stackframe, define rotating registers *** |
// *** Allocating new stackframe, define rotating registers *** |
255 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
256 |
|
|
271 |
.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] |
.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] |
272 |
.rotp ld_stage[LL], upack_stage[UL], st_stage[1] |
.rotp ld_stage[LL], upack_stage[UL], st_stage[1] |
273 |
|
|
274 |
/* Software pipelined loop: |
|
275 |
* Stage 1: Load value of SRC |
// Software pipelined loop: |
276 |
* Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data |
// Stage 1: Load value of SRC |
277 |
* Stage 3: Store both 8 byte of 16 bit data */ |
// Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data |
278 |
|
// Stage 3: Store both 8 byte of 16 bit data |
279 |
|
|
280 |
|
|
281 |
.Loop_8to16copy: |
.Loop_8to16copy: |
282 |
{.mii |
{.mii |
283 |
(ld_stage[0]) ld8 src_v[0] = [src], stride |
(ld_stage[0]) ld8 src_v[0] = [src], stride |
301 |
|
|
302 |
|
|
303 |
|
|
304 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
305 |
* |
// |
306 |
* transfer_16to8copy_ia64 |
// transfer_16to8copy_ia64 |
307 |
* |
// |
308 |
* src is a 64 x 16 bit signed continuous array. To convert the 16 bit |
// src is a 64 x 16 bit signed continuous array. To convert the 16 bit |
309 |
* values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of |
// values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of |
310 |
* 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word |
// 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word |
311 |
* of 8 x 8 unsigned data to the destination. |
// of 8 x 8 unsigned data to the destination. |
312 |
****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
313 |
|
|
314 |
.align 16 |
.align 16 |
315 |
.global transfer_16to8copy_ia64# |
.global transfer_16to8copy_ia64# |
323 |
src_2 = r17 |
src_2 = r17 |
324 |
stride = r16 |
stride = r16 |
325 |
|
|
|
|
|
|
.body |
|
|
|
|
326 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
327 |
.save ar.lc, oldLC |
.save ar.lc, oldLC |
328 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
329 |
mov oldPR = pr |
mov oldPR = pr |
330 |
|
|
331 |
|
|
332 |
|
.body |
333 |
|
|
334 |
// *** Allocating new stackframe, define rotating registers *** |
// *** Allocating new stackframe, define rotating registers *** |
335 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
336 |
|
|
352 |
.rotp ld_stage[LL], pack_stage[PL], st_stage[1] |
.rotp ld_stage[LL], pack_stage[PL], st_stage[1] |
353 |
|
|
354 |
|
|
355 |
/* Software pipelined loop: |
// Software pipelined loop: |
356 |
* Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data |
// Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data |
357 |
* Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data |
// Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data |
358 |
* Stage 3: Store the 8 byte to the destination address and add stride to |
// Stage 3: Store the 8 byte to the destination address and add stride to |
359 |
* destination address (to get the next 8 byte line of destination)*/ |
// destination address (to get the next 8 byte line of destination) |
360 |
|
|
361 |
|
|
362 |
.Loop_16to8copy: |
.Loop_16to8copy: |
363 |
{.mmi |
{.mmi |
364 |
(ld_stage[0]) ld8 src_v1[0] = [src_1], 16 |
(ld_stage[0]) ld8 src_v1[0] = [src_1], 16 |
381 |
|
|
382 |
|
|
383 |
|
|
384 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
385 |
* |
// |
386 |
* transfer_16to8add_ia64 |
// transfer_16to8add_ia64 |
387 |
* |
// |
388 |
* The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- |
// The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- |
389 |
* bit-values. These are "parallel-added" to the values of src. The result is |
// bit-values. These are "parallel-added" to the values of src. The result is |
390 |
* converted into 8-bit-values using "PACK" and stored at the adress of dst. |
// converted into 8-bit-values using "PACK" and stored at the adress of dst. |
391 |
* We assume that there is no misalignment. |
// We assume that there is no misalignment. |
392 |
* |
// |
393 |
*****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
394 |
|
|
395 |
.align 16 |
.align 16 |
396 |
.global transfer_16to8add_ia64# |
.global transfer_16to8add_ia64# |
406 |
|
|
407 |
_src = r17 |
_src = r17 |
408 |
|
|
|
|
|
|
.body |
|
|
|
|
409 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
410 |
.save ar.lc, r2 |
.save ar.lc, r2 |
411 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
412 |
mov oldPR = pr |
mov oldPR = pr |
413 |
|
|
414 |
|
|
415 |
|
.body |
416 |
|
|
417 |
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
418 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
419 |
|
|
434 |
.rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] |
.rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] |
435 |
|
|
436 |
|
|
437 |
/* Software pipelined loop: |
// Software pipelined loop: |
438 |
* s1_p: The values of src and dst are loaded |
// s1_p: The values of src and dst are loaded |
439 |
* s2_p: The dst-values are converted to 16-bit-values |
// s2_p: The dst-values are converted to 16-bit-values |
440 |
* s3_p: The values of src and dst are added |
// s3_p: The values of src and dst are added |
441 |
* s4_p: The Results are packed into 8-bit-values |
// s4_p: The Results are packed into 8-bit-values |
442 |
* s5_p: The 8-bit-values are stored at the dst-adresses |
// s5_p: The 8-bit-values are stored at the dst-adresses |
443 |
*/ |
|
444 |
|
|
445 |
.Loop_16to8add: |
.Loop_16to8add: |
446 |
{.mii |
{.mii |
474 |
|
|
475 |
|
|
476 |
|
|
477 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
478 |
* |
// |
479 |
* transfer_8to16sub_ia64 |
// transfer_8to16sub_ia64 |
480 |
* |
// |
481 |
* The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The |
// The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The |
482 |
* Difference of cur and ref ist stored at the dct-adresses and cur is copied |
// Difference of cur and ref ist stored at the dct-adresses and cur is copied |
483 |
* into the ref-array. |
// into the ref-array. |
484 |
* |
// |
485 |
* You must assume, that the data adressed by 'ref' are misaligned in memory. |
// You must assume, that the data adressed by 'ref' are misaligned in memory. |
486 |
* But you can assume, that the other data are aligned (at least I hope so). |
// But you can assume, that the other data are aligned (at least I hope so). |
487 |
* |
// |
488 |
****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
489 |
|
|
490 |
.align 16 |
.align 16 |
491 |
.global transfer_8to16sub_ia64# |
.global transfer_8to16sub_ia64# |
514 |
|
|
515 |
_dct = r21 // Register für die Zieladressen des 2. dct-Blocks |
_dct = r21 // Register für die Zieladressen des 2. dct-Blocks |
516 |
|
|
|
|
|
|
.body |
|
|
|
|
517 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
518 |
.save ar.lc, r2 |
.save ar.lc, r2 |
519 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
520 |
mov oldPR = pr |
mov oldPR = pr |
521 |
|
|
522 |
|
|
523 |
|
.body |
524 |
|
|
525 |
// *** Allocating new stackframe, define rotating registers *** |
// *** Allocating new stackframe, define rotating registers *** |
526 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
527 |
|
|
549 |
.rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] |
.rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] |
550 |
|
|
551 |
|
|
552 |
/* Software pipelined loop: |
// Software pipelined loop: |
553 |
* s1_p: The values of ref and cur ale loaded, a copy of cur is made. |
// s1_p: The values of ref and cur ale loaded, a copy of cur is made. |
554 |
* s2_p: cur is converted to 16-bit and thehe misaligned values of ref are |
// s2_p: cur is converted to 16-bit and thehe misaligned values of ref are |
555 |
* shifted... |
// shifted... |
556 |
* s3_p: ... and copied together. |
// s3_p: ... and copied together. |
557 |
* s4_p: This ref-value is converted to 16-bit. The values of cur are stored |
// s4_p: This ref-value is converted to 16-bit. The values of cur are stored |
558 |
* at the ref-adresses. |
// at the ref-adresses. |
559 |
* s5_p: the ref- abd cur-values are substracted... |
// s5_p: the ref- abd cur-values are substracted... |
560 |
* s6_p: ...and the result is stored at the dct-adresses. |
// s6_p: ...and the result is stored at the dct-adresses. |
561 |
*/ |
|
562 |
|
|
563 |
loop_8to16sub: |
loop_8to16sub: |
564 |
{.mii |
{.mii |
604 |
|
|
605 |
|
|
606 |
|
|
607 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
608 |
* |
// |
609 |
* transfer_8to16sub2_ia64 |
// transfer_8to16sub2_ia64 |
610 |
* |
// |
611 |
* At the time, this function was written, it was not yet in use. |
// At the time, this function was written, it was not yet in use. |
612 |
* We assume that the values of ref1/2 are misaligned. |
// We assume that the values of ref1/2 are misaligned. |
613 |
* |
// |
614 |
* The values of ref1/2 and cur are loaded, the ref-values need misalignment- |
// The values of ref1/2 and cur are loaded, the ref-values need misalignment- |
615 |
* treatment. The values are converted to 16-bit using unpack. The average of |
// treatment. The values are converted to 16-bit using unpack. The average of |
616 |
* ref1 and ref2 is computed with pavg and substacted from cur. The results are |
// ref1 and ref2 is computed with pavg and substacted from cur. The results are |
617 |
* stored at the dct-adresses. |
// stored at the dct-adresses. |
618 |
* pavg1.raz is used to get the same results as the C-code-function. |
// pavg1.raz is used to get the same results as the C-code-function. |
619 |
* |
// |
620 |
*****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
621 |
|
|
622 |
.text |
.text |
623 |
.align 16 |
.align 16 |
649 |
aoffset_1 = r24 |
aoffset_1 = r24 |
650 |
aoffset_2 = r25 |
aoffset_2 = r25 |
651 |
|
|
|
|
|
|
.body |
|
|
|
|
652 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
653 |
.save ar.lc, r2 |
.save ar.lc, r2 |
654 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
655 |
mov oldPR = pr |
mov oldPR = pr |
656 |
|
|
657 |
|
|
658 |
|
.body |
659 |
|
|
660 |
// *** Saving Paramters *** |
// *** Saving Paramters *** |
661 |
// *** (as inputregisters r32 + are needed for register-rotation) *** |
// *** (as inputregisters r32 + are needed for register-rotation) *** |
662 |
mov dct_ar = r32 |
mov dct_ar = r32 |
691 |
.rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] |
.rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] |
692 |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] |
693 |
|
|
694 |
/* software pipelined loop: |
|
695 |
* ld_stage: The values of ref1, ref2, cur are loaded |
// software pipelined loop: |
696 |
* sh_stage: The misaligned values of ref1/2 are shifted... |
// ld_stage: The values of ref1, ref2, cur are loaded |
697 |
* or_stage: ...and copied together. |
// sh_stage: The misaligned values of ref1/2 are shifted... |
698 |
* pavg_stage: The average of ref1 and ref2 is computed. |
// or_stage: ...and copied together. |
699 |
* up_stage: The result and the cur-values are converted to 16-bit. |
// pavg_stage: The average of ref1 and ref2 is computed. |
700 |
* psub_stage: Those values are substracted... |
// up_stage: The result and the cur-values are converted to 16-bit. |
701 |
* st_stage: ...and stored at the dct-adresses. |
// psub_stage: Those values are substracted... |
702 |
*/ |
// st_stage: ...and stored at the dct-adresses. |
703 |
|
|
704 |
|
|
705 |
.Loop_8to16sub2: |
.Loop_8to16sub2: |
706 |
{.mii |
{.mii |