1 |
/**************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
2 |
* |
// |
3 |
* mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, |
// mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, |
4 |
* University of Karlsruhe, Germany, 03.06.2002, during the laboratory |
// University of Karlsruhe, Germany, 03.06.2002, during the laboratory |
5 |
* "IA-64 Video Codec Assember Parktikum" at IPD Goos. |
// "IA-64 Video Codec Assember Parktikum" at IPD Goos. |
6 |
* |
// |
7 |
* Annotations: |
// |
8 |
* =========== |
///// legal header taken from original C-file /////////////////////////////////////// |
9 |
* |
// |
10 |
* - All functions work on 8x8-matrices. While the C-code-functions treat each |
// XVID MPEG-4 VIDEO CODEC |
11 |
* element seperatly, the functions in this assembler-code treat a whole line |
// - 8bit<->16bit transfer - |
12 |
* simultaneously. So one loop is saved. |
// |
13 |
* The remaining loop is relized by using softwarepipelining with rotating |
// This program is an implementation of a part of one or more MPEG-4 |
14 |
* rregisters. |
// Video tools as specified in ISO/IEC 14496-2 standard. Those intending |
15 |
* - Register renaming is used for better readability |
// to use this software module in hardware or software products are |
16 |
* - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both |
// advised that its use may infringe existing patents or copyrights, and |
17 |
* parts are shifted and joined together with an "OR"-Instruction. |
// any such use would be at such party's own risk. The original |
18 |
* - First parameter is stored in GR 32, next in GR 33, and so on. They must be |
// developer of this software module and his/her company, and subsequent |
19 |
* saved, as these GRs are used for register-rotation. |
// editors and their companies, will have no liability for use of this |
20 |
* - Some of the orininal, German comments used during development are left in |
// software or modifications or derivatives thereof. |
21 |
* in the code. They shouldn't bother anyone. |
// |
22 |
* |
// This program is free software ; you can redistribute it and/or modify |
23 |
* Anmerkungen: |
// it under the terms of the GNU General Public License as published by |
24 |
* ============ |
// the Free Software Foundation ; either version 2 of the License, or |
25 |
* |
// (at your option) any later version. |
26 |
* - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code |
// |
27 |
* jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- |
// This program is distributed in the hope that it will be useful, |
28 |
* Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. |
// but WITHOUT ANY WARRANTY ; without even the implied warranty of |
29 |
* Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
30 |
* rotierenden Registern realisiert. |
// GNU General Public License for more details. |
31 |
* - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. |
// |
32 |
* - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke |
// You should have received a copy of the GNU General Public License |
33 |
* geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem |
// along with this program ; if not, write to the Free Software |
34 |
* logischen Oder zusammenkopiert. |
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
35 |
* - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- |
// |
36 |
* sichert werden, da die Register für die register-Rotation benötigt werden. |
///// History ///////////////////////////////////////////////////////////////// |
37 |
* - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase |
// |
38 |
* sind im Code verblieben. Sie sollten niemanden stören. |
// - 16.07.2002: several minor changes for ecc-conformity |
39 |
* |
// - 03.06.2002: initial version |
40 |
****************************************************************************/ |
// |
41 |
|
/////////////////////////////////////////////////////////////////////////////// |
42 |
|
// |
43 |
|
// Annotations: |
44 |
|
// =========== |
45 |
|
// |
46 |
|
// - All functions work on 8x8-matrices. While the C-code-functions treat each |
47 |
|
// element seperatly, the functions in this assembler-code treat a whole line |
48 |
|
// simultaneously. So one loop is saved. |
49 |
|
// The remaining loop is relized by using softwarepipelining with rotating |
50 |
|
// rregisters. |
51 |
|
// - Register renaming is used for better readability |
52 |
|
// - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both |
53 |
|
// parts are shifted and joined together with an "OR"-Instruction. |
54 |
|
// - First parameter is stored in GR 32, next in GR 33, and so on. They must be |
55 |
|
// saved, as these GRs are used for register-rotation. |
56 |
|
// - Some of the orininal, German comments used during development are left in |
57 |
|
// in the code. They shouldn't bother anyone. |
58 |
|
// |
59 |
|
// Anmerkungen: |
60 |
|
// ============ |
61 |
|
// |
62 |
|
// - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code |
63 |
|
// jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- |
64 |
|
// Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. |
65 |
|
// Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit |
66 |
|
// rotierenden Registern realisiert. |
67 |
|
// - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. |
68 |
|
// - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke |
69 |
|
// geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem |
70 |
|
// logischen Oder zusammenkopiert. |
71 |
|
// - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- |
72 |
|
// sichert werden, da die Register für die register-Rotation benötigt werden. |
73 |
|
// - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase |
74 |
|
// sind im Code verblieben. Sie sollten niemanden stören. |
75 |
|
// |
76 |
|
/////////////////////////////////////////////////////////////////////////////// |
77 |
|
|
78 |
|
|
79 |
// *** define Latencies for software pipilines *** |
// *** define Latencies for software pipilines *** |
91 |
.text |
.text |
92 |
|
|
93 |
|
|
94 |
/**************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
95 |
* |
// |
96 |
* transfer8x8_copy_ia64 |
// transfer8x8_copy_ia64 |
97 |
* |
// |
98 |
* SRC is missaligned, to align the source load two 8-bytes-words, shift it, |
// SRC is missaligned, to align the source load two 8-bytes-words, shift it, |
99 |
* join them and store the aligned source into the destination address. |
// join them and store the aligned source into the destination address. |
100 |
* |
// |
101 |
****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
102 |
|
|
103 |
.align 16 |
.align 16 |
104 |
.global transfer8x8_copy_ia64# |
.global transfer8x8_copy_ia64# |
121 |
offset = r18 // shift right offset |
offset = r18 // shift right offset |
122 |
aoffset = r19 // shift left offset |
aoffset = r19 // shift left offset |
123 |
|
|
|
|
|
|
.body |
|
|
|
|
124 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
125 |
.save ar.lc, oldLC |
.save ar.lc, oldLC |
126 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
127 |
mov oldPR = pr |
mov oldPR = pr |
128 |
|
|
129 |
|
.body |
130 |
|
|
131 |
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
132 |
alloc r9 = ar.pfs, 3, 29, 0, 32 |
alloc r9 = ar.pfs, 3, 29, 0, 32 |
133 |
|
|
153 |
.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] |
.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] |
154 |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] |
155 |
|
|
156 |
/* Software pipelined loop: |
|
157 |
* Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 |
// Software pipelined loop: |
158 |
* Stage 2: Shift both values of source to SHD_R and SHD_L |
// Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 |
159 |
* Stage 3: Join both parts together with OR |
// Stage 2: Shift both values of source to SHD_R and SHD_L |
160 |
* Stage 4: Store aligned date to destination and add stride to destination address */ |
// Stage 3: Join both parts together with OR |
161 |
|
// Stage 4: Store aligned date to destination and add stride to destination address |
162 |
|
|
163 |
|
|
164 |
.Loop_8x8copy: |
.Loop_8x8copy: |
165 |
{.mii |
{.mii |
166 |
(ld_stage[0]) ld8 src_v1[0] = [src_1], stride |
(ld_stage[0]) ld8 src_v1[0] = [src_1], stride |
189 |
|
|
190 |
|
|
191 |
|
|
192 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
193 |
* |
// |
194 |
* transfer_8to16copy_ia64 |
// transfer_8to16copy_ia64 |
195 |
* |
// |
196 |
* SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, |
// SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, |
197 |
* UNPACK is used. So 8 bytes are loaded from source, unpacked to two |
// UNPACK is used. So 8 bytes are loaded from source, unpacked to two |
198 |
* 4 x 16 bit values and stored to the destination. Destination is a continuous |
// 4 x 16 bit values and stored to the destination. Destination is a continuous |
199 |
* array of 64 x 16 bit signed data. To store the next line, only 16 must be |
// array of 64 x 16 bit signed data. To store the next line, only 16 must be |
200 |
* added to the destination address. |
// added to the destination address. |
201 |
*****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
202 |
|
|
203 |
.align 16 |
.align 16 |
204 |
.global transfer_8to16copy_ia64# |
.global transfer_8to16copy_ia64# |
219 |
src = r16 |
src = r16 |
220 |
stride = r17 |
stride = r17 |
221 |
|
|
|
|
|
|
.body |
|
|
|
|
222 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
223 |
.save ar.lc, oldLC |
.save ar.lc, oldLC |
224 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
225 |
mov oldPR = pr |
mov oldPR = pr |
226 |
|
|
227 |
|
|
228 |
|
.body |
229 |
|
|
230 |
// *** Allocating new stackframe, define rotating registers *** |
// *** Allocating new stackframe, define rotating registers *** |
231 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
232 |
|
|
247 |
.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] |
.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] |
248 |
.rotp ld_stage[LL], upack_stage[UL], st_stage[1] |
.rotp ld_stage[LL], upack_stage[UL], st_stage[1] |
249 |
|
|
250 |
/* Software pipelined loop: |
|
251 |
* Stage 1: Load value of SRC |
// Software pipelined loop: |
252 |
* Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data |
// Stage 1: Load value of SRC |
253 |
* Stage 3: Store both 8 byte of 16 bit data */ |
// Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data |
254 |
|
// Stage 3: Store both 8 byte of 16 bit data |
255 |
|
|
256 |
|
|
257 |
.Loop_8to16copy: |
.Loop_8to16copy: |
258 |
{.mii |
{.mii |
259 |
(ld_stage[0]) ld8 src_v[0] = [src], stride |
(ld_stage[0]) ld8 src_v[0] = [src], stride |
277 |
|
|
278 |
|
|
279 |
|
|
280 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
281 |
* |
// |
282 |
* transfer_16to8copy_ia64 |
// transfer_16to8copy_ia64 |
283 |
* |
// |
284 |
* src is a 64 x 16 bit signed continuous array. To convert the 16 bit |
// src is a 64 x 16 bit signed continuous array. To convert the 16 bit |
285 |
* values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of |
// values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of |
286 |
* 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word |
// 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word |
287 |
* of 8 x 8 unsigned data to the destination. |
// of 8 x 8 unsigned data to the destination. |
288 |
****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
289 |
|
|
290 |
.align 16 |
.align 16 |
291 |
.global transfer_16to8copy_ia64# |
.global transfer_16to8copy_ia64# |
299 |
src_2 = r17 |
src_2 = r17 |
300 |
stride = r16 |
stride = r16 |
301 |
|
|
|
|
|
|
.body |
|
|
|
|
302 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
303 |
.save ar.lc, oldLC |
.save ar.lc, oldLC |
304 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
305 |
mov oldPR = pr |
mov oldPR = pr |
306 |
|
|
307 |
|
|
308 |
|
.body |
309 |
|
|
310 |
// *** Allocating new stackframe, define rotating registers *** |
// *** Allocating new stackframe, define rotating registers *** |
311 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
312 |
|
|
328 |
.rotp ld_stage[LL], pack_stage[PL], st_stage[1] |
.rotp ld_stage[LL], pack_stage[PL], st_stage[1] |
329 |
|
|
330 |
|
|
331 |
/* Software pipelined loop: |
// Software pipelined loop: |
332 |
* Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data |
// Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data |
333 |
* Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data |
// Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data |
334 |
* Stage 3: Store the 8 byte to the destination address and add stride to |
// Stage 3: Store the 8 byte to the destination address and add stride to |
335 |
* destination address (to get the next 8 byte line of destination)*/ |
// destination address (to get the next 8 byte line of destination) |
336 |
|
|
337 |
|
|
338 |
.Loop_16to8copy: |
.Loop_16to8copy: |
339 |
{.mmi |
{.mmi |
340 |
(ld_stage[0]) ld8 src_v1[0] = [src_1], 16 |
(ld_stage[0]) ld8 src_v1[0] = [src_1], 16 |
357 |
|
|
358 |
|
|
359 |
|
|
360 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
361 |
* |
// |
362 |
* transfer_16to8add_ia64 |
// transfer_16to8add_ia64 |
363 |
* |
// |
364 |
* The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- |
// The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- |
365 |
* bit-values. These are "parallel-added" to the values of src. The result is |
// bit-values. These are "parallel-added" to the values of src. The result is |
366 |
* converted into 8-bit-values using "PACK" and stored at the adress of dst. |
// converted into 8-bit-values using "PACK" and stored at the adress of dst. |
367 |
* We assume that there is no misalignment. |
// We assume that there is no misalignment. |
368 |
* |
// |
369 |
*****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
370 |
|
|
371 |
.align 16 |
.align 16 |
372 |
.global transfer_16to8add_ia64# |
.global transfer_16to8add_ia64# |
382 |
|
|
383 |
_src = r17 |
_src = r17 |
384 |
|
|
|
|
|
|
.body |
|
|
|
|
385 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
386 |
.save ar.lc, r2 |
.save ar.lc, r2 |
387 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
388 |
mov oldPR = pr |
mov oldPR = pr |
389 |
|
|
390 |
|
|
391 |
|
.body |
392 |
|
|
393 |
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
394 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
395 |
|
|
410 |
.rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] |
.rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] |
411 |
|
|
412 |
|
|
413 |
/* Software pipelined loop: |
// Software pipelined loop: |
414 |
* s1_p: The values of src and dst are loaded |
// s1_p: The values of src and dst are loaded |
415 |
* s2_p: The dst-values are converted to 16-bit-values |
// s2_p: The dst-values are converted to 16-bit-values |
416 |
* s3_p: The values of src and dst are added |
// s3_p: The values of src and dst are added |
417 |
* s4_p: The Results are packed into 8-bit-values |
// s4_p: The Results are packed into 8-bit-values |
418 |
* s5_p: The 8-bit-values are stored at the dst-adresses |
// s5_p: The 8-bit-values are stored at the dst-adresses |
419 |
*/ |
|
420 |
|
|
421 |
.Loop_16to8add: |
.Loop_16to8add: |
422 |
{.mii |
{.mii |
450 |
|
|
451 |
|
|
452 |
|
|
453 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
454 |
* |
// |
455 |
* transfer_8to16sub_ia64 |
// transfer_8to16sub_ia64 |
456 |
* |
// |
457 |
* The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The |
// The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The |
458 |
* Difference of cur and ref ist stored at the dct-adresses and cur is copied |
// Difference of cur and ref ist stored at the dct-adresses and cur is copied |
459 |
* into the ref-array. |
// into the ref-array. |
460 |
* |
// |
461 |
* You must assume, that the data adressed by 'ref' are misaligned in memory. |
// You must assume, that the data adressed by 'ref' are misaligned in memory. |
462 |
* But you can assume, that the other data are aligned (at least I hope so). |
// But you can assume, that the other data are aligned (at least I hope so). |
463 |
* |
// |
464 |
****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
465 |
|
|
466 |
.align 16 |
.align 16 |
467 |
.global transfer_8to16sub_ia64# |
.global transfer_8to16sub_ia64# |
490 |
|
|
491 |
_dct = r21 // Register für die Zieladressen des 2. dct-Blocks |
_dct = r21 // Register für die Zieladressen des 2. dct-Blocks |
492 |
|
|
|
|
|
|
.body |
|
|
|
|
493 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
494 |
.save ar.lc, r2 |
.save ar.lc, r2 |
495 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
496 |
mov oldPR = pr |
mov oldPR = pr |
497 |
|
|
498 |
|
|
499 |
|
.body |
500 |
|
|
501 |
// *** Allocating new stackframe, define rotating registers *** |
// *** Allocating new stackframe, define rotating registers *** |
502 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
503 |
|
|
525 |
.rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] |
.rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] |
526 |
|
|
527 |
|
|
528 |
/* Software pipelined loop: |
// Software pipelined loop: |
529 |
* s1_p: The values of ref and cur ale loaded, a copy of cur is made. |
// s1_p: The values of ref and cur ale loaded, a copy of cur is made. |
530 |
* s2_p: cur is converted to 16-bit and thehe misaligned values of ref are |
// s2_p: cur is converted to 16-bit and thehe misaligned values of ref are |
531 |
* shifted... |
// shifted... |
532 |
* s3_p: ... and copied together. |
// s3_p: ... and copied together. |
533 |
* s4_p: This ref-value is converted to 16-bit. The values of cur are stored |
// s4_p: This ref-value is converted to 16-bit. The values of cur are stored |
534 |
* at the ref-adresses. |
// at the ref-adresses. |
535 |
* s5_p: the ref- abd cur-values are substracted... |
// s5_p: the ref- abd cur-values are substracted... |
536 |
* s6_p: ...and the result is stored at the dct-adresses. |
// s6_p: ...and the result is stored at the dct-adresses. |
537 |
*/ |
|
538 |
|
|
539 |
loop_8to16sub: |
loop_8to16sub: |
540 |
{.mii |
{.mii |
580 |
|
|
581 |
|
|
582 |
|
|
583 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
584 |
* |
// |
585 |
* transfer_8to16sub2_ia64 |
// transfer_8to16sub2_ia64 |
586 |
* |
// |
587 |
* At the time, this function was written, it was not yet in use. |
// At the time, this function was written, it was not yet in use. |
588 |
* We assume that the values of ref1/2 are misaligned. |
// We assume that the values of ref1/2 are misaligned. |
589 |
* |
// |
590 |
* The values of ref1/2 and cur are loaded, the ref-values need misalignment- |
// The values of ref1/2 and cur are loaded, the ref-values need misalignment- |
591 |
* treatment. The values are converted to 16-bit using unpack. The average of |
// treatment. The values are converted to 16-bit using unpack. The average of |
592 |
* ref1 and ref2 is computed with pavg and substacted from cur. The results are |
// ref1 and ref2 is computed with pavg and substacted from cur. The results are |
593 |
* stored at the dct-adresses. |
// stored at the dct-adresses. |
594 |
* pavg1.raz is used to get the same results as the C-code-function. |
// pavg1.raz is used to get the same results as the C-code-function. |
595 |
* |
// |
596 |
*****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
597 |
|
|
598 |
.text |
.text |
599 |
.align 16 |
.align 16 |
625 |
aoffset_1 = r24 |
aoffset_1 = r24 |
626 |
aoffset_2 = r25 |
aoffset_2 = r25 |
627 |
|
|
|
|
|
|
.body |
|
|
|
|
628 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
629 |
.save ar.lc, r2 |
.save ar.lc, r2 |
630 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
631 |
mov oldPR = pr |
mov oldPR = pr |
632 |
|
|
633 |
|
|
634 |
|
.body |
635 |
|
|
636 |
// *** Saving Paramters *** |
// *** Saving Paramters *** |
637 |
// *** (as inputregisters r32 + are needed for register-rotation) *** |
// *** (as inputregisters r32 + are needed for register-rotation) *** |
638 |
mov dct_ar = r32 |
mov dct_ar = r32 |
667 |
.rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] |
.rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] |
668 |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] |
669 |
|
|
670 |
/* software pipelined loop: |
|
671 |
* ld_stage: The values of ref1, ref2, cur are loaded |
// software pipelined loop: |
672 |
* sh_stage: The misaligned values of ref1/2 are shifted... |
// ld_stage: The values of ref1, ref2, cur are loaded |
673 |
* or_stage: ...and copied together. |
// sh_stage: The misaligned values of ref1/2 are shifted... |
674 |
* pavg_stage: The average of ref1 and ref2 is computed. |
// or_stage: ...and copied together. |
675 |
* up_stage: The result and the cur-values are converted to 16-bit. |
// pavg_stage: The average of ref1 and ref2 is computed. |
676 |
* psub_stage: Those values are substracted... |
// up_stage: The result and the cur-values are converted to 16-bit. |
677 |
* st_stage: ...and stored at the dct-adresses. |
// psub_stage: Those values are substracted... |
678 |
*/ |
// st_stage: ...and stored at the dct-adresses. |
679 |
|
|
680 |
|
|
681 |
.Loop_8to16sub2: |
.Loop_8to16sub2: |
682 |
{.mii |
{.mii |