--- mem_transfer_ia64.s 2002/07/05 14:01:18 1.2 +++ mem_transfer_ia64.s 2002/07/16 17:55:18 1.3 @@ -1,43 +1,79 @@ -/**************************************************************************** -* -* mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, -* University of Karlsruhe, Germany, 03.06.2002, during the laboratory -* "IA-64 Video Codec Assember Parktikum" at IPD Goos. -* -* Annotations: -* =========== -* -* - All functions work on 8x8-matrices. While the C-code-functions treat each -* element seperatly, the functions in this assembler-code treat a whole line -* simultaneously. So one loop is saved. -* The remaining loop is relized by using softwarepipelining with rotating -* rregisters. -* - Register renaming is used for better readability -* - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both -* parts are shifted and joined together with an "OR"-Instruction. -* - First parameter is stored in GR 32, next in GR 33, and so on. They must be -* saved, as these GRs are used for register-rotation. -* - Some of the orininal, German comments used during development are left in -* in the code. They shouldn't bother anyone. -* -* Anmerkungen: -* ============ -* -* - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code -* jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- -* Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. -* Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit -* rotierenden Registern realisiert. -* - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. -* - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke -* geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem -* logischen Oder zusammenkopiert. -* - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- -* sichert werden, da die Register für die register-Rotation benötigt werden. -* - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase -* sind im Code verblieben. Sie sollten niemanden stören. -* -****************************************************************************/ +/////////////////////////////////////////////////////////////////////////////// +// +// mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, +// University of Karlsruhe, Germany, 03.06.2002, during the laboratory +// "IA-64 Video Codec Assember Parktikum" at IPD Goos. +// +// +///// legal header taken from original C-file /////////////////////////////////////// +// +// XVID MPEG-4 VIDEO CODEC +// - 8bit<->16bit transfer - +// +// This program is an implementation of a part of one or more MPEG-4 +// Video tools as specified in ISO/IEC 14496-2 standard. Those intending +// to use this software module in hardware or software products are +// advised that its use may infringe existing patents or copyrights, and +// any such use would be at such party's own risk. The original +// developer of this software module and his/her company, and subsequent +// editors and their companies, will have no liability for use of this +// software or modifications or derivatives thereof. +// +// This program is free software ; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation ; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY ; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program ; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +///// History ///////////////////////////////////////////////////////////////// +// +// - 16.07.2002: several minor changes for ecc-conformity +// - 03.06.2002: initial version +// +/////////////////////////////////////////////////////////////////////////////// +// +// Annotations: +// =========== +// +// - All functions work on 8x8-matrices. While the C-code-functions treat each +// element seperatly, the functions in this assembler-code treat a whole line +// simultaneously. So one loop is saved. +// The remaining loop is relized by using softwarepipelining with rotating +// rregisters. +// - Register renaming is used for better readability +// - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both +// parts are shifted and joined together with an "OR"-Instruction. +// - First parameter is stored in GR 32, next in GR 33, and so on. They must be +// saved, as these GRs are used for register-rotation. +// - Some of the orininal, German comments used during development are left in +// in the code. They shouldn't bother anyone. +// +// Anmerkungen: +// ============ +// +// - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code +// jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- +// Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. +// Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit +// rotierenden Registern realisiert. +// - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. +// - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke +// geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem +// logischen Oder zusammenkopiert. +// - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- +// sichert werden, da die Register für die register-Rotation benötigt werden. +// - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase +// sind im Code verblieben. Sie sollten niemanden stören. +// +/////////////////////////////////////////////////////////////////////////////// // *** define Latencies for software pipilines *** @@ -55,14 +91,14 @@ .text -/**************************************************************************** -* -* transfer8x8_copy_ia64 -* -* SRC is missaligned, to align the source load two 8-bytes-words, shift it, -* join them and store the aligned source into the destination address. -* -****************************************************************************/ +/////////////////////////////////////////////////////////////////////////////// +// +// transfer8x8_copy_ia64 +// +// SRC is missaligned, to align the source load two 8-bytes-words, shift it, +// join them and store the aligned source into the destination address. +// +/////////////////////////////////////////////////////////////////////////////// .align 16 .global transfer8x8_copy_ia64# @@ -85,14 +121,13 @@ offset = r18 // shift right offset aoffset = r19 // shift left offset - - .body - // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** .save ar.lc, oldLC mov oldLC = ar.lc mov oldPR = pr - + + .body + // *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** alloc r9 = ar.pfs, 3, 29, 0, 32 @@ -117,12 +152,15 @@ // src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left .rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] - -/* Software pipelined loop: -* Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 -* Stage 2: Shift both values of source to SHD_R and SHD_L -* Stage 3: Join both parts together with OR -* Stage 4: Store aligned date to destination and add stride to destination address */ + + +// Software pipelined loop: +// Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 +// Stage 2: Shift both values of source to SHD_R and SHD_L +// Stage 3: Join both parts together with OR +// Stage 4: Store aligned date to destination and add stride to destination address + + .Loop_8x8copy: {.mii (ld_stage[0]) ld8 src_v1[0] = [src_1], stride @@ -151,16 +189,16 @@ -/***************************************************************************** -* -* transfer_8to16copy_ia64 -* -* SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, -* UNPACK is used. So 8 bytes are loaded from source, unpacked to two -* 4 x 16 bit values and stored to the destination. Destination is a continuous -* array of 64 x 16 bit signed data. To store the next line, only 16 must be -* added to the destination address. -*****************************************************************************/ +/////////////////////////////////////////////////////////////////////////////// +// +// transfer_8to16copy_ia64 +// +// SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, +// UNPACK is used. So 8 bytes are loaded from source, unpacked to two +// 4 x 16 bit values and stored to the destination. Destination is a continuous +// array of 64 x 16 bit signed data. To store the next line, only 16 must be +// added to the destination address. +/////////////////////////////////////////////////////////////////////////////// .align 16 .global transfer_8to16copy_ia64# @@ -181,14 +219,14 @@ src = r16 stride = r17 - - .body - // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** .save ar.lc, oldLC mov oldLC = ar.lc mov oldPR = pr + + .body + // *** Allocating new stackframe, define rotating registers *** alloc r9 = ar.pfs, 4, 92, 0, 96 @@ -208,11 +246,14 @@ // src_v = source value, dst_v1 = destination value 1 .rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] .rotp ld_stage[LL], upack_stage[UL], st_stage[1] + + +// Software pipelined loop: +// Stage 1: Load value of SRC +// Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data +// Stage 3: Store both 8 byte of 16 bit data + -/* Software pipelined loop: -* Stage 1: Load value of SRC -* Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data -* Stage 3: Store both 8 byte of 16 bit data */ .Loop_8to16copy: {.mii (ld_stage[0]) ld8 src_v[0] = [src], stride @@ -236,15 +277,15 @@ -/***************************************************************************** -* -* transfer_16to8copy_ia64 -* -* src is a 64 x 16 bit signed continuous array. To convert the 16 bit -* values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of -* 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word -* of 8 x 8 unsigned data to the destination. -****************************************************************************/ +/////////////////////////////////////////////////////////////////////////////// +// +// transfer_16to8copy_ia64 +// +// src is a 64 x 16 bit signed continuous array. To convert the 16 bit +// values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of +// 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word +// of 8 x 8 unsigned data to the destination. +/////////////////////////////////////////////////////////////////////////////// .align 16 .global transfer_16to8copy_ia64# @@ -258,14 +299,14 @@ src_2 = r17 stride = r16 - - .body - // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** .save ar.lc, oldLC mov oldLC = ar.lc mov oldPR = pr + + .body + // *** Allocating new stackframe, define rotating registers *** alloc r9 = ar.pfs, 4, 92, 0, 96 @@ -287,11 +328,13 @@ .rotp ld_stage[LL], pack_stage[PL], st_stage[1] -/* Software pipelined loop: -* Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data -* Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data -* Stage 3: Store the 8 byte to the destination address and add stride to -* destination address (to get the next 8 byte line of destination)*/ +// Software pipelined loop: +// Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data +// Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data +// Stage 3: Store the 8 byte to the destination address and add stride to +// destination address (to get the next 8 byte line of destination) + + .Loop_16to8copy: {.mmi (ld_stage[0]) ld8 src_v1[0] = [src_1], 16 @@ -314,16 +357,16 @@ -/***************************************************************************** -* -* transfer_16to8add_ia64 -* -* The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- -* bit-values. These are "parallel-added" to the values of src. The result is -* converted into 8-bit-values using "PACK" and stored at the adress of dst. -* We assume that there is no misalignment. -* -*****************************************************************************/ +/////////////////////////////////////////////////////////////////////////////// +// +// transfer_16to8add_ia64 +// +// The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- +// bit-values. These are "parallel-added" to the values of src. The result is +// converted into 8-bit-values using "PACK" and stored at the adress of dst. +// We assume that there is no misalignment. +// +/////////////////////////////////////////////////////////////////////////////// .align 16 .global transfer_16to8add_ia64# @@ -339,14 +382,14 @@ _src = r17 - - .body - // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** .save ar.lc, r2 mov oldLC = ar.lc mov oldPR = pr + + .body + // *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** alloc r9 = ar.pfs, 4, 92, 0, 96 @@ -367,13 +410,13 @@ .rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] -/* Software pipelined loop: - * s1_p: The values of src and dst are loaded - * s2_p: The dst-values are converted to 16-bit-values - * s3_p: The values of src and dst are added - * s4_p: The Results are packed into 8-bit-values - * s5_p: The 8-bit-values are stored at the dst-adresses - */ +// Software pipelined loop: +// s1_p: The values of src and dst are loaded +// s2_p: The dst-values are converted to 16-bit-values +// s3_p: The values of src and dst are added +// s4_p: The Results are packed into 8-bit-values +// s5_p: The 8-bit-values are stored at the dst-adresses + .Loop_16to8add: {.mii @@ -407,18 +450,18 @@ -/***************************************************************************** -* -* transfer_8to16sub_ia64 -* -* The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The -* Difference of cur and ref ist stored at the dct-adresses and cur is copied -* into the ref-array. -* -* You must assume, that the data adressed by 'ref' are misaligned in memory. -* But you can assume, that the other data are aligned (at least I hope so). -* -****************************************************************************/ +/////////////////////////////////////////////////////////////////////////////// +// +// transfer_8to16sub_ia64 +// +// The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The +// Difference of cur and ref ist stored at the dct-adresses and cur is copied +// into the ref-array. +// +// You must assume, that the data adressed by 'ref' are misaligned in memory. +// But you can assume, that the other data are aligned (at least I hope so). +// +/////////////////////////////////////////////////////////////////////////////// .align 16 .global transfer_8to16sub_ia64# @@ -447,14 +490,14 @@ _dct = r21 // Register für die Zieladressen des 2. dct-Blocks - - .body - // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** .save ar.lc, r2 mov oldLC = ar.lc mov oldPR = pr + + .body + // *** Allocating new stackframe, define rotating registers *** alloc r9 = ar.pfs, 4, 92, 0, 96 @@ -482,16 +525,16 @@ .rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] -/* Software pipelined loop: - * s1_p: The values of ref and cur ale loaded, a copy of cur is made. - * s2_p: cur is converted to 16-bit and thehe misaligned values of ref are - * shifted... - * s3_p: ... and copied together. - * s4_p: This ref-value is converted to 16-bit. The values of cur are stored - * at the ref-adresses. - * s5_p: the ref- abd cur-values are substracted... - * s6_p: ...and the result is stored at the dct-adresses. - */ +// Software pipelined loop: +// s1_p: The values of ref and cur ale loaded, a copy of cur is made. +// s2_p: cur is converted to 16-bit and thehe misaligned values of ref are +// shifted... +// s3_p: ... and copied together. +// s4_p: This ref-value is converted to 16-bit. The values of cur are stored +// at the ref-adresses. +// s5_p: the ref- abd cur-values are substracted... +// s6_p: ...and the result is stored at the dct-adresses. + loop_8to16sub: {.mii @@ -537,20 +580,20 @@ -/***************************************************************************** -* -* transfer_8to16sub2_ia64 -* -* At the time, this function was written, it was not yet in use. -* We assume that the values of ref1/2 are misaligned. -* -* The values of ref1/2 and cur are loaded, the ref-values need misalignment- -* treatment. The values are converted to 16-bit using unpack. The average of -* ref1 and ref2 is computed with pavg and substacted from cur. The results are -* stored at the dct-adresses. -* pavg1.raz is used to get the same results as the C-code-function. -* -*****************************************************************************/ +/////////////////////////////////////////////////////////////////////////////// +// +// transfer_8to16sub2_ia64 +// +// At the time, this function was written, it was not yet in use. +// We assume that the values of ref1/2 are misaligned. +// +// The values of ref1/2 and cur are loaded, the ref-values need misalignment- +// treatment. The values are converted to 16-bit using unpack. The average of +// ref1 and ref2 is computed with pavg and substacted from cur. The results are +// stored at the dct-adresses. +// pavg1.raz is used to get the same results as the C-code-function. +// +/////////////////////////////////////////////////////////////////////////////// .text .align 16 @@ -582,14 +625,14 @@ aoffset_1 = r24 aoffset_2 = r25 - - .body - // *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** .save ar.lc, r2 mov oldLC = ar.lc mov oldPR = pr + + .body + // *** Saving Paramters *** // *** (as inputregisters r32 + are needed for register-rotation) *** mov dct_ar = r32 @@ -623,16 +666,17 @@ // *** define register arrays and predicate array for software pipeline *** .rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] + -/* software pipelined loop: - * ld_stage: The values of ref1, ref2, cur are loaded - * sh_stage: The misaligned values of ref1/2 are shifted... - * or_stage: ...and copied together. - * pavg_stage: The average of ref1 and ref2 is computed. - * up_stage: The result and the cur-values are converted to 16-bit. - * psub_stage: Those values are substracted... - * st_stage: ...and stored at the dct-adresses. - */ +// software pipelined loop: +// ld_stage: The values of ref1, ref2, cur are loaded +// sh_stage: The misaligned values of ref1/2 are shifted... +// or_stage: ...and copied together. +// pavg_stage: The average of ref1 and ref2 is computed. +// up_stage: The result and the cur-values are converted to 16-bit. +// psub_stage: Those values are substracted... +// st_stage: ...and stored at the dct-adresses. + .Loop_8to16sub2: {.mii