Diff of /xvidcore/src/utils/ia64_asm/mem_transfer_ia64.s

-revision 1.1, Fri Jun 14 08:22:39 2002 UTC
+revision 1.4, Sun Nov 17 00:51:11 2002 UTC
 Line 1
-         .file   "mem_transfer.c"
+ ///////////////////////////////////////////////////////////////////////////////
-         .pred.safe_across_calls p1-p5,p16-p63
+ //
-         .common transfer_8to16copy#,8,8
+ // mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel,
+ // University of Karlsruhe, Germany, 03.06.2002, during the laboratory
+ // "IA-64 Video Codec Assember Parktikum" at IPD Goos.
+ //
+ //
+ ///// legal header taken from original C-file ///////////////////////////////////////
+ //
+ // * XVID MPEG-4 VIDEO CODEC
+ // * - 8bit<->16bit transfer  -
+ // *
+ // *  This file is part of XviD, a free MPEG-4 video encoder/decoder
+ // *
+ // *  XviD is free software; you can redistribute it and/or modify it
+ // *  under the terms of the GNU General Public License as published by
+ // *  the Free Software Foundation; either version 2 of the License, or
+ // *  (at your option) any later version.
+ // *
+ // *  This program is distributed in the hope that it will be useful,
+ // *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ // *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ // *  GNU General Public License for more details.
+ // *
+ // *  You should have received a copy of the GNU General Public License
+ // *  along with this program; if not, write to the Free Software
+ // *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ // *
+ // *  Under section 8 of the GNU General Public License, the copyright
+ // *  holders of XVID explicitly forbid distribution in the following
+ // *  countries:
+ // *
+ // *    - Japan
+ // *    - United States of America
+ // *
+ // *  Linking XviD statically or dynamically with other modules is making a
+ // *  combined work based on XviD.  Thus, the terms and conditions of the
+ // *  GNU General Public License cover the whole combination.
+ // *
+ // *  As a special exception, the copyright holders of XviD give you
+ // *  permission to link XviD with independent modules that communicate with
+ // *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the
+ // *  license terms of these independent modules, and to copy and distribute
+ // *  the resulting combined work under terms of your choice, provided that
+ // *  every copy of the combined work is accompanied by a complete copy of
+ // *  the source code of XviD (the version of XviD used to produce the
+ // *  combined work), being distributed under the terms of the GNU General
+ // *  Public License plus this exception.  An independent module is a module
+ // *  which is not derived from or based on XviD.
+ // *
+ // *  Note that people who make modified versions of XviD are not obligated
+ // *  to grant this special exception for their modified versions; it is
+ // *  their choice whether to do so.  The GNU General Public License gives
+ // *  permission to release a modified version without this exception; this
+ // *  exception also makes it possible to release a modified version which
+ // *  carries forward this exception.
+ // *
+ // * $Id$
+ //
+ ///// History /////////////////////////////////////////////////////////////////
+ //
+ // - 16.07.2002: several minor changes for ecc-conformity
+ // - 03.06.2002: initial version
+ //
+ ///////////////////////////////////////////////////////////////////////////////
+ //
+ // Annotations:
+ // ===========
+ //
+ // - All functions work on 8x8-matrices. While the C-code-functions treat each
+ //   element seperatly, the functions in this assembler-code treat a whole line
+ //   simultaneously. So one loop is saved.
+ //   The remaining loop is relized by using softwarepipelining with rotating
+ //   rregisters.
+ // - Register renaming is used for better readability
+ // - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both
+ //   parts are shifted and joined together with an "OR"-Instruction.
+ // - First parameter is stored in GR 32, next in GR 33, and so on. They must be
+ //   saved, as these GRs are used for register-rotation.
+ // - Some of the orininal, German comments used during development are left in
+ //   in the code. They shouldn't bother anyone.
+ //
+ // Anmerkungen:
+ // ============
+ //
+ // - Alle Funtionen arbeiten mit 8x8-Matrizen. W�hrend die Funktionen im C-Code
+ //   jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler-
+ //   Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden.
+ //   Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit
+ //   rotierenden Registern realisiert.
+ // - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet.
+ // - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Bl�cke
+ //   geladen, beide Teile mit "shift"-Operationen zurechter�ckt und mit einem
+ //   logischen Oder zusammenkopiert.
+ // - Die Parameter werden in den Registern ab GR 32 �bergeben. Sie m�ssen ge-
+ //   sichert werden, da die Register f�r die register-Rotation ben�tigt werden.
+ // - Einige der urspr�nglichen, deutschen Kommentare aus der Entwicklungsphase
+ //   sind im Code verblieben. Sie sollten niemanden st�ren.
+ //
+ ///////////////////////////////////////////////////////////////////////////////
+ //      ***     define Latencies for software pipilines ***
+         LL  = 3 // Load
+         SL  = 3 // Store
+         PL  = 1 // Pack
+         SHL = 1 // Shift
+         OL  = 1 // Or
+         UL  = 1 // Unpack
+         PAL = 1 // Parallel Add
+         PSL = 1 // Parallel Subtract
+         PAVGL = 1 // Parallel Avarage
  .text
+ ///////////////////////////////////////////////////////////////////////////////
+ //
+ // transfer8x8_copy_ia64
+ //
+ // SRC is missaligned, to align the source load two 8-bytes-words, shift it,
+ // join them and store the aligned source into the destination address.
+ //
+ ///////////////////////////////////////////////////////////////////////////////
+         .align 16
+         .global transfer8x8_copy_ia64#
+         .proc transfer8x8_copy_ia64#
+ transfer8x8_copy_ia64:
+         .prologue
+ //      *** register renaming ***
+         zero = r0
+         oldLC = r2
+         oldPR = r3
+         src_1 = r14 // left aligned address of src
+         src_2 = r15 // right aligned address of src
+         dst = r16  // destination address
+         stride = r17
+         offset = r18 // shift right offset
+         aoffset = r19 // shift left offset
+ //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
+         .save ar.lc, oldLC
+         mov oldLC = ar.lc
+         mov oldPR = pr
+         .body
+ //      *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR ***
+         alloc r9 = ar.pfs, 3, 29, 0, 32
+ //      *** Saving Parameters ***
+         mov dst = r32
+         mov stride = r34
+ //      *** Misalingment-Treatment ***
+         and src_1 = -8, r33 // Computing adress of first aligned block containing src-values
+         dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress
+         ;;
+         sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl
+         add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values
+ //      *** init loop: set loop counter, epilog counter, predicates ***
+         mov ar.lc = 7
+         mov ar.ec = LL + SHL + OL + 1
+         mov pr.rot = 1 << 16
+         ;;
+ //      *** define register arrays and predicate array for software pipeline ***
+         // src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left
+         .rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1]
+         .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1]
+ //      Software pipelined loop:
+ //      Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2
+ //      Stage 2: Shift both values of source to SHD_R and SHD_L
+ //      Stage 3: Join both parts together with OR
+ //      Stage 4: Store aligned date to destination and add stride to destination address
+ .Loop_8x8copy:
+         {.mii
+                 (ld_stage[0]) ld8 src_v1[0] = [src_1], stride
+                 (sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset
+         }
+         {.mii
+                 (ld_stage[0]) ld8 src_v2[0] = [src_2], stride
+                 (sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset
+                 (or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL]
+         }
+         {.mib
+                 (st_stage[0]) st8 [dst] = value[OL]
+                 (st_stage[0]) add dst = dst, stride
+                 br.ctop.sptk.few .Loop_8x8copy
+                 ;;
+         }
+ //      *** Restore old LC and PRs ***
+         mov ar.lc = oldLC
+         mov pr = oldPR, -1
+         br.ret.sptk.many b0
+         .endp transfer8x8_copy_ia64#
+ ///////////////////////////////////////////////////////////////////////////////
+ //
+ // transfer_8to16copy_ia64
+ //
+ // SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values,
+ // UNPACK is used. So 8 bytes are loaded from source, unpacked to two
+ // 4 x 16 bit values and stored to the destination. Destination is a continuous
+ // array of 64 x 16 bit signed data. To store the next line, only 16 must be
+ // added to the destination address.
+ ///////////////////////////////////////////////////////////////////////////////
          .align 16
          .global transfer_8to16copy_ia64#
          .proc transfer_8to16copy_ia64#
  transfer_8to16copy_ia64:
          .prologue
-         .save ar.lc, r2
-         mov r2 = ar.lc
+ //      *** register renaming ***
+         oldLC = r2
+         oldPR = r3
+         zero = r0 // damit ist die Zahl "zero" = 0 gemeint
+         dst_1 = r14 // destination address for first 4 x 16 bit values
+         dst_2 = r15 // destination address for second 4 x 16 bit values
+         src = r16
+         stride = r17
+ //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
+         .save ar.lc, oldLC
+         mov oldLC = ar.lc
+         mov oldPR = pr
          .body
-         addl r14 = 7, r0
-         mov r21 = r0
+ //      *** Allocating new stackframe, define rotating registers ***
-         mov r20 = r0
+         alloc r9 = ar.pfs, 4, 92, 0, 96
-         ;;
-         mov ar.lc = r14
+ //      *** Saving Paramters ***
-         ;;
+         mov dst_1 = r32 // fist 4 x 16 bit values
- .L101:
+         add dst_2 = 8, r32 // second 4 x 16 bit values
-         addl r19 = 1, r0
+         mov src = r33
-         zxt4 r14 = r21
+         mov stride = r34
-         dep.z r15 = r20, 1, 32
-         ;;
+ //      *** init loop: set loop counter, epilog counter, predicates ***
-         add r16 = r21, r19
+         mov ar.lc = 7
-         add r14 = r33, r14
+         mov ar.ec = LL + UL + 1
-         add r17 = r20, r19
+         mov pr.rot = 1 << 16
          ;;
-         ld1 r18 = [r14]
-         add r15 = r15, r32
+ //      *** define register arrays and predicate array for software pipeline ***
-         zxt4 r16 = r16
+         // src_v = source value, dst_v1 = destination value 1
-         ;;
+         .rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1]
-         st2 [r15] = r18
+         .rotp ld_stage[LL], upack_stage[UL], st_stage[1]
-         addl r19 = 2, r0
-         add r16 = r33, r16
-         dep.z r17 = r17, 1, 32
+ //      Software pipelined loop:
-         ;;
+ //      Stage 1: Load value of SRC
-         ld1 r15 = [r16]
+ //      Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data
-         add r14 = r21, r19
+ //      Stage 3: Store both 8 byte of 16 bit data
-         add r18 = r20, r19
-         add r17 = r17, r32
-         ;;
+ .Loop_8to16copy:
-         zxt4 r14 = r14
+         {.mii
-         st2 [r17] = r15
+                 (ld_stage[0]) ld8 src_v[0] = [src], stride
-         addl r19 = 3, r0
+                 (upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL]
-         ;;
+                 (upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL]
-         add r14 = r33, r14
+         }
-         add r15 = r21, r19
+         {.mmb
-         dep.z r18 = r18, 1, 32
+                 (st_stage[0]) st8 [dst_1] = dst_v1[UL], 16
-         ;;
+                 (st_stage[0]) st8 [dst_2] = dst_v2[UL], 16
-         ld1 r17 = [r14]
+                 br.ctop.sptk.few .Loop_8to16copy
-         add r16 = r20, r19
+                 ;;
-         add r18 = r18, r32
+         }
-         zxt4 r15 = r15
-         ;;
+ //      *** Restore old LC and PRs ***
-         st2 [r18] = r17
+         mov ar.lc = oldLC
-         addl r19 = 4, r0
+         mov pr = oldPR, -1
-         add r15 = r33, r15
-         dep.z r16 = r16, 1, 32
-         ;;
-         ld1 r18 = [r15]
-         add r14 = r21, r19
-         add r17 = r20, r19
-         add r16 = r16, r32
-         ;;
-         zxt4 r14 = r14
-         st2 [r16] = r18
-         addl r19 = 5, r0
-         ;;
-         add r14 = r33, r14
-         add r15 = r21, r19
-         add r16 = r20, r19
-         dep.z r17 = r17, 1, 32
-         ;;
-         ld1 r18 = [r14]
-         addl r19 = 6, r0
-         add r17 = r17, r32
-         zxt4 r15 = r15
-         ;;
-         st2 [r17] = r18
-         add r14 = r21, r19
-         add r15 = r33, r15
-         dep.z r16 = r16, 1, 32
-         add r17 = r20, r19
-         ;;
-         ld1 r18 = [r15]
-         add r16 = r16, r32
-         zxt4 r14 = r14
-         ;;
-         st2 [r16] = r18
-         addl r19 = 7, r0
-         add r14 = r33, r14
-         ;;
-         ld1 r15 = [r14]
-         add r16 = r21, r19
-         dep.z r17 = r17, 1, 32
-         add r14 = r20, r19
-         ;;
-         add r17 = r17, r32
-         zxt4 r16 = r16
-         ;;
-         st2 [r17] = r15
-         dep.z r14 = r14, 1, 32
-         add r16 = r33, r16
-         ;;
-         add r14 = r14, r32
-         ld1 r15 = [r16]
-         add r21 = r21, r34
-         ;;
-         st2 [r14] = r15
-         adds r20 = 8, r20
-         br.cloop.sptk.few .L101
-         ;;
-         mov ar.lc = r2
          br.ret.sptk.many b0
          .endp transfer_8to16copy_ia64#
-         .common transfer_16to8copy#,8,8
+ ///////////////////////////////////////////////////////////////////////////////
+ //
+ // transfer_16to8copy_ia64
+ //
+ // src is a 64 x 16 bit signed continuous array. To convert the 16 bit
+ // values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of
+ // 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word
+ // of 8 x 8 unsigned data to the destination.
+ ///////////////////////////////////////////////////////////////////////////////
          .align 16
          .global transfer_16to8copy_ia64#
          .proc transfer_16to8copy_ia64#
  transfer_16to8copy_ia64:
          .prologue
+ //      *** register renaming ***
+         dst = r14
+         src_1 = r15
+         src_2 = r17
+         stride = r16
+ //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
+         .save ar.lc, oldLC
+         mov oldLC = ar.lc
+         mov oldPR = pr
          .body
-         mov r22 = r0
-         addl r21 = 255, r0
+ //      *** Allocating new stackframe, define rotating registers ***
-         mov r20 = r0
+         alloc r9 = ar.pfs, 4, 92, 0, 96
-         mov r19 = r0
- .L25:
+ //      *** Saving Paramters ***
-         mov r18 = r0
+         mov dst = r32
-         ;;
+         mov src_1 = r33
- .L29:
+         add src_2 = 8, r33
-         add r14 = r19, r18
+         mov stride = r34
-         ;;
-         dep.z r14 = r14, 1, 32
+ //      *** init loop: set loop counter, epilog counter, predicates ***
-         ;;
+         mov ar.lc = 7
-         add r14 = r14, r33
+         mov ar.ec = LL + PL + 1
-         ;;
+         mov pr.rot = 1 << 16
-         ld2 r15 = [r14]
+         ;;
-         ;;
-         sxt2 r15 = r15
+ //      *** define register arrays and predicate array for software pipeline ***
-         ;;
+         // src_v1 = source value 1, dst_v = destination value
-         mov r16 = r15
+         .rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1]
-         ;;
+         .rotp ld_stage[LL], pack_stage[PL], st_stage[1]
-         cmp4.le p6, p7 = r0, r16
-         ;;
-         (p7) mov r16 = r0
+ //      Software pipelined loop:
-         (p7) br.cond.dpnt .L106
+ //      Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data
-         ;;
+ //      Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data
-         cmp4.ge p6, p7 = r21, r16
+ //      Stage 3: Store the 8 byte to the destination address and add stride to
-         ;;
+ //               destination address (to get the next 8 byte line of destination)
-         (p7) addl r16 = 255, r0
- .L106:
-         add r14 = r20, r18
+ .Loop_16to8copy:
-         adds r17 = 1, r18
+         {.mmi
-         ;;
+                 (ld_stage[0]) ld8 src_v1[0] = [src_1], 16
-         zxt4 r14 = r14
+                 (ld_stage[0]) ld8 src_v2[0] = [src_2], 16
-         add r15 = r19, r17
+                 (pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL]
-         ;;
+         }
-         add r14 = r32, r14
+         {.mib
-         dep.z r15 = r15, 1, 32
+                 (st_stage[0]) st8 [dst] = dst_v[PL]
-         ;;
+                 (st_stage[0]) add dst = dst, stride
-         st1 [r14] = r16
+                 br.ctop.sptk.few .Loop_16to8copy
-         add r15 = r15, r33
+                 ;;
-         ;;
+         }
-         ld2 r14 = [r15]
-         ;;
+ //      *** Restore old LC and PRs ***
-         sxt2 r14 = r14
+         mov ar.lc = oldLC
-         ;;
+         mov pr = oldPR, -1
-         mov r16 = r14
-         ;;
-         cmp4.le p6, p7 = r0, r16
-         ;;
-         (p7) mov r16 = r0
-         (p7) br.cond.dpnt .L110
-         ;;
-         cmp4.ge p6, p7 = r21, r16
-         ;;
-         (p7) addl r16 = 255, r0
- .L110:
-         add r14 = r20, r17
-         adds r17 = 2, r18
-         ;;
-         zxt4 r14 = r14
-         add r15 = r19, r17
-         ;;
-         add r14 = r32, r14
-         dep.z r15 = r15, 1, 32
-         ;;
-         st1 [r14] = r16
-         add r15 = r15, r33
-         ;;
-         ld2 r14 = [r15]
-         ;;
-         sxt2 r14 = r14
-         ;;
-         mov r16 = r14
-         ;;
-         cmp4.le p6, p7 = r0, r16
-         ;;
-         (p7) mov r16 = r0
-         (p7) br.cond.dpnt .L114
-         ;;
-         cmp4.ge p6, p7 = r21, r16
-         ;;
-         (p7) addl r16 = 255, r0
- .L114:
-         add r14 = r20, r17
-         adds r17 = 3, r18
-         ;;
-         zxt4 r14 = r14
-         add r15 = r19, r17
-         ;;
-         add r14 = r32, r14
-         dep.z r15 = r15, 1, 32
-         ;;
-         st1 [r14] = r16
-         add r15 = r15, r33
-         ;;
-         ld2 r14 = [r15]
-         ;;
-         sxt2 r14 = r14
-         ;;
-         mov r15 = r14
-         ;;
-         cmp4.le p6, p7 = r0, r15
-         ;;
-         (p7) mov r15 = r0
-         (p7) br.cond.dpnt .L118
-         ;;
-         cmp4.ge p6, p7 = r21, r15
-         ;;
-         (p7) addl r15 = 255, r0
- .L118:
-         add r14 = r20, r17
-         adds r18 = 4, r18
-         ;;
-         zxt4 r14 = r14
-         cmp4.geu p6, p7 = 7, r18
-         ;;
-         add r14 = r32, r14
-         ;;
-         st1 [r14] = r15
-         (p6) br.cond.dptk .L29
-         adds r22 = 1, r22
-         add r20 = r20, r34
-         adds r19 = 8, r19
-         ;;
-         cmp4.geu p6, p7 = 7, r22
-         (p6) br.cond.dptk .L25
          br.ret.sptk.many b0
          .endp transfer_16to8copy_ia64#
-         .common transfer_8to16sub#,8,8
+ ///////////////////////////////////////////////////////////////////////////////
+ //
+ // transfer_16to8add_ia64
+ //
+ // The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16-
+ // bit-values. These are "parallel-added" to the values of src. The result is
+ // converted into 8-bit-values using "PACK" and stored at the adress of dst.
+ // We assume that there is no misalignment.
+ //
+ ///////////////////////////////////////////////////////////////////////////////
+         .align 16
+         .global transfer_16to8add_ia64#
+         .proc transfer_16to8add_ia64#
+ transfer_16to8add_ia64:
+         .prologue
+ //      *** register renaming ***
+         dst = r14
+         src = r15
+         stride = r16
+         _src = r17
+ //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
+         .save ar.lc, r2
+         mov oldLC = ar.lc
+         mov oldPR = pr
+         .body
+ //      *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR ***
+         alloc r9 = ar.pfs, 4, 92, 0, 96
+ //      *** Saving Paramters ***
+         mov dst = r32
+         mov src = r33
+         mov stride = r34
+         add _src = 8, r33
+ //      *** init loop: set loop counter, epilog counter, predicates ***
+         mov ar.lc = 7
+         mov ar.ec = LL + UL + PAL + PL + 1
+         mov pr.rot = 1 << 16
+         ;;
+ //      *** define register arrays and predicate array for software pipeline ***
+         .rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1]
+         .rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1]
+ //      Software pipelined loop:
+ //      s1_p: The values of src and dst are loaded
+ //      s2_p: The dst-values are converted to 16-bit-values
+ //      s3_p: The values of src and dst are added
+ //      s4_p: The Results are packed into 8-bit-values
+ //      s5_p: The 8-bit-values are stored at the dst-adresses
+ .Loop_16to8add:
+         {.mii
+                 (s1_p[0]) ld8 w_src_1[0] = [src], 16 // l�d die 1. H�lfte der j. Zeile von src (i = 0..3)
+                 (s1_p[0]) mov _dst[0] = dst // erh�ht die Adresse von dst um stride
+                 (s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst
+         }
+         {.mii
+                 (s1_p[0]) ld8 w_dst8[0] = [dst], stride // l�d die j. Zeile von dst
+                 (s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird f�r i = 0..3 in 16-Bit umgewandelt
+                 (s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird f�r i = 4..7 in 16-Bit umgewandelt
+         }
+         {.mii
+                 (s1_p[0]) ld8 w_src_2[0] = [_src], 16 // l�d die 2. H�lfte der j. Zeile von src (i = 4..7)
+                 (s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst
+                 (s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die �berpr�fung der Wertebereiche erfolgt automatisch
+         }
+         {.mmb
+                 (s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab
+                 (s1_p[0]) nop.m 0
+                 br.ctop.sptk.few .Loop_16to8add
+                 ;;
+         }
+ //      *** Restore old LC and PRs ***
+         mov ar.lc = oldLC
+         mov pr = oldPR, -1
+         br.ret.sptk.many b0
+         .endp transfer_16to8add_ia64#
+ ///////////////////////////////////////////////////////////////////////////////
+ //
+ // transfer_8to16sub_ia64
+ //
+ // The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The
+ // Difference of cur and ref ist stored at the dct-adresses and cur is copied
+ // into the ref-array.
+ //
+ // You must assume, that the data adressed by 'ref' are misaligned in memory.
+ // But you can assume, that the other data are aligned (at least I hope so).
+ //
+ ///////////////////////////////////////////////////////////////////////////////
          .align 16
          .global transfer_8to16sub_ia64#
          .proc transfer_8to16sub_ia64#
  transfer_8to16sub_ia64:
          .prologue
+ //      *** register renaming ***
+         oldLC = r2
+         oldPR = r3
+         zero = r0 // damit ist die Zahl "zero" = 0 gemeint
+         //Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage
+         dct = r14
+         cur = r15
+         ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das �bergabeRegister in dieser Liste
+         stride = r16
+         offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtr�cken
+         aoffset = r18 // Gegenst�ck zum Offset,
+         ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref
+         ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref
+         _dct = r21 // Register f�r die Zieladressen des 2. dct-Blocks
+ //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
+         .save ar.lc, r2
+         mov oldLC = ar.lc
+         mov oldPR = pr
          .body
-         mov r25 = r0
-         mov r24 = r0
+ //      *** Allocating new stackframe, define rotating registers ***
-         mov r23 = r0
+         alloc r9 = ar.pfs, 4, 92, 0, 96
- .L39:
-         mov r22 = r0
+ //      *** Saving Paramters ***
-         ;;
+         mov dct = r32
- .L43:
+         mov cur = r33
-         add r15 = r23, r22
+         // mov ref = r34: ref is unaligned, get aligned ref below...
-         adds r20 = 1, r22
+         mov stride = r35
-         add r16 = r24, r22
-         ;;
+         and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8)
-         zxt4 r15 = r15
+         dep offset = ref, zero, 3, 3
-         add r18 = r23, r20
+         ;;
-         dep.z r16 = r16, 1, 32
+         add ref_a2 = 8, ref_a1
-         ;;
+         sub aoffset = 64, offset // Gegenst�ck zum Offset wird berechnet
-         add r19 = r34, r15
+         add _dct = 8, dct // Die Adresse f�r den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) h�her als beim 1. Block
-         zxt4 r18 = r18
-         add r16 = r16, r32
+ //      *** init loop: set loop counter, epilog counter, predicates ***
-         add r15 = r33, r15
+         mov ar.lc = 7
-         ;;
+         mov ar.ec = LL + SHL + OL + UL + PSL + 1
-         ld1 r14 = [r19]
+         mov pr.rot = 1 << 16
-         add r21 = r34, r18
+         ;;
-         ld1 r17 = [r15]
-         adds r19 = 2, r22
+ //      *** define register arrays and predicate array for software pipeline ***
-         add r18 = r33, r18
+         .rotr  c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1],  dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1]
-         ;;
+         .rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1]
-         st1 [r15] = r14
-         sub r17 = r17, r14
-         add r20 = r24, r20
+ //      Software pipelined loop:
-         ;;
+ //      s1_p: The values of ref and cur ale loaded, a copy of cur is made.
-         st2 [r16] = r17
+ //      s2_p: cur is converted to 16-bit and thehe misaligned values of ref are
-         dep.z r20 = r20, 1, 32
+ //            shifted...
-         ld1 r14 = [r21]
+ //      s3_p: ... and copied together.
-         ld1 r15 = [r18]
+ //      s4_p: This ref-value is converted to 16-bit. The values of cur are stored
-         add r16 = r23, r19
+ //            at the ref-adresses.
-         ;;
+ //      s5_p: the ref- abd cur-values are substracted...
-         st1 [r18] = r14
+ //      s6_p: ...and the result is stored at the dct-adresses.
-         sub r15 = r15, r14
-         zxt4 r16 = r16
-         add r20 = r20, r32
+ loop_8to16sub:
-         ;;
+         {.mii
-         add r18 = r34, r16
+                 (s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // l�d den 1. 64-Bit-Block, der einen Teil der ref-Daten enth�lt
-         adds r17 = 3, r22
+                 (s1_p[0]) mov _cur[0] = cur // cur wird f�r sp�tere Verwendung gesichert
-         st2 [r20] = r15
+                 (s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte H�lfte wird zurechtger�ckt
-         add r16 = r33, r16
+         }
-         add r19 = r24, r19
+         {.mii
-         ;;
+                 (s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // l�d den 2. 64-Bit-Block
-         ld1 r14 = [r18]
+                 (s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke H�lfte wird zurechtger�ckt
-         add r15 = r23, r17
+                 (s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtger�ckten Daten werden in r zusammenkopiert
-         dep.z r19 = r19, 1, 32
+         }
-         ld1 r18 = [r16]
+         {.mii
-         ;;
+                 (s1_p[0]) ld8 c[0] = [cur], stride //l�d die j. Zeile von cur komplett
-         zxt4 r15 = r15
+                 (s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird f�r i = 0..3 in 16-Bit umgewandelt
-         add r19 = r19, r32
+                 (s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird f�r i = 4..7 in 16-Bit umgewandelt
-         st1 [r16] = r14
+         }
-         sub r18 = r18, r14
+         {.mii
-         ;;
+                 (s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt
-         add r20 = r34, r15
+                 //Umwandeln der 8-Bit r und c -Werte in 16-bit Werte
-         st2 [r19] = r18
+                 (s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird f�r i = 0..3 in 16-Bit umgewandelt
-         add r15 = r33, r15
+                 (s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird f�r i = 4..7 in 16-Bit umgewandelt
-         add r17 = r24, r17
+         }
-         ;;
+         {.mii
-         ld1 r14 = [r20]
+                 (s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. H�fte der j. Zeile
-         ld1 r16 = [r15]
+                 (s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. H�lfte
-         dep.z r17 = r17, 1, 32
+         }
-         ;;
+         {.mmb
-         add r17 = r17, r32
+                 (s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erh�hen der Adresse um 16 Byte f�r den n�chsten Wert
-         adds r22 = 4, r22
+                 (s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erh�hen der Adresse um 16 Byte f�r den n�chsten Wert
-         st1 [r15] = r14
+                 br.ctop.sptk.few loop_8to16sub // Und hopp
-         sub r16 = r16, r14
+                 ;;
-         ;;
+         }
-         cmp4.geu p6, p7 = 7, r22
-         st2 [r17] = r16
+ //      *** Restore old LC and PRs ***
-         (p6) br.cond.dptk .L43
+         mov ar.lc = oldLC
-         adds r25 = 1, r25
+         mov pr = oldPR, -1
-         adds r24 = 8, r24
-         add r23 = r23, r35
-         ;;
-         cmp4.geu p6, p7 = 7, r25
-         (p6) br.cond.dptk .L39
          br.ret.sptk.many b0
          .endp transfer_8to16sub_ia64#
-         .common transfer_8to16sub2#,8,8
+ ///////////////////////////////////////////////////////////////////////////////
+ //
+ // transfer_8to16sub2_ia64
+ //
+ // At the time, this function was written, it was not yet in use.
+ // We assume that the values of ref1/2 are misaligned.
+ //
+ // The values of ref1/2 and cur are loaded, the ref-values need misalignment-
+ // treatment. The values are converted to 16-bit using unpack. The average of
+ // ref1 and ref2 is computed with pavg and substacted from cur. The results are
+ // stored at the dct-adresses.
+ // pavg1.raz is used to get the same results as the C-code-function.
+ //
+ ///////////////////////////////////////////////////////////////////////////////
+         .text
          .align 16
          .global transfer_8to16sub2_ia64#
          .proc transfer_8to16sub2_ia64#
  transfer_8to16sub2_ia64:
          .prologue
+ //      *** register renaming ***
+         //      We've tried to keep the C-Code names as often as possible, at least as
+         //      part of register-names
+         oldLC = r2
+         oldPR = r3
+         zero = r0
+         dct_al = r14 // dct: adress of left block in one line
+         dct_ar = r15 // dct: adress of right block in one line
+         cur = r16
+         ref1_al = r17 // ref1: aligned adress of lower part
+         ref1_ah = r18 // ref1: aligned adress of higher part
+         ref2_al = r19 // ref2: aligned adress of lower part
+         ref2_ah = r20 // ref2: aligned adress of higher part
+         stride = r21
+         offset_1 = r22
+         offset_2 = r23
+         aoffset_1 = r24
+         aoffset_2 = r25
+ //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
          .save ar.lc, r2
-         mov r2 = ar.lc
+         mov oldLC = ar.lc
+         mov oldPR = pr
          .body
-         mov r28 = r0
-         addl r27 = 255, r0
+ //      *** Saving Paramters ***
-         mov r26 = r0
+ //      *** (as inputregisters r32 + are needed for register-rotation) ***
-         mov r25 = r0
+         mov dct_ar = r32
- .L50:
+         add dct_al = 8, r32
-         addl r14 = 3, r0
+         mov cur = r33
-         mov r21 = r0
-         ;;
+         and ref1_al = -8, r34
-         mov ar.lc = r14
+         and ref2_al = -8, r35   // ref2 aligned adrress of lower part
-         ;;
- .L138:
+         mov stride = r36
-         add r14 = r26, r21
-         add r17 = r25, r21
+ //      ***     Calculations for Misaligment-Handling ***
-         adds r19 = 1, r21
+         dep offset_1 = r34, zero, 3, 3
-         ;;
+         dep offset_2 = r35, zero, 3, 3
-         zxt4 r17 = r17
+         ;;
-         dep.z r14 = r14, 1, 32
+         add ref1_ah = 8, ref1_al
-         add r18 = r25, r19
+         add ref2_ah = 8, ref2_al
-         ;;
+         sub aoffset_1 = 64, offset_1
-         add r15 = r34, r17
+         sub aoffset_2 = 64, offset_2
-         add r23 = r14, r32
+         ;;
-         add r20 = r35, r17
-         ;;
+ //      *** Allocating new stackframe, define rotating registers ***
-         ld1 r14 = [r15]
+         alloc r9 = ar.pfs, 5, 91, 0, 96
-         ld1 r16 = [r20]
-         add r17 = r33, r17
+ //      *** init loop: set loop counter, epilog counter, predicates ***
-         ;;
+         mov ar.lc = 7
-         add r14 = r14, r16
+         mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1
-         ld1 r15 = [r17]
+         mov pr.rot = 1 << 16
-         zxt4 r18 = r18
+         ;;
-         ;;
-         adds r14 = 1, r14
+ //      *** define register arrays and predicate array for software pipeline ***
-         add r24 = r35, r18
+         .rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1]
-         add r22 = r34, r18
+         .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1]
-         ;;
-         shr.u r14 = r14, 1
-         add r19 = r26, r19
+ //      software pipelined loop:
-         add r16 = r33, r18
+ //      ld_stage:   The values of ref1, ref2, cur are loaded
-         ;;
+ //      sh_stage:   The misaligned values of ref1/2 are shifted...
-         cmp4.ge p6, p7 = r27, r14
+ //      or_stage:   ...and copied together.
-         dep.z r19 = r19, 1, 32
+ //      pavg_stage: The average of ref1 and ref2 is computed.
-         adds r21 = 2, r21
+ //      up_stage:   The result and the cur-values are converted to 16-bit.
-         ;;
+ //      psub_stage: Those values are substracted...
-         (p7) addl r14 = 255, r0
+ //      st_stage:   ...and stored at the dct-adresses.
-         add r19 = r19, r32
-         ;;
-         sub r14 = r15, r14
+ .Loop_8to16sub2:
-         ;;
+         {.mii
-         st2 [r23] = r14
+                 (ld_stage[0])   ld8 c[0] = [cur], stride
-         ld1 r14 = [r24]
+                 (sh_stage[0])   shr.u ref1_l[0] = ref1_vl[LL], offset_1
-         ld1 r15 = [r22]
+                 (sh_stage[0])   shl ref1_h[0] = ref1_vh[LL], aoffset_1
-         ld1 r16 = [r16]
+         }
-         ;;
+         {.mii
-         add r15 = r15, r14
+                 (ld_stage[0])   ld8 ref1_vl[0] = [ref1_al], stride
-         ;;
+                 (sh_stage[0])   shr.u ref2_l[0] = ref2_vl[LL], offset_2
-         adds r15 = 1, r15
+                 (sh_stage[0])   shl ref2_h[0] = ref2_vh[LL], aoffset_2
-         ;;
+         }
-         shr.u r14 = r15, 1
+         {.mii
-         ;;
+                 (ld_stage[0])   ld8 ref1_vh[0] = [ref1_ah], stride
-         cmp4.ge p6, p7 = r27, r14
+                 (or_stage[0])   or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL]
-         ;;
+                 (or_stage[0])   or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL]
-         (p7) addl r14 = 255, r0
+         }
-         ;;
+         {.mii
-         sub r14 = r16, r14
+                 (ld_stage[0])   ld8 ref2_vl[0] = [ref2_al], stride
-         ;;
+                 (pavg_stage[0]) pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL]
-         st2 [r19] = r14
+                 (up_stage[0])   unpack1.l r16_r[0] = zero, r[PAVGL]
-         br.cloop.sptk.few .L138
+         }
-         adds r28 = 1, r28
+         {.mii
-         adds r26 = 8, r26
+                 (ld_stage[0])   ld8 ref2_vh[0] = [ref2_ah], stride
-         add r25 = r25, r36
+                 (up_stage[0])   unpack1.h r16_l[0] = zero, r[PAVGL]
-         ;;
+                 (up_stage[0])   unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL]
-         cmp4.geu p6, p7 = 7, r28
+         }
-         (p6) br.cond.dptk .L50
+         {.mii
-         mov ar.lc = r2
+                 (st_stage[0])   st8 [dct_ar] = dct16_r[PSL], 16
+                 (up_stage[0])   unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL]
+                 (psub_stage[0]) psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL]
+         }
+         {.mib
+                 (st_stage[0])   st8 [dct_al] = dct16_l[PSL], 16
+                 (psub_stage[0]) psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL]
+                 br.ctop.sptk.few .Loop_8to16sub2 // Und hopp
+                 ;;
+         }
+ //      *** Restore old LC and PRs ***
+         mov ar.lc = oldLC
+         mov pr = oldPR, -1
          br.ret.sptk.many b0
          .endp transfer_8to16sub2_ia64#
-         .common transfer_16to8add#,8,8
-         .align 16
-         .global transfer_16to8add_ia64#
-         .proc transfer_16to8add_ia64#
- transfer_16to8add_ia64:
-         .prologue
-         .save ar.lc, r2
-         mov r2 = ar.lc
-         .body
-         mov r26 = r0
-         addl r25 = 255, r0
-         mov r24 = r0
-         mov r21 = r0
- .L62:
-         addl r14 = 3, r0
-         mov r20 = r0
-         ;;
-         mov ar.lc = r14
-         ;;
- .L149:
-         adds r17 = 1, r20
-         add r14 = r21, r20
-         add r15 = r24, r20
-         ;;
-         zxt4 r14 = r14
-         add r18 = r21, r17
-         dep.z r15 = r15, 1, 32
-         ;;
-         add r23 = r32, r14
-         zxt4 r18 = r18
-         add r15 = r15, r33
-         ;;
-         mov r16 = r23
-         add r22 = r32, r18
-         ld2 r14 = [r15]
-         ;;
-         ld1 r18 = [r16]
-         add r19 = r24, r17
-         adds r20 = 2, r20
-         ;;
-         add r14 = r14, r18
-         dep.z r19 = r19, 1, 32
-         mov r16 = r22
-         ;;
-         sxt2 r14 = r14
-         add r19 = r19, r33
-         ;;
-         cmp4.le p6, p7 = r0, r14
-         cmp4.ge p8, p9 = r25, r14
-         ;;
-         (p7) mov r14 = r0
-         (p7) br.cond.dpnt .L143
-         ;;
-         (p9) addl r14 = 255, r0
-         ;;
- .L143:
-         st1 [r23] = r14
-         ld1 r14 = [r22]
-         ld2 r15 = [r19]
-         ;;
-         add r15 = r15, r14
-         ;;
-         sxt2 r15 = r15
-         ;;
-         cmp4.le p6, p7 = r0, r15
-         cmp4.ge p8, p9 = r25, r15
-         ;;
-         (p7) mov r15 = r0
-         (p7) br.cond.dpnt .L147
-         ;;
-         (p9) addl r15 = 255, r0
-         ;;
- .L147:
-         st1 [r16] = r15
-         br.cloop.sptk.few .L149
-         adds r26 = 1, r26
-         adds r24 = 8, r24
-         add r21 = r21, r34
-         ;;
-         cmp4.geu p6, p7 = 7, r26
-         (p6) br.cond.dptk .L62
-         mov ar.lc = r2
-         br.ret.sptk.many b0
-         .endp transfer_16to8add_ia64#
-         .common transfer8x8_copy#,8,8
-         .align 16
-         .global transfer8x8_copy_ia64#
-         .proc transfer8x8_copy_ia64#
- transfer8x8_copy_ia64:
-         .prologue
-         .save ar.lc, r2
-         mov r2 = ar.lc
-         .body
-         addl r14 = 7, r0
-         mov r21 = r0
-         ;;
-         mov ar.lc = r14
-         ;;
- .L168:
-         zxt4 r14 = r21
-         adds r15 = 1, r21
-         adds r18 = 2, r21
-         ;;
-         add r16 = r33, r14
-         zxt4 r15 = r15
-         zxt4 r18 = r18
-         ;;
-         ld1 r17 = [r16]
-         add r14 = r32, r14
-         add r19 = r33, r15
-         ;;
-         st1 [r14] = r17
-         add r15 = r32, r15
-         add r20 = r33, r18
-         ld1 r16 = [r19]
-         adds r14 = 3, r21
-         add r18 = r32, r18
-         ;;
-         st1 [r15] = r16
-         zxt4 r14 = r14
-         adds r17 = 4, r21
-         ld1 r15 = [r20]
-         ;;
-         add r19 = r33, r14
-         zxt4 r17 = r17
-         st1 [r18] = r15
-         add r14 = r32, r14
-         ;;
-         add r20 = r33, r17
-         ld1 r15 = [r19]
-         adds r16 = 5, r21
-         add r17 = r32, r17
-         ;;
-         st1 [r14] = r15
-         zxt4 r16 = r16
-         adds r18 = 6, r21
-         ld1 r14 = [r20]
-         ;;
-         add r19 = r33, r16
-         zxt4 r18 = r18
-         st1 [r17] = r14
-         add r16 = r32, r16
-         ;;
-         add r20 = r33, r18
-         ld1 r14 = [r19]
-         adds r15 = 7, r21
-         add r18 = r32, r18
-         ;;
-         st1 [r16] = r14
-         zxt4 r15 = r15
-         add r21 = r21, r34
-         ld1 r16 = [r20]
-         ;;
-         add r17 = r33, r15
-         st1 [r18] = r16
-         add r15 = r32, r15
-         ;;
-         ld1 r14 = [r17]
-         ;;
-         st1 [r15] = r14
-         br.cloop.sptk.few .L168
-         ;;
-         mov ar.lc = r2
-         br.ret.sptk.many b0
-         .endp transfer8x8_copy_ia64#
-         .ident  "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)"

 Legend:



Removed from v.1.1
 


changed lines


 
Added in v.1.4
 Legend:



Removed from v.1.1
 


changed lines


 
Added in v.1.4
-Removed from v.1.1
+Added in v.1.4

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4