[cvs] / xvidcore / src / image / ia64_asm / interpolate8x8_ia64.s Repository:
ViewVC logotype

Diff of /xvidcore/src/image/ia64_asm/interpolate8x8_ia64.s

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Wed Jun 26 15:26:03 2002 UTC revision 1.6, Thu Feb 19 17:07:29 2009 UTC
# Line 1  Line 1 
1    // ****************************************************************************
2    // *
3    // *  XVID MPEG-4 VIDEO CODEC
4    // *  - IA64 halfpel interpolation -
5    // *
6    // *  Copyright(C) 2002 Kai Kühn, Alexander Viehl
7    // *
8    // *  This program is free software; you can redistribute it and/or modify it
9    // *  under the terms of the GNU General Public License as published by
10    // *  the Free Software Foundation; either version 2 of the License, or
11    // *  (at your option) any later version.
12    // *
13    // *  This program is distributed in the hope that it will be useful,
14    // *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15    // *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    // *  GNU General Public License for more details.
17    // *
18    // *  You should have received a copy of the GNU General Public License
19    // *  along with this program; if not, write to the Free Software
20    // *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
21    // *
22    // * $Id$
23    // *
24    // ***************************************************************************/
25    //
26    // ****************************************************************************
27    // *
28    // *  interpolate8x8_ia64.s, IA-64 halfpel interpolation
29    // *
30    // *  This version was implemented during an IA-64 practical training at
31    // *  the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/)
32    // *
33    // ****************************************************************************
34    
35          .file   "interpolate8x8_ia64.s"          .file   "interpolate8x8_ia64.s"
36          .pred.safe_across_calls p1-p5,p16-p63          .pred.safe_across_calls p1-p5,p16-p63
# Line 25  Line 58 
58          and r14 = -8,r33                        // align src          and r14 = -8,r33                        // align src
59          mov r15 = r32                   // get dest          mov r15 = r32                   // get dest
60          mov r16 = r34                   // stride          mov r16 = r34                   // stride
61          sub r17 = 1,r35                 // 1-rounding  //      sub r17 = 0,r0                  // 1-rounding
62    
63          ;;          ;;
64    
65          add r18 = 8,r14          add r18 = 8,r14
66          mux1 r17 = r17, @brcst          // broadcast 1-rounding  //      mux1 r17 = r17, @brcst          // broadcast 1-rounding
67    
68          sub r24 = 64,r22                        // lshift of src          sub r24 = 64,r22                        // lshift of src
69          add r26 = 8,r22                 // rshift of src+1          add r26 = 8,r22                 // rshift of src+1
70          sub r27 = 56,r22                        // lshift of src+1          sub r27 = 56,r22                        // lshift of src+1
71    
72          mov ar.lc = 7                                           // loopcounter          mov ar.lc = 7                                           // loopcounter
73          mov ar.ec = LL + SL +OL + AVL + AL + STL                // sum of latencies          mov ar.ec = LL + SL +OL + AVL  + STL            // sum of latencies
74          mov pr.rot = 1 << 16                                    // init pr regs for sw-pipeling          mov pr.rot = 1 << 16                                    // init pr regs for sw-pipeling
75    
76          ;;          ;;
77          .rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],add1[AL+1],avg[AVL+1]          .rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1]
78          .rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL], pavg1p[AVL],stp[STL]          .rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL]
79    
80    
81  loop_interpolate:  .Lloop_interpolate:
82          (aldp[0]) ld8 ald1[0] = [r14],r16               // load aligned src          (aldp[0]) ld8 ald1[0] = [r14],r16               // load aligned src
83          (aldp[0]) ld8 ald2[0] = [r18],r16               // and aligned src+8          (aldp[0]) ld8 ald2[0] = [r18],r16               // and aligned src+8
84    
# Line 56  Line 90 
90          (or1p[0]) or or1[0] = shru1[SL],shl2[SL]                // merge things          (or1p[0]) or or1[0] = shru1[SL],shl2[SL]                // merge things
91          (or1p[0]) or or2[0] = shru2[SL],shl1[SL]          (or1p[0]) or or2[0] = shru2[SL],shl1[SL]
92    
93          (addp[0]) padd1.uus add1[0] = or1[OL],r17               // add 1-rounding  //      (addp[0]) padd1.uus add1[0] = or1[OL],r17               // add 1-rounding
94    
95          (pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL]  // parallel average          (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL]      // parallel average
96    
97           (stp[0]) st8 [r15] = avg[AVL]                  // store results           (stp[0]) st8 [r15] = avg[AVL]                  // store results
98           (stp[0]) add r15 = r15,r16           (stp[0]) add r15 = r15,r16
# Line 66  Line 100 
100    
101    
102    
103          br.ctop.sptk.few loop_interpolate          br.ctop.sptk.few .Lloop_interpolate
104          ;;          ;;
105          mov ar.lc = r20          mov ar.lc = r20
106          mov pr = r21,-1          mov pr = r21,-1
# Line 96  Line 130 
130          and r14 = -8,r33          and r14 = -8,r33
131          mov r15 = r32          mov r15 = r32
132          mov r16 = r34          mov r16 = r34
133          sub r17 = 1,r35  //      sub r17 = 0,r0
134          ;;          ;;
135    
136          add r18 = 8,r14          add r18 = 8,r14
137          add r19 = r14,r16                       // src + stride          add r19 = r14,r16                       // src + stride
138          mux1 r17 = r17, @brcst  //      mux1 r17 = r17, @brcst
139    
140          sub r24 = 64,r22          sub r24 = 64,r22
141          ;;          ;;
142          add r26 = 8,r19                 // src + stride + 8          add r26 = 8,r19                 // src + stride + 8
143    
144          mov ar.lc = 7          mov ar.lc = 7
145          mov ar.ec = LL + SL +OL + AVL + AL + STL          mov ar.ec = LL + SL +OL + AVL + STL
146          mov pr.rot = 1 << 16          mov pr.rot = 1 << 16
147    
148          ;;          ;;
149          .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],add1[AL+1],avg[AVL+1]          .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1]
150          .rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL], pavg1p[AVL],stp[STL]          .rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL]
151    
152    
153  loop_interpolate2:  .Lloop_interpolate2:
154          (aldp[0]) ld8 ald1[0] = [r14],r16          (aldp[0]) ld8 ald1[0] = [r14],r16
155          (aldp[0]) ld8 ald2[0] = [r18],r16          (aldp[0]) ld8 ald2[0] = [r18],r16
156          (aldp[0]) ld8 ald3[0] = [r19],r16          (aldp[0]) ld8 ald3[0] = [r19],r16
# Line 130  Line 164 
164          (or1p[0]) or or1[0] = shru1[SL],shl1[SL]          (or1p[0]) or or1[0] = shru1[SL],shl1[SL]
165          (or1p[0]) or or2[0] = shru2[SL],shl2[SL]          (or1p[0]) or or2[0] = shru2[SL],shl2[SL]
166    
167          (addp[0]) padd1.uus add1[0] = or1[OL],r17  //      (addp[0]) padd1.uus add1[0] = or1[OL],r17
168    
169          (pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL]          (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL]
170    
171           (stp[0]) st8 [r15] = avg[AVL]           (stp[0]) st8 [r15] = avg[AVL]
172           (stp[0]) add r15 = r15,r16           (stp[0]) add r15 = r15,r16
# Line 140  Line 174 
174    
175    
176    
177          br.ctop.sptk.few loop_interpolate2          br.ctop.sptk.few .Lloop_interpolate2
178          ;;          ;;
179          mov ar.lc = r20          mov ar.lc = r20
180          mov pr = r21,-1          mov pr = r21,-1
# Line 170  Line 204 
204          and r14 = -8,r33          and r14 = -8,r33
205          mov r15 = r32          mov r15 = r32
206          mov r16 = r34          mov r16 = r34
207          sub r17 = 1,r35  //      sub r17 = 0,r0
208          ;;          ;;
209    
210          add r18 = 8,r14          add r18 = 8,r14
211          add r19 = r14,r16          add r19 = r14,r16
212          mux1 r17 = r17, @brcst  //      mux1 r17 = r17, @brcst
213    
214          add r27 = 8,r22          add r27 = 8,r22
215          sub r28 = 56,r22          sub r28 = 56,r22
# Line 184  Line 218 
218          add r26 = 8,r19          add r26 = 8,r19
219    
220          mov ar.lc = 7          mov ar.lc = 7
221          mov ar.ec = LL + SL +OL + 2*AVL + AL + STL          mov ar.ec = LL + SL +OL + 2*AVL  + STL
222          mov pr.rot = 1 << 16          mov pr.rot = 1 << 16
223    
224          ;;          ;;
225          .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],add1[AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1]          .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1]
226          .rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL],pavg1p[AVL],pavg2p[AVL],stp[STL]          .rotp aldp[LL], sh1p[SL], or1p[OL],pavg1p[AVL],pavg2p[AVL],stp[STL]
227    
228    
229  loop_interpolate3:  .Lloop_interpolate3:
230          (aldp[0]) ld8 ald1[0] = [r14],r16          (aldp[0]) ld8 ald1[0] = [r14],r16
231          (aldp[0]) ld8 ald2[0] = [r18],r16          (aldp[0]) ld8 ald2[0] = [r18],r16
232          (aldp[0]) ld8 ald3[0] = [r19],r16          (aldp[0]) ld8 ald3[0] = [r19],r16
# Line 213  Line 247 
247          (or1p[0]) or or3[0] = shru3[SL],shl3[SL]          (or1p[0]) or or3[0] = shru3[SL],shl3[SL]
248          (or1p[0]) or or4[0] = shru4[SL],shl4[SL]          (or1p[0]) or or4[0] = shru4[SL],shl4[SL]
249    
250          (addp[0]) padd1.uus add1[0] = or1[OL],r17  //      (addp[0]) padd1.uus add1[0] = or1[OL],r17
251    
252          (pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL]          (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL]
253          (pavg1p[0]) pavg1 avg1[0] = or3[OL+AL],or4[OL+AL]          (pavg1p[0]) pavg1 avg1[0] = or3[OL],or4[OL]
254    
255          (pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL]          (pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL]
256    
# Line 226  Line 260 
260    
261    
262    
263          br.ctop.sptk.few loop_interpolate3          br.ctop.sptk.few .Lloop_interpolate3
264          ;;          ;;
265          mov ar.lc = r20          mov ar.lc = r20
266          mov pr = r21,-1          mov pr = r21,-1

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.6

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4