1 |
|
// **************************************************************************** |
2 |
|
// * |
3 |
|
// * XVID MPEG-4 VIDEO CODEC |
4 |
|
// * - IA64 halfpel interpolation - |
5 |
|
// * |
6 |
|
// * Copyright(C) 2002 Kai Kühn, Alexander Viehl |
7 |
|
// * |
8 |
|
// * This program is free software; you can redistribute it and/or modify it |
9 |
|
// * under the terms of the GNU General Public License as published by |
10 |
|
// * the Free Software Foundation; either version 2 of the License, or |
11 |
|
// * (at your option) any later version. |
12 |
|
// * |
13 |
|
// * This program is distributed in the hope that it will be useful, |
14 |
|
// * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
|
// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
|
// * GNU General Public License for more details. |
17 |
|
// * |
18 |
|
// * You should have received a copy of the GNU General Public License |
19 |
|
// * along with this program; if not, write to the Free Software |
20 |
|
// * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
|
// * |
22 |
|
// * $Id$ |
23 |
|
// * |
24 |
|
// ***************************************************************************/ |
25 |
|
// |
26 |
|
// **************************************************************************** |
27 |
|
// * |
28 |
|
// * interpolate8x8_ia64.s, IA-64 halfpel interpolation |
29 |
|
// * |
30 |
|
// * This version was implemented during an IA-64 practical training at |
31 |
|
// * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) |
32 |
|
// * |
33 |
|
// **************************************************************************** |
34 |
|
|
35 |
.file "interpolate8x8_ia64.s" |
.file "interpolate8x8_ia64.s" |
36 |
.pred.safe_across_calls p1-p5,p16-p63 |
.pred.safe_across_calls p1-p5,p16-p63 |
58 |
and r14 = -8,r33 // align src |
and r14 = -8,r33 // align src |
59 |
mov r15 = r32 // get dest |
mov r15 = r32 // get dest |
60 |
mov r16 = r34 // stride |
mov r16 = r34 // stride |
61 |
sub r17 = 1,r35 // 1-rounding |
// sub r17 = 0,r0 // 1-rounding |
62 |
|
|
63 |
;; |
;; |
64 |
|
|
65 |
add r18 = 8,r14 |
add r18 = 8,r14 |
66 |
mux1 r17 = r17, @brcst // broadcast 1-rounding |
// mux1 r17 = r17, @brcst // broadcast 1-rounding |
67 |
|
|
68 |
sub r24 = 64,r22 // lshift of src |
sub r24 = 64,r22 // lshift of src |
69 |
add r26 = 8,r22 // rshift of src+1 |
add r26 = 8,r22 // rshift of src+1 |
70 |
sub r27 = 56,r22 // lshift of src+1 |
sub r27 = 56,r22 // lshift of src+1 |
71 |
|
|
72 |
mov ar.lc = 7 // loopcounter |
mov ar.lc = 7 // loopcounter |
73 |
mov ar.ec = LL + SL +OL + AVL + AL + STL // sum of latencies |
mov ar.ec = LL + SL +OL + AVL + STL // sum of latencies |
74 |
mov pr.rot = 1 << 16 // init pr regs for sw-pipeling |
mov pr.rot = 1 << 16 // init pr regs for sw-pipeling |
75 |
|
|
76 |
;; |
;; |
77 |
.rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],add1[AL+1],avg[AVL+1] |
.rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1] |
78 |
.rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL], pavg1p[AVL],stp[STL] |
.rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL] |
79 |
|
|
80 |
|
|
81 |
.Lloop_interpolate: |
.Lloop_interpolate: |
90 |
(or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things |
(or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things |
91 |
(or1p[0]) or or2[0] = shru2[SL],shl1[SL] |
(or1p[0]) or or2[0] = shru2[SL],shl1[SL] |
92 |
|
|
93 |
(addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding |
// (addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding |
94 |
|
|
95 |
(pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] // parallel average |
(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] // parallel average |
96 |
|
|
97 |
(stp[0]) st8 [r15] = avg[AVL] // store results |
(stp[0]) st8 [r15] = avg[AVL] // store results |
98 |
(stp[0]) add r15 = r15,r16 |
(stp[0]) add r15 = r15,r16 |
130 |
and r14 = -8,r33 |
and r14 = -8,r33 |
131 |
mov r15 = r32 |
mov r15 = r32 |
132 |
mov r16 = r34 |
mov r16 = r34 |
133 |
sub r17 = 1,r35 |
// sub r17 = 0,r0 |
134 |
;; |
;; |
135 |
|
|
136 |
add r18 = 8,r14 |
add r18 = 8,r14 |
137 |
add r19 = r14,r16 // src + stride |
add r19 = r14,r16 // src + stride |
138 |
mux1 r17 = r17, @brcst |
// mux1 r17 = r17, @brcst |
139 |
|
|
140 |
sub r24 = 64,r22 |
sub r24 = 64,r22 |
141 |
;; |
;; |
142 |
add r26 = 8,r19 // src + stride + 8 |
add r26 = 8,r19 // src + stride + 8 |
143 |
|
|
144 |
mov ar.lc = 7 |
mov ar.lc = 7 |
145 |
mov ar.ec = LL + SL +OL + AVL + AL + STL |
mov ar.ec = LL + SL +OL + AVL + STL |
146 |
mov pr.rot = 1 << 16 |
mov pr.rot = 1 << 16 |
147 |
|
|
148 |
;; |
;; |
149 |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],add1[AL+1],avg[AVL+1] |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1] |
150 |
.rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL], pavg1p[AVL],stp[STL] |
.rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL] |
151 |
|
|
152 |
|
|
153 |
.Lloop_interpolate2: |
.Lloop_interpolate2: |
164 |
(or1p[0]) or or1[0] = shru1[SL],shl1[SL] |
(or1p[0]) or or1[0] = shru1[SL],shl1[SL] |
165 |
(or1p[0]) or or2[0] = shru2[SL],shl2[SL] |
(or1p[0]) or or2[0] = shru2[SL],shl2[SL] |
166 |
|
|
167 |
(addp[0]) padd1.uus add1[0] = or1[OL],r17 |
// (addp[0]) padd1.uus add1[0] = or1[OL],r17 |
168 |
|
|
169 |
(pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] |
(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] |
170 |
|
|
171 |
(stp[0]) st8 [r15] = avg[AVL] |
(stp[0]) st8 [r15] = avg[AVL] |
172 |
(stp[0]) add r15 = r15,r16 |
(stp[0]) add r15 = r15,r16 |
204 |
and r14 = -8,r33 |
and r14 = -8,r33 |
205 |
mov r15 = r32 |
mov r15 = r32 |
206 |
mov r16 = r34 |
mov r16 = r34 |
207 |
sub r17 = 1,r35 |
// sub r17 = 0,r0 |
208 |
;; |
;; |
209 |
|
|
210 |
add r18 = 8,r14 |
add r18 = 8,r14 |
211 |
add r19 = r14,r16 |
add r19 = r14,r16 |
212 |
mux1 r17 = r17, @brcst |
// mux1 r17 = r17, @brcst |
213 |
|
|
214 |
add r27 = 8,r22 |
add r27 = 8,r22 |
215 |
sub r28 = 56,r22 |
sub r28 = 56,r22 |
218 |
add r26 = 8,r19 |
add r26 = 8,r19 |
219 |
|
|
220 |
mov ar.lc = 7 |
mov ar.lc = 7 |
221 |
mov ar.ec = LL + SL +OL + 2*AVL + AL + STL |
mov ar.ec = LL + SL +OL + 2*AVL + STL |
222 |
mov pr.rot = 1 << 16 |
mov pr.rot = 1 << 16 |
223 |
|
|
224 |
;; |
;; |
225 |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],add1[AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1] |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1] |
226 |
.rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL],pavg1p[AVL],pavg2p[AVL],stp[STL] |
.rotp aldp[LL], sh1p[SL], or1p[OL],pavg1p[AVL],pavg2p[AVL],stp[STL] |
227 |
|
|
228 |
|
|
229 |
.Lloop_interpolate3: |
.Lloop_interpolate3: |
247 |
(or1p[0]) or or3[0] = shru3[SL],shl3[SL] |
(or1p[0]) or or3[0] = shru3[SL],shl3[SL] |
248 |
(or1p[0]) or or4[0] = shru4[SL],shl4[SL] |
(or1p[0]) or or4[0] = shru4[SL],shl4[SL] |
249 |
|
|
250 |
(addp[0]) padd1.uus add1[0] = or1[OL],r17 |
// (addp[0]) padd1.uus add1[0] = or1[OL],r17 |
251 |
|
|
252 |
(pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] |
(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] |
253 |
(pavg1p[0]) pavg1 avg1[0] = or3[OL+AL],or4[OL+AL] |
(pavg1p[0]) pavg1 avg1[0] = or3[OL],or4[OL] |
254 |
|
|
255 |
(pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL] |
(pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL] |
256 |
|
|