Parent Directory | Revision Log
Revision 1.5 - (view) (download)
1 : | ia64p | 1.3 | |
2 : | .file "interpolate8x8_ia64.s" | ||
3 : | .pred.safe_across_calls p1-p5,p16-p63 | ||
4 : | .text | ||
5 : | .align 16 | ||
6 : | .global interpolate8x8_halfpel_h_ia64# | ||
7 : | .proc interpolate8x8_halfpel_h_ia64# | ||
8 : | interpolate8x8_halfpel_h_ia64: | ||
9 : | LL=3 | ||
10 : | SL=1 | ||
11 : | SL2=1 | ||
12 : | OL=1 | ||
13 : | OL2=1 | ||
14 : | AVL=1 | ||
15 : | AL=1 | ||
16 : | STL=3 | ||
17 : | |||
18 : | alloc r9=ar.pfs,4, 60,0,64 | ||
19 : | |||
20 : | mov r20 = ar.lc | ||
21 : | mov r21 = pr | ||
22 : | |||
23 : | dep.z r22 = r33,3,3 // rshift of src | ||
24 : | |||
25 : | and r14 = -8,r33 // align src | ||
26 : | mov r15 = r32 // get dest | ||
27 : | mov r16 = r34 // stride | ||
28 : | ia64p | 1.5 | // sub r17 = 0,r0 // 1-rounding |
29 : | |||
30 : | ia64p | 1.3 | ;; |
31 : | |||
32 : | add r18 = 8,r14 | ||
33 : | ia64p | 1.5 | // mux1 r17 = r17, @brcst // broadcast 1-rounding |
34 : | ia64p | 1.3 | |
35 : | sub r24 = 64,r22 // lshift of src | ||
36 : | add r26 = 8,r22 // rshift of src+1 | ||
37 : | sub r27 = 56,r22 // lshift of src+1 | ||
38 : | |||
39 : | mov ar.lc = 7 // loopcounter | ||
40 : | ia64p | 1.5 | mov ar.ec = LL + SL +OL + AVL + STL // sum of latencies |
41 : | ia64p | 1.3 | mov pr.rot = 1 << 16 // init pr regs for sw-pipeling |
42 : | |||
43 : | ;; | ||
44 : | ia64p | 1.5 | .rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1] |
45 : | .rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL] | ||
46 : | ia64p | 1.3 | |
47 : | |||
48 : | ia64p | 1.4 | .Lloop_interpolate: |
49 : | ia64p | 1.3 | (aldp[0]) ld8 ald1[0] = [r14],r16 // load aligned src |
50 : | (aldp[0]) ld8 ald2[0] = [r18],r16 // and aligned src+8 | ||
51 : | |||
52 : | (sh1p[0]) shr.u shru1[0] = ald1[LL],r22 // get src | ||
53 : | (sh1p[0]) shl shl1[0] = ald2[LL],r27 | ||
54 : | (sh1p[0]) shr.u shru2[0] = ald1[LL],r26 // get src+1 | ||
55 : | (sh1p[0]) shl shl2[0] = ald2[LL],r24 | ||
56 : | |||
57 : | (or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things | ||
58 : | (or1p[0]) or or2[0] = shru2[SL],shl1[SL] | ||
59 : | |||
60 : | ia64p | 1.5 | // (addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding |
61 : | ia64p | 1.3 | |
62 : | ia64p | 1.5 | (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] // parallel average |
63 : | ia64p | 1.3 | |
64 : | (stp[0]) st8 [r15] = avg[AVL] // store results | ||
65 : | (stp[0]) add r15 = r15,r16 | ||
66 : | |||
67 : | |||
68 : | |||
69 : | |||
70 : | ia64p | 1.4 | br.ctop.sptk.few .Lloop_interpolate |
71 : | ia64p | 1.3 | ;; |
72 : | mov ar.lc = r20 | ||
73 : | mov pr = r21,-1 | ||
74 : | br.ret.sptk.many b0 | ||
75 : | .endp interpolate8x8_halfpel_h_ia64# | ||
76 : | |||
77 : | .align 16 | ||
78 : | .global interpolate8x8_halfpel_v_ia64# | ||
79 : | .proc interpolate8x8_halfpel_v_ia64# | ||
80 : | interpolate8x8_halfpel_v_ia64: | ||
81 : | LL=3 | ||
82 : | SL=1 | ||
83 : | SL2=1 | ||
84 : | OL=1 | ||
85 : | OL2=1 | ||
86 : | AVL=1 | ||
87 : | AL=1 | ||
88 : | STL=3 | ||
89 : | |||
90 : | alloc r9=ar.pfs,4, 60,0,64 | ||
91 : | |||
92 : | mov r20 = ar.lc | ||
93 : | mov r21 = pr | ||
94 : | |||
95 : | dep.z r22 = r33,3,3 | ||
96 : | |||
97 : | and r14 = -8,r33 | ||
98 : | mov r15 = r32 | ||
99 : | mov r16 = r34 | ||
100 : | ia64p | 1.5 | // sub r17 = 0,r0 |
101 : | ia64p | 1.3 | ;; |
102 : | |||
103 : | add r18 = 8,r14 | ||
104 : | add r19 = r14,r16 // src + stride | ||
105 : | ia64p | 1.5 | // mux1 r17 = r17, @brcst |
106 : | ia64p | 1.3 | |
107 : | sub r24 = 64,r22 | ||
108 : | ;; | ||
109 : | add r26 = 8,r19 // src + stride + 8 | ||
110 : | |||
111 : | mov ar.lc = 7 | ||
112 : | ia64p | 1.5 | mov ar.ec = LL + SL +OL + AVL + STL |
113 : | ia64p | 1.3 | mov pr.rot = 1 << 16 |
114 : | |||
115 : | ;; | ||
116 : | ia64p | 1.5 | .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1] |
117 : | .rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL] | ||
118 : | ia64p | 1.3 | |
119 : | |||
120 : | ia64p | 1.4 | .Lloop_interpolate2: |
121 : | ia64p | 1.3 | (aldp[0]) ld8 ald1[0] = [r14],r16 |
122 : | (aldp[0]) ld8 ald2[0] = [r18],r16 | ||
123 : | (aldp[0]) ld8 ald3[0] = [r19],r16 | ||
124 : | (aldp[0]) ld8 ald4[0] = [r26],r16 | ||
125 : | |||
126 : | (sh1p[0]) shr.u shru1[0] = ald1[LL],r22 | ||
127 : | (sh1p[0]) shl shl1[0] = ald2[LL],r24 | ||
128 : | (sh1p[0]) shr.u shru2[0] = ald3[LL],r22 | ||
129 : | (sh1p[0]) shl shl2[0] = ald4[LL],r24 | ||
130 : | |||
131 : | (or1p[0]) or or1[0] = shru1[SL],shl1[SL] | ||
132 : | (or1p[0]) or or2[0] = shru2[SL],shl2[SL] | ||
133 : | |||
134 : | ia64p | 1.5 | // (addp[0]) padd1.uus add1[0] = or1[OL],r17 |
135 : | ia64p | 1.3 | |
136 : | ia64p | 1.5 | (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] |
137 : | ia64p | 1.3 | |
138 : | (stp[0]) st8 [r15] = avg[AVL] | ||
139 : | (stp[0]) add r15 = r15,r16 | ||
140 : | |||
141 : | |||
142 : | |||
143 : | |||
144 : | ia64p | 1.4 | br.ctop.sptk.few .Lloop_interpolate2 |
145 : | ia64p | 1.3 | ;; |
146 : | mov ar.lc = r20 | ||
147 : | mov pr = r21,-1 | ||
148 : | br.ret.sptk.many b0 | ||
149 : | .endp interpolate8x8_halfpel_v_ia64# | ||
150 : | |||
151 : | .align 16 | ||
152 : | .global interpolate8x8_halfpel_hv_ia64# | ||
153 : | .proc interpolate8x8_halfpel_hv_ia64# | ||
154 : | interpolate8x8_halfpel_hv_ia64: | ||
155 : | LL=3 | ||
156 : | SL=1 | ||
157 : | SL2=1 | ||
158 : | OL=1 | ||
159 : | OL2=1 | ||
160 : | AVL=1 | ||
161 : | AL=1 | ||
162 : | STL=3 | ||
163 : | |||
164 : | alloc r9=ar.pfs,4, 60,0,64 | ||
165 : | |||
166 : | mov r20 = ar.lc | ||
167 : | mov r21 = pr | ||
168 : | |||
169 : | dep.z r22 = r33,3,3 | ||
170 : | |||
171 : | and r14 = -8,r33 | ||
172 : | mov r15 = r32 | ||
173 : | mov r16 = r34 | ||
174 : | ia64p | 1.5 | // sub r17 = 0,r0 |
175 : | ia64p | 1.3 | ;; |
176 : | |||
177 : | add r18 = 8,r14 | ||
178 : | add r19 = r14,r16 | ||
179 : | ia64p | 1.5 | // mux1 r17 = r17, @brcst |
180 : | ia64p | 1.3 | |
181 : | add r27 = 8,r22 | ||
182 : | sub r28 = 56,r22 | ||
183 : | sub r24 = 64,r22 | ||
184 : | ;; | ||
185 : | add r26 = 8,r19 | ||
186 : | |||
187 : | mov ar.lc = 7 | ||
188 : | ia64p | 1.5 | mov ar.ec = LL + SL +OL + 2*AVL + STL |
189 : | ia64p | 1.3 | mov pr.rot = 1 << 16 |
190 : | |||
191 : | ;; | ||
192 : | ia64p | 1.5 | .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1] |
193 : | .rotp aldp[LL], sh1p[SL], or1p[OL],pavg1p[AVL],pavg2p[AVL],stp[STL] | ||
194 : | ia64p | 1.3 | |
195 : | |||
196 : | ia64p | 1.4 | .Lloop_interpolate3: |
197 : | ia64p | 1.3 | (aldp[0]) ld8 ald1[0] = [r14],r16 |
198 : | (aldp[0]) ld8 ald2[0] = [r18],r16 | ||
199 : | (aldp[0]) ld8 ald3[0] = [r19],r16 | ||
200 : | (aldp[0]) ld8 ald4[0] = [r26],r16 | ||
201 : | |||
202 : | (sh1p[0]) shr.u shru1[0] = ald1[LL],r22 | ||
203 : | (sh1p[0]) shl shl1[0] = ald2[LL],r24 | ||
204 : | (sh1p[0]) shr.u shru2[0] = ald3[LL],r22 | ||
205 : | (sh1p[0]) shl shl2[0] = ald4[LL],r24 | ||
206 : | (sh1p[0]) shr.u shru3[0] = ald1[LL],r27 | ||
207 : | (sh1p[0]) shl shl3[0] = ald2[LL],r28 | ||
208 : | (sh1p[0]) shr.u shru4[0] = ald3[LL],r27 | ||
209 : | (sh1p[0]) shl shl4[0] = ald4[LL],r28 | ||
210 : | |||
211 : | |||
212 : | (or1p[0]) or or1[0] = shru1[SL],shl1[SL] | ||
213 : | (or1p[0]) or or2[0] = shru2[SL],shl2[SL] | ||
214 : | (or1p[0]) or or3[0] = shru3[SL],shl3[SL] | ||
215 : | (or1p[0]) or or4[0] = shru4[SL],shl4[SL] | ||
216 : | |||
217 : | ia64p | 1.5 | // (addp[0]) padd1.uus add1[0] = or1[OL],r17 |
218 : | ia64p | 1.3 | |
219 : | ia64p | 1.5 | (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] |
220 : | (pavg1p[0]) pavg1 avg1[0] = or3[OL],or4[OL] | ||
221 : | ia64p | 1.3 | |
222 : | (pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL] | ||
223 : | |||
224 : | (stp[0]) st8 [r15] = avg2[AVL] | ||
225 : | (stp[0]) add r15 = r15,r16 | ||
226 : | |||
227 : | |||
228 : | |||
229 : | |||
230 : | ia64p | 1.4 | br.ctop.sptk.few .Lloop_interpolate3 |
231 : | ia64p | 1.3 | ;; |
232 : | mov ar.lc = r20 | ||
233 : | mov pr = r21,-1 | ||
234 : | br.ret.sptk.many b0 | ||
235 : | .endp interpolate8x8_halfpel_hv_ia64# | ||
236 : | |||
237 : |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |