1 |
.file "interpolate8x8.c" |
|
2 |
|
.file "interpolate8x8_ia64.s" |
3 |
.pred.safe_across_calls p1-p5,p16-p63 |
.pred.safe_across_calls p1-p5,p16-p63 |
|
.common interpolate8x8_halfpel_h#,8,8 |
|
|
.common interpolate8x8_halfpel_v#,8,8 |
|
|
.common interpolate8x8_halfpel_hv#,8,8 |
|
4 |
.text |
.text |
5 |
.align 16 |
.align 16 |
6 |
.global interpolate8x8_halfpel_h_ia64# |
.global interpolate8x8_halfpel_h_ia64# |
7 |
.proc interpolate8x8_halfpel_h_ia64# |
.proc interpolate8x8_halfpel_h_ia64# |
8 |
interpolate8x8_halfpel_h_ia64: |
interpolate8x8_halfpel_h_ia64: |
9 |
.prologue |
LL=3 |
10 |
.body |
SL=1 |
11 |
mov r26 = r0 |
SL2=1 |
12 |
mov r25 = r0 |
OL=1 |
13 |
.L15: |
OL2=1 |
14 |
mov r24 = r0 |
AVL=1 |
15 |
|
AL=1 |
16 |
|
STL=3 |
17 |
|
|
18 |
|
alloc r9=ar.pfs,4, 60,0,64 |
19 |
|
|
20 |
|
mov r20 = ar.lc |
21 |
|
mov r21 = pr |
22 |
|
|
23 |
|
dep.z r22 = r33,3,3 // rshift of src |
24 |
|
|
25 |
|
and r14 = -8,r33 // align src |
26 |
|
mov r15 = r32 // get dest |
27 |
|
mov r16 = r34 // stride |
28 |
|
sub r17 = 1,r35 // 1-rounding |
29 |
|
;; |
30 |
|
|
31 |
|
add r18 = 8,r14 |
32 |
|
mux1 r17 = r17, @brcst // broadcast 1-rounding |
33 |
|
|
34 |
|
sub r24 = 64,r22 // lshift of src |
35 |
|
add r26 = 8,r22 // rshift of src+1 |
36 |
|
sub r27 = 56,r22 // lshift of src+1 |
37 |
|
|
38 |
|
mov ar.lc = 7 // loopcounter |
39 |
|
mov ar.ec = LL + SL +OL + AVL + AL + STL // sum of latencies |
40 |
|
mov pr.rot = 1 << 16 // init pr regs for sw-pipeling |
41 |
|
|
42 |
|
;; |
43 |
|
.rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],add1[AL+1],avg[AVL+1] |
44 |
|
.rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL], pavg1p[AVL],stp[STL] |
45 |
|
|
46 |
|
|
47 |
|
loop_interpolate: |
48 |
|
(aldp[0]) ld8 ald1[0] = [r14],r16 // load aligned src |
49 |
|
(aldp[0]) ld8 ald2[0] = [r18],r16 // and aligned src+8 |
50 |
|
|
51 |
|
(sh1p[0]) shr.u shru1[0] = ald1[LL],r22 // get src |
52 |
|
(sh1p[0]) shl shl1[0] = ald2[LL],r27 |
53 |
|
(sh1p[0]) shr.u shru2[0] = ald1[LL],r26 // get src+1 |
54 |
|
(sh1p[0]) shl shl2[0] = ald2[LL],r24 |
55 |
|
|
56 |
|
(or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things |
57 |
|
(or1p[0]) or or2[0] = shru2[SL],shl1[SL] |
58 |
|
|
59 |
|
(addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding |
60 |
|
|
61 |
|
(pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] // parallel average |
62 |
|
|
63 |
|
(stp[0]) st8 [r15] = avg[AVL] // store results |
64 |
|
(stp[0]) add r15 = r15,r16 |
65 |
|
|
66 |
|
|
67 |
|
|
68 |
|
|
69 |
|
br.ctop.sptk.few loop_interpolate |
70 |
;; |
;; |
71 |
adds r23 = 1, r25 |
mov ar.lc = r20 |
72 |
.L19: |
mov pr = r21,-1 |
|
add r18 = r25, r24 |
|
|
;; |
|
|
zxt4 r15 = r23 |
|
|
adds r21 = 1, r24 |
|
|
zxt4 r18 = r18 |
|
|
;; |
|
|
add r15 = r33, r15 |
|
|
adds r17 = 1, r23 |
|
|
;; |
|
|
ld1 r14 = [r15] |
|
|
add r16 = r33, r18 |
|
|
add r21 = r25, r21 |
|
|
;; |
|
|
ld1 r15 = [r16] |
|
|
zxt4 r21 = r21 |
|
|
add r18 = r32, r18 |
|
|
;; |
|
|
add r14 = r14, r15 |
|
|
zxt4 r17 = r17 |
|
|
add r16 = r33, r21 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
add r17 = r33, r17 |
|
|
adds r19 = 2, r24 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
adds r20 = 2, r23 |
|
|
add r19 = r25, r19 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
zxt4 r19 = r19 |
|
|
add r21 = r32, r21 |
|
|
;; |
|
|
st1 [r18] = r14 |
|
|
zxt4 r20 = r20 |
|
|
add r22 = r33, r19 |
|
|
ld1 r15 = [r16] |
|
|
ld1 r14 = [r17] |
|
|
;; |
|
|
add r20 = r33, r20 |
|
|
add r14 = r14, r15 |
|
|
adds r16 = 3, r24 |
|
|
adds r17 = 3, r23 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
add r16 = r25, r16 |
|
|
add r19 = r32, r19 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
zxt4 r16 = r16 |
|
|
zxt4 r17 = r17 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
add r18 = r33, r16 |
|
|
add r17 = r33, r17 |
|
|
;; |
|
|
st1 [r21] = r14 |
|
|
add r16 = r32, r16 |
|
|
adds r24 = 4, r24 |
|
|
ld1 r15 = [r22] |
|
|
ld1 r14 = [r20] |
|
|
adds r23 = 4, r23 |
|
|
;; |
|
|
add r14 = r14, r15 |
|
|
cmp4.geu p6, p7 = 7, r24 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
;; |
|
|
st1 [r19] = r14 |
|
|
ld1 r15 = [r18] |
|
|
ld1 r14 = [r17] |
|
|
;; |
|
|
add r14 = r14, r15 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
;; |
|
|
st1 [r16] = r14 |
|
|
(p6) br.cond.dptk .L19 |
|
|
adds r26 = 1, r26 |
|
|
add r25 = r25, r34 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r26 |
|
|
(p6) br.cond.dptk .L15 |
|
73 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
74 |
.endp interpolate8x8_halfpel_h_ia64# |
.endp interpolate8x8_halfpel_h_ia64# |
75 |
|
|
76 |
.align 16 |
.align 16 |
77 |
.global interpolate8x8_halfpel_v_ia64# |
.global interpolate8x8_halfpel_v_ia64# |
78 |
.proc interpolate8x8_halfpel_v_ia64# |
.proc interpolate8x8_halfpel_v_ia64# |
79 |
interpolate8x8_halfpel_v_ia64: |
interpolate8x8_halfpel_v_ia64: |
80 |
.prologue |
LL=3 |
81 |
.body |
SL=1 |
82 |
mov r26 = r0 |
SL2=1 |
83 |
mov r25 = r0 |
OL=1 |
84 |
.L26: |
OL2=1 |
85 |
mov r24 = r0 |
AVL=1 |
86 |
;; |
AL=1 |
87 |
add r23 = r25, r34 |
STL=3 |
88 |
.L30: |
|
89 |
add r18 = r25, r24 |
alloc r9=ar.pfs,4, 60,0,64 |
90 |
|
|
91 |
|
mov r20 = ar.lc |
92 |
|
mov r21 = pr |
93 |
|
|
94 |
|
dep.z r22 = r33,3,3 |
95 |
|
|
96 |
|
and r14 = -8,r33 |
97 |
|
mov r15 = r32 |
98 |
|
mov r16 = r34 |
99 |
|
sub r17 = 1,r35 |
100 |
|
;; |
101 |
|
|
102 |
|
add r18 = 8,r14 |
103 |
|
add r19 = r14,r16 // src + stride |
104 |
|
mux1 r17 = r17, @brcst |
105 |
|
|
106 |
|
sub r24 = 64,r22 |
107 |
|
;; |
108 |
|
add r26 = 8,r19 // src + stride + 8 |
109 |
|
|
110 |
|
mov ar.lc = 7 |
111 |
|
mov ar.ec = LL + SL +OL + AVL + AL + STL |
112 |
|
mov pr.rot = 1 << 16 |
113 |
|
|
114 |
|
;; |
115 |
|
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],add1[AL+1],avg[AVL+1] |
116 |
|
.rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL], pavg1p[AVL],stp[STL] |
117 |
|
|
118 |
|
|
119 |
|
loop_interpolate2: |
120 |
|
(aldp[0]) ld8 ald1[0] = [r14],r16 |
121 |
|
(aldp[0]) ld8 ald2[0] = [r18],r16 |
122 |
|
(aldp[0]) ld8 ald3[0] = [r19],r16 |
123 |
|
(aldp[0]) ld8 ald4[0] = [r26],r16 |
124 |
|
|
125 |
|
(sh1p[0]) shr.u shru1[0] = ald1[LL],r22 |
126 |
|
(sh1p[0]) shl shl1[0] = ald2[LL],r24 |
127 |
|
(sh1p[0]) shr.u shru2[0] = ald3[LL],r22 |
128 |
|
(sh1p[0]) shl shl2[0] = ald4[LL],r24 |
129 |
|
|
130 |
|
(or1p[0]) or or1[0] = shru1[SL],shl1[SL] |
131 |
|
(or1p[0]) or or2[0] = shru2[SL],shl2[SL] |
132 |
|
|
133 |
|
(addp[0]) padd1.uus add1[0] = or1[OL],r17 |
134 |
|
|
135 |
|
(pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] |
136 |
|
|
137 |
|
(stp[0]) st8 [r15] = avg[AVL] |
138 |
|
(stp[0]) add r15 = r15,r16 |
139 |
|
|
140 |
|
|
141 |
|
|
142 |
|
|
143 |
|
br.ctop.sptk.few loop_interpolate2 |
144 |
;; |
;; |
145 |
zxt4 r15 = r23 |
mov ar.lc = r20 |
146 |
adds r21 = 1, r24 |
mov pr = r21,-1 |
|
zxt4 r18 = r18 |
|
|
;; |
|
|
add r15 = r33, r15 |
|
|
adds r17 = 1, r23 |
|
|
;; |
|
|
ld1 r14 = [r15] |
|
|
add r16 = r33, r18 |
|
|
add r21 = r25, r21 |
|
|
;; |
|
|
ld1 r15 = [r16] |
|
|
zxt4 r21 = r21 |
|
|
add r18 = r32, r18 |
|
|
;; |
|
|
add r14 = r14, r15 |
|
|
zxt4 r17 = r17 |
|
|
add r16 = r33, r21 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
add r17 = r33, r17 |
|
|
adds r19 = 2, r24 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
adds r20 = 2, r23 |
|
|
add r19 = r25, r19 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
zxt4 r19 = r19 |
|
|
add r21 = r32, r21 |
|
|
;; |
|
|
st1 [r18] = r14 |
|
|
zxt4 r20 = r20 |
|
|
add r22 = r33, r19 |
|
|
ld1 r15 = [r16] |
|
|
ld1 r14 = [r17] |
|
|
;; |
|
|
add r20 = r33, r20 |
|
|
add r14 = r14, r15 |
|
|
adds r16 = 3, r24 |
|
|
adds r17 = 3, r23 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
add r16 = r25, r16 |
|
|
add r19 = r32, r19 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
zxt4 r16 = r16 |
|
|
zxt4 r17 = r17 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
add r18 = r33, r16 |
|
|
add r17 = r33, r17 |
|
|
;; |
|
|
st1 [r21] = r14 |
|
|
add r16 = r32, r16 |
|
|
adds r24 = 4, r24 |
|
|
ld1 r15 = [r22] |
|
|
ld1 r14 = [r20] |
|
|
adds r23 = 4, r23 |
|
|
;; |
|
|
add r14 = r14, r15 |
|
|
cmp4.geu p6, p7 = 7, r24 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
;; |
|
|
st1 [r19] = r14 |
|
|
ld1 r15 = [r18] |
|
|
ld1 r14 = [r17] |
|
|
;; |
|
|
add r14 = r14, r15 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
;; |
|
|
st1 [r16] = r14 |
|
|
(p6) br.cond.dptk .L30 |
|
|
adds r26 = 1, r26 |
|
|
add r25 = r25, r34 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r26 |
|
|
(p6) br.cond.dptk .L26 |
|
147 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
148 |
.endp interpolate8x8_halfpel_v_ia64# |
.endp interpolate8x8_halfpel_v_ia64# |
149 |
|
|
150 |
.align 16 |
.align 16 |
151 |
.global interpolate8x8_halfpel_hv_ia64# |
.global interpolate8x8_halfpel_hv_ia64# |
152 |
.proc interpolate8x8_halfpel_hv_ia64# |
.proc interpolate8x8_halfpel_hv_ia64# |
153 |
interpolate8x8_halfpel_hv_ia64: |
interpolate8x8_halfpel_hv_ia64: |
154 |
.prologue |
LL=3 |
155 |
.save ar.lc, r2 |
SL=1 |
156 |
mov r2 = ar.lc |
SL2=1 |
157 |
.body |
OL=1 |
158 |
mov r27 = r0 |
OL2=1 |
159 |
mov r26 = r0 |
AVL=1 |
160 |
;; |
AL=1 |
161 |
.L37: |
STL=3 |
162 |
add r14 = r26, r34 |
|
163 |
mov r25 = r0 |
alloc r9=ar.pfs,4, 60,0,64 |
164 |
adds r24 = 1, r26 |
|
165 |
;; |
mov r20 = ar.lc |
166 |
mov r23 = r14 |
mov r21 = pr |
167 |
adds r22 = 1, r14 |
|
168 |
addl r14 = 3, r0 |
dep.z r22 = r33,3,3 |
169 |
;; |
|
170 |
mov ar.lc = r14 |
and r14 = -8,r33 |
171 |
;; |
mov r15 = r32 |
172 |
.L70: |
mov r16 = r34 |
173 |
add r21 = r26, r25 |
sub r17 = 1,r35 |
174 |
zxt4 r15 = r24 |
;; |
175 |
zxt4 r16 = r23 |
|
176 |
;; |
add r18 = 8,r14 |
177 |
zxt4 r21 = r21 |
add r19 = r14,r16 |
178 |
add r15 = r33, r15 |
mux1 r17 = r17, @brcst |
179 |
add r16 = r33, r16 |
|
180 |
;; |
add r27 = 8,r22 |
181 |
add r19 = r33, r21 |
sub r28 = 56,r22 |
182 |
ld1 r17 = [r15] |
sub r24 = 64,r22 |
183 |
zxt4 r14 = r22 |
;; |
184 |
;; |
add r26 = 8,r19 |
185 |
ld1 r20 = [r19] |
|
186 |
ld1 r18 = [r16] |
mov ar.lc = 7 |
187 |
add r14 = r33, r14 |
mov ar.ec = LL + SL +OL + 2*AVL + AL + STL |
188 |
;; |
mov pr.rot = 1 << 16 |
189 |
add r17 = r17, r20 |
|
190 |
ld1 r15 = [r14] |
;; |
191 |
adds r19 = 1, r24 |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],add1[AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1] |
192 |
;; |
.rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL],pavg1p[AVL],pavg2p[AVL],stp[STL] |
193 |
add r18 = r18, r17 |
|
194 |
adds r20 = 1, r25 |
|
195 |
adds r14 = 1, r23 |
loop_interpolate3: |
196 |
;; |
(aldp[0]) ld8 ald1[0] = [r14],r16 |
197 |
add r15 = r15, r18 |
(aldp[0]) ld8 ald2[0] = [r18],r16 |
198 |
add r20 = r26, r20 |
(aldp[0]) ld8 ald3[0] = [r19],r16 |
199 |
add r21 = r32, r21 |
(aldp[0]) ld8 ald4[0] = [r26],r16 |
200 |
;; |
|
201 |
sub r15 = r15, r35 |
(sh1p[0]) shr.u shru1[0] = ald1[LL],r22 |
202 |
zxt4 r20 = r20 |
(sh1p[0]) shl shl1[0] = ald2[LL],r24 |
203 |
zxt4 r19 = r19 |
(sh1p[0]) shr.u shru2[0] = ald3[LL],r22 |
204 |
;; |
(sh1p[0]) shl shl2[0] = ald4[LL],r24 |
205 |
adds r15 = 2, r15 |
(sh1p[0]) shr.u shru3[0] = ald1[LL],r27 |
206 |
add r17 = r33, r20 |
(sh1p[0]) shl shl3[0] = ald2[LL],r28 |
207 |
adds r16 = 1, r22 |
(sh1p[0]) shr.u shru4[0] = ald3[LL],r27 |
208 |
;; |
(sh1p[0]) shl shl4[0] = ald4[LL],r28 |
209 |
extr r15 = r15, 2, 16 |
|
210 |
add r19 = r33, r19 |
|
211 |
zxt4 r14 = r14 |
(or1p[0]) or or1[0] = shru1[SL],shl1[SL] |
212 |
;; |
(or1p[0]) or or2[0] = shru2[SL],shl2[SL] |
213 |
st1 [r21] = r15 |
(or1p[0]) or or3[0] = shru3[SL],shl3[SL] |
214 |
add r14 = r33, r14 |
(or1p[0]) or or4[0] = shru4[SL],shl4[SL] |
215 |
zxt4 r16 = r16 |
|
216 |
ld1 r18 = [r17] |
(addp[0]) padd1.uus add1[0] = or1[OL],r17 |
217 |
ld1 r15 = [r19] |
|
218 |
;; |
(pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] |
219 |
add r16 = r33, r16 |
(pavg1p[0]) pavg1 avg1[0] = or3[OL+AL],or4[OL+AL] |
220 |
ld1 r17 = [r14] |
|
221 |
add r15 = r15, r18 |
(pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL] |
222 |
add r20 = r32, r20 |
|
223 |
;; |
(stp[0]) st8 [r15] = avg2[AVL] |
224 |
ld1 r14 = [r16] |
(stp[0]) add r15 = r15,r16 |
225 |
add r17 = r17, r15 |
|
226 |
adds r22 = 2, r22 |
|
227 |
;; |
|
228 |
add r14 = r14, r17 |
|
229 |
adds r23 = 2, r23 |
br.ctop.sptk.few loop_interpolate3 |
230 |
adds r24 = 2, r24 |
;; |
231 |
;; |
mov ar.lc = r20 |
232 |
sub r14 = r14, r35 |
mov pr = r21,-1 |
|
adds r25 = 2, r25 |
|
|
;; |
|
|
adds r14 = 2, r14 |
|
|
;; |
|
|
extr r14 = r14, 2, 16 |
|
|
;; |
|
|
st1 [r20] = r14 |
|
|
br.cloop.sptk.few .L70 |
|
|
adds r27 = 1, r27 |
|
|
add r26 = r26, r34 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r27 |
|
|
(p6) br.cond.dptk .L37 |
|
|
mov ar.lc = r2 |
|
233 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
234 |
.endp interpolate8x8_halfpel_hv_ia64# |
.endp interpolate8x8_halfpel_hv_ia64# |
235 |
.ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)" |
|
236 |
|
|