1 |
.file "interpolate8x8.c" |
|
2 |
|
.file "interpolate8x8_ia64.s" |
3 |
.pred.safe_across_calls p1-p5,p16-p63 |
.pred.safe_across_calls p1-p5,p16-p63 |
|
.common interpolate8x8_halfpel_h#,8,8 |
|
|
.common interpolate8x8_halfpel_v#,8,8 |
|
|
.common interpolate8x8_halfpel_hv#,8,8 |
|
4 |
.text |
.text |
5 |
.align 16 |
.align 16 |
6 |
.global interpolate8x8_halfpel_h_ia64# |
.global interpolate8x8_halfpel_h_ia64# |
7 |
.proc interpolate8x8_halfpel_h_ia64# |
.proc interpolate8x8_halfpel_h_ia64# |
8 |
interpolate8x8_halfpel_h_ia64: |
interpolate8x8_halfpel_h_ia64: |
9 |
.prologue |
LL=3 |
10 |
.body |
SL=1 |
11 |
mov r26 = r0 |
SL2=1 |
12 |
mov r25 = r0 |
OL=1 |
13 |
.L15: |
OL2=1 |
14 |
mov r24 = r0 |
AVL=1 |
15 |
|
AL=1 |
16 |
|
STL=3 |
17 |
|
|
18 |
|
alloc r9=ar.pfs,4, 60,0,64 |
19 |
|
|
20 |
|
mov r20 = ar.lc |
21 |
|
mov r21 = pr |
22 |
|
|
23 |
|
dep.z r22 = r33,3,3 // rshift of src |
24 |
|
|
25 |
|
and r14 = -8,r33 // align src |
26 |
|
mov r15 = r32 // get dest |
27 |
|
mov r16 = r34 // stride |
28 |
|
// sub r17 = 0,r0 // 1-rounding |
29 |
|
|
30 |
|
;; |
31 |
|
|
32 |
|
add r18 = 8,r14 |
33 |
|
// mux1 r17 = r17, @brcst // broadcast 1-rounding |
34 |
|
|
35 |
|
sub r24 = 64,r22 // lshift of src |
36 |
|
add r26 = 8,r22 // rshift of src+1 |
37 |
|
sub r27 = 56,r22 // lshift of src+1 |
38 |
|
|
39 |
|
mov ar.lc = 7 // loopcounter |
40 |
|
mov ar.ec = LL + SL +OL + AVL + STL // sum of latencies |
41 |
|
mov pr.rot = 1 << 16 // init pr regs for sw-pipeling |
42 |
|
|
43 |
|
;; |
44 |
|
.rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1] |
45 |
|
.rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL] |
46 |
|
|
47 |
|
|
48 |
|
.Lloop_interpolate: |
49 |
|
(aldp[0]) ld8 ald1[0] = [r14],r16 // load aligned src |
50 |
|
(aldp[0]) ld8 ald2[0] = [r18],r16 // and aligned src+8 |
51 |
|
|
52 |
|
(sh1p[0]) shr.u shru1[0] = ald1[LL],r22 // get src |
53 |
|
(sh1p[0]) shl shl1[0] = ald2[LL],r27 |
54 |
|
(sh1p[0]) shr.u shru2[0] = ald1[LL],r26 // get src+1 |
55 |
|
(sh1p[0]) shl shl2[0] = ald2[LL],r24 |
56 |
|
|
57 |
|
(or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things |
58 |
|
(or1p[0]) or or2[0] = shru2[SL],shl1[SL] |
59 |
|
|
60 |
|
// (addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding |
61 |
|
|
62 |
|
(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] // parallel average |
63 |
|
|
64 |
|
(stp[0]) st8 [r15] = avg[AVL] // store results |
65 |
|
(stp[0]) add r15 = r15,r16 |
66 |
|
|
67 |
|
|
68 |
|
|
69 |
|
|
70 |
|
br.ctop.sptk.few .Lloop_interpolate |
71 |
;; |
;; |
72 |
adds r23 = 1, r25 |
mov ar.lc = r20 |
73 |
.L19: |
mov pr = r21,-1 |
|
add r18 = r25, r24 |
|
|
;; |
|
|
zxt4 r15 = r23 |
|
|
adds r21 = 1, r24 |
|
|
zxt4 r18 = r18 |
|
|
;; |
|
|
add r15 = r33, r15 |
|
|
adds r17 = 1, r23 |
|
|
;; |
|
|
ld1 r14 = [r15] |
|
|
add r16 = r33, r18 |
|
|
add r21 = r25, r21 |
|
|
;; |
|
|
ld1 r15 = [r16] |
|
|
zxt4 r21 = r21 |
|
|
add r18 = r32, r18 |
|
|
;; |
|
|
add r14 = r14, r15 |
|
|
zxt4 r17 = r17 |
|
|
add r16 = r33, r21 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
add r17 = r33, r17 |
|
|
adds r19 = 2, r24 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
adds r20 = 2, r23 |
|
|
add r19 = r25, r19 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
zxt4 r19 = r19 |
|
|
add r21 = r32, r21 |
|
|
;; |
|
|
st1 [r18] = r14 |
|
|
zxt4 r20 = r20 |
|
|
add r22 = r33, r19 |
|
|
ld1 r15 = [r16] |
|
|
ld1 r14 = [r17] |
|
|
;; |
|
|
add r20 = r33, r20 |
|
|
add r14 = r14, r15 |
|
|
adds r16 = 3, r24 |
|
|
adds r17 = 3, r23 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
add r16 = r25, r16 |
|
|
add r19 = r32, r19 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
zxt4 r16 = r16 |
|
|
zxt4 r17 = r17 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
add r18 = r33, r16 |
|
|
add r17 = r33, r17 |
|
|
;; |
|
|
st1 [r21] = r14 |
|
|
add r16 = r32, r16 |
|
|
adds r24 = 4, r24 |
|
|
ld1 r15 = [r22] |
|
|
ld1 r14 = [r20] |
|
|
adds r23 = 4, r23 |
|
|
;; |
|
|
add r14 = r14, r15 |
|
|
cmp4.geu p6, p7 = 7, r24 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
;; |
|
|
st1 [r19] = r14 |
|
|
ld1 r15 = [r18] |
|
|
ld1 r14 = [r17] |
|
|
;; |
|
|
add r14 = r14, r15 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
;; |
|
|
st1 [r16] = r14 |
|
|
(p6) br.cond.dptk .L19 |
|
|
adds r26 = 1, r26 |
|
|
add r25 = r25, r34 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r26 |
|
|
(p6) br.cond.dptk .L15 |
|
74 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
75 |
.endp interpolate8x8_halfpel_h_ia64# |
.endp interpolate8x8_halfpel_h_ia64# |
76 |
|
|
77 |
.align 16 |
.align 16 |
78 |
.global interpolate8x8_halfpel_v_ia64# |
.global interpolate8x8_halfpel_v_ia64# |
79 |
.proc interpolate8x8_halfpel_v_ia64# |
.proc interpolate8x8_halfpel_v_ia64# |
80 |
interpolate8x8_halfpel_v_ia64: |
interpolate8x8_halfpel_v_ia64: |
81 |
.prologue |
LL=3 |
82 |
.body |
SL=1 |
83 |
mov r26 = r0 |
SL2=1 |
84 |
mov r25 = r0 |
OL=1 |
85 |
.L26: |
OL2=1 |
86 |
mov r24 = r0 |
AVL=1 |
87 |
;; |
AL=1 |
88 |
add r23 = r25, r34 |
STL=3 |
89 |
.L30: |
|
90 |
add r18 = r25, r24 |
alloc r9=ar.pfs,4, 60,0,64 |
91 |
|
|
92 |
|
mov r20 = ar.lc |
93 |
|
mov r21 = pr |
94 |
|
|
95 |
|
dep.z r22 = r33,3,3 |
96 |
|
|
97 |
|
and r14 = -8,r33 |
98 |
|
mov r15 = r32 |
99 |
|
mov r16 = r34 |
100 |
|
// sub r17 = 0,r0 |
101 |
|
;; |
102 |
|
|
103 |
|
add r18 = 8,r14 |
104 |
|
add r19 = r14,r16 // src + stride |
105 |
|
// mux1 r17 = r17, @brcst |
106 |
|
|
107 |
|
sub r24 = 64,r22 |
108 |
|
;; |
109 |
|
add r26 = 8,r19 // src + stride + 8 |
110 |
|
|
111 |
|
mov ar.lc = 7 |
112 |
|
mov ar.ec = LL + SL +OL + AVL + STL |
113 |
|
mov pr.rot = 1 << 16 |
114 |
|
|
115 |
|
;; |
116 |
|
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1] |
117 |
|
.rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL] |
118 |
|
|
119 |
|
|
120 |
|
.Lloop_interpolate2: |
121 |
|
(aldp[0]) ld8 ald1[0] = [r14],r16 |
122 |
|
(aldp[0]) ld8 ald2[0] = [r18],r16 |
123 |
|
(aldp[0]) ld8 ald3[0] = [r19],r16 |
124 |
|
(aldp[0]) ld8 ald4[0] = [r26],r16 |
125 |
|
|
126 |
|
(sh1p[0]) shr.u shru1[0] = ald1[LL],r22 |
127 |
|
(sh1p[0]) shl shl1[0] = ald2[LL],r24 |
128 |
|
(sh1p[0]) shr.u shru2[0] = ald3[LL],r22 |
129 |
|
(sh1p[0]) shl shl2[0] = ald4[LL],r24 |
130 |
|
|
131 |
|
(or1p[0]) or or1[0] = shru1[SL],shl1[SL] |
132 |
|
(or1p[0]) or or2[0] = shru2[SL],shl2[SL] |
133 |
|
|
134 |
|
// (addp[0]) padd1.uus add1[0] = or1[OL],r17 |
135 |
|
|
136 |
|
(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] |
137 |
|
|
138 |
|
(stp[0]) st8 [r15] = avg[AVL] |
139 |
|
(stp[0]) add r15 = r15,r16 |
140 |
|
|
141 |
|
|
142 |
|
|
143 |
|
|
144 |
|
br.ctop.sptk.few .Lloop_interpolate2 |
145 |
;; |
;; |
146 |
zxt4 r15 = r23 |
mov ar.lc = r20 |
147 |
adds r21 = 1, r24 |
mov pr = r21,-1 |
|
zxt4 r18 = r18 |
|
|
;; |
|
|
add r15 = r33, r15 |
|
|
adds r17 = 1, r23 |
|
|
;; |
|
|
ld1 r14 = [r15] |
|
|
add r16 = r33, r18 |
|
|
add r21 = r25, r21 |
|
|
;; |
|
|
ld1 r15 = [r16] |
|
|
zxt4 r21 = r21 |
|
|
add r18 = r32, r18 |
|
|
;; |
|
|
add r14 = r14, r15 |
|
|
zxt4 r17 = r17 |
|
|
add r16 = r33, r21 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
add r17 = r33, r17 |
|
|
adds r19 = 2, r24 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
adds r20 = 2, r23 |
|
|
add r19 = r25, r19 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
zxt4 r19 = r19 |
|
|
add r21 = r32, r21 |
|
|
;; |
|
|
st1 [r18] = r14 |
|
|
zxt4 r20 = r20 |
|
|
add r22 = r33, r19 |
|
|
ld1 r15 = [r16] |
|
|
ld1 r14 = [r17] |
|
|
;; |
|
|
add r20 = r33, r20 |
|
|
add r14 = r14, r15 |
|
|
adds r16 = 3, r24 |
|
|
adds r17 = 3, r23 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
add r16 = r25, r16 |
|
|
add r19 = r32, r19 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
zxt4 r16 = r16 |
|
|
zxt4 r17 = r17 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
add r18 = r33, r16 |
|
|
add r17 = r33, r17 |
|
|
;; |
|
|
st1 [r21] = r14 |
|
|
add r16 = r32, r16 |
|
|
adds r24 = 4, r24 |
|
|
ld1 r15 = [r22] |
|
|
ld1 r14 = [r20] |
|
|
adds r23 = 4, r23 |
|
|
;; |
|
|
add r14 = r14, r15 |
|
|
cmp4.geu p6, p7 = 7, r24 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
;; |
|
|
st1 [r19] = r14 |
|
|
ld1 r15 = [r18] |
|
|
ld1 r14 = [r17] |
|
|
;; |
|
|
add r14 = r14, r15 |
|
|
;; |
|
|
sub r14 = r14, r35 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
;; |
|
|
extr r14 = r14, 1, 16 |
|
|
;; |
|
|
st1 [r16] = r14 |
|
|
(p6) br.cond.dptk .L30 |
|
|
adds r26 = 1, r26 |
|
|
add r25 = r25, r34 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r26 |
|
|
(p6) br.cond.dptk .L26 |
|
148 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
149 |
.endp interpolate8x8_halfpel_v_ia64# |
.endp interpolate8x8_halfpel_v_ia64# |
150 |
|
|
151 |
.align 16 |
.align 16 |
152 |
.global interpolate8x8_halfpel_hv_ia64# |
.global interpolate8x8_halfpel_hv_ia64# |
153 |
.proc interpolate8x8_halfpel_hv_ia64# |
.proc interpolate8x8_halfpel_hv_ia64# |
154 |
interpolate8x8_halfpel_hv_ia64: |
interpolate8x8_halfpel_hv_ia64: |
155 |
.prologue |
LL=3 |
156 |
.save ar.lc, r2 |
SL=1 |
157 |
mov r2 = ar.lc |
SL2=1 |
158 |
.body |
OL=1 |
159 |
mov r27 = r0 |
OL2=1 |
160 |
mov r26 = r0 |
AVL=1 |
161 |
;; |
AL=1 |
162 |
.L37: |
STL=3 |
163 |
add r14 = r26, r34 |
|
164 |
mov r25 = r0 |
alloc r9=ar.pfs,4, 60,0,64 |
165 |
adds r24 = 1, r26 |
|
166 |
;; |
mov r20 = ar.lc |
167 |
mov r23 = r14 |
mov r21 = pr |
168 |
adds r22 = 1, r14 |
|
169 |
addl r14 = 3, r0 |
dep.z r22 = r33,3,3 |
170 |
;; |
|
171 |
mov ar.lc = r14 |
and r14 = -8,r33 |
172 |
;; |
mov r15 = r32 |
173 |
.L70: |
mov r16 = r34 |
174 |
add r21 = r26, r25 |
// sub r17 = 0,r0 |
175 |
zxt4 r15 = r24 |
;; |
176 |
zxt4 r16 = r23 |
|
177 |
;; |
add r18 = 8,r14 |
178 |
zxt4 r21 = r21 |
add r19 = r14,r16 |
179 |
add r15 = r33, r15 |
// mux1 r17 = r17, @brcst |
180 |
add r16 = r33, r16 |
|
181 |
;; |
add r27 = 8,r22 |
182 |
add r19 = r33, r21 |
sub r28 = 56,r22 |
183 |
ld1 r17 = [r15] |
sub r24 = 64,r22 |
184 |
zxt4 r14 = r22 |
;; |
185 |
;; |
add r26 = 8,r19 |
186 |
ld1 r20 = [r19] |
|
187 |
ld1 r18 = [r16] |
mov ar.lc = 7 |
188 |
add r14 = r33, r14 |
mov ar.ec = LL + SL +OL + 2*AVL + STL |
189 |
;; |
mov pr.rot = 1 << 16 |
190 |
add r17 = r17, r20 |
|
191 |
ld1 r15 = [r14] |
;; |
192 |
adds r19 = 1, r24 |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1] |
193 |
;; |
.rotp aldp[LL], sh1p[SL], or1p[OL],pavg1p[AVL],pavg2p[AVL],stp[STL] |
194 |
add r18 = r18, r17 |
|
195 |
adds r20 = 1, r25 |
|
196 |
adds r14 = 1, r23 |
.Lloop_interpolate3: |
197 |
;; |
(aldp[0]) ld8 ald1[0] = [r14],r16 |
198 |
add r15 = r15, r18 |
(aldp[0]) ld8 ald2[0] = [r18],r16 |
199 |
add r20 = r26, r20 |
(aldp[0]) ld8 ald3[0] = [r19],r16 |
200 |
add r21 = r32, r21 |
(aldp[0]) ld8 ald4[0] = [r26],r16 |
201 |
;; |
|
202 |
sub r15 = r15, r35 |
(sh1p[0]) shr.u shru1[0] = ald1[LL],r22 |
203 |
zxt4 r20 = r20 |
(sh1p[0]) shl shl1[0] = ald2[LL],r24 |
204 |
zxt4 r19 = r19 |
(sh1p[0]) shr.u shru2[0] = ald3[LL],r22 |
205 |
;; |
(sh1p[0]) shl shl2[0] = ald4[LL],r24 |
206 |
adds r15 = 2, r15 |
(sh1p[0]) shr.u shru3[0] = ald1[LL],r27 |
207 |
add r17 = r33, r20 |
(sh1p[0]) shl shl3[0] = ald2[LL],r28 |
208 |
adds r16 = 1, r22 |
(sh1p[0]) shr.u shru4[0] = ald3[LL],r27 |
209 |
;; |
(sh1p[0]) shl shl4[0] = ald4[LL],r28 |
210 |
extr r15 = r15, 2, 16 |
|
211 |
add r19 = r33, r19 |
|
212 |
zxt4 r14 = r14 |
(or1p[0]) or or1[0] = shru1[SL],shl1[SL] |
213 |
;; |
(or1p[0]) or or2[0] = shru2[SL],shl2[SL] |
214 |
st1 [r21] = r15 |
(or1p[0]) or or3[0] = shru3[SL],shl3[SL] |
215 |
add r14 = r33, r14 |
(or1p[0]) or or4[0] = shru4[SL],shl4[SL] |
216 |
zxt4 r16 = r16 |
|
217 |
ld1 r18 = [r17] |
// (addp[0]) padd1.uus add1[0] = or1[OL],r17 |
218 |
ld1 r15 = [r19] |
|
219 |
;; |
(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] |
220 |
add r16 = r33, r16 |
(pavg1p[0]) pavg1 avg1[0] = or3[OL],or4[OL] |
221 |
ld1 r17 = [r14] |
|
222 |
add r15 = r15, r18 |
(pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL] |
223 |
add r20 = r32, r20 |
|
224 |
;; |
(stp[0]) st8 [r15] = avg2[AVL] |
225 |
ld1 r14 = [r16] |
(stp[0]) add r15 = r15,r16 |
226 |
add r17 = r17, r15 |
|
227 |
adds r22 = 2, r22 |
|
228 |
;; |
|
229 |
add r14 = r14, r17 |
|
230 |
adds r23 = 2, r23 |
br.ctop.sptk.few .Lloop_interpolate3 |
231 |
adds r24 = 2, r24 |
;; |
232 |
;; |
mov ar.lc = r20 |
233 |
sub r14 = r14, r35 |
mov pr = r21,-1 |
|
adds r25 = 2, r25 |
|
|
;; |
|
|
adds r14 = 2, r14 |
|
|
;; |
|
|
extr r14 = r14, 2, 16 |
|
|
;; |
|
|
st1 [r20] = r14 |
|
|
br.cloop.sptk.few .L70 |
|
|
adds r27 = 1, r27 |
|
|
add r26 = r26, r34 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r27 |
|
|
(p6) br.cond.dptk .L37 |
|
|
mov ar.lc = r2 |
|
234 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
235 |
.endp interpolate8x8_halfpel_hv_ia64# |
.endp interpolate8x8_halfpel_hv_ia64# |
236 |
.ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)" |
|
237 |
|
|