1 |
.file "mem_transfer.c" |
/////////////////////////////////////////////////////////////////////////////// |
2 |
.pred.safe_across_calls p1-p5,p16-p63 |
// |
3 |
.common transfer_8to16copy#,8,8 |
// mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, |
4 |
|
// University of Karlsruhe, Germany, 03.06.2002, during the laboratory |
5 |
|
// "IA-64 Video Codec Assember Parktikum" at IPD Goos. |
6 |
|
// |
7 |
|
// |
8 |
|
///// legal header taken from original C-file /////////////////////////////////////// |
9 |
|
// |
10 |
|
// XVID MPEG-4 VIDEO CODEC |
11 |
|
// - 8bit<->16bit transfer - |
12 |
|
// |
13 |
|
// This program is an implementation of a part of one or more MPEG-4 |
14 |
|
// Video tools as specified in ISO/IEC 14496-2 standard. Those intending |
15 |
|
// to use this software module in hardware or software products are |
16 |
|
// advised that its use may infringe existing patents or copyrights, and |
17 |
|
// any such use would be at such party's own risk. The original |
18 |
|
// developer of this software module and his/her company, and subsequent |
19 |
|
// editors and their companies, will have no liability for use of this |
20 |
|
// software or modifications or derivatives thereof. |
21 |
|
// |
22 |
|
// This program is free software ; you can redistribute it and/or modify |
23 |
|
// it under the terms of the GNU General Public License as published by |
24 |
|
// the Free Software Foundation ; either version 2 of the License, or |
25 |
|
// (at your option) any later version. |
26 |
|
// |
27 |
|
// This program is distributed in the hope that it will be useful, |
28 |
|
// but WITHOUT ANY WARRANTY ; without even the implied warranty of |
29 |
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
30 |
|
// GNU General Public License for more details. |
31 |
|
// |
32 |
|
// You should have received a copy of the GNU General Public License |
33 |
|
// along with this program ; if not, write to the Free Software |
34 |
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
35 |
|
// |
36 |
|
///// History ///////////////////////////////////////////////////////////////// |
37 |
|
// |
38 |
|
// - 16.07.2002: several minor changes for ecc-conformity |
39 |
|
// - 03.06.2002: initial version |
40 |
|
// |
41 |
|
/////////////////////////////////////////////////////////////////////////////// |
42 |
|
// |
43 |
|
// Annotations: |
44 |
|
// =========== |
45 |
|
// |
46 |
|
// - All functions work on 8x8-matrices. While the C-code-functions treat each |
47 |
|
// element seperatly, the functions in this assembler-code treat a whole line |
48 |
|
// simultaneously. So one loop is saved. |
49 |
|
// The remaining loop is relized by using softwarepipelining with rotating |
50 |
|
// rregisters. |
51 |
|
// - Register renaming is used for better readability |
52 |
|
// - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both |
53 |
|
// parts are shifted and joined together with an "OR"-Instruction. |
54 |
|
// - First parameter is stored in GR 32, next in GR 33, and so on. They must be |
55 |
|
// saved, as these GRs are used for register-rotation. |
56 |
|
// - Some of the orininal, German comments used during development are left in |
57 |
|
// in the code. They shouldn't bother anyone. |
58 |
|
// |
59 |
|
// Anmerkungen: |
60 |
|
// ============ |
61 |
|
// |
62 |
|
// - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code |
63 |
|
// jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- |
64 |
|
// Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. |
65 |
|
// Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit |
66 |
|
// rotierenden Registern realisiert. |
67 |
|
// - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. |
68 |
|
// - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke |
69 |
|
// geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem |
70 |
|
// logischen Oder zusammenkopiert. |
71 |
|
// - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- |
72 |
|
// sichert werden, da die Register für die register-Rotation benötigt werden. |
73 |
|
// - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase |
74 |
|
// sind im Code verblieben. Sie sollten niemanden stören. |
75 |
|
// |
76 |
|
/////////////////////////////////////////////////////////////////////////////// |
77 |
|
|
78 |
|
|
79 |
|
// *** define Latencies for software pipilines *** |
80 |
|
|
81 |
|
LL = 3 // Load |
82 |
|
SL = 3 // Store |
83 |
|
PL = 1 // Pack |
84 |
|
SHL = 1 // Shift |
85 |
|
OL = 1 // Or |
86 |
|
UL = 1 // Unpack |
87 |
|
PAL = 1 // Parallel Add |
88 |
|
PSL = 1 // Parallel Subtract |
89 |
|
PAVGL = 1 // Parallel Avarage |
90 |
|
|
91 |
.text |
.text |
92 |
|
|
93 |
|
|
94 |
|
/////////////////////////////////////////////////////////////////////////////// |
95 |
|
// |
96 |
|
// transfer8x8_copy_ia64 |
97 |
|
// |
98 |
|
// SRC is missaligned, to align the source load two 8-bytes-words, shift it, |
99 |
|
// join them and store the aligned source into the destination address. |
100 |
|
// |
101 |
|
/////////////////////////////////////////////////////////////////////////////// |
102 |
|
|
103 |
|
.align 16 |
104 |
|
.global transfer8x8_copy_ia64# |
105 |
|
.proc transfer8x8_copy_ia64# |
106 |
|
|
107 |
|
transfer8x8_copy_ia64: |
108 |
|
.prologue |
109 |
|
|
110 |
|
// *** register renaming *** |
111 |
|
zero = r0 |
112 |
|
|
113 |
|
oldLC = r2 |
114 |
|
oldPR = r3 |
115 |
|
|
116 |
|
src_1 = r14 // left aligned address of src |
117 |
|
src_2 = r15 // right aligned address of src |
118 |
|
dst = r16 // destination address |
119 |
|
stride = r17 |
120 |
|
|
121 |
|
offset = r18 // shift right offset |
122 |
|
aoffset = r19 // shift left offset |
123 |
|
|
124 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
125 |
|
.save ar.lc, oldLC |
126 |
|
mov oldLC = ar.lc |
127 |
|
mov oldPR = pr |
128 |
|
|
129 |
|
.body |
130 |
|
|
131 |
|
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
132 |
|
alloc r9 = ar.pfs, 3, 29, 0, 32 |
133 |
|
|
134 |
|
// *** Saving Parameters *** |
135 |
|
mov dst = r32 |
136 |
|
mov stride = r34 |
137 |
|
|
138 |
|
// *** Misalingment-Treatment *** |
139 |
|
and src_1 = -8, r33 // Computing adress of first aligned block containing src-values |
140 |
|
dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress |
141 |
|
;; |
142 |
|
sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl |
143 |
|
add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values |
144 |
|
|
145 |
|
// *** init loop: set loop counter, epilog counter, predicates *** |
146 |
|
mov ar.lc = 7 |
147 |
|
mov ar.ec = LL + SHL + OL + 1 |
148 |
|
mov pr.rot = 1 << 16 |
149 |
|
;; |
150 |
|
|
151 |
|
// *** define register arrays and predicate array for software pipeline *** |
152 |
|
// src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left |
153 |
|
.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] |
154 |
|
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] |
155 |
|
|
156 |
|
|
157 |
|
// Software pipelined loop: |
158 |
|
// Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 |
159 |
|
// Stage 2: Shift both values of source to SHD_R and SHD_L |
160 |
|
// Stage 3: Join both parts together with OR |
161 |
|
// Stage 4: Store aligned date to destination and add stride to destination address |
162 |
|
|
163 |
|
|
164 |
|
.Loop_8x8copy: |
165 |
|
{.mii |
166 |
|
(ld_stage[0]) ld8 src_v1[0] = [src_1], stride |
167 |
|
(sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset |
168 |
|
} |
169 |
|
{.mii |
170 |
|
(ld_stage[0]) ld8 src_v2[0] = [src_2], stride |
171 |
|
(sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset |
172 |
|
(or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL] |
173 |
|
} |
174 |
|
{.mib |
175 |
|
(st_stage[0]) st8 [dst] = value[OL] |
176 |
|
(st_stage[0]) add dst = dst, stride |
177 |
|
br.ctop.sptk.few .Loop_8x8copy |
178 |
|
;; |
179 |
|
} |
180 |
|
|
181 |
|
// *** Restore old LC and PRs *** |
182 |
|
mov ar.lc = oldLC |
183 |
|
mov pr = oldPR, -1 |
184 |
|
|
185 |
|
br.ret.sptk.many b0 |
186 |
|
|
187 |
|
.endp transfer8x8_copy_ia64# |
188 |
|
|
189 |
|
|
190 |
|
|
191 |
|
|
192 |
|
/////////////////////////////////////////////////////////////////////////////// |
193 |
|
// |
194 |
|
// transfer_8to16copy_ia64 |
195 |
|
// |
196 |
|
// SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, |
197 |
|
// UNPACK is used. So 8 bytes are loaded from source, unpacked to two |
198 |
|
// 4 x 16 bit values and stored to the destination. Destination is a continuous |
199 |
|
// array of 64 x 16 bit signed data. To store the next line, only 16 must be |
200 |
|
// added to the destination address. |
201 |
|
/////////////////////////////////////////////////////////////////////////////// |
202 |
|
|
203 |
.align 16 |
.align 16 |
204 |
.global transfer_8to16copy_ia64# |
.global transfer_8to16copy_ia64# |
205 |
.proc transfer_8to16copy_ia64# |
.proc transfer_8to16copy_ia64# |
206 |
|
|
207 |
|
|
208 |
transfer_8to16copy_ia64: |
transfer_8to16copy_ia64: |
209 |
.prologue |
.prologue |
210 |
.save ar.lc, r2 |
|
211 |
mov r2 = ar.lc |
// *** register renaming *** |
212 |
|
oldLC = r2 |
213 |
|
oldPR = r3 |
214 |
|
|
215 |
|
zero = r0 // damit ist die Zahl "zero" = 0 gemeint |
216 |
|
|
217 |
|
dst_1 = r14 // destination address for first 4 x 16 bit values |
218 |
|
dst_2 = r15 // destination address for second 4 x 16 bit values |
219 |
|
src = r16 |
220 |
|
stride = r17 |
221 |
|
|
222 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
223 |
|
.save ar.lc, oldLC |
224 |
|
mov oldLC = ar.lc |
225 |
|
mov oldPR = pr |
226 |
|
|
227 |
|
|
228 |
.body |
.body |
229 |
addl r14 = 7, r0 |
|
230 |
mov r21 = r0 |
// *** Allocating new stackframe, define rotating registers *** |
231 |
mov r20 = r0 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
232 |
;; |
|
233 |
mov ar.lc = r14 |
// *** Saving Paramters *** |
234 |
;; |
mov dst_1 = r32 // fist 4 x 16 bit values |
235 |
.L101: |
add dst_2 = 8, r32 // second 4 x 16 bit values |
236 |
addl r19 = 1, r0 |
mov src = r33 |
237 |
zxt4 r14 = r21 |
mov stride = r34 |
238 |
dep.z r15 = r20, 1, 32 |
|
239 |
;; |
// *** init loop: set loop counter, epilog counter, predicates *** |
240 |
add r16 = r21, r19 |
mov ar.lc = 7 |
241 |
add r14 = r33, r14 |
mov ar.ec = LL + UL + 1 |
242 |
add r17 = r20, r19 |
mov pr.rot = 1 << 16 |
243 |
;; |
;; |
244 |
ld1 r18 = [r14] |
|
245 |
add r15 = r15, r32 |
// *** define register arrays and predicate array for software pipeline *** |
246 |
zxt4 r16 = r16 |
// src_v = source value, dst_v1 = destination value 1 |
247 |
;; |
.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] |
248 |
st2 [r15] = r18 |
.rotp ld_stage[LL], upack_stage[UL], st_stage[1] |
249 |
addl r19 = 2, r0 |
|
250 |
add r16 = r33, r16 |
|
251 |
dep.z r17 = r17, 1, 32 |
// Software pipelined loop: |
252 |
;; |
// Stage 1: Load value of SRC |
253 |
ld1 r15 = [r16] |
// Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data |
254 |
add r14 = r21, r19 |
// Stage 3: Store both 8 byte of 16 bit data |
255 |
add r18 = r20, r19 |
|
256 |
add r17 = r17, r32 |
|
257 |
;; |
.Loop_8to16copy: |
258 |
zxt4 r14 = r14 |
{.mii |
259 |
st2 [r17] = r15 |
(ld_stage[0]) ld8 src_v[0] = [src], stride |
260 |
addl r19 = 3, r0 |
(upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL] |
261 |
;; |
(upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL] |
262 |
add r14 = r33, r14 |
} |
263 |
add r15 = r21, r19 |
{.mmb |
264 |
dep.z r18 = r18, 1, 32 |
(st_stage[0]) st8 [dst_1] = dst_v1[UL], 16 |
265 |
;; |
(st_stage[0]) st8 [dst_2] = dst_v2[UL], 16 |
266 |
ld1 r17 = [r14] |
br.ctop.sptk.few .Loop_8to16copy |
267 |
add r16 = r20, r19 |
;; |
268 |
add r18 = r18, r32 |
} |
269 |
zxt4 r15 = r15 |
|
270 |
;; |
// *** Restore old LC and PRs *** |
271 |
st2 [r18] = r17 |
mov ar.lc = oldLC |
272 |
addl r19 = 4, r0 |
mov pr = oldPR, -1 |
273 |
add r15 = r33, r15 |
|
|
dep.z r16 = r16, 1, 32 |
|
|
;; |
|
|
ld1 r18 = [r15] |
|
|
add r14 = r21, r19 |
|
|
add r17 = r20, r19 |
|
|
add r16 = r16, r32 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
st2 [r16] = r18 |
|
|
addl r19 = 5, r0 |
|
|
;; |
|
|
add r14 = r33, r14 |
|
|
add r15 = r21, r19 |
|
|
add r16 = r20, r19 |
|
|
dep.z r17 = r17, 1, 32 |
|
|
;; |
|
|
ld1 r18 = [r14] |
|
|
addl r19 = 6, r0 |
|
|
add r17 = r17, r32 |
|
|
zxt4 r15 = r15 |
|
|
;; |
|
|
st2 [r17] = r18 |
|
|
add r14 = r21, r19 |
|
|
add r15 = r33, r15 |
|
|
dep.z r16 = r16, 1, 32 |
|
|
add r17 = r20, r19 |
|
|
;; |
|
|
ld1 r18 = [r15] |
|
|
add r16 = r16, r32 |
|
|
zxt4 r14 = r14 |
|
|
;; |
|
|
st2 [r16] = r18 |
|
|
addl r19 = 7, r0 |
|
|
add r14 = r33, r14 |
|
|
;; |
|
|
ld1 r15 = [r14] |
|
|
add r16 = r21, r19 |
|
|
dep.z r17 = r17, 1, 32 |
|
|
add r14 = r20, r19 |
|
|
;; |
|
|
add r17 = r17, r32 |
|
|
zxt4 r16 = r16 |
|
|
;; |
|
|
st2 [r17] = r15 |
|
|
dep.z r14 = r14, 1, 32 |
|
|
add r16 = r33, r16 |
|
|
;; |
|
|
add r14 = r14, r32 |
|
|
ld1 r15 = [r16] |
|
|
add r21 = r21, r34 |
|
|
;; |
|
|
st2 [r14] = r15 |
|
|
adds r20 = 8, r20 |
|
|
br.cloop.sptk.few .L101 |
|
|
;; |
|
|
mov ar.lc = r2 |
|
274 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
275 |
.endp transfer_8to16copy_ia64# |
.endp transfer_8to16copy_ia64# |
276 |
.common transfer_16to8copy#,8,8 |
|
277 |
|
|
278 |
|
|
279 |
|
|
280 |
|
/////////////////////////////////////////////////////////////////////////////// |
281 |
|
// |
282 |
|
// transfer_16to8copy_ia64 |
283 |
|
// |
284 |
|
// src is a 64 x 16 bit signed continuous array. To convert the 16 bit |
285 |
|
// values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of |
286 |
|
// 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word |
287 |
|
// of 8 x 8 unsigned data to the destination. |
288 |
|
/////////////////////////////////////////////////////////////////////////////// |
289 |
|
|
290 |
.align 16 |
.align 16 |
291 |
.global transfer_16to8copy_ia64# |
.global transfer_16to8copy_ia64# |
292 |
.proc transfer_16to8copy_ia64# |
.proc transfer_16to8copy_ia64# |
293 |
transfer_16to8copy_ia64: |
transfer_16to8copy_ia64: |
294 |
.prologue |
.prologue |
295 |
|
|
296 |
|
// *** register renaming *** |
297 |
|
dst = r14 |
298 |
|
src_1 = r15 |
299 |
|
src_2 = r17 |
300 |
|
stride = r16 |
301 |
|
|
302 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
303 |
|
.save ar.lc, oldLC |
304 |
|
mov oldLC = ar.lc |
305 |
|
mov oldPR = pr |
306 |
|
|
307 |
|
|
308 |
.body |
.body |
309 |
mov r22 = r0 |
|
310 |
addl r21 = 255, r0 |
// *** Allocating new stackframe, define rotating registers *** |
311 |
mov r20 = r0 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
312 |
mov r19 = r0 |
|
313 |
.L25: |
// *** Saving Paramters *** |
314 |
mov r18 = r0 |
mov dst = r32 |
315 |
;; |
mov src_1 = r33 |
316 |
.L29: |
add src_2 = 8, r33 |
317 |
add r14 = r19, r18 |
mov stride = r34 |
318 |
;; |
|
319 |
dep.z r14 = r14, 1, 32 |
// *** init loop: set loop counter, epilog counter, predicates *** |
320 |
;; |
mov ar.lc = 7 |
321 |
add r14 = r14, r33 |
mov ar.ec = LL + PL + 1 |
322 |
;; |
mov pr.rot = 1 << 16 |
323 |
ld2 r15 = [r14] |
;; |
324 |
;; |
|
325 |
sxt2 r15 = r15 |
// *** define register arrays and predicate array for software pipeline *** |
326 |
;; |
// src_v1 = source value 1, dst_v = destination value |
327 |
mov r16 = r15 |
.rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1] |
328 |
;; |
.rotp ld_stage[LL], pack_stage[PL], st_stage[1] |
329 |
cmp4.le p6, p7 = r0, r16 |
|
330 |
;; |
|
331 |
(p7) mov r16 = r0 |
// Software pipelined loop: |
332 |
(p7) br.cond.dpnt .L106 |
// Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data |
333 |
;; |
// Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data |
334 |
cmp4.ge p6, p7 = r21, r16 |
// Stage 3: Store the 8 byte to the destination address and add stride to |
335 |
;; |
// destination address (to get the next 8 byte line of destination) |
336 |
(p7) addl r16 = 255, r0 |
|
337 |
.L106: |
|
338 |
add r14 = r20, r18 |
.Loop_16to8copy: |
339 |
adds r17 = 1, r18 |
{.mmi |
340 |
;; |
(ld_stage[0]) ld8 src_v1[0] = [src_1], 16 |
341 |
zxt4 r14 = r14 |
(ld_stage[0]) ld8 src_v2[0] = [src_2], 16 |
342 |
add r15 = r19, r17 |
(pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL] |
343 |
;; |
} |
344 |
add r14 = r32, r14 |
{.mib |
345 |
dep.z r15 = r15, 1, 32 |
(st_stage[0]) st8 [dst] = dst_v[PL] |
346 |
;; |
(st_stage[0]) add dst = dst, stride |
347 |
st1 [r14] = r16 |
br.ctop.sptk.few .Loop_16to8copy |
348 |
add r15 = r15, r33 |
;; |
349 |
;; |
} |
350 |
ld2 r14 = [r15] |
|
351 |
;; |
// *** Restore old LC and PRs *** |
352 |
sxt2 r14 = r14 |
mov ar.lc = oldLC |
353 |
;; |
mov pr = oldPR, -1 |
354 |
mov r16 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r16 |
|
|
;; |
|
|
(p7) mov r16 = r0 |
|
|
(p7) br.cond.dpnt .L110 |
|
|
;; |
|
|
cmp4.ge p6, p7 = r21, r16 |
|
|
;; |
|
|
(p7) addl r16 = 255, r0 |
|
|
.L110: |
|
|
add r14 = r20, r17 |
|
|
adds r17 = 2, r18 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
add r15 = r19, r17 |
|
|
;; |
|
|
add r14 = r32, r14 |
|
|
dep.z r15 = r15, 1, 32 |
|
|
;; |
|
|
st1 [r14] = r16 |
|
|
add r15 = r15, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r16 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r16 |
|
|
;; |
|
|
(p7) mov r16 = r0 |
|
|
(p7) br.cond.dpnt .L114 |
|
|
;; |
|
|
cmp4.ge p6, p7 = r21, r16 |
|
|
;; |
|
|
(p7) addl r16 = 255, r0 |
|
|
.L114: |
|
|
add r14 = r20, r17 |
|
|
adds r17 = 3, r18 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
add r15 = r19, r17 |
|
|
;; |
|
|
add r14 = r32, r14 |
|
|
dep.z r15 = r15, 1, 32 |
|
|
;; |
|
|
st1 [r14] = r16 |
|
|
add r15 = r15, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r15 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r15 |
|
|
;; |
|
|
(p7) mov r15 = r0 |
|
|
(p7) br.cond.dpnt .L118 |
|
|
;; |
|
|
cmp4.ge p6, p7 = r21, r15 |
|
|
;; |
|
|
(p7) addl r15 = 255, r0 |
|
|
.L118: |
|
|
add r14 = r20, r17 |
|
|
adds r18 = 4, r18 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
cmp4.geu p6, p7 = 7, r18 |
|
|
;; |
|
|
add r14 = r32, r14 |
|
|
;; |
|
|
st1 [r14] = r15 |
|
|
(p6) br.cond.dptk .L29 |
|
|
adds r22 = 1, r22 |
|
|
add r20 = r20, r34 |
|
|
adds r19 = 8, r19 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r22 |
|
|
(p6) br.cond.dptk .L25 |
|
355 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
356 |
.endp transfer_16to8copy_ia64# |
.endp transfer_16to8copy_ia64# |
357 |
.common transfer_8to16sub#,8,8 |
|
358 |
|
|
359 |
|
|
360 |
|
/////////////////////////////////////////////////////////////////////////////// |
361 |
|
// |
362 |
|
// transfer_16to8add_ia64 |
363 |
|
// |
364 |
|
// The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- |
365 |
|
// bit-values. These are "parallel-added" to the values of src. The result is |
366 |
|
// converted into 8-bit-values using "PACK" and stored at the adress of dst. |
367 |
|
// We assume that there is no misalignment. |
368 |
|
// |
369 |
|
/////////////////////////////////////////////////////////////////////////////// |
370 |
|
|
371 |
|
.align 16 |
372 |
|
.global transfer_16to8add_ia64# |
373 |
|
.proc transfer_16to8add_ia64# |
374 |
|
|
375 |
|
transfer_16to8add_ia64: |
376 |
|
.prologue |
377 |
|
|
378 |
|
// *** register renaming *** |
379 |
|
dst = r14 |
380 |
|
src = r15 |
381 |
|
stride = r16 |
382 |
|
|
383 |
|
_src = r17 |
384 |
|
|
385 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
386 |
|
.save ar.lc, r2 |
387 |
|
mov oldLC = ar.lc |
388 |
|
mov oldPR = pr |
389 |
|
|
390 |
|
|
391 |
|
.body |
392 |
|
|
393 |
|
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
394 |
|
alloc r9 = ar.pfs, 4, 92, 0, 96 |
395 |
|
|
396 |
|
// *** Saving Paramters *** |
397 |
|
mov dst = r32 |
398 |
|
mov src = r33 |
399 |
|
mov stride = r34 |
400 |
|
add _src = 8, r33 |
401 |
|
|
402 |
|
// *** init loop: set loop counter, epilog counter, predicates *** |
403 |
|
mov ar.lc = 7 |
404 |
|
mov ar.ec = LL + UL + PAL + PL + 1 |
405 |
|
mov pr.rot = 1 << 16 |
406 |
|
;; |
407 |
|
|
408 |
|
// *** define register arrays and predicate array for software pipeline *** |
409 |
|
.rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1] |
410 |
|
.rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] |
411 |
|
|
412 |
|
|
413 |
|
// Software pipelined loop: |
414 |
|
// s1_p: The values of src and dst are loaded |
415 |
|
// s2_p: The dst-values are converted to 16-bit-values |
416 |
|
// s3_p: The values of src and dst are added |
417 |
|
// s4_p: The Results are packed into 8-bit-values |
418 |
|
// s5_p: The 8-bit-values are stored at the dst-adresses |
419 |
|
|
420 |
|
|
421 |
|
.Loop_16to8add: |
422 |
|
{.mii |
423 |
|
(s1_p[0]) ld8 w_src_1[0] = [src], 16 // läd die 1. Hälfte der j. Zeile von src (i = 0..3) |
424 |
|
(s1_p[0]) mov _dst[0] = dst // erhöht die Adresse von dst um stride |
425 |
|
(s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst |
426 |
|
} |
427 |
|
{.mii |
428 |
|
(s1_p[0]) ld8 w_dst8[0] = [dst], stride // läd die j. Zeile von dst |
429 |
|
(s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird für i = 0..3 in 16-Bit umgewandelt |
430 |
|
(s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird für i = 4..7 in 16-Bit umgewandelt |
431 |
|
} |
432 |
|
{.mii |
433 |
|
(s1_p[0]) ld8 w_src_2[0] = [_src], 16 // läd die 2. Hälfte der j. Zeile von src (i = 4..7) |
434 |
|
(s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst |
435 |
|
(s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die Überprüfung der Wertebereiche erfolgt automatisch |
436 |
|
} |
437 |
|
{.mmb |
438 |
|
(s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab |
439 |
|
(s1_p[0]) nop.m 0 |
440 |
|
br.ctop.sptk.few .Loop_16to8add |
441 |
|
;; |
442 |
|
} |
443 |
|
|
444 |
|
// *** Restore old LC and PRs *** |
445 |
|
mov ar.lc = oldLC |
446 |
|
mov pr = oldPR, -1 |
447 |
|
|
448 |
|
br.ret.sptk.many b0 |
449 |
|
.endp transfer_16to8add_ia64# |
450 |
|
|
451 |
|
|
452 |
|
|
453 |
|
/////////////////////////////////////////////////////////////////////////////// |
454 |
|
// |
455 |
|
// transfer_8to16sub_ia64 |
456 |
|
// |
457 |
|
// The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The |
458 |
|
// Difference of cur and ref ist stored at the dct-adresses and cur is copied |
459 |
|
// into the ref-array. |
460 |
|
// |
461 |
|
// You must assume, that the data adressed by 'ref' are misaligned in memory. |
462 |
|
// But you can assume, that the other data are aligned (at least I hope so). |
463 |
|
// |
464 |
|
/////////////////////////////////////////////////////////////////////////////// |
465 |
|
|
466 |
.align 16 |
.align 16 |
467 |
.global transfer_8to16sub_ia64# |
.global transfer_8to16sub_ia64# |
468 |
.proc transfer_8to16sub_ia64# |
.proc transfer_8to16sub_ia64# |
469 |
|
|
470 |
|
|
471 |
transfer_8to16sub_ia64: |
transfer_8to16sub_ia64: |
472 |
.prologue |
.prologue |
473 |
|
|
474 |
|
// *** register renaming *** |
475 |
|
oldLC = r2 |
476 |
|
oldPR = r3 |
477 |
|
|
478 |
|
zero = r0 // damit ist die Zahl "zero" = 0 gemeint |
479 |
|
|
480 |
|
//Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage |
481 |
|
dct = r14 |
482 |
|
cur = r15 |
483 |
|
ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das ÜbergabeRegister in dieser Liste |
484 |
|
stride = r16 |
485 |
|
|
486 |
|
offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtrücken |
487 |
|
aoffset = r18 // Gegenstück zum Offset, |
488 |
|
ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref |
489 |
|
ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref |
490 |
|
|
491 |
|
_dct = r21 // Register für die Zieladressen des 2. dct-Blocks |
492 |
|
|
493 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
494 |
|
.save ar.lc, r2 |
495 |
|
mov oldLC = ar.lc |
496 |
|
mov oldPR = pr |
497 |
|
|
498 |
|
|
499 |
.body |
.body |
500 |
mov r25 = r0 |
|
501 |
mov r24 = r0 |
// *** Allocating new stackframe, define rotating registers *** |
502 |
mov r23 = r0 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
503 |
.L39: |
|
504 |
mov r22 = r0 |
// *** Saving Paramters *** |
505 |
;; |
mov dct = r32 |
506 |
.L43: |
mov cur = r33 |
507 |
add r15 = r23, r22 |
// mov ref = r34: ref is unaligned, get aligned ref below... |
508 |
adds r20 = 1, r22 |
mov stride = r35 |
509 |
add r16 = r24, r22 |
|
510 |
;; |
and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8) |
511 |
zxt4 r15 = r15 |
dep offset = ref, zero, 3, 3 |
512 |
add r18 = r23, r20 |
;; |
513 |
dep.z r16 = r16, 1, 32 |
add ref_a2 = 8, ref_a1 |
514 |
;; |
sub aoffset = 64, offset // Gegenstück zum Offset wird berechnet |
515 |
add r19 = r34, r15 |
add _dct = 8, dct // Die Adresse für den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) höher als beim 1. Block |
516 |
zxt4 r18 = r18 |
|
517 |
add r16 = r16, r32 |
// *** init loop: set loop counter, epilog counter, predicates *** |
518 |
add r15 = r33, r15 |
mov ar.lc = 7 |
519 |
;; |
mov ar.ec = LL + SHL + OL + UL + PSL + 1 |
520 |
ld1 r14 = [r19] |
mov pr.rot = 1 << 16 |
521 |
add r21 = r34, r18 |
;; |
522 |
ld1 r17 = [r15] |
|
523 |
adds r19 = 2, r22 |
// *** define register arrays and predicate array for software pipeline *** |
524 |
add r18 = r33, r18 |
.rotr c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1], dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1] |
525 |
;; |
.rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] |
526 |
st1 [r15] = r14 |
|
527 |
sub r17 = r17, r14 |
|
528 |
add r20 = r24, r20 |
// Software pipelined loop: |
529 |
;; |
// s1_p: The values of ref and cur ale loaded, a copy of cur is made. |
530 |
st2 [r16] = r17 |
// s2_p: cur is converted to 16-bit and thehe misaligned values of ref are |
531 |
dep.z r20 = r20, 1, 32 |
// shifted... |
532 |
ld1 r14 = [r21] |
// s3_p: ... and copied together. |
533 |
ld1 r15 = [r18] |
// s4_p: This ref-value is converted to 16-bit. The values of cur are stored |
534 |
add r16 = r23, r19 |
// at the ref-adresses. |
535 |
;; |
// s5_p: the ref- abd cur-values are substracted... |
536 |
st1 [r18] = r14 |
// s6_p: ...and the result is stored at the dct-adresses. |
537 |
sub r15 = r15, r14 |
|
538 |
zxt4 r16 = r16 |
|
539 |
add r20 = r20, r32 |
loop_8to16sub: |
540 |
;; |
{.mii |
541 |
add r18 = r34, r16 |
(s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // läd den 1. 64-Bit-Block, der einen Teil der ref-Daten enthält |
542 |
adds r17 = 3, r22 |
(s1_p[0]) mov _cur[0] = cur // cur wird für spätere Verwendung gesichert |
543 |
st2 [r20] = r15 |
(s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte Hälfte wird zurechtgerückt |
544 |
add r16 = r33, r16 |
} |
545 |
add r19 = r24, r19 |
{.mii |
546 |
;; |
(s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // läd den 2. 64-Bit-Block |
547 |
ld1 r14 = [r18] |
(s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke Hälfte wird zurechtgerückt |
548 |
add r15 = r23, r17 |
(s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtgerückten Daten werden in r zusammenkopiert |
549 |
dep.z r19 = r19, 1, 32 |
} |
550 |
ld1 r18 = [r16] |
{.mii |
551 |
;; |
(s1_p[0]) ld8 c[0] = [cur], stride //läd die j. Zeile von cur komplett |
552 |
zxt4 r15 = r15 |
(s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird für i = 0..3 in 16-Bit umgewandelt |
553 |
add r19 = r19, r32 |
(s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird für i = 4..7 in 16-Bit umgewandelt |
554 |
st1 [r16] = r14 |
} |
555 |
sub r18 = r18, r14 |
{.mii |
556 |
;; |
(s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt |
557 |
add r20 = r34, r15 |
//Umwandeln der 8-Bit r und c -Werte in 16-bit Werte |
558 |
st2 [r19] = r18 |
(s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird für i = 0..3 in 16-Bit umgewandelt |
559 |
add r15 = r33, r15 |
(s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird für i = 4..7 in 16-Bit umgewandelt |
560 |
add r17 = r24, r17 |
} |
561 |
;; |
{.mii |
562 |
ld1 r14 = [r20] |
(s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. Häfte der j. Zeile |
563 |
ld1 r16 = [r15] |
(s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. Hälfte |
564 |
dep.z r17 = r17, 1, 32 |
} |
565 |
;; |
{.mmb |
566 |
add r17 = r17, r32 |
(s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert |
567 |
adds r22 = 4, r22 |
(s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert |
568 |
st1 [r15] = r14 |
br.ctop.sptk.few loop_8to16sub // Und hopp |
569 |
sub r16 = r16, r14 |
;; |
570 |
;; |
} |
571 |
cmp4.geu p6, p7 = 7, r22 |
|
572 |
st2 [r17] = r16 |
// *** Restore old LC and PRs *** |
573 |
(p6) br.cond.dptk .L43 |
mov ar.lc = oldLC |
574 |
adds r25 = 1, r25 |
mov pr = oldPR, -1 |
575 |
adds r24 = 8, r24 |
|
|
add r23 = r23, r35 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r25 |
|
|
(p6) br.cond.dptk .L39 |
|
576 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
577 |
.endp transfer_8to16sub_ia64# |
.endp transfer_8to16sub_ia64# |
578 |
.common transfer_8to16sub2#,8,8 |
|
579 |
|
|
580 |
|
|
581 |
|
|
582 |
|
|
583 |
|
/////////////////////////////////////////////////////////////////////////////// |
584 |
|
// |
585 |
|
// transfer_8to16sub2_ia64 |
586 |
|
// |
587 |
|
// At the time, this function was written, it was not yet in use. |
588 |
|
// We assume that the values of ref1/2 are misaligned. |
589 |
|
// |
590 |
|
// The values of ref1/2 and cur are loaded, the ref-values need misalignment- |
591 |
|
// treatment. The values are converted to 16-bit using unpack. The average of |
592 |
|
// ref1 and ref2 is computed with pavg and substacted from cur. The results are |
593 |
|
// stored at the dct-adresses. |
594 |
|
// pavg1.raz is used to get the same results as the C-code-function. |
595 |
|
// |
596 |
|
/////////////////////////////////////////////////////////////////////////////// |
597 |
|
|
598 |
|
.text |
599 |
.align 16 |
.align 16 |
600 |
.global transfer_8to16sub2_ia64# |
.global transfer_8to16sub2_ia64# |
601 |
.proc transfer_8to16sub2_ia64# |
.proc transfer_8to16sub2_ia64# |
602 |
|
|
603 |
transfer_8to16sub2_ia64: |
transfer_8to16sub2_ia64: |
604 |
.prologue |
.prologue |
605 |
|
|
606 |
|
// *** register renaming *** |
607 |
|
// We've tried to keep the C-Code names as often as possible, at least as |
608 |
|
// part of register-names |
609 |
|
oldLC = r2 |
610 |
|
oldPR = r3 |
611 |
|
|
612 |
|
zero = r0 |
613 |
|
|
614 |
|
dct_al = r14 // dct: adress of left block in one line |
615 |
|
dct_ar = r15 // dct: adress of right block in one line |
616 |
|
cur = r16 |
617 |
|
ref1_al = r17 // ref1: aligned adress of lower part |
618 |
|
ref1_ah = r18 // ref1: aligned adress of higher part |
619 |
|
ref2_al = r19 // ref2: aligned adress of lower part |
620 |
|
ref2_ah = r20 // ref2: aligned adress of higher part |
621 |
|
stride = r21 |
622 |
|
|
623 |
|
offset_1 = r22 |
624 |
|
offset_2 = r23 |
625 |
|
aoffset_1 = r24 |
626 |
|
aoffset_2 = r25 |
627 |
|
|
628 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
629 |
.save ar.lc, r2 |
.save ar.lc, r2 |
630 |
mov r2 = ar.lc |
mov oldLC = ar.lc |
631 |
|
mov oldPR = pr |
632 |
|
|
633 |
|
|
634 |
.body |
.body |
635 |
mov r28 = r0 |
|
636 |
addl r27 = 255, r0 |
// *** Saving Paramters *** |
637 |
mov r26 = r0 |
// *** (as inputregisters r32 + are needed for register-rotation) *** |
638 |
mov r25 = r0 |
mov dct_ar = r32 |
639 |
.L50: |
add dct_al = 8, r32 |
640 |
addl r14 = 3, r0 |
mov cur = r33 |
641 |
mov r21 = r0 |
|
642 |
;; |
and ref1_al = -8, r34 |
643 |
mov ar.lc = r14 |
and ref2_al = -8, r35 // ref2 aligned adrress of lower part |
644 |
;; |
|
645 |
.L138: |
mov stride = r36 |
646 |
add r14 = r26, r21 |
|
647 |
add r17 = r25, r21 |
// *** Calculations for Misaligment-Handling *** |
648 |
adds r19 = 1, r21 |
dep offset_1 = r34, zero, 3, 3 |
649 |
;; |
dep offset_2 = r35, zero, 3, 3 |
650 |
zxt4 r17 = r17 |
;; |
651 |
dep.z r14 = r14, 1, 32 |
add ref1_ah = 8, ref1_al |
652 |
add r18 = r25, r19 |
add ref2_ah = 8, ref2_al |
653 |
;; |
sub aoffset_1 = 64, offset_1 |
654 |
add r15 = r34, r17 |
sub aoffset_2 = 64, offset_2 |
655 |
add r23 = r14, r32 |
;; |
656 |
add r20 = r35, r17 |
|
657 |
;; |
// *** Allocating new stackframe, define rotating registers *** |
658 |
ld1 r14 = [r15] |
alloc r9 = ar.pfs, 5, 91, 0, 96 |
659 |
ld1 r16 = [r20] |
|
660 |
add r17 = r33, r17 |
// *** init loop: set loop counter, epilog counter, predicates *** |
661 |
;; |
mov ar.lc = 7 |
662 |
add r14 = r14, r16 |
mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1 |
663 |
ld1 r15 = [r17] |
mov pr.rot = 1 << 16 |
664 |
zxt4 r18 = r18 |
;; |
665 |
;; |
|
666 |
adds r14 = 1, r14 |
// *** define register arrays and predicate array for software pipeline *** |
667 |
add r24 = r35, r18 |
.rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] |
668 |
add r22 = r34, r18 |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] |
669 |
;; |
|
670 |
shr.u r14 = r14, 1 |
|
671 |
add r19 = r26, r19 |
// software pipelined loop: |
672 |
add r16 = r33, r18 |
// ld_stage: The values of ref1, ref2, cur are loaded |
673 |
;; |
// sh_stage: The misaligned values of ref1/2 are shifted... |
674 |
cmp4.ge p6, p7 = r27, r14 |
// or_stage: ...and copied together. |
675 |
dep.z r19 = r19, 1, 32 |
// pavg_stage: The average of ref1 and ref2 is computed. |
676 |
adds r21 = 2, r21 |
// up_stage: The result and the cur-values are converted to 16-bit. |
677 |
;; |
// psub_stage: Those values are substracted... |
678 |
(p7) addl r14 = 255, r0 |
// st_stage: ...and stored at the dct-adresses. |
679 |
add r19 = r19, r32 |
|
680 |
;; |
|
681 |
sub r14 = r15, r14 |
.Loop_8to16sub2: |
682 |
;; |
{.mii |
683 |
st2 [r23] = r14 |
(ld_stage[0]) ld8 c[0] = [cur], stride |
684 |
ld1 r14 = [r24] |
(sh_stage[0]) shr.u ref1_l[0] = ref1_vl[LL], offset_1 |
685 |
ld1 r15 = [r22] |
(sh_stage[0]) shl ref1_h[0] = ref1_vh[LL], aoffset_1 |
686 |
ld1 r16 = [r16] |
} |
687 |
;; |
{.mii |
688 |
add r15 = r15, r14 |
(ld_stage[0]) ld8 ref1_vl[0] = [ref1_al], stride |
689 |
;; |
(sh_stage[0]) shr.u ref2_l[0] = ref2_vl[LL], offset_2 |
690 |
adds r15 = 1, r15 |
(sh_stage[0]) shl ref2_h[0] = ref2_vh[LL], aoffset_2 |
691 |
;; |
} |
692 |
shr.u r14 = r15, 1 |
{.mii |
693 |
;; |
(ld_stage[0]) ld8 ref1_vh[0] = [ref1_ah], stride |
694 |
cmp4.ge p6, p7 = r27, r14 |
(or_stage[0]) or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL] |
695 |
;; |
(or_stage[0]) or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL] |
696 |
(p7) addl r14 = 255, r0 |
} |
697 |
;; |
{.mii |
698 |
sub r14 = r16, r14 |
(ld_stage[0]) ld8 ref2_vl[0] = [ref2_al], stride |
699 |
;; |
(pavg_stage[0]) pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL] |
700 |
st2 [r19] = r14 |
(up_stage[0]) unpack1.l r16_r[0] = zero, r[PAVGL] |
701 |
br.cloop.sptk.few .L138 |
} |
702 |
adds r28 = 1, r28 |
{.mii |
703 |
adds r26 = 8, r26 |
(ld_stage[0]) ld8 ref2_vh[0] = [ref2_ah], stride |
704 |
add r25 = r25, r36 |
(up_stage[0]) unpack1.h r16_l[0] = zero, r[PAVGL] |
705 |
;; |
(up_stage[0]) unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL] |
706 |
cmp4.geu p6, p7 = 7, r28 |
} |
707 |
(p6) br.cond.dptk .L50 |
{.mii |
708 |
mov ar.lc = r2 |
(st_stage[0]) st8 [dct_ar] = dct16_r[PSL], 16 |
709 |
|
(up_stage[0]) unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL] |
710 |
|
(psub_stage[0]) psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL] |
711 |
|
} |
712 |
|
{.mib |
713 |
|
(st_stage[0]) st8 [dct_al] = dct16_l[PSL], 16 |
714 |
|
(psub_stage[0]) psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL] |
715 |
|
br.ctop.sptk.few .Loop_8to16sub2 // Und hopp |
716 |
|
;; |
717 |
|
} |
718 |
|
|
719 |
|
// *** Restore old LC and PRs *** |
720 |
|
mov ar.lc = oldLC |
721 |
|
mov pr = oldPR, -1 |
722 |
|
|
723 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
724 |
.endp transfer_8to16sub2_ia64# |
.endp transfer_8to16sub2_ia64# |
|
.common transfer_16to8add#,8,8 |
|
|
.align 16 |
|
|
.global transfer_16to8add_ia64# |
|
|
.proc transfer_16to8add_ia64# |
|
|
transfer_16to8add_ia64: |
|
|
.prologue |
|
|
.save ar.lc, r2 |
|
|
mov r2 = ar.lc |
|
|
.body |
|
|
mov r26 = r0 |
|
|
addl r25 = 255, r0 |
|
|
mov r24 = r0 |
|
|
mov r21 = r0 |
|
|
.L62: |
|
|
addl r14 = 3, r0 |
|
|
mov r20 = r0 |
|
|
;; |
|
|
mov ar.lc = r14 |
|
|
;; |
|
|
.L149: |
|
|
adds r17 = 1, r20 |
|
|
add r14 = r21, r20 |
|
|
add r15 = r24, r20 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
add r18 = r21, r17 |
|
|
dep.z r15 = r15, 1, 32 |
|
|
;; |
|
|
add r23 = r32, r14 |
|
|
zxt4 r18 = r18 |
|
|
add r15 = r15, r33 |
|
|
;; |
|
|
mov r16 = r23 |
|
|
add r22 = r32, r18 |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
ld1 r18 = [r16] |
|
|
add r19 = r24, r17 |
|
|
adds r20 = 2, r20 |
|
|
;; |
|
|
add r14 = r14, r18 |
|
|
dep.z r19 = r19, 1, 32 |
|
|
mov r16 = r22 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
add r19 = r19, r33 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
cmp4.ge p8, p9 = r25, r14 |
|
|
;; |
|
|
(p7) mov r14 = r0 |
|
|
(p7) br.cond.dpnt .L143 |
|
|
;; |
|
|
(p9) addl r14 = 255, r0 |
|
|
;; |
|
|
.L143: |
|
|
st1 [r23] = r14 |
|
|
ld1 r14 = [r22] |
|
|
ld2 r15 = [r19] |
|
|
;; |
|
|
add r15 = r15, r14 |
|
|
;; |
|
|
sxt2 r15 = r15 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r15 |
|
|
cmp4.ge p8, p9 = r25, r15 |
|
|
;; |
|
|
(p7) mov r15 = r0 |
|
|
(p7) br.cond.dpnt .L147 |
|
|
;; |
|
|
(p9) addl r15 = 255, r0 |
|
|
;; |
|
|
.L147: |
|
|
st1 [r16] = r15 |
|
|
br.cloop.sptk.few .L149 |
|
|
adds r26 = 1, r26 |
|
|
adds r24 = 8, r24 |
|
|
add r21 = r21, r34 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r26 |
|
|
(p6) br.cond.dptk .L62 |
|
|
mov ar.lc = r2 |
|
|
br.ret.sptk.many b0 |
|
|
.endp transfer_16to8add_ia64# |
|
|
.common transfer8x8_copy#,8,8 |
|
|
.align 16 |
|
|
.global transfer8x8_copy_ia64# |
|
|
.proc transfer8x8_copy_ia64# |
|
|
transfer8x8_copy_ia64: |
|
|
.prologue |
|
|
.save ar.lc, r2 |
|
|
mov r2 = ar.lc |
|
|
.body |
|
|
addl r14 = 7, r0 |
|
|
mov r21 = r0 |
|
|
;; |
|
|
mov ar.lc = r14 |
|
|
;; |
|
|
.L168: |
|
|
zxt4 r14 = r21 |
|
|
adds r15 = 1, r21 |
|
|
adds r18 = 2, r21 |
|
|
;; |
|
|
add r16 = r33, r14 |
|
|
zxt4 r15 = r15 |
|
|
zxt4 r18 = r18 |
|
|
;; |
|
|
ld1 r17 = [r16] |
|
|
add r14 = r32, r14 |
|
|
add r19 = r33, r15 |
|
|
;; |
|
|
st1 [r14] = r17 |
|
|
add r15 = r32, r15 |
|
|
add r20 = r33, r18 |
|
|
ld1 r16 = [r19] |
|
|
adds r14 = 3, r21 |
|
|
add r18 = r32, r18 |
|
|
;; |
|
|
st1 [r15] = r16 |
|
|
zxt4 r14 = r14 |
|
|
adds r17 = 4, r21 |
|
|
ld1 r15 = [r20] |
|
|
;; |
|
|
add r19 = r33, r14 |
|
|
zxt4 r17 = r17 |
|
|
st1 [r18] = r15 |
|
|
add r14 = r32, r14 |
|
|
;; |
|
|
add r20 = r33, r17 |
|
|
ld1 r15 = [r19] |
|
|
adds r16 = 5, r21 |
|
|
add r17 = r32, r17 |
|
|
;; |
|
|
st1 [r14] = r15 |
|
|
zxt4 r16 = r16 |
|
|
adds r18 = 6, r21 |
|
|
ld1 r14 = [r20] |
|
|
;; |
|
|
add r19 = r33, r16 |
|
|
zxt4 r18 = r18 |
|
|
st1 [r17] = r14 |
|
|
add r16 = r32, r16 |
|
|
;; |
|
|
add r20 = r33, r18 |
|
|
ld1 r14 = [r19] |
|
|
adds r15 = 7, r21 |
|
|
add r18 = r32, r18 |
|
|
;; |
|
|
st1 [r16] = r14 |
|
|
zxt4 r15 = r15 |
|
|
add r21 = r21, r34 |
|
|
ld1 r16 = [r20] |
|
|
;; |
|
|
add r17 = r33, r15 |
|
|
st1 [r18] = r16 |
|
|
add r15 = r32, r15 |
|
|
;; |
|
|
ld1 r14 = [r17] |
|
|
;; |
|
|
st1 [r15] = r14 |
|
|
br.cloop.sptk.few .L168 |
|
|
;; |
|
|
mov ar.lc = r2 |
|
|
br.ret.sptk.many b0 |
|
|
.endp transfer8x8_copy_ia64# |
|
|
.ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)" |
|