1 |
.file "mem_transfer.c" |
// **************************************************************************** |
2 |
.pred.safe_across_calls p1-p5,p16-p63 |
// * |
3 |
.common transfer_8to16copy#,8,8 |
// * XVID MPEG-4 VIDEO CODEC |
4 |
|
// * - IA64 8bit<->16bit transfer - |
5 |
|
// * |
6 |
|
// * Copyright(C) 2002 Sebastian Felis, Max Stengel |
7 |
|
// * |
8 |
|
// * This program is free software; you can redistribute it and/or modify it |
9 |
|
// * under the terms of the GNU General Public License as published by |
10 |
|
// * the Free Software Foundation; either version 2 of the License, or |
11 |
|
// * (at your option) any later version. |
12 |
|
// * |
13 |
|
// * This program is distributed in the hope that it will be useful, |
14 |
|
// * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
|
// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
|
// * GNU General Public License for more details. |
17 |
|
// * |
18 |
|
// * You should have received a copy of the GNU General Public License |
19 |
|
// * along with this program; if not, write to the Free Software |
20 |
|
// * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
|
// * |
22 |
|
// * $Id$ |
23 |
|
// * |
24 |
|
// ***************************************************************************/ |
25 |
|
// |
26 |
|
// **************************************************************************** |
27 |
|
// * |
28 |
|
// * mem_transfer_ia64.s, IA-64 8bit<->16bit transfer |
29 |
|
// * |
30 |
|
// * This version was implemented during an IA-64 practical training at |
31 |
|
// * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) |
32 |
|
// * |
33 |
|
// **************************************************************************** |
34 |
|
|
35 |
|
/////////////////////////////////////////////////////////////////////////////// |
36 |
|
// |
37 |
|
// mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, |
38 |
|
// University of Karlsruhe, Germany, 03.06.2002, during the laboratory |
39 |
|
// "IA-64 Video Codec Assember Parktikum" at IPD Goos. |
40 |
|
|
41 |
|
///// History ///////////////////////////////////////////////////////////////// |
42 |
|
// |
43 |
|
// - 16.07.2002: several minor changes for ecc-conformity |
44 |
|
// - 03.06.2002: initial version |
45 |
|
// |
46 |
|
|
47 |
|
/////////////////////////////////////////////////////////////////////////////// |
48 |
|
// |
49 |
|
// Annotations: |
50 |
|
// =========== |
51 |
|
// |
52 |
|
// - All functions work on 8x8-matrices. While the C-code-functions treat each |
53 |
|
// element seperatly, the functions in this assembler-code treat a whole line |
54 |
|
// simultaneously. So one loop is saved. |
55 |
|
// The remaining loop is relized by using softwarepipelining with rotating |
56 |
|
// rregisters. |
57 |
|
// - Register renaming is used for better readability |
58 |
|
// - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both |
59 |
|
// parts are shifted and joined together with an "OR"-Instruction. |
60 |
|
// - First parameter is stored in GR 32, next in GR 33, and so on. They must be |
61 |
|
// saved, as these GRs are used for register-rotation. |
62 |
|
// - Some of the orininal, German comments used during development are left in |
63 |
|
// in the code. They shouldn't bother anyone. |
64 |
|
// |
65 |
|
// Anmerkungen: |
66 |
|
// ============ |
67 |
|
// |
68 |
|
// - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code |
69 |
|
// jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- |
70 |
|
// Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. |
71 |
|
// Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit |
72 |
|
// rotierenden Registern realisiert. |
73 |
|
// - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. |
74 |
|
// - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke |
75 |
|
// geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem |
76 |
|
// logischen Oder zusammenkopiert. |
77 |
|
// - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- |
78 |
|
// sichert werden, da die Register für die register-Rotation benötigt werden. |
79 |
|
// - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase |
80 |
|
// sind im Code verblieben. Sie sollten niemanden stören. |
81 |
|
// |
82 |
|
/////////////////////////////////////////////////////////////////////////////// |
83 |
|
|
84 |
|
|
85 |
|
// *** define Latencies for software pipilines *** |
86 |
|
|
87 |
|
LL = 3 // Load |
88 |
|
SL = 3 // Store |
89 |
|
PL = 1 // Pack |
90 |
|
SHL = 1 // Shift |
91 |
|
OL = 1 // Or |
92 |
|
UL = 1 // Unpack |
93 |
|
PAL = 1 // Parallel Add |
94 |
|
PSL = 1 // Parallel Subtract |
95 |
|
PAVGL = 1 // Parallel Avarage |
96 |
|
|
97 |
.text |
.text |
98 |
|
|
99 |
|
|
100 |
|
/////////////////////////////////////////////////////////////////////////////// |
101 |
|
// |
102 |
|
// transfer8x8_copy_ia64 |
103 |
|
// |
104 |
|
// SRC is missaligned, to align the source load two 8-bytes-words, shift it, |
105 |
|
// join them and store the aligned source into the destination address. |
106 |
|
// |
107 |
|
/////////////////////////////////////////////////////////////////////////////// |
108 |
|
|
109 |
|
.align 16 |
110 |
|
.global transfer8x8_copy_ia64# |
111 |
|
.proc transfer8x8_copy_ia64# |
112 |
|
|
113 |
|
transfer8x8_copy_ia64: |
114 |
|
.prologue |
115 |
|
|
116 |
|
// *** register renaming *** |
117 |
|
zero = r0 |
118 |
|
|
119 |
|
oldLC = r2 |
120 |
|
oldPR = r3 |
121 |
|
|
122 |
|
src_1 = r14 // left aligned address of src |
123 |
|
src_2 = r15 // right aligned address of src |
124 |
|
dst = r16 // destination address |
125 |
|
stride = r17 |
126 |
|
|
127 |
|
offset = r18 // shift right offset |
128 |
|
aoffset = r19 // shift left offset |
129 |
|
|
130 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
131 |
|
.save ar.lc, oldLC |
132 |
|
mov oldLC = ar.lc |
133 |
|
mov oldPR = pr |
134 |
|
|
135 |
|
.body |
136 |
|
|
137 |
|
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
138 |
|
alloc r9 = ar.pfs, 3, 29, 0, 32 |
139 |
|
|
140 |
|
// *** Saving Parameters *** |
141 |
|
mov dst = r32 |
142 |
|
mov stride = r34 |
143 |
|
|
144 |
|
// *** Misalingment-Treatment *** |
145 |
|
and src_1 = -8, r33 // Computing adress of first aligned block containing src-values |
146 |
|
dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress |
147 |
|
;; |
148 |
|
sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl |
149 |
|
add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values |
150 |
|
|
151 |
|
// *** init loop: set loop counter, epilog counter, predicates *** |
152 |
|
mov ar.lc = 7 |
153 |
|
mov ar.ec = LL + SHL + OL + 1 |
154 |
|
mov pr.rot = 1 << 16 |
155 |
|
;; |
156 |
|
|
157 |
|
// *** define register arrays and predicate array for software pipeline *** |
158 |
|
// src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left |
159 |
|
.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] |
160 |
|
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] |
161 |
|
|
162 |
|
|
163 |
|
// Software pipelined loop: |
164 |
|
// Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 |
165 |
|
// Stage 2: Shift both values of source to SHD_R and SHD_L |
166 |
|
// Stage 3: Join both parts together with OR |
167 |
|
// Stage 4: Store aligned date to destination and add stride to destination address |
168 |
|
|
169 |
|
|
170 |
|
.Loop_8x8copy: |
171 |
|
{.mii |
172 |
|
(ld_stage[0]) ld8 src_v1[0] = [src_1], stride |
173 |
|
(sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset |
174 |
|
} |
175 |
|
{.mii |
176 |
|
(ld_stage[0]) ld8 src_v2[0] = [src_2], stride |
177 |
|
(sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset |
178 |
|
(or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL] |
179 |
|
} |
180 |
|
{.mib |
181 |
|
(st_stage[0]) st8 [dst] = value[OL] |
182 |
|
(st_stage[0]) add dst = dst, stride |
183 |
|
br.ctop.sptk.few .Loop_8x8copy |
184 |
|
;; |
185 |
|
} |
186 |
|
|
187 |
|
// *** Restore old LC and PRs *** |
188 |
|
mov ar.lc = oldLC |
189 |
|
mov pr = oldPR, -1 |
190 |
|
|
191 |
|
br.ret.sptk.many b0 |
192 |
|
|
193 |
|
.endp transfer8x8_copy_ia64# |
194 |
|
|
195 |
|
|
196 |
|
|
197 |
|
|
198 |
|
/////////////////////////////////////////////////////////////////////////////// |
199 |
|
// |
200 |
|
// transfer_8to16copy_ia64 |
201 |
|
// |
202 |
|
// SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, |
203 |
|
// UNPACK is used. So 8 bytes are loaded from source, unpacked to two |
204 |
|
// 4 x 16 bit values and stored to the destination. Destination is a continuous |
205 |
|
// array of 64 x 16 bit signed data. To store the next line, only 16 must be |
206 |
|
// added to the destination address. |
207 |
|
/////////////////////////////////////////////////////////////////////////////// |
208 |
|
|
209 |
.align 16 |
.align 16 |
210 |
.global transfer_8to16copy_ia64# |
.global transfer_8to16copy_ia64# |
211 |
.proc transfer_8to16copy_ia64# |
.proc transfer_8to16copy_ia64# |
212 |
|
|
213 |
|
|
214 |
transfer_8to16copy_ia64: |
transfer_8to16copy_ia64: |
215 |
.prologue |
.prologue |
216 |
.save ar.lc, r2 |
|
217 |
mov r2 = ar.lc |
// *** register renaming *** |
218 |
|
oldLC = r2 |
219 |
|
oldPR = r3 |
220 |
|
|
221 |
|
zero = r0 // damit ist die Zahl "zero" = 0 gemeint |
222 |
|
|
223 |
|
dst_1 = r14 // destination address for first 4 x 16 bit values |
224 |
|
dst_2 = r15 // destination address for second 4 x 16 bit values |
225 |
|
src = r16 |
226 |
|
stride = r17 |
227 |
|
|
228 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
229 |
|
.save ar.lc, oldLC |
230 |
|
mov oldLC = ar.lc |
231 |
|
mov oldPR = pr |
232 |
|
|
233 |
|
|
234 |
.body |
.body |
235 |
addl r14 = 7, r0 |
|
236 |
mov r21 = r0 |
// *** Allocating new stackframe, define rotating registers *** |
237 |
mov r20 = r0 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
238 |
;; |
|
239 |
mov ar.lc = r14 |
// *** Saving Paramters *** |
240 |
;; |
mov dst_1 = r32 // fist 4 x 16 bit values |
241 |
.L101: |
add dst_2 = 8, r32 // second 4 x 16 bit values |
242 |
addl r19 = 1, r0 |
mov src = r33 |
243 |
zxt4 r14 = r21 |
mov stride = r34 |
244 |
dep.z r15 = r20, 1, 32 |
|
245 |
;; |
// *** init loop: set loop counter, epilog counter, predicates *** |
246 |
add r16 = r21, r19 |
mov ar.lc = 7 |
247 |
add r14 = r33, r14 |
mov ar.ec = LL + UL + 1 |
248 |
add r17 = r20, r19 |
mov pr.rot = 1 << 16 |
249 |
;; |
;; |
250 |
ld1 r18 = [r14] |
|
251 |
add r15 = r15, r32 |
// *** define register arrays and predicate array for software pipeline *** |
252 |
zxt4 r16 = r16 |
// src_v = source value, dst_v1 = destination value 1 |
253 |
;; |
.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] |
254 |
st2 [r15] = r18 |
.rotp ld_stage[LL], upack_stage[UL], st_stage[1] |
255 |
addl r19 = 2, r0 |
|
256 |
add r16 = r33, r16 |
|
257 |
dep.z r17 = r17, 1, 32 |
// Software pipelined loop: |
258 |
;; |
// Stage 1: Load value of SRC |
259 |
ld1 r15 = [r16] |
// Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data |
260 |
add r14 = r21, r19 |
// Stage 3: Store both 8 byte of 16 bit data |
261 |
add r18 = r20, r19 |
|
262 |
add r17 = r17, r32 |
|
263 |
;; |
.Loop_8to16copy: |
264 |
zxt4 r14 = r14 |
{.mii |
265 |
st2 [r17] = r15 |
(ld_stage[0]) ld8 src_v[0] = [src], stride |
266 |
addl r19 = 3, r0 |
(upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL] |
267 |
;; |
(upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL] |
268 |
add r14 = r33, r14 |
} |
269 |
add r15 = r21, r19 |
{.mmb |
270 |
dep.z r18 = r18, 1, 32 |
(st_stage[0]) st8 [dst_1] = dst_v1[UL], 16 |
271 |
;; |
(st_stage[0]) st8 [dst_2] = dst_v2[UL], 16 |
272 |
ld1 r17 = [r14] |
br.ctop.sptk.few .Loop_8to16copy |
273 |
add r16 = r20, r19 |
;; |
274 |
add r18 = r18, r32 |
} |
275 |
zxt4 r15 = r15 |
|
276 |
;; |
// *** Restore old LC and PRs *** |
277 |
st2 [r18] = r17 |
mov ar.lc = oldLC |
278 |
addl r19 = 4, r0 |
mov pr = oldPR, -1 |
279 |
add r15 = r33, r15 |
|
|
dep.z r16 = r16, 1, 32 |
|
|
;; |
|
|
ld1 r18 = [r15] |
|
|
add r14 = r21, r19 |
|
|
add r17 = r20, r19 |
|
|
add r16 = r16, r32 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
st2 [r16] = r18 |
|
|
addl r19 = 5, r0 |
|
|
;; |
|
|
add r14 = r33, r14 |
|
|
add r15 = r21, r19 |
|
|
add r16 = r20, r19 |
|
|
dep.z r17 = r17, 1, 32 |
|
|
;; |
|
|
ld1 r18 = [r14] |
|
|
addl r19 = 6, r0 |
|
|
add r17 = r17, r32 |
|
|
zxt4 r15 = r15 |
|
|
;; |
|
|
st2 [r17] = r18 |
|
|
add r14 = r21, r19 |
|
|
add r15 = r33, r15 |
|
|
dep.z r16 = r16, 1, 32 |
|
|
add r17 = r20, r19 |
|
|
;; |
|
|
ld1 r18 = [r15] |
|
|
add r16 = r16, r32 |
|
|
zxt4 r14 = r14 |
|
|
;; |
|
|
st2 [r16] = r18 |
|
|
addl r19 = 7, r0 |
|
|
add r14 = r33, r14 |
|
|
;; |
|
|
ld1 r15 = [r14] |
|
|
add r16 = r21, r19 |
|
|
dep.z r17 = r17, 1, 32 |
|
|
add r14 = r20, r19 |
|
|
;; |
|
|
add r17 = r17, r32 |
|
|
zxt4 r16 = r16 |
|
|
;; |
|
|
st2 [r17] = r15 |
|
|
dep.z r14 = r14, 1, 32 |
|
|
add r16 = r33, r16 |
|
|
;; |
|
|
add r14 = r14, r32 |
|
|
ld1 r15 = [r16] |
|
|
add r21 = r21, r34 |
|
|
;; |
|
|
st2 [r14] = r15 |
|
|
adds r20 = 8, r20 |
|
|
br.cloop.sptk.few .L101 |
|
|
;; |
|
|
mov ar.lc = r2 |
|
280 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
281 |
.endp transfer_8to16copy_ia64# |
.endp transfer_8to16copy_ia64# |
282 |
.common transfer_16to8copy#,8,8 |
|
283 |
|
|
284 |
|
|
285 |
|
|
286 |
|
/////////////////////////////////////////////////////////////////////////////// |
287 |
|
// |
288 |
|
// transfer_16to8copy_ia64 |
289 |
|
// |
290 |
|
// src is a 64 x 16 bit signed continuous array. To convert the 16 bit |
291 |
|
// values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of |
292 |
|
// 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word |
293 |
|
// of 8 x 8 unsigned data to the destination. |
294 |
|
/////////////////////////////////////////////////////////////////////////////// |
295 |
|
|
296 |
.align 16 |
.align 16 |
297 |
.global transfer_16to8copy_ia64# |
.global transfer_16to8copy_ia64# |
298 |
.proc transfer_16to8copy_ia64# |
.proc transfer_16to8copy_ia64# |
299 |
transfer_16to8copy_ia64: |
transfer_16to8copy_ia64: |
300 |
.prologue |
.prologue |
301 |
|
|
302 |
|
// *** register renaming *** |
303 |
|
dst = r14 |
304 |
|
src_1 = r15 |
305 |
|
src_2 = r17 |
306 |
|
stride = r16 |
307 |
|
|
308 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
309 |
|
.save ar.lc, oldLC |
310 |
|
mov oldLC = ar.lc |
311 |
|
mov oldPR = pr |
312 |
|
|
313 |
|
|
314 |
.body |
.body |
315 |
mov r22 = r0 |
|
316 |
addl r21 = 255, r0 |
// *** Allocating new stackframe, define rotating registers *** |
317 |
mov r20 = r0 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
318 |
mov r19 = r0 |
|
319 |
.L25: |
// *** Saving Paramters *** |
320 |
mov r18 = r0 |
mov dst = r32 |
321 |
;; |
mov src_1 = r33 |
322 |
.L29: |
add src_2 = 8, r33 |
323 |
add r14 = r19, r18 |
mov stride = r34 |
324 |
;; |
|
325 |
dep.z r14 = r14, 1, 32 |
// *** init loop: set loop counter, epilog counter, predicates *** |
326 |
;; |
mov ar.lc = 7 |
327 |
add r14 = r14, r33 |
mov ar.ec = LL + PL + 1 |
328 |
;; |
mov pr.rot = 1 << 16 |
329 |
ld2 r15 = [r14] |
;; |
330 |
;; |
|
331 |
sxt2 r15 = r15 |
// *** define register arrays and predicate array for software pipeline *** |
332 |
;; |
// src_v1 = source value 1, dst_v = destination value |
333 |
mov r16 = r15 |
.rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1] |
334 |
;; |
.rotp ld_stage[LL], pack_stage[PL], st_stage[1] |
335 |
cmp4.le p6, p7 = r0, r16 |
|
336 |
;; |
|
337 |
(p7) mov r16 = r0 |
// Software pipelined loop: |
338 |
(p7) br.cond.dpnt .L106 |
// Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data |
339 |
;; |
// Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data |
340 |
cmp4.ge p6, p7 = r21, r16 |
// Stage 3: Store the 8 byte to the destination address and add stride to |
341 |
;; |
// destination address (to get the next 8 byte line of destination) |
342 |
(p7) addl r16 = 255, r0 |
|
343 |
.L106: |
|
344 |
add r14 = r20, r18 |
.Loop_16to8copy: |
345 |
adds r17 = 1, r18 |
{.mmi |
346 |
;; |
(ld_stage[0]) ld8 src_v1[0] = [src_1], 16 |
347 |
zxt4 r14 = r14 |
(ld_stage[0]) ld8 src_v2[0] = [src_2], 16 |
348 |
add r15 = r19, r17 |
(pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL] |
349 |
;; |
} |
350 |
add r14 = r32, r14 |
{.mib |
351 |
dep.z r15 = r15, 1, 32 |
(st_stage[0]) st8 [dst] = dst_v[PL] |
352 |
;; |
(st_stage[0]) add dst = dst, stride |
353 |
st1 [r14] = r16 |
br.ctop.sptk.few .Loop_16to8copy |
354 |
add r15 = r15, r33 |
;; |
355 |
;; |
} |
356 |
ld2 r14 = [r15] |
|
357 |
;; |
// *** Restore old LC and PRs *** |
358 |
sxt2 r14 = r14 |
mov ar.lc = oldLC |
359 |
;; |
mov pr = oldPR, -1 |
360 |
mov r16 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r16 |
|
|
;; |
|
|
(p7) mov r16 = r0 |
|
|
(p7) br.cond.dpnt .L110 |
|
|
;; |
|
|
cmp4.ge p6, p7 = r21, r16 |
|
|
;; |
|
|
(p7) addl r16 = 255, r0 |
|
|
.L110: |
|
|
add r14 = r20, r17 |
|
|
adds r17 = 2, r18 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
add r15 = r19, r17 |
|
|
;; |
|
|
add r14 = r32, r14 |
|
|
dep.z r15 = r15, 1, 32 |
|
|
;; |
|
|
st1 [r14] = r16 |
|
|
add r15 = r15, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r16 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r16 |
|
|
;; |
|
|
(p7) mov r16 = r0 |
|
|
(p7) br.cond.dpnt .L114 |
|
|
;; |
|
|
cmp4.ge p6, p7 = r21, r16 |
|
|
;; |
|
|
(p7) addl r16 = 255, r0 |
|
|
.L114: |
|
|
add r14 = r20, r17 |
|
|
adds r17 = 3, r18 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
add r15 = r19, r17 |
|
|
;; |
|
|
add r14 = r32, r14 |
|
|
dep.z r15 = r15, 1, 32 |
|
|
;; |
|
|
st1 [r14] = r16 |
|
|
add r15 = r15, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r15 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r15 |
|
|
;; |
|
|
(p7) mov r15 = r0 |
|
|
(p7) br.cond.dpnt .L118 |
|
|
;; |
|
|
cmp4.ge p6, p7 = r21, r15 |
|
|
;; |
|
|
(p7) addl r15 = 255, r0 |
|
|
.L118: |
|
|
add r14 = r20, r17 |
|
|
adds r18 = 4, r18 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
cmp4.geu p6, p7 = 7, r18 |
|
|
;; |
|
|
add r14 = r32, r14 |
|
|
;; |
|
|
st1 [r14] = r15 |
|
|
(p6) br.cond.dptk .L29 |
|
|
adds r22 = 1, r22 |
|
|
add r20 = r20, r34 |
|
|
adds r19 = 8, r19 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r22 |
|
|
(p6) br.cond.dptk .L25 |
|
361 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
362 |
.endp transfer_16to8copy_ia64# |
.endp transfer_16to8copy_ia64# |
363 |
.common transfer_8to16sub#,8,8 |
|
364 |
|
|
365 |
|
|
366 |
|
/////////////////////////////////////////////////////////////////////////////// |
367 |
|
// |
368 |
|
// transfer_16to8add_ia64 |
369 |
|
// |
370 |
|
// The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- |
371 |
|
// bit-values. These are "parallel-added" to the values of src. The result is |
372 |
|
// converted into 8-bit-values using "PACK" and stored at the adress of dst. |
373 |
|
// We assume that there is no misalignment. |
374 |
|
// |
375 |
|
/////////////////////////////////////////////////////////////////////////////// |
376 |
|
|
377 |
|
.align 16 |
378 |
|
.global transfer_16to8add_ia64# |
379 |
|
.proc transfer_16to8add_ia64# |
380 |
|
|
381 |
|
transfer_16to8add_ia64: |
382 |
|
.prologue |
383 |
|
|
384 |
|
// *** register renaming *** |
385 |
|
dst = r14 |
386 |
|
src = r15 |
387 |
|
stride = r16 |
388 |
|
|
389 |
|
_src = r17 |
390 |
|
|
391 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
392 |
|
.save ar.lc, r2 |
393 |
|
mov oldLC = ar.lc |
394 |
|
mov oldPR = pr |
395 |
|
|
396 |
|
|
397 |
|
.body |
398 |
|
|
399 |
|
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
400 |
|
alloc r9 = ar.pfs, 4, 92, 0, 96 |
401 |
|
|
402 |
|
// *** Saving Paramters *** |
403 |
|
mov dst = r32 |
404 |
|
mov src = r33 |
405 |
|
mov stride = r34 |
406 |
|
add _src = 8, r33 |
407 |
|
|
408 |
|
// *** init loop: set loop counter, epilog counter, predicates *** |
409 |
|
mov ar.lc = 7 |
410 |
|
mov ar.ec = LL + UL + PAL + PL + 1 |
411 |
|
mov pr.rot = 1 << 16 |
412 |
|
;; |
413 |
|
|
414 |
|
// *** define register arrays and predicate array for software pipeline *** |
415 |
|
.rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1] |
416 |
|
.rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] |
417 |
|
|
418 |
|
|
419 |
|
// Software pipelined loop: |
420 |
|
// s1_p: The values of src and dst are loaded |
421 |
|
// s2_p: The dst-values are converted to 16-bit-values |
422 |
|
// s3_p: The values of src and dst are added |
423 |
|
// s4_p: The Results are packed into 8-bit-values |
424 |
|
// s5_p: The 8-bit-values are stored at the dst-adresses |
425 |
|
|
426 |
|
|
427 |
|
.Loop_16to8add: |
428 |
|
{.mii |
429 |
|
(s1_p[0]) ld8 w_src_1[0] = [src], 16 // läd die 1. Hälfte der j. Zeile von src (i = 0..3) |
430 |
|
(s1_p[0]) mov _dst[0] = dst // erhöht die Adresse von dst um stride |
431 |
|
(s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst |
432 |
|
} |
433 |
|
{.mii |
434 |
|
(s1_p[0]) ld8 w_dst8[0] = [dst], stride // läd die j. Zeile von dst |
435 |
|
(s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird für i = 0..3 in 16-Bit umgewandelt |
436 |
|
(s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird für i = 4..7 in 16-Bit umgewandelt |
437 |
|
} |
438 |
|
{.mii |
439 |
|
(s1_p[0]) ld8 w_src_2[0] = [_src], 16 // läd die 2. Hälfte der j. Zeile von src (i = 4..7) |
440 |
|
(s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst |
441 |
|
(s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die Überprüfung der Wertebereiche erfolgt automatisch |
442 |
|
} |
443 |
|
{.mmb |
444 |
|
(s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab |
445 |
|
(s1_p[0]) nop.m 0 |
446 |
|
br.ctop.sptk.few .Loop_16to8add |
447 |
|
;; |
448 |
|
} |
449 |
|
|
450 |
|
// *** Restore old LC and PRs *** |
451 |
|
mov ar.lc = oldLC |
452 |
|
mov pr = oldPR, -1 |
453 |
|
|
454 |
|
br.ret.sptk.many b0 |
455 |
|
.endp transfer_16to8add_ia64# |
456 |
|
|
457 |
|
|
458 |
|
|
459 |
|
/////////////////////////////////////////////////////////////////////////////// |
460 |
|
// |
461 |
|
// transfer_8to16sub_ia64 |
462 |
|
// |
463 |
|
// The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The |
464 |
|
// Difference of cur and ref ist stored at the dct-adresses and cur is copied |
465 |
|
// into the ref-array. |
466 |
|
// |
467 |
|
// You must assume, that the data adressed by 'ref' are misaligned in memory. |
468 |
|
// But you can assume, that the other data are aligned (at least I hope so). |
469 |
|
// |
470 |
|
/////////////////////////////////////////////////////////////////////////////// |
471 |
|
|
472 |
.align 16 |
.align 16 |
473 |
.global transfer_8to16sub_ia64# |
.global transfer_8to16sub_ia64# |
474 |
.proc transfer_8to16sub_ia64# |
.proc transfer_8to16sub_ia64# |
475 |
|
|
476 |
|
|
477 |
transfer_8to16sub_ia64: |
transfer_8to16sub_ia64: |
478 |
.prologue |
.prologue |
479 |
|
|
480 |
|
// *** register renaming *** |
481 |
|
oldLC = r2 |
482 |
|
oldPR = r3 |
483 |
|
|
484 |
|
zero = r0 // damit ist die Zahl "zero" = 0 gemeint |
485 |
|
|
486 |
|
//Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage |
487 |
|
dct = r14 |
488 |
|
cur = r15 |
489 |
|
ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das ÜbergabeRegister in dieser Liste |
490 |
|
stride = r16 |
491 |
|
|
492 |
|
offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtrücken |
493 |
|
aoffset = r18 // Gegenstück zum Offset, |
494 |
|
ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref |
495 |
|
ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref |
496 |
|
|
497 |
|
_dct = r21 // Register für die Zieladressen des 2. dct-Blocks |
498 |
|
|
499 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
500 |
|
.save ar.lc, r2 |
501 |
|
mov oldLC = ar.lc |
502 |
|
mov oldPR = pr |
503 |
|
|
504 |
|
|
505 |
.body |
.body |
506 |
mov r25 = r0 |
|
507 |
mov r24 = r0 |
// *** Allocating new stackframe, define rotating registers *** |
508 |
mov r23 = r0 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
509 |
.L39: |
|
510 |
mov r22 = r0 |
// *** Saving Paramters *** |
511 |
;; |
mov dct = r32 |
512 |
.L43: |
mov cur = r33 |
513 |
add r15 = r23, r22 |
// mov ref = r34: ref is unaligned, get aligned ref below... |
514 |
adds r20 = 1, r22 |
mov stride = r35 |
515 |
add r16 = r24, r22 |
|
516 |
;; |
and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8) |
517 |
zxt4 r15 = r15 |
dep offset = ref, zero, 3, 3 |
518 |
add r18 = r23, r20 |
;; |
519 |
dep.z r16 = r16, 1, 32 |
add ref_a2 = 8, ref_a1 |
520 |
;; |
sub aoffset = 64, offset // Gegenstück zum Offset wird berechnet |
521 |
add r19 = r34, r15 |
add _dct = 8, dct // Die Adresse für den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) höher als beim 1. Block |
522 |
zxt4 r18 = r18 |
|
523 |
add r16 = r16, r32 |
// *** init loop: set loop counter, epilog counter, predicates *** |
524 |
add r15 = r33, r15 |
mov ar.lc = 7 |
525 |
;; |
mov ar.ec = LL + SHL + OL + UL + PSL + 1 |
526 |
ld1 r14 = [r19] |
mov pr.rot = 1 << 16 |
527 |
add r21 = r34, r18 |
;; |
528 |
ld1 r17 = [r15] |
|
529 |
adds r19 = 2, r22 |
// *** define register arrays and predicate array for software pipeline *** |
530 |
add r18 = r33, r18 |
.rotr c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1], dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1] |
531 |
;; |
.rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] |
532 |
st1 [r15] = r14 |
|
533 |
sub r17 = r17, r14 |
|
534 |
add r20 = r24, r20 |
// Software pipelined loop: |
535 |
;; |
// s1_p: The values of ref and cur ale loaded, a copy of cur is made. |
536 |
st2 [r16] = r17 |
// s2_p: cur is converted to 16-bit and thehe misaligned values of ref are |
537 |
dep.z r20 = r20, 1, 32 |
// shifted... |
538 |
ld1 r14 = [r21] |
// s3_p: ... and copied together. |
539 |
ld1 r15 = [r18] |
// s4_p: This ref-value is converted to 16-bit. The values of cur are stored |
540 |
add r16 = r23, r19 |
// at the ref-adresses. |
541 |
;; |
// s5_p: the ref- abd cur-values are substracted... |
542 |
st1 [r18] = r14 |
// s6_p: ...and the result is stored at the dct-adresses. |
543 |
sub r15 = r15, r14 |
|
544 |
zxt4 r16 = r16 |
|
545 |
add r20 = r20, r32 |
loop_8to16sub: |
546 |
;; |
{.mii |
547 |
add r18 = r34, r16 |
(s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // läd den 1. 64-Bit-Block, der einen Teil der ref-Daten enthält |
548 |
adds r17 = 3, r22 |
(s1_p[0]) mov _cur[0] = cur // cur wird für spätere Verwendung gesichert |
549 |
st2 [r20] = r15 |
(s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte Hälfte wird zurechtgerückt |
550 |
add r16 = r33, r16 |
} |
551 |
add r19 = r24, r19 |
{.mii |
552 |
;; |
(s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // läd den 2. 64-Bit-Block |
553 |
ld1 r14 = [r18] |
(s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke Hälfte wird zurechtgerückt |
554 |
add r15 = r23, r17 |
(s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtgerückten Daten werden in r zusammenkopiert |
555 |
dep.z r19 = r19, 1, 32 |
} |
556 |
ld1 r18 = [r16] |
{.mii |
557 |
;; |
(s1_p[0]) ld8 c[0] = [cur], stride //läd die j. Zeile von cur komplett |
558 |
zxt4 r15 = r15 |
(s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird für i = 0..3 in 16-Bit umgewandelt |
559 |
add r19 = r19, r32 |
(s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird für i = 4..7 in 16-Bit umgewandelt |
560 |
st1 [r16] = r14 |
} |
561 |
sub r18 = r18, r14 |
{.mii |
562 |
;; |
(s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt |
563 |
add r20 = r34, r15 |
//Umwandeln der 8-Bit r und c -Werte in 16-bit Werte |
564 |
st2 [r19] = r18 |
(s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird für i = 0..3 in 16-Bit umgewandelt |
565 |
add r15 = r33, r15 |
(s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird für i = 4..7 in 16-Bit umgewandelt |
566 |
add r17 = r24, r17 |
} |
567 |
;; |
{.mii |
568 |
ld1 r14 = [r20] |
(s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. Häfte der j. Zeile |
569 |
ld1 r16 = [r15] |
(s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. Hälfte |
570 |
dep.z r17 = r17, 1, 32 |
} |
571 |
;; |
{.mmb |
572 |
add r17 = r17, r32 |
(s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert |
573 |
adds r22 = 4, r22 |
(s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert |
574 |
st1 [r15] = r14 |
br.ctop.sptk.few loop_8to16sub // Und hopp |
575 |
sub r16 = r16, r14 |
;; |
576 |
;; |
} |
577 |
cmp4.geu p6, p7 = 7, r22 |
|
578 |
st2 [r17] = r16 |
// *** Restore old LC and PRs *** |
579 |
(p6) br.cond.dptk .L43 |
mov ar.lc = oldLC |
580 |
adds r25 = 1, r25 |
mov pr = oldPR, -1 |
581 |
adds r24 = 8, r24 |
|
|
add r23 = r23, r35 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r25 |
|
|
(p6) br.cond.dptk .L39 |
|
582 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
583 |
.endp transfer_8to16sub_ia64# |
.endp transfer_8to16sub_ia64# |
584 |
.common transfer_8to16sub2#,8,8 |
|
585 |
|
|
586 |
|
|
587 |
|
|
588 |
|
|
589 |
|
/////////////////////////////////////////////////////////////////////////////// |
590 |
|
// |
591 |
|
// transfer_8to16sub2_ia64 |
592 |
|
// |
593 |
|
// At the time, this function was written, it was not yet in use. |
594 |
|
// We assume that the values of ref1/2 are misaligned. |
595 |
|
// |
596 |
|
// The values of ref1/2 and cur are loaded, the ref-values need misalignment- |
597 |
|
// treatment. The values are converted to 16-bit using unpack. The average of |
598 |
|
// ref1 and ref2 is computed with pavg and substacted from cur. The results are |
599 |
|
// stored at the dct-adresses. |
600 |
|
// pavg1.raz is used to get the same results as the C-code-function. |
601 |
|
// |
602 |
|
/////////////////////////////////////////////////////////////////////////////// |
603 |
|
|
604 |
|
.text |
605 |
.align 16 |
.align 16 |
606 |
.global transfer_8to16sub2_ia64# |
.global transfer_8to16sub2_ia64# |
607 |
.proc transfer_8to16sub2_ia64# |
.proc transfer_8to16sub2_ia64# |
608 |
|
|
609 |
transfer_8to16sub2_ia64: |
transfer_8to16sub2_ia64: |
610 |
.prologue |
.prologue |
611 |
|
|
612 |
|
// *** register renaming *** |
613 |
|
// We've tried to keep the C-Code names as often as possible, at least as |
614 |
|
// part of register-names |
615 |
|
oldLC = r2 |
616 |
|
oldPR = r3 |
617 |
|
|
618 |
|
zero = r0 |
619 |
|
|
620 |
|
dct_al = r14 // dct: adress of left block in one line |
621 |
|
dct_ar = r15 // dct: adress of right block in one line |
622 |
|
cur = r16 |
623 |
|
ref1_al = r17 // ref1: aligned adress of lower part |
624 |
|
ref1_ah = r18 // ref1: aligned adress of higher part |
625 |
|
ref2_al = r19 // ref2: aligned adress of lower part |
626 |
|
ref2_ah = r20 // ref2: aligned adress of higher part |
627 |
|
stride = r21 |
628 |
|
|
629 |
|
offset_1 = r22 |
630 |
|
offset_2 = r23 |
631 |
|
aoffset_1 = r24 |
632 |
|
aoffset_2 = r25 |
633 |
|
|
634 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
635 |
.save ar.lc, r2 |
.save ar.lc, r2 |
636 |
mov r2 = ar.lc |
mov oldLC = ar.lc |
637 |
|
mov oldPR = pr |
638 |
|
|
639 |
|
|
640 |
.body |
.body |
641 |
mov r28 = r0 |
|
642 |
addl r27 = 255, r0 |
// *** Saving Paramters *** |
643 |
mov r26 = r0 |
// *** (as inputregisters r32 + are needed for register-rotation) *** |
644 |
mov r25 = r0 |
mov dct_ar = r32 |
645 |
.L50: |
add dct_al = 8, r32 |
646 |
addl r14 = 3, r0 |
mov cur = r33 |
647 |
mov r21 = r0 |
|
648 |
;; |
and ref1_al = -8, r34 |
649 |
mov ar.lc = r14 |
and ref2_al = -8, r35 // ref2 aligned adrress of lower part |
650 |
;; |
|
651 |
.L138: |
mov stride = r36 |
652 |
add r14 = r26, r21 |
|
653 |
add r17 = r25, r21 |
// *** Calculations for Misaligment-Handling *** |
654 |
adds r19 = 1, r21 |
dep offset_1 = r34, zero, 3, 3 |
655 |
;; |
dep offset_2 = r35, zero, 3, 3 |
656 |
zxt4 r17 = r17 |
;; |
657 |
dep.z r14 = r14, 1, 32 |
add ref1_ah = 8, ref1_al |
658 |
add r18 = r25, r19 |
add ref2_ah = 8, ref2_al |
659 |
;; |
sub aoffset_1 = 64, offset_1 |
660 |
add r15 = r34, r17 |
sub aoffset_2 = 64, offset_2 |
661 |
add r23 = r14, r32 |
;; |
662 |
add r20 = r35, r17 |
|
663 |
;; |
// *** Allocating new stackframe, define rotating registers *** |
664 |
ld1 r14 = [r15] |
alloc r9 = ar.pfs, 5, 91, 0, 96 |
665 |
ld1 r16 = [r20] |
|
666 |
add r17 = r33, r17 |
// *** init loop: set loop counter, epilog counter, predicates *** |
667 |
;; |
mov ar.lc = 7 |
668 |
add r14 = r14, r16 |
mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1 |
669 |
ld1 r15 = [r17] |
mov pr.rot = 1 << 16 |
670 |
zxt4 r18 = r18 |
;; |
671 |
;; |
|
672 |
adds r14 = 1, r14 |
// *** define register arrays and predicate array for software pipeline *** |
673 |
add r24 = r35, r18 |
.rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] |
674 |
add r22 = r34, r18 |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] |
675 |
;; |
|
676 |
shr.u r14 = r14, 1 |
|
677 |
add r19 = r26, r19 |
// software pipelined loop: |
678 |
add r16 = r33, r18 |
// ld_stage: The values of ref1, ref2, cur are loaded |
679 |
;; |
// sh_stage: The misaligned values of ref1/2 are shifted... |
680 |
cmp4.ge p6, p7 = r27, r14 |
// or_stage: ...and copied together. |
681 |
dep.z r19 = r19, 1, 32 |
// pavg_stage: The average of ref1 and ref2 is computed. |
682 |
adds r21 = 2, r21 |
// up_stage: The result and the cur-values are converted to 16-bit. |
683 |
;; |
// psub_stage: Those values are substracted... |
684 |
(p7) addl r14 = 255, r0 |
// st_stage: ...and stored at the dct-adresses. |
685 |
add r19 = r19, r32 |
|
686 |
;; |
|
687 |
sub r14 = r15, r14 |
.Loop_8to16sub2: |
688 |
;; |
{.mii |
689 |
st2 [r23] = r14 |
(ld_stage[0]) ld8 c[0] = [cur], stride |
690 |
ld1 r14 = [r24] |
(sh_stage[0]) shr.u ref1_l[0] = ref1_vl[LL], offset_1 |
691 |
ld1 r15 = [r22] |
(sh_stage[0]) shl ref1_h[0] = ref1_vh[LL], aoffset_1 |
692 |
ld1 r16 = [r16] |
} |
693 |
;; |
{.mii |
694 |
add r15 = r15, r14 |
(ld_stage[0]) ld8 ref1_vl[0] = [ref1_al], stride |
695 |
;; |
(sh_stage[0]) shr.u ref2_l[0] = ref2_vl[LL], offset_2 |
696 |
adds r15 = 1, r15 |
(sh_stage[0]) shl ref2_h[0] = ref2_vh[LL], aoffset_2 |
697 |
;; |
} |
698 |
shr.u r14 = r15, 1 |
{.mii |
699 |
;; |
(ld_stage[0]) ld8 ref1_vh[0] = [ref1_ah], stride |
700 |
cmp4.ge p6, p7 = r27, r14 |
(or_stage[0]) or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL] |
701 |
;; |
(or_stage[0]) or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL] |
702 |
(p7) addl r14 = 255, r0 |
} |
703 |
;; |
{.mii |
704 |
sub r14 = r16, r14 |
(ld_stage[0]) ld8 ref2_vl[0] = [ref2_al], stride |
705 |
;; |
(pavg_stage[0]) pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL] |
706 |
st2 [r19] = r14 |
(up_stage[0]) unpack1.l r16_r[0] = zero, r[PAVGL] |
707 |
br.cloop.sptk.few .L138 |
} |
708 |
adds r28 = 1, r28 |
{.mii |
709 |
adds r26 = 8, r26 |
(ld_stage[0]) ld8 ref2_vh[0] = [ref2_ah], stride |
710 |
add r25 = r25, r36 |
(up_stage[0]) unpack1.h r16_l[0] = zero, r[PAVGL] |
711 |
;; |
(up_stage[0]) unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL] |
712 |
cmp4.geu p6, p7 = 7, r28 |
} |
713 |
(p6) br.cond.dptk .L50 |
{.mii |
714 |
mov ar.lc = r2 |
(st_stage[0]) st8 [dct_ar] = dct16_r[PSL], 16 |
715 |
|
(up_stage[0]) unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL] |
716 |
|
(psub_stage[0]) psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL] |
717 |
|
} |
718 |
|
{.mib |
719 |
|
(st_stage[0]) st8 [dct_al] = dct16_l[PSL], 16 |
720 |
|
(psub_stage[0]) psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL] |
721 |
|
br.ctop.sptk.few .Loop_8to16sub2 // Und hopp |
722 |
|
;; |
723 |
|
} |
724 |
|
|
725 |
|
// *** Restore old LC and PRs *** |
726 |
|
mov ar.lc = oldLC |
727 |
|
mov pr = oldPR, -1 |
728 |
|
|
729 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
730 |
.endp transfer_8to16sub2_ia64# |
.endp transfer_8to16sub2_ia64# |
|
.common transfer_16to8add#,8,8 |
|
|
.align 16 |
|
|
.global transfer_16to8add_ia64# |
|
|
.proc transfer_16to8add_ia64# |
|
|
transfer_16to8add_ia64: |
|
|
.prologue |
|
|
.save ar.lc, r2 |
|
|
mov r2 = ar.lc |
|
|
.body |
|
|
mov r26 = r0 |
|
|
addl r25 = 255, r0 |
|
|
mov r24 = r0 |
|
|
mov r21 = r0 |
|
|
.L62: |
|
|
addl r14 = 3, r0 |
|
|
mov r20 = r0 |
|
|
;; |
|
|
mov ar.lc = r14 |
|
|
;; |
|
|
.L149: |
|
|
adds r17 = 1, r20 |
|
|
add r14 = r21, r20 |
|
|
add r15 = r24, r20 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
add r18 = r21, r17 |
|
|
dep.z r15 = r15, 1, 32 |
|
|
;; |
|
|
add r23 = r32, r14 |
|
|
zxt4 r18 = r18 |
|
|
add r15 = r15, r33 |
|
|
;; |
|
|
mov r16 = r23 |
|
|
add r22 = r32, r18 |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
ld1 r18 = [r16] |
|
|
add r19 = r24, r17 |
|
|
adds r20 = 2, r20 |
|
|
;; |
|
|
add r14 = r14, r18 |
|
|
dep.z r19 = r19, 1, 32 |
|
|
mov r16 = r22 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
add r19 = r19, r33 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
cmp4.ge p8, p9 = r25, r14 |
|
|
;; |
|
|
(p7) mov r14 = r0 |
|
|
(p7) br.cond.dpnt .L143 |
|
|
;; |
|
|
(p9) addl r14 = 255, r0 |
|
|
;; |
|
|
.L143: |
|
|
st1 [r23] = r14 |
|
|
ld1 r14 = [r22] |
|
|
ld2 r15 = [r19] |
|
|
;; |
|
|
add r15 = r15, r14 |
|
|
;; |
|
|
sxt2 r15 = r15 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r15 |
|
|
cmp4.ge p8, p9 = r25, r15 |
|
|
;; |
|
|
(p7) mov r15 = r0 |
|
|
(p7) br.cond.dpnt .L147 |
|
|
;; |
|
|
(p9) addl r15 = 255, r0 |
|
|
;; |
|
|
.L147: |
|
|
st1 [r16] = r15 |
|
|
br.cloop.sptk.few .L149 |
|
|
adds r26 = 1, r26 |
|
|
adds r24 = 8, r24 |
|
|
add r21 = r21, r34 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r26 |
|
|
(p6) br.cond.dptk .L62 |
|
|
mov ar.lc = r2 |
|
|
br.ret.sptk.many b0 |
|
|
.endp transfer_16to8add_ia64# |
|
|
.common transfer8x8_copy#,8,8 |
|
|
.align 16 |
|
|
.global transfer8x8_copy_ia64# |
|
|
.proc transfer8x8_copy_ia64# |
|
|
transfer8x8_copy_ia64: |
|
|
.prologue |
|
|
.save ar.lc, r2 |
|
|
mov r2 = ar.lc |
|
|
.body |
|
|
addl r14 = 7, r0 |
|
|
mov r21 = r0 |
|
|
;; |
|
|
mov ar.lc = r14 |
|
|
;; |
|
|
.L168: |
|
|
zxt4 r14 = r21 |
|
|
adds r15 = 1, r21 |
|
|
adds r18 = 2, r21 |
|
|
;; |
|
|
add r16 = r33, r14 |
|
|
zxt4 r15 = r15 |
|
|
zxt4 r18 = r18 |
|
|
;; |
|
|
ld1 r17 = [r16] |
|
|
add r14 = r32, r14 |
|
|
add r19 = r33, r15 |
|
|
;; |
|
|
st1 [r14] = r17 |
|
|
add r15 = r32, r15 |
|
|
add r20 = r33, r18 |
|
|
ld1 r16 = [r19] |
|
|
adds r14 = 3, r21 |
|
|
add r18 = r32, r18 |
|
|
;; |
|
|
st1 [r15] = r16 |
|
|
zxt4 r14 = r14 |
|
|
adds r17 = 4, r21 |
|
|
ld1 r15 = [r20] |
|
|
;; |
|
|
add r19 = r33, r14 |
|
|
zxt4 r17 = r17 |
|
|
st1 [r18] = r15 |
|
|
add r14 = r32, r14 |
|
|
;; |
|
|
add r20 = r33, r17 |
|
|
ld1 r15 = [r19] |
|
|
adds r16 = 5, r21 |
|
|
add r17 = r32, r17 |
|
|
;; |
|
|
st1 [r14] = r15 |
|
|
zxt4 r16 = r16 |
|
|
adds r18 = 6, r21 |
|
|
ld1 r14 = [r20] |
|
|
;; |
|
|
add r19 = r33, r16 |
|
|
zxt4 r18 = r18 |
|
|
st1 [r17] = r14 |
|
|
add r16 = r32, r16 |
|
|
;; |
|
|
add r20 = r33, r18 |
|
|
ld1 r14 = [r19] |
|
|
adds r15 = 7, r21 |
|
|
add r18 = r32, r18 |
|
|
;; |
|
|
st1 [r16] = r14 |
|
|
zxt4 r15 = r15 |
|
|
add r21 = r21, r34 |
|
|
ld1 r16 = [r20] |
|
|
;; |
|
|
add r17 = r33, r15 |
|
|
st1 [r18] = r16 |
|
|
add r15 = r32, r15 |
|
|
;; |
|
|
ld1 r14 = [r17] |
|
|
;; |
|
|
st1 [r15] = r14 |
|
|
br.cloop.sptk.few .L168 |
|
|
;; |
|
|
mov ar.lc = r2 |
|
|
br.ret.sptk.many b0 |
|
|
.endp transfer8x8_copy_ia64# |
|
|
.ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)" |
|