ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvs/xvidcore/src/dct/ia64_asm/fdct_ia64.s
Revision: 1.5
Committed: Sat Feb 15 15:22:18 2003 UTC (21 years, 7 months ago) by edgomez
Branch: MAIN
CVS Tags: release-1_2_0, tag-branching-1_2_0, release-1_1_3-final, release-1_1_3, release-1_1_2, release-1_1_1-final, release-1_1_0_final, release-1_1_0, release-1_0_3, release-1_0_2, release-1_0_1, release-1_0_0, tag-merging-20040322, cvs-head, merged-dev-api-3
Branch point for: release-1_2-branch, release-1_1-branch, release-1_0-branch, Isibaar, dev-api-4
Changes since 1.4: +36 -36 lines
Error occurred while calculating annotation data.
Log Message:
Moved dev-api-3 to HEAD -- Nasty but efficient -- Merging work has been done too

File Contents

# Content
1 // ******************************************************************************
2 // * *
3 // * This file is part of XviD, a free MPEG-4 video encoder/decoder *
4 // * *
5 // * *
6 // * XviD is free software; you can redistribute it and/or modify it *
7 // * under the terms of the GNU General Public License as published by *
8 // * the Free Software Foundation; either version 2 of the License, or *
9 // * (at your option) any later version. *
10 // * *
11 // * XviD is distributed in the hope that it will be useful, but *
12 // * WITHOUT ANY WARRANTY; without even the implied warranty of *
13 // * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14 // * GNU General Public License for more details. *
15 // * *
16 // * You should have received a copy of the GNU General Public License *
17 // * along with this program; if not, write to the Free Software *
18 // * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
19 // * *
20 // ******************************************************************************
21 //
22 // ******************************************************************************
23 // * *
24 // * fdct_ia64.s, IA-64 optimized forward DCT *
25 // * *
26 // * Completed version provided by Intel at AppNote AP-922 *
27 // * http://developer.intel.com/software/products/college/ia32/strmsimd/ *
28 // * Copyright (C) 1999 Intel Corporation, *
29 // * *
30 // * This version was implemented during an IA-64 practical training at *
31 // * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) *
32 // * Copyright (C) 2002 - Stephan Krause, Ingo-Marc Weber, Daniel Kallfass *
33 // * *
34 // * For more information visit the XviD homepage: http://www.xvid.org *
35 // * *
36 // ******************************************************************************
37 //
38 // ******************************************************************************
39 // * *
40 // * Revision history: *
41 // * *
42 // * 24.07.2002 Initial Version *
43 // * *
44 // ******************************************************************************
45
46
47 // This is a fast precise implementation of 8x8 Discrete Cosine Transform
48 // published in Intel Application Note 922 from 1999 and optimized for IA-64.
49 //
50 // An unoptimized "straight forward" version can be found at the end of this file.
51
52
53 .pred.safe_across_calls p1-p5,p16-p63
54 .text
55 .align 16
56 .global fdct_ia64#
57 .proc fdct_ia64#
58 fdct_ia64:
59 .prologue
60 alloc r14 = ar.pfs, 1, 56, 0, 0
61 // Save constants
62 mov r31 = 0x32ec // c0 = tan(1pi/16)
63 mov r30 = 0x6a0a // c1 = tan(2pi/16)
64 mov r29 = 0xab0e // c2 = tan(3pi/16)
65 mov r28 = 0xb505 // g4 = cos(4pi/16)
66 mov r27 = 0xd4db // g3 = cos(3pi/16)
67 mov r26 = 0xec83 // g2 = cos(2pi/16)
68 mov r25 = 0xfb15 // g1 = cos(1pi/16)
69 mov r24 = 0x0002 // correction bit for descaling
70 mov r23 = 0x0004 // correction bit for descaling
71
72 // Load Matrix into registers
73
74 add loc0 = r0, r32
75 add loc2 = 16, r32
76 add loc4 = 32, r32
77 add loc6 = 48, r32
78 add loc8 = 64, r32
79 add loc10 = 80, r32
80 add loc12 = 96, r32
81 add loc14 = 112, r32
82 add loc1 = 8, r32
83 add loc3 = 24, r32
84 add loc5 = 40, r32
85 add loc7 = 56, r32
86 add loc9 = 72, r32
87 add loc11 = 88, r32
88 add loc13 = 104, r32
89 add loc15 = 120, r32
90 ;;
91 ld8 loc16 = [loc0]
92 ld8 loc17 = [loc2]
93 ld8 loc18 = [loc4]
94 ld8 loc19 = [loc6]
95 ld8 loc20 = [loc8]
96 ld8 loc21 = [loc10]
97 ld8 loc22 = [loc12]
98 ld8 loc23 = [loc14]
99 ld8 loc24 = [loc1]
100 ld8 loc25 = [loc3]
101 ld8 loc26 = [loc5]
102 ld8 loc27 = [loc7]
103 mux2 r26 = r26, 0x00
104 ld8 loc28 = [loc9]
105 mux2 r31 = r31, 0x00
106 mux2 r25 = r25, 0x00
107 ld8 loc29 = [loc11]
108 mux2 r30 = r30, 0x00
109 mux2 r29 = r29, 0x00
110 ld8 loc30 = [loc13]
111 mux2 r28 = r28, 0x00
112 mux2 r27 = r27, 0x00
113 ld8 loc31 = [loc15]
114 mux2 r24 = r24, 0x00
115 mux2 r23 = r23, 0x00
116 ;;
117 pshl2 loc16 = loc16, 3
118 pshl2 loc17 = loc17, 3
119 pshl2 loc18 = loc18, 3
120 pshl2 loc19 = loc19, 3
121 pshl2 loc20 = loc20, 3
122 pshl2 loc21 = loc21, 3
123 pshl2 loc22 = loc22, 3
124 pshl2 loc23 = loc23, 3
125 ;;
126 pshl2 loc24 = loc24, 3
127
128 // *******************
129 // column-DTC 1st half
130 // *******************
131
132 psub2 loc37 = loc17, loc22 // t5 = x1 - x6
133 pshl2 loc25 = loc25, 3
134 pshl2 loc26 = loc26, 3
135 psub2 loc38 = loc18, loc21 // t6 = x2 - x5
136 pshl2 loc27 = loc27, 3
137 pshl2 loc28 = loc28, 3
138 ;;
139 padd2 loc32 = loc16, loc23 // t0 = x0 + x7
140 pshl2 loc29 = loc29, 3
141 pshl2 loc30 = loc30, 3
142 padd2 loc33 = loc17, loc22 // t1 = x1 + x6
143 padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
144 psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
145 ;;
146 padd2 loc34 = loc18, loc21 // t2 = x2 + x5
147 pshl2 loc31 = loc31, 3
148 padd2 loc35 = loc19, loc20 // t3 = x3 + x4
149 psub2 loc36 = loc16, loc23 // t4 = x0 - x7
150 pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
151 pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
152 ;;
153 psub2 loc39 = loc19, loc20 // t7 = x3 - x4
154 padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
155 padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
156
157 padd2 loc16 = loc32, loc35 // x0 = t0 + t3
158 padd2 loc17 = loc33, loc34 // x1 = t1 + t2
159 psub2 loc18 = loc32, loc35 // x2 = t0 - t3
160 ;;
161 psub2 loc19 = loc33, loc34 // x3 = t1 - t2
162 padd2 loc20 = loc36, loc37 // x4 = t4 + t5
163 padd2 loc21 = loc38, loc39 // x5 = t6 + t7
164 psub2 loc22 = loc36, loc37 // x6 = t4 - t5
165 psub2 loc23 = loc38, loc39 // x7 = t6 - t7
166 ;;
167 pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
168 padd2 loc32 = loc16, loc17 // t0 = x0 + x1
169 pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
170 pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
171 psub2 loc33 = loc16, loc17 // t1 = x0 - x1
172 pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
173 pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
174 ;;
175 padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
176 pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
177 padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
178 padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
179 psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
180 psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
181 ;;
182 padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
183 padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1)
184 pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
185 pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
186 psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7
187 ;;
188 padd2 loc48 = loc16, loc32 // y0 = x0 + t0
189 pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
190 pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
191 padd2 loc52 = loc17, loc33 // y4 = x1 + t1
192 pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
193 pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
194 ;;
195 padd2 loc50 = loc18, loc34 // y2 = x2 + t2
196 pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
197 pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
198 padd2 loc55 = loc21, loc37 // y7 = x5 + t5
199 padd2 loc49 = loc20, loc36 // y1 = x4 + t4
200 padd2 loc54 = loc19, loc35 // y6 = x3 + t3
201 ;;
202 padd2 loc51 = loc22, loc38 // y3 = x6 + t6
203 padd2 loc53 = loc23, loc39 // y5 = x7 + t7
204
205 //divide by 4
206
207 padd2 loc48 = loc48, r24
208 padd2 loc49 = loc49, r24
209 padd2 loc50 = loc50, r24
210 padd2 loc52 = loc52, r24
211 ;;
212 padd2 loc51 = loc51, r24
213 pshr2 loc48 = loc48, 2
214 padd2 loc53 = loc53, r24
215 pshr2 loc49 = loc49, 2
216 padd2 loc54 = loc54, r24
217 pshr2 loc50 = loc50, 2
218 padd2 loc55 = loc55, r24
219 pshr2 loc52 = loc52, 2
220 ;;
221 pshr2 loc51 = loc51, 2
222 pshr2 loc53 = loc53, 2
223 pshr2 loc54 = loc54, 2
224 pshr2 loc55 = loc55, 2
225
226
227 // *******************
228 // column-DTC 2nd half
229 // *******************
230
231 psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2
232 psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2
233 padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2
234 padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2
235 ;;
236 padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2
237 psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
238 padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
239 padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2
240 ;;
241 psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2
242 pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
243 pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
244 ;;
245 psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2
246 padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
247 padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
248
249 padd2 loc16 = loc32, loc35 // x0 = t0 + t3
250 padd2 loc17 = loc33, loc34 // x1 = t1 + t2
251 psub2 loc18 = loc32, loc35 // x2 = t0 - t3
252 ;;
253 psub2 loc19 = loc33, loc34 // x3 = t1 - t2
254 padd2 loc20 = loc36, loc37 // x4 = t4 + t5
255 padd2 loc21 = loc38, loc39 // x5 = t6 + t7
256 psub2 loc22 = loc36, loc37 // x6 = t4 - t5
257 psub2 loc23 = loc38, loc39 // x7 = t6 - t7
258 ;;
259 pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
260 padd2 loc32 = loc16, loc17 // t0 = x0 + x1
261 pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
262 pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
263 psub2 loc33 = loc16, loc17 // t1 = x0 - x1
264 pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
265 pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
266 pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
267 ;;
268 padd2 loc34 = loc18, loc43 // t2 = x2 + buf3
269 padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
270 padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
271 psub2 loc35 = loc42, loc19 // t3 = buf2 - x3
272 padd2 loc36 = loc20, loc45 // t4 = x4 + buf5
273 pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
274 ;;
275 psub2 loc37 = loc44, loc21 // t5 = buf4 - x5
276 padd2 loc38 = loc22, loc47 // t6 = x6 + buf7
277 psub2 loc39 = loc46, loc23 // t7 = buf6 - x7
278 pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
279 ;;
280 padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0
281 pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
282 pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
283 padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1
284 pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
285 pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
286 ;;
287 padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2
288 pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
289 pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
290 padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5
291 padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4
292 padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3
293 ;;
294 padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6
295
296 // *******************
297 // transpose matrix
298 // *******************
299
300 mix2.r loc32 = loc48, loc49 // tmp0 = mixr y0, y1
301 mix2.l loc33 = loc48, loc49 // tmp1 = mixl y0, y1
302 padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7
303 mix2.r loc34 = loc50, loc51 // tmp2 = mixr y2, y3
304 mix2.l loc35 = loc50, loc51 // tmp3 = mixl y2, y3
305 ;;
306
307 //divide by 4
308
309 padd2 loc40 = loc40, r24
310 padd2 loc41 = loc41, r24
311 mix4.r loc16 = loc32, loc34 // x0 = mixr tmp0, tmp2
312 padd2 loc42 = loc42, r24
313 padd2 loc43 = loc43, r24
314 mix4.r loc17 = loc33, loc35 // x1 = mixr tmp1, tmp3
315 padd2 loc44 = loc44, r24
316 padd2 loc45 = loc45, r24
317 mix4.l loc18 = loc32, loc34 // x2 = mixl tmp0, tmp2
318 padd2 loc46 = loc46, r24
319 padd2 loc47 = loc47, r24
320 mix4.l loc19 = loc33, loc35 // x3 = mixl tmp1, tmp3
321 ;;
322 pshr2 loc40 = loc40, 2
323 pshr2 loc41 = loc41, 2
324 pshr2 loc42 = loc42, 2
325 pshr2 loc43 = loc43, 2
326 mix2.r loc32 = loc52, loc53 // tmp0 = mixr y4, y5
327 mix2.l loc33 = loc52, loc53 // tmp1 = mixl y4, y5
328 mix2.r loc34 = loc54, loc55 // tmp2 = mixr y6, y7
329 mix2.l loc35 = loc54, loc55 // tmp3 = mixl y6, y7
330 ;;
331 pshr2 loc44 = loc44, 2
332 pshr2 loc45 = loc45, 2
333 pshr2 loc46 = loc46, 2
334 pshr2 loc47 = loc47, 2
335 mix4.r loc24 = loc32, loc34 // x0.2 = mixr tmp0, tmp2
336 mix4.r loc25 = loc33, loc35 // x1.2 = mixr tmp1, tmp3
337 mix4.l loc26 = loc32, loc34 // x2.2 = mixl tmp0, tmp2
338 mix4.l loc27 = loc33, loc35 // x3.2 = mixl tmp1, tmp3
339 ;;
340 mix2.r loc32 = loc40, loc41 // tmp0 = mixr y0.2, y1.2
341 mix2.l loc33 = loc40, loc41 // tmp1 = mixl y0.2, y1.2
342 mix2.r loc34 = loc42, loc43 // tmp2 = mixr y2.2, y3.2
343 mix2.l loc35 = loc42, loc43 // tmp3 = mixl y2.2, y3.2
344 ;;
345 mix4.r loc20 = loc32, loc34 // x4 = mixr tmp0, tmp2
346 mix4.r loc21 = loc33, loc35 // x5 = mixr tmp1, tmp3
347 mix4.l loc22 = loc32, loc34 // x6 = mixl tmp0, tmp2
348 mix4.l loc23 = loc33, loc35 // x7 = mixl tmp1, tmp3
349 ;;
350 mix2.r loc32 = loc44, loc45 // tmp0 = mixr y4.2, y5.2
351 mix2.l loc33 = loc44, loc45 // tmp1 = mixl y4.2, y5.2
352 mix2.r loc34 = loc46, loc47 // tmp2 = mixr y6.2, y6.2
353 mix2.l loc35 = loc46, loc47 // tmp3 = mixl y6.2, y6.2
354 ;;
355 mix4.r loc28 = loc32, loc34 // x4.2 = mixr tmp0, tmp2
356 mix4.r loc29 = loc33, loc35 // x5.2 = mixr tmp1, tmp3
357 mix4.l loc30 = loc32, loc34 // x6.2 = mixl tmp0, tmp2
358 mix4.l loc31 = loc33, loc35 // x7.2 = mixl tmp1, tmp3
359
360 // *******************
361 // row-DTC 1st half
362 // *******************
363
364 psub2 loc37 = loc17, loc22 // t5 = x1 - x6
365 psub2 loc38 = loc18, loc21 // t6 = x2 - x5
366 ;;
367 padd2 loc32 = loc16, loc23 // t0 = x0 + x7
368 padd2 loc33 = loc17, loc22 // t1 = x1 + x6
369 padd2 loc34 = loc18, loc21 // t2 = x2 + x5
370 psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
371 padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
372 padd2 loc35 = loc19, loc20 // t3 = x3 + x4
373 ;;
374 psub2 loc36 = loc16, loc23 // t4 = x0 - x7
375 pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
376 pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
377 ;;
378 psub2 loc39 = loc19, loc20 // t7 = x3 - x4
379 padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
380 padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
381
382 padd2 loc16 = loc32, loc35 // x0 = t0 + t3
383 padd2 loc17 = loc33, loc34 // x1 = t1 + t2
384 psub2 loc18 = loc32, loc35 // x2 = t0 - t3
385 ;;
386 psub2 loc19 = loc33, loc34 // x3 = t1 - t2
387 padd2 loc20 = loc36, loc37 // x4 = t4 + t5
388 padd2 loc21 = loc38, loc39 // x5 = t6 + t7
389 psub2 loc22 = loc36, loc37 // x6 = t4 - t5
390 psub2 loc23 = loc38, loc39 // x7 = t6 - t7
391 ;;
392 pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
393 padd2 loc32 = loc16, loc17 // t0 = x0 + x1
394 pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
395 pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
396 psub2 loc33 = loc16, loc17 // t1 = x0 - x1
397 pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
398 pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
399 pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
400 ;;
401 padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
402 padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
403 padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
404 ;;
405 psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
406 padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
407 psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
408 padd2 loc38 = loc22, loc47 // t6 = x6 + (buf7 * c1)
409 psub2 loc39 = loc46, loc23 // t7 = (c1 * buf6) - x7
410 pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
411 pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
412 ;;
413 padd2 loc48 = loc16, loc32 // y0 = x0 + t0
414 pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
415 pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
416 padd2 loc52 = loc17, loc33 // y4 = x1 + t1
417 pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
418 pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
419 ;;
420 padd2 loc50 = loc18, loc34 // y2 = x2 + t2
421 pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
422 pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
423 padd2 loc55 = loc21, loc37 // y7 = x5 + t5
424 padd2 loc49 = loc20, loc36 // y1 = x4 + t4
425 padd2 loc54 = loc19, loc35 // y6 = x3 + t3
426 ;;
427 padd2 loc51 = loc22, loc38 // y3 = x6 + t6
428 padd2 loc53 = loc23, loc39 // y5 = x7 + t7
429
430 // *******************
431 // row-DTC 2nd half
432 // *******************
433
434 psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2
435 psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2
436 padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2
437 padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2
438 ;;
439 padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2
440 psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
441 padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
442 padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2
443 ;;
444 psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2
445 pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
446 pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
447 ;;
448 psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2
449 padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
450 padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
451
452 padd2 loc16 = loc32, loc35 // x0 = t0 + t3
453 padd2 loc17 = loc33, loc34 // x1 = t1 + t2
454 psub2 loc18 = loc32, loc35 // x2 = t0 - t3
455 ;;
456 psub2 loc19 = loc33, loc34 // x3 = t1 - t2
457 padd2 loc20 = loc36, loc37 // x4 = t4 + t5
458 padd2 loc21 = loc38, loc39 // x5 = t6 + t7
459 psub2 loc22 = loc36, loc37 // x6 = t4 - t5
460 psub2 loc23 = loc38, loc39 // x7 = t6 - t7
461 ;;
462 pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
463 padd2 loc32 = loc16, loc17 // t0 = x0 + x1
464 pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
465 pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
466 psub2 loc33 = loc16, loc17 // t1 = x0 - x1
467 pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
468 pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
469 pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
470 ;;
471 padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
472 padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
473 padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
474 ;;
475 psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
476 padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
477 psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
478 padd2 loc38 = loc22, loc47 // t6 = x6 + (buf7 * c1)
479 psub2 loc39 = loc46, loc23 // t7 = (c1 * buf6) - x7
480 pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
481 pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
482 ;;
483 padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0
484 pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
485 pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
486 padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1
487 pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
488 pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
489 ;;
490 padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2
491 pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
492 pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
493 padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3
494 nop.i 0x0
495 nop.i 0x0
496 ;;
497
498 // *******************
499 // Transpose matrix
500 // *******************
501 padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4
502 mix2.l loc32 = loc49, loc48 // tmp0 = mixr y1, y0
503 mix2.r loc33 = loc49, loc48 // tmp1 = mixl y1, y0
504 padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5
505 mix2.l loc34 = loc51, loc50 // tmp2 = mixr y3, y2
506 mix2.r loc35 = loc51, loc50 // tmp3 = mixl y3, y2
507 ;;
508 padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6
509 mix4.l loc16 = loc34, loc32 // x0 = mixr tmp2, tmp0
510 mix4.l loc17 = loc35, loc33 // x1 = mixr tmp3, tmp1
511 padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7
512 mix4.r loc18 = loc34, loc32 // x2 = mixl tmp2, tmp0
513 mix4.r loc19 = loc35, loc33 // x3 = mixl tmp3, tmp1
514 ;;
515 padd2 loc16 = loc16, r23
516 mix2.l loc32 = loc41, loc40 // tmp0 = mixr y0.2, y1.2
517 mix2.r loc33 = loc41, loc40 // tmp1 = mixl y0.2, y1.2
518 padd2 loc17 = loc17, r23
519 mix2.l loc34 = loc43, loc42 // tmp2 = mixr y2.2, y3.2
520 mix2.r loc35 = loc43, loc42 // tmp3 = mixl y2.2, y3.2
521 ;;
522 padd2 loc18 = loc18, r23
523 mix4.l loc20 = loc34, loc32 // x4 = mixr tmp2, tmp0
524 mix4.l loc21 = loc35, loc33 // x5 = mixr tmp3, tmp1
525 padd2 loc19 = loc19, r23
526 mix4.r loc22 = loc34, loc32 // x6 = mixl tmp2, tmp0
527 mix4.r loc23 = loc35, loc33 // x7 = mixl tmp3, tmp1
528 ;;
529 padd2 loc20 = loc20, r23
530 mix2.l loc32 = loc53, loc52 // tmp0 = mixr y5, y4
531 mix2.r loc33 = loc53, loc52 // tmp1 = mixl y5, y4
532 padd2 loc21 = loc21, r23
533 mix2.l loc34 = loc55, loc54 // tmp2 = mixr y7, y6
534 mix2.r loc35 = loc55, loc54 // tmp3 = mixl y7, y6
535 ;;
536 padd2 loc22 = loc22, r23
537 mix4.l loc24 = loc34, loc32 // x0.2 = mixr tmp2, tmp0
538 mix4.l loc25 = loc35, loc33 // x1.2 = mixr tmp3, tmp1
539 padd2 loc23 = loc23, r23
540 mix4.r loc26 = loc34, loc32 // x2.2 = mixl tmp2, tmp0
541 mix4.r loc27 = loc35, loc33 // x3.2 = mixl tmp3, tmp1
542 ;;
543 padd2 loc24 = loc24, r23
544 mix2.l loc32 = loc45, loc44 // tmp0 = mixr y4.2, y5.2
545 mix2.r loc33 = loc45, loc44 // tmp1 = mixl y4.2, y5.2
546 padd2 loc25 = loc25, r23
547 mix2.l loc34 = loc47, loc46 // tmp2 = mixr y6.2, y6.2
548 mix2.r loc35 = loc47, loc46 // tmp3 = mixl y6.2, y6.2
549 ;;
550 padd2 loc26 = loc26, r23
551 mix4.l loc28 = loc34, loc32 // x4.2 = mixr tmp2, tmp0
552 mix4.l loc29 = loc35, loc33 // x5.2 = mixr tmp3, tmp1
553 padd2 loc27 = loc27, r23
554 mix4.r loc30 = loc34, loc32 // x6.2 = mixl tmp2, tmp0
555 mix4.r loc31 = loc35, loc33 // x7.2 = mixl tmp3, tmp1
556 ;;
557 // *******************
558 // Descale
559 // *******************
560 padd2 loc28 = loc28, r23
561 pshr2 loc16 = loc16, 3
562 pshr2 loc17 = loc17, 3
563 padd2 loc29 = loc29, r23
564 pshr2 loc18 = loc18, 3
565 pshr2 loc19 = loc19, 3
566 padd2 loc30 = loc30, r23
567 pshr2 loc20 = loc20, 3
568 pshr2 loc21 = loc21, 3
569 padd2 loc31 = loc31, r23
570 pshr2 loc22 = loc22, 3
571 pshr2 loc23 = loc23, 3
572 ;;
573 pshr2 loc24 = loc24, 3
574 pshr2 loc25 = loc25, 3
575 pshr2 loc26 = loc26, 3
576 pshr2 loc27 = loc27, 3
577 pshr2 loc28 = loc28, 3
578 pshr2 loc29 = loc29, 3
579 pshr2 loc30 = loc30, 3
580 pshr2 loc31 = loc31, 3
581 ;;
582 // *******************
583 // Store matrix
584 // *******************
585 st8 [loc0] = loc16
586 st8 [loc1] = loc24
587 st8 [loc2] = loc17
588 st8 [loc3] = loc25
589 st8 [loc4] = loc18
590 st8 [loc5] = loc26
591 st8 [loc6] = loc19
592 st8 [loc7] = loc27
593 st8 [loc8] = loc20
594 st8 [loc9] = loc28
595 st8 [loc10] = loc21
596 st8 [loc11] = loc29
597 st8 [loc12] = loc22
598 st8 [loc13] = loc30
599 st8 [loc14] = loc23
600 st8 [loc15] = loc31
601
602 mov ar.pfs = r14
603 br.ret.sptk.many b0
604 .endp fdct_ia64#
605 .common fdct#,8,8
606
607
608
609
610
611
612
613
614 //***********************************************
615 //* Here is a version of the DCT implementation *
616 //* unoptimized in terms of command ordering. *
617 //* This version is about 30% slower but *
618 //* easier understand. *
619 //***********************************************
620 //
621 // .pred.safe_across_calls p1-p5,p16-p63
622 //.text
623 // .align 16
624 // .global fdct_ia64#
625 // .proc fdct_ia64#
626 //fdct_ia64:
627 // .prologue
628 // alloc r14 = ar.pfs, 1, 56, 0, 0
629 //
630 // // *******************
631 // // Save constants
632 // // *******************
633 // mov r31 = 0x32ec // c0 = tan(1pi/16)
634 // mov r30 = 0x6a0a // c1 = tan(2pi/16)
635 // mov r29 = 0xab0e // c2 = tan(3pi/16)
636 // mov r28 = 0xb505 // g4 = cos(4pi/16)
637 // mov r27 = 0xd4db // g3 = cos(3pi/16)
638 // mov r26 = 0xec83 // g2 = cos(2pi/16)
639 // mov r25 = 0xfb15 // g1 = cos(1pi/16)
640 // mov r24 = 0x0002 // correction bit for descaling
641 // mov r23 = 0x0004 // correction bit for descaling
642 //
643 // // **************************
644 // // Load Matrix into registers
645 // // **************************
646 //
647 // add loc0 = r0, r32
648 // ;;
649 // mux2 r31 = r31, 0x00
650 // mux2 r30 = r30, 0x00
651 // mux2 r29 = r29, 0x00
652 // mux2 r28 = r28, 0x00
653 // mux2 r27 = r27, 0x00
654 // mux2 r26 = r26, 0x00
655 // mux2 r25 = r25, 0x00
656 // mux2 r24 = r24, 0x00
657 // mux2 r23 = r23, 0x00
658 // ld8 loc16 = [loc0]
659 // add loc2 = 16, r32
660 // add loc4 = 32, r32
661 // add loc6 = 48, r32
662 // add loc8 = 64, r32
663 // add loc10 = 80, r32
664 // ;;
665 // ld8 loc17 = [loc2]
666 // ld8 loc18 = [loc4]
667 // add loc12 = 96, r32
668 // ld8 loc19 = [loc6]
669 // ld8 loc20 = [loc8]
670 // add loc14 = 112, r32
671 // ;;
672 // ld8 loc21 = [loc10]
673 // ld8 loc22 = [loc12]
674 // add loc1 = 8, r32
675 // ld8 loc23 = [loc14]
676 // add loc3 = 24, r32
677 // add loc5 = 40, r32
678 // ;;
679 // ld8 loc24 = [loc1]
680 // ld8 loc25 = [loc3]
681 // add loc7 = 56, r32
682 // ld8 loc26 = [loc5]
683 // add loc9 = 72, r32
684 // add loc11 = 88, r32
685 // ;;
686 // ld8 loc27 = [loc7]
687 // ld8 loc28 = [loc9]
688 // add loc13 = 104, r32
689 // ld8 loc29 = [loc11]
690 // add loc15 = 120, r32
691 // ;;
692 // ld8 loc30 = [loc13]
693 // ld8 loc31 = [loc15]
694 // ;;
695 // // ******
696 // // Scale
697 // // ******
698 // pshl2 loc16 = loc16, 3
699 // pshl2 loc17 = loc17, 3
700 // pshl2 loc18 = loc18, 3
701 // pshl2 loc19 = loc19, 3
702 // pshl2 loc20 = loc20, 3
703 // pshl2 loc21 = loc21, 3
704 // pshl2 loc22 = loc22, 3
705 // pshl2 loc23 = loc23, 3
706 // pshl2 loc24 = loc24, 3
707 // pshl2 loc25 = loc25, 3
708 // pshl2 loc26 = loc26, 3
709 // pshl2 loc27 = loc27, 3
710 // pshl2 loc28 = loc28, 3
711 // pshl2 loc29 = loc29, 3
712 // pshl2 loc30 = loc30, 3
713 // pshl2 loc31 = loc31, 3
714 // ;;
715 //
716 // // *******************
717 // // column-DTC 1st half
718 // // *******************
719 //
720 // padd2 loc32 = loc16, loc23 // t0 = x0 + x7
721 // padd2 loc33 = loc17, loc22 // t1 = x1 + x6
722 // padd2 loc34 = loc18, loc21 // t2 = x2 + x5
723 // padd2 loc35 = loc19, loc20 // t3 = x3 + x4
724 // psub2 loc36 = loc16, loc23 // t4 = x0 - x7
725 // psub2 loc37 = loc17, loc22 // t5 = x1 - x6
726 // psub2 loc38 = loc18, loc21 // t6 = x2 - x5
727 // psub2 loc39 = loc19, loc20 // t7 = x3 - x4
728 // ;;
729 // padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
730 // psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
731 // ;;
732 // pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
733 // pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
734 // ;;
735 // padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
736 // padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
737 // ;;
738 // padd2 loc16 = loc32, loc35 // x0 = t0 + t3
739 // padd2 loc17 = loc33, loc34 // x1 = t1 + t2
740 // psub2 loc18 = loc32, loc35 // x2 = t0 - t3
741 // psub2 loc19 = loc33, loc34 // x3 = t1 - t2
742 // padd2 loc20 = loc36, loc37 // x4 = t4 + t5
743 // padd2 loc21 = loc38, loc39 // x5 = t6 + t7
744 // psub2 loc22 = loc36, loc37 // x6 = t4 - t5
745 // psub2 loc23 = loc38, loc39 // x7 = t6 - t7
746 // ;;
747 //
748 // padd2 loc32 = loc16, loc17 // t0 = x0 + x1
749 // psub2 loc33 = loc16, loc17 // t1 = x0 - x1
750 // pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
751 // pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
752 // pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
753 // pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
754 // pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
755 // pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
756 // ;;
757 // padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
758 // padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
759 // ;;
760 // padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
761 // psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
762 // padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
763 // psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
764 // padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1)
765 // psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7
766 // ;;
767 // pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
768 // pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
769 // pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
770 // pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
771 // pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
772 // pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
773 // pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
774 // pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
775 // ;;
776 // padd2 loc48 = loc16, loc32 // y0 = x0 + t0
777 // padd2 loc49 = loc20, loc36 // y1 = x4 + t4
778 // padd2 loc50 = loc18, loc34 // y2 = x2 + t2
779 // padd2 loc51 = loc22, loc38 // y3 = x6 + t6
780 // padd2 loc52 = loc17, loc33 // y4 = x1 + t1
781 // padd2 loc53 = loc23, loc39 // y5 = x7 + t7
782 // padd2 loc54 = loc19, loc35 // y6 = x3 + t3
783 // padd2 loc55 = loc21, loc37 // y7 = x5 + t5
784 // ;;
785 //
786 // // *******************
787 // // column-DTC 2nd half
788 // // *******************
789 //
790 // padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2
791 // padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2
792 // padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2
793 // padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2
794 // psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2
795 // psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2
796 // psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2
797 // psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2
798 // ;;
799 // padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
800 // psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
801 // ;;
802 // pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
803 // pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
804 // ;;
805 // padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
806 // padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
807 // ;;
808 // padd2 loc16 = loc32, loc35 // x0 = t0 + t3
809 // padd2 loc17 = loc33, loc34 // x1 = t1 + t2
810 // psub2 loc18 = loc32, loc35 // x2 = t0 - t3
811 // psub2 loc19 = loc33, loc34 // x3 = t1 - t2
812 // padd2 loc20 = loc36, loc37 // x4 = t4 + t5
813 // padd2 loc21 = loc38, loc39 // x5 = t6 + t7
814 // psub2 loc22 = loc36, loc37 // x6 = t4 - t5
815 // psub2 loc23 = loc38, loc39 // x7 = t6 - t7
816 // ;;
817 // padd2 loc32 = loc16, loc17 // t0 = x0 + x1
818 // psub2 loc33 = loc16, loc17 // t1 = x0 - x1
819 // pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
820 // pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
821 // pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
822 // pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
823 // pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
824 // pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
825 // ;;
826 // padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
827 // padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
828 // ;;
829 // padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
830 // psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
831 // padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
832 // psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
833 // padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1)
834 // psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7
835 // ;;
836 // pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
837 // pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
838 // pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
839 // pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
840 // pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
841 // pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
842 // pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
843 // pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
844 // ;;
845 // padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0
846 // padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4
847 // padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2
848 // padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6
849 // padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1
850 // padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7
851 // padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3
852 // padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5
853 // ;;
854 // padd2 loc40 = loc40, r24 // add r24 to correct rounding
855 // padd2 loc41 = loc41, r24
856 // padd2 loc42 = loc42, r24
857 // padd2 loc43 = loc43, r24
858 // padd2 loc44 = loc44, r24
859 // padd2 loc45 = loc45, r24
860 // padd2 loc46 = loc46, r24
861 // padd2 loc47 = loc47, r24
862 // padd2 loc48 = loc48, r24
863 // padd2 loc49 = loc49, r24
864 // padd2 loc50 = loc50, r24
865 // padd2 loc51 = loc51, r24
866 // padd2 loc52 = loc52, r24
867 // padd2 loc53 = loc53, r24
868 // padd2 loc54 = loc54, r24
869 // padd2 loc55 = loc55, r24
870 // ;;
871 // pshr2 loc40 = loc40, 2 // Divide all matrix elements through 4
872 // pshr2 loc41 = loc41, 2
873 // pshr2 loc42 = loc42, 2
874 // pshr2 loc43 = loc43, 2
875 // pshr2 loc44 = loc44, 2
876 // pshr2 loc45 = loc45, 2
877 // pshr2 loc46 = loc46, 2
878 // pshr2 loc47 = loc47, 2
879 // pshr2 loc48 = loc48, 2
880 // pshr2 loc49 = loc49, 2
881 // pshr2 loc50 = loc50, 2
882 // pshr2 loc51 = loc51, 2
883 // pshr2 loc52 = loc52, 2
884 // pshr2 loc53 = loc53, 2
885 // pshr2 loc54 = loc54, 2
886 // pshr2 loc55 = loc55, 2
887 // ;;
888 //
889 // // *****************
890 // // Transpose matrix
891 // // *****************
892 //
893 // mix2.r loc32 = loc48, loc49 // tmp0 = mixr y0, y1
894 // mix2.l loc33 = loc48, loc49 // tmp1 = mixl y0, y1
895 // mix2.r loc34 = loc50, loc51 // tmp2 = mixr y2, y3
896 // mix2.l loc35 = loc50, loc51 // tmp3 = mixl y2, y3
897 // ;;
898 // mix4.r loc16 = loc32, loc34 // x0 = mixr tmp0, tmp2
899 // mix4.r loc17 = loc33, loc35 // x1 = mixr tmp1, tmp3
900 // mix4.l loc18 = loc32, loc34 // x2 = mixl tmp0, tmp2
901 // mix4.l loc19 = loc33, loc35 // x3 = mixl tmp1, tmp3
902 // ;;
903 // mix2.r loc32 = loc40, loc41 // tmp0 = mixr y0.2, y1.2
904 // mix2.l loc33 = loc40, loc41 // tmp1 = mixl y0.2, y1.2
905 // mix2.r loc34 = loc42, loc43 // tmp2 = mixr y2.2, y3.2
906 // mix2.l loc35 = loc42, loc43 // tmp3 = mixl y2.2, y3.2
907 // ;;
908 // mix4.r loc20 = loc32, loc34 // x4 = mixr tmp0, tmp2
909 // mix4.r loc21 = loc33, loc35 // x5 = mixr tmp1, tmp3
910 // mix4.l loc22 = loc32, loc34 // x6 = mixl tmp0, tmp2
911 // mix4.l loc23 = loc33, loc35 // x7 = mixl tmp1, tmp3
912 // ;;
913 // mix2.r loc32 = loc52, loc53 // tmp0 = mixr y4, y5
914 // mix2.l loc33 = loc52, loc53 // tmp1 = mixl y4, y5
915 // mix2.r loc34 = loc54, loc55 // tmp2 = mixr y6, y7
916 // mix2.l loc35 = loc54, loc55 // tmp3 = mixl y6, y7
917 // ;;
918 // mix4.r loc24 = loc32, loc34 // x0.2 = mixr tmp0, tmp2
919 // mix4.r loc25 = loc33, loc35 // x1.2 = mixr tmp1, tmp3
920 // mix4.l loc26 = loc32, loc34 // x2.2 = mixl tmp0, tmp2
921 // mix4.l loc27 = loc33, loc35 // x3.2 = mixl tmp1, tmp3
922 // ;;
923 // mix2.r loc32 = loc44, loc45 // tmp0 = mixr y4.2, y5.2
924 // mix2.l loc33 = loc44, loc45 // tmp1 = mixl y4.2, y5.2
925 // mix2.r loc34 = loc46, loc47 // tmp2 = mixr y6.2, y6.2
926 // mix2.l loc35 = loc46, loc47 // tmp3 = mixl y6.2, y6.2
927 // ;;
928 // mix4.r loc28 = loc32, loc34 // x4.2 = mixr tmp0, tmp2
929 // mix4.r loc29 = loc33, loc35 // x5.2 = mixr tmp1, tmp3
930 // mix4.l loc30 = loc32, loc34 // x6.2 = mixl tmp0, tmp2
931 // mix4.l loc31 = loc33, loc35 // x7.2 = mixl tmp1, tmp3
932 // ;;
933 //
934 // // *******************
935 // // row-DTC 1st half
936 // // *******************
937 //
938 // padd2 loc32 = loc16, loc23 // t0 = x0 + x7
939 // padd2 loc33 = loc17, loc22 // t1 = x1 + x6
940 // padd2 loc34 = loc18, loc21 // t2 = x2 + x5
941 // padd2 loc35 = loc19, loc20 // t3 = x3 + x4
942 // psub2 loc36 = loc16, loc23 // t4 = x0 - x7
943 // psub2 loc37 = loc17, loc22 // t5 = x1 - x6
944 // psub2 loc38 = loc18, loc21 // t6 = x2 - x5
945 // psub2 loc39 = loc19, loc20 // t7 = x3 - x4
946 // ;;
947 // padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
948 // psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
949 // ;;
950 // pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
951 // pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
952 // ;;
953 // padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
954 // padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
955 // ;;
956 // padd2 loc16 = loc32, loc35 // x0 = t0 + t3
957 // padd2 loc17 = loc33, loc34 // x1 = t1 + t2
958 // psub2 loc18 = loc32, loc35 // x2 = t0 - t3
959 // psub2 loc19 = loc33, loc34 // x3 = t1 - t2
960 // padd2 loc20 = loc36, loc37 // x4 = t4 + t5
961 // padd2 loc21 = loc38, loc39 // x5 = t6 + t7
962 // psub2 loc22 = loc36, loc37 // x6 = t4 - t5
963 // psub2 loc23 = loc38, loc39 // x7 = t6 - t7
964 // ;;
965 // padd2 loc32 = loc16, loc17 // t0 = x0 + x1
966 // psub2 loc33 = loc16, loc17 // t1 = x0 - x1
967 // pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
968 // pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
969 // pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
970 // pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
971 // pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
972 // pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
973 // ;;
974 // padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
975 // padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
976 // ;;
977 // padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
978 // psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
979 // padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
980 // psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
981 // padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1)
982 // psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7
983 // ;;
984 // pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
985 // pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
986 // pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
987 // pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
988 // pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
989 // pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
990 // pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
991 // pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
992 // ;;
993 // padd2 loc48 = loc16, loc32 // y0 = x0 + t0
994 // padd2 loc49 = loc20, loc36 // y1 = x4 + t4
995 // padd2 loc50 = loc18, loc34 // y2 = x2 + t2
996 // padd2 loc51 = loc22, loc38 // y3 = x6 + t6
997 // padd2 loc52 = loc17, loc33 // y4 = x1 + t1
998 // padd2 loc53 = loc23, loc39 // y5 = x7 + t7
999 // padd2 loc54 = loc19, loc35 // y6 = x3 + t3
1000 // padd2 loc55 = loc21, loc37 // y7 = x5 + t5
1001 // ;;
1002 //
1003 // // *******************
1004 // // row-DTC 2nd half
1005 // // *******************
1006 //
1007 // padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2
1008 // padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2
1009 // padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2
1010 // padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2
1011 // psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2
1012 // psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2
1013 // psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2
1014 // psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2
1015 // ;;
1016 // padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
1017 // psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
1018 // ;;
1019 // pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
1020 // pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
1021 // ;;
1022 // padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
1023 // padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
1024 // ;;
1025 // padd2 loc16 = loc32, loc35 // x0 = t0 + t3
1026 // padd2 loc17 = loc33, loc34 // x1 = t1 + t2
1027 // psub2 loc18 = loc32, loc35 // x2 = t0 - t3
1028 // psub2 loc19 = loc33, loc34 // x3 = t1 - t2
1029 // padd2 loc20 = loc36, loc37 // x4 = t4 + t5
1030 // padd2 loc21 = loc38, loc39 // x5 = t6 + t7
1031 // psub2 loc22 = loc36, loc37 // x6 = t4 - t5
1032 // psub2 loc23 = loc38, loc39 // x7 = t6 - t7
1033 // ;;
1034 // padd2 loc32 = loc16, loc17 // t0 = x0 + x1
1035 // psub2 loc33 = loc16, loc17 // t1 = x0 - x1
1036 // pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
1037 // pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
1038 // pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
1039 // pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
1040 // pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
1041 // pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
1042 // ;;
1043 // padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
1044 // padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
1045 // ;;
1046 // padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
1047 // psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
1048 // padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
1049 // psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
1050 // padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1)
1051 // psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7
1052 // ;;
1053 // pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
1054 // pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
1055 // pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
1056 // pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
1057 // pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
1058 // pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
1059 // pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
1060 // pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
1061 // ;;
1062 // padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0
1063 // padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4
1064 // padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2
1065 // padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6
1066 // padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1
1067 // padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7
1068 // padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3
1069 // padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5
1070 // ;;
1071 // // *******************
1072 // // Transpose matrix
1073 // // *******************
1074 //
1075 // mix2.l loc32 = loc49, loc48 // tmp0 = mixr y1, y0
1076 // mix2.r loc33 = loc49, loc48 // tmp1 = mixl y1, y0
1077 // mix2.l loc34 = loc51, loc50 // tmp2 = mixr y3, y2
1078 // mix2.r loc35 = loc51, loc50 // tmp3 = mixl y3, y2
1079 // ;;
1080 // mix4.l loc16 = loc34, loc32 // x0 = mixr tmp2, tmp0
1081 // mix4.l loc17 = loc35, loc33 // x1 = mixr tmp3, tmp1
1082 // mix4.r loc18 = loc34, loc32 // x2 = mixl tmp2, tmp0
1083 // mix4.r loc19 = loc35, loc33 // x3 = mixl tmp3, tmp1
1084 // ;;
1085 // mix2.l loc32 = loc41, loc40 // tmp0 = mixr y0.2, y1.2
1086 // mix2.r loc33 = loc41, loc40 // tmp1 = mixl y0.2, y1.2
1087 // mix2.l loc34 = loc43, loc42 // tmp2 = mixr y2.2, y3.2
1088 // mix2.r loc35 = loc43, loc42 // tmp3 = mixl y2.2, y3.2
1089 // ;;
1090 // mix4.l loc20 = loc34, loc32 // x4 = mixr tmp2, tmp0
1091 // mix4.l loc21 = loc35, loc33 // x5 = mixr tmp3, tmp1
1092 // mix4.r loc22 = loc34, loc32 // x6 = mixl tmp2, tmp0
1093 // mix4.r loc23 = loc35, loc33 // x7 = mixl tmp3, tmp1
1094 // ;;
1095 // mix2.l loc32 = loc53, loc52 // tmp0 = mixr y5, y4
1096 // mix2.r loc33 = loc53, loc52 // tmp1 = mixl y5, y4
1097 // mix2.l loc34 = loc55, loc54 // tmp2 = mixr y7, y6
1098 // mix2.r loc35 = loc55, loc54 // tmp3 = mixl y7, y6
1099 // ;;
1100 // mix4.l loc24 = loc34, loc32 // x0.2 = mixr tmp2, tmp0
1101 // mix4.l loc25 = loc35, loc33 // x1.2 = mixr tmp3, tmp1
1102 // mix4.r loc26 = loc34, loc32 // x2.2 = mixl tmp2, tmp0
1103 // mix4.r loc27 = loc35, loc33 // x3.2 = mixl tmp3, tmp1
1104 // ;;
1105 // mix2.l loc32 = loc45, loc44 // tmp0 = mixr y4.2, y5.2
1106 // mix2.r loc33 = loc45, loc44 // tmp1 = mixl y4.2, y5.2
1107 // mix2.l loc34 = loc47, loc46 // tmp2 = mixr y6.2, y6.2
1108 // mix2.r loc35 = loc47, loc46 // tmp3 = mixl y6.2, y6.2
1109 // ;;
1110 // mix4.l loc28 = loc34, loc32 // x4.2 = mixr tmp2, tmp0
1111 // mix4.l loc29 = loc35, loc33 // x5.2 = mixr tmp3, tmp1
1112 // mix4.r loc30 = loc34, loc32 // x6.2 = mixl tmp2, tmp0
1113 // mix4.r loc31 = loc35, loc33 // x7.2 = mixl tmp3, tmp1
1114 // ;;
1115 //
1116 // // ********
1117 // // descale
1118 // // ********
1119 //
1120 // padd2 loc16 = loc16, r23
1121 // padd2 loc17 = loc17, r23
1122 // padd2 loc18 = loc18, r23
1123 // padd2 loc19 = loc19, r23
1124 // padd2 loc20 = loc20, r23
1125 // padd2 loc21 = loc21, r23
1126 // padd2 loc22 = loc22, r23
1127 // padd2 loc23 = loc23, r23
1128 // padd2 loc24 = loc24, r23
1129 // padd2 loc25 = loc25, r23
1130 // padd2 loc26 = loc26, r23
1131 // padd2 loc27 = loc27, r23
1132 // padd2 loc28 = loc28, r23
1133 // padd2 loc29 = loc29, r23
1134 // padd2 loc30 = loc30, r23
1135 // padd2 loc31 = loc31, r23
1136 // ;;
1137 // pshr2 loc16 = loc16, 3
1138 // pshr2 loc17 = loc17, 3
1139 // pshr2 loc18 = loc18, 3
1140 // pshr2 loc19 = loc19, 3
1141 // pshr2 loc20 = loc20, 3
1142 // pshr2 loc21 = loc21, 3
1143 // pshr2 loc22 = loc22, 3
1144 // pshr2 loc23 = loc23, 3
1145 // pshr2 loc24 = loc24, 3
1146 // pshr2 loc25 = loc25, 3
1147 // pshr2 loc26 = loc26, 3
1148 // pshr2 loc27 = loc27, 3
1149 // pshr2 loc28 = loc28, 3
1150 // pshr2 loc29 = loc29, 3
1151 // pshr2 loc30 = loc30, 3
1152 // pshr2 loc31 = loc31, 3
1153 // ;;
1154 // // ************
1155 // // Store Matrix
1156 // // ************
1157 // st8 [loc0] = loc16
1158 // st8 [loc1] = loc24
1159 // st8 [loc2] = loc17
1160 // st8 [loc3] = loc25
1161 // st8 [loc4] = loc18
1162 // st8 [loc5] = loc26
1163 // st8 [loc6] = loc19
1164 // st8 [loc7] = loc27
1165 // st8 [loc8] = loc20
1166 // st8 [loc9] = loc28
1167 // st8 [loc10] = loc21
1168 // st8 [loc11] = loc29
1169 // st8 [loc12] = loc22
1170 // st8 [loc13] = loc30
1171 // st8 [loc14] = loc23
1172 // st8 [loc15] = loc31
1173 //
1174 // mov ar.pfs = r14
1175 // br.ret.sptk.many b0
1176 // .endp fdct_ia64#
1177 // .common fdct#,8,8
1178 //